pax_global_header00006660000000000000000000000064141522301370014510gustar00rootroot0000000000000052 comment=617beb4a13baa559b689bdea77dce26a5e983ada libxsmm-1.17/000077500000000000000000000000001415223013700131135ustar00rootroot00000000000000libxsmm-1.17/.abi.txt000066400000000000000000000361341415223013700144740ustar00rootroot00000000000000libxsmm_aligned_malloc libxsmm_backtrace libxsmm_barrier_create libxsmm_barrier_destroy libxsmm_barrier_init libxsmm_barrier_wait libxsmm_bbgemm_descriptor_init libxsmm_bgemm_descriptor_init libxsmm_bigemm_descriptor_init libxsmm_blas_dgemm0 libxsmm_blas_dgemm1 libxsmm_blas_dgemm2 libxsmm_blas_dgemm3 libxsmm_blas_dgemm_ libxsmm_blas_error libxsmm_blas_sgemm0 libxsmm_blas_sgemm1 libxsmm_blas_sgemm2 libxsmm_blas_sgemm3 libxsmm_blas_sgemm_ libxsmm_blas_xgemm libxsmm_blas_xgemm_ libxsmm_blocked_gemm_convert_b_to_a libxsmm_blocked_gemm_copyin_a libxsmm_blocked_gemm_copyin_b libxsmm_blocked_gemm_copyin_c libxsmm_blocked_gemm_copyout_c libxsmm_blocked_gemm_handle_create libxsmm_blocked_gemm_handle_destroy libxsmm_blocked_gemm_omp libxsmm_blocked_gemm_st libxsmm_blocked_gemm_transpose_b libxsmm_bmmdispatch libxsmm_bmmdispatch_reducebatch_addr libxsmm_bmmdispatch_reducebatch_addr_unroll libxsmm_bmmdispatch_reducebatch_offs libxsmm_bmmdispatch_reducebatch_offs_unroll libxsmm_bmmdispatch_reducebatch_strd libxsmm_bmmdispatch_reducebatch_strd_unroll libxsmm_bsgemm libxsmm_bsgemm_ libxsmm_bsgemm_descriptor_init libxsmm_bsmmdispatch libxsmm_bsmmdispatch_reducebatch_addr libxsmm_bsmmdispatch_reducebatch_addr_unroll libxsmm_bsmmdispatch_reducebatch_offs libxsmm_bsmmdispatch_reducebatch_offs_unroll libxsmm_bsmmdispatch_reducebatch_strd libxsmm_bsmmdispatch_reducebatch_strd_unroll libxsmm_convert_bf16_f32 libxsmm_cpuid libxsmm_cpuid_name libxsmm_cpuid_vlen32 libxsmm_cpuid_x86 libxsmm_create_dcsr_reg libxsmm_create_pgemm_ac_rm libxsmm_create_pgemm_bc_rm libxsmm_create_scsr_reg libxsmm_create_xcsc_soa libxsmm_create_xcsr_soa libxsmm_dfsspmdm_create libxsmm_dfsspmdm_destroy libxsmm_dfsspmdm_execute libxsmm_dgemm libxsmm_dgemm0 libxsmm_dgemm1 libxsmm_dgemm2 libxsmm_dgemm3 libxsmm_dgemm_ libxsmm_dgemm_batch libxsmm_dgemm_batch_omp libxsmm_dgemm_descriptor_init libxsmm_dgemm_omp_ libxsmm_diff libxsmm_diff_16 libxsmm_diff_32 libxsmm_diff_48 libxsmm_diff_64 libxsmm_diff_char libxsmm_diff_i32 libxsmm_diff_i64 libxsmm_diff_i8 libxsmm_diff_n libxsmm_dispatch_getrf libxsmm_dispatch_mcopy libxsmm_dispatch_meltw libxsmm_dispatch_meltw_act_cvtfp32bf16 libxsmm_dispatch_meltw_add libxsmm_dispatch_meltw_copy libxsmm_dispatch_meltw_cvtfp32bf16 libxsmm_dispatch_meltw_cvtfp32bf16_act libxsmm_dispatch_meltw_mul libxsmm_dispatch_meltw_reduce libxsmm_dispatch_meltw_relu libxsmm_dispatch_meltw_scale libxsmm_dispatch_meltw_zero libxsmm_dispatch_pgemm libxsmm_dispatch_trans libxsmm_dispatch_trmm libxsmm_dispatch_trsm libxsmm_dmmavailable libxsmm_dmmcall libxsmm_dmmcall_abc libxsmm_dmmcall_prf libxsmm_dmmdispatch libxsmm_dmmdispatch_reducebatch_addr libxsmm_dmmdispatch_reducebatch_addr_unroll libxsmm_dmmdispatch_reducebatch_offs libxsmm_dmmdispatch_reducebatch_offs_unroll libxsmm_dmmdispatch_reducebatch_strd libxsmm_dmmdispatch_reducebatch_strd_unroll libxsmm_dnn_bind_scratch libxsmm_dnn_bind_tensor libxsmm_dnn_compare_tensor_datalayout libxsmm_dnn_copyin_tensor libxsmm_dnn_copyout_tensor libxsmm_dnn_create_conv_layer libxsmm_dnn_create_fullyconnected libxsmm_dnn_create_fusedbatchnorm libxsmm_dnn_create_fusedgroupnorm libxsmm_dnn_create_optimizer libxsmm_dnn_create_pooling libxsmm_dnn_create_rnncell libxsmm_dnn_create_softmaxloss libxsmm_dnn_create_tensor_datalayout libxsmm_dnn_dequantize libxsmm_dnn_destroy_conv_layer libxsmm_dnn_destroy_fullyconnected libxsmm_dnn_destroy_fusedbatchnorm libxsmm_dnn_destroy_fusedgroupnorm libxsmm_dnn_destroy_optimizer libxsmm_dnn_destroy_pooling libxsmm_dnn_destroy_rnncell libxsmm_dnn_destroy_softmaxloss libxsmm_dnn_destroy_tensor libxsmm_dnn_destroy_tensor_datalayout libxsmm_dnn_duplicate_tensor_datalayout libxsmm_dnn_execute libxsmm_dnn_execute_st libxsmm_dnn_fullyconnected_bind_scratch libxsmm_dnn_fullyconnected_bind_tensor libxsmm_dnn_fullyconnected_create_tensor_datalayout libxsmm_dnn_fullyconnected_execute_st libxsmm_dnn_fullyconnected_get_scratch_ptr libxsmm_dnn_fullyconnected_get_scratch_size libxsmm_dnn_fullyconnected_get_tensor libxsmm_dnn_fullyconnected_release_scratch libxsmm_dnn_fullyconnected_release_tensor libxsmm_dnn_fusedbatchnorm_bind_scratch libxsmm_dnn_fusedbatchnorm_bind_tensor libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout libxsmm_dnn_fusedbatchnorm_execute_st libxsmm_dnn_fusedbatchnorm_get_scratch_size libxsmm_dnn_fusedbatchnorm_get_tensor libxsmm_dnn_fusedbatchnorm_reduce_stats_st libxsmm_dnn_fusedbatchnorm_release_scratch libxsmm_dnn_fusedbatchnorm_release_tensor libxsmm_dnn_fusedgroupnorm_bind_scratch libxsmm_dnn_fusedgroupnorm_bind_tensor libxsmm_dnn_fusedgroupnorm_create_tensor_datalayout libxsmm_dnn_fusedgroupnorm_execute_st libxsmm_dnn_fusedgroupnorm_get_scratch_size libxsmm_dnn_fusedgroupnorm_get_tensor libxsmm_dnn_fusedgroupnorm_reduce_stats_st libxsmm_dnn_fusedgroupnorm_release_scratch libxsmm_dnn_fusedgroupnorm_release_tensor libxsmm_dnn_get_error libxsmm_dnn_get_qtensor_scf libxsmm_dnn_get_scratch_size libxsmm_dnn_get_simd_width libxsmm_dnn_get_tensor libxsmm_dnn_get_tensor_data_ptr libxsmm_dnn_get_tensor_datalayout libxsmm_dnn_get_tensor_elements libxsmm_dnn_get_tensor_size libxsmm_dnn_link_qtensor libxsmm_dnn_link_tensor libxsmm_dnn_optimizer_bind_scratch libxsmm_dnn_optimizer_bind_tensor libxsmm_dnn_optimizer_create_tensor_datalayout libxsmm_dnn_optimizer_execute_st libxsmm_dnn_optimizer_get_scratch_ptr libxsmm_dnn_optimizer_get_scratch_size libxsmm_dnn_optimizer_get_tensor libxsmm_dnn_optimizer_release_scratch libxsmm_dnn_optimizer_release_tensor libxsmm_dnn_pooling_bind_scratch libxsmm_dnn_pooling_bind_tensor libxsmm_dnn_pooling_create_tensor_datalayout libxsmm_dnn_pooling_execute_st libxsmm_dnn_pooling_get_scratch_size libxsmm_dnn_pooling_get_tensor libxsmm_dnn_pooling_release_scratch libxsmm_dnn_pooling_release_tensor libxsmm_dnn_quantize libxsmm_dnn_quantize_act libxsmm_dnn_quantize_fil libxsmm_dnn_release_scratch libxsmm_dnn_release_tensor libxsmm_dnn_rnncell_allocate_forget_bias libxsmm_dnn_rnncell_bind_internalstate libxsmm_dnn_rnncell_bind_scratch libxsmm_dnn_rnncell_bind_tensor libxsmm_dnn_rnncell_create_tensor_datalayout libxsmm_dnn_rnncell_execute_st libxsmm_dnn_rnncell_get_internalstate_ptr libxsmm_dnn_rnncell_get_internalstate_size libxsmm_dnn_rnncell_get_scratch_ptr libxsmm_dnn_rnncell_get_scratch_size libxsmm_dnn_rnncell_get_sequence_length libxsmm_dnn_rnncell_get_tensor libxsmm_dnn_rnncell_release_internalstate libxsmm_dnn_rnncell_release_scratch libxsmm_dnn_rnncell_release_tensor libxsmm_dnn_rnncell_set_sequence_length libxsmm_dnn_set_qtensor_scf libxsmm_dnn_set_tensor_data_ptr libxsmm_dnn_softmaxloss_bind_scratch libxsmm_dnn_softmaxloss_bind_tensor libxsmm_dnn_softmaxloss_create_tensor_datalayout libxsmm_dnn_softmaxloss_execute_st libxsmm_dnn_softmaxloss_get_loss libxsmm_dnn_softmaxloss_get_scratch_ptr libxsmm_dnn_softmaxloss_get_scratch_size libxsmm_dnn_softmaxloss_get_tensor libxsmm_dnn_softmaxloss_release_scratch libxsmm_dnn_softmaxloss_release_tensor libxsmm_dnn_trans_reg_bf16_filter libxsmm_dnn_trans_reg_filter libxsmm_dnn_typesize libxsmm_dnn_zero_tensor libxsmm_dsqrt libxsmm_finalize libxsmm_finalize_ libxsmm_free libxsmm_gcd libxsmm_gemm_batch libxsmm_gemm_batch_ libxsmm_gemm_batch_omp libxsmm_gemm_batch_omp_ libxsmm_gemm_descriptor_dinit libxsmm_gemm_descriptor_dinit2 libxsmm_gemm_descriptor_init libxsmm_gemm_descriptor_init2 libxsmm_gemm_descriptor_init3 libxsmm_gemm_dprint libxsmm_gemm_dprint2 libxsmm_gemm_handle_get_scratch_size libxsmm_gemm_handle_init libxsmm_gemm_internal_set_batchflag libxsmm_gemm_print libxsmm_gemm_print2 libxsmm_gemm_thread libxsmm_gemm_xprint libxsmm_generator_gemm_directasm libxsmm_generator_gemm_inlineasm libxsmm_generator_gemm_kernel libxsmm_generator_getrf_kernel libxsmm_generator_matcopy_kernel libxsmm_generator_mateltwise_kernel libxsmm_generator_packed_gemm_ac_rm libxsmm_generator_packed_gemm_bc_rm libxsmm_generator_pgemm_kernel libxsmm_generator_spgemm libxsmm_generator_spgemm_csc_kernel libxsmm_generator_spgemm_csc_soa_kernel libxsmm_generator_spgemm_csr_kernel libxsmm_generator_spgemm_csr_reg_kernel libxsmm_generator_spgemm_csr_soa_kernel libxsmm_generator_transpose_kernel libxsmm_generator_trmm_kernel libxsmm_generator_trsm_kernel libxsmm_get_default_allocator libxsmm_get_gemm_auto_prefetch libxsmm_get_gemm_prefetch libxsmm_get_gemm_xprefetch libxsmm_get_kernel_info libxsmm_get_malloc libxsmm_get_malloc_info libxsmm_get_malloc_xinfo libxsmm_get_mcopykernel_info libxsmm_get_meltwkernel_info libxsmm_get_mmkernel_info libxsmm_get_pid libxsmm_get_registry_info libxsmm_get_scratch_allocator libxsmm_get_scratch_info libxsmm_get_scratch_limit libxsmm_get_target_arch libxsmm_get_target_archid libxsmm_get_tid libxsmm_get_timer_info libxsmm_get_transkernel_info libxsmm_get_verbosity libxsmm_getrf_descriptor_init libxsmm_hash libxsmm_hash_char libxsmm_hash_i32 libxsmm_hash_i64 libxsmm_hash_i8 libxsmm_hash_string libxsmm_icbrt_u32 libxsmm_icbrt_u64 libxsmm_init libxsmm_init_ libxsmm_isqrt2_u32 libxsmm_isqrt_u32 libxsmm_isqrt_u64 libxsmm_itrans libxsmm_itrans_ libxsmm_itrans_d1 libxsmm_itrans_d2 libxsmm_itrans_p0 libxsmm_itrans_s1 libxsmm_itrans_s2 libxsmm_lcm libxsmm_malloc libxsmm_matcopy libxsmm_matcopy_ libxsmm_matcopy_d1 libxsmm_matcopy_d2 libxsmm_matcopy_omp libxsmm_matcopy_omp_ libxsmm_matcopy_p0 libxsmm_matcopy_s1 libxsmm_matcopy_s2 libxsmm_matcopy_thread libxsmm_matcopy_thread_internal libxsmm_matdiff libxsmm_matdiff_ libxsmm_matdiff_clear libxsmm_matdiff_clear_ libxsmm_matdiff_reduce libxsmm_matdiff_reduce_ libxsmm_mcopy_descriptor_init libxsmm_meltw_descriptor_init libxsmm_meltw_descriptor_init2 libxsmm_memcmp libxsmm_mhd_element_comparison libxsmm_mhd_element_conversion libxsmm_mhd_read libxsmm_mhd_read_header libxsmm_mhd_typeinfo libxsmm_mhd_typename libxsmm_mhd_write libxsmm_mmbatch libxsmm_mmbatch_ libxsmm_mmbatch_begin libxsmm_mmbatch_begin_ libxsmm_mmbatch_blas libxsmm_mmbatch_end libxsmm_mmbatch_end_ libxsmm_mmbatch_kernel libxsmm_mutex_acquire libxsmm_mutex_create libxsmm_mutex_destroy libxsmm_mutex_release libxsmm_mutex_trylock libxsmm_offset libxsmm_original_dgemm libxsmm_original_dgemm_batch libxsmm_original_dgemv libxsmm_original_sgemm libxsmm_original_sgemm_batch libxsmm_original_sgemv libxsmm_otrans libxsmm_otrans_ libxsmm_otrans_d1 libxsmm_otrans_d2 libxsmm_otrans_omp libxsmm_otrans_omp_ libxsmm_otrans_p0 libxsmm_otrans_s1 libxsmm_otrans_s2 libxsmm_otrans_thread libxsmm_otrans_thread_internal libxsmm_pgemm_descriptor_init libxsmm_primes_u32 libxsmm_product_limit libxsmm_ptr_b0 libxsmm_ptr_b1 libxsmm_ptr_b2 libxsmm_ptr_c0 libxsmm_ptr_c1 libxsmm_ptr_c2 libxsmm_ptr_d0 libxsmm_ptr_d1 libxsmm_ptr_d2 libxsmm_ptr_dmm libxsmm_ptr_i0 libxsmm_ptr_i1 libxsmm_ptr_i2 libxsmm_ptr_j0 libxsmm_ptr_j1 libxsmm_ptr_j2 libxsmm_ptr_l0 libxsmm_ptr_l1 libxsmm_ptr_l2 libxsmm_ptr_null libxsmm_ptr_s0 libxsmm_ptr_s1 libxsmm_ptr_s2 libxsmm_ptr_smm libxsmm_ptr_w0 libxsmm_ptr_w1 libxsmm_ptr_w2 libxsmm_ptr_wimm libxsmm_ptr_z0 libxsmm_ptr_z1 libxsmm_ptr_z2 libxsmm_realloc libxsmm_release_dmmkernel libxsmm_release_kernel libxsmm_release_kernel_ libxsmm_release_scratch libxsmm_release_smmkernel libxsmm_release_wimmkernel libxsmm_rnaz_convert_fp32_bf16 libxsmm_rne_convert_fp32_bf16 libxsmm_rng_create_avx512_extstate libxsmm_rng_destroy_avx512_extstate libxsmm_rng_f32_seq libxsmm_rng_f64 libxsmm_rng_seq libxsmm_rng_set_seed libxsmm_rng_u32 libxsmm_rwlock_acqread libxsmm_rwlock_acquire libxsmm_rwlock_create libxsmm_rwlock_destroy libxsmm_rwlock_release libxsmm_rwlock_relread libxsmm_rwlock_trylock libxsmm_rwlock_tryread libxsmm_scratch_malloc libxsmm_set_default_allocator libxsmm_set_gemm_auto_prefetch libxsmm_set_malloc libxsmm_set_scratch_allocator libxsmm_set_scratch_limit libxsmm_set_target_arch libxsmm_set_target_archid libxsmm_set_verbosity libxsmm_sexp2 libxsmm_sexp2_i8 libxsmm_sexp2_i8i libxsmm_sexp2_u8 libxsmm_sfsspmdm_create libxsmm_sfsspmdm_destroy libxsmm_sfsspmdm_execute libxsmm_sgemm libxsmm_sgemm0 libxsmm_sgemm1 libxsmm_sgemm2 libxsmm_sgemm3 libxsmm_sgemm_ libxsmm_sgemm_batch libxsmm_sgemm_batch_omp libxsmm_sgemm_descriptor_init libxsmm_sgemm_omp_ libxsmm_shuffle libxsmm_shuffle_ libxsmm_sink libxsmm_smmavailable libxsmm_smmcall libxsmm_smmcall_abc libxsmm_smmcall_prf libxsmm_smmdispatch libxsmm_smmdispatch_reducebatch_addr libxsmm_smmdispatch_reducebatch_addr_unroll libxsmm_smmdispatch_reducebatch_offs libxsmm_smmdispatch_reducebatch_offs_unroll libxsmm_smmdispatch_reducebatch_strd libxsmm_smmdispatch_reducebatch_strd_unroll libxsmm_spinlock_acquire libxsmm_spinlock_create libxsmm_spinlock_destroy libxsmm_spinlock_release libxsmm_spinlock_trylock libxsmm_spmdm_compute_bfloat16_thread libxsmm_spmdm_compute_fp32_thread libxsmm_spmdm_createSparseSlice_bfloat16_thread libxsmm_spmdm_createSparseSlice_fp32_thread libxsmm_spmdm_destroy libxsmm_spmdm_get_num_compute_blocks libxsmm_spmdm_get_num_createSparseSlice_blocks libxsmm_spmdm_init libxsmm_ssbimmdispatch libxsmm_ssbimmdispatch_reducebatch_addr libxsmm_ssbimmdispatch_reducebatch_addr_unroll libxsmm_ssbimmdispatch_reducebatch_offs libxsmm_ssbimmdispatch_reducebatch_offs_unroll libxsmm_ssbimmdispatch_reducebatch_strd libxsmm_ssbimmdispatch_reducebatch_strd_unroll libxsmm_ssqrt libxsmm_strerror libxsmm_subimmdispatch libxsmm_subimmdispatch_reducebatch_addr libxsmm_subimmdispatch_reducebatch_addr_unroll libxsmm_subimmdispatch_reducebatch_offs libxsmm_subimmdispatch_reducebatch_offs_unroll libxsmm_subimmdispatch_reducebatch_strd libxsmm_subimmdispatch_reducebatch_strd_unroll libxsmm_sububmmdispatch libxsmm_sububmmdispatch_reducebatch_addr libxsmm_sububmmdispatch_reducebatch_addr_unroll libxsmm_sububmmdispatch_reducebatch_offs libxsmm_sububmmdispatch_reducebatch_offs_unroll libxsmm_sububmmdispatch_reducebatch_strd libxsmm_sububmmdispatch_reducebatch_strd_unroll libxsmm_timer_duration libxsmm_timer_ncycles libxsmm_timer_ncycles_ libxsmm_timer_tick libxsmm_trace libxsmm_trace_finalize libxsmm_trace_info libxsmm_trace_init libxsmm_trans_descriptor_init libxsmm_trmm_descriptor_init libxsmm_trsm_descriptor_init libxsmm_truncate_convert_f32_bf16 libxsmm_typesize libxsmm_usbimmdispatch libxsmm_usbimmdispatch_reducebatch_addr libxsmm_usbimmdispatch_reducebatch_addr_unroll libxsmm_usbimmdispatch_reducebatch_offs libxsmm_usbimmdispatch_reducebatch_offs_unroll libxsmm_usbimmdispatch_reducebatch_strd libxsmm_usbimmdispatch_reducebatch_strd_unroll libxsmm_uubimmdispatch libxsmm_uubimmdispatch_reducebatch_addr libxsmm_uubimmdispatch_reducebatch_addr_unroll libxsmm_uubimmdispatch_reducebatch_offs libxsmm_uubimmdispatch_reducebatch_offs_unroll libxsmm_uubimmdispatch_reducebatch_strd libxsmm_uubimmdispatch_reducebatch_strd_unroll libxsmm_wigemm libxsmm_wigemm0 libxsmm_wigemm1 libxsmm_wigemm2 libxsmm_wigemm3 libxsmm_wigemm_ libxsmm_wigemm_descriptor_init libxsmm_wimmavailable libxsmm_wimmcall libxsmm_wimmcall_abc libxsmm_wimmcall_prf libxsmm_wimmdispatch libxsmm_wimmdispatch_reducebatch_addr libxsmm_wimmdispatch_reducebatch_addr_unroll libxsmm_wimmdispatch_reducebatch_offs libxsmm_wimmdispatch_reducebatch_offs_unroll libxsmm_wimmdispatch_reducebatch_strd libxsmm_wimmdispatch_reducebatch_strd_unroll libxsmm_xclear_ libxsmm_xdiff_ libxsmm_xdispatch libxsmm_xdispatch_ libxsmm_xgemm libxsmm_xgemm_omp libxsmm_xgemm_omp_ libxsmm_xhash_ libxsmm_xmmcall_ libxsmm_xmmcall_abc_ libxsmm_xmmcall_prf_ libxsmm_xmmdispatch libxsmm_xmmdispatch2_ libxsmm_xmmdispatch_ libxsmm_xregister libxsmm_xregister_ libxsmm_xrelease libxsmm_xrelease_ libxsmmf_get_target_arch libxsmm-1.17/.clang-format000066400000000000000000000012361415223013700154700ustar00rootroot00000000000000--- AlignAfterOpenBracket: DontAlign AlignEscapedNewlines: DontAlign AlignTrailingComments: false AllowShortCaseLabelsOnASingleLine: true AllowShortIfStatementsOnASingleLine: WithoutElse AllowShortLoopsOnASingleLine: true BraceWrapping: BeforeCatch: true BeforeElse: true BreakBeforeBraces: Custom ColumnLimit: 132 ConstructorInitializerIndentWidth: 0 ContinuationIndentWidth: 2 IndentCaseLabels: true IndentPPDirectives: AfterHash IndentWidth: 2 KeepEmptyLinesAtTheStartOfBlocks: false MaxEmptyLinesToKeep: 2 PenaltyBreakAssignment: 50 PointerAlignment: Left ReflowComments: false SortIncludes: false SpaceAfterTemplateKeyword: false UseTab: Never ... libxsmm-1.17/.flock.sh000077500000000000000000000020521415223013700146250ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### FLOCK=$(command -v flock) if [ -d "$1" ]; then ABSDIR=$(cd "$1" && pwd -P) elif [ -f "$1" ]; then ABSDIR=$(cd "$(dirname "$1")" && pwd -P) else ABSDIR=$(cd "$(dirname "$0")" && pwd -P) fi shift cd "${ABSDIR}" || true if [ "${FLOCK}" ]; then ${FLOCK} "${ABSDIR}" -c "$@" else eval "$@" fi libxsmm-1.17/.mktmp.sh000077500000000000000000000021551415223013700146630ustar00rootroot00000000000000#!/usr/bin/env bash ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### MKTEMP=$(command -v mktemp) MV=$(command -v mv) if [ "${MKTEMP}" ] && [ "${MV}" ]; then TEMPLATE=${1/XXXXXX/}.XXXXXX TMPFILE=$(${MKTEMP} "${TEMPLATE}") EXTFILE=${TMPFILE: -6} NEWFILE=${1/XXXXXX/${EXTFILE}} if [ "$1" != "${NEWFILE}" ]; then ${MV} "${TMPFILE}" "${NEWFILE}" echo "${NEWFILE}" else echo "${TMPFILE}" fi else touch "$1" fi libxsmm-1.17/.state.sh000077500000000000000000000045731415223013700146610ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### MKDIR=$(command -v mkdir) DIFF=$(command -v diff) UNIQ=$(command -v uniq) SED=$(command -v sed) TR=$(command -v tr) if [ "${MKDIR}" ] && [ "${SED}" ] && [ "${TR}" ] && [ "${DIFF}" ] && [ "${UNIQ}" ]; then if [ "$1" ]; then STATEFILE=$1/.state ${MKDIR} -p "$1" shift else STATEFILE=.state fi STATE=$(${TR} '?' '\n' | ${TR} '"' \' | ${SED} -e 's/^ */\"/' -e 's/ */ /g' -e 's/ *$/\\n\"/') TOUCH=$(command -v touch) if [ -e "${STATEFILE}" ]; then if [ "$@" ]; then EXCLUDE="-e /\($(echo "$@" | ${SED} "s/[[:space:]][[:space:]]*/\\\|/g" | ${SED} "s/\\\|$//")\)/d" fi # BSD's diff does not support --unchanged-line-format="" STATE_DIFF=$(printf "%s\n" "${STATE}" \ | ${DIFF} "${STATEFILE}" - 2>/dev/null | ${SED} -n 's/[<>] \(..*\)/\1/p' \ | ${SED} -e 's/=..*$//' -e 's/\"//g' -e '/^$/d' ${EXCLUDE} | ${UNIQ}) RESULT=$? if [ "0" != "${RESULT}" ] || [ "${STATE_DIFF}" ]; then if [ "" = "${NOSTATE}" ] || [ "0" = "${NOSTATE}" ]; then printf "%s\n" "${STATE}" > "${STATEFILE}" fi echo "$0 ${STATE_DIFF}" # only needed to execute body of .state-rule if [ "${TOUCH}" ]; then ${TOUCH} "$0"; fi fi else # difference must not be determined if [ "" = "${NOSTATE}" ] || [ "0" = "${NOSTATE}" ]; then printf "%s\n" "${STATE}" > "${STATEFILE}" fi echo "$0" # only needed to execute body of .state-rule if [ "${TOUCH}" ]; then ${TOUCH} "$0"; fi fi elif [ ! "${DIFF}" ]; then >&2 echo "Error: please install diffutils - diff command is missing!" exit 1 else >&2 echo "Error: missing prerequisites!" exit 1 fi libxsmm-1.17/.test-dnn.yml000066400000000000000000000166171415223013700154630ustar00rootroot00000000000000script: - make -e ${MAKEJ} && cd samples/deeplearning/cnnlayer && make -e ${MAKEJ} && (export CHECK=1 ITERS=1; for FORMAT in $(if [ "" != "${FORMATS}" ]; then echo "${FORMATS}"; else echo "L"; fi); do for MB_NT in $(if [ "" != "${MB_THREADS}" ]; then echo "${MB_THREADS}"; else echo "32_0"; fi); do MB=$(echo ${MB_NT} | cut -d_ -f1); export OMP_NUM_THREADS=$(echo ${MB_NT} | cut -d_ -f2); for PAD in 0 1; do echo; echo "--- TEST ResNet-50 (format=${FORMAT} pad=${PAD} mb=${MB} nt=${OMP_NUM_THREADS})"; ./run_resnet50.sh ${MB} ${ITERS} -1 f32 F ${FORMAT} ${PAD} && ./run_resnet50.sh ${MB} ${ITERS} -1 f32 B ${FORMAT} ${PAD} && ./run_resnet50.sh ${MB} ${ITERS} -1 f32 U ${FORMAT} ${PAD}; done done done) - make -e ${MAKEJ} && cd samples/deeplearning/cnnlayer && make -e ${MAKEJ} && (export CHECK=1 ITERS=1; for FORMAT in $(if [ "" != "${FORMATS}" ]; then echo "${FORMATS}"; else echo "L"; fi); do for MB_NT in $(if [ "" != "${MB_THREADS}" ]; then echo "${MB_THREADS}"; else echo "32_0"; fi); do MB=$(echo ${MB_NT} | cut -d_ -f1); export OMP_NUM_THREADS=$(echo ${MB_NT} | cut -d_ -f2); for PAD in 0 1; do echo; echo "--- TEST AlexNet (format=${FORMAT} pad=${PAD} mb=${MB} nt=${OMP_NUM_THREADS})"; ./run_alexnet.sh ${MB} ${ITERS} -1 f32 F ${FORMAT} ${PAD} && ./run_alexnet.sh ${MB} ${ITERS} -1 f32 B ${FORMAT} ${PAD} && ./run_alexnet.sh ${MB} ${ITERS} -1 f32 U ${FORMAT} ${PAD}; done done done) - make -e ${MAKEJ} && cd samples/deeplearning/cnnlayer && make -e ${MAKEJ} && (export CHECK=1 ITERS=1; for FORMAT in $(if [ "" != "${FORMATS}" ]; then echo "${FORMATS}"; else echo "L"; fi); do for MB_NT in $(if [ "" != "${MB_THREADS}" ]; then echo "${MB_THREADS}"; else echo "32_0"; fi); do MB=$(echo ${MB_NT} | cut -d_ -f1); export OMP_NUM_THREADS=$(echo ${MB_NT} | cut -d_ -f2); for PAD in 0 1; do echo; echo "--- TEST Overfeat (format=${FORMAT} pad=${PAD} mb=${MB} nt=${OMP_NUM_THREADS})"; ./run_overfeat.sh ${MB} ${ITERS} -1 f32 F ${FORMAT} ${PAD} && ./run_overfeat.sh ${MB} ${ITERS} -1 f32 B ${FORMAT} ${PAD} && ./run_overfeat.sh ${MB} ${ITERS} -1 f32 U ${FORMAT} ${PAD}; done done done) - make -e ${MAKEJ} && cd samples/deeplearning/cnnlayer && make -e ${MAKEJ} && (export CHECK=1 ITERS=1; for FORMAT in $(if [ "" != "${FORMATS}" ]; then echo "${FORMATS}"; else echo "L"; fi); do for MB_NT in $(if [ "" != "${MB_THREADS}" ]; then echo "${MB_THREADS}"; else echo "32_0"; fi); do MB=$(echo ${MB_NT} | cut -d_ -f1); export OMP_NUM_THREADS=$(echo ${MB_NT} | cut -d_ -f2); for PAD in 0 1; do echo; echo "--- TEST GoogleNet-v1 (format=${FORMAT} pad=${PAD} mb=${MB} nt=${OMP_NUM_THREADS})"; ./run_googlenetv1.sh ${MB} ${ITERS} -1 f32 F ${FORMAT} ${PAD} && ./run_googlenetv1.sh ${MB} ${ITERS} -1 f32 B ${FORMAT} ${PAD} && ./run_googlenetv1.sh ${MB} ${ITERS} -1 f32 U ${FORMAT} ${PAD}; done done done) - make -e ${MAKEJ} && cd samples/deeplearning/cnnlayer && make -e ${MAKEJ} && (export CHECK=1 ITERS=1; for FORMAT in $(if [ "" != "${FORMATS}" ]; then echo "${FORMATS}"; else echo "L"; fi); do for MB_NT in $(if [ "" != "${MB_THREADS}" ]; then echo "${MB_THREADS}"; else echo "32_0"; fi); do MB=$(echo ${MB_NT} | cut -d_ -f1); export OMP_NUM_THREADS=$(echo ${MB_NT} | cut -d_ -f2); for PAD in 0 1; do echo; echo "--- TEST GoogleNet-v3 (format=${FORMAT} pad=${PAD} mb=${MB} nt=${OMP_NUM_THREADS})"; ./run_googlenetv3.sh ${MB} ${ITERS} -1 f32 F ${FORMAT} ${PAD} && ./run_googlenetv3.sh ${MB} ${ITERS} -1 f32 B ${FORMAT} ${PAD} && ./run_googlenetv3.sh ${MB} ${ITERS} -1 f32 U ${FORMAT} ${PAD}; done done done) - make -e ${MAKEJ} && cd samples/deeplearning/cnnlayer && make -e ${MAKEJ} && (export CHECK=1 ITERS=1; for FORMAT in $(if [ "" != "${FORMATS}" ]; then echo "${FORMATS}"; else echo "L"; fi); do for MB_NT in $(if [ "" != "${MB_THREADS}" ]; then echo "${MB_THREADS}"; else echo "32_0"; fi); do MB=$(echo ${MB_NT} | cut -d_ -f1); export OMP_NUM_THREADS=$(echo ${MB_NT} | cut -d_ -f2); for PAD in 0 1; do echo; echo "--- TEST dcGAN (format=${FORMAT} pad=${PAD} mb=${MB} nt=${OMP_NUM_THREADS})"; ./run_dcgan.sh ${MB} ${ITERS} -1 f32 F ${FORMAT} ${PAD} && ./run_dcgan.sh ${MB} ${ITERS} -1 f32 B ${FORMAT} ${PAD} && ./run_dcgan.sh ${MB} ${ITERS} -1 f32 U ${FORMAT} ${PAD}; done done done) - make -e ${MAKEJ} && cd samples/deeplearning/cnnlayer && make -e ${MAKEJ} && (export CHECK=1 ITERS=1; for FORMAT in $(if [ "" != "${FORMATS}" ]; then echo "${FORMATS}"; else echo "L"; fi); do for MB_NT in $(if [ "" != "${MB_THREADS}" ]; then echo "${MB_THREADS}"; else echo "32_0"; fi); do MB=$(echo ${MB_NT} | cut -d_ -f1); export OMP_NUM_THREADS=$(echo ${MB_NT} | cut -d_ -f2); for PAD in 0 1; do echo; echo "--- TEST VGGa (format=${FORMAT} pad=${PAD} mb=${MB} nt=${OMP_NUM_THREADS})"; ./run_vgga.sh ${MB} ${ITERS} -1 f32 F ${FORMAT} ${PAD} && ./run_vgga.sh ${MB} ${ITERS} -1 f32 B ${FORMAT} ${PAD} && ./run_vgga.sh ${MB} ${ITERS} -1 f32 U ${FORMAT} ${PAD}; done done done) - make -e ${MAKEJ} && cd samples/deeplearning/cnnlayer && make -e ${MAKEJ} && (export OMP_NUM_THREADS=$(if [ "" != "${MB_THREADS}" ]; then echo "${MB_THREADS}" | cut -d_ -f1; else echo "0"; fi); export CHECK=1 ITERS=1; for FORMAT in $(if [ "" != "${FORMATS}" ]; then echo "${FORMATS}"; else echo "L"; fi); do for PAD in 0 1; do echo; echo "--- TEST DeepBench (format=${FORMAT} pad=${PAD})"; ./run_deepbench.sh ${ITERS} -1 f32 F ${FORMAT} ${PAD} && ./run_deepbench.sh ${ITERS} -1 f32 B ${FORMAT} ${PAD} && ./run_deepbench.sh ${ITERS} -1 f32 U ${FORMAT} ${PAD}; done done) - make -e ${MAKEJ} && cd samples/deeplearning/cnnlayer && make -e ${MAKEJ} && (echo; echo "--- TEST Quicktest"; for MB_NT in $(if [ "" != "${MB_THREADS}" ]; then echo "${MB_THREADS}"; else echo "32_0"; fi); do MB=$(echo ${MB_NT} | cut -d_ -f1); export OMP_NUM_THREADS=$(echo ${MB_NT} | cut -d_ -f2); ./layer_example_f32 1 299 299 ${MB} 3 32 3 3 0 0 2 U T 1 && ./layer_example_f32 1 13 13 ${MB} 192 384 3 3 1 1 1 B L 1; done) - make -e ${MAKEJ} && cd samples/deeplearning/cnnlayer && make -e ${MAKEJ} && (export OMP_NUM_THREADS=$(if [ "" != "${MB_THREADS}" ]; then echo "${MB_THREADS}" | cut -d_ -f1; else echo "0"; fi); export CHECK=1 CHECK_SCALE=1 ITERS=1 MB=${OMP_NUM_THREADS}; for KIND in F B U; do echo; echo "--- TEST ResNet-50 (precision=bf16 kind=${KIND} mb=${MB} nt=${OMP_NUM_THREADS})"; ./run_resnet50.sh ${MB} ${ITERS} -1 bf16 ${KIND} L 1; done) - make -e ${MAKEJ} && cd samples/deeplearning/cnnlayer && make -e ${MAKEJ} && (export OMP_NUM_THREADS=$(if [ "" != "${MB_THREADS}" ]; then echo "${MB_THREADS}" | cut -d_ -f1; else echo "0"; fi); export CHECK=1 CHECK_SCALE=1 ITERS=1 MB=${OMP_NUM_THREADS}; for KIND in F B U; do echo; echo "--- TEST ResNet-50 (precision=bf16 kind=${KIND} mb=${MB} nt=${OMP_NUM_THREADS})"; ./run_resnet50_mock.sh ${MB} ${ITERS} -1 bf16 ${KIND} L 1; done) libxsmm-1.17/.test-fc.yml000066400000000000000000000070551415223013700152700ustar00rootroot00000000000000script: - make -e ${MAKEJ} && cd samples/deeplearning/fullyconnecteddriver && make -e ${MAKEJ} && (echo; echo "--- TEST FC (FWD,resnet50,FP32)"; ./run_resnet50.sh 28 1 1 f32 0 F ) && (echo; echo "--- TEST FC (BWD,resnet50,FP32)"; ./run_resnet50.sh 28 1 1 f32 0 B ) && (echo; echo "--- TEST FC (UPD,resnet50,FP32)"; ./run_resnet50.sh 28 1 1 f32 0 U ) - make -e ${MAKEJ} && cd samples/deeplearning/fullyconnecteddriver && make -e ${MAKEJ} && (echo; echo "--- TEST FC (FWD,resnet50,BF16)"; CHECK_SCALE=0.001 ./run_resnet50.sh 28 1 1 bf16 0 F ) && (echo; echo "--- TEST FC (BWD,resnet50,BF16)"; CHECK_SCALE=0.001 ./run_resnet50.sh 28 1 1 bf16 0 B ) && (echo; echo "--- TEST FC (UPD,resnet50,BF16)"; CHECK_SCALE=0.001 ./run_resnet50.sh 28 1 1 bf16 0 U ) - make -e ${MAKEJ} && cd samples/deeplearning/fullyconnecteddriver && make -e ${MAKEJ} && (echo; echo "--- TEST FC (NCNC-KCCK,FP32)"; ./run_fullyconnected.sh B f32 1 1024 A 0 32 32 32 ) && (echo; echo "--- TEST FC (NCNC-KCCK,FP32)"; ./run_fullyconnected.sh B f32 1 1024 A 1 32 32 32 ) && (echo; echo "--- TEST FC (NCNC-KCCK,FP32)"; ./run_fullyconnected.sh B f32 1 1024 A 2 32 32 32 ) && (echo; echo "--- TEST FC (NCNC-KCCK,FP32)"; ./run_fullyconnected.sh B f32 1 1024 A 3 32 32 32 ) && (echo; echo "--- TEST FC (NCNC-KCCK,FP32)"; ./run_fullyconnected.sh B f32 1 1024 A 4 32 32 32 ) && (echo; echo "--- TEST FC (NCNC-KCCK,FP32)"; ./run_fullyconnected.sh B f32 1 1024 A 5 32 32 32 ) && (echo; echo "--- TEST FC (NCNC-KCCK,FP32)"; ./run_fullyconnected.sh B f32 1 1024 A 0 64 64 64 ) && (echo; echo "--- TEST FC (NCNC-KCCK,FP32)"; ./run_fullyconnected.sh B f32 1 1024 A 1 64 64 64 ) && (echo; echo "--- TEST FC (NCNC-KCCK,FP32)"; ./run_fullyconnected.sh B f32 1 1024 A 2 64 64 64 ) && (echo; echo "--- TEST FC (NCNC-KCCK,FP32)"; ./run_fullyconnected.sh B f32 1 1024 A 3 64 64 64 ) && (echo; echo "--- TEST FC (NCNC-KCCK,FP32)"; ./run_fullyconnected.sh B f32 1 1024 A 4 64 64 64 ) && (echo; echo "--- TEST FC (NCNC-KCCK,FP32)"; ./run_fullyconnected.sh B f32 1 1024 A 5 64 64 64 ) - make -e ${MAKEJ} && cd samples/deeplearning/fullyconnecteddriver && make -e ${MAKEJ} && (echo; echo "--- TEST FC (NCNC-KCCK,BF16)"; CHECK_SCALE=0.001 ./run_fullyconnected.sh B bf16 1 1024 A 0 32 32 32 ) && (echo; echo "--- TEST FC (NCNC-KCCK,BF16)"; CHECK_SCALE=0.001 ./run_fullyconnected.sh B bf16 1 1024 A 1 32 32 32 ) && (echo; echo "--- TEST FC (NCNC-KCCK,BF16)"; CHECK_SCALE=0.001 ./run_fullyconnected.sh B bf16 1 1024 A 2 32 32 32 ) && (echo; echo "--- TEST FC (NCNC-KCCK,BF16)"; CHECK_SCALE=0.001 ./run_fullyconnected.sh B bf16 1 1024 A 3 32 32 32 ) && (echo; echo "--- TEST FC (NCNC-KCCK,BF16)"; CHECK_SCALE=0.001 ./run_fullyconnected.sh B bf16 1 1024 A 4 32 32 32 ) && (echo; echo "--- TEST FC (NCNC-KCCK,BF16)"; CHECK_SCALE=0.001 ./run_fullyconnected.sh B bf16 1 1024 A 5 32 32 32 ) && (echo; echo "--- TEST FC (NCNC-KCCK,BF16)"; CHECK_SCALE=0.001 ./run_fullyconnected.sh B bf16 1 1024 A 0 64 64 64 ) && (echo; echo "--- TEST FC (NCNC-KCCK,BF16)"; CHECK_SCALE=0.001 ./run_fullyconnected.sh B bf16 1 1024 A 1 64 64 64 ) && (echo; echo "--- TEST FC (NCNC-KCCK,BF16)"; CHECK_SCALE=0.001 ./run_fullyconnected.sh B bf16 1 1024 A 2 64 64 64 ) && (echo; echo "--- TEST FC (NCNC-KCCK,BF16)"; CHECK_SCALE=0.001 ./run_fullyconnected.sh B bf16 1 1024 A 3 64 64 64 ) && (echo; echo "--- TEST FC (NCNC-KCCK,BF16)"; CHECK_SCALE=0.001 ./run_fullyconnected.sh B bf16 1 1024 A 4 64 64 64 ) && (echo; echo "--- TEST FC (NCNC-KCCK,BF16)"; CHECK_SCALE=0.001 ./run_fullyconnected.sh B bf16 1 1024 A 5 64 64 64 ) libxsmm-1.17/.test-rnn.yml000066400000000000000000000057671415223013700155050ustar00rootroot00000000000000script: - make -e ${MAKEJ} && cd samples/deeplearning/rnndriver && make -e ${MAKEJ} && (echo; echo "--- TEST RNN (NCNC-KCCK)"; CHECK=1 ./run_rnncell.sh ncnc_kcck f32 1 0 1 256 32 32 32 ) && (echo; echo "--- TEST RNN (NCNC-KCCK)"; CHECK=1 ./run_rnncell.sh ncnc_kcck f32 1 0 2 256 32 32 32 ) && (echo; echo "--- TEST RNN (NCNC-KCCK)"; CHECK=1 ./run_rnncell.sh ncnc_kcck f32 1 0 3 256 32 32 32 ) && (echo; echo "--- TEST RNN (NC-KCCK)"; CHECK=1 ./run_rnncell.sh nc_kcck f32 1 0 1 256 32 32 32 ) && (echo; echo "--- TEST RNN (NC-KCCK)"; CHECK=1 ./run_rnncell.sh nc_kcck f32 1 3 1 256 32 32 32 ) && (echo; echo "--- TEST RNN (NC-KCCK)"; CHECK=1 ./run_rnncell.sh nc_kcck f32 1 0 2 256 32 32 32 ) && (echo; echo "--- TEST RNN (NC-KCCK)"; CHECK=1 ./run_rnncell.sh nc_kcck f32 1 3 2 256 32 32 32 ) && (echo; echo "--- TEST RNN (NC-KCCK)"; CHECK=1 ./run_rnncell.sh nc_kcck f32 1 0 3 256 32 32 32 ) && (echo; echo "--- TEST RNN (NC-KCCK)"; CHECK=1 ./run_rnncell.sh nc_kcck f32 1 3 3 256 32 32 32 ) && (echo; echo "--- TEST RNN (NC-CK)"; CHECK=1 ./run_rnncell.sh nc_ck f32 1 0 1 256 32 32 32 ) && (echo; echo "--- TEST RNN (NC-CK)"; CHECK=1 ./run_rnncell.sh nc_ck f32 1 3 1 256 32 32 32 ) && (echo; echo "--- TEST RNN (NC-CK)"; CHECK=1 ./run_rnncell.sh nc_ck f32 1 0 2 256 32 32 32 ) && (echo; echo "--- TEST RNN (NC-CK)"; CHECK=1 ./run_rnncell.sh nc_ck f32 1 3 2 256 32 32 32 ) && (echo; echo "--- TEST RNN (NC-CK)"; CHECK=1 ./run_rnncell.sh nc_ck f32 1 0 3 256 32 32 32 ) && (echo; echo "--- TEST RNN (NC-CK)"; CHECK=1 ./run_rnncell.sh nc_ck f32 1 3 3 256 32 32 32 ) - make -e ${MAKEJ} && cd samples/deeplearning/lstmdriver && make -e ${MAKEJ} && (echo; echo "--- TEST LSTM (NC-CK)"; CHECK=1 ./run_lstmcell.sh nc_ck f32 1 0 256 32 32 32 ) && (echo; echo "--- TEST LSTM (NC-CK)"; CHECK=1 ./run_lstmcell.sh nc_ck f32 1 3 256 32 32 32 ) && (echo; echo "--- TEST LSTM (NC-KCCK)"; CHECK=1 ./run_lstmcell.sh nc_kcck f32 1 0 256 32 32 32 ) && (echo; echo "--- TEST LSTM (NC-KCCK)"; CHECK=1 ./run_lstmcell.sh nc_kcck f32 1 3 256 32 32 32 ) - make -e ${MAKEJ} && cd samples/deeplearning/lstmdriver && make -e ${MAKEJ} && (echo; echo "--- TEST LSTM (NC-CK/FWD/BF16)"; CHECK=1 CHECK_SCALE=0.001 ./run_lstmcell.sh nc_ck bf16 1 0 256 32 32 32 ) - make -e ${MAKEJ} && cd samples/deeplearning/lstmdriver && make -e ${MAKEJ} && (echo; echo "--- TEST LSTM (NC-CK/BWD/BF16)"; CHECK=1 CHECK_SCALE=0.001 ./run_lstmcell.sh nc_ck bf16 1 3 256 32 32 32 ) - make -e ${MAKEJ} && cd samples/deeplearning/lstmdriver && make -e ${MAKEJ} && (echo; echo "--- TEST LSTM (NC-KCCK/FWD/BF16)"; CHECK=1 CHECK_SCALE=0.001 ./run_lstmcell.sh nc_kcck bf16 1 0 256 32 32 32 ) - make -e ${MAKEJ} && cd samples/deeplearning/lstmdriver && make -e ${MAKEJ} && (echo; echo "--- TEST LSTM (NC-KCCK/BWD/BF16)"; CHECK=1 CHECK_SCALE=0.001 ./run_lstmcell.sh nc_kcck bf16 1 3 256 32 32 32 ) - true || (make -e ${MAKEJ} && cd samples/deeplearning/grudriver && make -e ${MAKEJ} && (echo; echo "--- TEST GRU"; CHECK=1 ./grudriver.sh)) libxsmm-1.17/.travis.yml000066400000000000000000000223021415223013700152230ustar00rootroot00000000000000os: linux # default language: cpp #dist: xenial branches: except: - develop - results - wip jobs: fast_finish: true include: - os: linux compiler: gcc env: INSTALL=install-artifacts OPT=1 PYTHON=python2 MALLOC=-1 LIBXSMM_MALLOC=1 - os: linux compiler: gcc env: INSTALL=install DBG=1 LIBXSMM_SE=1 LIBXSMM_TARGET=0 ANALYZE=1 - os: linux compiler: clang env: INSTALL=install DBG=1 LIBXSMM_TARGET=sse MALLOC=1 LD_LIBRARY_PATH=/usr/local/clang/lib:${LD_LIBRARY_PATH} - os: osx osx_image: xcode12.2 compiler: clang env: SYM=1 OPT=1 SPACES=1 TIMER_DELTA=-1 - os: osx osx_image: xcode7.3 compiler: clang env: DBG=1 INSTALL=install-artifacts LIBXSMM_SE=1 env: global: - PEDANTIC_HIGH="PEDANTIC=2" - PEDANTIC_TEST="PEDANTIC=1" - LIBXSMM_VERBOSE=3 - RUN_LINUX_PERF=0 addons: apt: update: true packages: - gfortran - libblas-dev - liblapack-dev - python3 coverity_scan: project: name: "hfp/libxsmm" description: "Matrix operations and deep learning primitives" build_command_prepend: "source .env/travis.env; make -e clean" # consider ${MAKEJ} to potentially accelerate the build build_command: "make -e" branch_pattern: coverity before_install: - source .env/travis.env - if [ -e /proc/cpuinfo ]; then grep -m1 "flags" /proc/cpuinfo | cut -d":" -f2-; elif [ "" != "$(command -v sysctl)" ]; then sysctl -a machdep.cpu.features machdep.cpu.extfeatures machdep.cpu.leaf7_features | cut -d":" -f2- | tr -s "\n" " " | tr "[:upper:]" "[:lower:]"; fi - if [ "1" = "${COVERITY_SCAN_BRANCH}" ] && [ -e /etc/ssl/certs/ca-certificates.crt ]; then echo -n | openssl s_client -connect scan.coverity.com:443 2>/dev/null | sed -ne '/-BEGIN CERTIFICATE-/,/-END CERTIFICATE-/p' 2>/dev/null | sudo tee -a /etc/ssl/certs/ca-certificates.crt 2>/dev/null; fi - if [ "" = "${CODECOV_ENABLED}" ] || [ "0" != "${CODECOV_ENABLED}" ]; then if [ "osx" = "${TRAVIS_OS_NAME}" ] && [ "clang" != "${CC}" ]; then sudo easy_install pip; fi; fi install: - if [ "" != "${UPLOAD_ENABLED}" ] && [ "0" != "${UPLOAD_ENABLED}" ] && [ "1" != "${COVERITY_SCAN_BRANCH}" ]; then ( cd /tmp && curl -O -L https://github.com/openssl/openssl/archive/OpenSSL_1_1_1d.tar.gz && tar xvf OpenSSL_1_1_1d.tar.gz && cd openssl-OpenSSL_1_1_1d && ./config --prefix=${HOME}/openssl && make ${MAKEJ} && make install && cd /tmp && curl -O -L https://www.libssh2.org/download/libssh2-1.9.0.tar.gz && tar xvf libssh2-1.9.0.tar.gz && cd libssh2-1.9.0 && ./configure --prefix=${HOME}/libssh2 --with-libssl-prefix=${HOME}/openssl && make ${MAKEJ} install && cd /tmp && curl -O -L https://github.com/curl/curl/archive/curl-7_67_0.tar.gz && tar xvf curl-7_67_0.tar.gz && cd curl-curl-7_67_0 && ./buildconf && ./configure --prefix=${HOME}/curl --with-ssl=${HOME}/openssl --with-libssh2=${HOME}/libssh2 && make ${MAKEJ} install && cd ${TRAVIS_BUILD_DIR} ) || true; fi - if [ "" = "${CODECOV_ENABLED}" ] || [ "0" != "${CODECOV_ENABLED}" ]; then if [ "clang" != "${CC}" ]; then pip install --upgrade pip; pip install --user codecov; fi; fi after_success: - eval ${CODECOV} # upload artifacts after_failure: - eval ${UPLOAD} before_script: - source .env/travis.env # avoid any dashes at the begin of a line within a test section script: - if [ "1" = "${COVERITY_SCAN_BRANCH}" ]; then exit 0; fi; source ${TRAVIS_BUILD_DIR}/.env/codecov.env $(if [ "clang" = "${CC}" ]; then echo "0"; fi) && make -e ${MAKEJ} PREFIX=${UPLOAD_DIR} ${PEDANTIC_HIGH} TRACE=1 STATIC=0 BLAS=1 ABSLIBS=1 MIX=0 EFLAGS="-DITYPE=double -DINCLUDE_LIBXSMM_LAST" ${INSTALL} tests && eval ${RUNXCOV} && if [ "" != "${UPLOAD_ENABLED}" ] && [ "0" != "$((10: enable intercepted malloc MALLOC ?= 0 # Determines the kind of routine called for intercepted GEMMs # >=1 and odd : sequential and non-tiled (small problem sizes only) # >=2 and even: parallelized and tiled (all problem sizes) # >=3 and odd : GEMV is intercepted; small problem sizes # >=4 and even: GEMV is intercepted; all problem sizes # negative: BLAS provides DGEMM_BATCH and SGEMM_BATCH # 0: disabled WRAP ?= 1 # Attempts to pin OpenMP based threads AUTOPIN ?= 0 ifneq (0,$(AUTOPIN)) DFLAGS += -DLIBXSMM_AUTOPIN endif # Profiling JIT code using Linux Perf # PERF=0: disabled (default) # PERF=1: enabled (without JITDUMP) # PERF=2: enabled (with JITDUMP) # # Additional support for jitdump # JITDUMP=0: disabled (default) # JITDUMP=1: enabled # PERF=2: enabled # ifneq (,$(PERF)) ifneq (0,$(PERF)) ifneq (1,$(PERF)) JITDUMP ?= 1 endif endif endif JITDUMP ?= 0 ifneq (0,$(JITDUMP)) PERF ?= 1 endif PERF ?= 0 ifneq (0,$(PERF)) SYM ?= 1 endif # OpenMP is disabled by default and LIBXSMM is # always agnostic wrt the threading runtime OMP ?= 0 ifneq (1,$(CACHE)) DFLAGS += -DLIBXSMM_CAPACITY_CACHE=$(CACHE) endif # disable lazy initialization and rely on ctor attribute ifeq (0,$(INIT)) DFLAGS += -DLIBXSMM_CTOR endif # Kind of documentation (internal key) DOCEXT := pdf # Timeout when downloading documentation parts TIMEOUT := 30 # state to be excluded from tracking the (re-)build state EXCLUDE_STATE := \ DESTDIR PREFIX BINDIR CURDIR DOCDIR DOCEXT INCDIR LICFDIR OUTDIR TSTDIR TIMEOUT \ PBINDIR PINCDIR POUTDIR PPKGDIR PMODDIR PSRCDIR PTSTDIR PDOCDIR SCRDIR SPLDIR \ SRCDIR TEST VERSION_STRING DEPSTATIC ALIAS_% BLAS %_TARGET %ROOT MPSS KNC # fixed .state file directory (included by source) DIRSTATE := $(OUTDIR)/.. ifeq (,$(M)$(N)$(K)) ifeq (,$(filter-out 0,$(MNK))) EXCLUDE_STATE += PRECISION MNK M N K endif endif # avoid to link with C++ standard library FORCE_CXX := 0 # include common Makefile artifacts include $(ROOTDIR)/Makefile.inc # TRACE facility INSTRUMENT ?= $(TRACE) # JIT backend is enabled by default ifeq (0,$(call qnum,$(PLATFORM))) # NaN JIT ?= 1 else ifeq (1,$(PLATFORM)) # JIT is disabled if platform is forced # enable with "PLATFORM=1 JIT=1" or "PLATFORM=2" VTUNE := 0 MKL := 0 JIT ?= 0 else # imply JIT=1 if PLATFORM=2 (or higher) VTUNE := 0 MKL := 0 JIT ?= 1 endif # target library for a broad range of systems ifneq (0,$(JIT)) SSE ?= 1 endif ifneq (,$(MKL)) ifneq (0,$(MKL)) BLAS := $(MKL) endif endif ifneq (,$(MAXTARGET)) DFLAGS += -DLIBXSMM_MAXTARGET=$(MAXTARGET) endif # necessary include directories IFLAGS += -I$(call quote,$(INCDIR)) IFLAGS += -I$(call quote,$(ROOTDIR)/$(SRCDIR)) ifeq (,$(PYTHON)) $(info --------------------------------------------------------------------------------) $(error No Python interpreter found) endif # Version numbers according to interface (version.txt) VERSION_MAJOR ?= $(shell $(PYTHON) $(ROOTDIR)/$(SCRDIR)/libxsmm_utilities.py 1) VERSION_MINOR ?= $(shell $(PYTHON) $(ROOTDIR)/$(SCRDIR)/libxsmm_utilities.py 2) VERSION_UPDATE ?= $(shell $(PYTHON) $(ROOTDIR)/$(SCRDIR)/libxsmm_utilities.py 3) VERSION_STRING ?= $(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_UPDATE) VERSION_API ?= $(shell $(PYTHON) $(ROOTDIR)/$(SCRDIR)/libxsmm_utilities.py 0 $(VERSION_STRING)) VERSION_ALL ?= $(shell $(PYTHON) $(ROOTDIR)/$(SCRDIR)/libxsmm_utilities.py) VERSION_RELEASED ?= $(shell $(PYTHON) $(ROOTDIR)/$(SCRDIR)/libxsmm_utilities.py -1 $(VERSION_ALL)) VERSION_RELEASE ?= HEAD VERSION_PACKAGE ?= 1 # explicitly target all objects ifneq (,$(strip $(SSE)$(AVX)$(MIC))) TGT ?= 1 endif TGT ?= 0 ifeq (0,$(BLAS)) ifeq (0,$(STATIC)) ifneq (0,$(LNKSOFT)) ifeq (Darwin,$(UNAME)) LDFLAGS += $(call linkopt,-U,_dgemm_) LDFLAGS += $(call linkopt,-U,_sgemm_) LDFLAGS += $(call linkopt,-U,_dgemv_) LDFLAGS += $(call linkopt,-U,_sgemv_) endif endif endif endif # target library for a broad range of systems ifneq (0,$(JIT)) ifeq (file,$(origin AVX)) AVX_STATIC := 0 endif endif AVX_STATIC ?= $(AVX) ifeq (1,$(AVX_STATIC)) GENTARGET := snb else ifeq (2,$(AVX_STATIC)) GENTARGET := hsw else ifeq (3,$(AVX_STATIC)) ifneq (0,$(MIC)) ifeq (2,$(MIC)) GENTARGET := knm else GENTARGET := knl endif else GENTARGET := skx endif else ifneq (0,$(SSE)) GENTARGET := wsm else GENTARGET := noarch endif ifneq (Darwin,$(UNAME)) GENGEMM := @$(ENVBIN) \ LD_LIBRARY_PATH="$(OUTDIR):$${LD_LIBRARY_PATH}" \ PATH="$(OUTDIR):$${PATH}" \ $(BINDIR)/libxsmm_gemm_generator else # osx GENGEMM := @$(ENVBIN) \ DYLD_LIBRARY_PATH="$(OUTDIR):$${DYLD_LIBRARY_PATH}" \ PATH="$(OUTDIR):$${PATH}" \ $(BINDIR)/libxsmm_gemm_generator endif INDICES ?= $(shell $(PYTHON) $(ROOTDIR)/$(SCRDIR)/libxsmm_utilities.py -1 $(THRESHOLD) $(words $(MNK)) $(MNK) $(words $(M)) $(words $(N)) $(M) $(N) $(K)) NINDICES := $(words $(INDICES)) SRCFILES_KERNELS := $(patsubst %,$(BLDDIR)/mm_%.c,$(INDICES)) KRNOBJS_HST := $(patsubst %,$(BLDDIR)/intel64/mm_%.o,$(INDICES)) KRNOBJS_MIC := $(patsubst %,$(BLDDIR)/mic/mm_%.o,$(INDICES)) HEADERS := $(wildcard $(ROOTDIR)/$(SRCDIR)/template/*.c) $(wildcard $(ROOTDIR)/$(SRCDIR)/*.h) \ $(ROOTDIR)/$(SRCDIR)/libxsmm_hash.c \ $(ROOTDIR)/include/libxsmm_blocked_gemm.h \ $(ROOTDIR)/include/libxsmm_cpuid.h \ $(ROOTDIR)/include/libxsmm_dnn.h \ $(ROOTDIR)/include/libxsmm_dnn_tensor.h \ $(ROOTDIR)/include/libxsmm_dnn_convolution.h \ $(ROOTDIR)/include/libxsmm_dnn_fusedbatchnorm.h \ $(ROOTDIR)/include/libxsmm_dnn_fusedgroupnorm.h \ $(ROOTDIR)/include/libxsmm_dnn_pooling.h \ $(ROOTDIR)/include/libxsmm_dnn_fullyconnected.h \ $(ROOTDIR)/include/libxsmm_dnn_rnncell.h \ $(ROOTDIR)/include/libxsmm_dnn_softmaxloss.h \ $(ROOTDIR)/include/libxsmm_dnn_optimizer.h \ $(ROOTDIR)/include/libxsmm_rng.h \ $(ROOTDIR)/include/libxsmm_frontend.h \ $(ROOTDIR)/include/libxsmm_fsspmdm.h \ $(ROOTDIR)/include/libxsmm_generator.h \ $(ROOTDIR)/include/libxsmm_intrinsics_x86.h \ $(ROOTDIR)/include/libxsmm_macros.h \ $(ROOTDIR)/include/libxsmm_malloc.h \ $(ROOTDIR)/include/libxsmm_math.h \ $(ROOTDIR)/include/libxsmm_memory.h \ $(ROOTDIR)/include/libxsmm_mhd.h \ $(ROOTDIR)/include/libxsmm_spmdm.h \ $(ROOTDIR)/include/libxsmm_sync.h \ $(ROOTDIR)/include/libxsmm_timer.h \ $(ROOTDIR)/include/libxsmm_typedefs.h SRCFILES_LIB := $(patsubst %,$(ROOTDIR)/$(SRCDIR)/%, \ libxsmm_main.c libxsmm_memory.c libxsmm_malloc.c libxsmm_hash.c libxsmm_math.c \ libxsmm_sync.c libxsmm_python.c libxsmm_mhd.c libxsmm_timer.c libxsmm_perf.c \ libxsmm_gemm.c libxsmm_xcopy.c libxsmm_blocked_gemm.c libxsmm_spmdm.c libxsmm_fsspmdm.c libxsmm_rng.c\ libxsmm_dnn.c libxsmm_dnn_tensor.c libxsmm_dnn_convolution.c libxsmm_dnn_elementwise.c \ libxsmm_dnn_rnncell.c libxsmm_dnn_rnncell_forward.c libxsmm_dnn_rnncell_backward_weight_update.c \ libxsmm_dnn_fusedbatchnorm.c libxsmm_dnn_fusedbatchnorm_forward.c libxsmm_dnn_fusedbatchnorm_backward.c \ libxsmm_dnn_fusedgroupnorm.c libxsmm_dnn_fusedgroupnorm_forward.c libxsmm_dnn_fusedgroupnorm_backward.c \ libxsmm_dnn_pooling.c libxsmm_dnn_pooling_forward.c libxsmm_dnn_pooling_backward.c libxsmm_dnn_convolution_forward.c \ libxsmm_dnn_fullyconnected.c libxsmm_dnn_fullyconnected_forward.c libxsmm_dnn_fullyconnected_backward_weight_update.c \ libxsmm_dnn_convolution_backward.c libxsmm_dnn_convolution_weight_update.c libxsmm_dnn_softmaxloss.c \ libxsmm_dnn_softmaxloss_forward.c libxsmm_dnn_softmaxloss_backward.c libxsmm_dnn_optimizer.c libxsmm_dnn_optimizer_sgd.c ) SRCFILES_GEN_LIB := $(patsubst %,$(ROOTDIR)/$(SRCDIR)/%,$(notdir $(wildcard $(ROOTDIR)/$(SRCDIR)/generator_*.c)) \ libxsmm_cpuid_x86.c libxsmm_generator.c libxsmm_trace.c) SRCFILES_GEN_GEMM_BIN := $(patsubst %,$(ROOTDIR)/$(SRCDIR)/%,libxsmm_generator_gemm_driver.c) OBJFILES_GEN_GEMM_BIN := $(patsubst %,$(BLDDIR)/intel64/%.o,$(basename $(notdir $(SRCFILES_GEN_GEMM_BIN)))) OBJFILES_GEN_LIB := $(patsubst %,$(BLDDIR)/intel64/%.o,$(basename $(notdir $(SRCFILES_GEN_LIB)))) OBJFILES_HST := $(patsubst %,$(BLDDIR)/intel64/%.o,$(basename $(notdir $(SRCFILES_LIB)))) OBJFILES_MIC := $(patsubst %,$(BLDDIR)/mic/%.o,$(basename $(notdir $(SRCFILES_LIB)))) $(BLDDIR)/mic/generator_common.o EXTOBJS_HST := $(BLDDIR)/intel64/libxsmm_ext.o \ $(BLDDIR)/intel64/libxsmm_ext_xcopy.o \ $(BLDDIR)/intel64/libxsmm_ext_blocked_gemm.o \ $(BLDDIR)/intel64/libxsmm_ext_gemm.o EXTOBJS_MIC := $(BLDDIR)/mic/libxsmm_ext.o \ $(BLDDIR)/mic/libxsmm_ext_xcopy.o \ $(BLDDIR)/mic/libxsmm_ext_blocked_gemm.o \ $(BLDDIR)/mic/libxsmm_ext_gemm.o NOBLAS_HST := $(BLDDIR)/intel64/libxsmm_noblas.o NOBLAS_MIC := $(BLDDIR)/mic/libxsmm_noblas.o # list of object might be "incomplete" if not all code gen. FLAGS are supplied with clean target! OBJECTS := $(OBJFILES_GEN_LIB) $(OBJFILES_GEN_GEMM_BIN) $(OBJFILES_HST) $(OBJFILES_MIC) \ $(KRNOBJS_HST) $(KRNOBJS_MIC) $(EXTOBJS_HST) $(EXTOBJS_MIC) $(NOBLAS_HST) $(NOBLAS_MIC) ifneq (,$(strip $(FC))) FTNOBJS := $(BLDDIR)/intel64/libxsmm-mod.o $(BLDDIR)/mic/libxsmm-mod.o endif MSGJITPROFILING := 0 ifneq (0,$(JIT)) ifneq (0,$(VTUNE)) ifeq (,$(filter Darwin,$(UNAME))) ifneq (0,$(PERF)) DFLAGS += -DLIBXSMM_PERF ifneq (0,$(JITDUMP)) DFLAGS += -DLIBXSMM_PERF_JITDUMP endif endif VTUNEROOT := $(shell env | grep VTUNE_PROFILER | grep -m1 _DIR | cut -d= -f2-) ifeq (,$(VTUNEROOT)) VTUNEROOT := $(shell env | grep VTUNE_AMPLIFIER | grep -m1 _DIR | cut -d= -f2-) endif ifeq (,$(VTUNEROOT)) VTUNEROOT := $(EBROOTVTUNE)/vtune_amplifier endif ifneq (,$(wildcard $(VTUNEROOT)/lib64/libjitprofiling.$(SLIBEXT))) ifneq (0,$(SYM)) LIBJITPROFILING := $(BLDDIR)/jitprofiling/libjitprofiling.$(SLIBEXT) OBJJITPROFILING := $(BLDDIR)/jitprofiling/*.o DFLAGS += -DLIBXSMM_VTUNE IFLAGS += -I$(call quote,$(VTUNEROOT)/include) WERROR := 0 ifneq (0,$(INTEL)) CXXFLAGS += -diag-disable 271 CFLAGS += -diag-disable 271 endif endif MSGJITPROFILING := 1 endif endif endif endif # no warning conversion for released versions ifneq (0,$(VERSION_RELEASED)) WERROR := 0 endif # no warning conversion for non-x86 ifneq (x86_64,$(MNAME)) WERROR := 0 endif # no warning conversion ifneq (,$(filter-out 0 1,$(INTEL))) WERROR := 0 endif information = \ $(info ================================================================================) \ $(info LIBXSMM $(VERSION_ALL) ($(UNAME)$(if $(filter-out 0,$(LIBXSMM_TARGET_HIDDEN)),$(NULL),$(if $(HOSTNAME),@$(HOSTNAME))))) \ $(info --------------------------------------------------------------------------------) \ $(info $(GINFO)) \ $(info $(CINFO)) \ $(if $(strip $(FC)),$(info $(FINFO))) \ $(if $(strip $(FC)),$(NULL), \ $(if $(strip $(FC_VERSION)), \ $(info Fortran Compiler $(FC_VERSION) is outdated!), \ $(info Fortran Compiler is disabled or missing: no Fortran interface is built!))) \ $(info --------------------------------------------------------------------------------) \ $(if $(ENVSTATE),$(info Environment: $(ENVSTATE)) \ $(info --------------------------------------------------------------------------------)) ifneq (,$(strip $(TEST))) .PHONY: run-tests run-tests: tests endif .PHONY: libxsmm ifeq (0,$(COMPATIBLE)) ifeq (0,$(SHARED)) libxsmm: lib generator else libxsmm: libs generator endif else ifeq (0,$(SHARED)) libxsmm: lib else libxsmm: libs endif endif $(information) ifneq (,$(filter _0_,_$(LNKSOFT)_)) ifeq (0,$(DEPSTATIC)) $(info Building a shared library requires to link against BLAS) $(info since a deferred choice is not implemented for this OS.) $(info --------------------------------------------------------------------------------) endif endif ifneq (,$(filter _0_,_$(BLAS)_)) ifeq (,$(filter _0_,_$(NOBLAS)_)) $(info BLAS dependency and fallback is removed!) $(info --------------------------------------------------------------------------------) endif else ifeq (, $(filter _0_,_$(LNKSOFT)_)) $(info LIBXSMM is link-time agnostic with respect to a BLAS library!) $(info Forcing a specific library can take away a user's choice.) $(info If this was to solve linker errors (dgemm_, sgemm_, etc.),) $(info the BLAS library should go after LIBXSMM (link-line).) $(info --------------------------------------------------------------------------------) endif ifneq (,$(filter 0 1,$(INTRINSICS))) ifeq (0,$(COMPATIBLE)) ifeq (0,$(INTEL)) $(info If adjusting INTRINSICS was necessary, consider updated GNU Binutils.) else # Intel Compiler $(info Intel Compiler does not usually require adjusting INTRINSICS.) endif $(info --------------------------------------------------------------------------------) endif # COMPATIBLE endif # INTRINSICS ifneq (0,$(MSGJITPROFILING)) ifneq (,$(strip $(LIBJITPROFILING))) $(info Intel VTune Amplifier support has been incorporated.) else $(info Intel VTune Amplifier support has been detected (enable with SYM=1).) endif $(info --------------------------------------------------------------------------------) endif .PHONY: lib lib: headers drytest lib_hst lib_mic .PHONY: libs libs: lib ifneq (0,$(STATIC)) @$(MAKE) --no-print-directory lib STATIC=0 else @$(MAKE) --no-print-directory lib STATIC=1 endif .PHONY: all all: libxsmm .PHONY: realall realall: all samples .PHONY: headers headers: cheader cheader_only fheader .PHONY: header-only header-only: cheader_only .PHONY: header_only header_only: header-only .PHONY: interface interface: headers module .PHONY: winterface winterface: headers sources .PHONY: lib_mic lib_mic: clib_mic flib_mic ext_mic noblas_mic .PHONY: lib_hst lib_hst: clib_hst flib_hst ext_hst noblas_hst PREFETCH_UID := 0 PREFETCH_TYPE := 0 PREFETCH_SCHEME := nopf ifneq (Windows_NT,$(UNAME)) # TODO: full support for Windows calling convention ifneq (0,$(shell echo "$$((0 <= $(PREFETCH) && $(PREFETCH) <= 6))")) PREFETCH_UID := $(PREFETCH) else ifneq (0,$(shell echo "$$((0 > $(PREFETCH)))")) # auto PREFETCH_UID := 1 else ifeq (pfsigonly,$(PREFETCH)) PREFETCH_UID := 2 else ifeq (BL2viaC,$(PREFETCH)) PREFETCH_UID := 3 else ifeq (curAL2,$(PREFETCH)) PREFETCH_UID := 4 else ifeq (curAL2_BL2viaC,$(PREFETCH)) PREFETCH_UID := 5 else ifeq (AL2,$(PREFETCH)) PREFETCH_UID := 6 else ifeq (AL2_BL2viaC,$(PREFETCH)) PREFETCH_UID := 7 endif # Mapping build options to libxsmm_gemm_prefetch_type (see include/libxsmm_typedefs.h) ifeq (1,$(PREFETCH_UID)) # Prefetch "auto" is a pseudo-strategy introduced by the frontend; # select "nopf" for statically generated code. PREFETCH_SCHEME := nopf PREFETCH_TYPE := -1 else ifeq (2,$(PREFETCH_UID)) PREFETCH_SCHEME := pfsigonly PREFETCH_TYPE := 1 else ifeq (3,$(PREFETCH_UID)) PREFETCH_SCHEME := BL2viaC PREFETCH_TYPE := 4 else ifeq (4,$(PREFETCH_UID)) PREFETCH_SCHEME := curAL2 PREFETCH_TYPE := 8 else ifeq (5,$(PREFETCH_UID)) PREFETCH_SCHEME := curAL2_BL2viaC PREFETCH_TYPE := $(shell echo "$$((4 | 8))") else ifeq (6,$(PREFETCH_UID)) PREFETCH_SCHEME := AL2 PREFETCH_TYPE := 2 else ifeq (7,$(PREFETCH_UID)) PREFETCH_SCHEME := AL2_BL2viaC PREFETCH_TYPE := $(shell echo "$$((4 | 2))") endif endif ifeq (,$(PREFETCH_SCHEME_MIC)) # adopt host scheme PREFETCH_SCHEME_MIC := $(PREFETCH_SCHEME) endif # Mapping build options to libxsmm_gemm_flags (see include/libxsmm_typedefs.h) #FLAGS := $(shell echo "$$((((0==$(ALPHA))*4) | ((0>$(ALPHA))*8) | ((0==$(BETA))*16) | ((0>$(BETA))*32)))") FLAGS := 0 SUPPRESS_UNUSED_VARIABLE_WARNINGS := LIBXSMM_UNUSED(A); LIBXSMM_UNUSED(B); LIBXSMM_UNUSED(C); ifneq (nopf,$(PREFETCH_SCHEME)) #SUPPRESS_UNUSED_VARIABLE_WARNINGS += LIBXSMM_UNUSED(A_prefetch); LIBXSMM_UNUSED(B_prefetch); #SUPPRESS_UNUSED_PREFETCH_WARNINGS := $(NULL) LIBXSMM_UNUSED(C_prefetch);~ SUPPRESS_UNUSED_PREFETCH_WARNINGS := $(NULL) LIBXSMM_UNUSED(A_prefetch); LIBXSMM_UNUSED(B_prefetch); LIBXSMM_UNUSED(C_prefetch);~ endif EXTCFLAGS := -DLIBXSMM_BUILD_EXT ifneq (0,$(call qnum,$(OMP))) # NaN DFLAGS += -DLIBXSMM_SYNC_OMP else # default (no OpenMP based synchronization) ifeq (,$(filter environment% override command%,$(origin OMP))) EXTCFLAGS += $(OMPFLAG) EXTLDFLAGS += $(OMPLIB) endif endif # auto-clean the co-build $(ROOTDIR)/$(SRCDIR)/template/libxsmm_config.h: $(ROOTDIR)/$(SCRDIR)/libxsmm_config.py $(ROOTDIR)/$(SCRDIR)/libxsmm_utilities.py \ $(ROOTDIR)/Makefile $(ROOTDIR)/Makefile.inc $(wildcard $(ROOTDIR)/.github/*) \ $(ROOTDIR)/version.txt #ifneq (,$(filter-out 0 1 2 STATIC,$(words $(PRESTATE)) $(word 2,$(PRESTATE)))) ifneq (0,$(STATIC)) # static @rm -f $(OUTDIR)/libxsmm*.$(DLIBEXT) $(OUTDIR)/libxsmm*.$(DLIBEXT).* else # shared/dynamic @rm -f $(OUTDIR)/libxsmm*.$(SLIBEXT) $(OUTDIR)/libxsmm*.$(SLIBEXT).* endif @touch $@ #endif .PHONY: config config: $(INCDIR)/libxsmm_config.h $(INCDIR)/libxsmm_version.h $(INCDIR)/libxsmm_config.h: $(INCDIR)/.make $(ROOTDIR)/$(SRCDIR)/template/libxsmm_config.h $(DIRSTATE)/.state $(information) $(info --- LIBXSMM build log) @if [ -e $(ROOTDIR)/.github/install.sh ]; then \ $(ROOTDIR)/.github/install.sh 2>/dev/null; \ fi @$(CP) $(filter $(ROOTDIR)/include/%.h,$(HEADERS)) $(INCDIR) 2>/dev/null || true @$(PYTHON) $(ROOTDIR)/$(SCRDIR)/libxsmm_config.py $(ROOTDIR)/$(SRCDIR)/template/libxsmm_config.h \ $(MAKE_ILP64) $(OFFLOAD) $(CACHELINE) $(PRECISION) $(PREFETCH_TYPE) \ $(shell echo "$$((0<$(THRESHOLD)?$(THRESHOLD):0))") $(shell echo "$$(($(THREADS)+$(OMP)))") \ $(JIT) $(FLAGS) $(ALPHA) $(BETA) $(WRAP) $(MALLOC) $(INDICES) > $@ $(INCDIR)/libxsmm_version.h: $(ROOTDIR)/$(SRCDIR)/template/libxsmm_config.h $(INCDIR)/.make \ $(ROOTDIR)/$(SRCDIR)/template/libxsmm_version.h @$(PYTHON) $(ROOTDIR)/$(SCRDIR)/libxsmm_config.py $(ROOTDIR)/$(SRCDIR)/template/libxsmm_version.h > $@ .PHONY: cheader cheader: $(INCDIR)/libxsmm.h $(INCDIR)/libxsmm.h: $(ROOTDIR)/$(SCRDIR)/libxsmm_interface.py \ $(ROOTDIR)/$(SRCDIR)/template/libxsmm.h \ $(INCDIR)/libxsmm_version.h \ $(INCDIR)/libxsmm_config.h \ $(HEADERS) @$(PYTHON) $(ROOTDIR)/$(SCRDIR)/libxsmm_interface.py $(ROOTDIR)/$(SRCDIR)/template/libxsmm.h \ $(shell echo "$$(($(PRECISION)+($(FORTRAN)<<2)))") $(PREFETCH_TYPE) $(INDICES) > $@ .PHONY: cheader_only cheader_only: $(INCDIR)/libxsmm_source.h $(INCDIR)/libxsmm_source.h: $(INCDIR)/.make $(ROOTDIR)/$(SCRDIR)/libxsmm_source.sh $(INCDIR)/libxsmm.h @$(ROOTDIR)/$(SCRDIR)/libxsmm_source.sh > $@ .PHONY: fheader fheader: $(INCDIR)/libxsmm.f $(INCDIR)/libxsmm.f: $(ROOTDIR)/$(SCRDIR)/libxsmm_interface.py \ $(ROOTDIR)/$(SCRDIR)/libxsmm_config.py \ $(ROOTDIR)/$(SRCDIR)/template/libxsmm.f \ $(INCDIR)/libxsmm_version.h \ $(INCDIR)/libxsmm_config.h @$(PYTHON) $(ROOTDIR)/$(SCRDIR)/libxsmm_interface.py $(ROOTDIR)/$(SRCDIR)/template/libxsmm.f \ $(shell echo "$$(($(PRECISION)+($(FORTRAN)<<2)))") $(PREFETCH_TYPE) $(INDICES) | \ $(PYTHON) $(ROOTDIR)/$(SCRDIR)/libxsmm_config.py /dev/stdin \ $(MAKE_ILP64) $(OFFLOAD) $(CACHELINE) $(PRECISION) $(PREFETCH_TYPE) \ $(shell echo "$$((0<$(THRESHOLD)?$(THRESHOLD):0))") $(shell echo "$$(($(THREADS)+$(OMP)))") \ $(JIT) $(FLAGS) $(ALPHA) $(BETA) $(WRAP) $(MALLOC) $(INDICES) | \ sed "/ATTRIBUTES OFFLOAD:MIC/d" > $@ .PHONY: sources sources: $(SRCFILES_KERNELS) $(BLDDIR)/libxsmm_dispatch.h $(BLDDIR)/libxsmm_dispatch.h: $(BLDDIR)/.make $(SRCFILES_KERNELS) $(ROOTDIR)/$(SCRDIR)/libxsmm_dispatch.py $(DIRSTATE)/.state @$(PYTHON) $(call quote,$(ROOTDIR)/$(SCRDIR)/libxsmm_dispatch.py) $(call qapath,$(DIRSTATE)/.state) $(PRECISION) $(THRESHOLD) $(INDICES) > $@ $(BLDDIR)/%.c: $(BLDDIR)/.make $(INCDIR)/libxsmm.h $(BINDIR)/libxsmm_gemm_generator $(ROOTDIR)/$(SCRDIR)/libxsmm_utilities.py $(ROOTDIR)/$(SCRDIR)/libxsmm_specialized.py ifneq (,$(strip $(SRCFILES_KERNELS))) $(eval MVALUE := $(shell echo $(basename $(notdir $@)) | cut -d_ -f2)) $(eval NVALUE := $(shell echo $(basename $(notdir $@)) | cut -d_ -f3)) $(eval KVALUE := $(shell echo $(basename $(notdir $@)) | cut -d_ -f4)) $(eval MNVALUE := $(MVALUE)) $(eval NMVALUE := $(NVALUE)) @echo "#include " > $@ @echo >> $@ ifeq (noarch,$(GENTARGET)) ifneq (,$(CTARGET)) ifneq (2,$(PRECISION)) @echo "#define LIBXSMM_GENTARGET_knl_sp" >> $@ @echo "#define LIBXSMM_GENTARGET_hsw_sp" >> $@ @echo "#define LIBXSMM_GENTARGET_snb_sp" >> $@ @echo "#define LIBXSMM_GENTARGET_wsm_sp" >> $@ endif ifneq (1,$(PRECISION)) @echo "#define LIBXSMM_GENTARGET_knl_dp" >> $@ @echo "#define LIBXSMM_GENTARGET_hsw_dp" >> $@ @echo "#define LIBXSMM_GENTARGET_snb_dp" >> $@ @echo "#define LIBXSMM_GENTARGET_wsm_dp" >> $@ endif @echo >> $@ @echo >> $@ ifneq (2,$(PRECISION)) $(GENGEMM) dense $@ libxsmm_s$(basename $(notdir $@))_knl $(MNVALUE) $(NMVALUE) $(KVALUE) $(MNVALUE) $(KVALUE) $(MNVALUE) $(ALPHA) $(BETA) 0 0 knl $(PREFETCH_SCHEME) SP $(GENGEMM) dense $@ libxsmm_s$(basename $(notdir $@))_hsw $(MNVALUE) $(NMVALUE) $(KVALUE) $(MNVALUE) $(KVALUE) $(MNVALUE) $(ALPHA) $(BETA) 0 0 hsw $(PREFETCH_SCHEME) SP $(GENGEMM) dense $@ libxsmm_s$(basename $(notdir $@))_snb $(MNVALUE) $(NMVALUE) $(KVALUE) $(MNVALUE) $(KVALUE) $(MNVALUE) $(ALPHA) $(BETA) 0 0 snb $(PREFETCH_SCHEME) SP $(GENGEMM) dense $@ libxsmm_s$(basename $(notdir $@))_wsm $(MNVALUE) $(NMVALUE) $(KVALUE) $(MNVALUE) $(KVALUE) $(MNVALUE) $(ALPHA) $(BETA) 0 0 wsm $(PREFETCH_SCHEME) SP endif ifneq (1,$(PRECISION)) $(GENGEMM) dense $@ libxsmm_d$(basename $(notdir $@))_knl $(MNVALUE) $(NMVALUE) $(KVALUE) $(MNVALUE) $(KVALUE) $(MNVALUE) $(ALPHA) $(BETA) 0 0 knl $(PREFETCH_SCHEME) DP $(GENGEMM) dense $@ libxsmm_d$(basename $(notdir $@))_hsw $(MNVALUE) $(NMVALUE) $(KVALUE) $(MNVALUE) $(KVALUE) $(MNVALUE) $(ALPHA) $(BETA) 0 0 hsw $(PREFETCH_SCHEME) DP $(GENGEMM) dense $@ libxsmm_d$(basename $(notdir $@))_snb $(MNVALUE) $(NMVALUE) $(KVALUE) $(MNVALUE) $(KVALUE) $(MNVALUE) $(ALPHA) $(BETA) 0 0 snb $(PREFETCH_SCHEME) DP $(GENGEMM) dense $@ libxsmm_d$(basename $(notdir $@))_wsm $(MNVALUE) $(NMVALUE) $(KVALUE) $(MNVALUE) $(KVALUE) $(MNVALUE) $(ALPHA) $(BETA) 0 0 wsm $(PREFETCH_SCHEME) DP endif endif # target else # noarch ifneq (2,$(PRECISION)) @echo "#define LIBXSMM_GENTARGET_$(GENTARGET)_sp" >> $@ endif ifneq (1,$(PRECISION)) @echo "#define LIBXSMM_GENTARGET_$(GENTARGET)_dp" >> $@ endif @echo >> $@ @echo >> $@ ifneq (2,$(PRECISION)) $(GENGEMM) dense $@ libxsmm_s$(basename $(notdir $@))_$(GENTARGET) $(MNVALUE) $(NMVALUE) $(KVALUE) $(MNVALUE) $(KVALUE) $(MNVALUE) $(ALPHA) $(BETA) 0 0 $(GENTARGET) $(PREFETCH_SCHEME) SP endif ifneq (1,$(PRECISION)) $(GENGEMM) dense $@ libxsmm_d$(basename $(notdir $@))_$(GENTARGET) $(MNVALUE) $(NMVALUE) $(KVALUE) $(MNVALUE) $(KVALUE) $(MNVALUE) $(ALPHA) $(BETA) 0 0 $(GENTARGET) $(PREFETCH_SCHEME) DP endif endif # noarch $(eval TMPFILE = $(shell $(MKTEMP) /tmp/.libxsmm_XXXXXX.mak)) @cat $@ | sed \ -e "s/void libxsmm_/LIBXSMM_INLINE LIBXSMM_RETARGETABLE void libxsmm_/" \ -e "s/#ifndef NDEBUG/$(SUPPRESS_UNUSED_PREFETCH_WARNINGS)#ifdef LIBXSMM_NEVER_DEFINED/" \ -e "s/#pragma message (\".*KERNEL COMPILATION ERROR in: \" __FILE__)/ $(SUPPRESS_UNUSED_VARIABLE_WARNINGS)/" \ -e "/#error No kernel was compiled, lacking support for current architecture?/d" \ -e "/#pragma message (\".*KERNEL COMPILATION WARNING: compiling ..* code on ..* or newer architecture: \" __FILE__)/d" \ | tr "~" "\n" > $(TMPFILE) @$(PYTHON) $(ROOTDIR)/$(SCRDIR)/libxsmm_specialized.py $(PRECISION) $(MVALUE) $(NVALUE) $(KVALUE) $(PREFETCH_TYPE) >> $(TMPFILE) @$(MV) $(TMPFILE) $@ endif define DEFINE_COMPILE_RULE $(1): $(2) $(3) $(dir $(1))/.make @rm -f $(1) -$(CC) $(4) $(if $(filter 0,$(WERROR)),$(NULL),$(WERROR_CFLAG)) -c $(2) -o $(1) @if ! [ -e $(1) ]; then \ if [ "2" = "$(INTRINSICS)" ]; then \ echo "--------------------------------------------------------------"; \ echo "In case of assembler error, perhaps GNU Binutils are outdated."; \ echo "See https://github.com/hfp/libxsmm#outdated-binutils"; \ echo "--------------------------------------------------------------"; \ fi; \ false; \ fi endef ifneq (0,$(GLIBC)) DFLAGS += -DLIBXSMM_BUILD=2 else DFLAGS += -DLIBXSMM_BUILD=1 endif ifneq (0,$(MIC)) ifneq (0,$(MPSS)) $(foreach OBJ,$(OBJFILES_MIC),$(eval $(call DEFINE_COMPILE_RULE, \ $(OBJ), $(patsubst %.o,$(ROOTDIR)/$(SRCDIR)/%.c,$(notdir $(OBJ))), \ $(INCDIR)/libxsmm.h $(INCDIR)/libxsmm_source.h $(BLDDIR)/libxsmm_dispatch.h,-mmic \ $(DFLAGS) $(IFLAGS) $(call applyif,1,libxsmm_main,$(OBJ),-I$(BLDDIR)) $(CFLAGS)))) $(foreach OBJ,$(KRNOBJS_MIC),$(eval $(call DEFINE_COMPILE_RULE, \ $(OBJ), $(patsubst %.o,$(BLDDIR)/%.c,$(notdir $(OBJ))), \ $(INCDIR)/libxsmm.h $(INCDIR)/libxsmm_source.h,-mmic \ $(DFLAGS) $(IFLAGS) $(CFLAGS)))) $(foreach OBJ,$(EXTOBJS_MIC),$(eval $(call DEFINE_COMPILE_RULE, \ $(OBJ), $(patsubst %.o,$(ROOTDIR)/$(SRCDIR)/%.c,$(notdir $(OBJ))), \ $(INCDIR)/libxsmm.h $(INCDIR)/libxsmm_source.h,-mmic \ $(DFLAGS) $(IFLAGS) $(EXTCFLAGS) $(CFLAGS)))) $(eval $(call DEFINE_COMPILE_RULE,$(NOBLAS_MIC),$(ROOTDIR)/$(SRCDIR)/libxsmm_ext.c,$(INCDIR)/libxsmm.h,-mmic \ $(NOBLAS_CFLAGS) $(NOBLAS_FLAGS) $(NOBLAS_IFLAGS) $(DNOBLAS))) endif endif # build rules that include target flags $(eval $(call DEFINE_COMPILE_RULE,$(NOBLAS_HST),$(ROOTDIR)/$(SRCDIR)/libxsmm_ext.c,$(INCDIR)/libxsmm.h, \ $(CTARGET) $(NOBLAS_CFLAGS) $(NOBLAS_FLAGS) $(NOBLAS_IFLAGS) $(DNOBLAS))) $(foreach OBJ,$(OBJFILES_HST),$(eval $(call DEFINE_COMPILE_RULE, \ $(OBJ),$(patsubst %.o,$(ROOTDIR)/$(SRCDIR)/%.c,$(notdir $(OBJ))), \ $(INCDIR)/libxsmm.h $(INCDIR)/libxsmm_source.h $(BLDDIR)/libxsmm_dispatch.h, \ $(DFLAGS) $(IFLAGS) $(call applyif,1,libxsmm_main,$(OBJ),-I$(BLDDIR)) $(CTARGET) $(CFLAGS)))) $(foreach OBJ,$(KRNOBJS_HST),$(eval $(call DEFINE_COMPILE_RULE, \ $(OBJ),$(patsubst %.o,$(BLDDIR)/%.c,$(notdir $(OBJ))), \ $(INCDIR)/libxsmm.h $(INCDIR)/libxsmm_source.h, \ $(DFLAGS) $(IFLAGS) $(CTARGET) $(CFLAGS)))) $(foreach OBJ,$(EXTOBJS_HST),$(eval $(call DEFINE_COMPILE_RULE, \ $(OBJ),$(patsubst %.o,$(ROOTDIR)/$(SRCDIR)/%.c,$(notdir $(OBJ))), \ $(INCDIR)/libxsmm.h $(INCDIR)/libxsmm_source.h, \ $(DFLAGS) $(IFLAGS) $(CTARGET) $(EXTCFLAGS) $(CFLAGS)))) # build rules that by default include no target flags ifneq (0,$(TGT)) TGT_FLAGS ?= $(CTARGET) endif $(foreach OBJ,$(OBJFILES_GEN_LIB),$(eval $(call DEFINE_COMPILE_RULE, \ $(OBJ),$(patsubst %.o,$(ROOTDIR)/$(SRCDIR)/%.c,$(notdir $(OBJ))), \ $(INCDIR)/libxsmm.h $(INCDIR)/libxsmm_source.h, \ $(DFLAGS) $(IFLAGS) $(TGT_FLAGS) $(CFLAGS)))) $(foreach OBJ,$(OBJFILES_GEN_GEMM_BIN),$(eval $(call DEFINE_COMPILE_RULE, \ $(OBJ),$(patsubst %.o,$(ROOTDIR)/$(SRCDIR)/%.c,$(notdir $(OBJ))), \ $(INCDIR)/libxsmm.h $(INCDIR)/libxsmm_source.h, \ $(DFLAGS) $(IFLAGS) $(TGT_FLAGS) $(CFLAGS)))) .PHONY: module_mic ifneq (0,$(MIC)) ifneq (0,$(MPSS)) ifneq (,$(strip $(FC))) module_mic: $(INCDIR)/mic/libxsmm.mod $(BLDDIR)/mic/libxsmm-mod.o: $(BLDDIR)/mic/.make $(INCDIR)/mic/.make $(INCDIR)/libxsmm.f $(FC) $(DFLAGS) $(IFLAGS) $(FCMTFLAGS) $(FCFLAGS) -mmic -c $(INCDIR)/libxsmm.f -o $@ $(FMFLAGS) $(INCDIR)/mic $(INCDIR)/mic/libxsmm.mod: $(BLDDIR)/mic/libxsmm-mod.o @if [ -e $(BLDDIR)/mic/LIBXSMM.mod ]; then $(CP) $(BLDDIR)/mic/LIBXSMM.mod $(INCDIR); fi @if [ -e $(BLDDIR)/mic/libxsmm.mod ]; then $(CP) $(BLDDIR)/mic/libxsmm.mod $(INCDIR); fi @if [ -e LIBXSMM.mod ]; then $(MV) LIBXSMM.mod $(INCDIR); fi @if [ -e libxsmm.mod ]; then $(MV) libxsmm.mod $(INCDIR); fi @touch $@ else .PHONY: $(BLDDIR)/mic/libxsmm-mod.o .PHONY: $(INCDIR)/mic/libxsmm.mod endif else .PHONY: $(BLDDIR)/mic/libxsmm-mod.o .PHONY: $(INCDIR)/mic/libxsmm.mod endif else .PHONY: $(BLDDIR)/mic/libxsmm-mod.o .PHONY: $(INCDIR)/mic/libxsmm.mod endif .PHONY: module_hst ifneq (,$(strip $(FC))) module_hst: $(INCDIR)/libxsmm.mod $(BLDDIR)/intel64/libxsmm-mod.o: $(BLDDIR)/intel64/.make $(INCDIR)/libxsmm.f $(FC) $(DFLAGS) $(IFLAGS) $(FCMTFLAGS) $(FCFLAGS) $(FTARGET) -c $(INCDIR)/libxsmm.f -o $@ $(FMFLAGS) $(INCDIR) $(INCDIR)/libxsmm.mod: $(BLDDIR)/intel64/libxsmm-mod.o @if [ -e $(BLDDIR)/intel64/LIBXSMM.mod ]; then $(CP) $(BLDDIR)/intel64/LIBXSMM.mod $(INCDIR); fi @if [ -e $(BLDDIR)/intel64/libxsmm.mod ]; then $(CP) $(BLDDIR)/intel64/libxsmm.mod $(INCDIR); fi @if [ -e LIBXSMM.mod ]; then $(MV) LIBXSMM.mod $(INCDIR); fi @if [ -e libxsmm.mod ]; then $(MV) libxsmm.mod $(INCDIR); fi @touch $@ else .PHONY: $(BLDDIR)/intel64/libxsmm-mod.o .PHONY: $(INCDIR)/libxsmm.mod endif .PHONY: module module: module_hst module_mic .PHONY: build_generator_lib build_generator_lib: $(OUTDIR)/libxsmmgen.$(LIBEXT) $(OUTDIR)/libxsmmgen.$(LIBEXT): $(OBJFILES_GEN_LIB) $(OUTDIR)/libxsmm.env ifeq (0,$(STATIC)) $(LIB_LD) $(call solink,$@,$(VERSION_MAJOR),$(VERSION_MINOR),$(VERSION_UPDATE),$(VERSION_API)) \ $(OBJFILES_GEN_LIB) $(call cleanld,$(NOBLAS_LDFLAGS) $(NOBLAS_CLDFLAGS)) else # static @rm -f $@ $(AR) -rs $@ $(OBJFILES_GEN_LIB) endif .PHONY: generator generator: $(BINDIR)/libxsmm_gemm_generator $(BINDIR)/libxsmm_gemm_generator: $(BINDIR)/.make $(OBJFILES_GEN_GEMM_BIN) $(OUTDIR)/libxsmmgen.$(LIBEXT) $(LD) -o $@ $(OBJFILES_GEN_GEMM_BIN) $(call abslib,$(OUTDIR)/libxsmmgen.$(ILIBEXT)) \ $(call cleanld,$(NOBLAS_LDFLAGS) $(NOBLAS_CLDFLAGS)) ifneq (,$(strip $(LIBJITPROFILING))) $(LIBJITPROFILING): $(BLDDIR)/jitprofiling/.make @$(CP) $(VTUNEROOT)/lib64/libjitprofiling.$(SLIBEXT) $(BLDDIR)/jitprofiling @cd $(BLDDIR)/jitprofiling; $(AR) x libjitprofiling.$(SLIBEXT) endif .PHONY: clib_mic ifneq (0,$(MIC)) ifneq (0,$(MPSS)) clib_mic: $(OUTDIR)/mic/libxsmm.$(LIBEXT) $(OUTDIR)/mic/libxsmm.$(LIBEXT): $(OUTDIR)/mic/.make $(OBJFILES_MIC) $(KRNOBJS_MIC) ifeq (0,$(STATIC)) $(LIB_LD) -mmic $(call solink,$@,$(VERSION_MAJOR),$(VERSION_MINOR),$(VERSION_UPDATE),$(VERSION_API)) \ $(OBJFILES_MIC) $(KRNOBJS_MIC) $(call cleanld,$(LDFLAGS) $(CLDFLAGS)) else # static @rm -f $@ $(AR) -rs $@ $(OBJFILES_MIC) $(KRNOBJS_MIC) endif endif endif .PHONY: clib_hst clib_hst: $(OUTDIR)/libxsmm.pc $(OUTDIR)/libxsmm.$(LIBEXT): $(OUTDIR)/.make $(OBJFILES_HST) $(OBJFILES_GEN_LIB) $(KRNOBJS_HST) $(LIBJITPROFILING) ifeq (0,$(STATIC)) $(LIB_LD) $(call solink,$@,$(VERSION_MAJOR),$(VERSION_MINOR),$(VERSION_UPDATE),$(VERSION_API)) \ $(OBJFILES_HST) $(OBJFILES_GEN_LIB) $(KRNOBJS_HST) $(LIBJITPROFILING) $(call cleanld,$(LDFLAGS) $(CLDFLAGS)) else # static @rm -f $@ $(AR) -rs $@ $(OBJFILES_HST) $(OBJFILES_GEN_LIB) $(KRNOBJS_HST) $(OBJJITPROFILING) endif .PHONY: flib_mic ifneq (0,$(MIC)) ifneq (0,$(MPSS)) ifneq (,$(strip $(FC))) flib_mic: $(OUTDIR)/mic/libxsmmf.$(LIBEXT) $(OUTDIR)/mic/libxsmmf.$(LIBEXT): $(INCDIR)/mic/libxsmm.mod $(OUTDIR)/mic/libxsmm.$(LIBEXT) ifeq (0,$(STATIC)) $(LIB_FLD) -mmic $(FCMTFLAGS) $(call solink,$@,$(VERSION_MAJOR),$(VERSION_MINOR),$(VERSION_UPDATE),$(VERSION_API)) \ $(BLDDIR)/mic/libxsmm-mod.o $(call abslib,$(OUTDIR)/mic/libxsmm.$(ILIBEXT)) $(call cleanld,$(LDFLAGS) $(FLDFLAGS)) else # static @rm -f $@ $(AR) -rs $@ $(BLDDIR)/mic/libxsmm-mod.o endif else .PHONY: $(OUTDIR)/mic/libxsmmf.$(LIBEXT) endif endif endif .PHONY: flib_hst ifneq (,$(strip $(FC))) flib_hst: $(OUTDIR)/libxsmmf.pc $(OUTDIR)/libxsmmf.$(LIBEXT): $(INCDIR)/libxsmm.mod $(OUTDIR)/libxsmm.$(LIBEXT) ifeq (0,$(STATIC)) $(LIB_FLD) $(FCMTFLAGS) $(call solink,$@,$(VERSION_MAJOR),$(VERSION_MINOR),$(VERSION_UPDATE),$(VERSION_API)) \ $(BLDDIR)/intel64/libxsmm-mod.o $(call abslib,$(OUTDIR)/libxsmm.$(ILIBEXT)) $(call cleanld,$(LDFLAGS) $(FLDFLAGS)) else # static @rm -f $@ $(AR) -rs $@ $(BLDDIR)/intel64/libxsmm-mod.o endif else .PHONY: $(OUTDIR)/libxsmmf.pc endif .PHONY: ext_mic ifneq (0,$(MIC)) ifneq (0,$(MPSS)) ext_mic: $(OUTDIR)/mic/libxsmmext.$(LIBEXT) $(OUTDIR)/mic/libxsmmext.$(LIBEXT): $(EXTOBJS_MIC) $(OUTDIR)/mic/libxsmm.$(LIBEXT) ifeq (0,$(STATIC)) $(LIB_LD) -mmic $(EXTLDFLAGS) $(call solink,$@,$(VERSION_MAJOR),$(VERSION_MINOR),$(VERSION_UPDATE),$(VERSION_API)) \ $(EXTOBJS_MIC) $(call abslib,$(OUTDIR)/mic/libxsmm.$(ILIBEXT)) $(call cleanld,$(LDFLAGS) $(CLDFLAGS)) else # static @rm -f $@ $(AR) -rs $@ $(EXTOBJS_MIC) endif endif endif .PHONY: ext_hst ext_hst: $(OUTDIR)/libxsmmext.pc $(OUTDIR)/libxsmmext.$(LIBEXT): $(OUTDIR)/libxsmm.$(LIBEXT) $(EXTOBJS_HST) ifeq (0,$(STATIC)) $(LIB_LD) $(EXTLDFLAGS) $(call solink,$@,$(VERSION_MAJOR),$(VERSION_MINOR),$(VERSION_UPDATE),$(VERSION_API)) \ $(EXTOBJS_HST) $(call abslib,$(OUTDIR)/libxsmm.$(ILIBEXT)) $(call cleanld,$(LDFLAGS) $(CLDFLAGS)) else # static @rm -f $@ $(AR) -rs $@ $(EXTOBJS_HST) endif .PHONY: noblas_mic ifneq (0,$(MIC)) ifneq (0,$(MPSS)) noblas_mic: $(OUTDIR)/mic/libxsmmnoblas.$(LIBEXT) $(OUTDIR)/mic/libxsmmnoblas.$(LIBEXT): $(NOBLAS_MIC) ifeq (0,$(STATIC)) $(LIB_LD) -mmic $(call solink,$@,$(VERSION_MAJOR),$(VERSION_MINOR),$(VERSION_UPDATE),$(VERSION_API)) \ $(NOBLAS_MIC) $(call cleanld,$(NOBLAS_LDFLAGS) $(NOBLAS_CLDFLAGS)) else # static @rm -f $@ $(AR) -rs $@ $(NOBLAS_MIC) endif endif endif .PHONY: noblas_hst noblas_hst: $(OUTDIR)/libxsmmnoblas.pc $(OUTDIR)/libxsmmnoblas.$(LIBEXT): $(NOBLAS_HST) ifeq (0,$(STATIC)) $(LIB_LD) $(call solink,$@,$(VERSION_MAJOR),$(VERSION_MINOR),$(VERSION_UPDATE),$(VERSION_API)) \ $(NOBLAS_HST) $(call cleanld,$(NOBLAS_LDFLAGS) $(NOBLAS_CLDFLAGS)) else # static @rm -f $@ $(AR) -rs $@ $(NOBLAS_HST) endif # use dir not qdir to avoid quotes; also $(ROOTDIR)/$(SPLDIR) is relative DIRS_SAMPLES := $(dir $(shell find $(ROOTDIR)/$(SPLDIR) -type f -name Makefile \ | grep -v /deeplearning/tvm_cnnlayer/ \ | grep -v /deeplearning/tf_lstm_ops/ \ | grep -v /deeplearning/gxm/ \ | grep -v /edge/repro/ \ | grep -v /packed/ \ | grep -v /pyfr/ \ $(NULL))) .PHONY: samples $(DIRS_SAMPLES) samples: $(DIRS_SAMPLES) $(DIRS_SAMPLES): lib_hst @$(FLOCK) $@ "$(MAKE) DEPSTATIC=$(STATIC)" .PHONY: cp2k cp2k_mic cp2k: lib_hst @$(FLOCK) $(ROOTDIR)/$(SPLDIR)/cp2k "$(MAKE) --no-print-directory DEPSTATIC=$(STATIC)" cp2k_mic: lib_mic @$(FLOCK) $(ROOTDIR)/$(SPLDIR)/cp2k "$(MAKE) --no-print-directory DEPSTATIC=$(STATIC) KNC=1" .PHONY: wrap wrap_mic wrap: lib_hst @$(FLOCK) $(ROOTDIR)/$(SPLDIR)/utilities/wrap "$(MAKE) --no-print-directory DEPSTATIC=$(STATIC) TRACE=0" wrap_mic: lib_mic @$(FLOCK) $(ROOTDIR)/$(SPLDIR)/utilities/wrap "$(MAKE) --no-print-directory DEPSTATIC=$(STATIC) KNC=1 TRACE=0" .PHONY: nek nek_mic nek: lib_hst @$(FLOCK) $(ROOTDIR)/$(SPLDIR)/nek "$(MAKE) --no-print-directory DEPSTATIC=$(STATIC)" nek_mic: lib_mic @$(FLOCK) $(ROOTDIR)/$(SPLDIR)/nek "$(MAKE) --no-print-directory DEPSTATIC=$(STATIC) KNC=1" .PHONY: smm smm_mic smm: lib_hst @$(FLOCK) $(ROOTDIR)/$(SPLDIR)/smm "$(MAKE) --no-print-directory DEPSTATIC=$(STATIC)" smm_mic: lib_mic @$(FLOCK) $(ROOTDIR)/$(SPLDIR)/smm "$(MAKE) --no-print-directory DEPSTATIC=$(STATIC) KNC=1" # added for specfem sample # will need option: make MNK="5 25" .. .PHONY: specfem specfem_mic specfem: lib_hst @$(FLOCK) $(ROOTDIR)/$(SPLDIR)/specfem "$(MAKE) --no-print-directory DEPSTATIC=$(STATIC)" specfem_mic: lib_mic @$(FLOCK) $(ROOTDIR)/$(SPLDIR)/specfem "$(MAKE) --no-print-directory DEPSTATIC=$(STATIC) KNC=1" .PHONY: drytest drytest: $(ROOTDIR)/$(SPLDIR)/cp2k/cp2k-perf.sh $(ROOTDIR)/$(SPLDIR)/smm/smmf-perf.sh \ $(ROOTDIR)/$(SPLDIR)/nek/axhm-perf.sh $(ROOTDIR)/$(SPLDIR)/nek/grad-perf.sh $(ROOTDIR)/$(SPLDIR)/nek/rstr-perf.sh $(ROOTDIR)/$(SPLDIR)/cp2k/cp2k-perf.sh: $(ROOTDIR)/$(SPLDIR)/cp2k/.make $(ROOTDIR)/Makefile @echo "#!/usr/bin/env sh" > $@ @echo >> $@ @echo "HERE=\$$(cd \$$(dirname \$$0); pwd -P)" >> $@ @echo "FILE=cp2k-perf.txt" >> $@ ifneq (,$(strip $(INDICES))) @echo "RUNS=\"$(INDICES)\"" >> $@ else @echo "RUNS=\"23_23_23 4_6_9 13_5_7 24_3_36\"" >> $@ endif @echo >> $@ @echo "if [ \"\" != \"\$$1\" ]; then" >> $@ @echo " FILE=\$$1" >> $@ @echo " shift" >> $@ @echo "fi" >> $@ @echo "if [ \"\" != \"\$$1\" ]; then" >> $@ @echo " SIZE=\$$1" >> $@ @echo " shift" >> $@ @echo "else" >> $@ @echo " SIZE=0" >> $@ @echo "fi" >> $@ @echo "cat /dev/null > \$${FILE}" >> $@ @echo >> $@ @echo "NRUN=1" >> $@ @echo "NMAX=\$$(echo \$${RUNS} | wc -w | tr -d ' ')" >> $@ @echo "for RUN in \$${RUNS}; do" >> $@ @echo " MVALUE=\$$(echo \$${RUN} | cut -d_ -f1)" >> $@ @echo " NVALUE=\$$(echo \$${RUN} | cut -d_ -f2)" >> $@ @echo " KVALUE=\$$(echo \$${RUN} | cut -d_ -f3)" >> $@ @echo " >&2 echo -n \"\$${NRUN} of \$${NMAX} (M=\$${MVALUE} N=\$${NVALUE} K=\$${KVALUE})... \"" >> $@ @echo " ERROR=\$$({ CHECK=1 \$${HERE}/cp2k-dbcsr.sh \$${MVALUE} \$${SIZE} 0 \$${NVALUE} \$${KVALUE} >> \$${FILE}; } 2>&1)" >> $@ @echo " RESULT=\$$?" >> $@ @echo " if [ 0 != \$${RESULT} ]; then" >> $@ @echo " echo \"FAILED(\$${RESULT}) \$${ERROR}\"" >> $@ @echo " exit 1" >> $@ @echo " else" >> $@ @echo " echo \"OK \$${ERROR}\"" >> $@ @echo " fi" >> $@ @echo " echo >> \$${FILE}" >> $@ @echo " NRUN=\$$((NRUN+1))" >> $@ @echo "done" >> $@ @echo >> $@ @chmod +x $@ $(ROOTDIR)/$(SPLDIR)/smm/smmf-perf.sh: $(ROOTDIR)/$(SPLDIR)/smm/.make $(ROOTDIR)/Makefile @echo "#!/usr/bin/env sh" > $@ @echo >> $@ @echo "HERE=\$$(cd \$$(dirname \$$0); pwd -P)" >> $@ @echo "FILE=\$${HERE}/smmf-perf.txt" >> $@ ifneq (,$(strip $(INDICES))) @echo "RUNS=\"$(INDICES)\"" >> $@ else @echo "RUNS=\"23_23_23 4_6_9 13_5_7 24_3_36\"" >> $@ endif @echo >> $@ @echo "if [ \"\" != \"\$$1\" ]; then" >> $@ @echo " FILE=\$$1" >> $@ @echo " shift" >> $@ @echo "fi" >> $@ @echo "cat /dev/null > \$${FILE}" >> $@ @echo >> $@ @echo "NRUN=1" >> $@ @echo "NMAX=\$$(echo \$${RUNS} | wc -w | tr -d ' ')" >> $@ @echo "for RUN in \$${RUNS}; do" >> $@ @echo " MVALUE=\$$(echo \$${RUN} | cut -d_ -f1)" >> $@ @echo " NVALUE=\$$(echo \$${RUN} | cut -d_ -f2)" >> $@ @echo " KVALUE=\$$(echo \$${RUN} | cut -d_ -f3)" >> $@ @echo " >&2 echo -n \"\$${NRUN} of \$${NMAX} (M=\$${MVALUE} N=\$${NVALUE} K=\$${KVALUE})... \"" >> $@ @echo " ERROR=\$$({ CHECK=1 \$${HERE}/smm.sh \$${MVALUE} \$${NVALUE} \$${KVALUE} \$$* >> \$${FILE}; } 2>&1)" >> $@ @echo " RESULT=\$$?" >> $@ @echo " if [ 0 != \$${RESULT} ]; then" >> $@ @echo " echo \"FAILED(\$${RESULT}) \$${ERROR}\"" >> $@ @echo " exit 1" >> $@ @echo " else" >> $@ @echo " echo \"OK \$${ERROR}\"" >> $@ @echo " fi" >> $@ @echo " echo >> \$${FILE}" >> $@ @echo " NRUN=\$$((NRUN+1))" >> $@ @echo "done" >> $@ @echo >> $@ @chmod +x $@ $(ROOTDIR)/$(SPLDIR)/nek/axhm-perf.sh: $(ROOTDIR)/$(SPLDIR)/nek/.make $(ROOTDIR)/Makefile @echo "#!/usr/bin/env sh" > $@ @echo >> $@ @echo "HERE=\$$(cd \$$(dirname \$$0); pwd -P)" >> $@ @echo "FILE=\$${HERE}/axhm-perf.txt" >> $@ ifneq (,$(strip $(INDICES))) @echo "RUNS=\"$(INDICES)\"" >> $@ else @echo "RUNS=\"4_6_9 8_8_8 13_13_13 16_8_13\"" >> $@ endif @echo >> $@ @echo "if [ \"\" != \"\$$1\" ]; then" >> $@ @echo " FILE=\$$1" >> $@ @echo " shift" >> $@ @echo "fi" >> $@ @echo "cat /dev/null > \$${FILE}" >> $@ @echo >> $@ @echo "NRUN=1" >> $@ @echo "NMAX=\$$(echo \$${RUNS} | wc -w | tr -d ' ')" >> $@ @echo "for RUN in \$${RUNS}; do" >> $@ @echo " MVALUE=\$$(echo \$${RUN} | cut -d_ -f1)" >> $@ @echo " NVALUE=\$$(echo \$${RUN} | cut -d_ -f2)" >> $@ @echo " KVALUE=\$$(echo \$${RUN} | cut -d_ -f3)" >> $@ @echo " >&2 echo -n \"\$${NRUN} of \$${NMAX} (M=\$${MVALUE} N=\$${NVALUE} K=\$${KVALUE})... \"" >> $@ @echo " ERROR=\$$({ CHECK=1 \$${HERE}/axhm.sh \$${MVALUE} \$${NVALUE} \$${KVALUE} \$$* >> \$${FILE}; } 2>&1)" >> $@ @echo " RESULT=\$$?" >> $@ @echo " if [ 0 != \$${RESULT} ]; then" >> $@ @echo " echo \"FAILED(\$${RESULT}) \$${ERROR}\"" >> $@ @echo " exit 1" >> $@ @echo " else" >> $@ @echo " echo \"OK \$${ERROR}\"" >> $@ @echo " fi" >> $@ @echo " echo >> \$${FILE}" >> $@ @echo " NRUN=\$$((NRUN+1))" >> $@ @echo "done" >> $@ @echo >> $@ @chmod +x $@ $(ROOTDIR)/$(SPLDIR)/nek/grad-perf.sh: $(ROOTDIR)/$(SPLDIR)/nek/.make $(ROOTDIR)/Makefile @echo "#!/usr/bin/env sh" > $@ @echo >> $@ @echo "HERE=\$$(cd \$$(dirname \$$0); pwd -P)" >> $@ @echo "FILE=\$${HERE}/grad-perf.txt" >> $@ ifneq (,$(strip $(INDICES))) @echo "RUNS=\"$(INDICES)\"" >> $@ else @echo "RUNS=\"4_6_9 8_8_8 13_13_13 16_8_13\"" >> $@ endif @echo >> $@ @echo "if [ \"\" != \"\$$1\" ]; then" >> $@ @echo " FILE=\$$1" >> $@ @echo " shift" >> $@ @echo "fi" >> $@ @echo "cat /dev/null > \$${FILE}" >> $@ @echo >> $@ @echo "NRUN=1" >> $@ @echo "NMAX=\$$(echo \$${RUNS} | wc -w | tr -d ' ')" >> $@ @echo "for RUN in \$${RUNS}; do" >> $@ @echo " MVALUE=\$$(echo \$${RUN} | cut -d_ -f1)" >> $@ @echo " NVALUE=\$$(echo \$${RUN} | cut -d_ -f2)" >> $@ @echo " KVALUE=\$$(echo \$${RUN} | cut -d_ -f3)" >> $@ @echo " >&2 echo -n \"\$${NRUN} of \$${NMAX} (M=\$${MVALUE} N=\$${NVALUE} K=\$${KVALUE})... \"" >> $@ @echo " ERROR=\$$({ CHECK=1 \$${HERE}/grad.sh \$${MVALUE} \$${NVALUE} \$${KVALUE} \$$* >> \$${FILE}; } 2>&1)" >> $@ @echo " RESULT=\$$?" >> $@ @echo " if [ 0 != \$${RESULT} ]; then" >> $@ @echo " echo \"FAILED(\$${RESULT}) \$${ERROR}\"" >> $@ @echo " exit 1" >> $@ @echo " else" >> $@ @echo " echo \"OK \$${ERROR}\"" >> $@ @echo " fi" >> $@ @echo " echo >> \$${FILE}" >> $@ @echo " NRUN=\$$((NRUN+1))" >> $@ @echo "done" >> $@ @echo >> $@ @chmod +x $@ $(ROOTDIR)/$(SPLDIR)/nek/rstr-perf.sh: $(ROOTDIR)/$(SPLDIR)/nek/.make $(ROOTDIR)/Makefile @echo "#!/usr/bin/env sh" > $@ @echo >> $@ @echo "HERE=\$$(cd \$$(dirname \$$0); pwd -P)" >> $@ @echo "FILE=\$${HERE}/rstr-perf.txt" >> $@ ifneq (,$(strip $(INDICES))) @echo "RUNS=\"$(INDICES)\"" >> $@ @echo "RUNT=\"$(INDICES)\"" >> $@ else @echo "RUNS=\"4_4_4 8_8_8\"" >> $@ @echo "RUNT=\"7_7_7 10_10_10\"" >> $@ endif @echo >> $@ @echo "if [ \"\" != \"\$$1\" ]; then" >> $@ @echo " FILE=\$$1" >> $@ @echo " shift" >> $@ @echo "fi" >> $@ @echo "cat /dev/null > \$${FILE}" >> $@ @echo >> $@ @echo "NRUN=1" >> $@ @echo "NRUNS=\$$(echo \$${RUNS} | wc -w | tr -d ' ')" >> $@ @echo "NRUNT=\$$(echo \$${RUNT} | wc -w | tr -d ' ')" >> $@ @echo "NMAX=\$$((NRUNS*NRUNT))" >> $@ @echo "for RUN1 in \$${RUNS}; do" >> $@ @echo " for RUN2 in \$${RUNT}; do" >> $@ @echo " MVALUE=\$$(echo \$${RUN1} | cut -d_ -f1)" >> $@ @echo " NVALUE=\$$(echo \$${RUN1} | cut -d_ -f2)" >> $@ @echo " KVALUE=\$$(echo \$${RUN1} | cut -d_ -f3)" >> $@ @echo " MMVALUE=\$$(echo \$${RUN2} | cut -d_ -f1)" >> $@ @echo " NNVALUE=\$$(echo \$${RUN2} | cut -d_ -f2)" >> $@ @echo " KKVALUE=\$$(echo \$${RUN2} | cut -d_ -f3)" >> $@ @echo " >&2 echo -n \"\$${NRUN} of \$${NMAX} (MNK=\$${MVALUE}x\$${NVALUE}x\$${KVALUE} MNK2=\$${MMVALUE}x\$${NNVALUE}x\$${KKVALUE})... \"" >> $@ @echo " ERROR=\$$({ CHECK=1 \$${HERE}/rstr.sh \$${MVALUE} \$${NVALUE} \$${KVALUE} \$${MMVALUE} \$${NNVALUE} \$${KKVALUE} \$$* >> \$${FILE}; } 2>&1)" >> $@ @echo " RESULT=\$$?" >> $@ @echo " if [ 0 != \$${RESULT} ]; then" >> $@ @echo " echo \"FAILED(\$${RESULT}) \$${ERROR}\"" >> $@ @echo " exit 1" >> $@ @echo " else" >> $@ @echo " echo \"OK \$${ERROR}\"" >> $@ @echo " fi" >> $@ @echo " echo >> \$${FILE}" >> $@ @echo " NRUN=\$$((NRUN+1))" >> $@ @echo "done" >> $@ @echo "done" >> $@ @echo >> $@ @chmod +x $@ .PHONY: test test: tests .PHONY: perf perf: perf-cp2k .PHONY: test-all test-all: tests test-cp2k test-smm test-nek test-wrap .PHONY: build-tests build-tests: lib_hst @$(FLOCK) $(ROOTDIR)/$(TSTDIR) "$(MAKE) --no-print-directory DEPSTATIC=$(STATIC)" .PHONY: tests tests: lib_hst @$(FLOCK) $(ROOTDIR)/$(TSTDIR) "$(MAKE) --no-print-directory DEPSTATIC=$(STATIC) test" .PHONY: cpp-test cpp-test: test-cpp .PHONY: test-cpp test-cpp: $(INCDIR)/libxsmm_source.h @$(FLOCK) $(ROOTDIR)/$(SPLDIR)/cp2k "$(MAKE) --no-print-directory DEPSTATIC=$(STATIC) TRACE=0 \ ECXXFLAGS='-DUSE_HEADER_ONLY $(ECXXFLAGS)' clean compile" .PHONY: test-cp2k test-cp2k: $(ROOTDIR)/$(SPLDIR)/cp2k/cp2k-test.txt $(ROOTDIR)/$(SPLDIR)/cp2k/cp2k-test.txt: $(ROOTDIR)/$(SPLDIR)/cp2k/cp2k-perf.sh lib_hst cp2k @$(FLOCK) $(call qdir,$@) "./cp2k-perf.sh $(call qndir,$@) $(shell echo $$(($(TESTSIZE) * 128)))" .PHONY: perf-cp2k perf-cp2k: $(ROOTDIR)/$(SPLDIR)/cp2k/cp2k-perf.txt $(ROOTDIR)/$(SPLDIR)/cp2k/cp2k-perf.txt: $(ROOTDIR)/$(SPLDIR)/cp2k/cp2k-perf.sh lib_hst cp2k @$(FLOCK) $(ROOTDIR)/$(SPLDIR)/cp2k "./cp2k-perf.sh $(call qndir,$@)" .PHONY: test-wrap test-wrap: wrap @$(FLOCK) $(ROOTDIR)/$(SPLDIR)/utilities/wrap "$(MAKE) --no-print-directory DEPSTATIC=$(STATIC) TRACE=0 test" .PHONY: test-smm ifneq (,$(strip $(FC))) test-smm: $(ROOTDIR)/$(SPLDIR)/smm/smm-test.txt $(ROOTDIR)/$(SPLDIR)/smm/smm-test.txt: $(ROOTDIR)/$(SPLDIR)/smm/smmf-perf.sh lib_hst smm @$(FLOCK) $(call qdir,$@) "./smmf-perf.sh $(call qndir,$@) $(shell echo $$(($(TESTSIZE) * -128)))" endif .PHONY: perf-smm ifneq (,$(strip $(FC))) perf-smm: $(ROOTDIR)/$(SPLDIR)/smm/smmf-perf.txt $(ROOTDIR)/$(SPLDIR)/smm/smmf-perf.txt: $(ROOTDIR)/$(SPLDIR)/smm/smmf-perf.sh lib_hst smm @$(FLOCK) $(call qdir,$@) "./smmf-perf.sh $(call qndir,$@)" endif .PHONY: test-nek ifneq (,$(strip $(FC))) test-nek: \ $(ROOTDIR)/$(SPLDIR)/nek/axhm-perf.txt \ $(ROOTDIR)/$(SPLDIR)/nek/grad-perf.txt \ $(ROOTDIR)/$(SPLDIR)/nek/rstr-perf.txt $(ROOTDIR)/$(SPLDIR)/nek/axhm-perf.txt: $(ROOTDIR)/$(SPLDIR)/nek/axhm-perf.sh lib_hst @$(FLOCK) $(ROOTDIR)/$(SPLDIR)/nek "$(MAKE) --no-print-directory DEPSTATIC=$(STATIC) axhm" @$(FLOCK) $(ROOTDIR)/$(SPLDIR)/nek "./axhm-perf.sh $(call qndir,$@) $(shell echo $$(($(TESTSIZE) * -128)))" $(ROOTDIR)/$(SPLDIR)/nek/grad-perf.txt: $(ROOTDIR)/$(SPLDIR)/nek/grad-perf.sh lib_hst @$(FLOCK) $(ROOTDIR)/$(SPLDIR)/nek "$(MAKE) --no-print-directory DEPSTATIC=$(STATIC) grad" @$(FLOCK) $(ROOTDIR)/$(SPLDIR)/nek "./grad-perf.sh $(call qndir,$@) $(shell echo $$(($(TESTSIZE) * -128)))" $(ROOTDIR)/$(SPLDIR)/nek/rstr-perf.txt: $(ROOTDIR)/$(SPLDIR)/nek/rstr-perf.sh lib_hst @$(FLOCK) $(ROOTDIR)/$(SPLDIR)/nek "$(MAKE) --no-print-directory DEPSTATIC=$(STATIC) rstr" @$(FLOCK) $(ROOTDIR)/$(SPLDIR)/nek "./rstr-perf.sh $(call qndir,$@) $(shell echo $$(($(TESTSIZE) * -128)))" endif $(DOCDIR)/index.md: $(DOCDIR)/.make $(ROOTDIR)/Makefile $(ROOTDIR)/README.md @sed $(ROOTDIR)/README.md \ -e 's/\[!\[..*\](..*)\](..*)//g' \ -e 's/\[\[..*\](..*)\]//g' \ -e "s/](${DOCDIR}\//](/g" \ -e 'N;/^\n$$/d;P;D' \ > $@ $(DOCDIR)/libxsmm_compat.md: $(DOCDIR)/.make $(ROOTDIR)/Makefile $(ROOTDIR)/version.txt @wget -T $(TIMEOUT) -q -O $@ "https://raw.githubusercontent.com/wiki/hfp/libxsmm/Compatibility.md" @echo >> $@ $(DOCDIR)/libxsmm_valid.md: $(DOCDIR)/.make $(ROOTDIR)/Makefile $(ROOTDIR)/version.txt @wget -T $(TIMEOUT) -q -O $@ "https://raw.githubusercontent.com/wiki/hfp/libxsmm/Validation.md" @echo >> $@ $(DOCDIR)/libxsmm_qna.md: $(DOCDIR)/.make $(ROOTDIR)/Makefile $(ROOTDIR)/version.txt @wget -T $(TIMEOUT) -q -O $@ "https://raw.githubusercontent.com/wiki/hfp/libxsmm/Q&A.md" @echo >> $@ $(DOCDIR)/libxsmm.$(DOCEXT): $(DOCDIR)/.make $(ROOTDIR)/documentation/index.md \ $(ROOTDIR)/documentation/libxsmm_mm.md $(ROOTDIR)/documentation/libxsmm_dl.md $(ROOTDIR)/documentation/libxsmm_aux.md \ $(ROOTDIR)/documentation/libxsmm_prof.md $(ROOTDIR)/documentation/libxsmm_tune.md $(ROOTDIR)/documentation/libxsmm_be.md \ $(ROOTDIR)/documentation/libxsmm_compat.md $(ROOTDIR)/documentation/libxsmm_valid.md $(ROOTDIR)/documentation/libxsmm_qna.md $(eval TMPFILE = $(shell $(MKTEMP) $(ROOTDIR)/documentation/.libxsmm_XXXXXX.tex)) @pandoc -D latex \ | sed \ -e 's/\(\\documentclass\[..*\]{..*}\)/\1\n\\pagenumbering{gobble}\n\\RedeclareSectionCommands[beforeskip=-1pt,afterskip=1pt]{subsection,subsubsection}/' \ -e 's/\\usepackage{listings}/\\usepackage{listings}\\lstset{basicstyle=\\footnotesize\\ttfamily,showstringspaces=false}/' \ -e 's/\(\\usepackage.*{hyperref}\)/\\usepackage[hyphens]{url}\n\1/' \ > $(TMPFILE) @cd $(ROOTDIR)/documentation && ( \ iconv -t utf-8 index.md && echo && \ echo "# LIBXSMM Domains" && \ iconv -t utf-8 libxsmm_mm.md && echo && \ iconv -t utf-8 libxsmm_dl.md && echo && \ iconv -t utf-8 libxsmm_aux.md && echo && \ iconv -t utf-8 libxsmm_prof.md && echo && \ iconv -t utf-8 libxsmm_tune.md && echo && \ iconv -t utf-8 libxsmm_be.md && echo && \ echo "# Appendix" && \ echo "## Compatibility" && \ sed "s/^\(##*\) /#\1 /" libxsmm_compat.md | iconv -t utf-8 && \ echo "## Validation" && \ sed "s/^\(##*\) /#\1 /" libxsmm_valid.md | iconv -t utf-8 && \ echo "## Q&A" && \ sed "s/^\(##*\) /#\1 /" libxsmm_qna.md | iconv -t utf-8; ) \ | sed \ -e 's//~/g' -e 's/<\/sub>/~/g' \ -e 's//^/g' -e 's/<\/sup>/^/g' \ -e 's/----*//g' \ | pandoc \ --template=$(call qndir,$(TMPFILE)) --listings \ -f gfm+subscript+superscript \ -V documentclass=scrartcl \ -V title-meta="LIBXSMM Documentation" \ -V author-meta="Hans Pabst, Alexander Heinecke" \ -V classoption=DIV=45 \ -V linkcolor=black \ -V citecolor=black \ -V urlcolor=black \ -o $(call qndir,$@) @rm $(TMPFILE) $(DOCDIR)/libxsmm_samples.md: $(ROOTDIR)/Makefile $(ROOTDIR)/$(SPLDIR)/*/README.md $(ROOTDIR)/$(SPLDIR)/deeplearning/*/README.md $(ROOTDIR)/$(SPLDIR)/utilities/*/README.md @cat $(ROOTDIR)/$(SPLDIR)/*/README.md $(ROOTDIR)/$(SPLDIR)/deeplearning/*/README.md $(ROOTDIR)/$(SPLDIR)/utilities/*/README.md \ | sed \ -e 's/^#/##/' \ -e 's//~/g' -e 's/<\/sub>/~/g' \ -e 's//^/g' -e 's/<\/sup>/^/g' \ -e 's/----*//g' \ -e '1s/^/# [LIBXSMM Samples](https:\/\/github.com\/hfp\/libxsmm\/raw\/master\/documentation\/libxsmm_samples.pdf)\n\n/' \ > $@ $(DOCDIR)/libxsmm_samples.$(DOCEXT): $(ROOTDIR)/documentation/libxsmm_samples.md $(eval TMPFILE = $(shell $(MKTEMP) .libxsmm_XXXXXX.tex)) @pandoc -D latex \ | sed \ -e 's/\(\\documentclass\[..*\]{..*}\)/\1\n\\pagenumbering{gobble}\n\\RedeclareSectionCommands[beforeskip=-1pt,afterskip=1pt]{subsection,subsubsection}/' \ -e 's/\\usepackage{listings}/\\usepackage{listings}\\lstset{basicstyle=\\footnotesize\\ttfamily,showstringspaces=false}/' \ -e 's/\(\\usepackage.*{hyperref}\)/\\usepackage[hyphens]{url}\n\1/' \ > $(TMPFILE) @iconv -t utf-8 $(ROOTDIR)/documentation/libxsmm_samples.md \ | pandoc \ --template=$(TMPFILE) --listings \ -f gfm+subscript+superscript \ -V documentclass=scrartcl \ -V title-meta="LIBXSMM Sample Code Summary" \ -V classoption=DIV=45 \ -V linkcolor=black \ -V citecolor=black \ -V urlcolor=black \ -o $@ @rm $(TMPFILE) $(DOCDIR)/tensorflow.$(DOCEXT): $(DOCDIR)/.make $(ROOTDIR)/Makefile $(ROOTDIR)/documentation/tensorflow.md $(eval TMPFILE = $(shell $(MKTEMP) $(ROOTDIR)/documentation/.libxsmm_XXXXXX.tex)) @pandoc -D latex \ | sed \ -e 's/\(\\documentclass\[..*\]{..*}\)/\1\n\\pagenumbering{gobble}\n\\RedeclareSectionCommands[beforeskip=-1pt,afterskip=1pt]{subsection,subsubsection}/' \ -e 's/\\usepackage{listings}/\\usepackage{listings}\\lstset{basicstyle=\\footnotesize\\ttfamily,showstringspaces=false}/' \ -e 's/\(\\usepackage.*{hyperref}\)/\\usepackage[hyphens]{url}\n\1/' \ > $(TMPFILE) @cd $(ROOTDIR)/documentation && iconv -t utf-8 tensorflow.md \ | sed \ -e 's//~/g' -e 's/<\/sub>/~/g' \ -e 's//^/g' -e 's/<\/sup>/^/g' \ -e 's/----*//g' \ | pandoc \ --template=$(call qndir,$(TMPFILE)) --listings \ -f gfm+subscript+superscript \ -V documentclass=scrartcl \ -V title-meta="TensorFlow with LIBXSMM" \ -V author-meta="Hans Pabst" \ -V classoption=DIV=45 \ -V linkcolor=black \ -V citecolor=black \ -V urlcolor=black \ -o $(call qndir,$@) @rm $(TMPFILE) .PHONY: documentation documentation: \ $(DOCDIR)/libxsmm.$(DOCEXT) \ $(DOCDIR)/libxsmm_samples.$(DOCEXT) \ $(DOCDIR)/tensorflow.$(DOCEXT) .PHONY: mkdocs mkdocs: $(ROOTDIR)/documentation/index.md $(ROOTDIR)/documentation/libxsmm_samples.md @mkdocs build --clean @mkdocs serve .PHONY: clean clean: ifneq ($(call qapath,$(BLDDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(BLDDIR)),$(HEREDIR)) @rm -rf $(BLDDIR) endif endif ifneq (,$(wildcard $(BLDDIR))) # still exists @rm -f $(OBJECTS) $(FTNOBJS) $(SRCFILES_KERNELS) $(BLDDIR)/libxsmm_dispatch.h @rm -f $(BLDDIR)/*.gcno $(BLDDIR)/*.gcda $(BLDDIR)/*.gcov endif @find . -type f \( -name .make -or -name .state \) -exec rm {} \; @rm -f $(ROOTDIR)/$(SCRDIR)/libxsmm_utilities.pyc @rm -rf $(ROOTDIR)/$(SCRDIR)/__pycache__ .PHONY: realclean realclean: clean ifneq ($(call qapath,$(OUTDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(OUTDIR)),$(HEREDIR)) @rm -rf $(OUTDIR) endif endif ifneq (,$(wildcard $(OUTDIR))) # still exists @rm -f $(OUTDIR)/libxsmm.$(LIBEXT)* $(OUTDIR)/mic/libxsmm.$(LIBEXT)* @rm -f $(OUTDIR)/libxsmmf.$(LIBEXT)* $(OUTDIR)/mic/libxsmmf.$(LIBEXT)* @rm -f $(OUTDIR)/libxsmmext.$(LIBEXT)* $(OUTDIR)/mic/libxsmmext.$(LIBEXT)* @rm -f $(OUTDIR)/libxsmmnoblas.$(LIBEXT)* $(OUTDIR)/mic/libxsmmnoblas.$(LIBEXT)* @rm -f $(OUTDIR)/libxsmmgen.$(LIBEXT)* @rm -f $(OUTDIR)/libxsmm*.pc endif ifneq ($(call qapath,$(BINDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(BINDIR)),$(HEREDIR)) @rm -rf $(BINDIR) endif endif ifneq (,$(wildcard $(BINDIR))) # still exists @rm -f $(BINDIR)/libxsmm_*_generator endif @rm -f $(ROOTDIR)/$(SPLDIR)/cp2k/cp2k-perf.sh @rm -f $(ROOTDIR)/$(SPLDIR)/smm/smmf-perf.sh @rm -f $(ROOTDIR)/$(SPLDIR)/nek/grad-perf.sh @rm -f $(ROOTDIR)/$(SPLDIR)/nek/axhm-perf.sh @rm -f $(ROOTDIR)/$(SPLDIR)/nek/rstr-perf.sh @rm -f $(INCDIR)/libxsmm_version.h @rm -f $(INCDIR)/libxsmm.modmic @rm -f $(INCDIR)/libxsmm.mod @rm -f $(INCDIR)/libxsmm.f @rm -f $(HEREDIR)/python3 .PHONY: clean-all clean-all: clean @find $(ROOTDIR)/$(SPLDIR) $(ROOTDIR)/$(TSTDIR) -type f -name Makefile -exec $(FLOCK) {} \ "$(MAKE) --no-print-directory clean" \; 2>/dev/null || true .PHONY: realclean-all realclean-all: realclean @find $(ROOTDIR)/$(SPLDIR) $(ROOTDIR)/$(TSTDIR) -type f -name Makefile -exec $(FLOCK) {} \ "$(MAKE) --no-print-directory realclean" \; 2>/dev/null || true .PHONY: distclean distclean: realclean-all @rm -rf libxsmm* # keep original prefix (:) ALIAS_PREFIX := $(PREFIX) # DESTDIR is used as prefix of PREFIX ifneq (,$(strip $(DESTDIR))) override PREFIX := $(call qapath,$(DESTDIR)/$(PREFIX)) endif # fall-back ifeq (,$(strip $(PREFIX))) override PREFIX := $(HEREDIR) endif # setup maintainer-layout ifeq (,$(strip $(ALIAS_PREFIX))) override ALIAS_PREFIX := $(PREFIX) endif ifneq ($(ALIAS_PREFIX),$(PREFIX)) PPKGDIR := libdata/pkgconfig PMODDIR := $(PDOCDIR) endif .PHONY: install-minimal install-minimal: libxsmm ifneq ($(PREFIX),$(ABSDIR)) @mkdir -p $(PREFIX)/$(POUTDIR) $(PREFIX)/$(PBINDIR) $(PREFIX)/$(PINCDIR) $(PREFIX)/$(PSRCDIR) @echo @echo "LIBXSMM installing libraries..." @$(CP) -va $(OUTDIR)/libxsmmnoblas.$(DLIBEXT)* $(PREFIX)/$(POUTDIR) 2>/dev/null || true @$(CP) -v $(OUTDIR)/libxsmmnoblas.$(SLIBEXT) $(PREFIX)/$(POUTDIR) 2>/dev/null || true @$(CP) -va $(OUTDIR)/libxsmmgen.$(DLIBEXT)* $(PREFIX)/$(POUTDIR) 2>/dev/null || true @$(CP) -v $(OUTDIR)/libxsmmgen.$(SLIBEXT) $(PREFIX)/$(POUTDIR) 2>/dev/null || true @$(CP) -va $(OUTDIR)/libxsmmext.$(DLIBEXT)* $(PREFIX)/$(POUTDIR) 2>/dev/null || true @$(CP) -v $(OUTDIR)/libxsmmext.$(SLIBEXT) $(PREFIX)/$(POUTDIR) 2>/dev/null || true @$(CP) -va $(OUTDIR)/libxsmmf.$(DLIBEXT)* $(PREFIX)/$(POUTDIR) 2>/dev/null || true @$(CP) -v $(OUTDIR)/libxsmmf.$(SLIBEXT) $(PREFIX)/$(POUTDIR) 2>/dev/null || true @$(CP) -va $(OUTDIR)/libxsmm.$(DLIBEXT)* $(PREFIX)/$(POUTDIR) 2>/dev/null || true @$(CP) -v $(OUTDIR)/libxsmm.$(SLIBEXT) $(PREFIX)/$(POUTDIR) 2>/dev/null || true @if [ -e $(OUTDIR)/mic/libxsmmnoblas.$(DLIBEXT) ]; then \ mkdir -p $(PREFIX)/$(POUTDIR)/mic; \ $(CP) -va $(OUTDIR)/mic/libxsmmnoblas.$(DLIBEXT)* $(PREFIX)/$(POUTDIR)/mic; \ fi @if [ -e $(OUTDIR)/mic/libxsmmnoblas.$(SLIBEXT) ]; then \ mkdir -p $(PREFIX)/$(POUTDIR)/mic; \ $(CP) -v $(OUTDIR)/mic/libxsmmnoblas.$(SLIBEXT) $(PREFIX)/$(POUTDIR)/mic; \ fi @if [ -e $(OUTDIR)/mic/libxsmmext.$(DLIBEXT) ]; then \ mkdir -p $(PREFIX)/$(POUTDIR)/mic; \ $(CP) -va $(OUTDIR)/mic/libxsmmext.$(DLIBEXT)* $(PREFIX)/$(POUTDIR)/mic; \ fi @if [ -e $(OUTDIR)/mic/libxsmmext.$(SLIBEXT) ]; then \ mkdir -p $(PREFIX)/$(POUTDIR)/mic; \ $(CP) -v $(OUTDIR)/mic/libxsmmext.$(SLIBEXT) $(PREFIX)/$(POUTDIR)/mic; \ fi @if [ -e $(OUTDIR)/mic/libxsmmf.$(DLIBEXT) ]; then \ mkdir -p $(PREFIX)/$(POUTDIR)/mic; \ $(CP) -va $(OUTDIR)/mic/libxsmmf.$(DLIBEXT)* $(PREFIX)/$(POUTDIR)/mic; \ fi @if [ -e $(OUTDIR)/mic/libxsmmf.$(SLIBEXT) ]; then \ mkdir -p $(PREFIX)/$(POUTDIR)/mic; \ $(CP) -v $(OUTDIR)/mic/libxsmmf.$(SLIBEXT) $(PREFIX)/$(POUTDIR)/mic; \ fi @if [ -e $(OUTDIR)/mic/libxsmm.$(DLIBEXT) ]; then \ mkdir -p $(PREFIX)/$(POUTDIR)/mic; \ $(CP) -va $(OUTDIR)/mic/libxsmm.$(DLIBEXT)* $(PREFIX)/$(POUTDIR)/mic; \ fi @if [ -e $(OUTDIR)/mic/libxsmm.$(SLIBEXT) ]; then \ mkdir -p $(PREFIX)/$(POUTDIR)/mic; \ $(CP) -v $(OUTDIR)/mic/libxsmm.$(SLIBEXT) $(PREFIX)/$(POUTDIR)/mic; \ fi @echo @echo "LIBXSMM installing pkg-config and module files..." @mkdir -p $(PREFIX)/$(PPKGDIR) @$(CP) -v $(OUTDIR)/*.pc $(PREFIX)/$(PPKGDIR) 2>/dev/null || true @if [ ! -e $(PREFIX)/$(PMODDIR)/libxsmm.env ]; then \ mkdir -p $(PREFIX)/$(PMODDIR); \ $(CP) -v $(OUTDIR)/libxsmm.env $(PREFIX)/$(PMODDIR) 2>/dev/null || true; \ fi @echo @echo "LIBXSMM installing stand-alone generators..." @$(CP) -v $(BINDIR)/libxsmm_*_generator $(PREFIX)/$(PBINDIR) 2>/dev/null || true @echo @echo "LIBXSMM installing interface..." @$(CP) -v $(INCDIR)/libxsmm*.h $(PREFIX)/$(PINCDIR) 2>/dev/null || true @$(CP) -v $(INCDIR)/libxsmm.f $(PREFIX)/$(PINCDIR) 2>/dev/null || true @$(CP) -v $(INCDIR)/*.mod* $(PREFIX)/$(PINCDIR) 2>/dev/null || true @echo @echo "LIBXSMM installing header-only..." @$(CP) -r $(ROOTDIR)/$(SRCDIR)/* $(PREFIX)/$(PSRCDIR) >/dev/null 2>/dev/null || true endif .PHONY: install install: install-minimal ifneq ($(PREFIX),$(ABSDIR)) @echo @echo "LIBXSMM installing documentation..." @mkdir -p $(PREFIX)/$(PDOCDIR) @$(CP) -v $(ROOTDIR)/$(DOCDIR)/*.pdf $(PREFIX)/$(PDOCDIR) @$(CP) -v $(ROOTDIR)/$(DOCDIR)/*.md $(PREFIX)/$(PDOCDIR) @$(CP) -v $(ROOTDIR)/SECURITY.md $(PREFIX)/$(PDOCDIR) @$(CP) -v $(ROOTDIR)/version.txt $(PREFIX)/$(PDOCDIR) @sed "s/^\"//;s/\\\n\"$$//;/STATIC=/d" $(DIRSTATE)/.state > $(PREFIX)/$(PDOCDIR)/build.txt 2>/dev/null || true @mkdir -p $(PREFIX)/$(LICFDIR) ifneq ($(call qapath,$(PREFIX)/$(PDOCDIR)/LICENSE.md),$(call qapath,$(PREFIX)/$(LICFDIR)/$(LICFILE))) @$(MV) $(PREFIX)/$(PDOCDIR)/LICENSE.md $(PREFIX)/$(LICFDIR)/$(LICFILE) endif endif .PHONY: install-all install-all: install .PHONY: install-realall install-realall: install samples ifneq ($(PREFIX),$(ABSDIR)) @echo @echo "LIBXSMM installing samples..." @$(CP) -v $(addprefix $(ROOTDIR)/$(SPLDIR)/cp2k/,cp2k cp2k.sh cp2k-perf* cp2k-plot.sh) $(PREFIX)/$(PBINDIR) 2>/dev/null || true @$(CP) -v $(addprefix $(ROOTDIR)/$(SPLDIR)/wrap/,dgemm-blas dgemm-blas.sh dgemm-wrap dgemm-wrap.sh wrap-test.sh) $(PREFIX)/$(PBINDIR) 2>/dev/null || true @$(CP) -v $(addprefix $(ROOTDIR)/$(SPLDIR)/dispatch/,dispatch dispatch.sh) $(PREFIX)/$(PBINDIR) 2>/dev/null || true @$(CP) -v $(addprefix $(ROOTDIR)/$(SPLDIR)/nek/,axhm grad rstr *.sh) $(PREFIX)/$(PBINDIR) 2>/dev/null || true @$(CP) -v $(addprefix $(ROOTDIR)/$(SPLDIR)/smm/,smm smm.sh smm-perf* smmf-perf.sh smm-plot.sh) $(PREFIX)/$(PBINDIR) 2>/dev/null || true @$(CP) -v $(addprefix $(ROOTDIR)/$(SPLDIR)/smm/,specialized specialized.sh) $(PREFIX)/$(PBINDIR) 2>/dev/null || true @$(CP) -v $(addprefix $(ROOTDIR)/$(SPLDIR)/smm/,dispatched dispatched.sh) $(PREFIX)/$(PBINDIR) 2>/dev/null || true @$(CP) -v $(addprefix $(ROOTDIR)/$(SPLDIR)/smm/,inlined inlined.sh) $(PREFIX)/$(PBINDIR) 2>/dev/null || true @$(CP) -v $(addprefix $(ROOTDIR)/$(SPLDIR)/smm/,blas blas.sh) $(PREFIX)/$(PBINDIR) 2>/dev/null || true endif .PHONY: install-dev install-dev: install-realall build-tests ifneq ($(PREFIX),$(ABSDIR)) @echo @echo "LIBXSMM installing tests..." @mkdir -p $(PREFIX)/$(PTSTDIR) @$(CP) -v $(basename $(wildcard $(ROOTDIR)/$(TSTDIR)/*.c)) $(PREFIX)/$(PTSTDIR) 2>/dev/null || true endif .PHONY: install-artifacts install-artifacts: install-dev ifneq ($(PREFIX),$(ABSDIR)) @echo @echo "LIBXSMM installing artifacts..." @mkdir -p $(PREFIX)/$(PDOCDIR)/artifacts @$(CP) -v $(DIRSTATE)/.state $(PREFIX)/$(PDOCDIR)/artifacts/make.txt endif ifeq (Windows_NT,$(UNAME)) ALIAS_PRIVLIBS := $(call ldlib,$(LD),$(SLDFLAGS),dbghelp) else ifneq (Darwin,$(UNAME)) ifneq (FreeBSD,$(UNAME)) ALIAS_PRIVLIBS := $(LIBPTHREAD) $(LIBRT) $(LIBDL) $(LIBM) $(LIBC) else ALIAS_PRIVLIBS := $(LIBDL) $(LIBM) $(LIBC) endif endif ifneq (Darwin,$(UNAME)) ALIAS_PRIVLIBS_EXT := -fopenmp endif ALIAS_INCLUDEDIR := $(subst $$$$,$(if $(findstring $$$$/,$$$$$(PINCDIR)),,\$${prefix}/),$(subst $$$$$(ALIAS_PREFIX),\$${prefix},$$$$$(PINCDIR))) ALIAS_LIBDIR := $(subst $$$$,$(if $(findstring $$$$/,$$$$$(POUTDIR)),,\$${prefix}/),$(subst $$$$$(ALIAS_PREFIX),\$${prefix},$$$$$(POUTDIR))) $(OUTDIR)/libxsmm.pc: $(OUTDIR)/libxsmm.$(LIBEXT) @echo "Name: libxsmm" > $@ @echo "Description: Matrix operations and deep learning primitives" >> $@ @echo "URL: https://github.com/hfp/libxsmm" >> $@ @echo "Version: $(VERSION_STRING)" >> $@ @echo >> $@ @echo "prefix=$(ALIAS_PREFIX)" >> $@ @echo "includedir=$(ALIAS_INCLUDEDIR)" >> $@ @echo "libdir=$(ALIAS_LIBDIR)" >> $@ @echo >> $@ @echo "Cflags: -I\$${includedir}" >> $@ ifneq (,$(ALIAS_PRIVLIBS)) @if [ -e $(OUTDIR)/libxsmm.$(DLIBEXT) ]; then \ echo "Libs: -L\$${libdir} -lxsmm" >> $@; \ echo "Libs.private: $(ALIAS_PRIVLIBS)" >> $@; \ else \ echo "Libs: -L\$${libdir} -lxsmm $(ALIAS_PRIVLIBS)" >> $@; \ fi else # no private libraries @echo "Libs: -L\$${libdir} -lxsmm" >> $@ endif $(OUTDIR)/libxsmmf.pc: $(OUTDIR)/libxsmmf.$(LIBEXT) @echo "Name: libxsmm/f" > $@ @echo "Description: LIBXSMM for Fortran" >> $@ @echo "URL: https://github.com/hfp/libxsmm" >> $@ @echo "Version: $(VERSION_STRING)" >> $@ @echo >> $@ @echo "prefix=$(ALIAS_PREFIX)" >> $@ @echo "includedir=$(ALIAS_INCLUDEDIR)" >> $@ @echo "libdir=$(ALIAS_LIBDIR)" >> $@ @echo >> $@ @echo "Requires: libxsmm" >> $@ @echo "Cflags: -I\$${includedir}" >> $@ @echo "Libs: -L\$${libdir} -lxsmmf" >> $@ $(OUTDIR)/libxsmmext.pc: $(OUTDIR)/libxsmmext.$(LIBEXT) @echo "Name: libxsmm/ext" > $@ @echo "Description: LIBXSMM/multithreaded for OpenMP" >> $@ @echo "URL: https://github.com/hfp/libxsmm" >> $@ @echo "Version: $(VERSION_STRING)" >> $@ @echo >> $@ @echo "prefix=$(ALIAS_PREFIX)" >> $@ @echo "includedir=$(ALIAS_INCLUDEDIR)" >> $@ @echo "libdir=$(ALIAS_LIBDIR)" >> $@ @echo >> $@ @echo "Requires: libxsmm" >> $@ @echo "Cflags: -I\$${includedir}" >> $@ ifneq (,$(ALIAS_PRIVLIBS_EXT)) @if [ -e $(OUTDIR)/libxsmmext.$(DLIBEXT) ]; then \ echo "Libs: -L\$${libdir} -lxsmmext" >> $@; \ echo "Libs.private: $(ALIAS_PRIVLIBS_EXT)" >> $@; \ else \ echo "Libs: -L\$${libdir} -lxsmmext $(ALIAS_PRIVLIBS_EXT)" >> $@; \ fi else # no private libraries @echo "Libs: -L\$${libdir} -lxsmmext" >> $@ endif $(OUTDIR)/libxsmmnoblas.pc: $(OUTDIR)/libxsmmnoblas.$(LIBEXT) @echo "Name: libxsmm/noblas" > $@ @echo "Description: LIBXSMM substituted LAPACK/BLAS dependency" >> $@ @echo "URL: https://github.com/hfp/libxsmm" >> $@ @echo "Version: $(VERSION_STRING)" >> $@ @echo >> $@ @echo "prefix=$(ALIAS_PREFIX)" >> $@ @echo "includedir=$(ALIAS_INCLUDEDIR)" >> $@ @echo "libdir=$(ALIAS_LIBDIR)" >> $@ @echo >> $@ @echo "Requires: libxsmm" >> $@ @echo "Cflags: -I\$${includedir}" >> $@ @echo "Libs: -L\$${libdir} -lxsmmnoblas" >> $@ $(OUTDIR)/libxsmm.env: $(OUTDIR)/.make $(INCDIR)/libxsmm.h @echo "#%Module1.0" > $@ @echo >> $@ @echo "module-whatis \"LIBXSMM $(VERSION_STRING)\"" >> $@ @echo >> $@ @echo "set PREFIX \"$(ALIAS_PREFIX)\"" >> $@ @echo "prepend-path PATH \"\$$PREFIX/bin\"" >> $@ @echo "prepend-path LD_LIBRARY_PATH \"\$$PREFIX/lib\"" >> $@ @echo >> $@ @echo "prepend-path PKG_CONFIG_PATH \"\$$PREFIX/lib\"" >> $@ @echo "prepend-path LIBRARY_PATH \"\$$PREFIX/lib\"" >> $@ @echo "prepend-path CPATH \"\$$PREFIX/include\"" >> $@ .PHONY: deb deb: @if [ "" != "$$(command -v git)" ]; then \ VERSION_ARCHIVE=$$(git describe --tags --abbrev=0 2>/dev/null); \ VERSION_ARCHIVE_SONAME=$$($(PYTHON) $(ROOTDIR)/$(SCRDIR)/libxsmm_utilities.py 0 $${VERSION_ARCHIVE}); \ fi; \ if [ "" != "$${VERSION_ARCHIVE}" ] && [ "" != "$${VERSION_ARCHIVE_SONAME}" ]; then \ ARCHIVE_AUTHOR_NAME="$$(git config user.name)"; \ ARCHIVE_AUTHOR_MAIL="$$(git config user.email)"; \ ARCHIVE_NAME=libxsmm$${VERSION_ARCHIVE_SONAME}; \ ARCHIVE_DATE="$$(LANG=C date -R)"; \ if [ "" != "$${ARCHIVE_AUTHOR_NAME}" ] && [ "" != "$${ARCHIVE_AUTHOR_MAIL}" ]; then \ ARCHIVE_AUTHOR="$${ARCHIVE_AUTHOR_NAME} <$${ARCHIVE_AUTHOR_MAIL}>"; \ else \ echo "Warning: Please git-config user.name and user.email!"; \ if [ "" != "$${ARCHIVE_AUTHOR_NAME}" ] || [ "" != "$${ARCHIVE_AUTHOR_MAIL}" ]; then \ ARCHIVE_AUTHOR="$${ARCHIVE_AUTHOR_NAME}$${ARCHIVE_AUTHOR_MAIL}"; \ fi \ fi; \ if ! [ -e $${ARCHIVE_NAME}_$${VERSION_ARCHIVE}.orig.tar.gz ]; then \ git archive --prefix $${ARCHIVE_NAME}-$${VERSION_ARCHIVE}/ \ -o $${ARCHIVE_NAME}_$${VERSION_ARCHIVE}.orig.tar.gz $(VERSION_RELEASE); \ fi; \ tar xf $${ARCHIVE_NAME}_$${VERSION_ARCHIVE}.orig.tar.gz; \ cd $${ARCHIVE_NAME}-$${VERSION_ARCHIVE}; \ mkdir -p debian/source; cd debian/source; \ echo "3.0 (quilt)" > format; \ cd ..; \ echo "Source: $${ARCHIVE_NAME}" > control; \ echo "Section: libs" >> control; \ echo "Homepage: https://github.com/hfp/libxsmm" >> control; \ echo "Vcs-Git: https://github.com/hfp/libxsmm/libxsmm.git" >> control; \ echo "Maintainer: $${ARCHIVE_AUTHOR}" >> control; \ echo "Priority: optional" >> control; \ echo "Build-Depends: debhelper (>= 9)" >> control; \ echo "Standards-Version: 3.9.8" >> control; \ echo >> control; \ echo "Package: $${ARCHIVE_NAME}" >> control; \ echo "Section: libs" >> control; \ echo "Architecture: amd64" >> control; \ echo "Depends: \$${shlibs:Depends}, \$${misc:Depends}" >> control; \ echo "Description: Matrix operations and deep learning primitives" >> control; \ wget -T $(TIMEOUT) -qO- "https://api.github.com/repos/hfp/libxsmm" \ | sed -n 's/ *\"description\": \"\(..*\)\".*/\1/p' \ | fold -s -w 79 | sed -e 's/^/ /' -e 's/[[:space:]][[:space:]]*$$//' >> control; \ echo "$${ARCHIVE_NAME} ($${VERSION_ARCHIVE}-$(VERSION_PACKAGE)) UNRELEASED; urgency=low" > changelog; \ echo >> changelog; \ wget -T $(TIMEOUT) -qO- "https://api.github.com/repos/hfp/libxsmm/releases/tags/$${VERSION_ARCHIVE}" \ | sed -n 's/ *\"body\": \"\(..*\)\".*/\1/p' \ | sed -e 's/\\r\\n/\n/g' -e 's/\\"/"/g' -e 's/\[\([^]]*\)\]([^)]*)/\1/g' \ | sed -n 's/^\* \(..*\)/\* \1/p' \ | fold -s -w 78 | sed -e 's/^/ /g' -e 's/^ \* /\* /' -e 's/^/ /' -e 's/[[:space:]][[:space:]]*$$//' >> changelog; \ echo >> changelog; \ echo " -- $${ARCHIVE_AUTHOR} $${ARCHIVE_DATE}" >> changelog; \ echo "#!/usr/bin/make -f" > rules; \ echo "export DH_VERBOSE = 1" >> rules; \ echo >> rules; \ echo "%:" >> rules; \ $$(which echo) -e "\tdh \$$@" >> rules; \ echo >> rules; \ echo "override_dh_auto_install:" >> rules; \ $$(which echo) -e "\tdh_auto_install -- prefix=/usr" >> rules; \ echo >> rules; \ echo "9" > compat; \ $(CP) ../LICENSE.md copyright; \ rm -f ../$(TSTDIR)/mhd_test.mhd; \ chmod +x rules; \ debuild \ -e PREFIX=debian/$${ARCHIVE_NAME}/usr \ -e PDOCDIR=share/doc/$${ARCHIVE_NAME} \ -e LICFILE=copyright \ -e LICFDIR=../.. \ -e SONAMELNK=1 \ -e SHARED=1 \ -e SYM=1 \ -us -uc; \ else \ echo "Error: Git is unavailable or make-deb runs outside of cloned repository!"; \ fi libxsmm-1.17/Makefile.inc000066400000000000000000002344431415223013700153350ustar00rootroot00000000000000############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### MAKE_VERSION_MAJOR := $(shell echo "$(MAKE_VERSION)" | cut -d. -f1) MAKE_VERSION_MINOR := $(shell echo "$(MAKE_VERSION)" | cut -d. -f2) MAKE_VERSION_PATCH := $(shell echo "$(MAKE_VERSION)" | cut -d. -f3) ifeq (,$(MAKE_VERSION_MAJOR)) MAKE_VERSION_MAJOR := 0 endif ifeq (,$(MAKE_VERSION_MINOR)) MAKE_VERSION_MINOR := 0 endif ifeq (,$(MAKE_VERSION_PATCH)) MAKE_VERSION_PATCH := 0 endif MAKE_VERSION_INT := $(shell echo "$$(($(MAKE_VERSION_MAJOR)*10000+$(MAKE_VERSION_MINOR)*100+$(MAKE_VERSION_PATCH)))") # Automatically disable parallel builds # depending on the version of GNU Make. # MAKE_PARALLEL=0: disable explicitly # MAKE_PARALLEL=1: enable explicitly ifeq (0,$(MAKE_PARALLEL)) .NOTPARALLEL: else ifeq (,$(strip $(MAKE_PARALLEL))) # Force parallel build for old GNU Make #ifneq (0,$(shell echo "$$((38200>$(MAKE_VERSION_INT)))")) #.NOTPARALLEL: #else ifneq (0,$(shell echo "$$((40201<$(MAKE_VERSION_INT)))")) MAKEFLAGS += -O endif else ifneq (0,$(shell echo "$$((40201<$(MAKE_VERSION_INT)))")) MAKEFLAGS += -O endif #MAKEINC := $(abspath $(dir $(filter %Makefile.inc,$(MAKEFILE_LIST)))) MAKEINC := $(patsubst %/,%,$(dir $(filter %Makefile.inc,$(MAKEFILE_LIST)))) ifeq (d,$(filter d,$(MAKEFLAGS))) #SHELL := bash -xv SHELL += -xv endif COMMAND := $(shell which command 2>/dev/null) ifneq (,$(COMMAND)) which = $(shell $(COMMAND) -v $1) else which = $(shell which $(firstword $1) 2>/dev/null) endif # ensure a number or instead return zero (by default) or 2nd argument (if given) qnum = $(shell echo "$1" | grep "^-*[0-9][0-9]*$$" 2>/dev/null || echo "$(if $2,$2,0)") ifeq (,$(HOSTNAME)) HOSTNAME := $(shell hostname 2>/dev/null) endif ifeq (Windows_NT,$(OS)) MNAME ?= $(if $(filter AMD64,$(PROCESSOR_ARCHITECTURE)),x86_64,$(PROCESSOR_ARCHITECTURE)) UNAME ?= Windows_NT # Cygwin/MinGW based DLIBEXT ?= dll SLIBEXT ?= lib else MNAME ?= $(shell uname -m 2>/dev/null) UNAME ?= $(shell uname 2>/dev/null) ifneq (Darwin,$(UNAME)) ENVBIN ?= $(call which,env) endif endif # Command line utilities #PKGCFG ?= $(call which,pkg-config) ifneq (,$(call which,stdbuf)) FLUSH ?= stdbuf -o0 -e0 endif CP ?= $(call which,cp) MV ?= $(call which,mv) MAKE ?= make # Python interpreter per PYTHON=/path/to/python PYTHON3 := $(call which,python3) # Python3 by default ifneq (,$(PYTHON3)) PYTHON := $(PYTHON3) else ifneq (,$(call which,python)) ifneq (Windows_NT,$(UNAME)) #SHELL := $(ENVBIN) PATH=$(MAKEINC):$(PATH) $(SHELL) PYTHON3 := $(shell ln -s $(call which,python) $(MAKEINC)/python3 2>/dev/null) PYTHON := $(MAKEINC)/python3 else PYTHON := python endif endif ifneq (Darwin,$(UNAME)) ifneq (,$(strip $(CP))) ifneq (FreeBSD,$(UNAME)) CP += -u endif endif DLIBEXT ?= so SLIBEXT ?= a else DLIBEXT ?= dylib SLIBEXT ?= a endif # Regular expression to match "main" (good-enough pattern) CMAIN := main[[:space:]]*(.*) FMAIN := ^[[:space:]]*PROGRAM[[:space:]][[:space:]]*\w\w*\([[:space:]][[:space:]]*\!.*\)*$$ # Regular expression to mazch variable name (Make-key) VNAME := ^ *[^0-9[:punct:]][A-Z0-9_][A-Z0-9_]* # Debugging and symbols (e.g., when profiling) SYM ?= 0 DBG ?= 0 # Instrumentation level (trace) ifeq (,$(strip $(INSTRUMENT))) INSTRUMENT := 0 endif TRACE ?= 0 ifeq (0,$(DBG)) ifneq (0,$(INSTRUMENT)) SYM := $(INSTRUMENT) endif ifeq (0,$(shell echo "$$((1<$(SYM) || 0>$(SYM)))")) DFLAGS += -DNDEBUG endif else # debugging enabled ifneq (0,$(shell echo "$$((1<$(DBG) || 0>$(DBG)))")) DFLAGS += -D_DEBUG endif SYM := $(DBG) endif # Optimization level ifeq (0,$(DBG)) OPT ?= 2 else OPT ?= 0 endif # Optimization flag derived from OPT flag OPTFLAG ?= -O$(patsubst O%,%,$(OPT)) # Kind of Clang-analysis (thread, address, ...) SANITIZE ?= $(NULL) # Avoid more sophisticated flags of the GCC tool chain, # and improve compatibility with compilers supposed to be # compatible with the GCC tool chain COMPATIBLE ?= 0 # Control visibility of symbols # 0: hidden unless explicitly marked visible # 1: default visibility VISIBILITY ?= 0 # Number of repeated calls (tests), # or used to scale the problem size TESTSIZE ?= 1 # PYMOD=1: enable Python module development PYMOD ?= 0 # Static or shared binary ifneq (0,$(PYMOD)) # req. for Python module override STATIC := 0 else STATIC ?= 0 endif # PIC: PIC or pic PIC ?= pic PLATFORM ?= 0 ifneq (0,$(call qnum,$(PLATFORM))) # NaN DFLAGS += -DLIBXSMM_PLATFORM_FORCE endif OFFLOAD ?= 0 ifneq (0,$(OFFLOAD)) MPSS ?= 1 KNC ?= 1 else MPSS ?= 0 KNC ?= 0 endif DEPDIR ?= $(ROOTDIR) ifeq (0,$(KNC)) LIBNAME ?= $(DEPDIR)/lib/libxsmm else ifneq (3,$(AVX)) ifeq (0,$(OFFLOAD)) LIBNAME ?= $(DEPDIR)/lib/mic/libxsmm else LIBNAME ?= $(DEPDIR)/lib/libxsmm endif else LIBNAME ?= $(DEPDIR)/lib/libxsmm endif # Additional library search paths LIBFIND ?= /usr/local/lib # Internal utilities MKTEMP := $(DEPDIR)/.mktmp.sh FLOCK := $(DEPDIR)/.flock.sh # THREADS refers to foundational TRT (and not necessarily Posix Threads) THREADS ?= 1 # Threading runtime ifeq (0,$(THREADS)) override OMP := 0 endif OMP ?= 0 # Code conformance (beyond -Wall) PEDANTIC ?= 0 # Warning about unused functions UNUSED ?= 0 # Embed InterProcedural Optimization information into libraries IPO ?= 0 FAT ?= 0 # ILP64=0 (LP64 with 32-bit integers), and ILP64=0 (64-bit integers) ILP64 ?= 0 # TBB Malloc enabled (1) or disabled (0) # availability depends on TBBROOT TBB_MALLOC ?= 0 # TBB runtime compatible with oldest supported GCC TBB_OLDRTL ?= 0 # Enable absolute library paths ABSLIBS ?= 0 # Embedd soname into shared library SONAMELNK ?= 2 # utilities to handle paths with spaces SPACES ?= 0 REVERSION0 := [0-9][0-9]*\.[0-9][0-9]*\.*[0-9]* REVERSION1 := s/..* \($(REVERSION0)\)[ \S]*.*/\1/ REVERSION2 := s/..* \([0-9]\{5\}[0-9]*\)[ \S]*.*/\1/ CHAR_OPEN := ( CHAR_CLOSE := ) CHAR_HASH := \# ifneq (0,$(SPACES)) unquote = $(shell echo "$1 $2 $3 $4 $5 $6 $7 $8 $9 ${10} ${11} ${12}" | sed -e 's/^[" ][" ]*//' -e 's/[" ][" ]*$$//') quote = $(strip $(if $(filter 0 1,$(words $1 $2 $3 $4 $5 $6 $7 $8 $9 ${10} ${11} ${12})), \ $1 $2 $3 $4 $5 $6 $7 $8 $9 ${10} ${11} ${12}, \ "$(call unquote,$1 $2 $3 $4 $5 $6 $7 $8 $9 ${10} ${11} ${12})")) qapath = $(call quote,$(shell export "VAR=$(call unquote,$1 $2 $3 $4 $5 $6 $7 $8 $9 ${10} ${11} ${12})" && if [ "$${VAR}" ]; then \ if [ ! -d "$${VAR}" ]; then cd "$$(dirname "$${VAR}" 2>/dev/null)" 2>/dev/null && echo "$$(pwd -P)/$$(basename "$${VAR}" 2>/dev/null)"; \ else cd "$${VAR}" 2>/dev/null && pwd -P; fi; fi)) qndir = $(call quote,$(shell export "VAR=$(call unquote,$1 $2 $3 $4 $5 $6 $7 $8 $9 ${10} ${11} ${12})" && \ if [ ! -d "$${VAR}" ]; then basename "$${VAR}" 2>/dev/null; fi)) qdir = $(call quote,$(shell dirname "$(call unquote,$1 $2 $3 $4 $5 $6 $7 $8 $9 ${10} ${11} ${12})x" 2>/dev/null)/) else quote = $(strip $(subst "",$(NULL),$1 $2 $3 $4 $5 $6 $7 $8 $9 ${10} ${11} ${12})) qapath = $(abspath $(call quote, $1 $2 $3 $4 $5 $6 $7 $8 $9 ${10} ${11} ${12})) qndir = $(notdir $(call quote, $1 $2 $3 $4 $5 $6 $7 $8 $9 ${10} ${11} ${12})) qdir = $(dir $(call quote, $1 $2 $3 $4 $5 $6 $7 $8 $9 ${10} ${11} ${12})) endif qsuffix = $(suffix $(strip $1 $2 $3 $4 $5 $6 $7 $8 $9 ${10} ${11} ${12})) qbname = $(basename $(call quote,$1 $2 $3 $4 $5 $6 $7 $8 $9 ${10} ${11} ${12})) qxdir = $(shell if [ -d "$1" ]; then echo "$1"; else echo "$(call qdir,$1)"; fi) qname = $(basename $(call quote,$(shell echo "$1 $2 $3 $4 $5 $6 $7 $8 $9 ${10} ${11} ${12}" | sed "s/\.$(REVERSION0)//"))) # absolute directory paths ABSDIR := $(call qapath,$(ROOTDIR)) HOMEDIR := $(call qapath,$(HOME)) HEREDIR := $(call qapath,.) # pick the last of wildcard's sorted results wildcard1 = $(strip $(lastword $(sort $(wildcard $1)))) # Pickup OpenMP library name if passed as OMP=libname|name ifeq (NaN,$(call qnum,$(OMP),NaN)) OMPRT := $(patsubst lib%,%,$(OMP)) endif OMPRT ?= $(NULL) uniqadd = $(if $($2),$(filter-out $($2),$($1)) $(if $3,$3,$($2)),$(filter-out $2,$($1)) $(if $3,$3,$2)) uniqmov = $(if $(filter $($2),$($1)),$(call uniqadd,$1,$2,$3),$($1)) ctest = $(if $1,$(if $(shell INFILE=$$($(MKTEMP) /tmp/.libxsmm_XXXXXX.c) && \ echo "int main(void) { return 0; }" > $${INFILE} && if [ "$3" ]; \ then $(firstword $1) -c $${INFILE} -o $${INFILE}.o >/dev/null && \ $1 $2 $${INFILE}.o -o $${INFILE}.x >/dev/null; \ else $(firstword $1) $2 $${INFILE} -o $${INFILE}.x >/dev/null; fi 2>&1 || echo "FAILED"; \ rm -f /tmp/$$(basename $${INFILE} .c).* .libxsmm_??????.c.* 2>/dev/null),$(NULL),$2)) # INFILE cannot use leading dot when used with certain F-compiler ftest = $(if $1,$(if $(shell INFILE=$$($(MKTEMP) /tmp/_libxsmm_XXXXXX.f) && \ printf " PROGRAM test\n END PROGRAM\n" > $${INFILE} && if [ "$3" ]; \ then $(firstword $1) -c $${INFILE} -o $${INFILE}.o >/dev/null && \ $1 $2 $${INFILE}.o -o $${INFILE}.x >/dev/null; \ else $(firstword $1) $2 $${INFILE} -o $${INFILE}.x >/dev/null; fi 2>&1 || echo "FAILED"; \ rm -f /tmp/$$(basename $${INFILE} .f).* _libxsmm_??????.f.* 2>/dev/null),$(NULL),$2)) ldclib = $(if $1,$(call ctest,$1 $2,-l$(strip $3))) ldflib = $(if $1,$(call ftest,$1 $2,-l$(strip $3))) # Automatically pickup the environment (make -e is not required), # or pickup the Intel Compiler (if available). GNU ?= 0 ifeq (0,$(GNU)) ifeq (,$(shell echo "$${CXX}")) ifneq (,$(filter icpc icpc1,$(call qndir,$(call which,icpc))$(INTEL))) CXX := icpc else ifneq (,$(filter icpx icpx2,$(call qndir,$(call which,icpx))$(INTEL))) CXX := icpx endif endif ifeq (,$(shell echo "$${CC}")) ifneq (,$(filter icc icc1,$(call qndir,$(call which,icc))$(INTEL))) CC := icc else ifneq (,$(filter icx icx2,$(call qndir,$(call which,icx))$(INTEL))) CC := icx endif endif endif # adopt extra flags from C if not set individually ECXXFLAGS ?= $(ECFLAGS) COMPILER_VERSION_FLAG ?= $(ECXXFLAGS) $(EFLAGS) --version 2>/dev/null CC_VERSION_FLAG ?= $(COMPILER_VERSION_FLAG) FC_VERSION_FLAG ?= $(COMPILER_VERSION_FLAG) CXX_VERSION_FLAG ?= $(CC_VERSION_FLAG) # check if the Intel Development Tools are available # 1: classic, 2: next-gen (opt-in) INTEL ?= $(if $(filter 2,$(words $(filter icpc% icpx% icc% icx%, \ $(shell $(CXX) $(CXX_VERSION_FLAG) 2>/dev/null | head -n1 | cut -d' ' -f1) \ $(shell $(CC) $(CC_VERSION_FLAG) 2>/dev/null | head -n1 | cut -d' ' -f1)))),1,0) ifneq (0,$(INTEL)) ifeq (,$(shell echo "$${FC}")) ifeq (ifx$(INTEL),$(call qndir,$(call which,ifx))$(filter-out 1,$(INTEL))) FC := ifx else ifneq (,$(call qndir,$(call which,ifort))) FC := ifort endif endif else ifeq (,$(call which,$(CXX))) CXX := g++ else ifneq (0,$(shell $(CXX) $(CXX_VERSION_FLAG) >/dev/null 2>/dev/null; echo "$$?")) CXX := g++ else ifneq (gcc,$(call qndir,$(call which,$(CC)))) ifeq (g++,$(call qndir,$(call which,$(CXX)))) CC := gcc endif endif ifeq (,$(call which,$(CC))) CC := gcc else ifneq (0,$(shell $(CC) $(CC_VERSION_FLAG) >/dev/null 2>/dev/null; echo "$$?")) CC := gcc endif ifeq (Cray,$(shell $(CC) -V 2>&1 | head -n1 | cut -d' ' -f1)) COMPILER_VERSION_FLAG ?= -V 2>&1 endif endif ifneq (,$(filter-out __INTEL_LLVM_COMPILER,$(shell echo "__INTEL_LLVM_COMPILER" \ | $(CC) $(call ctest,$(CC),$(ECFLAGS) $(EFLAGS)) -E -P - 2>/dev/null \ | sed "/^[[:space:]]*$$/d" 2>/dev/null))) ICX := 1 endif ifneq (,$(filter-out __INTEL_LLVM_COMPILER,$(shell echo "__INTEL_LLVM_COMPILER" \ | $(FC) $(call ftest,$(FC),$(EFCFLAGS) $(EFLAGS)) -E -P /dev/stdin 2>/dev/null \ | sed "/^[[:space:]]*$$/d" 2>/dev/null; rm -f stdin.f))) ICX := $(if $(filter 1,$(ICX)),2,1) FIXFC ?= 0 endif ICX ?= 0 # linker setup LD := $(CC) XLD := $(CXX) # Secondary static ifneq (file,$(origin STATIC)) # prefer user override/preference (in any case) DEPSTATIC ?= $(STATIC) else ifneq (,$(wildcard $(LIBNAME).$(SLIBEXT)*)) # prefer static (library exists) DEPSTATIC ?= 1 else ifneq (0,$(STATIC)) DEPSTATIC ?= $(STATIC) else DEPSTATIC ?= 0 endif LNKSOFT ?= 1 ifeq (0,$(STATIC)) ifeq (Windows_NT,$(UNAME)) LNKSOFT := 0 else ifeq (Darwin,$(UNAME)) LNKSOFT := 0 endif endif # BLAS is not used by default ifneq (0,$(LNKSOFT)) BLAS ?= 0 else BLAS ?= 2 endif # Explicitly disable BLAS by user's intervention # Makefile defines what happens (perhaps nothing) NOBLAS ?= 0 DNOBLAS := -D__BLAS=0 ifneq (,$(filter environment% override command%,$(origin BLAS))) ifeq (0,$(BLAS)) NOBLAS := 1 endif endif ifneq (0,$(NOBLAS)) BLAS_FLAGS += $(DNOBLAS) endif CXX_VERSION := $(shell $(CXX) $(CXX_VERSION_FLAG) | head -n1 | sed -n "$(REVERSION1)p") ifeq (,$(CXX_VERSION)) CXX_VERSION := $(shell $(CXX) $(CXX_VERSION_FLAG) | head -n1 | sed -n "$(REVERSION2)p") endif CXX_VERSION_MAJOR := $(shell echo "$(CXX_VERSION)" | cut -d. -f1) CXX_VERSION_MINOR := $(shell echo "$(CXX_VERSION)" | cut -d. -f2) CXX_VERSION_PATCH := $(shell echo "$(CXX_VERSION)" | cut -d. -f3) ifeq (3,$(words $(CXX_VERSION_MAJOR) $(CXX_VERSION_MINOR) $(CXX_VERSION_PATCH))) CXX_VERSION_NUM := $(shell echo "$$(($(CXX_VERSION_MAJOR)*10000+$(CXX_VERSION_MINOR)*100+$(CXX_VERSION_PATCH)))") else ifeq (2,$(words $(CXX_VERSION_MAJOR) $(CXX_VERSION_MINOR))) CXX_VERSION_NUM := $(shell echo "$$(($(CXX_VERSION_MAJOR)*10000+$(CXX_VERSION_MINOR)*100))") CXX_VERSION_PATCH := 0 else CXX_VERSION := $(NULL) CXX_VERSION_NUM := 0 endif CC_VERSION := $(shell $(CC) $(CC_VERSION_FLAG) | head -n1 | sed -n "$(REVERSION1)p") ifeq (,$(CC_VERSION)) CC_VERSION := $(shell $(CC) $(CC_VERSION_FLAG) | head -n1 | sed -n "$(REVERSION2)p") endif CC_VERSION_MAJOR := $(shell echo "$(CC_VERSION)" | cut -d. -f1) CC_VERSION_MINOR := $(shell echo "$(CC_VERSION)" | cut -d. -f2) CC_VERSION_PATCH := $(shell echo "$(CC_VERSION)" | cut -d. -f3) ifeq (3,$(words $(CC_VERSION_MAJOR) $(CC_VERSION_MINOR) $(CC_VERSION_PATCH))) CC_VERSION_NUM := $(shell echo "$$(($(CC_VERSION_MAJOR)*10000+$(CC_VERSION_MINOR)*100+$(CC_VERSION_PATCH)))") else ifeq (2,$(words $(CC_VERSION_MAJOR) $(CC_VERSION_MINOR))) CC_VERSION_NUM := $(shell echo "$$(($(CC_VERSION_MAJOR)*10000+$(CC_VERSION_MINOR)*100))") CC_VERSION_PATCH := 0 else CC_VERSION := $(NULL) CC_VERSION_NUM := 0 endif # disable Fortran per user-request ifeq (0,$(FORTRAN)) override FC := $(NULL) endif # fixup FC-default given by MAKE ifneq (,$(strip $(FC))) ifeq (,$(call which,$(FC))) FIXFC ?= 1 else ifneq (0,$(shell $(FC) $(FC_VERSION_FLAG) >/dev/null 2>/dev/null; echo "$$?")) FIXFC ?= 1 else ifneq (gfortran,$(call qndir,$(call which,$(FC)))) ifeq (g++,$(call qndir,$(call which,$(CXX)))) FIXFC ?= 1 endif endif endif FIXFC ?= 0 ifneq (0,$(FIXFC)) ifneq (,$(call qndir,$(call which,gfortran))) MKL_FCRTL := gf GFC := gfortran FC := $(GFC) else FC := $(NULL) endif endif ifneq (,$(strip $(FC))) FC_VERSION := $(shell $(FC) $(FC_VERSION_FLAG) | head -n1 | sed -n "$(REVERSION1)p") ifeq (,$(FC_VERSION)) FC_VERSION := $(shell $(FC) $(FC_VERSION_FLAG) | head -n1 | sed -n "$(REVERSION2)p") endif FC_VERSION_MAJOR := $(shell echo "$(FC_VERSION)" | cut -d. -f1) FC_VERSION_MINOR := $(shell echo "$(FC_VERSION)" | cut -d. -f2) FC_VERSION_PATCH := $(shell echo "$(FC_VERSION)" | cut -d. -f3) ifeq (3,$(words $(FC_VERSION_MAJOR) $(FC_VERSION_MINOR) $(FC_VERSION_PATCH))) FC_VERSION_NUM := $(shell echo "$$(($(FC_VERSION_MAJOR)*10000+$(FC_VERSION_MINOR)*100+$(FC_VERSION_PATCH)))") else ifeq (2,$(words $(FC_VERSION_MAJOR) $(FC_VERSION_MINOR))) FC_VERSION_NUM := $(shell echo "$$(($(FC_VERSION_MAJOR)*10000+$(FC_VERSION_MINOR)*100))") FC_VERSION_PATCH := 0 else FC_VERSION := $(NULL) FC_VERSION_NUM := 0 endif ifeq (GNU,$(shell $(FC) $(FC_VERSION_FLAG) | head -n1 | cut -d" " -f1)) ifneq (0,$(shell echo "$$((40500>$(FC_VERSION_NUM)))")) ifneq (gfortran,$(call qndir,$(FC))) FC := gfortran FC_VERSION := $(shell $(FC) $(FC_VERSION_FLAG) | head -n1 | sed -n "$(REVERSION1)p") ifeq (,$(FC_VERSION)) FC_VERSION := $(shell $(FC) $(FC_VERSION_FLAG) | head -n1 | sed -n "$(REVERSION2)p") endif FC_VERSION_MAJOR := $(shell echo "$(FC_VERSION)" | cut -d. -f1) FC_VERSION_MINOR := $(shell echo "$(FC_VERSION)" | cut -d. -f2) FC_VERSION_PATCH := $(shell echo "$(FC_VERSION)" | cut -d. -f3) ifeq (3,$(words $(FC_VERSION_MAJOR) $(FC_VERSION_MINOR) $(FC_VERSION_PATCH))) FC_VERSION_NUM := $(shell echo "$$(($(FC_VERSION_MAJOR)*10000+$(FC_VERSION_MINOR)*100+$(FC_VERSION_PATCH)))") else ifeq (2,$(words $(FC_VERSION_MAJOR) $(FC_VERSION_MINOR))) FC_VERSION_NUM := $(shell echo "$$(($(FC_VERSION_MAJOR)*10000+$(FC_VERSION_MINOR)*100))") FC_VERSION_PATCH := 0 else FC_VERSION_NUM := 0 endif ifneq (0,$(shell echo "$$((40500>$(FC_VERSION_NUM)))")) override FC := $(NULL) endif else override FC := $(NULL) endif else ifneq (0,$(shell echo "$$((40600>$(FC_VERSION_NUM)))")) FORTRAN ?= 1 endif else ifneq (0,$(INTEL)) ifneq (0,$(shell echo "$$((130000>$(FC_VERSION_NUM) && 0<$(FC_VERSION_NUM)))")) override FC := $(NULL) endif endif endif FORTRAN ?= 2 ifeq (,$(strip $(FC))) ifeq (0,$(FORTRAN)) FC_VERSION := $(NULL) else # keep FC_VERSION for message about outdated compiler FORTRAN := 0 endif endif # compiler names ifeq (0,$(PLATFORM)) CXX_NAME := $(call qbname,$(shell $(CXX) $(CXX_VERSION_FLAG) | head -n1 \ | sed -e "s/^\([^0-9][^0-9]*\) ..*/\1/" -e "s/[[:space:]][[:space:]]*[Vv]ersion//" \ -e "s/[[:space:]][[:space:]]*$(CHAR_OPEN)..*//" -e "s/[[:space:]][[:space:]]*[[:punct:]]//" \ -e "s/[[:space:]][[:space:]]*$(REVERSION0)//" \ | rev | cut -d" " -f1 | rev)) endif ifeq (,$(strip $(CXX_NAME))) CXX_NAME := $(call qname,$(call qndir,$(CXX))) endif ifeq (0,$(PLATFORM)) CC_NAME := $(call qbname,$(shell $(CC) $(CC_VERSION_FLAG) | head -n1 \ | sed -e "s/^\([^0-9][^0-9]*\) ..*/\1/" -e "s/[[:space:]][[:space:]]*[Vv]ersion//" \ -e "s/[[:space:]][[:space:]]*$(CHAR_OPEN)..*//" -e "s/[[:space:]][[:space:]]*[[:punct:]]//" \ -e "s/[[:space:]][[:space:]]*$(REVERSION0)//" \ | rev | cut -d" " -f1 | rev)) endif ifeq (,$(strip $(CC_NAME))) CC_NAME := $(call qname,$(call qndir,$(CC))) endif ifneq (,$(strip $(FC))) FC_NAME := $(firstword $(call qndir,$(FC))) endif ifeq (,$(strip $(FC_NAME))) FC_NAME := $(call qname,$(call qndir,$(FC))) endif ifneq (0,$(FORTRAN)) ifeq (,$(strip $(GFC))) ifneq (,$(strip $(FC))) ifeq (GCC,$(shell $(FC) $(FC_VERSION_FLAG) | head -n1 | sed "s/.* (\(..*\)) .*/\1/")) GFC := $(FC) else ifeq (0,$(shell $(FC) $(FC_VERSION_FLAG) | grep -q "Free Software Foundation"; echo "$$?")) GFC := $(FC) else ifneq (pgfortran,$(CC_NAME)) ifneq (,$(findstring gfortran,$(FC_NAME))) GFC := $(FC) endif endif else FORTRAN ?= 0 endif endif else FC := $(NULL) endif ifeq (,$(strip $(FC))) DFLAGS += -DLIBXSMM_NOFORTRAN endif # native GCC? XSMM_GCC ?= 0 ifeq (0,$(shell $(CC) $(CC_VERSION_FLAG) | grep -q "Free Software Foundation"; echo "$$?")) XSMM_GCC := 1 else ifeq (GCC,$(shell $(CC) $(CC_VERSION_FLAG) | head -n1 | sed "s/.* (\(..*\)) .*/\1/")) XSMM_GCC := 1 else ifneq (pgcc,$(CC_NAME)) ifneq (,$(findstring pgcc,$(CC_NAME))) XSMM_GCC := 1 endif endif ifeq (1,$(XSMM_GCC)) ifeq (0,$(shell $(CXX) $(CXX_VERSION_FLAG) | grep -q "Free Software Foundation"; echo "$$?")) XSMM_GCC := 2 else ifeq (GCC,$(shell $(CXX) $(CXX_VERSION_FLAG) | head -n1 | sed "s/.* (\(..*\)) .*/\1/")) XSMM_GCC := 2 else ifneq (,$(findstring g++,$(CXX_NAME))) XSMM_GCC := 2 endif ifeq (2,$(XSMM_GCC)) ifneq (,$(strip $(GFC))) MKL_FCRTL := gf XSMM_GCC := 3 endif endif endif # Fortran runtime library MKL_FCRTL ?= intel # adopt fully equipped archiver CCAR := $(call which,$(CC)-ar) ifneq (,$(findstring -ar?,$(CCAR)?)) ifeq (default,$(origin AR)) AR := $(CCAR) else AR ?= $(CCAR) endif endif ifneq (,$(FORCE_CXX)) ifneq (0,$(FORCE_CXX)) override CC := $(CXX) $(call ctest,$(CC),-xc++) endif endif WCHECK ?= 0 WERROR_CFLAG := $(call ctest,$(CC),-Werror) WERROR_FCFLAG := $(call ftest,$(FC),-Werror) ifneq (0,$(WCHECK)) WCHECK_CFLAG := $(call ctest,$(CC),-Wcheck $(WERROR_CFLAG)) endif ifeq (0,$(ICX)) ifneq (0,$(INTEL)) NOLIMITS_CFLAG := $(call ctest,$(CC),-qoverride-limits $(WERROR_CFLAG)) ifeq (,$(strip $(GFC))) ifneq (,$(strip $(FC))) NOLIMITS_FCFLAG := $(call ftest,$(FC),-qoverride-limits $(WERROR_FCFLAG)) endif endif endif endif ifeq (Windows_NT,$(UNAME)) ifeq (MINGW64,$(MSYSTEM)) MINGW := 64 else ifeq (MINGW32,$(MSYSTEM)) MINGW := 32 else ifeq (0,$(shell $(CC) -dM -E - < /dev/null 2>/dev/null | grep -q "__MINGW64__"; echo "$$?")) MINGW := 64 else ifeq (0,$(shell $(CC) -dM -E - < /dev/null 2>/dev/null | grep -q "__MINGW32__"; echo "$$?")) MINGW := 32 endif endif MINGW ?= 0 # Library extension ifneq (0,$(DEPSTATIC)) LIBEXT ?= $(SLIBEXT) else LIBEXT ?= $(DLIBEXT) ABSLIBS := 1 endif # Import-library ifeq (0,$(MINGW)) ILIBEXT ?= $(DLIBEXT) else # MinGW ILIBEXT ?= a endif # Separate control on how to link against the BLAS library BLAS_STATIC ?= $(DEPSTATIC) # PGI: ignore env. variable (same name) override PGI := 0 ifeq (0,$(XSMM_GCC)) # not GCC ifeq (0,$(INTEL)) # not Intel ifeq (0,$(shell $(CC) -dM -E - < /dev/null 2>/dev/null | grep -q "__clang__"; echo "$$?")) CLANG := 1 else override PGI := $(shell $(CC) $(CC_VERSION_FLAG) | if grep -q "PGI"; then echo "1"; else echo "0"; fi) endif endif endif CLANG ?= 0 # Make GCC version number available even when not using GCC ifneq (0,$(XSMM_GCC)) GCC_VERSION := $(CXX_VERSION) GCC_VERSION_MAJOR := $(CXX_VERSION_MAJOR) GCC_VERSION_MINOR := $(CXX_VERSION_MINOR) GCC_VERSION_PATCH := $(CXX_VERSION_PATCH) else ifeq (0,$(CLANG)) GCCBIN := $(call qndir,$(call which,gcc)) ifneq (,$(strip $(GCCBIN))) GCC_VERSION := $(shell $(GCCBIN) $(CXX_VERSION_FLAG) | head -n1 | sed -n "$(REVERSION1)p") GCC_VERSION_MAJOR := $(shell echo "$(GCC_VERSION)" | cut -d. -f1) GCC_VERSION_MINOR := $(shell echo "$(GCC_VERSION)" | cut -d. -f2) GCC_VERSION_PATCH := $(shell echo "$(GCC_VERSION)" | cut -d. -f3) endif endif ifeq (3,$(words $(GCC_VERSION_MAJOR) $(GCC_VERSION_MINOR) $(GCC_VERSION_PATCH))) GCC_VERSION_NUM := $(shell echo "$$(($(GCC_VERSION_MAJOR)*10000+$(GCC_VERSION_MINOR)*100+$(GCC_VERSION_PATCH)))") else ifeq (2,$(words $(GCC_VERSION_MAJOR) $(GCC_VERSION_MINOR))) GCC_VERSION_NUM := $(shell echo "$$(($(GCC_VERSION_MAJOR)*10000+$(GCC_VERSION_MINOR)*100))") GCC_VERSION_PATCH := 0 else GCC_VERSION := $(NULL) GCC_VERSION_NUM := 0 endif MAINTAINER ?= 0 # detect maintainer build and limit to SSE3 ifeq (,$(filter Windows_NT Darwin,$(UNAME))) ifeq (,$(SPACK_ENV_PATH)) # not under Spack ifeq (3,$(XSMM_GCC)) # pure GNU toolchain ifneq (0,$(shell echo "$$((60000<=$(GCC_VERSION_NUM)))")) MAINTAINER := 1 endif endif endif endif # Select x86 code path (if not selected otherwise) CPUFLAGS_X86 := $(strip $(shell if [ -e /proc/cpuinfo ]; then \ grep -m1 flags /proc/cpuinfo | cut -d: -f2-; \ elif [ "Darwin" = "$(UNAME)" ] && [ "x86_64" = "$(UNAME) -m" ]; then \ sysctl -a machdep.cpu.features \ machdep.cpu.extfeatures \ machdep.cpu.leaf7_features \ | cut -d: -f2- | tr -s "\n" " " \ | tr [:upper:] [:lower:]; \ fi)) SSE ?= 1 ifeq (0,$(SSE)) # discover AVX ifeq (1,$(words $(filter avx512f,$(CPUFLAGS_X86)))) ifeq (2,$(words $(filter avx512pf avx512er,$(CPUFLAGS_X86)))) # KNL AVX ?= 3 MIC ?= 1 else AVX ?= 2 endif else ifeq (1,$(words $(filter avx avx1.0,$(CPUFLAGS_X86)))) ifeq (1,$(words $(filter fma,$(CPUFLAGS_X86)))) AVX ?= 2 else AVX ?= 1 endif endif else ifeq (1,$(SSE)) # discover SSE ifeq (1,$(words $(filter sse4_2 sse4.2,$(CPUFLAGS_X86)))) SSE := 4 else ifneq (0,$(words $(filter sse3 ssse3,$(CPUFLAGS_X86)))) SSE := 3 else ifneq (0,$(words $(filter sse2 ,$(CPUFLAGS_X86)))) SSE := 2 else ifneq (,$(CPUFLAGS_X86)) SSE := 0 endif else ifneq (0,$(KNC)) MPSS := 1 endif AVX ?= 0 MIC ?= 0 # Select aarch64 code path CPUFLAGS_AARCH64 := $(strip $(shell if [ -e /proc/cpuinfo ]; then \ grep -m1 Features /proc/cpuinfo | cut -d: -f2-; \ elif [ "Darwin" = "$(UNAME)" ] && [ "arm64" = "$(UNAME) -m" ]; then \ sysctl -a hw.optional.neon | cut -d: -f2-; \ fi)) ASIMD ?= 1 ifeq (1,$(ASIMD)) # discover ASIMD ifeq (1,$(words $(filter asimd,$(CPUFLAGS_AARCH64)))) ASIMD := 1 else ifneq (0,$(words $(filter 1,$(CPUFLAGS_AARCH64)))) ASIMD := 1 else ifneq (,$(CPUFLAGS_X86)) ASIMD := 0 endif endif ifneq (0,$(INTEL)) SUITE := Intel Compiler MKL_OMPRTL := intel else ifneq (0,$(XSMM_GCC)) SUITE := GNU Compiler Collection MKL_OMPRTL := gnu else ifneq (0,$(PGI)) SUITE := $(if $(filter-out 0,$(PGI)),PGI $(NULL))Compiler MKL_OMPRTL := pgi else ifeq (0,$(CLANG)) COMPATIBLE := 1 endif ifneq (0,$(COMPATIBLE)) ifeq (Cray,$(shell $(CC) -V 2>&1 | head -n1 | cut -d' ' -f1)) SUITE := Cray Compiler LDFLAGS += -hsystem_alloc CRAY ?= 1 # prevent codegen issues ifeq (0,$(OPT)) override OPT := 1 endif endif endif endif SUITE ?= Compiler MKL_OMPRTL ?= gnu CRAY ?= 0 PGI ?= 0 ifeq (,$(strip $(GFC))) ifneq (,$(filter-out 0,$(INTEL) $(ICX))) ifneq (,$(call ftest,$(LD),-nofor-main,link)) override LD := $(LD) -nofor-main endif endif endif ifneq (Windows_NT,$(UNAME)) ifneq (,$(strip $(PIC))) ifneq (,$(call ctest,$(CC),-f$(PIC))) PIC := PIC endif PICFLAG := -f$(PIC) endif endif # prepend compiler-local library directory LIBFIND := $(call qapath,$(call qdir,$(call which,$(LD)))/../lib) $(wildcard $(LIBFIND)) ifneq (,$(COMMON)) ifneq (0,$(COMMON)) CFLAGS += $(call ctest,$(CC),-fcommon) else CFLAGS += $(call ctest,$(CC),-fno-common) endif endif ifneq (0,$(DEPSTATIC)) ifeq (0,$(COMPATIBLE)) ifneq (Darwin,$(UNAME)) ifneq (Windows_NT,$(UNAME)) ifneq (0,$(HARDEN)) ifneq (,$(strip $(HARDEN))) # explicit DYNAMIC := 1 else ifneq (0,$(SYM)) DYNAMIC := 1 endif else ifneq (0,$(SYM)) DYNAMIC := 1 endif endif endif endif else DYNAMIC := 1 endif DYNAMIC ?= 0 # enable MKL (if available) ifeq (,$(strip $(MKLROOT))) BLAS_INCFILE := $(wildcard /opt/intel/oneapi/mkl/latest/include/mkl.h) ifeq (,$(BLAS_INCFILE)) BLAS_INCFILE := $(call wildcard1,/opt/intel/compilers_and_libraries_*/$(MKL_PLATFORM)/mkl/include/mkl.h) endif ifneq (,$(BLAS_INCFILE)) MKLROOT := $(call qapath,$(call qdir,$(BLAS_INCFILE))/..) endif endif ifeq (,$(strip $(MKLROOT))) BLAS_INCFILE := $(strip $(wildcard /usr/include/mkl/mkl.h)) ifneq (,$(BLAS_INCFILE)) MKLROOT := $(call qapath,$(call qdir,$(BLAS_INCFILE))/../..) endif endif # Compiler is used for link stage ifneq (Darwin,$(UNAME)) ifneq (ld,$(call qndir,$(LD))) XLNKOPT := -Wl, endif XLNKVERBOSE := --verbose linkopt = $(if $1,$(XLNKOPT)$(if $2,$1=$(call quote,$2),$1)) abslibrpath = $(strip $(if $(findstring .$(ILIBEXT),$1)$(wildcard $1/), \ $(call linkopt,--rpath,$(call qxdir,$(call qapath,$1))))) XGROUP_BEGIN := $(call linkopt,--start-group) XGROUP_END := $(call linkopt,--end-group) ifneq (0,$(ASNEEDED)) XLIB_BEGIN := $(call linkopt,--as-needed) XLIB_END := $(call linkopt,--no-as-needed) endif else # OSX ifneq (ld,$(call qndir,$(LD))) XLNKOPT := -Xlinker endif XLNKVERBOSE := -t linkopt = $(if $1,$(XLNKOPT) $(if $2,$1 $(XLNKOPT) $(call quote,$2),$1)) abslibrpath = $(strip $(if $(findstring .$(ILIBEXT),$1)$(wildcard $1/), \ $(call linkopt,-rpath,$(call qxdir,$(call qapath,$1))))) endif XLIB_BEGIN ?= $(NULL) XLIB_END ?= $(NULL) absliblpath = $(strip $(if $1,$(if $(findstring .$(ILIBEXT),$1)$(wildcard $1/), \ -L$(call qxdir,$(call qapath,$1))))) ifneq (0,$(ABSLIBS)) abslibpath = $(strip $(call abslibrpath,$1) $(call absliblpath,$1)) endif abslibpath ?= $(call absliblpath,$1) libpath = $(call qapath,$(if $1,$(shell $(FLUSH) $1 $2 -l$(strip $3) $(call linkopt,$(XLNKVERBOSE)) 2>&1 \ | grep "lib$3" | tr " " "\n" | sed -n "/\//p" | sed "s/[$(CHAR_OPEN)]\(..*\)[$(CHAR_CLOSE)]/\1/" \ | xargs -I {} sh -c "ls -pd {} 2>/dev/null || ls -pd {}.* 2>/dev/null" | grep -v /$$ | tail -n1))) ifneq (Windows_NT1,$(UNAME)$(DEPSTATIC)) abslibfile = $(strip $(if $(findstring .$(ILIBEXT),$1), \ $(if $(patsubst lib%,%,$(call qname,$(call qndir,$1))), \ $(if $(findstring .$(ILIBEXT).,$1),-l:$(call qndir,$1), \ -l$(patsubst lib%,%,$(call qname,$(call qndir,$1))))), \ $(if $(filter $(call qndir,$1),$(call qapath,$1)),$(call qapath,$1),$1))) else abslibfile = $(strip $(if $(findstring .$(ILIBEXT),$1), \ $(if $(call qname,$(call qndir,$1)), \ -l$(call qname,$(call qndir,$1))), \ $(if $(filter $(call qndir,$1),$(call qapath,$1)),$(call qapath,$1),$1))) endif abslib = $(strip $(call abslibpath,$1) $(call abslibfile,$1)) ifeq (0,$(DEPSTATIC)) LIB_LD := $(LD) -shared $(PICFLAG) LIB_XLD := $(XLD) -shared $(PICFLAG) else LIB_LD := $(LD) LIB_XLD := $(XLD) endif ifeq (,$(strip $(FLD))) ifneq (,$(strip $(FC))) FLD := $(FC) $(XLIB_END) ifeq (0,$(DEPSTATIC)) LIB_FLD := $(FLD) -shared $(PICFLAG) else LIB_FLD := $(FLD) endif endif endif LIB_FLD ?= $(LIB_LD) FLD ?= $(LD) FREEFORM ?= 1 ifneq (0,$(INTEL)) ifeq (,$(strip $(GFC))) ifneq (,$(strip $(LIB_FLD))) LIB_FLD := $(call uniqadd,LIB_FLD,-nofor-main) endif ifneq (0,$(FREEFORM)) FFORM_FLAG := -free endif endif endif ifneq (0,$(FREEFORM)) ifeq (0,$(PGI)) FFORM_FLAG ?= $(call ftest,$(FC),-ffree-form) endif endif # CCE: resolve linker issue ifneq (0,$(DYNAMIC)) ifneq (,$(call ctest,$(LD),-dynamic,link)) EXCLUDE_VALUE += -dynamic XLD := $(XLD) -dynamic LD := $(LD) -dynamic endif ifneq (,$(call ftest,$(FLD),-dynamic,link)) EXCLUDE_VALUE += -dynamic FLD := $(FLD) -dynamic endif endif LIBDEP := $(LIBNAME).$(LIBEXT) MAINLIB := $(call abslib,$(LIBDEP)) FORTDEP := $(LIBNAME)f.$(LIBEXT) FORTLIB := $(call abslib,$(FORTDEP)) EXTDEP := $(LIBNAME)ext.$(LIBEXT) EXTLIB := $(XLIB_BEGIN) $(call abslib,$(EXTDEP)) $(XLIB_END) # provides libxsmmnoblas to satisfy BLAS symbols NOBLASDEP := $(LIBNAME)noblas.$(LIBEXT) NOBLASLIB := $(XLIB_BEGIN) $(call abslib,$(NOBLASDEP)) $(XLIB_END) ifeq (0,$(BLAS)) MAINLIB := $(MAINLIB) $(NOBLASLIB) #EXTLIB := $(EXTLIB) $(NOBLASLIB) LIBDEP := $(LIBDEP) $(NOBLASDEP) endif ifneq (Darwin,$(UNAME)) ifeq (0,$(shell ln -fs this-file-does-not-exist .ln 2>/dev/null && echo "$$?" && rm .ln 2>/dev/null)) solink = -o "$1.$2.$3.$4" $(call linkopt,-soname,$(strip $(call qndir,$1).$5)) ifneq (0,$(SONAMELNK)) solink += $(shell cd $(call qdir,$1) 2>/dev/null && \ ln -fs $(call qndir,$1.$2.$3.$4) $(call qndir,$1.$5) 2>/dev/null) endif ifneq (0,$(shell echo "$$((1<$(SONAMELNK) || 0>$(SONAMELNK)))")) solink += $(shell cd $(call qdir,$1) 2>/dev/null && \ ln -fs $(call qndir,$1.$5) $(call qndir,$1) 2>/dev/null) endif else # MinGW solink = -o $(call quote,$1) $(call linkopt,-soname,$(strip $(call qndir,$1).$5)) endif else # macOS solink = -o $(call qbname,$1).$2$(call qsuffix,$1) \ -install_name $(call qndir,$(call qbname,$1).$2$(call qsuffix,$1)) \ -current_version $2.$3.$4 -compatibility_version $5 ifneq (0,$(SONAMELNK)) solink += $(shell cd $(call qdir,$1) 2>/dev/null && \ ln -fs $(call qndir,$(call qbname,$1).$2$(call qsuffix,$1)) $(call qndir,$1) 2>/dev/null) endif endif ifneq (0,$(INTEL)) ifeq (1,$(STATIC)) ifeq (0,$(ICX)) SLDFLAGS += -no-intel-extensions -static-intel -static-libstdc++ endif ifneq (Darwin,$(UNAME)) SLDFLAGS += -static-libgcc endif DFLAGS += -D__STATIC=1 else ifneq (0,$(STATIC)) DFLAGS += -D__STATIC=$(STATIC) SLDFLAGS += -static endif else ifeq (1,$(STATIC)) ifeq (0,$(PGI)) SLDFLAGS += -Bstatic ifeq (0,$(CLANG)) SLDFLAGS += -static-libstdc++ ifneq (Darwin,$(UNAME)) SLDFLAGS += -static-libgcc endif endif endif DFLAGS += -D__STATIC=1 else ifneq (0,$(STATIC)) DFLAGS += -D__STATIC=$(STATIC) ifeq (0,$(shell $(LD) -static -ldummydoesnotexist 2>&1 | grep -q "\-ldummydoesnotexist"; echo "$$?")) SLDFLAGS += -static endif endif endif SLDFLAGS ?= $(NULL) ifneq (0,$(PYMOD)) ifneq (,$(PYTHON)) PYVERSION_STRING := $(shell $(PYTHON) --version 2>&1 | head -n1 | sed -n "$(REVERSION1)p") PYVERSION := $(shell echo "$(PYVERSION_STRING)" | cut -d. -f1,2) TESTRESA := $(call qapath,$(call qdir,$(call which,$(PYTHON)))/..) TESTRESA := $(wildcard $(TESTRESA)/include/python$(PYVERSION)/Python.h) TESTLIB := $(call ldclib,$(LD),$(SLDFLAGS),python$(PYVERSION)) ifneq (,$(TESTRESA)) ifneq (,$(TESTLIB)) LDFLAGS += $(TESTLIB) IFLAGS += -I$(call qdir,$(TESTRESA)) DFLAGS += -D__PYTHON # Avoid (unresolved) BLAS (alternative: BLAS=1|2) ifeq (,$(filter environment% override command%,$(origin BLAS))$(BLAS)) NOBLAS := 1 endif endif endif endif endif LIBATOMIC ?= 0 ifneq (0,$(THREADS)) ifneq (0,$(LIBATOMIC)) ifneq (,$(call ldclib,$(LD),$(SLDFLAGS),atomic)) LDFLAGS += $(call ldclib,$(LD),$(SLDFLAGS),atomic) DFLAGS += -DLIBXSMM_LIBATOMIC endif endif endif LIBGFORTRAN := $(call libpath,$(FLD),$(SLDFLAGS),gfortran) LIBPTHREAD := $(call ldclib,$(LD),$(SLDFLAGS),pthread) QUADMATH := $(call ldclib,$(LD),$(SLDFLAGS),quadmath) LIBCPP := $(call ldclib,$(LD),$(SLDFLAGS),stdc++) LIBRT := $(call ldclib,$(LD),$(SLDFLAGS),rt) LIBDL := $(call ldclib,$(LD),$(SLDFLAGS),dl) ifneq (0,$(INTEL)) LIBM := $(call ldclib,$(LD),$(SLDFLAGS),imf) endif LIBM ?= $(call ldclib,$(LD),$(SLDFLAGS),m) LIBC := $(call ldclib,$(LD),$(SLDFLAGS),c) # (default) runtime library dependencies ifneq (0,$(FORCE_CXX)) # incl. undefined FCLDFLAGS += $(XLIB_BEGIN) $(call ldflib,$(FLD),$(SLDFLAGS),stdc++) $(XLIB_END) CLDFLAGS += $(XLIB_BEGIN) $(LIBCPP) $(XLIB_END) endif FCLDFLAGS += $(XLIB_BEGIN) $(call ldflib,$(FLD),$(SLDFLAGS),c) $(XLIB_END) CXXLDFLAGS += $(XLIB_BEGIN) $(LIBC) $(XLIB_END) ifneq (0,$(INTEL)) AR ?= xiar ifneq (0,$(SYM)) ifeq (1,$(SYM)) CXXFLAGS += -g CFLAGS += -g else CXXFLAGS += -g3 -debug inline-debug-info CFLAGS += -g3 -debug inline-debug-info endif ifeq (,$(strip $(GFC))) ifeq (0,$(ICX)) FCFLAGS += -g -traceback endif endif endif ifneq (0,$(shell echo "$$((170000<=$(CXX_VERSION_NUM)))")) CXXFLAGS += -std=c++14 else ifneq (0,$(shell echo "$$((140000<=$(CXX_VERSION_NUM)))")) CXXFLAGS += -std=c++11 endif CXXFLAGS += -Wall -diag-disable 1879,3415,3948,10006,10010,10411,13003 CFLAGS += -Wall -diag-disable 1879,3415,3948,10006,10010,10411,13003 ifneq (0,$(UNUSED)) CXXFLAGS += -Wno-unused-function CFLAGS += -Wno-unused-function endif ifeq (,$(strip $(GFC))) ifneq (ld,$(call qndir,$(LD))) LDFLAGS += -diag-disable 1879,3415,10006,10010,10411 endif FCFLAGS += -diag-disable 10006,10010,10411,13003 ifneq (0,$(THREADS)) FCMTFLAGS += -threads endif FPEDANTIC += -warn all,notruncated_source -diag-disable 7025,7373,10237,10342,10382 endif CPEDANTIC += $(WCHECK_CFLAG) -diag-disable 177,981,1419,1572,2547,10382 ifeq (0,$(ICX)) CPEDANTIC += -diag-disable 593,1599,2415,2591 ifeq (0,$(WCHECK)) CPEDANTIC += -diag-disable 2259 endif endif CWARNEXTRA := -Wremarks ifeq (1,$(PEDANTIC)) ifeq (,$(filter-out 0,$(FORCE_CXX))) CSTD := -std=c99 endif CFLAGS += $(CSTD) ifeq (,$(strip $(GFC))) FSTD := -std$(if $(filter 1,$(FORTRAN)),03,08) FMFLAGS += $(FSTD) $(FPEDANTIC) -diag-disable 10010 FCFLAGS += $(FFORM_FLAG) endif CXXFLAGS += $(WCHECK_CFLAG) CFLAGS += $(WCHECK_CFLAG) else ifneq (0,$(PEDANTIC)) ifneq (,$(filter 0 1,$(ICX))) ifeq (,$(filter-out 0,$(FORCE_CXX))) CSTD := -std=c89 endif endif CXXFLAGS += $(CPEDANTIC) CFLAGS += $(CSTD) $(CPEDANTIC) ifneq (990000,$(CC_VERSION_NUM)) CFLAGS += $(CWARNEXTRA) else ifneq (2,$(PEDANTIC)) CFLAGS += $(CWARNEXTRA) endif ifeq (,$(strip $(GFC))) FSTD := -std$(if $(filter 1,$(FORTRAN)),03,08) FCFLAGS += $(FSTD) $(FPEDANTIC) FMFLAGS += -fixed endif else ifneq (,$(filter 0 1,$(ICX))) ifeq (,$(filter-out 0,$(FORCE_CXX))) CSTD := -std=c89 endif else ifneq (0,$(ICX)) CXXFLAGS += -Wno-pass-failed CFLAGS += -Wno-pass-failed endif ifeq (,$(strip $(GFC))) FCFLAGS += $(FFORM_FLAG) endif endif ifeq (0,$(OFFLOAD)) ifeq (0,$(ICX)) ifneq (0,$(shell echo "$$((150000<=$(CC_VERSION_NUM)))")) NO_OFFLOAD_FLAG := -qno-offload else NO_OFFLOAD_FLAG := -no-offload endif endif endif CXXFLAGS += $(OPTFLAG) $(NO_OFFLOAD_FLAG) CFLAGS += $(OPTFLAG) $(NO_OFFLOAD_FLAG) ifeq (,$(strip $(GFC))) FCFLAGS += $(OPTFLAG) $(NO_OFFLOAD_FLAG) # flag specifying output directory must be last FMFLAGS += -module else FCFLAGS += $(OPTFLAG) ifeq (0,$(ICX)) FMFLAGS += -J endif endif # avoid turning OpenMP limits into an error (-Werror) ifneq (,$(WERROR_CFLAG)) FCFLAGS += $(NOLIMITS_FCFLAG) CXXFLAGS += $(NOLIMITS_CFLAG) CFLAGS += $(NOLIMITS_CFLAG) endif ifeq (0,$(DBG)) # consider more accurate -fp-model (C/C++: precise, Fortran: source) ifeq (0,$(ICX)) #CXXFLAGS += -fp-model fast=2 #CFLAGS += -fp-model fast=2 CXXFLAGS += -fno-alias CFLAGS += -fno-alias endif CXXFLAGS += -ansi-alias CFLAGS += -ansi-alias ifeq (,$(strip $(GFC))) ifneq (,$(strip $(FC))) #FCFLAGS += -fp-model fast=2 ifneq (0,$(shell echo "$$((130000<=$(FC_VERSION_NUM)))")) FCFLAGS += -align array64byte endif ifneq (0,$(IPO)) FCFLAGS += -ipo endif endif endif ifneq (0,$(IPO)) CXXFLAGS += -ipo CFLAGS += -ipo endif else ifeq (,$(strip $(GFC))) # debugging enabled ifeq (0,$(ICX)) FCFLAGS += -check endif endif ifneq (0,$(shell echo "$$((3>$(DBG)))")) ifeq (0,$(COMPATIBLE)) ifneq (,$(filter environment% override command%,$(origin COMPATIBLE))$(filter-out 1,$(INTEL))) ifneq (,$(filter 3,$(AVX))$(filter-out 0,$(VNNI) $(BF16))) ifeq (,$(MIC)) CTARGET := -xCOMMON-AVX512 else ifneq (0,$(MIC)) CTARGET := -xMIC-AVX512 else CTARGET := -xCORE-AVX512 endif ifneq (,$(filter-out 0,$(VNNI))) CTARGET += $(call ctest,$(CC),-mavx512vnni) else ifneq (,$(filter-out 0,$(BF16))) CTARGET += $(call ctest,$(CC),-mavx512vnni -mavx512bf16) endif else ifeq (2,$(AVX)) CTARGET := -xCORE-AVX2 else ifeq (1,$(AVX)) CTARGET := -xAVX else ifneq (0,$(SSE)) ifeq (1,$(SSE)) # default CTARGET := -xSSE4.2 else ifeq (3,$(SSE)) ifneq (Darwin,$(UNAME)) CTARGET := -xSSE3 else # no systems with less than SSE4.2 CTARGET := -xSSE4.2 endif else ifeq (4,$(SSE)) CTARGET := -xSSE4.2 else CTARGET := -xSSE$(SSE) endif else ifneq (0,$(AVX)) CTARGET := -xHost endif endif endif ifneq (,$(filter 3,$(AVX))$(filter-out 0,$(VNNI) $(BF16))) ifeq (,$(MIC)) CTARGET := -xCOMMON-AVX512 else ifneq (0,$(MIC)) CTARGET := -xMIC-AVX512 else CTARGET := -xCORE-AVX512 endif ifneq (,$(filter-out 0,$(VNNI))) CTARGET += $(call ctest,$(CC),-mavx512vnni) else ifneq (,$(filter-out 0,$(BF16))) CTARGET += $(call ctest,$(CC),-mavx512vnni -mavx512bf16) endif else ifeq (2,$(AVX)) CTARGET := -march=core-avx2 endif endif ifeq (,$(strip $(GFC))) ifneq (0,$(shell echo "$$((150000<=$(FC_VERSION_NUM)))")) OMPFLAG_FORCE := -qopenmp else OMPFLAG_FORCE := -fopenmp endif else OMPFLAG_FORCE := -fopenmp endif ifeq (,$(strip $(OMPRT))) OMPRT := iomp5 endif ifneq (0,$(call qnum,$(OMP))) # NaN CXXFLAGS += $(OMPFLAG_FORCE) FCFLAGS += $(OMPFLAG_FORCE) CFLAGS += $(OMPFLAG_FORCE) ifneq (,$(strip $(GFC))) LDFLAGS += $(XLIB_BEGIN) $(call ldclib,$(LD),$(SLDFLAGS),$(OMPRT)) $(XLIB_END) else LDFLAGS := $(OMPFLAG_FORCE) $(LDFLAGS) endif endif ifneq (0,$(SIMD)) ifneq (0,$(shell echo "$$((150000<=$(CXX_VERSION_NUM)))")) DFLAGS += -DLIBXSMM_OPENMP_SIMD CXXFLAGS += -qopenmp-simd CFLAGS += -qopenmp-simd ifeq (,$(strip $(GFC))) FCFLAGS += -qopenmp-simd endif SIMD ?= 1 endif endif ifeq (,$(strip $(GFC))) ifneq (,$(strip $(R8))) ifneq (0,$(R8)) FCFLAGS += -autodouble endif endif endif # workaround for certain bits introduced by GCC 7.0 ifneq (0,$(shell echo "$$(((180000<=$(CC_VERSION_NUM) && 180001>$(CC_VERSION_NUM)) || (170006>$(CC_VERSION_NUM) && 0!=$(CC_VERSION_NUM))))")) CFLAGS += -D_Float128=__float128 endif else # GCC assumed ifneq (0,$(SYM)) ifeq (1,$(if $(SANITIZE),2,$(SYM))) CXXFLAGS += -g CFLAGS += -g FCFLAGS += -g else ifeq (2,$(if $(SANITIZE),2,$(SYM))) CXXFLAGS += -g -fsanitize=$(if $(SANITIZE),$(SANITIZE),thread) -fno-omit-frame-pointer CFLAGS += -g -fsanitize=$(if $(SANITIZE),$(SANITIZE),thread) -fno-omit-frame-pointer FCFLAGS += -g -fsanitize=$(if $(SANITIZE),$(SANITIZE),thread) -fno-omit-frame-pointer LDFLAGS += -g -fsanitize=$(if $(SANITIZE),$(SANITIZE),thread) #$(call ldclib,$(LD),$(SLDFLAGS),tsan) else ifneq (,$(filter 2 3,$(XSMM_GCC))) CXXFLAGS += -g3 CFLAGS += -g3 else CXXFLAGS += -g CFLAGS += -g endif ifeq (3,$(XSMM_GCC)) FCFLAGS += -g3 else FCFLAGS += -g endif endif endif ifeq (0,$(COMPATIBLE)) ifneq (0,$(shell echo "$$((50000<=$(GCC_VERSION_NUM)))")) CXXFLAGS += -std=c++14 else ifneq (0,$(shell echo "$$((40700<=$(GCC_VERSION_NUM)))")) CXXFLAGS += -std=c++11 else ifneq (0,$(CLANG)) ifneq (0,$(shell echo "$$((40000<=$(CXX_VERSION_NUM)))")) CXXFLAGS += -std=c++14 else CXXFLAGS += -std=c++11 endif ifneq (0,$(shell echo "$$((40000<=$(CC_VERSION_NUM)))")) CFLAGS += -Wno-pass-failed endif else ifneq (,$(filter-out 2 3,$(XSMM_GCC))) CXXFLAGS += -std=c++11 endif endif ifeq (,$(filter-out 0,$(COMPATIBLE) $(PGI))) CXXFLAGS += -Wall CFLAGS += -Wall FSTD := -std=$(if $(filter 1,$(FORTRAN)),f2003,f2008) CPEDANTIC += -pedantic -Wextra -Wno-variadic-macros FPEDANTIC += -pedantic -Wextra -Wunused-variable \ -Wimplicit-interface -Wimplicit-procedure \ -Wconversion -Wintrinsics-std \ -Wcharacter-truncation ifneq (0,$(shell echo "$$((40200<=$(CC_VERSION_NUM)))")) CPEDANTIC += -Wno-overlength-strings else ifneq (0,$(CLANG)) CPEDANTIC += -Wno-overlength-strings endif ifneq (0,$(shell echo "$$((40600<=$(CC_VERSION_NUM)))")) # can yield false positives with older GCC CPEDANTIC += -Wshadow endif ifneq (,$(strip $(FC))) ifneq (0,$(shell echo "$$((50000<=$(FC_VERSION_NUM)))")) FWARNEXTRA := -Wuse-without-only -Wc-binding-type \ -Wrealloc-lhs -Wrealloc-lhs-all \ -Wreal-q-constant -Wconversion-extra \ -Wline-truncation endif endif ifneq (0,$(UNUSED)) CXXFLAGS += -Wno-unused-function #-Wno-attributes CFLAGS += -Wno-unused-function #-Wno-attributes endif ifeq (0,$(MINGW)) CPEDANTIC += -Wformat=2 else # MinGW CXXFLAGS += -fno-asynchronous-unwind-tables FCFLAGS += -fno-asynchronous-unwind-tables CFLAGS += -fno-asynchronous-unwind-tables ifneq (,$(filter 0 1,$(PEDANTIC))) CFLAGS += -Wno-format endif endif FPEDANTIC += $(FWARNEXTRA) ifeq (1,$(PEDANTIC)) ifeq (,$(filter-out 0,$(FORCE_CXX))) CSTD := -std=c99 endif CXXFLAGS += $(CPEDANTIC) -Wno-long-long #CXXFLAGS += -Wno-missing-field-initializers CFLAGS += $(CSTD) $(CPEDANTIC) FCFLAGS += $(FFORM_FLAG) FMFLAGS += $(FSTD) -pedantic -Wunused-variable $(FWARNEXTRA) else ifneq (0,$(PEDANTIC)) ifneq (Darwin,$(UNAME)) ifeq (,$(filter-out 0,$(FORCE_CXX))) CSTD := -std=c89 endif CPEDANTIC += -Wno-long-long CXXFLAGS += $(CPEDANTIC) else ifneq (0,$(XSMM_GCC)) ifeq (,$(filter-out 0,$(FORCE_CXX))) CSTD := -std=c89 endif CPEDANTIC += -Wno-long-long CXXFLAGS += $(CPEDANTIC) else # Clang may run into ICEs under macOS ifeq (,$(filter-out 0,$(FORCE_CXX))) CSTD := -std=c99 endif CXXFLAGS += $(CPEDANTIC) -Wno-long-long endif #CXXFLAGS += -Wno-missing-field-initializers #-Wzero-as-null-pointer-constant CFLAGS += $(CSTD) $(CPEDANTIC) FCFLAGS += $(FSTD) $(FPEDANTIC) else ifeq (0,$(COMPATIBLE)) CPEDANTIC += -Wno-long-long #-Wno-missing-field-initializers FCFLAGS += $(FFORM_FLAG) ifeq (,$(filter-out 0,$(FORCE_CXX))) CSTD := -std=c89 endif endif endif # flag specifying output directory must be last ifeq (,$(filter-out 0,$(COMPATIBLE) $(PGI) $(ICX))) FMFLAGS += -J else # fallback FMFLAGS += -I endif CXXFLAGS += $(OPTFLAG) CFLAGS += $(OPTFLAG) FCFLAGS += $(OPTFLAG) ifeq (0,$(DBG)) ifneq (0,$(IPO)) CXXFLAGS += -flto CFLAGS += -flto FCFLAGS += -flto #FLDFLAGS += -fno-lto LDFLAGS += $(call linkopt,-flto) ifneq (0,$(FAT)) CXXFLAGS += -ffat-lto-objects CFLAGS += -ffat-lto-objects FCFLAGS += -ffat-lto-objects endif endif endif ifeq (Windows_NT,$(UNAME)) LDFLAGS += $(XLIB_BEGIN) $(call ldclib,$(LD),$(SLDFLAGS),dbghelp) $(XLIB_END) else ifeq (FreeBSD,$(UNAME)) LDFLAGS += $(XLIB_BEGIN) $(call ldclib,$(LD),$(SLDFLAGS),execinfo) $(XLIB_END) endif ifeq (0,$(COMPATIBLE)) ifneq (0,$(PGI)) OMPFLAG_FORCE := -mp OMPRT := omp else ifeq (0,$(INTEL)) ifneq (Darwin,$(UNAME)) ifneq (0,$(XSMM_GCC)) OMPFLAG_FORCE := -fopenmp else ifneq (0,$(shell echo "$$((0!=$(CLANG) && 30900<=$(CC_VERSION_NUM)))")) OMPFLAG_FORCE := -fopenmp OMPRT := omp endif else # Darwin OMPFLAG_FORCE := -Xpreprocessor -fopenmp OMPRT := omp endif endif endif ifeq (,$(OMPFLAG_FORCE)) ifneq (,$(filter environment% override command%,$(origin OMP))) OMPFLAG_FORCE := -fopenmp endif else ifeq (FreeBSD,$(UNAME)) # avoid include path at begin of compile line OMPFLAG_FORCE += -I/usr/local/include else ifneq (0,$(CLANG)) TESTRESB := $(call qapath,$(call qdir,$(call which,$(CC)))/../compiler) TESTRESB := $(wildcard $(TESTRESB)/include/omp.h) ifneq (,$(TESTRESB)) OMPFLAG_FORCE += -I$(call qdir,$(TESTRESB)) endif endif # account for missing TLS/OMP ifeq (,$(OMPFLAG_FORCE)) THREADS ?= 0 OMP := 0 endif ifeq (,$(strip $(OMPRT))) # fallback OMPRT := gomp endif OMPLIBFILE := $(call libpath,$(LD),$(SLDFLAGS) $(OMPFLAG_FORCE) $(foreach LIB,$(LIBFIND),$(call abslibpath,$(LIB))),$(OMPRT)) ifeq (0,$(CRAY)) ifeq (,$(OMPLIBFILE)) ifneq (,$(shell INFILE=$$($(MKTEMP) /tmp/.libxsmm_XXXXXX.c) && \ printf "$(CHAR_HASH)include \nint main(void) { return omp_get_max_threads(); }\n" > $${INFILE} && \ $(LD) $(SLDFLAGS) $(OMPFLAG_FORCE) $(foreach LIB,$(LIBFIND),$(call abslibpath,$(LIB))) -l$(OMPRT) $${INFILE} -o $${INFILE}.x 2>/dev/null >/dev/null && echo "OK"; \ rm -f /tmp/$$(basename $${INFILE} .c).* .libxsmm_??????.* 2>/dev/null)) CLDFLAGS += $(foreach LIB,$(LIBFIND),$(call abslibpath,$(LIB))) OMPLIBFILE := -l$(OMPRT) endif endif endif ifneq (,$(OMPFLAG_FORCE)) ifneq (0,$(call qnum,$(OMP))) # NaN ifneq (,$(OMPLIBFILE)) ifneq (Darwin,$(UNAME)) ifneq (0,$(CLANG)) CXXLDFLAGS += $(call abslibpath,$(OMPLIBFILE)) CLDFLAGS += $(call abslibpath,$(OMPLIBFILE)) endif CXXLDFLAGS := $(OMPFLAG_FORCE) $(CXXLDFLAGS) CLDFLAGS := $(OMPFLAG_FORCE) $(CLDFLAGS) else CXXLDFLAGS += $(call abslib,$(OMPLIBFILE)) CLDFLAGS += $(call abslib,$(OMPLIBFILE)) endif CXXFLAGS += $(OMPFLAG_FORCE) CFLAGS += $(OMPFLAG_FORCE) else ifneq (Darwin,$(UNAME)) CXXLDFLAGS := $(OMPFLAG_FORCE) $(CXXLDFLAGS) CLDFLAGS := $(OMPFLAG_FORCE) $(CLDFLAGS) CXXFLAGS += $(OMPFLAG_FORCE) CFLAGS += $(OMPFLAG_FORCE) endif ifeq (3,$(XSMM_GCC)) # pure GNU toolchain FLDFLAGS := -fopenmp $(FLDFLAGS) FCFLAGS += -fopenmp else # mixed toolchain TESTRESC := $(strip $(if $(GFC), \ $(call libpath,$(FLD),$(SLDFLAGS) -fopenmp,gomp), \ $(call libpath,$(FLD),$(SLDFLAGS) $(foreach LIB,$(LIBFIND),$(call abslibpath,$(LIB))),$(OMPRT)))) ifeq (,$(TESTRESC)) TESTRESC := $(strip $(call ldflib,$(FLD),$(SLDFLAGS) \ $(if $(GFC),-fopenmp,$(foreach LIB,$(LIBFIND),$(call abslibpath,$(LIB)))), \ $(if $(GFC),gomp,$(OMPRT)))) endif ifneq (,$(TESTRESC)) FLDFLAGS += $(if $(GFC),$(NULL),$(foreach LIB,$(LIBFIND),$(call abslibpath,$(LIB)))) $(TESTRESC) else TESTRESC := $(call ftest,$(FLD),-fopenmp,link) ifneq (,$(TESTRESC)) FLDFLAGS += $(TESTRESC) else # last try FLDFLAGS := $(OMPFLAG_FORCE) $(FLDFLAGS) endif endif TESTRESD := $(call ftest,$(FC),-fopenmp) ifneq (,$(TESTRESD)) FCFLAGS += $(TESTRESD) else FCFLAGS := $(call ftest,$(FC),$(OMPFLAG_FORCE)) $(FCFLAGS) endif endif else ifneq (0,$(OMP)) # clang: OMP=libomp (NaN) ifneq (,$(OMPLIBFILE)) ifneq (0,$(CLANG)) CXXLDFLAGS += $(call abslibpath,$(OMPLIBFILE)) CLDFLAGS += $(call abslibpath,$(OMPLIBFILE)) endif CXXLDFLAGS := $(OMPFLAG_FORCE)=$(OMP) $(CXXLDFLAGS) CLDFLAGS := $(OMPFLAG_FORCE)=$(OMP) $(CLDFLAGS) CXXFLAGS += $(OMPFLAG_FORCE)=$(OMP) CFLAGS += $(OMPFLAG_FORCE)=$(OMP) endif endif endif ifneq (,$(strip $(R8))) ifneq (0,$(R8)) FCFLAGS += -fdefault-real-8 -fdefault-double-8 endif endif endif FCMTFLAGS ?= $(NULL) ifeq (undefined,$(origin TARGET)) ifneq (,$(CTARGET)) ifeq (,$(call ctest,$(CC),$(CTARGET))) undefine CTARGET endif endif ifeq (,$(CTARGET)) ifneq (0,$(shell echo "$$((3>$(DBG)))")) ifneq (,$(filter 3,$(AVX))$(filter-out 0,$(VNNI) $(BF16))) ifneq (0,$(shell echo "$$(((0!=$(XSMM_GCC) && 80000<=$(GCC_VERSION_NUM)) || \ (0!=$(CLANG) && (40000<=$(CC_VERSION_NUM) || 0==$(CC_VERSION_NUM))) || \ (0!=$(PGI) && 190000<=$(CC_VERSION_NUM))))")) ifneq (,$(call ctest,$(CC),-mfma -mavx512f -mavx512cd)) CTARGET := -mfma -mavx512f -mavx512cd ifneq (0,$(lastword $(sort 0 $(MIC)))) # MIC ifneq (,$(call ctest,$(CC),-mavx512pf -mavx512er)) CTARGET += -mavx512pf -mavx512er endif else ifneq (,$(call ctest,$(CC),-mavx512dq -mavx512bw -mavx512vl)) ifneq (Darwin,$(UNAME)) CTARGET += -mavx512dq -mavx512bw -mavx512vl #CTARGET += -mavx512ifma -mavx512vbmi else ifneq (,$(filter 1 2 3,$(INTEL) $(XSMM_GCC))) CTARGET += -mavx512dq -mavx512bw -mavx512vl #CTARGET += -mavx512ifma -mavx512vbmi else ifneq (0,$(shell echo "$$((0!=$(CLANG) && 80100<=$(CC_VERSION_NUM)))")) CTARGET += -mavx512dq -mavx512bw -mavx512vl endif ifneq (,$(filter-out 0,$(VNNI))) CTARGET += $(call ctest,$(CC),-mavx512vnni) else ifneq (,$(filter-out 0,$(BF16))) CTARGET += $(call ctest,$(CC),-mavx512vnni -mavx512bf16) endif endif endif endif endif ifeq (,$(CTARGET)) # fallback to AVX2 ifneq (,$(filter 2 3,$(AVX))) ifneq (0,$(shell echo "$$((0!=$(INTEL) || 0!=$(CLANG) || (40800<=$(GCC_VERSION_NUM) && 0==$(PGI)) || 0==$(CC_VERSION_NUM)))")) CTARGET ?= -march=core-avx2 else CTARGET ?= -mavx2 -mfma endif endif endif ifeq (,$(CTARGET)) # fallback to AVX ifneq (,$(filter 1 2 3,$(AVX))) ifneq (0,$(shell echo "$$((0!=$(INTEL) || 0!=$(CLANG) || 40400<=$(GCC_VERSION_NUM) || 0==$(CC_VERSION_NUM)))")) CTARGET ?= -mavx endif endif endif ifneq (,$(filter 1 2 3 4,$(SSE) $(AVX))) ifeq (,$(CTARGET)) # SSE-4.2 or default-SSE ifneq (,$(filter 01 02 03 10 11 12 13 40 41 42 43,$(SSE)$(AVX))) ifneq (0,$(XSMM_GCC)) ifneq (0,$(shell echo "$$((40300<=$(CC_VERSION_NUM)))")) ifeq (0,$(MAINTAINER)) CTARGET := -msse4.2 else # maintainer build ifeq (0,$(DEPSTATIC)) TARGET := -msse4.2 CTARGET := -msse2 else ifneq (0,$(SHARED)) ifneq (,$(SHARED)) TARGET := -msse4.2 CTARGET := -msse2 else CTARGET := -msse4.2 endif else CTARGET := -msse4.2 endif endif else CTARGET := -msse2 endif endif CTARGET ?= -msse4.2 else ifneq (,$(filter-out 0 1,$(SSE))) # better to use TARGET flag directly CTARGET := -msse$(SSE) endif endif ifeq (,$(CTARGET)) # SSE3 ifneq (,$(filter 30 31 32 33,$(SSE)$(AVX))) ifneq (Darwin,$(UNAME)) CTARGET ?= -msse2 else # prevents Clang BE error (CRC32 and others) CTARGET ?= -msse4.2 endif endif endif # stop here as SSE=2 is implicitly present (64-bit ABI) else ifneq (0,$(AVX)) CTARGET := -march=native -mtune=native endif endif # CTARGET endif # DBG else # take user's TARGET into account CTARGET := $(TARGET) endif override VNNI := $(if $(filter -mavx512vnni,$(CTARGET)),1,0) override BF16 := $(if $(filter -mavx512bf16,$(CTARGET)),1,0) ifneq (,$(call ftest,$(FC),$(CTARGET))) # inherit CTARGET flags FTARGET := $(CTARGET) else ifneq (,$(call ftest,$(FC),$(patsubst -m%,-x%,$(CTARGET)))) FTARGET := $(patsubst -m%,-x%,$(CTARGET)) endif ifeq (,$(call ctest,$(CC),$(CTARGET))) # revoke target flags CTARGET := $(NULL) endif # avoid unnecessary state ifeq ($(CTARGET),$(FTARGET)) TARGET ?= $(CTARGET) endif # Intrinsics support level (0: None, 1: Static, 2: Dynamic) # 1003<=INTRINSICS<1999: CPUID (see libxsmm_cpuid.h) # For example, LIBXSMM_X86_AVX2=1006 ifneq (0,$(INTEL)) INTRINSICS ?= 2 else ifneq (0,$(CLANG)) INTRINSICS ?= 2 else ifneq (,$(GCC_VERSION)) ifneq (0,$(shell echo "$$((100000<=$(GCC_VERSION_NUM)))")) ifeq (/usr/bin/,$(call qdir,$(call which,$(CC)))) INTRINSICS ?= 2 endif endif endif ifeq (x86_64,$(MNAME)) INTRINSICS ?= 1006 ifneq (0,$(INTRINSICS)) ifeq (1,$(INTRINSICS)) ifeq (0,$(INTEL)) DFLAGS += -DLIBXSMM_INTRINSICS_STATIC else ifneq (,$(CTARGET)) DFLAGS += -DLIBXSMM_INTRINSICS_STATIC endif else ifneq (2,$(INTRINSICS)) DFLAGS += -DLIBXSMM_TARGET_ARCH=$(INTRINSICS) endif else DFLAGS += -DLIBXSMM_INTRINSICS_NONE endif endif ifeq (0,$(SYM)) ifneq (Darwin,$(UNAME)) CLDFLAGS += $(call ctest,$(CC),-s,link) FLDFLAGS += $(call ftest,$(FC),-s,link) endif endif ifneq (0,$(INSTRUMENT)) ifneq (Darwin1,$(UNAME)$(INSTRUMENT)) DFLAGS += -D__TRACE=$(INSTRUMENT) ifeq (0,$(PGI)) CXXFLAGS += -finstrument-functions $(call ctest,$(CXX),-fno-partial-inlining) $(call ctest,$(CXX),-fno-optimize-sibling-calls) CFLAGS += -finstrument-functions $(call ctest,$(CC),-fno-partial-inlining) $(call ctest,$(CC),-fno-optimize-sibling-calls) FCFLAGS += -finstrument-functions $(call ftest,$(FC),-fno-partial-inlining) $(call ftest,$(FC),-fno-optimize-sibling-calls) ifeq (0,$(INTEL)) # e.g. Intel, Clang, and others do not need/understand below flag ifneq (0,$(shell echo "$$((40300<=$(GCC_VERSION_NUM)))")) CFLAGS += -finstrument-functions-exclude-function-list=_mm_,_mm256_,_mm512_,__rdtsc ifneq (,$(filter 2 3,$(XSMM_GCC))) CXXFLAGS += -finstrument-functions-exclude-function-list=_mm_,_mm256_,_mm512_,__rdtsc ifeq (3,$(XSMM_GCC)) FCFLAGS += -finstrument-functions-exclude-function-list=_mm_,_mm256_,_mm512_,__rdtsc endif endif endif endif endif endif endif ifeq (0,$(COMPATIBLE)) ifneq (0,$(OPT)) ifeq (0,$(INTEL)) ifneq (0,$(SIMD)) ifneq (0,$(XSMM_GCC)) ifneq (,$(CTARGET)) ifneq (0,$(shell echo "$$((40900<=$(CC_VERSION_NUM)))")) DFLAGS += -DLIBXSMM_OPENMP_SIMD CFLAGS += -fopenmp-simd ifneq (1,$(XSMM_GCC)) ifneq (0,$(shell echo "$$((40900<=$(CXX_VERSION_NUM)))")) CXXFLAGS += -fopenmp-simd ifneq (,$(FTARGET)) ifneq (2,$(XSMM_GCC)) ifneq (0,$(shell echo "$$((40900<=$(FC_VERSION_NUM)))")) FCFLAGS += -fopenmp-simd SIMD ?= 1 endif endif endif endif endif endif endif else ifneq (0,$(CLANG)) ifneq (,$(strip $(SIMD))) # explicit ifneq (Darwin,$(UNAME)) ifneq (,$(CTARGET)) ifneq (0,$(shell echo "$$((60000<=$(CC_VERSION_NUM) && 60000<=$(CXX_VERSION_NUM)))")) DFLAGS += -DLIBXSMM_OPENMP_SIMD CXXFLAGS += -fopenmp-simd CFLAGS += -fopenmp-simd ifneq (,$(FTARGET)) ifneq (,$(strip $(FC))) ifneq (0,$(shell echo "$$((40900<=$(FC_VERSION_NUM)))")) FCFLAGS += -fopenmp-simd SIMD ?= 1 endif endif endif endif endif endif endif endif endif # SIMD endif endif ifneq (,$(filter 1 2,$(patsubst O%,%,$(OPT)))) ifneq (,$(CTARGET)) ifeq (,$(filter-out 0,$(PGI))) CXXFLAGS += -funroll-loops CFLAGS += -funroll-loops ifneq (,$(GFC)$(filter-out 0 1 2,$(XSMM_GCC))) FCFLAGS += -funroll-loops endif endif ifeq (,$(filter-out 0,$(PGI) $(INTEL) $(ICX))) CXXFLAGS += -ftree-vectorize CFLAGS += -ftree-vectorize FCFLAGS += -ftree-vectorize endif endif endif ifneq (Darwin,$(UNAME)) ifneq (0,$(HARDEN)) # not defined: enabled ifneq (,$(strip $(HARDEN))) # explicit ifneq (0,$(shell echo "$$((40900<=$(FC_VERSION_NUM)))")) CXXFLAGS += -fstack-protector-strong CFLAGS += -fstack-protector-strong else CXXFLAGS += -fstack-protector CFLAGS += -fstack-protector endif #DFLAGS += -D_FORTIFY_SOURCE=2 #else #DFLAGS += -D_FORTIFY_SOURCE=1 endif else CXXFLAGS += -fno-stack-protector CFLAGS += -fno-stack-protector endif ifneq (ld,$(call qndir,$(LD))) ifneq (Windows_NT,$(UNAME)) ifneq (0,$(DEPSTATIC)) EXCLUDE_VALUE += $(call linkopt,--export-dynamic) ifneq (0,$(HARDEN)) ifneq (,$(strip $(HARDEN))) # explicit LDFLAGS := $(call linkopt,--export-dynamic) $(LDFLAGS) else ifneq (0,$(SYM)) LDFLAGS := $(call linkopt,--export-dynamic) $(LDFLAGS) endif else ifneq (0,$(SYM)) LDFLAGS := $(call linkopt,--export-dynamic) $(LDFLAGS) endif endif # Linux distributions may apply similar hardening LDFLAGS := $(XLNKOPT)-z,relro,-z,now $(LDFLAGS) endif endif ifeq (0,$(PGI)) ifneq (0,$(OPT)) CXXFLAGS += -fdata-sections -ffunction-sections CFLAGS += -fdata-sections -ffunction-sections ifeq (,$(filter-out 0,$(INTEL) $(ICX))) FCFLAGS += -fdata-sections -ffunction-sections else ifneq (,$(strip $(GFC))) FCFLAGS += -fdata-sections -ffunction-sections endif # --gc-sections: relies on section-flags present at compile-stage LDFLAGS := $(call linkopt,--gc-sections) $(LDFLAGS) ifeq (0,$(VISIBILITY)) # -fvisibility=hidden may cause crashes CXXFLAGS += -fvisibility=hidden -fvisibility-inlines-hidden #FCFLAGS += -fvisibility=hidden CFLAGS += -fvisibility=hidden endif endif endif endif ifeq (0,$(EXP)) CXXFLAGS += -fno-exceptions endif endif ifeq (0,$(VLA)) DFLAGS += -DLIBXSMM_NO_VLA else ifneq (,$(VLA)) DFLAGS += -DLIBXSMM_VLA endif # Information which can be displayed by the actual Makefile ifneq (,$(strip $(FC))) GINFO := $(SUITE): $(strip $(CC_NAME) $(CC_VERSION)), $(strip $(CXX_NAME) $(CXX_VERSION)), and $(strip $(FC_NAME) $(FC_VERSION)) FINFO := Fortran target: $(if $(FTARGET),$(FTARGET),) else GINFO := $(SUITE): $(strip $(CC_NAME) $(CC_VERSION)), and $(strip $(CXX_NAME) $(CXX_VERSION)) FINFO := Fortran: endif CINFO := C / C++ target: $(if $(CTARGET),$(CTARGET),) ifeq (Darwin,$(UNAME)) # avoid Homebrew based GCC AS; apply the flag only to the non-GCC components ifneq (0,$(XSMM_GCC)) LDFLAGS += -Wa,-q CFLAGS += -Wa,-q ifneq (,$(filter 2 3,$(XSMM_GCC))) CXXFLAGS += -Wa,-q endif endif FLDFLAGS += -Wa,-q FCFLAGS += -Wa,-q endif ifeq (,$(filter-out 0,$(COMPATIBLE) $(PGI))) ifneq (,$(filter-out 0,$(THREADS) $(BLAS))) CXXFLAGS += -pthread CFLAGS += -pthread endif ifneq (Windows_NT,$(UNAME)) ifneq (0,$(INTEL)) ifeq (0,$(OFFLOAD)) FLDFLAGS += $(XLIB_BEGIN) $(call ldflib,$(FLD),$(SLDFLAGS),pthread) $(XLIB_END) CLDFLAGS += $(XLIB_BEGIN) $(LIBPTHREAD) $(XLIB_END) else FLDFLAGS += -pthread CLDFLAGS += -pthread endif else ifneq (Darwin,$(UNAME)) FLDFLAGS += -pthread CLDFLAGS += -pthread else ifeq (0,$(CLANG)) FLDFLAGS += -pthread CLDFLAGS += -pthread endif endif else ifneq (0,$(PGI)) LIBATOMIC ?= 1 endif ifeq (0,$(shell INFILE=$$($(MKTEMP) /tmp/.libxsmm_XXXXXX.c) && \ printf "$(CHAR_HASH)include \n$(CHAR_HASH)if !defined(__GNU_LIBRARY__) && !defined(__GLIBC__)\n0\n$(CHAR_HASH)endif\n" > $${INFILE} && \ if [ "$$($(CC) -c $${INFILE} -o $${INFILE}.o 2>&1 || echo 'x')" ]; then echo "1"; else echo "0"; fi; \ rm -f /tmp/$$(basename $${INFILE} .c).* .libxsmm_??????.* 2>/dev/null)) GLIBC := 1 endif GLIBC ?= 0 OMPLIBFILE ?= $(call libpath,$(LD),$(SLDFLAGS) $(OMPFLAG_FORCE) $(foreach LIB,$(LIBFIND),$(call abslibpath,$(LIB))),$(OMPRT)) ifneq (,$(strip $(OMPLIBFILE))) OMPLIB ?= $(call abslib,$(OMPLIBFILE)) else ifneq (0,$(INTEL)) OMPLIB ?= $(call ldclib,$(LD),$(SLDFLAGS),$(OMPRT)) endif ifneq (,$(OMPFLAG_FORCE)) ifeq (0,$(shell INFILE=$$($(MKTEMP) /tmp/.libxsmm_XXXXXX.c) && \ printf "$(CHAR_HASH)if defined(_OPENMP)\n$(CHAR_HASH) include \nint main() { return 0; }\n$(CHAR_HASH)endif\n" > $${INFILE} && \ if [ "$$($(CC) $(OMPFLAG_FORCE) $${INFILE} -o $${INFILE}.x 2>&1 || echo 'x')" ]; then echo "1"; else echo "0"; fi; \ rm -f /tmp/$$(basename $${INFILE} .c).* .libxsmm_??????.* 2>/dev/null)) OMPFLAG := $(OMPFLAG_FORCE) endif endif ifeq (0,$(OMP)) EXTLIB += $(OMPLIB) endif OMPFLAG ?= $(NULL) ifneq (0,$(TBB_MALLOC)) ifneq (,$(TBBROOT)) ifneq (Windows_NT,$(UNAME)) TBBLIB_DIR := $(TBBROOT)/lib/intel64 TBBLIB_DIRGCC := gcc$(GCC_VERSION_MAJOR).$(GCC_VERSION_MINOR) TBBLIB_MALLOC := $(wildcard $(TBBLIB_DIR)/$(TBBLIB_DIRGCC)/libtbbmalloc.$(ILIBEXT)) ifeq (,$(TBBLIB_MALLOC)) ifneq (0,$(TBB_OLDRTL)) TBB_LIBDIRGCC := $(shell ls -1 $(call quote,$(TBB_LIBDIR)) | head -n1) else TBB_LIBDIRGCC := $(shell ls -1 $(call quote,$(TBB_LIBDIR)) | tail -n1) endif TBB_LIBMALLOC := $(wildcard $(TBB_LIBDIR)/$(TBB_LIBDIRGCC)/libtbbmalloc.$(ILIBEXT)) endif ifneq (,$(TBB_LIBMALLOC)) IFLAGS += -I$(call quote,$(TBBROOT)/include) DFLAGS += -D__TBB LDFLAGS += $(XLIB_BEGIN) $(call abslib,$(TBB_LIBMALLOC)) $(XLIB_END) endif else # TODO: Windows support endif endif endif MAKE_ILP64 := 0 ifneq (,$(strip $(ILP64))) ifneq (0,$(ILP64)) MAKE_ILP64 := $(ILP64) endif endif ifneq (0,$(MAKE_ILP64)) BLAS_BITS := 64 MKL_BITS := ilp64 else MKL_BITS := lp64 endif ifneq (0,$(BLAS)) ifneq (Darwin,$(UNAME)) MKL_PLATFORM := linux else # macOS MKL_PLATFORM := mac endif endif ifneq (0,$(BLAS_STATIC)) BLASLIBEXT ?= $(SLIBEXT) else # shared (DLL) BLASLIBEXT ?= $(ILIBEXT) endif ifneq (,$(strip $(MKLROOT))) ifeq (0,$(PGI)) MKL ?= $(BLAS) ifneq (,$(strip $(FC))) ifneq (0,$(shell echo "$$((0==$(XSMM_GCC) || 40600<=$(GCC_VERSION_NUM)))")) MKL_DIRECT ?= 0 else MKL_DIRECT := 0 endif else MKL_DIRECT := 0 endif else MKL ?= 0 endif else # disable MKL := 0 endif ifneq (0,$(MKL)) ifeq (1,$(MKL_DIRECT)) ifeq (1,$(MKL)) BLAS_FLAGS += -DMKL_DIRECT_CALL_SEQ else BLAS_FLAGS += -DMKL_DIRECT_CALL endif ifneq (0,$(XSMM_GCC)) CXXFLAGS += -Wno-unused-value CFLAGS += -Wno-unused-value endif else ifneq (0,$(MKL_DIRECT)) ifeq (1,$(MKL)) BLAS_FLAGS += -DMKL_DIRECT_CALL_SEQ_JIT else BLAS_FLAGS += -DMKL_DIRECT_CALL_JIT endif ifneq (0,$(XSMM_GCC)) CXXFLAGS += -Wno-unused-value CFLAGS += -Wno-unused-value endif endif BLAS_FLAGS += -D__CBLAS endif ifneq (Darwin,$(UNAME)) ifneq (,$(wildcard $(MKLROOT)/lib/x86_64-linux-gnu/libmkl_rt.*)) MKL_ARCH := x86_64-linux-gnu else MKL_ARCH := intel64 endif endif ifeq (1,$(MKL)) # sequential BLAS_FLAGS += -D__BLAS=1 -D__MKL=1 MKL_THREADS := sequential ifneq (0,$(INTEL)) ifneq (0,$(OFFLOAD)) BLAS_LDFLAGS += -qoffload-option,mic,ld,"$(LIBM) $(LIBDL)" endif endif else ifneq (0,$(MKL)) # multi-threaded BLAS_FLAGS += -D__BLAS=$(MKL) -D__MKL=$(MKL) MKL_THREADS := $(MKL_OMPRTL)_thread ifeq (0,$(OMP)) ifneq (gnu,$(MKL_OMPRTL)) BLAS_LDFLAGS += $(OMPLIB) else BLAS_LDFLAGS += $(call ldclib,$(LD),$(SLDFLAGS),gomp) endif else ifeq (gnu,$(OMPLIB)$(MKL_OMPRTL)) BLAS_LDFLAGS += $(call ldclib,$(LD),$(SLDFLAGS),gomp) endif ifneq (0,$(THREADS)) BLAS_LDFLAGS += $(XLIB_BEGIN) $(LIBPTHREAD) $(XLIB_END) endif ifneq (0,$(INTEL)) ifneq (0,$(OFFLOAD)) BLAS_LDFLAGS += -qoffload-option,mic,ld,"$(LIBM) $(LIBDL)" ifeq (0,$(OMP)) BLAS_LDFLAGS += -qoffload-option,mic,ld,"$(OMPLIB)" endif endif endif endif ifneq (0,$(MKL)) ifeq (,$(BLAS_INCFILE)) BLAS_IFLAGS += -I$(call quote,$(MKLROOT)/include) else BLAS_IFLAGS += -I$(call qdir,$(BLAS_INCFILE)) endif ifeq (0,$(BLAS_STATIC)) # shared BLAS_LDFLAGS := $(call abslibpath,$(MKLROOT)/lib/$(MKL_ARCH)) \ $(call abslibfile,libmkl_$(MKL_FCRTL)_$(MKL_BITS).$(ILIBEXT)) \ $(call abslibfile,libmkl_core.$(ILIBEXT)) \ $(call abslibfile,libmkl_$(MKL_THREADS).$(ILIBEXT)) \ $(BLAS_LDFLAGS) ifneq (0,$(INTEL)) ifneq (0,$(OFFLOAD)) BLAS_LDFLAGS += -qoffload-option,mic,ld,"-L$(MKLROOT)/lib/mic -lmkl_$(MKL_FCRTL)_$(MKL_BITS) -lmkl_core -lmkl_$(MKL_THREADS)" endif endif else # static BLAS_LDFLAGS := $(XGROUP_BEGIN) \ $(MKLROOT)/lib/$(MKL_ARCH)/libmkl_$(MKL_FCRTL)_$(MKL_BITS).$(SLIBEXT) \ $(MKLROOT)/lib/$(MKL_ARCH)/libmkl_core.$(SLIBEXT) \ $(MKLROOT)/lib/$(MKL_ARCH)/libmkl_$(MKL_THREADS).$(SLIBEXT) \ $(XGROUP_END) $(BLAS_LDFLAGS) ifneq (0,$(INTEL)) ifneq (0,$(OFFLOAD)) BLAS_LDFLAGS += -qoffload-option,mic,ld,"--start-group \ $(MKLROOT)/lib/mic/libmkl_$(MKL_FCRTL)_$(MKL_BITS).$(SLIBEXT) \ $(MKLROOT)/lib/mic/libmkl_core.$(SLIBEXT) \ $(MKLROOT)/lib/mic/libmkl_$(MKL_THREADS).$(SLIBEXT) \ --end-group" endif endif endif else ifneq (0,$(BLAS)) # generic ifneq (,$(AOCL_ROOT)) ifeq (0,$(BLAS_STATIC)) # shared ifeq (1,$(BLAS)) # sequential BLAS_LDFLAGS := -L$(AOCL_ROOT)/lib -lblis $(BLAS_LDFLAGS) else # multi-threaded BLAS_LDFLAGS := -L$(AOCL_ROOT)/lib -lblis-mt $(BLAS_LDFLAGS) endif else # static ifeq (1,$(BLAS)) # sequential BLAS_LDFLAGS := $(AOCL_ROOT)/lib/libblis.a $(BLAS_LDFLAGS) else # multi-threaded BLAS_LDFLAGS := $(AOCL_ROOT)/lib/libblis-mt.a $(BLAS_LDFLAGS) endif endif else ifeq (,$(strip $(BLASLIB))) ifneq (1,$(BLAS)) ifneq (0,$(OMP)) BLAS_THREADS := o else BLAS_THREADS := p endif endif BLASDIR ?= $(wildcard $(LIBFIND) /usr/lib /usr/lib64) BLASROOT := $(wildcard $(patsubst %,%/..,$(BLASDIR))) ifeq (0,$(BLAS_STATIC)) # shared BLASTEST := $(filter-out -static,$(SLDFLAGS)) $(patsubst %,-L%,$(BLASDIR)) else # static BLASTEST := $(SLDFLAGS) $(patsubst %,-L%,$(BLASDIR)) endif BLASLIBFILE := $(call libpath,$(LD),$(BLASTEST),openblas$(BLAS_THREADS)$(BLAS_BITS)) ifeq (,$(BLASLIBFILE)) # newer distributions symlink non-decorated library to threaded OpenBLAS BLASLIBFILE := $(call libpath,$(LD),$(BLASTEST),openblas$(BLAS_BITS)) endif # most people expect to pickup OpenBLAS (if available) even when libblas/liblapack are available; use OPENBLAS=0 to avoid this ifeq (0,$(PGI)) OPENBLAS := $(if $(BLASLIBFILE),1,0) #else ifeq (Darwin,$(UNAME)) # Apple #OPENBLAS := 0 #BLASREF := 0 else # PGI OPENBLAS := 0 endif ifneq (0,$(OPENBLAS)) # OpenBLAS # OpenBLAS also carries the CBLAS bits BLAS_FLAGS += -D__BLAS=$(BLAS) -D__OPENBLAS ifneq (,$(wildcard $(patsubst %,%/include/openblas/f77blas.h,$(BLASROOT)))) BLAS_FLAGS += -D__OPENBLAS77 -I$(call qdir,$(firstword $(wildcard $(patsubst %,%/include/openblas/f77blas.h,$(BLASROOT))))) else ifneq (,$(wildcard $(patsubst %,%/include/x86_64-linux-gnu/f77blas.h,$(BLASROOT)))) BLAS_FLAGS += -D__OPENBLAS77 -I$(call qdir,$(firstword $(wildcard $(patsubst %,%/include/x86_64-linux-gnu/f77blas.h,$(BLASROOT))))) else ifneq (,$(wildcard $(patsubst %,%/include/f77blas.h,$(BLASROOT)))) BLAS_FLAGS += -D__OPENBLAS77 -I$(call qdir,$(firstword $(wildcard $(patsubst %,%/include/f77blas.h,$(BLASROOT))))) endif else ifneq (0,$(filter 0,$(BLASREF))) # BLAS (reference) BLAS_FLAGS += -D__BLAS ifeq (Windows_NT,$(UNAME)) # no particular action about static linkage (use DLL) BLASLIBFILE := $(call libpath,$(LD),$(BLASTEST),blas$(BLAS_BITS).dll) else BLASLIBFILE := $(call libpath,$(LD),$(BLASTEST),blas$(BLAS_BITS)) endif endif ifneq (,$(BLASLIBFILE)) ifeq (0,$(BLAS_STATIC)) # shared BLAS_LDFLAGS += $(call abslib,$(BLASLIBFILE)) else # static BLAS_LDFLAGS += $(BLASLIBFILE) endif BLASREF ?= 1 else ifneq (,$(call ctest,$(LD),-framework Accelerate)) # macOS BLAS_INCFILE := $(shell $(LD) $(call linkopt,-v) 2>&1 \ | grep -i frameworks 2>/dev/null \ | xargs -I {} find {} -name cblas.h 2>/dev/null \ | head -n1) ifneq (,$(BLAS_INCFILE)) BLAS_FLAGS += -I$(call qapath,$(call qdir,$(BLAS_INCFILE))) endif #EXCLUDE_VALUE += -framework Accelerate BLAS_LDFLAGS += -framework Accelerate BLAS_FLAGS += -D__CBLAS else # fallback BLAS_LDFLAGS += $(call ldclib,$(LD),$(SLDFLAGS),blas) BLASREF ?= 1 endif BLASREF ?= 0 ifneq (0,$(BLASREF)) # BLAS (reference) ifeq (,$(GFC)$(filter-out 0,$(XSMM_GCC))) ifneq (0,$(PGI)) BLAS_FLAGS += -D__CBLAS endif endif endif ifneq (,$(filter-out 0,$(OPENBLAS) $(BLASREF))) # OpenBLAS or RefBLAS ifeq (0,$(OMP)) # Fortran compiler cannot link OpenMP runtime ifeq (,$(call libpath,$(FLD),$(SLDFLAGS) $(OMPFLAG_FORCE) $(foreach LIB,$(LIBFIND),$(call abslibpath,$(LIB))),$(OMPRT))) BLAS_CLDFLAGS += $(XLIB_BEGIN) $(call abslib,$(OMPLIBFILE)) $(XLIB_END) else # common link BLAS/OMP-link flags (all compilers) BLAS_LDFLAGS += $(XLIB_BEGIN) $(call abslib,$(OMPLIBFILE)) $(XLIB_END) endif endif endif else # BLAS library is specified via BLASLIB #BLASTEST := $(NULL) BLAS_FLAGS += -D__BLAS=$(BLAS) ifneq (,$(findstring openblas,$(call qndir,$(BLASLIB)))) BLAS_FLAGS += -D__OPENBLAS endif ifneq (./,$(firstword $(BLASDIR))$(call qdir,$(BLASLIB))) ifeq (./,$(call qdir,$(BLASLIB))) BLAS_LDFLAGS += $(call abslib,$(firstword $(BLASDIR))/$(if $(call qsuffix,$(BLASLIB)),$(BLASLIB),lib$(BLASLIB).$(BLASLIBEXT))) else BLAS_LDFLAGS += $(call abslib,$(if $(call qsuffix,$(BLASLIB)),$(BLASLIB),$(BLASLIB).$(BLASLIBEXT))) endif else # fallback BLAS_LDFLAGS += $(call ldclib,$(LD),$(SLDFLAGS),$(BLASLIB)) endif endif endif ifneq (,$(GFC)$(filter-out 0,$(XSMM_GCC))) ifneq (,$(LIBGFORTRAN)) ifneq (0,$(DEPSTATIC)) BLAS_CLDFLAGS += $(XLIB_BEGIN) $(LIBGFORTRAN) $(XLIB_END) else BLAS_LDFLAGS += $(call abslibrpath,$(LIBGFORTRAN)) endif endif ifneq (,$(QUADMATH)) ifneq (0,$(DEPSTATIC)) BLAS_CLDFLAGS += $(XLIB_BEGIN) $(QUADMATH) $(XLIB_END) else BLAS_LDFLAGS += $(call abslibrpath,$(QUADMATH)) endif endif ifneq (,$(LIBM)) ifneq (0,$(DEPSTATIC)) BLAS_CLDFLAGS += $(XLIB_BEGIN) $(LIBM) $(XLIB_END) else BLAS_LDFLAGS += $(call abslibrpath,$(LIBM)) endif endif endif cleanup = $(foreach flag,$(subst //,/,$1),$(flag)) # cleanup eventually duplicated flags and slashes cleanld = $(strip $(shell echo "$1 $2 $3 $4 $5 $6 $7 $8 $9 ${10} ${11} ${12}" | sed \ -e "s/[[:space:]]$(XLIB_BEGIN)[[:space:]]$(XLIB_END)/ /g" \ -e "s/[[:space:]]$(XLIB_END)[[:space:]]$(XLIB_BEGIN)/ /g" \ -e "s/\/\//\//g")) # common runtime libraries LDFLAGS := $(call uniqmov,LDFLAGS,LIBPTHREAD,$(XLIB_BEGIN) $(LIBPTHREAD) $(XLIB_END)) LDFLAGS := $(call uniqmov,LDFLAGS,LIBCPP,$(XLIB_BEGIN) $(LIBCPP) $(XLIB_END)) ifeq (0,$(INTEL)) LDFLAGS := $(call uniqadd,LDFLAGS,LIBM,$(XLIB_BEGIN) $(LIBM) $(XLIB_END)) endif ifneq (0,$(MKL)) LDFLAGS := $(call uniqadd,LDFLAGS,LIBM,$(XLIB_BEGIN) $(LIBM) $(XLIB_END)) endif LDFLAGS := $(call uniqadd,LDFLAGS,LIBRT,$(XLIB_BEGIN) $(LIBRT) $(XLIB_END)) LDFLAGS := $(call uniqadd,LDFLAGS,LIBDL,$(XLIB_BEGIN) $(LIBDL) $(XLIB_END)) # no-BLAS flags: cleanup and extra flags; merged ("=" rather than ":=") NOBLAS_CXXCLEAN := $(call cleanup,$(PICFLAG) $(CXXFLAGS) $(ECXXFLAGS) $(EFLAGS)) NOBLAS_CXXFLAGS = $(NOBLAS_CXXCLEAN) NOBLAS_FCCLEAN := $(call cleanup,$(PICFLAG) $(FCFLAGS) $(EFCFLAGS) $(EFLAGS)) NOBLAS_FCFLAGS = $(NOBLAS_FCCLEAN) NOBLAS_CCLEAN := $(call cleanup,$(PICFLAG) $(CFLAGS) $(ECFLAGS) $(EFLAGS)) NOBLAS_CFLAGS = $(NOBLAS_CCLEAN) NOBLAS_FLAGS = $(call cleanup,$(filter-out -D__BLAS%,$(DFLAGS))) NOBLAS_LDCLEAN := $(call cleanld,$(LDFLAGS) $(ELDFLAGS)) NOBLAS_LDFLAGS = $(NOBLAS_LDCLEAN) NOBLAS_CLDCLEAN := $(call cleanld,$(CLDFLAGS)) NOBLAS_CLDFLAGS = $(NOBLAS_CLDCLEAN) NOBLAS_IFLAGS = $(call cleanup,$(IFLAGS)) # regular flags: cleanup CXXFLAGS := $(call cleanup,$(BLAS_CXXFLAGS) $(NOBLAS_CXXFLAGS)) FCFLAGS := $(call cleanup,$(BLAS_FCFLAGS) $(NOBLAS_FCFLAGS)) CFLAGS := $(call cleanup,$(BLAS_CFLAGS) $(NOBLAS_CFLAGS)) DFLAGS := $(call cleanup,$(BLAS_FLAGS) $(NOBLAS_FLAGS)) IFLAGS := $(call cleanup,$(BLAS_IFLAGS) $(NOBLAS_IFLAGS)) CLDFLAGS := $(call cleanld,$(BLAS_CLDFLAGS) $(NOBLAS_CLDFLAGS)) LDFLAGS := $(call cleanld,$(BLAS_LDFLAGS) $(NOBLAS_LDFLAGS)) CXXLDFLAGS := $(call cleanld,$(CXXLDFLAGS)) FLDFLAGS := $(call cleanld,$(FLDFLAGS)) .PRECIOUS: $(BLDDIR)/%-cpp.o $(BLDDIR)/%-c.o $(BLDDIR)/%-f.o \ $(BLDDIR)/%-f90.o $(BLDDIR)/%-f90.o $(BLDDIR)/%-f77.o \ %/.make .SUFFIXES: # applyif(A1,A2,A3,A4) evaluates to A4 if A1 (precondition) is non-zero and if the basenames (A2, A3) match applyif = $(if $(filter 0,$1),$(NULL),$(if $(filter $2,$(call qname,$(call qndir,$3))),$(if $1,$4))) # derives the extension of a filename extname = $(subst .,,$(call qsuffix,$(1))) # derives the name of an object files for a given source file objname = $(foreach ARG,$(1),$(addprefix $(BLDDIR)/,$(patsubst %$(call qsuffix,$(ARG)),%-$(call extname,$(ARG)).o,$(call qndir,$(ARG))))) MAKSTATE := $(sort $(shell grep -how '$(VNAME)' $(ROOTDIR)/Makefile.inc $(MAKEFILE_LIST) 2>/dev/null)) STATE := $(sort $(foreach V,$(filter $(MAKSTATE),$(filter-out \ $(EXCLUDE_STATE) EXCLUDE_% .% _% XSMM_% HAVE_% LIB_% %STATE MAKE_% MAKELEVEL MAKEFILE_LIST MAKEOVERRIDES MAKEFLAGS SHELL BLDDIR REVERSION% AVX SSE \ COMPILER_VERSION_FLAG CC_VERSION_% CXX_VERSION_% FC_VERSION_% GCC_VERSION_% CINFO CP DEPDIR DFLAGS FINFO ENVBIN EXTDEP FORTDEP FSTD GINFO DYNAMIC \ DLIBEXT FLOCK LIBEXT ICX ILIBEXT LIBNAME LICFILE MAKEINC MV ROOTDIR SLIBEXT XLNK% CHAR_OPEN CHAR_CLOSE CHAR_HASH MKTEMP OMPLIBFILE BLAS_INCFILE \ CPEDANTIC FREEFORM FFORM_FLAG FORTLIB FIXFC INCDIR OUTDIR SRCDIR XLIB_% GFC CLANG INTEL CRAY GCCBIN FMFLAGS FCLDFLAGS EXTLIB PYTHON% HOST% TMUX \ GNU CCAR CLDFLAGS LIBGFORTRAN PGI FPEDANTIC FWARNEXTRA XGROUP_BEGIN XGROUP_END SLDFLAGS PICFLAG PIC OPTFLAG OPT OMPFLAG_FORCE FORCE_CXX VNAME \ NOBLAS% DNOBLAS MAINLIB FLDFLAGS OPSYS LIBDEP LIBFIND LIBDL LIBRT LIBM LIBCPP BLAS_FLAGS BLAS_LDFLAGS BLAS_THREADS SUITE PEDANTIC CMAIN \ LIBPTHREAD CSTD CWARNEXTRA QUADMATH BLASDIR BLASLIBEXT BLASTEST NO_OFFLOAD_FLAG FYPPEXE INCLUDEMAKE LIBDIR OBJDIR ONEVERSION XDG_DATA_DIRS \ HEREDIR HOMEDIR UNUSED TEST% SHARED PYMOD MINGW FMAIN VERSION ACC% LESS% HOME% USER NAME LOGNAME LANG %PWD %PATH TERM% MOTD% SHLVL %COLORS \ RUNGCOV RUNXCOV CODECOV UPLOAD POSTPROC \ $(if $(filter-out $(FTARGET),$(CTARGET)),TARGET,CTARGET FTARGET) \ $(if $(filter $(BLAS_STATIC),$(DEPSTATIC)),BLAS_STATIC) \ $(if $(filter 0,$(TBB_MALLOC)),TBB_%) \ $(if $(filter 0,$(MKL)),MKL_%), \ $(.VARIABLES))),$(if $(filter-out default automatic,$(origin $V)),$(if $($V),$V)))) ENVSTATE := $(strip $(foreach V,$(STATE),$(if $(filter environment%,$(origin $V)),$V))) VALSTATE := $(foreach V,$(STATE),$V=$(subst \ $(USER),$(if $(USER),$$USER),$(subst \ $(HOME),$(if $(HOME),$$HOME),$(subst \ $(HOMEDIR),$(if $(HOMEDIR),$$HOME),$(filter-out $(EXCLUDE_VALUE),$($V)))))?) DIRSTATE ?= . PRESTATE := $(shell echo '$(VALSTATE)' | $(DEPDIR)/.state.sh $(DIRSTATE)) $(DIRSTATE)/.state: $(firstword $(PRESTATE)) %/.make: @mkdir -p $(call qbname,$@) @touch $@ .make: @touch $@ .DEFAULT_GOAL := $(NULL) libxsmm-1.17/README.md000066400000000000000000001236751415223013700144100ustar00rootroot00000000000000# LIBXSMM [![License](https://img.shields.io/badge/license-BSD3-blue.svg "BSD 3-Clause License")](LICENSE.md) [![Build status](https://badge.buildkite.com/2e962d4cfc7ddb10a6cd6c27b0d8033edf179a799e156cb363.svg "Buildkite Status")](https://github.com/hfp/libxsmm/wiki/Status) [![Coverity](https://scan.coverity.com/projects/7405/badge.svg "Coverity Analysis Status")](https://scan.coverity.com/projects/hfp-libxsmm) [![ReadtheDocs](https://readthedocs.org/projects/libxsmm/badge/?version=latest "Read the Docs")](https://libxsmm.readthedocs.io/) LIBXSMM is a library for specialized dense and sparse matrix operations as well as for deep learning primitives such as small convolutions. The library is targeting Intel Architecture with Intel SSE, Intel AVX, Intel AVX2, Intel AVX‑512 (with VNNI and Bfloat16), and Intel AMX (Advanced Matrix Extensions) supported by future Intel processor code-named Sapphire Rapids. Code generation is mainly based on Just‑In‑Time (JIT) code specialization for compiler-independent performance (matrix multiplications, matrix transpose/copy, sparse functionality, and deep learning). LIBXSMM is suitable for "build once and deploy everywhere", i.e., no special target flags are needed to exploit the available performance. Supported GEMM datatypes are: `FP64`, `FP32`, `bfloat16`, `int16`, and `int8`. For a list questions and answers, please also have a look at [https://github.com/hfp/libxsmm/wiki/Q&A](https://github.com/hfp/libxsmm/wiki/Q&A). **Where to go for documentation?** * **ReadtheDocs**: [main](https://libxsmm.readthedocs.io/) and [sample](https://libxsmm.readthedocs.io/libxsmm_samples/) documentation with full text search. * **PDF**: [main](https://github.com/hfp/libxsmm/raw/master/documentation/libxsmm.pdf) documentation file, and separate [sample](https://github.com/hfp/libxsmm/raw/master/documentation/libxsmm_samples.pdf) documentation. * **Articles**: [magazine article](https://software.intel.com/sites/default/files/parallel-universe-issue-34.pdf) incl. [sample code](https://github.com/hfp/libxsmm/tree/master/samples/magazine) (full list of [Articles](#articles)). **Getting Started**: The following C++ code is focused on a specific functionality but may be considered as [Hello LIBXSMM](https://github.com/hfp/libxsmm/tree/master/samples/hello). Build the example with `cd /path/to/libxsmm; make STATIC=0` (shared library), save the code under `hello.cpp` (below) and compile with `g++ -I/path/to/libxsmm/include hello.cpp -L/path/to/libxsmm/lib -lxsmm -lblas -o hello` (GNU CCC), and finally execute with `LD_LIBRARY_PATH=/path/to/libxsmm/lib LIBXSMM_VERBOSE=2 ./hello`. ```cpp #include #include int main(/*int argc, char* argv[]*/) { typedef double T; int batchsize = 1000, m = 13, n = 5, k = 7; std::vector a(batchsize * m * k), b(batchsize * k * n), c(m * n, 0); /* C/C++ and Fortran interfaces are available */ typedef libxsmm_mmfunction kernel_type; /* generates and dispatches a matrix multiplication kernel (C++ functor) */ kernel_type kernel(LIBXSMM_GEMM_FLAG_NONE, m, n, k, 1.0 /*alpha*/, 1.0 /*beta*/); assert(kernel); for (int i = 0; i < batchsize; ++i) { /* initialize input */ for (int ki = 0; ki < k; ++ki) { for (int j = 0; j < m; ++j) a[i * j * ki] = static_cast(1) / ((i + j + ki) % 25); for (int j = 0; j < n; ++j) b[i * j * ki] = static_cast(7) / ((i + j + ki) % 75); } } /* kernel multiplies and accumulates matrices: C += Ai * Bi */ for (int i = 0; i < batchsize; ++i) kernel(&a[i * m * k], &b[i * k * n], &c[0]); } ``` Plain [C code](https://github.com/hfp/libxsmm/blob/master/samples/hello/hello.c) as well as [Fortran code](https://github.com/hfp/libxsmm/blob/master/samples/hello/hello.f) resemble the same [example](https://github.com/hfp/libxsmm/tree/master/samples/hello). **What is a small matrix multiplication?** When characterizing the problem-size by using the M, N, and K parameters, a problem-size suitable for LIBXSMM falls approximately within (M N K)1/3 <= 64 (which illustrates that non-square matrices or even "tall and skinny" shapes are covered as well). The library is typically used to generate code up to the specified [threshold](documentation/libxsmm_tune.md#auto-dispatch). Raising the threshold may not only generate excessive amounts of code (due to unrolling in M or K dimension), but also miss to implement a tiling scheme to effectively utilize the cache hierarchy. For auto-dispatched problem-sizes above the configurable threshold (explicitly JIT'ted code is **not** subject to the threshold), LIBXSMM is falling back to BLAS. In terms of GEMM, the supported kernels are limited to *Alpha := 1*, *Beta := \{ 1, 0 \}*, and *TransA := 'N'*. **What is a small convolution?** In the last years, new workloads such as deep learning and more specifically convolutional neural networks (CNN) emerged and are pushing the limits of today's hardware. One of the expensive kernels is a small convolution with certain kernel sizes such that calculations in the frequency space is not the most efficient method when compared with direct convolutions. LIBXSMM's current support for convolutions aims for an easy to use invocation of small (direct) convolutions, which are intended for CNN training and classification. ## Interfaces and Domains ### Overview Please have a look at [https://github.com/hfp/libxsmm/tree/master/include](https://github.com/hfp/libxsmm/tree/master/include) for all published functions. Get started with the following list of available domains and documented functionality: * MM: [Matrix Multiplication](#matrix-multiplication) * DNN: [Deep Neural Networks](#deep-neural-networks) * AUX: [Service Functions](#service-functions) * PERF: [Performance](#performance) * BE: [Backend](#jit-backend) To initialize library internal resources, an explicit initialization routine helps to avoid lazy initialization overhead when calling LIBXSMM for the first time. The library deallocates internal resources at program exit, but also provides a companion of the afore mentioned initialization (finalize). ```C /** Initialize the library; pay for setup cost at a specific point. */ void libxsmm_init(void); /** De-initialize the library and free internal memory (optional). */ void libxsmm_finalize(void); ``` ### Matrix Multiplication This domain (MM) supports Small Matrix Multiplications (SMM), batches of multiple multiplications as well as the industry-standard interface for GEneral Matrix Matrix multiplication (GEMM). The [Matrix Multiplication domain (MM)](documentation/libxsmm_mm.md) contains routines for: * [Small, tiled, and parallelized matrix multiplications](documentation/libxsmm_mm.md#overview) * [Manual code dispatch (customized matrix batches)](documentation/libxsmm_mm.md#manual-code-dispatch) * [Batched multiplication (explicit interface)](documentation/libxsmm_mm.md#batched-multiplication) * [Call wrapper (static and dynamic linkage)](documentation/libxsmm_mm.md#call-wrapper) ### Deep Learning This domain (DL) is detailed by a separate [document](documentation/libxsmm_dl.md). It may be inspiring to have a look at the lightweight GxM framework, which uses LIBXSMM for end-to-end Deep Learning. ### Service Functions For convenient operation of the library and to ease integration, some service routines are available. These routines may not belong to the core functionality of LIBXSMM (SMM or DNN domain), but users are encouraged to use this domain (AUX). There are two categories: (1) routines which are available for C and FORTRAN, and (2) routines that are only available per C interface. The [service function domain (AUX)](documentation/libxsmm_aux.md) contains routines for: * [Getting and setting the target architecture](documentation/libxsmm_aux.md#getting-and-setting-the-target-architecture) * [Getting and setting the verbosity](documentation/libxsmm_aux.md#getting-and-setting-the-verbosity) * [Measuring time durations (timer)](documentation/libxsmm_aux.md#timer-facility) * [Dispatching user-data and multiple kernels](documentation/libxsmm_aux.md#user-data-dispatch) * [Loading and storing data (I/O)](documentation/libxsmm_aux.md#meta-image-file-io) * [Allocating memory](documentation/libxsmm_aux.md#memory-allocation) ### Backend More information about the JIT-backend and the code generator can be found in a separate [document](documentation/libxsmm_be.md). The [encoder sample collection](https://github.com/hfp/libxsmm/tree/master/samples/encoder) can help to get started writing a kernel using LIBXSMM. Please note, LIBXSMM's stand-alone [generator-driver](documentation/libxsmm_be.md#generator-driver) is considered legacy (deprecated). ## Build Instructions ### Overview The main interface file is *generated*, and it is therefore **not** stored in the code repository. Instead, one may have a look at the code generation template files for [C/C++](https://github.com/hfp/libxsmm/blob/master/src/template/libxsmm.h#L36) and [FORTRAN](https://github.com/hfp/libxsmm/blob/master/src/template/libxsmm.f#L32). There are two ways prepared to build and use LIBXSMM: * [Classic Library (ABI)](#classic-library-abi) and [Link Instructions](#link-instructions) (C/C++ and FORTRAN) * [Header-Only](#header-only) (C and C++) **Note**: LIBXSMM is available as prebuilt package for Fedora/RedHat/CentOS, Ubuntu, and FreeBSD. Further, LIBXSMM can be installed with the [Spack Package Manager](http://computation.llnl.gov/projects/spack-hpc-package-manager) or per [EasyBuild+EasyConfig](https://github.com/easybuilders). ### Classic Library (ABI) The build system relies on GNU Make (typically associated with the `make` command, but e.g. FreeBSD is calling it `gmake`). The build can be customized by using key‑value pairs. Key‑value pairs can be supplied in two ways: (1) after the "make" command, or (2) prior to the "make" command (`env`) which is effectively the same as exporting the key‑value pair as an environment variable (`export`, or `setenv`). Both methods can be mixed (the second method may require make's `-e` flag). In contrast to [header-only](#zero-config) which does not require configuration by default, 3rd-party build systems can compile and link LIBXSMM's sources but still avoid configuring the library (per `libxsmm_config.py`). The prerequisite to omit configuration is to opt-in by defining LIBXSMM_DEFAULT_CONFIG (`-D`). The zero-config feature is not available for LIBXSMM's Fortran interface. **Note**: By default, C/C++ and FORTRAN compilers are needed (some sample code is written in C++). Beside of specifying the compilers (`make CXX=g++ CC=gcc FC=gfortran` and maybe `AR=ar`), the need for a FORTRAN compiler can be relaxed (`make FC=` or `make FORTRAN=0`). The latter affects the availability of the MODule file and the corresponding `libxsmm.f` library (the interface `libxsmm.f` is still generated). The build system considers a set of given key-value pairs as a single unique build and triggers a rebuild for a distinct set of flags. For more advanced builds or additional background, please consult the section about [Customization](documentation/libxsmm_tune.md). To generate the interface of the library inside of the `include` directory and to build the static library (by default, STATIC=1 is activated). Run any (or both) of the following command(s): ```bash make STATIC=0 make ``` On CRAY systems, the CRAY Compiling Environment (CCE) should be used regardless of utilizing the CRAY compiler, the Intel Compiler, or the GNU Compiler Collection (GCC). The CCE is eventually suppressing to build shared libraries (STATIC=0). In any case, (1) switch to the desired compiler (module load/switch), and (2) rely on: ```bash make CXX=CC CC=cc FC=ftn ``` A variety of build environments is out-of-the-box compatible, see [https://github.com/hfp/libxsmm/wiki/Compatibility](https://github.com/hfp/libxsmm/wiki/Compatibility). If the build process is not successful, it may help to avoid advanced GCC flags. This is useful with a tool chain, which pretends to be GCC-compatible (and is treated as such) but fails to consume the afore mentioned flags: ```bash make COMPATIBLE=1 ``` In case of outdated Binutils, compilation can fail to assemble code when building the library (this has nothing to do with JIT-generated code and it does not affect how JIT-code is targeting the system). LIBXSMM implements some functionality using compiler-intrinsics and multiple code-paths which are scheduled according to CPUID. In contrast to `INTRINSICS=2` (default), `INTRINSICS=1` enables a fully static code path according to the desired target. If no target is given (e.g., `AVX=3`, or `AVX=2`), instruction set extensions cannot be leveraged for such code-paths. Try to fix failing compilation by building the latest GNU Binutils (and `export PATH=/path/to/binutils/bin:${PATH}`). Binutils are versioned independently of GNU GCC and other compilers. If one cannot update Binutils, work around with a CPUID-value as tabulated in [libxsmm_cpuid.h](https://github.com/hfp/libxsmm/blob/master/include/libxsmm_cpuid.h): start at the upper end (less than 1999) and decrement until compilation passes (make INTRINSICS=_CPUID_, e.g., `make INTRINSICS=1021`). As a last resort, rely on a fully static code path: ```bash make INTRINSICS=1 ``` To test and validate a build, please consult [https://github.com/hfp/libxsmm/wiki/Validation](https://github.com/hfp/libxsmm/wiki/Validation). To run some basic sanity checks, remember that each set of given key-value pairs represents a different build (and test): ```bash make STATIC=0 tests ``` To remove intermediate files, or to remove all generated files and folders (including the interface and the library archives), run one of the make-targets below. An additional distclean-target recursively cleans the entire tree (after version 1.9). ```bash make clean make realclean ``` FORTRAN code can make use of LIBXSMM: * By using the module and linking with `libxsmmf`, `libxsmm`, and (optionally) `libxsmmext`, * By including `libxsmm.f` and linking with `libxsmm`, and (optionally) `libxsmmext`, or * By (implicitly) calling a SUBROUTINE and linking with `libxsmm`, and (optionally) `libxsmmext`. **Note**: Using the Fortran module or including the interface, requires at least a Fortran 2003 compiler (F2K3). FORTRAN 77 compatibility is only implicitly available (no interface), and the available subset of routines is documented in `libxsmm.f` and marked with [comments](https://github.com/hfp/libxsmm/search?q=implementation+provided+for+Fortran+77+compatibility) (part of the implementation). ### Header-Only Version 1.4.4 introduced support for "header-only" usage in C and C++. By only including `libxsmm_source.h` allows to get around building the library. However, this gives up on a clearly defined application binary interface (ABI). An ABI may allow for hot-fixes after deploying an application (when relying on the shared library form), and it may also ensure to only rely on the public interface of LIBXSMM. In contrast, the header-only form not only exposes the internal implementation of LIBXSMM but can also increase the turnaround time during development of an application (due to longer compilation times). The header file is intentionally named "libxsmm_**source**.h" since this header file relies on the [src](https://github.com/hfp/libxsmm/tree/master/src) directory (with the implications as noted earlier). The header-only form depends on `libxsmm_source.h` which is *generated* according to the content of the source folder (`src`). LIBXSMM 1.16 (and later) provides header-only support without invoking a make-target (zero configuration) for any given checkout of LIBXSMM. To use configured header-only (non-default), LIBXSMM_CONFIGURED must be defined (`-D`). Previously, it was necessary to invoke `make header-only` (v1.6.2 or later), `make cheader` (prior to v1.6.2), or any target building the library (`make`). The zero-config feature allows 3rd-party build systems an easier integration of LIBXSMM, which also holds true if the system builds LIBXSMM from source (see [classic ABI](#zero-config-abi)). Fortran code may [include](#header-only-fortran) `libxsmm.f` but still requires that interface to be generated. **Note**: building an application applies the same build settings to LIBXSMM! For instance, to omit debug code inside of LIBXSMM `NDEBUG` must be defined (`-DNDEBUG`). ## Link Instructions Using the [classic ABI](#classic-library-abi) (including [Fortran](#fortran) code), requires linking LIBXSMM against the application. The library is agnostic with respect to the threading-runtime, and therefore an application is free to use any threading runtime (e.g., OpenMP). The library is also thread-safe, and multiple application threads can call LIBXSMM's routines concurrently. Enabling OpenMP for LIBXSMM's main library is supported as well (OMP=1), and mostly affects the synchronization primitives used inside of the library. All of the "omp" functionality (function postfix) is served by the `libxsmmext` library, which is automatically built with OpenMP enabled. When using this "omp" functionality, `libxsmmext` needs to be present at the link line. Library | Purpose :-------------|--------- libxsmm | Thread-safe core functions (same routine can be called concurrently). Contains routines that can take a thread-ID and the number of library-external threads. libxsmmf | Necessary when using the Fortran MODule but not when including `libxsmm.f` or relying on implicit interfaces ([Fortran 77](https://github.com/hfp/libxsmm/search?q=implementation+provided+for+Fortran+77+compatibility)). libxsmmext | Provides library-internal OpenMP-threaded functions carrying the `omp` postfix when compared to function name names of the core library. libxsmmnoblas | Supplies faked symbols for `dgemm` (and others) and thereby removes the need to link against a LAPACK/BLAS library. To ease linking with LIBXSMM, `pkg-config` can be used. For example: ```bash export PKG_CONFIG_PATH=/path/to/libxsmm/lib pkg-config libxsmm --libs ``` Similarly, an application is free to choose any BLAS or LAPACK library (if the link model available on the OS supports this), and therefore linking GEMM routines when linking LIBXSMM itself (by supplying BLAS=1|2) may prevent a user from making this decision at the time of linking the actual application. To use LIBXSMM without GEMM-related functionality, any BLAS-dependency can be removed in two ways: (1) building a special library with `make BLAS=0`, or (2) linking the application against the `libxsmmnoblas` library. If an application however uses BLAS already, the [Call Wrapper](documentation/libxsmm_mm.md#call-wrapper) can be used to intercept existing BLAS calls (and to rely on LIBXSMM instead). **Note**: LIBXSMM does not support to dynamically link `libxsmm` or `libxsmmext` ("so"), when BLAS is linked statically ("a"). If BLAS is linked statically, the static version of LIBXSMM must be used! ### Installation There are two main mechanisms to install LIBXSMM (both mechanisms can be combined): (1) building the library in an out‑of‑tree fashion, and (2) installing into a certain location. Building in an out‑of‑tree fashion looks like: ```bash cd libxsmm-install make -f /path/to/libxsmm/Makefile ``` Installation into a specific location looks like (`PREFIX` or `DESTDIR`): ```bash make MNK="1 2 3 4 5" PREFIX=/path/to/libxsmm-install install ``` Both `PREFIX` and `DESTDIR` are equivalent and can be relative or absolute paths. An installation can be repeated for different locations without triggering a rebuild. The prefix directory *inside* of each of the [package configuration files](#pkg-config) is set to where LIBXSMM is built (staging folder) unless `PREFIX` or `DESTDIR` is specified. The effect of `PREFIX` (or `DESTDIR`) with respect to the pkg-config files is independent of whether the install-target is invoked or not (make). Further, performing `make install-minimal` omits the documentation (default: `PREFIX/share/libxsmm`). Moreover, PINCDIR, POUTDIR, PBINDIR, and PDOCDIR allow to customize the locations underneath of the PREFIX location. To build a general package for an unpredictable audience (Linux distribution, or similar), it is advised to not over-specify or customize the build step, i.e., JIT, SSE, AVX, OMP, BLAS, etc. should not be used. The following is building and installing a complete set of libraries where the generated interface matches both the static and the shared libraries: ```bash make PREFIX=/path/to/libxsmm-install STATIC=0 install make PREFIX=/path/to/libxsmm-install install ``` ## Runtime Control ### Handling Errors The library handles errors with mechanisms available to the C programming language (no exceptions). The backend uses result codes passed by an argument rather than an actual return value. Such an argument is often a descriptor (struct) guiding and covering the state of the code generation. The frontend however may not hand-out any error state, which can be a big relief on the call-side. Instead, the frontend implements a [verbose mode](#verbose-mode) to inform about unexpected input or an error captured from the backend. Guiding principles of LIBXSMM are muted operation by default (non-verbose) and no unexpected exit from execution. ### Verbose Mode The [verbose mode](documentation/libxsmm_aux.md#getting-and-setting-the-verbosity) (level of verbosity) allows for an insight into the code dispatch mechanism by receiving a small tabulated statistic as soon as the library terminates. The design point for this functionality is to not impact the performance of any critical code path, i.e., verbose mode is always enabled and does not require symbols (SYM=1) or debug code (DBG=1). The statistics appears (`stderr`) when the environment variable LIBXSMM_VERBOSE is set to a non-zero value. For example: ```bash LIBXSMM_VERBOSE=1 ./myapplication [... application output] HSW/SP TRY JIT STA COL 0..13 0 0 0 0 14..23 0 0 0 0 24..128 3 3 0 0 ``` The tables are distinct between single-precision and double-precision, but either table is pruned if all counters are zero. If both tables are pruned, the library shows the code path which would have been used for JIT'ting the code: `LIBXSMM_TARGET=hsw` (otherwise the code path is shown in the table's header). The actual counters are collected for three buckets: small kernels (MNK1/3 <= 13), medium-sized kernels (13 < MNK1/3 <= 23), and larger kernels (23 < MNK1/3 <= 64; the actual upper bound depends on LIBXSMM_MAX_MNK as selected at compile-time). Keep in mind, that "larger" is supposedly still small in terms of arithmetic intensity (which grows linearly with the kernel size). Unfortunately, the arithmetic intensity depends on the way a kernel is used (which operands are loaded/stored into main memory) and it is not performance-neutral to collect this information. The TRY counter represents all attempts to register statically generated kernels, and all attempts to dynamically generate and register kernels. The TRY counter includes rejected JIT requests due to unsupported GEMM arguments. The JIT and STA counters distinct the successful cases of the afore mentioned event (TRY) into dynamically (JIT) and statically (STA) generated code. In case the capacity (O(*n*) = 105) of the code registry is exhausted, no more kernels can be registered although further attempts are not prevented. Registering many kernels (O(*n*) = 103) may ramp the number of hash key collisions (COL), which can degrade performance. The latter is prevented if the small thread-local cache is utilized effectively. Since explicitly JIT-generated code (`libxsmm_?mmdispatch`) does not fall under the THRESHOLD criterion, the above table is extended by one line if large kernels have been requested. This indicates a missing threshold-criterion (customized dispatch), or asks for cache-blocking the matrix multiplication. The latter is already implemented by LIBXSMM's "medium-sized" GEMM routines (`libxsmm_?gemm_omp`), which perform a tiled multiplication. Setting a verbosity level of at least two summarizes the number of registered JIT-generated kernels, which includes the total size and counters for GEMM, MCOPY (matrix copy), and TCOPY (matrix transpose) kernels. ```bash Registry: 20 MB (gemm=0 mcopy=14 tcopy=0) ``` If the call-wrapper is used, an additional runtime statistic becomes available (see [Call Wrapper](documentation/libxsmm_mm.md#call-wrapper)). **Note**: Setting LIBXSMM_VERBOSE to a negative value will binary-dump each generated JIT kernel to a file with each file being named like the function name shown in [Intel VTune](documentation/libxsmm_prof.md#intelvtuneamplifier). Disassembly of the raw binary files can be accomplished by: ```bash objdump -D -b binary -m i386 -M x86-64 [JIT-dump-file] ``` ### Call Trace During the initial steps of employing the LIBXSMM API, one may rely on a debug version of the library (`make DBG=1`). The latter also implies console output (`stderr`) in case of an error/warning condition inside of the library. It is also possible to print the execution flow (call trace) inside of LIBXSMM (can be combined with DBG=1 or OPT=0): ```bash make TRACE=1 ``` Building an application which traces calls (inside of the library) requires the shared library of LIBXSMM, alternatively the application is required to link the static library of LIBXSMM in a dynamic fashion (GNU tool chain: `-rdynamic`). Tracing calls (without debugger) can be then accomplished by an environment variable called LIBXSMM_TRACE. ```bash LIBXSMM_TRACE=1 ./myapplication ``` Syntactically up to three arguments separated by commas (which allows to omit arguments) are taken (*tid*,*i*,*n*): *tid* signifies the ID of the thread to be traced with 1...NTHREADS being valid and where LIBXSMM_TRACE=1 is filtering for the "main thread" (in fact the first thread running into the trace facility); grabbing all threads (no filter) can be achieved by supplying a negative id (which is also the default when omitted). The second argument is pruning higher levels of the call-tree with *i=1* being the default (level zero is the highest at the same level as the main function). The last argument is taking the number of inclusive call levels with *n=-1* being the default (signifying no filter). Although the `ltrace` (Linux utility) provides similar insight, the trace facility might be useful due to the afore mentioned filtering expressions. Please note that the trace facility is severely impacting the performance (even with LIBXSMM_TRACE=0), and this is not just because of console output but rather since inlining (internal) functions might be prevented along with additional call overhead on each function entry and exit. Therefore, debug symbols can be also enabled separately (`make SYM=1`; implied by TRACE=1 or DBG=1) which might be useful when profiling an application. ## Performance Profiling an application, which uses LIBXSMM's JIT-code is well-supported. The library supports Intel VTune Amplifier and Linux perf. Details are given on how to include profiler support, and how to run the application. * [Profiling using Intel VTune Amplifier](documentation/libxsmm_prof.md#intelvtuneamplifier) * [Profiling using Linux perf](documentation/libxsmm_prof.md#linuxperf) At build time, a variety of options exist to customize LIBXSMM. The library is setup for a broad range of use cases, which include sophisticated defaults for typical use. * [Customizing performance](documentation/libxsmm_tune.md#tuning) * [Tuning auto-dispatch](documentation/libxsmm_tune.md#auto-dispatch) To find performance results of applications or performance reproducers, the repository provides an orphaned branch called "results" which collects collateral material such as measured performance results along with explanatory figures. The results can be found at [https://github.com/hfp/libxsmm/tree/results#libxsmm-results](https://github.com/hfp/libxsmm/tree/results#libxsmm-results), or the results can be cloned as shown below. ```bash git clone --branch results \ https://github.com/hfp/libxsmm.git \ libxsmm-results ``` Please note that comparing performance results depends on whether the operands of the matrix multiplication are streamed or not. For example, multiplying with all matrices covered by the L1 cache may have an emphasis towards an implementation which perhaps performs worse for the real workload (if this real workload needs to stream some or all matrices from the main memory). Most of the [code samples](https://github.com/hfp/libxsmm/tree/master/samples) are aimed to reproduce performance results, and it is encouraged to model the exact case or to look at real [applications](#applications). ## Applications ### High Performance Computing (HPC) [1] [https://cp2k.org/](https://cp2k.org/): Open Source Molecular Dynamics and the [DBCSR library](https://github.com/cp2k/dbcsr), which processes batches of small matrix multiplications. The batches originate from a distributed block-sparse matrix with problem-specific small matrices. Starting with [CP2K 3.0](https://www.cp2k.org/version_history), LIBXSMM can substitute CP2K's `libsmm` library. [2] [https://github.com/SeisSol/SeisSol/](https://github.com/SeisSol/SeisSol/): SeisSol is one of the leading codes for earthquake scenarios, for simulating dynamic rupture processes. LIBXSMM provides highly optimized assembly kernels which form the computational back-bone of SeisSol (see [https://github.com/TUM-I5/seissol_kernels/](https://github.com/TUM-I5/seissol_kernels/). [3] [https://github.com/NekBox/NekBox](https://github.com/NekBox/NekBox): NekBox is a highly scalable and portable spectral element code, which is inspired by the [Nek5000](https://nek5000.mcs.anl.gov/) code. NekBox is specialized for box geometries and intended to prototype new methods as well as to leverage FORTRAN beyond the FORTRAN 77 standard. LIBXSMM can be used to substitute the [MXM_STD](https://github.com/Nek5000/NekBox/blob/box/mxm_std.F90) code. Please also note LIBXSMM's [NekBox reproducer](https://github.com/hfp/libxsmm/tree/master/samples/nek#nek-sample-collection). [4] [https://github.com/Nek5000/Nek5000](https://github.com/Nek5000/Nek5000): Nek5000 is the open-source, highly-scalable, always-portable spectral element code from [https://nek5000.mcs.anl.gov/](https://nek5000.mcs.anl.gov/). The development branch of the Nek5000 code [incorporates](https://github.com/Nek5000/Nek5000/blob/master/core/mxm_wrapper.f) LIBXSMM. [5] [http://pyfr.org/](http://pyfr.org/): PyFR is an open-source Python based framework for solving advection-diffusion type problems on streaming architectures by using the flux reconstruction approach. PyFR 1.6.0 optionally [incorporates LIBXSMM](http://pyfr.org/user_guide.php) as a matrix multiplication provider for the OpenMP backend. Please also note LIBXSMM's [PyFR-related code sample](https://github.com/hfp/libxsmm/tree/master/samples/pyfr). [6] [http://dial3343.org/about/](http://dial3343.org/about/): The Extreme-scale Discontinuous Galerkin Environment (EDGE) is a solver for hyperbolic partial differential equations with emphasis on seismic simulations. The EDGE [source code](https://github.com/3343/edge) optionally relies on LIBXSMM, but for high performance LIBXSMM's kernels are highly recommended. [7] [https://sxs-collaboration.github.io/spectre/](https://sxs-collaboration.github.io/spectre/): SpECTRE is an open-source code for multi-scale, multi-physics problems in astrophysics and gravitational physics which runs at Petascale and is designed for Exascale computers. In the future, SpECTRE may be applied to problems across discipline boundaries in fluid dynamics, geoscience, plasma physics, nuclear physics, and engineering. [8] [https://ceed.exascaleproject.org/ceed-code/](https://ceed.exascaleproject.org/ceed-code/): The Center for Efficient Exascale Discretizations (CEED) is building on the efforts of the Nek5000, MFEM, MAGMA, OCCA and PETSc projects to develop application program interfaces (APIs), both at high-level and at low-level to enable applications to take advantage of high-order methods. The CEED low-level API, [libCEED](https://ceed.exascaleproject.org/libceed/) uses LIBXSMM as a [backend](https://github.com/CEED/libCEED#backends) for high performance on CPUs. [9] [https://github.com/romeric/Fastor](https://github.com/romeric/Fastor): Fastor is a lightweight high performance tensor algebra framework for modern C++ and can optionally use LIBXSMM as [JIT-backend](https://github.com/romeric/Fastor/wiki/9.-Using-the-LIBXSMM-MKL-JIT-backend). ### Machine Learning (ML) [10] [https://github.com/plaidml/plaidml](https://github.com/plaidml/plaidml): PlaidML is an open source tensor compiler aiming for performance portability across a wide range of CPUs, GPUs and other accelerators. Combined with Intel’s nGraph compiler, PlaidML is targeting popular deep learning frameworks such as PyTorch, Keras (TensorFlow), and OpenVino. [PlaidML/v1](https://github.com/plaidml/plaidml/tree/plaidml-v1) (development branch) adopted [MLIR](https://mlir.llvm.org/), an extensible compiler infrastructure gaining industry-wide adoption. PlaidML/v1 started using LIBXSMM as backend for targeting CPUs. [11] [https://github.com/intel/intel-extension-for-pytorch](https://github.com/intel/intel-extension-for-pytorch): Intel Extension for PyTorch aims for a smooth user experience of PyTorch on CPUs by the means of good performance. The extension pack started to rely on [LIBXSMM for achieving high performance on CPUs](https://arxiv.org/abs/2005.04680). [12] [https://www.tensorflow.org/](https://tensorflow.org/): TensorFlow™ is an open source software library for numerical computation using data flow graphs. TensorFlow was originally developed by researchers and engineers working on the Google Brain Team for the purposes of conducting machine learning and deep neural networks research. LIBXSMM was once [used](documentation/tensorflow.md) to increase the performance of TensorFlow on Intel hardware. [13] [https://github.com/IntelLabs/SkimCaffe](https://github.com/IntelLabs/SkimCaffe#skimcaffe-specific-description): SkimCaffe from Intel Labs is a Caffe branch for training of sparse CNNs, which provide 80-95% sparsity in convolutions and fully-connected layers. LIBXSMM's SPMDM domain (SParseMatrix-DenseMatrix multiplication) evolved from SkimCaffe, and since then LIBXSMM implements the sparse operations in SkimCaffe. [14] [https://github.com/baidu-research/DeepBench](https://github.com/baidu-research/DeepBench#deepbench): The primary purpose of DeepBench is to benchmark operations that are important to deep learning on different hardware platforms. LIBXSMM's DNN primitives have been [incorporated into DeepBench](https://github.com/baidu-research/DeepBench/tree/master/code/intel/convolution/libxsmm_conv) to demonstrate an increased performance of deep learning on Intel hardware. ### Automated Driving (AD) [15] [https://software.seek.intel.com/accelerating-eigen-math-library](https://software.seek.intel.com/accelerating-eigen-math-library): Accelerating The Eigen Math Library for Automated Driving Workloads: The Need for Speed in Kalman Filtering. An article in [Issue 31](https://software.intel.com/content/www/us/en/develop/download/parallel-universe-magazine-issue-31-january-2018.html) of The Parallel Universe magazine ([pdf](https://software.intel.com/content/dam/develop/public/us/en/documents/parallel-universe-issue-31.pdf)). ## References [1] [https://sc19.supercomputing.org/proceedings/tech_poster/tech_poster_pages/rpost244.html](https://sc19.supercomputing.org/proceedings/tech_poster/tech_poster_pages/rpost244.html): High-Performance Deep Learning via a Single Building Block ([poster](https://sc19.supercomputing.org/proceedings/tech_poster/poster_files/rpost244s2-file2.pdf) and [abstract](https://sc19.supercomputing.org/proceedings/tech_poster/poster_files/rpost244s2-file3.pdf)), SC’19: The International Conference for High Performance Computing, Networking, Storage, and Analysis, Denver (Colorado). [2] [https://dl.acm.org/doi/10.1109/SC.2018.00069](https://dl.acm.org/doi/10.1109/SC.2018.00069): Anatomy of High-Performance Deep Learning Convolutions on SIMD Architectures ([paper](https://arxiv.org/pdf/1808.05567.pdf)). SC'18: The International Conference for High Performance Computing, Networking, Storage, and Analysis, Dallas (Texas). [3] [https://pasc17.pasc-conference.org/fileadmin/user_upload/pasc17/program/post116s2.pdf](https://pasc17.pasc-conference.org/fileadmin/user_upload/pasc17/program/post116s2.pdf): DBCSR: A Sparse Matrix Multiplication Library for Electronic Structure Codes (poster), PASC’17: The PASC17 Conference, Lugano (Switzerland). [4] [https://sc17.supercomputing.org/SC17%20Archive/tech_poster/tech_poster_pages/post190.html](https://sc17.supercomputing.org/SC17%20Archive/tech_poster/tech_poster_pages/post190.html): Understanding the Performance of Small Convolution Operations for CNN on Intel Architecture ([poster](https://sc17.supercomputing.org/SC17%20Archive/tech_poster/poster_files/post190s2-file2.pdf) and [abstract](https://sc17.supercomputing.org/SC17%20Archive/tech_poster/poster_files/post190s2-file3.pdf)), SC’17: The International Conference for High Performance Computing, Networking, Storage, and Analysis, Denver (Colorado). [5] [https://www.computer.org/csdl/proceedings-article/sc/2016/8815a981/12OmNCeaQ1D](https://www.computer.org/csdl/proceedings-article/sc/2016/8815a981/12OmNCeaQ1D): LIBXSMM: Accelerating Small Matrix Multiplications by Runtime Code Generation. SC'16: The International Conference for High Performance Computing, Networking, Storage and Analysis, Salt Lake City (Utah). [6] [http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post137.html](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post137.html): LIBXSMM: A High Performance Library for Small Matrix Multiplications ([poster](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/poster_files/post137s2-file2.pdf) and [abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/poster_files/post137s2-file3.pdf)). SC'15: The International Conference for High Performance Computing, Networking, Storage and Analysis, Austin (Texas). ## Articles [1] [https://www.nextplatform.com/2019/10/09/cloudy-supercomputers-join-the-hpc-petascale-club/](https://www.nextplatform.com/2019/10/09/cloudy-supercomputers-join-the-hpc-petascale-club/): Cloudy Supercomputers Join the HPC Petascale Club. An article written by Rob Farber, 2019. The article covers LIBXSMM in a separate section. [2] [https://www.nextplatform.com/2019/06/26/counting-the-cost-of-scaling-hpc-applications/](https://www.nextplatform.com/2019/06/26/counting-the-cost-of-scaling-hpc-applications/): Counting The Cost Of Scaling HPC Applications. An article written by Timothy Prickett Morgan, 2019. This article is about CP2K Open Source Molecular Dynamics and not about LIBXSMM. However, LIBXSMM was key for application performance. [3] [https://www.nextplatform.com/2019/06/26/counting-the-cost-of-scaling-hpc-applications/](https://www.nextplatform.com/2019/06/26/counting-the-cost-of-scaling-hpc-applications/): Azure Benchmarks HC-series Across Twenty-thousand Cores for HPC. An article written by John Russell, 2019. This article is about CP2K Open Source Molecular Dynamics and not about LIBXSMM. However, LIBXSMM was key for application performance. [4] [https://software.intel.com/sites/default/files/parallel-universe-issue-34.pdf](https://software.intel.com/content/www/us/en/develop/download/parallel-universe-magazine-issue-34-october-2018.html): LIBXSMM: An Open Source-Based Inspiration for Hardware and Software Development at Intel ([pdf](https://software.intel.com/content/dam/develop/public/us/en/documents/parallel-universe-issue-34.pdf)). An article written by Hans Pabst, Greg Henry, and Alexander Heinecke, 2018. [5] [https://medium.com/@rmfarber/libxsmm-brings-deep-learning-lessons-learned-to-many-hpc-applications-9143c6c93125](https://medium.com/@rmfarber/libxsmm-brings-deep-learning-lessons-learned-to-many-hpc-applications-9143c6c93125): LIBXSMM Brings Deep-learning "Lessons Learned" to Many HPC Applications. An article written by Rob Farber, 2018. [6] [https://www.rdworldonline.com/largest-supercomputer-simulation-of-sumatra-andaman-earthquake/](https://www.rdworldonline.com/largest-supercomputer-simulation-of-sumatra-andaman-earthquake/): Largest Supercomputer Simulation of Sumatra-Andaman Earthquake. An article written by Linda Barney, 2018. libxsmm-1.17/SECURITY.md000066400000000000000000000013671415223013700147130ustar00rootroot00000000000000# Security Policy ## Supported Versions LIBXSMM is distributed as source code package. In case of an issue, please report at https://github.com/hfp/libxsmm/issues. There is no formal commitment to update an existing release. Please feel free to write down such an expectation as part of the issue report! Determine accurate build information in case of an upstreamed package (binary distribution): ```bash export LIBXSMM_DUMP_BUILD=1 ./application_linked_with_libxsmm ... ``` Build information is available when LIBXSMM was dynamically or statically linked with an application (not for header-only). ## Reporting a Vulnerability Please report any kind of issue at https://github.com/hfp/libxsmm/issues or leave a reference to a report made elsewhere. libxsmm-1.17/WORKSPACE000066400000000000000000000001621415223013700143730ustar00rootroot00000000000000# This file belongs to LIBXSMM's support for building with Bazel. # For an example, have a look at samples/hello. libxsmm-1.17/documentation/000077500000000000000000000000001415223013700157645ustar00rootroot00000000000000libxsmm-1.17/documentation/CONTRIBUTING.md000077700000000000000000000000001415223013700226542../CONTRIBUTING.mdustar00rootroot00000000000000libxsmm-1.17/documentation/LICENSE.md000077700000000000000000000000001415223013700212022../LICENSE.mdustar00rootroot00000000000000libxsmm-1.17/documentation/README.md000066400000000000000000000006231415223013700172440ustar00rootroot00000000000000LIBXSMM Documentation * **ReadtheDocs**: [main](https://libxsmm.readthedocs.io/) and [sample](https://libxsmm.readthedocs.io/libxsmm_samples/) documentation with full text search. * **PDF**: [main](https://github.com/hfp/libxsmm/raw/master/documentation/libxsmm.pdf) documentation file, and separate [sample](https://github.com/hfp/libxsmm/raw/master/documentation/libxsmm_samples.pdf) documentation. libxsmm-1.17/documentation/conf.py000066400000000000000000000025321415223013700172650ustar00rootroot00000000000000############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### import sphinx_rtd_theme import os project = 'LIBXSMM' copyright = '2009-2021, Intel Corporation.' author = 'Intel Corporation' user = os.environ.get('USER') extensions = [ #"recommonmark", "m2r2" ] master_doc = "index" source_suffix = [ ".rst", #".md" ] exclude_patterns = [ "*-" + user + "-*.md", "Thumbs.db", ".DS_Store", "_build" ] html_theme = "sphinx_rtd_theme" html_theme_options = { "navigation_depth": 2 } html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] html_static_path = ["../.theme"] templates_path = ["_templates"] pygments_style = "sphinx" language = None libxsmm-1.17/documentation/cp2k.md000066400000000000000000000003321415223013700171430ustar00rootroot00000000000000# CP2K Open Source Molecular Dynamics The [CP2K recipe](https://xconfigure.readthedocs.io/cp2k/) ([PDF](https://github.com/hfp/xconfigure/raw/master/xconfigure.pdf)) has been incorporated into the XCONFIGURE project. libxsmm-1.17/documentation/gxm.md000077700000000000000000000000001415223013700254622../samples/deeplearning/gxm/README.mdustar00rootroot00000000000000libxsmm-1.17/documentation/index.md000066400000000000000000001220711415223013700174200ustar00rootroot00000000000000# LIBXSMM LIBXSMM is a library for specialized dense and sparse matrix operations as well as for deep learning primitives such as small convolutions. The library is targeting Intel Architecture with Intel SSE, Intel AVX, Intel AVX2, Intel AVX‑512 (with VNNI and Bfloat16), and Intel AMX (Advanced Matrix Extensions) supported by future Intel processor code-named Sapphire Rapids. Code generation is mainly based on Just‑In‑Time (JIT) code specialization for compiler-independent performance (matrix multiplications, matrix transpose/copy, sparse functionality, and deep learning). LIBXSMM is suitable for "build once and deploy everywhere", i.e., no special target flags are needed to exploit the available performance. Supported GEMM datatypes are: `FP64`, `FP32`, `bfloat16`, `int16`, and `int8`. For a list questions and answers, please also have a look at [https://github.com/hfp/libxsmm/wiki/Q&A](https://github.com/hfp/libxsmm/wiki/Q&A). **Where to go for documentation?** * **ReadtheDocs**: [main](https://libxsmm.readthedocs.io/) and [sample](https://libxsmm.readthedocs.io/libxsmm_samples/) documentation with full text search. * **PDF**: [main](https://github.com/hfp/libxsmm/raw/master/documentation/libxsmm.pdf) documentation file, and separate [sample](https://github.com/hfp/libxsmm/raw/master/documentation/libxsmm_samples.pdf) documentation. * **Articles**: [magazine article](https://software.intel.com/sites/default/files/parallel-universe-issue-34.pdf) incl. [sample code](https://github.com/hfp/libxsmm/tree/master/samples/magazine) (full list of [Articles](#articles)). **Getting Started**: The following C++ code is focused on a specific functionality but may be considered as [Hello LIBXSMM](https://github.com/hfp/libxsmm/tree/master/samples/hello). Build the example with `cd /path/to/libxsmm; make STATIC=0` (shared library), save the code under `hello.cpp` (below) and compile with `g++ -I/path/to/libxsmm/include hello.cpp -L/path/to/libxsmm/lib -lxsmm -lblas -o hello` (GNU CCC), and finally execute with `LD_LIBRARY_PATH=/path/to/libxsmm/lib LIBXSMM_VERBOSE=2 ./hello`. ```cpp #include #include int main(/*int argc, char* argv[]*/) { typedef double T; int batchsize = 1000, m = 13, n = 5, k = 7; std::vector a(batchsize * m * k), b(batchsize * k * n), c(m * n, 0); /* C/C++ and Fortran interfaces are available */ typedef libxsmm_mmfunction kernel_type; /* generates and dispatches a matrix multiplication kernel (C++ functor) */ kernel_type kernel(LIBXSMM_GEMM_FLAG_NONE, m, n, k, 1.0 /*alpha*/, 1.0 /*beta*/); assert(kernel); for (int i = 0; i < batchsize; ++i) { /* initialize input */ for (int ki = 0; ki < k; ++ki) { for (int j = 0; j < m; ++j) a[i * j * ki] = static_cast(1) / ((i + j + ki) % 25); for (int j = 0; j < n; ++j) b[i * j * ki] = static_cast(7) / ((i + j + ki) % 75); } } /* kernel multiplies and accumulates matrices: C += Ai * Bi */ for (int i = 0; i < batchsize; ++i) kernel(&a[i * m * k], &b[i * k * n], &c[0]); } ``` Plain [C code](https://github.com/hfp/libxsmm/blob/master/samples/hello/hello.c) as well as [Fortran code](https://github.com/hfp/libxsmm/blob/master/samples/hello/hello.f) resemble the same [example](https://github.com/hfp/libxsmm/tree/master/samples/hello). **What is a small matrix multiplication?** When characterizing the problem-size by using the M, N, and K parameters, a problem-size suitable for LIBXSMM falls approximately within (M N K)1/3 <= 64 (which illustrates that non-square matrices or even "tall and skinny" shapes are covered as well). The library is typically used to generate code up to the specified [threshold](libxsmm_tune.md#auto-dispatch). Raising the threshold may not only generate excessive amounts of code (due to unrolling in M or K dimension), but also miss to implement a tiling scheme to effectively utilize the cache hierarchy. For auto-dispatched problem-sizes above the configurable threshold (explicitly JIT'ted code is **not** subject to the threshold), LIBXSMM is falling back to BLAS. In terms of GEMM, the supported kernels are limited to *Alpha := 1*, *Beta := \{ 1, 0 \}*, and *TransA := 'N'*. **What is a small convolution?** In the last years, new workloads such as deep learning and more specifically convolutional neural networks (CNN) emerged and are pushing the limits of today's hardware. One of the expensive kernels is a small convolution with certain kernel sizes such that calculations in the frequency space is not the most efficient method when compared with direct convolutions. LIBXSMM's current support for convolutions aims for an easy to use invocation of small (direct) convolutions, which are intended for CNN training and classification. ## Interfaces and Domains ### Overview Please have a look at [https://github.com/hfp/libxsmm/tree/master/include](https://github.com/hfp/libxsmm/tree/master/include) for all published functions. Get started with the following list of available domains and documented functionality: * MM: [Matrix Multiplication](#matrix-multiplication) * DNN: [Deep Neural Networks](#deep-neural-networks) * AUX: [Service Functions](#service-functions) * PERF: [Performance](#performance) * BE: [Backend](#jit-backend) To initialize library internal resources, an explicit initialization routine helps to avoid lazy initialization overhead when calling LIBXSMM for the first time. The library deallocates internal resources at program exit, but also provides a companion of the afore mentioned initialization (finalize). ```C /** Initialize the library; pay for setup cost at a specific point. */ void libxsmm_init(void); /** De-initialize the library and free internal memory (optional). */ void libxsmm_finalize(void); ``` ### Matrix Multiplication This domain (MM) supports Small Matrix Multiplications (SMM), batches of multiple multiplications as well as the industry-standard interface for GEneral Matrix Matrix multiplication (GEMM). The [Matrix Multiplication domain (MM)](libxsmm_mm.md) contains routines for: * [Small, tiled, and parallelized matrix multiplications](libxsmm_mm.md#overview) * [Manual code dispatch (customized matrix batches)](libxsmm_mm.md#manual-code-dispatch) * [Batched multiplication (explicit interface)](libxsmm_mm.md#batched-multiplication) * [Call wrapper (static and dynamic linkage)](libxsmm_mm.md#call-wrapper) ### Deep Learning This domain (DL) is detailed by a separate [document](libxsmm_dl.md). It may be inspiring to have a look at the lightweight GxM framework, which uses LIBXSMM for end-to-end Deep Learning. ### Service Functions For convenient operation of the library and to ease integration, some service routines are available. These routines may not belong to the core functionality of LIBXSMM (SMM or DNN domain), but users are encouraged to use this domain (AUX). There are two categories: (1) routines which are available for C and FORTRAN, and (2) routines that are only available per C interface. The [service function domain (AUX)](libxsmm_aux.md) contains routines for: * [Getting and setting the target architecture](libxsmm_aux.md#getting-and-setting-the-target-architecture) * [Getting and setting the verbosity](libxsmm_aux.md#getting-and-setting-the-verbosity) * [Measuring time durations (timer)](libxsmm_aux.md#timer-facility) * [Dispatching user-data and multiple kernels](libxsmm_aux.md#user-data-dispatch) * [Loading and storing data (I/O)](libxsmm_aux.md#meta-image-file-io) * [Allocating memory](libxsmm_aux.md#memory-allocation) ### Backend More information about the JIT-backend and the code generator can be found in a separate [document](libxsmm_be.md). The [encoder sample collection](https://github.com/hfp/libxsmm/tree/master/samples/encoder) can help to get started writing a kernel using LIBXSMM. Please note, LIBXSMM's stand-alone [generator-driver](libxsmm_be.md#generator-driver) is considered legacy (deprecated). ## Build Instructions ### Overview The main interface file is *generated*, and it is therefore **not** stored in the code repository. Instead, one may have a look at the code generation template files for [C/C++](https://github.com/hfp/libxsmm/blob/master/src/template/libxsmm.h#L36) and [FORTRAN](https://github.com/hfp/libxsmm/blob/master/src/template/libxsmm.f#L32). There are two ways prepared to build and use LIBXSMM: * [Classic Library (ABI)](#classic-library-abi) and [Link Instructions](#link-instructions) (C/C++ and FORTRAN) * [Header-Only](#header-only) (C and C++) **Note**: LIBXSMM is available as prebuilt package for Fedora/RedHat/CentOS, Ubuntu, and FreeBSD. Further, LIBXSMM can be installed with the [Spack Package Manager](http://computation.llnl.gov/projects/spack-hpc-package-manager) or per [EasyBuild+EasyConfig](https://github.com/easybuilders). ### Classic Library (ABI) The build system relies on GNU Make (typically associated with the `make` command, but e.g. FreeBSD is calling it `gmake`). The build can be customized by using key‑value pairs. Key‑value pairs can be supplied in two ways: (1) after the "make" command, or (2) prior to the "make" command (`env`) which is effectively the same as exporting the key‑value pair as an environment variable (`export`, or `setenv`). Both methods can be mixed (the second method may require make's `-e` flag). In contrast to [header-only](#zero-config) which does not require configuration by default, 3rd-party build systems can compile and link LIBXSMM's sources but still avoid configuring the library (per `libxsmm_config.py`). The prerequisite to omit configuration is to opt-in by defining LIBXSMM_DEFAULT_CONFIG (`-D`). The zero-config feature is not available for LIBXSMM's Fortran interface. **Note**: By default, C/C++ and FORTRAN compilers are needed (some sample code is written in C++). Beside of specifying the compilers (`make CXX=g++ CC=gcc FC=gfortran` and maybe `AR=ar`), the need for a FORTRAN compiler can be relaxed (`make FC=` or `make FORTRAN=0`). The latter affects the availability of the MODule file and the corresponding `libxsmm.f` library (the interface `libxsmm.f` is still generated). The build system considers a set of given key-value pairs as a single unique build and triggers a rebuild for a distinct set of flags. For more advanced builds or additional background, please consult the section about [Customization](libxsmm_tune.md). To generate the interface of the library inside of the `include` directory and to build the static library (by default, STATIC=1 is activated). Run any (or both) of the following command(s): ```bash make STATIC=0 make ``` On CRAY systems, the CRAY Compiling Environment (CCE) should be used regardless of utilizing the CRAY compiler, the Intel Compiler, or the GNU Compiler Collection (GCC). The CCE is eventually suppressing to build shared libraries (STATIC=0). In any case, (1) switch to the desired compiler (module load/switch), and (2) rely on: ```bash make CXX=CC CC=cc FC=ftn ``` A variety of build environments is out-of-the-box compatible, see [https://github.com/hfp/libxsmm/wiki/Compatibility](https://github.com/hfp/libxsmm/wiki/Compatibility). If the build process is not successful, it may help to avoid advanced GCC flags. This is useful with a tool chain, which pretends to be GCC-compatible (and is treated as such) but fails to consume the afore mentioned flags: ```bash make COMPATIBLE=1 ``` In case of outdated Binutils, compilation can fail to assemble code when building the library (this has nothing to do with JIT-generated code and it does not affect how JIT-code is targeting the system). LIBXSMM implements some functionality using compiler-intrinsics and multiple code-paths which are scheduled according to CPUID. In contrast to `INTRINSICS=2` (default), `INTRINSICS=1` enables a fully static code path according to the desired target. If no target is given (e.g., `AVX=3`, or `AVX=2`), instruction set extensions cannot be leveraged for such code-paths. Try to fix failing compilation by building the latest GNU Binutils (and `export PATH=/path/to/binutils/bin:${PATH}`). Binutils are versioned independently of GNU GCC and other compilers. If one cannot update Binutils, work around with a CPUID-value as tabulated in [libxsmm_cpuid.h](https://github.com/hfp/libxsmm/blob/master/include/libxsmm_cpuid.h): start at the upper end (less than 1999) and decrement until compilation passes (make INTRINSICS=_CPUID_, e.g., `make INTRINSICS=1021`). As a last resort, rely on a fully static code path: ```bash make INTRINSICS=1 ``` To test and validate a build, please consult [https://github.com/hfp/libxsmm/wiki/Validation](https://github.com/hfp/libxsmm/wiki/Validation). To run some basic sanity checks, remember that each set of given key-value pairs represents a different build (and test): ```bash make STATIC=0 tests ``` To remove intermediate files, or to remove all generated files and folders (including the interface and the library archives), run one of the make-targets below. An additional distclean-target recursively cleans the entire tree (after version 1.9). ```bash make clean make realclean ``` FORTRAN code can make use of LIBXSMM: * By using the module and linking with `libxsmmf`, `libxsmm`, and (optionally) `libxsmmext`, * By including `libxsmm.f` and linking with `libxsmm`, and (optionally) `libxsmmext`, or * By (implicitly) calling a SUBROUTINE and linking with `libxsmm`, and (optionally) `libxsmmext`. **Note**: Using the Fortran module or including the interface, requires at least a Fortran 2003 compiler (F2K3). FORTRAN 77 compatibility is only implicitly available (no interface), and the available subset of routines is documented in `libxsmm.f` and marked with [comments](https://github.com/hfp/libxsmm/search?q=implementation+provided+for+Fortran+77+compatibility) (part of the implementation). ### Header-Only Version 1.4.4 introduced support for "header-only" usage in C and C++. By only including `libxsmm_source.h` allows to get around building the library. However, this gives up on a clearly defined application binary interface (ABI). An ABI may allow for hot-fixes after deploying an application (when relying on the shared library form), and it may also ensure to only rely on the public interface of LIBXSMM. In contrast, the header-only form not only exposes the internal implementation of LIBXSMM but can also increase the turnaround time during development of an application (due to longer compilation times). The header file is intentionally named "libxsmm_**source**.h" since this header file relies on the [src](https://github.com/hfp/libxsmm/tree/master/src) directory (with the implications as noted earlier). The header-only form depends on `libxsmm_source.h` which is *generated* according to the content of the source folder (`src`). LIBXSMM 1.16 (and later) provides header-only support without invoking a make-target (zero configuration) for any given checkout of LIBXSMM. To use configured header-only (non-default), LIBXSMM_CONFIGURED must be defined (`-D`). Previously, it was necessary to invoke `make header-only` (v1.6.2 or later), `make cheader` (prior to v1.6.2), or any target building the library (`make`). The zero-config feature allows 3rd-party build systems an easier integration of LIBXSMM, which also holds true if the system builds LIBXSMM from source (see [classic ABI](#zero-config-abi)). Fortran code may [include](#header-only-fortran) `libxsmm.f` but still requires that interface to be generated. **Note**: building an application applies the same build settings to LIBXSMM! For instance, to omit debug code inside of LIBXSMM `NDEBUG` must be defined (`-DNDEBUG`). ## Link Instructions Using the [classic ABI](#classic-library-abi) (including [Fortran](#fortran) code), requires linking LIBXSMM against the application. The library is agnostic with respect to the threading-runtime, and therefore an application is free to use any threading runtime (e.g., OpenMP). The library is also thread-safe, and multiple application threads can call LIBXSMM's routines concurrently. Enabling OpenMP for LIBXSMM's main library is supported as well (OMP=1), and mostly affects the synchronization primitives used inside of the library. All of the "omp" functionality (function postfix) is served by the `libxsmmext` library, which is automatically built with OpenMP enabled. When using this "omp" functionality, `libxsmmext` needs to be present at the link line. Library | Purpose :-------------|--------- libxsmm | Thread-safe core functions (same routine can be called concurrently). Contains routines that can take a thread-ID and the number of library-external threads. libxsmmf | Necessary when using the Fortran MODule but not when including `libxsmm.f` or relying on implicit interfaces ([Fortran 77](https://github.com/hfp/libxsmm/search?q=implementation+provided+for+Fortran+77+compatibility)). libxsmmext | Provides library-internal OpenMP-threaded functions carrying the `omp` postfix when compared to function name names of the core library. libxsmmnoblas | Supplies faked symbols for `dgemm` (and others) and thereby removes the need to link against a LAPACK/BLAS library. To ease linking with LIBXSMM, `pkg-config` can be used. For example: ```bash export PKG_CONFIG_PATH=/path/to/libxsmm/lib pkg-config libxsmm --libs ``` Similarly, an application is free to choose any BLAS or LAPACK library (if the link model available on the OS supports this), and therefore linking GEMM routines when linking LIBXSMM itself (by supplying BLAS=1|2) may prevent a user from making this decision at the time of linking the actual application. To use LIBXSMM without GEMM-related functionality, any BLAS-dependency can be removed in two ways: (1) building a special library with `make BLAS=0`, or (2) linking the application against the `libxsmmnoblas` library. If an application however uses BLAS already, the [Call Wrapper](libxsmm_mm.md#call-wrapper) can be used to intercept existing BLAS calls (and to rely on LIBXSMM instead). **Note**: LIBXSMM does not support to dynamically link `libxsmm` or `libxsmmext` ("so"), when BLAS is linked statically ("a"). If BLAS is linked statically, the static version of LIBXSMM must be used! ### Installation There are two main mechanisms to install LIBXSMM (both mechanisms can be combined): (1) building the library in an out‑of‑tree fashion, and (2) installing into a certain location. Building in an out‑of‑tree fashion looks like: ```bash cd libxsmm-install make -f /path/to/libxsmm/Makefile ``` Installation into a specific location looks like (`PREFIX` or `DESTDIR`): ```bash make MNK="1 2 3 4 5" PREFIX=/path/to/libxsmm-install install ``` Both `PREFIX` and `DESTDIR` are equivalent and can be relative or absolute paths. An installation can be repeated for different locations without triggering a rebuild. The prefix directory *inside* of each of the [package configuration files](#pkg-config) is set to where LIBXSMM is built (staging folder) unless `PREFIX` or `DESTDIR` is specified. The effect of `PREFIX` (or `DESTDIR`) with respect to the pkg-config files is independent of whether the install-target is invoked or not (make). Further, performing `make install-minimal` omits the documentation (default: `PREFIX/share/libxsmm`). Moreover, PINCDIR, POUTDIR, PBINDIR, and PDOCDIR allow to customize the locations underneath of the PREFIX location. To build a general package for an unpredictable audience (Linux distribution, or similar), it is advised to not over-specify or customize the build step, i.e., JIT, SSE, AVX, OMP, BLAS, etc. should not be used. The following is building and installing a complete set of libraries where the generated interface matches both the static and the shared libraries: ```bash make PREFIX=/path/to/libxsmm-install STATIC=0 install make PREFIX=/path/to/libxsmm-install install ``` ## Runtime Control ### Handling Errors The library handles errors with mechanisms available to the C programming language (no exceptions). The backend uses result codes passed by an argument rather than an actual return value. Such an argument is often a descriptor (struct) guiding and covering the state of the code generation. The frontend however may not hand-out any error state, which can be a big relief on the call-side. Instead, the frontend implements a [verbose mode](#verbose-mode) to inform about unexpected input or an error captured from the backend. Guiding principles of LIBXSMM are muted operation by default (non-verbose) and no unexpected exit from execution. ### Verbose Mode The [verbose mode](libxsmm_aux.md#getting-and-setting-the-verbosity) (level of verbosity) allows for an insight into the code dispatch mechanism by receiving a small tabulated statistic as soon as the library terminates. The design point for this functionality is to not impact the performance of any critical code path, i.e., verbose mode is always enabled and does not require symbols (SYM=1) or debug code (DBG=1). The statistics appears (`stderr`) when the environment variable LIBXSMM_VERBOSE is set to a non-zero value. For example: ```bash LIBXSMM_VERBOSE=1 ./myapplication [... application output] HSW/SP TRY JIT STA COL 0..13 0 0 0 0 14..23 0 0 0 0 24..128 3 3 0 0 ``` The tables are distinct between single-precision and double-precision, but either table is pruned if all counters are zero. If both tables are pruned, the library shows the code path which would have been used for JIT'ting the code: `LIBXSMM_TARGET=hsw` (otherwise the code path is shown in the table's header). The actual counters are collected for three buckets: small kernels (MNK1/3 <= 13), medium-sized kernels (13 < MNK1/3 <= 23), and larger kernels (23 < MNK1/3 <= 64; the actual upper bound depends on LIBXSMM_MAX_MNK as selected at compile-time). Keep in mind, that "larger" is supposedly still small in terms of arithmetic intensity (which grows linearly with the kernel size). Unfortunately, the arithmetic intensity depends on the way a kernel is used (which operands are loaded/stored into main memory) and it is not performance-neutral to collect this information. The TRY counter represents all attempts to register statically generated kernels, and all attempts to dynamically generate and register kernels. The TRY counter includes rejected JIT requests due to unsupported GEMM arguments. The JIT and STA counters distinct the successful cases of the afore mentioned event (TRY) into dynamically (JIT) and statically (STA) generated code. In case the capacity (O(*n*) = 105) of the code registry is exhausted, no more kernels can be registered although further attempts are not prevented. Registering many kernels (O(*n*) = 103) may ramp the number of hash key collisions (COL), which can degrade performance. The latter is prevented if the small thread-local cache is utilized effectively. Since explicitly JIT-generated code (`libxsmm_?mmdispatch`) does not fall under the THRESHOLD criterion, the above table is extended by one line if large kernels have been requested. This indicates a missing threshold-criterion (customized dispatch), or asks for cache-blocking the matrix multiplication. The latter is already implemented by LIBXSMM's "medium-sized" GEMM routines (`libxsmm_?gemm_omp`), which perform a tiled multiplication. Setting a verbosity level of at least two summarizes the number of registered JIT-generated kernels, which includes the total size and counters for GEMM, MCOPY (matrix copy), and TCOPY (matrix transpose) kernels. ```bash Registry: 20 MB (gemm=0 mcopy=14 tcopy=0) ``` If the call-wrapper is used, an additional runtime statistic becomes available (see [Call Wrapper](libxsmm_mm.md#call-wrapper)). **Note**: Setting LIBXSMM_VERBOSE to a negative value will binary-dump each generated JIT kernel to a file with each file being named like the function name shown in [Intel VTune](libxsmm_prof.md#intelvtuneamplifier). Disassembly of the raw binary files can be accomplished by: ```bash objdump -D -b binary -m i386 -M x86-64 [JIT-dump-file] ``` ### Call Trace During the initial steps of employing the LIBXSMM API, one may rely on a debug version of the library (`make DBG=1`). The latter also implies console output (`stderr`) in case of an error/warning condition inside of the library. It is also possible to print the execution flow (call trace) inside of LIBXSMM (can be combined with DBG=1 or OPT=0): ```bash make TRACE=1 ``` Building an application which traces calls (inside of the library) requires the shared library of LIBXSMM, alternatively the application is required to link the static library of LIBXSMM in a dynamic fashion (GNU tool chain: `-rdynamic`). Tracing calls (without debugger) can be then accomplished by an environment variable called LIBXSMM_TRACE. ```bash LIBXSMM_TRACE=1 ./myapplication ``` Syntactically up to three arguments separated by commas (which allows to omit arguments) are taken (*tid*,*i*,*n*): *tid* signifies the ID of the thread to be traced with 1...NTHREADS being valid and where LIBXSMM_TRACE=1 is filtering for the "main thread" (in fact the first thread running into the trace facility); grabbing all threads (no filter) can be achieved by supplying a negative id (which is also the default when omitted). The second argument is pruning higher levels of the call-tree with *i=1* being the default (level zero is the highest at the same level as the main function). The last argument is taking the number of inclusive call levels with *n=-1* being the default (signifying no filter). Although the `ltrace` (Linux utility) provides similar insight, the trace facility might be useful due to the afore mentioned filtering expressions. Please note that the trace facility is severely impacting the performance (even with LIBXSMM_TRACE=0), and this is not just because of console output but rather since inlining (internal) functions might be prevented along with additional call overhead on each function entry and exit. Therefore, debug symbols can be also enabled separately (`make SYM=1`; implied by TRACE=1 or DBG=1) which might be useful when profiling an application. ## Performance Profiling an application, which uses LIBXSMM's JIT-code is well-supported. The library supports Intel VTune Amplifier and Linux perf. Details are given on how to include profiler support, and how to run the application. * [Profiling using Intel VTune Amplifier](libxsmm_prof.md#intelvtuneamplifier) * [Profiling using Linux perf](libxsmm_prof.md#linuxperf) At build time, a variety of options exist to customize LIBXSMM. The library is setup for a broad range of use cases, which include sophisticated defaults for typical use. * [Customizing performance](libxsmm_tune.md#tuning) * [Tuning auto-dispatch](libxsmm_tune.md#auto-dispatch) To find performance results of applications or performance reproducers, the repository provides an orphaned branch called "results" which collects collateral material such as measured performance results along with explanatory figures. The results can be found at [https://github.com/hfp/libxsmm/tree/results#libxsmm-results](https://github.com/hfp/libxsmm/tree/results#libxsmm-results), or the results can be cloned as shown below. ```bash git clone --branch results \ https://github.com/hfp/libxsmm.git \ libxsmm-results ``` Please note that comparing performance results depends on whether the operands of the matrix multiplication are streamed or not. For example, multiplying with all matrices covered by the L1 cache may have an emphasis towards an implementation which perhaps performs worse for the real workload (if this real workload needs to stream some or all matrices from the main memory). Most of the [code samples](https://github.com/hfp/libxsmm/tree/master/samples) are aimed to reproduce performance results, and it is encouraged to model the exact case or to look at real [applications](#applications). ## Applications ### High Performance Computing (HPC) [1] [https://cp2k.org/](https://cp2k.org/): Open Source Molecular Dynamics and the [DBCSR library](https://github.com/cp2k/dbcsr), which processes batches of small matrix multiplications. The batches originate from a distributed block-sparse matrix with problem-specific small matrices. Starting with [CP2K 3.0](https://www.cp2k.org/version_history), LIBXSMM can substitute CP2K's `libsmm` library. [2] [https://github.com/SeisSol/SeisSol/](https://github.com/SeisSol/SeisSol/): SeisSol is one of the leading codes for earthquake scenarios, for simulating dynamic rupture processes. LIBXSMM provides highly optimized assembly kernels which form the computational back-bone of SeisSol (see [https://github.com/TUM-I5/seissol_kernels/](https://github.com/TUM-I5/seissol_kernels/). [3] [https://github.com/NekBox/NekBox](https://github.com/NekBox/NekBox): NekBox is a highly scalable and portable spectral element code, which is inspired by the [Nek5000](https://nek5000.mcs.anl.gov/) code. NekBox is specialized for box geometries and intended to prototype new methods as well as to leverage FORTRAN beyond the FORTRAN 77 standard. LIBXSMM can be used to substitute the [MXM_STD](https://github.com/Nek5000/NekBox/blob/box/mxm_std.F90) code. Please also note LIBXSMM's [NekBox reproducer](https://github.com/hfp/libxsmm/tree/master/samples/nek#nek-sample-collection). [4] [https://github.com/Nek5000/Nek5000](https://github.com/Nek5000/Nek5000): Nek5000 is the open-source, highly-scalable, always-portable spectral element code from [https://nek5000.mcs.anl.gov/](https://nek5000.mcs.anl.gov/). The development branch of the Nek5000 code [incorporates](https://github.com/Nek5000/Nek5000/blob/master/core/mxm_wrapper.f) LIBXSMM. [5] [http://pyfr.org/](http://pyfr.org/): PyFR is an open-source Python based framework for solving advection-diffusion type problems on streaming architectures by using the flux reconstruction approach. PyFR 1.6.0 optionally [incorporates LIBXSMM](http://pyfr.org/user_guide.php) as a matrix multiplication provider for the OpenMP backend. Please also note LIBXSMM's [PyFR-related code sample](https://github.com/hfp/libxsmm/tree/master/samples/pyfr). [6] [http://dial3343.org/about/](http://dial3343.org/about/): The Extreme-scale Discontinuous Galerkin Environment (EDGE) is a solver for hyperbolic partial differential equations with emphasis on seismic simulations. The EDGE [source code](https://github.com/3343/edge) optionally relies on LIBXSMM, but for high performance LIBXSMM's kernels are highly recommended. [7] [https://sxs-collaboration.github.io/spectre/](https://sxs-collaboration.github.io/spectre/): SpECTRE is an open-source code for multi-scale, multi-physics problems in astrophysics and gravitational physics which runs at Petascale and is designed for Exascale computers. In the future, SpECTRE may be applied to problems across discipline boundaries in fluid dynamics, geoscience, plasma physics, nuclear physics, and engineering. [8] [https://ceed.exascaleproject.org/ceed-code/](https://ceed.exascaleproject.org/ceed-code/): The Center for Efficient Exascale Discretizations (CEED) is building on the efforts of the Nek5000, MFEM, MAGMA, OCCA and PETSc projects to develop application program interfaces (APIs), both at high-level and at low-level to enable applications to take advantage of high-order methods. The CEED low-level API, [libCEED](https://ceed.exascaleproject.org/libceed/) uses LIBXSMM as a [backend](https://github.com/CEED/libCEED#backends) for high performance on CPUs. [9] [https://github.com/romeric/Fastor](https://github.com/romeric/Fastor): Fastor is a lightweight high performance tensor algebra framework for modern C++ and can optionally use LIBXSMM as [JIT-backend](https://github.com/romeric/Fastor/wiki/9.-Using-the-LIBXSMM-MKL-JIT-backend). ### Machine Learning (ML) [10] [https://github.com/plaidml/plaidml](https://github.com/plaidml/plaidml): PlaidML is an open source tensor compiler aiming for performance portability across a wide range of CPUs, GPUs and other accelerators. Combined with Intel’s nGraph compiler, PlaidML is targeting popular deep learning frameworks such as PyTorch, Keras (TensorFlow), and OpenVino. [PlaidML/v1](https://github.com/plaidml/plaidml/tree/plaidml-v1) (development branch) adopted [MLIR](https://mlir.llvm.org/), an extensible compiler infrastructure gaining industry-wide adoption. PlaidML/v1 started using LIBXSMM as backend for targeting CPUs. [11] [https://github.com/intel/intel-extension-for-pytorch](https://github.com/intel/intel-extension-for-pytorch): Intel Extension for PyTorch aims for a smooth user experience of PyTorch on CPUs by the means of good performance. The extension pack started to rely on [LIBXSMM for achieving high performance on CPUs](https://arxiv.org/abs/2005.04680). [12] [https://www.tensorflow.org/](https://tensorflow.org/): TensorFlow™ is an open source software library for numerical computation using data flow graphs. TensorFlow was originally developed by researchers and engineers working on the Google Brain Team for the purposes of conducting machine learning and deep neural networks research. LIBXSMM was once [used](tensorflow.md) to increase the performance of TensorFlow on Intel hardware. [13] [https://github.com/IntelLabs/SkimCaffe](https://github.com/IntelLabs/SkimCaffe#skimcaffe-specific-description): SkimCaffe from Intel Labs is a Caffe branch for training of sparse CNNs, which provide 80-95% sparsity in convolutions and fully-connected layers. LIBXSMM's SPMDM domain (SParseMatrix-DenseMatrix multiplication) evolved from SkimCaffe, and since then LIBXSMM implements the sparse operations in SkimCaffe. [14] [https://github.com/baidu-research/DeepBench](https://github.com/baidu-research/DeepBench#deepbench): The primary purpose of DeepBench is to benchmark operations that are important to deep learning on different hardware platforms. LIBXSMM's DNN primitives have been [incorporated into DeepBench](https://github.com/baidu-research/DeepBench/tree/master/code/intel/convolution/libxsmm_conv) to demonstrate an increased performance of deep learning on Intel hardware. ### Automated Driving (AD) [15] [https://software.seek.intel.com/accelerating-eigen-math-library](https://software.seek.intel.com/accelerating-eigen-math-library): Accelerating The Eigen Math Library for Automated Driving Workloads: The Need for Speed in Kalman Filtering. An article in [Issue 31](https://software.intel.com/content/www/us/en/develop/download/parallel-universe-magazine-issue-31-january-2018.html) of The Parallel Universe magazine ([pdf](https://software.intel.com/content/dam/develop/public/us/en/documents/parallel-universe-issue-31.pdf)). ## References [1] [https://sc19.supercomputing.org/proceedings/tech_poster/tech_poster_pages/rpost244.html](https://sc19.supercomputing.org/proceedings/tech_poster/tech_poster_pages/rpost244.html): High-Performance Deep Learning via a Single Building Block ([poster](https://sc19.supercomputing.org/proceedings/tech_poster/poster_files/rpost244s2-file2.pdf) and [abstract](https://sc19.supercomputing.org/proceedings/tech_poster/poster_files/rpost244s2-file3.pdf)), SC’19: The International Conference for High Performance Computing, Networking, Storage, and Analysis, Denver (Colorado). [2] [https://dl.acm.org/doi/10.1109/SC.2018.00069](https://dl.acm.org/doi/10.1109/SC.2018.00069): Anatomy of High-Performance Deep Learning Convolutions on SIMD Architectures ([paper](https://arxiv.org/pdf/1808.05567.pdf)). SC'18: The International Conference for High Performance Computing, Networking, Storage, and Analysis, Dallas (Texas). [3] [https://pasc17.pasc-conference.org/fileadmin/user_upload/pasc17/program/post116s2.pdf](https://pasc17.pasc-conference.org/fileadmin/user_upload/pasc17/program/post116s2.pdf): DBCSR: A Sparse Matrix Multiplication Library for Electronic Structure Codes (poster), PASC’17: The PASC17 Conference, Lugano (Switzerland). [4] [https://sc17.supercomputing.org/SC17%20Archive/tech_poster/tech_poster_pages/post190.html](https://sc17.supercomputing.org/SC17%20Archive/tech_poster/tech_poster_pages/post190.html): Understanding the Performance of Small Convolution Operations for CNN on Intel Architecture ([poster](https://sc17.supercomputing.org/SC17%20Archive/tech_poster/poster_files/post190s2-file2.pdf) and [abstract](https://sc17.supercomputing.org/SC17%20Archive/tech_poster/poster_files/post190s2-file3.pdf)), SC’17: The International Conference for High Performance Computing, Networking, Storage, and Analysis, Denver (Colorado). [5] [https://www.computer.org/csdl/proceedings-article/sc/2016/8815a981/12OmNCeaQ1D](https://www.computer.org/csdl/proceedings-article/sc/2016/8815a981/12OmNCeaQ1D): LIBXSMM: Accelerating Small Matrix Multiplications by Runtime Code Generation. SC'16: The International Conference for High Performance Computing, Networking, Storage and Analysis, Salt Lake City (Utah). [6] [http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post137.html](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post137.html): LIBXSMM: A High Performance Library for Small Matrix Multiplications ([poster](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/poster_files/post137s2-file2.pdf) and [abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/poster_files/post137s2-file3.pdf)). SC'15: The International Conference for High Performance Computing, Networking, Storage and Analysis, Austin (Texas). ## Articles [1] [https://www.nextplatform.com/2019/10/09/cloudy-supercomputers-join-the-hpc-petascale-club/](https://www.nextplatform.com/2019/10/09/cloudy-supercomputers-join-the-hpc-petascale-club/): Cloudy Supercomputers Join the HPC Petascale Club. An article written by Rob Farber, 2019. The article covers LIBXSMM in a separate section. [2] [https://www.nextplatform.com/2019/06/26/counting-the-cost-of-scaling-hpc-applications/](https://www.nextplatform.com/2019/06/26/counting-the-cost-of-scaling-hpc-applications/): Counting The Cost Of Scaling HPC Applications. An article written by Timothy Prickett Morgan, 2019. This article is about CP2K Open Source Molecular Dynamics and not about LIBXSMM. However, LIBXSMM was key for application performance. [3] [https://www.nextplatform.com/2019/06/26/counting-the-cost-of-scaling-hpc-applications/](https://www.nextplatform.com/2019/06/26/counting-the-cost-of-scaling-hpc-applications/): Azure Benchmarks HC-series Across Twenty-thousand Cores for HPC. An article written by John Russell, 2019. This article is about CP2K Open Source Molecular Dynamics and not about LIBXSMM. However, LIBXSMM was key for application performance. [4] [https://software.intel.com/sites/default/files/parallel-universe-issue-34.pdf](https://software.intel.com/content/www/us/en/develop/download/parallel-universe-magazine-issue-34-october-2018.html): LIBXSMM: An Open Source-Based Inspiration for Hardware and Software Development at Intel ([pdf](https://software.intel.com/content/dam/develop/public/us/en/documents/parallel-universe-issue-34.pdf)). An article written by Hans Pabst, Greg Henry, and Alexander Heinecke, 2018. [5] [https://medium.com/@rmfarber/libxsmm-brings-deep-learning-lessons-learned-to-many-hpc-applications-9143c6c93125](https://medium.com/@rmfarber/libxsmm-brings-deep-learning-lessons-learned-to-many-hpc-applications-9143c6c93125): LIBXSMM Brings Deep-learning "Lessons Learned" to Many HPC Applications. An article written by Rob Farber, 2018. [6] [https://www.rdworldonline.com/largest-supercomputer-simulation-of-sumatra-andaman-earthquake/](https://www.rdworldonline.com/largest-supercomputer-simulation-of-sumatra-andaman-earthquake/): Largest Supercomputer Simulation of Sumatra-Andaman Earthquake. An article written by Linda Barney, 2018. libxsmm-1.17/documentation/index.rst000066400000000000000000000023621415223013700176300ustar00rootroot00000000000000.. mdinclude:: index.md .. toctree:: :caption: LIBXSMM Domains :hidden: MM: Matrix Multiplication DNN: Deep Neural Networks AUX: Service Functions PROF: Performance Profiling TUNE: Customization BE: Backend .. toctree:: :caption: Example Code :hidden: Collection .. toctree:: :caption: Machine Learning (ML) PlaidML tensor compiler Intel Extension for PyTorch GxM Deeplearning TensorFlow .. toctree:: :caption: Scientific (HPC) CP2K SeisSol SpECTRE libCEED Fastor EDGE PyFR .. toctree:: :caption: About Contributing License libxsmm-1.17/documentation/libxsmm.pdf000066400000000000000000017460011415223013700201430ustar00rootroot00000000000000%PDF-1.5 % 21 0 obj << /Length 4893 /Filter /FlateDecode >> stream x;is6+TژA\\{Eo|κ챴z5UZʲuv4G&])s\l9B5~"!+=*H ojH&QT>ayv*|z q"Aa`b2RYu{*@^)-;vr7sX;;"X"ɐQKY\Ze-2YxO&ֳX\ް6Kmryaw-[jtG+h Fvi6}K-sT\?$ 4nz6 { mC7 i{֑.ؑ퐗 Z@Qқ"7VvK<6 UׂX;,(\&Ȋn~fum:˅59)U-[؀ŠiQRTp/Oq}%#ɭ@KMLnl]ЂX:ع'?W\}8Kf=s0^FOp0@r y\[:&v ӿ[ nZ4r%tWx B+%$~ _5f$dq=QsL:]ͧ `\kyL[r݈AϑYG6C"u0D /K O qjlQBV#Yl qiD4>dgI9w`4UϋlߍL? ljy9/)o= cٮt/zLhY@_eMC@)8h'w6\XaAjलYlC_3܏x4}xQER&3:L Q-$vmm3_94q6i{w/3ߦ6\ȫ2en0lk FllP}dFk ^"=fsX Ʈ˘ 6s)2m\ȏǂj.2V<0{=&B퍻Np?,T^@^ufEN>u"0AF+绿2+ lm͝췮/߃h;~=>8 |݋wϾ9}j4cN*ӰtNiAĞGrd/) )Coz+*8MoU 3ܬIE~%(Xn(]S )+B5\aNJLE|7 Z'Ӧ EoCG2e![f񜇈d4,+J/)Da) C5TZt @!tБny bI@{2ķ|(TΩȎQ<80U#Bъ{(Z!q+cڌv""|ؕM 6^d F Eߕ3 t j;oߖ^9ml[Q:}iY|Ket#w{?|.F>-WHCEȺyV_1w zѧ㰣7>t6ttzSpFwv]r,C P&]*-& ͟0")3F}~4mwx7&F ?<<aw0_ e(`!_I㯏_AP?@#ED#+*=%w^D~{b%\~DTZ8!LJB74 ҕӝiK!nr6>)i7/l;0_,>*cH /w\DAFp^#z\TcZơiΛ?*/MdfuAjzwǜHNW\X |Uظ$fmtT'Fr7/Blnft-G tT,jj幄Dk =F"0vm-z6V58}M[K_Yz^b,/>jX(+]CyCۆ._>{y\my=eZeثx .5/ '^%ʍ3jLTlyDΙW-1\|]JSV$ $r ]‰T]8 n,hdp))pLJ:v|'h3B)b E =ZBR^tNZ2YPYCO"(aȥp5~/zaa V<8y05YcU*2a|xhayhEǡ_+&>. cbĿi>d6ET)D!8ev Jm]]tQsU=H߫a{_A՚~6VGSǁe*.x* So&ʌ̶]ymZɉ}plUEDm`'ӦF/nןH+ Z '#q)"mpP/=7҂68n}X*~pTJ_y%4+~Rf2;QYǧ|[q9#NE}γ'ACkǬ &aC#Qhy{GY GlBMمPTl?pÃ<@ͷU՛Y7w+="J8*I˩ @i7Ԣ;7zn@Rw—]+=z{3enj0m$"?~W+ޓASբ-p*w 7a,dgFR endstream endobj 86 0 obj << /Length 2368 /Filter /FlateDecode >> stream xY[o8~𣼈P[]^eL-3vwF,KMš3 ~;j|D]!|^G%l߀M5]og([|uj|Byv5`1bj^dA2lAR]]e(=[i:Q`t 8ZZ^wsDz`+V ]o{^p %zXdr`~f7fՊntˮ+J?TZ//C 5Yazdi)CT)T8߇ f&cRtrY'ʵK~ݏcV~2ܓb Ѵ"|]zkP?+e\3}r] \Z]zN?7y=sL\MI"_H(Q ;)j~yJGreWݗq.}uhCu3V^0@ŭ4_EW֛ĹXbJIU?WUkg tr> stream xڥZYsF~ϯ`%`E NWA-GYJLD )Aa[׀DJ 1uvLίT/r8/bPL(mL-.+{qd A慓i z"@z Z^59̺iv4ӡXˍ=s.E-ҝq80P3ۆp㱥A[XQ/KA]Y??K;2 ua#GbP̯SqMB#(]ފAHR fIRH/ h[$F`"@K*V?=9dri"m|6m~[='4@hyٰ%<ّ>_$ .e)O7zqS'I 0$NZQ/ZC0m,p-ŋ"t<#NBwT<$-PjnPVjPҭ CZ UuzAgRxC~0ɵ֗_كFsyvL0='=10lJ;.߱ hN#Cڇwi ~o:meU>41}xA'M'uI8W[Vs^b͵.r2N޼ MxLOKd!3OA\4fYI4"y~SO5 Cwκ@t[&M[^{q10OG1coM>'X0Q2\` 0&0u8b3Ӊ>˂!He˭GC4fa BO$  PtB>F;[(߱P'x!M߃ObFR{~mZ'Mn? m7NK\]Mt\2<؊;yVunzgcj,ɊX~x0\"h䋢`>u^WN+P9nly?8^Ax)0|F! |LK My NwLsYp ڲo+#0ug"KFRcR뿻> }?+2Ss]c76r JfGn lv sl<Tq~NVQu 1ҭN ݁bUGQڄ EPpk>!Sf$U&k8R'4#@BBD2xJ8RH\?D $u~kv`M߫N>F׋G6V`  ^ 8]?ڡV\@>E.}A: %\2SU뫹zN--o_]1قG ;1ыW`~!a <"]W nL>F4(2B6ݎǶ˶J Ar<@m-c#6D=pq*. 'C-!3t 4F `J3ؐ7Nz$3i0/P<4D?Rc GTcP\5`U܈fͽ؁A']9@y (NXO^nTYܮ\Fn?r;riŇi㎗K:c, j@؜>WH)v<`<p=@ PHQ?ZppefLO >K>\e SD|@t$/lI&XEڶ)c)I25{y(ͫ`hRW׺ؠP= !]w}ͳ!K$W()>FLL D䂙 %>xEҜRNhGq8y 5Dp'#ZzB/ mW · ˗*pX:r!!c;+$7UQT6CePp#Tډyn  Ďo}/ /k]Ρc/"rpArz}|v%OqSck-? ֿ&+xw<AKB' vг5_~Wuhհ+5ZkM\h x~k!=x#Ϣ:8ky/$c6.o oȅ7˥0z×_;6[n{wCo^o so砒ڿr:_ʦ!6M,xµ+kz  9̐ Iӭл3]῏iO%C A`P|N>!,+PeVSp++UF*4sہ0N.OtA@^[Wt(=1.d?khcJy6NCmvf>G=~ ;*F *|MK*kD ^ ݡIk^_o endstream endobj 2 0 obj << /Type /ObjStm /N 100 /First 801 /Length 2614 /Filter /FlateDecode >> stream xZm bEAy)NR]q(tڹ;B}(iuVk7q^QCR#`i SF2ˌ1$JȤIVLZWR1f2XɴbJh\RicS^LhqqDL #Wp]4s%"ӸhRqKk{(XB1-~ X3`&%1a|/@ ~!!$::Ђԅ#(Q c"$XhVNc&V r; v` [=sO' ; yҽ V,@xWD`,"xROh<2`p4Aɂ۰tC"XEY*YA@ ` 1 tdp+ fAMY Ja5ޕĒY %4Qu5Zzsx \E8E,ZG 1Pn8Œj8E$clJn Sb do4%˟W;<^_il;*fߖ#)掽6Z<&y+iGgmvo4~0A5Fzȑ9ɑWMNV\炤+{Dhn[#V a.ݫemaYW6Ou몿_7W.&V[G*~+g 7N?X7iݔ|BPP"@xntfWnbx9[A>m)>9do n}uYK82iV1[D;@ōGxQ檿Ep=U=lf}WYЮhNuVRۥ}ʴY85B r ۔SspMq]ڦρFfe}Im=ᣩy,m_-;YDNK #d`h_V7yuK^qies(!\Q)G>ـ_Q//h^kAE"",?"A>.—{M )ՁXTQq|eo߬U+#xrl:NrBjPƑRiı%i?SZ]YҘ8ktx@5CTsps%lHQ( 3((s(2pg%(x#PDGkMѷ]AVJ4(^hr(d@":)mڣTġmڷY pX,G)= KL~;Lфq)zjb&hvjM-p8ԖTs:;T}R .W9,6/U]۴]/;n,nD,Åر'䰛 2d0:* Vz"1}OQSR9*;d\~^Nl~3}'6M(L1&(JAO4m}pڢqأqؤ1Ґԙ89Jn%G,rf nS[ 2/H$̙6"F7*׽}ߵqgA[XsWC7oN"G;nS) m<:*2,:_ iؕ= jG_TwVmݟ6 '(aK9ѺIfͶ~%ɏb֡*XL 1>`l~z[]k_0XL)f72 6nMB~$%}UA7 endstream endobj 158 0 obj << /Length 4113 /Filter /FlateDecode >> stream xZY6~u+FE~ZsR13v*ݘ>yWW[}}AH$2n|-_^V+yJaf*Sqz[pY7Z`[[qPrOws;~zYV8 :6b'Q̆*6۾QL@l6ATyfâ5eՖۖ CL@Gtp mNEG{ݮ?8nf8T\UKZ+ΟzwBZwGWY'!M8pyuŻcS܀Pw>{eth" =k#Y2U(n 7T~m:ڔ|ə#eεeC$p6-wV?I/f 'rʏk#k`ޅq6KD/{_OHRJ3=cB汳*4F'!z tAz5V 7t9PNn-o}D- yG]="2~[: Y)w$tP|}nLhR řB|Q{$D S_bXn[ m:!џpxBhC0ӵT(S4ȚpoO<3`:dq~$T\ 2p'˺!-?LC;$O`ԱM}b?-oVfkX&x!Pj\ 6 _xש:Jߝ&FF%aSzA~a,m$_ 6o=GKj_PzTbכkAC$KPȼN[ F`%,j[+24(1 YP`Ÿ88a u.c90l\t Ǒ4%(Qʳ{ >˪1~80R<L u#MGo9%YWq ȢyBnKA3~NH_[T @09}54>V)]1"|-}|I(-cQ d0lZ 4nވJ ({G@zSt͐EeQbҾ ԩHt*ߌRv틫x[C}+ 0͈cfC}yhB5>xO$B̘: aՇ!qBIg6'1WCOx.?yc-H:XQG@zC9$Rp Yq`]| \0b <T16mi]dezHp*LfaB l3ơMweEe::&VaIfK6ާH|3< NQų{X4>zRe,{ϪyN0 Qg)Lp,7 m&vGWg9 FmA|Y*4iF2ǔDL3K#_7=g{e[~Gŝ&X@iԌwe{/ĠR󝊜}uSgQvbV~(66TPC4VRkVEƚ!ȂcʌJxT"AsG\Ym:hi4`ᚒԍc儤$Ʋ/GD hN@ /o<V+Z+IPmΝ/mc)iMFőh_A|"*lv_p ڇre]JDǷ /={BbTFJUmZ@G9pR5B{Dt)L+O|;RS"gؖזtdBc_9WK,^&U٪Ȟ41zXk'>Gyl;$?!Kf%p(Y9b4ޤj ꎥ^,9B\e>%nsr',%+)ƕ<*,Qr|j#6`Á&,矆o>ğ`hс$.~lu8nzz ^NIA9<@[!h*i|FK# șjz⍡2e~.4$lNjiCd&\\w`C0Ӝ=oKѦKY$ &RF%^ajuPlPlkF~H>ae@[ C<S &J}3hhv+6KcĆZ (Zn}N$]Skk] Hz%Bu@N!(#\ ٤ԒsIi=9 =ON Y%|,Vmf7z7{r\.n* M"s0“Ԃ86|WSAk4^mܸeݷr<,`ۺ%I^I5czzB_Om0C_l47E~B*X[7  AbPgzjJOg.>JuqG#0W=Fh F-[(9UjK.UPt0剜vR5i0olBwE7@!Td݆2#nQ"g'N(Prі>#2m:t@xEO;+# Ԟۋ=XOS;K<֝[ ·m9NI("\\ۑsT{I(X Lм)C +E2e&}\>w8 (%qOGNzXPFܬx='n3iQcy[yVYQ|,{% ^Ҡ-. $AEIn^,oxO" <Uyi!>K3l s/_~Bvꇗ<-,2K#ԑ &uUͫ[/V-Ƣ[S2ƛc_+ endstream endobj 177 0 obj << /Length 4169 /Filter /FlateDecode >> stream x:rܸ] \56QbٮLUrՍXf X $ꖥ$/XFEtP*,$^\ns8o\4 Λ~j~?!PQXD"0a@Pb0OrW5`ߨ{]}\Z`Sj~@Lx톮l]` ?Ug{n@A8ޟe4½u.Lret\Pyqd20 jw ]34쀆D\sfÔjg@DmY7#`9Z}?-_+f?x6D:QY0> %Ox!8qd7s'/turbTnV}%R &L-X۝Ɠk^CLǽ޵j[6}YA}̓ v㥌xBlm}rR%Dlxx-ʿGZ $bW5Bڦ'ipD(LmW"XF>ËjVv_8IUհ>1Y}!}XՀq7 q-i |C* ^={6|+lM^3"45d$I:Iv੼S8Be#8 HWT|UؔS@w\w]zERQ" KS Thdޅu-}ӻ{w&UbnqL7c\`;;Fk~p@5+Ϣ; qĸ-^h|Q'N-ϜJmXgAJy"#J)&˥в'삻Klګ=w-&'yŦvCO^GL=͵D4LTCBf@ߎt5eG'R^:J9aU/I[3'oFB;O$0'_W`d!Lbs]`|*b$$ߨܹRq&w<+pW0p<M%,j|Dd {My [*+*anbB+:D+R 'EqQQyEeh?@_*B8"**H;8tS"-1d@+ cEk2F, nc5WI u.)DtJT#L]"ȯKRpZ(([qUa̛#]F۫z=yqNP 5Oo)ga:jiӧ[@\xn\QƆŻr6€z _?QD{rHG]ڮ1}hz60L^r$9U8+52Q:;4"'7Y0KdG953E {/a^_RSGh*G@?UWܬmw^Xċf&KK%3i_qXExp1"ތ|ɟ1W# F3@ Bޏ=[zfޞ#sVS>3:x'` X?;r1LtRÎɍPslq]%QH*Ae0= CYòL8_Vޱt5tƤ 4=bWdIGy܈N1ΕSdNj:3P 4ǔ':Ёc.o-$ q]ƤpI慘h=xn஧i1>>Ol J]&tGEYAfz81%*>c3_p"V> qrl~s +Fa!wc3WDs/rY4JjV4 \MSBHĪS`V#ҩ@p;Vv5#ؿ?r7hQb撂j=۾P:f|u<7 yv _j8>EQƙm,cPrrsr נgGu(cre$jfA@eIe-%qPqnS~Nqge`DqPylRP -+*k걏Ҥ(wFꈼf]Ĕky]3X[=Llc"yԡ uǍu:fb޼|?Fdp>< . Ycm(fG!%Aßde1%Kg*Rs*Ii%CWMH|2$IT$N}G-z7NиWь\pyJ@BmFQK¦󣃻dSPW_ εe)~6 Hxf AAw<0«̻-^Yó*_Lu~S$$ br艿Rj{)?Cvlotl=Jo;:N^q7tHqp$Lu$-{,CyZn ui"<N+ SÁό2Xv8K#Ӧ`$NRyn(wNYAw*R.UcͿ ݣn}=|H0bx߀E},eT\m.w xkS?("Oaϗ4w!k?Bp/@&=Ѱ}#{]E[T~ťŔ쨃ZeFp6dɃm}}S~ ZEFyϲ&ٷj?@= ؞ƟzSZ.yc]9mWAj{7nsȉD8q[LLu6/[-"͏\ Z1'ýGW2_t72!^XOz.bt FK!BbtfI$> stream xڕZYܴ~Wt*ؒmɹn%!@P,t4^H_&/==x%Y:9XW''v~ju} SFzeC+-޻ zCU~։^4u6ů_ߡ?  2NRUVʢ3m [ႆ+?\شY{;Ly_r?pko/ŞEWI [/+lS 84roǶٷYU񶠽e/AԍPCbEm#6- e!kV<#i?1`x˼ʞ[ܷcuqgrV˻U^"PrRzOIi9I] [Єs׉_m[uQ/֏detk0_]9 ȃy:6hBl#!@>w}p|F;):#l* _nZ<3m4̝TiQл}Hag$&$#fC1PmW9MzE~F0KDYb"xϳ#'MRT2'<(%6(,˹!|ai+ $w WͫGjr807L Lhs`@<ޫ ?E-e>~yǫK͹BJørM}O;7@קg!N]kR9BB1CC[SUc?j VJpK<8S0pNH^XٹuRh>&VE'rx jGnG-jxqʈY؆k"dv\1'0)hJ4)ˠ͋D&-# x%!"`;6Cz"v$9k}@a2`Pf&N HI5joĬVؿ :rV)zɊ8M*fI[EJ oxDlGQpGshԉYI[ikvn5;n1\:1C9n |dR%i;uj 9u(6[Wlr}$t"-xɎPRrAO?ko\ցRbm#t ɤJt MYC%ҧ7TDBMx<.[B~uK "x~ 8A2:kElQ;8.5{Gϟ$x_S5=L^P-=߰P Ic,= 5 ̛2K! ?{*z2]jUD=qPZ,HM TܱHDq.pzt 9ݡ*IE%ߋM_:Pe>\`&墐̂yEY(NtR F+F+G50xl?~EX!,HU r[ X\e@7^#ڙ0YNs )e<]PXr3;ɹ/Om.7YO|3>-GԕcN(S޽zyнX?I%YT VZ`Z`̸ь FVb933K(817%L(elw;SY:-{}H)?$ fzɸx1J,`:''eo2~jpybX`jPH{X{D!E\OyP#@Pɰ+ i2\__t"*]tyfD2k6'>ԇ*Q]ドkqO7FK%(AApt:vEIq{2.nÛry{mc(^T/@&U&F`΀煨Q ic0٧܀GuKrHv4'h>Ũ-;¯`*'Ǒ&p!Ngnß4ܷ#c,qxzB;}٘ۧͅITY<*-ΘH&e{DPõ*#R^5홠 Squ>-_= LEK/cKw,Ymnf3†hbe[Ff~)}^Dc"t`9bٷ}+0=" Ѳ S:d60˷F`Kع 1zPڸ}bTC܋{Pi, Y 1@hry Q}S7P `Ha%EEޡ HNV2LjлFo˼, 8NFdۼn8 @X]WV "GqonU 9yp ;;=S0BhU¡k*\(#Hƌ  GPFncPqI5<.1Pr={6Pd1` 1t`;t$c41(x|8d < #J媙ܭ2Jj;Y"(l$B>4>i {nV\E@Ĺw o[qqñ-}L06b;%đ9U@’ I|US(I N63iɥ]_=brw?MjtADH>U=vâ;G1Z/:0:Liiqi(쨥rK[A%5*J+uTy_ߤuڞrd9=+ Hx8RRNRJR^cZ)B:E$#ܠoZx_륹Pl֋F/!Jd+noE^*:_o*x[#R7E{/D!GvY1ݩeoo MO w6}~Β;cN@EU~jO `Mw:=}挮n:koDpp^G k/'r+0r|H@1 p<;58ӁB)g Wtuz2Ef*i']0] Nj-a4%Ee]ߐ嚊iEmƩH|oiZ_`p6 @i5PvC鹣@݊G ?`APi0@1@$և˦i E Q2=Dwxw*1. _Gn 2 Z;c^G:г'(:#K*#*zGe;=,ןY0 endstream endobj 241 0 obj << /Length 3781 /Filter /FlateDecode >> stream xڽَ6=_>uPW?d'ݻ"YnjWbO~"uǞ`H$EuWQn*Xmz]/"\ms0(Zm纮כ8ͼN?80#€V`ZxYiZȣ]Yp׾p~ٙ e]vIyV_"Z (0$JϾ&Yy: >Z~f#TʲЏt8d츭krbWήmL[vC/;B̋te=#DqGj +66̓ImUw Kmb6 p<<Q 7BLG_Y0SM[~çu5(zzir7 )ر׾0B$q,1+c4ܹ%Teɮ=f8QF =PGLmC} *\6aq  XAh[z^3S%`d& +ӂO9ӳ,(ar3(N pvH#80oap t+b9"52MEUokk`DKI *ip7?4 ǑEr?j/Dzg?$FHZqdCͯ}OA `ԱZ$ $cAMgw|rDhQҙuNWa!y'fe]  ڎ+sM (4}8H10[յ͉۲o1BY34ݾ^G9O&.u mMAɚgj]Sn2n-$\R0PtO< ^`5+Knn BChLgK#_e걄~*S5DP9OB?D$zϓHn` P ܖۗ/>J`8Y ^R[-qx^ucIWÓ~ee3=w`F74ۮbi 0ݲ4 ÷<|[Dæu#4zxA6B|I1%*XT12.Փ4w. ~̦a~2'Co˃;ͱ#Ȟw@L8X_PM3O`4'S0&ȱ;b*"U iE}|@þɂctjOԧ{V9~6M!X,Fkfh92bQ {kϮNeFV~S\a5ahvwNYؔdp8&bj#:d]:)IJ>mjv`Pl3ŞCLRE4O7`ܬ)f8>(5%L-d\` T\J0_p]vuT* ]Y6__T>!9LWsSȊ.fdP8xZ^pT:y0,Ie|*>\l:KhvI$n mwp“nv2Z (N=GNe+nZ7-V`ķVI ` . aM]z% @B} Y,yȕR[,X OPn,cU- `syQnm&Ǥ;r@IE(:Hޞ ThǪ'A,w% sXaٶFݸR//‰,=l =wlD" /+DIgѾGhG,U跌Xh.j(Rݤ0VP]ͦ[7K۬]ʦ"Hd/#%@5qPMe L~T5@ IQ\jzoqHa K 4obChbjpZ @ -.7͎kGX! q~x`LKvz(QNw8L7 )?3HDLC(tu9U@_Mܡԑ\HuUY;f+iYo֏H_IɎN%^ ,Q-֩>,\mb( _ąfr fV!`j#Xp s7Ux%ˈ=leq ~_H:XV'6 gy+K>'$LaXl~,jsokj+.$]G'T򚲧٘;nV nf@Ixuuqgq$ =1",J.`oHLg,:E@s!gA jQ/vYfbAqt7I}Ib]#13Nl^vO'@ΝaWvjV$裸eYqC^YG- "6\Gkmk9׶2ZC4^؟3^Z -:YbZ^&ԌŔΖaC#q u- ld.f I0F[L m+Q0zhF0ȥ. g XV2‡7xqPà oY5)ŮDy[XPb4O}t[Q% 1Z6;'I$r79thR~/B(N`{W99M-\z/!M]PI;̊ 9gxҜnr¶\oaL"@JSӳ玞ߐ8RD *DKx ("q%/ag+~"&HLO3si2p3L <Qѳ(s'^OaL [zx 4vy)-=?NNyO*.ջ3o?Ϝ CN9퐓:Cqd~5πI}toJ9A겲 ZP"&T WゃECE摒Ҧ+MJ~ds5w>Y/e$sgA t)+Mm"t1YG+_e'uu+dE\}1:b˒cG-褔6<.SL> stream xYm_A_%NkaE褽%s>>A 3<3AemT1ڤ(sV>%UHmPڤb$Y%`Q)uVe#=pΩsxg\-bdy6Vβ ~{9|E#feIbTdYFeE"efX&EpͽtβwFy)7L:3H&  8d'J:/փ&΋cfwF,bxE !yb1^89C嘔dabK$ O& Qd-)Zeb7pL0ž>b/[1;L8ȞG#P3Jƨ`2dPzrz:,P=M^Tx|Z6oͺ֫ 䈴)$b9ltL\l\UWM}sT{A_M}Y6?O+ /6깪puBPMjpY.?;,9- :2ldNVuYG2,d/Z#kf,r 6a=l!k!,qR  LYK`,$eX!֩T4fgR1g3໻>(oK`n`y#JVϺvFPճgzY@lvUOp}p.!^^4ڷjGYMV8la ߡq"Pv'$P[ڷnhFGt-8X5hpxl/l]լ˛E}MׇGurM$#򰬝d A"T>nTl,f2I`M1ljӄ;hoY7iVe ?tfQ/N.n>d~ KV ʌR@GGӭl}pBqTd )$! e¦_HYeR`ikGaTz9znz؏N%?BeX\8Kڡ)"P&TLܱ|m Cp)¢ә !s --U#@Gdq!i8cGE`{{O'#+GD Č;_ζf>E\fݻ#0!Cý<?$y<#QU!N= Lڃ=[eGg7F*6\xfW^-4-_fSw g ?N j`gYop2zJZy^n;ȵDWQd3EPNQn EaHc$5IjeJL.:b!V2x_8o̅$[L\I[_BNX`]*â41=Crw?VۿiQwUǤqO:>]lOqM]zj#p}x Oo.g7o%_}߬Q_ӝԏ"DiQgC:ߙQT#`-[ dCjq)5!h.;BʿCteNǎr(ԠxF؏D1l^e(DŽ2l͹+>q!jd`тlR*Â"F -mV1%˶آbJ= . Q/ޓʟOŸt{}@iU>9ڷn?6h endstream endobj 305 0 obj << /Length 4532 /Filter /FlateDecode >> stream xڕZ[s8~_Sj#$[f6ʸb舘lmA,"$o(YyAq W//O* C?*L,(v[˦&U?IAb'4J}j=$n'*lLTS+lw}Y z{ `@(A& 8u70''|c`Iydzϻ ?6zoM 3 7;Ӎn2:~NЗaQ5;-& 5o*[*2Ǘ)̦DiLL|ч;lEkڗ#J-񴆮aNkݶ Mj~8ֶ㾛6phsVoLߖx|vU_ƟL$זI-Wemzh e,v]J.(i5m'sy*5`%UQ(]phGTdj]oZaZySSʻ6򃓇_>n>gԜv a߾O20B^}vIaWLĽ3V]:R_И~ uv9ib-^±b) ]Z뷤6qߌ%֘Upn9MwoauHl+l{s~dvy`z= 5\%)TՖ#z( D\> JJ7D3oTb)i/|*SC XdύHq=][JxST$"K7=yЯy g44=h %p{*S:F*:@HQ:5X?RSJ}J7D_rzCy ҟ F%8f k ؠw-=ܺ#¦e]4;4 N|DX gG#-G|^BB9Zy![Rf?;ܻ}D _RmW)̥/` baXQB5siG)"@>?bKܦdRRN΁IᙽӁY@4vc|eo~ײ}ɾGe20xO&mdtxv*CV! FZ=Ce-ȢnG8JPgG^2! #/cvޜ [qКN^M'&B 7qFl>O6rZis{sɿl+g Y$~9$І[t6JJAuy(sGy҆ҖZhv7N<<؏.@4K</]_dno rD!pwe4bg.+TY8َ'្:-S($Ri+% e,ڲ)@y bYfzVh\CnO=˚s2ea 9A1s\.ǿ 4Ϧe .-田8LxM#lSq\nvN\N9^ZN"=ۿwh"phza@QO`O,wlX+\)"C G֟zu6.Zٹ*릍0m듬)?Mrȉr7(Y4'8_O/;m1Ȝɵk6I}_N@C:?,D::W@7CJ6\nL11R2ruB#!"rCw y7ee[ndP9mJN+&xp͕,@ii9D|[Bkj0, ϐ=ЈP[1Ea)60 p]b`wsd7m55-3+U ޴+:HݽՖZ`-Bk]U៑RxeL! n("-&RBԓ,X^d+f?5`[cg]5Gi`YC7 Ԫ}F/ 5މ~sn ʔ|L@мLW9CGrF xأi{wŠfzc5oY9t o:P-(~M:cGŽ|6nf6j[ZJ1bJI,L:G.GC4%n0GC˷0 )~Բ-c}n)ϊ}+bKbD)s;!s{(\hJrt?݆yP\(-L#phg)%Ee1OV`׬z($qpc]w+z Ȓ_B;֧ [[ 3=P:=!a iAB/#P%Tv <ǓP9=eli =xzB`mB|EjϒW!=ӾsU )xV{I* 9EderخLn'@?u|ƦnoWa/0ʺh p;vVi<((7l%?h9f~£0aoWn\r-#3M7[{:T1Z }FhIhØ~z/bP`~1 Km0Rz˼ktzt\xwЄO6e( 6aו5\qVJ9'h7Gj3_g}CGZo` y?ީ,`oOꎔBs }؁> TZ  ))vqei,:t b(S#e9]AO?H^īhp]-:'Д}t!?k6E՜aqvHvETS`Gwuʽx䕛*z2[4WGiIv53_cCIڛ_ x^ endstream endobj 354 0 obj << /Length 4148 /Filter /FlateDecode >> stream xڭko @Q ).T?.&4riq%fC*IRKsEHynQg* Sg7wgB ihR}YZaܘ{X]z/you]6/7{/k5]5葳5&Cwm'6h:nNX(w[cBxb-S<"EeSw=&^~&^uyy[p6״}^/!tAU겾^5 )Zyk="l8jW]n;ȫ?Vd_;bA % 8}!i+zg$00&3:3EܾU-Ԕmw0hH^ e9Pk˨ΏH?B*ٹ(n6@Q޾oB8W>: ~tבΰ־c+qYMU2*v~/sV=695Z;78%qu$ʙv:h"sܵnLm Pl-wsF/Tή3ӱG *G~&Z aࡾ/Q'"}#]y۲ MvTA8|feEA6TMX 0kawV;V K^mICYE :.*SQޒxw76$XJ" Y0q%ڌE%"?S&jw:Q >E@NNUQl紁(m)kO*uѝv4FRVOXfv.l5lwE yombkxQb enU=N%ԜU}!Ը{e})x `$_WEc#/tHVqY;jh{  ShT'ڢo.7MTN ׁJ A9XԤxI,hL2JYޥ0N3 =;4G>J?qՅ%(~g7+bSI͹/%hE=' flɘf89_epf0݁n)訯y7fxr4O9Xf$.'[-@)E,Ⱥ8%Ĺo!eccMtNz/nr1Fqk<w4l-=BcB$)!wc͔E˫է, ]>`k%Nnm9]Uv\+WݥrMX힄IJ/ÆeF8:^4s1x{( >G(R o"2q˔Wu3ԏ Obx+ `Ӊ y0N=p8JKvm{ vqnN*nB~Ć@</e _ol< 8 Q'FGDw ($ѱ[f-NјPyHC_F`cY O&G fk%Bf߂xrjlxgkrk89hKn]vjzR{<'X29!DD='N,Gk`&߬1',dc;,Vb-V0(90G W6Ւ(A?[REEux*3jW{SW,}7`I2NfE$t # 1v۟= mp2 8h}lLWeÏ:bӁgw=0"F^ΔgPch@_ʫ_ %JbC.o}0r&^ <3lHaAy W?"4Q]ckG?-vvytH?59Eqqİ9+A+DxH)ILI3=]BFj+*"ֹ|A؏;teLBsvOUvd~oaũbH;Fp8C+ tF8@#b`\pjϊԾ4rD #-eY0A0~9_|$Ȓڏ|n4ZS߯9Fc*X6i%+S{=Z8i~@mEC`c!k5 P (b9MȥB-!bLE2@O.9 sdͩz9̇k&=iW pU< Dd5DC[V{[S%ESâ&|b9}ޭsߢ G nmm:mTn$>5abe^h_1T5+h bt m'WLN&+DWVJ8CEOTBlpA3pkk$o ⥎fOqoYVBfJ|>Z"CH@RЧx #:ݜ6%6Px?>4P ^ӫq8 Gq7ҖRF bܸe3JtxhGn,Ĺb0 Q(5Uo5{Tz_b,[ʊuI׍89 }`#.hvVhTV0=&8ߍ{?:8h'dͩ3\㚶GRAHt`g,!gWZBu=1;DPL<&D\IYj9A .uwZdwې޼t9Ս"?ħzsumY_pm:"_ y;!^1ZȧA!Jo$r8>t,t, ?QE늪TɠABw~>BT: +bA؝4قQW rEBj B\ +T0BP-|@DrpȷJwr6ՌrTqQ8DA" SߘoH5ȑ,;ໞJmᗇb|E !U4$zSũ** jDRSVvd" XlKNz#fHJylb /M+[Eg!gY5w@EZ4tVji y.g* װ &zı$^.[ʦڛ*|mDsU5nj ªﱄ 0~J_Ρ:iKmv7q(ϓ%UֺajC|vi0 nÁ}8'mauBgx/wv e٩s8G'WN^ 47q:[̏%WGt܉vNTŤwW[ԏ7T2uQ:8!X.3:)-BODS}^;3?9Rsa+&a׌&Od2#Na cG%GrmZ2vвՍºpϥt&qԎĴ8⻒s@2 Ci0j*!4> stream x[koܸ_A (a(E@6mRqvck Ҍ\ΣMv'#"C:M<y庡))LiCa*^muZXVJ[Ӄ.!8;=S:9B\f)cfߝN=v=XFI2=j2*tFM}'f M\g p:CwyĖm]H8~G'įTPuP>xHۻy!)g@02GŎZ~<*aS6)l\כ)`eJ/8RA_O\lүV,;$cA?OJ<@l +d- endstream endobj 371 0 obj << /Length 3356 /Filter /FlateDecode >> stream x[m6_ˇZE !I"E89M%$no^HYnm:CgbL"8>?L&a($L$$P~?y3X{T,/ο? T),d+x7Y[7<̛]"_dm^iv4}<+S_z>)YD |Hdm3x^UKڵҔnƱES7d_ eg/dTLEؗ!Xz!6ڻOO#Wrw;+ e:Xw"U*`g*cY"crR!4w.~ {~ uQ)mD)'7bR5iu?|L~EUiYb6/5 8YD5AD!ݷb^^j(f EJL=?dYr]sҬ28(/vC6OŻ@|1l\p  Wgy\ٴYj DE{ '\ޓW X[T-̚7Dvj)7ˬ EtП΢(q  KSL _7~^Yo+)@P..ŮJ;cGjb:<8Qޯj60kO}:{¬FbS&Od<—H1 bF(L 9}EHV$ ^B8 eLrJ_۔ϡ!9t} #ā;^MiFiIiJїl]QA"uޑ*)$JD,̄~d FY Ȣ:&Oa1etG%9kJ ,iRc -_8DCPU*Vl`[&1*AK4$A58(}v3.Pcsn '׼ Pt  {F| 'A_Z%X')d:Rqi"ix ԋc^ow@ӿ))@b%/QX>کW~d1"ЬEa5݌pGJ{<$/_:H`d'!Y {,=/. NAV'i^.ݲÿۥt]%W\,,j4'Zȼ%Kѿh|y4ficU%'Yhp trEx/c9c[ 6N-{lk¶APmJR3 Uag9U҇ii#H:1`&d&C_$|}>bGPn#Jm0̈_}ř*5͊\n '3(YlXV6 v|Bn%&rKP\3qf*CXQt vXͮ ىlˎۼd1%1  bmYy,*WRwYjEjS\ N[bԼmzáx(/ 7 8ߐq1dhJ7ǒ !OVLBnЩJP&4Fdĺr.YP(- t2b=uAiAFOT(I,U(ӹ30G3@zPD+e(_zP39H\0+~A:ٱ~kRts)VJ^!HIE vU_u9U٠jޯ]~/Zcs1Z?*VSA+l4tVJljʯ(DS$dܗ?QJ" 9"%QHfI#Y$jDy'II;N54&Uz/Q? _.<2SWY}߻"9Bn.,Zk4 Br<u9z1o2($=}ꅻEf {'3o}0HCGj8ft1z<6b' iQƸ}^1`Li& ?Va]weҭvad?vY [W(v()&/vWk.]8Um V{k66=`Mc9H\䠴./<-ve891}?̛mҐ$2o6sz)Wxo;:P#zӥ|K ŗzQ%\DX%nw`'āsѳJaOȶdqk%v#]]OGOf:}z͏u4jcU*\UX/b:gGM?Ic=b%۬&XY ոmR$lz<'\ܮk-ɀ]>ZN4.sU;fɠMQHf v$ j~7jł0~z/f[7ۀ"P~ÅpBaeE:,ATX ZPʪ#K+A_YonHAp`kCjRHxoE_"m]_P:4u6^0`%w _atGMQ-@PO{8^XK~Xx&B _FCgNG`_k_d; endstream endobj 416 0 obj << /Length 4746 /Filter /FlateDecode >> stream x[ms8>Bb^%=I%rlNhXD IM~(M궶\ht|~/==@ga<-Á`<2Hdy< U<U K`xt<Z h2+p x$xx)++4h$^Kfd[V*Ў zGk:+W'+_&ɚ HbJ<īl$%>=W@{F);{*#v(> ,Vjxz<e 㑉F 7TN\QyBS*[1Tp@a_ l;kJ'VfS5@MOXN0 |xx4A~ាFv jHizC)_- kBZđ|CiI-+*sTn%gٔ_҈ɑeULO0_6!`nE/L0a37)[\"/9s)>5Q3:M3D5j6ׇJ=šʸrEA@rb@҇ڿHF QmMrI]* GtֵΩ2wF,E g|ř=~p6[OtEx*VpmL'4FN㭩\R9a#ܛH~D$hp Q!( \o)|->@xESyA埩T*?k*=ʷfEtCi+±#R&2g%477ɩcJFoIY )^ D!,?_I$66ZS&F,t9adSZ0P1@JA_,#22#'/X] }-g91g뗜g{ ]״6kxCoh][g7cJONPZ9,':!aI0T.NjG*}H $p rE jZC,,PmԮ bwi<-2ge`lL"N &WP7"_ዚ,Mf~IH)wd5#B1_,GD%yḪojvr'q̮DdW-y+2P\e>eIe%Ѵ>"*6~Gnc&\ӬZ uWQ e*MHQr pJdkZ=F"![Q!mOkmtde=..ʻ6U ZfϦf'9ϲ(;+"i|4Z#ZI>h^KA8O~1Vme >v9]rimqImq%&AMl_`Ëۗ|svH 4SiX}CiJV5(,-U=>nYovהPKIUmʫ1('a ;cX {765׺ii +a܎כQY߉R^l܇tPO$G>eR;B;ф"xEHнqCJ$cl%-!=8 Lhkni8qV㺼l|& =iS^H_0ߔټU.q mkJ/(}#ܥqWjte>rj>:r}vUZ3^3Ikf6gO>Y~Dq~G<{fhUX|;lY3A{vߡgJ{Pk};=c}d44mSi' le QĖב0FGRyگ5;e0Vׯ^]\_]_|Zy :ru7کڦI)KPqѨ rkk=w}(BJoڄy[UĠ-*❺|ZONܐ֣L}5|Vv9@ Dp-[2>cPko8>T2i ΧZ:Йeϊ&ܻ #|W9hP\jDr'mi# E^-}Ѳ0zp/,Sf`ZpɎ"znZma?N))5.D@v-G KZi v 1-%*I6b/) $.Hۦ%Kb<5W# ?6>D]$}p2/]5'> 8;E~4gGg!gAd'dPjRPxq}cnN[J(X"QۈY0 z{g鱶\r(=L!U}N &[Vfqd8L䬢ӆlj!g,v] D^ B8uJq%V\IG:|xE< 9v`vLwWNђу'iUHP^}ςyL/" 6c:%Q~sOj?0v1 ɘ=hШ[๊&{yЈeԐ3 ~0/78ch Ex{'"H*Wsز^ ji0B܄`!,EPkc.eF_`Ζ͕p'o(r:XA}XJ2Zv$ 0J"swPN ւGjWv֤r搎cp D|*jժvY ^Ƥ:רp4wx#LCUԅU@A>} ȑ@uTVG\J ,c:牝y3xy;! yB#X Z5/v}rv1LHwGgk~o.!q!]m BxMcą:cӫjcM+;oBcBGIzH[Cu-u+V*''Lds}<^7tԦ.U^`fN0|+أg tg&PIyo4SUpn29'ӄonTssnsիĚU=h[hPt~4@^7/VS~m;ϯoZU" 6~!CKAAf9E_܏ߊ}q?>ԯ?SXWCj$ȟ$tPy0ه J6uNoZO*쵈bexwnь+[֜M)1H^PE{u}ДSNkJfAs v{CݞrhEnʗ61#~PqCsh;>yT=ן(l:<݂L.Syp =ʿ{0yxw| q1u$Y*5= t,>_ȩvʐ @^ snVM46 Kў:lV:E{h ԣE lULBk.QZ7o>='37\ jJ-&{6 1P绨٣*/N 0tw2=`e 8q1g,7Ė-\Z"Kp*( sWknEbGrߏm9)'hKPhKÌ Z]8lul% /Jn8Uȏygio];߶[k0GXHo Uwn-ˑxQ{2Zk\.52e4E{(β4xvLDŽsgDA5-[//ZJ ,a {̦;רeQؿ 3.meՂh=? Y]qnM|2\.GeV;_`19v m޴m N1+}ք{6w endstream endobj 367 0 obj << /Type /ObjStm /N 100 /First 892 /Length 2190 /Filter /FlateDecode >> stream xZio> @}׆r8H L!'V*+ro0 QÓ W%(eUE cPC#) b" 1bZ$I1 /) *m! 4,4HC t`E'ƉLEY` -"J,ѫ>3i%u~)Ow)>(NWen&1q8"Ƚ6@&(~ވ'''I3mNyj3M7B |yh~ l?f1fzߏS>_< ioV3k @i#2؟mZ|:x슝S䷯9E\#X@,G-]^^<,q݊79ta6mEըB?4Vc9~Qt\@Z:0Xd/!=haLTr1qr L2vSnRMG;?ߋuٵ}z@=m5w裇1W(]8تo,lo]0C[zn'qU>!yj%Q\bƀc?$ԇz{ xҾw@|A(l ,-=$@aߝ|uvUg2'sWm6|8bNu5'J|dv|KbWGY\[EQfGq,==5)-=a}P ¼L.],sPWu$VwO,~.Aatd;[ҠX7gih QXM,b]OB(4|!*2u<WEt8Uuޝa# L&+x9\"&^ }*#vx9ߣ e7xz(#nr &[ol&YLlGj=^NC_vֈ[R).JVpZmon 7նa^0G&ms{pVek$WΕaJ -!dQ/_Mܖ?nM(`U+fR,ºxyPe1(;]u.ɔab/m,-sR,.CvPPA28/FNT֗ůޤwEX縳e5mLSl!nKe&n3e& ~(3ҚG>[%PgIgW2}!$]krLc@*{\c@6 :/E]E$pxs6ҡo#8k37ۮiuη|sOКr7`Ԡ(x~c+:_~3@[b+UR(;ڠgzl*to/1)^=BAJ4$W1lFqy۱Mƶm!eX6>2,jmsۭX SIXa\!VOcB}1eX1:£1ʕaj+s nӅXz^E x(Q endstream endobj 467 0 obj << /Length 4394 /Filter /FlateDecode >> stream x\[㸕~_QHViEb'HA0`4dKUZI`eIˮn^h>Upo}{͛*m۫0Mr&+I^o3ᨰ} . (^ Ƨ 2t[QKQ4u-f^Az&+(c2 !'$ EE(s5VPc;\VqWTScM*{N`)c(~R'ɨL'93. xL D #j8(^s#yy^&" xR#B3 w.Rԯ.X^..xێp8<,?Hf u u{{'ktv՜*>7[xOrּw%CM6) ڻ+C炀cǕ nl[ʑ=!,cb͎}t`l1!7s],/,?C,:+cd7iuPx"ZDWqi]l4Bpڶ a]9 UYɸ0Mַ=mmUl eu%⶘ׯ ~lJR^Kq]dTҾ1QCQۗ%5*bV$uW!lEY%kq(kDztyWDfĦ8d-p[:an'i% vf-UYK4S  Nm6,4S?Sip`@,%sKs Ʋr~:ͮ_4 #&D{WpխN~8&X u[dUJYb꬐d"}mEsUE-f%|8 ^;W]]>Av'vK*ٚ ِ2/rȎq=dCT!o.W+GTc8\1GWeeg,m RQ:CY%W(>.P8zt8#X{ u9ڡKg=f4HR7,8둈sqӽ*yt7hZ՗tHdd+T#-)~T yJ,g9*ёF-S6p=8Uf{U>݄xf בnwm.ZLh+7۪}GF%x  ]4:cd&de,!|b/kˢGK*᷸/^P@Xl}[ TL}}ѽN6;h7n\#rM>%wTqʼXzl^!|Ke|9c>'!tE F!&b j܈ÄY0hy8tMcrhݱf$ñd&+/ MAᑮMv޺Ϻݓ{H;nJhcOs'ja^" Q.׏떈B7>rym67\ Lޅͪзjl)p7Nâo;yB4|QwX$dP)޻[jvmW<!ښ/C.CۄpXj6g#D@,@,#bȶisX@NycϪ<ǽG3te;˼m ˒w%c.rT妬ٯG^ i줧28O7g~+K+$.Y{IntHŞf-ZVŞ0^d| %D J//q:~n&Hg5jD* `ƅ )&$Jͳ^ڶQ#Իі]=".q{VozW+oAWUǩ;'X?X7wYLJf%{E 'hc&'-/ 0/ /_Yfg=Ňo`,}:!oҧ< GOaY~d%}BK+DŠGgH sVlԣ{ \ ODFᦁ;줽Q0~b̴J|1~L[ @U FʬbSjF0r6d"tfu3,PJAwOe*5 <8Nh$8!&ƺ&xS:27VtHh@|#Au2yIy #c43@te[!P=HޓrCs h2KW l',lAzĔК}*\xRYե|*㴮/n}.7HcИ\x[\qǃ'3cӡ'߷Fog[SGj<}mӁ I~t֘NdUBuaxdb? dIWf~IH%~*YR,q:6|X_A9YEM4!ظO0"u;ASmQlW&͈m a'RHC;ď@܉؟qGwfRxIDMlHUV:l<"~ Vû/_Az?#VeBRnLɉx$MZ|)Ȝ灱וwu6C)<2XrgϘA*GL,DxD34~ hEG?vp*80%yOhfD[!Dh|xl:S5`yCip9P۪%#\z8ń-Bo؜MEa#fB+&΄6`]& !5&GHۄpiMH%RCbƳWjL%,)CY~,/&m=~{~Mn; 9E _<<}7~.o z+DwavR>"eFSe pKp=`Kk 52tM2 #`byAcb;t(~ox~W%mR(T d.?g Ϟn0_RG7  endstream endobj 520 0 obj << /Length 4500 /Filter /FlateDecode >> stream x[6oqFDz;سI֋8Yg d[-X-u$_"qM]bp] WϾMUI.oV: reVYr[Y교f^Cam}Z$i#KrE+TEk) +LunkbmKln(S~8q̉.kçcAg#缮Z q~u8),xN|nATg*(xշuEWHRNݒQrjֻ| b]'O$ImmWO݄AV4g,vBYeY^fxQfn`f3En8H.=$~_r;"5$K$ E!L>3Xufh%G\צ[ƶFAzUP\̤^QKYFH+Yw9gYAAXQlE] gMY1DC{TXYCX['E^̏k񢻦{nR5\6%Cy"֮d[ZFvZ,Jז=[d}wKp@ɡp!W Ǹ@HdzL!DI9ܙBkvfBm^FTR6gz6p+K?Q 1ӍW&qq+'u+K!섊N(w԰CQhT(^_ƃɍǹYDo}~ߞeڞ~UΕ\ޞm,=JAA{\VAVdq%T ѕYsmuWNRb&^a`ÇjqMBSc_}5 Q4/^д?aҖ w v#jâ0(`(Бb`TW;_|uh>h>`Uvu# &=TY:!Ύ}CPQabo@k$#j Yw& R1U6fsgdOY[O+Vl.Xu!hE զOi"k*Phqq-P5n=csې9nvV$`[T4B?2$ q՛#G{+neeQ99 5Ϣ~CC25ڥ"EZC{FdE*X? 4կ,aBW ָJ6şϰϐ<~0 c~O +J_RWJa{MyH(ػIP(" R&opFZ 3Lϩ$#VZF^E~:2B FS~e@Us#ҔFJ m-)6McCGrO ƄoEJaMT\zNupC\p OF$F!ŌXa`ڿD#3[\/ٗ/塢/~_/ &rbZbC?׾<(㟸//OPIx(,) XY}˹cUo, M?}{12 bIԠ]8ab׶s{k'z‡s?7q 6gqFUT&龤g@z'buVm%`Hb2ٹRj¾>nv}/͛b_ ]EZwɸĘ;+㠄"7Z2 ҧ# ~lwֽvm/xIxQ#/,:bx ϯ{|ȫw|cA7n `F)U'd"=3:7=1bdh!vDޟb7&ҩH0M;pCwlM:EÅk葨{vCX vu?r$:0:So|LSׇcO{hѶYݞœ\22ˌ׎…0n<_-aF3 `Ѻ'sr(Ŝ-#p e$GFCrfn걄&.D@1Fdg #%ʕ]7R0I%(t\AX,DB`X2Pw~JiGխ(îฦwA eًwk*+Oj!<ކȃU;Gөx0|Ip$z&4ˍz& 94O).EQQj?3[I$~(-(E9Ԅ t_pzl9&4)1PP ^! ^c  _z&S d7TiyƸH=S8Z$q?U8#00c "UĴ EP E0kͯ,s: G 6J(‰C()N56P'9Jrd|wn(Rm#._p'B{bMhwLmd,l-*>L-ӬMfc+ >L p!tkJwz?cvLNivbjNаK(ttM ™>qYU)/6Œ&r\R.r|L>IلI(Q=}0d¤EX>ufvIU㮇K^h>I=3\đ/X:02 zqQ4.Fdr`sy.#3 iax,񇝌k]`OxonR*12lmٿcI66+ (CG̅/>0X'Pw'9Zp$32 36 Ww9fvSA:Hstvq'f,r刺[udqTFTOIAƶ A~cM(M8I?Lh* ?MI)44/Gi4œ[@pö{'_ Fwq1bBk29~ dY=UTeAͩ7 lQls{(gOx>G<# Xܪ? t7EO ~BrZb(P93x?PFx.d 5f+8sع+YΦȧ([@ $nպkff̜j9p^Iპ0K8u^n?T;w>_#}7_N endstream endobj 461 0 obj << /Type /ObjStm /N 100 /First 873 /Length 1611 /Filter /FlateDecode >> stream xڵXn7}WK`.08 h NQH&*KJu.}= L,.ťGuqěd<T.f; q˓'%XN⚓ !3p[ɮhI4S\4I]-/ŵkYÿ@0#+QB+9s^r,NK?N T`»TpH[$CTQ]$4NQ nhT1Y؊\!v1E)+%(E 3 `DX,p35Xoc&x`!r02L7IqFDDF{);N`lM)Bj%Č3 rF97 %#pJ{aJc֘D5%[Ƭ?FD=Ƒ(j4f1TISᘜX O#LJFg?f)\Tf68!ƠLBOrځp=wtrݯkI|n<?vzp|LJ~=jآOWvXW?; x!M|ĖY0tN~Zatܻ%ݺoi|dOlX9鞮7f1u?v/g4P_愀jZ!I|~ṿ~nѿ<=V^݊ E 8ubqy\l:/9Cx͞F}!rJוtf6aK CaK6lka E El*b4[(J>-8\`NFlNd7Z6/Ŗ:V-upUl#).R7bG',OU̔|b/>uh6bM^)&lAE-ԉlqIֆ2"zj,: +JNeGF*/R4#oI/? F,Q`ԋQ>f)yo}r(8rE!P)'$U/8ce,֫ {\̮LwݶOb{W?7Dl` N.}]v1\f~.\u]fn~m bX H Rj^QcWΖ4>X 6o6%6܋܇UM ۰Bܒ✩_lVPu!hQӖ ؇BV}"Fl%(i!a-2'aAdeC{4^lϡ&,k X醸 endstream endobj 562 0 obj << /Length 3927 /Filter /FlateDecode >> stream xrܶ=_ΔxiL&8j%3MGCr%\ZV2 )Jvl8~@0ػ  '_=.{" {=:0 g{olQL'TqQή&I PAt&\ /[bTYPp>㵳WYQAgOLSY.덙]+mzZpg'' f l LC{,ƓJ/Ujڬn+#X $A5s\W "?QG^64.q[3)4LJ/xvr"b:fOX{v@sv!\ _1ɫ-a2S)cd q,Q5,wv~p~۠OJ _$'LB\X7|>Kwzd%駑Տ'i]|LO y鴞=Lo'*C!=`'{-0:BHuq09!ҷ6.l*FT,),H?qF3:`<jSţYD11$Pҁ ] MtIgъYI-\Rj/ ~Kp˃}Fzcpc`%Rwwݹl ) KmLƔ7%8#uWH t{HB_'*ޛP'| 4_YݏfD~v=PH$+}WXR\yI.Fcm=_֋7E{d+KisYє4Sa+:䐛7k-ncڮydYc3%<塳5ewnnuOIL\WRF~ PizwX^4Jқ!QU3ALCSU@6crVY-rЊ|l1$WKۇMUbRP*yfT]_9AA00)2U+`\mvv.r7U5CW57[L>Y5tF^ PQ L$.8+LFfhcz#o~Y}@ՌsQػ. ύXVI?Ԛ]4yb;ӔY9XX; }pZˤ("aL]KC~Tof XVek`J}a l,KUeegfT!Øtzf ^gu];vZ-Wa/)D B? , I-VDP9G9!_s*1Gġ" -1~08#<Ȃѩt6-jlO{03(cQB(9UcgOK4:sxzl=M,4ܝ39:tPi'9w+*#BNXlQ@HF E=-hgx 6*́nPd7CM02l^ 2n$y]H g1JS2L]SH5cDڇge Dk5Is0X"m3 rP}'zL_L)9;;+]~ăi(6)X )X W*|bS-贚sMТ!k*|!D\C$[ JM4)N(J*uʙ}EQ0_LLBxoBX nS ج ^~$R-^#}.3}T|}dr0~Έ0.cD'89D2CM'F|y&;6թf=#Na;3/lĀ6t$D!!)!#S,pI&5 U+q[egUqMHrQ OHrDHXUdWC,1Bc0;3;i/%g^~fPt!uw߈160v 1c=XٯT)6R߅6n1. ckb9Ns!iTG;wkٙ{?BK dxu3+\^,!1(ŝ懿kOH{ouAx]jn UHu/UN/KSzYfQo=,,W͝~',aN,̉~Ӷ R|pfDЁ33VdAX(@y*Ѵ5V聶v][SRͺ 8 V$}z9&ȧ`ǧ`ǧa[Y;Vl}!ad>e]Tom7÷l^GOu4R ށo ^&5d,h_+ 2Χ5O.J# (KHo3^vٌf&3_ހ&lY/!R5NxNpNpݽPK~-&vVQ*DNF(Թ]=/^WG̝uW^~\PUKGYҾ7~lS„6 Y8N%gYt %.=qO x@+jЍOBGLDhS]Az딏9 :`ih͠Cj,׊1m'S7 K{4ð0+"r>AI+3!$r)L"kQ~b0|m.b,$u*.HT V S'f;n{6K[@s +{1y?WZS1"vrfͣk\,"'90 >sUEx67!h,v8Q.6ʏ4e/ÉCgʘ< Q:ϢK&X*4vИ^9.(~K,`;jt(&>PM*`qp- AR!HJP-<Ӡt5A(5W@"fݖ|?`[̽K1Dn<^ah3<,sQ﮿o1wFfY!c+:vґF  ~d䈡H<GErļ 89~Ө kuE҆ZZG䷎Etz9l*f;Fpc{7N@* Iv$l؝$`X?^{bg/s:= {4hаw@ o"_ s2>=/確!gQx(^Gr4Yiii iIIrQu~/SWR?hFH2F^{U"QG`͉mٶ7tM*8U] WpBAj,Bb8MȊY I8_͋ॏ=&oht%\{e&l˵ ^*^% FQ.FF6lC޽]W'o ^)"o $FH)7_OG >mKt).ҥν6M(% [hp-7 f!6GүL,٬@،i*?IA^S.DK3/^O-MvlGPHN;cWcaG%%1HbmDXvsKr ΊJ?(\Vp{W 9!쒄K/IXŒ J(iB8%d)džxA&C< 2<]r6>vD$&$yZf!`TԬS:?#G) o  f{Bl,a$u'{,6CWϱv>Hso޹\xת1&eMMH5}.Ĵ<4߅$Gz$n7%\1׍r8&m<) endstream endobj 555 0 obj << /Type /ObjStm /N 100 /First 880 /Length 1491 /Filter /FlateDecode >> stream xXKkGϯcr"@ZKbwfe;2S`AڞoSHd 9c* $5k LVZ$õ p=HC))di`/7Ɗ?P{V[jdK d@y達Pp5ơ aT-:*>إ8eOԌ=vf]ja@6Cԩ9$;z )JG5pZ'7ńo`8k]V*:5a834pn@8n8`; ;,=taF P ОAKfHY;B=R $A8gxO!;8AXL]MlYalik1>\!xAKc3ν` U 4(!QfB)U&B% :-dʐ NNePE.0`h(-Id5  Ir5 %Yt,?`  Ҡ$P`d \h RtB`%;8 PEjASNNE8+P/aṤMװ{v.K6S4k). {Tk揄 j䄖:NFzNNxԇȜ9?U { i$:`5n.LpW/NvۇOp3j8qZwtli^L[?Oų]8kI/02dl|q|9> tnB^@;$HiNEi.>D <ǟ?ǧ'' Ӌrߌ~i߽ov?r]ج܌wwjoi\-vi;i7^Nv\_}A 2yT.5i(|3ƫ@!j߽kq^_/>LǫB wj@AI*Ѻfon.z%DtbbS9I䉆1RbKՇNJQ, ŅEfV}X$f R1.,S pطМcvD2NN͐zYd콸 Q0e͈2oH,hsĬÒDa+r(st$䳃ʅE3nyhq2A3jC ^cLB \!y n= L ₱ˇ̘Պ{y~LƤ>OpB'|ݕ+U`JRAң+iD.,a+(ɨ>W`*4rA6DxbRgP(P%ioY`HO#g_EabU̾ն=!QͿG&WV)r,rgvԙHi^ywa:i^+Ӽvr!e>Q|6!oɛc endstream endobj 702 0 obj << /Length 3088 /Filter /FlateDecode >> stream xr8=_apOx6ɤbTlh YH-IFLIvoe}&,'|Q|wJO`qDLjdRBbwjʛtn3F޻&iW*,@|2SaLΦ^45 7*ͼޔf`>,>L b'g+}V0KaTF=u7 LKҩb:Heˢʑ[/nWuv@kS^&sæZgWt-&4|Tf16/*Y9(-^#(D c 8U@|2͗Fj^bͺS޻.a ] mW䭓 k%!q@6r6>-RږU1\^{Y&"3Cl"x8S:$8+o]8."U߻4 *m dI @UҪebk2LQ|֦T&4)+Τ 75A+237]yM-зHJMf `Y[ӻPm'X; Mi-?dA('WLꈸz:oN$C)vXdM4P5uY&69ڬqbBVm`e@1584/uzm5 xE`-῞͖&7eBn-72oQUXhhy0bh-$Y PY _VHɢ \",.b5(ΩXRacPQÙs*LDu3i8?b~خ{#g" 'mow;DCTr. Dԟ>? /4|lpOW4j.7r d" E DŽDŽ  'P1~G~)eگƴ c1M V߭<Ӡ_b^!D& x3BCI|Ih/Yuwۊ^ > E gz;XWw (P#@N1%@)p?Mܾsr񗓣ׄ1OH DZf{7 )Z{T^:e^b/xQ0t?t8O'pVt8F89"|H]zF QhE|7OO~>JBfiU5F ^ !S| |xyVp}ʇwj rdƌwٚثpUTItTۦ%,~sw" LFGUuC59wuXd0<`ZBBu-bX&oQ,D(ǧ{)ЈɎ]'woC_{>1G}9O@Nݎ%;禳Yk_Jm$Rϙѳ~6-ԭ8h Py?5g2_%N{I,~rȈj]!z(aqpm&,nFwkYIC^-F1v.`mLѰz7b>|q& vf]Z/M}^'<8OգHnŰ]RAIpi/7Rm(Ap v2u0H b v|T.{T ^?%Y6ae)}bEe^†tJxwo%Kݾ1Nw2%*&Q5)'ܘiy#~c2LNʢy },Anf`{)JKM5BꁍY["( ;p+N!A(U{\HKȄnI + uvOA)%IiIS;y\Zw(_~xWly&჈ So.{ ȝ1pꄌ-ވ: 2=Cy]TcO)&٭Tۍ)~s;r}E30/U7zٻP}|ѝ&Kڅ4e|ns?l\ 5kƯtW/ %>ouwu-,gvvd-Ɔz6x2g6\Nҙ|ڸ?Gm i-+\:r]J̚سX,UZg×x]$a)\>2q?wRd5]U9Y̡t $.ÓhU*ZN=*"Kz)t{^%J^Z9U5s/|hmgeV(V jmNBi|57Џ<" t^/;ǥ Asm56r)k[c>?P#MSpXlI54#>U~|rdș nU8apًȵ endstream endobj 741 0 obj << /Length 4578 /Filter /FlateDecode >> stream x;ْȑy":p>i$-$OHû4H RyTUl-ǼB*3+JD"Z@wo(Ld<LPujsNdIb#D$Ax׏oHn\*PCGo"EUx 9k톟0k,Fyq,D~"JPDN94w޾q篟}1 >lй 4{XoZtWa|$yFQeiP]q@?4 G2`a"@EC*5ޫY*0޴|ERǽ+|__\c%P< 4Xݏ ?@lq=!:E&603<+̲ȱ2$qBR$aOc?-ꄲr=O)C%2dj8 (D5RsgI[FO`yCd7B2vU`d)$ T`=pq4v7c2Q$(GE@4䈖`m-Rb>W[H52Rr ̏PMp95|6i~.І)YƑ=_Ӆ9N}]}'_T@($en;h#0c&46UPefWk0 <{.?gmevks(MTq J0qcKpb|$I)Oih"AwرBFP -\C$[ _p`.RRQ`5=&7gC 4W 6f),=K3a#tP& |k aC:8EH=#TPcxbcp0U;(q ,&-Z,f)zَH83#^W BI<Ÿ環D'ѿQ~iH#=[э vȜB; Rď=l7 L4X%8>ZD-56%_Ҟ|fh9m6cvd>Y ΧLxuD92pI8W(zj_|pu2ҟp23}k&q"X >)oe}m_>",:Tq6~L Dy^#c9.P@ f+٬MySsW;(Ռ7 =)8C9<"H%P: M7$2@Qu2SCx׵PȦc}6]U=Q|UU/{G`ו@6l#a3iUތ͎GQlj6^gEa ,d>DdzSHN!WTTN7Q;eKe}4Jq%OF$N}ApBؚluΏ;׷jU[&z :Ӝ<&4)I"O;(mIR՘9m'PLX[SLO1-lzn )ݯB1ŔVE WTT2-(쨬l)_AuW񃲨p" N.2eQ ( )f[&![*;*1֙D !5+ W &y`FQ;_f[_S"8ByQ@D7#"[-GDB`గ!iURС CͱǬeq'gd;=Kmj|;Fi=C"g̕#olwJQRD$1PxӘy|*Hk A%- b![4DH7nzM0R8Xm~`wB&2{T7?_vStPGBmq7 [A?^CvC]rSlV÷H83P:?Ŋ~BwjxY; aꁈK߅XhiRqX5~hߓR/i z ''a8?s9Ȩx{PLJgŸ i^wF?.Gpn ˸m#<>f:nkՏl~p>/j (I믱*@gZ՞ۑV^z0볓ퟟ כٸo^? OGr %t}Fp`V*oÃÃn^ /?7ax7]\z9\r8_χv7nj~{3ވG¢̰ mÎcrvqsގ-׳ǂQQ*wh?oڜSWn7]w1ߘ - (7Z mn3,؞-/3rlWj*+pJ)"%TŻb;ǰΔ}esA$WX>,*V?2rÒ;/aN,2җ8SmN ;/%_؊FgL5;e jܲd'N2|Xs 92$ivٗSg9'hmK>˵#;1Cc Ꝙ iI ' Euv|74o/vӋlށË-?3lG̺].]+7m \-ip3ORbԇiĉa ]w@ endstream endobj 787 0 obj << /Length 4060 /Filter /FlateDecode >> stream xڭZ[s6~e!^Ӈ\mxbNE"$X=7P$%7Ng_w fw$}~ݓWZ͔^fW3z>y3(M %w}ߟ]f:WKs<iM-!䂕Y͖YYs?vYW->65yZEW+!3)8iZ SaE'+ZÔ 2x5OXf咖oZhmʬmW?LhzŚ2nϣw`:]v5!A 7٦TFLpĺ'&q 9LʚO\t ͨkcִy M.eU=rsfC'DFpzH dB$t0%Le7e.MD јa2 zO0 )hMQnWqEY Z&t5'sZb$@<;O|Y.aɜ74i&dD!ZnV8̃GcZ։|ۙil8(ٯTW7]W;qG؞3e[O?gX1%`^aSV/ח/><;NBśggW,D#T*b })%z8GkV?^qvC+쁿]A:B>rqAO^E,q( T*rSϮrXy|෠oO\+OasLn8d5JGuP݂ ϶x- 7Z$熾[[7uFÒ{?H L2>2-FK-+\&ATĶ˘{J|OY؞#KB8XH1%BάXj ȃQZ k5*O\%VR۫qb(51O*6X܎ei*U5QzMt6&c-HXanZjavQ3mͼn aJ@ep ~zH !M&d2Ce>4Uf2ﭙ?ע8r+<Ϣ.e>XfHy [aL44h6Vu+vTԖb:2_Υ4\[=-! [k^5[u/OIN'iW%.keDEPN ]vŢLּ%[N`znd]ak+@d.y}>6!)QĤԉ8sN06X^x[s20= ǏM$&$C<Լ{bF hA@Cfx0]d`Y},6ⅰxt@ =01{9@'BxY/SYi;}7l S,YP sX#*e 2+礵|}Ie;5fG jH՝faB^~Ks BB5vS\ Ͳˊ<2(kjE䛆̽YlXaQQq,t : ,88F[W܄*o NH< BsC R%)=K% Rsno|Q`Bp [NlZQmѰ95zbQ,ՕhPfʲ6ӮK2$HZY$e$F/h]+<eMlo%|&G!U uhAk.q W# =7t`-:X #.RQx)kd\H뽋 v̰c@P0BCU[:E _A*~Mt0`*ǣJ񜟗VT|n0?%D g-f0[.am=E]ܼeE'Fr^40zs^XZbJ$,8|G*΍V#FnvάbH`A.tz4+_yߠ[LL d-գnfBGJjn>z5lx_z.ٜ.HV]ЫABƒۉP^6moB6YUԝ5wU'tv-%Ʌ#nV͜a KA8Vsj$ HT 5`َBM(z"9Wo1QDBbklI؜#Fȸ[9kɹaAl,zLivg6?.7Va/@uX!2DxU8!Sްu[gXg8<(76<nmũ1 y%IɆ4Hd>+5{=P*L.%T޴fjqÁ2bs|KZqǬW"'B"N9U4 td uC^ܑO>)_g8@ Ջ`xT:?^cnw@]ĺ]O<)MSӎ>-s8ީ@x4b7Sd8.:Z;̈e?;xu8[|8V?coO2|3%7c\'iA*Kd#/O_W V"Y܎uIVrJ?"qƧ܀"qeћb},3/L;/UfE.[.W^Gh|]!%X46B>%?.NmL]#h|PrWeI]e`;:/ܭrѯה+zɭ}_}|tT՝uAk ;b7BFۑ[8Gp=}FNة`ӫ#p endstream endobj 808 0 obj << /Length 3316 /Filter /FlateDecode >> stream xkS{~?ʙXtIHKsS0E,q%B:prJeN-p#Ț!LgTNgzL'6ihv|!T?zε_vĵ=qk}y!cM6ڡ1D5sk{ = cأ0((= rIy?joYp DGD] #1? [H8s@)'T.OTֺ'T^jjN(TF:uG d\k#v~קhQϋh"Z B{|B. 5'$xWT-l\ `06=7kJl$U/I&g3K) FCZ' 8՞k=%xt,B!Tn}&3蟹ƺz(/ iɖ4ںDy>Q^Kp. @Fa$3EEe~g w`wQ%#&%N(XyR)-2]}C;qnJފb?6q"o˼㵱rPؾ<,NjI/C=S:Cuc*NPq fg\oM+7M!_fe@2"Pa7bT= A'WcSnlE 5. M/rtqYS#&Gk; PC9:Q"y{)d]H;Fs^r,Gx`\aw[؝kv'o|G)W ;7Z¦N skЖ}1bZ?җ zF}FoR7ßI _&55ߊVC*mUZ(W@,eVw\`G+?zt )m[GX.I:QL@a!3]㳨\kV)  3߰7m(XX@|8u5)s"y >1R' UP+cCϰrlI~xHcX}EW٥I?v ;*ͱ T}ߗg|~O'?qb[ڱ\_ʷ䣿ɇ2 7HjD60aA/לiA`ŝ/⏞]jGfXT&*ڣ{FK- >Cߥcʙz6B=aO༆eau.Auȍ㹾 uU.ϵҞ:x7$}#`O`vF+͒ fh ސ1 x`mJx\">@YjMO-c&< êkc'X?,qM*iES9 0բU~^=Zs 3Y*pЅ>̑-|Q!Zu+j6((%Be'В `Jª 廸 EY"@N(6S4 4 - 5AUopy~H'69@$.Ϗ)W3֔Ȭ*ҼsE:*d -Lӂ$)-SU]2ԗpuڭZK@9<:YEJޥZ 7IV qKV *1Gcd)0 9ig-S~ZK;*y;;7[D6L \n3fd.)rWi'JƚYEX`t-V1f0~*Yib 7.|6jq;Q'Zw]9Ƴ8p Vz͵ 3f[eӲj14 㻂:q-209+Sk ʂJSVP)Ht+mX 4sspe ԕRS*Oϴ\OI<7=^uc_V!iifkqx:"pʃe) unv2c{ endstream endobj 776 0 obj << /Type /ObjStm /N 100 /First 873 /Length 1652 /Filter /FlateDecode >> stream xڽX]o[7 }II耢mA08mc4iC;شC"\7Ē}ac9}a#dY'ub%&_H8Z+ua辌$BQ4i9!}UQ|)GYS64Sj u >(}bқ=&v_%[3 jx\ /xNyňӷdxzI9n2W W?'u8#J/u`x"o$#zykoVYfby2.7*`͈cXkEiQ1F"?pn2z_^>OՏv><Njp6;^Gg|Za6?><oq>=;[` dj'jCfČ~+f:|~\-Vx0)lDbQk\biWSh͐ȿx`㼩`~Q*5M2Isc/$ln5^pUd iqr*^L+u+͘-*ܘWF1B{T-Lݬz[&tcPL*Q!>==g 30)oV0lM>wB|3+qb<`L6\n7^MR\ n{3G&'2:f\ [1.ąM0ĢĦ(N}NPk>}q5y׽dVeݮ{., /m>r'Ġ2Yx~?0Ż,Q~5/aZ6S?n]~L b*/zlwN,/ۇEy6*;;(/U|mf5*igۍeTϽwc>V܍}-,QqbKB[Y1N|>`=> ; N(G_Iس/$qP0HSw$ n8n:|+!}%Ͽ_k?/UEF,=| I*W endstream endobj 860 0 obj << /Length 3728 /Filter /FlateDecode >> stream xn_1Zh(v&@{@f(YA=ul <{꺻j-}usc/V˅r= }Zc"RZ/Ί/ˋN-2i|xDuVﱪk"iUҵ_*'݇Һt#/`HNZY]ò'/~::r޿9y{|zξ?|Ə4#7}}r kƩj=u,ly8݇'oN$=72fw累s^#;UNknSuUeU._5ҕYr=f v$-/;MիyV^qQ;NxBsH9])==` ҆s+"|Mb"8 H-%&iΓbgYZ"t.oqkˀfp +qq/URJh+EBXee[?FyNt#7\XhE~Z֨"vyD) ]_H@9 <6p@Y iO*@>+n]g&Op1vRZI6<uekl}C( c%AJ12UEX\&~&,XXўt5F6;#_hTMe7ٰVX^˼]^v9×|amZT[ 5-i&z-:+tNKT[i5 74V%ZF&dm"cf? y0!C$@?_BC62 4{t-{<D Ԣbj):?y)eZ<ؽ_32#i ^X01h-Ҋ$)}=#J!L$ejV]FhThq|n%ʒ*@zk'dCpGpNpK`XCyX e*o<\6hbX8! TXPjھ/S*WTTJ4>#<[:o?zD/_`j!R򍣎pBoX T$isD2S/GG,P&T2 ӅŸFN>q)ôtʎ]v:g ,݂j'PYP22L+A'qA?ZP j,_7 O LPm@B#"F;.:%[@ s3W1"|r P"ۓ]*iײN"=>,7I F;ͳz>1`<*ۇUvVEps3]]3hGW}  J'HRaH !@oH1%A gkn*lH9ghjVFu4"jӚ_8΅ލB'wa zI9C2rV$}OY\ (IG'ylf= *y+^@Fl&߃>A`f$&C W/!ޖl2rpt]NwiC_ZV#Z ^ nEn",Bڨ5a%uTUF=2tQI$t OwuRIb c3eǚi'Ow$,*zv~oZ3wk1xlŬ/y SUv!kn(6U[\u}¨Zdhp2\UE1Lv{w.Pƌ &`>&TY.|dˢqF?_`@}e3$TT>9: 4XwI!(ǾBm{4,ҕ4HVϱ9ۀ]eFdoJ&aHKɬgee.bTlڤ\yF2=}إ5[93֠ [ZW }?8a I+D &ڊ6՗U]QdFV)2P0  ûEh Z C}"%1Oij4N0 C}sՌ#):ɬd",vĤ&V4֜⁃OjLO h$ѡ0sRٛ2@01ج"e(G;@^m904 x;ɪm6JSs+pSYo #iJ'Vˆa<ǤG<j0YHlĻlcG:z1} /+zMR]0h.vNB`o vٳ-T SٯvtRW`=h54jzgׯeuĭ,8|_K2E(;0}i0b:b|ϭ$t.,_sOsߴi͒Ţz|-ݑ~kgO4٪N\=(='1 DQoy(W\%7}lr5\fSy-4XefMpDW֠y0EW&  Qo("G!l[4h ?z!ָy''B b'c; |ɮ7y_b~L}r[T@/6=۝Lx͜,?PmVvw6B&u{[G./AқvV%,ωդOؽȅ`Ѧ XlUF '+^ZRÐPO1jfj5] AXh@^hLKGEUȆuȦ;;BV[靖?91`O_x̣LuL)&+~hF> stream xtT6Z/݁ahO[%힎 6 .!8D #{Ϳt'f3fgժO]Ut DR(+(x+IGHGʊ#Eʂin0B78\;(XY T 8ey`Yn<" Yv43Lꌐ8b3\.HB "1e8䙁8 V+rg܁ dsY"7bdbH8%B%G@;T"jr#\jX"$p ۰WDEK>*(9@s"(n+lx#AO}mp"1Ytnp z !LB.;8G)4B=\)5 l+ig dErc]]Tp韁$ n!R " 3g!T.}7 niC^B0ya(B6 D?D]* V G!Jpt_Ûn̴Y6"AX:Iډ)v"ņB,D0Y&"Hd4E!$2D="Ldŕҙ'L-7SObi$#wfAzGO&hqb)tG;@y.IHTf7eֆN4o&@ZA݈.dTC,3zjC3^*TrW]нrD.tck0FT0~T6l*̭qKkq1հDodp1^67A@X>J1~N;KwŲЙ MX7d+VV^,V$ *a'3d4oʹ~ <]01MnqO3tvÛL  nD&"MOѻ4 BSj.pCo oɋfrӐ[y9:s2NoFFgpzpӃ\F =0GgKNfz2ߡ-(4~DFECۄejhe^6^rUSֈCߣ} W:M |8wRߤ]jh_Zx `V̲ 2ECԇlb:Ht2ˁӖ2 i@: `d?f\QJ10]h.yG a6v7 6>fWenKX6p̓gws[)V"Պ4?nEl||{:llvv Dq}ZAr66VÍD{[Ht[A!Jbz *'TMvhW wHׁܿ?ߺ6{YC9%c Z}V?ŏk Z@LSzR44f0+W81A>:oʓ[vកLG2õh[.[~7É{NGK^n{b. V.骂  hwJ!QJۚ Gu(P6SuL:L{|$]|Rg#nhӁ p쏖BS)*Dy&Tl'ZqkV"׌Vm=8ֶu򏖿lBNk؎]ǃvp;4Q`pr;em S>4Z 4:"[TJ$ahbjX0$YHQ~,(B&CY0 hȎ(n"[Iddm邝fEJEG ~Ƈ` 2́G\X[e.֫dbm0#H}~HGRPqT6aQ]J0x2tA{.?V2 fwDh_Ǐͤp1\~D\3aT߄HV༻A໮LGv!h~/ ჱԶk`;x~"ߟe&/D̾\/p Ep_ˀ ҽ86/L´d:%cJ1҅|31cPz10VDRJd2f{6"iHE pdx`m@ctAf9|Y62F°3~ M43Iqu%dXyz;8qv])*dal:Bа~ _&z^Z7 )p=/E\8fkvS[_>7"tnM!~[;ouȒǁ>u}/못 6q؆G^-%U?^6ou!$6E9ciu> C!取'CϝNǏaνiG$[}pXΏتDpbgB}4e<{!ΌdDI#Ŏp-g>,A۳o+6HMɇ!%%sV.ZfNL`cgqSdMv$aNOX=DX60ofQ5+%Rl<_҉<@LQ @b maD̓/HوZR!8' d= 4ezFa}9,GG>GG;.#3izdSu 5=NWH89Í靅x(K-ѡ!H0lL gDj+ LtDKH5 /e@/DŠSl"p7b_pĵC^h3W +,~0_1Asgy 7 aR#WY9h:'t 'Fޟ/f%RPQK%zBX 9ՁNjFT/Bb1Dyя?8)&Ћ>\' ?Q!IIPPi^L,LfF ` ϝt xWP)LLօH }` ȴ nD5T[J Xi6pd#èH.vbЏBBQ):"݄znBJJsn 2WfG\K _ D0f veg[B` L,b zy#5W 14``DVf TмЏ,HTC5? 6A h4@zQ RtP|8`gqⶽiKJ_Y6DH &'CN)ih$)ɩVӨݾ77UOm\YӼη{{nC8xEyAHȳa(L]fOf6ăQ&3E˦k}4)<;z}aF9=6K;udR噽?s&.Cc=jA?^)JEBHЕExBN- &wxqsH*de[ W&bfƼ$ YFpmৃб^-(`Y(3 PTd9`4t/M1?yB;iG-@+|ׁ Pk`,=g b ~Rtd/C,БR(-2S3Jĕa~iF=50&'ӈTk J؄;Yœ0 /d{p=XD*{ŕ!<t1Z8+_hwJem=*}6"]M(l>A-Ѓ&b֛i 2<)`Ah~Ơ*_]d# ل$4W <{@]G@M>\F<\ h7!$ЁCI T$ M<Ha(1QIW S9DOYft#MGfd,~h0 4{sd#||4s|R3H ƕ6CA:ɢldqG,2A'Ɇ2@dR b 2tX&J2H&I }y8!N 6DY 9r {8!IB䙽$Hqh7!k%z:!v"Z9Aԡ6 q"]Ct@p@@Qhm!YNH5 AخH=\FE6{=\u*?s?Uױ`:B.BU0[B\SYn,D=ncT9/ @g%@^(9 /"e<` ASh1!RȍdyWK<Sk5h5 l]˯q ; h Aɼ/^&rRD=Ony28&$Ց`ʵ1u~Em $M/+CDLL 愀ꦠwH@=!QД ,B|*IO%D,wzl D})D~' 7-Y d./ D%!^4! $uPT`0?IFW}^sUeѶ̈́-]׎xyh[߀^MЃYFD,#!|:$@*d!m :~BT/Vڪ.0T7]9po6[4!'QҠց,o?mr(i-=, yؑxG BJt ;x P0G.] :Le hw~j2rӐŵuF +/:j4x*\.y(3J1j::>7`[-8u2uu8!ϊUe"ψhc m|Ƴ8`+N#Ylc[Lg S[mrv\YaL˵q-_myƖ+ZΏ|ڰ ~T[BȈFBd6B@EU*u1"(A$H0T5%Bz2&N4MP,*n_OEUStJKzcr*Xz258`?44LN,-!Y,&\N}'=͇ Oug-|E^:YwɎ8AtuueӶkOJOXfOٻ//g\_} _2{1=[a_X Pz$6% P(Yӊ-ž{fez);+jW[?ؓ{e_>UbgNK䅹-( (t.XDh0,t7Wcmb):r`DLq}閎6^kRBHMrGiI% q`-3y`ɗDQ`63uB0)Bd٠ R %g?X2S鋥Tb {/;Ƨ,trU Ėnl]Æ>Klf;gr/')Lҥno[6_x5FzHdƏ@EB%ST!}9jg,l* #9rr<ƔPCzS}7p~YF~"s/y l`˄H0+cKDq ETyRAC[d4sI`"II c"I5:D@iVnWS`dZvC'Hpis)!eOJ#'}R4DZrS>j QGXEMWBwe ͡%@`u8:?u`\3P?P,$.)矺-ϗ :2z qRh=S|9&t9_NϕNu0m .gJ` . ¬l(Xڀ$`#rIV+!"/8}L=mkGnD=-@p<7>PđN8PE^E|E]nsO# Ӄ m7JtW`1g\@B|vf<J,ptBlB$%'w`8G IY/S) h ؁h-ACRQ ␳nL0E󬁘!R[LO=U>0|k$1uǡ/c6oX=QM_1<⟽dv>_?Db7F/(NI B(0E]h'|"0%|-9E+wL_u?Oy'D'Jc((XǾX7EF0' n ?*+שTd,l4KqQhW %<A]d)BGGT  W!ʵOMQA>bе5w ~}!To(Le93cKDT>t f)#$p+= Og-~V=v)ˏ0" S4[Fciq~(>.AM1T' *(p61>"deWIVf_JZqf\ u8Ʈ@DEGO#11"9YEBbc֜0 g+"S]s᝔%0i%N~SDg,GT0qc#*h1l_;47|00Jp'_~b0'g7f8?zytPνc_% 9!5.&BYaDqBC'N8}r&k{#Cם﫚⍦u7Fی[>XtY G>t/|{.5jWͽ5͉ơ>ɚkoz4-Tsu'GҴAM/D㱛挱3CyQ4qe{jnwiN z' ې?RӢ7ݼ|-G;?GEqO^{њ3:8Kd.p{ܦ3((2}[G~y;(k-at=;Qx3k}~L+4O+w\?~W.y^`'AO5uyk5356Iؒiʫp˫ޝj)vNVy]]3?Pӭ~+7[yA}G.ZsN_PY4 HLuXmܖ3 w{ zS6:zo׽{Z^2xݡ .=禳w֝CR[}6}8uN>zIyɆZv2bQS[V0xG}q5q>Qkh9/.;HVNf7]6^; 5,k&$ b룄a뎎Xw=vಽ㶞7"D:tgdCfuKM7Y:e>|к` Xskfpg(!#VN7idI|x3wwս|䲳ԅ2loRzdot9G£}ٽIU\{^]fHCJQjp?^Vy/FOECf8xuC틭u/8[nJiyM+_^w gw|jn N| KݹfnܝOql$e8U bʇ]U 恞b'/ۻUͯo}) ~V@"gPz<}z7rt=SIO8xe̓pb(1"g?n]KlfՇؓ&7[߀ ¦?=)2)\qӇ[~DKl-/~~mYQG7?hNaB'4Ǯ]8$̐'~3>qK|ǖEΤ2z 9w^zhÃov)rÔ=W^~_yɣ?6|ZH{N|զWn=>Swwՠ?:hoeŒdy8={Ɔuf7^QP`37Y-۷VU~/[$hYvsMu&J3m{1{c^ʬa˶J}ۯ,ܜb,do7x+尹 DNO"VؕA[쬊`N=ހYK5^jta9Zdr=r'⦓(Ww'r=%6T`\:7 ein=uϪ'_`0"S<9|h%oKSNq At} MBqGu>0L}xtOvTi4U>*+zJ8&cC(,9KwJ8}ן}Ŵ (V7<{ʵcX-=~KL󖖊52'OM_tkR>Uz'r {2w w?/6LaT#vK+)eˍj'߹hӽg8Usũ$鳏~q1 Kr ߔ]^]w-U.z0u;s=T($';,AK^n~۷V_hl:v_=ki~=152tǵ絯߲m@*dk n.t>`\~\K+X~ v6tC+i@̐5*} 5v\{Z_kz mAYսk4o(yup4+nϥ9)r`ʰYX4(95WyzBc4[e "p'MYRX+*>W|ǰ2Gь.weYµ'2'-؍Cp䝤gS4} C NHFo8s#N Ɩ2S's]~?cU? 'ض:g^ayERdygn2gC,H7oДLdWPM_3(IF#8wYsBmбBiѸn5{&2 k^}{QIijik*_Qs{ %\~ղ ⲇ~|햳*.=~3#{@sںC{:}֨7ϻEKí|OLIf^3__y3voګͯ>yjː O}%Cɝe_o=u}കiI}W/>ja@$v}|>~"ՀAg\3hT$.!W%cf2V4s|_iԜҫϚ~n諊'"wJOok$xo1m|gIO1^KbKeXԞ=Źc MtU`+fXwL\IQɲ*v_yd=(7*r_:s*{ILQ"X*K}㧛YBiQp4^}EQ4Йߦ <{bie6Y~>S_/EI(}i0Ûfl[z^O;[Q͇o=w7c3F/1s1:6le37,æ.w&VDȄui|G/6M×ﶏYxG4 D2kN,'r톃ga,Z!mBy*/l]Ճ3v1S@ta?ɘu⾫ы knaPdđbǠ' bt5dҳMACx= k.=~mW+T2{Ӕik\GDPB=\zUul>ē4ZԍcRAI/-U몟n~oK},? )ns7#҉/Vs_q6ȃɫ,ѱ/7g +IM'@ \KXbEpѱd?ݗm$:8O7PcaA=:,w8\@q⹉=6evPԻ&ÌƉV/*_~/_; /ZL5Kʯ߯dЀ]zP1,_{_ev0ȓC/li!w슚ӗ揘҃ƷLv_[W'&kS̢!L[*?xr.lg4y[W-wޗ+Tko@YN_j?7~d|] O_(:-5}~ョ78y/8+[sS"c:ɇL=v7յWӲ?7ٶľ?uWp mc^|;o>O0\xשW\|*MVmltQ֞>A^65ipG4 Sstͷ|1hLO1w˹;on{i颣?M[z7o_5ߟjw(7ic+35[9g]_}'N ا? YĊb;Zƒ~9qs:cQ+SE7oO7 w#"+N9~fl=u>vba4Iه 9`Eۯ>[x{Spjj~ʳEAE[mUvjާ Wl:V :gڸ`@5EqcW4"lsEȒ5z!|_Azg%\7nEq@H3  xZc/vf|l;ɋgxKOyV/iS_MDp 6rՖnɒwYXd cgR3WT7>;t+#7" ԁwif iEhUqœq+NXv&`gj+-oՅ #o"S˷CwQQq2=(O?ǒAեxic*au۷b!at6ҭ@Y`ignbݍw_i|wI˽-{mœ>u6$ [ǫk|_~HKUܽ O~]l;rIbN:o͉_jL͟M?sۥ?\U<&V]<ȭoٰmO]E1D:b͹]y%'YjG`ŒoGX"4s4} !;Ybggl;vg򓯾GTջ/}'ۇ(>qZVRØJ_Ѵ_:rFQ᧠>r~ehEv@ ?I܅gf [T7d:pͪʪī2:f&S &>d)k  dE /4S+H, 'A5ȆXqIV2'"ᙒN\-QuNjlP<(Y{S|]򃜙kzRd]XY+#Lxd}.ޓfU?sA(3=:3fiEuFr7IeZ}6w4c._u<:I5}f<)I|#X*KrP;nt ɲ=xs=px㉇6]yw cVzw on9AUfG-rm->4<ܭG! Vzb@5RӍ%/r'kϧ7i 1[Ѫ_ʆ[[n~]C/7'+5CnW-~ ʶV_S~~Rxw?u7n}rν/~}?޵,x/5 ]­w^l>s͙4pdËK.oZjo>waHU7W6|}"kC ^7eftB~(f b&m??Hd 0SeuGj#S9[&x'MKV (҆~c’X|=מGϋO5~mC^$Ã>8i)J尹kk,hzzk1l-7MfY=)6賲/Ve$KGMvYtF7,=̭‰A4eiШl $8un~d}a$[-W5ҏ]5Ntg儭g"P0vӉur'uqs#C7O^F#?|Sfg^ i?g?9͟kO-P)RX%)IڢyVDW1k5~ͱ0U?e-ϔՋt6%'8y#v~E1W/Whi=QW]S,S*JT}ȓ2 "E!b oҌ"$7V7[4BGn۰ŧo|CM=N Y?q; б;XbX@?FҢXWgQ˅zup@ ~ ̷8C5/Cҏ@aן9C3bQ O6~7i^ʬO?qU݋^b5>LI\sͳ9TكYa?eo6]aE]j0YQʊoV?CUVM۱S ;vbN9oz/[}>q_AhK={{?e[q,rT0O[rgk{.ؑ"c"Lق$#6|c,?I(´n8qƁ`1%9ؕʻ>uí3'-ialm$eOy^}?_[.=pŽTj9x۽7xyn\^;ڴaWïO|< .3e [x“rKL߾\t@g)YM_m9}ӹp_Um6E+M%>?JYl+(JBtQV@s/\{"88]t̝v]`Gopy?/4qj Zw{W;Ϗ/ywΝ;syϙ;o~LkծĪn/#κܱth%+ϸZ5{#of;<>d%83~xzKuͩ=rS }`ğq=/Gwף患޾+}+_y߽n!`s:|5eArȗM x$.,з5t/>緾}/_yg?B3 7jw#i {.B(Cef /ǝ+.qN۲t˦%kB'k~w0fFC&u),A_nj($ۺ uZhqەqiDK`K`zƶ$qx; ,Ӷ-et<'ZК,${:VۇFk|Ȼ*NߞXȶfhZ G"#R˳炍H_鎕26ҵ-n Hr&\(uK ][3Y5t,_(FIָk$lR;ru ]d*:`ԛXZ7u.[܏ Zu.}gQ窅Lkͣ{Qdo??+庫$t&Ok~Ï/8lsξB[Xm0yTO1w m5Dw\`QT{й-ξXu sJ/iEpdOHu`˜\*mEMpyCh 1e=%d#ٞ<`}pg4Wb X;KQ5XH-0hDuA͑wT8ErlVwo,3*-:3tR{40Ğ2>2O Ty3}jPb=i˳܉k2!Tr [m%|ZgG4Go/";3Ŋ2Ə.J=T+֥Rr>ޮ\tD1SȆRGe ї>x ؄ 9O b #\\eX`E]ʅ b΋l}~uE쪰[i9bSG)GԾ|p*p DQS9tWaCU]aJ&SQ>8SAr&ׇxZLړx< j)Z,QAXрi}=v郯g<|]N" S!{Lc>zJ=#hnZM#;*8_7k*X%-[KULW7-+se**40/-c IœC󕹇٨ ٢\e,ڞ̇zO!a}KbܻÝvRق02~H=@ 1'n*og Smj.gҝ[Zu߹?}kz~w\+_?s\7^[> ݭ^7Ͼ}ů_wn-/}~֛z'^׮{|>{?o|}_;?GWW1d*O$"dB[3B9tͳvRWlec.Z sx.w%-VL͆F6ZmJ+cB! Lq ]6˓(gyCrɷ=&)B<2b!M' d|V>Ñ a !o,?(]Ws]ToB2bEe/5liB%/zQB6!=;zEN B[J`1ۡLUυsI.Me6D=Z66*o~} /8 <r0@"'<&jOXK 'bL%h gs.eȍ+}"rdl D)\Ji(HtV|/N2I gۓne4CO>ݱ/4UI҈9YiY-De:o'_5_>-|+G_,-K(60T*²(.r'“Ӆ{3n6״0 %\C&0V߾=B["Guɽc\k .cB5?́|_9hX苗GTV 殥kw:˹(vgNث)r(YU)Mq@d&G[}o%!v$hb.PKb@! pD|LXc| R!+L/&k0$M*ӳSB= L3i<14J|@2' HUK!M,$) *(x!\Efwei)RY>>54gc pB!/v#yHFKu{#_ϯ7+zX;xUt{Tk/o;s6 ^!fETFQwk'fNӷb~˓&t|3cҺ%GIS_|NRME)ޛf/U|=NJwM4)W)#ZB]'.ҧ6t3̟ ZM|aMIo.C}W554JcTL{b4ggMN'g :D$6* GSda͝dz9NX3]Dr*SMޅBf C9AF*L:ߌVi DQh5MaR3\?|CNvϵR_B[@k4&٤Ԃ..QQl2~yhh%(+sc}q{{&d`_A7Bbr,6 _1I/lrV# :N']Y0 F 0ͷQ)t[z>2#(<~%\yO e\LV><=!eC%uV#R20ugx%I ?&3$l"oQT=X1{B55#&$샹R2ߞ$仛tI2E<#9k6 ɺ xE!~#ijyNZ3𕻉I>6xybYdk,N+$yU!PR& Gog%p9(x6 廋EX> .U_4S*Ws!M2ݷM|uu5Z_B:W`P)2Z1"*g/Kml+Ш)dYNFB]DCm n}Z%yr{USUlr)jܦaψbS0b]#=i5V3UȃfYR, erf1?h%B4`1rVۇ^3ydlYpw(ӝ(fzJ0|t-Lq e+JpL~%G$)<_/kҽF+Ҁ恇V$7dFL/9VS5 ZI N/<·"1B*'_C$Blfj|@kfOrJOw&UC|pIڱBeB>y$sADyb9 d$9Y%^r!WGXAA Tj# *ˀFU'4 ֟PZM9ivufNګQ>Td隼^+m]r7 R%S@zELSmx{H z7|(&gsd5Yl#r(ZxZhCmC=s9=#|T'a0aRZhE)3D)JC3*>&gv" c*oJc$!wF6d#3XfoDJ=숼8s ]%Q@+Ƹ['YgBEJ<8J{jCJO%9D$rp %َa2 =0eFfi3=;9!>crf<%|SuzPsWh'3? o)cɯOv-'7XԳϬwKL,Gb I4R'ۺޛZ wwOLkJRhLqώ+E302 (ar:+1t>3J(G[?i)h׷GyБsKΥSȠ+:Zb ӻx}8ӹȕɣ+d(H)~R ?KQb[RLN80fHaV_]p(:r0Y9V-o(]0K1'Jq.IJvT\&!sDx0{lqɖoCeL%|WW;hك 1΄Z$HU߿[Y+Oo>e0"8w KGJ)Vދ!I`kEs˒Cr*@vR%CNٗ߹REueŽIDG9H?\RM>MpA|@82y|~IQ,C"W" 䚌5Y˧Şӝm%g'Ӛe*7E DB+%2ٖ^"0D("wwo0kMNa ob'kߟEC?lE\d)Ϲĉ C#)2#S19P 6PZ̔?'pU,N~ 6r,q98GGI.NOg27˕9t*fCj&\xf`)Zg:KĘi!Y 9w*[y3}B{2ǕvW|"'qk3cQ[j{oS(ɤ7p?իP3T4;@d?t}(S\BV5ʠ p^[2ӅڝQ e'MTȂI<[4ɷFq2m|J.櫦N?9 Ol>5Wp "NQI>gI+OA3-=T.j>2. 5R]g_,pxp1>Ա<ӱ"Ӿb}mw7w J|%WeF+kL-FE> 0]%BHaLP0MSASۙÆ X߇&achzǰ>TY"P|#j):(p"9Bgz}=N ]*쫯k9P ~+ޞae,ӄC!Scbo%4t@U^G=9frd2HTw5T` Q" @`:I b9sϵUT_?Xf:p.$TNⲖF528#R]w BP<#O eK9h6E+?!5G*xUn*ɚ6ShE#i <JRhkͥxQsC pH.``>kp5B6\DҾ:~c X2BzsK u_AAUR֓,:P3B--Up*gP)T4~hWKER+‹< 7Ъ7Tpʛ,"H+HX Ķ2WƵO&Pjm+PUU#UhN  }T ;: Έm9bo .*$>շUJwg wGVPӹݵݱKT;WaQ#RIY(e냛% XgGpj"uR_H'DM)tE+QUH! jGtW!&*žEbo)@/rfHShU}`ޞ2_d{;TꄜTx* =r}7X\C쬒BIRg E`lnB!~+4"I4x<=Q"jлa}j)Y$e=5h; 7B 4R"kEV4= OWhw7ڣU" JwfVS>=6ܻƎ";!')9vsrݦGZqp-!W?4ZlGƎ6Sp =5V##GWӿl#6.Y}ت:zcViŪc@˖dQCc9G4zSB)A?U{۷QG߿tctSZx,fJJ̗N'ė9luX1bc7:6`{b|+'V~禶%.V"$K͹Buɖޕ'8> ^ڔX~||I˷u6E6ɴ4T ߜX8T$}|xU0͡КRHyL^ݓzm߁zmW~mϮP]>]vy]s~;)_#ޟH RԶ :g7#/[yͽˏL9őKZlْc#+ƆV Z6rЊ#X!Y]k6lB:J}Y_(9B= +71uiEwH/8 Mh3,q]L`@ݵj#\ ckd'꣩0\Ir36V`X q+f/'NrnIp\a[z})Y}A&q.nf;9P:pWWӜI0v9u:yG298iw JNVr!^Rs+hMpmdNonO%`9):Lmh`ik(ZLL&GklX-5hhҧ idoF+qGvٓ;w#  RJ)T͵=,8QRFuQ2Yoi5:Vu&m0j댚Mh(X,plz ص#qԉUN͵wH<1P.OҲN-Y]hi 9U9sLQ,O*Siؾl=% Z$Ī8|)/n; 0)j8zXa<V[P5X(Z+1Zf`3 2Y5[ = 5UI*._Egs)P[U/Q&L`2e&d,B+J}Q H4#6u[Hz ƭ=|}2^LlK"}" ֮'_> tvN ?wwjԮ$~ifOv5{J%LDMu.\glq߬! kx-6b#5@zLѬ`P6;{f¹b|\W 1QD2 "9ʹ5!Bu>^Vova4ꢚUƆ VVsj䗨sϑ"Iil…Yo "g45S`|z)vS+*FN`ЙIY(dhKi5[ E:+ x1(%WHBx5B[Ab]x)B1P$@J6?^QS^LglQlYsgӣ:TR\1Xn?=Z!]X U֬9lS<׮8}hߙ}as s/ܴDif-@Ye2]iQԵpިՠ($ ҵhk*vHۃ!L@Xd ҃D-81Eo-2e6Nd$8E=mNѡo10C! ҅gey@( 9pV֏Cp)>nLS[ƨidsx7W)owr*)_{!# G:4Ug邺U-G_yUU]ǘxh9F!KJ(U_SuE02>\uB%l bmr9JϘ‹ҷ[V[(kjJNXY6bfJFWX GP ( 0FHQŔAG<&R MJ)v ER+`hG;OPo%q`/_vFڗ rgUP uwĝ/}`}ejbC2HݷwѦ\)8^;4{'"%-Hp;;n fF'(/JN΁'Hٹ+P*Y)yÏWy7~Ox3j4'+BL &YdL754C++16iw.JGZehSqf k5xφFÍ4) $K1l]m(&Hdxw!4&FVʆPl 1(E$ /4OI4YA t9T ]1Q٣Yb?G|$PZTϹl-k?|Kܸ$v>𳗮8N}A^hfE+0 gUC$J e M "/ 1 HB9=X߀ j||~< ຶ9fIӢu96b2S`&i O3!Ս͍MNc$NdIf$MFdJ(3C[9ւgŁg :ssa8TRդeD"=-6c꺶n8*=ǵ.}7dI҃nfʵ8ri+ז.l0f-gxTfdFiM !f IcJ ;l'%J݉fglii(-8y )!a?q".C,P+BvK5PhdoA.b'$mQNA"5ró(dCjl[D֞\" |P*烽k7]> Ϙ-:fiiZT[mMu9sNVIEl2zo8MU>ڳ]m%Mbbnd@r۶#~'paj }2_n#c}{Hb\5VT|_>3̱uv,u,?ui \k(aE<.TƶAlvL{؝mw=۶λ_rSO=n5b3H K3sUk;\Nm0e1.atv~ZuFj$۝=FwxJ1*!"ƞHLӢ\H(im9z=:C,9N@u(OG($* ^w:EiQ2Z8 AMO$@W"E"kOZQ]qzΜwzEJm}9n[ߓm/|Ow)%sFˡ)2A0('>#GPHP;T)V&.9a/ks(ģ:P3MZxkËj]:׈GƻP7w:|w{{sv gKv۬bSV=&#ڮ1x9PNbhn20t546CoX}a Z_'Ub0+iơ& J}IHJnj8[9)iy]{pȘވV+C< a3F&25 -jo4ȿ4'3ZnN䏜"x })B/Cn&}W5՛RcBO.Bwߝu`tf˧)ZhG$;+R_x;v_g>;sdپ{l3u;'|~y@Oۛa^Զ;1wٳoN~ӎ$/رo߁gQBy0w?';˘Hyt&2bےMWpi< I_z׿wm| O> b(#oϒyg#\SS m! hV7d9bo- 5ǩ,u.rRLSt_kXt.\UtIXQ7߹KW9w}//ݻ}g쌦>啵M @kM#KO;'~4o0` I# ?y"bXh"=GD(9YX-lۃݸXTUH\뗍a m Яkoh.85]<2dx?x{}xg~wMw~yzAp%fNGjYa0)I^4"A\ /Q6V[Tm8+u|] xh%Zla pUf2ME0'rNvEUWҼ`ɨ/p߂j҈AC}Aξ@"E+{Zx &hH ny Y=UHFkDE7s(݈Yc]AL~JˣdDnG_&u7m@V[A!&? Aa Zof-~Z#MsSkҸc:Gb0l[-%E*}vz2/Ľaj6qϼ5rScvAJeOeHR!xS Ȏv)he@,n(M3MNMYFSZPQ;'w-?Vo>T[[oki'41YAZ4+zG{9߆Vd&:+뢫k1?4@6E3}c '߅ H N2;X UOSo0ľ=E>E5M-Ft|Q1mxd+K9uGehix[}!EQ&#xʼn._S[g]pܢ8 -:^ C( QZ.aqB xCZ٠4̔ !ف[F v(g(C,2T.(ː-CL:-@lݩ\e3}l}kO1@+hfMjKJ=pyO=r]MS>8oY>z~ן-,Zc@kxӓL<ij}}Hth ]輨)acs8'.[^][>v'A@3kNY,|pPw<V/?=`6HvP jpYWSEdwxVT&-W17h4ds} Uv|߬o(6Kζū VyS.3: .:WnD}Vfʤ:ozY+[`]?iW)*:+fM9Q)&cB_w`>Zkl,fr;{`hH$~Ǘ5oQHC5HܢeСB"66^C!M?c7 [mD AEAnۃx )"]{1pTcSrYg= />ܲPyW-VYjў@O<~p <ʓWB uz2<VHo:uuM=EX :3`M-h,;[j+2BiZ)7yԉ 89vM %V_ >G=i5?VCG>RGϙ߻*ChY\)xߗDس_ڳ{Ͼzۭ_/72:'!b±_opzn`hǮ/>CkO?Qi/ i覌4ۻ{%Hw2oDӗOX?KfNx֦bkKUֺ/;ߏ\-'ݠ>\#| A  K^?D;~w?_E+dđTYB16|tek2 ήsJPϖf^W-tWX7CQbϧ;;ď~K.z[&Ϸ⾇me j5zb>A\)nI᪣o R x:l) cBXn/YO鍨t{|x#1n.7E|-Jh_$z(4f͹ȾnM=r[zS^^<9mIԋw\ט3'&Zz G1 c'#f@cNҫ; οPRMM7!eEZu?m;8Ԕi 50<̲,\X+HgnW,@|>Bc%(F- x+@z &hl%-o2ZQm!IthA+( HZ5x7*qFM3g55'ŐJr1URWr1vdaX ζ ~o<[pGhGf2̮2qћ~8-[Nd SwuuC7 H) ZGWu0^ ^!= IϨkhA))!G9 #֮h4hE' 9)ˎ/ʀVdT`iD,u'a5"E\w9]{"moPz er҉_wM@;?ڱg'mYMZ&+H7? uTVV⯶eCyPN\ޑbny|}{vnݦ4?Èy]<_|)޻;nE^nmӏ|_23y[},zw;wok֭[p?y8nwn;?,oᆱQOm;*#۝jС, =}kO:F56څN.,-@+?xk9me'wwLʟw_jޛYc$Kе?p,W[]ogyOesFyFoO<(Zoߺwz t]wS}]>Cj察5U/YR\Wbݖc?ckkM@`[\$K{`l{' tHB1{5M-]:sdI ޻:ϙ3g32Zcc ھW{*ZŗZyO\|h"euZMCcO}˗.{8?w{-bmVg@LP^@p^*q|{3S)5_>7,ew'HQwoF z78 x;>WnOIIS\ fKUã|,Vb~iIr[=?!~<ɡKҢ{#o?H5JV/f6ڣt4]f#N`lnJ^kR<4'0.V] -'bGICr H38 I^hE|?+3]4a~TWUNfF=i-_S]>uJqte;#SO.;vpfF @W$%Ƣg!1=M ;ۖZLT9sя&gg|'}i^N.+͘"=;N6J` pciy V$B /4h SW7]y.z:9Թ/IYG[L>P(H'JMIB~R4#$^ϋ&M]`{2SB٨oB|+/Q_Lgw%ܳ˿XY|\d J\E'rCC:yK-8҂CCƳ1U[5 Lϊ.Yf֌RIԑCGqT,MMI~o&Ҽk7̚yLNLy7^\ٳ+n|%i/% IS KHϻ_V81VY<h ;p9pj8<>/L8x#sV6/$qO<5l _q2)<>6@CB ~ iD}8qӦ@D!BB,z B@h!0RR1Uw~ A3?ҏ?`x|B g>oYSnG'h"0!lV`/1%94}Z1 }>t(L8{M76myEȤl҅y٫V>ګ/&}"xzg_b,,rYdi;"ၯ-ڢٜGGt߸L٬{>/x D\'XDS3lpǗ-I*3(?8< W&;{x̊: ':|ļ|dfc 8ru&,|}Xn˕A{R)Mi$o,?oQDD%xK֜هk_?g7qD}tDbG_4S_{MPm Oeesm?o\uحdP:izOEy)H>{Yfc/]|~khj,m6{@l pMV\ĖG:-vusf kGW~τ7?odzE1,gÏa@ }ɏ xA9}Fah !q?826y|+4GKO999 ׃Ei)EK 1s7WW?'_{Ͽ{_W>zKW[_O[p0"pCBCCsss S<3,鰬O|-{*9)lgqɻk/\4;+BEyG=g_͘1QQQ ?ܹs֬Yl6pMMͼy&Ndb8::ޗ_N:]ll4Yf,V ʖgΝCbc'd:x,+3_&b7(iqSZj%R 9y&ń@"٤ό a`tEO ɸPvAü>RwnGbS¨|oq뫔dVNNgyBP ~X$yu əPuK鬙%(Nߒ%|}}-Z'ϝ;7cƴ۷Μ9311~ƍŅ,$99`MrrrP ¹9sL>/ D Clv CHHC׀0P*y[YIΟ߮[鼼 Uה<&%%6,ŕJr.I*1D1,2ajUQ)! V0ԬT FDNu։Gy\5L(K=.ㅇO8i8W& 8Օ\.EΙ3 MKKٱcǔ)h貲%K]/~P.͛(!>|'ؚ}\#(-SOK7 .%Eo, *BJ<#ͅ6& &;.~r='n6QN2y)kr35mwu'>Mp.ܭ쌑J+K^=@BTԾ+8׮)O-oQ-iOn T!Q E]ihzݼyӹk7RntxqJ/)]є K]y/2ՆK&[ nGIݜ&O=VuTޘ$bNemnv_\ ApF<~ᄇ%O>ÓDxQ 9?3kW+o|j~!kpxQz0G&svW-.:t谘X024"b<;4' '?AiPq?Cc'?3|H}(` 'a=QQ⤢eτҟAٽ>X6۩Ew~`Nvי( Żd0<Zz5W >?y'N!;q(;?HJLe9"[8iʤ{v|U[|PxbB꤂ݻjav޽`܁*--EiiiG4iXPPW\ uPڵ ] BqçNm?_P`#YycMIX7yf >pjŶ%@apѳJ3e_.&M}jU, .23X0ȫcTRyO^^=+s,oSuT=k_}(l  V\=u<|X$D57Dkڴi_|Evv6XaÆ)SƢ<+QVZK!7^_mkϬJOܺ2__8n]RxuJv/4ZO =kW_͙v;|s̃<:4??C!{&<<pA2nT7!w ćѓ) IL YΘ.hāzyy 6lĈސQF'n4 ? rG\pBB٢``@YsPU^(=m$pϾFSR4}Vy罗^z 0aO? /0{ J"˗}Y@Dy I>Qoƛo&{뭷`ڥl3{1;%߸aE`2HO.<$/N[U$.rÃ~=uJ`>U룼zjQή^ϊ^o.hpL$KG?"qOR=+8%%bq)ʪgLUZR1obV. `v$Wo|흘Hq…1iVO$-$# f< f{)~,QV9N!&>Yo*$4|_L9]v]SJr=]ܿѭSP,?bW }ÿR &CAta422;Sڶu',/@(H(˯_~etTR8ٳ@'|> k ]͛ L~'PA(LjJ7[Pi[6"y+Ň ["e{L=X<^iqq/rn$WeYhYUR]:OQw[Y@]AGJ>^U8fq"@Y7l\n$p耒b?zz}-PӠ&>GHk>~ՅEx蘘5^^XhXX$tLU-[~AS&Y`Y\l2o`8}駳{O< @, Z-6o~ճRv#g3[EV"Y4*<419?`cǢ1c@g 2hL qj -"J ekA+RV94S&0U<{x.]pmaM!vJMMpՐ@D* B¼KL_rO^ 8\ P*p j$ħsŦgd_KHU-P?L &dVz*XEJ`̚w?T4SQ}z{íWugWt$s"Ȉp$Co6&MHΜ1s6ގ>`Ϙ1 b3::iT|h{J .y"*? B3ի3gdX;oqQA08QZ ̿O:ycEVN֛G FU{0o^ 4mv)oYyy+ 7UZ'Μ bWePw O|8Yƽz漫 6;ufs?\RorPp+6QԞ]VV&|vnh9VFuG0eۦ=F3t.ڤj]hɺzt/8Gʎ ߪֵeGIfAξ瘝M^rb[{FJss<˕Jya~ GxMpԅϿ4sƼw (ܳ/BA9[Y &482fTE<( 0@s$s<3X"Y#aAQzibRhlDu'䆚wR' -#y"82!l|Gk<2=b姟| Ly'.6i׮ X,O9%Gk׮ 퇮~zJ*cq*=szwq*ҥozS͚=OB/^$ <^SOCC{/Y E+@P{A!b-rY6}QJJTՙ0Ԝ+q3 &cE=hB[Y=|> 4 'qe̿|}} <၏7@yo$G/2_ $@sh[y*IXZ餯Zv헌  {@(2uWprU* | gx011?fFЈ(0DNw} ye;kn)YqXo$ //\`א E]`ↄx"-ɱcFW94[)y.5Jxu,N+)zuu愉~ `wb~zf}v޴ORn+W&$ ~􉢜Ymf:&Ȍ\"HeujiE *̞.$9o>tu`7i)|?_thڽţ',~9B =*xH2+vo^y};~SS؜v[v?7o< 7$H gKbǍ  =|eʣN=x`d IḓI^@{ N!<=ިEdݜ(oDŽ!)KZrcuܸƌ>z(7n#F<oXb$!Q= =(ܧptJJ*0 W([vp8"׼IYydsyqDHJ-T.A"0/4tòEdV$̅הtHfH6눍I͜1ór%,E3h5TOa潢 Y@W <4uWRU=Ud/I}+pW(1+V(J~>٩#8xPo&ZPYT840W8 ^gd9RP=Qjj:5&&Ê{{(<f #8(,5%\|(4"Z ^A *~2G!8:ABB}eef$'{ziSS<}wp0m,#orf%v|衇FɈ#Ofx Zdzd$ٯCl*-B Abø3(#b̮ j'+$ISiAJpkyᅗ x)'SNDp^# sJ6҇W|]f `U\=/[1(Yh))z^dfNEAbV`Y /gdDރ$ 7^^ wIC ù\z<ڗ ` j(3u(^^^}4p A~W a+ ~N`鸥x0po=Y e,|-"*l*85*gx yƅOYd9C  P?Yho`$O|Q&d'9C46 lqJK` Wܗ%. EdHF<ʠ0:+'~ެ>Dzdz}ǎ cÕAfW fIb$6T;X\t@q61Dn݄dai4jo Zt`)2Ûc>3/%kd2(L܁vBa Xɨ/\!:oF!1BP Wh*W7oܔ D -&bζ"%M]xl D׋W^G?b8~|_T2HDvF|h :eC |7DedE"`!ٳѵK3(l܄CBtH4*];$0?*Z:!'-R2(b0"C7hLVGcat)0aH9ԟLC Q$"Gp%1|ijޜ3@;PF[=*MZ {/Rdp2!!C01`رcI" ”yBM)O&cƠ?j Ϙ>oKJJTq@9% y:0x X|"Ki>r`P }YxpulԨ)ʪ(ոdy] Ɏ@0VH(KYo ӟ\"AdWCNpQ@CYdxX f#yH1&lF4 0 M4$c2Af],Xv'cKpZ_! X N,Z~P~`Ѥ#'x2y64r8zpK|,2~8^xˆpA;ȔFF79n(}%Cg:d_JtUK7e qCQMȾvb9DZv͆ށx4uF @H 1oW Fd , =8񰘛P2 >qCeK3@"',xz4}0} mpdPA#|+.,[.~,;)1N!fąȓr Nxd9& CS#Lg$Ѧ)"048 AQ5x+ ^æ%РBXZ7~|pl- _ 5Q# ȸ{(3MLp7OL 7wcǎڟ3v<, r`A$w xdfNHlJOyxK}[҃eyC&C5ȌåCy%DKd>6""?e3QVlV`l, %P#0HAYӠ 'e#`dҐ!^;&5GW 3Є#7cB#C䖜 F»`o5`%$z{ Ń4{&ăH(6^dÇeiQlrjCQɓ<#NO~HJG`slqtx' R(C9Іl#B}VH+hF4#"#o; YYj'/:0=`~"165|^?Q2B4\8,02>JA+񸏢T1.g1zququb/40?4(އ"%d8a8qZPL !G2c3K  B26#pi2R1$cV&HING}x 3CBy#3.'3L`66G>35--qhO̒dAvC3RbB sF2 3 2b<,`N Y*Bٜ ht@<;=dY$/!O$Yb(6Ѩ83OΈ fFki1)b:3c, At?g&g#Azz:Hp:7V(&eʒ 1,"s(p~\A{t&zJa%^HC%8-6n#Q9϶;"b z'%zХ?aqYN>E6d15:<6ӳ:^ZGGWEF.Y ]\.'`+s$[k36 7> /%Ι%(E`v(*YQƗ \OW$ B aq"XǴ RԄ@[CV0p<`nу\r@ni ׁaiqGqc Es mL_0S r1q,%z%8 "8 I8}_iJ޼'BQA3lA`)9&_xl\!pѱX!SP'sy+` pYHl?+L=rGF$ bG&?=V[dyD#%]SJpOF%{d㣲s=5# #qjBH({r!~04Ae^'q :g<'4kY$!l$/m0;aHH8qj0 ,20ZX_7tpɱ_Ov;/w7I;M>}oy\}\&8 ]7ydD|/qcxR VXδ~ P +͍#Aˆ }xlP&CtAr@>1c&B'&/ (6'8*=\:yؤtA4&:_~'[XA":0߭ Pѿ;2YI.QQO}I<(_ q|,%EB+ALs$#J-LGҏ' nn"p2:&/'FHiO<ÎN #`M fe Q-W۵D$.DDz4#0~R@$oi/`S8HL* {lG, "RqO"?*L IEGCwJ%qi\P'eZ\L($KRP͎ D1jW,hh%6Ta$tD]7z?MTjҹ0Pk:FM3ҚkVUt===jW^o0TۅNAZ]d@'n6):UJ&=]j]lE5ѿfjC"tٺtmQOv=^77 zMcwȍNCrnRP{vA!cOMU&/i\bZwt:ZRtWiz-D2iA%ѭ#1btJTݣTkMfJducP@7uDn=SICv('swD ԓ-+s8rb݃16P '|gI|-woz'6=$N ,$sOܓEf7漧Xn=Д]ZۭZvڛ:zQ3:FҢ(FeK׮7Z^Qug/)n_'joIQnE mn*}kRpvCdagD2Zw$a/gwr矍WpQv|mhLFq=>Ch :zfgEgnC_E$ 9=s;TLNPaѺ&qg4Nz{x>rvQ.3;Pg'6S.3`D*sqFhQ#n;YJkSd&ɉZ*CI08nsGܰ.ghI1]\N{\- FJ_m䧇<+vݴ 0tY#eci4;ˌ4NWpn^"_9a.FW lFb]_~iw^K_=҉#5K\^&fiulnMCz(vifBiw.=nrNǝM'|HKow޸[w8Qd#Ifg#ft8-;%d'p.-n:dD':(BiN-!+?ӓ8d2,VƢ[aڷL栭6=y)Fy\d)db2k~}׌Eᾕ|S;!EFi'Eu}sH8UT6n^tKra:5*jH3n D` ГZ]&a6f#!e596Aζyc%[#[]NstH`w/0^2 DbJEOXt:ܲ4z`ۭzFgSn0Z-2XuVEJ?8jݔYmV4XFbm@^F7mzp3OђI{pvBLh,^%~zĚeF-,J4p'mK45F}eS:Ϣ !.DF!28cfJND8I%rN-ㄘj>9-mͮYN:;N5ZScsvl$# ? M }b"+ d9ɡzBrVCHjJ.ewƚҍ;7B Zn2 W$N,$, w>Q=SRy"m>#Mf8Vg8~In0k\7: ]oNV(ڛ u״JUBRZU- );::~~^rܸqCgзuv6*MmmJBmm-JE{V@̜ήQ)M:m+u6l;CWN֮Tw6꺺Z e0ׯ V< N̍ u.EV镪6Xuot*4-؄:QVK7ۚ;PvU'v(q5Aa*sUs2wK ~oBo-PQP+3:'P :;;;Z-Kˌ{JFHG$^jt ֈpoԽ6eWBݡ*!]4߮/Gn!7d2:%O8VWL_gdӔin/S;ęL~.O 7ڔUN,{F_4gQi5* Bri5z }']G:[Q3M(:|L=Ng;5@@Z pYZE6hPA(YF4X&n4TȠbjzt]z= @hu7 wVٜۢ^_`RTG"܀$~Ʒg NEsB:kE+et zcj雦3YbhKlFե'D(0.- !-7f'2 0,'\%kOjddOVC^ՑQArmf\6d$m-Ft1kSSܫ7qEuLJ/҄)[6-uMYS@[F;%&>3`H+Z A nKDwgWH funZn\zʻ7~ 9 r!>tܰnhsXn8VvfFNƸ:nVcsZv Hܔ'] . Y(!׭dCe lL &3(0y WGl@W-7jUC>( 0 *x7MccQ5døEHfyfss3(6 g3Z=d0:*2$% Skų*h`U`Zf[MHs7 w ]!k=z ߭L6߭~w>˟mrͭM ѻI㔕A+JRzwЪGC]{^YۨT P@-1~P : l4ikkC/\PW{P]p/I72F^txdouͲKj@ɭҥKn\&- x;rآEfp:::Pի€;xѱcGV~ԩ-muu.\q{[l+**BV9rPZZX{/[QWxxXԑ#-͍.]|FR@f>]9Yұ&?;+@+V jP[g{-F{nz7 w_3T36mdV6kebQk{txlt!Fˣoݸv}ɢ7o"kW.ZpߞŅEY?l੣ةE_ x{:t14rA!ĔBt@ طo_2 ;*/njjmʘoqEIq邉/\YU{ ,1cVqqq`` <*n'|؜W1:m{gɢW:.]xI EE׭[GOpcpZ^{X8_uW.NzMJ K١`ӦL=rpFZ: a#abtGISC~Ώ,o}#),Y^dSvxsԱr2RN?B0;n4vMŠ7XH1%Z55*ԝr&WWx>u|).]:~tqѴp7nEݻw) cMyմܻ',;Fa{iRZ:ZU6[h}Sۥ5(x^P#hmڿhR{ ֯X\4}ƭEEENXR).\pҒ aЃ늋fξ9c3bwϽ[c ض|~EW]3ƙE;wޱg8Zjkͧ8nm[~t9Z/w*3矷 lǢLaU(e[HNy`i(YC._;39~ĉy䑿/𵦦(p+_\\o>V3nS:sU6 ^K_zjƬ'OV>1=f45e5‘/,:u*b^]zʹ Μhko#n޸ƙyly<N{ꩧΞD޹Н :[Vwhﮱ}wy; K_/ ᄏ]DVxɥ%3߁ 8~_7^-;SO%hm?e < 7=b}ܙݮ]}Ͱ#+[*[{7\ш?&ʪ !eaÆ0z_yP &~~>A+­:=Z55}ֻ1Mɒŋѣw}… $@͛6n۷/@Mc n< [_thECm,:x\[s㋟.9\ HeK-|(N`NH2I ->܀~y=~{ʳ'^ó}]u/o5)s,n{>XfIgyA>}{]wճEw_u벟͹;m(ݾmOA+~u<#~h 4W_oonjjkAU?'xC\@H%r}i[q*%uK%>PwOPt8'[0~幊zQRxi`Ҭܙ6Q3=ZRYE_]^L"1s*xj.+uNR4MՑD.mKTTTcǎ={6 H$hPԞ+<Ij|-Q*-7‰t&D_x_$9`D:#jikr̩z6NKLB(=\Jm-Qeee@k(ܞLE6 eUk \Yh2OA4J&7_k,eS&6LEЭmaݠ)cQ* B/$(*6Kgy<ēK `J(?p>隷kq pD ws"xo߾.6)dDha", >c]+ -Gt q ~4WY& &u^ד2{'߻5U4PM0ꆁz JE-Cb7zCyr`]>5'! E'Us93/^rOwEܜ(+W~` 1Pn~Mnފ@ |:rA y970Hd-%29u(?7E*Le-s'|ʐhm۶B-[<17lEdM M74U {_w}?vB׷AJF6ou3"{]A.K/ ׳ǵy=]yE|o{ne35TC6 `g L  +a@<ē>E x_|ht֭dLdYx}@U89M#˄̘Vޏ¡`㧫VV8̙9_Ʀ]_rC|i{-G4ƲQSG ߶q*hŖ4xDB2pv@9Ah۴j W/ 8N2 FO<~$hK7 Y*GΊ( l:nу @n?~bPqp-d»n*lU҆T fLkskHCHELHE"Pv&*" 5$@lD9b[|@b8(~I(+@;3rd1Kx'Zxʶ0N9&.0 ZA1e1 Vl9: <ѳdѽƈض@)4]LS]ٔ[lqK;֝|+z/ɁG?x{ʧ/VE9=`6B"hO'SQ@+ݝ"8r=\O},_<z'xhcB+Lal1TO#.?ybPb3NN<ciUкwz^M{`Ԍ@*%>Z+PUx_kѳo;K|eٚ >lR&i}1B˽Hcɓ9\~/~ ܫǟ|JgEUoѽ&{{1@W_}t!'xՏHTEuY$ J2?j4hm͡bqSV" V=*,L,\x[\/ CO<l]Dlko3pǛ:JphIEwىosב:'#/yX$H6 p/DYzgx'|ojj"tߣWx≇V?2R+x4uMx8Ϝ,?utey 8pـhPoɒ'o5gyE  Yf93#uP%1P}K7ӧ9QzTShЁ#=mZ>mҤUyM=Rɚz@+Uתkxo]dI[[ DV@ޣO<1(p@XC7Y^t¦f 'w('l)Y*.)lqA 9v܈渡e§#1ؑ< g64jq 3O=):U8g@ Jt;td!\ף[^w/zpM9[^4MO'$.'xoGx׷l .2aT X6l(4 X6;/'Њ;va`\䂣iOa2Y6lCD(\V(a91I\FxIPE9j2/H(s5 "rȲ2K 14Ki& 2d9*#0\˦S ̞G.#|zzO~8hDVX{M6]vժU6y}ȚEP,:(E]ʚ׮-*Z[\vu7PJ1Tu7m-^k^qu8O5EVOV\TxúWw\b\6A5V*ˢ5Zp)eц_R\۹n՚_X̮]V^0יY!9emGI },{TTg$~cϿ{wFD9r6w _/߼owU"|4M|èh"@P:H<\=C67B6l5$ 8}*nP2j8R& 0IBTxf#̂J4 rݶ-GrLȱԠXʚeؒR5-aOTMOl۴(p}$p|/X,\ؕvlݰT|gκ 7],S.t4c2p&s᝿6NءtĴ@ߴINg! ,STt\oݖH6T3l?Pd؊تmˊ D~j! =l T%XEAap~Ұ\ =${:ul_:aHFg$|eYLn7bg9ʀV""i.{ ^`b"ت`ˀVWAp1Z!GDy PmM"@D],"+T2EaW_SIE,(gjkQR< Y0 #a 1 GF*3Ue _ 'T;k?U Nh0TU$wYq1"쀓AR5VZŭU0P-WRUtS1 v t.>A"P>V[U9mlwbA(lw񋀕+Z\ٝg"i4N-i`%,u*ƹmEX RV=7rWɝ$MU?77RR8dH"-"_s$IP. Sphf&Z!a­d%vG*n4V+A 0\FbgE:q/b, 3$A$4ŵ嬂+٠,KTvu=I#R:˓8oA _K8w<g7[qA#;-(AV@u颕u#E&h%ΧRX64 ]xł?߼(8`4q+tdÛdӜ![e l y@*eX6p6m8ɦ9ޤQY\iHm]1D΋V$e9T[]rhXx"5Hؾks \ ptkP"7GgHe-zB9vb\;sוPX-g sE`EO I&۠=Qp# \ Fl"'@lAl㸅v'~v ?"~Gv갞u%NKWbS0b%Kŋ_xᅗ_~駟~ꩧ^y啥K;Z] ZْG3/8YUDсV:cB 2 l%X$$$:luP*R45gҮ&_D,C He 褙5A"F&h*cc ]GyL`d3~_ A܊D,X(r2MKt\z01PHsv(ZKhuI kR-g敠tE+r'hKn}hOC*C8%t:fu]x<^p@6M{GwF)nZFfH2;!wUpqc .u8W%۪݀t`c%"н9?{H;SrJ<q'PCUDWIIiš4|F"G}t߾}dI~Pu˦+`SS˱s'*dCqӶCm;pS6#K5,NLaʆ i=[~ȡñHw9TcX>X^y{ RD?Z]ZhE%dM9JG>!I} &"Ix 'uwױٍ D.lG,7)vS=[*ߊVSe6 &Rl-F p&Hk:,XRݻA7LM{1RA+b dY瞋be˖[͖/_o!x)yx##0uW=!1!G[B4!@2it0l 2k*qJSI&%d}ҁh0jDȲWFu)N8A5mjs-K9Saˣyc4ty/>2cB5i~C]bkE=?Qu6D%NޣO/<ҋRN%u[зlf q{P8brmCG TJ;L&S"ٻOG}Xd3cUDQ#&u{'"pDxF7;x`2\ḭ7޸hskƙsv9%J+Kx>x>!^쮍iܔu6/69u5 wj]"$Iq5^D (q]UXcbe=A3`0bŊ{/q`^Z]X. >]Z{}z2x O ĊJNٶma>z!_{[S#x}?wKf=K%V& ]IUxV7_).D|S_~kR"]8v/=DGh3$H6p|B,F*o]Pm3WEt}E1g|dCFTb}}tH:^T)nIhbp. ՟ PozbJ{+vFr0%ע:('XZ6.5UMH1#׏>w>-^˘ ɃFغxɲw=0Jā&DxMY`+ON:mYȎ v6Uoj6nVs*߸oV\Τ@ZKЯ+V67LZX~J۶egAxVlk*ؿwNr 5h#G>ܲ$B+lRRcK#6w\r∈$YU6/s6.=3wJ᪭0Q$o穠zF TI'*GqVQVW&'pEh;!NIKH8e۲ln6n;$)Uxݟ@G`kFpjJM"A@_qf(-L BT M (aZ[˪1,Rtr%*ʦAǁ"Ak6߄cΝ$d5?Bg9`"Ń.4 ,& 'cub%ǸEdj[A}Puuu>P'cXE|=Kɗ.]z ho܌G2.{7<@&Y 6=D?sJw1c0n)3}kʫ6vpT̩?wStZ^r%婷_[U6qaԵoh`]5+(.~rC= ln}PpĨ};54̞=ءR1@ܪ[YLdNp!<[̺'z}ٟjv}}ͯM+c3&]]?d+V7y?^+/=~ z5wbv0ӭe _?r S9O榙8R6F-~؜[-4uS/=RlKo( r݆QkI雿ޔ&XⒽyc50s~X}y搱v-l tTVQDzdmJaRx~{׶}T;gNkj?si h~-/(8^wϮT2r33 g{ Inٴ} -;w4x#},|O55e@A~s.YҫեV*4$ڽ?^?v )q集MhNVb: Qgo=78V䣥UéG?yqjC̾}ʏ[c!o/Xqûf,\lKl}/m,5;_铵o/LxE-E&Zc؅!@HzrK #s kV~~?eٹ㧊>rXAUS䂱y͸/w_~5@IL#nu u{8}Z-|wseZEF.Zԧ5~vC U|ŗ;xk6 ߾-\8[ʎnm} ntޚf{ћ>QЄtKp֐q'M0EY`6l4b##VRLTɄC{u:/ܻD쮙3_;j[oPEs~{ؙ3ܳ3YlUS}n\Q#F|l_[;gz__'ayw;hܸںf673_|Q'6L/,,//w߳{ll{huiI;HO ÏxO0b[ܶgGMsGQx9vxdf(^ U]Sx零ܸj-)spS™9uf`󮾶O^?PviI)Ӗ/+\E?zEԭo?Rq#?{ Bp}'mC̴~C6'R`hnGh)`}[AcL%x7x.h?zt􉓮_an,Ѷbx?[K猚nу$5yCOm޼NÇ+VУB+D `RX,LBD"0.DpL,d;VFKH0oSq5RPhYbPw[Hj 3"22?H'jTx]@C ƂLu-Ep`&"RZ,Hʄ!{C 'R4K(M%M`*@WWW ή*A_$~K3PK^GyKlpl3$*M @4CAH@(v_! m8ǣ1x;dY ,.)¹$^2(k#Rn1xضe2o"28wMf٩#F޺^CtE_u)?O7:s c&j !woOT3T1+N"3}hѓ'bA񗋟yjͲ^y9/4Ȇ4U7FEU%dR2F XWQNe œR^GOV'ɉ N.EFS ߶YG-ٳ=lߥ0d.Fwgk3OW}KK?lYk&VO̝ Bl8wbSA=1w̤5{Jiwuq 66Qp9_Sh£%x:x?:N56U9d߮rӡиO51. JΜ}CςWugZZZw^X۷o'~S<ъ4:6cO\Afp;?aqHѥζ;n|7>e ǩVa)†NtdyL $4 5CҐX+qi:R+77^ҶҎٱ6I/%quܘ0.]_ mhkecaNcU$M*ei Ԉ[g$ЅgʟT0@$˧"1Æ#Uo>Qv:k(gjM8gF̈Q9񔖢alcHe%'2Iو>\\2uGJU^F޻*Z&p~/&Qei&^,󥧟Lڎ9y\A2OWarܛv9zZ _-$F"Xt㭷oxosfJDzzZ\0E GH5lÇB>JǣG}iʔ8B___{#9p06p}67ރK 54`^FKl6 A(.COg;vVg/hE~='Z8> T}]u[k{XYsoM[5 nhuI ܽ}ˌ酿͕W^맞~p\uݤ.8dc0zwyY=0fIޱm#\nGO}5U,QX>6o#h'fϞ_rT,m느Wz 4j gPT6 iQ-~l58"I:555-Z_~}6lb?|-ē=7z[ %nd^d9~D^ qq9p BUZw[yhCFE OwD>~hCi vjGC`^Wljf3P@5,TR*rT iD8'0īZL6dMUKښ+"D@K{YSNj\,OhHixĽ-aE6, v2 C"R zTT($g6DP .oLK*TȍgІ\BC8'\: HEꇌVy? Y4Қ~;!PI@dnl@EnP}/4\'ulsLؘsE1-JirRuD`T C1lce)FBa&tÑu Q6AKn~В6BIN+2u#ӮfḚ.~9xR)h L4e%N)+M@H$"3!9V&Ċ?$F*@[%W;DtDhj I/B &+鶬#YjБz9F\t-ҙ_ꇠh_ɏ !O*jfxg<}T,IO Q&*gIsPB7: dZmg2m;o |[P4b$x r,$)nٔD xlc$9v+"L2$N[I 8 I5N(y9 q:`!5a+*YL&H px\p\3Gw#My8xh .vzhtJbw]pF Â;Dk0 *t#Nu+mݝ%&؃>!Assx`c'P w\ΦhjcY!m@i&+ȆBB~ W'7|JN$VJ>-M%AmLՎ"%ksX" 4'ڌr+hr,7GO5hkQ9;N4ASNJ,>\31R@+ܷHٗdQ Nu)C+N±\<vDSlD<#LSB2e*˩-Df,yO#F9rH)sFJ$d4?dxZ;n2Mvc<bOWB.Mb%fCi*Q!>)Hei rdIZJ3X`&-U%YӖHd"{ O.U\\L v":iz8s* aU9R<PW$,`$;bK 22(#CGm!qMY@1B1!\u^jsD(GwhYfVT \<2ѥ@Y“U.Ӑ ;d OՖKP ƤYǝEA% 6謩c5?RBp\?3[A"w'+ŝx':7D [gb6EX2 t$#X顕'*[(4BxQ&!$*LXֱH `@J4Φ\S%8IM2\,骉ݭh@UJRoE+b p9Hr@;f:#xQ 5Pn@ c;H+eX&-[*MJYƚ 0M]V;!nKIWjs) Kr>elMQФTR]dЧ2I10_"f$Vxɿ{T*ESLB"Zȥ,OӨs*eP qЍ|!c~Ptd =в)Rxxo*&Ǩ, P:p/N-R(iTF$8sExh.?S&G+^,0"ЊD#"G 5]U 'w2TKĉ! e!w\)8@*HY) ="%%&l<x<<\n?^KVcCC<>ʙ|ɳ6jnZZCY oH6J NFC8%+U.,ap1JO< `emmmFeee8L]!ѪsF7|x;歠&\:ё'/aT[cabEL?]ƫ3?Qˡ|!hp4MyT~([#E ;FFρND}pZP3,2Cs :A8I$^P4{Rmh i"-SVmi!#Ңt$$K[y2q 1#B5oY&EmAi/{G O2ᆲ6`ldْ-g, % 0Β-d[,49tOL.o}_׷UuvM}pdl:c $;']QqfF2m$+ - 4]>?hZ VU27iiipT(cƌ6mæjiiy jnӷ?OiKGn81GNU(Z4I -Xw^oԩ::z=X˫YHtQo$V?T 7 3ؽ$!՚yyG^1+x̂x!}@DDHfVpXcD59* 'vQ1Ό=,q_k HBOw[ׯ_ޖ^c}?o3(ol/'ݍ c1z+& 0em3^ywƪn_1W䭾^‚Bʡ.M&4&ca  4Xmmml["]}}}RF7ꫯٳ_NIIq:( cz]2CD'^_rӆ/6iԆ\t&ǹ77Ac#hT$e?= &6>q?aԹؚ,%)L{J"i&&IMR8!6n9V&~,Yd'p%Q}D EsxH^HU7);W|rCu}2%o7s%o"O@*͝3+!!>b;otÆMϜy/ ӸUt?N2tgcaꚤY/O}:;)s_#sq=GcPiPPX1ʘ(0$6o|術1CoU{R0 ;lr"_~sF<";GM{\$:l- 7:&'yn"_&!zi?9|s5I ȝ#)stY;y̏)D@*ZM!`̚tY[#wz6men@wɅSXgӡS3:fRZm:#{WF^:p89]~/;V A`XZHڵ U v3`9m,`|@8T+.rQ^05x^oԛnФGwF!FFpcq.rOf+5ɓM?^p_t% / ;x=$uK_~M:)+@)m>R0$1BR.uc".,/K!5_^\h-եKRC2w4uɉ L%FH8|I~Inuq8phEn1.^vU%(M.-w9dNL UEG;xv=;T6l+&+Ny) />/Vm`.0S%i)>t~RK؉+$/O21X7nn > E7|\ۢDƮGVnA`=bsS4*J&UX> f L3h')$i$/ Q6@P( sPyv1(Bld@0S  pjǎ@Z^ 0]&VvPSͧfOU0g41Ռo|} C=V-[RӦN!(%ZZtKIx:i0!26Mr&%oElH^&.+^&!(<{8+/I;>?#뇯JWN3iZʽce7br}+-Xy-jd@Q`:mm` 2U*r* c5u[3o[2[%\\[_[FSމ'N_Z.}WBtc+wu=qSsS1X]? 獻+Z6?< 8I"ȅY LxeV\l qK箎5}> /iqc<]V6!0" xVO!i=y/5gҰC?0xL\<-M [Y-'W߿D٭q!Nz;y酝;Qd@O@h p ~# FE[VmrV>ZHh1at(~|dt0!BrֺA WA D3kMysJj;/lq,Mw@N/dEZ/_[?dU:z+WNꍽ9|c3 1/YQ ꔱ}˅)RET1?+}\HZFEGE7.2:YHkJN0H;H)PyVe'N_Uz(WkzN+Wu3,Aў'CW_6sJ뙗(o,жM5s4+o*Z-G" rKHcB壤 3A\rs#KI$1hO` |>s\dMc,,]C8\R0XM-8qozu_^VҵMǧu7y?U$=1n%$ƧR'M\?H!:Qy~ J* Ȉ$P,pƎudO[T֕;}$grfG{׶Wmoѵ/Zk ГTŃ)0 KvLT+_oW L(h]wkiilI)*M_|LheIZdul_ռwNOd&ƻR,sΔ̼Uq Ls5$ys<g Ǐ29G5KJ5'/_h-]/?6!ƣ\\تPڵSf/=!A6)DGh FD ITrDMscd]whxҩNKߕwj=EۦNt ^u%pxﭬz (RJ AVvSZy#q6|#|7ޯSG:I@pNi9*rw3&ɬoϊp酖RƵ67=sbʥ\tW(]WH dYI#q 3_?}}̻Gw=Ƹęn$4 9fg.ڼ2͏?H +fr(Um {/o.9k. (\0.`30Ay&7.(cg |TZ Vrf \WӍ'3R/>tdjt]HL^KohyU9)}yy9קO^bƀ0GxFmrGpㅓ&ddNɭk?/^87{vaZFj~Ěu+^N^!Q$!8OAyBެ^,x tb &O yBB$遊=T7!7uDĔ?Ɛk/[~JXcx]E$7#O 0#AWy4)&lMY/4ԄǽuKmDz^vw^ Ryإ )9@^z9 ;qbڤɩ&dL|:ս[_{9'=}֮=|1,XYw+(PH@J?cMLz|ۧ'q:絚]E@; ;ay.5#q'|÷~OX<|h5S޼bӅ윂Af\ ^&N^G ov-'! {E-OE4h4ᘻ{/U+m֭^"{UZNs J>&e0B,ń: ɄYya' lA6j\QVRUy3澛S"19=~!Pzѽ:a0QgAv]v1XQ!h>nB@A hGϫ/whS:v.?$?6|(B؀]J&I4H.pn9F穓a>HӲàsM#8cã>:q/f<))w GhD^ qQ$IfA/]wƦ3!e!=:ft[RbmjbyUV.HW0!suLypuV|:&.vf PXm`G,N#9=sC(mD Spܘ 'М0P2!gצ.Jh0>U-Ovv#oւiڠpF CF"1 Sg<٘w#67"`ԟߋbHi}u+Ç-SLE9`~ ' ^-7b`.̡M C9_!a&.C d䰕y((yMXb45A1m[[0eK yWAa0)(eЊ.يKA>,':,MR[],5/(MCo%W7Y<Щg&qV~V@7vx \Q#߼."EJ]x@d_<&iFHaPi(&䭼5f>7r~܈ٷOL{U."yEiy@F&ڋ b9G0-~=f~NC5o2^ h O"+B;t(50P2vyq~bo߾3[WQT P,CBD?3~xh{yB)q/9c$ `h4}J]@9|-mhHPOa|!QTgC(1{=~,ksi&啧҂@d@$D[QAEK,%qt弔$xj: mZyr^k4K{ocmgq#PdH8/!Ɋ-@("h~>"2aɜSPr @^e:Om}kk;/Cx'@T dyVO༂LA и'B %c$c,'ĀRJH$(, dB" `W:zJ >Z.j'VlJ@' \ Ai`<((YB>eY H6q{1Kc },;kIV xGht7yzgtqh7CF]]Cm@_ %3~JKKc'pݱc_m09R]]_ʎ;ٳ9]YUZU]VSU[SP[qpmeEE+JWVW֗VV֔VVW6T5V\>I)èpJM\ "P?)a*񊛵kx0b%aI/yyw􄝼px#q8G@s8sx7{<<}4KYH'R{{]@&GȪqY!M!NһU$bxh"]e8ݨ|nՏa&T`NAPuZEMpz}@,̻ K㱱`$, dSFk?y$.MfKP7J6s{X7J;^;.|»Q88 4iqn Ef&$e!h ء~#Rv3K"%Qn%nE@_ (8ne?(J[,1c[~]n'8M в $ES c0} |4Ҳ (4;]fIy.Hԋ8@v4yIMiD=Ciҡ1_^n(^t%n"=Z>U#n炖>M+dRJ5ɥMjBzLZI8jlnէ2RO_.}~ַyLcrVu,Md蕄ϫUIzêl 1V:^͵3E'9Og7KY4[k2]iSBxm :\5SAқ%SdnUN#S@^Mr{Qv툹n4JVD }.Щ>Zz%ZlY{Ԝy ptlWPu'i#I sA\)JNIjA.꟤QuʢnJ[Lhu;+!5Q%ur-W"}ݒAv$a̭koc¿5Jf 2iā8ISSSdہ[̊m;0qC 2b}q{b۟6jӠ -Æ{Ŋ{ߊw p( ˇܵrbŪRqW&^ o0 b۷ ;sC0uCo}1! R<4&y((HdyȢ;=bmbw>zI? :n$!wmvFx࡫?de~غmɲa,kdY7duܐq~0YnOro'q;/S,zdžc18œg:(ᄽi_K1xlOK!)}`᱘A3~*Y+K=OslOsh*j3z߯By!m Hqpg| Av(dI~Bk2Hc]zvo&5EYeS&.5IOLf檔 Wx0XA\v>Z[<:D>0QF > ތ`h9L6Zm$GMI>GBAj"s@Mq.vc݉tB@@@{@z='  Ba {hX8e?؄Uzq#yJܴuDM*Y!xD/n6w.Ƴb:rHe tP΁ 3S "q/Zn.ɠh0,6E9VBE) a>f ى5Cnr?.\7}, a4h >'b' Ht څ TEI]`TO3%2l)4mv6;[cȱ13ۘ)_%ےiL1e2Ӎ1-pѐlȆ2!5#e'pŞnO˶g.?|13g:&nrbbM i6dĢEӐ)ENGodƛ.q'X~$iz4|2?z ?%X_:;__Yv2Tegks =9 &:y`NX覛OUZz_e{+߽Ϟ}GkU"no|Nxmɒ-͝;b3&HC F$g%I,M9w/ `."id9BcEd{9@u9u*ulWSIZEyfv-%чWj<j'Kv zM  %pn\zp%00Fc$io9a+;<(Z(!X.}v3{AC * @!tuVVk0^|D7aQ$KRRGumߜwT0"!:,$YYY[l of||IP}Q\+GzgdeLHIL~O|b>#'WVx%*{fHb;?0'353#Ozb?s9pKqy-2`;vI 0,>|35-13+w߆du<yɹwVO$!B#oS+pP Fyev7 A9z}6KǍ'7m*oh@x`a[[ނWo8ICȊ8,.Q[f [!ɱ4f#13f坹pGN2i7zA)x?<ԉڳ%sg;wzD!*fF+ݨݍZ]MmN8Wӭۻ8`ȑK'T{,_ͲVf˯MS^Q'Ũ cnj.>EՎ9[_z6!adnNƮx$ 6m/]OP3)B )N{})cP~gw>t֮#M,qCeu@9tUgkmktvW^򩪬w"ā9Y;uA0PZBF}x-͹v͆<8m.OW<$44ppў\+gxJމP?6_pVH(!yG$I0N!k0 <_ ^ă^^vChG<B0S`cӨfݍ/]Z+L,>2:t_!@9?l?Og-h:.E(wٺ-G/t4/^0Wz^|эVՑ"׷7ӯ C !@6 n(bf}pRyrͪϝi0{ۻ/Ϛ=eҔy.uzN&g lODnȄb½TҽdxzB"($L'$pqǁ# Ą豑dj|.#fb8N.A1Ze+0^~I64$,w wZ?puWkG& !(O1=?F'ǰznsY,6ŃtQ3Z>U@vY`McQov`f7 /(|˴Ύ]6{bţMAZ2mz+䇔h6ˁsͮ4{TeQ P7F.D !:iu 2'7o bSE mmmBAiTZ"9N@ 凘,n@"Zzb1^G XF8 n,!č]~~BKk\8eGf\^K5(C|Je4#b1#׮vM,/E\ H8 f.?z#s}w5tՌ  04%n;Lh]NEn+8z$̈0j j,N/+BG4כDɻzޒ V?b!5Z d.xÅ]Zs]SYUA 1w0QT:O_w݋Y婧;z kREA acZoKKnЂ>ۍXdCv˫w?7?iM^ 呗ޒڞqƌlWDjܸpx!gGO4ޟ0>Ee HLqqѣ"xCzZ}^Ɗ8lT;mVy).WdlRO@GoYMěT+gO;ud8wNqբ{7m&=S<űRMhsML,Am`e oiiqcbbB+{dO+h~7&O<1;g z01rfnjkg7nx^޷l^3%gҽƼkbӗohbh96 vCK-99?=3klȗA䑇|I3ss~5u KW<Ӆ G,HDC(P`PSO=uIP2pt}晧 SO>&c׬ޤ:{W_E rK+^x歖ʭ[_wU*J(+VItGGp{P,iE@PRsUF0(a[?1Iy̑p^O Eb/ϟTn뺔(m8Ïf-6]€.Eg/sZ[[l6AT~7L@h3?zp'zr7\笺r0V`ݵp"sƌS _}OfJr_.f1-YyutU ,%i<0gmOף7 NJMwÍK_6veZUSg/K'^` jD-k_W=Iq;;[ Xɰ,Jx0t;$xWӮa|6U ~KsigY,9=31%};:iy ܼysfFSpZGj] }WkO6/]Ig .OMowL Ns)*sy]T"^,m42֛7֜r0/mUjk jvCN*|ŷk<3/]+"4ΖOjr̕9]:bUksϜ3ۚn-=g[z׮g-tܹ#+WYV*қ>m7/.a\~پk3.GWq.m߽M-N" @8,3yUm,qJ\d˞m5n'{3|#s[={ģ$UR+';}Պ(ZH)Er_(tſ|I'g۶m>hSSS.PG /] )]MLs/mv^7!pK8 \7Ħ}i<ǎ~Pv*wڭ@2nWϾV~q{vS(K!R3&Ϟs'8"p+¥uwΚ:X*H@,%?SE%-}[|XxU^h:{LԌnEd');ؐٓ 8{k" 1< dV(̅"`Rճgl8pIb5{}w~D Ұ/s׽iH!ry}DqbآҊΞ>L`zHNPЫ%TndS[kʫ,ћ[;#.XcFA&VA 1w-]Q-LϦ%b)@1䄱1x(4uTH88ɤđ##ˏmNOZ`:hEt*h#OtaC{P! rѴu6U0ybUeKnXj_Xx>[& WW7ϷS#RfIg5+xY)&:P-O<:+jR2A W/[ժ~y OW/4yz㘇eMer{ɧGq~PtrA၆c";[&m@iپ 3Ҳ*:U@t䂅ךt D<6=` s &9$y'w5PFeϊ6]39|>OAgOvrF[kJJ?S&K-Xa,NC{oeWUZ(@* acPmEmEAA!@B<Tj~<}wyso u%uY^wwIdNaOx ̙}@4ұQA] vBoٗJX._H -Z. .7}$ `~w70( J!6EZ4evnyU:_k~׾z=D 4v{T|1Pv嗞_xnMoZr-[__&-K~d]0BO׽5;6PM膩\T~T4fT%'!nU6~*5HGˎhXfxE/r<o`4+&㩗}U xϟcG0hɂ`ϬU容n]R{᷏g%ʕԌ];:ǼF/Xw$k{iϪwwEV^ڳ#e;fq֌.J于y{=j+e g^sYO+{#Ľ lݳ9/vϖ0zBAS@7ͦ}A(p=ҕbt{;lk.(coߑ Gw>/bj=oO$ZDK2M"澽ټ$I d2`d'w}P洩-_G~wR,E*%Vj-kb*|?MKl} /?{[G=5W!~zꩧ8ACeKʡ޺}8\u@+2E>}緟㛥RYd =?wyaP * 샬F믽U)'TqY'~_y.LFA0@+8i,or9^aM$aI;wm]zC׭[!B~Ove٬AɊDYE)}奋7Cj~[zE]˯?S9] Eñd-tگonWiYfůlaxjr7ޱ1d1XjLD<E\xծ|җ!s{ [hD6Z8(8 xNdʞ[Gt!޴!؛?m$#D]supwuG2[޶k/U0֤џJ!yg 3 _{藿xjŊ+B]Cˤ/]0mno2W~oںZ,#ߵᬺ33畦5s-L46IS6ƩϚJ6LԉjTgpMS&)Ʃζsмe(p0tБ'"L2W9фs:/='y]£Kٽ_77:h_O/|`̮{yͽ[):]w?|wԾwlt3ɬmFe[6`[u"ru%zʘ4^7}KGHDZ[fz|JWXF#{mz|7̼mo=$$\ٹЎ`*8uNQB2kxeͿ"}/w2L͛7g=:o*Z [뮹Ks%eㆍްʷvOUhאލw:v{9|`%1a~_OOk=;oڍۆ6nvPh֛mKճau{E!hsFT'hjh=w9s<㯽M7pœ~d2np+MI=5~􃟵vjjhpE iݺ+|OO%\)\ymmmVj5GN1\=<< $p3E hIEKo/7vv}Y!R;.QFT9 zxJllF@Qq#F0Ǚjǎmp(p.VP~CSo) M7o#_(kfӦMH% Ӟ}IE`O=&O>o+&bHA!15{2{f{"YW~w첫]z6f$gNw+ͯ@33Rm3?ں'?khn-Y[pipfk]hgjG6h0j 2h$$l/hԅLtTf8I0"|.%ĪcPfKT>O)GAr^VJ44q:I'_.蚦Pt%s dH?o E8j+V#_̹܁?qaz."O3 \XQmnF'r E  O4"̐J$@qz,|ST ˑ 2-i#;tUr\NixBQ4PAtk(qU c8 X~bFyg2:,qteI< `E,A08Y(,DL 'Y-Spy5?ۻ7ֶ7[vϟ{֙wOc{5skw'lwtugWDgۺw,sol=ٝ_k|HPxR,aE4D(Nj',A"x[aL| rėN`PJt"($h"Ye p" ȇT0ckb0BX8Bі8R+lhhohj|enIj9fhE@*t 4.Vb(.ګx%] + ,<"i٩`ChFX,2re  X# +V>~ڔKw ,0d`(eh[!А@+ ՑH4ʝ(&p0@ O+fT^GK-س,(kJMx⦦.ֶtŕCl8 *I UUȒ4zKxym=˗wRC;0AQL'Gc[E]mE^кvU0qPPc7JCn4_ ^ߴ/ӳWPЍJRd̗W,w|)M t.Z8wLN%@R"xsO'JBn^p➓/IKEUqJx 2T&[ebڷn=<Q(YQ1adD Vp| @(eb` `k&dܲh,cL& B ,ՠ5`Af F] q % KBVSjE-Hu8TX1 `ʵD2M43.\, xSELċ4],2"*2`%ty Ek<=Ig;;::n8s?]wۤsVOVk?],]Uʨb ^a[8?Z-C{?ΪSu,_TƒJ ^.ad4Eej.A/y܁?b(HT9Ȗ.>`5ՕZ2 @W/xA=I7::鼆1bU>bkxso$xWPXZѤuW^{mP$ X.]ɔȒ(H1y Adή3_~i;\<+q8RCZ5ZP3 Pُ(Q(2⏡$ր[%={wy`Gx8A1>F[i<hl X*bɄMݴ No+lϷeXr[X^r[X9SpHs[ӎJ)]pO4R0_0Z/"^2cϦ)@d$!d*h8!*l)b%FJ2TaG8VhUA 8 !tY8{ɥfrF C< "/0 ,X K$L"$q%RX91DTl$Hb5X]ȅ,R \>Me8 jpBf 09P 7 *XR&U(=sld#5'NrpvGasHC9\B8W$LxcB ӥBgLeN>c`WAKљ<hEQB&ǰt",\sg\!K"(̢Pk$Q(zUJXT:{;,HM +C-ACA/x>hNl2P.hST EıªT b%J Q;q(% bY.B-lppʕ,#ҹ(@U$Vq{4/@"x.,JxXibL" C1 CY5" Q. ΩM.j'<^4u֡Qˠmt:P88FN9Ⲏ8{O ؃@0 G>Lplt6t9(qF~(  <6G*ڝ' xh:wwOBA@ n?xo {('/C"^Oh,9KO XaD,i!)o_g i;Zc;tdIܩQ`4F-6'@NVo㌇^5q>u42rVsR4^ E Hz2 91Opś9GJ8G,N}ZB7s8R?sV7 ZG@$ Î\ۣH,* C4`2^TiT,ETYDE.a:-$.p.gqE#Ȝg̶#hE=xe"<ǑY)  ]KR@;-GrWEz4p+:Wd@ݤUk,)肦Td X,CD(]fRD2SdAjpDThUt.!E3g.AXؓhID͈ &QeʜfJ2XD?‘SUYE4Y7'WI8HE!h7`EAU:p:" "<)MxUp?h XO郧|2'ΨMxSvR]`sL}C;c|-ҴqA0YwN$bv0@ѧAj!C 1 -w<&t5Ll`ԑ&woHalڐ|>贀*UU-*2r!cVr#R$ UT.38^W$JR.N,U||o@{b6@AT{  NLZc-^As0(QXR*zĕK6ScB! K URhA* U`ESKkP(s =P患3˜jL1V!W,q9V×Ҡc b g@D(hصk_G JS*^GU's%iz$)~d<3U'3TLIo*Oe MM /(=0rcWwǧꖜw^E_<ϟ3_3Y-9e_[uN~fgp9n3sT͝r9C6hsV@3i@.GCQ01SLO*A-XDjʆ%pҌN 7 *"MаdV$ePԜVUt 4E8e}&HT߇"XU(; ZZ0GO3&Z!D(㣇?|LhU#qP˨KILKUdyi F:2V/)gUA|#deVh)H2](#:#U]ott|ߞ5wwguΞpk׉c-m7jwFӑO6u} vCuw螵{οOӦa(PBIm)U%Y#hXq"2P%i8&ʚI$đ!#0c5qq;-)I Lp _@1LGY'5C2+!"cjMδh#+a ԜV<řni8f~vȟ;'Dp+l 3j*U,DH -WRɜ?Tx ˶l*—{)"F$132Z'Jh#+jGTc)a2# g׽>-ճ4|QJ6sӅlC;[hkNMoLh̶7W[F+[?v6!eZ[KvgꔐͰL'[e/Ͽη oym{mNpd DJ HRc,717*&`Qb7c9b 7Xq꽾qK prj\TYxv c ZhDUİ1ǍLHDVHV/4!X@9<+!kk´j*ȻXK|͍?0!Er3Q>&hU1%^z= :=P8o~}}3gv1?ܭ.شƕמ$ڼ}{SCuuΙ4g6"/e.$u5ghՆZ hӦZ_/7LcSkmSb4~|OElS}us[-_sdB@O2X2op,OP؜|AԔP {> *aiK*&Z ,'4Sl*x񑨞^ȁüacV4 qh2:e5 P@8 N "EÐ.(\5Z;έX`7O JA"Ç~ڙx )[^WϟhVkͥfCVZן׃cXWo43IHr 2H|g?eHoxsUAJ䥟{@ki&Az0\vpS&S& S&3S'&`;:I `7A4lko]:Lh2}>JDF2͟Q$l{{[2Xu*ZHœb*@'X2E^ֽtxWzLb䋥r&53c)ClI2D f ċZO /lxc߉dUҫ <ǹB:zu=fJ,GjcN(1G*2cژI9ylJQlI O|n&fLè*hA7nx>hUJȏULl2q}V15W^HlxӟrjgHh{?>|Q"h{ >%Jzп7D! ̂U@f)B,o<.Mm4_ircR} XknZRBRkj4sS`7њ-7yJ>dRTVՐ|K\w @ ^ o*W_{MS;g.޻W#nvKZ[^xj!v!/sY[Z_8}F׫jz<'Oe=wȧ‚Yf̹[ߏ$7,X:O]pSgDȂUY'-97I^>0s̎{YNXlISz,{*W\~Ǎ7ݰ~mƆkd SdO{*JhixOOO45Ǎ (n5n IbU /~sލRj:GНwޕHN˛7\FQ7Xkݞpڛ_w~=AW=w=cv7" 7u%ISK.OIw|9yQdD{cZkh7kԚimw4;/tN84ܾȌ%u>-Lx⹕@ЛlOg_ }od_|uLRXod%*< Xr>J@J7nu86R:yA=5U^y_q#!knt: |ei@0t^߿۽'q|;nzrChuY\\lo0cjfIiɎBk F4vo۲m-M)ɰ9˵.);رb fHEOϽLz(\ywbɄ57Y%MdÞz[sDUwXO-]wz DIG^|T{[G yƥ.ɁsniLOoŋv]S&S8vȑs͟ ::gK\f~Ź`Z-K_~஻M?%qQ:,vBKO|H?y{[l<ϨJ=k q͠W<˱ "MR=Џ~#[A+DY&*<k/_>wVKFI?c'E]K|'w\{å {=ݳ[Z9; dyV{$EeJx {-N@ V~ŗ6IqvS4,d&SuۚyԦziS|z;|K&LL"]DL$f$^L=v?_x0e1Q Sd|kvTH wv<h{~##))Rk-g ډ+׮9rb\y}`081j| \v{ +oy}oz0Mgt<2:xjy`w֢N&]p᦭xȽ?_$fDMvN@Ba߿ۿK ._vQ@nL*[JZ=0;w/ ܑy4s,rC6r+4V(P2Q>6heEɥ+^p)p muW]mv7SPڱc`y^\nЈ]hF3mصkjSde`5Aֈo~YW}0,Q/GRMj/mjk5SӔI{d">v{3`sjlƒVhTzs~Mou{zn{vfI9̘5y+ٲbM=K64ϙ߳cYg*t~╇+]7l8yYsYcSU߿_7y/3{vvW]_}I™sl!,e~w;ߺ^{womk:k95J`a1q7o3g^sS{OW^{][_}d^tK,9wt/WLhk fϞte(cV,c2OSd{O#B;:l ;6 C`UP.]g>9W_/he=|z3Lro}FU2 nXsm8S4W a{5ճ4xRX<)^Xlή]&V-F<#,_!L!O4`Da %Qv5sɒ/p9㲠st ~d(BRKU0K$$"`Uő4>'T2 M]B:{ljSuB>K(E]c`DID9G*5׍>cHR88Hd3)Ƣ)ITWʘr*354&OX1xx2F)J|>@-'lZKcV(&0bY[/R x4%T(:nƵ0N͓$ 'U,b7ыc *p*=ėl!to{w'O=gCg3woC-gS uCܺ+?O>{;ObEήTݍ;3D =V$#6$TFXcp,Ą@f)0FtLbv[T(k^kF$r5I#o1Jh3cҚ(Rg7.GPT\%I[X̛-&릴@ EOE0!P3sNCqd<$ZISE hof'DV:VWO" W+P3|vhʘt~!##QN#t+-$\P` .d:Kx)**-,J8 1 (y\T y< ɰQD8%哌VFBFM ; ؂B.hUf +)(@7UYƫ/2.t;ng?T}}Qo"Ky~vxN8Aa0D(Sx<Г|>lAZ~:l޼bpχ`Hc-TÔ@?}No KQ$v?q<~t_ žH<04z (bḭd9 '6݁'@ѿvGmC`'Gw~⨫C3݃c~{CFB'~?JQO @Vf8)9R(FFFx8'QJYQTAyMr4삒zc%4|>ZQ4 骘iagIC] ˉ*h\TM*23f(i%VYY@5a83ø$)UFU?N 0)M@|[۷/M5 0,$ FL(4ڊ ]frFѬsdd(U]Sl6Ѻ!ds8\iɧydaUJ0d_Tu%6HSIz"ACFWzI1,c_.ݘuDu]5A3paE44gؕ $v:12EvD+3$,2k2B3ds8%3%jLa]IۋȢ6"ʆUN#.ʊQ=JQ"*2(&SFЌa~ׄ1_ZN3#n, ԏebNE3T4MV@ӳKY@\4%͉03ܙt Nϝ ιY󹺵]}7Ƴ>}6|;gwOCmiЮ1n5fI-y< ƀ U(J E6lM^44'A{f3>fWe×];tI -dAf|uݴ>N׋(AFzkta%Ir۶mX _~x6ltacLxX v2$I [ ?)mxZ! 3C}06/?{={ΰ{d̤xhE50HpZREc #;E0,IT 4 Yz^ko-z߱o\Q+92E"s)&xp@N^}h/"ӷww.әBUM?3i7UR:4Ro1K;ܧf3o8Z1QOцi ,-JsM~(c28>{U6oޜL&:4O#ϭZ{ZYS4Φ˗n+ *2#OtwN2UY]0iu7"q83<3gΜS/YrD8GMcԭS#im|K'4up.m*R}Zik64gu2k_ 28g6Vh]̥VشR(<^gӅL04bKry-^vxrl -+aUQ #K(b򸕙rJٔKV~c2M4HP*]ͥUD<[,I^,5 T^(?jضVX'xOb^VjݵV{,ZT""HB IyHfL2y~3{g|f3=J?vmś^ _}оaŵ]lkOpj@vh7aџ6thn }w17 ygV|Qqobffʹ9D;%>`b8Wb'!ޛfA4"IM=1.@AVk+荢UZh Zczk$WNkn L2KvYڕxYܔ7F9Ź옘MK)+..==*@(͛f׬YSYY zѢE\.D>===-%ٺs^?qu;>K$hcIPq[SdLLЧ$挣Ug*[t_E*j[<@z;wʵ~- [`G$/5*{0; <3>iԉ#T\C^ 4!j|3;v~~檠~ǎwyd2lذ!))ٳz>''g֭ H4é.wtYiY} 0:x?*y{vGit1DzSw콢W{1LR;O UճDBQe{vfҚ>˗7G&B+Q!J.{_{mZi!_ZǁK yOh(.{M7Ӆzr<.+4KD ,okQ ۻg͞S\?o; fWOLmlzqbfe(V2hKߜ:ɖ;DƎeA\μ 1Ԩ$riX2!Qo\=tY4HFx%A/yp~ʅS~4uu岚9rP+eDjJ XY`>#X_ssҊgxK.uuuІX,hiiAYӡ?'Mp ,YvI)QI&N.+d'q9_X R?> t1TR͒^ NƕקSY: wZ`znŎ? xBIq[\Šu !1|ݚoH, aV4*J<")/M</p銽 Z=rHT9}^aĮ> . ueNEYA33o?dtm9uXEE}6A{t)Kg!0cI,z=FSp3j'Ek&}|Q*VȾ1b׮}aM?1zsXڻk];JI50V,x7S:O PK5_Jl炔oja׹s0 g̘ T*\BɒEO_\:izYP7#VS7=a]Cw|>'mT2Q~+sO:x˃_pquۑ gZ۸\^Gef k.mA1qtU DhEo@fR,^Qk#hRHU=eTU>Zgz]fʏ3?mnnpj@e'|K:ÆvXquoAr滂_~'92cBh' ,oB'`'8yUH:Vϛ.Fr账K aH@+RF/uv=|aprAuintT]YȥS~Vo-"q}s6TUz{!nu8~۶m CCCMMMpEkk+(i`uacMQsҸiÚ191hgtzM,g.FZeժ^׈\2ٟLPR^4ʂߞgVtN_]N(W`>K_f6bɆz#Z(JQB+ h廎V=AB%W<™T@+h54 Xjyoi!A %է^VMþ9~ L]]13V0媍lv/S;,[r-Yִwl|O$PeN5'll6Jr ֙"Oʛoca"Qz }FNHoK;w_6u_|xnb Q&|vO]~Ea U͜ñKEWn'&&RR^|E|rPVV&v;\˖-WZT6v$7mܔJj͜JJ]z v4?( ryNsm 94N+v굑! &N=ߟ_c>K9qml|2ˉSX8}۶ +m\6Qι*H+e~7sN807 /rI |CKA? N/xPtvrlLc8&(ðjkģ;<*HgZ &ߍQj{%oWiM6g3-'e|5>u & RJ0Dj}(?U١I"' >L[l={fCci! Ct(ʕ+Lb񎎎(J 1~T#e]jA˃kjj|>Ĉ%QBQ(vSSՉC: <&iᅓo@蹉#'!iF{n줅KS{o??:Nv=s&S3 H*gg [ϣ4ƈ̥Q`pp玗ˉ yCYF|~M(j׮]Ν;"Qp0~][4*Zрuf4G  ȷ0hι  T* l9g!BRF[ſ"VE6U"d]^ Kqx}ί s -nKp!\FC?lA W=Ce[ |;0`KK +{CE)J!&BϞ=j,,`r0eᜟ}o-G {E‹]ZiLЯW\v_4+dBbaDxYJ :/ke߀QE ;. }}D}tFE$󻺺 LL endstream endobj 881 0 obj << /Length 4759 /Filter /FlateDecode >> stream xڽ;ksF+x2teB8k-y峵JR*(`$k(Zp===a0[ٟ͂ ~_}w>63i, }3-g?z?i|~P'A)$RgW< nzu{K-73n_d5.h̗G'+Wr|o?R\).>fHn'HdOcPϒq2 Uڻk,^}%ܰCf`,.{=~nC,ԏM8] HG#2ǻga.0aLBPۇ0=-H@ wtSc5jEsrlȺb] ;OAhʜvRv;N66ns9#N؏Ɖs%Ϛ?>j\jjYȽ*x^e.$5vm^OuM(i`3V )g9s*_qdi-^&=rm/U8YXgU?`e.sAUN: [:ɈM@+r͠e\yk{AW,\ xruX~;SOH#:dz9(&fٴݺ*~@"u7'QYxlڽN (8IK}(Hz4k&ϖvEQ@9~Kܯa+1l}<:99}HqD\i;>UVTFX);@)[MJAŒZsMt&VR8hu5:)xS vH~vfSw'J&PpãOs<3܊bK5ᶭ6L.?Hp_+F}+>6_DP€B{+fA :cVr$}#捚@9F8Nưv[oHCƃF>4񼕩Y-& f'Q.'])e-`# {2D/2Nߧxccp+fa9206 E'&3W2khȆ$}욆d)X+8:*\*+Ę15XM ..O$le<`@A\E8r\P>}jXMsV˛Ggo>}rYRc{w1|c^D>|yr160汈D:I"ɻ'P*OS"1&\1Jq8^\FIwp+3]P٥RN DEI.\Xi5P^+rT4fSH7}ŹN#WeM ߦ )!<(K=u[ԋ&\^InR70^ʽԁ PKrdl+/mr'; "_Ej)w_~y*|d.r0 0(VL񽄌  ZI.ZZ7,T/4 ͂ dVpbK}bF>Pr|6䒁ܐ4"'`OB#e]*kӺ KIsY-I#ؚaV_=);z+y ꢭ{Ip .z$ULjɊI ^,W"ԝb\(rv U.71nN=sW";Vnb[@g |eu1IO r&ޏQ]8(T\# "qB@FqZBN>|i5ӫ^t< 䆻Fo~f:msn97VX4%=3/e\ssqBW t3r`?gEF~2wh[N}h_?Swώ'8NE'~Ȉ$ޒ3((&-:M)mlpI-jO5>f 妮A, n }^$ =48BIld0o*nfo%2!`0rvtvUhg7&ʒ)\5#`ezs˭'1 (>=0*2.e c!mTѝ8!v@c*e9Gǒʞ,=?|K0F}8SpY]7g[B#?_YU,^)±+_ 2.ؽ+ % 2z^ WrNgxe+)Iyk;z9$ǝ< &]<%ߔ7Mv onP#C g>&Lo)J#O|BS*Q-_,e{H2`hz LtKɗ-[*WT\SyM *̍f[LDz+[uBߺwPʯGZ>zϨ֌S tؓzONu nz;'oF oGT0Mn`]GQ=d&?O#v@ s b0&QKF 'qVx.mұte/cM\#'+3[6oF.X]8ïT6q[*54[V<]`ؔ KFfŗ?דixawMAƾѻ%w  endstream endobj 905 0 obj << /Length 3213 /Filter /FlateDecode >> stream xY[s6~xX4Ai6ulglu/q" I%i} )N>spLMɏ|8~f2DƏ[~bp5WSj/5 {[shomJmUԟS˳--R~#4tҀW ,@Á ϩ}M4 b-QhϨW@I9L<O)?*,&.jig]?&qGQg$6ҨU?(T0an nJi2"KlJTa2Wd+Ψ=әو%?ˢ"+w=um ݪRsjۑS5tz%SD<؎iB~&ү5NrcF _Q8#?ѷVq%tǠN{R7 1 tgٚD0jKFfgԞ_rvVӤ2R*S_Q?~H}C}`|Fȁ~!T,HB`#fSB )PFXW^ȘtΘD:*(8L{1:A7  (Vg~ g|˃"_oc#?!fd)@F#)@F6 <>9 I5sd#G3eHLž f+ikKj4 ,dy =x|D۲[z= fC6;zNg%d<VfK w:=5n*);hS#DC@"EM6Qiz &[-gMmN#- j+)qA\f^Y߻Q5-iFAwA&M!=?W=~w2ŷal&0 I~ {ۖeQ;a}*Xj=R`5=^z=~~usv*¨8}%/޾>{}un O݄l?YQMz5sQ4U<{D(~y3f ;gVռ9QG<|9O"S_h@iY̑!<،,+'xxQ^WM 'ㅕDFx'+[o[9 cH{kx8T^wpUAtx_:fioZ"1/U;s@} ҞË\E/90i`˷#mc&ϯf8ۜtj}Zտ8,d|Vk%nՂ M[prηv\Ul"o3%:0q :X# |Yb9Ү ;te|T<ܴ;{t|WdA0n^7NA03]ɪZvC!/y$g9|<.MDJy%=@ ЩHv/h fka酙NQ`wH V<$;!=ks1ۖs U[oJ9L_^I) }ϭ?6$.LkB98FYbCΡKu٣24!W̽2|i~Jѽd³nd͓6NjVj^5*+`ERa ]0D'X$\$v@XME<󧠓j3--cԫRR,6LE%6d|ZڒI8H-3WuEsG˜ldכ>N_Y ??jMj " #NNͽ:]Yc/Ou.y]mKA|wuB;k5uZ=f <'[lr]߲M^(6ykb`>C1gP7(d'[˂BΠ{w߁U^K4SAaV7āgqx+6Y އ@o.]tֳu ڛorRg:R1A)=CC?H GgkFg0 | ~;%0 |̡T,~.nWM֕ pݫj2xf6 q;l[u\(MA4|~j$"pϪhx9$c4NT$}&ƛyko.v紁Vyo ]\{3VzWBA%WzB(`FN=9`.njj3D˵> B[.c,,-1bAaHOaH#uK/*HadݚXP8 'cg3g>OZ?Yj'tN[ZSaY4z>Tn "uc֢C16$8Aazs&a@ás7v"w!:Ke]@^ʊ&_֮xiw%3*8ٕ'sW(Bn+[ZBMQI0u)uۺ!j8n ޫ v=\-Jx//6b鋢`} 讂Џ3Qnݿ`ܩMQ[ ){`DKQ/Mzf^[b&9@M u3U9 d|3>HsT`!CȈfj#KҌ9߱8 Odud"A^̳'RMk\ ^s~-nPg xVuc bQ>_, Gn/Y*ݓڞ ކjKx3]HgؒkxP-y^dؚiڥ{yU/vuj!)Iփ+5x=ʿ Su/sQi4 l0~`*@m^\)ʽgYc&%mb.^b}Wxj=4j}m&TOR;nE endstream endobj 950 0 obj << /Length 4027 /Filter /FlateDecode >> stream xڥZ[oF~J/$ oة;1b.d6%ίsr6/ps9CoyWy~8SKjn,7rssNv{Ō~DF(T#8,q((O SY@7GD~*gx%9~/|:s7D`?t[&Rlv_VV0>>ߕsǟyVP>5.0h%v[ڇ 7Qv2_g6~Nué;HkYHde %iNȃ(a*MIڣw/~s >VI+4ݞۂ64oQ)rYup#WίGLJX<:W<1LҖ{"\90ai-}Duy Mn9QOEų+@ 1 ޲16R-ط|@lnIBlVE$EVmki\+ 3jΕwA[.-Y>'V"mY?DB)#U}|1{>0:(TyÑm e?೒񇮼+&^/HD F 9>ק='e qu@ܒC)c`˵dUI76?D窀iSl[9Y֐;[Ԡ}lI0;;hL(j#*WyEJ؈YAJe"68VPf}4ڣ&Б )El`b10AJ;eԈDIM7L[BL& Ad4l/ Ĵ›TCq L%s R Hb۶F"E* ba,TVv^,&l]ZMSHmqs5r/Mз˜@u<;z)=i cLiIX=&OπjVlDž4N8Q&P4%y٠4ѐ9a`,Xܳ!:LDZ'9XVՎM N;0fkJ̡~>88e?9/qͤ3)?E: `TEL: v0 { eb #3sI԰褚TI5ԛ1/v#4 ūwg7'?El6(DHw 0E%K}A`Ao$ }*bW9!G$bQ+|[}-WNSo_TϮg{/F ySzQ oyB5?i"%=*bÌT e,/XfKք6X9QGq5|2 85P [(&l.@H/+pL\wtaFc^<2 Ddd,bH;6}&ơⳠ!,=tu\T9НxέLАtE2[\髆I +{ôj.4oG>7J=%dC€1li&- ۍL 4 hrU(BC0)7D]-m8:\LiL)Q!npp6qgj %'MI)}LQdu1uv>ڍaR 묹Oh7$io cae[UT&LMqΌ3~bR`fPv}1NBtaD3p I4>xC>JzǶjlM>jߚPQ@ 3sT9BPpi1G3yB{" *IarhO HňLbɷ5J`%n` ;Li=gm#🨽8+l*kڦI]67' t[ bt'~L p*] DkjuX d ã}5kD GLHd>Iw^ⱸR ?C_zә"sy@9*6u&p#&&WvJC/bq#a5k|//?\oI Mr(Lb:o -& 2j$"fwxW.޾p7t'1~b'o2lȘ! @tԷ)KθB2_xpɑTJ{B4zXn7[@& #0, r]&%5w[& =|n6_UQ' u`lav$5Ewf M%뙋A/$Of~Al5\2 cQƀ&_xmcr7Fd8'(y B~xN3 ȪK!l [͏B$_Kq+m"1- uʻ 񵹲678DF|ư|bs=:_l p*sJIȍ|Ĥ9eR-[T[oA,A"DT#8=8+M&T20*@WO"74HTF!yBX` p$dF0A`/ߝpB. N~ IEc ,iln}P! ihLPlSXDh6뇳(}hǗ<yTFA_[ ;ɠ7_[)s$ux` =8^6{O`&d\.RSRӷwji@A*ZcDUz_DHYm鸄n!&BR/H.7PIm!'Ulbd4 EQߏ}$Da%Hjpfz]X13M 4Kհf)9Qv6Ů+vOG!&̹DMN_M R H9Q`dCCEl_k1x(DOf*e!J}'aAǺG9U54Qȗ>}{> stream xڵXn7}WK`.oCr #@.0E'FV"i:3$l7D΃pn.00; G]A6Xʏ!&7:ncLL`1@sr4:R' Xdb\!_sCr9!BH0(sd9 |\0aPMG ~ w 812#/Fh)"hO9BrD,Qf!A2 H s0$gĊ$|HTcN\/'HxpIWup9+eCˠ"z!{@28bgJ aiרpQ̜TcӼ/P kY]-G>eg VA PEd[mi2yDoƈ]8 G߬Q͋l74/U{1Vyŋy;jbvY&Gqi60[;tD0dOڸ jA@aҭI~N_SƟShHM ^2(l>j]pXy<̻U ]mj\|1_ Ez/wb-64.JlUde_.YdBb*,R\,kur*,غtX2>1Bdb6)?^rGlN~w%?1'oe"ϗ8Ae@ O*a{J1A ؅RTC(f!85Oյ˲]vbM▯T1b#lBLQjXj(b.*J^VBkjRbX:Q7=87UX.VI$ 4UUݐ̶VC">+U3 }plC"#*>Q.qnݦa)aD%iaCWw%t hP=Wv d<}׮fμ8NЃ"dpSljf|0/Ǜ.JW]ǖ"BEBaAU؂*8t\mE%tߋdI^HBQ\9B>ň  .a B* EbYPP 2Y=+X.)#˔UXJk]j{([C<8VN#+[/'R endstream endobj 973 0 obj << /Length 4062 /Filter /FlateDecode >> stream xڝZYܶ~pvhx8Jɲ,eY*yN03fyyHF7cG@G6hh_~"*f<PM.0u%>.IhnX T9wt9òߡ[ӘNT7۝Tm~NTovWfsʾlj=Tqˢꯑ޽OuUnwREOg;|ou=lo8&+04xq'ݶpKL,לmGv$iGH˜[VCa Me.(^\a" qhkXG<pa 9:LeX LM7i@vǞg3 ^xztUT,QzJ]#]cxԆWC-esA`S$ `_6{a57?i\AK,S|5 AGXQj+ʎ wXV ,ox؝@=ig0G7*zYchدH$+!JHQVINB/ [? ʃ*aFhk`"uؓ{ak ΤQPXZ @__eSZi Í&ٌg'(]I!,b;-U4<5&TIXUp@]َ`Y) 77m4>w* x@9et+FZx=<)҉Vr׏uŋkWBe(T,^-,()<@+h9~NFz}R`בrf:x\=tW>In2CMAn5wf '˱Amkw(Ul2i""<^mN+6]}Lm,:!YmILH8VS-Zfj8\<Dz',gA'ΝE聝pă 0rӓ\i0,htsjAhZpsӣuڛu|^6=+ ǁos'ؼ?~k;oHSI'މ4,ts}8Qm<sɃg?-UYvEA 'Wo }ERF}G٤ `~\)6r4D#[KLٮd=cCTFaf$ {@IA22Ga?]y03uY pJÔ<0jjttgz|_^ͽ?wh\0[T!NL03qx^V[Aڽ30A+!uw8/,q>0v۟h$:ߝpa`FP*&SO @DA0 ,Sh/b{ qMqJ%PP}tVk[ГAq(ecav(?̒ h%wy <(r'ú8-J`.h2FW ă(!𪻱i CEAZEyίXf< Pr8ox?yJ4P)n&tj0llbI'` Rv}w]qJhr[Y(d"1ä̪m\P==e#~P~~fGyxV8䪲?&L@{apS,sf{e^wh<݁xڑK`dŔCٖj "csMLg7v `pt>i>ں?RtG5RoAۜ4H0,&5rЉ z!jXlT @9o,+'8:&q>p>WU+ٷEr+ ,*ɽ'IOG΢=} 8G|Bb}W}]Nfj#&91`TB$:A{{m3%pdj:uhCcAQWgLv4Dq# X:ϊW: ,uY3JU6wD67xe7!rX μ? ,V s`du戸Wz8IQͻ5嚬h[ָ;dƘ;L8p$8wPOl/ \k'Dk·&@݋J2W~٩(re@`]qcnwS͑>4_Mz:Y Gqh<)j[i\2x$BE^8X+Vc ۊ fMjVYw3 Xw"Jug oFk8>I?aJL).eLF4[[oxMVvp.̦\6A4rB:^SB"RgQ+a>Ǿ7ՓxTW?w ?w,KUS&+\&+\J'g`\ f2qp'DqkxcC3->a'NO3b($Q ƑeH@scD _ӫ?07=vCVѧ4 3ޫeT`ix81X80OpundMRg|ʥ ZBq!=hgBe$Б4. xGx7ЋT Yl}_׿9Sx_FBxwBGv&߱'\\a٩8'MGw6`82۽I_W- pBo֋xrjRM2ĩ Hͮpi2Q4RgPSHP(  ϥWcq/+IiMYyFۓҹT0NX ?oH^zܑ;Z)DuL份qޅx 9&Ѧ.ȋ0.M*6El'z;56ߢu=1O təS+wOyl;<#τGtoۧ5CwH_G N U,==j.ߗ STV\ CE@7)3 ucc]7y) VKL=_?ePj'R =g@ &)Z;=d6&K]%V]ek9|TRY.dV.gܴȖנB ঩]&2%FF=)>_6K^'DžؾdCŅe;Պ,T@;{x70s2pwpBO*Jf!.<  f! |22342oI'CH XĚjƦ.WIѠE\wJwΛ/З:fΪ G08C8]\Oʮv .i$ҹl1aEoZw1/܍T?]mvNLH]nsF.Tݭ,sAtjr]9] v*e1ðLo_ $⋂эs&>y 4xli/v F+"2 ` |' g?Q8BaI˝-k[lv\. #p63ly /;$xQ@#駧ƚj5~-4ZG5M~Zj.#q&4]J01^\S5 endstream endobj 1001 0 obj << /Length 3135 /Filter /FlateDecode >> stream xZYs~ D >FR-Gr"*Ur50kp,RI^sOwO36+gw滗}w:v~v׳=[䫏V_}߷b%V.Սtj`Mu4jezߵ<3 eYǢpj6NWص_槁ڕ۱֞o1סXtuaj !˷ݽ` ڽjS*kuL`mfZya2Y"YRWj'2WqD[Et!/JV6oѡ#nt~`qwr \w,-.߳DW|,_h5*/2NUPSݐrnۉ7u8: SGJ*ȇjpRV7㞇/ӼP};[b"-"FN,\qX}[l̠;]G"ڢ­7%һ&2,fE[ {5](C{`q%F| (}m"r;5Ł;m}=/RW{&dHܼ- p|d(TTm[ooVHc-D U{ _'O6*mn9xzO,6 ;jC׳99R\ұ@:Z!;b6fAv`[kU֕1!]ũIhhAtcM$ȞThԠ(FD-]lY$UVr};`pu|9\D0.!L,༊_;Mxc;r8B4Utv U„Pmaԝr3W`P)xgR,P(05[8d2zܭ7I,/F0W-eL.~̦^%б.eC5ݪ1=aȾ.Xh{zYsOonΙHT /x?;~B*skߚ@3[ `[[)Iʁ|́\Gq+UϼSF>s\AZR2&FEl 7]A" ]5I< py s!B7c)fh zĎ)E)!Bo>1h/qq+1mol!m:yO$[&)19%/H:vnd?<0d5ˆޛN?aqYBё*'O\#օx-!kûdT;'b:ᘧpq_nW endstream endobj 1040 0 obj << /Length 2158 /Filter /FlateDecode >> stream xXݏ۸_a2E`{HC,lsmbeɕ$wFK[IsHo~d[$^%n_ާK\,n\R &l2Nyy),sRe"ñfmjci= U`E^U.aϪ6[Xy60(N{4W^ʻ78~^DK%"~"8aJBne %MIZ| op`^ETKleyT˸Q{jhYG4?Ǘ*2T҄O}Ї(2VwonsEg{KQxYf̱2]Wu^7(~h]MNW{%'So0 jI@2Uk8dD?,"hA֩Ϫz%L$ %ERpz^wPAu*okٴu7ִrqTUN0\D3uRJ9'É\2xC9^5Mr&˩gvOpTfw?x,ayZ``<+ H KEk>__BA]'!k2k8#/)<W" Txnԟ |c |iƲ G'Y % .CN:<:lQ7V{/7~_HަpcԞhzU;4SGؿYlGa=b0]m k[D=NW2"Oi*#܂7\k\ JS. @5'hFJbvRkeTg׷]w}1Hˤ_@2*tW^8Ps6a:==#u+tt3 ?j09J 56t&PBY񤧮ӨȺ!ߜWj6dQ2)H15ISV[=wIJ>,1\8pq/% /6) e j#'ݍZ}W0tnI:k}E_~9@[:jC-{DwpSwv4P_93 Etmk?vs 0:d꼭ufԮ&3T5A9?LgPP0w?SAn98a"<󒉬 yN7+o类 >(p>P`yiDP;gmc)yyE7@+)i|dii?{ 6"_}1]g}QeE9ŷ"u~C(b)0߸B 1[Ut3:(7zR]~}ʞ3(}Egt :|#q')U>y ܧ=SoO[LXw4Rmn'9t5x8L/=@JmwMij޵VwPO9 SBX!"DCptUoSz=bE`0p㗌<մ7Ɛ=$eS\~J35=/ٕៃ b#"+Ye8F:wSR!;sj E]Ǯ2*_z) u}pţ74W> stream xXKo7W%9א$p$F Kr~C88=~;oT3LlR27>zBX &@s&ಉ$b"ӄs5)V kCEVg2㽒L%'23 xK5E2R&\> qC/W5 k"`vpQ\OAJlj< HwԽ1yT &4cObpoN֐X7 lݛ/|qrx7GW~ܛOӽbu9]Ξ)BG7Ef/v\RYeoqީm,y\4$cS-?6GcJũg: V 6d),)sR%+6lĘrd^vi0b>nnBxYvMەkޮeDζ[Xb>%}K:[/W?tb}vubgEwv|8.u?vq֝~uzj"Wf7p\B!4TxQ)<4oȋauM3n|)A2QdZH2^K-a [\8 vOŇ~|bvhY>{ IpUr޷j8s~uqIN[ID[1fPZcK݋bKFFB]ʼntKT?\{ le4̊)i.FoWPr \Z1J Q +m.)[uD%KQ v [2 FC&%8i$%G|V$c~VG9L(k+CׂcQ&8KV֠CQVF%(3Ѓ|т)#PF+`v^YɡQ6"#ڈ|6ӁQ%-8}+6XwyN`IleChv>]=<ǽb9 E+!Z?;joxzs|@{L(^W-8:t1y,nIkxY38綕tDm0*+ӗЃONr\-y&TZ (N *s٦q;OZ0c6W!cD7,& endstream endobj 1082 0 obj << /Length 3089 /Filter /FlateDecode >> stream xksܶ{~dA23,ۉ2JF3IC$V,#DZd+ݞhdhX4wQۺTAH^|ݒIi઱=W6yy=CEh ZF p;=Z~{~9/q0 LѡQM|ddeKGnDK?X) IUdI8Xc-\l֨/RQ \# u> ?~ީzW$d&gp?/@z2)2920@QeŦ&zuƩu t:klǣno5npZS ix H 2#t+%+WBR*Jla9-ȹx ad( ye"/-=:tHɋkl8Wb>+ʖ pETvHe=((a!lщG7q h@T=>wac[nA쑌/_>#NCfR;_-pPYdDM? kFx29-+>(0=u1㚕=& K"ώuʀ bC~δ@$$(8Ŝ8%6q2 15(e7fdU:PBΨd}i <- Tj&>`t@adsIrbr~mͽ$82N"=:NrDBRFID^VmrVoޟN4U^֋X#y[L3FE+6vI(elZ|M┗<+via(4XZXUm'[`1rMI[VN ـ}U U|[F# %^>U$"I(\sjbбi7M8StMgi  Xo ˌ5"81 a=P!hm)L 'mɮ"N/{_Pj/ Dl󵥟P-*.RyG!؂=PTz?R{N+hTRI6qf" /+O & D) NQ "(*A^"v5 bxA29Z-QzNc3 hx҈Pxp|{Z7|'/.O+&mv:Emy ~uNVa z Fԉ%.ġD^4U`;Y}.4P9"EX;CکCb0D@a:dFPfc[_e֔2LTee1fȉ{ YyJrず8(#y8RB*!ۈ6!u崵ThnTci!ټjx'UޑuLB!vZ{g9~hQVjV !S 6=.j fs1L=Yr\c8h"C`YC-+Q/.Ж#KU+Y11$HF+jbT4 Tp_&*F6?%6醾Sc3aXK$1&&񐱐]>)H| Yt/<" eD&53=kXY%f߁ַMӃ0tAl}}BY2xAaEg>tYrtgYx֗]qW%+-+E^*n]-o8Mˊ' ;~~G%-dYB~f,kuVGo }CV ]g!> OM ^xqgv5eQKKzrKAweí&$=MS\-֦/}<(Ѧge19w&kyP<|3ѳX(q<;zP2TZy^ ;!^;v :?J endstream endobj 1121 0 obj << /Length 4162 /Filter /FlateDecode >> stream xڥZoD_-֑2^?/tJ pn p3&=,HvWo>f':L|ͼ?:37UAny ):*/KnVywʾؕwy_4ֲ'4>HRQxpQ]0qևAl6_-~/k﷚,uߩ'vᨳ#=߼pPL$0}}tմxΤԡp~($S| pwS0ۢ5xug _E(973)ߏ7K<ֻCt"7E)izR@fzW*xm:gQe9t} ACCP)&^u y rmƑbȜE M6u_nW8Q%FD(*3zÝ9'[.KT{R*s.2,.ۼt⠳%5X^ x:;}uQ@v<;$ó-f^N* ;>?mmS_:B[Ofr/fV9mN ZEԴmh6+X< F̽߱0k3tu۔)j>J-wMQj<ˡ19zPPOUjz)M֓d| f =7D0 Q.9=jU8O6mo,ᆽ_NleQvk! A(M1R3 .QKjdyD=v؎8lG?,1KR7 FxfqO&{q$N{x/ř{;{|55 "W ,4{tC^=)^~ѱ?x%:>q6^bBP(0: J&bL77)ѱl "+_x0=&2`tj/ΗhLvYKLP)3<,[)ŭT͌|aFib͇O\S3$02D;1Gѷ<_4M䛎Ab.̥wNZ"lRH6ȡp XYteyj=6Mo 0N縞 ("0Pֳ,IrPH&NS21 ,>%Q~Dֺs’ohYw(`AG/-MH4ld*֡Jʸ%m`Ut7?_i{l8JCb{pyʴ **rֆ{oPDr'EI"_i,Hej=6/4X=NQj0d[U$.5ղo`A`"՞+[c dHCu{ִB])J̩X;vTO!vqCi”"Fp7戆) /#%Ra Ibʌ.2[xYHw,M.h5znSGOq 7X"7<,Tś/{4W# xv滙3'OF2jh->0@ӆl#0BlUaTh8$9gI7 ]:NQn;OAֆ;s<g%=EDHGs )!X+(1Yql \J)M%EwR='mQQ%(iw $F2XDؘ;Jʋ4(]-Fl\19j?09!*6)˙vId1Ml+Njuwsl -2A˪07]ZhBHq2tV)PdU.ZCG1!T\9"Skt;XaI™Pq &w rf|mmnRH+wm4&ĸ d]#-͛\>r%(wBܮ sؕ nJd %-bcr -nY dv=|9Tago2w9-,YGodw[ ƽ \>"A(*mj"`0pGxɻB^<+3z徺z?>Ѹ],?ڳ jBsPu;AL#Fԍbآ~ ދ@RzbMwH /$v~t(y\V ~OՇgT p髫EJ1|uRj\_ţOՔcMNAljx7+i=R[ln8>ymY#d0/KatB@)V.١67QtݰcxKiz? `!|t1V =TU60BՒo~ |!S^5jAwϬʂ*IVb`]Ab\:[T8NmQC|4=$-U.gхkXdʆ|u EdS!D /I,;m襓lvcvQuž)zJC15RDHjTH9EMLE=!Z1#pK7-ڛg ɾpgd-"jRLyx$j9,>C~zF>ՄBĥ?%1xVJtD%ۥ;ˑuqray`/{s]ٓϦiw" ֙ۄ#㴎Vr$c9USX5߄utf!SS hvwL>Q7 S@0{~&z#mQbOǦ4 aIV3ẠigEiX,&dR |6k´P@q:zVy?Pn(" LwЦVN`oO<;wjhRf2]0YI IMaq~0 endstream endobj 1151 0 obj << /Length 3590 /Filter /FlateDecode >> stream xڵZ[oF~/`X؉uFYEAS#k^~m(Rv}s?\(oy7y7ҙ﹙4 f\f99/8qcC٬mdǶ yLɳb]q+~">Mot/|,=?x}q/eSV[xn^ڭeS7?~FaƄpTO;0 89ԇo#h$x즆=o6Q9E;s;_=;u K2,pn*s>]N>#+Ju_jY$VJG?>|zy}g>vt[ml5K $9){|@Eιd*!Rac?ʎF„ ZDD_o$jlv5Q6]M.}Aζ]KY'0xjRMTʎv^74vڈشle}]pT w弞\4q}^o+9{n@)CMT$>_d)u9Hkj_PPhhq0_$3!O2瞞 $ =wSf1Ł3,?g]89pSӳ=Ut|8|VTÁqyG =k:\lv j ).LzܿpIwpw^_4UH><ԗvNOc{XЍ=맣맣맣 =OΛA7(P2(04゛ 0:(N_+3W:xxz(hZ 5ZF[M^6eWvn-+YD['^d x}[?,ֺ=IKͯyQ;#pQȴW_qx435a?C,="Ԇ#HO} *g0r"c N).is#6z?G0&t EDx_i}r) rz~09zԃ$v iW?O1$t0,}ؑ0G8젭A>$'BWP |^KG b+G8ǢkC뒲V{ azȃAC)ESUi#9Qe` SQTw{9p%0hE)!Rb} FS ܗ^y!tجy\((}9 ~)y`z]K> v8ɘ=\g]c\Eךv%p-|3!˒IBy&dZ̥!X}J@"+ּбߑF Znk#oca=ϧ;ab2[K;3X.P+7y|PT%4q$.A a1|礪x$kh;<IRX~[kɎ=Y`JM/{WK[6f*p͖2{ovKmZ Q_ FEŮÜ5u.ͱ4 D:ޱb5wldb=% `4/+/?  c%#LH WU{v[v+ysC0-EBй>Z;>3 =* xƹdNr}^C7W3!CЏ;T #؀Xm%V Eh?o0܇&.̔%+qQU.1"$Ң q2 P qROifn!!X;\S;q#&^d-"> <$UL B˫[mL,,M)#_oJJ9kv,%,םhf"0Nxaem$_ݡ\̇W1mB "]=3n,`d]VHEH9`:`:6'BҘLG):xRķR5-9Cg5Yqqt÷ įQ$ldb\)%BV=e+\ŗ8lqC=şǔb EM7jgk'tŻqt)7l$5XCVoQЈhZA[I!{ȼ<$ae; w1j4t|Җ(J]_-=zA@$8`ALpp]0[0$*XB{<ދ3%r]>^F6Maѩr U2Z9hC<@%v]-|"ظiJ'8BL"^Q !P^iIDžc dDo,ʂ hˁxE#8b<牗llqYW Z;е msd-䊓7E+~48/DЀ2b*!- +i\Thh%X+P1Gr͙9 ',>bjAJA Tyc`OP+7%Ykb^Exd l!Eis!KmPPB<ܠg_ FqZP*)IE[[d)?l~Ҕ&9jC,sHV)Ym>*aRSF_OEsSY`Q2@6LJ}ˌ NE` s!ZYJ5Y+hHN/@OFc )uMeS{_`/0Vm!Le)|/ t ~@3rNt8E)p!"iBcb>uW Hg7I endstream endobj 1074 0 obj << /Type /ObjStm /N 100 /First 981 /Length 2135 /Filter /FlateDecode >> stream xYmoF_CR-+p$\-16QIԑT>3fbwgwfgYiPBEAhˊ(tbMư& I?V&jakpʟ@46 9Ƌȏb֎% О H[5>6$وw#[2-%LY3̂92M(:)Z̑0Z.B =P˴i  I c&@J8:"07rH#hEuѤ?$Z6-F,,/1$amҘC[a=-{,l%y߈H6g7 "Ye)Ѱhw[Mt.eG#c#\&hkWuRdy^vFHl)bK3$G@DyGNR,!gɈ(J"h v"x0 "vl'8lNTuaXD$7Bx6 qDh8"$Z^} H@pEY9\`aϞa>sG^6@q bz&^!2sQ2I8W4DINm`/3\v#ٔQl fRy_ 6O|ѭGqv& F~ q'.P8a2fCU?E3Wم^67s7 WI36qdy3t~ <źE[݈WK&eʯ1_c~?_; jW(,.B U1=~E|u)&XNI$AV'SZb{9b?ǯcۭHMU]RλUufS-˛a޵/8?}w]vճaU/U=l]fk ՘a<.z ;ϐjs/e;?լup7j+-JQ%7Jȣö*)xBPS9/yYfTpg$߲NI%y$N)hxiTRb1} ਥR'$"uZVV8ߘ8$9}DJ WvJb3(2 -f/H½VFU<472( lK7Cg^/'ݥ=XSv ڣG{!9LfV0{^{!Oqۊ$$M8_%ΡTƅ$#v "fKEBT^v~ގl` '#X/Ax܎77ټ[[*G* ^.IG-2u>*D%ǔ7Mf psլVw6f1cﰨu3<9^{gQV8ѻ[yv[='7?5U,A m_vC3[u 7@#Z1Etd- <}scϬP`J %;YWP}c,E^OS /Ԩ;48 %>>\Kfu@^ &wI5(nSQݬG[岻 Y_rtqX|L[0 H[0 izOBOӗSS`X&{ GvNL(4i?MS}݆k2p2B3L2Lq8B!OĐHVhct%|&i"]sxDnOݞ&=Mt᠋30 (_3@|rxZox, ۮ:/ah3< X9oY1Ls~`7`oګmܵ@a$;g f>v񦺀f4[+]ioe*ٞB@(ŗ8d~~ D co!DR΢Fvʞ8_2/{ngfm.U.=pU=!?LC%S`cȈr 6ӌj -,⾱py) +{FJ#!~T-\ u~rx: K-ζ l À=q endstream endobj 1179 0 obj << /Length 795 /Filter /FlateDecode >> stream xmUn@}WX}UՊR% "R*cZ;Y }a3gnm-n}gurquE,汰%b k%kfm9Nص'gC)O[TnsI+I.!r|rXaȏb{x ggrl:w-F>JvkGV*$f|枿U$#ŹZVt (mL<ȀNR(8> stream xXn7}W1)]-ԍ5,'MkA7`Y2tI~}P1WԮrnȃw8sf ɕN0΄piMߊC Ä]0)x a'3+H9 xɼGα",e.ɹ%TyR! YRH g$Fl8Y.`Ő)I$AICP8HO+$H+F M ΰʑ$!QJ$2 )) $M eR!iIҦS) CjH hR)h 8BRVJAȯ,Lx€( p$,|`Fz3E!IHbJkRH৲*l+/L)O(V KX TD4K$&@$Y+IQ ZYZk4|؂i;X3 v(V ׅ"T( Hj@B)CEdQDa aY+8$jdPXŠ V Ɇ⭀<rʚR=I2J!H*4EU`74`P=Jfek{{+gg>a3bK2W c3LŶ* ){ ݃pʶY~$n怚AqPi.XJ),?-,Z>}/{]n{Rڿf^9iawwGdD=w1`p8ѳJJW8X0P )pydE? qEj&7m;i4;ۓCN'uz3z}>_>Ld2+'7/ːz'Ӯ39i347G9@0?'u6&ʟ(UFۆ&s98M[Kt@3J]4f'/hIH?Rf1i_m3b0}O[`oMuw^r]>ozoR(yZuK uFvņVF2T(x0i#+>4*G'<Ʈz#v佥-V~<*ͯN-]?*o~<_V>H;=Rk|}gagAVSz`ngy s {MiO=tī W;ϽUnji:"#K7q O{wUf񯊟Oy56Ys]ҭma@MobQ06lr ߑɖ.&w|>NvhY^uU)lEp endstream endobj 1420 0 obj << /Length1 2178 /Length2 26097 /Length3 0 /Length 27436 /Filter /FlateDecode >> stream xڴeT[5 R;!{4@p o9F\:Zk] 1P΅ +lokdH'loc `gdd%#qXۉ.G'3##,@htP=r@#UO /hBglڙ[>\D<,-\`鏷0=@`dg ۻ-vc )$U?:8;U5 ZN PSQ oN Wa]NLUHUKQL7埴ÍojfN%PZ8p30ӛ:;;Ond v*G9], %YK3pG!\Ĵ94F**l,\vFv&.F.ÿdo)ſ"NNr[4.l2]o_#협?j6stvqWD Y%SQ<;:9ѻxe',7 `R1;S{[ΰ'jQ'{'Ojk;{w;Yڙ+PJl?D́.F0`I#f#(kiv6r\\T7eZ| F+=_&V_)ڤT;` 4eww7RwAD*T_U\cmKg| ǡc1cM?j?|? -ARLA^T/#1;{SK;s3;cLl kH v..W_F8 #.#Ao`}>8?F Ao `Pq4>##GB"1|P2dA^?x~2} fA6d?8?k{l\,|w`o_{J>[a'{kǝ9#'KƏC/J@oaa{o:VNv3LL,,k+cc9@vuބ'*%Oh _S:b5s [4(PE^l/+ɭhWIa՞\=}k$o'燇(&4N%PCBu"WU:ՑAP8}ezGN%ѭ(t/Y`jEwAXA[uyB1Z7, ÜrE֦I+T5hY:6dzUWZ"d4$@O29#85KZfJdPէl x*@oS%ۛǘ(hF\;-k^S]|t)DF&ډuI&X_ Lh L]4963B7Ō.6-s#B>TaE pgYs2 р\{\ H-ՙU$`e'}⤢ eJau0h!*2LU8`/z1V9ԔVGα:|5!CY8טIMR. ynE()Quj-8wù0?v|:))蒘fNsRLBcCe`ƲskT B5lr Uw%mQ\ixW*$f6sn=2ŎƩU߽Џ|(Y_Iv@|kk*I1 Oi+"^)1#uO7ru j~qЋiڈ SE^{*K3M`ʷ>-`#`i?(}:^Hyid0DqҬ͖,jsWeU3u\{2lr0'H>ߵ!zd΃;h-&hhޞxrHSƚMv(nyu?a=&p"\,B-<ef:RzN;fW655Z^#nQ1rfXo~6v ydch-@^bY")N(ѣ)BO}fp/làJK|$nC,㟆m㫋ݟduO5vCXrrE,ǹd){]}Z7F6wqX3o~OhpԬƄH>ߛ27KBRJ~8rL1ӷ_*,Ϲs<JՆQ*<=EYL785T˭taVvw6rxȶz&{̧nQm6گȻ\S [B"Oϙi7'5:()[R%}"`MMG H"kI Tg& Gj>aFZv;6 W\ڻ65dcUt*琤b: ӏR/L+zٟb,Y?ܞ^{د)Z8j{YUWp[fScN-Bo xHeWTz;XvԥV=Dd#O4ܰZC a196aF>-h èɥ0)v88%{ 4ĀH._ƫ]n T%M`[3w(G݈t(v_G+ߝçߗ~/b"r8zϠ: خ}9 Jۆ FJs-9goIlNx./MY497ÔP"QT|{YAjp- N\30_F(FHXois%8L*j $e G9bf6>$.˶U/=ڹQF?dɍ#}#XRE*ݗA '#/-d;*@d:UL ߽B 9""vq*1M4'a kf딡Exd|-hY ^;qa ]O!OW[xU7iSD?EŴnЗFCh/̑y ջD94!50B@QK1'h{PMc_b$ϞWV"`[Gթe@e?f`]!i;/hJ8|KKYm&86p}`c9_͔7LpxUڰU3 ̷8E+.o._PXձ;D%[H $Ž!{A̳l@sȟ49UHuFGV:vu)eF?4%RK䖱SO[/l_f̒WjХ폮lZJVNYxADx; $B+?2m qT>)$遁'b学!54[K"I6a0~?Ę宿`<+6i^Ujb ~I۪Cn7Y_Xɗ&Fأ-ܭVDǝkBtFue=#qhئĨk6|;J[[3{d.{[˻x\3;1eς}]x[?Ml|M2#o7_?!AJ3Z 231.& єn<̥K=|+13Qu)afȿ }pEGw u7;T~ih`u nYOŒ@ż9w֧H^td$z`|\ T f2Wz'*~һ.
8¿6[yZwWP +,& wdc⹡sZlme<=g@R!kՑ[ uozJ_(3 k`5އ7b3-[hG5B F8r[Ϋ\-S;tSlFx?rJLsX6gels-vkPjYϾbp (bQ9Půqtyp#Q~ϓ4w$\:vT~>^\Tmk+,29(]qߑZW\J60MlJ|9cI-22 h#`"g_ KE!,Iఠ@R51Q;4L2s*хGϤd}]{J44GP(,NmQmE?uLBm)甚' R)Gj;ymm4vm̻'" j\jĭOBwaUw}rU#,KUl}kUh9 59>Lv$OFG' hDX_w%D_zz1Eƪf4a4DETFq ql3PcsΈᲃvۢ<tϻl{U塬"d&p@^N;~'v~T??DIk5+V RJV2 x͏*Zv6y pvU"@w4Ȧ4.Sx.syZ f#s^nlڢwK`\ LZκ Ί]gpy VkPoZO5@icvmbu/_!BvB%XڂU!no}c7}Zɒ[Lj3W^LkfxLtĔ*#*<r"+/RWHw ;yYb+lCѬCK#J:~Wz Pj2M4kAix]3($C< _'4ҹßc'DխGڳ-zF=[I/ٟzl8 M穖Ōk'|pdJP-{ſԉ5W/c '0waF0[DʧAzE]+>%#2#+#]5x_XV㷙 @}ʮƝt3D23#[Z:+/}cgYqO!q]o7p;7&,MO$rURv"xJ𯉢:ۍͮ=Ѹ3ClE봌|wD ;#9ytmW]"C} QLsنτ)ْ𰉌w9!Xڭ"6nOfG[# fZ\j+V0pM>kEtUnjIJqQYف>ۿ| b^GW:_Z?&K0> mʨwġgƧĠOgZ5 5E\:yIک,g FƁ}8 "Pwkƚ6Lco]i%T#0K :(ކ[ϤqCrvI Vp,%2t8Q.4`=(1/@):tSw7Tjr.tHقl 3u_`0_/x+f#BJ牞Ff,ZO u]o\76hm"6h_I?e}Ӹe>xs#F=JF<|]T VBX$ivx~B.q:gYud ̎F|n2U;&ʼN.QMZ *3BF 5`F=: 3q+M70#brJkO0b*Bk"(9Le}p]0'$ͺHlzm}[T(`ㄘ,Q fPa8Db&)U-KbYzHoظXD ;h0"wv^C-97ʴ1Y?z9A$ڊh6qGeP_gu0m%-2~iV:n@_AJWآKLN&&j̙Z$:^Yh[3j>WUZZlRܧ ,U|h3HDMyiE70d009pQ!j[is_#j)*f&`wb1c(={DyE}f~D0& Qv`[Qq󺊆$jH3TՋlA1>W0=.Ttg99Lv'xǴA\wkBnwk% ZjTP+%I 7A} 0.0DQ_@uTko.҅&{]Z+aji25Mlx .Nڑ+Ҽp}@?Xpfmյ}~[Ӂ0 18~Zu ?!LϥEX}E~Uro~W8I)"n=f8\%):+D48÷iQv j GFȹNSYbP9V&ȵhz&FZcm̳;7,~qp=;9)s%㗄oҪhg"I{ὼ~15Yw$_z+<z곳A6پ쌒3$[yײܙRؑ<}m65)H;7B@Y EeD %gg{v`}L\LV@}B߁BvWܣh.*~pmmk#@Aiрh{n9Zg0asK9/c󕆥fRV{ÚT2A1,71>W bE+4=mRwd/埫V+y2$tWY⷗vďI#d\M2ԠBbQWU$羝ܚ!` *+6+udpnFJ7֟ y^F+kyM1|8oSDWWnQV\eF\끉$^t"nA R'8^[iikVTX] ^d]*hy"ăko^(O2>!I"NQ]}nESk7yɃ%g{upW[W jy x}jZaSqkm,|_Qɇ b]u"{z7QpE]%@lLa,oKT #Tf!w`TV,Wi3t%D EN7|N5zjs>E&ynm]nSxoJo?H6*Wt\`5|YN$s?[X +҇gev4D\QX4ZL@ZIpMB3؂9yR${%Q,%7VO @k@sIWlqZ\H&vt_4T$1CkY]"<$Te5g)n{wP@ዙej~@wl 2dyIHRgFsIc#R//G˷"c<hᖿeReaW#Ǘ^7$?u--w1(Q`N)u0*' Jq=Vewo!ylE dף:Z=KϿӡ0lySP8ĩfj;D0)|F|W$ & rY;^9I5ޠ}'mnLOW4'vO@%Oeh!;ܽRSÍ50irypƚRZO 33h2y͘7|wc)*ޕ.P ӄ/ΫxڴS#{ٌ47q5t^WmTAiIr7D &o @hv/\ _[*ُ+d xVjs\qKfŴ@#\05;nd@0ASn<{uYtp x^Lhl}1CS|vKVJ1G; dpص>ir2$sj *Y J.ukmVV W 'HbhP`Mdmj,Ô lS.밧V0gDl!IMU96mAG >U;ixU>jF&FNcsqp[ {"&nTu0xDt|H(9ٳ5 QHz6W3U$ZKrM'$<<s| ܘ>yG^xj֢=jQ(ڑT31یiNtpGdjpb@T}Zda<E\Q$փL޲7H[.E>XAtKT~ Kt퐚FgSojz_(z#NE~/M4L&bv=lMx̛Z3aYʛZ&  w@6Pn{o:sk KPԴ %}/fy]30!GݪʫS79<{'l(( !-z`AhóOR4RΨ0 BlpN酓P9Q&7NJE':_[bP&/ꔼ}sN(]~ݰ{K >Prѫ- ~ %ۭ$,'={=W\BuC觱唐Dy pOY1imtiwV}%G>08X&Yw6#s17θ_ɦk34S]K8٠2}ze[|ǃ2R@W>p!nX6_nGrM\>e:ŬEȟT_D}02mQCi{PG9Fq'ek o]V),H} ndj埀"_>|`47u?.Xy`9jߩev^mC2WådDV(𫙠2A]q Nkj&_;>QrEy>!+>wʒJUҌY_hZ҉-:AfT!{$fxJsGU(ߨ"ex&nKyMEbI.hpS;Sroø=b jDmWGZ]@-%J*"*˰_GqLzfCK;dpJ_< GY!fֹE@'OGPk{;~qfՈ5C:uOs9R95 %~9OqMlè)W֗dwAt+Ek޸tͱWӤoF/Tp+[2M%pv$Ь-e|}b;Soh ?9Y[ދIߴo^}SJAW-ccݚ85.+ze yr}䠍E%@KyW&dS#$Ն7 oPީ?`ظws0UZ0γ%=ɑ|mzםlѧ"0eѾ>+L`SXr iʑ-᠆ep1 $}g*!k2h#ugn̡'"Lѐ:R̢ىǾUcGH q)Xv?呵$` ".}[>NcAc[ǭ}hw,GoPSd$Z'Qd^9(2x: 3 %E~еqb%]%{pB $Ay;e&Ou}m'˵8GZf=L]Qph;D}nWlz*%6QvHXRho1WMҎU,/̂zTUA("Q>Y^.'Qd\ b-$vvQǚ)~6zb (f(pt¡kK|EuHK~&ƇpRj~-'W\qAB[g$/n \Qy #5l|/ciZƀOMp1y0+Ee sb,c$BY^:H~$0趡ۆ7O!0WrYi*"ܪ#)kfVJPL;Sx(Hi%6/dw% MUâ]ry_MPe 9Q+zwH]K%C3?0,ro KU|Oܤ}0 V /}bҡH` 0'f}xɽ ?(5" n$Q X^Gx3>ĩ?]R4O4%L;Q<Ð~08ZgIQz ,@U8ԜfBԬe%E[wTЂ5FC)~Q1⻙q~x)Pts+7js9 gsl+@IJ,-`}9ѱ2m;lAp&Bb|ŗe3ÃKqacn.]NQ2}\O;U[I8\LݩcD7ǶwS٨FRj:1'j9FTD67{xEر]T8XlʼniH iO&¹/4et5:`6O{хutt+vΗ$F\I"$ƌ^/s+&;\cN?)ynYe?wx+?D$J#s1a&V Z/o*x+֛,0o^sfTn Lǔ-CFČ6eB:ϱ mAkcW"ʕ:>8KiѩE<0!d b=ɿx;JuqO$fAN.UVMIBmϫc9Ըb龫*}[0TRH/cΑ\Cj6kPSi=db9t3bqjG7шa⸅jRPuđv, ')Dh4 }Soy8ƀR֑d#V""0dp^])fxav6{0'E&8^ؼzeJI"Im5{d/`ҊhWmAE')("懞[ˇukgU(ݚV d8DAZ"1S QYIy#RE}g2K1aSX+Y=qeIZhu҇J+3؉)Pl ӿ>=[YuΗu1i\?֚<BD6) KѸt`w\/EƐf0}5`w7oF̍jPA5 p.|95GOFm*4{d-qpv4u꤀-lpX ;Uյ}a=uS6iItT'fp[ ΃}hՍN#dpGYL kx2 ]mȌ6 &_W;/IwpG7+7B39ϼDQ CF0~;ԗUj. mxPý–?J,0vCfw|7M$;Pқ6LC]x._[22ADƞP7mg!2D/ etqMT5)]c3Wp t3f{_ĥ?;E/# 7ܻdJhcګ:$byG,>'3G8 gIg2ew] J.k_yJ44T+iTڐ233TPd槛ݣm_)vH]?_O2Av>Hzǫ?6L}-SdC CC̑ `0_H4ŰmIIs$ *15kmܩQ]nw*gct! }o] ϖut8ÎHs_Ҡ7;Lm0"M E \3d F'3e!3d8:<,H?@r=3_T_vsG-x<ӌAbZWVfzu1Bk!cvIupaBSY-$p=e:4 Bpŝ! 5Ybp,Z8:(e,Y!])y^w涣Yx$q_ݲeO%=¶צS<䋸p1;aq=_. -d7f\ݖ:2}@.x#E;!AdVuX*$r,Jj-Gvs$!r֢F\cV{gOn06IKl6DR- Lp5 _"ؒn:JwÕ6|m#>VcW<=cprOb=-6ŷdj:.4ix(, J=Ɛ4yS\~29Ĺ0៧@W?ausԋ&ݯR?>vd *3EͼFTkYuUXيmda l/k|ua3Xb&ځ']K%ܱofǙߑ_~'{ZT'$wߢQe"?9V>lΐrwY 5hջZP,۳W<|>6bԪ/vq٨0,hQ"I9'9jla>j(b\ ^nCDkxWCԏ&^`^w a7n#~c05ۛܮRQwq_+N !nGHC!77x&ě2~T9G4N7Րy-JG4$BMO [Q&sFɁ;jHS גoQc%Rff`n׽?52aĩxB$b>zĚCFPع1 갷Fu ]}e>.*aj!|(;b *<4&{d.D|m3ROjS\#.u[ôv HU竫k'& }r]bvO bFMXhH3DSR%z}UᜋjH.;>F9ySmP_(JlАd;k8rVﯕW|nTFvrڜoWmjd#'%6"\`'ZEAMOSMX[:7yi JY}7tbm*cDl:ҭT-<đ»Ƣ֫o$7:$Aߓ-Ek/rѱ+Д6@P*8 ׮ʽis{T7.ް\1g'?ޮ&fB}ARz%z Y[lRNc.J,0܏f/I5#% vav ʪ6? l9mܖ@Qtn80f> )K=q"_=t0~ɤ@dI*CRuơ.mޮzg;a WF Ƈ=l,,^O)& F^Xl Ln6K.Vc_9%n\$PO ;[1P!fZi(%h!yy6W53I|ڬR m5\`u)7?>~?AG* @WKtWkg6'2Rp yLpF|_Bَ-whŜ/y|^7J}Ze$D|3 hFdžغQoOIZti\{3=q?|ͤLZ/_J2.,(U [wub&w[{RBSFT!Q@A#r;7nY -&u?UѪ9`sX~Wy_v'S]ا"Y`/7dgeIȘIJrɾ_, GD)2hYnVV' MaOW/C,?ҦGؾѾcG۫q` i&OT~IY!OE24ƒRbǥvT(lx(0B*Lyd{ 9fE1܃/H2ð$XEEfxo9ю`%1b7Z[krKF?ƫ&L^Lz 9w6,Zb-=óKv$WN> stream xڴuXZ>LCwwJwt34CttwJH7R ]x{zQSPu?Ips7Ɋ'FC9-z7~dW1@tZ΁tpݥc:MݜBh<(j(74O"p*5k9\BKhh-cB\&gaJ 6k-w\荍F"TJwⶋN\"<ϸf^&8_оf5.þ\NU^w9`_O q<}cO;mSupvXK5 Qt+ת hu+UaB&b/楞1?9 GGA™(P~7%ñ\՜,Ɵu~* $\P`ߥuiӠNNEk3+_va 2Gn tHZwQ׺D\uQ,9=qԦ|s *^ Sbeka .fq^)tq s5HTR}'sLP9 cak,?ud (Rn? T=@zwl0R;{HX=pr-@L 7k_) OF;Wcg %]ⷖT96Ƃ,Rvr㷏֯/fO!hz%"B:>;Kመbve HݱkЇ= n@OeՔCԃV+fm?d/3g_S3rlNI9 ykM^>C7^`<(MxcĊ{XrѶ㚲)?!&%̠q:\) :/v͕fW`CM%©5 |/m"x{H܊F7D˸&V9bFx]IG\ ˋ|&yNTP2E֋v(VdsBs[ǍT 4>!xWY~-;\Qn ^ /mklɗIUXZ7i8"->7gzWwhg!(_I&|{EJ9_=(3[@hܣ=o= /oSU!ʓVPDl<8c+glTg'/7D ߎvf-Œ'{]䝆˞R1aۄTEm|z*%ղi&Oh/D}E4H,;ĿZZv!wfb1gRM-=rF8T("jyEO=b5,@^F^1N,.'Oxu%T87JD+Qj3VK{o[|!k*'U6 *{#"rM8ۃ>FdnzPAa)3r 9 `Jckۅ7U~BʼnMT0z8t_Dw9B_3-J,wA؉OYrrQ`8a!(AY|e?3锿ڵ*T;5SJ *ў[k0 ׺|D^x4-dFҰnW4>>orr*y~LT7@ X0~CNNo+sߙҔONaz 56#pdd"(>vK]OǻԝJnٮ[``;)@軞Z6 36T"Pcά v# bZ^<iUZqnwʿvv҈6Ww;b-Cpcr$aX͘GSg>Bl*saP N/',DϹY4+>J(^5b]0}Ys Xj#ј*JQ3DF`yC\ҥ;[-3&b?:a9:p>D wω~ΞB89d鉢!RzarY aj].cr_O>Q3iSV˽?!(HodA9п/h!*Tev L_~\P%g,%E+'!OlLѮjcn_=yTَsDFNP.h}LJٽ$, D3c{PG5aLD{\1²"^vWàmu MYPQxqߩPVO|bB( W/+ (u>G[a@W,jӞIq٧pa uF]oyZY3"bgN)JiaAqvC5.莿2W(ӠA;~IEmkFe?r#(6tP>) t0;6%V`_ E?k4(KN㴳|&Sok7SP` ps|8C ^6@,> =ڱ~J:[Ip;>B0착E -*!>B}T5 {B3!8q> 96/q79$Fr"wz X]ūZgܡ9dɜ8hnMQdW"GBfSW=wsg~akg\YICvmwr>.5xZ@?D dz܅ԒhŊFX%ڗ(ϊܟM]nZ Yl4nŲ"B3aěQNkԸQMxTK AC_5pl;V .${PFjCOA3Qfk10IfkZ=pHc~+Оzؒmcu1r>4B4>4ﴸ#&jܿCǵ7ӏa}-ǴWd}0 p`OJjǣ'LX3Tr+s'b.[sRTg(Ҟ7oN嗐=Xh+qЋOq2L`[dMT$51h) CSTU}4.R`‘a|xdOםiǦ),%G w"αWD3só<E)0a7k\ڰirv{)ƁTE݂J>{07"_h QUVEb!OrGo9\~x}&S)xf X\Q"a]>Og.]ZMdrvksC 'Ķpv <~ZbM=FQ*1&xi3IbkRs\-'S뜍7a>Z*91-C``2%;B!M9u:ΛWva*v7ܠaD,ǨttSLqmn|~,)Ս3kKdP#6gL~As-Ӈne>zYD'UyQ{N$U0@"kwp1ghw]ܞ7eĵQܥ;E^e? fMl36PEEJ1S"dR/Uv| Vl織-W46=X:8Ym{T+ s T^)|M&Fu"]pJ*d~@jj٠WH<;>oQU'\ov BI=iBfvE4C}.NV<ڳ#g7mIU!co%WwQ\41$LZx(D̼q8Z #my)M3ӏ#pQgkwtaNZ c&.gĪE؏*k/)ᡯFX_CBk34z_ `LRnI s扊&繮_u-$IqHv>?U]dž%} ڎ'03/t/cB k~! JLw7HqSn`pr% v)Dp2(|J e8pxĄIԤixAG_(SFMJf|%S' "U~Ď&vb-І{6~u Z码N*cH @Hg8"NW;'Ь{4y¡0X1tyekS=B?ϊ`qCis<0v_2fٌԗފaP$7YDLedLNJYh̙AndFTӠE6ND$uVE7F |fvXY!}cܪ̏H T09!XYTϪf2z!٘OU?Ԩ0A??Zwo#'S~F uŒ|sO,/eŃAFaWyA+QuȳP+m,/EqAmcQپ(?`cgFF49e\<}!{d6!€6f˞ 2֣Sl!QU;[݌B.jPi;bi_hG<LkLuȝ7eeڿu!6a[&&d$lQEDAvR~XdV/H~_!Dɨ~)9#p`jڼJ4|L6RBKMpssOV<Qu9 [miK-O!<^e!r[\"I=8p.7̗d`wEKxfjY-qKE>O6A[8\)ԍZ}cK1*M#Bc)h'>\͊m$d K`2m\'`]"vGhB,u+"bsrᒍs3' KVmт1=-q6yT%>#|dJ PK:N),Z6CbH8K'RkiGl\JI{aJ:E-˫ADBN&) QV._1FD? ~XeR^&ҀCnR_ JdR=hP{_:gRI[Y|gu'2wC$e 7F_J|ݜ}SB9a$O1Di7 ̷4|&u#~SiV[I՜0zиR{T ,acAv7׭ݤ!dYE)0]\Qny6r d 6v?D2⁶ÃT"bs}}t$>Jd>qĝ¨,TGJn/#I >jsCW@]>j@m8Pl46ncE~" P dmD¤F_$ *Ի8:kMPY"YU*.3zy#~uy&=Gć +F P.c_?lD2c'S8ʢ"0 RfS\[W,4mJ!-~;uegS6'bo.Ljj4M G(cx~ y[A=S.wb:8͹t?EGnftTnv<-l:,N[,^}Ks.5ũ3]HfXrLP&nf-$s_ɐ(tIu1ڵ@{ʝt`W.۪Jj&"bpofi9ln"L(}dNe {8R-.JxԢivUH;o̷֑bC,϶J,v#޶!òx"g4pXB\s _ pfm-H}|bd=LK鈓~vMAqIBa2}tǒ̔.f޷j#d&=$'d}(A!ԻLΰBr[ sfWK !.=?d¤cU`ҥ̝VܠAaOw~c56=3F9 Aa;nNm k$+,kUuEU6c\  ׸[EwNjS&O`$[h~pq^kOv``[zK좴~&bI|.^mȮV׽Lʝǜ~ -G޷L_7,/*)U3V++M$s(]үJrh6I"RsH1G)cwԝDRc,-/ٍt])b D6qVL,uEժourQ;jlF v'(ffᇇv,\J@,OGar8;fj#[.pYr\[k,6R(ٍ  s5:$rU.krc18HĄv|rXTJl2ևj""pN0d QfqX.;; Pʒggj|t<4}yih\xD/XJK$y,dUSrop A`Mr}vqGgee?ӯm_ď\A{VEvWyehO 9{c2N}X>!{TæѯyV>)Aǁr_ B؟t{ZIw0̬BT8"r#ISV-iM@ ;vFV2q}dWJc^4KL4hA#rH`IP}GOCObq=wQN)2M$WJ67Tь8r ?#B2k .˦)X..xH!}huTS M o W${h *z}2tme'3Ŧ`(YߘL1$z/9Xv(%jJo؎wdoI?Oji2Kq#nvK|\6#! su*Q;;~ϖ/6<,,8߶$2Mdt n-O'8'ê N`:jxɑR^HS{AK1DkbU{*lЏp~H)m#Qho|wj4 L\z9;S 8bJ3}UIzi薵=pj)?la;ZZ2Nn^J+q=߶_7y5aF\]'\J ur}"h?m T*Ǽy+ 3鳎 -TQgW:ŏZ|6'˂n$F䣪M^r9#Ҭ{a'=i* ?B  [4DS mYML@] b60bTh~qVrQwi47N,%}RF0D7kKZ yjGS$' 0}3`$1|Q:8G1kqoUSWU@=,P 'TL-y8ា`p*NSz5[>2Y _\u~me 1]ūȇZnK0{&i2.ҙ'A@sAذ^Dž@C3N@T4_nM.]?N( ))`~p&(,{?oFP׃v (b{R'3B$8{D_s+7- ~,mXy1SK<(rsCť=|)h V߅KE6Yk5Uek.93h 7 q.{. ^Z@@\ GhJzuӎ6x\v)A!Deg"rǢgg~w!sj-s5I43Ծ.N`[պar[ʍHj'OSazoGnVFLELk3n+%k4c?jmG+ƪd!9x!b7 { َ7}kM̺;=HL.'2)m}k_\<V+wɪ`󀝰#s.~r_A1BX"NhۓX[QZm y^)cEdwĽ ?K6 Sטuۄ,5֪lUc+1Wג73kzTp+O8QJLM2V̲KyK&,yZ|<עpoPvc]͜uHk;Q2:K?RD,UY֭/p#H^~%:ۇ \~m)늝dXedBw|Xk9:# b`Y!sP4þIJpHYw˥FĶ8-E1;ڞϸ6dSMQ*^ 0Z!QP|c!KњR}8DP4ȌaG|sBCrk11©d4;+F}FNX^_ d+ec%P4iq9]a ;eZ(EA?k$-Rq )h@sܶI ggEjH E e5@`i7mApa$dUR 'LnWv}`h3zh@~Ѽsp YJ$*k>{ }rNAdBkMke W@x }u-wچ2sږM=ALeq%ɍ 0TB8>dTCNgTCѕb@ַP@@ L熕6(Ђ!m1>O>dsZH@6uL_*3_RO`Eb!u*\<֎t+a'7 sˢv%^fE1* 1LP|+s]vFjf 8!鞻R ` <B ;KޙZdJ2Ca=(AgTbēְ6}m|Lm&z|η D9}ęڴ-L} oη!5J? setVg)fj6/5?Hx>>Z/5(1c6ڴWZi.J^KbƺK Ou 7?n+tk/HHE߰C}c˗\Pn .IM7zz]AogIN㑎۵ `L;I>ObpѪ:o&r8~X|ݞYUU56bOf*,=/eBR':N§e{/X%U|f(Tbj-EvWdݟhXtoY2˶x/dD? u1=S/ֹJb r )Њ:˳3qԛo,WȲ&/RpU 2 q!ݺ ws ,5DZPDG[? bml f`J>9}H'T[HG~ aZWݩZ@Lȝj5^iN{AŊC\Ow?َ(;)tj%Y`|G\b,ƍ `B,BrVq$GyF;V/o_3j`_kf* \A7 t%Gz"Sp4 HaУ4x լ۽̇t9x7W.Gy/(Kj +n<^|; )AKg }p;q TK,.=LIBoV+3{6 w$WezSo/0&ϚrI-a0YA#-|w:Yo#Ǥԫ^1 `ڎ +]v54BKXqH 놋27"+S¢%\51.||BKKL%jlWQ:tAKmrK=gOÐ=2ǣKnm4VQxf@c1dh?؀xN.!)bP\2O wƃ@%F46~.~й!'wɔ1Evi,^0kShH"FMܢt 1Pe݂b-w'9{T@/ _Oư1y95ET+W]o»{=Xؗ3V#V͛_ut"5rql$\\6Uo=Օֆe@HClm%mZRfjt@{%"ݬ`AD.x 殝4YۙUݽ HfV@J+.h} U#=b̐CI)} {t?4hҳ浚BJCV۟A7򨍈=R [ {͵x&߉H5Aod3\Ifc{(<ӥVGe&ȏq3AdodM%5bAϸH/'WOl!!W5' J!T6 nZ{9JrA9ξpI%80 T0i&s+Ӌ\%%ݏ'ǖp }’uEK`kVZ򠨉=<}@10)" ^#C'.^3/>I~ V.C^}4̲ (qt6~(/LիxM˽%7ws4./Mc^QqRP'K2Y:CiXsC P·@1>9Nt^TzFQ@FΣ։MlZ;;*I:/I3 V!i,,<ǹ'<&бd=γ$T*`L&HusFև[)|cL]jJ?<K.t0\=L{ Ms\f=LX*`.naʐ=pf':@99fRCYhyQ>@ ~ kJl})^ʟlHhRɆ!]R!:y&<-eL%"@r:pD92 b2[nUuo8?;nrHZ>rKGʹ R-2dv{(1 a:B啔*#g3sst8qFU{Z`"^SpDk:N⑿!A(K}NXF{GivX냹Ry"&֪)SR:{z+5rrخT7"n6Xȯ]A4Avbh1RRWq49 F9ߣ56gEXz"O$ t2ʣV;ʾK5q&O @ 4+Wj5ށQ-8Xbm됝P5ڝ9g0Ҿ{V* baB1wցR ,cW6$/^빎RE$d7 y|,P,g?Jݤ@VBRI#S%I-NyԶ¹~L99v%Fx:FRQ D #Th G6yNR|$\&/:zAYi-45O7-Ț٤MYH͑JhX5\4)YC[3ۃȣ3MN8j~ɒ<2zX F|7nD_Ŭ{֗(_h mJ{:8A17kF_zf/䪕Fov*iRR[_h C&%N !o`̉a9œ'eoXHs>`# `o\굙XeAz3;2cӸ]qmONwAmR7<*᭴BDƓw=O\"XZnE."}PbB#PαldJx֋_u_YCj9b h=*ISx|Qk+E{GenL$/2F*⛴Bz\q'9Id"jM'^1!; 8s ;Vx>_q⥾Q>yq==BIuN<wvٮ;yrfwٝhT=˧gj "ߪs }WGBΎWf]\%-*[hDm;#ȅIAnA PMZF[M@TͲ'<д>kWgdR%)Moy-5(\7g((:λ[hbR*pa7B{a)NfVk'MCy:"Кq kO>8JWK1c#k`R$ / rwcL` f Z1+p5!<2$\UWzx`TA,MqMo/*& 0Sx*U輹{M=?T:B14R:*q,r%٫W 4I˿Z6 @m9 aKLleJWai[ l _dW{\U-˫PN8sn`4R#q;c뫴*H.OKCED⹱ U’ya@"rsRaޟS~e5/MvQHQ >鄫=7õYj/oN9>a? s=.$y=ԗʟ\!+L`5ݶ&/mGKu=i}qy\-\m)tz9Wz 5>\W d|ua ͔r1o|>PhLh˨{u9 k N K̵3in'T?i11?2qByvtW6 +-]<)Es؂U2Zz_.~D#HJ 1v&۬-.sxm ^1e| B 0LgEFN,_fHwTvC>lG2V2I ls7on,j~MF*k~/qv$bz#G$$gXb40t(38`Tkmr \9IuO]Lx ϝ34.d; b p`pб!(@BbHJh#ħL-7F䝓M-uH AH/{gHBhr4I_Z0Pu*& 'etG! |$>WI|83|_\wmZ8C>1.g#LV'E~)H6E1=*bTU#AtQ/{i<ylq&Lb̎dBݞ] t(*%JZ^U_KG݈(r_GD;e)Hg34 kffM <-eEn-&jM&8ͣ PRhҿbcT?ݼu=-d:(j*Nئq*; }9|ʟpz[Hx0a$@feH8"Ѩ5}:Ul AH"]-78M5HE!e{ }M%Ok֣ ~uvjBGq0Ga44~wi)Y^s8ͨ&+|Ɏx#Uz$?G -aA&M2$[z :~]:SARӹ UJ$#}76SXMcY8c$6/Cf/U0 c%zN6~QU[DiBw baJoxFYU؝рl4NutFSő!*hlN VV?PT䃑16YZ(ƮPWΈtc`@CW|I:ga4GNhBH<@az1ÐBo]9>k`]5] *yI_( #] pS]$7I̱Q⃡daAQM6C쳴"MХ#]yT:xT/stn  xx7B }B%on.GN Ĭ(}5 H(c7;ͲCIJ&'-5E^elE2'!]}" հHobD-zKW TU W9eJڭ㺺RvdGz|kEqbK(cFB;8|`+~aB 5>ٸX^ߟ' ;40]5}~;x> qmrkM\N F-yƣyڨ:*וM4YƶKe<< Jf_޿WGݩ:km7륥Rixs׸5reh#o8|y\QԊY4|f{K񶷖Tc}4b95XL`(59 \ Ak?2{Xl77 `xrDyݴ"`ॴeCv5&-rliζ;-R !P2@Ú*c&ʂ|<ؿS,lV; `bD$(=]Ay0,Uvn h6};GÞ ^ _QI5=:Xn1}f?@N[ +$Fg un^IMMN$`N *?ko4I~rnRbz{etW~҇M~HLZ5@j@bE?(U]Ƈl[(.IloaSLݽ!TX?H!&Qp7e'-ŝƍj"j^-ӱxt4>jfīlO5vԌũYuAgN#fEY:h^iҹ!(7,mfsFHXOR-7dA/}b 孒}/`XbIlD&!FƣNeS!$h\6+P fϓ}3=9o-n]/{^[g!F u?l:/Ê{ć1Oxe*{+={+<X)-IC~Lm`#4rT(iÖ3R[ezI /2 ʻ"j3gD;s%ig#Dl3@oA:d.TGv8;MfWH;1eXʙi3VeO"0M+{Fdx{OJ+gNVH hqE7sQݩa9H}^c +^a+7O@#!q-MD7?0+pM`"5:QS1!xK3Z_r _Ʒ?}S}Z'Lpu1 =q%s_ɇ魄TX٬3 ~v{% Quکi ۀ|1}kk 4E&,:laI\hKQ*J̢]7,r  KiWB`Gm:y+wizK"WʊF( !>QP=Pp7`933M%NI֯iˆ(Ub?%#Ћ xtz-<hXBx{5S]Ng P bhO3jfߝVq_[.2䚋4ģk 0AN?LAAJ+O[/t(ãz CZ:P<CStay!t,Kk80,G2_Ò8bK{ᦂ}e[H+`/Q|~FeS3eu)"?@LWMGc'Pc б+ԝ4 qI,A f}nrf6|?y8$osu H Uv>YZ>c׾60ɃrS3C[?}9*ewLM$Uw>S֊ظyĂE!I~&l}8␾$L&&} UtxZqSju\"N}MQ ?96po-,~]p, ?QvV"`vnw+Zs 8Ag·eD2#Uf#پ_}M+h?˴Ep)cypD, 쌠n=Qj\b,MnNH܌,n/v~bN^G)rW&i2%uI}u`nh9PoGXB"U_zWӇR^;:; MSFuc?X?yĈN2MQ'ۂJu)ڇZ> 94e3 I%3.;FQs/ jaE=p(@h&G'"g=1[rv1r(x^Rܟ\p;^`=GRxcT| JcGTgzO\[!AﻺȷG~eYVEʕj] Rw8D3Ɂ+έz? B v 4,̇VˣXmiRuBOb)p ~v1lZ|H|$F.J^~R#USܕ>l;MyK.~!䵄 V gr6L|ɴ1KhZױH>]4?aIQȋɶ8*տ\-Vi{H+%piR'$fM$PTަ2],0Jv) m*jYt.(z{l)4qeyN5i磨ZA 2㯣D_I8ZsfKP9a'SF- s7)]寪9Gů"pyqUKI)_x~' t)? &&⑥n74IPT Ohi}QNg]]iG:^2&+5HuG@-ɀ&75%2fL_R:ф(^Q_c*nவ*" M .ENEWGo# ?>b<Ҵ=A9n7,AS][Pu4m M2&A z@Ph9nS:H'o5QgD88Z24a5hnzrJR% uLςOta\~QsajIMS^R8Jp~7xf/>-!|a#[Ļ&ORT/B 5bքU0PX a%sWdNH[a(%e}fBqey]3 R84D8et _-b]<#g vKi`g>nQ}_a!1.&Lf!{5ˎ3BH@ =ei z~D;^Q?'/",UbM,bP+[>0PύBYYH]bQ$5lb!cޮrZ:&Da"J$ `4wƷjV2vsw[OګbIHF3=οNӹmڙ H.f7l8e&SwRv&yg_g&u  `/1b릆!N$Y{M:J\yy='78Bzp@{j 2xā498r.L:b[x}RʡGTxZ0&kZs95fCW U&-@a_Xxl KKxRy-CƴMvހ)O'ut_fg< EQ:}7q-P|zēqP*@bQB5Џsroí흀Xۃܴ7VszHgrBx*ok?E2LR-E=7/4ߌSA5 Mtf%T,@9=D-K "_[ + r ~EHf6w=>SUCJ 5xrGJ uϟEB/}1gk`KQGņjWCu_>֠oVqsFlIt' yMYc\ yl B1BT0ƈ-QGp{_brSx9]'g"[Jlu|ATc ҧ%|ފacH.sc-0ksySH\# :eݘ&*rC;zglX5N~$lY>$7ʄŨnTURRқ$xpM-XlB 876?:w8mR.ꌮm4ZzC.3?Ey DtK8vDZSFm= ky }ӗ ޼X`HX,=d`+'W'u \q$GSd)iH] `U*"v,Y[TTn_z2hHW vGXm&;SQHGH 7^dMNtI"r.pdc]ZL 71l鎘Mt`MpUr2.j-kiA`ax2i8(Nʤv:04vtXx]-{ȴW1Kyi߼EVC1inj",{0/Lw jHK3tW#j ǾjOE9G谶1|IКHzg>#FR PU#u>l׃QD/Q6^]ۑ|'_~D_HFOa/:W9^[TdikYEDrohwMܬgBD0SP۶ݯoevYk'wNj;GF:TM^fcNbջx-nD?Ή G ScB5D=n8itpTqmW@z̏2hm~BC$ Vn`o;ɴu){CAqw9d>}SGJ+-FIkBeaHjk`S$"As>|Enzd Ee ],Qҝ\羃39ZkTh< LP w9qq4إ)cɊ)ֱֿ=]om[%}r{a % 1c%3z&thUon4ZH!uww쮈 >w`T H$Ov)$cO@J P`83]gJ A7#Q2u:KѪDHnVLV9`ϴ3bvF4ur> Ӈ[?(# c,99=M?bQ/J2vVL֨f5-ԚVi~֩BkE#oyndfGq a6v@/4\{6[w:)ݹRxiH_e/ Nmlz~nU0 %YTCR\QW쾘K= 4yK[VP0Bxm5Z}NѬ[tFar7 +#*;e4(1|g0B.ԇ.r2 zؘ̥"*ʁxhqLFX\)豽U~LkU/Ǜ7V dre*4Xqsz䳄(W\[ʮyM[#Uz0J'gyom{WR6։i @~_y;Y5Ib*V7tR7E[(J.,r*H7iS䬘XqUIVYL`+ȧz3Wwqr+M pyr|;J$qzթNN{g}}E$AUG E?GG&GKgzH 8҃{%KPA?ύ.4h0;ո8͜0gz~(d+Rd #I)0ugeLH1@ڈztPҼ\X|L)Jf?\N#[#[U}wLP|MӿLٞZ´JWCk9{ ol.] |ʬF7Xm`c$72A|J?0z:9Cz3` #5AL{VX^%cGH \d1!p4-w"J$k-X(m&'*L,~me XkoŖn^,j¢@[VLCy Wɗ"+Zƥ a"("-=a`#K E6kEF7IAue%) x +VPθl;#_A5tBQ[tM]ۃ=m`yIayGn<4L&Z0ϗ5|;%/NG]"UjE{=#Ug5e ,-cf-'rgJ*2X$ZLeV)H#,M3/?5b:_~@YYZd1'G"Z'P7LϡyfTZ = $y$W-)dž.}/) n@I&,FĞ'5> };c>OOĎW( VT9( uSm3I*^ʩ"?]Ç5O0FLb#8)D,5zKR))jҙuA9+E^B*-<.@SO0|0Ir rlZKN(MB: ÁJ+< |Y~ȥ. rѬhVfn+ح\! 4x8pcߧp}TSiC=pdcaJ} `E$` >{ 1r#KGSs 7_颪u*}+vd\y;=Ė4Ο"ظCih~;oY6nۿv8(?EH eܜR9XOz%,d¹,ЯĈB vplR_dǶ-0k2H{j`M׆OF?$%egr.a/1o,N/Y<y1Y6BH7Y3B ~D+8J>NO̔sw5'Q' g<N1h0^"l@Jo!yD\wBK„2ds9X5W֍ď͈؟Mv ָ UDn/UY+.&&ۛBjh7J}P}µt%Guq#&߉/ ??=^d_ *˻T>,wB-C;jX2ߑˁF@%z2Z5^#M6~ >NudTq^eּ(*?l]2Rh;ѯ(i{}e$-)ϤVƒ}+Lk[ kxp_$@i.tMaDuj؉e3Gdv`t9؄ dp-i2.7WTZ+ZfAK/yF/H M`q_(c1MnpZ0Q ,>ĄMp$.d+MM!VMk@-W9D}5LߠFLY8!X3d{=225#qBK o#IPu?Gq 8{,"Ɣ]6_h"/ $578|Pc,/Ps!*@DQj=Kefxs i:gxЏ~`)S\ Y2\g`FUϑ3*A4q]%v$Zw@>,XV{Ņ|!#+NyqW>%DF喑|G?hj Kli.vqv-Nh9Y؁a endstream endobj 1424 0 obj << /Length1 1750 /Length2 21511 /Length3 0 /Length 22628 /Filter /FlateDecode >> stream xڴuTݺ- wwI)EܭwPwwwskVR63J؁xr*vF .z# \hli3r8-&LLI a4{Fj@f? %;'gzc#3dn R{8Z[8JO7h%d ag(ع}*;hadc3*IEu%UjĪ.vՋ$@LXAMԠHT>7(}p./&$wf+o3jhgO=#39?YX:OG b\@t:[,M ' m?OcD8i/woe,SRY #ɇ?hJQGǿ5w"v;ӳ1r38y/n}&v 'K'ge,mw{f0yai qU5z>18;7`8Tk'Y~ldq3K_M]A.@i3:@Ăof eog03qX?^NF@ } 04qǰ]dfKT *ǔځl<@3xF;AP3gQKFHǿy&oT vF6att*Y:XF?H8|{uoIXNNVL `T K3LL-Av<ӇX^6##`0s{F3i|LF0ϒ ?K>bl, `tp,)Y?Zf&ghg Դ4_.FΎLBd?^M h;w/zYYlܜdaX]J Q݁&Kv&AVߛB| &ˡȹN+d!';prI-2( x}AZAX6oIULv|} Ņr42wRkMgŷǎE;Y~]ꕷA27c:ڠ/w,Nv;?bF /|84@zBy5X6Sˁ鎯,TJ&8Dς3?!-[aTmmpq<8Q- N0R4wIoRe1b)Q JnV>BTpfN'6WF@IO,xuʴMFE)Y@]%HF"Or~(CYK^SWm(]|X+^X91 $% (R[a\>>|\%"k✯Ss2S.4 ڋMM04۔FH[}ab2D S~;zuFSF_šb Tt;?}*8&1\) ¶ 䑖QE[D-;IR')'*tkxC.oGyяQ%QƂ3QC!0/'zP#u qXG|d]%qب͘ߓ2G z;K>mE%Ec> &ՈGvӋ@}ȾHsvOswH` JCt"3dj}"ǫaC]{Qo7?Wn%{"Fa;TCP]8!j៵p[2`s:Ї+ OOtg0%Ko(ȆTf7Ȯckp௑J@lޑ.%T@%g[ 5_&,Mhw-)k~Ý:|\ HyGLyކP8@fX\evrчFB_=G^7~:Wt􂵣 _ \*8W5Z_%~&jaܕѦ1N.j-g ]d :O61 ;ĐIt;rRL~$mDt1<*Pm@j:RliLm_@4%C %jܼԅ@1CF-p^^#lN ]Z+ w(bv"ZՍnHFꮌ&NXb5iŪ%~>X)U?)l)x]9ĤïĸaC]Z0vQ&O{_c dmfU`g/[u==R{Q.W~HbGm'M=ʻ& !QbaBQErjv`p|1+4 g&wJ^3Rj":nNd96&,XqVu"7=yרoT c%ӥ]^M[ d"BZTpfe^;E>;*w@DHlr LvBW'ݽL!eCsJ1>yG4pѦ (!hS?w1tPԅfŨ_2=&|~58,Hi FjkƁI"k߻+ P(7WJ36fJ9>iF~K )`7nQ`G$C `(Oږ4Az>_cVhW}'rIFwx]t̃Y ZGRqBHϨ&txŅ¢ĖB j(Y&zDM:=qnqb=I"[x|fmy_0<ܼʠI?-0/_{~dy-M/9=0Qا%Wp:`G ]2Yͭdzk͹L>{<${A+_w!tś03Daw̟4g c -krκZn[cN.lY_$SKMD3ҹ14T0(7G5G)15Iu'QQ+'*py{ULjnTAm:AkL]ch}4fԌcC?"`V](*sM4ɆG~=& 'Z" V aCՕ72tdJ9=gv;i]VaaiO7f}=_k\xjߙuQ|Zxu S{A 5?kH оTʸ1/ϟKR7`ůr) ZGV#PpBM(KNkO0R9ÜDP]cU"f?J?) z&*Zhc hihTu7-C RB.ypwͲ&M>nاV2N+uϜ q 'Rn&%_Hf%'V?IAnpR0 HfxKWI"/:4y5^ G;Z;FFm>=?^ " RS5%}Q{D2"DYiΉ[ǦF@Ő'LcX&҈0}J~W;O@|fVY'mߕ+HW]j@ϻ1t]2_ҞfW]kJ z5cdQ6E*׷2 tkSV8̅p:Iz~nLֺzaGG-ۢ ZfڇpM̷XW1 <>yGS`N?&U㨤LgNvB ӚaaŚVQ)m wUKc N>?vz.o}ZuMDYbn&\|k#<T;XE TL;u]b2"1tZ:CE8kG}vw(Ǖ=wDtg9'$4Ǯy[tH%Q~u;Nx{`B毢isHja,p6FFed0ԚsPkrq+Vo XT)j;l>ȐCR'Z3rk^:#mrgQ[nN 򩉛R}5׶EtBٜ ʫ ) >YCPBEUS(v~txF,fWm: a:)o,_&7٧8m_19,A sGjP! AÀ4fS"Q,B{εܬpaF'Tngt,w7$USބR)(%*DTMJ~D0?zD_- 3䇃ILYg,>Ǧ;J`\p# 6EM"`p3D[,wC<rwڜe{rAzN.)'_ڐXO /`gȬoz2x,Ǵ [N/q hQC?)Ftc]JC7S8yCh! Hsqab+6ٯҗS/mQjW`\`3/_$Ԅn*h$HI5(Y6f%$HRpo3g^G0zJ9mq>:}UkdI'e4YxWXRS&{?`T%->UxyԎw۽8@Q74+ 6 9\6tS~%swzah%t$ ?q2Z1V45WY|@K؊?2U{;B/ΐq뾇2G= aڕw}5e9!p>mDN(9l 4vZ6eZo 5b}ђYop!ևwGn: ~ {7UZ[5NeطNV*2"lr;Ql1i{=ybEQBK΀Vplٿte-OizjRbUn;s;nx7'A *_Me ꇅ!jZd7uhc(8b4f:J ͦWtYW97]<C|9HXUAsVR>2'%uAe5ݐJ|zg3;f,9ϯMNEԧr tI:~'DYU#lHn}*%8~Z8u6ʲN]*8^bBY|ՠ,I[}ٌGڑ<;"xu}kW8Ü>v{z7}!Um-&Tf QT:a=rOQzwF |X"+αk/  &xϋ'`[BX(:gD&\zɵ_,6jyu&C M. K6"NkQz p#Zg1"ga)Lݦ$)5~?Qd eqpd\ڧ$ U9>5/ܟnQF:"z21=4U]x)^.jP'ļk~xH2'1OTvk@)I8E +7 ':~ϑA^0$ T^,7K<|. {i$$-QVޚXb{!+u||!, Nnpe<6-Xbd͂ R&}w7)DGNiPGb<}8@ 7eq,=8kZz9U^6GHM숵 2E `Z`aȡm%Y`̭S,IZ3Yh=j@]D-֙𙶖´?&ΩJf!#؂,˪ z;]M7p(3H=w#Z1KP%XOރ'$Qx|ѐS5`؀pE3zi)r M jb+]7 3I= 2e} )SEB~7[l'8Y p<2Dtm6$i*GJMlA{dH2*z2(c%q(ϳ*Ⴏ"F!գU\@'›y:H,q,MR $r(njo9ȒF_տv`9E䌽1MyMW{OsxEV!doGJ0.#fO.bK™F<'+o2mbھ QaĎN[=rB!]U*.;2F[ oRb?*8:DotERa[ڑ>ÝL[/'\XmӍ">U6HNI0Km?LU02m)|ň3|"EՕu 2OkZ]YX3 M g7S,]_&L 3e8bv8&M`$ey2Eר O, 7C7]J.bʖHH )gf <ɿtBUK%`ܹbG1~s4r~?*9X2T=So57)U4;ExKd蕇 T[?ײ&G<լnYHtgLH,Zp:ˈ1^BWZ/7'/BRR,lu_𳋋)~.aҊy.^x5 Sb^Ojk)FIV~i7Lpaz;{h!Y/9˲du#GʔruߣtP&6X?.I f&r 2>=CYl%abGZ)c[5xJCm@`-ӘbA_6~w>g:QtJo T0}rN;wR/pUi=Jy _ܗ:G-ygaKao{z)hYD7T312EBY e X rkYwI5y~bWq`~k|h'7pO[ПY\VA)-te%!$zg4_~ŭmPi[yk+ͫx<+VR)qXQ@*Ox)"qIbvVˀU)vǍ>Gȿ8\׫ Ӥoq>H#7?o?q]{zOCpfga v\9|R2{m,4-oB#) ;-ǝJl4؀5>踸"1G"!LG\6zgЖ|C j}LnXV\X{VnޤvLDkl(/˄-EJL 9ll|*m׆ qr 9N?5)F9O ^pTptOm(||kn+-ַwUNMY%?Iw,qĞ5~ΡlYSbn#5neQ Ot;2;#-W -E$pU Uו14._)Uڲ.2àg/{壁EvʸLX7ņk&*1 0r .Et CԔ Wh+M~W≎X\[x*WU{!B(5t}3A&x`C@zNZ/<ݷ2!džsX@p︣ThA,Cr6sýL?w!L6u՚~ThTAvSI"Dy=Gx:NhW ׈L{̚=$5Xy1U9'Kٰ^=ƙiN|"# hp="^٪8p=VEG$X;oѩ +1߲AU)BV3YΊFf9Tf[i>z參I7^(W=ZA͞F5nz39g*vG7pY#.u8q/F<*$鬛&Q0T #J E_Ƥ$H}き,xIP@$ۤ@bQsӆY=V7QS,%wh_#tjU_beq_eyhdm I*,o&Ya 53i+S.~y G&y`)+`B .ǗԤu ⲌZ{2`_>l8U =5>HnyH@'wݐA&W,n92v>>!c)=Ky{|1FЄx!'ZQre/2|i+%" itnvp7=V}ǭWp6_mDࢷb^C;[̉gJ!&ZKjMA{yY[Y_x,!R({5Q gz;),.Ij0A5?YZj ˜[q0VtYMXF[qhWn4$RaA[pLd G>?:5G5=9d40 uf; +l+VF,pP^h_0 WOh?Rv>CD(F@jJFgjIpKƮٸ$zGA8,V2~:E-];zk0A6Ǟ9^ӆNm`޷Cɸ\lQGڥHC4c)d?uᏨP N0s4sAA˸o91 OOq:jnE %<N5Kzm>ѯ`7푖0b8=;Kywb7oYHX"S{vs0|Ү{'G(-3Ks]B L%${]*[l iK{;%57D'Y $ku?Cy$L2,N]k aJouHpzuo)\=Y-mm3tG l-fN,X+'WW!̤sVxҎ0x_|sF՗K[KMEIR H**D:r ᓉ>uiY,"٘L僟" /K ƻPk_Bg˽Ih6xtqL:py[Xv2)@WgBclh0XSv6S3P? bY- 峀^x@ʲ|=Vw`d{p9>7p) Al2B̖gƑiRd=D][&{a0]@[v!D3vxe^e.}m/S?amH.Arc.kjs!SSv }G`5ѯ&6 (n rzo}POGǗoAY 4No߀3ɯlY)QJyAC+NHѿp|䃉 "wgp>\ul2 êjc{9L_|v;Aȹnfu/Ugvu'vܛz+f?,o#I UwR 9Nޔt-sWTP@Cd;[ps/Zijl!Q^ ;TÌ -ooq杰L}TƓIkCDnTS,6r44e7A_![uGNI3 J[d[ 0S#fOq QabsAPnҺ{nz&&DeH:b෵Hs_Un󁥻c6+s!]Yq5ٸw89 nh*o*M簢կcvgDiӷgǘwP& w0HUsԻdqJә)a8&e^߄n/Ә@x$:gg>َ<gҫxW)Z8+U1ۆ4VUYCo Pgnx$/7?" 2l<52c+~q@#v۲4G@r5&9]/(]tK_#{nHA=JCf6+sWΎqt'n?O1fy I>GO=sЍ.6l1RIfmӓxn@eX`z`W"Ҳ;s.>刮oωdY!Ŝme*oawrZ_"J!9sĵm!Kd R'~+ J |7˷ ɇGM,&Z Ourj4@\(CK樿bY[ l~[fhG PF-Zzzt7NΒw"agpG>`?'6:;@$y;h))IbE&Ù{DRҮ`yk"BCѿFߦ" 7F3ٖ`HreiԯIP+DX!hVe4n^OXu?QG{趾fgⷔDnyrA)۱CL ¡+ҩy!RϪjlt\QQٳ_0nDҗ㪾m &[ ?m ǒX`3$!pIqt[OhG˅kƽ9(>ą1qT{i'?Gk *q@`)W2=8dgfu'C06yaJ `"ݤh1r|e1ǎTU?0}/axuՐG(rz%%];mOk>GM[{@cN>,EpSʗnRw @ Vfncu*l/1*5,ϷtUj!a yQVHPtVjlmO uoa)%c>?X#j1&g)Ql%'Ϯ$WdFpfdq2mĄxXV^X j*ؽW7d?F9`RRruM9j褶I[O S hսCfE ee՗ Z^Շ03ғw]X`" \(l Xe!T=VU1͞8<Ȧ))K$Y# bwr]j *0[G"<[ee##[ILsBI M\U|+#D#pq볅:XQ<#BgVݍF:ZC6x+,[Q7o&%_!,̈́|vS~ߙAHG,-ӣ|*cF~1മ?oāHxq0]pٜt)R7uTkraV.kPW$ca,<^RE"{FG[I0+ߋDgÕTeآS. ';R Pc_n{62g*} 񶞟cX8[gBXn*bii$_XlJ, j?_Dkל'Gϣ71 |$,d9?(osP5鴇}֒Q*o͖B.?o'Ȩۻw7[YODIhчSK/CY2j-!,ʟCIf6_"xhmJ/]tLj#ֲc][_,r9|Cj(~+7;[ys:m<bz%GTa5K ϝ{qۤjZC2-WE= %*<.brCjD݃C`0VJ<)ὶRաdhKQ8& S됛.!v`FjF1ß/B?Il؆ȶ}9BUC+.?{Ӡhmc@"b:kwODI&Gy 3{Pk}ujwIʀP7rt`8#q!_KZGi<{}Brf!@ .خiM4td)VJ/?kM7"@BemSC*[bwT:X̑9e,z X82_>UV 基z9!CKPUȳ"k'2V!/Ibtyc@p7h3{b}ㆾtT$SZ2 xMk'˶1`-'YRIc33Gꑲ:}62%R;l}6G#ig`46ɋCsxJ6̧~sw[Y\{?%c ,D1Bmi/}D=\]/s66unZPsZ($l:F؟=RQy<pA1;:RbKNT81g #2`He磺5 cOZ/赋Eq턁yJg{N6#R]ǺWDN6<n0j_Dz§OY̦ 31^CjmXM@T7%gIzi- ^DN'tc=9YC7f`(O?۪iC_k'i;F2nEPןwl-pI=\_,  dZ$yA'&1=_RtKI8n}i'mw~H`N*]8eh]9`s}#Br:Qm=Qa`8g_jy!8E,{);oQ0&G?繗 wa NlˣVGUwtz`Vp?x#u\=jdpC}(Quoliq@< Ǿ%\۝1Y§l$xV5&O?{QBPcA9QӢ+}N+[Y; )@t} ; =Φ*rDeCݞXƅ?W/aTxzes$."Vva&6W{1yLwL^y|q' H!i5ɛ~@N ٷ 33f LiW5{IDeS`h_A;i#>;{Yܮd(#߄tyyʋ|秖Qqy'kY|b+.޹^8.NW3=tRy FH^?C@+Qv)tȈֆh\ӻ&X,W,]9~WXr2  zlm^QC<|vxu"1Dc2÷nI1 \f4rwcsפI/+ NO>?+.0J?h} {>W E˖2 3>{!ᴅX)ZWDmks}FnV94dMEt]P 'S{'{gn,2|SH|\E<{nayO˴ApOi}`~u~{.3\NjTGuYAw^c8:=Ťޠ 503n5\I7 a^#Eh;a޹;:co53^q͉Uàƴ9qx&;<gE ~rD7I/8D0'O|cqޡw݀{Nj4gvnl_bxp/iʸ%3.D˜m߯JUB$*S|AsW}xLČZ?qvfB0F4V0 tN3s[3%笯ġ<*+XóUP' Dm4 EVP\ 0h̓l3QZHSVZ~G _< ߳ f0 u㭊CHEI:!7-ߖ3v,4O8"$%"n\M8: e{Qr,S\Y̘Sl㘸lI'#FDZ]:ĬS`i\N[Cۉ)x '!+\`:+o4k0 # J(mK2Z%o13n%*WFBάp(CxUT񞍎_x,{1w Ls|aOc)z_IMѭ; lw$ɂB۩u#a*;yy>/^)¸:iV._e&\},CAʛ1`x۳pGök#0LЂtw43Eq5֔|g~,o@tиm(IJYps~orhD\ǐnmy+ gP! Zru~Bw0p ᗓCo v8_;}M򣟡ퟬ@[MxS&Grѿ=<5q"u4] S?6V;^/Adꇑ[Zݬ[uR! jp)50kL' g"cezWԺ}e@fnNEkAś7'-~qVL]IztV=|O"8b/.DY;FC_ mFSBx~@zO=+~ ;OPr@ ޹[+z7)=0>J̦KU12GYbƥ8:]t"1H\ka7˄8c=w3aMʔK l;3wZODtᡧTo?UꬨrmH|ng9`xUVnp„vC[Z+ӟqams.M|t4݄ʧiN|]Ǣo .٦$Zg+/:4^D6]gϼrP?&ܱX#{\%&@Yp֣ N 9i! "%۸j'Oƣec2'X8{UF"IP(&C qڹMxq̡{v JKk:[)6s/۾i%6hTMJ92.w ڭyళz)V^i9w@\ 6QM$H1qIl5dO( zJ;mLgK,0Q^C !7|/:Xu(R*Nl/7YEƍU{ yq+ t~10>EDѵflvbL˲n#x#\3gͪvC^GUӦ|?6;LUxy.RP2xнI"6_)I )G-2x^p/${dl9ϊkN6м}(1DE)*['௝ \q4Q@-q㢷sts ȲςLͧMhj`^Sf.6ف%Q1N bc͂7H{X n:FCQ 1};ƈlT] L'Vy?X بSx;mÜ*E7>q(+bx[`!>G6tӒ扈|u!$4- Ѯ Vv$ybZ6@?F K+}0%6 V##\72r1k~|1rUky2UC[DuID??ژcD9:l7pFDQoМ8ࠓ aD4wM%Ejը5ai_SijN_4W1o:L΋ZaJQoW%YS؋c, ##cCl'R<0T^8WL1N.%?9lB8(W+J9%b!7qH95\(vNjal1k< V<  ? f%#"9a{C<򔢛P>7 1'Xv"<Ei6xl$fzXoBp(5!S$'+O&2 uSO5uYHy| #PϵP4q M!9?ibD]o_}y"--(ǝiZ ;׷Zz .K>Dg)yQmᴹ jTD!1HL!ckNA!يk֒`rt}_RymZQ6t߮CWR]TCmAݒ$tsKBE2cFWx{z%U IK?I@t=bP|B ĥwV :bg-,{ :;ɝaRg$m!cf#S\͙c.43{kG74HsCALL9^ G&>]=/ r>^30<Q_3-X*>%囌G=o,qM{QY8m `Mǽ(?TxizϏ06ORvHQܕ ,(7$+]F/0F࠿UWOT M{e7bm1$F>Oa~"珵65ˤK3ȕq cT@k05fi%Ej(ͨ0ZFkpټR%Ԗw0oTĄ/MW#zYPe&Gh?kf$H2j_;c!(j(ȏ}$ yp*Ӎl^UT#ʛ9Ĕa|cr挗gpU'cRuD.%^ķ< ra>[poDa@HґvWLWB4Շd|mk$HQJ)Ԡ}BZ 0wdGL˜2~F9dK#Aw[TlwnZ3q_E( ~6c.˼U}5t*W2<(%䧗J3z;W.}̼nN@VFKzO7 h>M!nP]]ʙ3 !!hj`emE>O^-B9ȠAmZrEe_'F=g3YU\IĬPWQ{ i)B? ڬNT3ն* 0oKPI/p,EB 5̗1tEuF 7vJ_)Ghz9M擋$2z^.[~"p5'Jjso8O 1$I7Sm_nP;OV;_mtYZ[c(ez9ï/*̉qW/ s&\aPL)|AD4C#SzNnlQZb~Bt"#-D`Ưe q endstream endobj 1426 0 obj << /Length1 2526 /Length2 21447 /Length3 0 /Length 22954 /Filter /FlateDecode >> stream xڴzeT\[-[ݝ N ]{Wso00ε{R(1YL,|EUk+ Qhif`cba@yVt0K@|nS‹  J#+@Twhek{F=D AqkW;3S1G-ZX;[`#"ཱུ3Dh @@Kc1@ PTUH*i(2A9X4@B$PSW7aWvWTU,+ dgo;p0q5+ٙށ΄/~fgk; iG^!l$e JDbF8i94@|V@3 B d_ѻ vvs([e4.f 죥W vwGolCkAc3Ko LQ:d֐\OTB`@Tl$nmeam}f>9X۹2[; ldF6`3[GĿ!"?2\ MkZ~Y!mp-Af 89&ZHBj3u1aB*À R!tX! O:555k8X%B !fVHusBf.PCqrp2?F+UAi(_|,N-@ZfF(\tY 2+Dzퟻbb.edㅜY N5r2UƿoF2DY67OX);^ OH[.n!u@"k H8ϧ;:ZAO3\y1Ǖ&Sѓ]Rt8SI7Mq޻ \f·oiM1Mdqޖ7Y_YbcϏB;ܿ v.M !tlA ?lAѽFfY.hzӕMcV!ָ2ڵ^'sUӳpc7 kK6z{m5dU˹M'"JOdu$/KЗ+{ϙ?L#Į]@sn1&r{[kqx/r܁?b@ ѩ',(CZSZIK#2hmMm^C^鯟&^K+JY$Ɂb@f{ht̹ĢaƢw>D~3^~`/*ƞđ81KOá **2<' / ㉌6"бP ' /EGJ[ ֙gF\~J!?BWXb>#'FߕDLOXfO,afrr0TRHoY6gp&Vf ”CM: 0PrF!8V_jRmA| SJq#eOr㶪 KaE&4"$1RT‰Mp>gU{~t:'e"_:9)xE>o%YKgw(G4cP7Z:FqX(y8ij:idlPLjvE`|$,T7^cQ{A@ցSU.B"Eun/)FvY̡#'];Ta +OM$,TUM$}=A=3 kwN&x'.ne*P%cJc֘mE/̃ ZSiz|Ĩtb~j~?_g)2WڥS(~UθIǶ3v LRCYe^ R'xa=PW#,k[ k1w!~5l"F~R.56.XFÌNRwBU.(rh޸4Ӊq@z&1hX_O&?>IE昿VkQYN2w)L|C1c'vvh[5P<+Aq|ph0x@o%uUwrnVPTڮCHgUl1>:2rB,b]}H@DOX5g֗._0Y^ZuxJ%orS &y6SX3϶U2S!)qE6au-{ߤ>-`Gqbozi/F b 0reJ;]`du\vM9؜ߞy¶&$33(۬%HO-8$jEW jzn]nQ+Fu4XϬsx =^uݮ*f%#H<¨.aVT5t2 BLg*ȇo+ZF&(둓ySq51w`\bWv0? p(+ܮ҅;,u;㫶qut=PuBvERFs^RXFzYpՊDdD'p$~8'%=^z "e{(kt P)+CFc(7*ڳIy2=وbP d%FvH.F8R_S{4w1oe!땋5esM̠%Ty2.YێOG)Z OC{Vp$xv}S>@SPpӍm[S`kg47ͳ7 ey`'d#) ?2>c>!Mq%=~Fxh댽.ڌ}%K?S!T~Yc] P=Ľ"&ܝ.bs'Ieٓ~hA=O ۅ"%ik헃A޵WqFGC4U{r#woݏ4HJ}8tכ [-~H偍$70Gr3^1͈Tֆ<[o'b+w10'nTac7SPO\S:997,8ZE*>?\~|HRo[^ɁZ1꒷[%wC#q-^1ҴcQ|1,7At>rQLq?[5lމ:iUê ^d?> p֘\5]8@n-m*,DH `#ˇ 6CEcOWtcCaPSxR2LSVH@ Lof#?79ǑGE@c.^[D= sx5վKb_A]ݮ|'du5Mu9/@=Mh D xM<[MRYfn6lOoNچBTyRipKB>-;7?5] {/X 8Fs~/gFf:ί\zJAXUtBn4\a_~<]y:TSnj~2:7{=m5= :W h;5SkHEe[ ɂd~UdNqq3m V߽ؓ%1 B[,܆0+[¾FZj DCXu峰=c  ط~%QC A9ݯH*Mȫ4 o?>/$68*ThhI?*UgsȞ uZB/Rm>j&݉ l]ӎdAhVʥ1»`G2vu+NE9HJD[|G3SXݯ#Gb8i9 ?;/@t]#Ei (E"^MbҼ/L_}3jZ5Gڜ`25qc1vӀ>N핈 ;3&Wx~s-Ӽォ Ÿ=b8K8[. ٝ Yph 0y5垎p%*1T",WY6m`j7v"m"KSj}l4fӔc})Cy?3Zݒz?GMH~S%zRk(x\#&uzpKr SZS6;$ ]KsD7 wS':ZRoi%^?SJIqux2UbM=Rsy(ldل('x%Tt*7Y忶]~몑DcRy[|.8XۉbuZ"m6;v29K:S3#+# oq؈DDHO0jW8{oF>^G#ݞ'~xwxoUK SR\|@ܕB]JMT5B'Q ׋bV?s9{+1C"-AGxE| kh$L-?'=3%-φ;$IU3ҰpKTJadm[3/۟)Ƀyo~c mkJh4mlOXS!- yX|PO(L%vM-stܯW̄{>]f/pOB73!9J P=@ go6#0m{ 7+i6?^jӏqxvߝ 첞 M 4zMSox3{dͧv 7Q;ifUev&n޻pN:mMңV5]r_siJ%r?s7^aU <`~zOZ\@€֮<["n}6T+r%>KFu,ܵh,B@hڀAŕyeqkt1cq,I{e./8VnMϛ%7S%%X؊B"hOAa5ݻ|!=5f&.#gf"9; Dj hv. {^a7_O:,킔l9aF]U]$t9dXc?{ѣiE=A(MdS>oɋ|Ͻ5i!m"#A@{+\"Zqv dž/ٶf#TZB{}< qA.ocG7rjMpJUu@ŎzJ`e v= (0.hFu9JȈL:JârpnhGWH"7?O& ppqQWcU6 YRo '͔k-T:MP˱A!4uva}+Z]u\Qr!|/6 Iܤ -H&Ҭs-gY3hoO *HPJgAcοzn Ȼ#[@|!>wC=f뫴\=nuCʩ9>nutiGdLn,Q9i>:޻FR#Eެ':2J#zjޡ7=CC7g$x{a>'Y_!=0H <Ū 0[D11h~ Ml:øv#S1hܶ-Q 3H`>6+Xa|O/mx^71t87ƶ=v7O>jҍH#b*9_3ws^%o7h$N_siO?Sw)RSk"X>j|-vll'Kop=e1G }0 8\3`Ik? ,KSh#2ft;7+:VOSP&&2|_tMXt jS2mmЫߋBAԾUn*nԵbNi̞o>CW:x:|0>=.ST1/vY&_&ZK­00WNkkh]jN?#{ |L#-%{/l2e{ cz"LUe4G'W\3>2’+hQQJ߸Y 1k3I铁zx HMc\ON]{+×~p/ao񓿴}1P&ts[%1[tn BUEG̔AZ *bE⛰~pӬs:oLxܚ2vYgV #[N םAu%a'hH ?8 %dBcz MPLAΰS$\tp$jjq(zt)/2Ď=<ˈd(: \56VsOϥ@p Y"x/ **2˯ +<u[Ɖk"C7:1zB.zH _ ЌdV4uH^ ,,+!]ܲ๣zb2` ܹGJL4ݴ/BꩡV65wqҁ9G1lʾ-nH|k[yw5jff4~ Bl"m]ggG|0ǭ=|] Zb{# 0b:qX<yv"`.-)79R'L$I6jU%$.fvbAmp Dw\DRބ4](5fr3-SVӀY²󍛜7^ңZ׻&k{Bܥ񅴽 |=ӏON|̩!%ȟKI)xz .CwcU `X?kiG_Xfzi6\ӵ[15c2~h6Ӗ8*՞[Sܧ@[[#ESpO D%v [>-emuu{6 =(u^NR<ee0U*jmģ L؉_9sIZzSIHgn?P?cZ8o&BE=^ ]b S]M _LX2c+b_.ajSF]%aTcgغaz! LRIly.%|D.S!s2C~`.$98k_}lƁ{mkDfMbzv^&_KgAt4n*;v@QyK$˰Uuޮv0(+ԓc&%|D*' 롒-)m`=xGD(|I-fh5브ifm(^B}!OXdľ*dt~{ :EduFuرZ[*Uo/lH*İa$4-&#"c-+V،e+2Ͳ̙swK=QSK 0 yO/nd4mB;}7GWt^"l]ꊲ1'!SתA@. A#Gn4dL%D,5R<,aw&ՠk]FBI<^+7ѓ~b|4y:COq^kO4  {B̭n,n(ƦϐG/WT/lRg^0Ux eUmp[Rjihi ofJzSdby5$ +<8|@<[_|Szxu_ӁdO1Nsg7Fչ+f]~EB?+,l d<\f 6:wF62%e3! E,mQoŊ&7LLN@싞2\ɟkC>J *&kT ڦ^˟z/Vh87_x|%9Y@>~ڗ2U#_;#w8|,B&6N}7W9|R)|bt%\_)1`lvVǞr)ͨxjQpB 3bYyF^LTF{ avl$n.ԅʙ/&Xi7`x6&]m9kBEtx; {ez'>ll)_, iokrƋJea`A;Q"q/nX*ire:=&:y'Up3ͩvَjIܓmѲ[ J]t3]M|hnfi:g gr/>n.()2ɜ{dVb W;:ٮDB6 ?#m]wt%d_y(Ȃ\^sŽm24շ>\e9s2c04ߗ kb"SR:Iҏ ԇq@o4ՙ3 ȎܞdsgUv8@g8Qe X|IУV14A%}*,ZS1 8)Vr - ӸsnwѶg#+{0ـ|5q/ W;=L4!4Ϊ,7\Oeø!8^#_u1ܾ {_z__>kI'E`xFxtr9c9;SNS)g#Ţgi/!5_dP/)WTNUeBe9Q}~W¿6Ќ2^:o{\R, H:)Xf)ݍ-Z<6,A]a%W΂1앴5_(6h=)e]`5F:͇7}W\BEG* FC$ (GOt3!)KROAP5:%%)a7;=>yn+ai.uP[̥?;UɧzcB$t4Ś"i\56+_ÉJV5T:۴T$ V.G7a3obZ|acI@Uʨ!,n^d$ B~4(K{ɍ AwczO[Se b{L;Iy*fqxS. eDƔ9Dk ~b.M'jU&)-ޅX4i.]E9ꀴZ ׎oȿ4hP<{q`}eU#,QλGБ %V4i3ɛp$ާ}>-SVjꗄxQ& غE_LTM*,~B>7ߟ/Mzj&V֚zS'r\\?pR&l=~&OW -okq+ %Ugֻ#ɔf ~*:X*F6"xc, yNV8/PFA)Sgb~<<X&`T$rax 1ιG,wf:<_ϊ }aWRձ-Cjйij zo3BvSj據?L=G7L=}qW{$,1A 5 ߙφdJfWwo1w{ESWaef5>"_|Rm(U*OioV_tk>|q816;rLַ]f|C)dHVa'5Li7VcVBˎoaŜ Z6SxmE.'5ԾeXlt v 7Fp˦$ i=no N&l05\ s9r^wUKO_ӇC.5҅I)RS?pMisuGbd@Z bMrΗpkB_;/ 9GBԂFЛ)gobo-ۥ>A+ o}t BgFeD]f~" # C}B&ke %bMy#sdXĘl.Ӫ4cdFϬ%B0/3=ozHH_5W !>n:#(>;u<%uLsg G e""o̩r6IDssm|T3ejөkv/nceK ;4aU8 u.y.!" xa1fBLwPiqEb +8m7Z-[{*[_ jW(~Hw•,߅ٻO+O}'a+~e]q/($OwZi@٨ ).4/B: h2ݠ<]EJ>VP̺&-6C&i.0%~=[2S=r9 "r.-pč !l@qEL,a9g_Q{ !,!; pt,ein{,*o} WTDrݫ#XA㉰WR9 ԁFORъS`~e PߥPͦ@4ǟrK+ a}vn%Q{a4Ũ*N8GOi{rQ%2 g'UL",'kAΆJVMIi0m`]߹谨/N2lc'b 8wxl)#;"yP&lٝlܻF7^|[5]3}SbCfIK;N2"-]1SWݥ~< CƷ6gb\K8 }w1*26<ڭp"p~iGY!g9hÉ$C|[ug+<(m1`9,cKol>/,Ae1(ң'/F2J^o]ާ|:cC$g{ELxk uWPr2eVgg)|ҚY [ 'czaLƁ?A+-؏oYkr31^v);te薠k:*ydQ? PKX0ڀvsAdɫzeA~).ڋ\xbuD"ܪl ABRp7i_i%|U`Xkg1ɉTdxOzp5{iZw함P@hLNT2N7kD "Fl5fMF:`PTb=v$mW{/_8IdALAv\{35NS3!?[Ψ8 9+"vK1!FTjNK2|N/Kw((2OF\f1\4ϯ!٦C$៫ͭjcí$]'bg,j XZBv[F?qR8 680unOR\ CiigMLyzZ]"O@u3w6M0wnMKkzM}ᐅ[ Npx>1خeUcW} u!Xf#n_„S"om.n|nv`&҂WȯfM:s["_ND%,QYh:Æj|8PK9{~qf߲Xvj!6=6}w,6/U|w0.[ 3n7MM1^GaZ:#F{AKq&f{[ISP[]XE mViC0ך]+*d׭|]':D_Z`lfhnKna>Mߤ&RG ۚn1̈h4Y\5WŞ-N5O)dsL Q&dQlW[;jG4e4րMsgW3t_1Pȕ9.a+d I#m}&NH9L{=t I@}%Eg~vAqg]y'96=X?xaGY4`>MhG\|-FQ4RWjg\|zZubgs(?'0:4(**;@6o] ڸqr{}[[sU9P9꿱7c͆|S~S~ l6ׯs RqրCЉVrh1*45~P?>F_2?$1@sA]$"QKI)znnnIkcɽp'9 $lB%7f$S:}\[( ,e~MMjRSW*ѦȤ͂XYɆ@Jg7r`$.qZ{{hpҠш:&vy 'fY3#FKkWKW")[MS9({p(,u[Zd;D{p-|qbh1l>|g(XB1IRNM牅.eJ{-ºZR7M&Q 斢/ה 75P,Y.C.VboEC;n9>B@ ~<%_q m~6< мwÌ\2Ӑ:"׍r}؜~|EYO |8u^Ge쿧rı}o%Wܻ%wTM.'kr\i;N Xt+n³TzeSmNǿ6BAh/iյ02Ն?5V_B<ۊú9'XXt7Ɏ˖QaJ*VrkE(NNqOwly']6ݞLz0WXBg ^|cxOٯqB)^(ŠA< Z펡u{ï TP7 _=wRMv85ăD?ɉmHxUoy7%=0xd^P;;ңeTc}Pmt DcC=j&G?V6f]h-a̫f&doMm>4vYj~ iCLeY'V RF`0hVJhjMdw(ZB`!:"[w W oq2xwFT:D|r/V%~"Cߣ49Rʄ>zM4>|KW]msNo|oW|`GYy&&鯂f m-鋦ܙS&v@YL?mI3P-n̋ Orn0nqD#2F(M̃FFKwHZiOe4җ4^q?-]2>"Ukn <׮VXi ^FTr[<;Hfx%qCg%rSzYorS47| oAP둽.] 8 //8mmvp *ӏI|5N*""4\Up7^T=/Z zMF=zj 4ЫMLk^~x EjjSHTŵHK[UvD/0S:Dhn>^;ef>۞E(3 Z4g[%X(UvfOObѫR7bÏ$t=s0zOJ سݢH,x<-H}9 Ph9h-DPl'2F7Pu}< Y+aOF߆T]~+WOTke/uWH 쓴hm4EpcopQ=~JmJq X <0O8C̨,U0=,N:iSXSɣy7t/"Ǹ.)Ցٖ`@ HjJ} i_tk$6K[ q*e;rBǕX;A{AI0H}ḉ_cJ͒.%*tWX tDdϞ) c#Dqi.NoDnt?/L%J@ЏaΛB )MGg?{sHݗ)ME ]M FsoIܨ2 qU&㖲GbX*-ON:pbP kח[xЂfEz./J$ PF2Z4-BQ*}/wȮW2*{"V#^}, endstream endobj 1428 0 obj << /Length1 1675 /Length2 5838 /Length3 0 /Length 6942 /Filter /FlateDecode >> stream xڵuuX6%>H#ݝ'] m!" ")(HH tJH;<<>{\7F*P=L  }C+)- 8; ,L:wLw@LDD Ђ!a8'aX& B~/( VaH'8ƇKQC}N؋/*]d @P@WH_0@ypa#ra怉exl'+lFEDKPW10`w q>t} cc Q掁_ 7_hTGw3r`PNBho|p rwpow$G'ř  ]$i8]qTpv^D 00?8C0sA`0pXtb!X `ۆ <5wvͿp;FC>1?s($bTp=6}M #cANxHA};:HDeH5P5+5>u8',GoY Q^HGB/xzMp7q&ڜ`X@0ogfra0HCЀ#a 0usE&*@XqB1/S>܌BQH9 89ziz W_q#|W*h½aP0??TN (*.$r[Rb8_N.Hܖ 7.PF: K-;J适‘N$qw &!d y ,Daq)8.T\F!a?k)@`c?1/_FXw ýDpq\dťeA11@T\ Gßxq{}1 s C9ȅOy<@#c%.e qDi,Ô ҹ P m{I"s0z̛]]EH~ lS!tҷ|ky q lI75'bn'[6L^*sGxOP353O|l:{Ү2y]^C.1vƙF)x##ό&rIC;6a&8KC+b76?I9}‡i ]K\2;SJ̑4$!"Hr_G]_MB_mnw8>s:%SEۛ˓ߨ0AabxAo X(|{ީdg]%l8#$pQVF4> /aqQz3J%ABU⶗_$^qo3"H60a[ 5ȧ ٟzdzt_/S{U +[ LSY -MD *_X峽iD}[;z[x %/b\>_-tȯ놝i.M+7 D/e/{T8ṽx?6^P]0э"!y^jsmmyLO 7 يˇ߶HKYON1;ONS7%&`sϛūČ6X۾aĵý1u!DD%V)Q<6ro 3`uJkH<c>] wDeQslΜ݀Ͽ]fV'\)^o'ܴpkga=%f[y+A&EAl|UHBR/v)yLE{>VS~Qm۠l]sPm⏧.?|xŮrxT_C#plR%Sy^fO}5(UqşJ\zJpyk)qu_Г1u+W.r;] ?~ɬ:%ɟ^/m="C; 3x߲.hlGYW.+ɡ'99l脼A#١1HTQ!ҚOv]5Bt:ÿ31Bt=Ѝ'bAs]r(uc~tF;Ú2wpϽ\7;w"l nuzM6Av"fOkGj.V,`5}mj{QC`ڂГK9a繣-6ڥhY<(Y|;V-lNow@AFOFGtdKPc =>~㘲h]o\ku{|O77b:Fztkt.>Y ֪LaYAse|v xm:P0jxFf&FKn'vѻmfIR(]q(70);w>VmxTG'-ah)$Z[EOqSK厁f}CJfN W-FfȰ>IrbxO|%3?d)Dz2Cf.3VGu|dh؞kc cܻqΏjGsn@*WR4C ]b4HԚ% C *JwC;..xu*,"j\`'ֺN=;ĭ C9|^[.U_<6g #>iVe=h+Ww.TUAMCƗxجkrtՐ ǁ[UǧDegy;pS(.Ctmq4}r1EdD3˳ٹ%Чni>a'+ƷJ,<̜R|OVTi3;;N'$d'p|ֹes/xwHBPpÛg<`_搋 Ў2z'ϗ_I/y~L(},1ef>i͵BXϾ~YcvR! `veQeBqDJ`.UDDvmV]YN]RմuՊ}yJnO'`'X/Em]J[HiwW C8yD!mb-ݍ7*M!V\cZ:_QD1;:{ˢJu>EBĨɐN>Ϫ]~Ri)'r~^` 9kb)By=.[/fxbvuujyةF%lf)Xj5inI^rqz%ҧ8w)|M/ F۾ %'މK:>(P1f9N$pD7٫e"u3c7^ zwQ$ 4 3d]bn= C|DmRb[&`p R@76+UN(,DYWl}w.TȊ7v]NƜ}sPgx-T(ɖ)*Wܪu¥tc堾$BvW-e̽U|!3BL4~ƧAR(fqgmxezOO7(t<Э NVNЋ:N'6SYmM )߿~3!ѹ]'sȆ߈?u6׼orԯەX|{d>X7R!պYs]?0CE~N7~ߤ5ʮ6#BR'} k+Q癒G=F #ٷU?y/ Dwg|Z_E#?saV -r>;^NL{/cWJ! L&Z7wEsk&&fѦfȡ(]zf| rE/c݂IkPҧi\a=ԫB#3 ]3RTUn6 RB<ۭi~[Su>G)9ƔM]EatC"qwnZ.c*h! v)v3tFK'T"٤[|Wƀ~9S|ye&9֚;LJqC;mS5sAb-dߢ*m;̋ =mj_-֧lLKGiSOrL] u VgNzw"5lgL"ᳯ4s~]ӪuzKPb/dn.?rTPLqe97~KٙT}@)ɡΨ(sM272(+L4~1E(LZF%U!%Hġae/d w+*_?jsS&6GdbN DV> stream xڴxuT6t 1twtK -HI7ҭ7sܻ~~EogAG&i ʁA6.vNaȝM d fEvZ@  P [A~nnNN!T:<t*>U B` h!lP5dk2A].>nv1x~G-Pr{;,@%vUv *0AK lteZL..`qѕgHzy]m?u ([VT;𷻪,\OÍ  W"naٲ8Ovs@5d m'w[ANr࿕VBrAos;_i,UP8[؃ @ jxA =~P?iM] 쵓_NwV`;@7{3%STSaS.M  wk *]?'ry89Y8kdl_fcMwZkCrEt%AN@<>CNe^8+/{hyRж& ЗגRg]Fdnn>qA׆8A` ؀PO![7pHA?H! 8d8rCp(A<?A(AP.*A BhAP.AAP.A AuANr4pU/ q;ajq6\P9W?O?~lqC?ހuM^ƿV B][;d6GVʖLV# '3PJAXʙ$})SE / i Υ/( L֔ضT $Ŕ-c U] f:P*|kX;۞N=MsA}ѫl {'X-~REHQ%$n2;-bf3Yɞs$/(D(teٵ42d]('L ޙBH<~9'>x]0]F &gJUm̯@Kv#c )jg,+}I`!\LI3jU|^@ZTÙ U&w6 +v')=$ϰRSl3 sK$C,+5|R0y]|D;dVLc鳲.O=˥W퍤nç-%u7n-rqr!Kd5bE8֜|z?Z"5`ܰіК^vki)zFon)aԑ6'3${ GU?l{Yz6j0v|I $r&1!23r)L='⥤=Ty}чF1)@[~\Ne76!5-bp9M\i%NH_0'n͕Ad /TPMm-!,+|#4:O_påm"ҩ‰b@r|5F[GPv'#EG&͕Q|UA*pnKGUUoP CQFv.nu/IMYn1v-< HGMp1Y!NhYSǶ3ͦJ& ygӺj*>z}I}J0s8/megMa5/KR2\;so/ Ƥ"ȍxJ%O6Sů1 8vfpz2c?9] Ա_\^["h[|^sLU\EںZtҳSQSbJ5-ӮZ%yslC$v!aS?j68u6:'r GRE4mRsx bRQg_)Bwu.iNJ @TSa([. F6$atBJEb85R"ҷ\v. _}߿Q2uzv-]!`c?0h#Ҳ܌6n燖.P5fW}DrZf4nѧϣ`ז m'.~6(}()gy[@.obvtиЙbIge]+drɖ5f ps+s˴j}iH2G5v'KrJ~5C%GJ]Uf H34.*GUcg^"j[. 7%l9 K3j0nV~\wH'.s펑$$@z5MZ348;((od8^ZжA^sX], M{ܢxbAE?qH--_ɎWl5/ +[)v<m/v<Ύ `F=%|=ors*͸^-ʽ}VPLU3CH&eG-A!|@8G(6Hn})2qN]P?0U 7.ЬzcI`bbnW=ڐqT~aQT[/UMNrun`4Bqݷx+m\xRS£]/w%Z%Bp2MGI?Dcqzٷ'{K((<*3>q#d ףȶH0)K;>k`:*)}MsHay}u xMX»>,n- 9}A}+ q^#VOW9ڥNB%sMK:6onYbۦ:dd;>-䟚NzKRdXB͹/V~(dAy#Zp?x˫٢A5Vt#rٌ~pa%C<Ůmb~3,!}Nv@pn/Qu/Hc2E!YMˇ}i* 5D^&xS Cjudxznwh 牭|R'ڸvŶHQK8/Mn1wMP"зk[߉MwKb_Iߺ O>G@+c/Ɔn͐ţ!Er[ͻL: /,ǣ]f _˜22ustd͖KI bd4jmyJd r#jabݛv#9Ԭ%N<IZ.C1Fa0xRIW%B0Upx%p\^rE67ĸJݴ|M[Iruk}ka9Cb̯ػ,2owH9+ꏢ h|张1?)/f]G+ق"y5 HL: Eo{f[7.L'Oʬ{='y =c9p%@nˆrǠ2¤{EJ:{B]_be_J `<&$}Y,jv.[[``O3a6Gk' :@X3aZjXAaF7؍':RE_Iݩb%k Mm?[jr P1q9cnK 3S갏ELQ1kg |25WUMgph,+Gjb?U0.7Q/![*av'MT띨r7WLϺsTMZ" DF&\фB0Ka#c+3iR('5i#bF֚8b*ԹKee4'n,&C8gR+ܻcHYܽ3q 3 gڈr[+wψGR 1^:%O;gOG:\i~ExF$Ӓ7hsS2qs{pLG5 푙9 .-*N8J׫ 'Rlk{>c (^% |y4 nt^x ޒz~4I=t#:U0oN|؍PmbScx+=xvTST觘R>B(( !"p3_yrVP}(Aܰ/LFP0uvT v.l~w_#@n=tRJ-{GKwox[%'% --%%dBfYgEG+;"q8Lc\6y/:Q4M{,M;Cs׃3JTȍ_MG$ɕ>{{F`ǛSpOzjj ^ROln K)5?2$ IewG[o,Kͭ?2 ې䴛[~ȯ eZR_|.U(x$)/NQKݪ}[&˩A]^-i/G&yO)xpQ>}:ݚ6g飩/ٟTN֛[/+zm߿rp'C2Ѽ؋?r3bnI>M $R*\UgHI,(D~g%w'`PM>*SҲeL߭.Nf?=IPo.,:74ߵNCQ߃u$#KPf%<$͖jfW]մ?q=cd'DyioCǔ)/%yFS}wX,Se)Saf1 |/.vv+Icz,]c= H|BЌd^ >U8O򜚏C!u/| 2HqۣrkΰXUrc〈/ZB1+bx}u?+O򀵔}ݜE^dѣ[z0 ^ynvʙl3eSRxE>@G&^b+Y/ KT&eШr]T[+>p袅y_"uY=9FKW/ƈd**%MqiDm2Xx~zJ +ȳH6)}w+DfL5&K' PFQ 6;Y[XgOU jBV.c]zbӈ s~g{[!P,@++c]½ԗ]JT\G?hrI7s&"BK9[oRxrR _BQn0ada[t˪Z?z0VvwO1Q_U/Ocἤ81#b2kX`3lGj\~0xܴV9OaI͖roPq?"kkzpJW>Nb _FXsq[3"8qYhضK33S-xW]{9=)eEe+P{,wKBC+ރj!};ؑ ~&0ymԛ֒ǎs PIlvk6NmF#iy[T3_n_U]>Tl6pn OO׋j˚jzн#g6i{`'X\M2^@E9[!l,I{HQ8Lwv{j.hm!4 6_nyQZڪ @H3_oΡ݄YS %ÿ ,Ql9y_nCcR1٠)ARkJ':k,W6 PLƻzD/QH>VEGa,N)ݲC︝bR>"Nt~C{c?R <Fsr1at }v`t̛9L|Jw`S/0=.*u Q:*PpS[wDvxb%yn6Qޕk@0IRa_xQ;|A]B =,!4Co~qjA."0%9኱Gm fC;9uzU,GxiC$h'X;2fY;RΒpc̥قیDkˌnX!DvDrH-#ԮȾZ 7fXwߞrY_Ť>$r ;/J$Ҋ1daz cJ  SFlpBh1>+jLt Eaz$6a]1$= [dĸ &m4cjg~ -AsɄK{ Orhj,WJCsd$T_\բֶg*Y䌜#H^[7Z>IJ'HYߩo h~րqU9fybT~\*Br%{0̝Ý%Jԥ^4πUh6ޓl+@JJ\>ZU#F UӮ׈tq4YJlT#rKEjk wZ`|gً̢@>5X!\=F(kg*(9qj2-SzIi0VFYG٤px+͕jM- R;͜V`EM³0- X]δ^:kqt` ɃJ$ݺ,ÿh#95$cg CK}#CnDʺp@*faL5yD^_ϵ3{BuJv*9AB F宬]RߌU+L ( "QFYȱ>cDB!݉8?'^ʛc&z1 t U4]9EzH&j>ax 3y#]au3>js>% `+'BlU=/Xmtr)u9Ȉ"Ce-<̐O&pfjYCBԣB"DԳL1VW$s+sniVVkE]]S& LT $ŷi)a7RbYdRt\ 1`<\x! a6xTOQluWS!Fa.xϪvv9""ZxLٍcZYV<4ZVͧ_T&lnaܷ  5m,^J "s:H5 s^}?Z2!/%?3ۀ3W0y=,F#?b%lX|{[ FR8!iAy|NEB`6*smj>ԔOLǛ@}r8R vTqNW6EC4->K^e<*GWT+ڀ[lSYƗo}tX5܋d!#PчcO(2`JR>F)L ۓC Wg#0If'2y=bgER퇐i*.jIJ B_C {M%KD"/AfzۇhMry׃錄! hHYO\fg-b*x}9 9!ƞ^ԳEv:Y)q%֦jדw7"eM飴jQAµgG܆b`pؓdBg#`PsQ_ , ƥ7-i`o0|x-&oAk:PPi Kc OŹդH+jٽP`w1#3q6xSOk1 hjrc"n Y/->7dqXs` hNxEsO66kkdEqh/,ZT#xgT/P6a\yBcXqߵ K +_Z|,oެSwEd;de'W#:# iō,8˫%<ʞ V4aBtZ@@"y=_,O+Cw[[O:sܔvT,8ܽ!&MkdUHvE1*ȭ1%I#ɢ0&!?0Մ{}WJ5¥*A7, zؚ|1/- ̀S'mHUo4⫂׋TU8Q!H%bs_((_6URtYWbaLize*F%w"F"b>y1p19.lܱ.ļpHs̞;9n1 3~尸;Q?mc+ ܜby~II̟[ڵٞXM"]ȓNY<  Kfr!9DŽjc$Wtjwn >r*ea~fWxgTI2vC|&2!A C;%PiGz}}җ}+??#VvTu0͑&DB]BGBOBaa‡Ts(S£^jdѠB)uUw^Xغ}qz?Rdą.vOj Ιa`l0"K}bD C(HlP&yQC(j\0=G,dдtY^ٵ$c3%oX~7a=Wh(.9x3E|2h5r=RB!Ber37=gwVG.n)+T_iN>ck|/{:*[wM*$=XaEVBiߖҀ/)# ƚDQ0ߝ q4 b]cVϻDfz0*DE 4 Of2[K \c.)⧩ΠL7ü+IES {U 3SSg͒ Iq_{k*WDF{$ Fae|×9tZgV6Î}9[bk?qxrXf̯wb=Ԃn+4uW~ ^3}MpȻ|yR}aDZSZy0R7;EWB3 K,I~8!rJ Qeq^Dݥ_8s"G wp7U!iI?I>m~nOV'>sD8ܙ~\F/SbIpy\ߚ_>d%eYiq pEL ~&X%ڔR@aOΙhc<&o2pi{'@M՟ɫD3q̄ѹٟUCYBge(gqQLmztjU+J2%`$BIpn[l؛ wYSVn?w%(%(1ϰ3{] `{ٕEEUO0DT;db mPws:22W9Y ,QSx3mZ=V1p+pku"Nb9lUlR@U:Y>GSO7*uQI4(xn}ϋ9OJ{^7bҩ)Y ܐqsB .\GMo>EZ{%ZxHѲqgm]i"730*ŴpPA'{<=WfI[qevuǪhԽpd G1*f58˃[_P4?(U_0}%{+ )C-]?t&7&|9?`OƋJ,p&?yi3@ Ә"U 5hq ȑ-)qי]Urn]`yÁRIs7nAj^ih%P뀆lЬz.>$#Ԝ,=fZ\I;#C2>$N3sXPT] Ҫ/ShО[ncѿz iq呐J:@ Z8k;f3^!pJr&G&|ڮĹIDqf} eLZyƢ+Yhr1Ugq3gcN|V"Ҁcr QHj9gIeU+k]nNdeR!3xi?!Uozy 8I[ J: UT]ɀӳ%fb4th''.{^Ba~/ShWKg\n"kS5nM\&Ѣt/~@'B|Hxʌk u;&ϔE;0G2=D JǞ٭`\^>YH;R ]Q5,>qDBg5I/>ƇO8n6 -C$VJStgvfc xUЍ9T qx${֋afżGVr3s뛮$o6~;{Cz5~OY}w;` lg?є #ZuU9JTA5q `sW;Ћ0_XhrjBimh6-1I޾Oƈ ?'Y"2um ?@kbc%2\2kM._ 6o.Qy6tm| Z]٘V"Nvf ed_MhW 1tg4]덋o~v5VꚔ^ƥvacfz-̰%L*Waj]!$1@(1fC]'kד軬79iݩ,hA5NJ?ʢf(.%FAI,GlǮO|fu\@P96ZXp;@~X>cV/źgm {Gx%nĘ2Jt#Kl/2-pݙAiXA;ǧtM#QIa1瞄Cro"cߙ,\[.Y8s*,lXvs=t1*杬P9{;,; #c>L{g~: ғ3`菵~ a};6.-j:\QTy؃Z~Rx)@KcW ň ]aq<ޞI Snx#Wy@Ka +I̱F}w*N1[Ubĩ>_=̈́ri~XmĒ mQAw9grYӹgֲ'SF}kW' 0Sz !qK N_YHxnH|}O3GRs ڟI ~c ۃ[zΉ==8?j=̣aIVrVS& O!^N_Fq`CyO.g|U7Wpj*>IK;P$h[:I&Lb *C4妇p=ހ ^,ĊY6TH*Xq1V0G .UntaW?e:%Yg>t )9Ղ ԧc\lAh,%ƈ,Da(á}k꨷5>U50@ !S<A*C>>:-"})8aCv&w :ܣ&XwlR_o;̋w-I\ =+!7$;yYB}4E6x?p0.XFsş[.1;a 7?؜Wrsm夥?(ݿFs2\'y X #sH+9;dvW6ki@\GN#7*0416Zf |ed\cё&C Oq*X4 jDRu_Ĺ"0LylP!Cw9~Ws3i6Q -@=E4ryAd{M*ﵤ$t*tBywtKЛUl>l&3ZH(]3Țt i7ٚ9 3=ʶn׶Nib ˍݗn=f->kXG9!ȏ־oR2YQll)i@;sG ]ϡ~ļ Fm]?"B2r i< ̚Af+dG+jo|Id7ٜW0!Ea_rS7{#Mh2?\=V6t2pgW*fl3<ѥ%O !d G<} :j <Ъۨ a|WX} t7mҲ @*zwy7Be@HΩHť$%mT_L3$~@(Xﺦ?=M`Hi%6b/,Sq쭉>y((&ׁ}i77~[,F\}t:Gxv:t4?(MSq^q9CTY l6ׯ>.#q2z1M1exkÌ.0AC{z^Me51O<]ɍ#L-OM.g98:)a`MzHU@:heI0X'bFT9xzɯrfđ餗SC8]҆?j5eLV3.&rV[Qe۔3$C}IҨ'-ޝ 8KR WbZ_8'vTe?eJ^ۏxbn~UhEϗ%I4ފTPu*}+vdSUX87'/̩^%3`*3:|I>Ϗg'9I`LFꚺ"f^n[H>N%x-l<[lb}ő_ԄqbdWGrrPٸ]BEc[:a9T l+jhiY^^[a1w6g;r3&kXU`tT*nرpu!%7Rs=c-eVT>=)ej-y ohX&oASDIh1$ex^Pxx.$䇱 8ZW[8پ0 l8(Křz{W Z _{9ׂX@~m%,O6-{̊nJoM*Ml`V^#Zp?cSSFP܀g'ҵ~҈m_]"p|<W6R`nWY1G,OcG/mFG#+(+ (їt*;zoNOАA-,ćE(@LEYTC0f/Jr>U9{:ҜC48SDRwșKv葻ps3~(@ qjqMw$9_-τտ:y[#VNQW~lù~h_qB-4EOݑ1 cLw ?EQLT*RDw ֈڞ߬ݪ̴43 &ŵGFWJ}3~eX2';p0>/帼X_t?͟ J\Xƨ*ԊU~g +]}0fq^ 0lTJ EQ-Pՠ'9el U&FS(^G ǃה9o9t| "{X!*{HeɸP?v+}1~²$j.JXcUf16MGYX*VuKڼ섥op:z)sG d37/0%gF/"v--K`ףy rÉM"Gf ӊ>=jdm%ĩ70Z6d-y9ͻy 8Q\"1bHe'k)\HgUO:eWJ0Iۨ#>Ҟ;Y{S?E:}n0iRfG݀{=󩵅IrbCæXKF(tD)'^@30Av6e/Lĥ&Af[ly.Ď̄^3uчZZ;ߦ~|ЈatBNqoɷE*/ŶtZ 0Aba;|bd7%带q$;GS;ֿlZ|x𗩬hIg]ǵݲM1E*"jgWO"$̜-;L)rA1 j0,Fi{D%馩V7GdwXqRD9iy8FS 9IMQpl:KQ|~2E'hlAk7g?=%qʶ.lJgh)u?9Y%-"Ix^oyf' cO mY;&A"njCG[L g@uÊtZҳycȘV=EL*nKHJۈ }3u~Ӄ)B2 TJ?[p\$It\,F^K2{鋊Pؘ1"~J}ә[7guRw}hSBTM H#Drf]#,ZgY|^=l:#`K/[F ``^~h, -y]No.\.jέK%azX;sBO1V hw!ɨddAp_=\'NSA ȩ$ƕ_hOsIſUU>`Oȁv6& 0ecvӔw vueuˊVYl⾬N&piQֹ N crLZAf$efC7 d!'?F[19W~XhVf~}oTܽ \`=CF< s{V?3;2OHƜsΎa-[a<15 ޝ*1(0UkݣL/DzQI*(j9T́$'AGs!T ֡jR'[$~B8R{__Đ< `hf<ǺVO;8*)&Zś(RafCABr σF'nƮ4Xu֩Npީi8o"Wthޟ5ט)#]rsB ؿd*,?$1M\8n1zM^o\;Z=3oI=i۩}aDDMP~ʮ ~#72yQ 1IF_;. SߗU#E ٺuy³\H _7*9\؉y29f{Ɖ)4?d]^ͧ)*/L>x|cɾ'iD%W_P !2tn֛4D?bz@ZVM{RGK=#DfJt P1"}^sm7'BDZJ6#~pIٓAYE傡g'`--GJ:!|ũ8oѤq+Y!bLLA_hW5{u=U2^lҬN)wYIXTdm^?1=IV,'^2PѰYwUSZ\Jhn^- -iT4 ;44Db |y'j`_Lf3pEi 99)|Dpch Ac7$-x Ƌ:e @RCBbքD!i1X$Ц f*谄7=JSL *Xu\YS㬼.gT9=Wk/|`+i}h">IbPZ5pΗĐ[|_I͸᳊K}L|ie' _ͽvOTND8G$>VnpU^Lr$F@l "gdV,@>F6ٱ<Ҥ\ѩtq.nƹ/ځ9hmM>txaD3ӍqAZ$~Mu2P;(q';y'V$}J:<>Yf4݄ d'Th0Bδ ?F-OCU^*Rvu"vƨĂ#bM49P?4mZ6:f!ek}y.3Ej(w]՝⢯;2uLqx; ۛhVq$P f?Lv<#υ @ZޗKMKM280<\VXBOd9!Bʿ{_I'sr8i.0bs*h}}-OulD'gG:J8qwo|F&N. eJ6ߊj A9E:+=QR=`QUx0.m)Ү +!vv #AF<+9Qmv{ PFy;'cRܜVk?!}9g^6Yb3VR2"mv}E_StZKBJg3w=A]T&l^%;;Bn;:ѥ)5wI8vT1yVH{jPmˊP^1 @''ky:j퀢4Y'ě^ f{5`ooH^|}~ݝv7:&S;m'ڼWVaicX8uRD$bZW]]kDˎ$r&$=[$g5力,ӺfooÂY "_lFB@Pn/RʵCKqJ:$I൚\Eѫ—UsҔJBP;-Ҫ׍i/ h kq6cԼ%/ !C]p:ɔD_iz _7-F ֔qu2m{x"K%ĉ|@Ȓ=󭖑"5W/g\ E6j5#ݎ6ŀTcS'Ӓ[-!q҆+K"OpGlkf:oLk]|)?a_=ƻ,Gm+jylrLql+&<<U iIߜTihԒpnP-xRCOFlTDx޳t1F, %[dx ]xBWV4&l6[4ʃKe{a :%0A0|`1#U|Q' P?wnMKkzM}ᐁ5'pP5+DyIgWF$'K3(ds9|Ǡ bz\+O ̇ca薞'WÏgVjRh98n.1X4 {JՙY~_\vUjP?R&NQ _+ȿE <M>,7c /Qлdavѭik=aBZ>U~7NT'nI Lz6 ": 6[}!@=‘kmN"(Lpyl}TbKk/'LX vz%_r2tZbFe8u*m[EZ-DCOU2 )5 N}᠀.iA0&~U. +:'N8bT(7TR}d wAӞİ_Fy:o[W7$xÍ;w83Hݫ6u9@Gd2|eML͛T%847ݎő}%qwL|Ƥ};"l`7ͫ?\JDߍ~FkpBjʰ#W8)xxM@)X!_TaK%4wLMDu\8~ Ay>xT_!ˮ IGs=i+33{ԋMD}VΘ:p[&흽X^s`sy_<S$7%9bt"tQ}Ωh7 hVnuֆu$只mܙg\-=jQP2tS#F"5^ʛTOCCXTfX{4*;C-+͹;>K_>,x ڋqcb́bhJF}WXAJܫRn5/&"iTUWl8"`o-L$ddr[ W4X3}ȿ-% 0EH&?׹X%ӱX=v0)^Td=Qä^ "\f"P o_ϸ^Ӝȓ@(\"V <΄ Ȫ. 5qƇcT|q=w3(rQ1N!ޢI(?S" n`1~]qu\uXʃLH1#;aipsS/,5!s/Y"F=/|BQ [CL13x MҰe.WE0>IhgY*}׈AO?d[3@ԑ -hӻiB=pL_?v"ɨ,z{AZCE8(^gnS*M~3LnRDaV|CaG?<`S su| E|}''YEhH_e?)tSS;UZ gS\xf8ʰ`PkH==ۻV~%f?$MOH %5y?-9=uQ +(x#4 aAus>& \<[x\b|[G%zzO[c:%ʼn r+]`2rB(Mļ!y1Ƈƈ}V^JبMxf(~>~v]]NIUvVbǚdYD#V\J;s,.w*@׶]J?E k٩l ZɒeuZq+ϚЕཕ0Nh(@,e"OBRt2 Q*H.?_Έ^NI;j'C=nO,ٻ J}=r5S%]e휑I3(%Hp *&xg)0AdG|vW{+?MS\:_f"aۆ V7Tcg2A4Xǚ^~QGogME =4q -W Fu^@2m%_X:ވkqI0 8=TF5̒zo E4 ư{Pݰbf-8ϸ^y]}o_ WEœ`WcM IS'({ }UѲ'bUA 'l eݗ$b@iBuʶôDz$ M!sUPPgpoָZx}i*%ih U椊3tʞWn@z iDrDP8VER2:Ozynq[-9A#eŨ!< uڨR*⋘5w|[D%8 fuٸyrK4T7 ED]!=@+<ϱRz [Mh`.ҟ+& >\>y}3Imj+-*H#D]-=zAB646ץdG~ㆡC@Huϔ(zeDB4_,r(o{64\Qp$/}rinƟ5 t"x7mNPD5U_?ڮ@abU4KR,+2Пmq&F$$/}{*J-7M{'!n {M|eb'DnYrdUMC ۅCODŽRr-|TpSnA!G4+b7X/&-T<<1{)Q5!#7^(gݹVY)0L#ņthYYjKe^[6URr\n${{oau&76^Cd'%r /E<3㮥G{BFU Ti?񉇞߮ ЯOp1$y*Gs1}d줁4M^nlu\BdVg~`إp3G7NKLBi&(O9_ip1w"LCMN& 3VX-`GHlXIAW^6嶇 4zgيIiE*ohsLxZh ~&i{]O["qS3(dv"' l5}8)s|XbA|"ws&U4I;zisd-lBޫA\ѐ>ϿͯFHlv(ʈ?#yD+ 90o=(LTMGaQ -۟$KpeFnB&U0 endstream endobj 1432 0 obj << /Length1 2246 /Length2 27489 /Length3 0 /Length 28779 /Filter /FlateDecode >> stream xڴeT˲3qwn 6qw]Cpw|dsZ{7pUWu]]DIA(ag sfaf6223QP: -l fnFfFVff8 $a6yΆj@_d`d1 5|{8Z;>`c`3ӟhF (Ps0ZlF@sCkS)@ PWWQH(+0~Lboo?ZDU%b j=@R]U_5ь1'χpyq5a5m%q?k\N6e}:@mlhhhho>5s 'U[# @h$aAR~}؝-; 4N))l -lΆ.Nl@ D]!_"v+ӳ1t3uqGmsvNNN0Qg,l +HK1}-Gulݝ3/ `hOq[Q;Np'fQ'g;Gnh+[;7[ljakb&.L.@iq0m3:@؜Oz叙>^vSCk')d 8;}900vhҶv?4_tZ{LpL v@)\. 6@XX{h?-$,܁&J_figÏ5~l_&?Gg??>fkQQ?jLRښ"t0yۙXؚX98p]bhg_m`bsػ8Ll%'I_ `L7Io0L Iob0IMl>M| G>G>#Aob0MU>iMM<cӇ!z?C?C?C?C?#?#jbcLC?C?#?c8?jw>|zk89c,]F96ҟ[7_hgԴ0q6.a#߷?EDܽ9 lbcC<>jw_)_"@cE;c ˔R_2H ƟXZ2q+8b9@ ;9)^}$"- Lʩ;O{H~e]4'2ymqmDSQo7ɤzem빐n,h(xӝ1Q}+sy!X2P(:t)yjqڏ:p5fzq)'7O`FN$]QɯFOvvRcoQ^''v˰=t:I;Efw X-j摭5Z1s~ʨF̓j $^Πn@)2`6&p>`_cT@ht/ F X4F+80%2BH#zƓIka ^)[.#q+l(Ɗ1tI1M(d: E%?zM_;)2*"O~}NJPsbO-WMwEKz[7Bf(&ıMóܹfvK|͓g.qV4#%FQ o|: \4e^ 67*z]HJje8˥Xrw"q~͕k%G\PTZUq q |efa %[%|BjfGm>nן=5}\,ˢU]iFd?+1Iqin1{GJ2f&v 稰2m"ZWKa5|㕵W၅!u~j̧moLg 0Stv$H<*MX k8X  _~H#g7Out:pLW_޴v1՝ݺzȾUM=l?b ѳ (ADڎ!&5;yG%([$iݍ:n/]N}fbG<{ 7PGYeHS 'n #=TNIVl!6'VVeFEO yݴy0Ds(;L7^s&v1A9U2S\{am2.|U.Psɍ/cVЋjUOٙec\)6iVLKO#ÓE_-)JUzeJ+"\uzi&(+.cxTjjBa:fS~npJpBvB܌$zZ$'?) 6UWYSjFQ :% ֻh5~{^ZK8*_5 \>r(9v~w@bMn(o.3Gc h}c/6'?cR%B,*^a|3k荊O[v`pi@McXP5iq*&Z ˍ>'%(~S_ƥŎ̓: "k@.!B(TmJ38x_*˻d8(ȑpJì"ؐhkQE"E+}uSL) A\lאWґk{!qV`@=61 ?r3KE䯓r!CO3+6]TޑyzI&5k]g.vĄsŠ!39 GU^eLV/OEͽܧR,ˢi㠆z%|ҽ/2Ze^{SOBk$8z, #/툋EYq4Wڵ*E}U9B>ˬ}lGѵ5o;_w %eG5Xd؟\փ|PT f*#r;M\U\ 7o6xfDFh-Ԉ$K DnhKP3sWWƚ0c6Mft }X:d&!8Pk ͝=c(5g %C'Lن> 9R UK$ @p#.A= X>Sۊ30Ǭa_^ms I!%cDˮ/ݛbd9;Ϩ35WA-Pͺ jlӡ,EF47K&f^ʅhq0CHNEaԱP:pmZ:F?,bR e>Qsh\!( v ɰ~͆c{ u:vж=kzz|BKY-RIŰ%\g?$xь)oo 5.Zbظ gDDDӊ4Ki r=F e0O{R'Nڝ|n33|EIs{f^OD PQ$?;G:z<$AC_ "f(ibUѪxAd lmḳYb@\11Y pg~Eg W܎{ s L%F7J<_ʇ){|1'ND*M۪סp)&IM!R.lFD]hh Slr-_ϴsن:OJ4?y jFtVy;5<jF:}K:BUz%wfӠz@,nL_5 }#j(ǔMbcbPR=-*m;!< +$i~ԋR#$zp-?SoبRHfJ՜d+. I$Ʒ5f||v+D̪dkĺv!lH|3\|.J:r;1aNQ8>jmNDx72.ʾ!q˧ALs=b`bo@++޲-W5'%,7YR+=φA?21?#bѺ0V:`CB+;Krϧ93i~whvsTb&D=~]oyEa&:e⦓W2^_9&{ +FPlv.Qu >xJm 'r' DbsoU~e wA آ X$ `U+QPO!׎σńR~n0sݘ,_}ʍCUI2"NcP*㒮ڡKo0WwN ?vAe#{RucIu) [T÷?:zz$X~wy{PC:L.tֹW5yvP?)'6T}Ɨb`  yʣ^iC+u.\1#+jmoeHߚ~U/z%sԊwĕ֎IJUhwYSk7#Qj GY3F)p̂!x85 /}/v9~tGJ!hgp׊^UV 7l; oW ܐI?R8 oKw-r|s*Fޢ^֧:pܮޭFBg=bqMH2@H!Y۷@ ʲN\o|lmGt], xH~̜zwT=;Cqީ}/h\|!O/*a^70\rE2z?AU[MrH}6B 7 Dw5 OQ\X5豞ϴbU A]o4*e{Lz Znsn *dnڐ`& LsetMCp-ٴXV3O_E}zBUqgBejK+$([XڠۋcAuXQl9ZH+q V;޶/,Y,y6 ^[E#orĉ(r dp+P+N/{,SşډOi={ezc߯nV'Ty[,[VwYb7/X 2LPf!d<ў(Ex- 'J`ڮKM7ű>lBg!?ui,B4,tUvo_:k,"((`~vsx" Fl3?.Sj9*ƙǬEokK^w b':FGWgzCq:̤}Ik>L vϹ% \JZ+ xC|ѦSqdvӍeoQ6Hp9!)ClqrL`ZA[-9ibSke+;|{r%d.t ѮWx6 >oSf臃w0rgMEg]W%sd~0T=)/;[mDUu ad&{$1M?A<+K4/QNXfOS x+*WgwdfCщL&hoĮN׆m]ĥY2u6V-ֶ})EZx<:3z7sϖMFi[' Lpp}|CNZTW|L5{"oIq@񑌶n鋒@(.RpRbW~.b0 i8(i/_0*0-SO䫔| |5JQ+`p$;,!n?rZG>0b׹X_{R;zΠ"=c_xJ8Syz5pj4rYWp :cĢ#FRG'ABn*Nlscm&ݑSbܓ~K t Y[jBH *,GglW,;(R&D*F{(y׿c0ǼviY~VET3 Gb~X d|lNW Z:`((xib g)$yix*f3g2֯ X+W_7JI(Sh$ّК=qs4_=up݃\婠/_|JlҦ}k\ڭlM& 0XK%jGYȪjt6,S\VpEoy#kry͕ܳէWwpi2}Jʤo[ ,k$VMf{$jB"TȊ\NXzR\)bFFI׼XVd81OϦPOW>/#@2 / BҜWU0xaȓrZ=P1a`ގq]é\QOGJ jLyMO5>(ЫzTIxuY!I#^\NzWbgBT1"s^e޲Xqv݋QfU5p={+NPA$BJHyhWB1ee'^UgVG% ZCm]`/q شh0Ijbqߢ: "h+@Wz  T\vc"*jVALux]+C8EO92Ã11\m}`TOͰ AQA)"p_+_|XB\L8oT!A.N#Jb:tgó4C;/[Gfwn8Ol@ˌ%+|<]! ײ!e&$ DmHNw揫]6v>7J\Re 5enm/6ǎwX1Lh6γU]l-Σo&AhQ@n_pb:cɐoO6׬%3|f Τf:pTz0Upl)b2'gUFfId/Hc'VMfj ~si*х(ڨh1s t(`5`3J!Uk/CY)9$Qw~SYJ?9mHue&a!T#}6>|**tlϭh常Fo>QN/J)5/k-Wlj3ĵ%ՐdMǭyG4(g +F`.P=]Z߉jQl8 Hf{| F\|˜`d$$OP! ;L=,CS<O kPg$A)H1jLTfF  шLtSjUEAEؒ\D0*C rikJ7I^ g0yS_$k« RnyeՓT [ЛalH I410AN|SRcVhR Lu`B6:[{07hR؋T7FXlP C@;Αh/׉Peov= Qayk D? Lv@fPJ /OM]EՀn uݘ幺 0TY Ԋ91@4{+¯&c6&ʦ w n]*QfW0-ޒ?@a& HYS= _{f h8T_Bsjg\@.`-]pq < ۚ+3}S+(0fjM;w &;߿ݡ8w:qqB"ʚ1B^yU Y'\)0ݪFB`~lh?D`2a;m>W;30ULKWW͝ }`1gBrXL%|10#F*;8/9!\Z(EkU!L`%IQCB@4.l:<ƪeAOncLhatvT1"gїsGϐp I6^h$Z0z59rdnKzb,V:Uފ$oMb2h'1辻IG 닕r1pbrX7M{q9O; ⫙Lr5ope]=ZoSEyeqqvĜUq0f 6rFS+{T|$P6ecsSaG#msVsB\ ȔyFc>io|I`_.pJmu.\5.ExR׵qn#y0 {.2vH6mvJ)UZIV9?0mI܎ņ~.?;,Ub;R~Y"#%@4^>qJFVC v;Iᐉ8RGiZY-@f'첋WMZ n[6ӢmBIb=|ln ocQ,NaZIaq85 ̃7W&Yzny#Wo %5raD6!Opd- n&B| (V[~K^k7=fM!-Ad+䖃ICP\ղ>vddN 4RzG7X?dĨ_DAK1w0.'3xϯohhdZ=9iM͟O!T1KIwZw3dObN]GG**8Y*[?nr/Wr)M:M(ib?ٚxTehTLjխP L]u-.Œe%@)S7@~<1E LlƞL.ۿh Iw'H m(LJ4.ܒ+˫zkZmܓXbD/)( 1|=CNPfboUA^+2sѩeHV^LθA#~{z%%$–{`)CbsLS}FyV7Ê(~c~-:uercxm >+- 9>^[H} l&'Jy\Kɉ\z ȑ'1>J(I]|Bڬ)86H=І|{r5[8 d 3Ić4*_Gw\*ENOu|-,ZL 1p&.`rιU /FXXA;ȔO@BpyoՔraS]U=.K3+pϓ޼r?Q:3{Vp7" SQhc`G,d(__TA|-]CU=V mM"=_Aes-[f^)hF=}D'[GxAyJpu\Kq]u,-B#ƢmC|CO}u1^wx}R7Пn+B"8xpӒ(Z\8/pg;Pn9u ;䗣XEH#stQC,S~` [=C_DAMn׫QӄBͰ!ۈZzQڴXE;2k`zrVt{vؤ*ET7,I>]#׌i ~_TNBk֚k{x, JRq٥^*rx9mn~1lO+2;o^uO~)HpaH_WIl>~{]Zpk2a-p_* b]~Oh]wZ|PrLWT7y%f:S <bLz^Ah wҾ~T #z휯nL::۷^DUnTZӮ.WP3 iR|iBG4e,[wNI(|QX? kYY͡|P<P!ے˪ytLk8}⼇h 'g:}W,qr dbgyTM1.rU| }Z{LxVJ`oM$zF}{%A[L"JkWˀ*)!TlM#-0%{>$d( H"GL鸿v -= 8A%+6nGˆFw3t;(8}O h#/TW4ƣuZr;Ҥe{▋He7d($Us_xn#q@I}h D6ش ^#s="3~'l^)'Xʲgz`^m2$it+emTER~a?=**89^M*q$Zé}UW:LTD:c3ӽFG4 J-NTUMYYut{;0̪ٔwS;0Gۄ﹕$H$Lko13&6U3jWp{7 $Qo%ΕrG+nA*+m sK-B7y떘 `b{!|JTNsXh;1h2'mq\>@ WI0?Sh(nVDvζ|JWF{,X rv:' =dE˩9mdcuU0YH6B*E""]7nS61&-k Y3Pn)-Q,c^Щp 5! JVW#"713 %@ՃV#9=lNX aTZCӮ4| u9:Bwјf{ynBJhPPL&wA.t6),/&?#'5UZi).++O)Ԣgz@mR0x%vS0`P~M Y[yH7d{T)!BȑnubUOtn`'|wS}. Y&'LssYW:@S;]7bK ZZJq/Wcw N$pt@qLpHA-A/uS3hdV a⻽CDJbK%+VEF}}S`0T ,r#NN$ݒa&@$@vcs\Qdfz"x@&bB[;S˼I~5 E9O7(%Ρhx#'x k~օGմ}~{Pikg;]W?Kh|j *c1Lts@[p¶dX7F<ϫQ4 oDԠ4ΠnzoOfnyCn*`dN.6mRJ/[,ٟb)E;|\MA{8CIdU CQ._>P8S}Vdށw˅ fF6e7Pϝ7^W5"j`P klh z:گXDG̏ 'z\q^/b(Sɖvnu 'ٹ5nKk46 q͕ڶm۶m۶ձcc۶mzU~iQ5V)%d+*?mŹ'Qdxu!Z.S a4䦮ӥ;n*s9TTȒDpwA@a}TMzT14Tȝ V:,+¥\ZJ(x?*a_D\ڜ/K^kqtĽho_3W$ YSxXtyݸI\ l<8Wjoʄj"@; 9C87L'qPl`LJfLAUD)mbWY#e=/U_EѐUGrmL\+"6 r_)e $Ci~/ה]iu"x:(k#G n&(uz(hemQi*x2ċ7$49L{i-^CMbkor,Sdu |d;l7\tTiԅMC 2{|4+U3*3l͢(+Mv~@PW305鿨^\INB, Uj N^r9js{BP $:PdOJ&y1Myt^Ti?N>{5w7gk-6o[T<5j֠.F]9ewЗġ ~0l8sƎT, uKt6VnLzLOT,k%hxhYU#dY99K\SCCEQpc c(1si~#~Rԩ`Ӳj)'Qktp+| KBt7$r^j٨Ur!` \11D&dB/Al)z{I}pSp ksI^[Ls]/Dܙ5kIb^tI5{쟈R@e }UdQqG'XŌxHCbjjތZ^,:2!J,Oȉ"6|1aʠ|' '#i6MA -\C >o9E%;ڔk}?ifx "i`{ZyخzͭSOtfox<\(U[[<]L~;A7meI5o?x3::a9ObB@pmUO(OSp"u]xR깄9b: AV7fE =Sէw%=_pצ(:a$v=ÿ9Qg(xCjQ_ z{Khf VlOߑZo\fgsCI+&uk {'nw $K~kIht|,J l@ U7Kt8<'MK~@ZW@ Tkwu_2Bۖ 9ࠎ>yo hÐk#vjtNaTu~]%45$siu5zA^5Ę r 褧޺l =V JNIIyܥ.G,ѵjs ޣ3|'/|fr+bm?C6EG\#2wKn}Ƞ;b"0+}ViC5baTٓkJήcAp8BqY d{eP}R(Pv$JjNhPX.<~s484(j`?9!9o2~zv9/NndnyQ)z]0:ҳbLûHv)'R)X )M3V Rے |S:%j7kʦծ8gǎ1 zPeX@o(m3glf$Y(%KrU_ރN[ ѕI<|s=$ˤcW68tLXs,|*U6fWr qN..y% iyj&C SءEK MfgZ]ڻ+0,JJE>9DcAZJVCZCyޭ$b}!J+k+vyy5,8^3vl[>c^n? X5*uɆs/Bll|FSw&zX<1n^b̧ .+-P&m#eYd2?+A>|G4XtL<&K ;`)Cu'lqU_)lj# =YLZkE:/n0ͤRC"[Tpc XRܙMÁ:=I皼E[&޶ᨁtEڂ(MϵLd*zS#4ĪJTs 4XBPq;?_Hɒ;mmq9m#"b$._f&kD+3벫a8{%Aݬ/lg YK $sxtTn HQ}W5'Bw$E oDRڝȐgl(*l9>+bgoi@w7r{MJ^tU%%f68yS5o0yzZI(XTHC- }Rm"\`l͈3U;\\Y5Z #@ӿ|<P=E-\w#MD/~v]TH^PCf>ٴ 4~J{(FAR8( W&%;E/rS5֖C, Si`KĽ2_?L WO׎A ^yΙ aot]CCEscWZ2X`VH;M |'}LtaoxCz6m6qv 7oҲeBEA7!vi7yPiY74k?̸#NO+f+=k~O;'ax%o:05E֙Ƣv& L' 0S h ^:U7"LH?KH&:NBMmV턈~OQ\ɽ : u9?[ + &pS݄<´3PU5N煵a\ow^7Mcړ%лPwzךa5kZ;yHobQ=XX0r.VIpASa[wd0ٮuŤ^K`}TS?4ʒ~ZE k)dٷeR[i %zUVJSakaDZ?-mor,iٷy B#'P%eR*teD1U%ͮn:،ׯ Oє"ѴD`z=03\ͳ:aavd'H1?8Hx޿]JiSN YQ u4 I򲉕8::Y RM &(Җ(#}Y\aW}g-$,pD%H$jRToOwY¶8EYceDBGz߭|H!ebJd aQo Lۋ!`8#@g_oDqy{Gxeܿb?t[qK=~l$Rf~LDXD  `*؛0)8wGm=i>\^K 3q:lTazj5FnNa)>LLߡLk&5 rle>~=d۶f.RU9GPFgp2W=IthoBe7py4ǐ[t]ְْBCа{dJ@hjɆGLS2&K%ӢEM k!O:wQȫB8u/^݁[X IǒuvYى'xu[pa&ˆkspnƵJ &6; PX~kLQx+X$U4C$_VEuQ]-OQցƚëYX-:bP:p k1O+hu(֜$QWw%*ѦoZd{@ ޮș Ua˘@# - 5Ҭ^AO,eBb`]q5Nbk e']N04knPT&峨JGM l(5曼(,27%#Pz.kx8K*Ld!J[Vpp* /)E͹%#gbp7{1ՒqPe1>*TY^xS@^RZk4{UX-rU!WLRXQ B{iZJDiV҈ AnS>b8%f Rb_rGhwl?vȿNUnT^KE *ZtN޽e&\^N!T~I2w0gON/nҲcHUw'?1{XT~ΛQOJjی> _Ӽ9PK[zs:8dWt'-O5^A+<3Ǯv?o0yFHJl'_ZrHK"/S2k-4]&xAn8cOuWgɮzgu]Y6VZ٦IL=L3U9ЅnP 8!y-Z#{X0G#;O׆ ձ3' ~;a 51WyO^55مMq#7:.y' *48  9ޕrŹpf3;+ֵ-JD T zeix&CJZ ]~t'<k|i*"*룺_"+}hr}LgN,1-3iY<[oW{ׇJxQ䂥U#DjJca~C-D`뷦)P]"?%5eYl bvx3)[w]nOCY؝(l%UeO9\5텚VPg!c7'rq&8l~y,m/Ҡ=4?$W; dؓu0.Y-@夾aw?OpW-@?RYZܚ[W: o=:"ߡ0Ru.vE?" (_BѕEyJ Fshwf?E38;? {31v!r泃v_.f?23=gSAS<K 9}EM {'f %vfA !R84z<}YX 9NGn]5VeJANa8vvS"~g)քT( *7bxS~7~:dswȖwj;,a`񗫐IUQQRc+!zb{79wp{}bo72 `vTlIh C!eچcy̆7.1C)ąG\+?'x|=Iyl$fvH>>qsUwmk߹`Yd1| (JU]v6 nV&c~۸ U7HҨH5p/mJe& |Eh~z⚛iyT\!KM#{@A.rW-B4#_h$њ񴅗ds.86~SCF<毡:.'km^ّpR%0;uqAo=Ew=}@tM.E.GPw]c\mԴMV2ؿϩ2y(4ė>j$`h!>_nIn^2z8|8g; %c֎IJh?j4ZԌ]l#tYPYjYQ'fB3.49!Ѽ]e;Tfrkw)Wb<^{`HzuCti1xk` T:vDK"H@0y![Aqx-$R5:y| +%潟mޅYMN Jo@]zi=G{3sѭdXrp{5/Dʰ4GKJg.ń>͂,HǧIt[ e3SfDEk䟼tz3ή-'ܽhHktb'0M3Ƃ߼G.>ZsC˶\p, sRqav٘`vd2im.U jQd.!WkH`&&BL5lpcwdӔ`ך 0`WS)R}d6t35L>5 4*sMD9]|0 1ɉCuѢ]@˻=;D]z@(prNz0Ŋlo^Iq>ZT nn`ts{Yז D;-_Xf<7;fى o~\.j=;ۂ:RtFB ]3Ľ":8F#>`QIr؁++ qƋ.P ~Z(' 8ԖЕ36}%jЁpЃrCQ ,I $|o?bc6} d2OB;ȩP5Sʍt u&Fms<ȘqYƄL6Hٕ -X'Ɓߩ4;^3ۑDo7܉h0?B }u/S-nYܫ[Tޥʣ5(Ƕ3i#np CjYvAd7A߱j+\Q~2ù'+({qIo5y{dBjJ<5#Q5w ^N8 [WE^z%}'PMq|][L:o Qf$3mU˼|/&Sf4~&xDJ-c$a~QR!$~yΙW*tYCGh;@ڒdӄ l_U,`=1>P/K4j;]*+Z"ͻ;3цcƨ]sI/ "{a% 2J=<~ڗ7:xYh4 xO`TG#l\.>v/Le, *gcL.s,rk H2IXu+K$9 z%2 _H-2'K wZn0?GlP)⦓9Gc3l5* V'"NfO H =tA1HJm}%P-L2KgM'pUWfj5RgF߲"?dÞ2'+w{覾źt 4(+J ˥]B/x|Vȥ+s\JC%a"\xg_Tx>MQ#E )}j u ]'qUTL!̐7x,n55`Lv&xeIt@2ZڞaϨ (!b mUKMhL>2&vO#@oZ {+Hڰ*6ᘯY3>O6pʛDPDr'jp8#n/pH-QTRk>oYQ/RO(ڀI> Ҷ^&=Czl`m&e[yƲWfKʐ/sYƭPֹe836%Du9se-$[e3(´`͔"}qsJm.SItsQ"T,gP8&Y*C ` 4R,m4ŧ'qگg2U8|Ѭd9,We\ VS:,@vtSZaxM\&ZN]wiǕuGP9U1K `ɅrD/$%H#2ABSb|GBYTVÿY^p4ds-KA\k]1F)T!pځ.Uw+s7|R4j%dIjޠ.[ȋdjG2jtԖQ;iROQqv%as̬Z#xLGuJ}+ W?!”*]gU@Ufɦmw":v@ZVճOV}S@Rf[b{k[6h'@a͢L2 u cNr%rL# ?_KǝkRm!2o?/!/=ڳ;NKߩ,?އ~9nڞ& %nU_̵l+':\:@T0T!5>)UezS<&%+W'Uz׌iϦ ޱ7z9Ꚑ 65/زx>gt@1ǔ4y"q.n4NVߔbGaBA m8u> stream xڴuTh HwHס;nF:%%C@)q],8<; *2U &1 PPTR9ԁVnvvfVVND** g Aq=|Ty^@6_@dfVlt` LL"gțۂ<\lmyf%f2,Ђf@kS;K hiHkdUT55AEBCSK ))j3d44:[15_yܕ44uUX~`]l~7j03oj`WKg_ ֮,,Vn. g+fGiZ۸<@ζ3Wc,tךḿ._NҠVr+ _iM]UTUU؛8Ln.do 7g_9r'zwL\Ϳ69ظbkl))IKih2)ρI _ֿI*G`W$m}r9{wm@>/対[9h98$,B-X@'ܚWf嗘?G#gc  n@??Fl< sW𘃷 _,A`&SM{`Z"(\@.O.i7;;eS{ :_3S{;? c)2?:iO_=[,j {1+; xEiIv;6N-ui4uyR=]p++I1w^r0Y8XعΦ^!`הX@`k%,bD#no`x,E `,2;E7FE7sQ\~#0E F`.ohF`.ڿ˛E7s-MA`KS{G5r9gߎ`Slb.v.H9 2s65閮OF'*b[8\? Ld*%]1+x1~nOά`A%wr36J(]+'iQϴ^<YߎY{y*8mt-%®mw7d;bJ?;ΗPBu FOhK6Ad˽큑+i* rn[~/&ȱ Q Q/A#*KŹ*=`\>*-ɾ)W";3Ї#}-"3d^R=rIY.&6II?'s֩C3Ne[ b&{%nmW v.*C_a=IaJLt]fJc9D&ٲD+Lg~1kwW9U8l̇?g 8"!pNa`]Xɝ}Z]P)yzxQ6O ߰tpE iOVPe$iwJ{&L&qEk t&z*axDGӼ*frwQŰa{}޺9Tx`.wcefC.y~p4|HO٣$Wo1zN?-orB3+_Jlu|KS?L`ABA_T4y@+ p5DѨqXP3i|Z ,Y9C4%@fêT'mpPH[#bk+I i"mXE(=i}R֛>EUږSV:}ڑz82nMr(fkOu֦ew٦"H XN5F-ݜ͑P h wkql7ROn,T|YJ],&|?:uՆQZ'&7;ڞ$&T۞s?R"F^%4nDvConYYTci+YCAs)̄(l7}hi29DKS(H13Wd;nђfC :ji"?H fMW&4 |a/6_]E)\nnM 41a,bFMґoSA \Gv|S^Ӧe8CmM4͎mhߌWO]x˦z2(p&\Mpڷ ؉ 'z(_j|išAcW_hqj‡4q`Llh[z_[WұW x{QNg9 6r8zFy5lJ@f(92p33|Pl2,4aZM0zk e2M!}1x&m%S0pH4I㎹ 蓈#pV7= O7*m,N(_pc%T c|+x֩fh kVE@.;nW[tKwRI-a<7Kd'YfWb[zR.*}#ج=McZ̖=ӞF351 V>EE2VMQ蜶ؿYeSm XޞbވZJKԷCIƉYbP$wpk$XLX$Opn_Oe,pchPiDbU4Rg\,t'O:Av`~>h~>ܗ |!hߝ.ȵC)3nY@;;˼5=y廞N'M#ΚGcs$Rqmsωw FV"'SnOj1KKƙyY6lj* ڲF.{LI{4W:V l;h%x:b.GPޑSSJzb\cGJPfهħ꟫ViX ]dֳ<? LBiS|y<¦&=D_TfPA%+gܶx i'z$("їsa03Vm섘*i+ Wn_8 nƗ BP"Gԙ D>XM&v44wWKv}n{!%yIŘ/:(RNHlc!.}j6c cm1NK]@n$1&K6i./'D:db+u4Zӷ_9fŮܽX{Ux*рҊavW5M0uyHǀmEh3=*٠ŅV nThRvhP7خD>pbӺk.޿5c%m/Hs_uݷ:!*tL䜥1R sWs@IX,X7YXg, : xX{E-V.ǚ9PO: cSdZRc>!YdptŘ^k G(}9P {M#M,6^0bU7;ZwB'gVu_*QXn 5׋|tX#;V&NpkٝE^,挍\~8J9'Pєs,$DS%{ŃyyK`2њ84`Ժucp·GXjW xHw'lZIj~Wb|l]9 V8N1I8c>d: 楩>I+Rfb e VKB";r135s`eѶCy2\jII8L#$19{iku5mh۸eCcGHo1<ĩxVS}0lBC=,6BnkvBd.)UTdF&I)lg α`>`tQ4}s>4sÁ@{BWǮyMFQ@5%tF/9zW+Y65ŵt"dxk 6uzl!D;o!B*Ӑ0A<KAqOR=>XZ[(bk}cqA7;S;w 7Li] bF&8*lUZ=q`i]]>hY&'?H`d yjЃ݁֫D6x۾-hG:e+2 XH')rḅ:# n iyHEKւ"I]ՒずI,Ii3q8dOS)ahmK^8^?a;в%2ˢŲ~m:3ݥΔXi{Ğ} m0KXocJi#"ZE n3~ MxP~yy~ EBxɮKZK?MY27YmFf 3ucN7τL;'>T=T bna#KE^Sk( ++hzaQlsɘ= db𓕥-@QҮnHbm3]Caj'+{+| nEH$[qDᢒb[~I%? %g3 nD2sJ,b̖FBAſIh͈D*Y>= 2!˾Fr`)i=or\D4N=>D&BNSI{p&/O}g}UfH<sˆ5ajH|%Q͚ :AgZCq{G)$]6>z?cfL5;{Cs! ɿ\(Qэ18hߦ[/t4uX$J{-},zlE8Ox#WRB+b^ʑw  y|hm;8w,4ET?wS*=j>!3} AAD*n ‚"(d4OkRTŻ?C~-ArK ~Ѫ{T}%s_YSnn`nY4h!afƉ?}bOگ_#G9ؽ^h'VQp`c1A!6HhqTzlfZSضt=}O 9j-}&E=lYoz'U.!Yy+v \p֪ Ua~1ܬAqg^~u[0PCv3{y^R>͙Ae'Y>+]ʖ crYLQ%Lm.؃=L׀cxY "y-UHNhݞXۼDrWQ/uV1(TAd -m+ؙuFDpHi}^BIocKڙ~'~Td"#IQnȊq]QӞ2{vL.l홎Í\rJr]" 2YHw&N(in?jf8^5Gom9I|* 8RCWةA/wrH4mT"AGz~'B&(;!N&̙[nT%'sld~Hyg!*/Q[do0/tD禟+:.#QMc~.j.I ԏnk]ᓶ: jk3|/r;HwFJURaXílhr@nʯB(GUڍ?AM df릂y%;^ЉLs8֢ec(ǁ""t4#T:Y~5#ґb'M CYrmAbTv 9`ҨUY-)ɢԠn[-5 pO@E[8ѻCp7G%#!\ :(AeS&:"*8rx 9vPR]̹.,&!kLQj<njiSVtr)gqOc|WUjFeq?.*9APs1vXF_k#ֻhTnF}/g:Fg;,RZ$0S}U;Ai߼U{4 jflfO )jȇC)fKA&W-He%GS]]\|:lj{FsRPm< ox.##䯓4_LȅhiU]=IX*=]B߶S> YS4:^2{gu \UkT[64}ɯGbJxfJI^{Q}܎8t~ƩO uKU%\HLEwƂ7 -d8+l)ź/FpoQ P 3`.5z[:0Ic?Zt%mg Wg_mE6ztfi ?FGet-<0~nm^ӡ:wŒAU7~iŌL}W;PUfսr^5fXgknQ(?YA@$) {*` qx4`'$XgBƯOP;8| D|?]w둇*XlE\d]fFv#eFZoc@ [DD@]=>җBOFMb_>t𐼼\vܓ(}zÑ}`1/D:+7 d>3I]MKD6 E>ܝ<^%uPI7G"8u3 T+vk@rӄ|hҿy߆q}]SГyྤ&_K̈gR.HFӑBqQt環@9x~Bob M{Xor}zE"1Юb$ksa2zrL…X<6dT=:Gs Ȍ9}Gnk𒦙4xt`|d9g2m(h6J8 k|XVfthjPOg.jߤ .îD|<n:zm8޲ʋGܞGC^)a1-r#yіF4TFO鶁erS>+*Ҥ~Qبa;}TBU篛[9J4^p5rM'wMc4汪ٽ B|.iFЀYosSY5,R'J7~Nff,̰8U%31B"E3#lېI5 є dkuRoC!xH6"ăUA!皵+rzF?|%ƛ 6ILtִlNV)l*"7qB]\'Xˎ(H~<{I˝ղRh-it #$!*Q:"&vwgt[交~(VטZ}؝Hq-( b7ըx“ 0QGhi(w`&z a}+3\35uA!6PRkTĤ;+;N2 JM%,9#1RanqH,lիvR~.Zy@K#ه ZJ$M,!7GlݷkBhO﹡@@#;ZƟe(%P}Bi5LƉ ҇Q3PAR5L%Ϥ Emuyp 69FfuH$ZW|H8$< 8YL5}`}tSy %+Zb=S ఩q\v>O)I*{'$.>mBaxY@6;7m?mػ*rԊW-dC&d'jH[W;컚P?i鋎r>+,,6X̪Y}S`P;P.NUn)G5w@&HCuF%] YJp*JzAQw!&KI7rKΕH.mv?o%!I 60ͧ S~Ȉ (Ϝ0D h{'X08_Jގ!n!,꛼a2!0 ^2f`Ͳ}PsvEL~b< H)53w`m4YeFq1=R}'hmOK74x]D;˯FNa^Uؒڭ~K,D; "y櫥8F>iBٯ9FM12L >m=q hA+4wrnpa4oQpfO/kЪV!vc=q= +T宮 a0f7Z5ίAQNc=Ǫr> "z1;d]8wFc.']cF\;㹣1 IOp0>5E}d g Qhp:?нS8/8m/CPi\HUj xY T?n;7*r> Zr w/ԑ{K}t[^mP|RYQw 1&Z G Ό3޲_vɔx c/).RJaO܆zZ9W]zDkʹ_CXe;:܂?5zg_0hny:ԃ\ j@c2+8.sX!bqM &zA>(kU?FT bm\ܝœl{Ihq29_Om`N]"U$mvd,>BDژ&X.]W" yZ+=kWfr;贯vE42Y}d)_|r[mnz;bhl!&UإG8mmDp1^6y)i%w_$ܬ5^;DfPxvTݤ[أ{CI2ktVYdE*`NxW.ae)w[OJ4/rZ~}:bW}eLJlz`[뉝8Ň2QUyDY3Pԩ*x.Y1FӺ܅Z]#G{%pǓhW}â}lXy0WjX&Ը3k [K9E.NĶEv/ɋ ]x o N)8?[Phԥ!\&Om?ff$uV=.BJ-QԞM6ſB_*:عݯYxYxB$Hhq;x(n!v*RZ3&M>vWF cjdɰӽ'/xEhpџ1/VbK[(BR9A{x EPsbiBSURƟ߅GGu 0Pɕ%HĪ~ )O6)^k8oSXUZj`>ۈY![ofAJs (:oj|6-MyE}i&H%So0ZG]Rk`϶AiB"\w )ucS\^d$@D &UmAWHc<\o Kݓc=7\{i\6&oVu@C04ZPO0tڷTZL2MvbmtoKJ WѣmRf3b Obk7IՋcPfqQ a餏`f5 =bP-E0d0TYS2)ٹ}ܹĢu$]\-8ARٌ{>b}O3*Ƞ Q 'fY&ndopH4uqNw+!o}x|NCopH)V?SuIM-N HΣzV&X4Rq/NOa^g蝎cpX0b iE- FI4a--6 ڋb!L tn?(%nG;x"7BW(S2(^хPqg0>d&"O8_Ȃc)?g9A/ N ! W̵"a”ėή<|=X&?Q@;R;Ys .Liؠ啳^^uy[_@39 QihK2Ȱ#i`m/hl'c%1ޑ*<|>:*Du௷ /?^n4[׎6a l[E)5ޙ#ó@Zn:e3'W(å$Vl (o X>֜ҒJ>^m] OiKyi2+̄Q;N ňCt fgX#&`]V3t_N¤h/zD`~$?*9w2TB1JbҾ~uiCo,(&g[*@#ؾ%تFe8wnF>gBc' f]&}6T5nU]jꡚ=!'"o@f+f\iq$VG g9Qljs UM٪lB~kDHaU X~h"E)vʕLx@@8fYX qn{oK Gl3#UYb:<٢VO2CBi6L̙)/{zk(:2$Cq/M9 n\w_e8t硏:LNlM/83/;R.S/v+Jm@ӳxn}DoCN/H/Dk8_^1 iE NKZ0%lt EP[u'iv'n\Eݡċr2,' s.XA Τ,tWrF1+5 '@SR{=`!~gp;!Hw C]`P*JWͶo ܵ@i17D<<(;zCS؍(se߮~Ҳ7jɁ,@OM'4@bW,S#SaN~%AÿrjVR~Fn ^#{!0e˭['}nw5=cQJ\#hQR,ROD8|7dny{w1QV``}T!Y~"N>/(NAQ 2M^omfteeo/ˎh@>]O`#n)#<՟gZH\)KvR' I.u;ԑUz(TmBR)P˗=v=ol8~S:C;='Th? KT }X!_—zkV#j9JH44%Ns/"3k_Ɖ-2~T;tO.ݦR@VfhD&K%( ^*S&d8;R1Px}~bJOի~cE#K TwDrĺx^`sa80K|H*~|tAO+v|`>ߥ>EzlMT a>*3eCmpa0 < +1iR7=4ɦJ~hA[)U=:1/vsի}]Q޿֢\Q mU0/o^CY9"&ٯ&>&4ܳUO6إ"+]޺Pl/եp|$M͏NA]( buACBFDe'S wG`- .;kW+M dr$t14>^TA64y ˟ ډ+w-XS-?S0_bDjNc͹Ptyu? 7DI^9S&^dQkPK[g!C { W-V$sCU1 w|?!|!*71w!!;G/D.6+\uPf*H'9}AN$5vAtD{YS<˗&2[F`F, sMUBC,Qk!8KitRynLR!S>E9qԁ}z URՒ ^AˋoEQ!̭%g6_Q'|=M ߴtG tY3k{?W.9yz$&SZ iq՝Ǽ̱ gXgA.-:{r^9&YkHGDڧHd'?$vӬqN#V7WF7hsi$i7W,n "!P% kpVuܮ@Ԁ.y*Fx=?}Q6)dpk< t%n0w9<6R-A[  ۠03/V ߀SQ%: Qg4[,ԫ _%aYSꢱ\-mNYH-zuSLlTRJ9[wϲ2--Xu/ 0i3$n\%OO ƅk)JY&ڰq xx0n p&%RcxԈ0dbR(;OFez'.{ db"ݡnv )s/fx ,CJ(!ރt~Qmд>{ޛaǽĹA˫@]Xi $j%(Vq}֦g݇f[C4gmRKt=X{ì|(Y1&}q8OP;yѨL3*`;/i;6PK%({+ Y$)ɚ&{t޳"OBկ5"|,fXc:.4?^¨C}*i6C7@jpY=TJrl ) 0s)p{*5)H}u|Z4vO8@ Ԍi}6EIEMWW|jh'3N*'.+mgSL0֖;̟bDrFFf0]ipj&T #=@ P' 2w1`?Io_!  |фiixjΠ*n/$7۳cuf*G.crߝkW-5rwz[ B_`W0QD~௤Qr᠓'?LсFY"anCy ߷Ԯ-(ff 2B5ųE_FƒnNw ׋QO}٦+HNAEe60_%3뱼Zy'T8*4@Fj2qP殗~>68䟿(V9D&6;9T (8usN̫cC&c( =c1:78NS fnڐz"okUfM>AȻ8>1#sFѡ.Pk1gm{O԰-iV\ M86]Y~u&ɇ;j5 N1Aа~p~&,7smgյ{5uv֍Ef ZG ,MI0$P.=`X o<#8xc皧5ˍr'Ý p\(Kؤ>4p x3Y"4z< i˜~ޛ pl>[!h+*4,@)FDJ@V`:Bػ'h *iK#-Vݦ TA^w, J5~\2@!ɒ.[lfeØ15.q>ȴDHq2l҂EWNVEuPZ΋18Lv k˿X!H\j%zPB;Cug݇z'("_T۴$43*fx@@;Vm+g@;{*]w[]eNKLRVq٤EU=B2({ S0ҍ#Wz64 ! Uһk@6(JF dXyy :ݣILJRU^v?R@ G!^SC^k9dɈ7H.DR}xxv Mt X*]v[Bh+(7? &%xQ^ǎEK C2X_]ʩMꌍJ6qtb1 L28”p} (M_~hDYB˺ޟȧR$ԍPMMsiqPU$9u^MLnrȒR֣Kt,RyRB{p=bR !o |\{IJ];>_ir"qJ_}<ެJ^?f41u=0v40aEP759NѮ*#8,![2"ߝ =>ǨrX?qxGжk -<)yG%ECb2Śb>%E*OVc]5d}- {k8y9v\-uЎ|77 ' xoQLʋgٻ-pf;r>0bփ+9Nd 5Z8T*#`H Enz¸o][Pt8zG -F1fR9y9@SA>B|e(2 V1-q (C'6ʳB2n%W-_/ԀG X};v =jlR>?#{C|-ݰu?&/6&ÑY25p-jX4c AHI6f/uGbZ (@s@Z)~T y/ q6kA~` 8P=?U7DtԾrpw-I6rQOhjwO6'rgy4ܫ0sJ%`f&I Ehe,PDjPGy} O %;>5jk#/K]Z!qq_Z+ц%-8H2%R&{v?-Agc:T H!񮑖 xTIuwe*+M-nH+]V -9J&xb+4XsT/)=ś{:0zB)~BQNc1aZgS7[̈-Yvʞy/KC1JX6e.h~he{>2o}U?"nH; ͥRoy,:`a"4N&(̝.vӬqN#V9?lKע.IJC=P>]G?v\B#D ͞EF* cz'xuCu>*zROhZYH;V[c:kުt\ҫw\F6Xlg *}1ZBdk2P M @%9啶.QM~he v.Xlc62DFN֨N;fuBAQֆtARͨ48X0"{(rQG-V d:8=_@ KBľr`9Wϣj."|IuMOL޳.S֐c!G{ԑMSGF}EE1ѻ#u*]eOJw^ R =Yq9W23չ8:35bܳ :A;^Aͷ:Rm$ѷK#o𞯌/ ąÊZg5CgUNm5 ֤7A}ZӸ']MǰԤi|(<1a՛>ah\\B$a[&\DYU@ogOuW=W8M#':Zh$$~xJXHStd. z|HVyfbT86ZC%Ʀ*Zr$^  +ɑBؤ*4"9J[Q?ó5Z_cHVcRoR[ظ{[~6i*tU}w?"|{jh5l@Q%'+hN06N>,m꟩"454x (':S]\74D 89\ej4cAQPhbN\' tL($b<0Xin9jO L -C"G&qP:e>z7?mBTaM/^h>6;# 7*Aei9[J L^f@Kww$6Zٽ$OJϻJ8|ѭ2Pa0Lp_4ٻNC* #u3;f^p ^V !v( k\kucC@!C/!z1P5bss@YEj4fwx̎O* JYS y} ` l"چt6[uZ\IK?0Zʤ<2A[Rqr_{ar?8הu(>faNƭƕ΢4/M0b+vNoˬNwfn,xOj:S(%M{`G *T^zW-z8݅:P7 yUoO gC<^tJUa*NvgVhQ=`8yPb`^]8u =l4R2fJ@p^܅,q"c_8i= <2?˺֯58ϸ~'kw 6mspHRv vgNݍ&; @SV]/9!9 ԁm&ro A:6!q1t{;ΈIi@UNe~4hEZ5*Xi etIK'.veUmΩgJ)_n D!y1 Ntc!kC' kӲPnKvNNLMg,eF C4[MR.mS< `gd^'ݍ.ey|]p l+*='9;]xzF9<[Y3Pϧ !z=yD3~h] [PL-ъتʴFmx Wy[y]LX,$Z):@02dS0J9q>;Wtu}+յߗ͛?_HUEX/Zf+kY*/FQɐnA$(ik!'!diĈ6Ux&/^FٸNKI&km`b[ຣfC.v&vZXM/ܞKEҳ<1?M5>uEe]гn]SnƾPg gNyh᡺|DiBx1R a\Yq3v# P7ˆ-(ft7RIcuo֐ww'?j=fx>}eҒhYELGԾKbWrTPeckV4 `ԂkB_ "|+ْ0:mۻ( PJge Ģ6ϰ 5!t _QjS:x/YƳMf&W-ڻkɷoqJF B2tOz%/U|Mޑx ̷NZ« sEATY`(lm])/ƁL.Y៊ u 9~o} L[.cvS&K#oo*Mx|hw"%C,@z)Rh\nk]8VA]?S;tvgm[@'Uٖ?N',XQ/PLo" \7V6BIrZ!ͰwBj|JbCD2(WRBn0 XSX(&_Ai:㿬NJL`V?lӰ#@\/fbLʿE~K u"d/ Km<AbxM"8 B G^$ Kv8= |`qMaRcfKɌ֢Vk1m-d`p_{ /y 3 u?mwǤ^OFyUV a#j8ժRYYFWۻu (s䞺Z!6|J֕k=! _-,N`DUzԧv!3~Ahmr5Fqoq9Cz =+iwu܌8`UHZ fS?xthԁ*ʈ9< O&T8˂#]dЂgk{#2B hBt=kWL,A;j4J4wWFP,_ %|)u_ >ăH]wn,Oz}5 ʹK FGSbC defCQ<נ߸!ژ씁a`zYQnUwΧ|ܬ7Y(Mc* ]?%jpS&c%^ؾ‰9;rlg&VxH!L.o<1cZ\Vݔ'8(|Xo@XcbxyѤj5aUscgpUrݢIA Oq*/w1fhC;>l ,ʦpO0=-4:Ԥ%1ݱ®2~ꥋF@hc_jtݡr*O83Vcӫ-C@G@7`nanǭgk(y(]`fq1Y3R_ @1P@W*HlqyPn)F,њtaaEs+L|#%JtG!W㶅'2nrto-GJŝD~+Q*k,W¾<KY } iϠ_k Lt==zW"s @k[TBal#| y/+KU1O ߻hYqM,iJm X TJ8Ky横d|wbGE >\r&Όㄏ(7bQPd&!e#Qy"έǕ :fCz{{J0n-)Hh+ȇPK4 qo2BO'XGs ߊ ùJIR"S jq~xcdB uBGj! e|bob^G.ÝX/e9icqq@9whf/ Ab{'oQWg3d`t8 paJknjqY2̬F[YLڬM{e_-_3Vsj[jЃO`ȴҧA")_nȬo?KP#|+jyQHbP1CiaT%4OăCԕjȸC y  }<,I$)V ,lG+ \'Kݙpo[d 'ܟ:16!{Xr5(ޕ?4˙KyƟ-֘+e!!%N0dpiw|*q:&9ËN5X*\rDYrviUn1EvE"fnYz=|޾VZˇQѓg륾70~{8-7\ L & 6Zc<3s`{߿yZQ`:sZ_=ml,|)CVVR9#1TkA3 X2gSws"`V5M{r\H: ^74z"~ypH<1j ǠUS:DbqGt(ĔuJDeYC,M(9dV*cxC$Wq \#* . TQ1br[!OKOT[M Ԗ qL!s[ aug$=OJy.2yڴj& (ت.0#:[;dVRD/vw-ur`4jʪ"L3G 2cB IC&!쿾CG2\uj>珸׵Ɠ J:LvoRT9D^Ȑ{aE'qGã^rDH=1~D-rY ۠Ckwq+Skq[P`s|‰к,Uu@6r+O̐iLP ́Pv:D8μye9r1%H1p_~#hi)x2H);X>M-Lа'ދ R2<  T?5jq QINJH\"ԋ*Dl8+_ҌwI L?^:ïo.L.B1W>O6vbvOz , SN`5&EqiTO*VGQJn endstream endobj 1436 0 obj << /Length1 2624 /Length2 19987 /Length3 0 /Length 21517 /Filter /FlateDecode >> stream xڴeX5 ָ;!Kp=-$ ݂ׯ3sɜs߇Yev]  Pfbcf+(؃XdƶVvfVVN**1'1$n %S0 ʋ@N{8e{g03D YXt1{'+ KLL#eػ9XAfYffDhLƶ{s:P&RUPVcVsqpwO.bjRqEu P :߂vWPQQ`c] trM?QC2I jdo vcaqsscpq3;Y0;3`K~ @rv[ii% "1m68Eci< AC1` 4;A @7E!}1v38{7]=wD ;{gfK (#)$<=; f;/D!`ggBTd&fogwĭ };yXۀ@^w\X4@V.@CDd@0tM-Y~5+lŐ&x9;̍m>V@//gcW 7 sUA+ 3]S:5z̀,`8sKVH?_3c;+[2 -TimGg,i4SZĿ3,l&6NfV.5o-dt!Ϗ d*Mm@@gg_* 6~' `TVWf߹Jdjof;9{ B;d̀M d\>s{''"[7A?"X$Aܬ$ 6`8,2'EAv?®AؕA<v?§A ]iA>/Dg0C '+g?&?'L@s9?o? HEw`Ol!?9#PC_>J!&N?@e` " k6l!+6H5757לlw9!I:@O8!u;B6ҟ N$ lkl/73_bg 1ΦN5A_Rǿ K~Y/׿6럧?:j`'{׿L!S y r_T6ֿEEݽ8!W C_6I?{@w)¬)uJ]7_bX*^R!mٸ ̈́x9?Ɂ ~.M }Ѧ Ķ}^mH,4S4U%DdYa޿n_6;O+ƸFЁos] E2\Xiz,'[ y4(VLqQ^0ΐ64]XHÊ>JGlpPv6PEhP2^Q:ihg?,UPYqéݎՆR;z|P>=sBcex{T쟮? 5ᆈr#͗ݔ$>$Df_da`¬,I,֚Qh] U]^ɜ_\rOf~))_">̓ /+<]9?zb>4oA4G{R6p|nބ]|&:Og|A{1_KXR:&A>NzXN7'oE'gi^M7QQU"eڹ[;ۆlY1ySO#ˣb|6x{sjpd}&R"']0>&^&&+{֬c&@ ] &RiC]d9KfBr4M9?) ![&( _ / ¤R)@L3O$?TVIIΐYLe4P?7~>~)<4U1) 3`je7H鐚}Fyە a@dOZ tM3jeִ1,}@o6[ӫIiZUcEX>: Phldm)Xg0jl (!L迪" !&t4/9l[&rɴ1uY=49i'ѓb~t]cW-ȥupnP75OՓjӈ7Qc|AEhl5%N`IAik:lm{ָ&૕>U%I}O-\L+43xodb{8eD_,T79;1ϡ¨w||RJX#a }xqU!_\6'* ]D_lF«HȒ* DQ{~=QV6HgP`,1[F1abխ\gty+clGD-%X39o05a@_8*1vFŔJ@WMg*6M G11p?q7PV?3-\bAU^Qn"S{u4CsM0EX$֭cDwwQ~ =ǒfmͪ{ʢM :41Zy muTu 2H~l7 g*$"k8өS5fuĬ'5?_FZnK"!>Pmϛ 6ZS-q= ZgL~a2 Е(ΌLTç R=#LxOR5?}zBhhh8r!sWQ";,Q Pn0Em#ۻԞCjB9Vv 4ɍ .w1iӖ~a\ZgҘRUn9ҭ"CItT=[ g%诱WU5:~^z(.\.gl&qfܦWƷ$zuQVʏpk9 势bijAnO5'T/}Њ45RL9࿨wt"͢Bl3$݁sp<'cZBB}zaEvc# ʖT͉gcL6%Jlx;_Ogb)67k,I_s| jP=&44c. s})Ғh5sT d^ݢ"JوjDD“W)25H;=Qv9#Qrl1kEeS#jUQ =??IExAt_;6cw-%KgտJ|v;y|\xnFRq#.S? U.RDnYr(qmG}i8iFx /+.&x˯L Y=DkN_V'ta.3 E›rVDR6t.:Cud<|mَILK]Cɤ6>lAՖ[QŚ y\,U0uvĈ'cT~n#_!`6] J,6`zUA.?3]bޏ$E&`47ti3dV9&وhAM6<^ə}r]T.#RȔLH23|3}m'GmJ|3` -eX|)Ə~lmd%=*CdYӢ}rͥO Eby՛y 9O}Koz~J ~Hg((b(2ՔNdmN2KXɧ\Hl)EeNc΀dEA\< skϻ/)#6fa3ƪ'ko&V_[u[+E#j]a>ZGp~r 7ULHκ]C0>*^`&Z{i7cNƎx~Z~M+T~-PQUW-XLc8 *Yc! cvva e vRqf5U`mnd*F=ON!e-3 cSCGp ^w/ rXw[VA8)_#+2|T}GFDtJ ѶLHx*N~H}-1 ZQr顺yѽda5)]{IݤPvnBa0W)fMu,O5ۜG(siI\uн=?u T/\l ɈU*ZRgy 6}2=Y+,~m6Г& `uB#n ٰy rT,v{EhB^+0)PEDtՙr  ]K)wyF:k4ejhGd[0fp0f .qE̔ajL{9u|cfsXS֒G3ߌm$Ig@yY%S&.@Yo-OIoK[ӛ|Kή[&q(_#ww*hR5#)& ;10ƇTCY Gt?t_hΊ%XI43&Z\1yk‚izIJbu2N,b_͍ QJ){Ң}_ [w>E$_8=xk_#ҖxA/aE5oWW ﷑ -T,~ޛ폡 DyE*.#:zPTW& MHAIkUz73Vkz=MRV0r4J&fc.ME8#eIqdp.q0kֻČϗ+wǯ#~r #ſsm#I zP$2T>ї &_[nR,9gߒ4?-v@Y~>*ir?0@( rHT{': |;ZSg굪0}pJF)]o/̉ f%xg=aC޻OF8RX3t jHAQ_m:W>mh*}NPeZt\29o CS0mv`/rq'IĚ鞢K taV>n+l\kfZC<&+;ɍW `F#&.DKr/v2lzI Pxb[*1Tj3VFEy(XXcraQ^~ۦ}ђP 3!a}:][u1}>mFIa}w+D+h> {^ŤX;#$sGPEzEF@Vg+%=4ׅ4x}8`6j#ÎagWd/Lu:eiBToˤcC$?lYV&g? kcus5+%m5 2K~˘ʩ f1*&r'po"JfX@fߝ0]l+=A[GfvUC].J۝=NA؟\>ǡI[/. Zݘ cj-(JK4t1-_s$hM׳9B(YL<tDL=VFs* 6˧9z2t#8|H6s+PxEfQjp.=;LR/ϫPCU߇/'Ïm{y 6VwAΚmNHmdG0?a.|zxNňVQ^ ]A&.xb@6P-w"~2<獎@bI4i9Zƙ9U KL}}>?ŜlOiȱy֦ [co9{p{OMUExeٍF82=ejEnۑm<gW'rp+)v=6܃UgS8n6Rxq78޶gNٚ;˲Խ:@AVyaxb˨X ,$K ?KWO ֖PC$~Ot.lz ɣHc dp O&2h?5 Rm^U=3>5&bm5x7MU"M1T9i,@txǖHG !#n$ۗ\'GKޚwu`7ӣ :Ŕ+Un%T{.A0߻ r?h}*^ɚl EW)ba!'uP 9qm|z"'ovE.?[t|?u0a| 9ۻFxm͖ll`n. ",)tN U C{ATgcsXˠ^&} >V;_,BGKb5(NYO_,%i(tST-yTLN[4׋p*!; ս If<2IceΜxZK 8p,O= ya2'<F DUߓޗzء;zsĈԀKO:Q\fſj䚥'kljLlgg{J(e';j)d=@+<^{U.UCl,wad;>e2p9?;ʰvV7 ,3 KTcushF֜ 쎺}[HDmH/("u>JHk+hʛ` c 4`1ϣ=LMl"GLy,.$ʛrz}璋EʢZhIKeJ7ZڲOV{(}[@G3ѿ~۠pcOF RJ(2'p]]H.¼Pa%T1X˕BTSZ(X .$a\q9f );M#<,2"pxbԎo^)900G':]שZX>3֫`5ˆR0@ubx*;pWBp#HQ݆F( w6^ᵢb$ hG9&jS_%],68@d?2t~@e~\H&u6Z kz?)E(B4]>0Gn @O#\}`\C4,z쪌kHڜ[Ť#@#*kDgh߯"~;-^lj]%MZ G MΉS.уL!M5jcX=2āteKun%O~J)Wz1z Lئv"[6Yɻ۰hsKكuV6֯oYK Ѱ6lHGbCcטl7ꆲWE^sx) #%R쉒JUxSv|Eg>'%1L/j៾ƽ)F9fąSjIxy5YǑR{y%; !TIyj/pOL2hV[LFehדCRWk{QO.Y@խ UcR%I"ep3 \8eQP0昋OƸZlIm5.?V:4B?JRDuMX10b}\Xyx[ZH)ǿ:dTۈH iܑgJQX/c,CHo+k)RGG#]鴦ρ!-elMHjaP<=xl8O =S]؏IJpebBpE&obdAi[GG FУwRnCeNIBTeZ53( QorJ[ |$9?8#¨=梳 \Kg,2Oe&pj߮i32I) i򄩠ʕ62%釬f5êk ~ _^R lFM Yi5phg#a01j.PMTZ&޻ϳ{}m~w' 6ٙwD !MUC _k?BMT[ߝ&NGHYzu',棦ݱHA6 q-D|t&]'hoK(e yCd,f0GәSN+~)}m"9f9˻L?k?\3ѧ콋͚QNv-HdBUF%PDq5r}\r('ӡ!>Wֈw[B\'|H CU0 >O-.Ĵoc!"4gfLfniCIjR ػpsb)-}  `;v[Scxg!w\2CAt Ae\[YʝoxR٬|gWڦMoLJ-;)-MP .]Կ9L/P!ȧ$*zዌ?՗r~\iS^ PHHvIv̑:TX5y^ 0i$gҡRm_4?֒S<+p nd]U/:4.2H fu^Co"G $ͬɲyAB75w&=J}5Yd@9bw} WuYt^AG'I4a>>,1Ȼs[|HNQ>{77/oƷB#o3E=٫2L`% _0yڎgaUDqڹ&p52mDbfpW8Znu*IbOl ; Y#8Uz8M;*fupZ\JV_};hV|I߮T,}O :n( M_u\ >jK*KTBX1&2&mO)f?8R pL1`0PN5r{ߩ=*~[0d g\'|s.|>~⨛LY;#RRu2᧚8W Lx/db3~)kjiSY}m֋D %$Fȧd6iY\q#}L*ڱd 1D;܄Z㞅@Bp8 zx\%5X9j;^5d<#[6r׭ /9TwygÈv|% cY߄R_Ǎ>a Nv^5[G`&Nuf1(ٴ}w 7%C h<$i^U†m%O3pԾk+59RXq?}ƚjySEWXs^1U vKr*5kzfq&Gi0F#̂FK}oSH ~Rw񥱆U@b&҆ KCЛY?ZƘ=@B ۫ѤXilTe\!HD4.N ]eBxPU^kɍL \>Y2XI"L/_NL ɫYS/e" oE{vO3Cޯ)9Ӄ"w9Q'_#gyLyĦvY6sCkMjmtkR\ زadExY%;:б)3nbML^1UN԰2~屖<7H:$,fA+yhc.}Z0Q6<#Χ{"'^xs(rW_Y0Ğ.?U c )w_,p{:J%H ?Lur-?I_Ƞj/7Rx [Aw|L7;_f.wVSB){e>gmIGTN kvTS[^bY&k¨ LY3YpIb,54k;+ ^o;f/>mf2f}+Oڨ RhǵaT٘8m(z }%/jcCgwac*AbA" xAѧ8#3, t@t~)Â$5j`H : a6\ +X4 u'&24ݮ9jEC/ 5-!ڢQˍ4:fC`*OB˜iǩjd1). R3M>.$,7 ֹe)a]AD\R{;oI}<걥=ﵦ]ץ;SԦ ݚm!7w ĺ|>oz~}K;R(^g"oLdX/ K7R0JcI[iilԲBǪ ߯Nհ:;nBQuDKJR 9->V}2QoHFCP&uؖZ?"R=v wA<F'uJy ~XNvvҫ_E7IqNnM{;z\O p&]a-ߚo綡IQbu;":2`#bnٖmOA6tדVTݕH)%ҳW]Zw(}K n-.i/rO:硏s0ZBMY/N<7TM^B~//64cb&=ӧՒ_«81vM>'(>c8ĨզZmN ,d ͨ%.</GP z4kĦC),y0V* A;0'FypֱB8$ u͡LחQ~- Esk_)1ii ?J #vjL=/>_LuD$RLtzGؒ{Drwd,-Yˈ^bd/Z,QÛWK iRoAr{ij&j,$4bsC?&R6aKߵJ%xS|Kd-j4apn`IY?uNrG49yk LZ[j88 W>*aR׃1i=aھf +zm` ct@p^vs_()DIU:#(tP.kv1.1A={+'-Dp.4x^ >> C^b<[-` W6Xe^|IxZU{&5*eqqӐAFTq]YgUi"^ax0%rT\> eZcCsMX̊TQBaVdme /h _C@ySQ,t|5H3B|z"ͮ9l9쀚zlc?},j>^˜vEyD. yZ@oNC=(+0Y҈)~Ey~Î0w9 u ;g77[QH 2 [_C1|1Gu/=uf[֪5{L%(ʇj}ERY9Kf{hsG8$7+}zG;ujJmP++jY󨈝mPRB}GY9dr%A² f]:G}Ҍ,3)|5wF#kjcњ PA8@;-ԟd Cj7 _Er,CM]#ru$4P#8s> ]խ\00!|hzQz^b6Z}滏#΍9F?[BMU/i"R~tTFb2.S.7eyk92yRرYsf+VB\)0bkoJZlD pTp09%SZ'x,,(φ9LY![KWhY?~H91po 3/n,/}1-m)~ri `;ױ bSfLSž1Sc.M=x@ʿu c9}.ΙH Ҳx@>Sۤ /NR1śK(ąlB{σ/9ZO? f屖( Hh2eCFa#Q-ʒ'lMY<ɞVL( .БO|3IZ L^ ƣ*:` @* PPЩiކ?ZB?kO"?|_J'V8)d㋁9 (zmvghƯ7F=AO&~r+;I%.@eubJn̼A}ׄX?V,7b\e\NT 'q1(Jj] Xg}'9)W_ rjxD@HcRzR!nЙCFHwcyʎ@Y&cj~=vkHҌЖ˖ Kz^Ϝx|H$dmZ]%8dUߨ Dj_L1β`ҷ;u_>J26gWMn/.w8_uDTRɊ 0ƣHVw{ N*F۽l6 #EC9cu)L5M&b;0vԆ=L$i٧,B:\E s Wzi&96XjxFi bm2,t$0INn$Q@Cϕ8XmyN CNltCcjGV8+|_q\!z@gXKupxP`59NShHz$ IoDaҺjCR<3G#4Gz䝛ṵ>u] GZl=iKjƇY~BrS:Հ4LD(h@@K# 4$3Fk79JۃߩBXL~IJN2eIbSTD0S>B^Py @ /rNglEfq^x"w? 0UF?{s3oyQzLFP2v7-9;1w4#,-v 5*J33`t[IuoGRl.~OGEbA'c$۴B'mtǛ!]nVxϑT#v? ϤwH${;w⊳lo,$n ,߉bȑܒIc#Iٚc87E mdK[ RgVgBJԧ% n MsFFKPG“;1~;pRW=e6r3p} b]9Zc^U*mZG5J~k jGPzuN2EQ%JjVfaYcY[G""'j?Q&KrD Brg{'P]+j E}OțdWu OTs{pU/GxxGyDpuZ57*Ԉ~FJuy@\~X䝲y)e Ԁ f$"~TהdkFz?T LU3/W++/M Q%oCd@qL6]B{ ¼|*b쫗M|!4[cwz-R#Zp2ɘS~7tW9rRK΂"O˯,TƖs(X5T)M/WuHWuEL4ch_"i_ړUgRY$%QIGzeFpHL3$1 d;7O?1> >z٭ᡜBN e..j$W7>%E<|% Hsܰܦlv)5M{zJEނȿ9Ж'⁸ꐃE|'ls=kz^U7WČΛAԻAhaф}LWLQZ l K:aEr_CL~wѬ[tF. cC=K6o9Ϧ9fNp,~`Mi8Q@cwXX}wu2y ]w61%)fEn9Rf `g*I~w8I/tr!46PRT$.a14D^bŗΞOc<zI؀44&F<#]C%ІҩvcØI徦:JK<řOoqc=#Lѐ0 )l )We $nx.MǼ3oJRL|W 8"Eѣ{8Ucbh_U"fʋ>A:}Cե>PA$xRܟLj(z 5zQZ_9"apmgJgRu"G=f"ۺ|හflc,*2)MpeI67J.x׏)1c52wydp[Pꄧ%g_ҡo0.;h\YaŢ{FbqLhHKn8uQl"7|I~+u/K64^$=O"eJڵ^PЪJ_ߝ$b=w qf &*ڷ-bg3NM۵r`m @ëR|![И}PNw .9o~TF|'HӴk @Jf,;:[yb@xŸr[\R87\.K8$dߩw1Aps$ yR#lP͑d bR!n۔㹞Amƒ:/tr* gEGdd{`wwxhQYB-|htmq䯎1o^Q&Y F,ty!y]U_"0 "JM { ž|UfMq)K(^'*{H5HKyzTV!*<ea#04B'l8rNװP hl*p`7O(E ELU7hw\ra̅1zlQ@&Tiw-(8 ߩ^.~m>b=fw/8'!4nĤQ-FOiz>weg2<0[H'ь!!_xfm(;n82 nسãbk_{ KQޥ^!쭿;Tq s9A[ŋ*hM'D|I{Ey;EE'vM 1ncaT({{f[(}us1M)G]FVOZR-,'m/*j Xצ=K h&q$Ԛ{_tU- [h1XrLIQm>o0``>u.vQr~]˷" _g0L:ub%dO|L§-K @}w&@PZ v?f)3~= _ϑw4k?oL>Ë4!E/ 7)9eh kz@ n5sC}^h H9Tbaj'>Z6kp¯4@|Sʁ ZTa 㻬-ݢPfUA3z33<^46Y0ϗ%R~Ǒ{ZgեϾwFmF& cuAиl_Lݠ+3wuTd-K x93{c˹$CWc֖_Sk\t,9RU)OTsax ,}氶NNɽ:SK3h}g|U{\ӧ0Ԫ 5C :L |Z) dsv[4SYȻ}z3.9RBq]1; @^CYA!HsYbCB'PPKu;> stream x[[s۶~ L&g8M6y%V#(_vHL9np:cA XB*I.I,0%#x"2iJtщ46љNgibɓr)LY$sJI\%a8D lS+؉gD\I{)[GV hd*59jGƄgj k MgZ2Yjbp-`w0{߽\˾O6Wؔ,)b3lavHfݽ=.N%ߕWgbgO݋w<]axdz#lާ*LBSb׋< Ҝq07&aCm~ QѠ)Qغ bפ$eqD[$mpf [ڂ𮑽Ĺ-:}A}B⨇P5 V|Pa+h.zjz!i5̯wFRçze!:@xԬ1hlS6$H7s,ߗkP6@L;Klٯ7m *mSv`rCڔXE*Yw/WǷH^ƑU_vTvH2jA4Y(ٝud׋}<7k84wcYbښ6O? fNٷ): VӘmz:1]=W_0SxE 4iRho!mN@: UL5Z۵c͢">(@TGzX}n}u4kqXK&nG#;<ΙUYG^ PnB7jسxlb︆XvgE?z?Xx1>684 ȸEw1؛,?KDu+䋢WjI")vST+5[>pJ5:wAN_{kg֝ƄME|֍vf?(X`۱|pFi'e {:/j,2WnP`?[eѢMGqٷXjGWE eJt3*eϼ#_yE= ksw0#KDMLˀy>f笠DvwbT@0@Pr1"׊o-lb8aI\4rEkفINأ_Uv` - Z6 K?Ⱦg έ(WJgxRkm-7wJĦtb5:Q4t2C ߶5@$kp"~}p*Fe91vKV=]lZI^Se0ִ TyjZ %%Qjch K5*VGlH~K+y| 6[r-Ѐ [U4B猓AJQLB18@>DSq&U?ʥY[K-OrABHS4g=ce܍ ݠqaW)CUeLD)}9 zo2wmZޫ QpJ!b#ҥ 4w "iZ Q6L$ 6CB,%&R?E?"qA{Ո+Whe 5b 84PZ@=ʭ/.$.X$x1k[:>_ö*ԊV=PMИOw;O5Ee@?*jsT1A$!Yb%/VʕѴ7#ڋ̓/g`b2L<<,$7oZJ|C||qY=80:^&$O=v̧o{] <.||#=bsd ;eoـ!2v.%l¦l 6g_%~d [">/{ADݻb/~>=>:_HɧcEQ3+,Q>,>pt5Tb{HG(ְ%vO? [5{Gҍxxţ5FN# PLNjzN^M*gU4|y P/EU߸hEA#"}[.\]tr|5.(s+XZQP#Gkh\S3Eͷ_<g_pThY(:[{3W<<], 16'ZL 9 Jd {o1Y<+fvi K:P<$LpY-u*[|rsr~`5t~ǕuG yb--%KWJ,=*&`ɚJ;~٩hYU)Er=XoͲK"w:=~eNmGhM< v+c=}v^VVH;a1`9a^& K8PtR[#UBjS`bVeB,pX#P}> stream xڭZko_6 Chbhg@죿RLRO d8uB]BX!+YH -X2 cd(j d>LsSp)CdBЈָDВ՚ ^I0Ā ~Ű78b0 *ˊF(68)VHEoh} hw8½UΠc(a!<Ԗ`P$ *'S4@Fiin Y0(|EWvӺ"8x~W7_4SKO/ӷ}7|>f}N;HUxL8_>5#ig&\=}w|$k;翁ǫ^9rMr+ $'r?o ^.k"Pψ3l/+7V!~Ob/ryl/32,jGO6ehK|jl-dI@Zt,jWtPйA yK-tOGObXZng\ >=w *veq7P~Pk[EZZ)| gs ,aϥ]_]n[*N绿؝~԰,cnB ߲_o?KKubqPŀ< @yB@O8 1g@cr @:) u?VˀU`e@}Pƀ, @qGp hfKNYk:?:%łk04) \Sxp+[:2WbMT`63TM\Y_~9#2Vq[JOg+cā,ͬ)Y,BnMYf.tOԖÛrцYywm?x=- Oah't8OXph|: luylQ;afy9 o(Pg]Y%oQuC>5}Onkqɺ^ٷc9{JT9WOSO d <iv^p{+ȵ7|ޗthCwl1-X/wxaP'BULYPm!@Sb퐟ś%:E[ ys9ExV-v'xKddkw;і:zpel֥{8Bv < TƮEDqCCty 0p‛Ҡ/g t73\zP69w{름?I/6e+U&&2"3M1NVV-9h9ֹآѕtAC恢74i_q endstream endobj 1456 0 obj << /Type /ObjStm /N 100 /First 1046 /Length 4544 /Filter /FlateDecode >> stream xڅ\˒+QJx8|աÊR\Y!gI1jTVllaEtKAö+~԰YJį$y_Z@h_?ܢ??5GmnѶ/ [jWr@5m9ymhٲ*boCVz#[k'Qgm[1&$M$~M4%*VކխfœMaZt۶iȰmaT`⦠Mn-}e>ApF M7Slk!ֶr"j'%tBL?=!{Ajyv(}b ?1!b{7{b'{ӱB"sn,tCuۜKBEeBCkKN^m!om ~Z0_ kMbEBSdޚ[@~CoMsNEL cz|0Pc]- rRzk{[CZNZX?{kzo@k?{\RpU:@RY<0=c)nGB+}*H1_{pzz]Özav XzS3[6և|9~B+w]:)Fˇ!@9`!tw( μ"nmքz0΍4`N|3 <`|&%\0z2AP3项{[KX6kB=I|ba HFKσ&EKσd͒O~ksc=$Ay5ҽLY8!='=fy0kژDGɚN7ʜ0cc7+-S{A" i:C&`mnmքzSV՞f@E0.dȃ2'#n+*PH$ފ91Ua͠υuFkRe!4GH9BfalʴĖ=OohKxhcxF ,`{[b 74 _"^NV+b>(Ș!AFDB tsMai B6Qncgz[%jx9Y-M$i8C|%2u9$x Y 4Mn$c}K qb'F"ErX1s7g%2RmAJ 2v C2vMX&f;\(k ! @#$Q:8)˚ٍ|r%1 ge 1AΔDtWG&RLqVwDG,5HaAjsG ;rƯí\&R;Dyw2Eb eYөSW%68L`e "UѸv;Ur H !H * A 2PW_iTnC ^J=7`ոRIMځ-ٹ-F}aj!H)CPQDBײMHۛD'F'W'jyفOmz1x /D$ev-DRv 7!-9PMȬEM,ބfؚ'6b`b\3Ϯq4 ' D6f?LQ, ʓw%1FB. 3B@!Z'%4{p_+w1a"kمd.Y>jITc Jw+N,Gj XLlIzR!6BHqj<(H-lGمIj4I8$"& g)"%7Wg~8."f F,Ȉ4%f +*+pjҚ`*bCSAq%!f HjYYx[3]z$HkJW]iAn$ JDW JU%c%b=⫾JO# ˌi=b_TR͊ Ok $a94lgˠv1X\9U)eel!c '4qS&VKj^=TȪWb^5{Frkx9YN"Jόū% 8\tƭV$W54 Jo#UZY>/Wjx9Y->'Iw_H:HkBrZgdCB^؉8#[]JWTeoQY/=Fx1RR!`yIFZ#V|H/dFB'6l2Cf9Ʀ];%>?%QHQHٚTƺLtkd:4Rw8"™zR<)ހp~F>Af{%9JMMNNECO=D4҈"DCU?"i@Hp,wnC $j6h)u|Avmj62[ n$&S1lY>ٻ:2Dm dj;jx9Y阝a$:RIu CZel:{D1@CYA٪Q}VBD'!fKzܹsi{5H$6>f m9V,=9J$UF#ycH^YEgo+DY 8lo&2%pYa_V݋wǻLJÖ/Uj91|rpϟ_Oڹח<>=߿0U,;|8vޭz<>>uw|6x{<ߝ?>wg ?/Eʝf2<#w~:^?iϛa_/y]_^Hvޢ/zptkozrz/?-gZ"kJt3+^w;BK8ݣ_Oqf}yp/m&q3N׿.u{{sr|F ||zz<ɯ/-[1St- ˕#׷ON_ve|71z~y``A t<_]\?VS&_ev2QwcdzN}F48o\\wwϧ.iͦnD:ag'I%> endobj 1557 0 obj << /Type /ObjStm /N 54 /First 532 /Length 2290 /Filter /FlateDecode >> stream xڭZK#71@@ HnnrZPmWwWGU_lvl="G(Q*pB  ((8)HH"HJIKC ghHrBymPB?$AhemxRh+KcZ(7HZ+ GNXi^XamL(OP0!M8" )ƤR4 QSTr ) =R8(.w s¹+ ! , \t/:{X^4/ o%pR ғ!I #,d.YA\F(LLQ (dۻafe=N X+gG1(ШGв8rc=m܌s|/w}WGmy7wtG 62m5i6j&nsg[E@a IZsk,44am@)U1E2 HFv"<!-F"w(A\d#./]Hr'#n",6iGjX Â᧗;iu߮k~Mk s6{3æ/8LgD=?.0qi1LfSwӦ_m>E> xgeoSϹf.Pũ5I#Aꫲ{}>  ka<>_qM4nWOc,Z!J $(nfsi881ڵ:6~ѩE,yf㙍g63x" 73|80`DD6Q`Fr`Ts>hBU12Ȕ"SȰ"\ IeL&TdR$ ZL `pt@,8v>aT Y)fuj(0.u()bI)&>Rzpڅ-#jfJSV@.#^TҪfFiNaTZ,Ng"aJ)d8цakC2j9dH Z΂yXJp_` I> p}_ !nsn R pKwI<꿂EwVʿZOЅ?\pk'A3d;,pϬKwIkҷn=B`b'$7[wW̆$pP\+D=B 뛏bA`n[K渥:n[+[5=Hw_S9-{׻织 {4t<3#L퇟>QW endstream endobj 1612 0 obj << /Type /XRef /Index [0 1613] /Size 1613 /W [1 3 1] /Root 1610 0 R /Info 1611 0 R /ID [ ] /Length 3375 /Filter /FlateDecode >> stream xGd_^^s9s>FW w6w3OPA9 8 YL,#9/Wu}ĠF&d(2%2#sd̓PbY"o2Y.+d`jE>3;pQ=JVZN(dlMF]vN%eB>/4ރwXQ9&儜SrZY9'\KrYky#| oM*ܐrKn+JݒẸsVQtF֑ӭl\PBSGSW5uTTTTTgWWWWCCCCJ'.PHHHHHg~Y N~.999999994]EsM3Όo>=444444\99999999]}I"z73V3XO0ٽl;l:Pv;v;v;h:h44LP&Y+" 4u_>+d,HRY&eU/gmMYV&#.;dݲG>/rD19.'䤜rF99/\rEu!7'"_Ex@@@@[ۚokۚokb1֐[Cn ]%i3~5[o EE uq}omS?ׯTKDKDKDKDKDk歙fޚykҭI&RR=h?8^/PܤgMQ(D5" BD!Q(D" BD!Q(fY̲Q( , žubQ P*n ApP8( ApP8( ApP8( ApP8( ApP8(wk{Q nf\7RQ(D" CP`(0 C|jidB2)S2-32G|2db'O >|2d'O >|2d'OrVr3Of/rD19.'䤜aqvkB\rI.*ܐrKn+',_+ L? 4?o?@004kD"@D "_;T"`X,C0_W_^6Fan,BV*Y-kdA6&,[dllSvn#{erD19.'䤜2o%3r֯ɟrA.%,W\rCn-RܓREG$'nuF?OBНKH_`8~s[އڳlJٵQ[ڰ~M7[؞~G3ﰼ~%=M['fs44%=u1K6z,Xr՟s,A.YOsYNj>bROGP:/Y/B_we=,g,A=TY+^XznkKeSl _f?[z_-˃fS?n|,GR3hR, ~4-O<kIh̷GHN$'BD^"*1^7xYUq;ŝ0"i4E"MHS)i4ETsq >|4hG >|4hq<7GLP&eJeF\'e,EXRY&eUZZY'elMYV&#.;dݲG>/rD19.'xK~P endstream endobj startxref 507341 %%EOF libxsmm-1.17/documentation/libxsmm_aux.md000066400000000000000000000341611415223013700206430ustar00rootroot00000000000000## Service Functions ### Target Architecture This functionality is available for the C and Fortran interface. There are [ID based](https://github.com/hfp/libxsmm/blob/master/include/libxsmm_cpuid.h#L47) (same for C and Fortran) and string based functions to query the code path (as determined by the CPUID), or to set the code path regardless of the presented CPUID features. The latter may degrade performance if a lower set of instruction set extensions is requested, which can be still useful for studying the performance impact of different instruction set extensions. **Note**: There is no additional check performed if an unsupported instruction set extension is requested, and incompatible JIT-generated code may be executed (unknown instruction signaled). ```C int libxsmm_get_target_archid(void); void libxsmm_set_target_archid(int id); const char* libxsmm_get_target_arch(void); void libxsmm_set_target_arch(const char* arch); ``` Available code paths (IDs and corresponding strings): * LIBXSMM_TARGET_ARCH_GENERIC: "**generic**", "none", "0" * LIBXSMM_X86_GENERIC: "**x86**", "x64", "sse2" * LIBXSMM_X86_SSE3: "**sse3**" * LIBXSMM_X86_SSE42: "**wsm**", "nhm", "sse4", "sse4_2", "sse4.2" * LIBXSMM_X86_AVX: "**snb**", "avx" * LIBXSMM_X86_AVX2: "**hsw**", "avx2" * LIBXSMM_X86_AVX512_MIC: "**knl**", "mic" * LIBXSMM_X86_AVX512_KNM: "**knm**" * LIBXSMM_X86_AVX512_CORE: "**skx**", "skl", "avx3", "avx512" * LIBXSMM_X86_AVX512_CLX: "**clx**" * LIBXSMM_X86_AVX512_CPX: "**cpx**" * LIBXSMM_X86_AVX512_SPR: "**spr**" The **bold** names are returned by `libxsmm_get_target_arch` whereas `libxsmm_set_target_arch` accepts all of the above strings (similar to the environment variable LIBXSMM_TARGET). ### Verbosity Level The [verbose mode](index.md#verbose-mode) (level of verbosity) can be controlled using the C or Fortran API, and there is an environment variable which corresponds to `libxsmm_set_verbosity` (LIBXSMM_VERBOSE). ```C int libxsmm_get_verbosity(void); void libxsmm_set_verbosity(int level); ``` ### Timer Facility Due to the performance oriented nature of LIBXSMM, timer-related functionality is available for the C and Fortran interface ([libxsmm_timer.h](https://github.com/hfp/libxsmm/blob/master/include/libxsmm_timer.h#L37) and [libxsmm.f](https://github.com/hfp/libxsmm/blob/master/include/libxsmm.f#L32)). The timer is used in many of the [code samples](https://github.com/hfp/libxsmm/tree/master/samples) to measure the duration of executing a region of the code. The timer is based on a monotonic clock tick, which uses a platform-specific resolution. The counter may rely on the time stamp counter instruction (RDTSC), which is not necessarily counting CPU cycles (reasons are out of scope in this context). However, `libxsmm_timer_ncycles` delivers raw clock ticks (RDTSC). ```C typedef unsigned long long libxsmm_timer_tickint; libxsmm_timer_tickint libxsmm_timer_tick(void); double libxsmm_timer_duration( libxsmm_timer_tickint tick0, libxsmm_timer_tickint tick1); libxsmm_timer_tickint libxsmm_timer_ncycles( libxsmm_timer_tickint tick0, libxsmm_timer_tickint tick1); ``` ### User-Data Dispatch To register a user-defined key-value pair with LIBXSMM's fast key-value store, the key must be binary reproducible. Structured key-data (`struct` or `class` type which can be padded in a compiler-specific fashion) must be completely cleared, i.e., all gaps may be zero-filled before initializing data members (`memset(&mykey, 0, sizeof(mykey))`). This is because some compilers can leave padded data uninitialized, which breaks binary reproducible keys, hence the flow is: claring heterogeneous keys (struct), initialization (members), and registration. The size of the key is arbitrary but limited to LIBXSMM_DESCRIPTOR_MAXSIZE (96 Byte), and the size of the value can be of an arbitrary size. The given value is copied and may be initialized at registration-time or when dispatched. Registered data is released at program termination but can be manually unregistered and released (`libxsmm_xrelease`), e.g., to register a larger value for an existing key. ```C void* libxsmm_xregister(const void* key, size_t key_size, size_t value_size, const void* value_init); void* libxsmm_xdispatch(const void* key, size_t key_size); ``` The Fortran interface is designed to follow the same flow as the C language: (1) `libxsmm_xdispatch` is used to query the value, and (2) if the value is a NULL-pointer, it is registered per `libxsmm_xregister`. Similar to C (`memset`), structured key-data must be zero-filled (`libxsmm_xclear`) even when followed by an element-wise initialization. A key based on a contiguous array has no gaps by definition and it is enough to initialize the array elements. A [Fortran example](https://github.com/hfp/libxsmm/blob/master/samples/utilities/dispatch/dispatch_udt.f) is given as part of the [Dispatch Microbenchmark](https://github.com/hfp/libxsmm/tree/master/samples/utilities/dispatch). ```Fortran FUNCTION libxsmm_xregister(key, keysize, valsize, valinit) TYPE(C_PTR), INTENT(IN), VALUE :: key TYPE(C_PTR), INTENT(IN), VALUE, OPTIONAL :: valinit INTEGER(C_INT), INTENT(IN) :: keysize, valsize TYPE(C_PTR) :: libxsmm_xregister END FUNCTION FUNCTION libxsmm_xdispatch(key, keysize) TYPE(C_PTR), INTENT(IN), VALUE :: key INTEGER(C_INT), INTENT(IN) :: keysize TYPE(C_PTR) :: libxsmm_xdispatch END FUNCTION ``` **Note**: This functionality can be used to, e.g., dispatch multiple kernels in one step if a code location relies on multiple kernels. This way, one can pay the cost of dispatch one time per task rather than according to the number of JIT-kernels used by this task. However, the functionality is not limited to multiple kernels but any data can be registered and queried. User-data dispatch uses the same implementation as regular code-dispatch. ### Memory Allocation The C interface ([libxsmm_malloc.h](https://github.com/hfp/libxsmm/blob/master/include/libxsmm_malloc.h)) provides functions for aligned memory one of which allows to specify the alignment (or to request an automatically selected alignment). The automatic alignment is also available with a `malloc` compatible signature. The size of the automatic alignment depends on a heuristic, which uses the size of the requested buffer. **Note**: The function `libxsmm_free` must be used to deallocate buffers allocated by LIBXSMM's allocation functions. ```C void* libxsmm_malloc(size_t size); void* libxsmm_aligned_malloc(size_t size, size_t alignment); void* libxsmm_aligned_scratch(size_t size, size_t alignment); void libxsmm_free(const volatile void* memory); int libxsmm_get_malloc_info(const void* m, libxsmm_malloc_info* i); int libxsmm_get_scratch_info(libxsmm_scratch_info* info); ``` The library exposes two memory allocation domains: (1) default memory allocation, and (2) scratch memory allocation. There are similar service functions for both domains that allow to customize the allocation and deallocation function. The "context form" even supports a user-defined "object", which may represent an allocator or any other external facility. To set the allocator of the default domain is analogous to setting the allocator of the scratch memory domain (shown below). ```C int libxsmm_set_scratch_allocator(void* context, libxsmm_malloc_function malloc_fn, libxsmm_free_function free_fn); int libxsmm_get_scratch_allocator(void** context, libxsmm_malloc_function* malloc_fn, libxsmm_free_function* free_fn); ``` The scratch memory allocation is very effective and delivers a decent speedup over subsequent regular memory allocations. In contrast to the default allocator, a watermark for repeatedly allocated and deallocated buffers is established. The scratch memory domain is (arbitrarily) limited to 4 GB of memory which can be adjusted to a different number of Bytes (available per [libxsmm_malloc.h](https://github.com/hfp/libxsmm/blob/master/include/libxsmm_malloc.h), and also per environment variable LIBXSMM_SCRATCH_LIMIT with optional "k|K", "m|M", "g|G" units, unlimited per "-1"). ```C void libxsmm_set_scratch_limit(size_t nbytes); size_t libxsmm_get_scratch_limit(void); ``` By establishing a pool of "temporary" memory, the cost of repeated allocation and deallocation cycles is avoided when the watermark is reached. The scratch memory is scope-oriented with a limited number of pools for buffers of different life-time or held for different threads. The [verbose mode](index.md#verbose-mode) with a verbosity level of at least two (LIBXSMM_VERBOSE=2) shows some statistics about the populated scratch memory. ```bash Scratch: 173 MB (mallocs=5, pools=1) ``` To improve thread-scalability and to avoid frequent memory allocation/deallocation, the scratch memory allocator can be leveraged by [intercepting existing malloc/free calls](libxsmm_tune.md#intercepted-allocations). **Note**: be careful with scratch memory as it only grows during execution (in between `libxsmm_init` and `libxsmm_finalize` unless `libxsmm_release_scratch` is called). This is true even when `libxsmm_free` is (and should be) used! ### Meta Image File I/O Loading and storing data (I/O) is normally out of LIBXSMM's scope. However, comparing results (correctness) or writing files for visual inspection is clearly desired. This is particularly useful for the DNN domain. The MHD library domain provides support for the Meta Image File format (MHD). Tools such as [ITK-SNAP](http://itksnap.org/) or [ParaView](https://www.paraview.org/) can be used to inspect, compare, and modify images (even beyond two-dimensional images). Writing an image is per `libxsmm_mhd_write`, and loading an image is split in two stages: (1) `libxsmm_mhd_read_header`, and (2) `libxsmm_mhd_read`. The first step allows to allocate a properly sized buffer, which is then used to obtain the data per `libxsmm_mhd_read`. When reading data, an on-the-fly type conversion is supported. Further, data that is already in memory can be compared against file-data without allocating memory or reading this file into memory. To load an image from a familiar format (JPG, PNG, etc.), one may save the raw data using for instance [IrfanView](http://www.irfanview.com/) and rely on a "header-only" MHD-file (plain text). This may look like: ```ini NDims = 2 DimSize = 202 134 ElementType = MET_UCHAR ElementNumberOfChannels = 1 ElementDataFile = mhd_image.raw ``` In the above case, a single channel (gray-scale) 202x134-image is described with pixel data stored separately (`mhd_image.raw`). Multi-channel images are expected to interleave the pixel data. The pixel type is per `libxsmm_mhd_elemtype` ([libxsmm_mhd.h](https://github.com/hfp/libxsmm/blob/master/include/libxsmm_mhd.h#L38)). ### Thread Synchronization LIBXSMM comes with a number of light-weight abstraction layers (macro and API-based), which are distinct from the internal API (include files in [src](https://github.com/hfp/libxsmm/tree/master/src) directory) and that are exposed for general use (and hence part of the [include](https://github.com/hfp/libxsmm/tree/master/include) directory). The synchronization layer is mainly based on macros: LIBXSMM_LOCK_\* provide spin-locks, mutexes, and reader-writer locks (LIBXSMM_LOCK_SPINLOCK, LIBXSMM_LOCK_MUTEX, and LIBXSMM_LOCK_RWLOCK respectively). Usually the spin-lock is also named LIBXSMM_LOCK_DEFAULT. The implementation is intentionally based on OS-native primitives unless LIBXSMM is reconfigured (per LIBXSMM_LOCK_SYSTEM) or built using `make OMP=1` (using OpenMP inside of the library is not recommended). The life-cycle of a lock looks like: ```C /* attribute variable and lock variable */ LIBXSMM_LOCK_ATTR_TYPE(LIBXSMM_LOCK_DEFAULT) attr; LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK_DEFAULT) lock; /* attribute initialization */ LIBXSMM_LOCK_ATTR_INIT(LIBXSMM_LOCK_DEFAULT, &attr); /* lock initialization per initialized attribute */ LIBXSMM_LOCK_INIT(LIBXSMM_LOCK_DEFAULT, &lock, &attr); /* the attribute can be destroyed */ LIBXSMM_LOCK_ATTR_DESTROY(LIBXSMM_LOCK_DEFAULT, &attr); /* lock destruction (usage: see below/next code block) */ LIBXSMM_LOCK_DESTROY(LIBXSMM_LOCK_DEFAULT, &lock); ``` Once the lock is initialized (or an array of locks), it can be exclusively locked or try-locked, and released at the end of the locked section (LIBXSMM_LOCK_ACQUIRE, LIBXSMM_LOCK_TRYLOCK, and LIBXSMM_LOCK_RELEASE respectively): ```C LIBXSMM_LOCK_ACQUIRE(LIBXSMM_LOCK_DEFAULT, &lock); /* locked code section */ LIBXSMM_LOCK_RELEASE(LIBXSMM_LOCK_DEFAULT, &lock); ``` If the lock-kind is LIBXSMM_LOCK_RWLOCK, non-exclusive a.k.a. shared locking allows to permit multiple readers (LIBXSMM_LOCK_ACQREAD, LIBXSMM_LOCK_TRYREAD, and LIBXSMM_LOCK_RELREAD) if the lock is not acquired exclusively (see above). An attempt to only read-lock anything else but an RW-lock is an exclusive lock (see above). ```C if (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK) == LIBXSMM_LOCK_TRYREAD(LIBXSMM_LOCK_RWLOCK, &rwlock)) { /* locked code section */ LIBXSMM_LOCK_RELREAD(LIBXSMM_LOCK_RWLOCK, &rwlock); } ``` Locking different sections for read (LIBXSMM_LOCK_ACQREAD, LIBXSMM_LOCK_RELREAD) and write (LIBXSMM_LOCK_ACQUIRE, LIBXSMM_LOCK_RELEASE) may look like: ```C LIBXSMM_LOCK_ACQREAD(LIBXSMM_LOCK_RWLOCK, &rwlock); /* locked code section: only reads are performed */ LIBXSMM_LOCK_RELREAD(LIBXSMM_LOCK_RWLOCK, &rwlock); LIBXSMM_LOCK_ACQUIRE(LIBXSMM_LOCK_RWLOCK, &rwlock); /* locked code section: exclusive write (no R/W) */ LIBXSMM_LOCK_RELEASE(LIBXSMM_LOCK_RWLOCK, &rwlock); ``` For a lock not backed by an OS level primitive (fully featured lock), the synchronization layer also provides a simple lock based on atomic operations: ```C static union { char pad[LIBXSMM_CACHELINE]; volatile LIBXSMM_ATOMIC_LOCKTYPE state; } lock; LIBXSMM_ATOMIC_ACQUIRE(&lock.state, LIBXSMM_SYNC_NPAUSE, LIBXSMM_ATOMIC_RELAXED); /* locked code section */ LIBXSMM_ATOMIC_RELEASE(&lock.state, LIBXSMM_ATOMIC_RELAXED); ``` In addition to the LIBXSMM_LOCK_\* macros or LIBXSMM_ATOMIC_LOCKTYPE, API-based lock primitives are also available (libxsmm_mutex_\*, and libxsmm_rwlock_\*). However, the underlying implementation of the latter is experimental. libxsmm-1.17/documentation/libxsmm_be.md000066400000000000000000000152331415223013700204330ustar00rootroot00000000000000## Backend ### Code Generator (JIT) There can be situations in which it is up-front not clear which problem-sizes will be needed when running an application. To leverage LIBXSMM's high-performance kernels, the library implements a JIT (Just-In-Time) code generation backend which generates the requested kernels on the fly (in-memory). This is accomplished by emitting the corresponding byte-code directly into an executable buffer. The actual JIT code is generated per the CPUID flags, and therefore does not rely on the code path selected when building the library. In the current implementation, some limitations apply to the JIT backend specifically: 1. To stay agnostic to any threading model used, Pthread mutexes are guarding the updates of the JIT'ted code cache (link line with `-lpthread` is required); building with OMP=1 employs an OpenMP critical section as an alternative locking mechanism. 2. There is limited support for the Windows calling convention (only kernels without prefetch signature). The JIT backend can also be disabled at build time (`make JIT=0`) as well as at runtime (`LIBXSMM_TARGET=0`, or anything prior to Intel AVX). The latter is an environment variable which allows to set a code path independent of the CPUID (LIBXSMM_TARGET=0|1|sse|snb|hsw|knl|knm|skx|clx|cpx|spr). Please note that LIBXSMM_TARGET cannot enable the JIT backend if it was disabled at build time (JIT=0). One can use the afore mentioned THRESHOLD parameter to control the matrix sizes for which the JIT compilation will be automatically performed. However, explicitly requested kernels (by calling `libxsmm_?mmdispatch`) fall not under a threshold for the problem-size. In any case, JIT code generation can be used for accompanying statically generated code. ### Generator Driver In rare situations, it might be useful to directly incorporate generated C code (with inline assembly regions). This is accomplished by invoking a driver program (with certain command line arguments). **Note**: The stand-alone generator-driver is considered legacy (deprecated). Associated functionality may be removed and future instruction set extensions may not be addressed with printed assembly code. The cost of dispatching JIT-code for every code region of an application, and for every visit of such region, can be amortized in several ways and without dispensing JIT-generated code. Dispatching [multiple kernels at once](libxsmm_aux.md#user-data-dispatch) or (most effectively) tabulating JIT'ted function pointers manually, can elleviate or remove first-time code generation and (more important) the cost of subsequently dispatching kernels (when code was already JIT-generated). The generator driver program is usually built as part of LIBXSMM's build process, but also available as a separate build target: ```bash make generator bin/libxsmm_gemm_generator ``` The code generator driver program accepts the following arguments: 1. Select: dense, dense_asm, sparse, sparse_csr, or sparse_csr_reg 2. Filename of a file to append to 3. Routine name to be created 4. M parameter 5. N parameter 6. K parameter 7. LDA (0 indicates A is sparse if 1st arg. is "sparse*") 8. LDB (0 indicates B is sparse if 1st arg. is "sparse*") 9. LDC parameter 10. Alpha (1) 11. Beta: (0 or 1) 12. Alignment override for A (1 auto, 0 unalignment) 13. Alignment override for C (1 auto, 0 unalignment) 14. Architecture (noarch, wsm, snb, hsw, knl, knm, skx, clx, cpx) 15. Prefetch strategy, see below (only nopf or pfsigonly for "sparse*") 16. SP (single-precision), DP (double-recision), or I16 (only "dense*") 17. CSC file in Matrix market format (only if 1st arg. is "sparse*"). The prefetch strategy can be: 1. "nopf": data is not prefetched, just three arguments: A, B, and C 2. "pfsigonly": no prefetches, kernel signature: A, B, C, A', B', and C' 3. "BL2viaC": uses accesses to C to prefetch B' 4. "AL2": uses accesses to A to prefetch A 5. "curAL2": prefetches current A ahead in the kernel 6. "AL2_BL2viaC": combines AL2 and BL2viaC 7. "curAL2_BL2viaC": combines curAL2 and BL2viaC Here are some examples of invoking the driver program: ```bash bin/libxsmm_gemm_generator dense foo.c foo 16 16 16 32 32 32 1 1 1 1 hsw nopf DP bin/libxsmm_gemm_generator dense_asm foo.c foo 16 16 16 32 32 32 1 1 1 1 knl AL2_BL2viaC DP bin/libxsmm_gemm_generator sparse foo.c foo 16 16 16 32 0 32 1 1 1 1 hsw nopf DP bar.csc ``` Please note, there are additional examples given in samples/generator and samples/seissol. ### Development Concepts The low-level code generator is hosted by a single translation unit ([src/generator_x86_instructions.c](https://github.com/hfp/libxsmm/blob/master/src/generator_x86_instructions.h)). The code generator emits instructions as enumerated in [src/generator_common.h](https://github.com/hfp/libxsmm/blob/master/src/generator_common.h). A kernel then is a buffered stream of instructions in either binary/encoded or textual form. The latter is leveraged by stand-alone generator drivers that can print C functions with an assembly section (inline). A [generator driver](#generator-driver) may exists for some of LIBXSMM's function domains. Please note that emitting the textual form is not needed to inspect the emitted code since the binary encoded form can be easily disassembled ([objdump](index.md#objdump)). The binary encoded form is directly suitable for execution by casting the code-buffer into a function-pointer of the corresponding signature. It is advised to rely on LIBXSMM's internal memory allocation routines to acquire an executable buffer (see libxsmm_malloc_flags, libxsmm_xmalloc, and libxsmm_malloc_attrib in [src/libxsmm_main.h](https://github.com/hfp/libxsmm/blob/master/src/libxsmm_main.h)). This ensures correct behavior in security-hardened environments. As a bonus, [profiler support](libxsmm_prof.md) for the emitted code is enabled transparently. To debug the JIT'ted code, GNU GDB can be used to disassemble a given memory address (`disas address,+length`). Having the code disassembled side-by-side (while debugging) helps to look ahead and to have some orientation. For the latter, [objdump](index.md#objdump) can be used to acquire the source code (assembly) along with hexadecimal line numbers (length). The offset position (for GDB's disas) directly corresponds to objectdump's line numbers. The kernel development is much like assembly programming, except that an API is used to emit instructions. For further reference, some existing source code for building kernels can be inspected (e.g., matcopy). This may help to capture the concept of mapping registers (basically a table to avoid hard-coding register names). libxsmm-1.17/documentation/libxsmm_compat.md000066400000000000000000000000001415223013700213120ustar00rootroot00000000000000libxsmm-1.17/documentation/libxsmm_dl.md000066400000000000000000000157231415223013700204500ustar00rootroot00000000000000## Deep Neural Networks To achieve best performance with small convolutions for CNN on SIMD architectures, a specific data layout must be used. As this layout depends on several architectural parameters, the goal of LIBXSMM's interface is to hide this complexity from the user by providing copy-in and copy-out routines. This happens using opaque data types, which themselves are later bound to a convolution operation. The interface is available for C. There is a collection of code samples ([samples/deeplearning](https://github.com/hfp/libxsmm/tree/master/samples/deeplearning)) available including a light-weight [framework for deep learning (GXM)](https://github.com/hfp/libxsmm/tree/master/samples/deeplearning/gxm), and samples with focus on [Convolutional Deep Neural Networks (DNNs)](https://github.com/hfp/libxsmm/tree/master/samples/deeplearning/cnnlayer), or [LSTM cells](https://github.com/hfp/libxsmm/tree/master/samples/deeplearning/lstmdriver), etc. The general concept of the CNN interface is circled around a few types: `libxsmm_dnn_layer`, `libxsmm_dnn_buffer`, `libxsmm_dnn_bias`, and `libxsmm_dnn_filter`. A handle of such a type is always setup by calling a create-function. ```C /** Simplified LIBXSMM types which are needed to create a handle. */ /** Structure which describes the input and output of data (DNN). */ typedef struct libxsmm_dnn_conv_desc { int N; /* number of images in mini-batch */ int C; /* number of input feature maps */ int H; /* height of input image */ int W; /* width of input image */ int K; /* number of output feature maps */ int R; /* height of filter kernel */ int S; /* width of filter kernel */ int u; /* vertical stride */ int v; /* horizontal stride */ int pad_h; /* height of logical rim padding to input for adjusting output height */ int pad_w; /* width of logical rim padding to input for adjusting output width */ int pad_h_in; /* height of zero-padding in input buffer, must equal to pad_h for direct conv */ int pad_w_in; /* width of zero-padding in input buffer, must equal to pad_w for direct conv */ int pad_h_out; /* height of zero-padding in output buffer */ int pad_w_out; /* width of zero-padding in output buffer */ int threads; /* number of threads to use when running convolution */ libxsmm_dnn_datatype datatype; /* datatypes use for all input and outputs */ libxsmm_dnn_tensor_format buffer_format; /* format which is for buffer buffers */ libxsmm_dnn_tensor_format filter_format; /* format which is for filter buffers */ libxsmm_dnn_conv_algo algo; /* convolution algorithm used */ libxsmm_dnn_conv_option options; /* additional options */ libxsmm_dnn_conv_fuse_op fuse_ops; /* used ops into convolutions */ } libxsmm_dnn_conv_desc; /** Type of algorithm used for convolutions. */ typedef enum libxsmm_dnn_conv_algo { /** let the library decide */ LIBXSMM_DNN_CONV_ALGO_AUTO, /* ignored for now */ /** direct convolution. */ LIBXSMM_DNN_CONV_ALGO_DIRECT } libxsmm_dnn_conv_algo; /** Denotes the element/pixel type of an image/channel. */ typedef enum libxsmm_dnn_datatype { LIBXSMM_DNN_DATATYPE_F32, LIBXSMM_DNN_DATATYPE_I32, LIBXSMM_DNN_DATATYPE_I16, LIBXSMM_DNN_DATATYPE_I8 } libxsmm_dnn_datatype; libxsmm_dnn_layer* libxsmm_dnn_create_conv_layer( libxsmm_dnn_conv_desc conv_desc, libxsmm_dnn_err_t* status); libxsmm_dnn_err_t libxsmm_dnn_destroy_conv_layer( const libxsmm_dnn_layer* handle); ``` A sample call looks like (without error checks): ```C /* declare LIBXSMM variables */ libxsmm_dnn_conv_desc conv_desc; libxsmm_dnn_err_t status; libxsmm_dnn_layer* handle; /* setting conv_desc values.... */ conv_desc.N = ... /* create handle */ handle = libxsmm_dnn_create_conv_layer(conv_desc, &status); ``` Next activation and filter buffers need to be linked, initialized and bound to the handle. Afterwards the convolution can be executed in a threading environment of choice (error checks are omitted for brevity): ```C float *input, *output, *filter; libxsmm_dnn_buffer* libxsmm_reg_input; libxsmm_dnn_buffer* libxsmm_reg_output; libxsmm_dnn_filter* libxsmm_reg_filter; /* allocate data */ input = (float*)libxsmm_aligned_malloc(...); output = ...; /* link data to buffers */ libxsmm_reg_input = libxsmm_dnn_link_buffer( libxsmm_handle, LIBXSMM_DNN_INPUT, input, LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM_PTR, &status); libxsmm_reg_output = libxsmm_dnn_link_buffer( libxsmm_handle, LIBXSMM_DNN_OUTPUT, output, LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM_PTR, &status); libxsmm_reg_filter = libxsmm_dnn_link_filter( libxsmm_handle, LIBXSMM_DNN_FILTER, filter, LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM_PTR, &status); /* copy in data to LIBXSMM format: naive format is: */ /* (mini-batch)(number-featuremaps)(featuremap-height)(featuremap-width) for layers, */ /* and the naive format for filters is: */ /* (number-output-featuremaps)(number-input-featuremaps)(kernel-height)(kernel-width) */ libxsmm_dnn_copyin_buffer(libxsmm_reg_input, (void*)naive_input, LIBXSMM_DNN_TENSOR_FORMAT_NCHW); libxsmm_dnn_zero_buffer(libxsmm_reg_output); libxsmm_dnn_copyin_filter(libxsmm_reg_filter, (void*)naive_filter, LIBXSMM_DNN_TENSOR_FORMAT_KCRS); /* bind layer to handle */ libxsmm_dnn_bind_input_buffer(libxsmm_handle, libxsmm_reg_input, LIBXSMM_DNN_REGULAR_INPUT); libxsmm_dnn_bind_output_buffer(libxsmm_handle, libxsmm_reg_output, LIBXSMM_DNN_REGULAR_OUTPUT); libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_reg_filter, LIBXSMM_DNN_REGULAR_FILTER); /* allocate and bind scratch */ scratch = libxsmm_aligned_scratch(libxsmm_dnn_get_scratch_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, &status), 2097152); libxsmm_dnn_bind_scratch(libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, scratch); /* run the convolution */ #pragma omp parallel { libxsmm_dnn_convolve_st(libxsmm_handle, LIBXSMM_DNN_CONV_KIND_FWD, 0, omp_get_thread_num(), omp_get_num_threads()); } /* copy out data */ libxsmm_dnn_copyout_buffer(libxsmm_output, (void*)naive_libxsmm_output, LIBXSMM_DNN_TENSOR_FORMAT_NCHW); /* clean up */ libxsmm_dnn_release_scratch(...); libxsmm_dnn_release_buffer(...); ... libxsmm_dnn_destroy_buffer(...); ... libxsmm_dnn_destroy_conv_layer(...); ``` libxsmm-1.17/documentation/libxsmm_fortran.md000066400000000000000000000007731415223013700215230ustar00rootroot00000000000000Title: LIBXSMM project: LIBXSMM author: Intel Corporation summary: Library targeting Intel Architecture for specialized matrix operations. project_github: https://github.com/hfp/libxsmm project_download: https://github.com/hfp/libxsmm/releases/latest favicon: ../.theme/img/favicon.png css: ../.theme/ford.css output_dir: ../html src_dir: ../include search: true page_dir: . Library targeting Intel Architecture for specialized matrix operations: [libxsmm.readthedocs.io/](https://libxsmm.readthedocs.io/) libxsmm-1.17/documentation/libxsmm_mm.md000066400000000000000000000411001415223013700204460ustar00rootroot00000000000000## Matrix Multiplication ### Overview To perform the dense matrix-matrix multiplication Cm x n = alpha · Am x k · Bk x n + beta · Cm x n, the full-blown GEMM interface can be treated with "default arguments" (which is deviating from the BLAS standard, however without compromising the binary compatibility). Default arguments are derived from compile-time constants (configurable) for historic reasons (LIBXSMM's "pre-JIT era"). ```C libxsmm_?gemm(NULL/*transa*/, NULL/*transb*/, &m/*required*/, &n/*required*/, &k/*required*/, NULL/*alpha*/, a/*required*/, NULL/*lda*/, b/*required*/, NULL/*ldb*/, NULL/*beta*/, c/*required*/, NULL/*ldc*/); ``` For the C interface (with type prefix `s` or `d`), all arguments including m, n, and k are passed by pointer. This is needed for binary compatibility with the original GEMM/BLAS interface. ```C libxsmm_gemm(NULL/*transa*/, NULL/*transb*/, m/*required*/, n/*required*/, k/*required*/, NULL/*alpha*/, a/*required*/, NULL/*lda*/, b/*required*/, NULL/*ldb*/, NULL/*beta*/, c/*required*/, NULL/*ldc*/); ``` The C++ interface is also supplying overloaded versions where m, n, and k can be passed by‑value (making it clearer that m, n, and k are non-optional arguments). ```FORTRAN ! Dense matrix multiplication (single/double-precision). CALL libxsmm_?gemm(m=m, n=n, k=k, a=a, b=b, c=c) ! Dense matrix multiplication (generic interface). CALL libxsmm_gemm(m=m, n=n, k=k, a=a, b=b, c=c) ``` The FORTRAN interface supports optional arguments (without affecting the binary compatibility with the original BLAS interface) by allowing to omit arguments where the C/C++ interface allows for NULL to be passed. ```C /** Dense matrix multiplication (single/double-precision). */ libxsmm_blas_?gemm(NULL/*transa*/, NULL/*transb*/, &m/*required*/, &n/*required*/, &k/*required*/, NULL/*alpha*/, a/*required*/, NULL/*lda*/, b/*required*/, NULL/*ldb*/, NULL/*beta*/, c/*required*/, NULL/*ldc*/); ``` For convenience, a BLAS-based dense matrix multiplication (`libxsmm_blas_gemm`) is provided for all supported languages. This only re-exposes the underlying GEMM/BLAS implementation, but the interface accepts optional arguments (or NULL pointers in C) where the regular GEMM expects a value. To remove any BLAS-dependency, please follow the [Link Instructions](index.md#link-instructions). A BLAS-based GEMM can be useful for validation/benchmark purposes, and more important as a fallback when building an application-specific dispatch mechanism. ```C /** OpenMP parallelized dense matrix multiplication. */ libxsmm_?gemm_omp(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); ``` A more recently added variant of matrix multiplication is parallelized based on the OpenMP standard. These routines will open an internal parallel region and rely on "classic" thread based OpenMP. If these routines are called from inside of a parallel region, the parallelism will be based on tasks (OpenMP 3.0). Please note that all OpenMP-based routines are hosted by the extension library (libxsmmext), which keeps the main library agnostic with respect to a threading runtime. ### Manual Code Dispatch Successively calling a kernel (i.e., multiple times) allows for amortizing the cost of the code dispatch. Moreover, to customize the dispatch mechanism, one can rely on the following interface. ```C /** Call dispatched (*function_ptr)(a, b, c [, pa, pb, pc]). */ libxsmm_[s|d]mmfunction libxsmm_[type-prefix]mmdispatch( libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, /** NULL: tight fit (m) */ const libxsmm_blasint* lda, /** NULL: tight fit (k) */ const libxsmm_blasint* ldb, /** NULL: tight fit (m) */ const libxsmm_blasint* ldc, /** NULL: LIBXSMM_ALPHA */ const type* alpha, /** NULL: LIBXSMM_BETA */ const type* beta, /** NULL: LIBXSMM_FLAGS */ const int* flags, /** NULL: LIBXSMM_PREFETCH_NONE (not LIBXSMM_PREFETCH!) */ const int* prefetch); ``` Overloaded function signatures are provided and allow to omit arguments (C++ and FORTRAN), which are then derived from the [configurable defaults](https://github.com/hfp/libxsmm/blob/master/include/libxsmm_config.h). In C++, `libxsmm_mmfunction` can be used to instantiate a functor rather than making a distinction between numeric types per type-prefix. For lower precision GEMMs, `libxsmm_mmfunction` optionally takes a second type (output type). ```C /* generates or dispatches the code specialization */ libxsmm_mmfunction xmm(m, n, k); if (xmm) { /* JIT'ted code */ /* can be parallelized per, e.g., OpenMP */ for (int i = 0; i < n; ++i) { xmm(a+i*asize, b+i*bsize, c+i*csize); } } ``` Similarly in FORTRAN (see [samples/smm/smm.f](https://github.com/hfp/libxsmm/blob/master/samples/smm/smm.f)), a generic interface (`libxsmm_mmdispatch`) can be used to dispatch a `LIBXSMM_?MMFUNCTION`. The handle encapsulated by such a `LIBXSMM_?MMFUNCTION` can be called per `libxsmm_call`. Beside of dispatching code, one can also call statically generated kernels (e.g., `libxsmm_dmm_4_4_4`) by using the prototype functions included with the FORTRAN and C/C++ interface. Prototypes are present whenever static code was requested at compile-time of the library (e.g. per `make MNK="1 2 3 4 5"`). ```FORTRAN TYPE(LIBXSMM_DMMFUNCTION) :: xmm CALL libxsmm_dispatch(xmm, m, n, k) IF (libxsmm_available(xmm)) THEN DO i = LBOUND(c, 3), UBOUND(c, 3) ! consider OpenMP CALL libxsmm_dmmcall(xmm, a(:,:,i), b(:,:,i), c(:,:,i)) END DO END IF ``` ### Batched Multiplication In case of batched SMMs, it can be beneficial to supply "next locations" such that the upcoming operands are prefetched ahead of time. Such a location would be the address of the next matrix to be multiplied (and not any of the floating-point elements within the "current" matrix-operand). The "prefetch strategy" is requested at dispatch-time of a kernel. A [strategy](libxsmm_be.md#prefetch-strategy) other than `LIBXSMM_PREFETCH_NONE` turns the signature of a JIT'ted kernel into a function with six arguments (`a,b,c, pa,pb,pc` instead of `a,b,c`). To defer the decision about the strategy to a CPUID-based mechanism, one can choose `LIBXSMM_PREFETCH_AUTO`. ```C int prefetch = LIBXSMM_PREFETCH_AUTO; int flags = 0; /* LIBXSMM_FLAGS */ libxsmm_dmmfunction xmm = NULL; double alpha = 1, beta = 0; xmm = libxsmm_dmmdispatch(23/*m*/, 23/*n*/, 23/*k*/, NULL/*lda*/, NULL/*ldb*/, NULL/*ldc*/, &alpha, &beta, &flags, &prefetch); ``` Above, pointer-arguments of `libxsmm_dmmdispatch` can be NULL (or OPTIONAL in FORTRAN): for LDx this means a "tight" leading dimension, alpha, beta, and flags are given by a [default value](https://github.com/hfp/libxsmm/blob/master/include/libxsmm_config.h) (which is selected at compile-time), and for the prefetch strategy a NULL-argument refers to "no prefetch" (which is equivalent to an explicit `LIBXSMM_PREFETCH_NONE`). By design, the prefetch strategy can be changed at runtime (as soon as valid next-locations are used) without changing the call-site (kernel-signature with six arguments). ```C if (0 < n) { /* check that n is at least 1 */ # pragma parallel omp private(i) for (i = 0; i < (n - 1); ++i) { const double *const ai = a + i * asize; const double *const bi = b + i * bsize; double *const ci = c + i * csize; xmm(ai, bi, ci, ai + asize, bi + bsize, ci + csize); } xmm(a + (n - 1) * asize, b + (n - 1) * bsize, c + (n - 1) * csize, /* pseudo prefetch for last element of batch (avoids page fault) */ a + (n - 1) * asize, b + (n - 1) * bsize, c + (n - 1) * csize); } ``` To process a batch of matrix multiplications and to prefetch the operands of the next multiplication ahead of time, the code presented in the [Overview](#overview) section may be modified as shown above. The last multiplication is peeled from the main batch to avoid prefetching out-of-bounds (OOB). Prefetching from an invalid address does not trap an exception, but an (unnecessary) page fault can be avoided. ```C /** Batched matrix multiplications (explicit data representation). */ int libxsmm_mmbatch(libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, const char* transa, const char* transb, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const void* beta, void* c, const libxsmm_blasint* ldc, libxsmm_blasint index_base, libxsmm_blasint index_stride, const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], libxsmm_blasint batchsize, int tid, int ntasks); ``` To further simplify the multiplication of matrices in a batch, LIBXSMM's batch interface can help to extract the necessary input from a variety of existing structures (integer indexes, array of pointers both with Byte sized strides). An expert interface (see above) can employ a user-defined threading runtime (`tid` and `ntasks`). In case of OpenMP, `libxsmm_mmbatch_omp` is ready-to-use and hosted by the extension library (libxsmmext). Of course, `libxsmm_mmbatch_omp` does not take `tid` and `ntasks` since both arguments are given by OpenMP. Similarly, a sequential version (shown below) is available per `libxsmm_gemm_batch` (libxsmm). Please note that an explicit data representation should exist and reused rather than created only to call the explicit batch-interface. Creating such a data structure only for this matter can introduce an overhead which is hard to amortize (speedup). If no explicit data structure exists, a "chain" of multiplications can be often algorithmically described (see [self-hosted batch loop](#implicit-batches)). ```C void libxsmm_gemm_batch(libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, const char* transa, const char* transb, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const void* beta, void* c, const libxsmm_blasint* ldc, libxsmm_blasint index_base, libxsmm_blasint index_stride, const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], libxsmm_blasint batchsize); ``` In recent BLAS library implementations, `dgemm_batch` and `sgemm_batch` have been introduced. This BLAS(-like) interface allows for groups of homogeneous batches, which is like an additional loop around the interface as introduced above. On the other hand, the BLAS(-like) interface only supports arrays of pointers for the matrices. In contrast, above interface supports arrays of pointers as well as arrays of indexes plus a flexible way to extract data from arrays of structures (AoS). LIBXSMM also supports this (new) BLAS(-like) interface with `libxsmm_?gemm_batch` and `libxsmm_?gemm_batch_omp` (the latter of which relies on LIBXSMM/ext). Further, existing calls to `dgemm_batch` and `sgemm_batch` can be intercepted and replaced with [LIBXSMM's call wrapper](#call-wrapper). The signatures of `libxsmm_dgemm_batch` and `libxsmm_sgemm_batch` are equal except for the element type (`double` and `float` respectively). ```C void libxsmm_dgemm_batch(const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], const double alpha_array[], const double* a_array[], const libxsmm_blasint lda_array[], const double* b_array[], const libxsmm_blasint ldb_array[], const double beta_array[], double* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]); ``` **Note**: the multi-threaded implementation (`ntasks > 1` or "omp" form of the functions) avoids data races if indexes or pointers for the destination (C-)matrix are duplicated. This synchronization occurs automatically (`beta != 0`), but can be avoided by passing a negative `batchsize`, `group_size` and/or a negative `group_count`. ### User-Data Dispatch It can be desired to dispatch user-defined data, i.e., to query a value based on a key. This functionality can be used to, e.g., dispatch multiple kernels in one step if a code location relies on multiple kernels. This way, one can pay the cost of dispatch one time per task rather than according to the number of JIT-kernels used by this task. This functionality is detailed in the section about [Service Functions](libxsmm_aux.md#user-data-dispatch). ### Call Wrapper #### Overview Since the library is binary compatible with existing GEMM calls (BLAS), such calls can be replaced at link-time or intercepted at runtime of an application such that LIBXSMM is used instead of the original BLAS library. There are two cases to consider: (1) static linkage, and (2) dynamic linkage of the application against the original BLAS library. When calls are intercepted, one can select a sequential (default) or an OpenMP-parallelized implementation (`make WRAP=2`). ```bash LIBXSMM STATISTIC: 1000 multiplications dgemm(trans=NN mnk=32,32,21 ldx=32,21,32 a,b=1,0): 8% [main$omp$1] dgemm(trans=NN mnk=32,21,32 ldx=32,32,32 a,b=1,0): 8% [main$omp$1] dgemm(trans=NN mnk=10,21,32 ldx=10,32,10 a,b=1,0): 5% [main$omp$1] dgemm(trans=NN mnk=32,10,32 ldx=32,32,32 a,b=1,0): 5% [main$omp$1] dgemm(trans=NN mnk=32,32,10 ldx=32,10,32 a,b=1,0): 5% [main$omp$1] ``` Intercepted GEMMs can also build a sophisticated statistic (histogram) with LIBXSMM_VERBOSE=4 (or higher). The histogram displays the call sites (debug symbol name) of all intercepted GEMMs ([example](https://github.com/hfp/libxsmm/blob/master/samples/utilities/wrap/autobatch.c) above depicts an OpenMP region hosted by the main function). With level 5 (or higher), the histogram yields the entire content, and eventually less relevant entries are not pruned. An application must be built with symbols (`-g`) and export symbols similar to shared libraries (`-Wl,--export-dynamic` even when linked statically) in order to display the symbol names of where the GEMMs originated (call site). **Note**: Intercepting GEMM calls is low effort but implies overhead, which can be relatively high for small-sized problems. LIBXSMM's native programming interface has lower overhead and allows to amortize this overhead when using the same multiplication kernel in a consecutive fashion along with sophisticated data prefetch. #### Static Linkage An application which is linked statically against BLAS requires to wrap the `sgemm_` and the `dgemm_` symbol (an alternative is to wrap only `dgemm_`). To relink the application (without editing the build system) can often be accomplished by copying and pasting the linker command as it appeared in the console output of the build system, and then re-invoking a modified link step (please also consider `-Wl,--export-dynamic`). ```bash gcc [...] -Wl,--wrap=dgemm_,--wrap=sgemm_ \ /path/to/libxsmmext.a /path/to/libxsmm.a \ /path/to/your_regular_blas.a ``` In addition, existing [BLAS(-like) batch-calls](#blas-batch-interface) can be intercepted as well: ```bash gcc [...] -Wl,--wrap=dgemm_batch_,--wrap=sgemm_batch_ \ -Wl,--wrap=dgemm_batch,--wrap=sgemm_batch \ -Wl,--wrap=dgemm_,--wrap=sgemm_ \ /path/to/libxsmmext.a /path/to/libxsmm.a \ /path/to/your_regular_blas.a ``` Above, GEMM and GEMM_BATCH are intercepted both, however this can be chosen independently. For GEMM_BATCH the Fortran and C-form of the symbol may be intercepted both (regular GEMM can always be intercepted per `?gemm_` even when `?gemm` is used in C-code). **Note**: The static link-time wrapper technique may only work with a GCC tool chain (GNU Binutils: `ld`, or `ld` via compiler-driver), and it has been tested with GNU GCC, Intel Compiler, and Clang. However, this does not work under Microsoft Windows (even when using the GNU tool chain or Cygwin). #### Dynamic Linkage An application that is dynamically linked against BLAS allows to intercept the GEMM calls at startup time (runtime) of the unmodified executable by using the LD_PRELOAD mechanism. The shared library of LIBXSMMext (`make STATIC=0`) can be used to intercept GEMM calls: ```bash LD_LIBRARY_PATH=/path/to/libxsmm/lib:${LD_LIBRARY_PATH} \ LD_PRELOAD=libxsmmext.so \ ./myapplication ``` libxsmm-1.17/documentation/libxsmm_prof-vtune.png000066400000000000000000003540101415223013700223350ustar00rootroot00000000000000PNG  IHDR:ѧsBITO IDATxweGu/VUx䤑F,P@9 0`c_wI $ Y%Ga4LOOt:'cU9F{ګ^~+Ԯƾ3@#(4DF\#@D@$*BMR C$"J+D82$ H#!!p" hF!bLf A3ZhHn=4dm&jFvl21!Wfr-4#MBҨ5*@5Dnk`#Xk6ʅ.a}}~k@ lAGԚVZ<-}+i?b BZhݏ7nŁ/ޞz xS"|oJHC#-, p@hBpƹ0"4ok;-rk%x- o ٢RGުV>h~+tЯ; ސ 0V!L j+@Du.|ɪ' @mbI@$TF j=p%ZRky0 RЀ HjV5R(zX$ +bDZ iS3)|I HVګؼ]+d%iK.V5 4Z-PH/Q1jZX>blZVVhy+py+A!pRQiȴV2+,I$dn{/ dl %+Wij Pl.Lͷ&2:-j iO[)bH@@"c#"O(N[@* RT Z;Zn!E_p^1м)`WK^R -7@KIC ÉR[l5nXFHZ^ݿKV@b \% Dgp:DC)=<Ò+K\7ib)Vyz✿Z䚽 g0 < ߦ: o udi""JH(P I q..pH@.z;fDKԂ8ϥ&)X|eMa8!ޒ A 4yWN&߁!1٢Aɔi8 0II&t3mHD(tE[ Z ت1%TZ".X 7["mRpqݵ"q1j~1҂p5G 4B_㋞\Y| K@mjٛhF8to W5>RZL!-&%c H/*%|Z.(/j׹$eкohA/KFL/QjE@$J)`-_5pӿJtfob,86+VmЪ Պp,$ yd-vG$ЪVˇ%;kM>BR RJH bIĉ!2 U+%1"T ! LWN%&`@j I 1G v:.Z0}}_\χ-fl8wmVK%[{^Jy.<ڨ#95 2@ tit ՖwոV%|ݟoϟ7zg 0N`K6t,ގy<\z+7I#vEՅ0-~kG *Υ`]m/C$ϲH+""D`Fڃ"?L{~߁ lD L'0Q JIY8CB%I0 dsa:.@gB4 kyFra lE1jPKhLhS5mCЈglRVp$Kk'hl[~||A/#gK< qo'5|xC#檕2gR/A x钻ϳ_ϡɛCFįZL7’Z9XR%kzzXҾBPt4x X7_[r/@DP@Z#.`qs*1|Cg蔿ރ|]} ~lMaQZML#(@ZyCެ?Я-$"mfr*+rQf2T)ũJ,iJ籢sį9Q}7~Q/ t*Lï6nS'(#9@I --zNYNLif<ϯͩ$䂛iԡ4I@!"% liv&25"c#0Cl4 Jb@@I#hBE10lBDD) 2u-C%1)ȸfZm qH%HWhzb6A& 4lJd -$60KjA$sghs+UH%aRI f cH5  ju,a;*sL+@3m(%yn*hYNLS!`*04R)91pn;^j%~r@E $1XoY&J, M:$S ߱!Rh64m;7*b>;J^6WR)l(^bE1M GmrһN*8:'g5haj;C\,;uɀ͒ÎZ7xSr1p˙\4v6%%OdbRe2.c" AHs(m :`IHB`pLuR L@ @JP&&`B.8Yϴ8 v6)I!88elu"ǖoCZhH"`c%I ļ|In9 I\t[sOXZiF-ZAIH)pMS!LZ)S`*c: t]7RIN! D&@Er-mBZ)!T Z-%' T>@&"-e;-VΛs0d&ATIif(g\RZ͎BqR-e0IQгlF}W^Ɵ R) |Iq9iXb Lp٘8l(2l˩WːH2M $UgqY!Q°H (PI ]\') x.R&K -r)SUՖ L 2jQZ(FQbFE":P YǭWo<寖Xg׮| q뮗^n-۶^vy16fa;K\,;N1OGaT_gYi@G+ξF&Cj#ޮΌm RJ*PJ nH@W+8wٟ\=89>zXXd 8 4a>h-QZZ: RHHx5LMծM28JF:Ae2WNd eDiI N>?mݼezrrzz:6JAfl#pϤ4RJkb!:*  cgm$ X} -U8WFRXfNMKieF6zU@au#EI` ֶת~8i~^~Š^Ř!qVta(2m;43hږmiF! :cp.JR I`h>m[yDКB؆BDQ$MT,Vse<빧N8 u>/(@jO/[vw }=_d]'Ƿl۾wo|k|PqRu!:Pc@*Me._XjuszvDyIe;[?▛~|±V~~zfSiвu3z9ԕs:=#z/43^Y)G~5_ʙj?P%\.g 38@ q]kH:΁qaDM"g;S{2@ rTj == ] c\FaJ7+S9  a&* 1M: tP2 & Sl"w.&9!fM3tD-#;VK>2Rie7;'Zs=U$SFlf\c;묳Wk4)esRձ10\IcB33&8iq2al&Qp7EQ,FCiDlc͕g\6TʠN癆ô95a X"%H͐( à^닣X+u3IQ|%qQu\H\ΟrOo.>u?z3>~_=p@tt8eTZ~qisϦ BTEpooLH\ e0af,SH%pLVAc+8r L6uqfQZn[FvNN!#+Vjj:c=|םݝ]|][i@qMxgtȾ}_-dsȇ nK_3z|[_w% R8S Rٙ_":7Vg+V,3+S>rW)@O,7TJb"d\VS˻\T/<kW.3y$48ɸ+JYG Mm8Lg Xצ@^6kVmzz^WJRGa Ng&)Qkwup%2E5 R `iTg V&)ScX|TJJ4MҊVZ{MF7Aƍ4h$~#֩BLiԜ98N=K()H?(g|~:e4(KכQg ޞcZ(a3ob!7W t7p\(;&GQ/uwu07mUI\%ev]*0̸5vCaP h"g:NDex6'WDYkz5 F2N`EielV!?8Ԫsڸq](,xefrUPq=VZ\/ZΆq-R^l ?M IDATmnQ)5Y.lk`o8Sё^ +V!g3i0alˤ40|&aچa:1PT(9/hL5~es^7F9?P嚾G5k9W.wILvuX^o4df7# وFgg'21A3iwJVfL׍,: tLAo2uW[d^FwW+Ba֭=ԤgaxՈ~sf@-f<+W BֈD6 sq#M(4caQ87Swl+ </uWXzN(T~+ M[rmZ+~9]FDZrҍ"2AgLeģd<<=y'>VfJGTlI2739Y(f7o @2@K5"Qu g?7> ;nucmޢr]o4/ݻyE~HP ;w䶟=qcG_v+::r?N4<{n&_{z(M7qm[/}U 3ٶaX^73y#8M*z3Å>h9YnX0L]'z3Uff{o.yO( >+F~^6|ޅXLwjk-߿n㹧 }'x~o?޹sCCێv Ǭ_U2>w|>KN9]s?*=O8]2NMvwy?RA8t7CdHJصiX: B>k|.l SN=k}CA=%J^t%YfxUT_Wl.2]t I]xŖmL7y@}?>`R;|O:];w_o5$]~%_~x3{{9]Ztݻw_sͷTJ2'x 5+ږ}`x[nK;_~29_plnϮ]?W_|q歇_?[Jq+W_n633Ϛ}gN=ݝ]Zme]#~~أ8443:#bEyʋ;ŗ\p}$_bW8<8<16ͯ|Q;.m{;W-W ~$a}}BvdzO];#ױ9SOw@ɠy؎%!Fp[2Lss]94t\(N4g0G^Ua#̙(cY"UV.[|ȫ;棏?rӊRa7W>9$ԑVt:;Ft*÷b`{^!;;tc_zn`GtDDbu9\{mov'o]>Zt'e+0>L# 3=;l(pk{׉'طoի J&Ǫ֣ S u 'G=Q~ nBGz651aO>pܙs&~Y G6o9bYo7ȯ[?]9\'Ha%-ޞ6lx.:93Po!g @1ZP?Ms+j?Wov [cZ߹ZQmg~yI'tک֭N[tdYRB0BZVkT+N$dtuuE۶o;n\>|Źx3.X6xK/?ͯq-۶SflrKG?~±Ǿҳ&w`wtvwnp} 7or590:'xrQO;S0d jʀ V+ZmdΓN8slށ3:k뺫wϞc?ޑbikm C֖#6iU+^IR*7l{DV;ӎ<∻;n۰\pEqЌZW.U;;< Z}[X_곿}|7pp^ݗg+ײ R/ut^nd0MӪ}Ysw_{7@s>s|r]mGo{}ͷi~?:jyf6/<|x9|}27W,m>~}{? /_[ۿFz?;c3K/yߊ/׿$;+>Ig~sƣ1 Z1f2^:fs݌'j40 Q}^!WZw"$0gFk0Q dwMs{ /sӃ7|MM Г9j=YIPdM1QK.|wM>kˇ7wOzeiۚ+U}-˻v>7bf\s0?{Wd,}wߝܫ>yi'mݻc ϝ|wo|c8ZkLq5 EoM6-^t7mT=#iZq;ѷ~}߶އͯt> s+OMzi+W|yk{Gg+7ٖK_u׍NuĕW^}W6~f䕳9e@EأOꣿmOY{n7Ṡ?4g>sQƣW9ZF2EJAhk1w&kQHLJ>)7QȚaXv_/x2X6;j[o6M՗|c39>~6_xunӶfZZV~-jO>~?K(ͷtWXQm;n'̓O7粹{w7?aSPaeŧ&;n$ƸKU 'HSiٹzgL}_w9g^yʼn1˴n\fw|ͷ$R [ /LnܨCڜT]>5N髻j'tNC~'ƧG&gܠ/Nܰy';;dlnkBjӲ?lo}m=89ıMШ4Z̲l\b`pr|2\XJ|``hd|TXҀVYw "t Z'3pb=c< B_jy:z0ݝ߼vMrgdZ V uGy2+ƒ?D];̐ pwz{?渮bqzjgθo<ׯ?" lX[;.:qPwG_}ZDQ`fO>֭?gG{;Ri#F^zjY-:rRG0}`_>5::иgTuzL3 U׫a G'~OVv{qrαE?V(K)@SDG~?8(M<; f b8ooT辽mzts|lo7rgm?ffz.ը ?27[CβV}%RrGO\e_ۿgzfez_|ṮɩK.u?Ӷ4ivS`mnW_y1Bdt\, }=a^mtwwz|U}}Cwuؙgc-*O u*s~3zK.޴娉c/\woJ0F?j;Fw a)q%cRA \9Ƀp3_"YP%ȱ]&39Lks릩n`lUӤzV󅂗5,c?S,E Ђ!iYˑ 03[ݭ+VM_Xgs 8u7'FFqLyfj5\pj#macFeZUq 4SktwvU+\xj  ْc(i3w7%hi0K MTuu4xAض V,[m{ű֧f0^1~@wZLa~k{.i$axغ+W;51գM0 t\\Z\}#l۾otnζN<˷s?yo_?0Ia/C`$aS:E;"ImprDjg QΞZR5U?IoQeB -uwٹށsΤ[fF޹'v{7P̫4@.3\k,58HҹjJ l7Mf:no^}uZ[bʵ{Va>cQ(Tq % Iq$ ,U߄\>%0M7\޳/<ۏY~~|y8`leSe4knvwW\611gR rtEnZA:}/O?_j$i߁jm՚k^&) fF4TTdܴM?_( ׼CW|wG$("XO:nQ[l՚j[(uB߿ƽf/ΙϾzqwܬa9{W*ޞf2nh@\r_ޙD;WiN!_t\vZGia$Il/'iJu|922*7t[Q-u(4yJ$I8ZLS3dPQ,n´fju++SSŌwէ?'v = ?Jd4$A%elG0!ȴ{{XWպY'Uꐌk7+hFid$/5&"kTkIit\ E5u='xBo_=Fx̱'^k?'D+׬[fC<;=1<ߡl%I@ C H|e8r޸Z vMMa[''vCDiw0&F94ԯu:5Y3M'B8tuwww9NXus󥮁U_zAvZVrWxcu?/zǝ t2^$e3Ѵ bfA,Uw`W}WZ$e$`Թwܲ|6ۘ8kp7IX֨ 4wg3YMޛ[a_V6`` @l"@h1D9 A4r8%lS! -[&-n"(+@bp@̂ f}y[zs{$=;:deV~9_ n<{$NDkQځG]yg_ZX^\W:??5s߸ٷ㾝{y%v 6җlMC~׿豓iO_#o-gW;ЪOWK W#wyn9|z}ֺdA N{naYHGs;hf.]T @kbɝE[!4ۓQ\u:EaޝƧ?OȲr~D=d$O?(KKp/o=?3?7Cߞڻo﷾NhDCܾ>pgq<v;(xnwNPo6WW 8k_7|?r5|z\WΝ9vؾCK u b5,]8{qT? Μ;wswkqؘ߽jφ&,=umrKg|zkp>\T0,˴(=zHdn#D37l(nCxVp0ց]+쎻^wl8p,) DFp{vJj5_~[K&/qT˝s cQ 陙əK ~weiu Ns\}F2,OFwg{ωDԜte>;9})M(%":} 0{~/_K+Pꮻ[kԟz{g^8'-]UIr['&=-n.O]..pH"&i5vLwYGa:n^s++ȣpI^ϾF3zk'k׿sO>u悎Opm_GE0 =-ZwcG J2.-=jSӭ~o{;~F#}/o|c?'l`O?-,g~mnki-s}ϭK ] O͉ɹݻgvVη Va&&W&ܱg;ߚg_{sjϾO>i駝AsıǏ{=C<{ʚ ;~fjA75m"zgxG>C?>|15;Qs_>™:0'O}{{f-/'=sFg"{QYZkS3zS`emdzZSDǑ%~ozӡ;8w]}܅AY8{Çoy?K'&gJϵf59 ٻ|-TG>sg";޳?'_7_Z[ȊV۳,owz&hMĵq IDATMe6A,RAlt:{?1n|GQ33Lf9WԩI >O?VȜ:uG>igzm7I7_~51q?r'z9{;8xmG{vw}üI=i40\WZ/w~k3w߁wޏ}gϞ?yGȪIƹ_-{P9`"zӳ%<8lNLR'(Ofuˎ lkvӍo}+N`P z=F3=s+{WTDRmvg$:}ci$D{<+'>w=ҥ_ ƥK~3gμ-oӟ܎/կLvGwg_[qp'Mv>;_ww;g>Wk4?w\-D{Ҿ/|k,qRsc-//-ro}??x;<N61]k?…t~Ga;o}}a:w?G;뿚fC!zчk'3w]Z^WyiΝ hߎz{vyzJ&&w5{ge[uKVR" RzR;{䑍t?sԎw뾕KϝM⤴'-VF~_>矝{}}ٳgJxꩴ;wuVz{>vⅅ8x{ٝ{ɓ+y:E'νۮys, N,Zi;QyَΜ8yҬЇ?яȭww>??vtUcbڞ7lĹRUIDw~:tbmQ:Dޙڝd~Кݹ(n&\FFrq ,tm2W R|2m\dЅ[30Hb-ڰ׿eA:zqyijrbߎ]O>5ɄA~?wy,xv.Ek&Ʀ+9Ҩ[vy}}u5 tM"fW[$kI~|YyYD`0hu333Svzz*j$ AA)Jˠk׎FFgUذWI4B'64?a(frH7R6͆^^\\[_mzZ;k3_ ]K_Zfd̙sg&mD{kk;f'8fYXAn0n.N6z>MLjhŊG}Ү%~FlV1K1 \]]" 9To41.,[nG\"͕&b7/LMM8sx:޳>8_Zr=/}uDh6={-^i9sڭ =A}f}y@0IAZfL ^W@)̠WJ1sߏ0 vO<8}g0hE-(9[~s;8Q /˳jzY ;n=&^g}Sza8% ̄aF=K[J[Jǵzԧ{Wv=z}{h6뫃n&nF#%Z( 6޻KK'/.^̳b+֌iˋ]?kK뾴p}]F`(}C?Owɩi)W}{}G??| hE-cQY6V[M絸n!,Kw'S~7j6Dtdqy) ݷ'|[l18&qԳC M=15qR8eY#=SٽsװӁQ:PNo$y61lj O<յr81ɚ< 3JZ0¥<ϓ@ג~g|aiY:=PAԮ׻ޅA97ß42_[ݘ2/㯬KEoo5A㢬J6'V}{v9_dH-{N#pbL`k-RJ%I9 SJ(i@$Q JUoH<@4.(L𢼰fj𣩡<1U"F1qIjS2V1ŤO=gC&5959u<:)1 eތ7v]5ۭMb=N6& 0Zc  ׏Gr*[j o륊\@R}uAe.l7Ʒ#TۅMHEt0a E2h$ )ap Q:e) ` cq=KmHHvc"_ AجZe@ƀQrCxiTQ7q-llqTk( bt@QW hJ%+;O `@P*Zl}|Fe#i"/ Q -C< V+ qx -1-=9L($M^0ѶUxU#q"F T8^$Q"zbcÅ̭G)a`0iaJ10`72g;Xn9ƫ?bR#eVBmI@Ѻ{WRif\@ WBJH0b"(!'jmA UqhRW| FeHġ9VHh8P^UVm QzmJUfM?cűs 6-WH䝃Rڢt(:7~~XS  W`(R OqWi2mEHMZ+j| Y@BP T>zf%;]Lmsig&`qdsj*]B^lIxؾ8C9J}uX٣˨1%W`O77t<!ϝ$F+{u9Xi㹩[oc+ ̆͑ߒmjT.f[/BԼ3LdFF~RcuX (l(T}D9x~^e};[UMB'eۮl1 @olb٣+O=+HhRnQf)&&a_+RD( +"BFJlApնRETv\M+;!4Z5C`ͥJWLC^r4A7>1| MdyL x/EHI{Q)G4#!X 5IX+oe D܈g槠J\Hޜ)2 G&-"eI$tH@"@T[q jWۭ5[뗗h(^̤R]]%^AR( AITDڈLvq&,^Zh!0d/R ƍ a7" JąHX e@P";Ʀ̍æA8"dDt6VEI=0P *]* j&B` '}. IDAT^bʘBmʛBWtFbW@5B+(yksvQ PpX0O\%mš~ :!*ޙkL"G\a3mk횫 ϼ6;!8WJADPTlAq9W@mT#QA{ /E8.jUWht(IėZK-^VJCkC!&Ԍ/ؗA`PhRe5F)5X׊x ^/nGB^ (qR(bGU텲w9bFS#dkr ыQWol-e F6$,J ,E\K0͇Rd"`{v&SDjLd"HJ!X( 0XtF#]\\ 3 h-Vʂ=AF#'8HqzWfqm[\)T%p.0`( WJt431mz7\iq=v9]j_V v΅J,Ply""#440ye"3?Z'ER^Ė 6!HǢ.@PJBl^i*ၾKmũ9*n|o_Jsӈ .pda[*J)ηLl!2xW7Ÿ7w p(3`$5!PzHbrxnI,YwL&|Ǥ=)# nmE`-Œ`1RH@d WޗXrDr dpyiF^a%F&֪+I8,jk$翖z Ce*nn1Ƙ̮t{Y o쑮4^9Ϭ02 4L "AkrOY@4R/ fRTB!EJk"2Ǯ9_W@7Qc1!w^T- g*IW$)b=J N8lW`4O䧸jC֊H ⟿^ 1IoxmHDc I +5Vycі_(a7w2t,6+<5 "Zk"-)EZ(7:db$ ZآLj=5(hD s34kv%p *v ED$RE_nxSY>FH [mdӏ<Ha)0C ' ;-m+i_8L03;Fo؞#*C{Zs!Vema$Ff]h뛑$$( 2`A@@@H^cn:BeL왙{?hz.0<Xr-*qa@"|C; "4A"o(?kp.*9rY-g@)ZZ6ƘTHUz!$>)?My^1uS'Qyi VjcND(-jv)uYʤIҕEJimҼD1ٗ,PAht`W^oxs"Qw{aÖy$D٨á',fo+$1pժ0Ӛ&40X+unxcBqNe 0pH"4N{*4#E58pM/ ҬL`F (i @ xOUv8P&xTYy3u(Xay2 k5@oѕ7(@'+$BcM֚F6+ىT~Bڹ<:CBPFXa-$Ĥ KJ .(?ܣ`9EU$oTOdofjv{^U}D1@HqD+tqA$n6i0 ("!! C 2An(`.)a"acGNQ)!@FƐs(9˳9ր"(wH #+w)Mz9@ H1yLHH8M䟿FG!}:,CQyRDU]b3Spe$=ucO׌F=dJâE(4 "k4P@R"Bᐖj!% 01L/4MD @M![߼:P?!z9(r ՚yQ8,*HE-:f(S#ue[V"B+(E $z y6ds׃!6$$"#QREa` ¾7V׺ m^t:KK,zۻxaq0HW/..I-IQ%0 "[8ie )@IRW驙, jԛZ}J׼cg} !MdK V{ L2yVDQhe3,8 cX6:( awEIu/gRVVd8鲰t zq\+(,&[VF(K(PxtW⏿?wⳗ>o.`NbrnSq\^hԛFAZ]LPIR/{$4ͫs8 "g"=ў, ;Ga$(󬘙+򲺩>k1#0\ɀ MA8AEI-idYAГeEk+ȭ[aFhb,-^Kanqh`q):Xqˌ-mF` ÀHL`waRsU6*_s>Ϭwʳm4;Ti5>0aIR!Efg@4AU,ר7,l(au"/a֨0Mfw\=O+^*.,$ cfT̥i^epWQi g6!JmAy/:(aa"Wt[#rI덐luw|F-:퉉K/SJ)XuntVJYQr\׽>ruN1393H09bl -K ]ٲ-Q "%QD `BDTU]>4wܫWЫVs&i0.2㴜$M7%Ah4rBQeU7 `жn4uq3LEyz:YXp@H r eYҨbUꙚL1"צTVTPui^q$vf1t6g0 C7T-0^9.lƅ X1*ڣXv_WA1D ]ȚTd0Lll:b99d͌A@Dcs@_.i֪~&@$N0D A %K) $Y݉GrͿݹ:48g>𫯾n9sG\!aAa(eي \>/IJŞq1AaG&fòZMZ*øV{^j2cfE *K峣cHLNM"cLKǪjxW*v1eY9i^"T5MU4. 0Ab,!!D`T釙9wd"ULE ,$a]hᦫڴ~k/?޻oW7p(f0Ac&P@l.o)k??e 0%Sı51㚮[V+ӔLL$UUeVaiϞ3.~o<7-*KB 4 C8&7p9'8!g_;##狶d3ǏtMB%*D^ rWW\X-XW ܄r)jWOWL%BAx ΀0H:Xf&&ݲ Y"c7/׭dݺlYE1YQ(V\`W\.\غutaj!ej߲E?ˇyҮ.,O%BJ%-˖UP*jryjÆ\p9˲d c=Mͦe7 iaeӅj61s NofӡCN:ht\*)Vk49gJaq0t2X#LZ8 N_]6w%i^šWzz޴dɅK\ٞR^w T5j}_7_OygQ$ àn+*BD"%RFG8]yر}|!h6m36-rB~e^W_ 0DrU']hݸjժ8TJ7Vg(ܶhP"D:c cy^XeVL }wFQt5]T;zߑ1E-!Xp1AR5r?UHu ';aB04o4_x)/TZQՌPVL&E%qqdY5s5{gUk,Sl:NXS*ZK֮]v3_($IIR1!W\yEKTU^dx[_۾}k9ݽ=4ry*/pMf"f"X$ɪQ*{n{nE%dw{MCD$Lc$-Y?ȱާ w r"+Қ5k6^qgϞ78nHi6ɤ3CE>庺0%aZ-B9 A`Jo<3.4CTG,dFӚH|&0V$]'(2ŃBqCb"1{W;e=|5 p"5u܆/{w; $%5JH8~l>lm( }`fv:tw@H(DFJB/tL!cp#f`&3!$F²|R- @%?(HG@ʚS Bt2t6|AE"4q w53NaN8oNckƱD@ՐP:YC+HaTWnꪫ6Ιӏ1moI]v,wY,_X4< N1 B@p޺@F)w~A^}@1CW\/Q%YթbnM hu_~iZ"S IDATp]W5bmu0U*RG( PLVnBfYD"kjVbW]uɓ')œO?TV\emg`7_(Tk3t!0v cE"i,[dll!(dwwwr]WU5YA#@1 l9FBE!g!A18c!DF`ߍ֭v"HlֆV.ݹWG'X{W,{v+Qɀ 0B}CsM,SҪi$(Bjm&LA[pE\W 4)Bмûvj4AP&?JD"Ez8~c8ΖM3/x,&''_>W^}yѢEwqݻwy~_eYDQjZϞ=S*sbQoQ4o\q&m;DKOO~swSML7V9A;,%E! Q[*l-A1_4|"ײy2 KBp\宮5kVsժ㴢(LFå %iƁaD S gU'O̔}?8]jZuθPj`IV];Ξǡ뺄"bЀ#? ʓ[^yiǎw̭~ڣﷹ(<^\՚E/d%2TEвt&d80 ]cX`Js"aH$3'B]'Nf(lQ, @2iҩkoɟ_K4M # cƺKc#10֒DڈH ۶TUVI&4Mjz!mgE,h4秳)EI3(%1a$!% "Dguڕ%đ|xYӲݞ:yzKy ]^% a_Ow_/Zbrú5kX" G$ "?|ͪՒy/1EWfv6d! n$i 0|VkJ8̦~N))Bdh{m*WU8f03S-0>1ČR:ʙ떸cEK #`~g̴?x.$S3}\&A#`SӕZF]=ޜ?qJ۱!j"v˺6nzUtCUIVǼ߃0 ?g@ fShb DG_fs@Pg+d2kwxC<y뮻q` }[7nDAdT",EQMW[o^|Vڶdҙ %HT2Q(YNϔ=_h>ikc *ffiC BwB?ÉL7bX4_H9B ļy +&&&lˎ(bM JlRw1i& `f"ZWUEZdqu8D@P=WE8r",@IVtU1䘈l/ZRLO Y㾁~]5o8, PJj2Tul||H[i;eRlIfk3?tÃsXindSmT&iJu3r۾"\UCԎ%…9)M fJbQA6FC L `b"pG.x,xH+ &.'>wؑw-Fh9 wE&y"BbXzF(""8xa0PĪgG9q ,8{ t9/,)2B`ZFBd"ږGN;4i"ɵݴ.һﺻ' w_ m5:v*iFQZ{*v+SO?/[λc$U+#$K, {wYw7_rk[_}7W4 ze3om{m1Nˊj}sZᆛn9ګV.YS4i9 O?(@*fb s D?{,}ɉĴ^S歶#8%09}흭U|.vw'?峹Rw^# lzuBtMT906mЬ[6.Yt 7oEK~:~a^$T|c)u(3{wbf5kBbȒIӌXw߹r**sg?OTIWЇ?Yw"hEK;b2\H,bTF,䝰H$McܡGU W\RŮ[^v饵FWU._le;qpņGo$+/߰h-/k s9gM嶛/]_'%IΗ:B\l㋎H ʚ)&cԦ'{z'vٯBUbZAluÕK={wλs}Cعș{+Y*N%B_zoo?QOwzl>套+4C=tss*?T.Svv;zk7sERkZBWC=) @@Ȁ=g{jNxG>c;#[-<}z֚40n7om}E UUݺ&'Lꟿ/T+W{oZN|3?cx>~tnWWśZb[^C>qo__Cso*RXFOU+V՛|_K_3O!{?C=&"&Q@ P]]!A9SO?z'S *k /?yp>9*BR ZzWZHƜB>[gz B&IN~ SXS f:[34lX-P) 8LL_t 蘗tDD!l9o}ұ# /]D{ʥ3Vh6b$1Aniͦ v3{|5w}gm}s[z789>uUUSg$"T*]r٦M=pԩK axv|ζm˲,Y2wxpǎm;op`#?,Yr'.]R)>r={QL(Ƙij-[q{ ,GFFK[oO~؍ziG, BXpXp!b8pLI CXP%w:ӧ'RUvZz·dBo\mKD%`"Kr*,sNF ͛7olْɤ9::7ٶc ,vӵ{^#ڻwwxզ%͔OriUU{{{L@_o'lvWbf aaݕH$B?mK7zc;/+| 9u/\p/|NRݻv޽nݺnW^]d 7\w;wnذA|7TܹgJI$i $.Zu WBD1}S1 xuezW҈#=SYs奧zSK&c`noa$qtT?se۷~k6Hi_/| ]wŋۻw[ڷgiK=Kt%wuw)|ժB1'?l6]]=ÃCᄈsxh(J?ɓ#*BP$UU˲<].qf8JʓӮ^3gG?I]w݊+e3Ol…+W.<ܹ%HtLŧTV1E{n!"S%ɟzup9` i=5*A e,O^qٕ=˃m?Hԫ0 SD:m{Պwl/ݷvۑ#G_Y~wqpѡ!UUСC׮Ys͵oѣ{zko~eW  vgs۲,]RYc&"su8Eo0-$I;eIWw^gxtb̼{w~9eJiLTzuQ}(|!Ne_Ly=wo|o'_]82B/LO38W7#ǏĀ\p$@cN0bqIjh?Q\.>(B6[zh֐$ QA) +ض!b @Ms(g^{f\Rd0FM4sBșLԊK͛o$СC՚54<s]ײT*5>18& 4LŌ+K[dY>p+5LҞMWf 7\(vMLVT|+Lƌa"_Ѵ`L6֕>z66^>xXo,'4-~rB`if|~=s޸v:T^vl~LHC>r a\"Tv_(fR[>?T<ㄎgΌr+[oQUUQ={p˪L4 wNS%CmMj.h;m*ޞA[W_t^t6v=9©H0&/}{?w=wFrǎnJ$jiKJg&?xБ\| 7wdr>k2bom?u\"tc# }WxFVfZcxv-],/J7 #N@S&8# pl{շYǽOTܿ[Ƨk` 0fB1Av6kAyR#*g߾?CGp=pgϝVF!m`oB-rzQ*c_GLV>?z!L.]JQMU7ŌABiH.v,@XӾmD1Ԫ~?g(2zd3Q};ߎYLXQƘJr]w=SBPǜt:mfooTyjr:/'Ol=M !e7 UUmnf3Q͛wСB4YE<\ו$i``7( B)ϟ:u^뺩*B$"B$M:\j5eILOZ8b)QnWW7d|b뮫T`!$01pX#cLdIZ_;F-mء7W_rd%ZN{㝽EU#8BY,7oի/_zVf+Wec|#MV ڰa+GQ1H0 ];uԊ掎9wĎ7?wޣǎZ\⋿*OY泟fJ% }u 0F/_z'|1FQ^ify͛7sUUuRlݺuݾy9U몪9R،a0q =Lsa[V$9l*\0M4˲z{{t:JE$xEQfrttX,aLzz֮]_jtMln60B##|C˱<\TOO>[o9g1gqAH"pΏ;jժ(Z:==<7c"E*{Aoo oo{a#$l L_iBF)IuVNf2z%N)3F3qȚ"#NN,]6g z2fH1iXQ([-'L J0(QAusE1!4]<@ƭV+1&&6o|h ƹ`asL: fPԀg'Z03ٜKfAJ+MJi4^v۪ { IDATTRw^-W*]Hu"aXt]pq:SBRuDdr],qj-t]H8r۶ E,Ng,5q9,&UHjܹsTʹlk7WW""XzN@@jfu]4օkyZbTD@3DLA)%'R0593iG]V,-d՝H֛]mPȄ$汌հ8q%˦ZV ]Ihs$x<1Tu]wfJ<jqTbw*:B@t%yr~t'K޴4] 2*xL NcVOuUUm?e]裏?4nApСl6ׯԶAh!1*\QT˲r\WV9jΝ?w\"`ɲj(LZs]z饵Zmɒ%ZÜkYVcBQJUU]ds.[=֮](~P./\t0 .YJC ` 1(BeX|C53s#D|~99U7婪 cyu/}Ke?_ C$yn;Ku.]wJ1NRR)ݻ_?_* VmݺUӴT:Hq9s.'}=/b:|8O$xF_4\.YrqNi0:;>)|3.N&K.=wl>uQmzbA%n& 4sDٶƘcU(05I"&kVkf2: \ YGaF_ϐDI&4- C0l^x?"8;WXlZ$)auML$̑+J1\Lު!ϟ_dɖ-[f*zWu=Qʹ@EE%;47#N7rjLO;@])ڞV˭5'FZ0$]iluzzzz5kjۗZW}n!J Zڞl8z삅 ,[X}ɚ>~B`,!$-[*1Ν@M\v&9s4nBx6,[$bў} oh%D#;? a^CHR5UUc~i L,DLB!: .Z1g =CT4 ˶m93=Yf,kT'& c,>!$LAu]W\qEG}Ξ=[(|Te7N<>cL4t8yO?}UW=ce=z'غu?w}wVw)q>я+ccc333ry֭{__6]v?~xxnqlf"0 0(l\j*ꪫ֭$J?[m۶nk'x ̙3>͛;?}ű^JvMlo붝ض`i+S*H(xp١3CCvR*I2R1޾}{__߇>!,˪j$۷ϲ0 mn4Fq- jm*i-[iӦR$nј?oafB(%Q={jZ!{キiӦ}c>}zttX,afsnoܿRU{/_/~m۶W^3qM\oTmL1ܶ,*k ?O8041ƶm^,ŢQ)b,˒$uTUu?sT*H$.\w83 hffds߽{wNJj|0HR'OT*|Rcٳqu}+_rGyw,J$[78揢c-sw 5Wy<) hbyDr.j W#$/8\8~!AO>g5k vOcw>kXj=NLLZ,LvvL _G,zǞxj| 7k_j5wq?l4%abƌMviXeYWG?sAw5;uۑH!!FgzիeY? 70555<<gq ؿN\XZZ2M4M!ZF5M^BKKKj+ZijlAzfj*f4mJ51f61+7۹' rֻ~|d~zJQZ]|_38m|o6W]u_~#R_J/ov]قAݸ˟~Xiǯ_~ժ5ܗ}fVYe:Hh[WY O  ږT"1|r7oկ}KP zżnQiݳgNZmo9!/ؤ/QHAH R86s,Lep"X&$yTb<=y?tzbj~hq4aT1A }Sg{eqOL_o ~kB2( @`1bDRr.$ ҀY ~<=2 v9::,Hu݌HJu/a 繝Nwڵa:eYVJDiNcaÆv8L$aT*ՅBpTt:BXTcSSS6 e.ItTih4FFF(J1]ץ jNWT:a R,z}ݺuF`Z.WjE%Ɠ}?0 cffZuBiECf͚-Oڽ;-f6r/D$DDG¦E'o};,017A4'=mۖu$<[X;PcLFeRdq0 ]ejժNj5MӤm^ k B!$9M"HMӒ$Qnh4!###q 4M1bV닋VՖ0Ҋ"c, (,*Wʁp\mښFa !N}]q~ztQM$i b] !BeU`7m3 8```}}0)1>ɫo_|,chJF~ۭVeמ| $`  ٧|`)4Nn覕t;qs,1KY/?{vaɴ,!"kX1@$Y,ˢ(ErZ0gy1622r1ض$Ҕ异i6l077e2 Y]VJŧv6M4l]7=99149tB85' !dӦMz=E58prɺixn,W,iñ nHD@T]P XfYHJCeJYCav$V+RdfEB\\0˲!E|~ii @fY65um)a00ʭ4<<8իr\(&''UOSOpk׮Mdff&+XѠJ)1jПt:3900p駫Μ Nn=JTDb!8ut5՟-n(0d@G30_V.;~94Bc8Db$9#}?4M]׭5Mq8;vF0 5jF-l49/W( X |j8Mas.nذavrP,q!_EI10-@Љaݺ?ƋV;C?5P_raf(r$r0,Ǖ5D1J=ߐ˥ gَF%'%6P0l\] qk5 I]oꥂn,nѩZN!W.\-%$`[ѽ H`) TH̤&qʍ3]0tcƏq#~HǞms&H ա$íNlYf%VR)UdH$[@隖f/C&@V $401$Br*cFGGOeC0RT3 2M!DF*8LaHi4NT/WX,)J4Mq]2<}jQJՠ^-۶ 25V庞Yh6[QX(Xz^m/޼y뺝Nګ?~˜C5LB Rg ̸IIƒ&%X[ߒkVn6dqh$n 2ic՗&Ķ (˝NX,j-8a#D>,VB5V(~k׾\jխ޺k׮\.W,cjmql@rp=3iw]7e۶`nn.8u]/.6![N ICJ]Vmƙ"$q u+sGa GiZC$I.iau$Li"Zcw-4jv..!/USrQnnwxx1V4M[ZZR̪ ;MӔ rv{۶mgq0֮] вVtl۶mw:@cl2 CBOoif r-o|Kӳr13cI1ΐDf`xxX5lQiLrŅeYw:]EfZ휗㜧i"@Rg4%Ͽo~c1.lU^g,F A&YB5cj^* ,'ߝ'4k.{,V,SPιo׻MRqP]7[Uhmq20dih~zqhl4 jQ:4cfϗ.e`xKfjIʘd4C'Bʁ@n P0/LD4 2t|{OfBPCI3ɩ'nq!}FGGv۶mpSTk֬Q2JEQ>W!-7,cme,BhǺY1-{>1$ #`KF^Ef!QOzі5 7 IDATyV u ,|أa@XX/kK^|OaMbgqߗ` ~G؆ [<] 5ӲA$G[{4j2%K@fUcoy9= =+M@O<}ASw;bh7lخ= _srQ"$ ~?,HS Jh Re(mIJQ(K(@H˷Y{} їSZ[ //iD.3@Zp4Qk=P;M {֣ۮ VFjX JK@`) , Hl2J@oW(?:Gz!QB@hn jR`1!+:(Kn{)B\+qdU(?ZuB0IA&kSBs9@hТ\"E&00*gԧѶϡ'8һkUG<,;Os"BuP@+l ;maI@fs|a ճڂ#MQp `t$@yr&t"3j%@X#uЕ-GW@dO[(+BuLhT9c&!ZQαl*nӷ fJ2P2Lcy1 41ġSFA \=),Coa>"H$)EE ;>׮rNJX[:4h:yO̲RJ/ @B2ԻB3hE)w?_f˃4A%RDYc,'j}.SszdXōoWMN :Tuxc@DWsdaa1u8%6e@=i5 I$)Sf!5j:C GsIGj?A@NFx0Q-Vfjaq=$ ,\"cp٥\a"4 i"\]]M̐#̑&#APݴғ|B$H>$(mSbbM'ft`֖@Yh @`S p,3/Zee[Ts@2&S@IS"!)T#HTKD@ @p {T"0@%DsD7 Eb]fXYeHџH@B#k(!8!Hp23z6V)Px4 C1LR񣙼0XrNB8ҰYJ5MTd18E XʉGk v%]1Q`HFtN "[3C`9Px1˾K Xe:d{9m!]j} ij a$T`}} DR+{)la@ | =4@HiB 4i@P@(NH-( (9% 4ظd%G $8t @~t)@r@kR*`/E)8s:@$$~J @BA@4(qBUj[Ƅ~q$+l)@¦RS aA> }lਠLz.]%ݎ{ܣRH@Bjx.j= H~!<S]9QL@BJlwĽ:WPI硟/p<:HAӴ~ OL"`,pB QA *C%DXJ a q$?У+$(`@d $)N)A ,0K"zp@ P!&6:@@@ :T)` !岻KR#1@2ճG[F'P~,c,Έq&0R>ڒOTD}Ts&HK|0Xʸ@CVaeZ_UCС?@pQ(pޟL:  0$ % 1"^w?n+Ac9/ "#A`xuqEZKa)%BpL H @u4B@"]jP?@o%0Y@@`BѼDb )+cDTʁ\ z$@`! Ā0B{ u<Un65} D"P`h2c:@B ]@$^f W_5#>?TS ;A,F@($=# g\E))Y=9?8<= 4Mày4IM݈NS q/2@KQw[ %8: Qϲul,uz*]葌XB" @"Ft 8pyF'n~9QiX:hsI@dHr L'0G`XC@p"U |d,uXX>zlO eD BfInP,"%XFPQ$p%  1]`1Hj H r(#XjI@)H\F2@p`6p%Dr K`q`6F(a'|&QIףQrޥ)H ;1p7tI,΁ eߟ8A10i zJj iz9dR[ y2ͶP5<ۧ ˶!Gxz@0???99x+Rωm;|']eٓ){|,0B7ha /)[:j40XkYqDHՕ,{Z@^"+23⹍l |7ˍsÂ&ՊIа3\=L(ܜ$ OZkFpͥv-6ljtppV1f,Tf=ɲ NBt`z*W(MSժg3ƘRJ430!!"%G!,J]jӴ3|x7/xu?S,;:j)@˳Y8ĥ ~&SuXAb9,!z>E??_Y:HAbH:V~F8tDu` Hb"t`$,t,1ǩH2l@@eHdu].s8EO--#U^\64Uj7ϝEI$eYJS8Rq^>.}}pLGrZȤ! Q^6CW~9 \9/T'$u%B )1іO!TJ JJINN(qBJs iֿH$qKKK^3 j*AB8jwUV#)("A(84M4!LJy̌ltx,hBKs\LRqi ! bXr*61j>B ] }`+X2hmA)Ub0tt]7L!(!ա>a t@r9,W,@uzy9Zj@*eM#(BX('5-+] c IA"4,KeZr03 \v-@*9хf3 5F5\I28cqЯVF^rU:NTo۶&B=2dQ :`dP."=n}nB0V7j6,4: 1*;}/W"K X>rz! CRK7LUA8̊_ơu6 c2MC08u];+sF;zhZoaU( "fT* aD \"$ i@ƈP&8B<ðZ߿px(" U!ebp0)-̃k/--UZ,1^(lj( clێRSqRʨ!؞-XÆ \a)[@W`@RҒ8(J/Wq1N2L _Zٳq 6 02`"1RϚ 8HlgSfᜁwEmUJu@"4M1&(Aƹ*D/_czSKٻw5kTC˲(bOY *纮SO 1!c\pMה|,cBJ!dד;;>SF1!, l"?znW %cA|^\8s>zj]K1֩Si? Ų>G I Oxw;W.reYbQ7ѩR;f%E1u cL01ƘpUʊ3p+X h$c'|.;v޽kNа iRBƜ1U&\"B$RJ,Wo=clL1?;W**R0"˒$eIF05-4$I=PT6c7[5??9B"JH"YZZRTȊ1ݻw/,,^p[6o 8!JE:MR&2"1J1ńjА41 .o7x>O>yaa;4N 2 P(q˥U (%>׾K/gpÍ;wAB(I8rJ[AS*T @"sbL0c|DgoW`+XorB$cKO~?{9iϾ{W=׻,/\vرm۶b]wu\rɆw=u]w^aڵKȕ5nwn.{[ǯ'|`؆|wꫯ> 2 Vסi-..麕e<ɧYT.ggg]qG6o|w~_{㟽?)MzRo|O>yWݻW۷޽ھ}gn޹+> ObꗾoͶ?~cO`fCx͟\848?;b1˹Bau\s߱͝o@2~a1UVe۶7l`U׾ǟxrַ[_~ɧM/M_z;|d˖-\? !?Onwgyf&L + VP@(qJ1VQ۷vݺǗ_~{^JW\׾nַ|߰a˲Fj'&&>o۶;̧?~IGޱɧw׋.b_|^69uGnt뭯|+/$Io3ifi:;;h`/1qi{kv]ygyE]T*ny {wOoظ\.m֭[7f͟uVsժUsk_}?FHEJtAN:  ȲL-U,)eG9Ӷo-kZXܳgW|p:6ꬳڼysVO~"l4֬Yt8. ivi?xq O->Egggv|Ֆ[{m@)v۹so5,;Hb.y(r\t*Y޳S.J\@\.3b{ի^Bo'p իWw7W]uϽn=~i~f71 -D7|ӽoB5ׇa/5XIX V(4 yrƍ/^uUぁN3;7m6q/nddIJݻw[500T(JA([bff6}( Bwޅ$^y^5,#)Y ]45QeBL4 P(/?я8p`umذ!M4GG>}}_>y_$qyi622^p+пJRۡB^^}z31>:LA06A\,@q:==}9}^~[nR]׍Z %/ksίOyf\6xb{?;5knڴ ӟ>wk׮=3l}}{ )## kwH dv] prrZi%`Ηk۶=<22?7W.vǣ0Oo~akp0vlSKy[Փeah8?F{={׮]ULyJbqqq```r!4;c,;6nEFZ-Z88BFP()mɵkזJ;vt:˲ժbrr\.nfyoюot|t2t=#X{%ی#\翸s^קYciR,o{v 2\aY&W5jԨAjS8&ݗx[;NE?-QF;mi w-${3JŐO5jԨ}v_GTӆ5j@e jԨ"CڶFKR"m;' idy&H?ISJ(bYN) ,4*0Z\SB뺊BȻ\.{WWW(JXu=Y@;dqPGkFm6 0p8\,STR1MӲ,EQ81[˶qBDi(mAJED(RdBJ뺶m3d7sҏ1fƸLJٓUc5Ҷ\.F2L,˾K8gΜιsaB8i0;z?jˍ+4P(@&}_D0d BȈlW*BR)N;#[!"cLQJy^$m[4ƘE)W5i֨QDVr(JTkaq!i\}`1kN:}ߗ_iZHD b1FLLӔ|R8 Ð8{N(|5j|ȁi7~#qް, !c5l,-`JBiB!}=BimuD"B! e4E)˲xE7AuUUHCˏGdkyڱa#RTd})QPqr a iajX+lۖO1χB!mfH o#5Qc0>kP*8eE"˲%.R-h 0UA˲|4ugҤIbNUU'MT,cX___ss0];;;b|wpps>22Ťx„ i24M)@)}δAFuSZӴH$B4MUUq8P(4M9>D! @DT k뮻.[,|vumDC2PO}3 cWtasQj}w*W( L&3NمYߠ!APB),@Ga[ >Rj5vI VU*H$R,e\.G"4mۖw~$ *#kX%UfxgO8=mi[W?CCx]d}O6j08}uhSƾy?|={W,h]y?hI'ѨU,Of!4u'/]W_6|@~ഷO|P* #4ib ,hhȴf2恁 S\H-ԞL(ˤ'~ t6084lX<P<+˔P(yގ޹jU5jXVM3J(?qf( :Zu@K;bM7|zykCg?s.h8mŊWo~: edʕgN:SO9eo !n}X,Lp]7V*!'PjԨQc D! frPHf4ftc}1mYP.W'xssqhThUZt]U5|E PDBhX, Kr /r;S00`/4c6w|ro桇c=}==[^z˗oٲ1:=}JƦ?wuon֥ U)W,Ep8jj, b:8Sh3 pmcF={6clpppԩi'6uu(G}t[[ /^Se'}?wHm|A P((EV\Fj(J%X)Q4M CW5EA<8zWǐekU"dk 0Ƨbݴk'!0~gGDhllBH#8ABb !RPj@U H P( Py| H(cTa00ИPJJa /t{x5PƇl.JV~Μ=[Uor UBR@p2B B5#lيPDBdgcD 4Q5D) "BH1b綢^_5>DDX,L{';3g'a;y 0HpD焍7 \"YP"] @ 5PLh")TF!SJPJv-l3V5>Drb(+"{@Y:`\(rdL@ TU  >H`xLq$@ !#'RJAaBc [.w5vRd yԭR7 \`0͙1!BBe@ 9A(R" \S?S;]!! 8 F]ϧR  Qa;ʭU.\\zK b13(TPZB/B  a@t@"D* U @(PƮ8rNs$@ )!q@H! @)cHMvEi֖3)]  iA) TlzƟ/>W"z Y o?,۶ ܣ4$3fbp8 ߒLPu  @u,W"/ t/^=WKQ#@b A ! !@pPA  2^PT `D` @* !p&@*e2anψه;]]]Ps;A%Ķ-#SiPTc:U(*@S R$"W9(W)f P)+f(.Xe#Cx4*8(S纺+{yLdRjPJ CwkByT%>Ba#(JG5U 刌Sxy^$rɒ)r(! HUqP k岫iLw1Jyp8<9# D B!I@P@T X5alr @F(@TUQCZmT8#r9#' ("[' A% HTA ':Eй $D>W@TmV(%B!U /P9ĔliP&9UdT*UThm5z7(rMCE#,1)E (V{ERiB !@5CzPBj(( Tڷ`\Mo˦mCNl6NJ=/*۱d$t x<0}c `* ӣ&*q##DLg2 !d[}(aC8bJ "QH PuJU;Ռk@P! D"'8"A@VM!@ _Lƭٱ$?nD>mJq DKx\ Jz>@Hq,B "[b5"pD$j:" $@ A (@b<Ʋa+Qd d04M4=4 mˊ)%=e@$ޏM"'UA3J)XTJ*D h\ ol@h`;pD42P$i<\ejBz(˺+ -*tEd, ah+X DAF!„ *14]V6;ؔa2>id4TCOt]SUC;(VE$q|!(5DVֈDLm֫Ęȍ9ɥ,)8uf//pL.Ӂ\!! " p@~+)GAUgݶ>s@Um'orדDdx4cBuQM\u Be$DLn P9'QQ]s]15N0:Zù빩xA`|][c~5hi(J>O&p\.Sa#Td+oC*`sCS=ucQ;<:FCp[{{TbTD"b-VM^tmoQx]>T޹J$QT2[QB) |,L5n* Be@PKd/ .[}Eq2mdi7jUqn#W^rDB@ځDRLASXjcmmJi r%Fe?Or>BZH$r?ӧO=0ՌGcшA  8R HJŢUHƒT 뱨TxVY]464fcB>몓5ݔuTU5 Cֵ2Wk *c*cјE#Ba4{I uјoL ci`U*J4w]4ͦzUݞu~*JO+ IDAT`A<%NHuXƠ[E2rG>@ ҵ IfX.:vOp2 re3@J A Te8R0e(#RcaߪUE]DTiO)`nvÇ u?OA0sB"HSSSwwo~D"1`=aF[:J],r|Ez D]Dep вnj)i c֬-Vsmlh]5ݐAf UG 2(arL)%.gMrL*m,0P( h>XltVURN m-PH B]BkJEӨa(vT HH(R JB\>pJ r7d;eDABa2 c7bYc.wR\qk6y'ޑw?H(0A FpU0 @B0B(T@upJplUﲝB,˒(i+of*zgSkֽqg=dr͕r)ON=Evaҕ\eۗ9%lܼwÏ_Xh8~^X0uQ'Cfbu9rӟ94J%أ~uǟx|9Ѿ>0D:'d򗃀OZjM<w-kɡP/~ /pwnr?@ ē `;cqu}\. i"Ղ!gHxaQƨRa@ a@CE26'VgejLH.EsiN aH!ƥ2PcLe1#Da[ e2 HI@H". Ȫ XU)rc*.`c{O @iV۹+(OУ|~p5\a庳>of׿a~G} ?>GuԵ߿OŷrK\qJ%[U_|q-m9UĆ ֿ_9X, (iW^~ۗ 4A R@(7o^\UUN83<;\dGq98u'.YfZ B,[viz9|8k+PTXZ')z!?g_cӫzT-d* nITJ􍽛Uy`$ 4LnZs{ᇊve`$*hW^ŎDL Jluv'-2yG}}߿+_r6kinˮk!$JKž^J)bVH7† G#y<9G}TմUihhFGG+ Sp_.Dƍ\yWU%}:tgAP&d /՗_TN|藏twvyW?кDP5i @(.4MAAyk'o~[Һŗ!PXӴxS7ڥKO=ӎ8 } 'g˾lU--7]Þ{y544Y >SG{|cc{" Ŭ A?Xj~>$GF.=3CӯC_ppލi0bY!b%-TU?8Rʜq 4yknT:6u8vRK " d:38}hh脓O_Enf]__a3TCP(*GurkZ&aQM{BR|P.PH PSV+V*4JqV*bݽg/_wÜf,Zh9B|n8J57yRuW*i(X"7E"aA0\Dֿq_pӦM{H{% JVYg A]Ǜ| !e) Ww\Y t]W_f\rɟl6{衇yM*JCCj{' pX$v%zo|q@XX,s1^AQJ~e ʔ V\h*Q8:;Y0i.{L"x"<#Jѩ Mutys~_&LK&8<744ڶR)HضauԌ3N8ᄗ_~YZWneW?=p˲6oL(0a뀨j${yO?@Gqܒ%z#=zWl|s7:]׻aw.h3?fÚ})S|s|ss4bN(Fz3|uk\z]y/|ᩧ?s=hL3_~+e׹spQk׮IDF"tSO=uG/\ۗ^ڵ .xcdW^o4 9Q59x w~eUW]yUWZ|s E`͝W^^}W^yeWWw}9c.kׯ;̯zuwum~%5fg~QG}ǟ'k"MЖtd&fT g?KK iYk`P֋Аkv޳}Ko~w6iu ֮G}%([6p¤إ]wܒeW^ln`7榦:Jiwwa뎌m;Bc/bTY`ATZb.R*LӜ0aB_='ׯ]8IڦMv=vɩTJ׵Q]׏8∞3N&c;hp8݇ iP,/.|iO}c;"|م~/|q^7]S'֥|>u_\8gi-J$4]&&pЧ?_r',wuY裏!x >S߾g^z駟n|114JGYSWcX>?\.[x8{Ol^< |p[7|͋8;S;~{mB2F׿e7smǟp~ۏ'5k'O44R *DI'3 /Nߴ#au\ͤ2A-:|?w?N;}'x~r~Bpd;c>5 ʃ!KZƺ\紴w}\.weuQLFA*%cZ;r9w;mڴ'xBaYLIRb=mtbY$ڗ,Yr'n&c}DFiO'NL~|)9}K:CML&mzҥWn'n6JpiEdF.{WJiR_f(\__mwEr;1Y{ͻ[k;<=yd}zʔ)XTf͜E/}Ji2o ['+P vm .Z+60J Ә9qgϞ"r8orF/tsIutC!šҍznH3 Z [?8KsRin\}mū^wXDa(;Ect=pˌ?~Co3Ha޼2Eӌd$qmC*&k ~3J@v]w:}rYוxG67(m1 3fr:Gw(1u]֭dj PM|;P"-,bi͓IPckKMS 7ƞ3?PaK}m=O: MQGŲ5olν m5%SU!Jא3B!\.8"b1UUKRKOL"߁VRd2;N-˚1cR* xۚL8D |5H2ƤB)B=$z S<: @$" iu(S T%Ҏ@8D6=i: 7@*]Qp.$S W}ztbm<%ORt}Ϛ9Z> IDATAj€N]8v04+ъQ (=W T)?˗-k4mh02k)s'g5w#wcuWZ=jZTJn"*OӍBd_T( P((&8'k@dU:U֖_\԰)hz(n-2~ |cKGǬS6ӳ̼#8-YU-&鏧5&*pC{"0Y}o?(E.QlVѵGw{9]]=uu-Yۻ xJ3^׳c^ +%džɭ_~nnk+^9ē Ð I ( Z%}*ZA(| uuuiɹ2~o8O"@ Nx}>>JPAqq()1pA`(zB@Q( [ dKvSݶa]Ug>UuzN'!"$HԨBq|T@?1h0ɄtNCuUWխ;{=|N#~G>zsϹkZkm6 @(jcei4As#AP|wR1UV̠xQ|Z" ͛~]^t'?Iwai'-lr+= .ؾqu׾g/}};y3niL='^ᏼ5{.IiMa8q=7]u3ڱk_?z{~"X`m$I|p|ЖMsTԈҢTgjZ^o4ds+^pɓg]{^sO쪋/8w]jm ˀq ,_xM%/k睓txѥOwoy`0'J^LDzw׿+ٟ}%cixs_[oFgw_{WOJ>|r/{?~㍏{7ꛮy6mڔeg=33SeכRpg*3("g"˝3ϙ]? s^m~<ۿ[Ӟ[nO}i7JQ;=k  TT'ͣ&:Ss͛uYk±Z8NQHn[$i;YOOOFx4Njwwכm۶; `ϏZ=rzM#! %leӳcNee߿^GQVR}Dن ~s< kI]\]07a/yKTyeYVR;vm߾}aasY\\BZ={t:Nsэ7ؿ:tVaxQ"j4w fuzcyرrТX]PܷU9t1 4{>lEQj8eY2ƪp8:#I|\WM_sl JU&I516$'RjmmTa\ Y6qh ! EZV:NMr= 0FRW*Yj,K!DY˲%0 =9~E 哶գp،jd ÊlV-V?R}/[ ٶ]DZ֚;vaTZ3@b4D,GE)yJ{d2FAȶ F,Ne?_ֶe,ũvFpmTkEN;ZY)]QsmV.Avl6zVkh8h4Z+۱T| VvEշZ+*D5DDD.+éxe 0BJrB[؅,VV{eq(pl(~)U%nTUsQ_/c }D2;;[ 4M_B9@UzJӴ*dYeYUT;SjbDfv#YWe;6mEd1y^IZps(gz_ף֜T'3!|<?r:SyyTkF+]4bKзmv0JVigVWjPX|ƹr]h9#!%fg덚֤*4m 3UŽZpUq8e麮yU[1J]yEQyvJ85wp!aE0a}2OGjYV0 }߯d.u*1Z^^֬,ǩaV{դgۭ$8EQU*p-2ǝֺy[1_,Y68;Lš]YgfdY\C`t$'5p43l[Ho #',K<#"uqM2n+ f1!(F`"χ~'OhJ zA`CL羻 $fjd ObXV7^=]ך-۱>"~2YyWWZ&Ҭjc?c*v}s9pnn[Vŏ_WV(]wݵe˖( 62Zu8*RjW7췦7ܷ v~wy<=>@߿|7M"jy9rT̙U)TU|wNNN6M[5xrPKEEo?zsk HK33`0njX WG;:8ܱc6nܸgϞ4vկ~u׮]{?ٛ:SS,Y׾/pkV}{~kkx@)C%㇁9e@ܽͿp2{. +ӳ[sy^nRn ߽閯oyϽE/zZ,Rx0B f*Sנ \ "҆`8I}_<87;}tg<&ʫ^mɍ,Lk_/N0uyekɅעଳRr0o{= uTR -UQppz9s<J6ݣ+?M^ g=ѫ ^qtl3mO{ߝw5;;=E/zюۿjUOC i2YӃ8gCIL ֱGʲm;㙙Vumn8sSPe?y7t/ 0't {) 32ʶ$- d|,%")TnN:.gOWǻ3:vtD9.qg{O[oz/?xGq6hܶ\ rJ0gr6_J#f qZA5Pv^nnt/&/j,lWR_95(F'qɝZK~=7oz>}ӟ>KGOp)TA`E]Kia3y)Jّw%Z`-u8^[:zZTSC taf&a`ҡV[ q1 m !S}>/=Z1c=ױGN|!5=cm[۷o^seyO,.nܴ3=eL~}_Fk ?K=\1fT-8)R6=z5xռÝ(&Zw~7-{oWWյ` 7~ozBٺ/yΏ<[Ҫ/m޺V@k5(5J@p_rW—VA&2ɻ>wzG?♋K.N_}ή(m37h2A9Qp#-H04rrHWh4"l]H_ M6̠̘c-ݻ~Þ}7l~{Yn_\~O}{Jn$+u-`9rC~- IDATP`&0 JTp-A!upeiP& ~/p~O}{}?9?gnW_K'N,;~{N4~ϵg8KǏ[2WZM*7ׇqP;МQ(uD ٕ!8TP$: ?nj3yW˞/o飷Ž{w=YWM|;7˾-=rӟ]ݏ5x<$H24APF1hn@&R}DwqFsa3ǏS]+Wzd" PmD1DaZCi0X][Ѩ?g>i`_^f'x/~>x<+??G>tȡC)A `4c** cr@CK,+3܁匲,ϓZFvt}ws(x^~ӧ>5Ѭ=_=sJ=z~/ݷ?~3y%)lĵ! 04z0bB 7 # 2:(Z=unYޚ:\y3[6ݵm]oχFk沧=#jΰgFC2F9BpVuZ6hu~I_guO?[;ٱЋ~x~vӜW U|Y;/ap׭xW>߾n=5qeOs/m2. C HciÕ_!٩ʀU@ajx/9J)sUQVLic˫+] ??1ՙؾc{TwwYѪ8`.HBն"b&8x  3ױG ؃F{Z0iir__sݟԧnVQvQ |s7ͩ)b?Q@26g=֚m+`[Ikn{̦Fwc;w{ss]s6?‰o◾_tIEްd` 6`b, SRA0b[iMV̛h-v[f!ѵ~YDhH 1I ȨP__AUIpDF27l-[n+r}8q,g4 j6w߿$Oh4'jQ&X*ϹȘȹ9ˁ1g#N1c Sad:O޳?뗿B~eϝolܶS~xɋza;n/|3%\ǩG(ր#b` ʠ,#L R'ŨP {au<&VÐuo| Y:y=5'_?䪟yu{޺a5j( Κ4 lhX8 )z 6K$ ]Wgo_Dž_Fk~uoy}?{/Wv8e85 ! ÁzI))@4qK>ўm;~ǡ~? 0L^if] yȹGNhQ佡Ⴕc95!30|3a޴ h>yK_rB`44+h4J ¶&v*hfc19YªLbC 2=Gim\!,mg4|FNpoZ\1eY'>!|_| 7/Fiy??j8pyuz~oݺY20Frf,I)jJh@@qЧ4v=Ȓ(a|%] 5{޲%_7Yϸl  0cNߌ-8`F35 ibT ss\HKR!. e`Y+cA©翴걢u0d _+;#\0 P `6t`y8X>1P }ә`dE@ZfZ0oh0'~a=\[AYhw`qλnysx\۷obbuHj-,,lݺ^ӟ޴i@3 '#`Z RCY6'-XSeá"NdR&iƍXB8i'I!`|@*M0Yx5f`d H\ðBp>IJR+ RFZ'a+k΍"J +V&jg0rk=ZZ'<v{[M|VHNd3b7J\JHpѥԌ1*x Ȓ4==q+.qGWNm۶Vb*؆E^n4:sPܐШ׈YFc̀FRgRe*-=&\cI2#P^210,,@2 (0mwWZИ޴F1Ҿ -ɰGxmBgڃ5b2d0?UioS_[tdT&EɬH !qHLOcO׹ױuU%,>:ro;woYmAVoM9Vƌ`1# PP2`dH,KL J ] Ø%81ܲ1Jz8gD QfR*RN{breu4qcF`Co0!4nQ Fja4A1M0)}f$x+&sMEkÒj5޵Q7lrDӬ0x-va8ݪJAh>+]/8`EΥJ3S)*/@3Vza-(x.:Zi4RK2<^5&V }Uߵ36Bi \(s $51TAIPTLپMSb TM# X-iJmAEn Ǐo>疃'3ngaq~f|sm= ^/@k]Je31V@S%Af nj x 83cddŌ 6c.ok4;dIhqʢr#aJGBc.`rL 3)FWl0@1dhCư֡G;8(.ұ6;K'RVZZ1SSiYRKGkZRqfDRZ "cܠ 2:]pb㍏?P-;I7/jos<׺5¶8wheZ=zѱ 1y :(xTT@NĔ-'&Y^ #Hz'ۺs~(X;(ʵO8rbܹS)uر[q\9`ei9yd `aaq1P gq2\Dx5qI e)iqP9ceYY^2XY׺SA9o6kP02m[9y_x·c4fwr"N8`3pHH2B(A2pr0Jy.ix3.bƖ198F]IwF5 #DHG+@8|0I)e%T="P€FPt f$aljuCv49(VZmNZ 2ЬRI2zY\ecgG @'X1RDBdY5i xJM[;ɓf[f,rm6 JTRa&FPM<\Rv-__a&Ib;Q䠿\U9&a3 0$ /Y$A`P "rQɊprg9V8z]Z{yWgNwuyzpgr3wz1@_ž<"{@21gM'}o ON @Lk  c@}:@&̔R)i$K,?~.W*T*5n6mirASN E35hBQ4O0!.C ,$$3= "[( *e:Ps︖p6}׋Gc:>P2.,/vAvG:j:!d| 0AG@ ER(0ZCD\`%NٶC(Lsy`!EXL,B"Ox5 @ jf4 MaBn<7[<[[&Lِ ~! IDATH8*x>%`^$**t:mDŽrAT$0RB =ߒ#j9*MM#4* .X7%Fѐ@ϬC @ =)0&X3ca:0DM06bB! F?I&`IW{L:LK @`A [v BQB3x.  K/ 3?b > "(@À04= xx>" M2\H@ЧS>F}bo&3m?/[_aDPǰ|vk=;v4&fY7kT9u]0Dß;sa-mo9RRwu7-=㎹̝7׶Lx8 2mFqh,J8N1L|u]<aZ(*NOO @ND}昦L |7 hRڵ7tş?x|)OwFNұEd޵y~˯tfmW5= Wf&љw뮼fI*Jv[rJDSm[mmY ^V[m00X>ؓO?,3Gye]FQ{W9 .Vٙu/?UHFƛ/iyqqV˸e _<.$UE9VZ!N&_kF;::{z?dsι 3'W]k.Dz8𚭆%\WDsw0ݭO~c?x !t%Jt<9Ï?rULn/Ƈ÷+ S뺶YhVo$)%* S,@+oG?NrW4=QfTu ?qYgy,_,Kd0LmT5*P(c< xDFnGx.7{w?C|鲯~^v߼ƺufV6/?o$Q{\.w?,h{ŗ^ñ.#ʶ)mdag_xՏFn#eal`@yA c 8رGS~Zow{}E"(z`3H  `]ph%wj:T__|zhޡ1ITݽ|͊2-OZ25kMEQ, "ov\mBLK2|$FY@Z-T˕4MRY<52Vj0B, h4SUkz\Gk8LƠru.hY {Lf=|q@yeE Bm;r8)B6NwU*/fNL::P%zVJuBH:W>PxQ!˹f?T믿njj/Z_ݻoKWzٮfuq]w%|$%_;<ɧq_teoɸLF1+\jՙgy޽+ϙ3^۰aw}Gf24ɤPi**O=Uo8߾{;a%K.uy7 ̟=?W( Gx|>?00_~晝;wΞ3O袋*740a&o>޲-OŃ>O2Mxq'twtA7x{~~/_pjń(0['2P0M\ub(8! 0M5P޸qkb'ɜ1o:idmŞ,F3kSkl[h"t0(n~d6)Hl-  7F_߭o(?PI^q]w]oOw9w!"8r\gt/o"$ROϞ7 4_o$w /^}`,y-竛Lv ܻNζ@OO_|Yga8NU#8₋߹s'_%G!Br_`R\;7%v!N}GLSbxf"BS3ފ|FgH ` a0` (LbBcB€"E1NjJuXMuݿ޻n[^PQ $@@Dfpjτ7d999Q"r68fd}}}ݙlԲ,]O:8pG&dE4mycx?]8;VE=tt<5XlΜ9T\(T;: d\塡DZ,~OOw|+pWØ(rزP(fAx2ַn;; ܴg{Ο?/XWTb?/ј@,X?sQFq_ճޡᵷ[nQlwKU-Y4f͚788N3jLLLA*bq`^Xӹv,'Juj׮][TYccc>CݝVMaL>>d_Sg d_m1OV?HH0S@H@B0 BHD"Qv}oj26e5/͝3W.h/p7 5\oo;sνGq]ǵp "!6u-x_Z_ת?o޽{ Gju]^~>vК5 G}ey&Վ?kF״]sM&۵}_K.ΥSFFbF4x^FGG Ϥ&LF;sO>u׮=?yǷqpJFǢxOoo8:-[,XP,Gҋ{6l~K/H$F~2, Phppp޽]]]tmf0l<-]{7ZpBE=;?#E=4%HXh@$ jwmg9?־{\2TUEX8H& @vQT̙39(1o>whTLN(JXZsϽͦrʋ/+x$uݴm4X8"p5C\r磏>z(뮻\ ɮ.e-q E1t3LJfϚh_ _.wZ,閻x/҆ =/,>s= 7|cp/0.] tM7-Uxa DRbZPUS;'6y$Ä{_) -75kqdrtt?ܺcE=s=]{lۺ?=922 rc?,˥qO\zλm*Yld Cb L0!]}f0 a3a!4&t: +Rf[KuD2(^5 U*BhLwy덏vz>轟<ĢyKV.^051AI/7}¦]KCY$5-3B(<#1,nTyޢ6[]O<#9$#Tq|BJ%>_,Z̳z񥗦 ӂ ꫯtQsϖMgwڰ" e3{ghd޻9餓:@H=WVY(_uEQV^m8~'Ml@(JTCP("<]s/Ib@ͦ~s n]ȉ ϭͦTsT?^-7zࡩ7oH|brƛn^߼,Zp@4ݾe+biZ&12:l0jK.uQ'9I"HiAVòH$QVu g|ϩTjm{Wjؖ# [no/\^կ~uSX,~7(~L&or\?z㎻MP"es~wq]w7˲˲S)۶)Ő{>r0 }GtC?zx… WkDb*'Ȳӳm֮d2911s [tZx0jTM@HP(DOQT8D",'Skvvtٽc|g<~#`,@0ѾB1b  &At h>DL&/UK8 j2~8$es90( R0 ;뺪c²lV86JZBN'"eOMMzz٪ OQ/r=ˏ=ioΞ5$%*EQTlvE’$er&0ڈb ((3PkzYRl4,NJ|p(ǰSӅ%KW24h4Caa'uR6K&z]RL&1jѨi#sg-qMWk;,4t8])o=qǟd~Tq]7<_|yGG;322ʹuvv6Vٙ޳{'Z}Axp:U~jvuuA~ءccck[ 1MaYmv}}_\hBi۶c(gy|055Etoo$_|߿{|N9s3B+XYn^z||RU*XD"{|qmzWe{=D| ,G6n}E͛7gr Ҝ9},.Yjʕ"B(|kZ4;dY^xbP8ôh 4(Gi2,0@4@/ |?pDZi;zޒ^ #=/Qe]ɌnX("q86]_VFGB+ή~Yi63Lh4pBX,p]jZdVfYv,fA,t\wW*|i^ <'K7__>vg~hO#!|HA (HDB "E  a@@}@"@LQE!BP0 :::_ MSz6B@[q,]bcS;_oܽk|dd,<\o? iZs˫Պ\.Cyo-0dY(R.#L$A0$8/˶}?pGF2hHC s$x`rr2Hh- {~\!ir1.TJ(Z(y}Is{ϟ֟瑑۶aVkbbbjjjǎ;wܳgOPd2a0b6}ub"| p6mh42wj5:sM;Ͱ$ɮ޵wdd$۷bQקMZsdxOȮb1qP(6lX`An޳{GG'Jx.9!L+ՊT2z4YwaDѰ\)Bp8U]DžzƘ[oNM&&;00FfiwX I#,/ɩ7}\.W^yزKk699ي IDAT_}>pEJԓOa6  (JjZT!MDc`iyçg!DIhv0L)bg޵q; x,!3\(!Hl^{7nOYuSO=uxxXRTw}7L  SZ'xkzŊ]. c9}/OqU研\7Řea@XXH bQlwД4֖¶$ ULQҞ (Ix|dt||BhX X:D drbtRq Kۺ}Yg]U7[oNOOϱ71gww ^89TʵZtRbXl E˅b䠰ϨLJ+h)6j1ݙiUtQ(rGW%+ 7|+׮@t]-\Hj\yXX8 åbHD+W?~/YSBӳf}_F7{e 50@ IZL&X3R)іAxdddǎ_ި%/| T_~ڵkWZGØs1JXߢ\pc/_zOyWCCC~U.Z-ߺ3OZ?EߴiW|>H$8+V{ÉDSN9S]q=P߼y˚57w~'rz* ^zj%ƔS1ʥB?cf1k׮|~2x`͚5b% XQ҂ ~ao޽4MBy=#/Kc-2>]߾溫N94Uӯ暛nT>c?w 6rl;|d2tH7 BA40¾/~^{oTJ<aO~4M@!g$!$뺮>BB"*B4M1H qwi.`q_?|+YXu3W 5{߾RqU{lٔeP(T*uؾak?߾꫷ڶߢe<DX/^;{ڵ:3_`KiY[UɊkVhBc[L jv,8@&߱m,˱X%-}csmHIJX,VT,^e6YhzN4ŰT*twϞ=W455U5l۝3{>M8A@5D*!ƸQV$tcA*wt|N-˷47w BvI|'SKM;ᄛnGa:؊C[wBZE>/Jؽ#ECq߼o޸difI8(twONb9VY AZ,wt-[7b 2c@!`p!q,Dctsvh."˖;c#Sn~tҵ|NQT(RBajގ1޴iӒŋvܹ߂V%BZ1A. q " h$14n4Tê(qrZ Мe헲/ZJ`¡prD(Ǜ#Q8@d /(ު*:Tku0DZ<ЧZ JR~lxlT xM%VH1 r?91DYx.m;˖R$' `Ja2G*a1Lx}[j2hh4Dcfxr|W B zzۿw ,mNc?9I7[0뙞f!L0!6*υ$6U8\6Rv~4H^wxEnlN(! [<´a[P,+;v+''jj͞oy5b,h6YgXBP֖.Zxgh$0>Z4lkxOضNQj 3a(g\<^ȥ駟^*x!Ҭގ;j\& P ҁY:]{v 5X"d֯_k>C1lP _s5{Gͦ(Js/]Xm$1 כ bWr{&{/_f8$&& D¦i3 Z#~ oq4HXTQU59j#Osn(D8& ht!h5t;j-ףY\Ut5-R  g152}juHh# "/q3yq2r;:Z鍆{\ 0}ah*V >Č+|^ɒl[jbb NVBR0rJ#xn  EZ)?% l!eY%pSTRr@{$kf+PL]L@,ڎ38k|Q&R$T,$F>L16dbR!xcF~ ]ϟMerb"EqZf,  !fsf6mN6ӆ1be9:n /@lAMN !je/ dO wu,FOLL̬ɲ |. `xx8JMOOwvvjWT4ry2$HD`#ȈHhHy%h,U"$IjTyui uc*Jvޫ(aXz-˲ f7KE&M3b1q,26 &GGDߧiR]XxvNtg(=k޹1 ޥkH$bV$P[wyǟs JiBaa#|T: HY)ceWsҗ>o)rL9U3)4MD4Oy3VEZd֭[XJEX! ~bS( N" إ =S7LZ$sdg<͆( $6b.Pz>@8($)u!ffGl%BHG2,pFFQ3ІFE*XEJ\*՚f%Y&v\e r4CӘaX]79MKeHH^G#* ) HdhYQ\ r@`2Nga4-zvHKu EF&@/NO;-6OHI=< qG1c+ !HӴT2ia蝝9U \G |"`/[0 !x<1& `.@,ܢ3aKq-&`)l6Z㚚wY1`RU C{eYfY6 u83=Y(Si{YVVy;O\Չ&4ig1b@(T$cD1 ̠0*249vӹxîj@Q|}]ֿg0,Y:HTt[9u92C !rCR&*JԋAM*- Ѩ+&M WF&k'0ƴaq8,+ b۶=wgzb]OĠLfF镋cS4H"n}uD"Z::McB4l&FJi0SAiX6Bt8ŵ~o$ [O9{=Ԫv͠>,j{qL:L vRܙr!mzf ϕzT触NDI7rnFE pxxVa,]lIYGiv$rNc4 uB9 #!d_oԶ0Vgso15iZi:nm !-* F V|gq\7B!jwKBF6͒4lxhvgšT$C٨ZBk6JrovATMɤM6YQ.."ʙC Rll5S 5Z(f-X $L`~\v!=+Vs.ZQ֊RJVJh5 (n=(AYPǴD(,GZLM7ǓmFf;쩔F$ev0Re'IhPѬV ^I0(v(lˮ֪rju]JeZ\.@#D6NzX]t:aDQP(RHEv#G}nb qBjV=Zi  @PzQRnwG \,Jq4;bӴ=tBDl2@.$l DGUYo'CkNReJ'N%R$6dCR#5撰Ү Aa\EEܗfTk>_?Z:˻>I\n7 @C(fNPJCCSΓQ^t,DZ3<*Bبٕ>z9EƲrfY(lǃVj &sgopK蘶 j-B8 hò,PZAJ$yf;-%TZ ЊVIf73 ˶-KI9y0VR/mC<{O.kqG=-K&~e8VyB &G__l^k)9 !$B[Y ڞ[lض  HeLMn]xgj庎LAou4I9,$hk'3PHcyV⺮R0Ơu$Z.O(4L($V Ա!&vbCkh8 !;!㹰T*jby^ZFoO/F!Wv\wjǎ7Jt\&v40 IDATHL s9?Mir c0fsi|_9jiZj"H!V/(*B]ϜsV%%xE%$\)EEJU`i"dD3Ih̔Y*􁩩ϜsԳjVwS)Ʈs3]&'f8 *H twLUe#!D$i[It 0ilz@Ƕ±dX]QнZUP۝FEHJۆ0MH[XNLN.X4d4 cDZZNZ]d!0M[mnٲ-Q(%b>WGF rQJ9T+ehP=3Oh}LMƝVtLfPPCR HHZpIl-ZIҰNBϓ8nTh wFiQ;׻no~w 'F)鸎@Ġ jjr}e˖]z饝Nnyyam;0{ }n @\}՟6nJpu]uU۷ ap2n>گyScSwzۻ=7q?W7M>ի׬~(p4.msAT:Yn5DAurc뎩;|dR D^&9*8T(;XbѲL™ISrtp53r'i&2fuGz>{ltдiubeD8j);B@TRCm:@Tqɧ*⒑L0 S]s$t6dcnY_F(:hL>ᏼ/ΐ p} Nh*@۵aJ%MSy\6MidPaД'9?MAAv=44Rz}'tR.oHg?J!}}'4 OhZ{ƲA׿|gɒ%sO~0aۖ[)pܱsa˚׮:f{' xܷh-5zؠ_Uv $hYb$AfK~!4d&MdZHvc{](^?W}7|2OMM]|ůzի}=;vqGqěN=?uE(3y.Gqǟpܩitf=sUO>g>իW|oo{Vc- ?N;ZwqJ8>ܾ}8Js)3KNO'Bh;NV źk33cccSSSsL.8Mƍwܹf͚\LDQ ~%`RJb NcA:h??G?Y iI<˷(dt0% M4IŤf HHh@eH@ln`s5}"YU}K|yIጲfEST$ki_sٱGa\ 33-[~\q U&6sWM<'>}٪K-ZıRn+ZK/C?W^t=.k&a000`i+[Qz>uo|5oz?|kK.nvŕWrakN{_w7yGߡ*ulmW_}~wY399y%{q^+rxұ+lsʩ?5'ӟOovVv#o޼=X !N3@*iprh|\J#Z_{uyls=3111X.v .`hhǟ{wka~o3 3 B:n'yG>\.y5_Z ncT*<80NtѢEwL5xŊ?Rrzz@oo\JyZ222 (FHH !rҥK)y~+VJ`*2!\ щڑV&au,5PJ@S : ;9ӕ=!d(_ڣ=KgNgzfT.'"6-W*jwZN+W P==} j$1A LV,I۽BӬB߰¾>uZnmb1Nؾc\p~JV֬W y)WrGz (Ԩ7Z-LOOgČjӽ=l&I22<,gZB!Gs!W*9I؎N;LK/؉'׼R:66rܼFYCRk 5Z[ֶDNLNnܼirrr &'' ü袋 iwumI" 2}I ~_6])AW'q099 ;V===oO~?nE(%avw.LL|gZ>죔ʔsDx <ıRJ5337t>/?}׮_߮⊣: .m?M6lٲ婧ڰaï~  9JfY?0/sONm8-^ߊ_f1ݘj1 ڍ.XsPa-n2`6iiseK]y`幓 :M ĩj32-ӻi1ӟwɧzAӡK n?γn_\μ?>xч{G~dnѱ{ӵGmz~{;swn_zի|e˖-^s~mbJ!>'IλN>?sΊesΕV/3θk>5kr-Go+V\~'xȂf,ߪ_o p.;t:0~>#Gy𩧞/~j y*HE *|g}|ߏEK 8hѢvmժ}{{+˗/VVkjjj}?׽N[zA$"݅-ͨ;Yv.7 8\hn#<ZwطЭědE;4m)jlRM Cbwff&-fP'oSie&mg@QCE"86R읟z9CZrt:Q09nP( aQunTMt{.I)w&BMf^d8S9kNRr'SSKCB)mj63m alۺ ,R'fᅥBO'Z;3u++ፎNp֭vɉ{ѡ!VŹe˖LZ v ,`^]_SF+7 4M{l8)z8 e.ݺqnZhg^Gy裏~~["JAk-i JW|0H{b||P*j=לVffjeUA !r9KΉ’0ߴyhbAю%+7u{{T{Eu/~r8% ӄIT&Lc)$HvrU}n,1߄eH֭#Zj6zP f6@)4&:ф`HuHSF)47[ks#L59/)bd+s}v@it$2h lG圛Z-,xwr|w94`q>7L80-T*tӍ ,VJZ]OH) \V+[*$rQjm)ēWznY04t?ܼl鞠$㜇a$IP;O/e4fo㹧RqD,j f](0+v jR7"B"qZ* ]"MLJе=PA*v?~sZxɃ[ܼeTTЄ44-HU3{5_ȫڕrtWγ*`4( JJ7)48:QqsթZ/"PM5`D4^?jWbe-Ԍf],^pQIx&aY8̙9F&n(K' VB1}[XA֫4!BFE&''1'DZee(عsrx_ de#1h86lR,N}7/0BR*MlmϮvy\R PJ+BH@aקK`Z\i 46' `, v7ki6W $@ E9A $@NӔQbfRqxj*y;C)`^B2˔ 5ф)hC (F#fSl#hy%Ϛ+)fYyPs%e&,2n RhŘ %JѰTL5bQ+ h| GMU@PM S 4"B@SM(~GhNi2{ @Qg@!beۜl$I%%:+ 6 (Z͋sUGe4qEw߽jժ~|8,c<3i/ŴDʠqMTa1S/ꚎP4[3|1S cI̙jHmH3 BC*b!P$,#MB"j&D+p0/(Ϩ u$L@w:(LB͉&ːF}vccNOi1ЄqMˡum 5ՠL "$6-S3P P̪1h+TZPdv ,99FE̕D( ( )mPb[ n,"nHf M]#ى{i? ιR*kbYVǛ7o^tiժNGJPk$5(֙$4m7JJ8"q\&TaoʚLufff04{Aqeͩ0Ic0 !n1([QJb{n7+a>SN SR*a"鉎4zI$1BAW2I̵$hj NT+MoIwߝ1 ZK) !ܠ`, J)58!HR%0 0%0TL ¨\z2?\iBs%9LOf]ɋ? DAffH*l0RIF0L֖eQJ3Bj\Ga]0 GGG}===.sh.sE@ 4ބQ[BK ~un9k&\(19*Rp…թXw\WJB)JS֔$IUG &;ej*-,  癞=Q(0,+-MuBPh By& 1o^4 4id~ g NaERJI) CD)%%&ZkrB RP4+30Wd3sl[Գ2tyc9 $ppH kP Gb 3/.!$3'(/ !!mg⥻x6]=I6`,-00\a R'"|``H014Xaͮ9W%LbsJHmK v<;ikO`PdNܠ#B 4%j>87.N>;%9 efA|ei#/Yd*3g$I4k@) Բe~=<͹ I%j0u|c01g[6\(c9 2 %%istRK)Mk((%R _pFeٱ1vL, J2 !R#×} >L6CsP0@Q:2$Q%)Fks8͢~R(F)ᶚ <@)rfHBa[2\eҼ!n?[[lCd.xYxQFsHE Y}$IR/OriJI4MTE0 4H]uEHf"BَeFP$IX%H Uh^. u8$B[s,9gB8i%irt1HA.{mWnvڠ&Ԓ+))Se[ۭڧ>m$_vs GE= h@lXeֺzQJe8lT 59*Q9T 5afG;LqԲ sYnYIDATܺ썀َР,IYOҮq| B`hh4XDa7ᄊmLr=B`OWxj̋YLS E @ZPhH^7of?&& &2ߚ1刣|M6q , Z-[C%7 .a; s QɔVZ39=fDw5wֵ毫]zn?2ts3 ^qݺOoxs@tfy!vwv8N6dPF٠g?Pi,?>}yy=3Ktۯ~{],K+%U9濿o4jů/-_7Sr!=-p 33S{ʫ>r,(N,4ʢ 'H4TÐiJPs14TRE׆D 1&|Kp@CbÆ 'pbJ8U}SW* IJ\(Ps1@ַBf}jR%XU&f(r\&I$3Ӷƍ[.#wk=a˳Al9Qm;* #jpyP$sXufӓJApX˗󟇑88 _!LQ.麮: ì&(zntݵ_?#:F~OL옚фNhVʚӢ38?5ǟ8O\}ɌBޏtbbs: ^J)NJResnxnm{`:R!њAC>+n$)XL5qy jgԼ[2F)EAF{T$R+48Q IuVɐ[dzrlrr\9::s疭55nXDT*A( ;wR&Ij$DZ8۶mK33%I2=367 =Z>66UcیBF6Ҕ[vZkL)Rv~{o^.#id PRm۾sbrpLUD@^c^hѢEJiݵm@zINlQP2w5<-J'ZSB^Y=}bpjԧx{-xUߴ#?I^={zxco|^u;}w{N8aݺuW_}ŋ7w}z뷾-%pGMoꪫ6m .8.SN:Q*t| J9 X \ >OFFF>zsWi7or~[9ӿut{t\?S4aY@.@z0 {0X̿gmy Zks^i Lv=55l$Y9oG/[oޱj_.HjK/KMo|> O{7l,#1f4&D%d6IRE4eJ3ȮgYdzYAvN;ظqm㙙/}A~訔)s]϶{ѝcO:?W/?7>^cN_}ߝsЅ|N}ӛ߼<xgu׾+V|K_{LOOLLL|G$yx1MM^y啜/~kV袏孧5kw]=~}B4R$iظ~C KReƯ|5'>5'<߾Gqo''>q;N^

.'9C-B[7mW\IF'vg*Ps tѢEO:İ/G.&r9ߴ]miFݖ뿖$Ɏ]zqLڰa?{wy쑇ش~~gwaZ~ח{\v GdT+ HJ~A)iZf-9fM2l\9>>t8751mwV`~&=eS{W8|ө~^{nݺמ@.c߮ZO~SSS{O{{'fBnJ\ WHDjB5eTϵ'D\2Uf愇y9RjphhTX\h DQh4y[mND*M y'&Z6AS!'-[zŗ,?U<]̄a822wرf͚LX,nٲo}篺zo+(O?z?EȣaTnĢ8?p:ˎMKR@H~|Lg,.1L&Є1@bT_Lɓ'fͺy``;w2^^]ׇɓdrls܏Ϟm5 dMȑ#Q0Ra@6͝{…osiy<omm9|G>lٲ:HuPdQB v:yܠg`>xpIgԩzdU,:VE׺HQ__&5Lc9>bzǷ >teYidFs{Vu`,<{XOҮN( ]_ٳW Xb:)J #`a# @8"#A@ CbP2aZT,HR3Zؿ8N!LĜX*n@ Wh&ϞUDDգN)<㖎?߾{7g7<;vʦtnܸ'\zo>sL4c!--]ޚ;onیTmYmw{w/?#[/?ݮ1"L6ԃx| <㕷n{oH6:sp`H7DM{dwtKssSs%KdsܦBٶHx,L$Yr5kǙhWEj_“E4v#GOXm;sq$/ǀ%(zOXv4ioڪw9C:EZu_Ȍ}ϊLf$=̚5 o?w8V,vlWc#5c3!KD`cp)H2$[uuJ?q.㲔cLbR}n`)#"Bs0!HѸeEKbKK \p!;7o޵kW2m0J7i۶nh<|=˲"ݱr QQ1`ьȴM)JE !ahFsa(BwrT HE@<0-"/{i>?BB8{N8|rEq<ZXgW {R8i) p t'dBqX c狅f4j1[]s{^ʞ+뺮>0cɬ!eWn4#e2L^>wχ?5 4?^>)6U`(˧OBn0T`L(րL=co0aFBj&dk뺮+1uѣd&)զOM(@!LJ) 0M=x裝F8SɘJNuj)bNF4bZBzرch4N_<ſ1D"*1aƩNV@#iӕ/9, `a!bZ ϼbqccJ VL# P4I74PM 51pDoۿhiTg4(OlO@O0|miM`f\GYV C |B3\%!X,[nϞ='-[tuu)t]!'ZB +8a(~AEdM٨MӤ(T;aSa>VP}?JAy^"Fa %Pb:l5g۷oѢEǏ_|xqm9Ы#4;*J 4*ꃺzP/fb_7{Nqz&ƗVhBMgrB[\ۺ8:K ua(wW )e? utBT1:0j[)R c3g"xhhhav2(" %XdT*8X=bvtTw&j\ζm5.1Ckh dCR0$`$IШ, G#mq)":BappEHŅӱLeSjVSI$>|1$?TNھblNvG |֔ ) T]]]}}}?8Hd2+VE׍p@I0ƄsC7L6Q!~)ע_*g|n"qzLlrDhPcV,-h4! piuE&9e %HIENDB`libxsmm-1.17/documentation/libxsmm_prof.md000066400000000000000000000116111415223013700210070ustar00rootroot00000000000000## Performance Analysis ### Intel VTune Profiler To analyze which kind of kernels have been called, and from where these kernels have been invoked (call stack), the library allows profiling its JIT code using Intel VTune Profiler. To enable this support, VTune's root directory needs to be set at build-time of the library. Enabling symbols (SYM=1 or DBG=1) incorporates VTune's JIT Profiling API: ```bash source /opt/intel/vtune_profiler/vtune-vars.sh make SYM=1 ``` Above, the root directory is automatically determined from the environment (VTUNE_PROFILER_\*_DIR or VTUNE_AMPLIFIER_\*_DIR with older versions). This variable is present after source'ing the Intel VTune environment (`source /path/to/vtune_amplifier/amplxe-vars.sh` with older version), but it can be manually provided as well (`make VTUNEROOT=/path/to/vtune_amplifier`). Symbols are not really required to display kernel names for the dynamically generated code, however enabling symbols makes the analysis much more useful for the rest of the (static) code, and hence it has been made a prerequisite. For example, when "call stacks" are collected it is possible to find out where the JIT code has been invoked by the application: ```bash vtune -r resultdir -data-limit 0 -collect hotspots \ -knob enable-stack-collection=true \ -knob sampling-mode=hw \ -knob stack-size=0 \ -- ./myapplication ``` In case of an MPI-parallelized application, it can be useful to only collect results from a "representative" rank, and to also avoid running the event collector in every rank of the application. With Intel MPI both of which can be achieved by: ```bash mpirun -gtool 'vtune -r resultdir -data-limit 0 -collect hotspots \ -knob sampling-mode=hw -knob enable-stack-collection=true \ -knob stack-size=0:4=exclusive' \ [...] ./myapplication ``` The `:4=exclusive` is related to Intel MPI or mpirun's gtool arguments and unrelated to VTune's command line syntax (see `vtune --help` or `amplxe-cl --help` with older versions); such argument(s) need to appear at the end of the gtool-string. For instance, the shown command line selects the 5th rank (zero-based) along with exclusive usage of the performance monitoring unit (PMU) such that only one event-collector runs for all ranks (without rank-number, all ranks are sampled). Intel VTune Profiler presents invoked JIT code like functions, which belong to a module named "libxsmm.jit". The function name as well as the module name are supplied by LIBXSMM using VTune's JIT-Profiling API. Below, the shown "function name" (`libxsmm_knl_dnn_23x23x23_23_23_23_a1_b1_p6::mxm`) encodes an AVX-512 ("knl") double-precision kernel ("d") for small dense matrix multiplication, which performs no transposes ("nn"). The name further encodes M=N=K=LDA=LDB=LDC=23, Alpha=Beta=1.0, and a prefetch strategy ("p6"). ![The shown "function name" (`libxsmm_knl_dnn_23x23x23_23_23_23_a1_b1_p6::mxm`) encodes an Intel AVX-512 ("knl") double-precision kernel ("d") for small dense matrix multiplication, which performs no transposes ("nn"). The name further encodes M=N=K=LDA=LDB=LDC=23, Alpha=Beta=1.0, and some prefetch strategy ("p6").](libxsmm_prof-vtune.png) An application that cannot rely on LIBXSMM's build system can apply `-DLIBXSMM_VTUNE=2` during compilation, and link against `${VTUNE_AMPLIFIER_XE_2017_DIR}/lib64/libjitprofiling.a`. For example, TensorFlow with LIBXSMM and Intel VTune Profiler may use this way to gain insight into LIBXSMM's JIT-code (see [here](tensorflow.md#performance-profiling)). ### Linux perf With LIBXSMM, there is both basic (`perf map`) and extended support (`jitdump`) when profiling an application. To enable perf support at runtime, the environment LIBXSMM_VERBOSE needs to be set to a negative value. * The basic support can be enabled at compile-time with PERF=1 (implies SYM=1) using `make PERF=1`. At runtime of the application, a map-file ('jit-*pid*.map') is generated ('/tmp' directory). This file is automatically read by Linux perf, and enriches the information about unknown code such as JIT'ted kernels. * The support for "jitdump" can be enabled by supplying JITDUMP=1 (implies PERF=1) or PERF=2 (implies JITDUMP=1) when making the library: `make JITDUMP=1` or `make PERF=2`. At runtime of the application, a dump-file ('jit-*pid*.dump') is generated (in perf's debug directory, usually `$HOME/.debug/jit/`) which includes information about JIT'ted kernels (such as addresses, symbol names, code size, and the code itself). The dump file can be injected into `perf.data` (using `perf inject -j`), and it enables an annotated view of the assembly in perf's report (requires a reasonably recent version of Linux perf). libxsmm-1.17/documentation/libxsmm_qna.md000066400000000000000000000231411415223013700206210ustar00rootroot00000000000000## What is the background of the name "LIBXSMM"? The "MM" stands for Matrix Multiplication, and the "S" clarifies the working domain i.e., Small Matrix Multiplication. The latter also means the name is neither a variation of "MXM" nor an eXtreme Small Matrix Multiplication but rather about Intel Architecture (x86) - and no, the library is [64‑bit only](https://github.com/hfp/libxsmm/issues/103#issuecomment-256887962). The spelling of the name might follow the syllables of libx\\/smm, libx'smm, or libx‑smm. > **NOTE**: the library does [not](https://github.com/hfp/libxsmm/issues/103#issuecomment-256887962) support 32-bit architecture (64‑bit only) ## What is a small matrix multiplication? When characterizing the problem-size using the M, N, and K parameters, a problem-size suitable for LIBXSMM falls approximately within *(M N K)1/3 \<= 128* (which illustrates that non-square matrices or even "tall and skinny" shapes are covered as well). The library is typically used to generate code up to the specified [threshold](#auto-dispatch). Raising the threshold may not only generate excessive amounts of code (due to unrolling in M or K dimension), but also miss to implement a tiling scheme to effectively utilize the cache hierarchy. For auto-dispatched problem-sizes above the configurable threshold (explicitly JIT'ted code is **not** subject to the threshold), LIBXSMM is falling back to BLAS. In terms of GEMM, the supported kernels are limited to *Alpha := 1*, *Beta := \{ 1, 0 \}*, and *TransA := 'N'*. > **NOTE**: *Alpha*, *Beta*, and *TransA* are limited to `1`, `{ 1, 0 }`, and `'N'` respectively. ## What is a small convolution? In the last years, new workloads such as deep learning and more specifically convolutional neural networks (CNN) emerged, and are pushing the limits of today's hardware. One of the expensive kernels is a small convolution with certain kernel sizes (3, 5, or 7) such that calculations in the frequency space is not the most efficient method when compared with direct convolutions. LIBXSMM's current support for convolutions aims for an easy to use invocation of small (direct) convolutions, which are intended for CNN training and classification. The [Interface](#interface-for-convolutions) is currently ramping up, and the functionality increases quickly towards a broader set of use cases. ## What about "medium-sized" and big(ger) matrix multiplications? A more recent addition are GEMM routines, which are parallelized using OpenMP (`libxsmm_?gemm_omp`). These routines leverage the same specialized kernel routines as the small matrix multiplications, in-memory code generation (JIT), and automatic code/parameter dispatch but they implement a tile-based multiplication scheme i.e., a scheme that is suitable for larger problem-sizes. For *Alpha*, *Beta*, *TransA*, and *TransB*, the limitations of the small matrix multiplication kernels apply. More details can be found in the [description of the xgemm sample code](https://github.com/hfp/libxsmm/tree/master/samples/xgemm#xgemm-tiled-gemm-routines). ## How to determine whether an application can benefit from using LIBXSMM or not? Given the application uses BLAS to carry out matrix multiplications, one may use the [Call Wrapper](#call-wrapper), and measure the application performance e.g., time to solution. However, the latter can significantly improve when using LIBXSMM's API directly. To check whether there are applicable GEMM-calls, the [Verbose Mode](#verbose-mode) can help to collect an insight. Further, when an application uses [Intel MKL 11.2](https://registrationcenter.intel.com/en/forms/?productid=2558) (or higher), then running the application with the environment variable MKL_VERBOSE=1 (`env MKL_VERBOSE=1 ./workload > verbose.txt`) can collect a similar insight (`grep -a "MKL_VERBOSE DGEMM(N,N" verbose.txt | cut -d'(' -f2 | cut -d, -f3-5"`). ## Is LIBXSMM compatible from version-to-version, or what is the ABI commitment? One may have a look at issue [#120](https://github.com/hfp/libxsmm/issues/120#issuecomment-264498939) or [#282](https://github.com/hfp/libxsmm/issues/282#issuecomment-485390494), but in summary: * Binary compatibility is not continuously tested (only manually for a subset of the API namely SMM domain). * Major versions are likely breaking binary compatibility with existing integrations (that is typical). * Minor versions may break binary compatibility of recently introduced features (may not be typical). * Update and patch versions are binary compatible but may only be released on request (issue). LIBXSMM's API for Small Matrix Multiplications (SMMs) is considered stable, and all major known applications (e.g., CP2K, EDGE, NEK5K, and SeisSol) either rely on SMMs or are able (and want) to benefit from an improved API of any of the other domains (e.g., DL). Until at least v2.0, LIBXSMM is not able to track or even maintain binary compatibility and hence the SONAME also goes with the semantic version. A [list of public functions](https://github.com/hfp/libxsmm/blob/master/.abi.txt) is maintained (but there is no distinction for a small subset of them that are only meant for communication between LIBXSMM and LIBXSMM/ext). ## I am relying on a prebuilt version of CP2K (or another application), is LIBXSMM incorporated and which version is it? This can be determined using the environment variable `LIBXSMM_VERBOSE=2` (or higher verbosity). It is not even required to use an input or workload since the information in question is presented when the program terminates. For example: ``` LIBXSMM_VERBOSE=1 exe/Linux-x86-64-intelx/cp2k.psmp [...] LIBXSMM_VERSION: release-1.11 LIBXSMM_TARGET: clx ``` ## I am relying on a prebuilt version of an application, and I am concerned about optimal compiler flags. LIBXSMM uses JIT-generated code according to the CPUID of the system. This is independent of the compiler flags used to build the library. If LIBXSMM was incorporated per [classic ABI](https://libxsmm.readthedocs.io/#classic-library-abi), `LIBXSMM_DUMP_BUILD=1` environment variable allows to print build flags used for LIBXSMM at termination of the application. This output of `LIBXSMM_DUMP_BUILD=1` can yield hints about the flags used to build the application (if similar). For concerns regarding the code of an application that cannot benefit from LIBXSMM, one may have a look at the build recipes of the [XCONFIGURE](http://xconfigure.readthedocs.io/) project. ## What Operating Systems are covered by LIBXSMM, and what about Microsoft Windows? The answer here focuses on the actual runtime support rather than the supported compiler tool chains used to build the library. All flavors of Linux are supported (if the library was successfully built), which includes installations running a security-hardened Linux kernel (SELinux). The Apple OS (OSX) is supported, which also includes more recent SIP-enabled versions (System Integrity Protection). The BSD OS is likely supported, but building the library is only occasionally validated. Microsoft Windows is supported for non-JIT operation, and for most (e.g., GEMM and MATCOPY) of the JIT-kernels (prefetch signature is not supported). There is currently no support for JIT in the DNN domain (no further check is performed i.e., crash at runtime). See also [issue #71](https://github.com/hfp/libxsmm/issues/71). ## Does LIBXSMM has some support for GEMV? The library generates acceptable code when using `M=1` or `N=1`. For example, building with `make M=16 N=1 K=16 AVX=2` and inspecting the assembly (build directory) or dumping/disassembling the JIT code (see reference documentation) shows the minimum number of load/store instructions. Given that GEMV is a memory bound operation, this suggests reasonable code quality. LIBXSMM selects from multiple microkernels (specific for each ISA extension) by using a fixed scheme/heuristic, which should be acceptable for GEMV. The sample code under [samples/smm](https://github.com/hfp/libxsmm/blob/master/samples/smm) provides ready-to-use benchmark drivers that can help to compare the performance with LAPACK/BLAS. Afore mentioned benchmarks exercise streaming all possible combinations of operands. ## What about complex and mixed types? This question refers to the following kind of element type of the GEMM interface of LIBXSMM: * Complex types: complex numbers in single and double-precision, * Mixed types: e.g. real double-precision and complex double-precision There are no (immediate) plans to support more types for the GEMM part. Please note, that LIBXSMM indeed supports lower precision GEMM (wgemm). ## What about voting for features? All feedback and [issue reports](https://github.com/hfp/libxsmm/issues) are handled openly, are welcome and considered ([answered](https://github.com/hfp/libxsmm/issues?q=is%3Aissue+is%3Aclosed), and [collected](https://github.com/hfp/libxsmm/wiki/Development#longer-term-issues)). However, we do not seek for "feature votes" since the development of the library is not a democratic process. ## \ What is the purpose of ROW_MAJOR vs. COL_MAJOR? This build configuration is deprecated ([issue 85](https://github.com/hfp/libxsmm/issues/85)), otherwise there is nothing one cannot achieve with row-major as opposed to column-major storage order. In particular the choice is not about whether a program is written in C/C++ or in FORTRAN. The ROW_MAJOR setting is just offered for existing code, which calls into function(s) that assume row-major storage order and where these calls are to be replaced by LIBXSMM in a "1:1 fashion". It is encouraged to avoid the ROW_MAJOR setting since BLAS implies COL_MAJOR (and LIBXSMM is supposed to be compatible with BLAS). [More...](https://github.com/hfp/libxsmm/issues/80) libxsmm-1.17/documentation/libxsmm_samples.md000066400000000000000000000750341415223013700215160ustar00rootroot00000000000000# [LIBXSMM Samples](https://github.com/hfp/libxsmm/raw/master/documentation/libxsmm_samples.pdf) ## CP2K Artificial Benchmark The first code sample given for LIBXSMM was a performance reproducer exercising the same set of kernels usually generated for CP2K's SMM library. The code sample attempted to model the way "matrix stacks" are processed in CP2K, however there are two different code paths in CP2K: (1) the "main" code path used when processing stacks on the host-side, and (2) a code path targeting offload devices. Beside of the host-sided parallelization via MPI (and perhaps OpenMP), the secondly mentioned code path relies on an additional level of parallelization (which is obviously necessary to drive a potentially highly parallel offload device). Also, the additional level of parallelism is not exactly "nested" in the sense that it participates on sharing the same resources as the host-side. In fact, this "artificial benchmark" (cp2k code sample) is modeling a code path as utilized in the secondly mentioned case (offload device). ## Hello LIBXSMM This example is focused on a specific functionality but may be considered as "Hello LIBXSMM". Copy and paste the example code and build it either manually and as described in our [main documentation](https://libxsmm.readthedocs.io/#hello-libxsmm) (see underneath the source code), or use GNU Make: ```bash cd /path/to/libxsmm make cd /path/to/libxsmm/samples/hello make ./hello ``` Alternatively, one can use the Bazel build system. To further simplify, [Bazelisk](https://github.com/bazelbuild/bazelisk) is used to boot-strap [Bazel](https://bazel.build/): ```bash cd /path/to/libxsmm/samples/hello bazelisk build //... ./bazel-bin/hello ``` The [C/C++ code](https://github.com/hfp/libxsmm/blob/master/samples/hello/hello.cpp) given here uses LIBXSMM in header-only form (`#include `), which is in contrast to the code shown in the [main documentation](https://libxsmm.readthedocs.io/#hello-libxsmm). The [Fortran code](https://github.com/hfp/libxsmm/blob/master/samples/hello/hello.f) (`hello.f`) can be manually compiled like `gfortran -I/path/to/libxsmm/include hello.f -L/path/to/libxsmm/lib -libxsmmf -lxsmm -lxsmmnoblas -o hello` or as part of the above described invocation of GNU Make. ## Magazine ### Overview This collection of code samples accompany an article written for [issue #34](https://software.intel.com/sites/default/files/parallel-universe-issue-34.pdf) of the magazine [The Parallel Universe](https://software.intel.com/en-us/download/parallel-universe-magazine-issue-34-october-2018), an Intel publication. The articles focuses on Blaze-, Eigen-, and LIBXSMM-variants of Small Matrix Multiplications (SMMs). The set of sample codes now also includes a variant relying on BLAS and a variant that showcases LIBXSMM's explicit batch-interface. The baseline requirements are libraries that can operate on column-major storage order, "zero copy" when using existing memory buffers, and an API that is powerful enough to describe leading dimensions. Typically a library-internal parallelization of matrix multiplication is desired. However, for the magazine sample collection there is no performance gain expected since the matrices are small, and nested parallelism may only add overhead. Hence library-internal parallelism is disabled (BLAZE_USE_SHARED_MEMORY_PARALLELIZATION=0, EIGEN_DONT_PARALLELIZE). LIBXSMM provides parallelization on a per-functions basis and no global toggle is needed. The sample codes rely on the minimum programming language supported by the library in question (API): C++ in case of Blaze and Eigen, and C in case of LIBXSMM (both C++ and Fortran interfaces are available as well). For Blaze and Eigen, the build-system ensures to not map implementation into a BLAS library (normally desired but this would not test the library-native implementation). ### Results To reproduce or repeat the performance measurements on a system of choice, all matrix operands are streamed by default. The file [magazine.h](https://github.com/hfp/libxsmm/blob/master/samples/magazine/magazine.h) can be edited to reproduce the desired combination (STREAM_A, STREAM_B, and STREAM_C). Whether or not matrix operands are streamed is motivated in publication. To reduce dependency on the compiler's OpenMP implementation, the benchmarks run single-threaded by default (`make OMP=1` can parallelize the batch of matrix multiplications). The outer/batch-level parallelization is also disabled to avoid accounting for proper first-touch memory population on multi-socket systems (NUMA). For the latter, the init-function (located in magazine.h) is not parallelized for simplicity. ```bash cd libxsmm; make cd samples/magazine; make ``` To run the benchmark kernels presented by the article: ```bash ./benchmark.sh ``` Please note that if multiple threads are enabled and used, an appropriate pin-strategy should be used (OMP_PLACES=threads, OMP_PROC_BIND=TRUE). To finally produce the benchmark charts: ```bash ./benchmark-plot.sh blaze ./benchmark-plot.sh eigen ./benchmark-plot.sh xsmm ``` The plot script relies at least on Gnuplot. ImageMagick (mogrify) can be also useful if PNGs are created, e.g., `./benchmark-plot.sh xsmm png 0` (the last argument disables single-file charts in contrast to multi-page PDFs created by default, the option also disables chart titles). The set of kernels executed during the benchmark can be larger than the kernels presented by the plots: [benchmark.set](https://github.com/hfp/libxsmm/blob/master/samples/magazine/benchmark.set) selects the kernels independent of the kernels executed (union). ## NEK Sample Collection This directory contains kernels taken from Nek{Box,5000}. They aim to represent most of the matrix-matrix workloads. Please note that the [mxm_std.f](https://github.com/hfp/libxsmm/blob/master/samples/nek/mxm_std.f) source code is protected by an (US) GOVERNMENT LICENSE, and under the copyright of the University of Chicago. ### stpm Small tensor-product multiple (stpm) replicates the axhelm kernel, which computes the Laplacian with spectral elements. Usage: ```bash ./stpm m n k size1 size ``` The elements are m-by-n-by-k, mode picks the LIBXSMM interface used, and size scales the number of spectral elements. ### rstr Restriction operator transforms elements from one size to another. This occurs in multi-grid, the convection operator, and, when the sizes are the same, the local Schwarz solves. Usage: ```bash ./rstr m n k mm nn kk size1 size ``` The input elements are m-by-n-by-k and the output elements are mm-by-nn-by-kk. When m=mm, n=nn, k=kk, this half of a Schwarz solve. ## SMM Sample Collection This collection of code samples exercises different memory streaming cases when performing the matrix multiplication *C~m x n~ = alpha · A~m x k~ · B~k x n~ + beta · C~m x n~*: (1) streaming the matrices A, B, and C which is usually referred as batched matrix multiplication, (2) streaming the inputs A and B but accumulating C within cache, (3) streaming the A and C matrices while B is kept in cache, (4) streaming the B and C matrices while A is kept in cache, and (4) not streaming any of the operands but repeating the very same multiplication until the requested number of matrix multiplications has been completed. Beside of measuring the duration of a test case, the performance is presented in GFLOPS/s. As an alternative metric, the memory bandwidth is given (the artificial "cached" case omits to present the cache-memory bandwidth). The "pseudo-performance" given in FLOPS/cycle is an artificial scoring, it not only uses a non-standard formula for calculating the FLOPS (*2 \* M \* N \* K - M \* N* rather than *2 \* M \* N \* K*) but also relies on (pseudo-)clock cycles: ``` $ ./specialized.sh 0 m=32 n=32 k=32 size=87381 memory=2048.0 MB (DP) Batched (A,B,C)... pseudo-perf.: 10.7 FLOPS/cycle performance: 23.9 GFLOPS/s bandwidth: 11.1 GB/s duration: 239 ms Finished ``` There are two sub collections of samples codes: (1) a collection of C++ code samples showing either BLAS, Compiler-generated code (inlined code), LIBXSMM/dispatched, LIBXSMM/specialized functions to carry out the multiplication, and (2) a Fortran sample code showing BLAS versus LIBXSMM including some result validation. **C/C++ Code Samples: Command Line Interface (CLI)** * Takes an optional number (1st arg.) to select the streaming-case (0...8) * Optionally takes the M, N, and K parameter of the GEMM in this order * If only M is supplied, the N and K "inherit" the M-value * Example I (A,B,C): ./specialized.sh 0 16 8 9 * Example II (A,B): ./specialized.sh 6 16 **Fortran Code Sample: Command Line Interface (CLI)** * Optionally takes the M, N, and K parameter of the GEMM in this order * Optional problem size (in MB) of the workload; M/N/K must have been supplied * Optional total problem size (in MB) implying the number of repeated run * If only M is supplied, the N and K are "inheriting" the M-value * Shows the performance of each of the streaming cases * Example I: ./smm.sh 16 8 9 1024 16384 * Example II: ./smm.sh 16 ## SPECFEM Sample This sample contains a dummy example from a spectral-element stiffness kernel taken from [SPECFEM3D_GLOBE](https://github.com/geodynamics/specfem3d_globe). It is based on a 4th-order, spectral-element stiffness kernel for simulations of elastic wave propagation through the Earth. Matrix sizes used are (25,5), (5,25) and (5,5) determined by different cut-planes through a three dimensional (5,5,5)-element with a total of 125 GLL points. ### Usage Step-by-Step This example needs the LIBXSMM library to be built with static kernels, using MNK="5 25" (for matrix size (5,25), (25,5) and (5,5)). #### Build LIBXSMM ##### General Default Compilation In LIBXSMM root directory, compile the library with: ```bash make MNK="5 25" ALPHA=1 BETA=0 ``` ##### Additional Compilation Examples Compilation using only single precision version and aggressive optimization: ```bash make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 ``` For Sandy Bridge CPUs: ```bash make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 AVX=1 ``` For Haswell CPUs: ```bash make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 AVX=2 ``` For Knights Corner (KNC) (and thereby creating a Sandy Bridge version): ```bash make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 AVX=1 \ OFFLOAD=1 KNC=1 ``` Installing libraries into a sub-directory workstation/: ```bash make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 AVX=1 \ OFFLOAD=1 KNC=1 \ PREFIX=workstation/ install-minimal ``` #### Build SpecFEM example code For default CPU host: ```bash cd sample/specfem make ``` For Knights Corner (KNC): ```bash cd sample/specfem make KNC=1 ``` Additionally, adding some specific Fortran compiler flags, for example: ```bash cd sample/specfem make FCFLAGS="-O3 -fopenmp" [...] ``` Note that steps 1 and 2 could be shortened by specifying a "specfem" make target in the LIBXSMM root directory: ```bash make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 AVX=1 specfem ``` For Knights Corner, this would need two steps: ```bash make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 AVX=1 OFFLOAD=1 KNC=1 make OPT=3 specfem_mic ``` ### Run the Performance Test For default CPU host: ```bash ./specfem.sh ``` For Knights Corner (KNC): ```bash ./specfem.sh -mic ``` ### Results Using Intel Compiler suite: icpc 15.0.2, icc 15.0.2, and ifort 15.0.2. #### Sandy Bridge - Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz Library compilation by (root directory): ```bash make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 AVX=1 ``` Single threaded example run: ```bash cd sample/specfem make; OMP_NUM_THREADS=1 ./specfem.sh ``` Output: ```bash =============================================================== average over 15 repetitions timing with Deville loops = 0.1269 timing with unrolled loops = 0.1737 / speedup = -36.87 % timing with LIBXSMM dispatch = 0.1697 / speedup = -33.77 % timing with LIBXSMM prefetch = 0.1611 / speedup = -26.98 % timing with LIBXSMM static = 0.1392 / speedup = -9.70 % =============================================================== ``` #### Haswell - Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz Library compilation by (root directory): ```bash make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 AVX=2 ``` Single threaded example run: ```bash cd sample/specfem make; OMP_NUM_THREADS=1 ./specfem.sh ``` Output: ```bash =============================================================== average over 15 repetitions timing with Deville loops = 0.1028 timing with unrolled loops = 0.1385 / speedup = -34.73 % timing with LIBXSMM dispatch = 0.1408 / speedup = -37.02 % timing with LIBXSMM prefetch = 0.1327 / speedup = -29.07 % timing with LIBXSMM static = 0.1151 / speedup = -11.93 % =============================================================== ``` Multi-threaded example run: ```bash cd sample/specfem make OPT=3; OMP_NUM_THREADS=24 ./specfem.sh ``` Output: ```bash OpenMP information: number of threads = 24 [...] =============================================================== average over 15 repetitions timing with Deville loops = 0.0064 timing with unrolled loops = 0.0349 / speedup = -446.71 % timing with LIBXSMM dispatch = 0.0082 / speedup = -28.34 % timing with LIBXSMM prefetch = 0.0076 / speedup = -19.59 % timing with LIBXSMM static = 0.0068 / speedup = -5.78 % =============================================================== ``` #### Knights Corner - Intel Xeon Phi B1PRQ-5110P/5120D Library compilation by (root directory): ```bash make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 OFFLOAD=1 KNC=1 ``` Multi-threaded example run: ```bash cd sample/specfem make FCFLAGS="-O3 -fopenmp -warn" OPT=3 KNC=1; ./specfem.sh -mic ``` Output: ```bash OpenMP information: number of threads = 236 [...] =============================================================== average over 15 repetitions timing with Deville loops = 0.0164 timing with unrolled loops = 0.6982 / speedup = -4162.10 % timing with LIBXSMM dispatch = 0.0170 / speedup = -3.89 % timing with LIBXSMM static = 0.0149 / speedup = 9.22 % =============================================================== ``` ## Matrix Transpose (TCOPY) ### Overview This code sample aims to benchmark the performance of matrix transposes. The C/C++ and [FORTRAN sample code](https://github.com/hfp/libxsmm/blob/master/samples/transpose/transpose.f) differ slightly with the C/C++ code sample offering a richer set of command line options as well as build settings available inside of the [translation unit](https://github.com/hfp/libxsmm/blob/master/samples/transpose/transpose.c). The available command line options of the sample code may be reviewed by looking into the source code. Generally, the idea is to support the following: > transpose [<kind> [<m> [<n> [<ldi> [<ldo>]]]]] transposef [<m> [<n> [<ldi> [<ldo>]]]] Above, `m` and `n` specify the matrix shape, and `ldi` the leading dimension of the matrix. The argument `ldo` allows to specify an output dimension, which may differ from `ldi`. The transpose kind shall be either out-of-place (`o`) or in-place (`i`). Running the C sample code may look like: ```bash $ ./transpose.sh o 20000 m=20000 n=20000 ldi=20000 ldo=20000 size=3052MB (double, out-of-place) bandwidth: 18.8 GB/s duration: 159 ms ``` Instead of executing a wrapper script, one may affinitize the multi-threaded execution manually (OpenMP runtime). In case of an executable built using the Intel Compiler this may look like: ```bash LIBXSMM_VERBOSE=2 KMP_AFFINITY=balanced,granularity=fine,1 \ ./transpose o 20000 m=20000 n=20000 ldi=20000 ldo=20000 size=3052MB (double, out-of-place) bandwidth: 21.1 GB/s duration: 141 ms Registry: 20 MB (gemm=0 mcopy=0 tcopy=1) ``` In the above case one can see from the verbose output (`LIBXSMM_VERBOSE=2`) that one kernel (tcopy) served transposing the entire matrix. To avoid duplicating JIT-kernels under contention (code registry), one may also consider `LIBXSMM_TRYLOCK=1`, which is available per API-call as well. ### OpenTuner To tune the tile sizes ("block sizes") internal to LIBXSMM's transpose routine, the [OpenTuner](http://opentuner.org/) extensible framework for program autotuning can be used. In case of issues during the tuning phase ("no value has been set for this column"), please install the latest 1.2.x revision of SQLAlchemy (`pip install sqlalchemy==1.2.19`). A tuning script (`transpose_opentuner.py`) is provided, which accepts a range of matrix sizes as command line arguments. > transpose_opentuner.py <begin> <end> [*nexperiments-per-epoch*] [*tile-size-m*] [*tile-size-n*] To start a tuning experiment for a new set of arguments, it is highly recommended to start from scratch. Otherwise the population of previously generated tuning results is fetched from a database and used to tune an eventually unrelated range of matrix shapes. To get reliable timings, the total time for all experiments per epoch is minimized (hence a different number of experiments per epoch also asks for an own database). Optionally, the initial block size can be seeded (`tile-size-m` and `tile-size-n`). ```bash rm -rf opentuner.db ``` The script tunes matrices with randomized shape according to the specified range. The leading dimension is chosen tightly for the experiments. The optimizer not only maximizes the performance but also minimizes the value of *M \* N* (which also helps to prune duplicated results due to an additional preference). ```bash rm -rf opentuner.db ./transpose_opentuner.py --no-dups 1 1024 1000 rm -rf opentuner.db ./transpose_opentuner.py --no-dups 1024 2048 100 rm -rf opentuner.db ./transpose_opentuner.py --no-dups 2048 3072 20 rm -rf opentuner.db ./transpose_opentuner.py --no-dups 3072 4096 20 rm -rf opentuner.db ./transpose_opentuner.py --no-dups 4096 5120 16 rm -rf opentuner.db ./transpose_opentuner.py --no-dups 5120 6144 12 rm -rf opentuner.db ./transpose_opentuner.py --no-dups 6144 7168 8 rm -rf opentuner.db ./transpose_opentuner.py --no-dups 7168 8192 6 ``` The tuning script uses the environment variables `LIBXSMM_TCOPY_M` and `LIBXSMM_TCOPY_N`, which are internal to LIBXSMM. These variables are used to adjust certain thresholds in `libxsmm_otrans` or to request a specific tiling-scheme inside of the `libxsmm_otrans_omp` routine. ## XGEMM: Tiled GEMM Routines ### Overview This sample code calls the `libxsmm_?gemm_omp` routines provided by the LIBXSMM extension library (`libxsmmext`). These routines are meant for big(ger) xGEMM routines, and thereby provide an OpenMP-based parallelization. The driver program (`xgemm.c`) currently accepts all typical GEMM arguments (except for the transposition specifier): `m`, `n`, `k`, `lda`, `ldb`, `ldc`, `alpha`, and `beta`. All arguments are optional (or will inherit defaults from previously specified arguments). Matrix transposition as part of the `libxsmm_?gemm_omp` routines will become available in an upcoming release of LIBXSMM. Please also note that unsupported Alpha or Beta values will cause a fall back to the related BLAS routine. The single-precision matrix multiplications require to change the `ITYPE` in `xgemm.c`. ```bash ./xgemm.sh 2000 ``` ### OpenTuner To tune the tile sizes ("block sizes") internal to LIBXSMM, the [OpenTuner](http://opentuner.org/) extensible framework for program autotuning can be used. In case of issues during the tuning phase ("no value has been set for this column"), please install the latest 1.2.x revision of SQLAlchemy (`pip install sqlalchemy==1.2.19`). A tuning script (`xgemm_opentuner.py`) is provided, which optionally accepts a list of grouped parameters as command line arguments. The syntax of the arguments is per LIBXSMM's `MNK` build-option, and expands to "triplets" specifying the matrix shapes. For instance, four matrix multiplications of square-matrices can be benchmarked and tuned using the following command. ```bash ./xgemm_opentuner.py 1024,1280,1536,1792 ``` To start a tuning experiment for a new set of arguments, it is highly recommended to start from scratch. Otherwise the population of previously generated tuning results is fetched from a database and used to tune an unrelated range of matrix shapes. Optionally, the initial block size can be seeded (`tile-size-m`, `tile-size-n`, and `tile-size-k`). ```bash rm -rf opentuner.db ``` The script tunes the geometric mean of the performance for each of the requested triplets. However, the optimizer not only maximizes the performance but also minimizes the value of *M \* N \* K* (which also helps to prune duplicated results due to an additional preference). As a limitation of the current implementation, the multiplication kernels are not accompanied by copy-kernels (and not accompanied by transpose kernels). This negatively impacts performance on power-of-two matrix shapes (POT) due to trashing the LLC. However, it has been found, that tuning for POT shapes likely achieves superior performance when compared to tuning for non-POT shapes of the same range. ```bash rm -rf opentuner.db ./xgemm_opentuner.py --no-dups 192,256,320,512,768 rm -rf opentuner.db ./xgemm_opentuner.py --no-dups 1024,1280,1536,1792 rm -rf opentuner.db ./xgemm_opentuner.py --no-dups 2048,2304,2560,2816 rm -rf opentuner.db ./xgemm_opentuner.py --no-dups 3072,3328,3584,3840 rm -rf opentuner.db ./xgemm_opentuner.py --no-dups 4096,4416,4736 rm -rf opentuner.db ./xgemm_opentuner.py --no-dups 5120,5440,5760 rm -rf opentuner.db ./xgemm_opentuner.py --no-dups 6144,6464,6784 rm -rf opentuner.db ./xgemm_opentuner.py --no-dups 7168,7488,7808 ``` Above, the series of matrix multiplications from 192-8K is separately tuned in eight ranges. The tuning script uses the environment variables `LIBXSMM_TGEMM_M`, `LIBXSMM_TGEMM_N`, and `LIBXSMM_TGEMM_K` which are internal to LIBXSMM. These variables are used to request a specific tiling-scheme within LIBXSMM's `libxsmm_?gemm_omp` routines. ## Deep Learning with GxM ### Compiling and Building GxM 1. Install Pre-requisite Libraries: Google logging module (glog), gflags, Google's data interchange format (Protobuf), OpenCV, LMDB 2. In Makefile.config, set GXM_LIBRARY_PATH variable to the path containing above libraries 3. In Makefile.config, set LIBXSMM_PATH variable to the path containing LIBXSMM library 4. Set/clear other flags in Makefile.config as required (see associated comments in Makefile.config) 5. source setup_env.sh 6. make clean; make ### Running GxM The network topology definitions directory is "model_zoo". Currently, it contains definitions for AlexNet (without LRN), ResNet-50, Inception v3 along with CIFAR10 and MNIST as simple test definitions. Each topology definition is in a .prototxt file. ResNet-50 can run with "dummy data", raw JPEG image data or with LMDB. Filenames indicate the data source along with the minibatch size. Inception v3 runs only with compressed LMDB data. The hyperparameter definitions for each topology are also in the corresponding directory under "model_zoo" in a .prototxt file with the suffix "solver". For a single-node, this file is called solver.prototxt. For multi-node the filename also contains the global minibatch size (=single node minibatch size x number of nodes);, e.g., solver_896.prototxt contains hyperparameters for MB=56 per node and 16 nodes. The "solver*" file also contains a flag that specifies whether to start execution from a checkpoint (and thus read load weights from the "./weights" directory) or from scratch; by default execution starts from scratch. Optimal parallelization of Convolutional layers in LIBXSMM happens when the number of OpenMP threads = MiniBatch. Therefore, on Xeon ```bash export OMP_NUM_THREADS= export KMP_AFFINITY=compact,granularity=fine,1,0 ``` The command line for a training run is: ```bash ./build/bin/gxm train ``` For example: ```bash ./build/bin/gxm train model_zoo/resnet/1_resnet50_dummy_56.prototxt model_zoo/resnet/solver.prototxt ``` ### Preping on RHEL 8.0 / CentOS 8.0 ```bash dnf install protobuf wget http://mirror.centos.org/centos/8/PowerTools/x86_64/os/Packages/protobuf-compiler-3.5.0-7.el8.x86_64.rpm dnf install protobuf-compiler-3.5.0-7.el8.x86_64.rpm wget http://mirror.centos.org/centos/8/PowerTools/x86_64/os/Packages/protobuf-devel-3.5.0-7.el8.x86_64.rpm dnf install protobuf-devel-3.5.0-7.el8.x86_64.rpm dnf install lmdb dnf install lmdb-devel wget http://repo.okay.com.mx/centos/8/x86_64/release/opencv-devel-3.4.1-9.el8.x86_64.rpm wget http://repo.okay.com.mx/centos/8/x86_64/release/opencv-3.4.1-9.el8.x86_64.rpm dnf install opencv-3.4.1-9.el8.x86_64.rpm dnf install opencv-devel-3.4.1-9.el8.x86_64.rpm wget http://mirror.centos.org/centos/8/PowerTools/x86_64/os/Packages/gflags-2.1.2-6.el8.x86_64.rpm wget http://mirror.centos.org/centos/8/PowerTools/x86_64/os/Packages/gflags-devel-2.1.2-6.el8.x86_64.rpm dnf install gflags-2.1.2-6.el8.x86_64.rpm dnf install gflags-devel-2.1.2-6.el8.x86_64.rpm wget http://mirror.centos.org/centos/8/PowerTools/x86_64/os/Packages/glog-devel-0.3.5-3.el8.x86_64.rpm wget http://mirror.centos.org/centos/8/PowerTools/x86_64/os/Packages/glog-0.3.5-3.el8.x86_64.rpm dnf install glog-0.3.5-3.el8.x86_64.rpm dnf install glog-devel-0.3.5-3.el8.x86_64.rpm ``` Make sure that the makefile follows the OpenCV Ver 3 path! ## Xsmm LSTM This code may be integrated with Tensorflow to make use of LIBXSMM's LSTM. Support for creating a Python wheel and a pip package can be found in the [directory](https://github.com/hfp/libxsmm/tree/master/samples/deeplearning/tf_lstm_ops) as well. ## Dispatch ### Microbenchmark This code sample benchmarks the performance of (1) the dispatch mechanism, and (2) the time needed to JIT-generate code for the first time. Both mechanisms are relevant when replacing GEMM calls (see [Call Wrapper](https://libxsmm.readthedocs.io/libxsmm_mm/#call-wrapper) section of the reference documentation), or in any case of calling LIBXSMM's native [GEMM functionality](https://libxsmm.readthedocs.io/libxsmm_mm/). **Command Line Interface (CLI)** * Optionally takes the number of dispatches/code-generations (default: 10000). * Optionally takes the number of threads (default: 1). **Measurements (Benchmark)** * Duration of an empty function call (serves as a reference timing). * Duration to find an already generated kernel (cached/non-cached). * Duration to JIT-generate a GEMM kernel. In case of a multi-threaded benchmark, the timings represent a highly contended request (worst case). For thread-scaling, it can be observed that read-only accesses (code dispatch) stay roughly with a constant duration whereas write-accesses (code generation) are serialized and hence the duration scales linearly with the number of threads. The [Fortran example](https://github.com/hfp/libxsmm/blob/master/samples/utilities/dispatch/dispatch.f) (`dispatch.f`) could use `libxsmm_dmmdispatch` (or similar) like the C code (`dispatch.c`) but intentionally shows the lower-level dispatch interface `libxsmm_xmmdispatch` and also omits using the LIBXSMM module. Not using the module confirms: the same task can be achieved by relying only on FORTRAN 77 language level. ### User-Data Dispatch Further, another [Fortran example](https://github.com/hfp/libxsmm/blob/master/samples/utilities/dispatch/dispatch_udt.f) about [user-data dispatch](https://libxsmm.readthedocs.io/libxsmm_aux/#user-data-dispatch) is not exactly a benchmark. Dispatching user-data containing multiple kernels can obviously save multiple singular dispatches. The C interface for dispatching user-data is designed to follow the same flow as the Fortran interface. ## MHD Image I/O This code sample aims to provide a simple piece of code, which takes an image and produces a visual result using LIBXSMM's MHD image file I/O. Performing a single convolution is *not* a showcase of LIBXSMM's Deeplearning as the code only runs over a single image with one channel. LIBXSMM's CNNs are vectorized over image channels (multiple images) according to the native vector-width of the processor and otherwise fall back to a high-level implementation. **Note**: For high-performance deep learning, please refer to the collection of [CNN layer samples](https://github.com/hfp/libxsmm/tree/master/samples/deeplearning/cnnlayer). The executable can run with the following arguments (all arguments are optional): > mhd [<filename-in> [<nrepeat> [<kw> [<kh>] [<filename-out>]]]] For stable timing (benchmark), the key operation (convolution) may be repeated (`nrepeat`). Further, `kw` and `kh` can specify the kernel-size of the convolution. The `filename-in` and `filename-out` name MHD-files used as input and output respectively. The `filename-in` may be a pseudo-file (that does not exist) but specify the image resolution of generated input (`w`[x`h`] where the file `wxh.mhd` stores the generated image data). To load an image from a familiar format (JPG, PNG, etc.), please have a look at [Meta Image File I/O](https://libxsmm.readthedocs.io/libxsmm_aux/#meta-image-file-io). ## Scratch Memory Allocation (Microbenchmark) This code sample aims to benchmark the performance of the scratch memory allocation. This facility is a viable option to satisfy the need for temporary memory when using the DNN domain of LIBXSMM (small convolutions). Although any kind of readable/writable buffer can be bound to a convolution handle, LIBXSMM's `libxsmm_aligned_scratch` features a thread-safe linear allocator mechanism which can help to lower allocation overhead. ## Wrapped DGEMM This code sample is calling DGEMM and there is no dependency on the LIBXSMM API as it only relies on LAPACK/BLAS interface. Two variants are linked when building the source code: (1) code which is dynamically linked against LAPACK/BLAS, (2) code which is linked using `--wrap=`*symbol* as possible when using a GNU GCC compatible tool chain. For more information, see the [Call Wrapper](https://libxsmm.readthedocs.io/libxsmm_mm/#call-wrapper) section of the reference documentation. The same (source-)code will execute in three flavors when running `dgemm-test.sh`: (1) code variant which is dynamically linked against the originally supplied LAPACK/BLAS library, (2) code variant which is linked using the wrapper mechanism of the GNU GCC tool chain, and (3) the first code but using the LD_PRELOAD mechanism (available under Linux). **Command Line Interface (CLI)** * Optionally takes the number of repeated DGEMM calls * Shows the performance of the workload (wall time) libxsmm-1.17/documentation/libxsmm_samples.pdf000066400000000000000000006442661415223013700217000ustar00rootroot00000000000000%PDF-1.5 % 15 0 obj << /Length 3351 /Filter /FlateDecode >> stream xkܶ{~!ZīH=9=l#mjwJQ:;/j=)P _$rH'mf쇯/|uBg0uz~Y*_)fx|{WUiۓxiqLHB~b[wFvv@GKSTIen gaAI `⡣x*`m u!v e3_[ɘU5N), ʲ^ ܷY,WlƚzK Ζ5愽.+ˀEU= mhOӜ)-rŸ;|#^enۢ}g(  l-r+;TLAvEך+8Ξ V15SoYZ63@ *f"eXm(vp<N6 X!jXic_t[MSq'vgSE荐r%S9 :B:# '"+ bʵ6[X*vQ/QyDPr W0tLʑa+M`M!V彁s%:x؟$<+C bPVSAݛG^ g~$$T ]/7S I2ĶP}oɻ% ˦^U ciJ8dxCeJm:< _ "!JxUDBvL}>l le@qfSH؏3%&.͑M P%ɦayq_h ;ZE[ z$DBO&`*Q(ċ7, vy?!!q8QX{7s`s}2XZ|$ػSc#iAZخ-Ob?NI(oK0e]< !vIߊ ږ!s^3- b< Sۗ@o|b~oI_C[ i~Z) 4GBg] E ~;!\91o .Gr%1٢"%bwep}s_-&:WA c۪*z\KdWUI0Pɢ麲voutj;λqRDuy/%UN@"_l/]rn8_M DYFT0.e^m`) x'(B_/:*E r1"'cOiE\T{qja3ud}p1 ]!I Ǩז- 6"ByZr.F @E'x>QŮ FökbS>>\3B)3r[A8;p Jjxy赔:%2h?y(fP6r-yN>Z %C+ .北KIj[l8Rj^^crs"OG'h0rp*P$AU<Q|Xh8 =ؠ ^4t&qڜB|IyB,t8> +Y㜈k22Db,tzGj)J׮*ᤪJg[r:ϗzwˋ/͏,-.~xػz5ӗW?\~7o^yc5bĨoV&i^cKZSK\HT3[S5Rfĕ3.KNsa8'x5*Ch!Z⎇F7ّ7cbT@]'Jɜ~/%ErLQSMh|z;Yrc:} c raCR%D CiqM90C5#Rgf۸1t @ c*ɡ^,'Bِq ^U΍AQ:#E !h T׹(dCrʼn*`W칁=6 z>F<Qk԰ 旳!/ݨv$ Ig\(ܙ*m7iF;EE[`qjp6_`#8 Oν?+/ endstream endobj 56 0 obj << /Length 2777 /Filter /FlateDecode >> stream xYYۺ~o}i1I63dmhddf8CѾP!yH#. wjˋA\'s3o,I`zb^f}Pzz}՟ǩ0ϛs\/w!0Z$iQXbuV{a~buAOP(= dлMyc*Cj\@ sS?^ҵowmY%yߕ̠5Zx,NZȇ (PD$ ( gxN, OCZ y6."oTuޠޣa|b} 3JU5llȽ_}~y咹JՒGH `Yr"(q(b)ȴ+985Sn dbLD ,Rbz\jm_~b+ǮCUtVRMF(,1Z5œz0t|7c =ـދ57w<hشK< :b f[)߁,l8 'Y %Qx#w:Yr0du}E~dfaќ)7ށ|VF׉Cz{7;ֱN^ed8uUήg*>nи WںʺBF ҙ#gږk!E; .!= 4^xnBoC%_)tvC 9 Z aF9@5pBPH%bJ/?8l#rO:%[Q(?)MfLʊ6qS/hSY`m̉L N0Ll/uǐlcx a*ba2=;Rǖ5+9tJcD lC?uw}YT礊"-:ZCgD QAPQ:E 'ݏޓC|Alw@+GRZ 'D`rIќm4O|T {j n$Hƭ{y(I96dtmwP `YCADq2߉`C M}{wu})^*+j}Y$adqOny[>o7o TT6TTD"ÁzG%,Cav/Q**[*{ʰ$MiĆ393_&mml2ĸ}.[8'y KTհ[AVkfH5_]< )([\omD-Px2fq g}ٕ{LʻPEI@F})q 'rZuNa>d/3WZY5s`| "<t@$O@73h5'P0+|W!@=BWmٍQ9.g0 XN~sԒQ&/_oȑ i|^]i,O08_BuwskG|L=:N~DI"C9Ke(#(C0hcAEq09F1Idh0\-a}rBC7Ŀmp?V2g@vϰ<'}/ @Ts ڼμVرb GDes$]|}d6t_F%j.~/ qj3O97GIf9+uH*) /> stream x]o=B0@5Wr@(hSBR9K]K_ܝ]nM=d"D#'׷VDPJ,'?Wәmp7e882_tE]Mu%i DL4[Mt5^U>>׷>A=4X i ZӟfQ,~ʛ$u<[mWV};nJr)>>-bS,V{bHCN4_Q0#zJa'~=D: U[YYenw3|oF_dld3x %BƊP)CʉR_D6zL5 Sa9 8o= ^= 0VC m{bMu&jowә6%2FMoi-miC~39 ( p {{n|AMyWƆ&&/h~ q*9Y$110]Tm2A%t vnbغmaʡ:ɇ[_d RH$I C,DD:cpb0*J~Ώ#EQ DuT;Jaz&Fhq-9Z"%H?eC"$ɾMnr61xhqrR.Ҡ; YaFQ|!!شe>tMIqx!b:8K` i_yۑ#:P\LzU.)w&T:d=8,y[`Ơ(ӑ, k I %vWj8h @ g`]UWFl؀DZf8V~# ǜ'*yuZO.2T 2 5NŲ[:̖wkNCTb0ҋ"+r!("sHA^.+66[#ǞR-?H2cTb|Fl|gTNHbĬb'qQ:L@$ vQZg@]qQr?p5K lYxpܱ89EV.v A:v5 B[ir\2)e.#|`ģ.G}x!cH! #3,sE8ʪ7eR/zu 5c육mͽ&/ :s>yۈSh>9%9 t}ThiqJJ! NLg(t+6 ` ׀)3fԖgJBOH AĪ9L7`Pa4PXՋ3Ϥ_*h/am Z5$"BZ`SچGLe%a_aM =ܦ=̉#?@B*tgkaY6sBQQvE$y[kb{`DD/SAM8js"sF!Ab-KjQ Ar3F"0ک;D='D28#/MD+)Nx,hf_R?wb`sPښچjW.1@Vϩ]n^Axo$ 0~>noej@I+@axTjg=2߻5+eSVW*\ꐻ'B%^alH *N1my)1av|B^]^-vC/˽eX{۪3 )l'*iI0jBI(pWjTn+ӬitZIN@8Eu\g=ěv9'])- Gk`meY,9ΧcqDj6#-e*np |;* ڀf 8mWSL!;8Ak`x6R!SH&WT0E˯2.}\$ J'T%ƍxYbD.Z2Cm&+:_ e̗(+ 0ycxwGx@}CG!S h9Ȭu(;rNS+f7ߜq٣}]8^nٙ v5TsE~xz'3m|?8ߩUOO%sZOV} 99Lx^`|Еt\'#KͳwgwKk:9'Sc'1h1\Ԋ0 1R2P O'g_5aTW(9JJΊfXnI䑇KM-,rzp-*oSR%;BWCN'? D{K:]J$I>1xހVKיnXՌ!=9vĨ 1`0ƫo^Y>trM)>q2d Sɰco1&߼NSoV~Wg kB8N[\p\H3s~|O(͡Ci| "XKL֮j "Ŷmʬڥ)]%Y%rgΦ7*ۦEhM _EGw@wNHÀC[ m0Komv; r3j U!o|َZW&(&YQ |`zѡmdݙE{ȳ۶WO?Ks@> stream xYn7}WK`./!YrA-)0KZAj|}Ϭ$vԆv ƹ2+o) )V!U/'5᢬㨬U؎,n1!+2QFaCgrN9kc%aM s4r+ z `:(A Ѐxލ*06C2"-B%b"_"1% :p(#FwH| l$ ~1xE,f% ;b[hktKk=LF0+!GǪ:QVUԩQyL=zǣUE3˾YAeӵ$L;4YVFbALNl>k0ee vjVWMᚆkl]ܰ90UOY")g"W6iB2IK`tBh"RM;ٌ{~-I?kI˟}꾯YI.W|6j]u7jN6 (_nz7^MϿ&) N88867uSOwzV\6y{tyC^rNZR^[9Q\kl>~uWˉҭ[MBsHo_*8eBd@: `;Ƚj'؈B9hrm=/\ѨѷzH+&?$A8RIK!(0_Bמo a>z>h}7y[Ow/wIߎr oUb$4k|~7ȼEnkF[Ԕ#cN: $T7#Cҕ h $@Գ"$;-q Oe)@B#vwY# ΖͲvh?(\n,c)Az *o`TҠ"c"T mz`%HO9 %scԁ݅8#nՋu;9iP0<{Wu 7?U3l Qi'1v0ax՘\j{q[Зt8ukDR6ɤGq\.a,ahXސ`.Z\c2wZ>ۈ;;(ʨ~:x 2_ue:veP}hH^'eP(&E' @qЊ.vBI)գ˛G[\# "Z%eP9E!0'A-C}:X)N-uyZ>Zk%-?(_4^&]խyݼz*;1Y1 b*"P7C"$> stream xXo6 _aIz$C&m P-ǧoY׏dǾ]M  E>HJ[x{m'{AqF3qo:xF0R.›μ䅬d$\Mٚ֫'ys WWo0M4JB'ׯGHڟԭT#n>iH.,mvR]69. .͚~1K;,C M&Y L#B C'Y@`iIJޘr c SD0E`T |@ y4%ȿDEW6/ilUsFUk#қM6us_?Si<%8xRc6ahS3; 8Q+%-9R+Wv=X4)7T(ɑ/;*xL&иl_!z;yoF3[N_eDGD~5gr;G]=JRf=Zݡ㣲8f4[a@g;)O(q7F0s+k%Ri,h5rT"-de݉o90C.1o_'~aaڂ%;bp|)|+nPISIS~(O (33ikԮ5]!h]i*1xTM(9Ǣ$2rۤLd:ZD@E~L43y)nLZ`LktMCFxqP1|*.Q'"r=z uZԆfqL`S zg0J&jh!Aňb9#)+ A,F"u87^Xoؚh}=E#EDF!,3 w4lo5JdC[N4 !QzT|cJ1RU ӭ]6Ss>=Ȉ};됔۞tM}'}*pCVw>*We?:1Pӝ=k}&ƪآԞx(=t<0}]l2w{v;* ߮ ?~a]C%grC[l86 KcpP2( ᛾P}`g[~XP-P Ikx^N0~Ev% j᳻G} endstream endobj 167 0 obj << /Type /ObjStm /N 100 /First 876 /Length 1156 /Filter /FlateDecode >> stream xڭWˊ\G ߯/PW]0xċ$ c7!`fx y ^tϕT*Z>X+ǚ {,6"!Xxl,59]8a/Y|#pNt, >p9v&^N>T9zrK3BQ86f|6^.= s  %y/.yN(]_ԇ\W}0FyI) 3e+nhX2mɫ&THW/Lj=uvv7UT[O24qVob+4/6HTC˓usdͮ΀+rC ְ+t9I֬^݅f7$kC1q \cgl[|1Fa_35#gacFz)mM8iQp-)CԖMމ>v&X*zb+hzh{ZO{XUdm9]B= Bg6J{0l[pݬ^lXЄ2BSs6pa=Iv RI+GU),> stream xkoF{(4brAR %۸9F" Z,"#8yR,;vhy$|U~}UMl&-'2 xb ]p2Q!|3Y|$4`%k 5gSUB&B254NoKpM|@ lap6Md"ĹH` SM<D qe !Rf@a* k!iE X^EGvaʟC]>er'/[օI8`N4_Ar9M6%AaDg&(C2;Rb.@m/@\Q)x|ԂWR)"DAM AdDP)DqHۂ`BP~g0d*4UbEDr0DzH> Sф<kU;"F q A?af]&>'ZAp%9EiD6—"S4:mnNR_9MA/XÝ`0QKև|^)AW@{KCxYAf@2ΦQnsO_8:{qgh;N>J(M\`u)o>h),%"$ DMCOT58[JFS,rltXI]Ϳ47SY_%,fUs[/w홃Kc`:0<^|#ZN:yGݯөFsV؊Pcs[<>n LKL-&wu+`PrRh׌.;㟦1G^#,騁"芺j#ky%or=>%&F12i89ˆ -׫.JMRUMJQ$RMKv\%o9Ȫī2(W?RHb練,GgbOwmýQjPͫJ9IiK$(qvanw[4K . `Trq`LzμwxdV(rQgA_-źGpu2%ҁLt LyبhN>!ȉ{ɾg}Fo2'#CHoXa[@"+{K cq68K2nPШћlhM~, k72Bve-=-9X&4(4KqPT ::4?P}J1XH Mp:IP6:+[oͶvϖ8+t8croRoCSocei̘hҤJ1X!YUFnTi'z9۔[]IحWd1b?&>626! vbM: [λsd̄ ȫB*xe[UΥ Nj}?~ThHFcy3 S0üTQa2'J/ӘjOǔrk5,U+ǜsA*X-o=$Qj#FBXP o\rە|&Hʡ)C흔, !2 DQv@YO}lۮ^]Hyd.洝7Ŧ{XuH3p& ůXA)b֭ !wD}=MH&@:~Ý͖#lpt;;M9> ɹm{s;/(98h6ƙ?dUgg)U,UR*B#_<:z 6}XlJT<C|5G/UXXQΒY,|P),.e [୬{0p-Q Ɇ*ThhVHK(VXo^'vڐ1j?š-*I݌^/57n$x[ڻ7^rI#VWlE["bbqjC+$ĂJ7.O҄F[H|4`38I%YVN%yDYc>ߥG8'8 LXٯÏ/O}Tޗ*VrgԟaSWdwzC)_̆ٶQrHzW7%$"u ҠA@?d!pqb],KLwGg3>TM-o؜CG J<2P)PE50g2vS l;h[\ _3w~v?F]#U$v 0ٛټO؍+$!R`ĭ'\W~<5s Խ3PI!֗w56(Z]%Ě9|@2hLVk_?C哦8?xkıbuYkL}l5棯ʱ "K !/Иlٶu`MPG΃6_&n40a5AUs?#8rpPp{QJp'@Tr8]|s&*YE(ć `e9%g%k/x uӼ)@JbÏeVӧHHZ,іmgxDܽ8-=TFKMi}* hˇzWM"<1,2SGr2n`.3|ov+|NK^w#<]lG~ zd>8& }⢨ܣ͓=?QC(=ϛi v,F=XqbD{7 9/oeu? 9[BS: z>(bؘ&> stream x]=¸j"JD" fsf$Aksjd^~}狲r({(gʼnwhg_]׉Z(̣\-oв-oj*8^\o?2TEsW?e}mX6pmudm&A9ʀnyw/JSvޔw[j0aւSLof+?Wmѯ&XjƵed]oD'"mf ӄ솪˦ɉ={׺ ܗp5 $ \`yBFuCwAۄ8!!6Qu7mzͽC7.ՌK:)({$j(*CݺpEj0R5 UmQ!`E4ݦ@zeTL"w$Ѐ@}-a5$n|o6-,eЀ}p22 8'djsG)÷e N_"m\gd.ĵcOaKO`la nݯ݌Rنj vaIޡ|.@&8*ޕ_wŔ;aL9>IBd m 4ը P]aB_Vn .8v{} l<=OWU i+0f>]c4La&4sQ]S@ڎ0Y,8m|GLdJά l빍&iz({$Fn-l U!`F y6a/xA\% ։6ko>KM2kґzMLI5O9f%Y/Z 4̭^xt Zuh( k3CC5xLnYVhQ _3iDIEj09Os)G-h7(v7 $X\y+L-N7f0 d`oQQsUFcU Iir^9az\99Z&kG]inUS@fpu!&'j?y2Ȕ7KDxZY _)_DϘIjg)+5kH?I<ؑ3s'6},6= $s1Wt(tIGNy}:A: }x)E릟qtБN:hF# :&hﳟaq@tY֤'% u1\$AUTPl'y-c(cݷ~__չ{KbLa!y4mBCs*utItN4Ph%\Q41g5d桍ґխ\1^/J 웦Z ~OmFNUymo-ܖeb@ &eq&y ~sO+PV Nr[wՕkis0 diLJ|/qHK7I,ηNںa@fڅ0.hbcbNUFYLX&/ ("CrxKz9v&`2@R{ޗ±K֦$zWlwMsz=gBi;_s/wyr"3&hqTah։!3^nϳͶhe3uO "}s7h [E9qhRdptux o,Z\<R^m\F:SnO쳊EWoxleтd֗C쵭-#5,J[_ ,b`#<#Ji:YWRs`qZ71ݎ\!Qr8btg_@4?Unź_N4!dQN/dSisC3sH 'MϓIHzچɋi:_sMs2'/e07>\es6@lf7ˮ] iR YrfQ6gY$?,uvpn\>@$:MK:rozf7 y(uO(v&g$cx >RQ4.R4||NP#9CWlaG/u,z>s/=!%N028h&@'۱`Րs;!{JDYII2DyC>C\eJp:Re74W睪BUv†/ %P['eB5eHXrSL)fV3'3,3\- Yd ɫ-X̃{V#=`Gʃy:Z}4VH|{ӛN8 Tb l,8IbxFlh&N%dPfX`[jo1\NGS6cz؝| QŽZ:L; c]ϊ z8Z(+iu\]MT/@U/Qǡ/qyrʊr\/N7~7d2JB-xVȳ`G1dk0v ×5>_HXv ,ZTݬϘڵѾ#(Rndn6*CqRPX5հKh5'0 gu׋[ogfN" CcHP^8|&Iue-ð*Oɑ+w+5{*嗈Oll1|ʽ}ԇ8sπ?޹OdޝQ:Lz(ur, j#6:B6*)U]?'%Zj01:P~k` PMԳko?wQb?(3`Y0+?/M"p+S(WN e^2.%rҠ-{W{)w"juIѯ_8J|8A# Vk|uF( g~_NkT#5्r:Im!5{tDU)yE(9b{)yyDnhK-RncI]O M:rDWD#%O$û:p0tsCUPmhhBiwF\ksrúЏg5 endstream endobj 321 0 obj << /Length 3202 /Filter /FlateDecode >> stream xk6{~_N.֊$J$uEqH$MMtq=,6GɯPۛK"伇a.{{p~fA`ԏ#?eջ\j˕vYfeU^mH{KS-M&U}!h۞S_0yK߻|+SYY%٘ u4uɝUaVue\ű^v;Ӽ[ 0gBڗJSخxDqvJ6Tbu/Ex7ƢHR$136Y]g-U@[7xMW&ĉC.@t ̺&kwk {ˠfBL*5 @dzCmAׄy$E$Xh?2.B"r.7VyaVvUiLNW:tOl })g-d>WbUH?ZgZ VO d$ĺ46m {lW6؆aQd$"g>K!,,RkMG'Q[f`ui@y_-ͪ=@smʬZs/{6Lݘ{Ӳy*Ua֚F"j;2J=vۜ Z)uyY}0 WuGF:0Hq+tA`Mev k|@FDvvYC'H뾣YּX밈b$27\@ i,^$hţ6 ?p6[w= Xkڙba Jhn}9rl)̉sY䁕,lrr4N xZ& qFi%0D^% 8*oH+S0%Y3C1FY;|m־_)w-Ŕ[pr$@Mw p Q-PKFYXne -.Ub5Z*f&~6Mm E X'%Pw!yϟ?e ol݋E&S̙*& vsOw#TFޮLdhΎ~8ZnC;vhWz6SAb]| lppaa YɛiT |); VK&vq>I֥H%]c{?h7EL4bb[b=nRU:/!Dv08onp=݇b]E]ZŅ@p0A_8q"3R D+{-Qʋaݒi@;P@+NT`[V_E"&"dFӰXNdDdr|kKE&&"SN #Dd(Q'21>Q1Y=Ld>CdL7wbXW3R$1INa'0Xa5|_e2 .'0.5sm2b["ގLWpIt!BM>/t_}:GE". rʹ4p1Q‰kFp2#@k+aʥG. cS:(U&-˫m /Ph+kpӹ0G/B fk~HAs7RyMF =$j=̓AQ=1ZRz ɏm0.组s4V~n4RϪ ? k6,P>GZWy_7J+84xOkۂzEJTxMMOqZS mdۖ?' Isrp!}Ò%`J#pL6uW_7m6NJDcYuzw.=᜛0hYUYZ"{ Xk8寿+a9坸*rqk[2+` A2ϩ1d8]euiM$gvA]Z%{@sbXQGDcȘs"yi ޟsI~6ua2KգrB}Jv;ҷq9lkL֭1%巯rWDŽBi1)&'mqYuRÅ><\ȱ*o aEmxDr=}5C_yR8/.S8L*PJW7Tc ݁v8] "tac@gX G\#"<4XMsk*d@Z 4-'D,Cs~ OtyyP{M {m\' AeER&G|2 _a~侖[VpGnaQ0 ICTk=g1vtb˲H14v/jhmMYPzsvS2R7|+Eժquɧ'ya4z(6X>UpNgİK<~'Z>:1W?Sm\𱲯\Yrqm[kL;F12,sRR^JӡΌo:8k2=htѹ iPbp# ?}1푫s6_=Z":G*]j;V@.!UcrZ0 o xO׷& 7 V~Bmfx\SJX,|+?hNQvo~G젡'v20im c"¤XAW ߞӈ>GW:ND. #|!a9> stream xY[k\G ~_1˜]F3r4BHRhk`;5kH}?'E7IѬTQI"x;*5W3hM`hR{{I:@~G:=۬O^,w MϞ.K|qfcjE7_/r;7WGA =tCv $jƒeE޲{kcAhy;mm'p U]&{GVkUZ\-zqyýyt^L<vq>YOo__Lgx:?i{t~qNj{.{yrU3ڛ I.+' uU\7>7l1itbo}z a JAkHk?ZCXQ!l2a*z }.aYk$YYG C5Q:e)1Z3tU bofsMPu>ㅰT9Dh90`Ee$<|c; sK3gd> stream x[s6B/GT4 MZvit47D%Аo:J$k p. iL`ݓ<fOp~dv: @`œ,b2&z+ݯt7q >K_/mSHG杭h[=˽ @ 7Z,T6H\.$wUq#/pWbQԥCmZg~c &"9IOR6I`;\>Uk:~e6 t@aFm/b*Q(jUYj,~S"7 n4ȓQ&GVH߱pnH >y mffX.| pO5ۛ.8qyN2?OYNa201l?Kk іho& Id'a /@.Ry$-?/OzO%A&b2@TΉ" SB!GKIՔ ))zxGD D-BC&zst~eޒfii E&g{pȦZոR+A#Da2CIw!T$('(y\MUqNˢ6ݧQoAvc^tOwx>waf)&Hd JҜ !B)%7dɌ( т@1G-I8Gi>pEgdMEB@5њ"Z~!B4Oq#1 jӣ+GSt#~"1Heҳ@ЗZz{p4R5x:Ur2%o͑ip') w8+uDkڵGxpm1kXN'm#sFOh] /nHZI%sGeOPc`g~tlY$?;ǡ{8_ "" & p) |l*:UrE)Yl?LѵSI=n)4ō{J[ާ-GfNp<{(kAJ:lS S7%=^64"_A(x6M fmdDo>b 큎Psљӳb0ym0));Rq&Dc Ծ'{A#Lf5Q(4݁>2!S_ڔ.G_ Tgn}/m۴Ydӭ.r#K-QpF5|/eͤvgɒY("胵+rFg{(]kMu;>5L"E~ n| ɛ!4BC`q@O0p𐎎Xs]`/;SRr熦l{\%z}G?6ƲfBfxX[rWvz<6j4VyO7cW&%G ~$j)|9?)ályp>N{E&#B8*s1*>쎊d(y7wCR[fJScN=ݍNDl.lfQ'B۰ULCLySlqPiP:*0TV\Hn̘G.U6gB]{'l`lC9+k[ՍN]{H1 sB2x8lOpG0J,KhIĞF> stream xڝZYsF~ϯۂU"+*EJ+V*[R Sత({-n|ë7Z(MT- z>y (MDDryW-W9ҏ̪5y7\~\AE^~Wf Wt;3nZ~i}y;s+D0Ot ;lQ7b.W:7ćZ+ JZ~4q8`=iq \Kd?˂nOS\u hMPy-W 5 :uҶ˰Dj^4Uw\~q̾:SwkY"tUf=5W`9q69UΣvfgЎJv`1҆ &ϬؒP=`O]f}VmQOXM-7"M'NnIewv6?>pj."aH8 |"3 BK>?ӃalUNr0Ǔ&1zW"ݫ/8I0Ѯg))@{e9|D@L)H/oj<^f7Zj M@M-:FKEQFМ[5m\t̐GS0@;8bg2BA.7 Ӆe7©m:)tbiqFS& LZ4 Wy^V,t){y7 e?PyfM%SdذcJv?ߵ=G(in0,  zVRUz)@hlpnhѭyo9r5P C',kb` c1弛A!#W)=70x# uueׅ,h<0yCZa>yэ57}c>J( aA/ 9##hNteX?VM.[u?٦w/GFi=)˗/OZ J'm떂R>S2\ 7M{ 1X*ٞd0HL ӟ:hc'@7̚T0kua u BX6㥅a.}! K>yFJ6v=00o̬C΃w/k P>c4jM[͵>0m[J&{Ղ7ZN%@ ʓK"o$S0 5Q]=R9y*r !#7ـۊeBhZ HJY+=pZO]?橀c$)?^8؏Q$xe6$i9DK6)Wb1ID.A!ߛؘsR؛Bełlg9<">LJ7X`҃P!ȘST0p罿}]|L!ƘL|eS}栆;p*w.: Pm烵@Sx( .Ĺ#׆G _F:Jm䧌l!8_?=9-XpnT@#\rDlbnw,GU1IZ/s)_loFJ~o&Ӻe.^~?E޸.ͺA-P"Cԋ(ѳ9GnYBl\{ZIC(P"`Vq8LwHDՄ,f[q҆D@ &̅:V,Cg,=HW#g'K# !)h3SH*LbPi_ow5 g4=74#؏L5%#ͣ-%|8Oh&{obK)k/rn zi|BIP5=^ٰI{HԽv :XP`FK$ 3hB{ = Cܨ+~ lzj _oJm c)Q$7Phg{  Ӎ F;EQ3Dm/`/ʑ|p#W N:?6Uq^xf'fei nn,^ñӏd| ?nta|pqS܅_ϯQ4@;M 4J/ @A1o)$Faںoðy>jfah H!A}m``1ni`DvnHHAS*dH2tpש0d>O\e^P ,'"j8ڇzzGu)0c86b;\jD.}<h*Մf.(ז*uB$e #DTLN7%<Ά8$CCޑRR29R>~k?N>W*@1W%R ;k:O+yvwbI 1LڼE0~M:8ΔUtzQX1`\?i+ endstream endobj 364 0 obj << /Type /ObjStm /N 100 /First 873 /Length 1662 /Filter /FlateDecode >> stream xYYkG~_A=}T"@!h5,/eT g꺺Q d dJc1>3j&FgB.c1$F2iEpp\6K,&ylLbCdM1\hJIJ% M K&'眼MŤB;=^X9b$2L~x$kȨb@pKQoĖl '$D1Rn\ `?!'"OِGz !" V^(K=&x ` SG Ў<ḬC"ȁ\a X^ǝ"kp2It>>4)\4;2zʞlv6y)%`[ "TPJVFQ d h>Va]t!6!Tl"](8U.8D6\Ɣm)TacR8T.8T](E[ikkօJBeBAYwpI A 6WwqZf-٪whL~h^7z!]jCӼ텝.ժ7]ЭM;_ͺMsu_mf/Wa!h}/+$#{kvu^r0Ѵ͎\U6 O 3y L݂R?)+CTCTs}vQb!} PB+dSH*lBpB:,|&͉ tJے}{/6XǢ`Q:up]:0ʺ*,GMJu)E(RR}ECâZ 1UP.{$+- r&UEh僁 Ka,*lDb](Cu8b9&e~ogK҇#*Nl@N,uZ88?y= kN*nh?<2vt$_A0>(D%)c܍y7x"(C(?c]Y?ͪ';•i $e2|{9bV(Uѧ۾U@#MhG}\HZ{F?J߮.!|*|vɧVz@qwA?G9gQ< *(˿?lAe>ej==Us%vG#KE?^TXڂ cV  3 }=^ endstream endobj 538 0 obj << /Length1 2073 /Length2 23919 /Length3 0 /Length 25164 /Filter /FlateDecode >> stream xڴzeT˶-ni$;Nơqww],}9qf.5k֪H d2lXyr C[fa GV r2(;':Xy>$@w  t2Tu  G'#Cw7H"sw03wßd 3 @VC[4#@nPlF@sCkS )$UhVq9U5 zNPSQShߌ S=O ӟ=X.@G?e;3:l*6wrebruue4svtb91YO r: lk.9 @h$i.{һ_ޅpG ?ʘ:+( 1u::9;e{PMqvpSC]*ԅA;ӵ6t3uv6mc+?_69!y)q1UƳecWDey̜v{ٚllY;"O]'';u-󿭦&T7qcRwJO 63݌͙SY%L OGC O".{_V5x63_?_l&@S&y{3PcUKZHwWU#Q hhdlۥ ߛ^~"#}:f|hle ttp| -IQX^CH 1[c``OV6$&F[{ ` r@s&?I_$o`7zϓ7zϓ70 q G8Az?  `2d0r,+[|gbo^4X *kӿ,ce~W}{we'hmm+Q35 y; 89&o3trpa~,M? |0G0͓`|W?rM_g@71"SejSHXt9GƳJMx\]2B B$Om l֤[%=C91uF e.2c|ٌ6b؉OGS4u nyzk0"^fD0*1T|]F$ +yn_r Ŧ1iqnd"~>i|@*}) GGaM=z Lp"DM ]IЎ*n'FdQ*,wV ?G1Vk@I?.́+Rnqq P=qn 0RYuO՛˾ 8q{EF*RsbչVF.5:<伨{^" 9eҗ#,ž滔nܣ] s?)R.HfQXݧ*.9==aoLM&#W,A_vYgv^%Q7zXvjѨ4N.{saD>xAAN=_bWss KK 85U'~T(Av9430Qo*t|/1s6KRs}waDqmBl`]%z23}f$_ !ѶF3e"c,l9bٰ[px,l/H2)n>"F%jI NK?E |ƾ_AgȒYKVߍMJd 2WtfmN+x'ڨm:˜F%1";乮XB4hi7k!*ˉ!;"Jϥ¦bq='}ɫ翁D҉co;Zv, n/;C"t߃,/&KC);$u}bw t?sX ePb LhpXϰq~k[mxH' EKVj0Y&8rҭ\g \$Hz4DLvE 0; y)/A8IP^MCeZ JY3a'VlŹxл,%zBnjL|!k؆#XϜA֟2g˚5ѺފL 4ka)9zK HE oj޸nh<[GXD*B`Bg3|Rs?#,EmԸM%(eUkӬ9/ǀ[< AF̩?F)`F߄]Z"b+OH!O\uebhn8Sj qU:R "_v axzi u2w4i pYJ5[2 C;ZЃR d津WC:Kk$7mQ*'f5jH7i& f%IsK׭%N܈GKS;> :Gpq`Oҧ"HHoX[x$WKoմMHҡ]ڗz}o}s4 i˦aP(3 O0sg7!qrH)Ds~sǤT4п^ho"j?H 9좬I6b:i|ިb)d]HV)|I mKގBʈBxӣrJ< vx撡O>iD=$=8Ai|`g5Y0U~@RIE܀R{"^t|ؕ~##FۺgN"&5n^2jtIkhhrcO9qkTU < l glc- Xl׃Ǧ9{Л'X1}Q);Q) $W )2mmzbqv>hdQL[2WB᳭/]}IJeY/S1Tk2?d{g!ARTטi\[֗*["^Bl0v\a2Ћq!b}Hs1g4lp$SL[?UqxQ?D/7g||YZ/~ m|̅* (7S+,^ :8]p}Է. iGpg6sJ (w*deKc"c9Z&4u|so#E#8#`Y֯Ĭ{ο93"z=Дmꂙ ct1zlA c,8P&id>gc)qFZaozY7$\%ϛ,ԭ)[8GE_ʡ:?bWYR] N{QQ@:eexeMwxŶy'v@ħZCpǀSB\ Zpw|3NԆF 5+F7oDraNR)<~parhϰeKz*41F ښ2Ejsם9% ; #.pv*&)jCP!+Qh'"`3IwSݬJۈR OMEYJnpk.xtHǴ5l㊜x٥#÷[PuP(r.-U7K2Hee A0d*p r4Jqkve1u(o P>Q xׯR~i[s]U۸FtHgziTLMty0Hlx*#[,&Mn2w̕ۋN +CgU S-}iAsdѾ&%)L"\u W+ hyQTqW[)62z@\d9 {6U"O|IE=19kg9텼UCaYgb 9=GRtNJh.ߑFFrLFOKD5L>Ԝ&Yz=#5~{27nT]~0 q`A (l@C1uӇ{~aohL9S?aՐ:yag F,IR,Or⌜hw9q%qӜ<~?$E!_Z|WTzY\iĘ҆Y-=~n=!H!^xDȒBcه@+CN~>V, sA< v'KZOn1,2d.zzSĕ nssAj*W;m[R}4*E'L])ë^zm (wQ̷780f|-=1iN$p׃P _H -9n\לE@PQdv+hqxntGG [)B׬' k55AI{kC|´B4m`Gg4O -@ >TVDzNJڤLʪ:>K?V= ށ'(ބ[Lqg'b4Xͣb eӋ,3wpPu몰;[ 64J3/X?˄HOEԛagHaY]l4t t >d6a;ßw YU }w#5

|}4CΣCoȧl@Py29o] ΕUW>C :ս*OoJ gihm |tG-NX: Jz (p{NDbK0h镬*]Q7L-[2 ~V>XkVYZ3s$g-Нʢۯch #]ܞS͹gx$ ٱRe@(kh_]\@ g4V:Wo]=vrxjdxGp R~Q%+U%6 gM^#Tf>iCw"+n71wD7Q/oq NNlOߘlMEU8آht!E@"h0!0>dv{*dg0U ī,8A9VF'W0Η.^6ҩ'_q:wѢ¹MM6`GP+:' 0ذa0Jf{֎<|dj1QLK<4PjXz.s QIvыXLI^X$!đyޝ_wTv.1A/RA Y? ᘡpZ.THVVm&3,eO>%u(<9( GmRO^Uo'?5 ^/3jgvw>6F'xJ|$P\\GygJ;Kr\M'Zi&GIE{uBl +(f"G{ 0R[.i5cVѮ@)?*&DO2O^|gci&  {_at%ܻSIt4(/$ %-ٶ&x Np(I&چ< %HSAjz.ԃ cF3xĀ!}Nvwz0aƛ G=\꧂zAO$񧘿"Uꑧe쐫vw]{xI֥_Ҵ|eTuUO!wU9tֵ){?DF:ATە@Gz0Yv[rao>t?p Y8WIP˝'fgKP^ֿu`R4'!K!Mh >ṊjHK|y4;]:kfE2n=&wm 3a*zھ:0{^,6C>p~:d /K(DB۾C*0-q`ad*T?V`>R1tz7i'\Cc3uS32SV5&I&T7Y6G LdYy 6'@+,~ Soͪ1IvQ:6n D0<>0)oL[#` 7.:CemLM;?J|F/U+dQ\٦x\pGa崪XD\!ī߭͌ab~1h$7qQ&*m 8w}B12~{_j oИ9*O NʙKQUDi&+{P^˭ rLL/~lR)^^d- ˬ-lX4j[ֱEMj#&@.D, '^?ex;cqGǴIb $h[aKvR3ȶu93eWHWZ}ʄ)%?:G4OXJ3z 3iVh "фwg\q=P]gL֛NAFO[ŸI) )>u @b;6WL ܥ\2-EʤX;Vj,ҶΡ@DJ eaЭӺ_, 7&Z;&y?떧 p=lňܺAf5)|hg"Wo=vk[j?k']":Am5uN%tmwkC d}A}À[77ّ8TCvs:g[KSrtOz#86rř,, c:P! ^۬7KX,e^Gẕ$;Q{X3Z7ܷ>PnI4=_l2{̽MIEVKD7UT@QDL- :e,a؍5ds(:_B +̖2J-MMwu*Expu[9*R,IkUI;#0kO(홟h~Qʰ7CD N7U#C$ɸf |V#̨/=WVFz$`?GSd&ڮzn@~Ekt%Hx*uZWukȁLV\[ng% s*VFk_B<X7&{2x ؕ0U YGL95dp.,{Ip26'FOPjb*]/N2*t)5Je C~Ӎ# \OtQ:1*Ax6L[6z]3cdITRJKa*H|da$rMtABs>ց[s2pŹۙ=1>T-ĵ{%# eFcn=% ~}?K_|i}`h'VTO51ߖ2F@]܃6Kǻ(j;c}7AZ)Vl,zB 96\V/X_XlC7 | w4`pQyS}+N@G7B5h/9)9\2z[C}z r=^= |{&%&b#%W$QscM$`l J膎px )9~T `O+u"+ f]!ly*h-|S"7yJ_-a8nL-=AE[5JpU>8fM`c=fKG ВOKRogz@Io{|[A=z0V(Gۮ{]< ۈJW &6`V-DuJ촡UREg_3ھAh>>ӽV E{y(IecEf=gG)]JzTbV- Cҍ)(kOea!6mѩjz:d+np. ][S.=$&A; LŰN!%qB"^WґE r0h%E+}B$K>&k3!!s}aؙ"̄g Q-vMFJ.SM %j3AJM6ُ`6%FȪ-V мO 5s#R/ɧI!fh+-8wlr!ʫz0$Yhs2c%t 3 o>/j?! @vy.؎FTu?=wpZI>I,S'SrN.e6Zy<)Amצ^G9HXj C52Ѐ]%Yr}{eLm,Cf뎓H6[j{Vphl:`O;xZ\N(b.ɴi|,2 ^YA4FHrBKh?Oyu$!$uW^mߩ{%ݸ 깊v &܈v7AbmgFzNȆHA|/nJ{Yઑ󛟿y0qj0e^g?@~.F6L{ؗ!0ħάJrb5OZSmlY[hXGBAZgrM-kwGn^HLhꘃ&Uz:Ig"jNȋ ߞ ( 17gh.MRq/!-~!,(cPѮV\d(uS}2}"Ģ8֚m")kh3t%~kz 4xFY?+|h)f"vq$9q@Y$ƴO۫!<5cke`oZCD}~;n`){먉0\-@V1O\]CxyRb;K }U7oJ\d^K GMR-k{X/g\W^AjZ7]ٙ.CE1l^ύ:owy>ʜB3];% /EG`p ʼMSh15`>"Wyq!RYRl,k2:0ejµ.٧~L:K1%9)i O3nLەNKڎ\=y_X@} EH0V j -V]tN B pr"f_l=e7 -Ru(%88w<&\eZu[7ȇ‚!'.wl'rkEcl-_3.nj]X/0GVkǺr}eN$ؘur+e<yGѵ5S~O? 2k:sruNq؎>\N ;YR"v#Է4=0k0s8II)-"TovHX&hOhIRl#ykߏ-Kǖ`R;*':v=*UCgݗoQqߨ{45>w2}I$M ;Cn+4Z~ (r(Wn)-w sسMbl\f ̗ve $XF}k2|yq&]ۜLfU{ʝGaڬ ;q<=hΰJ_z Þre׃="[8^:2?^$wC*s KNfJϔLʤO'x]DjLF'OɄo$)urJ[ԾpoZ;F,Q&ScQlr|-?+{3,9m ϬXRal_b<V@m5L lPL܏RktYhcC~L _% 끇H@* ]$#:7s?!é9iLDK1JPQf(0D,AwSkzKqD~ HWI  g1ExKߖ2ҏJB-fKصj y2D*%3Ҍ=f7iYhNN0՗ӻvG\\XNq!] yL)Ltt%<-VbdH༬L4M§[=0k$U *}Q"2"zZ=UTh0zyA@lSGfUNWWYO?0j)B&[7,C?68|!~Ә-vvR-?!jP5<\֢cu~^i}`F2PfGLe|˧Z4#C \vo50v~K'aJ'Jtد3v 8ATIfmH#Ez2Odu@& BqFQS0V,mܸ-E7*!Nfà=9IZ%-ŐC˛K ~sppȖ sئv'>hL^g49+Fd%)g AFT!oGcgKm 91NJKq.9xE"p1UwR"vl\,L>t%M ,OT$U hwq?ߖEVxn`tei}`r~eeyLE^;-nU(vY{Y֢|mu4G1DFa!.uFܼc[eqpPQqCGj|;J2Q #@HUʡЦikjΔ?k]BaZ(T5l md S=l* :qs,4 z%эpcO$r܏㤸P9!Yi#(4m)E Jyw (f88zUO`q0^O:>% yQR:Baľ%d2z#j靹D״5~1OXr7R@'\ 4˞Gya]5>8EW2cLM6D~ 9L%5nڊ݅0&Wq~Ovhۊ Ԃd'?G¦ '5~-QݻW5ڸ)MZF&- ǞAK^-\URFEy!x)ztj[W 'EN(ۼh wV: y,1z-a8D+kx +"7U#XD Ǝ?( f+ʃ\M1{[W~e*Hn.}fJ~.uռUU–0oCl`A- y#ĹPir"qJ_}<ެIn?\KV\Hf/k'+$ Dtc 'C%eB[y][MgUnb:H|]aՅuF~fc+Vto``_]f^iǦql_V/1(`%àDwɞ[!v9t7dyh%yG@6ڕy)U2YUj鞏::nS)R\@FWCSE{(AwzaNS䞯ue Wu"mcɽp۞숸]yC 3J⨤B0Ib [,牁tC?&d;?vL7gHOTm!^'̌JSBɁ]ǻ_N)wEtJ3SW,@u@3bGb=JA.,;b)cWȵ8e%5j&Be'+y+-Y`)mY8k, ?VV`M~iaX޵-(Hi0ߕdiBe+IT&} SUP3MoĵW^kWn\a^0 g /L7*'nd[Jr*)_C_X6N)aQa bhq24EװH=5M<Ǵ0U>?,!wDHcILiG/Uu6l+1}s#^ށ͚}}A5G~-̳{cCD X8WeVXNJoZ5H47hg?{w lMB1d*@H,_M=u?& X]U,ܞԋDg: <{:^$gJMonu`vj->3EEd&U' u b,G47 cDgFG@pNaܗ5T<]BX2xW3xR0<@,}v3#$c`Ұ[Q=V4y%bmM K.:GTLT ·7{ FD_dma0-f[~ h<Δ|C.17\delPF[QR.3zw ;4=m YlZ2~BH,_M=u?& X]U,ܞjW6oA`> r?#ݎlC"Vw*t )dLLxh6gh VSOPn#"eMF$laF}ӌI#.Xݒma}jnTszd{HG(A{|Bs]F|yO5S 2d6(/;Ga;̶"#ŖsJXR7yLc)tB.-Ÿ 'R.D^jq kxֹjÛkrw/QGpѴ2ر676yex5T5Hs(C e \urՁ&KLw~^6K6G2zp9?9nd`f3(øP$76ryڇ歐-oW(UÓJjvm͚T(|TepV_7(]WIg(uH;IAZت>ۀhS1řZRMI!+Kd; `,0GW q!X@|x(80$&VL B,}йy!Sh=69ܹ1bfuVQD%mpH}j9ml"ջxHs$woI"!<$MEL=b90[ibQ.8!i,ј Lǐp*Xf76IlvyPWO}Xֺ뱭,lEU2I h'ժa9U 4z)t-]dX'~W鑛]G!6ӝ_IC3?5ꭑ-oZN9Z1?71_q/q#hD  %=g AGVao5R+-5[l+*<*Pй ļ|U%PU{$DQ#5WՕ;It-fk(FLM 8Vۿ֌$}| mX,zX*@Tdd{>0e`OoǷIC!˒!1|Bp%KuBل1(4޿HܷFj4fh C6!fH_#D0E-:?:|/q~*ϰk{; 6@nmEK6qFAYADor;_+ Ng6._e4SI㭌%S䰬6pEI 1&%:ZgaPclM*D(HHGfZ*<5giErv&2\)n*)}s ˔CY2|<K&Z.CR%*zd_vvX[ozQ~L\Eڳ^G,3fH{ē” .?.DhOJIĂN<x+bk71Xϸ&#s1؞"N+<]_)8,Z+0K]T ,zw)'qA3a&ʩmt+-p`(2`]%/|0-.chCt1V1\Û; V>?m&L`e"GkP=7~^kD!&p`|O/ {0M3r0PDU/8IT=4*5妿 M@ ߺ+)h6Z)F!eE>J+)|pAZF 5xK)KoY􂨉O[B<Rk&#SJZZX C70q/WпMՙ9f6FAP1D -P:Nӷ]@ sbmJV Cb#JI^*g I[%Ԝw:{'?..F"M7x3AdSֈ =uyr|>/䢤5,`, ![MKbqT:KΥTR@ͤ޵ьԔqԆ-Iy Tz%L3{H/8qbpw4Wt9t|Ch/`OVbu HGMۆm|ndlaXqzqZȄ;[}pxWX4`ӃӀc 4ϯSDZy%MFĭ !é=N*H|)DTG3 iv-Dȏsޤ'O +hJ\% [콮=| DCo0vCD}9TKdw X ɞ{t{ }_ 6E^XH6rQO 77YJb7Vekzٰ 3 $giӀ!@\)S֕"2i)>?6yE0ZFmh]m{un\U.?b%BiuIXlohI>{`IySOy7Va˝RmhGX PZn 2n93h1(⑾皓ZQWT=L/֎)|*@x!(G-iS% w49QgC[j͕8Z*aV]NըPܨ$Md8+QL}Us 7ndV8'IHIecm5|L&=ai%ɛN/1$<nңh;u9l4Wrj1(q͚jpCF6LxN4ufNBm2,[aztU5 .asmٮɢIj^BgdF@`CݱG;32Bgy Acq:7q ~;͘:uĘ>m<afV 05ъuk)@IzŰHTK a&|nzg^ϦxiP83US6L5R7NhI.v+ ^ԕ'WVFL XZ2ŔEB%E]6npBi&NNEpK߆KyFN`y@>ֿ% <{hW4XƓD&?vq*=x 4㎶'x@q(tϠT ͈BtB0XG T.CruTVx9 Ɉ%;51Be 3PK[aw2{bFp~2C܂->y^8섇>~wD2dcd?V @7 oprw[g_RFUeޯY?gbj}l9G4CI<$ڀm0a n])C-G3%:0@ۺ:ҶZk11*?H٬8]cps^Qc])_` _5fŌRMR{O6 E2ZNDjXIuԫ?)\m.\.0/f￴Q}`LX-n%wq4y>ؓԶ=}gr`+4 `]7M8uOh5N6o%S䲑Mw^pMZX խ &QiΗ~:SZR&ڈl, endstream endobj 540 0 obj << /Length1 3058 /Length2 35344 /Length3 0 /Length 36990 /Filter /FlateDecode >> stream xڴeT6LHw7,;͢RE:E:SZZR[}}7f^sTLbN@i';3+?@QI ʤp0s3r"RQIm@f@~ @ `eCA@W`Pi8fU'7w&s37.N>6bp01[ ofafo 0Y䙕N^`- 0ژ9X@]@F]EKUX?\$44dbʚR6#@FKCOM ߚ 6$)*f+p3vrur+ݙˋ͝՚/~6n/'W{1 Kp;m*E[ Io#`'bF9 W3|UUf w d6t7sp%-i&Hxʡ_?iK] \_l '+[/nLILYNZJCII[pܝ\}X`ۃ@~-W-=Y@.@9Ee@w+z[ذJ׼ 2spZ~nf@0Oſ"*~7 hNIc:. |k@8#x-g9^\Np<( ,qMA'jǘqG?l GWtqz&\ n޿)`fG`piպ۸QpWܽp%9z)Y8ZzXpP? 8,9#]f{@[KG_Xck ~ ad ,7%[\ۏ'#Q ﵿ^c z-,?DTJOWP1 'Zʚ$—,~ ) M](o6TKޖV;um&cH*%6ͬ55ݡ|~^9vRؑ_g}< ׆U0^slخ_ NwAc'ƙ-ϚE:`BƕÒwY_:3vB" $Ф"ze (ὅBL.q){sDЂ1_OTs%4]d ݞ?QW=;ºC[K- )XAw"Sま6]˒)|L)(j8EQJx&-514>_>E͙AX>+@ ^fVUj>YM=޾Mn 6P)*|Z=YXӪ;4 5w*v=ZG%~nQ5 y9C' J >ֶ|FWL9<[q&Ly47|E-&%M1(oHxDܢs6zY9?g?CxR”REٺ>υzwSoRs-m,ɺvϷ3aj2 VYG (_8+[x 2{HVsǡcJCF L\ ݅ٓ@۬Hwܛ8 գw=v^:ܻek5еgl vv2 #&w+Ն9܀hP53+9"ꉱ;oL¹6?u8&҂7$=1xdlC nuYfJ,y/W G̰cwȏZ+htvH]zo[ŏfBc' I@u2eۼda 5W*v ї%T*R&a<q-W&G w$vJ 3Fm׍&C>oEY'\*U+%jG)º x#E>f @~X-v!y߻>P뿄v,*eZF mls9r4HQখAuO^痏}识`˜BC3!smex+=F6Z[фrE*_ ],PrÃd J$pTT!}"ֺ}1d!룃2zfXo$6E2BG#]dx>瞛ѕ)'s69Yw}i=Bk(֧\׆CKJ/o\<1菂϶Y^ k &ṁP @"$h jwn]E)rDC:[KB!  p <s6|@AXڽy8XuJe$2h1uEjdJd:בm(,8 x296ۃJsBY:M@ԴժwN/LL19Ư)`jᑙeۥymw!1Y;V@,d,dž]5ʄ%~쇎!!<7huFtcX9O:U^P*b=G%rC~Fs3(T_JPI~26w2;i| ѣnIįYaCV%?9{Ygƀ20hH`%REe髕㋴cU\űnw1EiG1yO(`ޓu΁4=!3 Ѭ>h9ݧEK(>Q=7W YH#, "j~-#ħub|Pa6^B,, ?uc#q5KuMTI\)^ 7QZ6IrjwV`MHґ6ˌv@ex5_% ]'Z&usVR8}MHl^N_Jތm!d2u"HSlYq4 0%Di-@MRfR0!޴c^)Y=]dDH#hЃΰՊ<ݙ8=O7IY nFd8fkdɼ,զAvI ㏓~ƋL<|w&B+GoUAV'1?仼L+1tόc+3 );7{?U4c/6SXf;m/f#HI|цE:8WѠBco kmh7qjaRDeG , "G@LKmq0ᆏp^뷕yi`HmrPy se%yDYD(V+޲,Wvzu7N5$3 r@(y.(u`d`UED@I*-;5hAU;|gÐB/r;(EB7y [ǺM$w2wC#Dܴe}?qέ,'Ͽ@1RW)ʵ/9c0{h Wd+Y㾶:8UHٙ7Ŵ;^ӷkϾJ@]cAОjD͝^pNPj>j)*=|%CM1Jo؋M4$˓r~\Yd1[V=RMbRBB_(7B NOB(2:%YfJsh%+EC#}P Z7Sq;>ײFAlɂo{}^6dW#C2w%3&Bp4?N0|VCtuRJopA"Wylbo#5Z_~BV Y̹niiW~Y޲ybeZþ kyW4)%oI#w#W>UdK|a4+eF!y0DTk:ka IkvrLDq,"{dxK*{' Ο%v]- `厱 s-[dN8)Mj䯔v8@/ӓ#87FB.(qfaȼI,:`I7̘ @cF6,뺁"T{0McwCIPo]w>  Su^Lup1ꜞͪQbzhqBaE#߶IҺTOHdik[(RMzK9FNڔjZ7Nm-x:{1Y Nicv,$HlHw;Z״ \0KJe X( ҩ!v}aW&~2}k$oq#DM]5^*~TGm<0z,@K{RޖF5 vP`Y%(v" ۻrd]syX=0(躯R av/ĜK3Z{[aYdaڠ gt}ATۜ#^s.X"W}lW׫Xcy҆fRivFKI'P()xA>4{$)&fb&iתjI֜!fG$]ugᝓ(z8Uh6%u-Er^v%dڶe>3Q&A2#;W8ql(2usUkdV98N@x)$NhNE^At t5<._fh+~/]* H@@j6%^yu[U'Ír=#F"ryMݧzQ6m%:fxq!BD{SF}c׆[*>}6L 5)qm_K| @tky.;m`iD*y`ajTtʷ#"JA7WUk;>ŸI.1\6_L3B)zbkއ|,RdPI(o߬CV:Ivj-~nbH`&Ufk^l8Y[O:wan(KЈhA7 ~7>8/"ǻV}kG+ ܍3ȫ jhpEV S!6w "pʪ))F n~~t4ޤMpbw19*лYSj gQ{둗˸[ļ5 ~>"߽[4kt avg`CR+ǧ, #sN5WL4y(^j*W^ !{xsw[Qڅ4XKSSS+㝨&`Ƽӥ/j$jHg1v:&E9D@60QfiI-ۻmOMAw݊ѩG寓:pkuQ0+h}ϣHxn}rUʫAnJgn ޞ79O<>7kLMlHa*?1pYف }R`t=ӞR2Bئ ݬ"+Q71=DžP)ںqIJuVצxDpg/zl:Zh$mQ< mcq. RDzx&YXi!$z:9-26AIRFgtV7B&taKZ|Oex40;&ؑŌW(@q79"&%s!rrwi2gR-AQ~c jtq6稃 /(hSEN ^K֐J%͊U{S71dZL]˭pԯ.3);Tħ5ܩEgON5 exK'QUje5I ؋X,8rH_wH>vߔexk|k>=>ތ]J"¶}q>l G*0j9D&XAumxYMGe8Dsߌ`P$[ɠ*%h@(߅4Poj$j Wb\YKMK~s1+V5} &\Wqܱx_z"ƝaAc[|NŠ)N+w$o5 =Lpa(9~f!mk1W$~>.<8*Q-QR*NE!+7Lzmwc *anۃm)W-)N*e^'}9q"1%M{ov%l 76;a(5{ Ž'<5 1$jyO*.S\^4ߊ`7NGg俁0d1/p)ÏuӹcB^5Q$R.)fE5wޡ--5ۜ{q*EGb9o,woCZc#mК\pqVϚCӶYz$7z@Xiݮ0Ւ5͖фo s8_}^'pQZ`8ߠU&(jUF̲ˡ1Nj9*qD8C9Jgr*IӃ%(- B܋'-=~B:,lc ť%8 5+tNb94׳dOl ıG_d8m{I9YnkX)VlNyaX`v1H.8QY|0 -.I#=u_9SO#<4zۯ5M0`ny'4υJ &,bAKu$m턃A t$b!xF1U~Bxr hhZttoLUqN>J~FM,iޥ7zʅI([1-r"--dd"T$izޓc‡/ɵ>]]&.fqAzr+ھzf*gU W H5am {^ZleRYgiy˹NVWvڞZJ3z g-3`b澓(o$5j^y# 맟1?yL",A>©ՇwʲP0X|nx su.$D;4]΂-HLb{#-oj@VF8)l\>i$8j} &|G_i%К {lr泧M>*;` 'AC*g8VcȮD> [ְf6!.dC6!W<ݑVڟ.^B ԩ/3Dlѧ3]Ǚ"Rr .#ǃG+q3/d*V02`RI8Z ?v܈ƣ d~uw,Y& Y˗g7cz+Hhk] _F |Exc#'C5m՛~ =44ǃ KB}ݬ ,@kx}=P}nv2uW®_6b~Bމ%{ͲBvzs YjL̵8&iPiP?ﹷ+MDƌd:~+btNMM؜;<2wԈ1t$ѦM8:f)iɾڊډeI?c1#^|2@! ͞֌z?𦣌bP[1Uj]AIٷE*ut"'Q~U.^vȉ]dnݐ1&VַNӹu~ v^@ׇHVC \lv͠78{rR Iw\bȦk?v2[:b (` .~K\,kP q~_3VBl; =E  Srgȭԋ<Av9Dn < +uO҂Phƽ$kɇI) JQa6wI+4i}R<ލhccn/"ζ}f<y(6^[wIG^Cdb¸p>IbsVLs];%}bZR0}zs_k#٪,-O|vfmAq$]M?x[X @u # ! {2 TLI>}<׳_GQ'p"Icv>52&(Ny쫆W)MRt 팪D_:Iv˶g_Ц^A7@4uOx՗k)TpM%[%iPHDlZ)m'Á rO~6wa(VkO1Spmź8Jk,"'Lj&MJl:NotۛiLG8Ent!FgDc2}:hO2KUcw#`^g.3 ݇pOgfqyF9iK i0[+]t?B+(DY0k2h.q Jeo_7>{ 'zCvy6Y0 F[".ȫDuGwAaW~帖oID  .*Y5oTM^&SL1rѵ,V$M!3$Xpc J"żB]Xt{3"?XrW=Pc clU#r;rǎ=HywVEV`In<Vuvt%=.MZ(;ZԎk ]U,Ǵ O 3]>}BSh:tLP\BuW-|*&ߡ9[>uwt@=r-R:qH[ISjȸrg ZE\̵FTQ,f槴C_0|8HlһG{i"QL8-gw?(q7Vq? {ּLS}VܾTvjw,*޼kiE9]Fq Jq~V |r b[FTp)BQ;5/F EOj4R*ÖK.IF\+/x 67uG']<REId7Zȟp8Tv dҪ%a>ڜNI#߱@ssoE `1ڦ7Mx ѓ7zA}˽["SlP#u߄eS| F٠J>5  iZ-*% <$7i;}CђZM|a+o+=X wrğ#e1O#Wfa,ԃB0.s26F-,^nW-ߧP*mUjRdsE_T^0Ab05(_SɅػJ&,?o[kd=@/)]1Mɷ^)SMϾ(SCGo:VwG"R"D+ȎOoMEb zѭt {¼j+;Z#|OTr5?%-{Y14rJ&Jin Fzj23LVP~?`F.̒لG|қ7R3C7 DHPsQ`6%c?O6d:c4*>D\/kQ-"pV8߼j9j`x]vneGWQu?aCXN [`~3 k'r,*ܭȁS}-<EpDPg-s֠[В сKJ5\8,"ݲ|6fZGrBKڌLN8x"xԆXhMXmkC~(mޭhQ H Fjqn49'Ӱk=5)c1͸|dzcc> sE"yfPgs OnO{ kj.tǘl 4"k\;c=-CdbW:Vb5L`{ϡd`}m lJ Qgј Z+]u, #^a) 4*TUlg7|V椸|՜𙓡#>4M,ZaCjϬ(N]3Я_t#39?] v"/q#M/Te5=rug ̟ٝO0wl$[$HD婉lU׈Fmm4* *']U]U={.'GA(g liSsd\k"RkaCر(=eCR&M*Zk/Œ!;~=9g*,3$\zά̒+xx-Ke:p~D&Uʥ.l+B tEyF<ݧ 4Se-֐C#ϓF@\G8Ml!QepN,j؍ '*fv߷@ ?#w9NǾht7Wgϲ J;]P0p]R&?p)[z\wϳX &.F;|ny!j(-U^pt}Ft?ao='MSXM4g =뽙ţ0}ȪǍuh$Zm]sot_L=VvҸA|rMZ0A ,˰y+3>瑞fmk3ɉg_ƖKLe9K񬱨Bo;~>Y:!RS2ldZu $;}79 # =:(oB~̸psX[HPOE\IɃ0сBz8LE[K󤝻y<Э/EF;Y*g< 1-B8|A74O&щ|D=ʻ=Q Fl Nu E:؍|S 1[ƾ'9-Lo ,X.>h{=YvKuoOOrڵpPtXIbp(/Dv}TM8, ~;I+9U0hWYjJЅE.ue?;N}l  鯇7 I\+w?^ٯ࢔G |A5W=5n!BiN2wcA^FE R$X)5 TMU UgI3H,qBxV"fE`Ilnf {_Q\!~,0Ө}c[lDΣFxX'w~U9f^lFsrmճU\R߅LU>5HC UX\PXӗ|VՐs,QSX*9|(-UbP/=ⷀ~t؎|Ec7[,pPDE5Fg"U>b"tpLWRO^~M<~5] E}ܤ*<Mzx+#0|C/Up&3R9l>0n-s?"  u*Шf@|`AXC#nX[ѶX<􅡓dsBq'b+߭7GcQ( Pǔ /TrvpE2)r(Gej"B&}[r"IzQ[9 О\ǯk?14+w]| >jK?&Z w}eT( =F&ǥQJ3؟䇄%W WSf Lσ4VrC&͡ 鐈!H:aCIX7։c7Uu@B}U5;x7ͱfivlva=]馕pM# R|G:x(7ʭ9}*rwE6z0{Q 4 7e?(Q<-+`[LAXaӧF:}+7 ]<EcYڻr7l C%a~_RY7`$W<ړz7!;~?ǗHD;8a-Heu,@m[& .IMFHŞcά"GRCW\X׬pEnM#Ȕˈ7CsȜt4kT5VqQ*#[񼾷s.",~P}RNtżs:ahM[Dn2SWBQ K=L|ctWpK[fRܩ{\E1Ð hKo2]x)Q&"0Uꊞ+@2kat  MPG~ (8-Q$ V+۶m۶m۶m۶m۶mszoEޓ7 O@R;̽L|eTAMaAS:&ktM&{!0`P(dԖP37lBq"U"v(Ȃ\{e0RR; Vr1ok;k@Z E&`@**߯ t\@D(UiHjof)GAzag?uH (hõ>ԃU-jc?(A{P"/6§}bVAʨuB;`/5XsE%F]OE v1 &bLa0"BQ ?n[ZW#0ߍ]Aa5;So5lxg4E0W'| ȉ&!A$e|A?~30=Whm+lp?LdVЄC +́0̷Ml5as&L1l$CѹM 0.gJaƠIjƑRFZbdNKVUۄ\ŠOs gbBLElI)b` `Rq+P-ǣ[]Y$.$SS`YYW.l{4„VD24˂`#x]MCGQv30`+6 Ę/My]D7~`S%^[)lD;W%Ff#҅9.cBt&yo5:/i^,K7UXI^Q8!XmiMB#IĶZ܇F\鰾kse&f`P2M[ ׌yr;cC% Tpݿ/f'?i 啺cdVn+c &{.,7"Y@:LV@ݒ1$F#)辙ONDy/xg;R?=G&YqrR];T@%KB[G姌@:\wMaGْ qMw)0$Eom4PdF\3! 0ģU#֌9D&ǂ\!!GdT]Gv3nq\ӑ"syLnQSUF [(9ٗ(;%6e7^JgKtC|؝rHC7Skuqn-oG}-.\S,B(&(|U=XMDVad~%1,lD}ә FDTZ| sO dR1qN: I v‵|g41o[RAw^P/^ݽ%/}jYrgp,)q ͨ;]ި [L `Q0wD>"M 6,:8ˆ[ vSQC WےGth$wY%m(Vw7 C50@FNfgeߗ2dFF΄g^OJX=gFo!󸱒#҂O[A7p?0&<5TRDX5d#`a*d#h!i8 5պxqAwI8gڛ! O1&0{`B+H>|?\Hǣ!]1 ī\!j$V.'yTuЀ@@@mp8l\d=ӟ/_rܜ~^r[ )`eKaŋN|N܃Ymǟ[LܗZ!Ql8~,v,9AF JU0 4eSZ|h| X#[*[!Jej͛-d@ ^4=1c4OUeDd+F+ڀ1KIr4~ 21=BMZjE /&tf{F+ 4)i$&xFL @e-[na>(,64ogXzh1`b5k"Px]=y#UxkQذ2Gf0vDr3MOoJ3ܬ `;;?+5xRksyM8βzm5{#ڴsQ3wRxL,dVWbfƻ逡?T4pgr΂Ћ'}Y%' OPNA_󇄳#T,i>P[K6>B"@]O'Jx6|+>MۧvM]J3Xpl~Ga',ĩ qr-eF R$:0,Jir;p4OU{tz͏q˴؆p <$wҥ^db%kHh\Qʧw L.Z yAx:Qb& n]#(XkmTKsn}5LDڮyDA#sxΌ,|y ޿l k1!wN3s\ClGѺY 7٧jЂq+k.tv4QDyNEf$ގOdIbV~= Ts fI X-dJ6IEzemٌC}ya{yVd8r$ \Q(v2X:?>v`lyֲM/21SV mj L,T8EF(?Tdh&IFrv~#6nq_&շHԱ%';y2E p"8Xez-z7`-nJlo:}s%RKB3QfYKvЊI{<I{Q4'9"i ٫?۷|~r,=y:^ⰏH jґsSgDeශ;v}%B >ei=]!G%Xۗ %1 o{3jcvшŸQ ee0C\1@mWeFol){&/xd1xv&b3P,ڮYIOꛣAX,5 'U 3\|ؓTl0̟*WPDf|aVotUG3}GNx˰Z7ܙe#bܽcb@oA =#.)[qIḷ%o;қhu?OI| L [7dt7T+ p]f,bRQ'[x[q3ă>)Q1~n)pn' +eShw7hHM@&fv++%7|@%' QdfêKD_QHu;ªTctXϙ&o}ʁdJ"Ona\|v(,)apRRTՎ"X@*I2qr%II s/V.hij]G, lKj|O`!/7eATRxFW )::j_xdGlZy yBbr{+-9+lѠXi{($Μ VaF. L/G=ӹ lhHG_EGϽmb|Z̽ĂS;Td4U(4yOJ@k #+eX7-H RE# ź|Ms RͶ޶goE8X?M_ 0W$V(@e#hUsk;~*W:c78q4G0XI']pFbvXѠ}ej3Fۓ 9PBrz>1$@$?Rj:/p۠t id:}&WB D_OwmPY73|IZQC}OB>!4FY]dQfkogKx`7B0GKv˞ JRɱ` pǛgBBf?ekGM,уjMrPm\8~x kY ܥSgכW=G<(2ЗY3q׎#SNA}"?D TH zU#~vݨ՜=яPV}M*, pyӿ3 g8# z)^ffG:&gv,umkV@/O_m(aB>Y)mUûk.+H#yIS Yu L*g],i< ^H~a _pyjFo&h ]R뙀5-{Ty͘'SqTΦM@>C!'kO`ܔ&X-"klD^#t6X p,*8R\b$Y5TQbjJ~ZlQZ&+QJT,|=-Gpwf1[ :}!ٰN иPRG͈{2<}ߎdi O!U| ]QRpК)5T(ukg4C7LoYteuhpM\e$]N\1U wT(F4jROP!{Rdf$>wO.2`_^9g*KuN[*:"x -o haeӆO%U#N_W?ej9n2\ ARsT0cC "`qz#;]ߧ!v6!ԗ3d5F,A{4";=ҎNajmc#@J6k'`1*'D`iDISImˊ] TuGpT8t!y# ??>R*e ʚ|DUa`Mx\`i{cS񭺘Y4!@QB2?{\#Ha_GL>Yn6QTLjj$? B}$zdHZޙٗN(&k *bcρ*ifA* Sa.4rT;DŽMpO Ѫju^jh~6jk5謩8! g~ /,]z1 d؜t$%V%א(9G-)^269gX1R^)>a8;3Z,!,0Rq/BVjI7!{LC!$k?E% o_*͙ yd#7g a/M)RVg:^}Lw9z+E`\y|cT~!u7۴>.eܗɹ4S[tSi%I %4;V[hZb6ގ.SȢS(,])KM:8-DEEsDKuC"aD St ,N؜u#18Czd$q#Nq .t>)2W0Ƕ{{$^etƓ&%>z~:2\xOR`\Gc2v<&;$@->,Uw^!V14[xMLB qhq;TAf_| |bVI?ިDM;;s^;P1 dhY۶rѱSpy/)GFjTh޾*6qFKaFN\@*VIRvw@?!HӍTVngoWey4Y80ez3 2獀9\q8 +UxN, ͣ_O;R얮:s`VGLhaౡcI;ZVW굚9V2D{]Y71'O2jv˜br"3xQ;ISH$P?$ jB֝DԔ" 5j[id`5{~O&xvU0)E$LY\r:l+|wS7Eߣe%<]?(rˆO"A f3ѪM|ũv-I3p P/.?䶘 rs.}A#kp$hL' aD'D)ғ'A8c솱q ї+kx/tf~M.mf%"ٙ(1Hrb$ tZ mWIy[ VCH 4mf#G[Ňf_T_~_MJv 4MeIngؤ*$_dIEqm٧c^}<8z.Z1a&O˖yVOhbͺ2Y6>g A}vZ gLil > \9G#rT-WqRXmZwT::Ӑ*]Q2 } C`㒎ˑcz$̜Pạ548! WLLg1)֩k8g.,ԯu؉f9;Emf"S`>hu6uo7 V:=>{y@y*,bU69vGؼ.w}űCb;Ê}Ʈ)`d.?HYH{#e.p5 ni§g+U" '—8 ګL+'$qbQ4bZE"z݁zbiL12XYC]V@ K%8\iE5O.y7̪D"IS @ 6\bCqF从߳*'PB9fXޖ弡ZO˱Pa=Qϳ˽  hZ`M1/Y)+ZHڲY˒:=d^s ڼp$.LԧCӂGMjGej@'i?ڻ'P1Q.9Ks_~ekY+yg[Vonivm 1¿xWߨCOz5g3O܈r^ rgrWCo+J8up0}9~kToMxDeYh*mGMsE}X D 5tt~|غ3M~Jwue^xvt{U&8'ƳTfGNY\OZqs@b#kٯH.>m2ʭ]2hS >cJ9ZOD5F T-7(tԣUm1wwE>ܲoIfKXTLRW_:APf^wCmu~/k֡ߓHoJ8L|KD$Pe;ޛIN+dl,=1K3[myu?V9@SSG4-[~keݓέKMu* ; `}{W\8Z'@Ye&i͞{&W8$-+z1$kru?2S%M샫s'#aoAoZ&8>'e4K(!`x:jB 9VNB7 2\$h #>l]DpY:: x`i$4OW3긩,6N8eD1>^;Ua:k+lH%#U#Gz=rL.Cwg\MZstC#o4q2+ρrktU 6G#Ƚ4Ҟ!Kf._%LNYw v)ZT91&dN5F4Z2zj)> r<=a!how1`54 (dD \+o~kl+zج͝ º5A!^u93S->߾OBRrs Yd^ ܭM_[ya0?;7Z,^_ r~3sz7y`>G:Ǩ}MΎ2.Hpԇ@/4~\("GѵDa¥ g Qd{GI?}灒rr>[c_\'C$@_܀ K:-3ʕ13^AsPNjɀB_h9d'Vg/[Q4eYߪuܴ~iJeP/טw"3%V#EK fy散EhJtx~F \[_\0*TOÖ~̱y:'͈nB!oJy4|P >02KĞ\Xk)K*MAIU|}gDƠ/HR/dy3 bMBk. \8U-wqj=`hQ:JW fm@DzGYHcxITt$Jz?i 84b~k >*l':Yfߚ3@e Xe+•B%ݗ +3F0Qcx!W}Z+2?-e) RjBlˮ ]蘟nIҰrtMP~dA?y [‘kD-,NW98g%*)p rR^a~rx>o. n8gF=5T!3 =%WiF*y@-~iAJrK {c֔z^pѮD3Lpz|]u/nA%5էI!Kk~V㭅BKг~N18K қ ÍE4L.2S~E:l."DC!Ѩv%VP+R~ض@xd$qo'ē9X/O .Yx+xa2d|N:rW<;iΨ:3 )P)E8X>[~uk7n_c?yi6Ań+g7,ӳ4ШY<@I,OozӃϛ)-?jv\pO*[F}|*pq#8Q;C56$(ɶ?;+#U+"AsKmCW/ԏw@SAguNpH-rЍp\sW؝v!jKA3ଌQuP1jz"h䠪?m}DW8?֧QXX1vhX,B-)Yk3] 6dVŷ f>iU[H55 X9J5<ұ.FЄ'Zh\_3㉥quGFw֐BLJ50"6>bw]0)(Nj58pf>f)R;cciR'be RDp`!sdXt[@`Jx'lXyyRˢ^spn%>*>Ԗd ro?_Wf)y.%GR|Cѭ=CaM=)ff(, &^ \(L/S#UҔR@32R}ӑuVBPx~h5ABd򧊬2elmev[b:]:gUDǦW)jĝed kpDwGg/]]dyӶ=,ws=1P'Ze3DN=1(KaK_,&?{-4t*=)O娥f7 .;*!ffL@rT5 s2U/œ9]( q?=bnK§6Y ;01u)O-'rnMW!XھN 5ӴR7T+KX-3sۆq}AYx9j;Dv钂r;J1OZ&:2*0^1e2b>Wb<_;;F 8(@ T1 gXU2˾28vu'`3M0RYLI |x(9t UtոRJŵHđN'~4\d(}թf7B x=YoDK@g2M;PtάH-˒`4S5X;xHxTB,1{|I+m:7Ů~(:GW>{Ŝ=V<UI=AO'Xz%g7 eRHcXoRqG]~K55_=bsmISx=QEJ%Fq7O=d%x>Q;WS{*8$ n_ͿzIzlqq-Cjy=@%pp bbXvqP9f9CӐ cx *em[H;u4Y9CgHGkx ZߺG$z%F9hqSVc9+8Ekl z6nv[9#jc %'CfΑ̙ǟ79RGR;0&yȇxbVYcJý3Tݢv;Ωw~BE7., ]f8yKyDaì/ uwH HjYI~3NB> ݱEJ<\BRoD.ðZF ~"Ii> G[/p҄joxeb6І`zp m̩ #jû52jeU)HR`ti]m 3E-wJ'MR3l]]?"™{<Gf "8qt"CKʈY%>u!/]|"jt䝭8ySZp2VX(SkG,[!B&ASrߎe%fw(15q!Q)`#=;]&Nw|o:\.x Wxn2 ղo>4iVw"WzG%CPxX)Qŵ Od{"WBֱ&?(7!۲ lIeh|њdͪp Xg }ѵN紽DŽ;x<骰ü\= 8S /B,y&q6pH/ذxl2~Ł ZYט ྶy9Wau ^k8sOIA[s1̓-2HPY,ۼϐ04ngB1ҮbR*[m 66zoɩt6"#̪%7"hoi-y s4o+?K5V`5`_Ñ82dQQCo=Nn9o9B3.#ttw]ok W2CaI\r@_&J(J߇z_:2q.[֝K2–V'@qZYyPPq&ѹ-X`LiXH__݈Ƿ@)pO T|={JC[۶Rɷ֠'CY Z;JRT㯕@&R1]ǩkȑ z`/cQ]@@O&jT wkOv!7 {h'L= t$]h?xf.ع$gxGj4UCBȡݴ 3IE<*~R۠Yǖh#kvV&C\QHFn dB!5#<̺z[߮#<d6T Ă _&{ǘ%Л)0 hNl{ىNfr:eF6/,Ȫ KxLڮs9Gd;_I"JwB#Y|gewO/բ#(ڋ'˶R&xgĥ'ؿ 56.xǓ毱178v/;,G u@J,q;C];j1dc;z(f2J{W~wxduۏx:ұ/䭀TVs/lh%k?'I3aʆkÅ1>`#~ae|~[a&{HE"n!6?ldDlܜa@M0 S+0%iScN||86\B2Lu(\H}񲁌we ّ泲Qh4TqL1t.|&Е ﮽,IWq'w9%WA)Ex/>& -_$}u/tN4%:d>ZM+{%(#!GDH,sa#;(:hx[FU=۱jŒTNng6-!ruOGxLB͜e**!m}gT+ aRr9!f_K>/9_l-@.MrM;ؗtX?Nԍ^G%-9 =]CޒB)dku10!H^= /޸:Zbڞ2b᜶)r'x8/ASz=o-Gi vhܬKpwL/_o󙹡6O^[$"]x& |8mEeqE6S (Ѯ.Ey/8IOkoXIa:{'՗ ) ^[W^ہj&_97]|X &[ {:9F1 l-eE//E*2zbs! NNJFϰ7?s7Fȉ>ǯUԊDMzl' Ej-Gx̽:ZbEӳ1Owuu|%XJF!QIϟE#`/Ч Ld&M X2 Ug3%:y,3uKHz)+´znuUI1K5r`$~ԑ@@9Z՜TpIrdѺj$bb-Z;EkX:"<5򵣫14Nl6՚M="iF7oA`cDF~M}mIN8uzAX]o,(Յr ] ^5jkOLY"6+zAesҞ^b݌%A{Ę̜K\Tx)0yx|1g!gbbbv `*(\wDWBA2q yB<!?< טZ. `R-P]_8ז|4DQQTME~)$0Գ .}>5 a3`{*ԥc:_Z[ IƬ'0su]"[**&'܋7}EoJ;49$~Sqr­az ~ ^:Ҧ)?%|md\%>RKGDT  ,3 Ou Z=aYXzfAZuz%_(x\$'c`;:z舞 6jX2XOZy?1`lOH]@y^yK`HtK[U1pb-IyM}j7~fv՚NN;0Fރ(Z6rEe_J OXC['8pÆFs&RE('b? a/]m,?@LIQ *0Gw b&@ ĭ2Aܭy-\wB9g˦N{r{o'd!~spa3 Iɭ1}T #?EFB!X/{*m(NqyCZ6i"`S<牁1!Nr!P xnK,07 KI k/PQBMr&ٳox kj)e#j|nC% 1s> 05|42Ê!tQO^El 8SE=67(ReҪȬmq{=NujLQj0](0%?(< EOiY/b%/>o-$HLvHy" *_̭T?!R7OO7J@F)elu^SɄx"{f~\bt",i`2= wҰ7pWMJ./|m KWou:SQ)T-M_pV}Wb9y~NS3Q%f('S;gAv8ӭdCxBٜ|o);Sàs;3|ඉfֽf^ԍ.ѯ5Q';*UߪAbO$mKfyd۬ƹL<6iO Fy@0L˛AO\[`# |DIxWO{jA=}]:mힼߐJ ʟ۹ >sb]z|]ϝ5S3Ƴ?x~ywufV/k@;زrD[K74Z8;"] U,b؇.5['[<ؕ9 I1vmp B^u.z#`R.IK%> stream xڴUX\۶5'[ApwwwNp)) =%s>`ҺFmYENB/lbg330l mA&&6xrrQG!VpƠTG 7<9@h t|w@=@ePs:f@Q;{wG 3sП5X'[ chlede0503\ߍ*;[`g PjTĕUʊjJ* 89QU5I:8NTSQWhߌS=O83=.@G'?e;3SMl*2y]]]̜@ vf S5p9Zޟ@k_q5yo' @h$a/{+ߓ{oϚ 8Q\9%%9-hkhk29; M(EuvtSC].Ewcm'fh綍l,@NZ0a,l +HK˽ ϖ^; 7_p1qL"5yg}b}93][ڹz_Λ83Z88' 09r叙 ޞvSCk')dޞt'gXޅ~YZ]/;u`4gT {_$ mT3v !K`hch_> ' 7__mͬzf6&Vy\+w ?#쏟|4:9ؘr_m)q )umL,l,CGGCwxwA<ߵmtK1F[;{  0ssF?!߈(o)7z F>#6w~-yO0qYAΆ`0]96rg7df0#?;!;|ߌɟt| &e>}'n Y\QN_?eg|5W* G;+;! G 7mknt""vnl\zwrqk5R|n@c;c@RȹNʰ4ebӦqrvHM~vrR<>E䁘֯ 7&v }}? f3/} >*fIom!ru>DL^%|oYZ8܈h涄׎4z{4^5 D,|Kg:@aL-jy~4זUOE]u!vcnUilIĶMzG=5L#>bܸPcT}@H\DVi d{Q]BGꛎZW(1O9$4VȐMJfL!vU%ߌJ(;32:Kd,sV%D/ͯӪWdz9WĨᗅ+,š̖4:[^\_'Ud jz5164@K'@}|ٍkghzzStZ\,r\gQ|6EiN)1%S*T=qO+=; 0IHTHlNfyŷ :l%FOt\5rOkm>[~h8oR92T>}ԭ-+$«ڂno)!Z= <~p* EI9#v3x.'kWKrҥj5D;P!q]%ʒ8x ,mˣ`R|&N_([A'gKX##AJ~zI0EM~Xi#! ؤ/^OU̶_1Q;b|h1Pcʅb(3DN>U{4X=Y8oN@kDcgYm1l},Fk>h .Mzj֔N]ilq*CMkW WD˙-@|̙FiѨ(/gflT5Nivc%Aޥtչֆl-:ԗ!|ZoL<~~X¸{"Q< F;U.9Om7z&Px՗0$٩mg9|UaIJ&PWR$h&E#[ꨓv<]cI$M9%JG$fAꝆQ"n[&!3c3эo2|R/U3$`wD|rc$[kxC! &rfoB]lIksؗ,S7M` ? Fr!Fp˅![`!<|ꙉG'X򲊅$k̓wf9uH8<V^)sԅ#Dѓ%aFmdCI^pS&}k-)D*GG9Qo`iW IB7ozP` O.:o1'gfuӈ=3Ouoa.'!=7JO%\4~ܰlaZt}Ӕ@c)RbET^MFvyMg?ze+JYu1>[rl*$N.AwsE=E.UdtfBp ,s|hi)k0ڧRl,5G *9<|czG&y ;!tHb)ĆW&/ZP6cf*#paP+^q 3=/-.|aK+iȴSru9l%Lށ}1 ۊ'v{Rݫ2 %@W6p[ϑ*y聘wl\ĴͩBC-{S\㓆$(jlo l[`UK<{ar#$g T \=B-fd%Br,`)QJ%RMd/q6i¯l*A@iMݱQls^.s=VΠDMLwwDA#ps8gri)q&*aYEЃ@'ls_-`n8frT:ZXq~f7MPKaZɳop-P25SN^>Rbb6ѮW_m岦7:`#kGTQ:* ?ޞ98(Ύ#G ~z\Y_Z .θ>t=+fyڥ}gZgM;+ ҵ5({{% u-\gKL]B9ږBKb+qVu}r,уɹO\Zdž_V:H HDGDws3:?Bżh:A-QLq%ض3P~', (XāYiJRZ/Pl##0%Kzp~^=p0U.%y/Ou<ɹzDh=E|B7YIS|D[5\g/aPr) $_fOuջ;hV(~jqoVTu,ykw-\6|jiQCV4V/2"/8TCL dܮ91P-ARk$LL-;cWptmVUo5M9oBkEmQ^)jS|Yܰ꡿8h=ULKD]K7g7-|I-7ꨴj~윞d|-C{Q̤pR415A*e;QPlXs?ZC _b]j]pT_Ȅ &q:.#a߭\úGODyWY~nښI/ .@ lE( Kgv e^r T}p E:Շ2`)rCms˗ ) N~Z-ԛp6EﺷٯA]J L."@uY;FvLH$+isQΆvlUYLJNggێr=xpbP:|eG{K١2bn*]SyE}0xB :gQ8?~Ğ.<E+i,0B ftwݤrUK$Mo+uCLAh8HI9y$:-4sBTT+k@afƗ)L\gMSΪt*wm=/7#G}34{?9wR:~e,Bi6Yqw`?|:]Q1jV1FD(J`Yb3wk IcSf9<0{tv_f8IH}URD&`Dp7SkMnçzݽ{10* &xڥ=#큚kClgX3? l2Dg'Ō +O{wg +iSMQu CC/ %i|L-4BB:U}4W)I,~VdzJa 8Aˊ~̐ÃeX[JL'Lqc|$="y6ݽP#J9UǑ[&Fz9>+52V/diLRX[ѣoB8 ȢN5F=%O3UI2Ғ.wZ'dN p*'˙ g+PM>,Oq>&DCFSCӫ|'ćmcy|3[܀3r]"Nrt(w&>iJWIk'@ST2L] 3vi/ioQEf'*%xxjFX:lL.nwY|L KKoeiu YDR%̺1<ƆX)/;JgmIO>^Cз r[GŜ&[6}pC_#-)O6Q;V ^m&Z㷦9pf |<2 B魶zPV6">6~ O˝+Ǎ$KR"q3Wni:܏#rm4/nxXz: ~ j昨4-YH-CRc3/`x2= B3;eTjU W&_7k@3gm%O 7H=;qi Qu*'{ozJc ~=C-yriSBD cy# tb4˫ >rsS3ʎa2|9[cS"*`I/oW{T;tyHNw$2'I0愰Jp3Goq#k~Q!-E1̪kjcMbYZ2[/BwQ3sva?o!y՘r;q'9%_u%!w4"eTE;cT"_bʼ>/ ԀLF O}ZwA~錪rBQߠ|qmiR2O'rʴu-hY55$O {|` =xuvkeZdxR= YgɥD߲R,x1A 9=Vl-arAp}z9ZCO5Bo'v)+> E8]e3RAѠ 5! -{?ڿMj^V>nE|r|!WˏR#Vvf{:i8,E So╓NNܩM#~7V`m"dB>L"3#)6n3$3\#ݷhVDйݪ%.h,ш 7.Li7W5>/L8D\a+/T NU!Btdk]@*mZ6\)cQ 05̹ in+h;uU·5!d\ۄ2@_o;ɨ2۾3]عm̒je஻_0tOȬxk ډiwgcA:C UdbiA\lI4ܶxTX*Ԟ$HW 3ӛrD;U WZW5Y~J207a^T9R8hW4eW<˯_#l_T '9*<~G6Jm!2qGBC~_q W$1CwaSXnL87-=}~N.:PzhWgS#?23g |V#▕wB2ukcbGS3&ǢIƙ ]aq`e)SYvZz$7pk %ED+ 3I[s|И,ybV {]k/ކKS#Rwݭ|8U$Mb}p;9A߅ N]nnjx~#%> YQqgy@L$L@ק Kz%?7=7`ouRX N!(Xw_y]7,DP+A{6@a3o >QlJe Kd+dbVT`G\?jeyX"6)~X([idS>}Yhf`eV1׾sH믪zrꭝ >|futH%I>CBwV],QWjbvh$uq'^8:Ͼn4O> A@ ze$Tj)<:~n↩-84#\PݤLslUdjɳfY|&gbe:5ʹsPou?KF)̬t\*\xkp}N:'-"_>#%@W`Ɗ7 Ĩ 5/讀d.b>z$sM33:GS@k~ZqCѼK;XTEUDi㍟tXf>DH ;QgnmRMp ʯ|I},g;wLoڍ\pZjnC/4ߛ[' z1G#j^\hAج2}9G?5*6HP:zF& m$J;2ȅGTA(sDQ= B"|2u˯\aIR{V:sFzҲ>P}!ٶ?.}\\ %l)=yipB^|R["c>/#ub~cqK E8aij3 m U 5ulmT#.E~5 9 x["O V &/(uyc{nM;p'b Q+=`~qh ʋ'-t𭿺K2tN1&m|s.v-W*I(:iS8wE.I:\'"}J1t2 ђeJʢRثnh_Ɣq)MFO#[Cc9c{*4 N?Ƚ 88u82(x w"3^wUf~`G" D>Bp='sQOۮQJ-Nh70{g5v"]|7!!'7c]Gp1mSZ T3 NLiW{|/ |cˆ!ikӖ9h c`"2cЫqra8 jL3:x*/|@IENd_Áv)lwTfFOS&Y`pStM9%̮uEcuع0BgΧq>P~QUȐH3Z(ül^يՂ?u&sV0:IdtIU,]V&?@qheA}.SpT2#X%H0δ3]$¬- k9͐7[3Va Mnާ/cYZ,?z/ "Jxi/T r8Bl5w5ݼw{JC셏H`CACz="c^ o b*ޡ\g^;լ0[4v}~ l0U_0k6 &z2BxbqAΑ+鱈>·9:H)1@͈EO#k)L.8Jj6m洅 ;4PWBw9;'@"!%v^U+ ކSԙJb\AN]2nVÈ)exwJ`?:?bc 2)[L] p;5H54+ N"]A 4H)eEݜmk*Qp?H}q%UwkB1}]}3~!.fN_X6->#ZU~ܚ-FSn1.{uʖh߸F`s`-AA:g) hq *ϴ$0lqqWFimwAau$৺:!O\2"dG"i{Jȋ6Bso% %+r9نZ(7U[z?|o^ݤ$%j|!2ACo~E]W]O!DbTfʠIv'&\{7ꨔk۩t=!KS1}*K'AxL=i3O#EBHKXPC11տ2gaKƛ N U4cK#݋F"K6>.y$. Ja^4+97js[BƕX1[=̔n(nَNZbE46n㕘w,r20Z= &Ѫ hG'6Db(j.#;g L} a"j8'>,^Af .n<՚<.2FdG AD٣|3fRİH Nda8!맖7ԬU:' ߓWp g!t: +fL\șXX2M:>su/qVn[٘]PўڃIq}"R;*acZ֔nY7D(4 H:y+ypain"WǂH89WߌZ&EZ^ ߜ~;ztCr((H$.VŻ: &zq~Y#+2\ [ 7H^C|p㣂G"94;ܧJ7 VA y2{ޘ[ݰgbGVF[lOa[|6 3\! OF퉧T6:~fY^1+AV3ADרJNUT)/mq?B4# ZG1'l(:et?]<הx\u\wC_x$"rrXaG멄w-{B-c3'cbD?ˠ35Yʫӡ'z.} bmYF0nfCX܉_J7h꜇|kVEЃXMCq!lp9ݷ}8ޫ:0h6.^ƭW ݊t**#-unR慨(Y2zrI,еTZDjʆw#'|s}9FFWVr 6]lp~)B)9VfQGZ c1V8që)EXpc؛s $u*UԞ#ˤ.wF-Iv%E7~[sFfq#Dԅ0սSg7d3. S]2D 6aJ `C NH6_FGcjF 1(-?-|ap֙gK)]s۰GIM}\ نL"Š+^^^ QfK(H@s-KdIXpXض'<'рwjppEI"pp̺咔pà  }Փ2ȅ:v;A>a(c˽ (#~Ny dhl+nSpUJKn}7|ulm6=DF 4,a4jcv Zh $UD!.u5`,E'MC XIb[UvkL^0j` X*QIWMB$rz \KOeЮ "$sk8w`˱}ngn1 gvRIΪ{^tqcejR۷ePP7[ m"_iF 'sSTw4\_}л8Sbzo'BuNYOOgc "c@(-?zm@02^Tax81 FR-dF#2Ot\tꦹ)a1Ct/s0!qj( C CR"4!`H㳥Z+L3uDq^w*IC^Y<6& >nS"aUv6'Cߧ$X2$y4/tOm}ˋaQLpUZt#Y Zla %$9ST>/j|xvF^cnʄ4б?Adπ&7muT/A w%0(cyy6q!!r;] t|=G`$ .@ D|*XUwB_x͹/ϵЎ lG\5^UKY_Ǣ 6_GN_,-*7Cl+8J endstream endobj 544 0 obj << /Length1 1683 /Length2 5459 /Length3 0 /Length 6562 /Filter /FlateDecode >> stream xڵTw<Bdds&{q883P*D알UVEW}~k??n`,p#hAa@G K h0#TABh( FCdIU"4)'C3t!h "Z…!pW(ƒ+QAxaPW7YQNgʂmpgච 9Mpa.0XjFƀ1 1 ?g@ Sc_ߕ3uLL, D| HlظpȀJ]M74KFHWB "^s|&nP@z';DA~78;@ gEAO"pDz~(?cZ !p0 1(܇B*$l!2f6 'cP濯턀(4wGAУ ?*iE؁ gTud)a @DZ ƉT ġFѧ F B rƻ3K@TJƹHBЀ0 ~NnBgεr9sH Bx.` u(@#1kHP'4NU!=wAҿ8$53R!='?[,u ?P?dCΠC =?bP:lE;(%+  J~GLV .]_dPy#ܸ8C [),5 w@cIqj"8Y;C h\ A.$ّKBgLi@SDcp߹_gzEuh$buJP?kaDp~ZY &% A$G$ZWʹq$m3AHGNj/rJ )XN"x̠3Qɕє N ? ;O.v65  fPS64 SgvS̆VgYEuA5x%-81a]Ynb50 qZ3$W: 1'1(uHёU}f }QqaP~!e!/q;qUW5ѳ< Xin ׶̱_:?~rҸL*o:' 43X)V;ZGlZj!ܹIq Eki'yNa˨ƛzLĺ524 lL'{XL yd2ai5fXtUG;M}`wN|3VˁiEcH)Ŭ'ϗȡ_5J-y)Ђ:jW_@mI,ש`-d5Xrx5*w|݋oWI`@IqMַ NuN 5R!;>;~عR%=x=?_P'ΘѬv]p !)*^_7.{ht0M󡔤WfTM!N٣ڧjZr[tA{Y5f#]`;<& _GDKK>f]JBi~8^P[MiF7#GʛߞDnCz ƶ+E,G?%,> nvNe+Fd|2 )6N+bhi5n{wrDdf,0IñoZ-b6 k' JK_u^bHʆ1 #݋)zS>*PїSLyyBIe:wE۩7| exWZ KE@?3Ӊ]@) $( qyŤ`$Ue Y`_ZIǾ6S'+nEIN~MѾs;?ĨshjsR=~%;b%s9:TNhVYJ) zVFIzb\$lfR[2 JBwW?(;0@\t;rsSku bioc,yMq|[6-e[n*僦r(M);DC!#]%{2?FC.wt㽕l)ar/ez>߂_wFW>£J8ԾВ:гTRiy1LsAG'd%&~7>c) [>6JmCy6&:=Klrb@RO-`A~5 *-&0rݏ;_hk۱J6m_XiÊ'n6zv>u組SLDVV}QY -ey~\ݭ7/~_$A(;al^a2#`c#理r-hzi= Zއsu#vfaU~- cK&IAC'+泦rdvIR%8Y`)7*O]->v@enXv=Z xbYzť> BI}HbQ(/MR[H9u.Kf)Ʌ7L3Xzط3/V^M=ܚd Vo"*_ . sM=he,_ _5N׬N< ;ίjk~<תS.ߙhnv ܸ돆 uH+#~1mjNo7ADi]dw쾼J!v$'L((e+\ƚ0Wi}+dnmO$ "0BC#3ANX/`ю mx}eO()U,xJe6owvUL݃£EhMA-S?>H?GAw+=4<>Ȭny#*{%tDkw-Zœ2pZʞvaT^K+YJض P!&3Ix"7pVu$?Amn<[5v_PasH$;) ,ƶIԳ9[8'xk]HQUubk"no ?k\;1SN˖dj09⊈{rYBmZ3 \GO1 v -p}!E6/j%~DcR3Dof2ק'zcsչI>'^9J/S]K$O "XzwzXR-:9H}s>'7xSQݬJ Y?J~+  ɹHuL؜Iꏇfnj$ŶOk8Ҏ%X3S^?Q0g^S 0}9)Ln^ٮ5vtaĆ$Yۺi+`// Ts;;i}qr77xQ}ˤygw!wf{=B>֕H[:Wi{rNWX|ewek`'ߝ-$3l+ , r-/C5(Sa'|RH;e}R82:%[?"0qU6CrEV+L37\#5 80Tjھo$g<"2qm]i(~osPB&ߒ \Д80+[`YWaߟ.v_Օ= iCtlƾ\9)%+L V|A |ko]mOeH=)u3NUm6TLLOuW-/|ow.% |z$[!g1[O{|VdK9؞L:8L+$6y<΍hYrßaOv}ޞJT~% 2Mm# MmƑ"iFo:r{|;<7!#i:&QFec6m3䇵  :a+ܵLVk3 W)`͡3c0 U*@>.P*?'+`@I0P>'b3\2kX=y\Ds`SfGdБN'}5{ 10WuYe :[,_eBH:i}|﹖tm[gnw@69ͤUM<9@,k>uފ` ]gqiMܱvfIӠ;hȔ5y:k+db,k.1V HYr!i= LKJGΞv?3,o_`x{mA>&<3G ՚{ EgIhT' u |4?@V ׽LįG)Үj^zM|"T5OCAۍhNK1/׎h6> /񩇲}:۰綯R^̍5K'VVs tٔ$}TMWH}0ӭJAyPҜp1si߽vv$u%V7C,L`׷M|KEa׉oy `87jX/•Fbh=CԠKNWc_ljl+#$KF&TF{v6XX?тD $)}{fWQz?~jL!R@v0lA8žbPfSOXB }fw$Ÿ ckvr)ZlB6o^ݪ/6x"}Q@8`L2n!&I#F){?b,GHrp5Ea=@pn&!QV APny}.t wvOsLPsj݇7I7SG.ѳk6_?=46ڼ!bn50x ]zbg]ozzf-[!16ieCx% ȧ]_~ZM NCmZo8~3ԍfȠfP[gzϹig!²~:nk̽x=^OxFE?eoJ^cň[ZD|qw{r3YE&qMOݶ{`3ݍ5m[S3<ό(d\ >2Gξd_1BYI Y3d9tzʤO"2AwRz] ^KGSgp&% 9se1M񨴾659MRC#]ϧtRӪ+3E6OIH0oVU L(A<{e.k‣j$rMG5|9봓roXD|hǥOşw<վTPqwĚ 9ۥДlM yZܗ_pqLVirH˗~.Rkk)$O!aPde\7TK-!{6'"S1vN-FjhXR*\#BYc? g#x\ԩ3{S2mN5BMWJ2/D[c>lV6pZ J -1QrWe4㖱HPǼ@E{#J.g"x`9Aԓ5m~@ -pzJt=:;`^%z]e3,[k~FͩKr타/=(f-8rbZ\r 'RH 'iҬp<4yR~bitaYO#>BkXO]mL@rJ~ufra7uZmUݸ烝N uew@F:Y*&͓>ДidѻݫS :wT4m[ 927/wu}> stream xڴeTN5[pwwwww2k5Kpww ,8y'& n$UQg57J۹02ՍYYmLlL,,N@#K{; # ls"Pv@'Т)t1th*.FΠe"ndin;;#LŘrF&֖#;S"@dF6f{3P.VSTQe%VwupwZ54J@ZS]_ ~sh7w$+7o TOiP3'{4..|L.LNL6ԧaa pw>6q3bW;P49Ik$%(dwOa !\~;/ #bTTFv.@;#;3?6/Д_NN9.f샍_w&vΖ.YW{,)*JIk0*ώQ?޿J(xXАJڙڂvF-%H'{'Ojk;{w;1YڙՁY(+ 9:&̿ff3#gl8}^o 04q8 ]/3/sDiA` 4C`Vw?'\mll4:ZxxhJtXXKe]@/jgn?&g4 }`d5X<,A O  `TYG0rr2D` ''4Ŧ@&dg 8~"'Y_ ,q  `f0KA 6`@]+A v?ĮbW@]Af?Tb@:]?iGPN#?@?h Z3Mǿ-,NLVA393@,nA X@ ,^l z?_XA!U+?ˠX#o4/YKbv:Y%+5 ?F@;[zI j ?R\,mdw@9\ :P_/OqL^@Q`s5P/E#'K,Kd?"-&f`dVn.VPkt~G@aeބU2?/ДL+tV2;p%vɀ…-T 2|~Ɂv:l6[nMUE%Er42ʻhOr tK8f3ۈcݏlЮS>Cͳ6c:٠{,w,OwzČ2]X=&ۍU$D0z^' 0"<6ʚ c-rȄZ- X8;S*yzO>m50Vl3moE'/3=&[:6m0^>dI#i!"J䱗K#{IWfe('hOsgT,--4i&?(S>>ė ̚i>Kqb(Ym9|EQ U;DZ[̧'\: kཔ=/1 dP3>S˪Z(#j2NJP0 IkXùK, bХ f$61GO0TnL]gll xbwHE^6 3%K yDot%D3lC&6fJ;?y]]c+f8ZR3&+E4;ծXsWabH\cB<"amm> A/a,$C4z'ϚÖN$"^R6\/ހoФϏ؆zRb+qng,Ҿ|)#]bƎ%(m> aOsϹ(<tz>"k*_edu[:?^ۺl!5rz&DOWjJ]/WXVY0έU XrWN-N|scDvz4M g}}aXrR` &\#m{MTvdԛhȷˉqjh7 8\a8[R4J׋h\2 DW8I=H=<?kli,no=Hnj~REv5\Dd!|`Ð3@%q;{B`֞'r+i+[uFgB?@',:oqŋHqsg4!l)Ik"P/G.e";q+dC&#&$؟VUEI[3.+݇~d "QjW,KsG^3U3CހAxfqf쌠YE3>eBt\>x=2Ćc3(F LSwڡM3ٰ8B:GX~:r|E s,/p}דHT^9hqK5YJ0, !ufB$y޴o( kIّ=ƹ*GˇD 9 >c0j梛BTBYFqu[]u&/9? 8YxⱹMc1 3Vs[HEnu{eϚ+X;vzeX=O="Ck%ث?'k 6 o~$ D&3PS( rA{_I 9A{;h7P-'F*'K2Q8N6\X!ĸd C{[ӋBG'v* ʧ k/jU\N݇↓tBdʍpA^C::$;rVTٲH=ZHTMoHSx y31\Y&l>xl;p#FVfPMD`BbԌƴ=ا7'3B_?%q́c Es 'l b&-;+}粁:|'&<=Zgˬpy+5 a/1ON_z4 }:k @CUy+j*nx0f"ྑ 4#.;|DR5aH*_{4F @#jZQS!>s~W! pTu*ʜ>`0Ds1sV]yolcwliw$ YdL8:uu'.\РH ;#C|q6s! r=fνk\ ^Hxa#NhE#woFh]] 4ZH ^PwcLJ.> ?ci'YWJ>ԱliTn*M^Hj[ urvK0Ocnnkh( 履-l+y/1M&}q l}i`"Hc?2ֹ~q f^e星y^NHq 5=-E;b.Yo(>e  iȾ+^v$Fi5Lfjwq$հUFq7CwiOWdKЗ ei.F 13V$|G"fosTӮŲП;'|7$.bfEFiNg+yob@ʁ,@#ع$C#dʗV]_7Zm=׷/$HQxu?biutt{w(/v:k aywUT*Zx)Z!~濴j&r)UŞ2 Bk:jզzn9œY).i}$ɟ->G^aڐ0=!)Ҫ%yIE֬ma aȧ5sOk!~l U#8HI ՜?> u_J(SPޫtE9 ry{>X{܍Fߜd5 µ,bsQJb ̫ *RVţu#v.LAc(mpZ#oG3-m<7q6RلQhnoq^.4\hxo~''ʹ;uZ8kYܨzw3CQX܏LHU}bCUD.JgaebVA݃m|[vCh>+ڈY$3Q/4 %s`?$Wx}>Vม2cBaH ٤&Wxzjx0,3vFA"*F(qxakQ Xc(6$6G|ol08&us8Ⓡ#)U]!: R&btX[ƕmB2OV߮_ws0~2e-dX]gɈEZ~n@ū^ ^lUF?\y @ic%@mW{pﮖ<9j[>@S}f`6Sr,=\%2 $uGFFNFzuvk,)t?&q<CᔑC$!zٖx̬Adb&O٨n;B Dn)>ɒͳi+6?6 :FHZڛYK^ae5 >tM-=|o X#S#UJRߋ#푪r8z%hܒo-loL3x^an2WWg#Y]u)vD.X(+ iŖ(J"|j9m_u En/Tj!97 :NQH9J/ޛ% m(>QZǖ5nĵ]g; +JGvG$BX # ^!Gԣ2dh|i?9dq @)_|}{e ZŎ}x\dz69PP~ <|a˫'[W^!TdcUmܓd%iSjlQ)]gn;}ςyS01 E+ܫy858߽v?DG0:!_Rjħz*֧PruO\s3x5lBr*w7~/g6X?nP@_rntSŘ@({ qPEA^0N&OJe)iFu%'%i&M7LA0^gg\EmnpqwMN熜9"b_LL%?<0/ҊY1YZ/X_Lqb! . yLJW0i>g(CPA}ioNN'*2v;}.(;Ç :Dx2sv#'tJlÆ"xy?w6EhwQIhUnWuԖ"MMNtyGnY0a8lsvTax/҇wіolv}TX5w?d%*TzMR`Bw)Ryu+W` n B)"䰄cH^|fJ*{g#Κj NZ=os_e[ԸqY$Bk O=3ު*曼aInűE_"}]w$2hCZ!g E0.PA~E'!9yk5Ӈ}uKǠLq BpB.mnFea:l{ 5缿ݮ'acN9OKKBl'!TSDNdɼƍ}:yI^%qƵ7sDFNU-uI͗?$WDPZk"ͱTxtb8;CtgjdVvOIdMV\TsrA%%+Fin) ].Oԥ<JT919*dFj=Us0Nz~`OD`Ot3#X=@9v˲Pf2ޢ49r]6O۷vEݣ "a'I&dk"c^&&'LdS06;`q1C]NNNYDI yJFP'8 W Ӣ5~GEi*mڃτ2.S,HTbz{r8oOJ"0Ŏ=:<%d+7YbE:yzO֙+a b )h)V9%rCܾ<\DKf'kW޸) {!(b/@6~cPmqKk Ei+~Ԕ{+i兺Wgfʬ-6҇īO&9;˔njVt@F.P.܋swoLTO_"e D78H*ߜ%Rov=c:B.nE72(ݟ G%KXM@F]Nǔ((BlʂGf3c/Hǚëqd:DzAgU!<@m%wa ;"sEw4#,kpǶ •#BGZ=`Qe}Ѽ6PlߪWDh2ǽy']Jת ye}K4XSy_H)X%)q1B bWA4Btj2:f`T&yP*;ʗ9&L=K" a>UީkȺQnKb x\3H5eUBooS~풜޸m|-Xm م`D΁f{1枞G6J@ƈd_wvmBlJ/%{A$YNsjeNkWH{ђ} 佖2s+d#HJ4>.֩àzH^22 =̴bzXQlҫ"OɠR͛k[ ?U md%ړƚKݍ@3Jqρ O= J0GZ88ou{q^Cಗ:ĽMy 1-jjy R3Nv5.0x~?RqЄ_j(9|NAϛi5|x]M7v6>l߳Q8H"4jc+&[ِ!P! bd {myl-GfW槉e yKHuXPR|ٴ ^G\mo<` y.pBQ4>8ޗNrx6,|`W2`ˉPj6GFoͦ.eIt`uA<-cܙ4//xb (Q~xb/seҮoOzЌp AIynHC},vA#t;nh:3 u6ZMAex2B_n:֮sxcf8)|}2.+D "|΁zj<n#Z~uQ 9V:[LTfAP5AG= : -f㆗IYkR`,YbC\'CG6 /9 d$X6 Q%JuOj1jQZH<8o,cYltv[o봕pSsA=YPyheJ,X~yrxN$f~xW`* c"QoEјx5=^fǢ][%MmN:t~ƹX[Y]ZAUa"Awr y N )ó/IȎ\-I1,g{:oO[Tlyi_pNc w|oڠ ?tu8|7MD>j[9A6 9xb!.4g^ITa~)(QWcD⩑0%w^z#Qej:$i_y5JSY ~?;Pv)+] 5va`wj[C[5r`1_"Grݑ~Y(|GĂ0o?m;-nhҋ+&!`v&Gz^Uu@0| OV5V/Tht !m@*mvM<MTVT&]UZ;>#+zBŊƽn xmI#>WܩE0'9GuuU@R+O6mX8uwX׵sR8!. I9!YIr:p[kX`t[]⢿ьÿgC "';W31܎v 0;0L@J7|R(\hH]4i$yCΓV9ianP%mh`nBӑBG2]u,JS9}E>C=fi5c)Z96[x r+ !뷹Nm 8o&pX5%Dևt6J>C#AJv{>ԞPz9j4`v3~Olo{" ߓ /ok@ .,T󱓓ˇdPbzyAos@  2N8B[YhwL$/*pr~;MUTUMjERBo"O=}go[1LܒAYa-7̕*ޕfa R\/FQ8yabM##C\[[~"sa-)5(Y"6{¼C->8Р#ɀt-a\+?zSX?+xUs1!umrL-QGpT"7jD O<.įFz|$ev2+Y1$$nF_2!M7 W&٫IXUʂНLh)LC]gtp_ 5u–?om!VP}{N%>Y_tJN}~b٦0Nk^ΰ$bꎛuzco̹Nd(=OfXfxL-~Ә?>Olz҃F~Y \TۧDQ(RLI_>d' 6;E{rN:2Vo#aMRjm:/cO4I. w* {YzzٖΓzˤa(-z7Ry= pQAಉ/-4Dcb䞁ln99By1Fv)?r)5$]ZŌbČ&V=h}qGq_&?X4T  Jү7?z\Sɾfϧ@P6'{eZLH[",71 ;n'd5R=$QДmpFJh4QLd0: ˸ݹT!KDžO3Gx R~#;ްAT͗3X1hX`Խ4%ZTcTI{_(~ofp:H8#bӛ=}w!k]XX w%_=xֆXra]ڍiks9[}mTEzVSQ+UH%;D w >UK%P^1j[$go@ F"0CgЗ)ɪF.©/*{ ƍ a~;&?4QEoCSsXh4〕DdxvJ},UQC{(J,A3C..j aX7~L.Str1"s`ȒEMvF$:h5TNM/\VM^gd!dÜVJ#TQط8}'Y˽`- 5WcQٮ;o~kYYߢ#F/r\Un$ 54sYizWfR-+hVZ]w ~Z.XOsF:u(kתъ .ܞl t"Vv㞽mN/Q3rwɫ41;e|dGjIw^ޙN,[gU ;)E).D :Dl䴃g)UM<@ ӽp"_e1XyTZ~(ONKY2M` [ɭHd?L?QfFX. A kWn4zIȠl> 3Ïw1&·Z$+= 8b^͗@7I QN!stq1r֮_{ؗmQH^j_1I|j˨eD:=EUaaoi{$' XϾVKU•6Uj2=o0'}|kVpNw|1 rovL;vO& f=?uN9f:jzpi,#H X]eOc_aWEvg7[}ڑ |fƍd~qG,㓲Vy|.px)yOAIZ'Uh٤Q7u.)P)ܧfd!s"s˓K\Elgn4I4 Q;Qm:dRkSH .Z:`-iuwO)Nxr'׷T* %6D/mCb-jČTۦ{H!$7@~D8\p6`~reF ۯ3Bw 1p{[FJ Odٌx,M}_'½4p ]m9RYK7|45+~R)n<ͨ"OKbM0(7u9Ye68StdS J)L&37M!E^.Q8%,>"5h6kb!kJm&k2İf(ǟW4U _ɀaV)oF_? O@t!N(McMv90%4O+81`k@}nzK[F3NyEt?LBH^@n5|Jnj#fa8^RBŔf^_jh?G)]Vq9IXlS3V!T B%oJQjhi 7f"ocRs`Đ5U  VU„.:_k-Z(J+ε۟-"&FET̪ԖriaEkg#T= *6g FՃaWW `yQh[]^A1'U._#A)B;;P8wu3{v&EqO))sIurC&Ґfd>a.2sR/_3̡BxYzJ<&D~Ŋ<΅ѯ2Lҧ\K䚷G'ހU̮kO1YcPqZNy-3*Inf|P]Wh͌`NIz8[iqd6ۜxbY~}΋ZL SOqwqJ0(hb {'an7DףIet dj~ R5?8iP)+cI [)x}eaijԆ uכa5=?n$0+C($ہ]]zkP?YOa3]mj#u \Lvŀ.T&XL9TVf'i ZѐzyRR6A瑏rjl)^Z9w֞AN$T!آ,cgX Um*Yݹzꐽ@ [aSꈧdIa6f?,\C}{biO_6y1Y'V%暡bܐQ:TGmNk ձT589Z F\?i9ao;0KȑnY,îSݷҨL2N=z) Oѱnƽ4@ ;mL90r8&QĀŬZI \"bOC!W z CX@@1"[K擃!pevĒcoO,؜;Z64PE ?*,MoqJ0) 9R_͞5ոXKXҩk.o7BpC }ns^&Í\+kh!&nL|S5@[7pFT$/f"䍦OO,) -O65k@26wdYvKg31xK+ܓ&_[Jγx VضX-ضIkX'щUe#]: /Z'l w4>n(gbV PfsJBH2,ʡY3|` ߋK$Q]7=G7pB3 IX1ݏu5/Cdro9+@JD)DhJIi#DRWH*9S.I!ga lGBug@rM>3.Ң4tQ!+m2Tۏ[<%<$[Z"E4t|[^_M,F\X( \XV0OAwAM䫄0VvXRjE7ztm[١~ ;ګ?Rk^iGHws/W=1"t_rc&6?gª?uG;ji:IFZn%[5CTEѲXi78̙K?3,\OC`Lٔ~Q} s iM< I&z.ᒥEHr0p$(ˁ_X(/8g(ODhgb9YBr?SU8ܹ/1Qb>[->lΰ WЛ6vd e3u_O&𙮂$')ly:.M|U?]gMe7>xD,Z:WqY0p YO.hȋ0Z6l65W?2jg/%}֥̘1lpA~( P> wfG2.z0}}I K)bZcjdXoN,KbEBU=!OxZ{sp^ cꌖ%p7'GO ׷_90=ѿ[_vJia>Gǹ%rt8[b?@J2ԩO.n ?]wɵ͂!k93_+wIK4d3>~ %l}Nj7!\QC]:}m'{6tNa) {fa+&gj@Ku3o˄t{8c%c!`Б\E'qTj''r x-ޝ*M#uEYGLAɉv#&#ą6+uțIbl3O`W HZ}"f̘E<#\o ~gƑfJs`p}#Y==M#gl-sd #* xrMTo"J," {"gR!!?> ]u}kkHYU:o;lٷK凮*:BZanHNi6Fn#Zb\it|zH^;#f0)޶;jt2fCɸM@{_44_:m*0ۖ{V N)QӼp]A]ОyQ0cv]f٢'!lQVA$,>ji; 0/9h:S#[4O37el#|Ү:l~uHydo25a*HM|c11;t rcHW a:jje'YH6@Vd: <nmfG݀{=󩵅(Lw8-p4tՙ.ğ4P/a; U-C;EQe.q?Ӱ\?Ty*Ix=O綢ߘ'G[k qD"З@O!1$x}_=>}MX|]rڛq/Xȅ/|f Ai>6< `e|1قx"+N.WCzͼVe&a4c2cϝ]_gOF?^&B@ģ.H")Åb>̧)aQ1X:V F!Zw8ӂ M1.gvk31Ac 5Ϋ_,B윣C*{EZ 8XeIHc (B~(UTRa&^~b3jcqDC+֬Nܺ1AgBJˣAHaHf &1ԑ'ʫpMg#np-4,h.JnV[26?̏p8u;9c.| X7/DQ.m'A'((>Q ۷fbiM;/OMILTOESpW77& pD3ܘ!r<ʯw6Bvz\'u-L ea2 ª(J[kH@ yǡDPdr_īMiļ=,g0"QqdŚl?1,E[ːg:s'豦d~tKR ƁRgnR;rxIP~޻}!.bPC8u<_;ks p V S7AtZF#nZbyV2DRJ~ډHaDl27qa|#VMIV\? LD<|Hey$6~P5 kzך]v ,Lۯc !IuzQ٤ƴ˥Mvu8 wMь$ԏ Tb8j 2>nLͫWK?_F7]y 5q^U6dE'c\D!wFJiiH@xO \Y%VVa{(@m0TmFЄ2`{sF%!{o[ h!;+ۇ WP<{WZﱦ\eLlt"}աfTX0D]y< ~Lȅ"*nD<łȈvS$-?͌O)xe9ё6W)>S :L(vTwCvb6t)Gg{{TO&X}z5zD "p|\}҇o΄eqp*t?oEV>0`*,F/Bk\B)|M50=+ܦt饍D\, OQ_xO"tg<(gd yoKʷ¤uM@=8X+ØxKPUsP/t#~2kPJCeQ\ '̛cmEI朓SukM~#GqvjԂ 9mrMVmVAĭ?B@8]kRܾ1<)tsBK+Lه nҎ&GmlI,ĨUe i6DNW]#|mK{/*"d *UWAeҭQ`V>Ľm f ݪk;竘sՂ\b =1-#@dz< -)fߌ11Q k!gVu kXL[m~+ I (A1&[K2Lc7ZCݼZMѫIhS' k'H{^e jam`z,|`Ke(&]]^\~.d.o_ӝ>=$Tڃ{/`Aj.4D2uZ?tl@T"ΜlpO1Jkt 6T5{K3θ 5Jخz9Nqہ(,fW5¨]d> PE-:3ܥ?[!t'\s-xEW'H Vl&KW$ i|V8^go/cz*ev9jߩW":nI؊L$9kk|tK90/+2W&䨆A+4-z؛[6)YOUg S{rm'LkFeaNL8$G@xEIfOmYLZ` z٢^n (lU㫵9R"م3^fhsv<# ❱ַ7-]M%U^ZD,"]Vu5a{Κr%qZD  mPa:p%fJ^Iu9ѩ3p?fæ/z;řabE+! j v(XC ,-(= i2Pi[ěkZ0ήN$K{!XjhkG<酯t —yKa6{͚1iLJg_EIAV2=Wc6OFE]`fAPlm>8j;7ϏfH]ӛ6~,?[??gSVjg6@ߚ:ëƛ}B 9 @ۉXsXRL1iKW@,N"L 3Kv諘(QJu}4~ivхEWU>KjӕEzjkQ1Z4$Jm|5-C\U@*ELJ$hq l;a8+A^{9ZǾzd˱rVg &]*c{O;sK2֒l|}B̧>TvFVCU2iZ2vjJ{&&\$x%A -PS|hO2$(.;&K+# QY4VR*r饎=̒5D ##|<}*4\P }DNChVf#zvJpq90U/V]f-S=R6o}lYXH,R&DUs)Ǐ6O&\1 )v}>VRLBBu!\67DƼ!JHMcqbd4рIcn^yc | P7 ] Hcf`Y'R5I>TSZ&s5Լ:Y1}ajq{UCbǵx@G4vB* Ҟ?%m e*nS!%{v9"m* AҌSq$6 ~R4L\5@-t[īh ~؅Qt`R5l3rB'!~|O=I/ꟶYepUw￷@Mk)i.R-_UC\m  qVP)rhj*~L*@ՀúԑdߏMJ٤րՌmH% jNL(ƈ獃)$ {Q8m-N m<ۍKOJ"҃$jpMDCPp{bۛSnug,QdD4dŶF.RB,p.9MN'H/Zh@YqJVCF]g!Y.o>A؛ҡ)mq-P{=#^R:&4 |*rU ڕӆjC}ݤ$DZ'mÓYv H3[ċy4K:'RqQMǞ iu[Р|gیy  r:JIoimV8bA@~8|MP\q|ijwHjZZ-b}J:f<[H2oF;QU ~MHB endstream endobj 548 0 obj << /Length1 1727 /Length2 21289 /Length3 0 /Length 22336 /Filter /FlateDecode >> stream xڴstd߶۝tIWlff;wl۶m;mÎҿsϻw}F\s}k"#RP25ڀ262 &F:a[+c3=##+dnk#jr9@l 0tA*nv@&?#C 15R}ڹ91XF-L60uq4er.9`432ؚTUe1%e2G`e';;[EDYEU *$"$TUm 9<eTT4Ęp:8MT>\MlI4\\\MAvVԧbfpu|Vdc t̀ w62F@G_'q)?Z>_G 13pWFAA`m`n} @N/И_"Ns[w¶#ӱ4p3qrt6q4w9+"`bn[93SV GwlApp1oD?upcOB[غx`sc}7vcP1wJߘ)`@W#3_/O;[;#q@p4p@N@OA󏥂Ot)[׿J/P@>Vh g ??r;YYX)if`mna[,m(n 4V0R  ٘Z?HRǾcw1F6@GG'?*Gޏ *NJ0pp0pC``3@h`}@[S`0 #112`!`?DV ?"#Q*]З_[?2nn 2& sWm1}dl++1X.jMLϿ;t!,/X$7zOAqџU|אZN! Hrz%i`Ym&TN+ xzᣈ fѫ.uSKgiΤŶTGOD:#'ѯur] 瘚10\::A}BԳyAGazѴhT4Ӗb5j!*Ċ$o]w?C:utD͕'\ ?(vJH$Ex$+S,#.iU&nuF݂}FmaV1 o= L֊G,7רWcwtN}*!- хvTzMIz& ˖WBTz{Ao]j,8/|ՃHȹOR Dw {ɰB|>޼i[B(wΙdc*3pAh @t5`=Nݏ>q}#fCn$;f%(pIe ;ˮd|M{R_MT!:x(ΦzD3y1%GxMOQ"]bL4r̨0WSBLtVߞ:@)ɴO" #Xݧb#k5Sj\?eS\*5W[L̢~iA >/^:t'cP* p<ᡳG"/rv)zB . 0z*kEJ]o:#DSJaR!ݬ~=|c S-Z|Y fL].R>R/r}6)/Tπ^":'&y4-!Q~ ZT@IoY7dH[a=HLZmu#LJe[Oc&m>## ؿBP 1cclܔvR!QHqs#X:ShewZj*/Y S⢓+%M)y4UT7usn*$n*~.U$Ac'm$%S9=T]koAK! %m NK"mPd6*sP8bɁlu\%%G)MWI"܆Z!\X+d#n|D3<%pQp;NH*;{0hK5Ҽl gP%8.;Y.ͩל VsI~Oٲci26u{?Jݨ|8ҥyZp#KPv,9eCI!*FIXu2NtFzHscn7o0}dea~j% Rvj'jbU`ɋ}P+x%鏗{ڠ!E׬uwfHwVJt6sxn3צ,\/felUc YbvI: jo((2] 5-?&#B{9k?Ӎ*dtWU1CwxASwz >`J+lIUIwbnb`+lpNC8gkEkXʎHwA>UI7sGg"ʦǢXc}{ˢv2es!G H!fAN#%T#/ sCpw3>[K"zx1`0EQnyh\PEçM;26Rq TU1(ɒ +PZ֮R"5WbXVvZ/Z7 vFݚVy`Dsکd0],8X@]$Y vY@_MA?EЍ%97^QC0m彀㷼sO\jµ"۬Ha3+^P¶գ10Osz6tّ8w&^9(ߴр%̪t4`q†YU+s? ?\Fbw R©G;r%OvGvGeOI,p[T8ΕeW{].vB LSu;8u-}vڨcĮݲ+ b:@{J?1 5׌o)]R{AC< ]y׷!/_϶6s8 Epɣu $jq3I,4gJ 凧bT`7^Na)6Ba+X.7E[lˈa,?=]ܧkzeXB@oVT_LwB|jkYqɏ1 'Ef8KS&MldbWTZ\},V: QEQz w)h7@YNm6c"eDkF`RfIB,_3[3Es> #p OoŔbd[pa]IQ~*A?~ҺwtH!D{/ճv~m.s%\j{?22[|9ehua|S撌wABX\j+Rc@_ NS")m(,MX^T !||enڑ} 9emG66fkI{e}>gQxu, /qW?$8^gdCu^7A(}@ɭXf> 뮌:ǡ)!'{)^b$]T/t4j i:(WKx*`|{7ỵ;xkBRo= ӷ;hwQ>"#Eg : IJ53PKwE(dңJ#7N@l˯:n"g(jъt tj.K~Gy's+ȓl?QL%ΔS->0{<P {ufT7)NC:'Gc"߹fcli&<͖2:Rm's4z҉Y鉰,hÌ~)C-d"i{O*eboŁs_n7mu1awύ5$)r@jvJ`#)W~<佗WVhd1nٲb|6K*-ܴV&57xqb,hc /lp*Ok>Z_m}Bg=*V'= .['D9&lFv7se\b_!cݎ[d_c?II|B5a!VxFuy - 2lQL=tJ?#?kH ߜx,9yyU棂|:>ii +NlNxZ/R؉V<Ӳ\JkRzd]kKsn˻:pa ԁ!L'lUwLZwBuq+_ϒJu}2O 'MԾ$q+HJՉ :tژ@x]ށoj@$ߕxM3I[>:s ] @q%eVt6hۧ}zAVr<ʨipK]iļR^ƹ;[Ԫ`yκ/Xl FTY΋c{Ɲ$lAJ-aAg8ȓ==pl+lE08m$Eƾܶzm9T꧂, (]jJbI- Ќg,K J%3WÆPщuv< ӹˇ'eŏn:79 V+ t8QIf6OP\18#Vǩ9E~;:nPuB FhFpxn#1=9>N'$aVt7"Ϻ):]w:cWܗ)kҕ18KMx Nei!Զ(k^D*i>Y⌴xn/bM4݋HƁJ"#.y9_<0)]= JAG,urmLǩ-č\8ra[36.Qb~-58˓#SLuI>~czʮ<0+ zHjUY-M!=z@2 MTwu@vp8n`A&|wIEOqBۍF?e9? ݘv(R|m+|PSQۂT6X{ql` Kqfz7[ο׼@J4YpƐ\%N~$% c)<=g&2jA܆]߼]~3mUPsE <^-9=Gƹ;juHDHӚCr+ x((J}m>iIJl M:QԷ|l!!WŻyʹ t~X(0M؃F"^WY `cIŽ'e bm0"s.fg >gr,ŷj~̦3>TJ$ m&;< p.ч_yF~#z03 `n@'n<_$kΫʬy_Go\~cAfRVZt-k/䠰)%~5^\/z$+RJ%: + Xp~0Z)k9~kU乊0F4gg>.W.ẏO7])`ⵃBg%@&YԎ&,"by3|פg1Ub^:nHqȳ9)P([E#cMP--0$ä́_Ju0onzc(~Z+ pb7þ_y퀫)XьWHS`]a}Qr)*25\z]xyͷԏ!>_i(Brhr49Wu=x3C1.ɽZxI]Ix&le˞ }bܢă+[-7zr&rfrP7ГA.8|1 1Y[Z3=;SXGp` c'̟"?5\)Wt)!cP5A!ݕmģ70^FPU٭gx^˕IdC.w%6/1n.V"_go_$k"0F"K!JZ樳D5E$vzb䃪bn*1Z\ 3 {šaY pg5m}^Y2Ϡ LCNפY9}OW)CB@Bq5w޳mEJym%~PǷp.+7ܲ~n@TOȀZ6K`K75sNgxwWm* @g$UTAaD+$ͺ|D$A4bҒVNV]Wv,H~ԂKsB⫩?Ίڋjmp.~%E Y|RR85)V )Tx?u"dL2z[۟>׸d㉥޶7PSq4E\/VȗsU2c'4PYk Md3Wg$CM"$FiI &BjZqd%PǙG{5t;AVX@g{O#"yFf~aUhq[/Ҏ6%PyhW#5 R׆I%9 6$GUӋ(RO;e< ٟaƱ6cԳҩknHa1R[ɼsQ6nJ6k);~f+&OI*nuZO*4DTZ/e%9nA!LkA1'Q #MP}k/Iƪж 3-kXe犘O_+^`MK}G~%Օjcv}\O}[~ G,KL{xךhSp?×Y"̤'J|~;irg:Nň5pk޴)|j4 &'GljD `;y.ܨsìۻ8zBI&0ЄX |;<Ɍܒ[X12P r#o]Q?Q8ȋ4Z ' %pF~Wdt~3hlc@yQ8PD7KUMP25$E;2cLqu1, ՠ?F4I O!̘[ZWC CqY`L3i?ϰVXeeSS34g~0LfQA#my]iHN#Tfs6|Fc-uj{H l ĭqP~9)%UතTbb o0|}zf8ϲ,N5}Xϣ/ BQyF=U5/?l P={q{߼2311I!4zxXcx jϱ>eǬlA3Fx޲!y'D`?&u2\b=tȌ;93mf51ttH%~.(S4yr^`Fr\F,wѦ&KCiU-893E%HQ8.n/ x3k*/4n&],ZƵfQo6t8>?wmL12FdBb8Bbd3𛔇tM(28%h*;7&gS.[E/뛊9c!`7U@c"r˻"'qaP<>n_ 4?Q+YN[\$VJC<&Y?|R^MdC[ $Irr]D5J#ܵgi&ٰVٶw{,wMoχɛ0)5gMn%G}>Tۤ,7￳GFa_)UZ 0Ң3Y\ΔZ3fqm6q%=ѫV,҅ToĽxj`g'3 q3mQ3p3Mǔ~"ͣ5<5;U]an\U gCwdFh`H|?Z&!<,z (Y'sU>S#v 6 Ynq &3m7nƼlrN0w -C.nw83Հah<5,}.y0-8?3*g . BFުbUn$jb}̖z#أxFMCH ٥sDME ]Oj!^dFP1#&Ew Yl{@+Oi=cQ+d_~u !Z(r¼= Bi0M74E7 R#,[:+Zv 3xlo48=dF~.[`bm]A)Ƣ<FSU!+|池Ͳ<_ \~\qʽ;UvwpѶ3Cz?J zE#6ÿUcu :2!}#ҫ+Xߧb SuP8UĈݩ׆%EI(⥝ϡ$%@uKv<oG;&\ ?X;\.yB w6)k9`n^vX~s0nnV&0i-38{*'`~@:tT߭qlEg+p )S߱ `"LviI|X՜ЪYZNz訦GU6̈́hP-L+jjtZ_cQPu[@;YhYiI?&c43ňYe'AfO:6^]JChhut)(hQz>xj&s-bW.%Ȓ4ꊊq:O[t`!xnw څ2'؃STvr޴|$\i#%?嵬R`f ٴnDYH0^٠# 7B']Wi!gND Gy+2K zqFYQe~mF$` Gg0 ϮKexUWTߝ~_=Z7 9rW[^Ӽw"(K&aAԮnJpB? ?@J r)nCi]4W&]O׵%7i$f7ɦ@3+ńt=i%d-̒9&N{,H!"+8.>HQ_dG]H=(LNqf2ZQs^]F霁`6L-fDnU G}rSN{)+/ x˿߈(8\&XGSw%; #hdvMB-DWyp'f@BxN݃|)G!Y fHMmAl16=LCj@ݝR+)+ͷ5Z\P[o]W83IZ94aU5-*c)Fީ=vI۞;6B B? Ubљ=-?ku\9nkU3OI~GE:Eg@cŪKbEO~]TWƵdк'k+{V/+ԯ94^E-!θmN, {Ɓ?f۹xDrQH<&Zx;YBQj&1l6h1Bumּ(4E 㮴#wjMTr62#_xY~]N;/CE8𵠰ܺAlΠƮ8ΥR+AǖmMm&鈔,>"s~7AL ›!MRJus<7_d);Su/4Ios_a~JO9|&$xK/Rq> ?Q`F\iEkwdeMi,b;ٿ.#I~0%1cc#kQL >*$}۟NcaLݬV]gҩ6XNWqzz`"ij/GsTe[$<F.ȼ_n3Δ4EPH_=³ۤGdŶX|&pQk:}y- O+,|}9% -+I)<PiI-:ZvR3R63fcρ5fOʈF?T8eثؼl)>"%!ri򒩰Ia0_0Afpgl;V(}~ר%|jqහ8_,˙5pA#P>#>dWgQs~zjj4S_1I 5+S2j`FyB=a~}Z:6#=g[Kը8 MDH7"9`Dh i$ObD7NzA4 iɼvGR~(a-T:e06, [J)sYܜmC\=\L}=?J|$DVR˗4_&SZjE3"kZ 15xw+‰'m]̲=ꝡ>Zf,[u~ >w0 {9HSY^}C6%_5i0s2$l5`4PJK;ƕ)R3\V7*VPg&VOχU ^%#.ȣu3DA3DE0Jֶ l8^P<'GT?SpzHf㙕PnT1(!#"cj -R4r"XH\Ts78S\>*C 'f[97n#lp̼gXY.UEf' j;8+l;T, 힪[BR xr4% ί0@kI%2qH UaLyA`k d@׽{Z?!@VHVB8uG4mY=oq|{Y}fk\> WF wGm;iDQkY"L q/nr22nf_uSZ:O&z" ,llJ52C^JldM2,[MNb3Dy7T2̓VoV`n17ϑ^(^9vu&`.`=BWFHDbk4؇ߖ_?@&y+ }=ZP}|X-X<&LQCou_$D S ?WQDZ ͌Mkw yau}mx]zx_#pyQKPxCN~oH2Bt:89棂 Qr}Pă|>/Iꨯ%s t@2i![˹;`nQ}ӤLQѣ&g̡BMſ0|![ e) 65qZ50qo|D&0jz|UOZ{4-R}Lyo ˭gqX~Xcm' mB-Ot;Ϋ޸> j\ByHP;DS29ppK"@L_hjd8d<(o翏xAVI+,Mڳ断Mk1c+[|![NVإL7YKt,b6S~;v+u?u^ $S󂾍},Q^97?W8JwG[?s1L8}Oo|Ԓ,R˹~lHlR),VG&{KrLJbES'79{8]h$b)6HŠMҿ/h=w1PT6ϴզHǢU ,S{kRPQ7E+ -@ tŝ5mS׏2``q»;ڿ(lV|^*-m4.VmZ~j^cZR򥲝oa9;pZDĹAm apsOl8ϺDMݯ+E7߯NwA'_n*vY_Ufy8=H7sU(jap: 2Ã'"n00zw2k(c Z_䳅-90v\h;qEoY-`Dt,Yg]"$Ć;U ]ݭVVKzӍ|$yQXcZlFc![Sc %SE2r 8O ҩq3l_vL*Wd%8D>MHE]3mX7`駳R~Sd݆A( $ړ*MyV~Ъ}Ŋ  56L}{u]Sg2ZϏddBQM-HKGKR' *l]FB#~"% 6K&MiQ~ci =13]qٴRSsTVhO/q*Kb/μ7CoZh|qo:TOقNj"d(%LgL!*{R!jlr`zx4dZd/猚vIy? 彷.L#\FqO-hǴ/xOMk3Ld5;ujG;.rZh0p`(&E)ɨfX!|ɵu.pWq婰 ZwFa,z:x?^-][=:f'C VHn$9Poy# r˦JJ̏d_,=fqzv+^ ( ͭ!6Tq7%vcY*ݙ)34=cZg]7tgl!Ԩd~?kGi}d|c y} Z8rp9Rգy,=+%Z5*g`0`;z/ok5{U̝%Y8¿ɋR J2̅CcNaͷm}8d8>WePvzć9iPXS,ɀGN`qA+"cRa`0"uAڢ5t}][.yV7"4׃X6&#Xu%* 8zJBZ¯K.aBi 3HTέsSJ:6/"U/$zHE%a!VolR8;kKNxWY+H_Lj."mĺ /bkpЕ儁Fm]c}Dz//}NV1-o%Sƍo x;]1C_gixsInMƨ^J 8onuyW?{l _˥"M?B]~ Y]&7EMRH:1F~6*dw[oM8yƫ F&TB lƲ+ v<WwZ)poL {0昸m+ 0fT?_@ҌWmH.#fY&JyE1wwHZMlb;%bI>OU{!nupt9لe7M,B|I_yAjl``C27/ Mctȃ;J?X4Hy]ʔ=ulk.Z Dzg7?9@=2^R~Sg-c:yA_Q:^ L3rm@× endstream endobj 550 0 obj << /Length1 2875 /Length2 28267 /Length3 0 /Length 29901 /Filter /FlateDecode >> stream xڴuTJHwC!ҍtKwwJ# !)-}á8W͹^JFMY d9YJ g? ƅBM-4ہ@/j dB"Q@gi(Z>.@vr3[C@g;g =$Efgc ]w,sK PyAv:3hkh YzmMi M&= Hjji2U@&?6L-7$w4;5@7wߴ W$ @g zyyxY@n6,.iڹ@nO7#Oc< U,I29 $A4@ؚURSS89Ζ@9` @ @7]nt deF~^c/nvWE [=scSWbV 32g7OzRJQppp C*l% rrvG>);H 7kglmleV.v@y Pl` z[ڲ&3+͐&\;k 'v`eg 9䨠.l  Qo ݟcJ9V gGU?d<U̝t 3wsg+S9;]hf搱wqBvIIr,c`f_d-?. #bZR;/-AVv6n d8~iz+3 Ix 7; `mJEVɿ*Xx2;U/EV®Aؕ"_aW A|v§A4".H3"_a |zO?/DZEv ;B:b1@k_;:$q@-! G$W; VX ߢz;#)v '$oE gDpA6o@H7J,9/d 9rp)vH3@UBjY_7rar" O(V]otAO)F޲ǦC7;d Y;w  )nW!D5;;D%pCV uc: ]{w!^ ^!? -5C*?j.M_{p`7P G9zAne|ҩBş-Z"n )2{y}i|`Ȣ`5z/zg~x6"_ *~-"Vw T ~$?{1T[ּF~ag9)"k5ג亙3cxdxj![KK>]b#34LVls<+=*!3'9aV }1-]nmq# #V&-dUyFPƘhIKN I &_s䋇|G$Dudܦ@aoA'bnMS0Ӳj/T E>v~أFէ/o'n0?'i8,=1֎~إl;3=fkHS*r/Ǜ;f5+eU"XZ_,u˃ȷR5j6 yBw"NKQUuH- nJ/ῇ.M| _C ;>fx{f%ʯ=Ye:#R_4UM+~bŢP(۷&Иp)!.ۧz#_/Ӡ]$c,V~*ΉؕLh/xch\8']VR^1\Pz!Ǎ6X#~ų%qEڛר];-DQcKb^||Ojok FhtdP>UJ& bpǣT j%DgX˜(heaR|^,٦Rx__r)EZɴ~DJnHzezH'oTYƺ7^kasH6,yt9Ϋ/TD#؊<f.M5wʽpOז͚U+YvkÍt'fUy9,1+xYbd3 ~?6z/#ptC+%j_kB)?U0i4/4p!/*L3rn(VP 9$5#]z;c@rcP6bhxJ@u բ/uJP"U}dцM/"^`i WRXp̦KI)>tZnZ_d^ޠd ^OґAg6 t=yf?Wn!9Z2NgN7IWlc -TPqe[w2]'Ӕ\ء'wL;Kmh&taPdYK^~Z1,q;2B$K5u)]g: /hbD4Z.Jշq\HRvDjI> Sb{SCE:o=մuFpkU<  co&_j/)!;W!=8'o,ݓZ2$킍n #h6B;g+_WoMJGsJ_3u#V8aHC2be3fBj*mA}ؑdK5n\]zqr8"}:t9m%X]fY怄&R'\+5"Ag,\;/`j+ϩ?hG~z-pFv9ׄפʈI>Vnfi{ 83)FVVB$OcvU3')" Oɯ揥ω D=bǔR Ca1Ƌ,H߃Mfx&nR~rtpStowL5{013er)4=28nйWTvhr#xEgyưS9}z?4Kr*ǝ|$*9i+H$SP8Ezۡϒ$/ ڋ3nSwH"> *4@Y]&AV*B>C}Dx Fo}HKY ɘgwqB68C'=P}k^~g|do ^wՄ [,t9T*LHsDC> Gjf,MϑkzMc<[R>Nl5o[pbND4v{JG^W}Z*}"ޤ:֮`9Ifͪr])0'=,.(GqpyL??usIRu1>I=TL#Xy"tqr, RLjwƏ՜6GdqpYt9 dGĿwH2kmr-)mR UF|NzdλsyO>x5 G~H_8oڡ=n:ͨ4C,ux^7*uQqi&]aL&p/kg_)!ɠ>M+_'!y!fKO_ {CLKϏ RtrErݲG{m:ך@-/R=Gh\de5{p8w]W*9jeApN'Mbac \?ZZx);블m#j_?VWzeLSMgV1>$ hۉ?pɍ\\F2M(g!',:nKUşO,)=T|rZw"z>[[kC +Xwsj1Zn+'}o)?PǃEݤS UWO;MXT8v6~^$`}[O]j-(zQDѐs9 ֛`8]Sz,[B[C I' |IH/\VCXĉi9BZkH_1Yɹ N>KSb<¡];'waLBЃ5PӶ|;9|YSĻERkcس~9^4Uv$Vв-}Db;9=mEHҞV ^DDE+~AaAddFZ엀"nò;h:eD}6hg|g~6q7/JgU͚ݲ3PҟG~mδi~V#]/Td\axP_d=b X:}z|w2r*M`eP@Oz vT}Bei$h,Ne#=IT߅.#K4jKD1>UU)F M7]#K|/<;:p?qMI_7-}No1Hg$Pʩ'\8oGJ8>Њ<M,Ujgx;&(GtYM;Ԭza󅖶3e!X v ~ΞG'r,XVà vf*e+<!Ο>2ee^v!,CtrүU,G咠VMDNǼ}  |azd)^YnV \͐LhYfjaN}uV2K{G.źͬiZ'+Z(Q*VD+։qq^I)^o3GydڳXZ.M ؖ(˫~W'7OK[rY]rFI?C#BCLk־Hxm}8dmTwBIp (GF1ᬦgX4b]#uje()̾cl3mO|CNb 62c-~a<(}p!vaw.o;p ̈ +إǫo57߆,"RC#/Gno31 ZD=ί!_&Q,eUY_ew>DOȣ<-]Ua; ^fQסK-.-)8?^ s ~7<̏[y|pY7^+S: 1Zim>[nϐUVyLRيSB 㕰(kF\r5& A\yFuKϪdڍvvAˍd2>_u9~Ҟ$aiUfֹ[`dt-$"g]d"r󤱼d2͑V=QFgy& .8Zijh]3Qk6 ;b< w8xv_aցz$v{w@~vZ$p(cNuXJ9+'``}5amX:vgph1\'r܎Y.UC▸&+Cf%ny x+o/W 3z>`5Nrx81J*JHE lE9p0,~5ͫ2^[b^lvT^Eg33QWK )j5s"d>P9_HGKq>Ya铢BO;(Vl&bR:!8tb e/POS.:PM*ͼ#3ֵ7vNhF&0ͤU_ o~l`%"^S2vMQE!EzI8Yj>|eGQmR鲥b5ЯF@r;+%?>='H(qJ!nFZx6g1xա1Xy.tgnSȓҹDȦʅR=gZ".NezrɐNCkoaDcD%N nIzk4/3ܜat<n ܪz6spge –+~{"A-.!d4)<3vD޳hqe>PCo Jl6Mяe"EOqxlJ/hʩm?ǯA>F2*!j:cf~r%П^VYUؔ뮷42omjEt)~>XD<$D⠱F# COs;Eƒ[CwეݷEiaBl~<oEMPɣ]|޸ H %jwmV$_ S=|`Tߔ Ko+8 (^i =Qa7%wVMy*>zb6͠%B=95-V(}`8A&8xKyIdñ(z3^)}ǎ7|mWv[04f=EBOjr,:|i cœ`.쯅Jʷ1C$58#kсjYUSAf| Roo0l$^di6sƹs@94R(]TVF>m'ZuD ݈0׼YC'v++mR&imy/gJ3Ǒin7pU4c{Q{x{(F`#aQ<=m! Y蹀²|ЂXYZ_lH9qcsNߖ'T]ьWv$-Tf|e]r"it]GŐO6v*z1ŨѴ:,%gt .9U:O Zk@/RtGR6t{971ac#9K0WT^@D˃#NU,L~ Gm`5s*j(3ns@+H1]#4Ej ,T{K_N=䠂f$dvE3jL΍^"뚱 5-#x1/D`R w^kW.W^Caee}м_7xyu9ҽ\˦vjYB>N9Qs[OuV1Զ_o6^]? .8K|w\!G0$D C8qzQO#r#@m0^/MkQ :=@4xWPwj_LE7rtxf g$#N%))(vxč+2!ܿ m2"X OF)oP_L=x${fWLh[Is5f! Es\cW05m7{qZ$gF:-Fr֬TaWuII࿈7=J@Ru@ 骀O&Uilg$`WZ<&oI#ƴ&e8,`-{~$4l,!?~0ffGT#%8:6INw"ݾJP71݈#mFҺ9VB6r2I7xJʑoozg Q,^,s#{XgJm[K3>q v }i/ CIx:mT+ =?VkbF\ 5r3rg`Co<nlh`4>odX4Sy=b]/Nk x'CaIBts tdg:k)+p"p($ίtdG㗀z&njn^!;U@˖ iӲA@==ƛfv¨2qJvjtS;'W k\YD(OZ 8*MLqC#U_޴y~-i|)_$1(e$ʳaif=QGCEvݺS*:Y֡m%iͽv>pp+/>1973[)l򲟏iLbYQ`I9C]5pCCp+P%!>|E9RT8y9lJ"/"x(_,޽?{DM0m!mJ5(Y+׵&=E(_Tz: 6-G.þy\fޚZl3sSԽw;K GYj>`Gn^7Ἒn#E`/̄1oꆌ:`PAlYȬ2KSiAh&59ZsZ INO:j {Ŷx,NߘҪ;b×AuV9!ZMjDgÑ\L+͕e aXr -1[ě$5" V~ qACB5BW-3%΁K O7+YVȈ (i[ QJ>8lW;1-hcS[EY e9}0"miRɪKΨFًg-HqKCuڻ $ ׁbt~"]A|TiUĄp ^Ay?] RJZw> l˷KJ9L= 4<6S?gr5⸸/`Kw~Z=~K :P Ə^^rͯw[LIc%_Mм#HytMEɫt`s$2meV5s}XV1>5֦v^d]'!3A) L tjj>nA,^Y3"Dɝ+M{^P)Ln8";JMmmOM՞gD<4h)[ 5sӾ/E&#.)OIy?n }wkn $Ԫ,Dc]Ch˲ʬ#Ehȧ}UIz'l> q4NMTBw Ԇ: V'U>a P[N;o/`oMw$J~J8əp* ?{mޫpԍ@]&睳U*X>8'B5Hv;2Yϖ?.v1vo (0*3Ut( ѱݪ8Y*D,qP\S> } @-rOcEag9Y-xv&h?|8~i&0_:%n?gMMRZ ߏITiROjLβޏRŸ 괙Q+U:b:&; Vv=~rO 2qN,"qDoCh4K85ztsӹO0Ks_6^d2Z2#u>6zх< Bqjq|O[[˰2S2{tTO)DcځjZ&DchG-IoEw"+:,bWׁJWrc6{`Gs`Ty[c+x[>݂BX/#]aCpל7>ڭov@>yzJVfUS:'~??Dn1bA)7Fhe W# #fצ( K &VœTlPw_h0k\63Tʁhs27b0[cb֗I[~(xрkG~!]:߅b&l_x{~[1 )[!jՉ1 \//-$פ:rB,Hͮ %R;)xS9pi8:AuE3M j~jEJno'kKh_N쿥z+*E 2|IJd4Aq![w? Dfbao*bMlqIXLrPlK/޳g:ZufTnMk<.2.Gs;.'\a0y9 " j.j u?ܿoF7GðTe ʋSFh :xuFK:g{4VFŊLCexr9ʱ9}-ts\[{k4LbmbQ MZ $Ge:vjbpѱ.L}Oo DΒ~uV"7TARrSȮyQTђ N6;q :c>9k8L(v wr9 qW>: [,}\=Kʟ4D:$ܷ&e)cM$UD*R7,ar&kAv׮?} H<Ƴ*Tsǽ z st(cM'[*s%)o0yk0, |{JB˃>"۱M" sWn1Lqo]25s%'wSw9?6uS7;r· 1L>̤Z.%89;g#rHڏa(E802>JPv'55NKBTW\*J73Nar|6=o ɸe|cav=wT5$̛ugTug1p,RJI O2ebqL$`J"~οS1:&' .q/y~pE/h`g4qawPzu$^BZ(^[GIV}46cȡrJ},Itg3m_o#EJDPu>Z-ݍ5+ Sy|mӸxoƦ`q#n &5i' ,Fs#Dshj 1G%p:"q^0Q l^\ .  *lXvPT,ƤYT5ٱ7:60/;@Ym9p\˨ t}aOxL,Ka=ekW!`YSO:pڝf>~w MĹ@+%۝ 0DB>2n Okb}G]lc c3r{>]hmիI|'gC@AnOyf @8d3s(-0*{_谙gX1ϣz)r: hO$ZZ Ꭶ*I=4!τZ<xݒ<߇|䢒CU[>>cf {h.7QkdTpFUK](JPy;/&%0 ]o_) u2$\SN~Rp$;/HA{B6@d~֎d'w7 ~_v]1xXzexU'|X@D"wR2 OQ"$[9}lOQԊPʋX B #J-m06`ƻo>(`g2)m8kc}u|}od|eXנgr)9˾Hȉzz#)7¶m4Va㮖vV]R/ޜk՟Ǒ}53ŕW|ٺ0bjfm y;$8 mx#,1-k/{X^a})2k0[#IU'lfagyʡFw6Yz`Hye[72b.<0{d:4-^cgʬF#"K I>_%?̂O!^s[4 Exp|,5z²(yUO(nok]]#nh!R1c&(~,2I(W'n:bguG(l-Rj:s5D(Z벼NO:nxÜWtERLDbwH9ڜM#cV&sly*B @ @y#'7#< : A@%5 707]Pӆԩ~EIK񅟄<^[HlbqG~~eNE&w3X~KZ?_篙7i<<ݗEũP^6ǁ&dԊ+!d&!ه;.0e zF- ߄_^9n=Ei$DNbi݉$WZ;%OE: H}f4!'v{\^r+| t؃{y$w1B4n8{5R_ o5.?6_&DuKZ,WΰvĦf0wh ؽVnԯ {gNуwAHi@Ǽ$/ s8%ɲBKgAІu"Ә6Uq%o5^1VdzH[;~W0%L}*lgClAcd`]mcEx!uBLMYpn0ѱj1 ORHQ%(x/jegGSNeD`՜%_߶)~Mn!k~&.ɂ&wy)jkiaŪbIΒoMMV/!K5e0AQ'uX! dpW/}7!- ZuDya D7zX?i$J6웥(Ք[[n7q8&q-IGsh7]dl{pzu%L }E\0 z*{r yk %cCY!&}?|QE.rtP%@RN'D탓]#vy!Dd@rIa LAisg{WgvOpIkBcő6mM%ǝe+%ܳS[Y1l9O';SDRzEa T v"3ҦEH)(`S.p'4&iWvf:} M3]qdu6ew60`ŨdZ3U`BzT0)  /ZZ(uHxp<~9FCD7ShhPJ^A{73UTxw`y؞\YcݚH$ I謮SYp]T.rEWj@s֩$˭ i?N9l>P9Twiѷ|ez[s}DYɭ*Khe*@rPLW:;T0 ?Ą eÔH1deεI6Qx%D;ڋ|9A!\7.`02NHh` a29ػ4B<|߱CN 0}dX)P;nѲTH=9Os~v]. n#M*ÿ+3 G薬"VFI %Ee'1\l7 X2$G !0Lڌݡ -3s>Gɉ+BTټMJ2i?kucH2i';$J|2H]SZu1cHDdN ~!ϐ9#Ga扗D ֈ7.e8l2:cI-(4w(i7ߚlȫ.be.1чˀ]7Lh+L9N B2"[\eeTZ>H " Nb5f353"L:q[iLsv|pzk3@ &5vXâSEʂƌSqk A5G`v3n-9$ۈXɺ D~h;D`l*Œ[So=@/1Ҕl{ Io}S`s]Ea&B0q v@GvFiD !u'&OJ%j 1t(PӠ@(:2ϺW䟈\L<ܟ()FV@ȓ hk6gS|ťs-;b,D9#d*>ZMZNJ)=oYvE F?z KK2}}S,ze|$n8p ( ?<c!0&5OU;L,G2 iF.6S0&MP!>]踔Qז6meLnؒ:Am䭟u F^8y ߠD񐄡 )SeK9'Cx~ C2W}PjGd=tmFx Cp2:űA{0l?-V>M 4P ,m؆d 9,j40)1lQ1Qa\كD- HurV8^$֤ȷ0"ED{|d }"KE;</mhH}՗rw56xKHs1G_YQQM6H_NZ(BdYyl="+V,MI0$P.=`X i5R}gv 'f xw *ŤXc |J׵ Ri4cԽ>KSx!cZ!%jbeej.ĶNYhKCow< k>!cx  5%9h߱@`E?['$)w];)oH?C2 u;6RO)إqW>0\!OV/eӯ*t{g[jv'ԛO\{.]GA5eJny$A@L k`DN5C tȉzA4FZ9yj2bh<&3I }%M>28^)cL8aئN2aȞ%Kl'^%x@ܮ'ou@ƤDb~Fz*] f$.+CAqSwwXgk\DDyk?* :IEg}'ć}z"a\CxOP1 pj JM ,SD>,[(8^ʤhs޿G>r;€}p]m3fEwf,~TI*}|Gw-2̒8[i)@q&ӗ?,ӬOo ׹k-ȴ(*r8L//ؾl >B͍-o&6YdvL@ʗǚ2؈g*9sVG!͜sPy^8H8Wơ B!6DŽ׈p7q#ِN 7S!< 8}>RhO8uDz¥N=0C72KO>6)HRZW/fMѱ1 u!y5gybFxEN#u@FsRh`WfzUmdͩ-|OlW]^ԥMMa+ v3i*#&,RMqmpQiLs DPd/w 8̔ ש+'|cx'/NO6̡?T]4XTp$(\C@uyLb #0[Ycy}=N/$3e TJ&脜>Fe XN (d&Uݓf3pK:?4;AR,[/꒿$s2i歮<2ndt+&|霢tZ0f-/4 H}7n ‚c0W@Zr{?1+-@xUCZ82bﯜ:LprjNY*=Q '!a^4(@d-@kA$XI+@> "f0W3|4Nb0#cCNԀ5A|I.D<oѯ唍:AtGD* lڕ֪G1(]TF.SD=!&bi`b˥Ʀet5Y %0jT^!Gd']g-F!_+gu+#hWzPPA}&;Mgv W֐CAKʒ8rcI|-i2PomX|r-e-h))}ׅ57\sJXr|X"rc H|j'~_)IAUsOi1}L ip0&] յ9oJz?#ڪT=@e4@( HkG7яi %X:;rg O<5o7WH+RLá_n"d@ Uv?cb:AHH#:Z yU)Pq3"w&|e</I-6Qe/ߺ W_N_LjǷϠ;!]2yS eA'Bd! <`;8'|L,?,H\]C5R}d~ONعL@WAjdzR被؆S<}l-¾.U\ NU,<PJ DOEਲ਼к=-h "噔6ְEpeQA]wLfX?,{9AUyUH1&MGX,X]Yi_yP3}膀u9"cjLI*u_\!cï]DN<  H֠^]i[/tq7q e?3</:Ev: b)$ouv6]=2|.EFHpam%pFN Рt2w hh/ 9BNY kSh! m \у&=] er#T;2=!&i#V>f3)'&L]^Cq=?G(gU{?CZRf̼z@*Db}# )Sb9Q14>zvC J}3ǃ;ӆzEF7<ϫU37惯 _R9ٲY'jګk+N(Uf~"ՋHyLVn])6E\/̆ZXmttxx $<8][Fx!X)c[z:^tU&wDFb~8'.XySJ5iQC-qx:@]7RKGe|h,~~g]~rwrg=@K1\+@UTx$)*R nuw6oH%&i<%ХB`8Y9i[4vEJNCgҍ"U-$M8-FY()լ r!#^OV9;ዧ h/ۚ ?8C|L& ʁ0B,-]j>s虓9` )oVtJsNiשa06)5DU$MS=륒-c+[bՓ)Z2= m(aiia@ XJL^02–DP+ȇcȿUuV/i>cӮ/gTm)h udM4Z3rjtUQ0!Ml!#Jqq bʢUڦY >%mV?)ĥmiELǻ?$b'V-BT'ZUԝRZv{p4=i6V5 'zcl^,RnW~™ɪU%%%]jSW\_eV#+, C`)^)g*PSu<Tt\|Љb쟃b ./[5N^ зI,t G*DSK,{̅w9'h*M*)ZǙ40*7Z-ϸ,P=L0 3}84,,xxsGe֌f(=U@H9x)5,N+7cnA=S/keYB'))c47j>/ܘh=DO<#ATꆂpQgh|HX'3AA׈ !l6[X`2n˅)SnX h Х 3#JUvXV@ʪE۫;ww4 PsIDn~,ٌ}qCP*'98]Bs7K?]6s4R;I(fKtp  hc>M!\ѵ:g3 ,Rh3W諍O 1ypj(ӝzf UJd-M/]ST.}Eyɵ)gL*TQqRQϙG<]xV,πKes>2q]!&|J"[L/n+F%W(VCݲsJvwQz܎R^x-{[[2WD=-yf_ܵsY~z~䔌&@WrI,,$tmύq[y]%LM`BKelδLT&X6_:` Ehve5GpWe`4סPr9f=t.᳭ꀱ d!,ʐʲ B6`hs#L7_r[0+[4OdT|kGca,}۹=MwYUK:9{e8f䎹.ʙ]`^[&M* Tm:6(R Zj%q{IP~ W:y.EX4gNrtʜә(?)^XlP?n,^J4| >ȩHkr$Q1B_h@:|] D{Fhl 4XSGg,Dp)S}GuNƁ)urSn8,}H`h'ߓL\ZكݒOSU 𦐦]oBNu&wmW n::^&+UZ4`}ugIܬ7%!"묈_>:#")?,%JTx[gbWg+ yϢ 薧G~5+{; FOVW;]W H\:))f,m׬ͧݍ`*hyʄQ~,)x+qRJ%8F8`U@)gZKu[cYq㚺 `r*::t.,64+iѾr)Uٱ>r~+jIW(>$򧿤dH]h+?!s\>X2"F DCpbt OLsc]Jj-?BAAs1cykaSC8rkߏ/&{W~A0HuS4r #:9 R38Wt, dS*e!`j2Jz0nV~䱕.9J,W9lJ[ѵAhQ,p4% 9Yj>4vW,oUﺘw X-DzcΖHFʊ-c "aU& x֠=,rfw2Ȃh+)#4"lʢn+է/xy5UyU9ҵl]zػ\$>K-TN۱Z2mj)Y *,>Z=Ta݋𢷛8pB{WH g]HB6^>"_"JҚ!RQt6KY$j E't .֍g`fK?<;(XсDo Y[ⷥX9"{~1[A":ĔCX6RVpOQ5rMY)ϟFLN1d19̭b* ȦD!+ꧺ7KUsbG)wxkEB Ta3aFi20d84mIJNyڎ~袩*,(bvo(ٲQAznƀV_xD ᵬNwQgEdpv,\R Xs͓)IIs[ }a</C.Qg&#{7CC5nðĩ*Є-Ph?AMxHnMҕh/=ZfƪkcW6Ϙ-nŎr\H$ VU' u 6$hÇaKrԆN2 (Ѓl0ݑ:dFt2 -MD^ ?`T toˤ!Fm2 axj qV=FXw'idmA vӘ깍T'u3r??Nٴ ˤmidkYL(2OQ,9f`uK0nటM^[0* FJj*': HAVe4삦#EؗcK{xz-bI_Swg],؏CnA+8?pYWQmkM<;3O7F,PO*.Zz`HK.x!̻An fL tV|NC+s~_{~D@. r)UӁIԉY3m_&րBg47EBIHp##-]5  Z68NUgwr<a_gRLM/Kܗ-L0pJ<;1HĩGNdR2ͣgSxд.݁B~02}_@>+X@;(UÏ;/#ǨZnNMQ5܆d4%q]؜ 0Ӱ,>\ KKpb.yb76[k2*U+UEHF᷅t::Ą,^~hEI:yw&]spCK7 J@HéF\ Q.)Lp#/h*G@֧;IJd"nڞ i%m; O Ox5R4*EbV1Bdk;T|vr(Y8tTUj1W.-:(1^XxDk?ްXP] gGts[KpcW;eh-q1@O 4#C qX|xn֮CCnC!wGo4 Ӄ5׸WgM9G7$~zF3z?+#Y.^) KXv[XUI1,e>Tս磜+;1s\>:e4iY;2҆ϧnޏ z2\ޓ*#3 5ZM1[mMy,[[Ln"7ӥ&\Te?j19 |}YfqЯ*b|kL(:P MH3TN/N.|ˏngM8Dh$d{q~ A |0*.ē)OXvX[NsԵhk%¥Pe;77+8t%`~M[A|C+;LIOw bĆ#!̂0A|;\Ǐ'GʗPN6½ EXV8eTͳl|n B ӁBHAW38p_D.9;Jӓ-5M{+#֡X&Z|6f~Of%77 |4+<%jΩVi&E3F D].yX#S .{v?pQR5 &y;@AŸ3) 69H0}.+bEJ*{CA> stream xZ[W9~d`Ҟ9'$1@.Li&v;!뷪-wn&deZ*T%ckXƴLkx ;UƄ.p%"0-Vf0 U`FΘuZ2Vy0/%[< с 2Ph$tiA&dk,Y ,,W86 8 ueL 6@j`%.+` :d&A1H`'% {Iz^3$xրD4 E2@2[Tt^p'Σ ,c `cGB\6$`@,SysB $FrD(a,w t,ෑi)Ba f@#;VlR>qc *Dh@`Q8{)PNhq̂ zogU UF [|/ 0;?b!f A1tY/ 5K`C SHL- 0M C YmD<a?a5e=Lޯn÷l }+YgВ/&%{}EqE}9 >jQST@ق@1?d?|I~SjOƏZ eQ1G_>I^HB/y.69ej6W\c3&f`uiZnCl$6ϛ{=fM QMMe2z~l] ihn _&/R:W+UůܙQ]6-[ A5S"`U_U}ZMmlIճ% WY2E?0gL0'L]Sl_0CB&A {}o%c%ٸ ԺL}ߪGDz$TJY՗8ZaA6ss0%YֆZ A} NQu&QFh89'DD?h2j j8#mZb^W7ww92 &^(%:J8b`njΟQx)-tcn<|5Fk+.T5_1qZ8ZB5!1u>UFHqT.BaQBQRƀ4fʊaQ4@D0 S&A D<RϩDe~UjlZ`U4Ƽ@ 2ĂY@2U-m@}Qilb (tulkxQumY5"tW&*Nxf&>K;ן9ytt]ϫzh/^Wvv3nq>hoَ ԧ4.]e܎Y9&e=~\WoK_-N;|?/Ka1.&_Ox`Oy{SɯK^0' ۆ՛{o^4:/.!\pg![5Pe8ЀI&L'|zh!x' w|:}t:"ni|<aqu5q<2y~3 F7ט+|/4ח'Etfh4FB g1_yQg؉ăjM_a&q9/|3tXLs7;Otv|8?xOu}n d='´1ĨR<j7ͧ⤕ JW{x\c7CB%OHԍۉoLL l7b w.={qp;Lf, YyC74t풮jfl>Lt)(y:M,s#\ڻDɛ}bR(7h2`OXB#Y\c #^umHaKS:yŰp.Eao뢔.:}$ܢtEAZYkY1_JyU:K"&2֢|[DfM%ڛ b4:?,=3Q:#*ű:.Ɔc1YUb *AXk !lb( 5d5x: oNzb2?L#.a9nt7u;RfJ7/սqt0BncLxT?&8ӒqN`/:? IGYg>A 0ǽ3Pekk3&6\lb6隶IӰއm׹e lelôӲef2Q0l[݇]eV~@܇e{2rLO'BSm+ƹx'[؆O 8q 9/M=xb_#3&7!l|&#>mЯKu)*σ7Y$~RCQUb> stream xڕ[K#ϯem0$$0$LI4:>WiXՃ,VdQ xӐNhmc翪Wqd/L⿺b}xr.&:׸gA%XS)GMyQP`J"x<D DxMИ5&؈Yl!+&*Ix蟈"51,v 0&YF@y&2:rU-ǁ˰6eX+D|`{ymbcxFf\^ gmF\XqDS@XYGV\2{,^fl `ӹ~x_xǿo x3n;r7}$ͥns1\j6[6qFn#s[fsw;_7ž2 5 *"}E$ϩH%hJsmFnUƊ>D\.a.u!qxVy3{_7uwU͊ aB;dU}qxb/xa"V5T@G ; (_,Dft;b % V=\ Gج8$#=C$E^GqDd\F$p"eLSɳZֶXDK$JHc WrUFY'OY'{/Q2~yD0ZWXɬ$JVeٰN 0r).!2ZQ/h_}< QrmQ@̬TEBs!j\FV(N*TV`ilL>璓(0S\\߰lg*>aS/)I8JIT8JK)HZŦ`㜬FyZO L=O)()uIҩ-n*6~%DY*&/FL"\Dum+(I($ |DB=WFq؛:*Y0ټ%Jv:l^)EE\-{=lUlERLPHA {͛qF =)j*3h8GDP41'M6G$Q`'ag ԇ}dE?sUldyk4I$I(YN(p+*$eli#,#RhI܇-!k*6'ZL@((@$8!yC `Tu_g[{Y%T 7D'*Q :nVN2T-]7[GkжJE E{Q^ ΃Ah\=0 J {.ìY)[q-\N%4E@r&$Ć [۹jCG؟ABGؠABGءABkQآAB &(& @اAB$(T J{G#zkJ@ VU`(%wE$1<7k`(37DA^%+PLjQU҃5`^F _!--Du!!;XfKx "$Pj8>ˍqyO M6emz~ww8}0r=ᄊp9/]{}9~kwG48oKz߾nkKDV SXQ6z,_/W.Gzqb}|DJ~\^j2 nqMj=Uc@9nW{7$+ϭ鱤 ymV6Qqx0a|UPFƇa=*~>۷Q ;xG *^ j6f?Η+{66)տRqho]{Ο3&pQ_9e{x}>^9]E ehƗH~y9S޽Ͽ[eig׷}3z9ƃ;j8W&CpC6r N98r0rXj﫜o}=T:UY=XmfjۚڸO$(+UvQ`\n1/w)lEqFĐ(h^*Ysd,C(5U޾6[&rAmb9bF7TQ|^Zƣ6K3߿j4^e3]foW . ʍ:m_0p=<}ݖ.8@>Ո7<A~8"Ʊ t0q2k檼Y䩂!lH@%øA@p⶞N h bL 8Nt E_I(pc$k%"MDw)4w$[ơ;Igr$ G@ {-+L(IGxYa JƅlؐP!>.8]ݩ}-KK}\8nk8זM\^wC+~ݞi| |z*4aa_,T6}*AG%Q|3qrRl*ΤRb*Ey\B "%JL{7EVY1TmQPEY1ك> endobj 625 0 obj << /Type /ObjStm /N 6 /First 46 /Length 259 /Filter /FlateDecode >> stream x}M0sTM2IS ",zOjBӊIaoڝ¢ay?` j`Ѐ22:rXs*(*7l`:x2 )DE7[Ɵ*Wn>Tm3K;>-CdF&D*2dPCMv&ZR^ uG_XH1~m}<udPh=dP.U3lt=Xxe'< 9퇖 endstream endobj 632 0 obj << /Type /XRef /Index [0 633] /Size 633 /W [1 3 1] /Root 630 0 R /Info 631 0 R /ID [ ] /Length 1312 /Filter /FlateDecode >> stream x7s\e{W$p-眳sC +jZ|3 3T  t4ܣVyWW]u nUF031 9ݘna=<`/h/\-b,񵧮bcVz%^Wa5`-a=l&ll8a;v`j:vاz3p8888888󸀋˸}zU03<_xxx~ ^9j[/Q G4Hb:yi!O' |B>!O%z BH%KPI$T+Fh#m6BFh#m6B؉ C%E"_1wu @B !H$ @B !H$ @B !Pl܀TB 1B !C!b1OcH $ B!@H *ufܾ1O;O;ȌR)@J %H R)@J %H R)@.u:HRA u:HҟYHY呾S R)@J %H R)@J %H R)@J %H R)@J %H R)@J %o~?zwf{C~ʰ#Q<c1\eXpjZz| ؈M،-؊ql<va7ag G=؋}؏8IaQqIiYy\E\e\U\uM&wppCwdu=O՛~-@}MMMMmf?zt:49Fjbhƌd-i-6Z|+TҖ? ƺoYmR[k_(Vt~&0c 5qc#X~vvʬ1@4q}:F&6iCZ;lkkG6T׎}IcmF%vXTb;oʱ]4.5C͇M7~;}uu]UWuu<○+7Ⱦ WUڈfz6 }LVGAԱBaC BCq^e\U\uMn ^^ c=#`?gom?ܯC endstream endobj startxref 213654 %%EOF libxsmm-1.17/documentation/libxsmm_tune.md000066400000000000000000000305721415223013700210230ustar00rootroot00000000000000## Customization ### Intercepted Allocations To improve thread-scalability and to avoid frequent memory allocation/deallocation, the [scratch memory allocator](libxsmm_aux.md#memory-allocation) can be leveraged by intercepting existing malloc/free calls. This facility is built into LIBXSMM's main library, but disabled at compile-time (by default); build with `make MALLOC=1` to permanently enable, or build with `make MALLOC=-1` to even require an environment variable `LIBXSMM_MALLOC=1` or an API-call (`libxsmm_set_malloc`). Both runtime settings allow an optional lower and/or an upper bound to select malloc-calls based on the size of the allocation. For the environment option, an extra variable is introduced, e.g., use `LIBXSMM_MALLOC=1 LIBXSMM_MALLOC_LIMIT=4m:1g`. ```C void libxsmm_set_malloc(int enabled, const size_t* lo, const size_t* hi); int libxsmm_get_malloc(size_t* lo, size_t* hi); ``` Querying the status may return zero even if there was an attempt to enable this facility (limitation/experimental implementation). Please note, the regular [Scratch Memory API](libxsmm_aux.md#memory-allocation) (e.g., `libxsmm_[get|set]_scratch_limit`) and the related environment variables can apply as well (`LIBXSMM_SCRATCH_LIMIT`, `LIBXSMM_SCRATCH_POOLS`, `LIBXSMM_SCRATCH_SCALE`). If intercepted memory allocations are enabled, the scratch limit is adjusted by default to allow unlimited growth of the scratch domain. Further, an increased verbosity level can help to gain some insight (`LIBXSMM_VERBOSE=3`). Intercepting malloc/free is supported by linking LIBXSMM's static or shared main library. The latter of which can be used to intercept calls of an existing and unchanged binary (LD_PRELOAD mechanism). To statically link with LIBXSMM and to intercept existing malloc/free calls, the following changes to the application's link stage are recommended: ```bash gcc [...] -Wl,--export-dynamic \ -Wl,--wrap=malloc,--wrap=calloc,--wrap=realloc \ -Wl,--wrap=memalign,--wrap=free \ /path/to/libxsmm.a ``` The main library causes a BLAS-dependency which may be already fulfilled for the application in question. However, if this is not the case (unresolved symbols), `libxsmmnoblas.a` must be linked in addition. Depending on the dependencies of the application, the link order may also need to be adjusted. Other i.e. a GNU-compatible compiler (as shown above), can induce additional requirements (compiler runtime libraries). **Note**: The Intel Compiler may need "libirc", i.e., `-lirc` in front of `libxsmm.a`. Linking LIBXSMM's static library may require above mentioned linker flags (`--wrap`) in particular when using Intel Fortran (IFORT) as a linker driver unless `CALL libxsmm_init()` is issued (or at least one symbol of LIBXSMM's main library is referenced; check with `nm application | grep libxsmm`). Linking the static library by using the GNU compiler does not strictly need special flags when linking the application. Linking the shared library form of LIBXSMM (`make STATIC=0`) has similar requirements with respect to the application but does not require `-Wl,--wrap` although `-Wl,--export-dynamic` is necessary if the application is statically linked (beside of LIBXSMM linked in a shared fashion). The LD_PRELOAD based mechanism does not need any changes to the link step of an application. However, `libxsmmnoblas` may be required if the application does not already link against BLAS. ```bash LD_PRELOAD="libxsmm.so libxsmmnoblas.so" LD_LIBRARY_PATH=/path/to/libxsmm/lib:${LD_LIBRARY_PATH} LIBXSMM_MALLOC=1 ``` **Note**: If the application already uses BLAS, of course `libxsmmnoblas` must not be used! The following code can be compiled and linked with `gfortran example.f -o example`: ```fortran PROGRAM allocate_test DOUBLE PRECISION, ALLOCATABLE :: a(:), b(:), c(:) INTEGER :: i, repeat = 100000 DOUBLE PRECISION :: t0, t1, d ALLOCATE(b(16*1024)) ALLOCATE(c(16*1024)) CALL CPU_TIME(t0) DO i = 1, repeat ALLOCATE(a(16*1024*1024)) DEALLOCATE(a) END DO CALL CPU_TIME(t1) DEALLOCATE(b) DEALLOCATE(c) d = t1 - t0 WRITE(*, "(A,F10.1,A)") "duration:", (1D3 * d), " ms" END PROGRAM ``` Running with `LIBXSMM_VERBOSE=3 LIBXSMM_MALLOC=1 LD_PRELOAD=... LD_LIBRARY_PATH=... ./example` displays: `Scratch: 132 MB (mallocs=1, pools=1)` which shows the innermost allocation/deallocation was served by the scratch memory allocator. ### Static Specialization By default, LIBXSMM uses the [JIT backend](index.md#jit-backend) which is automatically building optimized code (JIT=1). Matrix multiplication kernels can be also statically specialized at compile-time of the library (M, N, and K values). This mechanism also extends the interface of the library because function prototypes are included into both the C and FORTRAN interface. ```bash make M="2 4" N="1" K="$(echo $(seq 2 5))" ``` The above example is generating the following set of (M,N,K) triplets: ```bash (2,1,2), (2,1,3), (2,1,4), (2,1,5), (4,1,2), (4,1,3), (4,1,4), (4,1,5) ``` The index sets are in a loop-nest relationship (M(N(K))) when generating the indexes. Moreover, an empty index set resolves to the next non-empty outer index set of the loop nest (including to wrap around from the M to K set). An empty index set does not participate in the loop-nest relationship. Here is an example of generating multiplication routines which are "squares" with respect to M and N (N inherits the current value of the "M loop"): ```bash make M="$(echo $(seq 2 5))" K="$(echo $(seq 2 5))" ``` An even more flexible specialization is possible by using the MNK variable when building the library. It takes a list of indexes which are eventually grouped (using commas): ```bash make MNK="2 3, 23" ``` Each group of the above indexes is combined into all possible triplets generating the following set of (M,N,K) values: ```bash (2,2,2), (2,2,3), (2,3,2), (2,3,3), (3,2,2), (3,2,3), (3,3,2), (3,3,3), (23,23,23) ``` Of course, both mechanisms (M/N/K and MNK based) can be combined by using the same command line (make). Static optimization and JIT can also be combined (no need to turn off the JIT backend). ### User-Data Dispatch It can be desired to dispatch user-defined data, i.e., to query a value based on a key. This functionality can be used to, e.g., dispatch multiple kernels in one step if a code location relies on multiple kernels. This way, one can pay the cost of dispatch one time per task rather than according to the number of JIT-kernels used by this task. This functionality is detailed in the section about [Service Functions](libxsmm_aux.md#user-data-dispatch). ### Targeted Compilation Specifying a code path is not necessary if the JIT backend is not disabled. However, disabling JIT compilation, statically generating a collection of kernels, and targeting a specific instruction set extension for the entire library looks like: ```bash make JIT=0 AVX=3 MNK="1 2 3 4 5" ``` The above example builds a library which cannot be deployed to anything else but the Intel Knights Landing processor family ("KNL") or future Intel Xeon processors supporting foundational Intel AVX‑512 instructions (AVX‑512F). The latter might be even more adjusted by supplying MIC=1 (along with AVX=3), however this does not matter since critical code is in inline assembly (and not affected). Similarly, SSE=0 (or JIT=0 without SSE or AVX build flag) employs an "arch-native" approach whereas AVX=1, AVX=2 (with FMA), and AVX=3 are specifically selecting the kind of Intel AVX code. Moreover, controlling the target flags manually or adjusting the code optimizations is also possible. The following example is GCC-specific and corresponds to OPT=3, AVX=3, and MIC=1: ```bash make OPT=3 TARGET="-mavx512f -mavx512cd -mavx512er -mavx512pf" ``` An extended interface can be generated which allows to perform software prefetches. Prefetching data might be helpful when processing batches of matrix multiplications where the next operands are farther away or otherwise unpredictable in their memory location. The prefetch strategy can be specified similar as shown in the section [Generator Driver](libxsmm_be.md#generator-driver), i.e., by either using the number of the shown enumeration, or by exactly using the name of the prefetch strategy. The only exception is PREFETCH=1 which is automatically selecting a strategy per an internal table (navigated by CPUID flags). The following example is requesting the "AL2jpst" strategy: ```bash make PREFETCH=8 ``` The prefetch interface is extending the signature of all kernels by three arguments (pa, pb, and pc). These additional arguments are specifying the locations of the operands of the next multiplication (the next a, b, and c matrices). Providing unnecessary arguments in case of the three-argument kernels is not big a problem (beside of some additional call-overhead), however running a 3-argument kernel with more than three arguments and thereby picking up garbage data is misleading or disabling the hardware prefetcher (due to software prefetches). In this case, a misleading prefetch location is given plus an eventual page fault due to an out-of-bounds (garbage-)location. Further, a generated configuration ([template](https://github.com/hfp/libxsmm/blob/master/include/libxsmm_config.h)) of the library encodes the parameters for which the library was built for (static information). This helps optimizing client code related to the library's functionality. For example, the LIBXSMM_MAX_\* and LIBXSMM_AVG_\* information can be used with the LIBXSMM_PRAGMA_LOOP_COUNT macro to hint loop trip counts when handling matrices related to the problem domain of LIBXSMM. ### Auto-dispatch The function `libxsmm_?mmdispatch` helps amortizing the cost of the dispatch when multiple calls with the same M, N, and K are needed. The automatic code dispatch is orchestrating two levels: 1. Specialized routine (implemented in assembly code), 2. BLAS library call (fallback). Both levels are accessible directly, which allows to customize the code dispatch. The fallback level may be supplied by the Intel Math Kernel Library (Intel MKL) 11.2 DIRECT CALL feature. Further, a preprocessor symbol denotes the largest problem-size (*M* x *N* x *K*) that belongs to the first level, and therefore determines if a matrix multiplication falls back to BLAS. The problem-size threshold can be configured by using for example: ```bash make THRESHOLD=$((60 * 60 * 60)) ``` The maximum of the given threshold and the largest requested specialization refines the value of the threshold. Please note that explicitly JIT'ting and executing a kernel is possible and independent of the threshold. If a problem-size is below the threshold, dispatching the code requires to figure out whether a specialized routine exists or not. For statically generated code, the precision can be selected: ```bash make PRECISION=2 ``` The default preference is to generate and register both single and double-precision code (PRECISION=0). Specifying PRECISION=1|2 is generating and registering single-precision or double-precision code respectively. The automatic dispatch is highly convenient because existing GEMM calls can serve specialized kernels (even in a binary compatible fashion), however there is (and always will be) an overhead associated with looking up the code-registry and checking whether the code determined by the GEMM call is already JIT'ted or not. This lookup has been optimized with various techniques such as specialized CPU instructions to calculate CRC32 checksums, to avoid costly synchronization (needed for thread-safety) until it is ultimately known that the requested kernel is not yet JIT'ted, and by implementing a small thread-local cache of recently dispatched kernels. The latter of which can be adjusted in size (only power-of-two sizes) but also disabled: ```bash make CACHE=0 ``` Please note that measuring the relative cost of automatically dispatching a requested kernel depends on the kernel size (obviously smaller matrices are multiplied faster on an absolute basis), however smaller matrix multiplications are bottlenecked by memory bandwidth rather than arithmetic intensity. The latter implies the highest relative overhead when (artificially) benchmarking the very same multiplication out of the CPU-cache. libxsmm-1.17/documentation/libxsmm_valid.md000066400000000000000000000104161415223013700211420ustar00rootroot00000000000000## Basic Tests To run basic [tests](http://libxsmm.readthedocs.io/#classic-library-abi): ```bash make tests ``` Remember: a set of key-value pairs represents a single unique (re-)build (and test): ```bash make STATIC=0 tests ``` There is a whole collection of test targets available (`test-cp2k`, `test-cpp`, `test-nek`). However, it is then better to rely on test-suites. ## Test Suites It is possible to run tests like LIBXSMM's continuous integration ([https://travis-ci.org/hfp/libxsmm](https://travis-ci.org/hfp/libxsmm)): ```bash scripts/tool_test.sh ``` The above command runs the entire collection ("scripts/tool_test.sh 0"). However, one test (of currently 11 tests) can be selected by number (1-11): ```bash scripts/tool_test.sh 1 ``` The suite itself can be also selected. For example, some DNN tests are described in `.test-dnn.yml`: ```bash TESTSET=test-dnn scripts/tool_test.sh ``` In general, all key-value pairs valid for LIBXSMM's `make` can be given as part of the environment: ```bash AVX=3 MIC=0 TESTSET=test-dnn scripts/tool_test.sh ``` Please note, the suite/test itself may be comprised of key-value pairs that take precedence. ## CI Tests The `tool_test.sh` script is included in repository archives and releases i.e., it works for non-repository folders. In contrast, the Continuous Integration (CI) use case relies on the Git command being present and the folder being a Git-clone. Functionality * `[skip ci]` as part of a commit message will not trigger the CI agents, and tests are skipped for such a commit. * `[full ci]` as part of a commit message will trigger a full test even if the setup uses the "Fast CI" option. The "Fast CI" option is enabled per filename given as 2nd command line argument: ```bash scripts/tool_test.sh 1 .fullci ``` In the above example, a file named `.fullci` may contain path/file patterns (wildcard format) triggering a full test if the files changed by the commit match any of the patterns. ## Portability It is desirable to exercise portability and reliability of LIBXSMM's source code even on Non-Intel Architecture by the means of compilation, linkage, and generic tests. This section is *not* about Intel Architecture (or compatible). Successful compilation (or even running some of the tests successfully) does not mean LIBXSMM is valuable on that platform. Make sure to rely on `PLATFORM=1`, otherwise a compilation error should occur _Intel Architecture or compatible CPU required!_ This error avoids (automated) attempts to upstream LIBXSMM to an unsupported platform. LIBXSMM is upstreamed for Intel Architecture on all major Linux distributions, FreeBSD, and others. If compilation fails with _LIBXSMM is only supported on a 64-bit platform!_, `make PLATFORM=1 DBG=1` can be used to exercise compilation. If platform support is forced (`PLATFORM=1`), runtime code generation is disabled at compile-time (`JIT=0`). Runtime code generation can be also enabled (`PLATFORM=1 JIT=1`) but code-dispatch will still return NULL-kernels. However, some tests will start failing as missing JIT-support it is not signaled at compile-time as with `JIT=0`. **Note**: JIT-support normally guarantees a non-NULL code pointer ("kernel") if the request is according to the [limitations](https://github.com/hfp/libxsmm/wiki/Q&A#what-is-a-small-matrix-multiplication) (user-code is not asked to check for a NULL-kernel), which does not hold true if JIT is enabled on a platform that does not implement it. ### TinyCC The Tiny C Compiler (TinyCC) supports Intel Architecture, but lacks at least support for thread-local storage (TLS). ```bash make CC=tcc THREADS=0 INTRINSICS=0 VLA=0 ASNEEDED=0 BLAS=0 FORCE_CXX=0 ``` ### IBM XL Compiler for Linux (POWER) The POWER platform requires aforementioned `PLATFORM=1` to unlock compilation. ```bash make PLATFORM=1 CC=xlc CXX=xlc++ FC=xlf ``` ### Cross-compilation for ARM ARM AArch64 is regularly [supported](https://github.com/hfp/libxsmm/wiki/Compatibility#arm-aarch64). However, 32-bit ARM requires aforementioned `PLATFORM=1` to unlock compilation (similar to 32-bit Intel Architecture). Unlocking compilation for 32-bit ARM is not be confused with supporting 32-bit ARM architectures. ```bash make PLATFORM=1 AR=arm-linux-gnueabi-ar \ FC=arm-linux-gnueabi-gfortran \ CXX=arm-linux-gnueabi-g++ \ CC=arm-linux-gnueabi-gcc ``` libxsmm-1.17/documentation/tensorflow.md000066400000000000000000000430421415223013700205130ustar00rootroot00000000000000# TensorFlow™ with LIBXSMM ## Getting Started Previously, this document covered building TensorFlow with LIBXSMM's API for Deep Learning (direct convolutions and Winograd). LIBXSMM's Deep Learning domain (DL) is under active research and quickly evolving, and hence reintegration with TensorFlow may be needed. This document focuses on building TensorFlow from source with Intel MKL and MKL-DNN plus LIBXSMM's code for sparse Matrix Dense-Matrix multiplication (SpMDM). LIBXSMM SpMDM is rather stable and integrated with TensorFlow since TF 1.1 (`--define tensorflow_xsmm=1`). To start building TensorFlow, one may clone the source from the official Git-repository: ```bash git clone https://github.com/tensorflow/tensorflow.git ``` MKL, MKL-DNN, and LIBXSMM do not impose to build for a specific code path or target flags and attempt to exploit the most recent instruction set extensions. For most other code it is recommended to use a recent GNU Compiler Collection to build TensorFlow. If the static code path does not match the highest possible CPU target (march=native), TensorFlow emits a warning at runtime which is reasonable given that libraries such as Eigen may contribute performance critical code paths. With any [recent Bazel version](https://github.com/bazelbuild/bazel/releases), a non-default GNU Compiler Collection can be source'd, i.e., it can be added to the environment just normally as shown below (the second block of exports may be safely omitted). ```bash export PATH=/path/to/gcc/bin:${PATH} export LD_LIBRARY_PATH=/path/to/gcc/lib64:/path/to/gcc/lib:${LD_LIBRARY_PATH} export LIBRARY_PATH=/path/to/gcc/lib64:${LIBRARY_PATH} export MANPATH=/path/to/gcc/share/man:${MANPATH} export CXX=/path/to/gcc/bin/g++ export CC=/path/to/gcc/bin/gcc export FC=/path/to/gcc/bin/gfortran ``` TensorFlow may be configured for the first time. In the past, Python 3 was problematic since it was not the primary development vehicle (and Python 2.7 was the de-facto prerequisite). It is recommended to use the default Python version available on the system (Linux distribution's default). For the configuration, all questions may be (interactively) answered with the suggested defaults. In earlier revisions of TensorFlow some frameworks could be disabled at configure-time in a non-interactive fashion using environment variables (`TF_NEED_GCP=0`, `TF_NEED_HDFS=0`, `TF_NEED_S3=0`, `TF_NEED_KAFKA=0`). However, the current mechanism to disable certain frameworks is per Bazel's build-line (`--config=noaws`, `--config=nogcp`, `--config=nohdfs`, `--config=noignite`, `--config=nokafka`, `--config=nonccl`). ```bash cd /path/to/tensorflow git pull TF_NEED_GCP=0 TF_NEED_HDFS=0 TF_NEED_S3=0 TF_NEED_KAFKA=0 \ ./configure ``` Bazel is downloading dependencies by default during the initial build stage and hence Internet access on the build system is highly desirable. When behind an HTTP-proxy, the environment variables `https_proxy` and `http_proxy` are considered by the Python package installer (pip) but they should carry `https://` and `http://` respectively (in the past `pip --proxy` was necessary despite of the environment variables being present, e.g., `pip --proxy proxy.domain.com:912`). ```bash export https_proxy=https://proxy.domain.com:912 export http_proxy=http://proxy.domain.com:911 ``` If the build step of any of the Bazel commands goes wrong, `-s --verbose_failures` can be used (`-s` shows the full command of each of the build steps). To start over completely, one may wipe directory caching the downloaded dependencies which is located by default in user's home and called ".cache" (`rm -rf $HOME/.cache`). For non-production code such as for debug purpose, TensorFlow can be built with `-c dbg` (or at least `--copt=-O0`). For further reference, please consult the [official guide](https://www.tensorflow.org/install/install_sources) to build TensorFlow from sources. In case of production code, it is recommended to rely on a moderate optimization level (`-c opt --copt=-O2`), and to better focus on a reasonable set of target-flags (`-mfma -mavx2`). MKL, MKL-DNN, and LIBXSMM make use of CPUID-dispatch, and it is not too critical to pick for instance AVX-512 (even if AVX-512 is available on the intended production target). However, if the desired workload is bottlenecked by Eigen code paths that are not covered by the aforementioned libraries, one may be sufficiently served with Intel AVX2 instructions (`-mfma -mavx2`). ```bash bazel build --config=mkl -c opt --copt=-O2 \ --cxxopt=-D_GLIBCXX_USE_CXX11_ABI=0 --copt=-fopenmp-simd \ --define tensorflow_xsmm=1 --copt=-mfma --copt=-mavx2 \ //tensorflow/tools/pip_package:build_pip_package ``` If specific target flags are desired, one may select depending on the system capabilities: * AVX2/HSW/BDW: `--copt=-mfma --copt=-mavx2` (as shown above, and typically sufficient) * AVX-512/CORE/SKX: `--copt=-mfma --copt=-mavx512f --copt=-mavx512cd --copt=-mavx512bw --copt=-mavx512vl --copt=-mavx512dq` * AVX-512/MIC/KNL/KNM: `--copt=-mfma --copt=-mavx512f --copt=-mavx512cd --copt=-mavx512pf --copt=-mavx512er` **Note**: In the past, TensorFlow or specifically Eigen's packed math abstraction asserted an unmet condition in case of AVX-512. Therefore, one should either (1) limit the code to Intel AVX2 instructions, or (2) supply `-c opt` which implies `--copt=-DNDEBUG` and thereby **disables** the assertions (at own risk). As a side-note (this is often missed in AVX2 vs. AVX-512 comparisons), AVX2 code can utilize twice as many registers (32) on an AVX-512 capable system (if instructions are EVEX encoded). To finally build the TensorFlow (pip-)package ("wheel"), please invoke the following command (in the past the zip-stage ran into problems with Python wheels containing debug code because of exceeding 2 GB for the size of the wheel). ```bash bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg ``` The new Python TensorFlow wheel can be installed by the following command (use `sudo -H` in front to elevate your permissions, or add `--user` (this flag does not require a user name argument but implicitly specifies the current user) to install locally for the current user rather than installing it in a system-wide fashion): ```bash pip install -I /tmp/tensorflow_pkg/ ``` The `-I` flag may be sufficient to reinstall the wheel even when the name of the wheel suggests that the same version is already installed. To make sure that no other bits are left, it is perhaps even better to remove all TensorFlow wheels (system-wide and user-local). In rare cases it can help to start over and to remove all locally installed Python packages (`rm -rf ~/.local`). ```bash pip uninstall tensorflow pip install /tmp/tensorflow_pkg/ ``` **Note**: Unless a workload is symlinked and built underneath of the TensorFlow directory (for quicker development turnaround time; out of scope in this document), a wheel must be installed before it can be used to run any TensorFlow Python-code (the desired workload). ## Performance Tuning To use MKL and MKL-DNN effectively, the environment shall be setup with at least `KMP_BLOCKTIME=1` (perhaps more environment settings such as `KMP_AFFINITY=compact,1,granularity=fine`, `KMP_HW_SUBSET=1T`, and `OMP_NUM_THREADS=` are beneficial). The `KMP_BLOCKTIME` shall be set to a "low number of Milliseconds" (if not zero) to allow OpenMP workers to quickly transition between MKL's and TF's (Eigen) thread-pool. Please note that LIBXSMM uses the native TensorFlow (Eigen) thread-pool. It can be very beneficial to scale TensorFlow even on a per-socket basis (in case of multi-socket systems). Generally, this may involve (1) real MPI-based communication, or (2) just trivially running multiple instances of TensorFlow separately (without tight communication). For example, [Horovod](https://github.com/uber/horovod) can be used to perform an almost "trivial" instancing of TensorFlow, and to add an intermittent averaging scheme for exchanging weights between independently learning instances (Horovod is out of scope for this document). Similarly, for inference all incoming requests may be dispatched (in batches) to independent instances of TensorFlow. For the latter, the web-based client/server infrastructure TensorFlow Serving may be used to serve inference-requests. However, to quickly check the benefits of scaling TensorFlow, one may simply use `numactl` to run on a single socket only; multiplying the achieved performance according to the number of sockets yields a quick estimate of scaling performance. Here is an example for a single dual-socket Skylake server system with HT enabled and sub-NUMA clustering disabled (2x24 cores, 96 threads in two memory-domains/sockets). ```bash numactl -H available: 2 nodes (0-1) node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 node 0 size: 96972 MB node 0 free: 91935 MB node 1 cpus: 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 node 1 size: 98304 MB node 1 free: 95136 MB node distances: node 0 1 0: 10 21 1: 21 10 ``` To run a workload on a single socket (of the afore mentioned system), one may execute the following command: ```bash $ numactl -C 0-23,48-71 ./my_tf_workload.py ``` It can be assumed that running on two sockets independently is twice as fast as the performance measured in the previous step. For any benchmarks, a freshly booted system shall be used (alternatively, a root/sudo user can drop filesystem caches and defragment memory pages): ```bash echo 3 > /proc/sys/vm/drop_caches echo 1 > /proc/sys/vm/compact_memory ``` To gain insight into performance bottlenecks, one can source the Intel VTune Amplifier and run: ```bash amplxe-cl -r result -data-limit 0 \ -collect advanced-hotspots -knob collection-detail=stack-sampling -- \ python my_tf_workload.py ``` ## Validation and Benchmarks ### TensorFlow Model Repository This section may help to quickly setup models from the TensorFlow repository. Care must be taken to ensure that the model in question uses a suitable memory layout for the tensors. In general, the "channel-last" format may perform with best support (NHWC-format). If NHWC is not the default, the model (benchmark) should be adjusted. ```bash git clone https://github.com/tensorflow/models.git tensorflow-models cd /path/to/tensorflow ln -s /path/to/tensorflow-models tensorflow/models bazel build //tensorflow/models/tutorials/image/alexnet:alexnet_benchmark ``` The above command may be combined with `//tensorflow/tools/pip_package:build_pip_package` to build TF as well. Please remember, the TF wheel needs to be only installed if the model runs outside of TF's source tree. To run the "Alexnet" benchmark: ```bash LIBXSMM_VERBOSE=2 \ bazel-bin/tensorflow/models/tutorials/image/alexnet/alexnet_benchmark \ --batch_size=256 2>&1 \ | tee output_alexnet.log ``` ### Convnet Benchmarks The section may be outdated due to helps to the Convnet Benchmarks being superseded (Alexnet, Overfeat, VGG, and Googlenet v1). Recently, the original Convnet benchmark **stopped working with current TensorFlow**: please rely on TensorFlow model repository (previous section). ```bash git clone https://github.com/soumith/convnet-benchmarks.git cd /path/to/tensorflow mkdir -p tensorflow/models ln -s /path/to/convnet-benchmarks/tensorflow tensorflow/models/convnetbenchmarks bazel build \ //tensorflow/models/convnetbenchmarks:benchmark_alexnet \ //tensorflow/models/convnetbenchmarks:benchmark_overfeat \ //tensorflow/models/convnetbenchmarks:benchmark_vgg \ //tensorflow/models/convnetbenchmarks:benchmark_googlenet ``` The above command may be combined with `//tensorflow/tools/pip_package:build_pip_package` to build TF as well. Please note, the wheel needs to be only installed if the model runs outside of TF's source tree. To run the "Alexnet" benchmark: ```bash bazel-bin/tensorflow/models/convnetbenchmarks/benchmark_alexnet \ --data_format=NHWC --forward_only=true --batch_size=256 2>&1 \ | tee output_alexnet.log ``` ### Running Inception-v3 This section may be outdated, or data source may have moved to a different location! To run Inception-v3 inference on the ImageNet dataset, please follow the [instructions](https://github.com/tensorflow/models/tree/master/research/inception#getting-started) to download and preprocess the Inception-v3 dataset. The relevant part of the instructions are duplicated below for convenience. ```bash # location of where to place the ImageNet data DATA_DIR=$HOME/imagenet-data # build the preprocessing script. cd tensorflow-models/inception bazel build //inception:download_and_preprocess_imagenet # run it bazel-bin/inception/download_and_preprocess_imagenet "${DATA_DIR}" ``` The final line of the output script should read something like this, note the number of images: ```bash 2016-02-17 14:30:17.287989: Finished writing all 1281167 images in data set. ``` Please [download](https://github.com/tensorflow/models/tree/master/research/slim) models/slim as well as the [pretrained weights for Inception-v3](http://download.tensorflow.org/models/inception_v3_2016_08_28.tar.gz). Please setup the environment variables as follows: ```bash export CHECKPOINT_FILE= location of downloaded inception-v3 pretrained weights export DATASET_DIR=$DATA_DIR ``` Please modify the file eval_image_classifier.py in models/slim so that inter_op_parallelism_threads is set to 1 since TensorFlow/libxsmm does not support concurrent evaluations of subgraphs currently. ```Python slim.evaluation.evaluate_once( master=FLAGS.master, checkpoint_path=checkpoint_path, logdir=FLAGS.eval_dir, num_evals=num_batches, eval_op=list(names_to_updates.values()), variables_to_restore=variables_to_restore, session_config= tf.ConfigProto(inter_op_parallelism_threads=1)) ``` Run inference on ImageNet as follows: ```bash python eval_image_classifier.py \ --alsologtostderr \ --checkpoint_path=${CHECKPOINT_FILE} \ --dataset_dir=${DATASET_DIR} \ --dataset_name=imagenet \ --dataset_split_name=validation \ --model_name=inception_v3 ``` Please verify recall and accuracy as follows: ```bash 2017-07-13 21:21:27.438050: I tensorflow/core/kernels/logging_ops.cc:79] eval/Recall_5[0.93945813] 2017-07-13 21:21:27.438104: I tensorflow/core/kernels/logging_ops.cc:79] eval/Accuracy[0.77981138] ``` ## Development and Tests This section focuses on LIBXSMM's integration with TensorFlow, which has two aspects: (1) sparse CNN using SpMDM routines, and (2) CNN using direct convolutions. To build and run the regression tests for the sparse routines (SpMDM): ```bash bazel build //tensorflow/core/kernels:sparse_matmul_op_test bazel-bin/tensorflow/core/kernels/sparse_matmul_op_test --benchmarks=all bazel-bin/tensorflow/core/kernels/sparse_matmul_op_test bazel run //tensorflow/python/kernel_tests:sparse_matmul_op_test ``` As suggested in the overview, it is still possible to exercise TensorFlow with LIBXSMM as a compute engine for a very limited set of operators (2d forward/backward direct convolutions), which may be desired for testing and development purpose. To enable LIBXSMM's convolutions, the flags `--define tensorflow_xsmm_convolutions=1` and/or `--define tensorflow_xsmm_backward_convolutions=1` are supplied in addition to `--define tensorflow_xsmm=1`. It might be even possible to `--define eigen_xsmm=1` if not implied by the afore mentioned flags. Configuring MKL-DNN (`--config=mkl`) may take precedence over LIBXSMM, hence it is omitted. ```bash bazel build --config=v2 -c opt --copt=-O2 \ --cxxopt=-D_GLIBCXX_USE_CXX11_ABI=0 --copt=-fopenmp-simd \ --define tensorflow_xsmm_convolutions=1 \ --define tensorflow_xsmm_backward_convolutions=1 \ --define tensorflow_xsmm=1 --copt=-mfma --copt=-mavx2 \ //tensorflow/tools/pip_package:build_pip_package \ //tensorflow/core/kernels:sparse_matmul_op_test \ //tensorflow/core/kernels:conv_ops_test ``` To build and test the CNN routines: ```bash bazel build //tensorflow/core/kernels:conv_ops_test bazel-bin/tensorflow/core/kernels/conv_ops_test bazel run //tensorflow/python/kernel_tests:conv_ops_test ``` For development and experiments, one may clone a [fork](https://github.com/hfp/tensorflow) of the [original](https://github.com/tensorflow/tensorflow/) TensorFlow repository: ```bash git clone https://github.com/hfp/tensorflow.git ``` To get nicely named JIT-kernels when profiling a workload, LIBXSMM's support for [JIT-profiling](libxsmm_prof.md) can be leveraged. In case of TensorFlow, the following flags can be added to Bazel's build line (Intel VTune Amplifier 2018): ```bash --copt=-DLIBXSMM_VTUNE=2 --linkopt=${VTUNE_AMPLIFIER_2018_DIR}/lib64/libjitprofiling.a ``` For Intel VTune Amplifier 2017 this looks like: ```bash --copt=-DLIBXSMM_VTUNE=2 --linkopt=${VTUNE_AMPLIFIER_XE_2017_DIR}/lib64/libjitprofiling.a ``` libxsmm-1.17/documentation/tensorflow.pdf000066400000000000000000005471651415223013700207030ustar00rootroot00000000000000%PDF-1.5 % 7 0 obj << /Length 4928 /Filter /FlateDecode >> stream x;ks8Wjڈ!A|q8xngk*EK EjH*j (Yd FrL^ytӳ(ڏt&I&Y|f>wy+]s^{V;O?`tN#?#M~S*^sYn{]d%O~8?!2`Ls*Sikedhl{3AH$ lOυ'MAi,rD"|{+f2\/(f΍ N6ʚ]θ*CUzyhx~ rst:7 qQ5oyxLK::S 9YJ_=c> u$9gO=iN0皫D*T^PgC5<{ wCP&Z[@ {ĩ/G0) us\oHf6([ĭ\$D03++g,{fS! aidVnӢzx}$*<՛MՔ$3o`uI\ Yo2h=ApΝ;Έ\PE?h.U\N:YJ7pfr'$ B?Ak< R`?Q YpO}f]:,[m {nh=p$ jiM5ubɫR0 ̫ʫhK"!ૐWG췤Igj!\m{ Y̛kv-aBG wGwRʰ?DtN|ThDmogN~Mv Wш-I=ڏsfE͋b|_8JOGEɎEdQ$lz;tK{PY6Y?QL'> Ws{noA.ݾBJ~ZlG.@(3?4ġ4~,JCJz6l?'x瘻e.[v|4#j;x1ct-q5U\ v_n[#:6 z%y9::7E?ẋ~H{ĭ׈붉i0bZpcU{oD Xenڨ'b1BJԧgQjް)?%is쎦B kmZӚ߷eW:{j\7&Ͷ3bKCt GI!Jǎ`@KY9繰z2R{s7o^vlY`^gEϮ 7vA= t#8BÐscآe}. RRlk6I:L=%V)% d{"$NM{`KLCb/G= VYh!5J@45G`OšDŜKBۆsQ x НȠm'! q {K40cyW_x<փd =(h}(9=xb`0>D_@Hc`Eq4Fg?x  RXMs޸L"E$T Dg۶:V \rrJdi{=k5+m]eEN\ӊbJFt\>{,qc!{;,ga2i5a'pfX[d53͑pLX|*؏v~QfIu?F H՝1'Oٵr|2[O)3jCHҳ'CaMKqo+y:8 5=_u{d3=_9SƩQ5v m@刣e~_ ~?+/r`v;Voay6ŕf(x&;)H&@vJAM1h J=ťzUSH*B]N|3 à$AʍME)Y@p4]R"kٺ:N[^^h+`zUoCd !l w[;s( (C5CC:OyknhBT9[ҟ5gyM"HwczQ) 8p@>VtÓ `y*jgvpksdVqՆHyrɿjWC*2@Ϊ,,W'%]H7S #Kءm,['_ʷ>g-w(pp _0Nz_8ig1eV*3PĆ+e3@+mD Üfq%(܀k.dBΊϦj:qQdΣ]B*$tǶ3B/: } R?aClV-Uu&p3FCRmi5L*d&VPD6S R7]3z&ȞUy7YȻ.7C qf~JLptn{B˜)S?4K OFwA4yɢ>h$4cXv\){U#Jk+bIbVɟ2ݤ /]i\󛿞zW%NA.ɯ9W2Jsf nGYaQ MrxɄa*2ܮW6vHd}Uh-$K+ǥNb t-c:czϯ,Y8h.vՅ VUK=Ħf?5x4^t(? (ض# Y&cIE~I s bM8z-`!Տ)-`{W8W)YЃs1?v6I2&}wH/nq' P86}9◷ymlB7B "qs Wc`,?EZ+~9҆6%El>v  _S4tj|" eq\6 8bgLN+{K 0\ZEg.E{N@&|~BFgr{5}_ڰӇ@mǐ0~>:LHg)?vons.0WHFzQXsT9)#l.uȆkSKKe<_>o}&2[ JzJnלOd lJb? rrXZ9IJRqi?d #T~F|uÿV endstream endobj 45 0 obj << /Length 4733 /Filter /FlateDecode >> stream x[[s:~?BCWY4g'Sx'3'[sRI\S8۷oEV|هCnl=f/g`iL{nY]糿;y8%^({*(_R>f'TYSϩ)(]QZPNvb_lOqP #3a8R8 T(0GvBMh|7uhHC߯uh4LYGVmvw^bsr^;s.O's.I >F==}BxJ u, 73MQ7MqPCV?9- VP4ݍ؄m)ReEB )5ʯ(_sK5_(mԖ("Yj 59&W<6LsE Vr IbI,$6A|CQ-"1:0&PpH|Ɲj;JyZJJҌv~*H?RB/{Δt7҂;5d.)N\Ԡz촿g|5CQzBt(^'z}iQA,˚08Pfp&7mјԕTok[S,eȫܿ#ܵr2e,´ @UߵJog_:~أ4q$z=oWyá&M?j@8w/2wMm܊Ibbd*aeVv+Ѳ0u @A00|D!fjĮR,VؑC^|DdQ8 :Z̏V/nV-V8?"{s]sͺ E5ٲ+4b.2}e")IP2k ASME"⥢-.4fU7%߰4Φޗ97D ^ED&EAŲgYY$\]Ϳ;Is@L{Sk4z 9uv;؋zB78KNPǁH!Tr2Xog7#;] VIRL?~Ԭ8=mnYB$qb]'}8XQ`|pDZu6;hhu7g=чru(2MB[(6tr swݦ'hcYYQ 3aR5T,7dVj4&G9(p7,{}tjȟqArr]CAj(=]rbbrс7#7ysÉnFUQ@|㢨-Ê/E8Q#E8{D";Jo(]?b4 O,E-g㠓A! y. Ր0P(617H*.!NٷMt&`& ky-O *tBVjj4ݔ́[$pQ;0@ )Af[%|8 `!7=`gfk 8g}Y |V65VoY%CpYt냘 QYPr4u֥?4!c fk,F䅍}X?|eЯ$g7􁞢B$Atc~[VYsg=70a8"=EF!^H[J;J3JqS H0k Pkg239HQxzIQچmZ:c p)̬AƲ)SZX#i>˔X}+ lW`Zb>uhd Ap~ 4}b~ 7 3𔂗)tPR>䊜 {,C3Q3 }-RaB~zmڮ/ʺm? i%Kde0 J d,\m%m3ˊ929-L΢yBiVA.kBXv-Ѯ[ܠ@il''ND}jY򡟒a͂Hg Fcd*vBP9'MkbP EwZ$1(7bcht\E.7vK:,HCNaDj%?rک18 #;puۨwH?ܲy=ǞQ0b=X2  /|Kyv 23GRx Gn&-+ObAj㋴Bz?_j?r>Tml:P-BVܔus "A@F!X;i_BUnH0:plqvmv!O/9eW7w\s\w0ږ.C~:7U"&`A\ Dlkg,sd%rcMeO]hcb=cؠ=0x.!rjm/ GWGn싧h<}_qKZ{J%Vzcj_a:~{&< iV kdjу׭*@jp=h}$4\}[j/߿R~lj>'x.(k\^vC tg9rF02=k{{w^<EqCO /J(X .͒g= (4ZYhիBTڗYStwOWEe~vuUQj/?7}I:><l*(tu,L/oz?Wa6w-^ÉnLwn,['@ijaW5*?pw=?3۷r5Fngw]<|uyɅ=2>Y]`dxf,Ӳ!G`E*vP,lz+5 M1ݣN<;-h[!(}I.[)DJU o)pDak{w^r@G.;I~`|/3}M+ԲUh|'IR|6s;q eo"?ak0#"*bЮ4v@耼܁Yq4"}0` qE.kd<$Ϯ76 dXtCyEW ?ge[Ygb8ofp LNin|Jrr;DVnVDv 7h  PqLfzBAl;@5Ƿ{mB׍xX7hgkC%clT^B /-%lghmIc }(a(WHKR^A0gf{9Ͻě,>Rj}}$RԹ*79W=B+v`"Q=hubD”EYn#f*\~ 0NK▶ fvl qa:+V35؊“o`CXaDPڳ@sU˲ןu2m2F%sKJ0W0o0<ߞ>V;tboiUz*.`7Z:s<}NRjj] &@s, ƌ#ػ2V [v]t)}Fnt(:Sk:c%!v\ܪÐJ 0"sռ!Pk-w?%ƨ"}ߓD τZn+LK>!DEbбF8.$u.z"s d B69Ah1Mi@a r iĜ9d߄CW7wevco(@DpTQVK$ oeN~ۉ2 uCײeoZ/.SΧ%ZC,ٚmzTOe4׋<OQ_؟__K2E endstream endobj 66 0 obj << /Length 3646 /Filter /FlateDecode >> stream x[o~p(Z }4%WZWDI)R%);wPZp;;;;YIM65yrߗW_\IdqN8'61fr}Z晙D@t5'i:f?L, 17:tCapz 4rsJ/(]QuL_6$ ItE`?HA*ՑB B&*PY4kdQ:3 iܐ q$Np$qP'z 3j;'Oğ? RdȡƂg2L$3$2L#90#90P"9Z$Gh-z"H'DXrKbA,9%2Id&A"9Hi`4 쨬${ t&8PWw4̼<23[JGϟY?y1r#wȝ2r)+wJ]RW+9+9+9PrJBA2L,3TC#3VfBd&h̸r=\O*9H% ȁvTdn9$ dq }ډMXJ+J?W)T;H ?dG@;.cxKpCe!sL8!}MtI4rNoD3+>:tz!R2i$QKb jd*e =ќhD˨# ͉. u{B׈"НX`E5\FB1b`哭O-ц( H,H3&2DxhY2ڀ'Pq'$T#($Kױ5njU*"sn̛)Pܢdմ} ƺߡUR5+ ];^]Tzta4YXt路U'D4՞S:|k]\ 9wSf7 tD+Uu,o,l-XiKSCWp$ΙCu&,k0Cj,&iB q*Ow(|.%_'et/%at')8K$F_BpFm^]6񗌽.ya];{3q,bܕv$l/{ګÞgӊ;䟾+Nj ӷkn[8@c'NttP'-Fc9|Y,)pϾzW*VlIdl5+5)a)raqe`G0>[RQjCpaj zxF<Ɠ/%QAprLƒr Zn[)^?+~3Z_1Zc\G8Da:ӈۗ31⸢9j:PObH&-3~ ?" d-ה~9h%0a-8 @XǑUI0o= &1  u Z([E ó N4 I,6r#BϘo#h!he^jq/S ;Iӝ7J[yݜHNWz @.n.SJi]ٍ8HWZ)Q*&Fh$%#s27ψ6[:p1& rbLD At5C5NBXBt0&o9ͺj.}˛|SXju-J>Wa3MƓ"ΫN()nYTU0`hM ?ȻCVu3>\u_n[.W=8C>.X]W#h=h 8 +W~7 EdU1xb @^*i( BT}U \(]dddq xIQ/~GwÍ1 &5Z /p|k9ۅN"_A 0Xc8b-a"];\3SZ.~D&d"x?94Dji"#tƬF*NL$6l$Po"AfFI A m:YxGvzRNH`$C4"JϜT0xΣk g7V4,KB$?(JREI|>a󄀉 L(\bio2&@@Px- +|`B82K XU&S&|"/V؅߈"wٹ׌$yss芮'?},i&2B}\?DE/ndb|[F'P,΄aa~0kO,ϊ@f=3šEqckqtF->0]&A`vzk endstream endobj 2 0 obj << /Type /ObjStm /N 100 /First 788 /Length 1381 /Filter /FlateDecode >> stream xX[oU~_1r\f會J(P E(Zoe;I$8T!}f(R)I&I$\)E)%RR&PITJwʑ'ʉZ+3, u#0d"CjqD=JT"ę, 81 >IO>J >CEbזA#+V|Ћ'vL a3L 9* !'g@W@dʪehc0_!橎C<QXacEbd&xf\!P@6Tיcy9CF |97U>W_E"1tAb֑*p3L/. &=u5'dRW|P V.PYrbSԒɐL фl`:R@[ӂ+Mx9͖HD>밷GN訠F^Ҹ?ӣG0fF㋓Wa|:-7fMpr.Vg5n"H Jhx pr9ixu3Olew)clCG`fK`eTkJ#Pϗ6i9?|n_lެWp61=[^Wjg믡i(n TK@~s]ˀȬo2LW|ޜ,﮿^GϿPj HCZ^, 2,1y~I$2+({NW!<iH}HCS F:=琺KR c 6>CI̓ {b!KhcUh걨zXQ:Fb;6RM*2IH|I2diVwɦG'm{]0_z-׳}go7w˝rwqy`i6lIh3m#h>h]Vi59 p @h\EZeK1reEG}1{j/FXC샖k7./)򾡼o.,Tg46@=u zOWw؊ʎU}.`+e -֚9J@9hqE@{ Z] A|% V>VE;I.l9U_i UAZh5v>WZ ~j@闆ņ P}]4_ðm^|{:RTE_qЌB|i(Ư&lWquW[뵻+^5|*%W'jJߛSh@ endstream endobj 125 0 obj << /Length 3862 /Filter /FlateDecode >> stream xZmܶ_uVt^).v&ud~HnW+X+-$흝63$596_F#!n,>$0Ͽ,?y i/E$, )B? rλ Exuz;[5kW4;=,k^06Zz0 /[Tnzx,hPwI=ǭ;pM/#_V|w j E_j$[ D$d "5w_ӛV'T(qNH?&"vءvz zϛבXhnGA:j8YdiAX{E+SP3oBu{8F$ޡ>nP}50s |:lжusc~WgެX˿q̦?ZMֹ|m6{첮}HS"K1.^I^S*%؛}Y֦,7=z zR6ꮚ~(Z,^+-^!|P$6zwj 4 kD|OoO]Yjq-d䵎2M9D! N'M/dPF~n)erj#dLؤ Sc _{d@I"νRf3P$/ieVjF?3|NBnRm.?O$UCp9.muk{E d4CY ۢ( `&ޟf(D#ۮChJNO5*1MPR1'5e&x 25~fM63ݒkeYu,$Mi \)448Tms}ME,#3fC f0rЋs@/(Gژn<5=U[a8sR Z͜톾&Gzئ>+=hƚ,1֑W&hM6#fiKp?rRЎkMѿsA޷@z}9<g5x? Z|fݺviOVSņZ]yZi]|0Y:eQƏ4P@d]YwWFİXtk_.>ݜu֒4 _ڶ$] ҜLyT,ܡ^\Fǰ?8Zǰ<iTS ܓH$@3 0#kn, %3YKu8f+ .tx^7ھpK ߀r`ap50 xp9^ϕEN'a%Mn-OeX.xd9_x^Q/PP2 Ň?l_$ٔGkf.o|1[il[-G8GDz&̌wn%F>γf iYCNXbl(]+EՄ~^G)%O_0+,yk) -=Уf}"~G>g}Gzt(Yɐb=hd9yP _*P )8jg9+aNee-$[8e0GИ[v  9.9.s\.s\s\ HAL'-FX?A%3Klh#SS}QkԏaEU@7I) YǓ uWnSm~W5aO1-sqYOǒGe=P MU鐭KÅ/YԠ3B[%w4DjCrD2#Qyd 0X }c$kåVlBpT(s Dl9Ch@TB\@Mk u@() .#Rcg|` _řƭi_aB["%}rֲ) ȄGi2XeW2BP\ /y1Jmn8=tZBw` *Kaq_|H[~0ĖlA N#} }mU]4P ~.وEG!]PV{B(5 2{,4|B>ڽ)/ +oti{Wu:UuHS:yI .vK@JtKL͘֎ C(syD4#+**Wܶ-EyHct0Vhgi\\^M_d /1KD SEƿf੘XKh%:V_?SUR<5Yjk+պ.^ZBfJuGo዁*Քݪ.UX {{C<{Q<[z '%)QO"S?6צpSWyQ]Habenէmݩpe؅vǽi^JTkR%UMW_HNYV [׊}*ړA V?]]+fا*%%wv̔;,\zzZ$eቪР :JLM5/oWp:x^nMdOFEms;6@nVZ^M1pGfcwG.h\8&.V2TVu=YK_ͩ)ܱra<b:7XOG*{X>rlD'ӊH$=g;K27%pX`' ض# 1dh92)\Al;TJMI9Лj.&ĦLG!v*.JΩx>r8oQq3g=;7\5Ҟ-oi&Mo)z3.SgSmFQ=z[vƶѸ+`tB)YQD Kj*MM=)jR';L~4xHoL1$ȣ%GjIhtMi$;E"6ߑcsV~w(Ym/C(wl֭b3aG'LUE H-!iOoYLx,ź;[6r`tg2)5x@v7Ƭ&\1MEY01+E6FS7_2~i[M'(yIJ?r١wmTiMR ;)]K}W}Li el m wғx`86` DT>@mǪp ; =Q6-8vչrXé eLuRQ3۰5qgxwd.n\“y|Q,%1tgn뢮ioLXlX!g-~0>]֧#O՞[c"Vzvpԧݩ7$+ 5,FjY촽(]hn IԿ5gG1mhUK4DfvZ1iXJGk4 vM3L. ԆsIϥ7t fjcs$T VQ@ (ؘ(ς|``OM$u?+/0Z<[d`~!\(ަ{|3I,7<%jɿiy endstream endobj 179 0 obj << /Length 2941 /Filter /FlateDecode >> stream xZmoܸ~b?8-I I| UNRw^H-'uȗ9|8C %&}zыHO $QD桎*T 8oIy]vS6tYP4 L4Ao?Br"E\RPP &IP'|zQЗjTw}imӧo_hU3zC : naͩ)6}۽۩#(Ữ[7HZf]x:Kb.!HQ-^Į2$~o01z{vrmTtn)xʎTeQu0Af׸ܶ!,}80s#A]U/ fkpXȖ}B~:˶3<JT])ö"8BIN$ $a,p.1V?zi,Pr2iGt59Y%k"1;bU) ,iLe|?h%hE& ͕P))NzZ^d0Uؾ$ݑI[DKo7T{Mth(8)M#OZ {J($Z4 $A0hGT'Q4E*Uq('D&P(ǔ*Ι6\\4܆;M$i;5[Fq}JY}z;+{1 H]C )=`?n%c/a/=쉩eH$% ikJoWYD= s8[_Z.i]G|; >ߏ?71{5і({Ō˨̌ʜ;Ȟ4 j4v[V+0g+4@٨F(sew]lʠL} \CUUs[2[0.k|Wv7\6!`\li.fLeKLE2詳(ƛPYNOl‚.?l*6Lr;g,َK՜3e7n sc9+k+*sk3޵4#쬑LP;(fg${+eʃ,?Q4r5ngN2I7T q sW'Q Whud>,9{|șgGƮALs)HP2s\4ߎ"D9IFRO~#a9UCmH6ٷ14Xegqc_YP?@F7o͊@~UbžtP=C&|x r0G7&e#4^2{gD*[7ᐶ17bO7oTlC| ㍷.wժjP]6,ym_ m;VE>˶p3* ִZy;i[Y~u\d9 SCluğ56vcKbqwԡ^Mnf=ߞDu%*L3:*X'j^MP?8T5e3fZLmÓ5fDR zGh]Yư)eͅohtP@ /|#P=%SO%CP}CD_}ʱƮȶh)%:ma?|nԹ86,q:QNi/EGx!LU9ze/:\R:t$`{0Q6l!Sg߃ 1,}dy}VͲAY{άVFjw{/\􍊽^zbdW{uXzetAY]>ets8$XpaG,/$P4 endstream endobj 119 0 obj << /Type /ObjStm /N 100 /First 885 /Length 1650 /Filter /FlateDecode >> stream xڽXn7}WKR\ކ H vaZ*iպN=7-$Hzyx876(%r@u}V6fr @9B(譊ѣw*K 1M_6lq{acA#=p00y`&AA=:, L,oDL_*X2|.8 .E9Ú ,돥BzjbqiS| ߬:c5z)|p=yTǶԝ:R7:vjY=^czխӽSn.Iüǯ8EHsVr-(ժQX"rAzzմӺw0oջ5,8W4!u5! ڤi|Wroߡz9ͪ:~λbkU婞4 FX7٢ʹ^j9^wu[jӬj˺]=}YlB >%i82 :LJ|p鑺ojhSMzݴ O'.k8ARgMrcD8ߊIa <#C&o4'Ǣ/1˰h-F?eaPf!6WeFHp,r("ʢ 5߿",3J%ra{aA2FYQ"i+3C`c=G&"f|0X\ˬbY%Nt8?)eFȋ@*YrPG"9cFaQmü2WI&:[+ Yz>C's^(^dZ#lt(y\UWrKUWquuL:iN'\W/Ҡ*B. чיxe8/2~v>[eᒆBH_1DF\zss͇y~_Ap_ڷg[ N7,~xڞ춡F$|#No%%ɽ$?L endstream endobj 245 0 obj << /Length1 1919 /Length2 23011 /Length3 0 /Length 24208 /Filter /FlateDecode >> stream xڴeT[6ӸKpwи n!Xpw's93^w_]wh %US$ޅ '3gafښX((Ĝ̌\F.f.Kˇ efoh 0țy:J@gc#e3{ +{31l <eݝmFFyFCh,l@s@]UBE JXKE$f)uU?jf[>|(1PQV`a F?};"P[8213Z:0,lOt|Ra!ww`pouX9e+$3w177Pt1rqu%xR@ME;ӳ1rߊٻ:{#7mVfwS3+d" $%T>ϞA{Fp3sXxM*ao*Oĭ>td{OM]]>?2 33`abN#f#H`ndlcenv6r38xs ?/́kOא|L)`jfǤthf$]mm̨;edgezi'TY[E-l>*H~4ǡc0sG/؛9;863?UEUE]R7Z[X98FNNFp=fheS3ht08́Np `#qLjF<\&M ɿ S@? djAkAdAd09~9~9r\?x]?6)9=L.ܰ|?^_X=ع9 <Ώ,_3\/ "Є/:)WpKPK&b%cG<PL"4o}EFkRԭ/>Xz@.R#o쳙mmDc1hwR,Nh(xӝ.OQF}"+s!Xc2P(:tj/BEU۩R#{Wcf.W}"bOpEd$u ` .͔{շ]/-pRdq2O8Y]5ۛ˘8`Z3B8-BTb:-֍l&ln6B7@/+mHyW)Vdؠhw[1ї .9J.__VxkBI <.|M)ѝU c⦡Qf.؝-6HS]OhHJ-et2t5 !>77QC$5ja]8lr$5HDz[;+=*e_pxQΚsA^zOv]߱+sܹFE9kMR;Wrjf (eGD1T,klB8Rf(T_60 /z}{;:!.XOnS5ſ6]0+D}"L Z;H nS\d YXE` Gf]<,DMT̎u/gDžCI~:i 0wsW%ЍRSRIb"YHbJw9}DT"ݦ8=~2 ]~ Ûa-ֳv1zUQ;U U| W/e29MN{S{Y44i} һrņWi:ğGV *g2"M ړqXw~wdƗ+BE=Cbvv]{c_uX?p$_7.7ֵP;rtt4(b/T0o?OV hzb+q/%.BFYWS:נ"f(s&> †! SH^\䱉m h7Į3y(7Tp\`IQx'Vj洅.ՄnyMxizZC䋡`x*4Z, 9u#WFI(qJ;<)B<ԩύ`]ר-T&XDW?; oDf@ @k3F!$>8=y?Oֱr)P?I#żϲo}1'۫YJLhxJk%? P_&kq_z?`~PmX@̴ X|^C&=87.]md{ ZkM@JR(vO㸎iA򢎠%[%4i2*^,8ǀ@AB7Nka4"TGuH`({@`i8^uTيcnʸ_%ӐEGfy[zVj/&kڨ Izv}YӣԄ7)9 ]ӤmP.RIFώĈY:{D*D!J6ݔI%pEďA֧;%XZ/ 2;P *ʵc5aA @Xg)ۈL ;JjF-7+) 3i-_ۨÃ@NxF}.n0aF,5JuXNLÔHA;/Uc{6$Qk]fA?(W-Zp<4!"%f ^kz^PI!?u+pXfI,8U b o+u#mVVJ\0`I =]`aioh R֪x vL> *BL<_YQIlvFeSgŭH+Óб=kn!߈$c$C_]51U~uc.Vh-ѳc8Srwn8$(q^ rȶ-`Mg[V=X+0jWH-8ݿ D+l =C擓@p0_~؂+1ӴV.r ZT1 X\S0f(@gKb fICPFhq9!x;PL Qվ@+J؀Dz?VƢ " 3n;pX@:߆62FY6Kli(hBU5цZCrQ\0uM 2x~ښ=uX֎LJAsh-/Ie,0E2>K/f[+)q,NA ~rM *W#W?1F  j|O˿4/ Kye'<1> K& E(>zBbc[%)M%B^8xi-iG3@$r%gl=`պ Cud3[}MMI[D *iJN:y ምK6E~RvD=9/I;G< '2+Ą').9z=ڲb"C}Q&m+ȶ/bY;(5 4c4qەWV;XDjP҃!@iPSڑo7"QQs`}Y 2nE9|?X5=g e?FY`lZ͗=֙h]!D!jq/|uնV~1Wnﲌmq YsP"~¼4Nrò0ɄOqyqS8p"fZPhR(+Ggs5zpei[pHwHi+qOb(YV!n yq+Zoo:p?aK8XGyr$B[`DK;ײe)B:0©gX^\e֨~B4!n}o^f w/[YF/K:v9jD-Uνքԣ|Sܭ CҜ(@+/OLw|-!T4 I$I1l-t;OeH< ր{xGꆺX$vGW+ɪ2-ިzaO(Hx.]r7vSP睡9cn Y`>6HE83ԈMJ@RM+͙']O7SP>ios6: 6ׇt-ӊ6Fi'q&3 di!f0*^`P I F+)>yO˷Hez;~N}tpŽڻrE!ląk\kVe.4FkSnsjsoBOK]h^o&]m`[*)v o |ǯ`q2864D4ĊY>z9 #H*wjlEI& V#1$HG@%/6 N="ïW%[#Zֱ^8- 5[qzE lcGQ9T){]bpQnguf8S]p"7$xw'NNW2J-l7<l&i% Bҫ[b,6pvJ۪VQG!ͩ Иp*ǣ`S\ƚ1pέ.g(^%DB x*Hj&(3B"Fs=PLaTI=k, HH۬oSnz$u'xަ?1ʶS'%Q$]0\jzV:GU<)o90FT ۴+* '|0Se}7nW̘xN"jNZ<֗5Ţ[eWe_&}˷]AjQ84]Vn1CE\H3ԥ0' jTk,q1̤ wֹ‰Kص<6\|Omu%+ƾ F)$riEsyB̙C 1|uE7C̎@ZY!&ˢ}yL!MVOAhW^MČ*LXR.#$҂SGno}5]YkmQ3i1$ɜGjkB8<_!14@ ـZT˒jyB=FsLUehz;tsIbGO3 sim}; _|1G4~`pɝQ3Nij㉡aHw,!;GJ95ĸ^leR|:ZJ6`ۨ;@H4qzFj6-eq`%|bM/[:~#bS {bN73wC+KŃ)?vފkq0BbLN$WF,;p 2*}HYW7RіS7*4(ܜE/K|͐kE^@/Y#lJsTӓEbؕx!27uDJ\'ͯPtKO @<1 ii,%uƿ#>hm*Tqc} Q6,,EVѶl) ˆ!Ub0G68>xkU/@Wgq߿h8~>NMC®*itrJ*k87Yivw0f9C  8'_+EUqï"UwJNe|MZ:cw F楄Fҫk:%58 6=d~?*"OR@0R2bw'0r5' 4ƆL{CWcG8bOd\7" NG8Dc/.}Fz(s|+Pc50+ԯ,k{;gE A.ih }M~2 `/'HIim ) BA~|[ɵp)^M_E(q+lpB[(^k/FGGI6s`.һyEm|ZoMP2\PlhȀw\ |B.&[ MMjCF 5E*/evBHtz IFtnK`oh0]qsPϷN wCa6ot ]>ga⩊eZ #.1 :Dfܙ-;?FYXOBq?䢶oW dK%ˮ[~,+yJ\:CnT[!n皹O$40).`Oj{etTl"c;~߈̟(]?sIsqD~@_9E}(zPh(:m\E>FGaOyXӫͤ:wxmQioYSo~{CUvS{#D^\tWO܅RxnTMΏDT?zwJ9E@Ʃd|(\]*jB=e ˎsqE9b"lZu\{(P泥aOG63aBi71I7mPogx҈ɺʳ1}nuBI9)oT翌iqY<\=[O͗Ϩ8 e`E'Qo$l1.0$zހpw?|BYUoR""iJe'0h 8AT¼Me?a{m8E.0K}#KaS@.fޢxxsבŭcaBe@-Q7/J?r> 2,!09siH-.r ?>䥆[L B^\*Krr+V?F>BSCsKzwX=es4E'Ϋ OYΌvkaPV* z}ceH#5PNRzoSv28 *)̙#Zrsm HC'bs+ՙa۟aB0eR{ƹX&(w{`(Lʡ}F!acj:CUk3RLMvJWMqxX6=plpx`z;ݾ HDw'c&ڻlr-P4]/WG9P(}z8hUΚa[Wƽz #(gNUo5wGMoĥ9>z gͽb?NN2jD9kz2 f !y}ʦ+X/ ׹'WpsM|UeU%&Khc[5i#OCNXvq^3k~0Vۮ^&r(' q%^ !l # 1~!sCtW[d]|E77i|ӛ 0|n9M֑9N`+Qiv3t.ĸ y:H_2 & ;N?]06}\W)e"yI[hͱZ.X{`}FPxSp~K) C9 0e` +.d!Uz ˠx'}]wO5j<+Y3.5MM!>0ߟN pKlo*4(>9b$^e1yίqar ܧĀkUsٕpD_⛦䷢ ۴`@gj:}f^ aT31@[(V("`Ev\iۨgTZk7!izOkƅȉv&F6r;b|cIvbMXW7KUG0m=qr Ӆr3ԆO,Me 7%w3^0by"mi$i- Qz  j ʾJމh)SH)p\;"_m8Q /vlИ4ѓi@98ZuINвD5a"KixF-NM%ͼn]?p[,vGJۏz.Fb1n#EuDfױ>:7$gPZ~ny9j8Lj?c2SnZ{PfFdYڗ/mvgY,= .Ȳ>?~L.)NLhFCqCaQgQ*;u_Oe@0nufOJtu"dF6~ ڢtk+f>+[.*eU:wK?]$lP:oђ!XSvOm {$fQej;Cs2]5Pwb^9'/ӡɯ)vIG[2<Ϻ;~˼1"|XHt:T[_>RQC{#?Rе*yC ȵ/ea,MuFGɍ_Hr)u1_>%n gQ {~ZВT92Z_UTo#lZ̾~ŧdmB;h bSx#|sT0P 2Kj,` %C15Eǫtt.4MG'R[̭Uad:هܲ6A C$lC8L%|f=W|LORaT5iR@0띦وh<|MgKܷB u>TϬ=8]~r0:W[iPz~@mIwxF4b|$,zI&#0%W`Hi*ꖿ_|&ڣdǵ}E%L*Y%QӾ 3/7ah^;rT:9@3T1]"%NqzRCw馷n;1g-)5AçTCA[ocp#h-@izRz_.|}``f8Nʣv8:Γ8̟w~z\K6-Ͷ O'7F:)O [ZdF?xLG\Y:N^L.НIXu%cz/97'Սq}>蓴ɬe5} JsdS٠`se$O F*Y-.,* YE89&o 3劙w-Sog3Z @ҤVkgY@/3sŋ͐ |6VG=Es܁[Rv͹G;zۿ;}x^$6J4X~abOB^jm&lw(09Ca:X yp[{(W\q8-x1p8s;`.be/ 5UDaY .7D9 D%S%3K<9#V4g;7o4(b/(hz153µX(KjЌiT}7qqL m̒، 7X};AS5"(ݶ!u&чur$| ,Q͂_MMe\:߳P]ZEPq >7x4\1Jʡ Xy" `fJJ}KYxcnAB&Nm&E̬F w8$zCdZE8T:.~n8_gSFcoZo-!ҙmKmi?DIowSeܙމ{(7jOЊC+ bf۷ ,M / _ K^[0齆מzJ=}#/yB*LmLj3o罘00$O,R3(ƇtXc;ҡX^\yw+7tPDr.tGݛ)7=hml?~zbyt|pHJ,+CC<мWb5@RqpszԨ#,.wx!.MDɎiztD̀KxS_{7XఐMtBaDw7*CӹH+1gpX.T0K8ؤ"I2sK ;B)l@"XdLnޖ'36gD e}mʭOs0(P?Iu:6Wz@< "7`X":Z{CJ|ȋM@}|2D$KSNet3ij{J-t-lLقcdO~BW|lUqJCӀ6GӬsl6=hR}6Nt D5oHBx~B&2$PTumpo#i5͛yr7¶7 jcQ6iZ]DE$7|MD0P=9_ގ+گmEWY7uJdҾ94K "ngSK69̌3g1`g8e-@^0ݫ+j~wF+oY^COrmTP_$|W%7!%fg0Qz_m+GaB{̷pc.e7Bӈ9A e/W,^Ɉ"-:*H"GqjZNjH.03tirzRWHhe5]q.o<ꥻlQZovƝgw< @|܆QV?k4"Cٴp(Ӝ;C}z )ezxL/V)f%- ,bS\iOUrk7Tl8z5B: Р.9{7_ʥ aDV; 1[7g=ݭ[9dC÷e!QSDj<_eշU,`'KBw  s9V)ooM*=wP&بjo y[ !p _!o:mhUPE6<ߠDyqiC@W?L,QƼMd֑ N*,EK-Pos\ڰTgWEWqf_OlJZ#LۄN֑DaZ2?h!|kgU([;G?;B /9,HĻR . :ʍBEiu*8qw)2pXIh6?]EEBD_a:tTLFSASt+'.y7eS`,)$ `gؠx3QjwT'|96Hu1*h+|5N^(:=5$@Lf _Ϸe)7xm2jԮF%b`vpx-=×X𤴉(; fszC-,4HAozɬß۹t5úhI*;p ?лcPK`:$-'ԟW5 p)|n|/C!ܲaGڒ. P2~ n}˾c*bbcN~F9 5{k$x/hiX83Cڃ b+BCqK{mOǫsv$X{%.ZORu0;pZ21v|UJrCA W#(ygyz[u,*yʱ\-NKb瘎sRn_na_R+ɆF,ϰCALg\<+}p4_FB^r49l&7Gh !P+`\~1&}bXjjeB̿u\!SRٹ4>SM_@^?`٦PmAmIE^;SYt>p,>v7& U_*MC}xRv&!|43N{Mν ^z0&7\G4GBևqbbHn^*dL)8U^OX^$/m%_4 N_! ,**?ZCF{7 *y =]:佃&ǃ;|TeaT4cfE0M—KH\Qvj'rvfĀ([Ɨƛ0' }Q-g͌hD ^D|V 54jm>.o!b >H! GwݯI/攷72C&3w sa67P+UvNFzݿ0!qWкD$0Bΐ#yc ij`w~@ۥ:)gY.Lw*ۋT)Q=l( i_th?N\nki`J4[;7LQ6oP"Nj Z0Q4 z݅ˉAR qc\| blNjWӶ*#C3jVE7ƙ샷Շ^ozi@dMsNl;WOX-Jo0ܪ=]2JT!*q8WHE\S<%q9O(Lya1| dO4_LR)L`x+p:4/.橨}8kic'_P(dmmr^ vx=![“ H&|\MX} Rr2ؚ/&"YÄbmwfc1 ) ws+xr[%cۮ'ݞFq73+ ۄ#~pR:!n5Q{nATDj^ څxlY_'*^T%waZҰirdF'ڒ>[wԫ,07 {ioTb薀9W,̣)\U'跬;BCY{YӥM+0HզQ|'UdyU`́cÅu֠EZ'۳X& vA43`pHW1L-@|rY9ڷ+⑃\cG[>g8VFK Ȅ?yS72xH{UPߚYI#k;à @ɵ̘yVսz׃f6o|tRU<_15Cq$hB!qEx') 1H"t}^kkFԥ=%*d!DCEn$&8`Tq]図L{07Gb+=_vKDk۴;/Acc~P hKX(P8sV}&S&Z2߱VThѧI9|^Toj(wV#DъThJNzvFg9rs&$F3CsW1 tMŸS2RWz n ]B] 1ݭxVJQ\I_Xj`B~3xgO|Ǧ O dz ejO,$?qцߧҠNBd ;t!d<&'ejRKVrq<zK⥇t%j㘶Rr5%@B9㸵n|n < PjXx$_;eU~On+ϱ]4 nǂ"И[T.?8i!㶰') Fw5 ~h4aoıK{ׁN71jy#Ε!V.%_(i;C`ZBeKyIJ!?*A Z*6Ni?!W;v#ksI$] RFS2ۮd?6Uv&bIvObV\ (̖t93l"%_ٗu _};9D.Wͼ4|+YOG/[`I# { $imLz6E9.[QJVC.<(Cjld*&$ 5\R~Qg`x {4tZL,׽bi s%.Z +?a%o[5[0%6NJm?TPc1׾@#Qv*АƎY,?/f/=խ0ùkv4KlP3àHm_/&6m b!? J6)dZMF61LpW_ÍKT>3M kڗ"9e.ZgIn1 V$9-bw)diJZז0qY~IΑgFoI4K^F#9+QDu93s &2ZZR&sd6~*ԉ~s:8%my옚e|drz.NqEG~xX[- V?npC e=vhv9E '-l 2,UJ!!4Lf;[vˊ1 l'`ꜷ<5s cFC乵_Y&:QrE<'FądcѻDؚ.? a6!$jaQm4$#HYuFf)5O^݁Q7m,1; 6+5G̶߄-tƎHU9=+J?%<أk'0q{!)7x?_#h+rKt\ R07 9 hg,c"_P{M lLPA@sd~E\Lq|E0D>.>.>p:fmwgeTE?)Kh($V(6߻ s.8u%ZEl=܃6 Η?lwúFF’3K?Y *?&n9?i5yJI.^!\3jtef-_5&W<МJ$yd:MGEno}E8gjCZw\tbD R{(X vsA.m?OEM\D|F}#ȾV1 *afKXZ<ͭpS<_# MG@`gDnVFkj9C (9`nT)U)=sT]Fi#WKM^8o;=c(:h?jHd9u4$Sg<O1xž7 6)g6&;4C0]m)X˖= '/(eQT 7ߓ&AڸHO)–A!y$}Pn0udJ-r9X<ijH>(rm @˙xRgUAxG95zHMo@+ k\6Q+KUy 2A:̾,8ik2%C]߈l П0PXщ@hݤJ]L_*"[>Q=aJ=BSZGo;}Ԝ&#o~PÏ2:-Ab0b yivs' ygx#_hP'`ORxX.̰oE>8I6{uEP$oh4ק_!^P U_ C})]-šBEKrENm0xOKjLԃLaKie/'*_ h m@U ڰ=_X}\`IOkPbR79Z ;KPٔҕGS7~On(oN#u0c&&|:D,atȥt"}W{ E io A^渽xl3ϑa<&?:* 7O1p|G}ޣJ(6'G'NO.Ru8hvb^Y!iѵL\V%p_ }gJ " Q?Ґ|zY[*.0KPĂbwTJtqf4Lg.Q';r@MF7mpFe߼_k7&Jˈ%!&`=' qE8,M\]عj)8s`/eCa.ia hf"]Q]#c/ +ň{Ub|leȄˋ\`W>$`bLZ's]W = ?B˗Xv4ī"Q8G) DeSz?P1|&gu/~.Q^S c]M; OLǛlK85 Guةe$Yz˝Mw19:;֖{ ƭsTCI Ae2+j?RyڸU&ł$Y?ZoMx_i+^=smkVj,Qr9y.D< 5AR_dȐ >b?#S}a]M7[݆ofp m9G%Y)31oޔwxYŸ &<q,LJfjE~'A9wdV{xc{Rr6$d3]@6A3au9_,O.0iOuNj_^|ǜAŽ. 7GRsqF-Du7&E%RNd8:"n%Mj ݖ o0](ۊYDj"qAEc ֡͜je[K~ #_#g "+\Ymg>G$˒VwGkDeͮVG,e7|o+ϞL{g,ЪZ F2>&2|q",6>FNΛ.* ^ڨzi__x GH(- &KDSؔ7^m,두E":}vNuSw [9cTfCW?H2S,694$(pIG O endstream endobj 247 0 obj << /Length1 2822 /Length2 33231 /Length3 0 /Length 34817 /Filter /FlateDecode >> stream xڴeX> -RHwwww PC % ]JHwtww ~{?\ֽCE$f` vCؘYJ@0+: `gfeDp!`I XT \h++ @9CSOthU)- :hw &ߕ~g3f.  Ϭ Pvp`) hgphtZRu-U :fha WGG"%S֔2ZjPeMwht%)M1M=U)6k@.ֿ7j(3jT gh G~wwwfKW%?4]ζ30`s+п $/=TJh715p\EUUE= `3h qu  ;@Wfh t]]?mvv"`amY))IKih2)B̤U "? `\|P Ξ,wm`5殎,Z`k'WCM(l r<̬X~7g^~~Bz;:8,v. _k @]A;̭ Qˁ-|2CJ=`;O9E NvSڃh fKl/\bf/ie/A#e] _>XقA..BcXtԴ%'6s0[ع@gg' +tعl6y3,fpt,Q~o(7E_"X$ ^`7aHAl?"qX NA(AP.Jo AAAkA:>#6N(5SB#&#.htLbo&+tIA?L?{ nUdҲS7tӆJOIX*VV _P_ TWۿ T( _aS PI&;?nh1G &YQ!6l8 6Nć gLZY lP\, Jdo;\".HBqXUO[. VΠ *h ?S ۃJB/%אAz]=]Z /lT~a?/N` -HWla ؠvg \e;xx3qB; U:9*֕}96LL(A)N](o.ҥ Ƴ{ZiH\MtG.%6ͬ>]i.n_>;O3Dzcl#@k@6}"°q)޽pcxnS$Sl~]^\w\c(!Pc=+[VQ8QovϹ>UІʒh9wyn S̉(X|gG2̭l|UW\\G!旡eEE:|?dԻdd &IGY*53yڦ~+-d\zf~^FȽzA^_W1v,Fe:CO8 -z5^WtQ0Y^{vbzL.C>r0RRBaQh,8#!U0?jlB}Km.=Sp!b3Qv9Ƿ J㷏6~, -8iF57(JOGpyv#0m3g7Pg&ڣd4w9*?_kL>oaq'"F# jK'w^b+@ŏWHk2nEo`DpE9j/oœO iSS6Gҩ؊~82'3+|w Oя4vJ:˯YΫ&]"B#N%%O6N(aiHh<'%>fS뼀/ D`]V҉UZA݇JlF5dz'87"n3G׍׳<1#õ5QLX o^,J%s |J+[}6GJo(`HeYuٸ_0텄iQ &?⣘ag@CypoEJ\[C(Mtd6 ;^1.(ᯣpeXzM˜E6 z Xu`$6,5jwFfLKnd:ӑ8}2?p%؁YaF^D7T7͌f}e#{DN!*LXà^#3F=EDd[=jh{@n ӫCoLT!1wAi r@ӝ-l!LSb!\p$&glרr4Y.psg#MXZs."~>G\²ժǜSTu:qXX23~o{JzuLQ`I xS*َ"0D{;ݓ==U_glK:[k{d7{eGSF".{+,ӫ~vh4H ~?lO.Z{'x_^MLݦvjnc 2>sJLjqOk*^R!5w2V_2y5ҏR/}92i0F8qq2bF#e<*>?C8EhP'EY:r+KثeBJ<Lh^p5).Kǣqӛ;'W *;W`kq,-V!dI6v74pօH4 {7jqjvf z#s&DKؙp#u\A'贖bfߥl i\.z@G$)drgk4Salڸ]x疀C'B ZR4g$[}ьϗ]n):w گn`iO\yR \o+ׄrc(-0A+m0j' oq 5G(ݎvQiG_OmmN|Yh3TrE_ۻT޲%E-N`pIvߓm.L(_FIœTA` m̋Ưoӭj?2*F;Ye"Qȇ$cAèxD[(}CrI#|=MZxN/b>IaRM#42 .[ðAiwŪdik7& 2,3Oh҆r-eͼ'}w7h7.-gbhڼBDegtGLiې]^~" ZkL&mjO)` Nunaw\ uEhS2y)xQa4֛v>TP8I^2z_74T%jr҂az%Tf#oړN8ab޸9P{n)4ф:1nX T_SK{-ŠUk2;7;={"=*3e_S4ɗ+aIWlRjR HOѷSF51]5G2kEe_vuIENWD篪pkfJe5tjY4{,bǞp//iO$_?Z']E5e -Q[guGzuw&rwn鞭Ƶ /ۼj&%̆Ke`$766]b;=/u4R6$Ѥ7O5vkz'gG,ZVSүC0FsEAv$pC~|+"g^aCam ב9SN)A.l_tN2=5H5xf縗xw>( >bR{{IZ !n('bpidr\ZU=7gk\ Knl+.a$kyDI11hpwRצ艪́Λs#c(MDXD%byo\ M'3WM&T6)^oql brt:R;g ⚟ $v,Pd:+ p~Aci}Cx`K)vvz ?z,{[\o;-q=zπ8_ SL98shqڏJ@oYa&; Eϱ6}oZA%8O.d?N \X)Z4Ot3iISʪ"즅5N5%(fDUm\EB@b/[8⒮4Yl~ñk6f&fiO1,p$`cSfB„#%E[?@N7¼"ë>ƕYu,^iyݣ5i.Mem^x.ґ]H[tޞ K4@Ff[(3.ck_nƿ%!)TK?2o]말ۆ,Gu8+21i QQv[m+Gڍ1p,x<Ԧ7SHS#_#9h.;s{̆U,!]Q1 :m@ڑ؏}.!dѓxDhӣ~8MF6>6)ˎɼOB,ac5Mq7/Wҗa3hإ= E횿B^|`mg^l"Վ.*T5Q(.`YΠW]#6dIxCp̲qKvF(5P-ͦ_>-uorzzSrec*L>6IB#!E>+~ٳvjc]8/%l``~97aMRzғ=mw(+jCf /iܚȘsC]=SA<;ˠ: ZQPsݓV S?Z1ܭ/H H/̈'']{dꟑz Eu([:;,Ѥ%Vn/{h!*=_JCaX{#@2eF>fp/W,-ih_2PZKwuO r!])z<I]V)qALد =vs&{F`ͼ"W&9=ykAnc^w,c$v {@9f~XB\ٲ ?kӛg0iDxH%(-7ѳ1D]QM&;>O_O(uM 4EMО2>7Tv: 9Ӗ1~ro)x8NJb2l NG Q ZYT~ @w_ ~9ѱ%AnQgul& p8ѝRRXx]%֖4ȶ=6 NH&MLSRb\۠BKSP <f2k^ ,Ɇ>\]}:R:!٣p#2;1Oy,S|+i!G[ c\eiMʢk`܌Aw^,b=f\WgIӗ M/c\ll}Y2R*_{4c% SK'L,Oӽ_r._R^cԀTu+;eH77CvqTóDޫI>LmG|=U Z-3w°ensC^%mCl*2QUx% rpv}gNZ]neɏ!wqZ=[‚<@,F*ܔ#?|'q9ԡXSIԒ>혷3 /b7:DO %`h\&%nI N}/&`KdZijT4;Dn]i[nѬe"ub† Pж4L|\B59v ~c3?譍 ډch,(͐%2C2=+|>]'ר0M*te`4zZ9 "-q-j|(P9pH؈)>MگH+o?"OTbh:KxIL\a|&K!YmʤQ"5C_=koEoׄ5oI=8_ehxsގJEvt#]0k&ճ}l-Un!]kT%aWEwI.ҝΒ$)d rQ+$53Y8#T(AXS\F_$ΪKV Qz{ |F.92ip_:*]vs58όV辬h3xB=;.~kI X sD\C*ggsS?@*x}f / =>֕қaGe:ֿ0[ *YܹGܠkW_ݤ>bJIj'ok=h2Sw v6Uw|hI:bW-=LM}jo - -+K,*6æ@F -T09 ASJ\H/QpMQܟg MPyr?k\7Djte(AC3b J[7 X v7p~~]gqc<*\1"[R2a"J/B w^}XֆS@Ȋ }ڇBЭGC׫+gFT/!(nu5h/ T:U80;wPr\1`,:'t|R &]6%ךƞT)[Gmq) !󦚉cӛ:iqUX\鏧sh桗Tj;+Xl\5)K(u6*>(b+,NbsdxT=/'lMٔ$7F'Y3t}KȬ&5PWaB1F@Ĺ)< K&3J:[\P+'JĘ nc?/ʣ&`!yy͂p-NԼ@4~ޓ| 3; Ïi\ɁI_mŏ#^T5^/M\3}ZC8כKLt YiS-ӍIx-X1l($4EC#"3n!A4,}5zZXo;@ܘxGa/*Գmct6}[L\?U-pc %ep4ż,;R!۟JtiJB0FxwjEO:ćD O$8[&4u:%ڒ+sk]fCUX1w|~ϭrVrzO, k$(=5yF7}0 %/Y01x=뤗T1*(ެ["3swgf9hJ EeyHmݺ*+3PMʘk.ؖ8U[C М<춙 9+q5['޵q?yAk7Ĩ~Dh/eF+:VeFZl8[0 (X҈g"WSOv@tͩESkӎ}.ww"Qg4kU>$*Oە}{p2UX`m]>5dznG9leBxLҭP9E ?䝼9x0,Xr]O'@~MjA_m]_q]؋i&6Mw=_B2! Ʉammr>?2 [gƔjc,ÊF6:R+H8$CN&an V=l"a9[t oOޒs8 E~YE\jiU'>Vx)O&ͭ>i ě+ȯ8m673Cb||Y=mlUgٚ|-m*<+t%TLmr_E+RV3L1sEͽht 4Qzݑ|Sɬoك­`;i"f A ! &|܃|)IohQN3EglzqUyREG;ll r~~Xh$u3ɅeEkI!,XqW?&-:7Fz̟q)SPYθ8>\Gfhofzh*.Fޕ> j A,"D^'+u j>djtȘk@"g7oF`HҏdiʾKf{ح}1G]碚O&HK>TMM(֩#Gd˖^4ҁ \ܷtT/ y N9?/龵'YP jX|е'?׉eP!n;^ق<`qu`<ŧ/|GYBU(]! :!Ao=V60Pn#5ڨ_t@W?ɮx[`82 g Ж <,ezۿ Ï J?NYJB#S.~qv^,q)0uUxzYS[Yυ7h;ȱ(8_ydZrMŐ % i:$6 Z]b \ {OՙW&j>#@! gpF >_õ\#̒_ahL^xKzT[%S[&c`(*@9ObwWFzWU+D%/La ,86U:X tpIYhּ\c|,o$g$ɟ#8b^{zEvő lg,Oa _d8,ޯq$ #(cv,8ܪd9u;wYGx-ym ^pgEۭ>9!jf }c yvl~u-3#yCm(h+ jMM"H7MF!V!T0&mok C҄'R:\v5?mkxGt*Zf >Tb'OHn_߆֞/s?ҍ]fM8!S?!d:Yb|# ujׯSH|CVɁ1ZE;NLAQb\KD0vrZװ8݁qtr>]W'6 {s:=Rxİt4guC Bb<ߝn8رcѹ׻m|اnA;DcCj?NdkHE" $2͆G;džKEQH#B"{l~q Jҋ-m HJެxI\~Ğ&_JAiUd^ߨzV66.TiH&0+@#0\RFGR3[A2n"rB'kd0Uvv*ߩn)Qbƴd3f2\I٠FkE6ޡGP>;Z/:e}4w*aOEa;-j\zH6en%pDM&I*`QY]Jz sNYYD.$vʼT-tsty8Y6~G58VaXif. ! Dת]qw+.2ȷLVBevc}r? KWZX(cʔlC#L019aPQ^R) s9](ZAӃ {sR|9 6\T.Ϫ/phGtagj ~r1u N\ 1'gD?*aU"_ẟNq6pFΟ<dL-^[fѵU)1=AWRuu Ik#%W2[Et̏o`Bx2h1=+BtJT^W-s'|b)@.f>.L5?j}h[R&Qӛ< U;s!$+t6#L>y)n(Hů[6ΡXy 5Útf"z p" #.7OJ}6:2} Wؕt=`˓Tq`jCFDA>e@yXZl)yq "ת%_ԻS'6+/oE@5׻UvKNtnG\k[H̊*WFWҡ3=97c#PZpZJ=/1;iEMK[Goͬ/YzoNNJw B y\r 'RZɢd)6M$ffpG`O,uF}PgD{HV$d;%W䳡p_&\O|9ڢ8~_% ̈́+FYO48|G >5T+ArG((7{RQHFސ_f؎^[x+j>$䊆;ESr L+x~XQ[eKz+$*aMw^Evl[X_$Q婩P7>z0_<;t#K+Z_"mu"wKs0eg׀W:Ü? BHLvb1MWDO9u|ɪ_9h,|9u!= )O?Mf;sEbё(C 7~߲o8/~0)z]}*LZ)p-)W9@u}9Yx_]&O7϶m#6 ){F2kQF@xX,Nw327< v xLG"~x9Lu3wzS\§<35!V:Z+>dӓ2^baַt%uF` IйF΅kJB?gDmp%]gÅh~ƥҒ¤)*@4DN,0s7+ﱟ3#f4ij띤_*:RY.k(W5_/:j| ?)GTg6:#c%闒i -teiqz.A;!ƹ^Sӏe`GĜ9/Zw<3VJˡ!mxf-sϭO6٨~k/5 #kQwB-$936EyvJ!pA| 4v!_C 9'}rSM, T\Ӗkr&yS"5r@U/}P[ysQIbnHߌb`^9L}KY+qD8#S|J}CtB׬#5NW(uT} ˦UGrcoM<g=' 3ıNjoẍ́ś/*+ m۶m۶m۶m۶mٶ;/p+m({F^hrx mxްtshc/ vrykvˍ2vx)7~Sܓ {؂J +6zg<|pIǿ^go6Fq0~[p!cRWWѓ =R{M?F;9Tԁ:p|(S'M:oZ}9?0A|Z+?#EoTdiҧdA'u+fğ/R ?DN.xu{_6oh"P㖢ST230O0({T iw@bBƾOD(O jN {י)%`8 &q ?:G2ǃ%aNRL'϶H85,Y&X:g+5fR1BPm†rXE5˨YR1xJ]TNKW|lnY9lG6wJxqX̡[5eP3*{CK梾<(]j5B6#_ $#7q۔YR*ېx8<-`%Z-ekpy-Eز)%fpbJVG6dT"0{+{ }|-G V_U]n8 }9 ŅOn!DK+3w r.qC/#fU.Yg;rGi֩uX׾rGkl80P0DdA/~R "Ǫ& W94o$xAę1A`ͥ[!B@[mZf' A.a`VtdGqHׯ3٘lZh=p?l6,5S c@!~0[$6βexx:G V2=xZTp%F<E),ғ,]a7R*0 g"6i;d(\4EŲzEEmkk!)fI($ [w' T+H*\neM07]!0'6z|llIb1$?",F%1(֖@N<;j.P3 R=KH!BFUs cy``p8NC31@* ޻dn]+3GT79U팲kduj ]ͺqX >T9Qw6qUu h{^RJebBD#FZDT= iNr+?QC&SI.`е'FZIQTCӤL㈛$^iu>E~ޯ?)D"'p n4*0 u0t;71oa7k_?O/(+Lb1QiWU9~ڍC$85u̲̕J^$B7yB$8$0rfK[=Y's.:e7x&㯘b+%mvV1ӄQ"E]4L1 .$P֞[fe0KbsHAO6NJRB5+n1*OOEkqVY!axsaf1PwtWҹP1k;k7Mp.F$(ɧFStCd~Px=gz9|FQU,:uvj! jST`+XG'NW&﬏)d/`܉'( Q rEjZl2OK-n A>m~nbAAD}> z`8ZVsP"K䧒@3VN1wҘϿZBQԪGgJmFtR̄ boۮHLOe@yjJ-4ǐFdd.t$6HAVXЫ ڪ3JImu=Xg]eu*ƪ)~jY: WDFoe"1M<O=b~xݩWawnfHBPղzu p?~`?%z*1(eAqz:A'3R' HG¿,vCp;@y]&3:9D{nzuYwsNx |5б$(; ^4ݰ^qVМcƦd##Z\2{a'c ĈI= mɊ!žxp+ T"7#E&M/QH9\%g"%eq 8۳,UnHޮf82 mSZ*:Gݳ Ԣm8aXׁ8`߶;BX" lS/D,gUA_yؙh]f)L#&r"+*Bύ ptK-lfxBjAۙ~@ȧ}v0^q&Aeˎ}i/bAc&J:?پW(Cs`$I3E+݁1Wm* ЬBq?1tmc۹sW*s3+hֿWeg.U׻JRPuԍ՜NhI( Rf;@?=pS:η_hQ[%yee+0}!["zI W (@FDҸB6}w1Aݡy89 eCI^8w㦥]4e:r!AM`pu]NPJRd0I< "Xϑnl7EΉ&QzTAk>yx,ۭ}F[vgӚFr|>h ߪ'lZ!;ސfZu<^-eLK˿͊Ba%MUվ 5ľܺdןgaW@%[.dv^%.:z0DW~ pZ>"l?{s[*EcTvCySá34AuBfl _MN1Mi``7h|)do4҃/ ]7V)b-c=`d4ZH7!6:@(lTOp^@ޘHO}melq̍Ex8ژAd'l*ݲ+M|.51H ?OT3 =TbWIR޸EӤIh^P!)`Wg:.".bQF$+o4rJ&nWjx94{YZp1}[psSd 7dO:9C!_̈$DɻSd-̘Z}Q+QPۘ ;]tDq7KؒL#XSжIc=taoCud C4<@szu0 \2yZLDQI@ E9+'/7Zс6GlmJZ<`ln _P ZGq‰H>~}/b<71=^;`u$J#.Ga5/24!Bj+UJ|ř2-: rf!8 (h4q*L-wD. l]ʁ{*v7To5|-4&, ?ⱨT0c:Jڼ{=GCϭf > xބJ_uT4``MmYlGGUW3~E>C"`ن0ZÓcʻ2*]>|bW!kgɸ(vZ 5 S Fɷ q#9M1hs2YdgD>;1Ajڠ z2օlv$/D7 ؅\`H- ty)# k5UULgC0!?ɐ9%5Ɖ'MaT!2>rdA<9.R; V֎G]ѳ;P[( *H 3tYچq{T ?҄8|(-Qr0/[ , [4RSGZTgD8Ӽ EN 8uƅ `S{l)ٝ*Mj* سSBuir;M,$s!7Ѧ;#H9SeokM@?y3TQs&+P3|Ho)"‘~$yVUV$je6[0m+WjB/1//F=XLS2nkPuT~9>d'N$ 1-`OOR7˟)I;,?>亂M^hفN1s ؿۺ5;6x2X3$Ys5ȉnj_E֝+M$5O'z/f9itlc9^ouG󁎐?E=Vzzg>"|C݇S_85}Aj,jiyQleAxI^У;\ ڰ[F o7ϒCkl# /Za5OŌ!hWҟK4x{n|>*+o!tbiDpwB._ᓝШjzVzمa3Nb]LTx4b֙&OC ߰,:åmUm >5w 잹Q*ba/\jG~nz`fzHi1@h##I> aLBS}7*8a^/w?~I<;v²gj7-C/tYvuNg_oA>]L m 8SǔB(&y7||C?V:Ln\$(1;:,f>Om%A)VUd~!6AI& yPM<+rLN^XMWOD$0 -ؑl %N[GKz CM7m\=Va}9ܒÓ{i}2tjFˢǎ|D\y(~f k0+>RRaלY {ALr,#I"!7S}j z}% 85rM}{7k =9]A;3 5@/:r(ə(4DWT8DSx%Td+013qD7g͏|[a9(ArIDY{91&ONI4ScWMm) $=yEՐV:w ;m|*5 %a30wef;A&{iDcT^bo*qk|]sd][ZwWb0"頃RdGWJ+g`WVl%)o}r*><#j̤[% ,IuBQFECՐ8 J&P>z%GչlPY2|+N]a.*.%@A=J魛?2ww WIO\@?$*5rA0.;E>T7L瀚1 LP!D{ٖ'ټtt#6GHyY-2aςa1C2\+ćѡi<+؏CbsoX݂h?"'z#ttW@d`ЧЅ8W}%Ps)yUj18ƨmf|^V)%8r[y#'co|'k);Lv,(+KP ,iג]{WO D͇\me,POBrӴYﻴ|3'y{=0˾ EI2&t4 ( gҹ8D荵 .y>kj %/NJU1)d+JN-SCk`I@ŚL+ tfH8Gf,nLmll&YX̿ȸ?2ERa`L͖tQ"P~"fFBpK5'C5.5"+T\M΋Ɣ `ٿQŷQ;ZNifNL\mPÈ蚂stj֛+o^Kk`RU0~7͢|~]cJdr!/BABKO5 (k_f=Cnwe`H8gE3#? 9Om=b }6!rƼ{;ebrt{GHL(+e>UކǦ/)>TIqb\aVڤk~fD#݆zW2*]C?lV:٩蟱mBAFLm;qK΋d^GD~97s[2`\}<'(db?bq 5QygHƢ ^UHbh:og pH D$wv ^5NAn<+x!ϷCX e_Da-$%8}$j4Lܡ;Y+w c}24lGʈj|$" mz,\Dv, Tl1TG3K_*cDStaQRr"ҞC,,gyDѹJ H$/TkwR+ y^Yb)Fkpʓ)f @ |,R( VPAEٸ}&1@R;zqY/'ޔߘ FDf!(;UG4޺r ?qRu'(D#LՆ,ig͚ Kӧ/] ~$Jҽ6)8JdsZIbZ,.pR[׫qb$7nA?*H'f䇵%b݅ya@ Cef d$xьk+UvIj+[ܚ7.i cF:-aLjs:> aY%4s/0$CuȓL]X#i)j:׫bD"8$F[˿u' tzhQHRS{Wn,Lֲ嶲>Ԯ؁}W|]~aAvfm^6qJN7?J)c|o#K"J1+9P/Iv*<&}<\twkR-\QxbLMnE-Qێvp &4z9z|/>eGԱAl+ 5o_QՏeq3bbj jrh,|q&}"RY`9P}XyNǁ["ۓi{65ӰYsUֶݒħJ];6kh qOӽoZTGc8FIcFmMmMQ=̫rjQ#t8/ōu 8yl.]O6ha ͛Mݔ5NXrD:33Kt,|6ZzQ"պo;N}*SIGzYrG?Hŗ=1C,3~VnYwY}9 Rp|܆S `eRcn@0R0*kP3 |x?HD "5uP֙j<$=&a(}czzj('Z2 ΗlOl~ ăU,͋qOr^qWj`lN@U;^ 1K80+R,Aϱr7׺A@ HSD$2V xbvj^Hhԓc:[=r]2 tKk:=QZtI+"wKi ,g:6׊90<_8S8'-az_ f%g]-&àcuMl qcH/'g LK$,t*yoD}:Z7Y$:&/!`ǜJ感ls526W;ɿ%|vғ[~r\C!Xhެc֖.I^d2itːRP2Nnfeyv'?&@r+:LW^qk{OXO"#a,ksq٧6H@\dy1WCh //I48w#?uPsV( T |+?/K:]F)UbA̐nx[%Iz{߄,Za`WTHE3hQqIxH9sY%ǝ(]^WRF>@! 6xsz>Wjn+U}P#N̐ȃ uM}إSGO`.Pؖc#BSNo'<~_ն9 ˎ8ޚDTY ,fn]ƒuĚT ^;n\~KK#MHL|I7b%KמjuvlM tfse''?ܨc$jبj9=ljX}Ě;I8Uʢ3el2{_% ])PJ8ۀ: L1L4`;RӜ8wbO!3r18)rEŚ4ǨRU̲f!;t^\DC q\)2^?$@ VMs:zM4ZwOOO]$>e 0cSç,鮨 CD_&)!̫>$(fj2*UH@JțN4u0Q-Yua9V%q*c >x|e<9GN*㑞D:¸U_[>&R<4Yq*5Z_ʃsDd!5J1 3fsbLOQ'W 1ّuz8QrpLW,EytOqK8>G<СzQձ=.5`YEJ@ ZO@ph,VP%or0oee?Cnv_kl/Ч U}AiحaP a%)2ĝ>r-so,\[SPTхьgq.Yv=ݚKIHi Ӵ@BKS&6+"ì]BCo=n*DJƑQ1c{ X\.Z"f_oFSYQ~nyjJrH&xYk7liԳ&jGqxPo/6PbVV.~zd6(D^, 1_TR`Q[[(:FfNA:1*Y3cѦ>Pj#/Wbc:!˭OҗK t2xJZoLɻֻT2<: -!,KhٻǟjFmF~lXPTү5$Tf>x`#ƢD"5i %,"o7M}?!E &PE@@ǭVR)zD}1INb[(#f9G)y<滻H/(C%Z]q=Eź+;PbٖI{$RK+ccbW; ) xKߺ9zr5.mM>Om2 0f,c)46.}G*\W{G]Ѥ>Eh,ȸ;ԧٌ,dy07)зqFhX^Zd$U䜐:4z6: `KW%0%Y MZEԦN NVdNWoݓ;y5G(n0v0/Cm0'I9{H/Ay)Z$.5F($zXPyx7LOi 1D^\dB;Azl fP$- #4kĦV ȡB& "fbNiB c$LuSIA._ :Nrd` jHW5"]CoAIPc_xdC&% lW:kd9FyU+ < 2 g^m&W̓ʾ ϟ1٠M5Op ͈ wH+s]bz֛TXR`,/4Ip-'E'/FFjGUzV}I}`%|`J\(h.0ÄX˒wNRv!e3h.Q9֪!J.kIƹH(FcNxVCUv nbl@i22l%]~ŋTOKxH2Ѫ_;P1c=obڌ7~L$=(ɾM:i?&nNNB/Ae)M|9R:J{͔3}PxsI8\aળ'F1ݕd$xmkofIX9krÿTVE<J/imͤZϯVu[ݯj.&XQ>n/J˜:"$ )Ca1y%߯TzRD;/O[jy WVIuaF \(\x5xȔ*ycZaH~%ٽ*ih=TUBfcԇ9kgt#|y|^^R:L%o7 ]emQ .;8UO&@fG yO]{?[7Z#¼RgsUg[6+a#L{98iCs=j[e*ISΓ*L.~6,]r*ψΈ{xohʏgP~W!Z&q%/sN#PUƶn/JǣN 5\U+0IYW.Q` ;@.ZWBZ> )zwUUL?PGzCsPS\U";;7-y}fʻKη I gh^yYdJ6c3ЀżvR(IMjc$V%M'v,Qnlg0 䐹!Pb$:ADqB&6:g!V"Bcc`!d)*Vhf1_s NU{] 3k "ͦ+.(WLZWRkM8ILTf!z#{' Y [lpq5>DU-&]?i"YNJsZ6)y+B&X:>I8ʣ5&yc.Ko"#]VTeMhɱ}be_bkm˺IP!H t o~2h=K=<`?T,Rsah.=|%wݭu,fAڝ)m CfLt.A_p]QTB[$2GTYbOeӾ4PT"@7=fIB:[̳sCQg@K`cL@^>cCBO(".ay--`ifJh$-x9,TV#x7Β'RQCdHh3 X6\2ă3bZCbN}O/:|h̿vq g'3ۥWeOGUtf\.f8sQ2b oh1 ۆ(Cg9J9s6>n"<ٰa坐֓DX7ոz~W{žR}P%#XIRh"Сt50&HfKuwc);5QO\UeT&u 35:iHج="3or")!tt{%/iS(= endstream endobj 249 0 obj << /Length1 2150 /Length2 24510 /Length3 0 /Length 25740 /Filter /FlateDecode >> stream xڴctm۶m۶ &6'L26>s>g>++^}JH iML\h2ʆvΌ B6&&:RRa'SCK{;CS.@ C 73u4ydM] U<L 簩)始˟̴"H[ۻ;[[ LRtt9{O%`djahc7jTEJ ʔt]O8 @DPNE`FWUVSS9 @NsOωeEUU4D<fl'h#T[ڧ_ ..\t.tNt6St;Y>?LmL*g9],Lϊd,MM8kNv,˟6p654((l -\L ?': }~K)@O9'Ϳ >?jߏmlol3K?_6YA9I1QeZƳ_p09Y M*jg"lokOD,?bIMmmgn?F3K;?57quWtt5&m.#؂O䏙3CgS_K3ogC7S?`&.-M`.igfSɿo)ڢ`bjC/g %jc#ghkJ_I65fJt05Qt1WMet1lxA;sˤg|6c2g[ۙ:;82,zQuI)ۛXڙXNN0 fbS:@Oggppu;YE66ӿ@/7qEC z@/71%Ϙ2gLg @7}FQX*gLS3 712|&4~6~ 0~*0~J~?f|,J2mlFOe)Y g$ '<~w~ s~ s>=nG?g_ /XT).N '߿Wҿx {xӲ1h>WS13=i1꒽1wUZsXhL8)'iT"jL'.H6)q`k@Y_J]ifӷ&;~~yjtAY+DGRye,sY_œ=OLSשD:? K[МlPc W 0ƥ z{PwcJI涱q/r:b&-9ʨ%>KH{܇`n`!2=ƞiCpg%jOZeH1/(RކJ/waWj!;[;AU+Q աiHd;7jg# ; -lOo>@zVN+VBW[pUyA9Y>?s:ٝ=2sLh"{%A\wzO1ĀLʂC-ۏbxpܜFϫZDkZ#rx _]V+#k0ݜܵ*[9!6Z`uE()=.8eXz"$]0*Cc/ԎI~F ]G5C =+ ӥs!ΟyQq;cQ ]J iE' dXGXfu9 J@2k Su[ ; BiFPx(|W]M7\x;q݇SC'geo6Z죳K\sQD>ѣWg3kv/NYZex5N\+ȁͩUVʨA*O f0w\=Nx7A苶GĕTk.d=ekxf#CRyq2{@6O!:$wj&l̄a?[\8f\) a!=r(6_tgU~ҖI,@OGY ; }B%쮨2ap#QCF6QCLfE#N&T_~entlKOOp_t64*w,qrjY#<^GUp6Ʋ#mQ3t;e~m)s:n:y0Z/VYwn O-&bc+Hc3^ #mg(-˦=#R 43:ݥ9IOA$7S:è"Tϻmom&*ڲŷ8zlr^zJ\Ɉ+@O[E@e-#_5m$S 4GGra-|.=m-n-Uj^*@EEY?uNZIb@ \]1SㅛY{oLMJ{f]#z;UK?U&\ه.xӼl匂fHZ5v:ɞy)0/䷮A¾^ЯN_4N0=xGi[#0}r郅%HPOX[ֱiJ&;UԧںcWI?"xr%5t-Sq%tTP%߸kIb֓iE3o~4LD*#7rȟPאs m? חR8S?lmaVqԬzqr3/ {Z}ʳE+?.usRXpH~ q F4det7~0-`/2>Ϧ_VvXñD'HveׇV[=-BnPzq!C'4i~H;Ix)[/PR8/N$MUZ`o R}hPǶ.>i)[hH]4^A#'6 xGɊa~ݹAΞ+ֆT'coG ۺl$`t\™dt?VNЃY;Ww#5F>(j]~;!wC<24j?Wϳ)>%%!p͒k_!GZ(48FD't#<^]7QVlPqISp1Z+;ҮܐdNc ;ЖTqw@cA6y970Rh`QT8Vg" g{[A] `>DW*ޫμ:R}Dܙ@'{9ξ:՜%˸xO-S0uPZ(jM Z  LdPԨETuWuc\X=X 6tz,Q"p _k)Lp ֳ"! )s4{SDsQ <ouCl 2vKLJ\ yz76m'-6 O;cv>ZoWdF`@R+SL}[.Ǟ7=eK\bQq*m$Y@(x/IJ3G\ >e]RHB iD<砀djBbtoreڄ.; 8y8nƸmxzVMmh[ϦwMb`}rm{`5wU;UyK2IB-ZQxҠr??ڎ\S:Mx *)1:•rOaP#fl+%]"O]s]AuXb_Ok/|ox}ݕtZ7j.\#%-w8&),`1솫tQe1ל4qi#lak=fb^AW[r+ij]#Ҽ M-TKƪ;ܜukdqͺj]5pL6B=lRuIGv};+%+a /p/ ]Giه(c3dj7z?).}ϳܗ .dFCuw[ۧj*Aq{KUYmr$m|+J,ǾxXg O:gΐ' Q0r Zʛ wEA b{]7Ac_3 d) x?} h4f4|.ngI j!Ur[vdj\z@TE[,w|G< '()Ia/ \@= 'IĘs&DGďgSJUK!Uv|5| ܕ"wב _m+Mi޻v,"}ڑkZjO!XY-VoA g%űXk%s؆ |ddS} ^ w| B!뙇YNqwc >q?C6{hi _4)z˱u{iT*L(i&DI23X5W*ofN:6ЋUɀ7=gE ˟W>8VCˬz^Y(AGe9Y7v@2:([P&aROor+BRq:yQurvx7/$V(s|\.>=̳Lpȝɜ<zlOjhM4U=@BK\T1%&?(:WHy=cpM y-Ew= =%rѷW!fp,\A$Jl;x鱶ݲ 4 3Dy.ʞK&Pwլ:Tez-T`2k0#V<|fKog9zBPXư_ʄjHV+TɸL>Fiw—y% 3y`|<5H\>̄ek>hE2J}d`R_  w.پ ?|KM|iT7bbxo.H*<0)o*0f8EV} 4vttя$2S-@Atuݱ;69hGdԧW$X.QW }+`x7jևpbJZO,n:"jIH'1B(S5a4%7j},f3MsXJ<ͬߤ^RxE|syEvvP; `mM\V\Iw\J>& po8'擮e +5r,"v' #HylЖ VqJr&rcz.0넗t/.* , L@؍* P$2yÝBQ3Q>^^NA$Qۉid0-W:Z_7/,ebG=CG |˻)Wrhc_>|(=Uv ~|`~ay43VJk<`,!OzϊdQc%1D+ɝzo>E(|3ckeDWɯ]׆3345DT_ E5,Ae}~aFҽw`t痡^d(GSfQ\ۺ!*\c4^ NhWFDZuQnIxd7x7É*6{ǀ{hx+a/-̸Q0ƘaG~v195 >97kJY K`\?@ =[:(WW]F [,P=]}>m[WN6w/WǙ1 Z'e ui'olb5(#3/g‘L]SuFzy_b{AaYL,xY>RH4N#P;mHًzzx&\dD|QIwA2]A 'hIhrsɺvSyW4OD'(u @* B8 \a,' `Oݗo@pw%9XV۴T NiTDZ̪:yN⪠GX?A&Ɉ0gwX^,߬f\9tLw)0)82 ab3 qK*Á/T0ť1_͏\!ȕ `ݧlz6GEgb~TMU[*>bb#=6bDi-Pj՞UE Yb* m[L%]gC$p!(LvMqȰ ۯvYq51_hp" AN47(ʼn3Fb;4h\?# ~- Һ Ӊo&V|G+_QC2Yc\`e@#b!:j i/4DqȒy dZxu+Kb6}79.gVn% #$?XS8n;3xph98+Tp9O5 6 S%8_]I>2Ű]c3sA٫ܓ77Hu̱o`'z>` *N&s&00-% H)rTZ֮6֩|Kˆ^QBJEΥ(ˣ{?ǜM:]MYn{32읉t󳍷nfNF [~B ,eVwČ*n3P#J$ 12c9OțVi8tfh3 툢m"uiTr Ģwr$*Ę ph=J W1zRnrV(1k-\M3we?wZC3™BujTFYAq4f'B?Ev |ݘI1q{-GYDK؁ܵM4ږ̟6QۙMt.ڶgf : `wyCTYEAHN EaVYۮT0VLqKytq dyC7G PmFm2ܟௐ8ɻ%Zį#wnY}%XD<2GMS8`pؚvbt^*=(HSQv|X8?TnOLѭ`ݾS5VmrrsdӐ#œ{zj煭)[au(: Z`Ĥz$kZ( cc%.GrrXc,c֘k4ADCwY u'HMxܶZX-& cHk0M%ᔬx')h:&ߦ- ȗCA)"n\k?p 6_F#/䑨B~Xpb^f#ȴׂ_eT]O]*zhE9Kߝy9QVI΍vLNG ZgZg^W$Ps k#_,8R-^hskIRQwa.9hEQh[vr3l`R  Hn ,Ǫ#;wEN2ΦiHEQu8w<|./ gAP?4jwݾ8T´S8Cc);LvP2d)po[QƔ0U^H1.`aW'&w\DwM55 h\Po!a]N^ O+jFFxh-F6*65_r9/YD_ţ_Q$"r4 P[8rC }(tFUư/@>J9jz«8,Z9J @Ohw$],vaJ4y|}qيpS&TԆiSmVo6s#;>>.|l CT ?el(({6sJA`cHR:㔬&jJ߇EzGg;n/1-??ں2 "+;`»J?bU,O6}ډ ѕ:6vJ[㈟wh{r){9L&M ]-)mC較 rY,2"3}٬q׳F-jq÷^M6`uHwoq.9|:m"qcA$ϑ2qe߿2|dԻ &2Xi|+nhɈХkcG:FbXe*oh&S#yayIvxhwZ^ׂ,K <)L'^G+!s4 "pt#oK$_Y+TZHjW{HlN0gKB K?bη*r͌ kw% H_652cV5ty\&lM_Imxr f(50S`gAp g"}^!QR!J(yM3v3[ ͍sWedG(io84I-YQKahXTD#;1Hvm04:0AKzag~h]Q/\Cbn- xiM~:eOT}6[9͆7 T{*rc/ JvFun^ZEXr{`X, w-$ӯ& A.\n"pFЛJ"jPjզüeZTD1*bA+o=uHNKإY00ɠ"F"=5J[ amNrT:GOObpA Yw? h Yۑ h-}\ 9хH.ugǹg[eah-*]CB-`ʢw[o^W2nN.F@nޒ 66t,{-S6 ضzxB3 Ud>l<{x7z)YXx]H [98\n(3HW8%uO7ٶ'/K͟4> I)_T840*03~F7AK9'70Ҙ&&* [?+ \yX#$b}*Qq؉g z(OBjES@ABtGpѱ!B3J +iLT>ԛ0 MT1F5SvtUg%>UfБO:6yM"ɵ܄mcwRP?KAQ&N/fEg7yt1$zo_47R|obcAsZ-HEV KЉq'2+BB͸iCt2s]a(vC7ĵ|0y(7X 7 vdy(b\B,W{.CA% 6/~Ө4F]O |_4ioWk:=dp`v8ejĕ.C]Z7J* 0/h(hV *լucLI& W{5Mdb4HNQL3iFFߋB"6t}C^}Ԣ^-b:pg*h3eEvD>Vr;CR [V,=rٵHLEu4h,vz*kS Z5&m{-Ƽog-zL|5L{I+׸l&9߳>G JJʌ9Y+$5YffEzF{uL2Q6$:6N?)4,"%4-gPX k[v_u׊S4hya#J͸u/U K9/3BͩD{ԝ5 o^pmnDZHշ/^q0ƫҡչE[)S 10+eZ.ͭڐd[Mؗ kP_h=3ZS qh/u5.TNՊu)X -ʭ7E$Ǹ~!a0݅l=P~if WSJjdP3Q^z Ջ]Q=bk]M~B3NyzԛVGC17YX zi5jjMnp(69YI`X^!8sqE_1IJ,q3[Q rx1OǍ"I25(:QA;Ge&NJ X%l}6>uY 7E O;HV΀ pY=xegDq":ƺ7VM_0Zڈe3=;aIׅӛI"mA x(dZ O.I%q2ܵϘ%GNr3jVx9$Lj$wfcRn.@D 3dw4ut(FT9^LJ:^EIyF+a+fJj $q̾AԡR踺z9Z7c K'D1_sf~Q  F G\y sX}AszVJbk7DﱄuwѼBw}>D] DaCH~繑Ҵøé}XWEIQIwslzaQ0[zEZY^Nx|'1Z!#K6^VI h$Q{OwBnv͹䶢f2qg>y>T:PHZoc_6K r/ǁ"SOO8wFYӏmf)4BRL.blɑ?9PtU?^ Ȕ{o*2rI1ثQ lA~]n 6:+s ^*!҂wop"B,P9ozk,n H! }$[v jd/Y-UI6 C7tqG{bxиzu(2gPjmo.ނFи%Y!7ߟd$tYOrO5jΉ'當j9ܲ41hjAKz AjQcT~՞Z["ءlM褪e:]|wv׿EϙI4IZǖ?|UZZ'ndnQEj ?O'7~C˕ oQ J,ݍM ocȢ7=,ɜ=2xJZk~+]/$:#&&!}84b$@[ldǣ:ÏcS8#6aJD#'FAh@V\f AD?3' ƀz"էF &r֨of}^Fp |\:Zc?.c ܁R9% |@w݃վe:hBH}Jg3cyK߁]f* 9-;FGr/;wQtUܿd+&t~j VK&"UoOK2=˛6D;i$s\"CDez& "$(/IOu(>_V:(y^R:sE:w'N厖 ۬PQR񘣡+-BXp2;O '^<lcBǿ Fϑ*#2~*2IwZDC3ɠ=:&{('VpF/k}혆} D3{+ނmIy"9igse0n r/C~׶)%²qm^Ļӹj믎 s?ՔKц"5hLu*&UHtΗ)l,)B]f^:1y= r6bIn@54bq[-G5\1pzb` q~YU{Gҷݤ U * Qy?C¬1r()%% ܌mVu~x\V*-16YGO#[W:mԻaGbw’/aߣ~JxR X S"Jd*> &L) B#8W_: K? ȡ wZ&)uN 'R+$bhߍ?/0BK #q LJYV5 $G,{Ǐ|2 #B%AH2(@4RZ犞h4p.`f)YT|k@x sQly6eܣ61T²Lmhp4:PkK7>9H|?5 G,|Pw_r? (Ȁ1 wg;W n{L\ %I{Pxjt`*?~Qp9%GCFHt$yf 4Pʘ2q>nTZl=S7}CbI NҺW :E;zAO;.5J3Q20N7Ok+fri[w!g;Vl 'XzJœzq"AB^^ 4Y4wb?( ;HP% ,@J*9xo [O_A~hoB*WB;Tc%7Ym  ^c%?aش;EtFYaВug+^YLsl_>{넿u;vS{u3zUӓԐrI{86Y5 K F6iȲ IC0ɓJ԰qG;b("Jfy*|Tצߝ nWP!_Z5q8|VŠH:o) C$=h*,@[BYn RMՀ1LSzd UMw|ƋNɼF}8;}ҐEH w,v_T˧0j񒮲!0.M%b/2ދZ!)R5VQ- 7x3 =!#p-3 vDEӅ]GւtaqL,W1zeQ͔Z&'K]f=ޔ؛Z}c0~*4Zѿ)<$,< x~":+.P=VY^Xr'Lae:xV/p'\b-A&uܪɆ 4%q 2m۶L2&ĶĶm7m^_TwCÚ &|Y\>Ip(3, 靖tҝ[1b9w!dҔl=~#cp LԠ=h\*90P_ =v9"k̂vr!7N ?3dv-3Aט]>'mHIQjk^`,sEU -Ub ,4Ü2xM5/3?vX@4tسVP(i:ؤ}7Nf#N3FT܂wUcེ}q8fwb/6պ 8XuoS_b$~]ա>]Ѯ+˒ +bg*~M#v=q8ku* iwo4*-e6zMM" ]F HSYj~SVd">%Vk:? [ :m퟽֓:4,$ ]FAMՔ&n:\DA͑<_]!F>5B'lHou(:}_oWa H' 5Id聠]FF*%Kg´o!~eN(JEg 1$r{9c qiO+>>`P#NɀI8BoˀixqRG30>w6/RN`%;>|vGvqNo_'"a W,u )x} 6=J@*JIiKƨLJ֘FB._S-)Jf.<$p@גCtpTeZ:Gܓ#jTM,%YoI͟"zFsH[Ze>_ ) 6 7/TX}8`IYS8yG8z\T5M5gS_t#Sk  ~2{KDcg[Қ}^zH. FVcDnOW}ZLf}r$ IDs] \`9DfEhsdNWuu7A̬V4@ļ j3KJޒ6ruGlV8]q͸"kNsotT%'vGu|댚J+8tVn cU{6(9 V0 B1jõa_w&Ey~9zu3تuB Sx[^v*yS4S/MF`-[4?əpʙhQq~eo‘2Y-C)S TiU}s*!@6.KV(1x8:UK\TUԅĔ+R0*aHNTWs…/zS$Y%C5A6!i9N6|~N:8Iiң֜,j5͘jJ O&5~12oޞwqL53TbHǬ#DrP2~&;-ZGssk4/!5>DsXUb[N'óuc.tEgb":œMyb45E#5X?QfPБQ?ʮWx ev4߬|d?ȯnmcʘ(R:gL$?&E/"~"X]~7 (U4+`e`p$.lB\9 RNG=@㻂wӥo ܊ǡ%¨m\]G).Ne3Q eXEY&Ғp漞#ZzHnʰi?Tj$Q%:vO% MzJH Liz8~ȏunx† 119k&$`P밒㸽SZIJIQn@| wfY#Ų3۸ /C%be>(oCbbЪ5sjI֚B@[l,޻'qhZ4\uEetSUjaD0_kBk*cN Ĺu%?Yu?1u]F'\mebU}ގ2HVEiSu5)/$,ٴ4"GڒQ)OuA5yK^NvmnuѐZuwףc :[Zlw/"P#8-_{ci^cr#6YuHrMqI(.HOrdA yৄ[ts^3"(ኻKjH%Q<(:&u"b!Sf*PoR;BN>! lmI 20e뾍30ZFd(2/@%?6 |Q!Q=||yt{J@*csH1p5{[Bbb`Joa~EJAbɸA~xuU;8T|)4_ r牶^'Cՙԯ*N5[6sʕ-}YS &0x EXJJT =4x F ؊j=j0:qju Nؑ6;$ n={- }e'8uQr#qOU2GDo!zjx\瞎Ng6LD&(i0ωt_7qZq `=Vl*<~0w#/ Rr-[I/$jG님kbJ|3#Ȝ}>C%u^^1iywc:qg*~r!R喺P;|I[$qS Qr*7[#79XRxS[ύXC -42Gz 7F:1TͧNwT'= zH?̏:=2j kx8s-* @GcBtt Sw:\xe.ukQ{'3b=g'7k%ey=gxP9mʔn"gMڽ&)}FgPs-!IK?}6v_y/Ŝ_,7 Kxn=g^OC fX4bS#2Ǝ,gNI`U徨.sHSV2J>M>ٮrA\8.og_7nގ}=̅Y};7t<Ƣ=wVֱ6U_7YhŰ*4bnPK1XM/|l!p*Q`J]SG|赒S{<ȍe4 lO$1\7:UK^A$̬AkQ#6R 4(k*hpѱvsK~6odnѢ;wTE!Xf]͏OfSj> jLsV+ IۆeD➉)@krqIIjzdOx%0.VkWi05(5%7/6;Q( ^RB}!˷SDQ]W7jEPQ,OySxNC¡NavqJ@ǹ-&˂W/IC_kv@6C=9ݦjNDMRh}nVu3zƁo [PBT7 \ Q~T+ogI@{꓾!‘ԄzBKҹKAĹ5{;uymZ-*.'8lv$2fRmd;K8ڊ5qGΆ]}sHʂS~h׀蘲0?u@%" e˜ 975}ש(On!}x.21asX5vX?2ѦN08wJlFq~[ٹ|tWP: Ϣw\ 3>vMD·Y2#|+{ï qfQo]g 1HXUyRx!`%F1](a7[C$<GMRIG}ˡʄ+-Zc{uHn{l:f(\QڋoYxt˶Ҍv#,{fa#>tdN93p5A8z<'O<4+d6Cbzg() It6r,*a ` c,LR(V;e ӟb((^AS6b ghs/EhyiT|B6I?֙n&>T ֶ˞r*2_#*EFq]4 /joXiUo2 *v(w_.` v4`Al?Ϛ`ZHw׬)zt1QM^J8o`x RB~ 򻻻J?+E!>'e.R[[GL}-!$S9"(=fl2VCKzQQDtEC:.Ɨ:<ԑ s>{#XN૏Ϻp`]-dZԀǻ[L#v,CJBN+Z}TYuv/0SH:% ,tA7؂ hW WITbW=,U9^7<wχ֠r(葏sGH#NV48y rhs٨@#ʌ9طvb2^I%WTm]M:ߢzgu k>̲VM4Nj9ۥۨp #Vz{.P\Z-9k*R iz;hg? zoټG SpͤzdwSP?(Si kh~t>ZR "oi[}% J}Z<čx{i^y³x<:+Mi~U1K5Ou y$A dT2=u$#@':{Xw*~=0 BJѯbDrP",.OdpTMC]U2ѐ&ۇT?x׷|\VhCRnXE[G^5+ACňN\1`hwLs UCtJ298:ϼWKf"["HSvGVjM jIM&"fX^!IݠPS|`rm$D88dVU6Nʳ9̀"WKa%ؖ'thYiW^U5a6vB7gWO.n™ ϰUQAmFd,2c_-֡O{rsMѰ84_vK7%6b h +m7 tqMhhix xb52Fy|,}dn<.7{τۡ)Tyk2 Bh=v tFxw/ؔ,$HcaL2 s/\23hp6w<}oe:#ukΞ n"`n In8z4ro9ωt2מG%R>ޛ>ߛ6(;2ZMµAf"{i?Lj# x¾33x$pY_  ' ӗdΉ9 L+XG7n"=^oٳ^qكݱl?.zg{FRiO2[ﴶ]]j扉)z(ݹ yOO0SS.:I& b-|誶C$Sߠ1 yrMڠŇ= ~ {܌ϗC1-=qvbnq"1,~F4~ /fSފUǔ#nv% :24/1>e7.Tu=r!9E ٚ9D.ĵScCP9Hst?9a1籶A832ٿM;m+m zsD`fcض j ocf @L/"}sgf^jNR d3p(e)8^9l7ix}hHS :lR2aW m2\MAÌ_ I%Q& I wى>>4,Q`gJ ߏH$>K<ALH/xŝjOP`ENY{ט"rHjIL1ͳiZgLß^}GV cDH=;QgѵxYm0i<\>Br0 P/ﮏdcPYMFZHbcg tՐHf6΅Aodic3#6S;fBڌ0gECkd쇱(2߁_&/τ1)ș`~ hsQ`NBZ, x*^f'->eߩe!eH]_PP!~ɪ}۫KKjq\_5\^M_ [e\eMKz endstream endobj 251 0 obj << /Length1 1846 /Length2 22813 /Length3 0 /Length 23913 /Filter /FlateDecode >> stream xڴuT]> V݊CAwwwwRݵ;w(3<3ε\g^$UPf6$V ,̼Y99X֞Ala`edffGVb@{/ پ.9=Pv @75 D" v531`c`鏷#@hhv37Ҍr/`7l0-`c H, TWUPa| `m ZDUT%b_T 5zʟWքEM'ϛw9qaMq?kAvfWm*7Wc[_ ԦּLLNNN&v`[FkS158moW[/b@{S? !IoNorF0؁@h,hfeZޞ #:!Ott,V/kMeFB@O9OBTq,5FU4d4YEi1$Ց#Q֎WD2+9 fYl-0& _‹38#Ұ]hZtI* 1w%]Zp;&:1IKPv.mWG⳥ϧ(E`apCeڽ.}"CQitVYxaI5Fw*aMGX*1Su껕X]m>%=<hR8ut}I[XOQeMFBs( ;{~B#v۸vA>MBL"=xδO_D]5/a\XexD"{h>[k0/C$9S:cv\ (ptHW,rS+bNAu"{4KۃJglQBK d|X!6h0ڪbX-gr)UgaCQ(#r"<=eQI*u)rx%;j"C〖#K\D dȀrHsJ رֲn֥f?̔҈v=Pne_e'd|9IK ď%DM󚾉=GmM)Db|_ӝE0Y؝AW%c (.}\$;":PIznn1M@{!pVHCΞV|z`7%u*o;B nϬ]_nY(nݟj7FУ|5e#'|Kc9Kw ^S K$!JZt9HP p?tS=cx+ނ;3M{3>>rWFFT*k{6 N\~ ΤiA f~_vMN' kjPo01!14f^@ \i6abܥ0J/O-Ȕb|+o]/syr\I#cj})L~ʷYu?9uT vQDTeJ)jU<)}Q,Vc0a 2]VX~"N::##-yN^^XAdn5fJ}lVo.ҋV-8֓(chϖCkɱg-y(SMgx4woQ\DL 䱢ٜzt^_D~2)Sfux#fKy֪m7S]Ua>]!7|yUTܠ'vDՂuĄ"kg,)(n|K]afˎgƻ>x}PyFb/h,+joE[ >1O.4/zsgUўŌqDW~ ?1JY/h|v0=ZSxj%;>oNGp lwO&m)HTiv #4j k8o*Fy;%e""UtuлLv| 1 K(MabRGw/VirB nc0ϷZ/}0Z.!AW3Ah_4WPmލMgFixPZ0%~n%-Mn$ *izO[& \u)b:7s>GoVkiR'}r2 ?y6<#vQJ'2{iAVKGLN"o.>뵊W`򺻔vBy N*5fAƅ۝xXZscD*dSHp@1vW:d P'[D£(~ogkyrX-lSY<ѯYbI3-jځDXrsToqȒ2:}<$f 껎Z꤈xICtth>-Qv.d JI> !mVB!3C 8+ ; AF*%鉘Zqu3>"{ MFKlVt?axs-K'N$ŝ^W BV*]8+,[=[#5p+UTV%=Fy6f7!P-u 2Fh84zQVeP4dЇ!LZ*|4lp1ZF䢼xp 5b19t]ȅ6YIjߊa 9|8{i89+w1ţuM/~g(l\;iY.9SL`CmV_D.F;O` 4&|dL? 9UT]^ U7  ʅyW 2l눼O<[+e 5PHL\üӃls\2;`.iΓ; N2kgq?=Fʷ-}6Mm$.c%/Q}!8fаc˩YWu]_ώxo€x, 7!|ugwlb#߮tiK-@e:Ri BCr(J?WR >a,'r==aYy5jL 3\)6o"rRh9<; YcqԖ701KxrK/Yeo&IIMʶԩoC~dӱe8:7dL0h{k'*tu@XV_{ĨU9vǡ_6 'jQW47*qVRjl6))ݡ/P-@ѡ ܚToaμv%dFr5BAS{N/ Iєu詴^Caf!P4$M"E`S,{t>ZV1A^8Ar KLo+|?Z2#XT׻gGPaDbȀJ@a~揹r҃Ulʜ" y*tĽ/?yҋ$6gTlj ~ |U`퉟\jaHITS^i凄N}03m]/、@' ߌ*[[OhWZ%$x;mS#31/åmBgnW>[9ˡsz=5gN0?f]]z~ls so!+Eg>0_CSh762ܧB+ |"c=L⑜dIZ8H Pڙ-׆\?#Q(E*!/Z-*G5E#u!W_klYd.ðnVt|=m2`BNxld1Wrw5t\3wj֍Y:Fbj ̃/k}„O1ɸ| ,?~b2EQVt%7*nsR[XXSOhix䧱bY3 }۳Xxt !MfdSuU`C@W [ఒX+oV$J`4ғWm|;BVBm}\;j+=4dNjlF=OڍζYᣯu9Ob0JM]%Jpˉ /1-԰y(3X4KSEtzV f^pk/G݅eSt*hWy wOY nH\OW?T5L8ҺhxHn|USoVe ȩuAsy>.%cWW}oRux;/ᲈMXXkz;H(+״a+ N}S4ɟvGk{Q4luz)>T0X֥QyXq{Ʀ˯G׊u4M yD:^;A^K ԇDPq)X)Wp#CC+$&$٫;XQ(`/L; ߊ_{5Xvg{WA¢֓۩I}{A{%NfNjW1'mJJq%KHdLwQ/SSq)D<+.5zH*VL!@P#UݿG dA\9/;ݪ2璗ك_̹KZNf"*nm(WYL&N侕xDŽXj\5y(Q.L1V3l{Yu CZxF֣7qj.UFP=iR $ F$pBu67>U*shȪa.RrQux@M b`BhQ֟j.%6b債%ļFf4h7P 7OUm%D tkWO~hI ˜V%ir]6jrbT3M5-9`X2)p-nr̿5Cv].lmb 7Ƚ b=9sw 8F,Qڌ(ǝ3]Ԍ$Q&O#OԳcսR\()s{k#"Mۉ٦˝ %1Cv-,3Ko' oFcG 픡]x?zd)Ust6ewfz0i|T6d FZqf,fluڠ]P=CpC(T18!=&P_߂_mE 9hBbw>N5"hȣ#4~̻=lS¥iv4"JD5*xp*\-H qVjC _3\;#c|,+*DEZϓ}vJ9GNg 7lhȃ#GJ;fCʼ4J4}` PMB5qL /1#{&?m_ eJ~bgc/ALusG (CK,`BV([@mTOoV$,DhX՟L$ .&>w]MN_O BVod(z+R3ukÒQWu0J@-zCSrYkDoG(r4W(?">G芚T[+ܺc޻8֝pE5ZTrf&_.]Z'e-IA_piCw+| cX6H(g}Sݡ !Ax<4!eaöݑ4uG0\X?B>7:@_ɟ[Rs¡BjK2IKT@\pvDð*K}`UzGY@_ga.;xA67@1MHDLu1wҹi #D$)kL?iDZyͷ^ \ s'5ʇ5܆H5a:1_yNY@6&'NTKOA,tI}oG!ۼbh#i^Z rǯvi Eldnʋ'O'v5!Rr GŎ~]V8_.7>ۚ1t)+|#2&+c /<0MWo<Y63~k5eB ,{Uxl5m  fZwOyD> "zo\NBfT@v5NYxrk/U#KhʍXİ@d*vL@Yv$.FxZ\T<4zq<,/lm9 ?n [z R>!ҠVW䃰'OHpH6Ѵ?L-tF !toO_wIS\bC-X,>/6|tza1Rdݥ"]EY|~U8 _ f%#iߗՏG8rp׭Mp!+dsaϦLjυFG΍\ n;ۛ}; xk״5AY7b'@|d $M0S~JKb#OJk*),r4Tm gMH{@E8CM=% ~+8{e+1\AVʞmE +X.~/$!AC>)u^eh)F8OH$ V}t!q#Y~m\)Hj Xy x')RMWZxL|RGȗz|T@/{anjAчjThyX_xD$ȕ3@".P:VȖU leI0Ȑ.`4)+Cfa'JѴ J8ײ,ppp2L il'{V7x m㹆P0ty#BuK)aFk6p&'q@5%yu5==-, =5ߧ8Q+>&F`D=!mWvyJn\̖؝rIqDԣ%G6O8n#iĎ@>9EM.>SJtcsFhY[ -rqP"vOc i$xwAmΩ?)س%Kp3>b=K#t>{{v0=`r5M4T:Q+ RY^- 45~DC|d!OCS_jDΥA"tŁXk6 f ߪDxуjL?_ZFC= _>Gל,9K6֝"ӕ RYݱ˃D#]T-0 F. %mck[$,BGפi7~ϯL3H؎C R`Qk <」/ƿO\xڡT+#OԂL?kVIIͼ{?4L*:"q6|K ]T4 \ib^ͺ*s `^i+DgzZ &VӵN&Z'0Zsj 9‘%+0:Vҧ*CEϵjTR՞ Z!BJ-c߯] !s!B&Ua6}![R;@ڗ*ʍ!.W! REDbo,:n>I/`[xi:,*B̡j˽O@(1 IzLHzˀWa58vP)\ZgUIrl!vmYW;o7ٮ)ϖƭs8_3?S5n8ҝ陻u rZFyJMHzgԓ*SYUIL#UdHM0JIb!珩?RMTG F9 rh\?r6hkd{Z2){ /*N",Tt1SAgarf [Vlt?;G zQh<=^KoQ9$H1zQ_Ld1s 99ac{1"x@,0.1Ky ba7I՝"~,<pJtCXI5xjwiݓh.a NSt !:k,\ LA!I`1DKc@讲U|%u3%ˑ9KFX9]BJdGMKˢJJf8vDE2S+Jю6ɥ!ɩQ6L]u!7#fwv觋C<=aU2^Ε=؅r}6+d$Z97ġy@"3EMݚsJ|rZ a)^!L[_\AM`nFtox<20o3OuF`˛@L2*Ʉ n]4K\LZfK-r̰sdFPSjsj[KX֨g9ATAM|Ut7HcymU2zۛ} ^|5C ? )t)Lmš]ބmU Oc7 Dй;k^o`_n·7d=v9qdrvT]i~t;Gt4N-I;DSLoa}[cUAK$1U/?µQWabٙr[Ge/^)S(TXҿ{<%y)c9N┇h\W7bn `(AǭL'&{ܼADڱ6)7 FvyR|GYmȗ "d4%M r: \]pEF8Dyt|WK}F;p9r^^0E%$㑺kF AZ UFPp‹aTb6!7 FTMS/b(֎+3'QF3-J{j3pܒQ94FixP8p3*ZA3U\H*M yTiS5+D:IȖ.MF,f!$}҇ 6;Ơ0wUR{G' ŲxЬ;[' s[IT""[O`&f+hlke6\OôSϸO|~Ț吙ksZxo|4,Th$ +jʬH嬙ֽ%4 U6׶bI̼ꘛ=*`x3 tL (`Z}bT=,àj/}ƀ@bR'M(*IIŊ?%dʤXfiyPtaLrjwӜ;sA/1,pc8}z)eΫֺ/sv[(}e3>ݗ{gBF~P+LBx 1q0raXstF+,ktt9:l_9SۺVc+=ifJsU:MSq.}!{[UKl'??&{+fp2BR#!c_L04VZdϹ)h§$&'pe-S5@ xcRx0O$͆niep,Kzגg %}<LX-QC`JP1> gʓ聬-yErf~0˗*Cm:'͌xK b Q(hdԞ$ėnWx΢^yl3u/>d[UXtt~cj=F3<f n3Lb5+{镈y: dY(ǐv`RԸDǝ49[}q􃿊6?f"ڈ6fs~j4~u nV Dj:9#CEPkcP`neAOzټmdc=4no `73cd;N"}&vB3.|:KQܳS`Zi1C?o^#RP6i1Q*}Fϔo7 .#eU_3aS倓u98J:mp]?tacĘ 4fSdJ ȥNͬ ڒ6 ib}U"'?LR>Eܵa&p SRY$_+?w>_o;\ aSSBmf.Z̎VΑSe" &+D-sNF!vK)J?>ϾЪQŨB\?aMjc͹@PAdCnKQ?oPJs|W2@brSWd9Q.6##@//Z+YҫKIA3$4_cv _?DŽ#~C xaNd43 #AP|t[PΏQÉ8\VճE1c~raLCwgm.ذ7ў?6g=SM|J$l2;N>y5~(+b3`wV-[Ei0t?icڃm[^#)`` \s4M im|#ҋ`@gOCzXF|׼g}dj ܸa5s_j4Ɓ2X Ea5;nZߡk/:Qs˚b d6tvf ;֪{dHtd0 ʰτxn5h,'gl5l"L+Eׅ$mliQrwFgq ۵tdGe* WKO)>u03Ѿ_kNnİv|#g"P t.e X1Dr~Eb,w}@ѳV$gsJK(+n $ԭOeuy6u(NxVyI ̜f{ 7,~n 9dYcYJ {mb-Z`-0E=)۰O%{CFp*آLoN~v#JA"DΠZQO |+^笻XՓ /k=A.mMjO@1'p2IYKۜPƬjBZU`v9& >nE^Z|Y-\; ]dK,KnP]9:ò{n;RBS"[[OlnHb<׆*Uf_P{i"6W">šN\ 'ɟMg p >33 䯝zYZ^6sZg] Z p'/MJ[41] r\0ֿ}D.՞e12alża:dDĀ I=bրƇâ|Fq,^LmPq#[[SIJ Ϥp3яq:T2Izq^5n;S*F[5NEA*<8w@xwPs??sR+^fz%u;tY8l'8DPtLiZiq_?kc_)c24"V{KaZFkQ0((lN,8,ZDgF5v)`>uAbn)ΊG9 cEzWTq;lR Vp?mhɝb0 ZZF&M̮Y*x?e&Kc5woZV7de3\{ҿ Caָ{FTr-Ш!oqFgBlZx5O u.?h~Ohs_bp/2w)-L0蒌ulXUTx$)*R nuw6A?l ˡ3v7MÙ?ĩ'[GrGy\9~Ǖk|=|8%$+"r|ȷMB׷Bgl|Ɣv-ՎJ~k`wzjW6$teUu-.fnDSH*鄦8װP(8kt{=]'x3Aʒ,Iބ)g \DWj6aڑCFo0C2@{מHG6bMRVHTO( pwd,6K5?Tiv n8в @ ~Q,09Eҿ}rDҬ:*K7'/BopHx`K{44 ?xC K^b@_,a"c-p%PngԴw4-dݭ/inQc%}IBVSfP3T+'5۬Z ȁQҕ (dmnwI -|X7:PfuM'h栋A+%]B~] ET.r;"UbFŌN8!FT*5 ו2WHݐxjĬwyTCy^* 0=OHr= G"[*ol]Xi1@lkmxze:Ou4G"& *DCh,pZX#j4FSJ0,qp*qC"wb#Vō ٣\Jz[y`/! 9q²K4{W./t1Yj}޵p]CiugyUO4tz,' pٵN$!#kBg:Kd>#>t^4_=`RX$r"ər¸[`+SX3bM32 :r\lxZ*,z/ŝ|w袉1HF5PËh!f~kw6'i_+A @f}D4"Pj}&o(J70KGau8i M+'sDad^܅W"X%JFx[G`Dk k :=i׏:?v4mB ]HLAuWFҊ-WA˦ ]= oȉ;|VDZvF6]2¹!_%WIq7_nbgv:2HW/pF[p z>5r-lv_Â3Ѿ1צǏĥ? шsCW6veYhEX$PY&xNVYeI./`5;~'9sKSA V{DDfuD|'\T{n@{6-|(!OO.ŲK%bX+~߱R\tcAn٫j٨ZIVB|`HɄEV%}tiIr ¾g7jc A }oku6J̀<BWtÌ^=#fQr+lۣ&MI^(j|U_ES Ɇc/}sfyq/gO+kTprD'8]>\kQ*n+!gM Ivi](Qut-Wi|֕R)U)AKmΙ N#}3"a3jG*/HAN[X?\rHd3MSS3a8o< 6 3C$Jm HvO8@U7urd,Ē9y8czDSӵ1#|(v#F:LȤ8(MK'9+ h=5WL`h+~$҉}+8lW#ND H-G`߾쯇\4ϯ!٦C$៫ͩ#De R'o~xy1$X:U?sHs1 tjln[}[͐Q(`2k]ݗGޛ[J3 ۡUWW#\;GhS"@LJȏۭ + /o><@2z]JI }yzx LFhG஽1ZNl[N~՜CoKb*H1|CV8" "2]|翚PG<( Q3~gPڊ!owL JQ>cf }Mt]"C*~O`XUn1J r@+_h5]DQSvXP^VPgɥsShj\̬Pr9/ ;Prrp'cHJtK&،3^AIOKmHLԪr^=E o._,Cp6NSu~kgU(ݚV dlv<`^Տ5vЗ1iD0~;ԗQ Ho-ߧrR.'k@'(:]A/B˙U1Wp[A6aC/&HC0Lת*vnUWAa\gtv˪f蒲:.&|G#PLtmkL (H'3r`Z_\^cǍ 86bX#{nDTfଶS ߥ&Mkd7rz2^'nsZuofroJ;o|چKpGOWMP.;pd.?W_Lڞ-H ~If/2l_)vH]?_O2Axs&{ \ R8^GG(!'>}#׈Ν=:mjyG 0G鷲# xf*ÍF&A>Fn AxO/\3LAᱰ-)e" "y)vI|?جfhZ"B<: 8؟,T\5t3E6j!䓢hP8zrP,|6o;LF't-. ?$rjіe|< $H^n ^QT>%߁Xg;˃@u 扅Cpg1Aо|DǏRtDA*2kRnD¨:l]EHDԬH65D %ʺ;-0M؃^aH@:pt/QZb"k;MItf0HH~˚Ww)ߔeyLEt+f⤤-RP*@cqVudHђ2DZFfxc U B Mk~!!`ƿ|kA1+|l~@ C(b9;On# Y_Ʋ,ӕPBK;`*%ֽY)`H,Q^-].j'iO-XEZtpxlOa?zSALC|riN.5z^NԠz\$ZL˝C Z;КȚ,9K5\՘p/~k{rs)+xJ ?_ǐGI @ Z!etwI ,GX2ᄼ2Ӯɒ 8biWw/|m-"܂E>I(M/D7E_~l{ T ̭6DƳE,Am2Au ^Ҁ.2 ̣XɣwrZOAp KL~Z]PG ϨS 0EZ,gcؼ $kknU s l(@،sCT^n 퓌Z~;GUlcYcs lY16r39b,ϓaY:aAƌEq{J#NZ[U$МbH#RL] qvo 8z˧6r.8,%L/c)JPZ)y&qiٿ!'64?'Y񣕀 !O =em{¨Ĩzn{koEaAk7>׼:wM-t_2=m;Q e}RkxɴimfC󽃲;B5P DƆ{K<]t؂lAdnKj]Q.X.Z#so8PDى$,[f!doV)7夳U <-b殐$R%r3'0@GNo3txz>{۟>?J-{Pml=igk-;ly֮Wz7dQK΀&a11ojr$j⽪[I6bD.Y_8ej}4<]a(ũ!Y%#hi(Hwym%@hAD mYr[~\.4kȸي+]w2 >k&ĠFJEgc6wS̳EWZ[^@^>eHA #Wh6/q_ˣ.j{u;ԫv>]~EDMZS{m({#}T 7voLd fih3]"Z,VpjqF"YU p2lP':D `\C.RG"QHI~J'Oz !ǏQXqG3 StK' "`pg7dqKxf;jO3?N^?C ^3~#/&^f-l(JRB"j80)Y˂Z5I+f޾S 9 pRX GZzEUN'w4tO ) 0s)p Um\y*Ƃ)|g PLVVq7#zeV6zу0  .Nȝlgyw1LlT_O v6/VpVcaoJ, ]>&yVOpf_5^Yhݗ>DxnLQ(ynF (ܒ Aµi^A TڇE*E?y&ѻH94CLmmKdb_uI\\P)'7bbbF h# zuҩ"$zה/?3A؀oX `{Abp_yv:|^l긠 <4C@䌶>sb/A~};ӌ}4 !Gەm$/HLMe Yo(LOvj`:hl;KXo/u&PS6g4(w7%7f!J)Bs =+Q<ڥ(+0~DJtvPWv>vE.h6f JԠSW•5PōѮN 3@ˇ`|%p7f(= f!Lanh3n g&B殔UqUL5|\\ 4èb+z_{"$UH'rS[WAi95y9Jj0W e&K:+X-N`8fb耸ҨAώ|ӧ+bFNk6Lˍ8^&4o&rU۴tAXC4 5k\妑}b2;bBzLi]q[P ,WA.ÊܗP@~ZO3ʠ,F'o &H۵syZē@oxd-BǓ1 SB[,[5E̅G ee &M4+]\xK^3UsIJ٥-Xݵir> stream xڴuTl HwwS!qAR.o~3:sw~`AM,n 2ʀݘYJ G?ƅHM-4s9Jn6U 7'؂ tso2L 3\ݘ\j#" rvqw,3 ;-Py:#hcfoYmMi M&= 8jҖeHhI:LYmM?*Z`<`ZZzj쬿`x]\mhvr9@g$brfq>-'t!L _~dkttvK00nc W3|Ԕfn@G3G +hIIw9rO.;f7l "Vջ?2eqyiM-f%92+8yc;xy6J;ZJU"O̓ś[:ZZ݉U(/`5 :^63+`}@N+3{Wj}V7BdXZU.hK ߪ)=xZ@+DVxeK^H?߆cR:l]eljn6p/x## >vlZfvՁ Gs?傉],UY[^YF+iG 5`bnn/;x-^L v8@.; `-JAV?*Xx2;UAV?]gWٕ pv?]?]4 .ήAo pv?]?li 4spoCo);8-x-|rsڛ[&s3 mo/9#,%ZSo oßE;gZ \şuG_Iy~ra\_1~AX[~76Z.0c6N6@ǿ,2ۿ wAp`rTfS r$Tv"?jp0'3%_]b? Pڂt ̖_ղ%j ;X`D 6wovpi fuyrp3]\-@.n_L_{/fp$˿*#Z䟫ϙ϶ dD &;XoFϕ˗ L3'xNZϕ [^@"B0]ZKħ颙 j~U" 2g)mAY4 %9`R]0/,vP?hd)/UP)q}H h>1qJaXѱZY2ފb鵌NM< z1l@ÜiagX^t}ƴB->q #&kt2#6=Z/XI]t)TS9,to #*0Q+Pc8ߏ{(q  x9K' MI}1o2" Y]%.4A̍4ڟ4oџGUqYkZ wFhWɸ܄%bYpUgEg2:eZg]'+YƉ+X*),W=C JZ Ըn@l[|oH貢7 wu- O;'9qW<,CshIVՐ E=yfGFIbqIzg'7n~M CyL!c#x`Ԣ@nϷ+1H76u;9i]~+؍iI/&#mmcLWߛzowCO awo/FKWA+O^fl6wn3$(qxk?ٽ{ vXԚ,/c-<ʎ<ۛUCKּ_5 ڋ8-BTP1%1ؠ Yfz>x֝!."mbok,|!d* z2:htE>7Ru߇U$~G"]pKcA|H }#褐x5Vc!R $͗I^ "%?6^|*-.+X;_r&]CD5=e\ɦG%4Qi**9 r^[$AVI1t1o!#*KOŹ*=|T:}34" %Zbi=ЇT-3VހR=] )9ZYWd.fvF)A)' dC3.9=r2o STE`0n67*G_Fڱ]}JKW`sOY 1\VEغR F8׃;> b*Q܂ZJ)5.XbkIؒF_|ȹ1 ĉ'$=>` BJ]|Wwj iݯV6ߏGw;xmXf| %+ .ʛ;NN^>NDgfp^oOp2˫2$hF*T1ϟ*$gؤ ;b0|+̲q6^a+=I"yWtBiɫ4VGw,Jރ]Bm#ln#:Ds%W+:e:6+DG0aK5Dma%ax9BlzT8‚wzLBQ,?HFyg(WzoP_J' \"r?o2K;99^M9̕+_e ̭{MGGy,=ۛѦr*La6Uԟ F4%4hH)')硧z[7OyR mF"+{m,5ZuO->ƻ3up/aoɢc"nZ%@!G.oAûhh3.[SF \pARPER*4(Ef*R NhE1xvAez NK]ɉFw^\"Fdn][U Cڏ=:'r6mnX(w2vlC1z5=Ao Oʩ a~p6^ _]|C~%[P !|iΙ*U!5fBZS^Aac'Lk/gPLA1Qγxhrv/q~ v>!-cLDW}[J_3eD)|Qgu×A*뻬a]3^Ǯ0ئ󯛑mX&+~ jU [k~4[ivT>I>uӐ`3ei0V:2z ӜʉsBeS_:BW:K<:>CK,X!srۧzh& d=(!i+j _cStsRopH);x5R'xm6/Ͽ&~Nz/`QiŐ3c5 Sf,N;~@v`[}9l~> ڗ䑐 z!l۝*ȵG)7iY@?;ϸ1=y&x5ɓh@~JE9OMމpZl&tNy,?];R_h*8;\i_ nzjt6KGɧHoѕSh qHSиBq@D\C~VJ?sF&qF4`ugSA0mId^PwOMG~XO2_?֭Gp=hs%VGuZab#/ w]D17w 7BeSbMhL:zʄ2&|.67#lhPT-EJ86xMh'U:k2uXHk'i 茣:cICt/*ʋE kRQj%ǃ5SPew S2:gpȦGzR~(chIoK]8]?|ұ R3ʢ3_o7:gʬt },2,~vP#@%+, w1!OZ8 oL'wR0&3s˜W_mDr7z0ًa"]qGHF%!+{c"X?eB&_BquO:\&&?KMi1ERƶ+ *~ ~YzywpEq3C#as4=sHR/|fX`p-=(l d)׋#<_i>6T\+|/6.!1=&5 VϺtk+h1, ⩘=u$Rȓ ` qҮ^hڻbm@^Fs]BCjg^k᫘;+|˜uEy?"gԞXM uۼGGM,(0Yr'MD/짏1_?s"3Q E H '>#o"$^gc,X@\St$,$)Zz崐i"fbB)\g#Ϥu-ŏW}g55h'¨ aψ|{|%1 ѱ<7pR^2x})85ίw %9VQ޴*!Z }2$I͝Ŀ ^ I3ZH2'v=,4 V)BL,,zfvwT.:UWdٮ[b#XA~8qjK99lֆx{3p0fj"& QW% 9vžo+a^<?ccX*i"Id8QI/W[_טv3-^lf5-_5cwcuE}'۷iq& fN*l5 4+<]k|z]˭;%F~bpC~HmJC/V[eGVV"|q)Tʏ7O̟ih["9|= 3_tبc<ݶJS"D<4H<HܯLB6r^h'QUtdk9N.6XxqLflz{巩mi:[l:n#O <|-Er"Zo&/!ٔy UqCz6 .sjk&f+W\:"GfogH>+Uvҽ!Tm'56oeܡs]Tտ$5(t{@&igYF&ڜKO+sOԥM9zs]y_֐ Os`Љ^z!(S*ϭ_,iu1,o^ppBB;?? tІ7>ӏf6,~)QZ)HVD~}6ILD = F( κHTØE-^{e|IP |5s@DM|f7DaDGx:1o7CWd7[RМ5gk[˽6rBԞaiR l5w|U4qLq*A#{+(݉ NdڛyM :lGȔRѰ|pgEN|/dh}&=ӵ7k ?rh^˳WF y LEjNKwi0{Bz݊U >%*)F4<* Wu5Px C̭rPQy1%#j@ϱ FRfdϵw0`"8gô#vFW#Pz3}Z3&NxQܳ|3NgRMVޙ1pºlX_Sx ϟe]oeo3=K=k^P\fiO `nhsl.A6\i0#qUC 9jؗC9zSA6W=Xu}"Sm]BGzs!Rpm|I8߻.c[#gڤo"OtHHY+]Ðߵ= [rP6]xa` ZQoT_1R@ŌWK9Kϓ|)֡DŽ=;IsaʦM˛Jh%rh(m_?/j n',2X6#EIsTQ΋w)]ݠ@dt]j`u"caǹ^[$ld >Sve̩~fFtqQ*|v@ *;bJ*>t`F+=t4g]0r|>ݷ.wBA%6Γ׽)aZڳ8%!P9¬n;؝?kC.*-3Va4=%TyNqF,1w(.F& !򤌱l<A4ϹSSgRƬlHq]jB7/_/)|r}6嫥I\a #{b;4H&)@}? fElNtȂKۓI % k7$D&QM &bՈ] Z\]SA;VZD9l% w_L_ 8*(0K̶iyh{vOM+H{a~g®[~db+C=p0}d>̃AFlݳ\ Ue^']5iu>"QY&'4V;n[!DDk9_G-L U-ޞ{Tnf#zH#4-?n?# |#&.jG%*J}{{?J)~y6ɡw#_B^sO(FqC4]) %~.W$iIl2Z='L zH!*ԟ?{58;Z=bdBn_5xnCW(1OM9m蜿~f!x,i*R60a~ML%vЃ<Mo:\(.NCQ{. iŏ~Sh6bwv[L'_*F*?vu<1ɿj!PIo)o,4BsN6>^ͳs9Pb gM63~fM^]B_9N ;}ocJiRjuB_y A eA@.oSeL+esR1P 7M_:7aWU>qf{ =D&wL3X~[JBLjOZD8)zB2gv4/Ř2.L~#'e᫒m7:QGeTehY )8%Wu'F"l_`~2ᵮٽ FiҲJ%l,@i7F˾SY-nb^ܬ,70#ԁ@: ׌s`VagdC^*@Om#*xb$3jmE>I-]^;໓3Mc|%l2ٿغc{x>  O iOb.6PkV6h/'dZ/rM@t9g ]T 7+4N[5bȤiS:*ĐW(bu7<W on I9wF ܟew6Kygg1+h|ZR:E.R lۉ/ dP>V] {4KeFҚ13U|M2%@A©$2yKS[o܄EDL:0:hzølH))C4US j=0+7ZZ=%=JjOi}11n%;K޴m L\^4ჩe3uP:$ !#Uthb Oo.]t@1i/*C:< ׷jr1m|?FJ/[˼qt;WҾW 0_25կV^'(Vw^dq9⢬«Vj\3 x9#?=etP 4hQb|y5gd$5OZr31J>)!5[>5fޫ'@oii[}(▩oA2(|fٹȏ~SSnFa(=u+ b3#'pz2V s]j4rK34w7| m/y/㠋r IԖWXˎ(H|;)@RhQm/,GH~UcEIMC-.@]u5|5)0qA);=kH[Pť5QEv`rPGh\v`&!aX*2ε@PS~jqT7e:#+F:M1$JI#*9'5VeiqL(lկvVy!Vy@G #OZZu,glݏ_H\^>sAAƬyֵLe$(%dP©5̗& q2Qs0\!Ru edEmb9z?Ya|@34sԤHZCj+>v1ȍH'ߙmuzEn(ʉn~x*b2\(8l?n?4g-%-17&vG脤%O6H<7{,j}>i}vhk|-X^\/ber(;8)7YCֺՄ"_/W$ @lȣXqa)嬺˞~uOvuk4[kM (JP>Ƀ@x*PjdBz=â{cL8KלK.HmC8_7FC"d<Ӑ)ί3'#؁Ͼ0qD 滼'KIۑB5Ye]SlM ׍BD~̼%b2v)LכJV;Y#βNb'Zc3VM c4_Áiga9mk'l4빢j g牟KCfM v ѓг?+ ʰ\o!m?#oɿ*/u5d4b5M49 ^A4[ GCn:} vno O <+ꮢ~:x+”X3~SV& I^0b;#2^/Q!cxP e8?DW S ,׺ pWseeL Y?W}xKf ~tjЩF9܀nl,= _p+XqAe/p4%B =]ܾY4>6z5z{*s)Fukc->4 }_(PESgGB+-EeLhkd$0n1c]?xӽٝhLgAd azy[ z VoCM;bo&~vshc3ȉkx.YL L9Df@Map³`"HR} 7Z.Mv>bZB`N񒉧~>ZUoدoGGRqLI2J(hY&~OUGA %*5' Cr5g2](j3؉E;}.k}>.Yχm$z#Lꔼ<1":[Z,^/jZKʻxt;푴Uu+C捇 |o>(Lţ yؓeq;V/Hǻ>mkP/:.l׶!B_Ek#jǧޓ~Lx[0نtcx< "r{;+H,&ZW u-ͧu ^ wtrl5iGG="Ǥ`uRg0{hi}Y~-#xE(Ԑ`]H&UV`lFNtyl71 ^pk(;u'AV_laGL?n\Y~aSM`fk2t_:O\[]+A1'mN_EKaU9ks-6׹aB \>p^5boF Ⰻ7O=ƫMpyX7h2W83 r1:Cڍ@K}"iHflh u QC?ۤITW(kz3y2Q&M+iЅ9)yoi4sG]>qqIklrA{ǶVx|Pe{} Y*HZnM\+wd Ĉ< V- )ݼAy3RBmaԚ=]a*vOmTV}D&)a*#L4>}lZ<$DYmv!9Udu?b?,w)goBT(]VGФ4*Ƶ Psh|M:Y^Ls8@Gir8th+vwng}8uy|ni1ˋ(Z; ^jSZ'J =N;茌܍1> T -Hfhu E. +gy;uwD_4ӊ,*xO`ˈYCMU%v~Ɋw<ߣJh$vU, {`A0]N~zj"PEc/l)\uڮhLAel] KT1K:8t/B v Ҿ2ĕơ ؿnf3nG;YigI"+!Atӷ]$zxcXW!h]ԩ/%288v>PARr9~Vol=kqqW'e1諟kX~1K/?)dmEHR,GB6QQΠoaieꓙwIaq6 qcj{?zb?P#RS`_ L A EÈ埋Vh6bNkB{4*S'g`?# RXw.xS3:c)O~\T\رOSBYl@} l;lF Y~,%qG|4O̦'7>tF rTO [ 0t9ސXVy3v rcÅj|PJfL'/)$QrW2K ֺpiń )VGUVCFY#H7ݨ&ݧE/968\Icb[uMLS2;4јOzL_O} pQ!$fM:QIՊ.TlM~4wj.Adr˭}oVZ[#UqlXcU5-.\8z2X1b箩ЦACTF2 AՅAo6罞+V_?szhvYX]5 Z^%ceP-aQ>p Ur|}l5Tƅa20 "Dz+uZ[䚑H5NUder"ƴZ#dvTރgXS?B{M5[0\?OGF&) X9RQ [D ;p9W6k&3b5{ځdG2*0hCs#SR"BlXO{B$.S\[E`ԉ&@Kc|Z,NjB@vɪR%1Ђ]%bه#J^7|9$nfVĻl}%{ww7H_={>nrqsNU-}Avт.$MP-xS(pr2{-"!IBk56T*GFEOA},:5kGL>`,b$-pű7y@ *vG̷+B;l~_JsAN/zrϵ5&<2޻unC,][:y_xMx}Cm%D=ZbxGm0aEfRY#4]+W+C$%b'2Ji[``*)FH01Y3`Ѓ3)1M:JIR)I)H%DLR9?*/VooS| n$_ADCImɠ__RV ~T]PrF\3x~ߕs0^df73txKfc\>43B~IX5g(+Fݕ/oI(|UZf(w΀)3,pFJDzt@u BWw،͐J.+ge:w0(~<Ğ0b(}iK, ֽc)])#KORp!sDTnyվCa/@ Ml7|j~,ʓQLa38Hڶ.]\:Kq2a~="@(VOxĮ HEx Hp'5"UwnlE7dZ^YZwhMPvqÝ>((MqOc³E5avƳ4,T e_K_u8Q3bM\+TE=> g.i}wY>/t `Mcϥmy>hvb^Rvquy\i oX;ۢo+qzʗ2.yW;*qvl:"Dg7хyvdTE8Sh9fB}d uo<9E6(y5y+ڎ1C0%kETnTn1qfp晀nn@NX7'|r`^) ΁Fmm3mK_F9"jAAׂN^ٹZ-D̒@ƒ@r]ciY,3_K̐JRv%Cj?=s$g{[6]i/:&s'jZvMpO.}&g 9i%Ō0L2kOؐ'욞Nec6$q5+eE^GvJM*abҡ}]m&}8zedՓ iJ`( ^.*]V#Ǹ3%Qjw+.ɬb̹X/^Jx{׺l1Lٱ`Лj7t 'N2o_| ND `hU`4ɕz!K/qk:Rz.ӡ,+yjF)}tX0˵.m.Qy"3sz}2j&D=Ny˚.Î0ƫګ򺽈/÷!4n\$7+&2\*MY$VO<`iв)h81d%bK il@.R22Tb⾡rue~6ӕtlcT8|2EO9SٺG1]:x 'E} Ol0!['>(d%(i%^j6UgP88q@䥶Nh_3y~t_ђ,~qkN| ;s'yYEzC }rEl˃ ~ ftJl!OV/*e49&@d)Fg/)kJejpڣUyJjD1 m*jWaMS +?zyWsxRF{P#T ܃sf?~J; hǬL4G0_mWk+U0λ}&TBiC_@ m}7n.eNqUs8CJ @ Z A{.+2eڧb$F6z/#63@ۣ|Gފ84~; ^ Y1Z̙c9Β*xYHd=ɔ&PBhޗ yߠM6%%(ʺ eBAPA͹)žʚ᣺Wz^Ń\,R)\:9%c3.G;EX6N^lU_(S dV%AYټ%^h̄Om>,2XLAQ3wE00lwimz^68í)J U+rx~Sit6 /]>6@߫kqTI).s:i^e˷b%򵔪X/ `/(_,_.M'>ު~Yؼ.g ܣKB%T(u ѹKx>GOb%IUb"h"ՙ`-65&$He 85H̐%zhD- ,`@.Țogkф:!e uty(r1LA,].6`FcVP/8C?>a^3!GYU Ebs&h~ ODFho!om<n0E^"zSQWb}&qdt"['OXxu8ʻ9! L/Kg")9oa thbm73Pq7OD7tLA;>j1ȧ!j h %%]50:me&B2Boks?SaD k ygaa<'ǒ"hJ78B:\i x  H8AM^pvpMvN6طp*va>lXNlu%jhf?#@ɺ|lE F =e'X]z^6p8:c6q3g Y0$2Vm~C chHa*JZWyX_/g*ˑmF86L4. vZ38܅Yd0D)h{|Isͷtx6Py0Ǵ-J|`A U_ u_+@q'P0;3WN64XٽU-鸰&虣:kcf$=Q u5eH_qt}F z[>'Oz$ $˴鏷vmbЏ%*J/Z${YK\Homb1fƹo<2u`"/7~}fd,oI| -4VL`{U3 D$[׳1'2Nc[c8!ޣIVşmBE2j.LQ?[1^E vV)g!/(Q7'c)+DydQ·~~[\<,IEؐr |Al/q}帒MZ<xAFm ℍJPn+ջ4&=] M(cLkgi `<94C?E&UaΓ/;RF'mL)$"KK KP(,@VS7`j_z7~`9mOLUDh6w{5r!䔇PoWXϓ)!Lhɗ(o0E{]z9vScB\LW'<:w'W(hkO1:wk+oOFneU2tBc\ NBF21_yWQ] p&0=S˟Dt24v\W8L@^( d*,ꮳU02 \(m^NZ}.qi6}T])'q~(_ fYi'M#.x ۘuw!'bcm*Nq|o`=Z[*ZDkQ̋^B k:*yikDR+@b{nrmTOA5\ f2a9[+"#)bHUdO5SqwII/q' 2~4jW=Lϕ3ć^KCeћviP@PPp1Nl-ŹBĖaduiqgndÓ|`]fws'݃D =‚M8=v[4oTlzYeH+P"K֢ǟך"j1@pIcFsQ:6{Mvrv_&DN[g|sMmNp亮 {r}ėHZgټn%# %Ukc< SEPfHl<>I+or^9&]ʾZ cfhw:̸^RG~j&u[Tddy\(}r˯A{;ad޻U2Nl,mWQI_FQQ#AT&ڧfɰD I6/9-,*qrӇ:L}aW%P@`Gko0d#3HY!Fp%^gVp/%׽'{Tͣ¬:F׺*bo/ƌY.[7`ǜ&X5%*'w={q )Q7['{|8{ի&jީC+U#'Pp\iR`SXRdI=:Ju!U$AM_SoS<-ebV=o[x{6C|%"3ǭՋ]O)J/&@qg o1>k!ai; LsKc@QjLO8rý(]8-XW1m2xFzN Lu\@OWJMEs.XTfnHw1׭/J{ }Zh1u\g̋oGO:-ۭzE913f|_4MN{~%MMx>Vd/Ef1zІ.'}7&ȵpQd0pЈ&B2`!âk.HZy'T8*4Ck+x)9" 5PpjrJkj1,IVşm8pw%;AdeP׺К?Lƚ5Ƃ#}F""mb8!L-#b֣Of[ LǠzюCl,$ĤtɧΤtQdŤ @޳(W `E<\1LbG t*OfmWYOқ)I6L/t@yFgo!Ofb 0¬[Q{O4 qSkmo ْ~H|d@53>%[^s _ȘS%Ap4~]n{7rs &4roG]u8lxBu )%,[{*V`t|mI.ߧq?92"ʃlpv(0qVf1oZ n$,aiR=^Đ!GPOK ÜJdk[dPqQǝm!9!ĥYէ-_/ukm5܇di[ӻO>&\/7A0Fk gM>l,Ið u!(J;˘CĽ ! 4$/\anuL*WA8.G&?`NhE*/-JE佯\=62pP߭;JvE?S!Hz֔ixNyio %eĞFc|n28 tG9GˋZB?Wk*jaf̐>"FG e:FCa~H( $;ɓZ6m,iё+ƝjѡrZ}cxB0yRZ|Xߎ G,Ye<-$\/Ix;&LbXxCny[ij%v1ẁ]>?bb=B <#8Y\(L"" [PC&0!׭WQs7Q>r)я /Բt:H6}[[Á=lTx"CEHv#|H$YUಢLrkxnamز=`l65z'Uܵs^&&O3ї91.)S&]`!¦E6"eՉvKJ5 &iYOR^55|5ǮssE%ݩe e>ق#MP 40w[B{0#lȺT!W LDt1N\,$;5`} ASA44TILU}=ډoN Ib J}\*[]_nA͹ڤeSܞ |i@J5 8V?e~u+hR_&i*ӹЙ?E6Lն-DGsEe[p t5jՖX3W{9S=ȑpSnI+!Ѧ\#8\s__H^>j7RV'rCq 9܅߽R^rg4+,׷tg>/ᵢ$}rJp$^`~29ucco$zIΌ'Q,B@ޥ~TӧvL {Z0>OvN[]+1Nh}jZ}iԜEQq>;=<>Qc k!jS ᬚmC,qF֤({u4l8d!n LJUUc0%>AXEFIQأSZ\jËf'~}rDwgӥdG_c!#;6 -/*5oi6n2+۴݆Uf^$r*CEj," /Zva%VMO$ȦÚ!S#(K a9X@dn|7~?Y6|Q Fpz;MwO%R&7H(#q< H& +2V/s S<'0$YjĸY2S-R5:;͐g/Y02)edP*u>J26ews{-Eb؎sj֢́L kڡSā@[6Elُ T:묪42*U-GGS},۴;Š ^WBF+15a5Nu_gP>]d ^ǣaCԽSbeyC۷ !=턢ȇ!߁mS1|'{ z6?1o Ɇc/,-+c&B& (J,(1,CWٺ{d Zn/)i݇f[C4*o}d]W( ד>\mtb2Nk=k1Z%\{>t&{H¾ςq0q8+R7oH_}3{Rs뭹 2Bl5w? sYa-{-Ei&ΪuS#`:QP,-v7K2dǥ)kO$ T6ogP2v=[jA$- Ro=ڸǍmN@ۋ))2mR."^g 1AŤRyL$zMZc\>F MQ}P8Bbad$$ú0_L8X>Oc%FU*F} M#|t1G0t-kGPdLWJXQ7+%Acѻџyɕם&,wE*V\Ei5#k2ȪceJϏ͍kq45[T f;ؔ5- ; [ѦKk(+Ibqzi76q%7z I4|A4A E,LUE=OXFOjzg x3C>;EDrRn5yl<3%i"+yAK Do_A֏ #? /v[hHf].7H_DD#"%GQbu/V(. =]Pbf7s!%K@LmaaCkýbWZ$b(E̖tr?3XЁKE7ru :*qGvJ/nP1I*ߵJSX$Eb.l?ҔCUŏz 5/XaMRJAU`rWܬ蒁MAD ^L=NӃ,5 ^Dѧ,tA(i4zLp҉sVW'>^ ɦCp^Č tEw: endstream endobj 255 0 obj << /Length1 1973 /Length2 13604 /Length3 0 /Length 14844 /Filter /FlateDecode >> stream xڵeTkCV,8݋[ݽPP9tof9J\^-OBE(ljg 932@v,F6&6&**QGH r(8{:XXx@]i 0<쁬3ӻ2i]D=--`gd鏷@`203ޅ;hadc3*IEu%UZ.vETUM] &&j0$UA j?y ˋ i+2+d'p jfv%X8;23189393O fh x:md^Ng  gi98I}/ӻ '' X9+$59AF wCg#g'_'Дo@R9;Ϳؽߙvm؁,Yw3K_2yai qU5F1۽Wxbr `cc8TŸYуd,AfnbϬtpJ?2s3tM,$kVYߋeog03qX/^NF@ 럊F\SK1_K<ߙK@ךҾ﨩` 4C`Vs~rI(ikfdki_c CFtN@S%KgL s #?e>Ǐ]tSib :9>h7i _V ;SK9#}89^Sm tkZL ;wOGD#dF<\fw߈l `sr-9̖im\l9Vtr||G@GKg}L. N?'_QVuvjZD]}XT9&-"b `d;}C5?'t X3dR+?]Mt\#%#mO,9P ?CobH*u%rTYhWޗE\x,[IKY'9Lvv1lzk\+ @}P!m 2\g.[b}fGtQnqC̯ӐsOm/e{I@堒6 0>Fn> ?s^3j7)cѨ“ $kٜb$W2Ӆ+EkYu͈+;?cȗAOl!Pr ^Vٞc)8D JHd'e ǂOYR6Im؆b$CCvPrev|@31D&Ct: |1g8Q`Eo@R(g7oϨ Q|#\i ]cHV<)gT_&fǼH}6x9oѱtח<@-y񔰉OAǭ{X^Iפ`bW)XŪ_REed7ڵM3>Vi1"@B*n) O"'zIHlʾ槴`!niXA$y>Ž= (h2C4'Oi֪_B`WQ8YM:15Wy86LP2bE2?87Z-1 q8"+JgB="56JֆSuqVk#GΔSSOm5xkW>m 3+ܓ"apNШ"aaB~Q‹aǹ uz|(?SRg1;7K*Rr*K`J\6 ZG$C7zM) {W3Ji6zT EuD ? gYqEO "4aP7E%eqQrESXR۫Ҧx O*=x] UP]4}+ XN ҳ*R=1B)wVn3ڏWc']BVx%?Ȩ*`ӜM q{ c `Z1DkU*yQ=y=) ʣ dsafyXt;L)E;)=O]:<)YrA n$5JEF`2awzl-ōܕij[Yb;Ua<'gͅF`"H5%i}Ͱr$`ž 9!Yo]/45ԲW _$:_46ywT4n<a8=v7r'eR&1Xk omy=<J(ˌ9l ֹ &W\%Y;ޖzJrNw0cTMҦo袞ˉ ž*5}Hʛ{hJc\\k4 %Ivbh>ɩzܘ8 Cg}*gE_ԾI˻-19aǀ W 2^ʘz* A jqGvH5IƬ^јs+9Ky6VIc?J H.zbNS6WWY+WvO\rO xE(,Z_ kl89AU]#,Ȑ#]Az#4L@s0\le^ѱ4n7"?1.攝=BZ:m^ 9B.#o[&@]K`r"sNJZ RJlH$J Z 6&L5wΙj>TbK3tvLÕX?(HV֟ )<`FVË|\Q#s%PGP,ᜳaC jy8G[<4Dv}di&%x cʰBUkX|Wz48z$ KĹl$6"3Oi{Q2cIC(@(!cO#䅬6%4;mԡ 'L=n{֣T687ZAJ4,#̘cU5a?Sgp/QVD`T)mӯiBEYR-I Ob鮶x~dȨl$ɦ5;ػe5O 7=vx~Ðe39fd`D i+4G~޾y>R0Ppnw˰ |9OA~@-#tvъE\T':ݣfVɗmU>D8Nt, $dV k'~ВoÆS?vE"?ւP4pB2|.M>$+#z8~w/7;(hS|8%Qb [Gh 裄VC3'u@JjvEȐw"|;9S<\Y"6]- Vt4B/%-uM?;9h0Tik`)L7l\R+CD!bv$|ЋYQnτbhOUp&nUC G*0_c{jd(?V趃י|q˄eӵ+ &*_|@ټ\x>~(dzTaդ-fۉHuj5ƑK.'Y%=C,13F$]IdZi>ם+ྼ[a ߼qyF'˝1-4)[FWZ|Kgbs!y)N13ع:س/=,gB!g\U8Tx wDi-ɭЄD-0[BҌvp`7ű!`{4C=/eX;#dVҁb=4klp4<qmeբ`⡈°V/:Mn oC\8Y2?{ܪ Ԅ.ŸvkB96%˯ s?ڸ`E/%'A;M5؛DܨdSK_l,|#Qvo}^ϛum<A.z)o&s剌CgjRd_BTΚ3dUVFB`;*t#4mpe7 -RMT[_`RToDJ85:EVaӄyAn-;]ڃ90X'Gi^{[*T2䔙1-#v XBA|`q4]݇U t1Yg$A~'=O⢁4Bdx8qB$@>-W,$`zwNTZL+UMS)vgq> pֆ$i2C8Nh.]O Gv8=?opڠ!6Zޕ˅)Ц4 U8 U3<E 91 WmV PqcljA8GtO+7f[Dr|2+AT6[?ӊJX"SC/bF7]e٦oW юC i4==h0w17W]EF%^rQP}Ekll|:^ lj)|:@ɑ7J2?XgzkFTHW$:;q,?4uBИ8>;P'WQI.׎ gCf7G2_^NGZ4G;`+sl'+הvJSo6#&I!wCqeCR ؁, 7g˳tsdy"zJRqk`e6pԾZKRrۅ c۠u'  .HW oZxM;sh@+FifF=g[L|,ީJ!ʅ\ZB M#9Y=,'B^FA-`G!ij.n{6$|֓]#0> >+{rɫ Z-Mw)ؔ-i}ܮS0]P">a먿tGV)Kkc=A:ljst*,L%G%X~B>b |"Yωy8=$}aK"7dytdl \u9KgvI=YbZ0 :@x(Usnxn:@*+P6EJʃ16Y9VF妜!t#i#s%#ī@WR?а9oKfpSt]\kBV󣶃>Z@ z T1B6syr[]=PK A~r*8거}Pb*yГ&‹L"hsPGJt7Tvd3a[>;_B#2uT]+~RtTIAƩ%ܡ斩zw^f"[Rl &ӔI*|dKHy Bm+dh5cLQר }~ Gb>õOa0`J7y}ͩgaVk*J5!IM(Z7x-0F.cG]/9uX4>4ɟBs;9sPdx{zx=A_ŋsX@>[US,kͰaWv$, nɯN/>~g;s(F^HMw%ٚH'8M(s5aLJW7FBIV;^R)mޜSLr(J[wyx?Bx36>NW5zrL+yO@= ūGzF":>NyW\SRB(oN @{kYV*B :.ybQUTA OI.T&z(s΀F+}c<{yl )KVN wTײ:dcjF4PІT ",9L C =dkyݭO_v9Ǝ7i\920zYD^f}O6Տc/\IPl祭pv^t{u@8J!h!MFvAS]3;AXF QȬWrk5O1{N΍iaʓ9apM_)CzkŇ[6m._GN~`aYb=Vf۴ItY{ &N;? nwWx'C|C|sk3GKѨ~IҪ}hmk+GpKM eS 7V3i%FpE'@FMJSZ!o dWRv4I͋|n?O'}%5㚀mIY(H{E1J\o1SQB Ji}8u `G0x'6Q2'h㋉fט"ԭI(=E rk=Zd ǏHhEٺvYR}A_3H/MaC| gft:{|K%ʐ,K$4L}! ݼ̥aFTE(-8%ةɀ|rP83Loe6WB|$&/YL >h!$I}wy(٣&E,tњN܃`bPt5Q=S)4A<:zAvHG,̠}UͦL&0jY=>+Ww(,+> zQ_Kye ڟ+#4~ҁ_mmS7DRl;¯ Ўe \UZo%?#=)C292'Bi48f~G,pR xpP#6?$ȡi&VZ6t`W#4>W3K';S2H_[pR1ɐ%C)j!#VҰU_ܔй+tflPMa\,)YPHdI/c\h4) Yư;pDA\rw`%k}M'V{>JIƭN)o- u>æ1X\VLؗiueߥBp /tiRH=[V-\a j.u]Txnct 3@{,1ڜ!Y)CPMѡ,c) -b)Y"]5ZBE_|{{;xzuW(!q#=);x]SvZ 3 C2r9V_!݀jуI GFg {: ʅ V'XH>EHfvW _nj*gƠh?$XB`[csj@0QY㠄S͒ Q|oul£cpT<۸®urX16m!J 7&GKDEmnfWHs(`cT[wVb'acTbؼ%+<K 겆elq% Ko&ymقKT.|TsKVC-bBB1R y={RHT+\E_I (mғaƙ1BFh^1A]΄[v+8n^C3Z<$⏽V$#R.DEAg,Lj#tvmh3+35no?@ ` w1MMxk]X=vl:K qtb7jS%L{]KO%A{/HF8}jɸrtÚH0c7t0.yPq3E_LL@AxG_n#϶JRy8dN&M0ŦWC,gF,8f0A| ̝=FErqΖ7yvcM()a*[OG%/%12_ p]jZίELm1W+v 4^ƥvAsNĕWt%ekE_~=Smwb?ҵ?O~S4zl#j6z>Q4hvo]~4*UIͦD>" +y쭮 7ǒ!y ybli bEX_4#K쁆xx$>\4|ֱ8>p@Tq̫eqNmLW:}Ff=WRR[/s)"L>h>%zctg@mUɛLẻ3m͊!Fأtڢ-$ M/Zn#qSSqVvo oiՊh[TvF\1Ӻ? 8ԴPײ7 BwnѺG!z]~RѓC`JCSSvV(J0}RZ [o40YG>m%NНEB%{EY6 T\p?U`Vh=*'dP$BF5 U.q1wTʝN,`@ gf:2Kfeq(~ 3NݰT:c\[=SW> endobj 222 0 obj << /Type /ObjStm /N 80 /First 724 /Length 3811 /Filter /FlateDecode >> stream xr6]_xwdw`әiZIINZm5芴t qpp<3!dƕ:M&%>LZ|*g: Repm2LHqgB0&<p$qNUG:`tytILhI瀔V0ɔ1YLy\LK/`ȴ}M,5x~=}frg4(;n Hrx23$ !Q89L ryVXJL*b̒ M=GGˆ̡Ĩτљs77 &Fmy `8 9(y48r8̈́FX3hG •J̠.ΠT 3!O8h8vt@C| )GH.А`; ȄҢy! ބG͑Wi=  8pXkMV\m-pa"pxC}4⋌fUك E:Q}_I-tVZ{eYlXw7lڭuO܋1_yʖ_K5nIl[YivAK)/4_-Sf榯ypE!!_ǘձ1ϔ4P:Va4CfnqFCw1!k=)tHLa8ʈs :δCŽj΁X sy#b ^ck"\Y[i=AX{{|_; .zĕ0jpdfa=s#:gѨ+Z6I*q9XO$) ۀ><772']JK> hDwU@kJVa.s2E#*0:)tw!)OIJ` h H@LA]D:'`yMkG6xNw\h=o.%Z#@dC{9-{Ms]FC.% IY5{+\HFwQFG6:IS<*8b(ZEE)B1x ʌxUfImbj݌r?}[U%lXuGٯs\xť|;cEm\]\G$9pISWۋu3vڔ7x㘱#qYN&{+[%+d+ٖUXv.dW<{9F>?ؑeyq.v(sEM+~~>Er+C}A߰o }Ϟg b^7'3)z.ӢZW[7tFbuaV~nJvf{܂'Eo@ն}W0^bWnyf;:ܭ%q]5l&V*pV7p|uQ_]Y}.]v%a,wUS,O'{ >-{.꿷.h;ծk`\4 uo7d}kmqp],7(5():yTԳj[=mUU69V>NAN"{6noQ^'ؿaz"k)"?{}(#{B"Un_@Ab.KVԋժY%b.YcgbQR,'NߕM e Ry !}\~Fe]Z.Z_݄;xӓ@@3~mԽ89;k ¡43 tU.:Q..j{\K͢_ʱqff_RZ3u(㜝2> |.:Qd? JF'#7$ ' >J޶; D7+\*è A?x #^w<ф~?:mƝq!˄Pb_qD^5|D=c/K~-dt]hgi8il. '=e2Th熼!ovyèFd4;|׭ușr;埃[35LgZ"@[#@M9Q#/ WuaEqQP>k,3j ِPEPpߖ$5:2G<2wx txj*M_` ؕ7z+A-}m$vdyw,>+`99q";Nq`3=> <7${'s>+`\>=y_@P6j1?m]sNj{-TK,t.$ C_6oʺA lyGKxkQ44XΏ[ Þ6fpۡ0=8Pq*AEMK@j]2 2dn3-3S9(xT~B '+7bR{4™Gs`R\| }` ,̔@gZ g&ȄA Ab!Ƥ]'O2pScz [a8`PN%顤di7"`[1` GyQާy*aM$)A \! Nt NA}vJ0r5)&&y}M…O.S䟫:|n$#!a56Es:w)&u.84(y jbZ0s4ƚ̐eB6RdE)x S"A66֤`,T ZLʰIAb*aKGjR  _H&Y &}=$e"!+:(U:* %ܔ8gJ'@<JȃU B)zbhB('O6E=]U):I 47SaM F"4:Ae2MAPVx#O G.XQp$Xf; N?p&A5z,FaM ډDІÅF"ALAWS#I u M m u "SLTr\&G$@A-:6Ʉ2l$Dv2?LzX"B-V:ޫLb)ҐJXj{d09{ C ]<"q-,èwX\ Qr(8Z޺0lnSqCigJvvH*nƋش gLB6\Ħ[į( p: WESYx}b^:X}uQ>crB#k?JHC endstream endobj 296 0 obj << /Type /XRef /Index [0 297] /Size 297 /W [1 3 1] /Root 294 0 R /Info 295 0 R /ID [<4F8F78473320EDB01AB25D61B78D3816> <4F8F78473320EDB01AB25D61B78D3816>] /Length 654 /Filter /FlateDecode >> stream x;sUU6(rAQ0^*wF@` Cv~ k{>7p9tVg5IYZkOM.=-ㅵNˌlXM=()or*/jVd^dl!;M'o9ۿ-twcŸ'w_;n#| GCHdbo7D>s9!KrR/+9%匜sr^.rQ.eY+rUoeEkr]n(7ܖg#wܗ_<_}r7𒵽 $Xd}oKƫ6F"X`,E debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true libxsmm {01CE57B8-270B-48B6-8BD5-CFD30B8EFCDC} 10.0 StaticLibrary Disabled Disabled v142 true StaticLibrary true true Disabled Disabled v142 StaticLibrary true Disabled Disabled v142 true StaticLibrary Disabled Disabled v142 true true StaticLibrary true Disabled Disabled v142 StaticLibrary true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 $(SolutionDir)..\lib\ia32\ $(SolutionDir)..\lib\ia32\ $(SolutionDir)..\lib\ia32\ $(SolutionDir)..\obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(SolutionDir)..\obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(SolutionDir)..\obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(SolutionDir)..\lib\intel64\ $(SolutionDir)..\lib\intel64\ $(SolutionDir)..\lib\intel64\ $(SolutionDir)..\obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(SolutionDir)..\obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(SolutionDir)..\obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ Full $(SolutionDir)..\include;$(SolutionDir)..\obj;$(LIBXSMMROOT)\include;$(LIBXSMMROOT)\obj;$(LIBXSMMROOT)\src;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;LIBXSMM_BUILD;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Fast NoTraps true true false 4710;4711;4752;4820 SingleFile 177,3948,10373,10382 SSE42 EnableAllWarnings None true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) dbghelp.lib;mkl_rt.lib;%(AdditionalDependencies) MaxSpeed $(SolutionDir)..\include;$(SolutionDir)..\obj;$(LIBXSMMROOT)\include;$(LIBXSMMROOT)\obj;$(LIBXSMMROOT)\src;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;LIBXSMM_BUILD;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Fast NoTraps true true false 4710;4711;4752;4820 None SingleFile 177,3948,10373,10382 SSE42 EnableAllWarnings true $(OutDir)$(TargetName).pdb 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) dbghelp.lib;mkl_rt.lib;%(AdditionalDependencies) true X64 Full $(SolutionDir)..\include;$(SolutionDir)..\obj;$(LIBXSMMROOT)\include;$(LIBXSMMROOT)\obj;$(LIBXSMMROOT)\src;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;LIBXSMM_BUILD;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Fast NoTraps true true false 4710;4711;4752;4820 SingleFile 177,3948,10373,10382 SSE42 EnableAllWarnings None true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) dbghelp.lib;mkl_rt.lib;%(AdditionalDependencies) X64 MaxSpeed $(SolutionDir)..\include;$(SolutionDir)..\obj;$(LIBXSMMROOT)\include;$(LIBXSMMROOT)\obj;$(LIBXSMMROOT)\src;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;LIBXSMM_BUILD;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Fast NoTraps true true false 4710;4711;4752;4820 None SingleFile 177,3948,10373,10382 SSE42 EnableAllWarnings true $(OutDir)$(TargetName).pdb 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) dbghelp.lib;mkl_rt.lib;%(AdditionalDependencies) true Disabled $(SolutionDir)..\include;$(SolutionDir)..\obj;$(LIBXSMMROOT)\include;$(LIBXSMMROOT)\obj;$(LIBXSMMROOT)\src;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;LIBXSMM_BUILD;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL false 4710;4711;4752;4820;6011 None 177,3948,10373,10382 SSE42 Level4 true $(OutDir)$(TargetName).pdb 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) dbghelp.lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT X64 Disabled $(SolutionDir)..\include;$(SolutionDir)..\obj;$(LIBXSMMROOT)\include;$(LIBXSMMROOT)\obj;$(LIBXSMMROOT)\src;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;LIBXSMM_BUILD;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL false 4710;4711;4752;4820;6011 None 177,3948,10373,10382 SSE42 Level4 true $(OutDir)$(TargetName).pdb 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) dbghelp.lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT libxsmm-1.17/ide/libxsmm.vcxproj.filters000066400000000000000000001074661415223013700204310ustar00rootroot00000000000000 {9c7d8219-7adb-4410-8ceb-c7422106af6a} {df2aad30-e992-4052-8f3a-f1343f4e3f33} {13af66c8-51ff-4b9e-94fe-53a686556d47} {c4a6fa16-23e9-4fb6-9148-c47da5e490e6} {23f4992b-2a02-4f5a-9287-9d00762dfcef} include src include include include include src src include src src src include src src src src include include include include include include include include include src src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src src src include src src src include include src src src\generator src\generator src\generator src\generator src src src src include src src\generator src\generator include include include include include include include src src src\generator src src src src include src\generator src src src src src src src src src src src src src src src src src src src src src src src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src\generator src src src\template src\template src\template src\template src src src src\template src\template src\template src\template src\template src\template src\template src src src src src src\generator src\generator src\generator src\template src\template src\template src\template src\template src\template src\generator src\template src src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\generator src\generator src src src src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src\template src src src src\generator src\template src\template src\template src src\template src\template src src src src src src\template src\template src\template src\generator src\generator scripts scripts scripts scripts scripts scripts scripts scripts scripts scripts scripts scripts scripts include libxsmm-1.17/ide/libxsmm_generator_gemm_driver.vcxproj000066400000000000000000000546431415223013700234060ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 libxsmm_generator_gemm_driver {47EDE325-4516-48DA-862B-F689F12DDBD3} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 true Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 ProgramDatabase None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 ProgramDatabase None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/ide/libxsmmext.vcxproj000066400000000000000000000574001415223013700174730ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 libxsmmext {FFFB45C1-DFE2-477E-ACA1-EF5906463BEA} 10.0 StaticLibrary Disabled Disabled v142 true StaticLibrary true true Disabled Disabled v142 StaticLibrary true Disabled Disabled v142 true StaticLibrary Disabled Disabled v142 true true StaticLibrary true Disabled Disabled v142 StaticLibrary true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 $(SolutionDir)..\lib\ia32\ $(SolutionDir)..\lib\ia32\ $(SolutionDir)..\lib\ia32\ $(SolutionDir)..\obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(SolutionDir)..\obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(SolutionDir)..\obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(SolutionDir)..\lib\intel64\ $(SolutionDir)..\lib\intel64\ $(SolutionDir)..\lib\intel64\ $(SolutionDir)..\obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(SolutionDir)..\obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(SolutionDir)..\obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(LIBXSMMROOT)\obj;$(LIBXSMMROOT)\src;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;LIBXSMM_BUILD;LIBXSMM_BUILD_EXT;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Fast NoTraps true true false 4710;4711;4752;4820 SingleFile 177,3948,10373,10382 SSE42 EnableAllWarnings None GenerateParallelCode true true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;mkl_rt.lib;%(AdditionalDependencies) MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(LIBXSMMROOT)\obj;$(LIBXSMMROOT)\src;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;LIBXSMM_BUILD;LIBXSMM_BUILD_EXT;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Fast NoTraps true true false 4710;4711;4752;4820 None SingleFile 177,3948,10373,10382 SSE42 EnableAllWarnings GenerateParallelCode true true $(OutDir)$(TargetName).pdb 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(LIBXSMMROOT)\obj;$(LIBXSMMROOT)\src;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;LIBXSMM_BUILD;LIBXSMM_BUILD_EXT;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Fast NoTraps true true false 4710;4711;4752;4820 SingleFile 177,3948,10373,10382 SSE42 EnableAllWarnings None GenerateParallelCode true true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;mkl_rt.lib;%(AdditionalDependencies) X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(LIBXSMMROOT)\obj;$(LIBXSMMROOT)\src;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;LIBXSMM_BUILD;LIBXSMM_BUILD_EXT;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Fast NoTraps true true false 4710;4711;4752;4820 None SingleFile 177,3948,10373,10382 SSE42 EnableAllWarnings GenerateParallelCode true true $(OutDir)$(TargetName).pdb 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(LIBXSMMROOT)\obj;$(LIBXSMMROOT)\src;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;LIBXSMM_BUILD;LIBXSMM_BUILD_EXT;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL false 4710;4711;4752;4820;6011 None 177,3948,10373,10382 SSE42 Level4 GenerateParallelCode true true $(OutDir)$(TargetName).pdb 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(LIBXSMMROOT)\obj;$(LIBXSMMROOT)\src;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;LIBXSMM_BUILD;LIBXSMM_BUILD_EXT;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL false 4710;4711;4752;4820;6011 None 177,3948,10373,10382 SSE42 Level4 GenerateParallelCode true true $(OutDir)$(TargetName).pdb 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT libxsmm-1.17/ide/libxsmmext.vcxproj.filters000066400000000000000000000020201415223013700211260ustar00rootroot00000000000000 {87aa2ea7-b77a-44e0-85e3-6dad5fa5b41a} src src src src src libxsmm-1.17/ide/libxsmmnoblas.vcxproj000066400000000000000000000526741415223013700201610ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 libxsmmnoblas {7C7D5D58-A367-414E-87B4-C23B8A578EC5} 10.0 StaticLibrary Disabled Disabled v142 true StaticLibrary true true Disabled Disabled v142 StaticLibrary true Disabled Disabled v142 true StaticLibrary Disabled Disabled v142 true true StaticLibrary true Disabled Disabled v142 StaticLibrary true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 $(SolutionDir)..\lib\ia32\ $(SolutionDir)..\lib\ia32\ $(SolutionDir)..\lib\ia32\ $(SolutionDir)..\obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(SolutionDir)..\obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(SolutionDir)..\obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(SolutionDir)..\lib\intel64\ $(SolutionDir)..\lib\intel64\ $(SolutionDir)..\lib\intel64\ $(SolutionDir)..\obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(SolutionDir)..\obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(SolutionDir)..\obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ Full $(SolutionDir)..\include;$(SolutionDir)..\obj;$(LIBXSMMROOT)\include;$(LIBXSMMROOT)\obj;$(LIBXSMMROOT)\src;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;LIBXSMM_BUILD;__BLAS=0;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Fast NoTraps true true false 4710;4711;4752;4820 SingleFile 177,3948,10373,10382 SSE42 EnableAllWarnings None true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console MaxSpeed $(SolutionDir)..\include;$(SolutionDir)..\obj;$(LIBXSMMROOT)\include;$(LIBXSMMROOT)\obj;$(LIBXSMMROOT)\src;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;LIBXSMM_BUILD;__BLAS=0;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Fast NoTraps true true false 4710;4711;4752;4820 None SingleFile 177,3948,10373,10382 SSE42 EnableAllWarnings true $(OutDir)$(TargetName).pdb 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console true X64 Full $(SolutionDir)..\include;$(SolutionDir)..\obj;$(LIBXSMMROOT)\include;$(LIBXSMMROOT)\obj;$(LIBXSMMROOT)\src;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;LIBXSMM_BUILD;__BLAS=0;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Fast NoTraps true true false 4710;4711;4752;4820 SingleFile 177,3948,10373,10382 SSE42 EnableAllWarnings None true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console X64 MaxSpeed $(SolutionDir)..\include;$(SolutionDir)..\obj;$(LIBXSMMROOT)\include;$(LIBXSMMROOT)\obj;$(LIBXSMMROOT)\src;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;LIBXSMM_BUILD;__BLAS=0;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Fast NoTraps true true false 4710;4711;4752;4820 None SingleFile 177,3948,10373,10382 SSE42 EnableAllWarnings true $(OutDir)$(TargetName).pdb 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console true Disabled $(SolutionDir)..\include;$(SolutionDir)..\obj;$(LIBXSMMROOT)\include;$(LIBXSMMROOT)\obj;$(LIBXSMMROOT)\src;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;LIBXSMM_BUILD;__BLAS=0;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL false 4710;4711;4752;4820;6011 None 177,3948,10373,10382 SSE42 Level4 true $(OutDir)$(TargetName).pdb 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console MSVCRT X64 Disabled $(SolutionDir)..\include;$(SolutionDir)..\obj;$(LIBXSMMROOT)\include;$(LIBXSMMROOT)\obj;$(LIBXSMMROOT)\src;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;LIBXSMM_BUILD;__BLAS=0;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL false 4710;4711;4752;4820;6011 None 177,3948,10373,10382 SSE42 Level4 true $(OutDir)$(TargetName).pdb 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console MSVCRT libxsmm-1.17/ide/libxsmmnoblas.vcxproj.filters000066400000000000000000000013341415223013700216130ustar00rootroot00000000000000 {c90de36f-1f7f-43c9-943d-2983d99302be} src src libxsmm-1.17/include/000077500000000000000000000000001415223013700145365ustar00rootroot00000000000000libxsmm-1.17/include/libxsmm.h000066400000000000000000002167531415223013700164000ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_H #define LIBXSMM_H #include "libxsmm_config.h" /** * Strings to denote the version of LIBXSMM (libxsmm_config.h). * LIBXSMM_VERSION: Name of the version (stringized version numbers). * LIBXSMM_BRANCH: Name of the branch this version is derived from. */ #define LIBXSMM_VERSION LIBXSMM_CONFIG_VERSION #define LIBXSMM_BRANCH LIBXSMM_CONFIG_BRANCH /** * Semantic version according to https://semver.org/ (see also libxsmm_config.h). * LIBXSMM_VERSION_MAJOR: Major version derived from the most recent RCS-tag. * LIBXSMM_VERSION_MINOR: Minor version derived from the most recent RCS-tag. * LIBXSMM_VERSION_UPDATE: Update number derived from the most recent RCS-tag. * LIBXSMM_VERSION_PATCH: Patch number based on distance to most recent RCS-tag. */ #define LIBXSMM_VERSION_MAJOR LIBXSMM_CONFIG_VERSION_MAJOR #define LIBXSMM_VERSION_MINOR LIBXSMM_CONFIG_VERSION_MINOR #define LIBXSMM_VERSION_UPDATE LIBXSMM_CONFIG_VERSION_UPDATE #define LIBXSMM_VERSION_PATCH LIBXSMM_CONFIG_VERSION_PATCH /** * The following interfaces shall be explicitly included, * i.e., separate from libxsmm.h: * - libxsmm_intrinsics_x86.h * - libxsmm_cpuid.h * - libxsmm_sync.h * - libxsmm_mhd.h */ #include "libxsmm_dnn_convolution.h" #include "libxsmm_dnn_fullyconnected.h" #include "libxsmm_dnn_fusedbatchnorm.h" #include "libxsmm_dnn_fusedgroupnorm.h" #include "libxsmm_dnn_pooling.h" #include "libxsmm_dnn_rnncell.h" #include "libxsmm_dnn_softmaxloss.h" #include "libxsmm_dnn_optimizer.h" #include "libxsmm_blocked_gemm.h" #include "libxsmm_generator.h" #include "libxsmm_frontend.h" #include "libxsmm_fsspmdm.h" #include "libxsmm_malloc.h" #include "libxsmm_spmdm.h" #include "libxsmm_cpuid.h" #include "libxsmm_timer.h" #include "libxsmm_math.h" #include "libxsmm_rng.h" /** Initialize the library; pay for setup cost at a specific point. */ LIBXSMM_API void libxsmm_init(void); /** De-initialize the library and free internal memory (optional). */ LIBXSMM_API void libxsmm_finalize(void); /** * Returns the architecture and instruction set extension as determined by the CPUID flags, as set * by the libxsmm_get_target_arch* functions, or as set by the LIBXSMM_TARGET environment variable. */ LIBXSMM_API int libxsmm_get_target_archid(void); /** Set target architecture (id: see libxsmm_typedefs.h) for subsequent code generation (JIT). */ LIBXSMM_API void libxsmm_set_target_archid(int id); /** * Returns the name of the target architecture as determined by the CPUID flags, as set by the * libxsmm_get_target_arch* functions, or as set by the LIBXSMM_TARGET environment variable. */ LIBXSMM_API const char* libxsmm_get_target_arch(void); /** Set target architecture (arch="0|sse|snb|hsw|knl|knm|skx|clx|cpx", NULL/"0": CPUID). */ LIBXSMM_API void libxsmm_set_target_arch(const char* arch); /** Get the level of verbosity. */ LIBXSMM_API int libxsmm_get_verbosity(void); /** * Set the level of verbosity (0: off, positive value: verbosity level, * negative value: maximum verbosity, which also dumps JIT-code) */ LIBXSMM_API void libxsmm_set_verbosity(int level); /** Get the default prefetch strategy. */ LIBXSMM_API libxsmm_gemm_prefetch_type libxsmm_get_gemm_auto_prefetch(void); /** Set the default prefetch strategy. */ LIBXSMM_API void libxsmm_set_gemm_auto_prefetch(libxsmm_gemm_prefetch_type strategy); /** Receive information about JIT-generated code. */ LIBXSMM_API int libxsmm_get_kernel_info(const void* kernel, libxsmm_kernel_info* info); /** Get information about the matrix multiplication kernel. */ LIBXSMM_API int libxsmm_get_mmkernel_info(libxsmm_xmmfunction kernel, libxsmm_mmkernel_info* info); /** Get information about the matrix transpose kernel. */ LIBXSMM_API int libxsmm_get_transkernel_info(libxsmm_xtransfunction kernel, libxsmm_transkernel_info* info); /** Get information about the matrix copy kernel. */ LIBXSMM_API int libxsmm_get_mcopykernel_info(libxsmm_xmcopyfunction kernel, libxsmm_mcopykernel_info* info); /** Get information about the matrix eltwise kernel. */ LIBXSMM_API int libxsmm_get_meltwkernel_info(libxsmm_xmeltwfunction kernel, libxsmm_meltwkernel_info* info); /** Get information about the code registry. */ LIBXSMM_API int libxsmm_get_registry_info(libxsmm_registry_info* info); /** * Register user-defined key-value. * Since the key-type is unknown to LIBXSMM, the key must be binary reproducible, * i.e., if it is a structured type (padded data may be uninitialized), it must * be initially zero-filled (memset) followed by an element-wise initialization. * The size of the key is limited (see documentation). The given value is copied * by LIBXSMM and may be initialized at registration-time or whenever queried. * Registered data is released at program termination but can be also released * if needed (libxsmm_xrelease), .e.g., for larger value for the same key. */ LIBXSMM_API void* libxsmm_xregister(const void* key, size_t key_size, size_t value_size, const void* value_init); /** Query user-defined value from LIBXSMM's code registry. */ LIBXSMM_API void* libxsmm_xdispatch(const void* key, size_t key_size); /** Remove key-value pair from code registry and release memory. */ LIBXSMM_API void libxsmm_xrelease(const void* key, size_t key_size); /** Query or JIT-generate SMM-kernel; returns NULL if it does not exist or if JIT is not supported (descriptor form). */ LIBXSMM_API libxsmm_xmmfunction libxsmm_xmmdispatch(const libxsmm_gemm_descriptor* descriptor); /** Query or JIT-generate SMM-kernel; returns NULL if it does not exist or if JIT is not supported (double-precision). */ LIBXSMM_API libxsmm_dmmfunction libxsmm_dmmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const double* alpha, const double* beta, const int* flags, const int* prefetch); /** Query or JIT-generate SMM-kernel; returns NULL if it does not exist or if JIT is not supported (single-precision). */ LIBXSMM_API libxsmm_smmfunction libxsmm_smmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate SMM-kernel; returns NULL if it does not exist or if JIT is not supported (bf16 inputs, fp32-accumulate) */ LIBXSMM_API libxsmm_bsmmfunction libxsmm_bsmmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate SMM-kernel; returns NULL if it does not exist or if JIT is not supported (bf16 inputs, fp32-accumulate internally, bf16 outputs) */ LIBXSMM_API libxsmm_bmmfunction libxsmm_bmmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate SMM-kernel; returns NULL if it does not exist or if JIT is not supported (low/short-precision, int-accumulate) */ LIBXSMM_API libxsmm_wimmfunction libxsmm_wimmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate SMM-kernel; returns NULL if it does not exist or if JIT is not supported (low/char-precision, int-accumulate) */ LIBXSMM_API libxsmm_ssbimmfunction libxsmm_ssbimmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); LIBXSMM_API libxsmm_usbimmfunction libxsmm_usbimmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); LIBXSMM_API libxsmm_subimmfunction libxsmm_subimmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); LIBXSMM_API libxsmm_uubimmfunction libxsmm_uubimmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate SMM-kernel; returns NULL if it does not exist or if JIT is not supported (low/char-precision, int-accumulate, int8 outputs) */ LIBXSMM_API libxsmm_sububmmfunction libxsmm_sububmmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (double-precision). */ LIBXSMM_API libxsmm_dmmfunction_reducebatch_addr libxsmm_dmmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const double* alpha, const double* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (single-precision). */ LIBXSMM_API libxsmm_smmfunction_reducebatch_addr libxsmm_smmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (bf16 inputs, fp32-accumulate). */ LIBXSMM_API libxsmm_bsmmfunction_reducebatch_addr libxsmm_bsmmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (bf16 inputs, fp32-accumulate internally, bf16 outputs). */ LIBXSMM_API libxsmm_bmmfunction_reducebatch_addr libxsmm_bmmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int16 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_wimmfunction_reducebatch_addr libxsmm_wimmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_addr libxsmm_ssbimmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_usbimmfunction_reducebatch_addr libxsmm_usbimmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_subimmfunction_reducebatch_addr libxsmm_subimmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_uubimmfunction_reducebatch_addr libxsmm_uubimmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate, int8 outputs). */ LIBXSMM_API libxsmm_sububmmfunction_reducebatch_addr libxsmm_sububmmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (double-precision). */ LIBXSMM_API libxsmm_dmmfunction_reducebatch_addr libxsmm_dmmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const double* alpha, const double* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (single-precision). */ LIBXSMM_API libxsmm_smmfunction_reducebatch_addr libxsmm_smmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /* Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (bf16 inputs, fp32-accumulate). */ LIBXSMM_API libxsmm_bsmmfunction_reducebatch_addr libxsmm_bsmmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (bf16 inputs, fp32-accumulate internally, bf16 outputs). */ LIBXSMM_API libxsmm_bmmfunction_reducebatch_addr libxsmm_bmmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int16 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_wimmfunction_reducebatch_addr libxsmm_wimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_addr libxsmm_ssbimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_usbimmfunction_reducebatch_addr libxsmm_usbimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_subimmfunction_reducebatch_addr libxsmm_subimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_uubimmfunction_reducebatch_addr libxsmm_uubimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate, int8 outputs). */ LIBXSMM_API libxsmm_sububmmfunction_reducebatch_addr libxsmm_sububmmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (double-precision). */ LIBXSMM_API libxsmm_dmmfunction_reducebatch_offs libxsmm_dmmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const double* alpha, const double* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (single-precision). */ LIBXSMM_API libxsmm_smmfunction_reducebatch_offs libxsmm_smmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (bf16 inputs, fp32-accumulate). */ LIBXSMM_API libxsmm_bsmmfunction_reducebatch_offs libxsmm_bsmmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (bf16 inputs, fp32-accumulate internally, bf16 outputs). */ LIBXSMM_API libxsmm_bmmfunction_reducebatch_offs libxsmm_bmmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int16 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_wimmfunction_reducebatch_offs libxsmm_wimmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_offs libxsmm_ssbimmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_usbimmfunction_reducebatch_offs libxsmm_usbimmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_subimmfunction_reducebatch_offs libxsmm_subimmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_uubimmfunction_reducebatch_offs libxsmm_uubimmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate, int8 outputs). */ LIBXSMM_API libxsmm_sububmmfunction_reducebatch_offs libxsmm_sububmmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (double-precision). */ LIBXSMM_API libxsmm_dmmfunction_reducebatch_offs libxsmm_dmmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const double* alpha, const double* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (single-precision). */ LIBXSMM_API libxsmm_smmfunction_reducebatch_offs libxsmm_smmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (bf16 inputs, fp32-accumulate). */ LIBXSMM_API libxsmm_bsmmfunction_reducebatch_offs libxsmm_bsmmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (bf16 inputs, fp32-accumulate internally, bf16 outputs). */ LIBXSMM_API libxsmm_bmmfunction_reducebatch_offs libxsmm_bmmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int16 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_wimmfunction_reducebatch_offs libxsmm_wimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_offs libxsmm_ssbimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_usbimmfunction_reducebatch_offs libxsmm_usbimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_subimmfunction_reducebatch_offs libxsmm_subimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_uubimmfunction_reducebatch_offs libxsmm_uubimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate, int8 outputs). */ LIBXSMM_API libxsmm_sububmmfunction_reducebatch_offs libxsmm_sububmmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (double-precision). */ LIBXSMM_API libxsmm_dmmfunction_reducebatch_strd libxsmm_dmmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const double* alpha, const double* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (single-precision). */ LIBXSMM_API libxsmm_smmfunction_reducebatch_strd libxsmm_smmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (bf16 inputs, fp32-accumulate). */ LIBXSMM_API libxsmm_bsmmfunction_reducebatch_strd libxsmm_bsmmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (bf16 inputs, fp32-accumulate internally, bf16 outputs). */ LIBXSMM_API libxsmm_bmmfunction_reducebatch_strd libxsmm_bmmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int16 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_wimmfunction_reducebatch_strd libxsmm_wimmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_strd libxsmm_ssbimmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_usbimmfunction_reducebatch_strd libxsmm_usbimmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_subimmfunction_reducebatch_strd libxsmm_subimmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_uubimmfunction_reducebatch_strd libxsmm_uubimmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate, int8 outputs). */ LIBXSMM_API libxsmm_sububmmfunction_reducebatch_strd libxsmm_sububmmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (double-precision). */ LIBXSMM_API libxsmm_dmmfunction_reducebatch_strd libxsmm_dmmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const double* alpha, const double* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (single-precision). */ LIBXSMM_API libxsmm_smmfunction_reducebatch_strd libxsmm_smmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (bf16 inputs, fp32-accumulate). */ LIBXSMM_API libxsmm_bsmmfunction_reducebatch_strd libxsmm_bsmmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (bf16 inputs, fp32-accumulate internally, bf16 outputs). */ LIBXSMM_API libxsmm_bmmfunction_reducebatch_strd libxsmm_bmmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int16 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_wimmfunction_reducebatch_strd libxsmm_wimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_strd libxsmm_ssbimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_usbimmfunction_reducebatch_strd libxsmm_usbimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_subimmfunction_reducebatch_strd libxsmm_subimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_uubimmfunction_reducebatch_strd libxsmm_uubimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate, int8 outputs). */ LIBXSMM_API libxsmm_sububmmfunction_reducebatch_strd libxsmm_sububmmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** * Process a series of matrix multiplications (batch). See also libxsmm_gemm_batch/omp. * The kind of matrix operands (a, b, c) depend on index_stride: * index_stride==0: pointers to pointers of elements, e.g., double** for the C matrices. * index_stride!=0: pointer to elements, e.g., const double* for the A and B matrices. */ LIBXSMM_API void libxsmm_mmbatch(libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, const char* transa, const char* transb, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const void* beta, void* c, const libxsmm_blasint* ldc, /** Determines index-base (usually 0, 1 for one-based indexes); uses the same unit as the strides. */ libxsmm_blasint index_base, /** * Stride used to walk stride_a, stride_b, and stride_c; zero turns stride_* into scalar values. * The index_stride is measured in Bytes (sizeof(libxsmm_blasint) determines packed indexes). */ libxsmm_blasint index_stride, /** * Depending on index_stride, the meaning of stride_a, stride_b, and stride_c is different. * index_stride==0: stride_a, stride_b, and stride_c are pointers to scalar values. * index_stride!=0: stride_* are indexes determining the position of a, b, and c operands. */ const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], /** * Number of matrix multiplications. If the size is given as a negative value, * then internal synchronization is omitted. */ libxsmm_blasint batchsize, /** Thread-ID (TID), and number of threads. */ /*unsigned*/int tid, /*unsigned*/int nthreads); /** Process a series of matrix multiplications (batch). See also libxsmm_mmbatch. */ LIBXSMM_API void libxsmm_gemm_batch(libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, const char* transa, const char* transb, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const void* beta, void* c, const libxsmm_blasint* ldc, libxsmm_blasint index_base, libxsmm_blasint index_stride, const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], libxsmm_blasint batchsize); /** Process a series of matrix multiplications (batch) with OpenMP (libxsmmext). See also libxsmm_mmbatch. */ LIBXSMM_APIEXT void libxsmm_gemm_batch_omp(libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, const char* transa, const char* transb, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const void* beta, void* c, const libxsmm_blasint* ldc, libxsmm_blasint index_base, libxsmm_blasint index_stride, const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], libxsmm_blasint batchsize); /** Unlike libxsmm_gemm_batch, groups of homogeneous batches are possible (double-precision). */ LIBXSMM_API void libxsmm_dgemm_batch(const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], const double alpha_array[], const double* a_array[], const libxsmm_blasint lda_array[], const double* b_array[], const libxsmm_blasint ldb_array[], const double beta_array[], double* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]); /** Unlike libxsmm_gemm_batch, groups of homogeneous batches are possible (single-precision). */ LIBXSMM_API void libxsmm_sgemm_batch(const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], const float alpha_array[], const float* a_array[], const libxsmm_blasint lda_array[], const float* b_array[], const libxsmm_blasint ldb_array[], const float beta_array[], float* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]); /** Unlike libxsmm_gemm_batch, groups of homogeneous batches are possible (double-precision). */ LIBXSMM_APIEXT void libxsmm_dgemm_batch_omp(const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], const double alpha_array[], const double* a_array[], const libxsmm_blasint lda_array[], const double* b_array[], const libxsmm_blasint ldb_array[], const double beta_array[], double* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]); /** Unlike libxsmm_gemm_batch, groups of homogeneous batches are possible (single-precision). */ LIBXSMM_APIEXT void libxsmm_sgemm_batch_omp(const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], const float alpha_array[], const float* a_array[], const libxsmm_blasint lda_array[], const float* b_array[], const libxsmm_blasint ldb_array[], const float beta_array[], float* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]); /** * This function is a no-op unless LIBXSMM is built to intercept GEMM calls. * Pointer arguments are used to filter intercepted GEMM calls such that * non-NULL values match. Otherwise (NULL) the respective argument is * considered a "free value", i.e., every value can match; libxsmmext required. */ LIBXSMM_APIEXT void libxsmm_mmbatch_begin(libxsmm_gemm_precision precision, const int* flags, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const void* alpha, const void* beta); /** Processes the batch of previously recorded matrix multiplications (libxsmm_mmbatch_begin); libxsmmext required. */ LIBXSMM_APIEXT void libxsmm_mmbatch_end(void); /** Code generation routine for matrix-copy using a descriptor. */ LIBXSMM_API libxsmm_xmcopyfunction libxsmm_dispatch_mcopy(const libxsmm_mcopy_descriptor* descriptor); /** Code generation routine for matrix-eltwise using a descriptor. */ LIBXSMM_API libxsmm_xmeltwfunction libxsmm_dispatch_meltw(const libxsmm_meltw_descriptor* descriptor); LIBXSMM_API libxsmm_meltwfunction_copy libxsmm_dispatch_meltw_copy(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type); LIBXSMM_API libxsmm_meltwfunction_zero libxsmm_dispatch_meltw_zero(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type); LIBXSMM_API libxsmm_meltwfunction_add libxsmm_dispatch_meltw_add(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type); LIBXSMM_API libxsmm_meltwfunction_mul libxsmm_dispatch_meltw_mul(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type); LIBXSMM_API libxsmm_meltwfunction_relu libxsmm_dispatch_meltw_relu(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type); LIBXSMM_API libxsmm_meltwfunction_cvtfp32bf16 libxsmm_dispatch_meltw_cvtfp32bf16(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type); LIBXSMM_API libxsmm_meltwfunction_cvtfp32bf16_act libxsmm_dispatch_meltw_cvtfp32bf16_act(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type, libxsmm_meltw_cvta_flags flags); LIBXSMM_API libxsmm_meltwfunction_act_cvtfp32bf16 libxsmm_dispatch_meltw_act_cvtfp32bf16(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type, libxsmm_meltw_acvt_flags flags); LIBXSMM_API libxsmm_meltwfunction_reduce libxsmm_dispatch_meltw_reduce(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type, libxsmm_meltw_redu_flags flags); LIBXSMM_API libxsmm_meltwfunction_scale libxsmm_dispatch_meltw_scale(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type, libxsmm_meltw_scal_flags flags); /** Code generation routine for transposes using a descriptor */ LIBXSMM_API libxsmm_xtransfunction libxsmm_dispatch_trans(const libxsmm_trans_descriptor* descriptor); /** Code generation routine for GEMM/packed using a descriptor */ LIBXSMM_API libxsmm_pgemm_xfunction libxsmm_dispatch_pgemm(const libxsmm_pgemm_descriptor* descriptor); /** Code generation routine for GETRF/packed using a descriptor */ LIBXSMM_API libxsmm_getrf_xfunction libxsmm_dispatch_getrf(const libxsmm_getrf_descriptor* descriptor); /** Code generation routine for TRMM/packed using a descriptor */ LIBXSMM_API libxsmm_trmm_xfunction libxsmm_dispatch_trmm(const libxsmm_trmm_descriptor* descriptor); /** Code generation routine for TRSM/packed using a descriptor */ LIBXSMM_API libxsmm_trsm_xfunction libxsmm_dispatch_trsm(const libxsmm_trsm_descriptor* descriptor); /** * Code generation routine for the CSR format which multiplies a dense SOA matrix (each element holds a SIMD-width * wide vector) and a sparse matrix or a sparse matrix with a dense SOA matrix. * The result is always a SOA matrix. There is no code cache, and user code has to manage the code pointers. * Call libxsmm_release_kernel in order to deallocate the JIT'ted code. */ LIBXSMM_API libxsmm_xmmfunction libxsmm_create_xcsr_soa(const libxsmm_gemm_descriptor* descriptor, const unsigned int* row_ptr, const unsigned int* column_idx, const void* values, unsigned int packed_width); /** * Code generation routine for the CSC format which multiplies a dense SOA matrix (each element holds a SIMD-width * wide vector) and a sparse matrix or a sparse matrix with a dense SOA matrix. * The result is always a SOA matrix. There is no code cache, and user code has to manage the code pointers. * Call libxsmm_release_kernel in order to deallocate the JIT'ted code. */ LIBXSMM_API libxsmm_xmmfunction libxsmm_create_xcsc_soa(const libxsmm_gemm_descriptor* descriptor, const unsigned int* column_ptr, const unsigned int* row_idx, const void* values, unsigned int packed_width); /** * Code generation routine for row-major format B matrix which is multiplied by a dense packed matrix (each element holds a SIMD-width * wide vector) and the result is another packed matrix. The memory layout of the SOA matrix is [row][col][packed]. * here is no code cache, and user code has to manage the code pointers. * Call libxsmm_release_kernel in order to deallocate the JIT'ted code. */ LIBXSMM_API libxsmm_xmmfunction libxsmm_create_pgemm_ac_rm(const libxsmm_gemm_descriptor* descriptor, unsigned int packed_width); /** * Code generation routine for row-major format A matrix which is multiplied by a dense packed matrix (each element holds a SIMD-width * wide vector) and the result is another packed matrix. The memory layout of the packed matrix is [row][col][packed]. * here is no code cache, and user code has to manage the code pointers. * Call libxsmm_release_kernel in order to deallocate the JIT'ted code. */ LIBXSMM_API libxsmm_xmmfunction libxsmm_create_pgemm_bc_rm(const libxsmm_gemm_descriptor* descriptor, unsigned int packed_width); /** * Code generation routine for the CSR format which multiplies a dense matrix B into a dense matrix C. * The sparse matrix a is kept in registers. * Call libxsmm_release_kernel in order to deallocate the JIT'ted code. */ LIBXSMM_API libxsmm_dmmfunction libxsmm_create_dcsr_reg(const libxsmm_gemm_descriptor* descriptor, const unsigned int* row_ptr, const unsigned int* column_idx, const double* values); /** * Code generation routine for the CSR format which multiplies a dense matrix B into a dense matrix C. * The sparse matrix a is kept in registers. * Call libxsmm_release_kernel in order to deallocate the JIT'ted code. */ LIBXSMM_API libxsmm_smmfunction libxsmm_create_scsr_reg(const libxsmm_gemm_descriptor* descriptor, const unsigned int* row_ptr, const unsigned int* column_idx, const float* values); /** * Deallocates the JIT'ted code as returned by libxsmm_create_* functions, * unregisters and releases code from the code registry. */ LIBXSMM_API void libxsmm_release_kernel(const void* kernel); /** Matrix copy function ("in" can be NULL to zero the destination). */ LIBXSMM_API void libxsmm_matcopy(void* out, const void* in, unsigned int typesize, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo); /** Matrix copy function ("in" can be NULL to zero the destination, per-thread form). */ LIBXSMM_API void libxsmm_matcopy_thread(void* out, const void* in, unsigned int typesize, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo, /*unsigned*/int tid, /*unsigned*/int nthreads); /** Matrix copy function ("in" can be NULL to zero the destination); MT via libxsmmext. */ LIBXSMM_APIEXT void libxsmm_matcopy_omp(void* out, const void* in, unsigned int typesize, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo); /** Matrix transposition (out-of-place form). */ LIBXSMM_API void libxsmm_otrans(void* out, const void* in, unsigned int typesize, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo); /** Matrix transposition (out-of-place form, per-thread form). */ LIBXSMM_API void libxsmm_otrans_thread(void* out, const void* in, unsigned int typesize, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo, /*unsigned*/int tid, /*unsigned*/int nthreads); /** Matrix transposition; MT via libxsmmext (out-of-place form). */ LIBXSMM_APIEXT void libxsmm_otrans_omp(void* out, const void* in, unsigned int typesize, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo); /** Matrix transposition (in-place form). */ LIBXSMM_API void libxsmm_itrans(void* inout, unsigned int typesize, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld); /** Initialize GEMM-handle; allows to better amortize setup overhead. */ LIBXSMM_API libxsmm_gemm_handle* libxsmm_gemm_handle_init(libxsmm_gemm_blob* blob, libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const void* alpha, const void* beta, int flags, /*unsigned*/int ntasks); /** Calculate required scratch buffer size needed to perform libxsmm_gemm_thread. */ LIBXSMM_API size_t libxsmm_gemm_handle_get_scratch_size(const libxsmm_gemm_handle* handle); /** Low-level type-agnostic GEMM suitable for external threads or tasks. */ LIBXSMM_API void libxsmm_gemm_thread(const libxsmm_gemm_handle* handle, void* scratch, const void* a, const void* b, void* c, /*unsigned*/int tid, /*unsigned*/int nthreads); /** General dense matrix multiplication (sequential). */ LIBXSMM_API void libxsmm_xgemm(libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const void* beta, void* c, const libxsmm_blasint* ldc); /** General dense matrix multiplication (libxsmmext); available as xgemm (generic), dgemm (DP), and sgemm (SP). */ LIBXSMM_APIEXT void libxsmm_xgemm_omp(libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const void* beta, void* c, const libxsmm_blasint* ldc); /** Dispatched general dense matrix multiplication (double-precision). */ LIBXSMM_API void libxsmm_dgemm(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const double* alpha, const double* a, const libxsmm_blasint* lda, const double* b, const libxsmm_blasint* ldb, const double* beta, double* c, const libxsmm_blasint* ldc); /** Dispatched general dense matrix multiplication (single-precision). */ LIBXSMM_API void libxsmm_sgemm(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const float* alpha, const float* a, const libxsmm_blasint* lda, const float* b, const libxsmm_blasint* ldb, const float* beta, float* c, const libxsmm_blasint* ldc); /** Dispatched general dense matrix multiplication (I16 input, I32 result). */ LIBXSMM_API void libxsmm_wigemm(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const int* alpha, const short* a, const libxsmm_blasint* lda, const short* b, const libxsmm_blasint* ldb, const int* beta, int* c, const libxsmm_blasint* ldc); /** Dispatched general dense matrix multiplication (BF16 input, F32 result). */ LIBXSMM_API void libxsmm_bsgemm(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const float* alpha, const libxsmm_bfloat16* a, const libxsmm_blasint* lda, const libxsmm_bfloat16* b, const libxsmm_blasint* ldb, const float* beta, float* c, const libxsmm_blasint* ldc); #if !defined(LIBXSMM_DEFAULT_CONFIG) && !defined(LIBXSMM_SOURCE_H) #endif /*!defined(LIBXSMM_DEFAULT_CONFIG)*/ #if defined(__cplusplus) /** Map a built-in type to libxsmm_gemm_precision (libxsmm_gemm_precision_enum). */ template struct LIBXSMM_RETARGETABLE libxsmm_gemm_precision_enum { static const libxsmm_gemm_precision value = static_cast(LIBXSMM_DATATYPE_UNSUPPORTED); }; template<> struct LIBXSMM_RETARGETABLE libxsmm_gemm_precision_enum { static const libxsmm_gemm_precision value = LIBXSMM_GEMM_PRECISION_F64; }; template<> struct LIBXSMM_RETARGETABLE libxsmm_gemm_precision_enum { static const libxsmm_gemm_precision value = LIBXSMM_GEMM_PRECISION_F32; }; template<> struct LIBXSMM_RETARGETABLE libxsmm_gemm_precision_enum { static const libxsmm_gemm_precision value = LIBXSMM_GEMM_PRECISION_I32; }; template<> struct LIBXSMM_RETARGETABLE libxsmm_gemm_precision_enum { static const libxsmm_gemm_precision value = LIBXSMM_GEMM_PRECISION_I16; }; template<> struct LIBXSMM_RETARGETABLE libxsmm_gemm_precision_enum { static const libxsmm_gemm_precision value = LIBXSMM_GEMM_PRECISION_BF16; }; template<> struct LIBXSMM_RETARGETABLE libxsmm_gemm_precision_enum { static const libxsmm_gemm_precision value = LIBXSMM_GEMM_PRECISION_BF16; }; template<> struct LIBXSMM_RETARGETABLE libxsmm_gemm_precision_enum { static const libxsmm_gemm_precision value = LIBXSMM_GEMM_PRECISION_I8; }; template<> struct LIBXSMM_RETARGETABLE libxsmm_gemm_precision_enum { static const libxsmm_gemm_precision value = LIBXSMM_GEMM_PRECISION_I8; }; template<> struct LIBXSMM_RETARGETABLE libxsmm_gemm_precision_enum { static const libxsmm_gemm_precision value = LIBXSMM_GEMM_PRECISION_I8; }; template struct LIBXSMM_RETARGETABLE libxsmm_gemm_default_output { typedef INP_TYPE type; }; template<> struct LIBXSMM_RETARGETABLE libxsmm_gemm_default_output { typedef int type; }; template<> struct LIBXSMM_RETARGETABLE libxsmm_gemm_default_output { typedef int type; }; /** Construct and execute a specialized function. */ template::type> class LIBXSMM_RETARGETABLE libxsmm_mmfunction { mutable/*retargetable*/ libxsmm_xmmfunction m_function; public: typedef INP_TYPE itype; typedef OUT_TYPE otype; public: libxsmm_mmfunction() { m_function.xmm = 0; } libxsmm_mmfunction(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, int flags = LIBXSMM_FLAGS) { libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_gemm_descriptor_init2(&blob, libxsmm_gemm_precision_enum::value, libxsmm_gemm_precision_enum::value, m, n, k, m, k, m, NULL/*alpha*/, NULL/*beta*/, flags, libxsmm_get_gemm_xprefetch(NULL)); m_function.xmm = (0 != desc ? libxsmm_xmmdispatch(desc).xmm : 0); } libxsmm_mmfunction(int flags, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, int prefetch) { libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_gemm_descriptor_init2(&blob, libxsmm_gemm_precision_enum::value, libxsmm_gemm_precision_enum::value, m, n, k, m, k, m, NULL/*alpha*/, NULL/*beta*/, flags, libxsmm_get_gemm_prefetch(prefetch)); m_function.xmm = (0 != desc ? libxsmm_xmmdispatch(desc).xmm : 0); } libxsmm_mmfunction(int flags, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, otype alpha, otype beta) { libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_gemm_descriptor_init2(&blob, libxsmm_gemm_precision_enum::value, libxsmm_gemm_precision_enum::value, m, n, k, m, k, m, &alpha, &beta, flags, libxsmm_get_gemm_xprefetch(NULL)); m_function.xmm = (0 != desc ? libxsmm_xmmdispatch(desc).xmm : 0); } libxsmm_mmfunction(int flags, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, otype alpha, otype beta, int prefetch) { libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_gemm_descriptor_init2(&blob, libxsmm_gemm_precision_enum::value, libxsmm_gemm_precision_enum::value, m, n, k, m, k, m, &alpha, &beta, flags, libxsmm_get_gemm_prefetch(prefetch)); m_function.xmm = (0 != desc ? libxsmm_xmmdispatch(desc).xmm : 0); } libxsmm_mmfunction(int flags, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, int prefetch) { libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_gemm_descriptor_init2(&blob, libxsmm_gemm_precision_enum::value, libxsmm_gemm_precision_enum::value, m, n, k, lda, ldb, ldc, NULL/*alpha*/, NULL/*beta*/, flags, libxsmm_get_gemm_prefetch(prefetch)); m_function.xmm = (0 != desc ? libxsmm_xmmdispatch(desc).xmm : 0); } libxsmm_mmfunction(int flags, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, otype alpha, otype beta) { libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_gemm_descriptor_init2(&blob, libxsmm_gemm_precision_enum::value, libxsmm_gemm_precision_enum::value, m, n, k, lda, ldb, ldc, &alpha, &beta, flags, libxsmm_get_gemm_xprefetch(NULL)); m_function.xmm = (0 != desc ? libxsmm_xmmdispatch(desc).xmm : 0); } libxsmm_mmfunction(int flags, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, otype alpha, otype beta, int prefetch) { libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_gemm_descriptor_init2(&blob, libxsmm_gemm_precision_enum::value, libxsmm_gemm_precision_enum::value, m, n, k, lda, ldb, ldc, &alpha, &beta, flags, libxsmm_get_gemm_prefetch(prefetch)); m_function.xmm = (0 != desc ? libxsmm_xmmdispatch(desc).xmm : 0); } public: const libxsmm_xmmfunction& kernel() const { return m_function; } operator const void*() const { return 0 != m_function.xmm ? this : 0; } void operator()(const itype* a, const itype* b, otype* c) const { LIBXSMM_MMCALL_ABC(m_function.xmm, a, b, c); } void operator()(const itype* a, const itype* b, otype* c, const itype* pa, const itype* pb, const otype* pc) const { LIBXSMM_MMCALL_PRF(m_function.xmm, a, b, c, pa, pb, pc); } }; /** Matrix copy function ("in" can be NULL to zero the destination). */ template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_matcopy(T* out, const T* in, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo) { return libxsmm_matcopy(out, in, sizeof(T), m, n, ldi, ldo); } template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_matcopy(T* out, const T* in, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi) { return libxsmm_matcopy(out, in, m, n, ldi, ldi); } template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_matcopy(T* out, const T* in, libxsmm_blasint m, libxsmm_blasint n) { return libxsmm_matcopy(out, in, m, n, m); } template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_matcopy(T* out, const T* in, libxsmm_blasint n) { return libxsmm_matcopy(out, in, n, n); } /** Matrix copy function ("in" can be NULL to zero the destination); MT via libxsmmext. */ template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_matcopy_omp(T* out, const T* in, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo) { return libxsmm_matcopy_omp(out, in, sizeof(T), m, n, ldi, ldo); } template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_matcopy_omp(T* out, const T* in, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi) { return libxsmm_matcopy_omp(out, in, m, n, ldi, ldi); } template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_matcopy_omp(T* out, const T* in, libxsmm_blasint m, libxsmm_blasint n) { return libxsmm_matcopy_omp(out, in, m, n, m); } template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_matcopy_omp(T* out, const T* in, libxsmm_blasint n) { return libxsmm_matcopy_omp(out, in, n, n); } /** Matrix transposition (out-of-place form). */ template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_trans(T* out, const T* in, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo) { return libxsmm_otrans(out, in, sizeof(T), m, n, ldi, ldo); } template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_trans(T* out, const T* in, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi) { return libxsmm_trans(out, in, m, n, ldi, ldi); } template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_trans(T* out, const T* in, libxsmm_blasint m, libxsmm_blasint n) { return libxsmm_trans(out, in, m, n, m); } template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_trans(T* out, const T* in, libxsmm_blasint n) { return libxsmm_trans(out, in, n, n); } /** Matrix transposition; MT via libxsmmext (out-of-place form). */ template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_trans_omp(T* out, const T* in, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo) { return libxsmm_otrans_omp(out, in, sizeof(T), m, n, ldi, ldo); } template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_trans_omp(T* out, const T* in, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi) { return libxsmm_trans_omp(out, in, m, n, ldi, ldi); } template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_trans_omp(T* out, const T* in, libxsmm_blasint m, libxsmm_blasint n) { return libxsmm_trans_omp(out, in, m, n, m); } template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_trans_omp(T* out, const T* in, libxsmm_blasint n) { return libxsmm_trans_omp(out, in, n, n); } /** Matrix transposition (in-place form). */ template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_trans(T* inout, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi) { return libxsmm_itrans(inout, sizeof(T), m, n, ldi); } template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_trans(T* inout, libxsmm_blasint m, libxsmm_blasint n) { return libxsmm_trans(inout, m, n, m); } template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_trans(T* inout, libxsmm_blasint n) { return libxsmm_trans(inout, n, n); } /** Dispatched general dense matrix multiplication (double-precision). */ inline LIBXSMM_RETARGETABLE void libxsmm_gemm(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const double* alpha, const double* a, const libxsmm_blasint* lda, const double* b, const libxsmm_blasint* ldb, const double* beta, double* c, const libxsmm_blasint* ldc) { libxsmm_dgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } inline LIBXSMM_RETARGETABLE void libxsmm_gemm(const char* transa, const char* transb, /* by-value */ libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const double* alpha, const double* a, const libxsmm_blasint* lda, const double* b, const libxsmm_blasint* ldb, const double* beta, double* c, const libxsmm_blasint* ldc) { libxsmm_dgemm(transa, transb, &m, &n, &k, alpha, a, lda, b, ldb, beta, c, ldc); } /** Dispatched general dense matrix multiplication (single-precision). */ inline LIBXSMM_RETARGETABLE void libxsmm_gemm(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const float* alpha, const float* a, const libxsmm_blasint* lda, const float* b, const libxsmm_blasint* ldb, const float* beta, float* c, const libxsmm_blasint* ldc) { libxsmm_sgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } inline LIBXSMM_RETARGETABLE void libxsmm_gemm(const char* transa, const char* transb, /* by-value */ libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const float* alpha, const float* a, const libxsmm_blasint* lda, const float* b, const libxsmm_blasint* ldb, const float* beta, float* c, const libxsmm_blasint* ldc) { libxsmm_sgemm(transa, transb, &m, &n, &k, alpha, a, lda, b, ldb, beta, c, ldc); } /** Dispatched general dense matrix multiplication (low-precision). */ inline LIBXSMM_RETARGETABLE void libxsmm_gemm(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const int* alpha, const short* a, const libxsmm_blasint* lda, const short* b, const libxsmm_blasint* ldb, const int* beta, int* c, const libxsmm_blasint* ldc) { libxsmm_wigemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } inline LIBXSMM_RETARGETABLE void libxsmm_gemm(const char* transa, const char* transb, /* by-value */ libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const int* alpha, const short* a, const libxsmm_blasint* lda, const short* b, const libxsmm_blasint* ldb, const int* beta, int* c, const libxsmm_blasint* ldc) { libxsmm_wigemm(transa, transb, &m, &n, &k, alpha, a, lda, b, ldb, beta, c, ldc); } /** Dispatched general dense matrix multiplication (low-precision). */ inline LIBXSMM_RETARGETABLE void libxsmm_gemm(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const float* alpha, const libxsmm_bfloat16* a, const libxsmm_blasint* lda, const libxsmm_bfloat16* b, const libxsmm_blasint* ldb, const float* beta, float* c, const libxsmm_blasint* ldc) { libxsmm_bsgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } inline LIBXSMM_RETARGETABLE void libxsmm_gemm(const char* transa, const char* transb, /* by-value */ libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const float* alpha, const libxsmm_bfloat16* a, const libxsmm_blasint* lda, const libxsmm_bfloat16* b, const libxsmm_blasint* ldb, const float* beta, float* c, const libxsmm_blasint* ldc) { libxsmm_bsgemm(transa, transb, &m, &n, &k, alpha, a, lda, b, ldb, beta, c, ldc); } /** General dense matrix multiplication based on LAPACK/BLAS (double-precision). */ inline LIBXSMM_RETARGETABLE void libxsmm_blas_gemm(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const double* alpha, const double* a, const libxsmm_blasint* lda, const double* b, const libxsmm_blasint* ldb, const double* beta, double* c, const libxsmm_blasint* ldc) { libxsmm_blas_dgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } inline LIBXSMM_RETARGETABLE void libxsmm_blas_gemm(const char* transa, const char* transb, /* by-value */ libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const double* alpha, const double* a, const libxsmm_blasint* lda, const double* b, const libxsmm_blasint* ldb, const double* beta, double* c, const libxsmm_blasint* ldc) { libxsmm_blas_dgemm(transa, transb, &m, &n, &k, alpha, a, lda, b, ldb, beta, c, ldc); } /** General dense matrix multiplication based on LAPACK/BLAS (single-precision). */ inline LIBXSMM_RETARGETABLE void libxsmm_blas_gemm(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const float* alpha, const float* a, const libxsmm_blasint* lda, const float* b, const libxsmm_blasint* ldb, const float* beta, float* c, const libxsmm_blasint* ldc) { libxsmm_blas_sgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } inline LIBXSMM_RETARGETABLE void libxsmm_blas_gemm(const char* transa, const char* transb, /* by-value */ libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const float* alpha, const float* a, const libxsmm_blasint* lda, const float* b, const libxsmm_blasint* ldb, const float* beta, float* c, const libxsmm_blasint* ldc) { libxsmm_blas_sgemm(transa, transb, &m, &n, &k, alpha, a, lda, b, ldb, beta, c, ldc); } #endif /*__cplusplus*/ #endif /*LIBXSMM_H*/ libxsmm-1.17/include/libxsmm_blocked_gemm.h000066400000000000000000000113161415223013700210540ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_BLOCKED_GEMM_H #define LIBXSMM_BLOCKED_GEMM_H #include "libxsmm_typedefs.h" /** Denotes the BGEMM data order. */ typedef enum libxsmm_blocked_gemm_order { LIBXSMM_BLOCKED_GEMM_ORDER_JIK = 0, LIBXSMM_BLOCKED_GEMM_ORDER_IJK = 1, LIBXSMM_BLOCKED_GEMM_ORDER_JKI = 2, LIBXSMM_BLOCKED_GEMM_ORDER_IKJ = 3, LIBXSMM_BLOCKED_GEMM_ORDER_KJI = 4, LIBXSMM_BLOCKED_GEMM_ORDER_KIJ = 5 } libxsmm_blocked_gemm_order; /** Describes the Block-GEMM (BGEMM) operation. */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_blocked_gemm_handle libxsmm_blocked_gemm_handle; LIBXSMM_API libxsmm_blocked_gemm_handle* libxsmm_blocked_gemm_handle_create( /** Number of threads used to run BGEMM. */ /*unsigned*/ int nthreads, libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, /** If the block-size (BM, BN, or BK) is not given, a suitable value is chosen internally. */ const libxsmm_blasint* bm, const libxsmm_blasint* bn, const libxsmm_blasint* bk, /** If b_m1, b_n1, b_k1, or b_k2 is not supplied, the respective value defaults to one. */ const libxsmm_blasint* b_m1, const libxsmm_blasint* b_n1, const libxsmm_blasint* b_k1, const libxsmm_blasint* b_k2, /** If alpha is not supplied (NULL), then LIBXSMM_ALPHA is used instead. */ const void* alpha, /** If beta is not supplied (NULL), then LIBXSMM_BETA is used instead. */ const void* beta, /** See libxsmm_gemm_flags (LIBXSMM_FLAGS is used if NULL is given). */ const int* gemm_flags, /** See libxsmm_gemm_prefetch_type; a strategy chosen automatically if NULL is given. */ const libxsmm_gemm_prefetch_type* prefetch, /** See libxsmm_blocked_gemm_order; an order is chosen automatically if NULL is given. */ const libxsmm_blocked_gemm_order* order); LIBXSMM_API void libxsmm_blocked_gemm_handle_destroy(const libxsmm_blocked_gemm_handle* handle); /** Copy-in functions for A, B, and C matrices. A leading dimension for the source buffer is optional and can be NULL. */ LIBXSMM_API int libxsmm_blocked_gemm_copyin_a(const libxsmm_blocked_gemm_handle* handle, const void* src, const libxsmm_blasint* ld, void* dst); LIBXSMM_API int libxsmm_blocked_gemm_copyin_b(const libxsmm_blocked_gemm_handle* handle, const void* src, const libxsmm_blasint* ld, void* dst); LIBXSMM_API int libxsmm_blocked_gemm_copyin_c(const libxsmm_blocked_gemm_handle* handle, const void* src, const libxsmm_blasint* ld, void* dst); /** Copy-out function for the C-matrix. A leading dimension for the destination buffer is optional and can be NULL. */ LIBXSMM_API int libxsmm_blocked_gemm_copyout_c(const libxsmm_blocked_gemm_handle* handle, const void* src, const libxsmm_blasint* ld, void* dst); /** Convert function required to reorganize elements in delta for BWD and UPD passes of RNN, LSTM and GRU */ LIBXSMM_API int libxsmm_blocked_gemm_convert_b_to_a(const libxsmm_blocked_gemm_handle* handle, const void* src, const libxsmm_blasint* ld, void* dst); /** Transpose matrix b for UPD pass of GRU */ LIBXSMM_API int libxsmm_blocked_gemm_transpose_b(const libxsmm_blocked_gemm_handle* handle, const void* src, const libxsmm_blasint* ld, void* dst); /** * Fine grain parallelized block-GEMM (BGEMM), which uses a block structure * layout for the A and B matrices. The implementation is parallelized * among M, N, and K using fine-grained on-demand locks when writing C. */ LIBXSMM_API void libxsmm_blocked_gemm_st(const libxsmm_blocked_gemm_handle* handle, const void* a, const void* b, void* c, /*unsigned*/int start_thread, /*unsigned*/int tid); /** * Implementation of libxsmm_blocked_gemm, which is parallelized with OpenMP * and uses an OpenMP or custom barrier implementation. The function * allows to run multiple GEMMs, which is specified by 'count' (RNNs). * This function requires to link against libxsmmext. */ LIBXSMM_APIEXT void libxsmm_blocked_gemm_omp(const libxsmm_blocked_gemm_handle* handle, const void* a, const void* b, void* c, /*unsigned*/int count); #endif /*LIBXSMM_BLOCKED_GEMM_H*/ libxsmm-1.17/include/libxsmm_config.h000066400000000000000000000023131415223013700177060ustar00rootroot00000000000000#ifndef LIBXSMM_CONFIG_H #define LIBXSMM_CONFIG_H #if !defined(LIBXSMM_DEFAULT_CONFIG) && (defined(_WIN32) || (defined(LIBXSMM_SOURCE_H) && !defined(LIBXSMM_CONFIGURED))) # define LIBXSMM_DEFAULT_CONFIG #endif #if !defined(LIBXSMM_DEFAULT_CONFIG) && (!defined(LIBXSMM_SOURCE_H) || defined(LIBXSMM_CONFIGURED)) # include "libxsmm_version.h" #else # define LIBXSMM_CONFIG_VERSION "" # define LIBXSMM_CONFIG_BRANCH "" # define LIBXSMM_CONFIG_VERSION_MAJOR INT_MAX # define LIBXSMM_CONFIG_VERSION_MINOR INT_MAX # define LIBXSMM_CONFIG_VERSION_UPDATE INT_MAX # define LIBXSMM_CONFIG_VERSION_PATCH INT_MAX # define LIBXSMM_CONFIG_BUILD_DATE INT_MAX #endif #define LIBXSMM_CONFIG_CACHELINE 64 #define LIBXSMM_CONFIG_ALIGNMENT 64 #define LIBXSMM_CONFIG_MALLOC 0 #define LIBXSMM_CONFIG_ILP64 0 #define LIBXSMM_CONFIG_SYNC 1 #define LIBXSMM_CONFIG_JIT 1 #define LIBXSMM_CONFIG_PREFETCH -1 #define LIBXSMM_CONFIG_MAX_MNK 262144 #define LIBXSMM_CONFIG_MAX_DIM 64 #define LIBXSMM_CONFIG_AVG_DIM 32 #define LIBXSMM_CONFIG_MAX_M 64 #define LIBXSMM_CONFIG_MAX_N 64 #define LIBXSMM_CONFIG_MAX_K 64 #define LIBXSMM_CONFIG_FLAGS 0 #define LIBXSMM_CONFIG_ALPHA 1 #define LIBXSMM_CONFIG_BETA 1 #define LIBXSMM_CONFIG_WRAP 1 #endif libxsmm-1.17/include/libxsmm_cpuid.h000066400000000000000000000052041415223013700175470ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_CPUID_H #define LIBXSMM_CPUID_H #include "libxsmm_macros.h" /** * Enumerates the available target architectures and instruction * set extensions as returned by libxsmm_get_target_archid(). */ #define LIBXSMM_TARGET_ARCH_UNKNOWN 0 #define LIBXSMM_TARGET_ARCH_GENERIC 1 #define LIBXSMM_X86_GENERIC 1002 #define LIBXSMM_X86_SSE3 1003 #define LIBXSMM_X86_SSE4 1004 #define LIBXSMM_X86_AVX 1005 #define LIBXSMM_X86_AVX2 1006 #define LIBXSMM_X86_AVX512 1007 #define LIBXSMM_X86_AVX512_MIC 1010 /* KNL */ #define LIBXSMM_X86_AVX512_KNM 1011 #define LIBXSMM_X86_AVX512_CORE 1020 /* SKX */ #define LIBXSMM_X86_AVX512_CLX 1021 #define LIBXSMM_X86_AVX512_CPX 1022 #define LIBXSMM_X86_ALLFEAT 1999 /* all features supported which are used anywhere in LIBXSMM, this value should never be used to set arch, only for compares */ /** A zero-initialized structure assumes conservative properties. */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_cpuid_x86_info { int constant_tsc; /** Timer stamp counter is monotonic. */ int has_context; /** Context switches are permitted. */ } libxsmm_cpuid_x86_info; /** Returns the target architecture and instruction set extensions. */ #if defined(__cplusplus) /* note: stay compatible with TF */ LIBXSMM_API int libxsmm_cpuid_x86(libxsmm_cpuid_x86_info* info = NULL); #else LIBXSMM_API int libxsmm_cpuid_x86(libxsmm_cpuid_x86_info* info); #endif /** * Similar to libxsmm_cpuid_x86, but conceptually not x86-specific. * The actual code path (as used by LIBXSMM) is determined by * libxsmm_[get|set]_target_archid/libxsmm_[get|set]_target_arch. */ LIBXSMM_API int libxsmm_cpuid(void); /** Names the CPU architecture given by CPUID. */ LIBXSMM_API const char* libxsmm_cpuid_name(int id); /** SIMD vector length (VLEN) in 32-bit elements. */ LIBXSMM_API int libxsmm_cpuid_vlen32(int id); #endif /*LIBXSMM_CPUID_H*/ libxsmm-1.17/include/libxsmm_dnn.h000066400000000000000000000155711415223013700172320ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Hans Pabst (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_DNN_H #define LIBXSMM_DNN_H #include "libxsmm_typedefs.h" typedef unsigned int libxsmm_dnn_err_t; /** Define error and warning codes */ #define LIBXSMM_DNN_SUCCESS 0 #define LIBXSMM_DNN_WARN_FALLBACK 90000 #define LIBXSMM_DNN_WARN_RNN_SUBOPTIMAL_N_BLOCKING 90001 #define LIBXSMM_DNN_WARN_RNN_SUBOPTIMAL_C_BLOCKING 90002 #define LIBXSMM_DNN_WARN_RNN_SUBOPTIMAL_K_BLOCKING 90003 #define LIBXSMM_DNN_WARN_FC_SUBOPTIMAL_N_BLOCKING 90004 #define LIBXSMM_DNN_WARN_FC_SUBOPTIMAL_C_BLOCKING 90005 #define LIBXSMM_DNN_WARN_FC_SUBOPTIMAL_K_BLOCKING 90006 #define LIBXSMM_DNN_ERR_GENERAL 100000 #define LIBXSMM_DNN_ERR_CREATE_HANDLE 100001 #define LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE 100002 #define LIBXSMM_DNN_ERR_INVALID_BLOCKING 100003 #define LIBXSMM_DNN_ERR_INVALID_HANDLE 100004 #define LIBXSMM_DNN_ERR_DATA_NOT_BOUND 100005 #define LIBXSMM_DNN_ERR_CREATE_TENSOR 100006 #define LIBXSMM_DNN_ERR_INVALID_TENSOR 100007 #define LIBXSMM_DNN_ERR_MISMATCH_TENSOR 100008 #define LIBXSMM_DNN_ERR_INVALID_HANDLE_TENSOR 100009 #define LIBXSMM_DNN_ERR_INVALID_KIND 100010 #define LIBXSMM_DNN_ERR_INVALID_FORMAT_NCHW 100011 #define LIBXSMM_DNN_ERR_UNSUPPORTED_DST_FORMAT 100012 #define LIBXSMM_DNN_ERR_UNSUPPORTED_SRC_FORMAT 100013 #define LIBXSMM_DNN_ERR_INVALID_FORMAT_CONVOLVE 100014 #define LIBXSMM_DNN_ERR_INVALID_FORMAT_KCRS 100015 #define LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL 100016 #define LIBXSMM_DNN_ERR_CREATE_LAYOUT 100017 #define LIBXSMM_DNN_ERR_INVALID_LAYOUT 100018 #define LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH 100019 #define LIBXSMM_DNN_ERR_SCRATCH_NOT_ALLOCED 100020 #define LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE 100021 #define LIBXSMM_DNN_ERR_INVALID_ALGO 100022 #define LIBXSMM_DNN_ERR_INVALID_PADDING 100023 #define LIBXSMM_DNN_ERR_UNKNOWN_BIAS_TYPE 100024 #define LIBXSMM_DNN_ERR_MISMATCH_BIAS 100025 #define LIBXSMM_DNN_ERR_INVALID_HANDLE_BIAS 100026 #define LIBXSMM_DNN_ERR_TIME_STEPS_TOO_SMALL 100027 #define LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS 100028 #define LIBXSMM_DNN_ERR_NOT_IMPLEMENTED 100029 #define LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER 100030 #define LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION 100031 #define LIBXSMM_DNN_ERR_INVALID_FORMAT_FUSEDBN 100032 #define LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING 100033 #define LIBXSMM_DNN_ERR_INVALID_FORMAT_FC 100034 #define LIBXSMM_DNN_ERR_INVALID_RNN_TYPE 100035 #define LIBXSMM_DNN_ERR_RNN_INVALID_SEQ_LEN 100036 #define LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER 100037 #define LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION 100038 #define LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION 100039 /** Kinds of supported compute flavor operations. */ typedef enum libxsmm_dnn_compute_kind { /** Forward path */ LIBXSMM_DNN_COMPUTE_KIND_FWD, /** Backward path */ LIBXSMM_DNN_COMPUTE_KIND_BWD, /** Updated weights. */ LIBXSMM_DNN_COMPUTE_KIND_UPD, /** Backward and weightupdate combined, useful for RNNs */ LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, /** All routines, need for some init routines. */ LIBXSMM_DNN_COMPUTE_KIND_ALL } libxsmm_dnn_compute_kind; /** these are some quantization definitions, not sure if we want to move them into some main part of LIBXSMM */ /* @TODO check position of these declarations and defines */ typedef union LIBXSMM_RETARGETABLE libxsmm_intfloat { unsigned int ui; float f; } libxsmm_intfloat; /* F32 masking defines */ #define LIBXSNN_DNN_MASK_SIGN_F32 0x80000000 #define LIBXSMM_DNN_MASK_EXP_F32 0x7f800000 #define LIBXSMM_DNN_MASK_MANT_F32 0x007fffff #define LIBXSMM_DNN_MASK_ABS_F32 0x7fffffff #define LIBXSMM_DNN_MASK_FULL_F32 0xffffffff #define LIBXSMM_DNN_MANT_SZ_F32 23 #define LIBXSMM_DNN_SZ_F32 32 /* DFP16 masking defines */ #define LIBXSMM_DNN_MANT_DFP16 15 #define LIXSMMM_DNN_RES_DFP16 libxsmm_sexp2_i8i(-(LIBXSMM_DNN_MANT_DFP16)) /* Quantization Rounding Defines */ #define LIBXSMM_DNN_QUANT_NO_ROUND 80000 #define LIBXSMM_DNN_QUANT_BIAS_ROUND 80001 #define LIBXSMM_DNN_QUANT_STOCH_ROUND 80002 #define LIBXSMM_DNN_QUANT_NEAREST_ROUND 80003 #define LIBXSMM_DNN_QUANT_FPHW_ROUND 80004 /** get string of error code */ LIBXSMM_API const char* libxsmm_dnn_get_error(libxsmm_dnn_err_t code); LIBXSMM_API size_t libxsmm_dnn_typesize(libxsmm_dnn_datatype datatype); LIBXSMM_API size_t libxsmm_dnn_get_simd_width(libxsmm_dnn_datatype datatype); /** some quantization helper functions, @TODO need to be integrated better for all different ways of quantizations */ LIBXSMM_API void libxsmm_dnn_quantize( float* in_buffer, short* out_buffer, int length, unsigned char add_shift, unsigned char* scf, int round_mode ); LIBXSMM_API void libxsmm_dnn_quantize_act( float* in_buffer, short* out_buffer, unsigned int N, unsigned int C, unsigned int H, unsigned int W, unsigned int cblk_f32, unsigned int cblk_i16, unsigned int lp_blk, unsigned char add_shift, unsigned char* scf, int round_mode ); LIBXSMM_API void libxsmm_dnn_quantize_fil( float* in_buffer, short* out_buffer, unsigned int K, unsigned int C, unsigned int R, unsigned int S, unsigned int cblk_f32, unsigned int cblk_i16, unsigned int kblk_f32, unsigned int kblk_i16, unsigned int lp_blk, unsigned char add_shift, unsigned char* scf, int round_mode ); LIBXSMM_API void libxsmm_dnn_dequantize( short* in_buffer, float* out_buffer, int length, unsigned char scf ); /** some BF16<->FP32 conversion functions @TODO we need to find a final place for those */ LIBXSMM_API void libxsmm_truncate_convert_f32_bf16(const float* in, libxsmm_bfloat16* out, unsigned int length); LIBXSMM_API void libxsmm_rnaz_convert_fp32_bf16(const float* in, libxsmm_bfloat16* out, unsigned int len); LIBXSMM_API void libxsmm_rne_convert_fp32_bf16(const float* in, libxsmm_bfloat16* out, unsigned int len); LIBXSMM_API void libxsmm_convert_bf16_f32(const libxsmm_bfloat16* in, float* out, unsigned int length); #endif /*LIBXSMM_DNN_H*/ libxsmm-1.17/include/libxsmm_dnn_convolution.h000066400000000000000000000135341415223013700216660ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_DNN_CONVOLUTION_H #define LIBXSMM_DNN_CONVOLUTION_H #include "libxsmm_dnn.h" #include "libxsmm_dnn_tensor.h" #include "libxsmm_dnn_fusedbatchnorm.h" /** Opaque handles which represents convolutions and LIBXSMM datatypes */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_layer libxsmm_dnn_layer; typedef enum libxsmm_dnn_conv_fuse_op { /* we fuse nothing into convolution */ LIBXSMM_DNN_CONV_FUSE_NONE = 0 } libxsmm_dnn_conv_fuse_op; /** Type of algorithm used for convolutions. */ typedef enum libxsmm_dnn_conv_algo { /** let the library decide */ LIBXSMM_DNN_CONV_ALGO_AUTO, /** direct convolution. */ LIBXSMM_DNN_CONV_ALGO_DIRECT } libxsmm_dnn_conv_algo; /** Structure which describes the input and output of data (DNN). */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_conv_desc { int N; /* number of images in mini-batch */ int C; /* number of input feature maps */ int H; /* height of input image */ int W; /* width of input image */ int K; /* number of output feature maps */ int R; /* height of filter kernel */ int S; /* width of filter kernel */ int u; /* vertical stride */ int v; /* horizontal stride */ int pad_h; /* height of logical rim padding to input for adjusting output height */ int pad_w; /* width of logical rim padding to input for adjusting output width */ int pad_h_in; /* height of zero-padding in input buffer, must equal to pad_h for direct conv */ int pad_w_in; /* width of zero-padding in input buffer, must equal to pad_w for direct conv */ int pad_h_out; /* height of zero-padding in output buffer */ int pad_w_out; /* width of zero-padding in output buffer */ int threads; /* number of threads to use when running convolution */ libxsmm_dnn_datatype datatype_in; /* datatypes used for all input related buffer */ libxsmm_dnn_datatype datatype_out; /* datatypes used for all output related buffer */ libxsmm_dnn_tensor_format buffer_format; /* format which is for buffer buffers */ libxsmm_dnn_tensor_format filter_format; /* format which is for filter buffers */ libxsmm_dnn_conv_algo algo; /* convolution algorithm used */ libxsmm_dnn_conv_option options; /* additional options */ libxsmm_dnn_conv_fuse_op fuse_ops; /* used ops into convolutions */ } libxsmm_dnn_conv_desc; /** Create a layer handle (non-NULL if successful), and pre-build all JIT-code versions. */ LIBXSMM_API libxsmm_dnn_layer* libxsmm_dnn_create_conv_layer(libxsmm_dnn_conv_desc conv_desc, libxsmm_dnn_err_t* status); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_conv_layer(const libxsmm_dnn_layer* handle); /** get layout description of buffers and filters from handle */ LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_create_tensor_datalayout(const libxsmm_dnn_layer* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status); /** scratch pad management */ LIBXSMM_API size_t libxsmm_dnn_get_scratch_size(const libxsmm_dnn_layer* handle, const libxsmm_dnn_compute_kind kind, libxsmm_dnn_err_t* status); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_bind_scratch(libxsmm_dnn_layer* handle, const libxsmm_dnn_compute_kind kind, const void* scratch); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_release_scratch(libxsmm_dnn_layer* handle, const libxsmm_dnn_compute_kind kind); /** Bind/Release buffers, filters and bias to layer operation */ LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_bind_tensor(libxsmm_dnn_layer* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type); LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_get_tensor(libxsmm_dnn_layer* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_release_tensor(libxsmm_dnn_layer* handle, const libxsmm_dnn_tensor_type type); /** Run the layer identified by the handle; may use threads internally. */ LIBXSMM_API void libxsmm_dnn_execute(libxsmm_dnn_layer* handle, libxsmm_dnn_compute_kind kind); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_execute_st(libxsmm_dnn_layer* handle, libxsmm_dnn_compute_kind kind, /*unsigned*/int start_thread, /*unsigned*/int tid); /** some helper functions for framework integration */ LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_trans_reg_filter(const libxsmm_dnn_layer* handle); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_trans_reg_bf16_filter(const libxsmm_dnn_layer* handle); #endif /*LIBXSMM_DNN_CONVOLUTION_H*/ libxsmm-1.17/include/libxsmm_dnn_fullyconnected.h000066400000000000000000000100341415223013700223150ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_DNN_FULLYCONNECTED_H #define LIBXSMM_DNN_FULLYCONNECTED_H #include "libxsmm_dnn.h" #include "libxsmm_dnn_tensor.h" /** Opaque handles which represents LIBXSMM fullyconnected */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_fullyconnected libxsmm_dnn_fullyconnected; typedef enum libxsmm_dnn_fullyconnected_fuse_op { /* the fuse order is: 1. BIAS, 2. Actitvation */ LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE = 0, LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS = 1, LIBXSMM_DNN_FULLYCONNECTED_FUSE_RELU = 2, LIBXSMM_DNN_FULLYCONNECTED_FUSE_SIGMOID = 4, LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_RELU = 3, LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_SIGMOID = 5 } libxsmm_dnn_fullyconnected_fuse_op; LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_fullyconnected_desc { int N; /* number of images in mini-batch */ int C; /* number of input feature maps */ int K; /* number of output feature maps */ int bn; int bk; int bc; int threads; /* number of threads used */ libxsmm_dnn_datatype datatype_in; /* datatype used for all input related buffers */ libxsmm_dnn_datatype datatype_out; /* datatype used for all output related buffers */ libxsmm_dnn_tensor_format buffer_format; /* format which is for activation buffers */ libxsmm_dnn_tensor_format filter_format; /* format which is for filter buffers */ libxsmm_dnn_fullyconnected_fuse_op fuse_ops; /* fused operations */ } libxsmm_dnn_fullyconnected_desc; LIBXSMM_API libxsmm_dnn_fullyconnected* libxsmm_dnn_create_fullyconnected(libxsmm_dnn_fullyconnected_desc fullyconnected_desc, libxsmm_dnn_err_t* status); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_fullyconnected(const libxsmm_dnn_fullyconnected* handle); LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_fullyconnected_create_tensor_datalayout(const libxsmm_dnn_fullyconnected* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status); LIBXSMM_API void* libxsmm_dnn_fullyconnected_get_scratch_ptr (const libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_err_t* status); LIBXSMM_API size_t libxsmm_dnn_fullyconnected_get_scratch_size(const libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_err_t* status); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_bind_scratch(libxsmm_dnn_fullyconnected* handle, const void* scratch); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_release_scratch(libxsmm_dnn_fullyconnected* handle); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_bind_tensor(libxsmm_dnn_fullyconnected* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type); LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_fullyconnected_get_tensor(libxsmm_dnn_fullyconnected* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_release_tensor(libxsmm_dnn_fullyconnected* handle, const libxsmm_dnn_tensor_type type); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_execute_st(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, /*unsigned*/int start_thread, /*unsigned*/int tid); #endif /*LIBXSMM_DNN_FULLYCONNECTED_H*/ libxsmm-1.17/include/libxsmm_dnn_fusedbatchnorm.h000066400000000000000000000054451415223013700223150ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_DNN_FUSEDBATCHNORM_H #define LIBXSMM_DNN_FUSEDBATCHNORM_H #include "libxsmm_dnn.h" #include "libxsmm_dnn_tensor.h" /** Opaque handles which represents LIBXSMM fusedbatchnorm */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_fusedbatchnorm libxsmm_dnn_fusedbatchnorm; LIBXSMM_API libxsmm_dnn_fusedbatchnorm* libxsmm_dnn_create_fusedbatchnorm(libxsmm_dnn_fusedbatchnorm_desc fusedbatchnorm_desc, libxsmm_dnn_err_t* status); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_fusedbatchnorm(const libxsmm_dnn_fusedbatchnorm* handle); LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout(const libxsmm_dnn_fusedbatchnorm* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status); LIBXSMM_API size_t libxsmm_dnn_fusedbatchnorm_get_scratch_size(const libxsmm_dnn_fusedbatchnorm* handle, libxsmm_dnn_err_t* status); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_bind_scratch(libxsmm_dnn_fusedbatchnorm* handle, const void* scratch); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_release_scratch(libxsmm_dnn_fusedbatchnorm* handle); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_bind_tensor(libxsmm_dnn_fusedbatchnorm* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type); LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_fusedbatchnorm_get_tensor(libxsmm_dnn_fusedbatchnorm* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_release_tensor(libxsmm_dnn_fusedbatchnorm* handle, const libxsmm_dnn_tensor_type type); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_execute_st(libxsmm_dnn_fusedbatchnorm* handle, libxsmm_dnn_compute_kind kind, /*unsigned*/int start_thread, /*unsigned*/int tid); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_reduce_stats_st(libxsmm_dnn_fusedbatchnorm** handles, int num_handles, libxsmm_dnn_compute_kind kind, /*unsigned*/int start_thread, /*unsigned*/int tid); #endif /*LIBXSMM_DNN_FUSEDBATCHNORM_H*/ libxsmm-1.17/include/libxsmm_dnn_fusedgroupnorm.h000066400000000000000000000054451415223013700223700ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_DNN_FUSEDGROUPNORM_H #define LIBXSMM_DNN_FUSEDGROUPNORM_H #include "libxsmm_dnn.h" #include "libxsmm_dnn_tensor.h" /** Opaque handles which represents LIBXSMM fusedgroupnorm */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_fusedgroupnorm libxsmm_dnn_fusedgroupnorm; LIBXSMM_API libxsmm_dnn_fusedgroupnorm* libxsmm_dnn_create_fusedgroupnorm(libxsmm_dnn_fusedgroupnorm_desc fusedgroupnorm_desc, libxsmm_dnn_err_t* status); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_fusedgroupnorm(const libxsmm_dnn_fusedgroupnorm* handle); LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_fusedgroupnorm_create_tensor_datalayout(const libxsmm_dnn_fusedgroupnorm* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status); LIBXSMM_API size_t libxsmm_dnn_fusedgroupnorm_get_scratch_size(const libxsmm_dnn_fusedgroupnorm* handle, libxsmm_dnn_err_t* status); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_bind_scratch(libxsmm_dnn_fusedgroupnorm* handle, const void* scratch); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_release_scratch(libxsmm_dnn_fusedgroupnorm* handle); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_bind_tensor(libxsmm_dnn_fusedgroupnorm* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type); LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_fusedgroupnorm_get_tensor(libxsmm_dnn_fusedgroupnorm* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_release_tensor(libxsmm_dnn_fusedgroupnorm* handle, const libxsmm_dnn_tensor_type type); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_execute_st(libxsmm_dnn_fusedgroupnorm* handle, libxsmm_dnn_compute_kind kind, /*unsigned*/int start_thread, /*unsigned*/int tid); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_reduce_stats_st(libxsmm_dnn_fusedgroupnorm** handles, int num_handles, libxsmm_dnn_compute_kind kind, /*unsigned*/int start_thread, /*unsigned*/int tid); #endif /*LIBXSMM_DNN_FUSEDGROUPNORM_H*/ libxsmm-1.17/include/libxsmm_dnn_optimizer.h000066400000000000000000000065521415223013700213330ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_DNN_SGD_H #define LIBXSMM_DNN_SGD_H #include "libxsmm_dnn.h" #include "libxsmm_dnn_tensor.h" /** Opaque handles which represents LIBXSMM optimizer */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_optimizer libxsmm_dnn_optimizer; typedef enum libxsmm_dnn_optimizer_type { LIBXSMM_DNN_OPTIMIZER_SGD = 1 } libxsmm_dnn_optimizer_type; LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_optimizer_desc { int C; /* number of feature maps */ int K; /* number of feature maps */ int bc; int bk; float learning_rate; /* learning rate */ int threads; /* number of threads used */ libxsmm_dnn_optimizer_type opt_type; libxsmm_dnn_datatype datatype_master; /* datatype used for all input related buffers */ libxsmm_dnn_datatype datatype; /* datatype used for all input related buffers */ libxsmm_dnn_tensor_format filter_format; /* format which is for filter buffers */ } libxsmm_dnn_optimizer_desc; LIBXSMM_API libxsmm_dnn_optimizer* libxsmm_dnn_create_optimizer(libxsmm_dnn_optimizer_desc optimizer_desc, libxsmm_dnn_err_t* status); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_optimizer(const libxsmm_dnn_optimizer* handle); LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_optimizer_create_tensor_datalayout(const libxsmm_dnn_optimizer* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status); LIBXSMM_API void* libxsmm_dnn_optimizer_get_scratch_ptr (const libxsmm_dnn_optimizer* handle, libxsmm_dnn_err_t* status); LIBXSMM_API size_t libxsmm_dnn_optimizer_get_scratch_size(const libxsmm_dnn_optimizer* handle, libxsmm_dnn_err_t* status); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_optimizer_bind_scratch(libxsmm_dnn_optimizer* handle, const void* scratch); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_optimizer_release_scratch(libxsmm_dnn_optimizer* handle); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_optimizer_bind_tensor(libxsmm_dnn_optimizer* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type); LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_optimizer_get_tensor(libxsmm_dnn_optimizer* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_optimizer_release_tensor(libxsmm_dnn_optimizer* handle, const libxsmm_dnn_tensor_type type); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_optimizer_execute_st(libxsmm_dnn_optimizer* handle, /*unsigned*/int start_thread, /*unsigned*/int tid); #endif /*LIBXSMM_DNN_SGD_H*/ libxsmm-1.17/include/libxsmm_dnn_pooling.h000066400000000000000000000103741415223013700207550ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_DNN_POOLING_H #define LIBXSMM_DNN_POOLING_H #include "libxsmm_dnn.h" #include "libxsmm_dnn_tensor.h" /** Opaque handles which represents LIBXSMM pooling */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_pooling libxsmm_dnn_pooling; typedef enum libxsmm_dnn_pooling_type { LIBXSMM_DNN_POOLING_MAX = 1, LIBXSMM_DNN_POOLING_AVG = 2 } libxsmm_dnn_pooling_type; LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_pooling_desc { int N; /* number of images in mini-batch */ int C; /* number of input feature maps */ int H; /* height of input image */ int W; /* width of input image */ int R; /* kernel height */ int S; /* kernel width */ int u; /* vertical stride */ int v; /* horizontal stride */ int pad_h; /* height of logical padding of input buffer */ int pad_w; /* width of logical padding of input buffer */ int pad_h_in; /* height of physical zero-padding in input buffer */ int pad_w_in; /* width of physical zero-padding in input buffer */ int pad_h_out; /* height of physical zero-padding in output buffer */ int pad_w_out; /* width of physical zero-padding in output buffer */ int threads; /* number of threads used */ libxsmm_dnn_datatype datatype_in; /* datatypes used for all input related buffer */ libxsmm_dnn_datatype datatype_out; /* datatypes used for all output related buffer */ libxsmm_dnn_datatype datatype_mask; /* datatypes used for the masks */ libxsmm_dnn_tensor_format buffer_format; /* format which is for activation buffers */ libxsmm_dnn_pooling_type pooling_type; /* type of pooling operation */ } libxsmm_dnn_pooling_desc; LIBXSMM_API libxsmm_dnn_pooling* libxsmm_dnn_create_pooling(libxsmm_dnn_pooling_desc pooling_desc, libxsmm_dnn_err_t* status); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_pooling(const libxsmm_dnn_pooling* handle); LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_pooling_create_tensor_datalayout(const libxsmm_dnn_pooling* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status); LIBXSMM_API size_t libxsmm_dnn_pooling_get_scratch_size(const libxsmm_dnn_pooling* handle, libxsmm_dnn_err_t* status); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_pooling_bind_scratch(libxsmm_dnn_pooling* handle, const void* scratch); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_pooling_release_scratch(libxsmm_dnn_pooling* handle); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_pooling_bind_tensor(libxsmm_dnn_pooling* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type); LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_pooling_get_tensor(libxsmm_dnn_pooling* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_pooling_release_tensor(libxsmm_dnn_pooling* handle, const libxsmm_dnn_tensor_type type); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_pooling_execute_st(libxsmm_dnn_pooling* handle, libxsmm_dnn_compute_kind kind, /*unsigned*/int start_thread, /*unsigned*/int tid); #endif /*LIBXSMM_DNN_POOLING_H*/ libxsmm-1.17/include/libxsmm_dnn_rnncell.h000066400000000000000000000115211415223013700207360ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Kunal Banerjee (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_DNN_RNNCELL_H #define LIBXSMM_DNN_RNNCELL_H #include "libxsmm_dnn.h" #include "libxsmm_dnn_tensor.h" LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_rnncell libxsmm_dnn_rnncell; /** Type of algorithm used for convolutions. */ typedef enum libxsmm_dnn_rnncell_type { /** simple RNN cell with ReLU as activation function */ LIBXSMM_DNN_RNNCELL_RNN_RELU, /** simple RNN cell with sigmoid as activation function */ LIBXSMM_DNN_RNNCELL_RNN_SIGMOID, /** simple RNN cell with tanh as activation function */ LIBXSMM_DNN_RNNCELL_RNN_TANH, /** LSTM cell */ LIBXSMM_DNN_RNNCELL_LSTM, /** GRU cell */ LIBXSMM_DNN_RNNCELL_GRU } libxsmm_dnn_rnncell_type; LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_rnncell_desc { int threads; libxsmm_blasint K; /* number of outputs */ libxsmm_blasint N; /* size of the minibatch */ libxsmm_blasint C; /* number of inputs */ libxsmm_blasint max_T; /* number of time steps */ libxsmm_blasint bk; libxsmm_blasint bn; libxsmm_blasint bc; libxsmm_dnn_rnncell_type cell_type; /* cell type RNN ReLU, RNN Sigmoid, RNN Tanh, LSTM, GRU */ libxsmm_dnn_datatype datatype_in; /* datatypes used for all input related buffer */ libxsmm_dnn_datatype datatype_out; /* datatypes used for all output related buffer */ libxsmm_dnn_tensor_format buffer_format; /* format which is for activation buffers */ libxsmm_dnn_tensor_format filter_format; /* format which is for filter buffers */ } libxsmm_dnn_rnncell_desc; LIBXSMM_API libxsmm_dnn_rnncell* libxsmm_dnn_create_rnncell(libxsmm_dnn_rnncell_desc rnncell_desc, libxsmm_dnn_err_t* status); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_rnncell(const libxsmm_dnn_rnncell* handle); LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_rnncell_create_tensor_datalayout(const libxsmm_dnn_rnncell* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status); LIBXSMM_API size_t libxsmm_dnn_rnncell_get_scratch_size(const libxsmm_dnn_rnncell* handle, const libxsmm_dnn_compute_kind kind, libxsmm_dnn_err_t* status); LIBXSMM_API void* libxsmm_dnn_rnncell_get_scratch_ptr (const libxsmm_dnn_rnncell* handle, libxsmm_dnn_err_t* status); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_bind_scratch(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_compute_kind kind, const void* scratch); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_release_scratch(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_compute_kind kind); LIBXSMM_API size_t libxsmm_dnn_rnncell_get_internalstate_size(const libxsmm_dnn_rnncell* handle, const libxsmm_dnn_compute_kind kind, libxsmm_dnn_err_t* status); LIBXSMM_API void* libxsmm_dnn_rnncell_get_internalstate_ptr (const libxsmm_dnn_rnncell* handle, libxsmm_dnn_err_t* status); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_bind_internalstate(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_compute_kind kind, const void* internalstate); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_release_internalstate(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_compute_kind kind); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_allocate_forget_bias(libxsmm_dnn_rnncell* handle, const float forget_bias); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_bind_tensor(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type); LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_rnncell_get_tensor(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_release_tensor(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_tensor_type type); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_set_sequence_length( libxsmm_dnn_rnncell* handle, const libxsmm_blasint T ); LIBXSMM_API libxsmm_blasint libxsmm_dnn_rnncell_get_sequence_length( libxsmm_dnn_rnncell* handle, libxsmm_dnn_err_t* status ); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_execute_st(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, /*unsigned*/int start_thread, /*unsigned*/int tid); #endif /*LIBXSMM_DNN_RNNCELL_H*/ libxsmm-1.17/include/libxsmm_dnn_softmaxloss.h000066400000000000000000000070051415223013700216650ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_DNN_SOFTMAXLOSS_H #define LIBXSMM_DNN_SOFTMAXLOSS_H #include "libxsmm_dnn.h" #include "libxsmm_dnn_tensor.h" /** Opaque handles which represents LIBXSMM softmaxloss */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_softmaxloss libxsmm_dnn_softmaxloss; LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_softmaxloss_desc { int N; /* number of images in mini-batch */ int C; /* number of input feature maps */ int bn; /* requested N blocking for NCNC format */ int bc; /* requested C blocking for NCNC format */ float loss_weight; /* loss weight */ int threads; /* number of threads used */ libxsmm_dnn_datatype datatype; /* datatype used for all buffers */ libxsmm_dnn_tensor_format buffer_format; /* format which is for activation buffers */ } libxsmm_dnn_softmaxloss_desc; LIBXSMM_API libxsmm_dnn_softmaxloss* libxsmm_dnn_create_softmaxloss(libxsmm_dnn_softmaxloss_desc softmaxloss_desc, libxsmm_dnn_err_t* status); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_softmaxloss(const libxsmm_dnn_softmaxloss* handle); LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_softmaxloss_create_tensor_datalayout(const libxsmm_dnn_softmaxloss* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status); LIBXSMM_API void* libxsmm_dnn_softmaxloss_get_scratch_ptr (const libxsmm_dnn_softmaxloss* handle, libxsmm_dnn_err_t* status); LIBXSMM_API size_t libxsmm_dnn_softmaxloss_get_scratch_size(const libxsmm_dnn_softmaxloss* handle, libxsmm_dnn_err_t* status); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_bind_scratch(libxsmm_dnn_softmaxloss* handle, const void* scratch); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_release_scratch(libxsmm_dnn_softmaxloss* handle); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_bind_tensor(libxsmm_dnn_softmaxloss* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type); LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_softmaxloss_get_tensor(libxsmm_dnn_softmaxloss* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_release_tensor(libxsmm_dnn_softmaxloss* handle, const libxsmm_dnn_tensor_type type); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_execute_st(libxsmm_dnn_softmaxloss* handle, libxsmm_dnn_compute_kind kind, /*unsigned*/int start_thread, /*unsigned*/int tid); LIBXSMM_API float libxsmm_dnn_softmaxloss_get_loss(const libxsmm_dnn_softmaxloss* handle, libxsmm_dnn_err_t* status); #endif /*LIBXSMM_DNN_SOFTMAXLOSS_H*/ libxsmm-1.17/include/libxsmm_dnn_tensor.h000066400000000000000000000200611415223013700206120ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_DNN_TENSOR_H #define LIBXSMM_DNN_TENSOR_H #include "libxsmm_typedefs.h" #include "libxsmm_dnn.h" /** Opaque handles which represents convolutions and LIBXSMM datatypes */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_tensor libxsmm_dnn_tensor; typedef enum libxsmm_dnn_tensor_dimtype { /** Mini-batch */ LIBXSMM_DNN_TENSOR_DIMTYPE_N, /** Image Height */ LIBXSMM_DNN_TENSOR_DIMTYPE_H, /** Image Width */ LIBXSMM_DNN_TENSOR_DIMTYPE_W, /** channels or input channels */ LIBXSMM_DNN_TENSOR_DIMTYPE_C, /** output channels */ LIBXSMM_DNN_TENSOR_DIMTYPE_K, /** kernel height */ LIBXSMM_DNN_TENSOR_DIMTYPE_R, /** kernel width */ LIBXSMM_DNN_TENSOR_DIMTYPE_S, /** sequence lenth counter */ LIBXSMM_DNN_TENSOR_DIMTYPE_T, /** channle group counter */ LIBXSMM_DNN_TENSOR_DIMTYPE_G, /** general counter */ LIBXSMM_DNN_TENSOR_DIMTYPE_X } libxsmm_dnn_tensor_dimtype; /** types of different buffers */ typedef enum libxsmm_dnn_tensor_type { /** regular input buffer */ LIBXSMM_DNN_REGULAR_INPUT, /** regular input buffer */ LIBXSMM_DNN_REGULAR_INPUT_ADD, /** regular input buffer, transpose */ LIBXSMM_DNN_REGULAR_INPUT_TRANS, /** gradient input buffer */ LIBXSMM_DNN_GRADIENT_INPUT, /** gradient input buffer */ LIBXSMM_DNN_GRADIENT_INPUT_ADD, /** regular output buffer */ LIBXSMM_DNN_REGULAR_OUTPUT, /** gradient output buffer */ LIBXSMM_DNN_GRADIENT_OUTPUT, /** general input type */ LIBXSMM_DNN_INPUT, /** general output type */ LIBXSMM_DNN_OUTPUT, /** general activation type */ LIBXSMM_DNN_ACTIVATION, /* regular filter */ LIBXSMM_DNN_REGULAR_FILTER, /* regular filter */ LIBXSMM_DNN_REGULAR_FILTER_TRANS, /* gradient filter */ LIBXSMM_DNN_GRADIENT_FILTER, /* master filter */ LIBXSMM_DNN_MASTER_FILTER, /** general filter type */ LIBXSMM_DNN_FILTER, /* regular bias */ LIBXSMM_DNN_REGULAR_CHANNEL_BIAS, /* gradient bias */ LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS, /* bias */ LIBXSMM_DNN_CHANNEL_BIAS, /* regular beta */ LIBXSMM_DNN_REGULAR_CHANNEL_BETA, /* gradient beta */ LIBXSMM_DNN_GRADIENT_CHANNEL_BETA, /* beta */ LIBXSMM_DNN_CHANNEL_BETA, /* regular gamma */ LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA, /* gradient gamma */ LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA, /* Gamma */ LIBXSMM_DNN_CHANNEL_GAMMA, /* regular beta */ LIBXSMM_DNN_CHANNEL_EXPECTVAL, /* regular beta */ LIBXSMM_DNN_CHANNEL_RCPSTDDEV, /* variance */ LIBXSMM_DNN_CHANNEL_VARIANCE, /** general bias type */ LIBXSMM_DNN_CHANNEL_SCALAR, /** Labels */ LIBXSMM_DNN_LABEL, /** batch stats */ LIBXSMM_DNN_BATCH_STATS, LIBXSMM_DNN_MAX_STATS_FWD, LIBXSMM_DNN_MAX_STATS_BWD, LIBXSMM_DNN_MAX_STATS_UPD, /** pooling mask */ LIBXSMM_DNN_POOLING_MASK, /** ReLU mask */ LIBXSMM_DNN_RELU_MASK, /** general type, if needed might cause API issues in copy in/out API */ LIBXSMM_DNN_TENSOR, /** regular input buffer */ LIBXSMM_DNN_RNN_REGULAR_INPUT, /** regular previous cell state buffer */ LIBXSMM_DNN_RNN_REGULAR_CS_PREV, /** regular previous hidden state buffer */ LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV, /** regular weight (LSTM: wi, wc, wf, wo) */ LIBXSMM_DNN_RNN_REGULAR_WEIGHT, /** regular recurrent weight (LSTM: ri, rc, rf, ro) */ LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT, /** regular weight (LSTM: wi, wc, wf, wo) */ LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS, /** regular recurrent weight (LSTM: ri, rc, rf, ro) */ LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS, /** regular bias (LSTM: bi, bc, bf, bo) */ LIBXSMM_DNN_RNN_REGULAR_BIAS, /** regular output cell state buffer */ LIBXSMM_DNN_RNN_REGULAR_CS, /** regular hidden state buffer */ LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE, /** gradient input buffer */ LIBXSMM_DNN_RNN_GRADIENT_INPUT, /** gradient previous cell state buffer */ LIBXSMM_DNN_RNN_GRADIENT_CS_PREV, /** gradient previous hidden state buffer */ LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV, /** gradient weight */ LIBXSMM_DNN_RNN_GRADIENT_WEIGHT, /** gradient recurrent weight */ LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT, /** gradient bias */ LIBXSMM_DNN_RNN_GRADIENT_BIAS, /** gradient output cell state buffer */ LIBXSMM_DNN_RNN_GRADIENT_CS, /** gradient hidden state buffer */ LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE, /** internal i buffer */ LIBXSMM_DNN_RNN_INTERNAL_I, /** internal f buffer */ LIBXSMM_DNN_RNN_INTERNAL_F, /** internal o buffer */ LIBXSMM_DNN_RNN_INTERNAL_O, /** internal ci buffer */ LIBXSMM_DNN_RNN_INTERNAL_CI, /** internal co buffer */ LIBXSMM_DNN_RNN_INTERNAL_CO } libxsmm_dnn_tensor_type; /** layout descriptor to allow external data handling outside of LIBXSMM */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_tensor_datalayout { libxsmm_dnn_tensor_dimtype* dim_type; unsigned int* dim_size; unsigned int num_dims; libxsmm_dnn_tensor_format format; /* format of activation buffer */ libxsmm_dnn_datatype datatype; /* data type */ libxsmm_dnn_tensor_type tensor_type; /* tensor type */ } libxsmm_dnn_tensor_datalayout; /** tensorlayout handling */ LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_duplicate_tensor_datalayout(const libxsmm_dnn_tensor_datalayout* layout, libxsmm_dnn_err_t* status); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_tensor_datalayout(libxsmm_dnn_tensor_datalayout* layout); LIBXSMM_API unsigned int libxsmm_dnn_compare_tensor_datalayout(const libxsmm_dnn_tensor_datalayout* layout_a, const libxsmm_dnn_tensor_datalayout* layout_b, libxsmm_dnn_err_t* status); LIBXSMM_API unsigned int libxsmm_dnn_get_tensor_size(const libxsmm_dnn_tensor_datalayout* layout, libxsmm_dnn_err_t* status); LIBXSMM_API unsigned int libxsmm_dnn_get_tensor_elements(const libxsmm_dnn_tensor_datalayout* layout, libxsmm_dnn_err_t* status); /** Create and manage buffers, filters and bias (non-NULL if successful) */ LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_link_tensor(const libxsmm_dnn_tensor_datalayout* layout, const void* data, libxsmm_dnn_err_t* status); LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_link_qtensor(const libxsmm_dnn_tensor_datalayout* layout, const void* data, const unsigned char exp, libxsmm_dnn_err_t* status); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_set_tensor_data_ptr(libxsmm_dnn_tensor* tensor, const void* data); LIBXSMM_API void* libxsmm_dnn_get_tensor_data_ptr(const libxsmm_dnn_tensor* tensor, libxsmm_dnn_err_t* status); LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_get_tensor_datalayout(const libxsmm_dnn_tensor* tensor, libxsmm_dnn_err_t* status); LIBXSMM_API unsigned char libxsmm_dnn_get_qtensor_scf(const libxsmm_dnn_tensor* tensor, libxsmm_dnn_err_t* status); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_set_qtensor_scf(libxsmm_dnn_tensor* tensor, const unsigned char scf); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_tensor(const libxsmm_dnn_tensor* tensor); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_zero_tensor(const libxsmm_dnn_tensor* tensor); /** * Copy-in/out from a plain format such [N][C][H][W] or [N][H][W][C] */ LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_copyin_tensor(const libxsmm_dnn_tensor* tensor, const void* data, const libxsmm_dnn_tensor_format in_format); LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_copyout_tensor(const libxsmm_dnn_tensor* tensor, void* data, const libxsmm_dnn_tensor_format out_format); #endif /*LIBXSMM_DNN_TENSOR_H*/ libxsmm-1.17/include/libxsmm_frontend.h000066400000000000000000001060411415223013700202630ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_FRONTEND_H #define LIBXSMM_FRONTEND_H #include "libxsmm_typedefs.h" /** Helper macros for eliding prefetch address calculations depending on prefetch scheme. */ #if !defined(_WIN32) && !defined(__CYGWIN__) /* TODO: fully support calling convention */ #if 0 != ((LIBXSMM_PREFETCH) & 2/*AL2*/) \ || 0 != ((LIBXSMM_PREFETCH) & 8/*AL2_AHEAD*/) # define LIBXSMM_GEMM_PREFETCH_A(EXPR) (EXPR) #endif #if 0 != ((LIBXSMM_PREFETCH) & 4/*BL2_VIA_C*/) \ || 0 != ((LIBXSMM_PREFETCH) & 16/*BL1*/) # define LIBXSMM_GEMM_PREFETCH_B(EXPR) (EXPR) #endif #endif /** Secondary helper macros derived from the above group. */ #if defined(LIBXSMM_GEMM_PREFETCH_A) # define LIBXSMM_NOPREFETCH_A(EXPR) #else # define LIBXSMM_NOPREFETCH_A(EXPR) EXPR # define LIBXSMM_GEMM_PREFETCH_A(EXPR) 0 #endif #if defined(LIBXSMM_GEMM_PREFETCH_B) # define LIBXSMM_NOPREFETCH_B(EXPR) #else # define LIBXSMM_NOPREFETCH_B(EXPR) EXPR # define LIBXSMM_GEMM_PREFETCH_B(EXPR) 0 #endif #if defined(LIBXSMM_GEMM_PREFETCH_C) # define LIBXSMM_NOPREFETCH_C(EXPR) #else # define LIBXSMM_NOPREFETCH_C(EXPR) EXPR # define LIBXSMM_GEMM_PREFETCH_C(EXPR) 0 #endif /** MKL_DIRECT_CALL requires to include the MKL interface. */ #if (defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL) || \ (defined(__MKL) && !defined(LIBXSMM_BUILD) && \ (!defined(__BLAS) || (0 != __BLAS)))) # if (0 != LIBXSMM_ILP64 && !defined(MKL_ILP64)) # error "Inconsistent ILP64 configuration detected!" # endif # if defined(LIBXSMM_OFFLOAD_BUILD) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) # include # pragma offload_attribute(pop) # else # include # endif #endif /** __INTEL_MKL__ is needed later to fix some NOTHROW issue. */ #if defined(__MKL) && !defined(__INTEL_MKL__) && defined(NOTHROW) # if defined(LIBXSMM_OFFLOAD_BUILD) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) # include # pragma offload_attribute(pop) # else # include # endif #endif /** Unfortunately calculation of INTEL_MKL_VERSION is not stable over time. */ #if defined(__INTEL_MKL__) && defined(__INTEL_MKL_MINOR__) && defined(__INTEL_MKL_UPDATE__) # define LIBXSMM_MKL_VERSION3 LIBXSMM_VERSION3(__INTEL_MKL__, __INTEL_MKL_MINOR__, __INTEL_MKL_UPDATE__) #endif /** Automatically select a prefetch-strategy (libxsmm_get_gemm_xprefetch, etc.). */ #define LIBXSMM_PREFETCH_AUTO -1 /** Append "_omp" postfix to the given symbol. */ #define LIBXSMM_USEOMP(FUNCTION) LIBXSMM_CONCATENATE(FUNCTION, _omp) /** Helper macro for BLAS-style prefixes. */ #define LIBXSMM_TPREFIX_NAME(TYPE) LIBXSMM_CONCATENATE(LIBXSMM_TPREFIX_, TYPE) #define LIBXSMM_TPREFIX(TYPE, FUNCTION) LIBXSMM_CONCATENATE(LIBXSMM_TPREFIX_NAME(TYPE), FUNCTION) #define LIBXSMM_TPREFIX_doubledouble d #define LIBXSMM_TPREFIX_floatfloat s #define LIBXSMM_TPREFIX_shortfloat ws #define LIBXSMM_TPREFIX_shortint wi #define LIBXSMM_TPREFIX_libxsmm_bfloat16float bs /** Defaults if only the input type is specified. */ #define LIBXSMM_TPREFIX_double LIBXSMM_TPREFIX_doubledouble #define LIBXSMM_TPREFIX_float LIBXSMM_TPREFIX_floatfloat #define LIBXSMM_TPREFIX_short LIBXSMM_TPREFIX_shortint #define LIBXSMM_GEMM_XFLAGS(ITYPE, OTYPE) LIBXSMM_CONCATENATE(LIBXSMM_GEMM_XFLAGS_, ITYPE) /* ignore OTYPE for now */ #define LIBXSMM_GEMM_XFLAGS_double 0 #define LIBXSMM_GEMM_XFLAGS_float 0 #define LIBXSMM_GEMM_XFLAGS_libxsmm_bfloat16 LIBXSMM_GEMM_FLAG_VNNI_A #define LIBXSMM_GEMM_XFLAGS_int 0 #define LIBXSMM_GEMM_XFLAGS_short 0 /** Construct symbol name from a given real type name (float, double and short). */ #define LIBXSMM_BLAS_FNTYPE(TYPE, KIND) LIBXSMM_CONCATENATE3(libxsmm_, LIBXSMM_TPREFIX(TYPE, KIND), _function) #define LIBXSMM_MMFUNCTION_TYPE(TYPE) LIBXSMM_CONCATENATE(libxsmm_, LIBXSMM_TPREFIX(TYPE, mmfunction)) #define LIBXSMM_MMDISPATCH_SYMBOL(TYPE) LIBXSMM_CONCATENATE(libxsmm_, LIBXSMM_TPREFIX(TYPE, mmdispatch)) #define LIBXSMM_XBLAS_SYMBOL(TYPE) LIBXSMM_CONCATENATE(libxsmm_blas_, LIBXSMM_TPREFIX(TYPE, gemm)) #define LIBXSMM_XGEMM_SYMBOL(TYPE) LIBXSMM_CONCATENATE(libxsmm_, LIBXSMM_TPREFIX(TYPE, gemm)) #define LIBXSMM_YGEMM_SYMBOL(TYPE) LIBXSMM_USEOMP(LIBXSMM_XGEMM_SYMBOL(TYPE)) #define LIBXSMM_BLAS_SYMBOL(TYPE, KIND) LIBXSMM_FSYMBOL(LIBXSMM_TPREFIX(TYPE, KIND)) #define LIBXSMM_CBLAS_SYMBOL LIBXSMM_TPREFIX #define LIBXSMM_BLAS_DECL(TYPE, KIND, DECL) LIBXSMM_CONCATENATE(LIBXSMM_BLAS_, LIBXSMM_TPREFIX(TYPE, KIND))(DECL) #if !defined(MKL_DIRECT_CALL_SEQ) && !defined(MKL_DIRECT_CALL) # define LIBXSMM_BLAS_dgemm(DECL) DECL; # define LIBXSMM_BLAS_sgemm(DECL) DECL; # define LIBXSMM_BLAS_dgemv(DECL) DECL; # define LIBXSMM_BLAS_sgemv(DECL) DECL; #else # define LIBXSMM_BLAS_dgemm # define LIBXSMM_BLAS_sgemm # define LIBXSMM_BLAS_dgemv # define LIBXSMM_BLAS_sgemv #endif /* Construct prefix names, function type or dispatch function from given input and output types. */ #define LIBXSMM_MMFUNCTION_TYPE2(ITYPE, OTYPE) LIBXSMM_MMFUNCTION_TYPE(LIBXSMM_CONCATENATE(ITYPE, OTYPE)) #define LIBXSMM_MMDISPATCH_SYMBOL2(ITYPE, OTYPE) LIBXSMM_MMDISPATCH_SYMBOL(LIBXSMM_CONCATENATE(ITYPE, OTYPE)) #define LIBXSMM_TPREFIX_NAME2(ITYPE, OTYPE) LIBXSMM_TPREFIX_NAME(LIBXSMM_CONCATENATE(ITYPE, OTYPE)) #define LIBXSMM_TPREFIX2(ITYPE, OTYPE, FUNCTION) LIBXSMM_TPREFIX(LIBXSMM_CONCATENATE(ITYPE, OTYPE), FUNCTION) /** Helper macro for comparing selected types. */ #define LIBXSMM_EQUAL(T1, T2) LIBXSMM_CONCATENATE3(LIBXSMM_EQUAL_, T1, T2) #define LIBXSMM_EQUAL_floatfloat 1 #define LIBXSMM_EQUAL_doubledouble 1 #define LIBXSMM_EQUAL_floatdouble 0 #define LIBXSMM_EQUAL_doublefloat 0 #define LIBXSMM_EQUAL_shortdouble 0 #define LIBXSMM_EQUAL_shortfloat 0 #if defined(LIBXSMM_BLAS_CONST) # undef LIBXSMM_BLAS_CONST # define LIBXSMM_BLAS_CONST const #elif defined(OPENBLAS_CONST) # define LIBXSMM_BLAS_CONST OPENBLAS_CONST #elif defined(LIBXSMM_BLAS_NONCONST) || defined(__OPENBLAS) || defined(__OPENBLAS77) # define LIBXSMM_BLAS_CONST #else # define LIBXSMM_BLAS_CONST const #endif #if !defined(LIBXSMM_NO_BLAS) # if (!defined(__BLAS) || (0 != __BLAS)) # define LIBXSMM_NO_BLAS 0 # define LIBXSMM_BLAS 1 # else # define LIBXSMM_NO_BLAS 1 # define LIBXSMM_BLAS 0 # endif #endif #if defined(__BLAS) && (1 == __BLAS) # if defined(__OPENBLAS) LIBXSMM_EXTERN void openblas_set_num_threads(int num_threads); # define LIBXSMM_BLAS_INIT openblas_set_num_threads(1); # endif #endif #if !defined(LIBXSMM_BLAS_INIT) # define LIBXSMM_BLAS_INIT #endif #if defined(LIBXSMM_BUILD) # if defined(LIBXSMM_BUILD_EXT) && !defined(__STATIC) # define LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_APIEXT # elif defined(LIBXSMM_NO_BLAS) && (1 == LIBXSMM_NO_BLAS) # define LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_API # endif #endif #if !defined(LIBXSMM_BLAS_SYMBOL_VISIBILITY) # define LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_EXTERN LIBXSMM_VISIBILITY_IMPORT LIBXSMM_RETARGETABLE #endif #if defined(NOTHROW) # define LIBXSMM_BLAS_NOTHROW NOTHROW #else # define LIBXSMM_BLAS_NOTHROW LIBXSMM_NOEXCEPT #endif #define LIBXSMM_BLAS_NOEXCEPT(KIND) LIBXSMM_CONCATENATE(LIBXSMM_BLAS_NOEXCEPT_, KIND) #if defined(LIBXSMM_MKL_VERSION3) && (LIBXSMM_VERSION3(2020, 0, 2) <= LIBXSMM_MKL_VERSION3) # define LIBXSMM_BLAS_NOEXCEPT_gemm_batch LIBXSMM_BLAS_NOTHROW #else # define LIBXSMM_BLAS_NOEXCEPT_gemm_batch #endif #define LIBXSMM_BLAS_NOEXCEPT_gemm LIBXSMM_BLAS_NOTHROW #define LIBXSMM_BLAS_NOEXCEPT_gemv LIBXSMM_BLAS_NOTHROW #define LIBXSMM_BLAS_SYMBOL_SIGNATURE_gemm_batch(CONST_STAR, STAR, TYPE) char CONST_STAR, char CONST_STAR, \ libxsmm_blasint CONST_STAR, libxsmm_blasint CONST_STAR, libxsmm_blasint CONST_STAR, \ TYPE CONST_STAR, TYPE CONST_STAR STAR, libxsmm_blasint CONST_STAR, TYPE CONST_STAR STAR, libxsmm_blasint CONST_STAR, \ TYPE CONST_STAR, TYPE STAR STAR, libxsmm_blasint CONST_STAR, libxsmm_blasint CONST_STAR, libxsmm_blasint CONST_STAR #define LIBXSMM_BLAS_SYMBOL_SIGNATURE_gemm(CONST_STAR, STAR, TYPE) char CONST_STAR, char CONST_STAR, \ libxsmm_blasint CONST_STAR, libxsmm_blasint CONST_STAR, libxsmm_blasint CONST_STAR, TYPE CONST_STAR, TYPE CONST_STAR, libxsmm_blasint CONST_STAR, \ TYPE CONST_STAR, libxsmm_blasint CONST_STAR, TYPE CONST_STAR, TYPE STAR, libxsmm_blasint CONST_STAR #define LIBXSMM_BLAS_SYMBOL_SIGNATURE_gemv(CONST_STAR, STAR, TYPE) char CONST_STAR, libxsmm_blasint CONST_STAR, libxsmm_blasint CONST_STAR, \ TYPE CONST_STAR, TYPE CONST_STAR, libxsmm_blasint CONST_STAR, TYPE CONST_STAR, libxsmm_blasint CONST_STAR, \ TYPE CONST_STAR, TYPE STAR, libxsmm_blasint CONST_STAR #define LIBXSMM_BLAS_SYMBOL_SIGNATURE(CONST_STAR, STAR, TYPE, KIND) LIBXSMM_CONCATENATE(LIBXSMM_BLAS_SYMBOL_SIGNATURE_, KIND)(CONST_STAR, STAR, TYPE) #define LIBXSMM_BLAS_SYMBOL_FDECL(CONST_STAR, STAR, TYPE, KIND) LIBXSMM_BLAS_SYMBOL_VISIBILITY \ void LIBXSMM_BLAS_SYMBOL(TYPE, KIND)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(CONST_STAR, STAR, TYPE, KIND)) LIBXSMM_BLAS_NOEXCEPT(KIND) #define LIBXSMM_BLAS_SYMBOL_CDECL(CONST_STAR, STAR, TYPE, KIND) LIBXSMM_BLAS_SYMBOL_VISIBILITY \ void LIBXSMM_CBLAS_SYMBOL(TYPE, KIND)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(CONST_STAR, STAR, TYPE, KIND)) LIBXSMM_BLAS_NOEXCEPT(KIND) #if (0 != LIBXSMM_BLAS) /* BLAS available */ # define LIBXSMM_BLAS_SYMBOL_DECL(TYPE, KIND) LIBXSMM_BLAS_DECL(TYPE, KIND, LIBXSMM_BLAS_SYMBOL_FDECL(LIBXSMM_BLAS_CONST*, *, TYPE, KIND)) #else # define LIBXSMM_BLAS_SYMBOL_DECL(TYPE, KIND) #endif /** Helper macro consolidating the transpose requests into a set of flags. */ #define LIBXSMM_GEMM_FLAGS(TRANSA, TRANSB) /* check for N/n rather than T/t since C/c is also valid! */ \ ((('n' == (TRANSA) || *"N" == (TRANSA)) ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_TRANS_A) \ | (('n' == (TRANSB) || *"N" == (TRANSB)) ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_TRANS_B)) /** Helper macro consolidating CBLAS transpose requests into a set of flags. */ #define LIBXSMM_GEMM_CFLAGS(TRANSA, TRANSB) /* check for N/n rather than T/t since C/c is also valid! */ \ ((CblasNoTrans == (TRANSA) ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_TRANS_A) \ | (CblasNoTrans == (TRANSB) ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_TRANS_B)) /** Helper macro consolidating the transpose requests into a set of flags. */ #define LIBXSMM_GEMM_VNNI_FLAGS(TRANSA, TRANSB, VNNIA, VNNIB) /* check for N/n rather than T/t since C/c is also valid! */ \ ((('n' == (TRANSA) || *"N" == (TRANSA)) ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_TRANS_A) \ | (('n' == (TRANSB) || *"N" == (TRANSB)) ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_TRANS_B) \ | (('n' == (VNNIA) || *"N" == (VNNIA)) ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_VNNI_A) \ | (('n' == (VNNIB) || *"N" == (VNNIB)) ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_VNNI_B)) /** Helper macro allowing NULL-requests (transposes) supplied by some default. */ #define LIBXSMM_GEMM_PFLAGS(TRANSA, TRANSB, DEFAULT) LIBXSMM_GEMM_FLAGS( \ NULL != ((const void*)(TRANSA)) ? (*(const char*)(TRANSA)) : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & (DEFAULT)) ? 'n' : 't'), \ NULL != ((const void*)(TRANSB)) ? (*(const char*)(TRANSB)) : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & (DEFAULT)) ? 'n' : 't')) \ | (~(LIBXSMM_GEMM_FLAG_TRANS_A | LIBXSMM_GEMM_FLAG_TRANS_B) & (DEFAULT)) /** Inlinable GEMM exercising the compiler's code generation (macro template). TODO: only NN is supported and SP/DP matrices. */ #define LIBXSMM_INLINE_XGEMM(ITYPE, OTYPE, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) { \ /* Use 'n' (instead of 'N') avoids warning about "no macro replacement within a character constant". */ \ const char libxsmm_inline_xgemm_transa_ = (char)(NULL != ((void*)(TRANSA)) ? (*(const char*)(TRANSA)) : \ (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & LIBXSMM_FLAGS) ? 'n' : 't')); \ const char libxsmm_inline_xgemm_transb_ = (char)(NULL != ((void*)(TRANSB)) ? (*(const char*)(TRANSB)) : \ (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & LIBXSMM_FLAGS) ? 'n' : 't')); \ const libxsmm_blasint libxsmm_inline_xgemm_m_ = *(const libxsmm_blasint*)(M); /* must be specified */ \ const libxsmm_blasint libxsmm_inline_xgemm_k_ = (NULL != ((void*)(K)) ? (*(const libxsmm_blasint*)(K)) : libxsmm_inline_xgemm_m_); \ const libxsmm_blasint libxsmm_inline_xgemm_n_ = (NULL != ((void*)(N)) ? (*(const libxsmm_blasint*)(N)) : libxsmm_inline_xgemm_k_); \ const libxsmm_blasint libxsmm_inline_xgemm_lda_ = (NULL != ((void*)(LDA)) ? (*(const libxsmm_blasint*)(LDA)) : \ (('n' == libxsmm_inline_xgemm_transa_ || *"N" == libxsmm_inline_xgemm_transa_) ? libxsmm_inline_xgemm_m_ : libxsmm_inline_xgemm_k_)); \ const libxsmm_blasint libxsmm_inline_xgemm_ldb_ = (NULL != ((void*)(LDB)) ? (*(const libxsmm_blasint*)(LDB)) : \ (('n' == libxsmm_inline_xgemm_transb_ || *"N" == libxsmm_inline_xgemm_transb_) ? libxsmm_inline_xgemm_k_ : libxsmm_inline_xgemm_n_)); \ const libxsmm_blasint libxsmm_inline_xgemm_ldc_ = (NULL != ((void*)(LDC)) ? (*(const libxsmm_blasint*)(LDC)) : libxsmm_inline_xgemm_m_); \ const OTYPE libxsmm_inline_xgemm_alpha_ = (NULL != ((void*)(ALPHA)) ? (*(const OTYPE*)(ALPHA)) : ((OTYPE)LIBXSMM_ALPHA)); \ const OTYPE libxsmm_inline_xgemm_beta_ = (NULL != ((void*)(BETA)) ? (*(const OTYPE*)(BETA)) : ((OTYPE)LIBXSMM_BETA)); \ libxsmm_blasint libxsmm_inline_xgemm_ni_, libxsmm_inline_xgemm_mi_ = 0, libxsmm_inline_xgemm_ki_; /* loop induction variables */ \ LIBXSMM_ASSERT('n' == libxsmm_inline_xgemm_transa_ || *"N" == libxsmm_inline_xgemm_transa_); \ LIBXSMM_ASSERT('n' == libxsmm_inline_xgemm_transb_ || *"N" == libxsmm_inline_xgemm_transb_); \ LIBXSMM_PRAGMA_SIMD \ for (libxsmm_inline_xgemm_mi_ = 0; libxsmm_inline_xgemm_mi_ < libxsmm_inline_xgemm_m_; ++libxsmm_inline_xgemm_mi_) { \ LIBXSMM_PRAGMA_LOOP_COUNT(1, LIBXSMM_CONFIG_MAX_DIM, LIBXSMM_CONFIG_AVG_DIM) \ for (libxsmm_inline_xgemm_ki_ = 0; libxsmm_inline_xgemm_ki_ < libxsmm_inline_xgemm_k_; ++libxsmm_inline_xgemm_ki_) { \ LIBXSMM_PRAGMA_UNROLL \ for (libxsmm_inline_xgemm_ni_ = 0; libxsmm_inline_xgemm_ni_ < libxsmm_inline_xgemm_n_; ++libxsmm_inline_xgemm_ni_) { \ ((OTYPE*)(C))[libxsmm_inline_xgemm_ni_*libxsmm_inline_xgemm_ldc_+libxsmm_inline_xgemm_mi_] \ = ((const ITYPE*)(B))[libxsmm_inline_xgemm_ni_*libxsmm_inline_xgemm_ldb_+libxsmm_inline_xgemm_ki_] * \ (((const ITYPE*)(A))[libxsmm_inline_xgemm_ki_*libxsmm_inline_xgemm_lda_+libxsmm_inline_xgemm_mi_] * libxsmm_inline_xgemm_alpha_) \ + ((const OTYPE*)(C))[libxsmm_inline_xgemm_ni_*libxsmm_inline_xgemm_ldc_+libxsmm_inline_xgemm_mi_] * libxsmm_inline_xgemm_beta_; \ } \ } \ } \ } #if (defined(LIBXSMM_INIT) || defined(LIBXSMM_CTOR)) # undef LIBXSMM_INIT # define LIBXSMM_INIT LIBXSMM_ASSERT_MSG(1 < libxsmm_ninit, "LIBXSMM is not initialized"); # define LIBXSMM_INIT_COMPLETED #else # define LIBXSMM_INIT if (2 > libxsmm_ninit) libxsmm_init(); #endif /** Map to appropriate BLAS function (or fallback). The mapping is used, e.g., inside of LIBXSMM_BLAS_XGEMM. */ #define LIBXSMM_BLAS_FUNCTION(ITYPE, OTYPE, FUNCTION) LIBXSMM_CONCATENATE(LIBXSMM_BLAS_FUNCTION_, LIBXSMM_TPREFIX2(ITYPE, OTYPE, FUNCTION)) #if (0 != LIBXSMM_BLAS) /* Helper macro to eventually (if defined) call libxsmm_init */ # if defined(LIBXSMM_INIT_COMPLETED) # define LIBXSMM_BLAS_FUNCTION_dgemm_batch libxsmm_original_dgemm_batch_function # define LIBXSMM_BLAS_FUNCTION_sgemm_batch libxsmm_original_sgemm_batch_function # define LIBXSMM_BLAS_FUNCTION_dgemm libxsmm_original_dgemm_function # define LIBXSMM_BLAS_FUNCTION_sgemm libxsmm_original_sgemm_function # define LIBXSMM_BLAS_FUNCTION_dgemv libxsmm_original_dgemv_function # define LIBXSMM_BLAS_FUNCTION_sgemv libxsmm_original_sgemv_function # else # define LIBXSMM_BLAS_FUNCTION_dgemm_batch libxsmm_original_dgemm_batch() # define LIBXSMM_BLAS_FUNCTION_sgemm_batch libxsmm_original_sgemm_batch() # define LIBXSMM_BLAS_FUNCTION_dgemm libxsmm_original_dgemm() # define LIBXSMM_BLAS_FUNCTION_sgemm libxsmm_original_sgemm() # define LIBXSMM_BLAS_FUNCTION_dgemv libxsmm_original_dgemv() # define LIBXSMM_BLAS_FUNCTION_sgemv libxsmm_original_sgemv() # endif #else /* no BLAS */ # define LIBXSMM_BLAS_FUNCTION_dgemm_batch libxsmm_blas_error("dgemm_batch") # define LIBXSMM_BLAS_FUNCTION_sgemm_batch libxsmm_blas_error("sgemm_batch") # define LIBXSMM_BLAS_FUNCTION_dgemm libxsmm_blas_error("dgemm") # define LIBXSMM_BLAS_FUNCTION_sgemm libxsmm_blas_error("sgemm") # define LIBXSMM_BLAS_FUNCTION_dgemv libxsmm_blas_error("dgemv") # define LIBXSMM_BLAS_FUNCTION_sgemv libxsmm_blas_error("sgemv") #endif /** Low-precision (BLAS-like) function symbols. */ #define LIBXSMM_BLAS_FUNCTION_wigemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \ LIBXSMM_INLINE_XGEMM(short, int, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) #define LIBXSMM_BLAS_FUNCTION_bsgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \ LIBXSMM_INLINE_XGEMM(libxsmm_bfloat16, float, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) /** Short-cut macros to construct desired BLAS function symbol. */ #define LIBXSMM_BLAS_FUNCTION1(TYPE, FUNCTION) LIBXSMM_BLAS_FUNCTION(TYPE, TYPE, FUNCTION) #define LIBXSMM_GEMM_BATCH_SYMBOL(TYPE) LIBXSMM_BLAS_FUNCTION1(TYPE, gemm_batch) #define LIBXSMM_GEMM_SYMBOL(TYPE) LIBXSMM_BLAS_FUNCTION1(TYPE, gemm) #define LIBXSMM_GEMV_SYMBOL(TYPE) LIBXSMM_BLAS_FUNCTION1(TYPE, gemv) /** BLAS-based GEMM supplied by the linked LAPACK/BLAS library (macro template). */ #define LIBXSMM_BLAS_XGEMM(ITYPE, OTYPE, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) { \ /* Use 'n' (instead of 'N') avoids warning about "no macro replacement within a character constant". */ \ const char libxsmm_blas_xgemm_transa_ = (char)(NULL != ((void*)(TRANSA)) ? (*(const char*)(TRANSA)) : \ (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & LIBXSMM_FLAGS) ? 'n' : 't')); \ const char libxsmm_blas_xgemm_transb_ = (char)(NULL != ((void*)(TRANSB)) ? (*(const char*)(TRANSB)) : \ (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & LIBXSMM_FLAGS) ? 'n' : 't')); \ const libxsmm_blasint *const libxsmm_blas_xgemm_k_ = (NULL != ((void*)(K)) ? (K) : (M)); \ const libxsmm_blasint *const libxsmm_blas_xgemm_n_ = (NULL != ((void*)(N)) ? (N) : libxsmm_blas_xgemm_k_); \ const libxsmm_blasint libxsmm_blas_xgemm_lda_ = LIBXSMM_MAX(NULL != ((void*)(LDA)) ? *(LDA) : \ *(('n' == libxsmm_blas_xgemm_transa_ || *"N" == libxsmm_blas_xgemm_transa_) ? (M) : libxsmm_blas_xgemm_k_), 1); \ const libxsmm_blasint libxsmm_blas_xgemm_ldb_ = LIBXSMM_MAX(NULL != ((void*)(LDB)) ? *(LDB) : \ *(('n' == libxsmm_blas_xgemm_transb_ || *"N" == libxsmm_blas_xgemm_transb_) ? libxsmm_blas_xgemm_k_ : libxsmm_blas_xgemm_n_), 1); \ const libxsmm_blasint libxsmm_blas_xgemm_ldc_ = LIBXSMM_MAX(NULL != ((void*)(LDC)) ? *(LDC) : *(M), 1); \ const OTYPE libxsmm_blas_xgemm_alpha_ = (NULL != ((void*)(ALPHA)) ? (*(const OTYPE*)(ALPHA)) : ((OTYPE)LIBXSMM_ALPHA)); \ const OTYPE libxsmm_blas_xgemm_beta_ = (NULL != ((void*)(BETA)) ? (*(const OTYPE*)(BETA)) : ((OTYPE)LIBXSMM_BETA)); \ LIBXSMM_BLAS_FUNCTION(ITYPE, OTYPE, gemm)(&libxsmm_blas_xgemm_transa_, &libxsmm_blas_xgemm_transb_, \ M, libxsmm_blas_xgemm_n_, libxsmm_blas_xgemm_k_, \ &libxsmm_blas_xgemm_alpha_, (const ITYPE*)(A), &libxsmm_blas_xgemm_lda_, \ (const ITYPE*)(B), &libxsmm_blas_xgemm_ldb_, \ &libxsmm_blas_xgemm_beta_, (ITYPE*)(C), &libxsmm_blas_xgemm_ldc_); \ } /** Helper macros for calling a dispatched function in a row/column-major aware fashion. */ #define LIBXSMM_MMCALL_ABC(FN, A, B, C) \ LIBXSMM_ASSERT(FN); FN(A, B, C) #define LIBXSMM_MMCALL_PRF(FN, A, B, C, PA, PB, PC) { \ LIBXSMM_NOPREFETCH_A(LIBXSMM_UNUSED(PA)); \ LIBXSMM_NOPREFETCH_B(LIBXSMM_UNUSED(PB)); \ LIBXSMM_NOPREFETCH_C(LIBXSMM_UNUSED(PC)); \ LIBXSMM_ASSERT(FN); FN(A, B, C, \ LIBXSMM_GEMM_PREFETCH_A(PA), \ LIBXSMM_GEMM_PREFETCH_B(PB), \ LIBXSMM_GEMM_PREFETCH_C(PC)); \ } #if (0/*LIBXSMM_GEMM_PREFETCH_NONE*/ == LIBXSMM_PREFETCH) # define LIBXSMM_MMCALL_LDX(FN, A, B, C, M, N, K, LDA, LDB, LDC) \ LIBXSMM_MMCALL_ABC(FN, A, B, C) #else # define LIBXSMM_MMCALL_LDX(FN, A, B, C, M, N, K, LDA, LDB, LDC) \ LIBXSMM_MMCALL_PRF(FN, A, B, C, (A) + ((size_t)LDA) * (K), (B) + ((size_t)LDB) * (N), (C) + ((size_t)LDC) * (N)) #endif #define LIBXSMM_MMCALL(FN, A, B, C, M, N, K) LIBXSMM_MMCALL_LDX(FN, A, B, C, M, N, K, M, K, M) /** Calculate problem size from M, N, and K using the correct integer type in order to cover the general case. */ #define LIBXSMM_MNK_SIZE(M, N, K) (((size_t)(M)) * ((size_t)(N)) * ((size_t)(K))) /** Calculate total number of matrix-elements; matrices A, B, C are given per M, N, K, and emphasize (S) the C-size. */ #define LIBXSMM_SIZE(M, N, K, S) \ (((size_t)(M) * (size_t)(K)) + ((size_t)(K) * (size_t)(N)) + \ (((size_t)(S) * (size_t)(M) * (size_t)(N)))) /** Condition based on arithmetic intensity (AI) */ #define LIBXSMM_SMM_AI(M, N, K, S, TYPESIZE) \ ((LIBXSMM_MNK_SIZE(M, N, K) * 2) <= ((size_t)(TYPESIZE) * 4/*AI*/ * LIBXSMM_SIZE(M, N, K, S))) /** Determine whether an SMM is suitable, i.e., small enough. */ #if !defined(LIBXSMM_THRESHOLD_AI) /* traditional MNK-threshold */ # define LIBXSMM_SMM(M, N, K, S, TYPESIZE) (LIBXSMM_MNK_SIZE(M, N, K) <= (LIBXSMM_MAX_MNK)) #else /* threshold based on arithmetic intensity */ # define LIBXSMM_SMM LIBXSMM_SMM_AI #endif /** Fall-back code paths: LIBXSMM_XGEMM_FALLBACK0, and LIBXSMM_XGEMM_FALLBACK1 (macro template). */ #if !defined(LIBXSMM_XGEMM_FALLBACK0) # define LIBXSMM_XGEMM_FALLBACK0(ITYPE, OTYPE, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \ LIBXSMM_BLAS_FUNCTION(ITYPE, OTYPE, gemm)(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) #endif #if !defined(LIBXSMM_XGEMM_FALLBACK1) # define LIBXSMM_XGEMM_FALLBACK1(ITYPE, OTYPE, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \ LIBXSMM_BLAS_FUNCTION(ITYPE, OTYPE, gemm)(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) #endif /** * Execute a specialized function, or use a fallback code path depending on threshold (macro template). * LIBXSMM_XGEMM_FALLBACK0 or specialized function: below LIBXSMM_MAX_MNK * LIBXSMM_XGEMM_FALLBACK1: above LIBXSMM_MAX_MNK */ #define LIBXSMM_XGEMM(ITYPE, OTYPE, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) { \ const int libxsmm_xgemm_flags_ = LIBXSMM_GEMM_PFLAGS(TRANSA, TRANSB, LIBXSMM_FLAGS) | LIBXSMM_GEMM_XFLAGS(ITYPE, OTYPE); \ const libxsmm_blasint *const libxsmm_xgemm_k_ = (NULL != (K) ? (K) : (M)); \ const libxsmm_blasint *const libxsmm_xgemm_n_ = (NULL != (N) ? (N) : libxsmm_xgemm_k_); \ const libxsmm_blasint libxsmm_xgemm_lda_ = LIBXSMM_MAX(NULL != ((void*)(LDA)) ? *(LDA) : \ *(0 == (LIBXSMM_GEMM_FLAG_TRANS_A & libxsmm_xgemm_flags_) ? (M) : libxsmm_xgemm_k_), 1); \ const libxsmm_blasint libxsmm_xgemm_ldb_ = LIBXSMM_MAX(NULL != ((void*)(LDB)) ? *(LDB) : \ *(0 == (LIBXSMM_GEMM_FLAG_TRANS_B & libxsmm_xgemm_flags_) ? libxsmm_xgemm_k_ : libxsmm_xgemm_n_), 1); \ const libxsmm_blasint libxsmm_xgemm_ldc_ = LIBXSMM_MAX(NULL != (LDC) ? *(LDC) : *(M), 1); \ if (LIBXSMM_SMM(*(M), *libxsmm_xgemm_n_, *libxsmm_xgemm_k_, 2/*RFO*/, sizeof(OTYPE))) { \ const LIBXSMM_MMFUNCTION_TYPE2(ITYPE, OTYPE) libxsmm_mmfunction_ = LIBXSMM_MMDISPATCH_SYMBOL2(ITYPE, OTYPE)( \ *(M), *libxsmm_xgemm_n_, *libxsmm_xgemm_k_, &libxsmm_xgemm_lda_, &libxsmm_xgemm_ldb_, &libxsmm_xgemm_ldc_, \ (const OTYPE*)(ALPHA), (const OTYPE*)(BETA), &libxsmm_xgemm_flags_, NULL); \ if (NULL != libxsmm_mmfunction_) { \ LIBXSMM_MMCALL_LDX(libxsmm_mmfunction_, (const ITYPE*)(A), (const ITYPE*)(B), (OTYPE*)(C), \ *(M), *libxsmm_xgemm_n_, *libxsmm_xgemm_k_, libxsmm_xgemm_lda_, libxsmm_xgemm_ldb_, libxsmm_xgemm_ldc_); \ } \ else { \ const char libxsmm_xgemm_transa_ = (char)(0 == (LIBXSMM_GEMM_FLAG_TRANS_A & libxsmm_xgemm_flags_) ? 'n' : 't'); \ const char libxsmm_xgemm_transb_ = (char)(0 == (LIBXSMM_GEMM_FLAG_TRANS_B & libxsmm_xgemm_flags_) ? 'n' : 't'); \ const OTYPE libxsmm_xgemm_alpha_ = (NULL != ((void*)(ALPHA)) ? (*(const OTYPE*)(ALPHA)) : ((OTYPE)LIBXSMM_ALPHA)); \ const OTYPE libxsmm_xgemm_beta_ = (NULL != ((void*)(BETA)) ? (*(const OTYPE*)(BETA)) : ((OTYPE)LIBXSMM_BETA)); \ LIBXSMM_XGEMM_FALLBACK0(ITYPE, OTYPE, &libxsmm_xgemm_transa_, &libxsmm_xgemm_transb_, \ M, libxsmm_xgemm_n_, libxsmm_xgemm_k_, \ &libxsmm_xgemm_alpha_, A, &libxsmm_xgemm_lda_, \ B, &libxsmm_xgemm_ldb_, \ &libxsmm_xgemm_beta_, C, &libxsmm_xgemm_ldc_); \ } \ } \ else { \ const char libxsmm_xgemm_transa_ = (char)(0 == (LIBXSMM_GEMM_FLAG_TRANS_A & libxsmm_xgemm_flags_) ? 'n' : 't'); \ const char libxsmm_xgemm_transb_ = (char)(0 == (LIBXSMM_GEMM_FLAG_TRANS_B & libxsmm_xgemm_flags_) ? 'n' : 't'); \ const OTYPE libxsmm_xgemm_alpha_ = (NULL != ((void*)(ALPHA)) ? (*(const OTYPE*)(ALPHA)) : ((OTYPE)LIBXSMM_ALPHA)); \ const OTYPE libxsmm_xgemm_beta_ = (NULL != ((void*)(BETA)) ? (*(const OTYPE*)(BETA)) : ((OTYPE)LIBXSMM_BETA)); \ LIBXSMM_XGEMM_FALLBACK1(ITYPE, OTYPE, &libxsmm_xgemm_transa_, &libxsmm_xgemm_transb_, \ M, libxsmm_xgemm_n_, libxsmm_xgemm_k_, \ &libxsmm_xgemm_alpha_, A, &libxsmm_xgemm_lda_, \ B, &libxsmm_xgemm_ldb_, \ &libxsmm_xgemm_beta_, C, &libxsmm_xgemm_ldc_); \ } \ } /** Helper macro to setup a matrix with some initial values. */ #define LIBXSMM_MATINIT_AUX(OMP, TYPE, SEED, DST, NROWS, NCOLS, LD, SCALE) { \ /*const*/ double libxsmm_matinit_seed_ = (double)(SEED); /* avoid constant conditional */ \ const double libxsmm_matinit_scale_ = (SCALE) * libxsmm_matinit_seed_ + (SCALE); \ const libxsmm_blasint libxsmm_matinit_nrows_ = (libxsmm_blasint)NROWS; \ const libxsmm_blasint libxsmm_matinit_ld_ = (libxsmm_blasint)LD; \ libxsmm_blasint libxsmm_matinit_i_ = 0, libxsmm_matinit_j_ = 0; \ LIBXSMM_OMP_VAR(libxsmm_matinit_i_); LIBXSMM_OMP_VAR(libxsmm_matinit_j_); \ if (0 != libxsmm_matinit_seed_) { \ OMP(parallel for private(libxsmm_matinit_i_, libxsmm_matinit_j_)) \ for (libxsmm_matinit_i_ = 0; libxsmm_matinit_i_ < ((libxsmm_blasint)NCOLS); ++libxsmm_matinit_i_) { \ for (libxsmm_matinit_j_ = 0; libxsmm_matinit_j_ < libxsmm_matinit_nrows_; ++libxsmm_matinit_j_) { \ const libxsmm_blasint libxsmm_matinit_k_ = libxsmm_matinit_i_ * libxsmm_matinit_ld_ + libxsmm_matinit_j_; \ (DST)[libxsmm_matinit_k_] = (TYPE)(libxsmm_matinit_scale_ * (1.0 + \ libxsmm_matinit_i_ * libxsmm_matinit_nrows_ + libxsmm_matinit_j_)); \ } \ for (; libxsmm_matinit_j_ < libxsmm_matinit_ld_; ++libxsmm_matinit_j_) { \ const libxsmm_blasint libxsmm_matinit_k_ = libxsmm_matinit_i_ * libxsmm_matinit_ld_ + libxsmm_matinit_j_; \ (DST)[libxsmm_matinit_k_] = (TYPE)(SEED); \ } \ } \ } \ else { /* shuffle based initialization */ \ const unsigned int libxsmm_matinit_maxval_ = ((unsigned int)NCOLS) * ((unsigned int)libxsmm_matinit_ld_); \ const TYPE libxsmm_matinit_maxval2_ = (TYPE)(libxsmm_matinit_maxval_ / 2), libxsmm_matinit_inv_ = (TYPE)((SCALE) / libxsmm_matinit_maxval2_); \ const size_t libxsmm_matinit_shuffle_ = libxsmm_shuffle(libxsmm_matinit_maxval_); \ OMP(parallel for private(libxsmm_matinit_i_, libxsmm_matinit_j_)) \ for (libxsmm_matinit_i_ = 0; libxsmm_matinit_i_ < ((libxsmm_blasint)NCOLS); ++libxsmm_matinit_i_) { \ for (libxsmm_matinit_j_ = 0; libxsmm_matinit_j_ < libxsmm_matinit_ld_; ++libxsmm_matinit_j_) { \ const libxsmm_blasint libxsmm_matinit_k_ = libxsmm_matinit_i_ * libxsmm_matinit_ld_ + libxsmm_matinit_j_; \ (DST)[libxsmm_matinit_k_] = libxsmm_matinit_inv_ * /* normalize values to an interval of [-1, +1] */ \ ((TYPE)(libxsmm_matinit_shuffle_ * libxsmm_matinit_k_ % libxsmm_matinit_maxval_) - libxsmm_matinit_maxval2_); \ } \ } \ } \ } #define LIBXSMM_MATINIT(TYPE, SEED, DST, NROWS, NCOLS, LD, SCALE) \ LIBXSMM_MATINIT_AUX(LIBXSMM_ELIDE, TYPE, SEED, DST, NROWS, NCOLS, LD, SCALE) #define LIBXSMM_MATINIT_SEQ(TYPE, SEED, DST, NROWS, NCOLS, LD, SCALE) \ LIBXSMM_MATINIT(TYPE, SEED, DST, NROWS, NCOLS, LD, SCALE) #define LIBXSMM_MATINIT_OMP(TYPE, SEED, DST, NROWS, NCOLS, LD, SCALE) \ LIBXSMM_MATINIT_AUX(LIBXSMM_PRAGMA_OMP, TYPE, SEED, DST, NROWS, NCOLS, LD, SCALE) /** Call libxsmm_gemm_print using LIBXSMM's GEMM-flags. */ #define LIBXSMM_GEMM_PRINT(OSTREAM, PRECISION, FLAGS, M, N, K, DALPHA, A, LDA, B, LDB, DBETA, C, LDC) \ LIBXSMM_GEMM_PRINT2(OSTREAM, PRECISION, PRECISION, FLAGS, M, N, K, DALPHA, A, LDA, B, LDB, DBETA, C, LDC) #define LIBXSMM_GEMM_PRINT2(OSTREAM, IPREC, OPREC, FLAGS, M, N, K, DALPHA, A, LDA, B, LDB, DBETA, C, LDC) \ libxsmm_gemm_dprint2(OSTREAM, (libxsmm_gemm_precision)(IPREC), (libxsmm_gemm_precision)(OPREC), \ /* Use 'n' (instead of 'N') avoids warning about "no macro replacement within a character constant". */ \ (char)(0 == (LIBXSMM_GEMM_FLAG_TRANS_A & (FLAGS)) ? 'n' : 't'), \ (char)(0 == (LIBXSMM_GEMM_FLAG_TRANS_B & (FLAGS)) ? 'n' : 't'), \ M, N, K, DALPHA, A, LDA, B, LDB, DBETA, C, LDC) /** * Utility function, which either prints information about the GEMM call * or dumps (FILE/ostream=0) all input and output data into MHD files. * The Meta Image Format (MHD) is suitable for visual inspection using, * e.g., ITK-SNAP or ParaView. */ LIBXSMM_API void libxsmm_gemm_print(void* ostream, libxsmm_gemm_precision precision, const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const void* beta, void* c, const libxsmm_blasint* ldc); LIBXSMM_API void libxsmm_gemm_print2(void* ostream, libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const void* beta, void* c, const libxsmm_blasint* ldc); LIBXSMM_API void libxsmm_gemm_dprint(void* ostream, libxsmm_gemm_precision precision, char transa, char transb, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, double dalpha, const void* a, libxsmm_blasint lda, const void* b, libxsmm_blasint ldb, double dbeta, void* c, libxsmm_blasint ldc); LIBXSMM_API void libxsmm_gemm_dprint2(void* ostream, libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, char transa, char transb, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, double dalpha, const void* a, libxsmm_blasint lda, const void* b, libxsmm_blasint ldb, double dbeta, void* c, libxsmm_blasint ldc); LIBXSMM_API void libxsmm_gemm_xprint(void* ostream, libxsmm_xmmfunction kernel, const void* a, const void* b, void* c); /** GEMM_BATCH: fallback prototype functions served by any compliant LAPACK/BLAS. */ LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_dgemm_batch_function)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, double, gemm_batch)); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_sgemm_batch_function)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, float, gemm_batch)); /** GEMM: fallback prototype functions served by any compliant LAPACK/BLAS. */ LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_dgemm_function)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, double, gemm)); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_sgemm_function)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, float, gemm)); /** GEMV: fallback prototype functions served by any compliant LAPACK/BLAS. */ LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_dgemv_function)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, double, gemv)); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_sgemv_function)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, float, gemv)); /** Helper function to consume arguments when called. */ LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_sink_function)(LIBXSMM_VARIADIC); /** The original BLAS functions. */ LIBXSMM_APIVAR_PUBLIC(/*volatile*/libxsmm_dgemm_batch_function libxsmm_original_dgemm_batch_function); LIBXSMM_APIVAR_PUBLIC(/*volatile*/libxsmm_sgemm_batch_function libxsmm_original_sgemm_batch_function); LIBXSMM_APIVAR_PUBLIC(/*volatile*/libxsmm_dgemm_function libxsmm_original_dgemm_function); LIBXSMM_APIVAR_PUBLIC(/*volatile*/libxsmm_sgemm_function libxsmm_original_sgemm_function); LIBXSMM_APIVAR_PUBLIC(/*volatile*/libxsmm_dgemv_function libxsmm_original_dgemv_function); LIBXSMM_APIVAR_PUBLIC(/*volatile*/libxsmm_sgemv_function libxsmm_original_sgemv_function); LIBXSMM_API libxsmm_dgemm_batch_function libxsmm_original_dgemm_batch(void); LIBXSMM_API libxsmm_sgemm_batch_function libxsmm_original_sgemm_batch(void); LIBXSMM_API libxsmm_dgemm_function libxsmm_original_dgemm(void); LIBXSMM_API libxsmm_sgemm_function libxsmm_original_sgemm(void); LIBXSMM_API libxsmm_dgemv_function libxsmm_original_dgemv(void); LIBXSMM_API libxsmm_sgemv_function libxsmm_original_sgemv(void); LIBXSMM_API libxsmm_sink_function libxsmm_blas_error(const char* symbol); LIBXSMM_API void libxsmm_sink(LIBXSMM_VARIADIC); /** * General dense matrix multiplication, which re-exposes LAPACK/BLAS * but allows to rely on LIBXSMM's defaults (libxsmm_config.h) * when supplying NULL-arguments in certain places. */ LIBXSMM_API void libxsmm_blas_xgemm(libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const void* beta, void* c, const libxsmm_blasint* ldc); #define libxsmm_blas_dgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \ libxsmm_blas_xgemm(LIBXSMM_GEMM_PRECISION_F64, LIBXSMM_GEMM_PRECISION_F64, \ TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) #define libxsmm_blas_sgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \ libxsmm_blas_xgemm(LIBXSMM_GEMM_PRECISION_F32, LIBXSMM_GEMM_PRECISION_F32, \ TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) #define libxsmm_dgemm_omp(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \ libxsmm_xgemm_omp(LIBXSMM_GEMM_PRECISION_F64, LIBXSMM_GEMM_PRECISION_F64, \ TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) #define libxsmm_sgemm_omp(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \ libxsmm_xgemm_omp(LIBXSMM_GEMM_PRECISION_F32, LIBXSMM_GEMM_PRECISION_F32, \ TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) /** Translates GEMM prefetch request into prefetch-enumeration (incl. FE's auto-prefetch). */ LIBXSMM_API libxsmm_gemm_prefetch_type libxsmm_get_gemm_xprefetch(const int* prefetch); LIBXSMM_API libxsmm_gemm_prefetch_type libxsmm_get_gemm_prefetch(int prefetch); #endif /*LIBXSMM_FRONTEND_H*/ libxsmm-1.17/include/libxsmm_fsspmdm.h000066400000000000000000000043771415223013700201260ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_FSSPMDM_H #define LIBXSMM_FSSPMDM_H #include "libxsmm_typedefs.h" /** Opaque types for fsspmdm */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dfsspmdm libxsmm_dfsspmdm; LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_sfsspmdm libxsmm_sfsspmdm; LIBXSMM_API libxsmm_dfsspmdm* libxsmm_dfsspmdm_create( libxsmm_blasint M, libxsmm_blasint N, libxsmm_blasint K, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, const double alpha, const double beta, libxsmm_blasint c_is_nt, const double* a_dense ); LIBXSMM_API void libxsmm_dfsspmdm_execute( const libxsmm_dfsspmdm* handle, const double* B, double* C ); LIBXSMM_API void libxsmm_dfsspmdm_destroy( libxsmm_dfsspmdm* handle ); LIBXSMM_API libxsmm_sfsspmdm* libxsmm_sfsspmdm_create( libxsmm_blasint M, libxsmm_blasint N, libxsmm_blasint K, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, const float alpha, const float beta, libxsmm_blasint c_is_nt, const float* a_dense ); LIBXSMM_API void libxsmm_sfsspmdm_execute( const libxsmm_sfsspmdm* handle, const float* B, float* C ); LIBXSMM_API void libxsmm_sfsspmdm_destroy( libxsmm_sfsspmdm* handle ); #endif /*LIBXSMM_FSSPMDM_H*/ libxsmm-1.17/include/libxsmm_generator.h000066400000000000000000000414131415223013700204330ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Hans Pabst (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_GENERATOR_H #define LIBXSMM_GENERATOR_H #include "libxsmm_typedefs.h" #define LIBXSMM_GEMM_NO_BYPASS(FLAGS, ALPHA, BETA) ( \ 0 == ((FLAGS) & (LIBXSMM_GEMM_FLAG_TRANS_A)) && \ (LIBXSMM_FEQ(1, ALPHA) /*|| LIBXSMM_FEQ(-1, ALPHA)*/) && \ (LIBXSMM_FEQ(1, BETA) || LIBXSMM_FEQ(0, BETA))) /** Initialize GEMM descriptor as used by low-level routines (type-specific). */ LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_dgemm_descriptor_init(libxsmm_descriptor_blob* blob, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, double alpha, double beta, int flags, int prefetch); LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_sgemm_descriptor_init(libxsmm_descriptor_blob* blob, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, float alpha, float beta, int flags, int prefetch); LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_wigemm_descriptor_init(libxsmm_descriptor_blob* blob, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, int alpha, int beta, int flags, int prefetch); LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_bigemm_descriptor_init(libxsmm_descriptor_blob* blob, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, int alpha, int beta, int flags, int prefetch); LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_bbgemm_descriptor_init(libxsmm_descriptor_blob* blob, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, int alpha, int beta, int flags, int prefetch); LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_bsgemm_descriptor_init(libxsmm_descriptor_blob* blob, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, float alpha, float beta, int flags, int prefetch); LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_bgemm_descriptor_init(libxsmm_descriptor_blob* blob, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, float alpha, float beta, int flags, int prefetch); /** Initialize GEMM descriptor (generic: double-precision alpha/beta). */ LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_gemm_descriptor_dinit(libxsmm_descriptor_blob* blob, libxsmm_gemm_precision precision, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, double alpha, double beta, int flags, int prefetch); LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_gemm_descriptor_dinit2(libxsmm_descriptor_blob* blob, libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, double alpha, double beta, int flags, int prefetch); /** Initialize GEMM descriptor as used by low-level routines (generic). */ LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_gemm_descriptor_init(libxsmm_descriptor_blob* blob, libxsmm_gemm_precision precision, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, const void* alpha, const void* beta, int flags, int prefetch); LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_gemm_descriptor_init2(libxsmm_descriptor_blob* blob, libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, const void* alpha, const void* beta, int flags, int prefetch); /** Similar to libxsmm_gemm_descriptor_init2 with optional type-converted alpha/beta (dalpha/dbeta). */ LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_gemm_descriptor_init3(libxsmm_descriptor_blob* blob, libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, const void* alpha, const void* beta, int flags, int prefetch, double* dalpha, double* dbeta); /** Initialize transpose descriptor as used by low-level routines. */ LIBXSMM_API libxsmm_trans_descriptor* libxsmm_trans_descriptor_init(libxsmm_descriptor_blob* blob, unsigned int typesize, unsigned int m, unsigned int n, unsigned int ldo); /** Initialize transpose descriptor as used by low-level routines. */ LIBXSMM_API libxsmm_mcopy_descriptor* libxsmm_mcopy_descriptor_init(libxsmm_descriptor_blob* blob, unsigned int typesize, unsigned int m, unsigned int n, unsigned int ldo, unsigned int ldi, int flags, int prefetch, const int* unroll); /** Initialize transpose descriptor as used by low-level routines. */ LIBXSMM_API libxsmm_meltw_descriptor* libxsmm_meltw_descriptor_init(libxsmm_descriptor_blob* blob, libxsmm_datatype in_type, libxsmm_datatype out_type, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldo, libxsmm_blasint ldi, int flags, int operation); LIBXSMM_API libxsmm_meltw_descriptor* libxsmm_meltw_descriptor_init2(libxsmm_descriptor_blob* blob, libxsmm_datatype in_type, libxsmm_datatype in2_type, libxsmm_datatype out_type, libxsmm_datatype out2_type, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldo, libxsmm_blasint ldi, libxsmm_blasint ldx, libxsmm_blasint ldy, int flags, int operation); /** Initialize packed trsm descriptor as used by low-level routines. */ LIBXSMM_API libxsmm_trsm_descriptor* libxsmm_trsm_descriptor_init(libxsmm_descriptor_blob* blob, unsigned int typesize, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint lda, libxsmm_blasint ldb, const void* alpha, char transa, char diag, char side, char uplo, int layout); /** Initialize packed trmm descriptor as used by low-level routines. */ LIBXSMM_API libxsmm_trmm_descriptor* libxsmm_trmm_descriptor_init(libxsmm_descriptor_blob* blob, unsigned int typesize, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint lda, libxsmm_blasint ldb, const void* alpha, char transa, char diag, char side, char uplo, int layout); /** Initialize packed getrf descriptor as used by low-level routines. */ LIBXSMM_API libxsmm_getrf_descriptor* libxsmm_getrf_descriptor_init(libxsmm_descriptor_blob* blob, unsigned int typesize, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint lda, int layout); /** Initialize packed pgemm descriptor as used by low-level routines. */ LIBXSMM_API libxsmm_pgemm_descriptor* libxsmm_pgemm_descriptor_init(libxsmm_descriptor_blob* blob, unsigned int typesize, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, const void* alpha, char transa, char transb, int layout); /** Structure referring to the generated code with some attached information. */ LIBXSMM_EXTERN_C typedef struct libxsmm_generated_code { void* generated_code; /** pointer to memory which can contain strings or binary code */ unsigned int buffer_size; /** total size if the buffer generated_code */ unsigned int code_size; /** size of bytes used in generated_code */ unsigned int code_type; /** * 0: generated code contains inline assembly in a C function * which can be dumped into a *.c/cc/cpp file * 1: generated code contains assembly which can be * dumped into an *.s file * >1: generated code contains a function in binary code which can be * called, when the code is copied into executable memory */ unsigned int last_error; /** * 0: no error occurred * >0: error code */ unsigned int arch; /* target arch for the current code generation task */ unsigned int sf_size; /* offset of RSP to the beginning of the stack frame * we track this value to have RBP availbale for general compute */ } libxsmm_generated_code; /** function to translate LIBXSMM Generator error codes to error messages */ LIBXSMM_API const char* libxsmm_strerror(unsigned int i_error_code); /* @TODO change int based architecture value */ LIBXSMM_API void libxsmm_generator_gemm_inlineasm(const char* i_file_out, const char* i_routine_name, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch ); /* @TODO change int based architecture value */ LIBXSMM_API void libxsmm_generator_gemm_directasm(const char* i_file_out, const char* i_routine_name, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch ); LIBXSMM_API void libxsmm_generator_gemm_kernel(libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc ); /* @TODO change int based architecture value */ LIBXSMM_API void libxsmm_generator_spgemm(const char* i_file_out, const char* i_routine_name, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const char* i_file_in, const int i_is_csr); /* @TODO change int based architecture value */ LIBXSMM_API void libxsmm_generator_spgemm_csc_kernel(libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const double* i_values); /* @TODO change int based architecture value */ LIBXSMM_API void libxsmm_generator_spgemm_csr_kernel(libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const double* i_values); /* @TODO change int based architecture value */ LIBXSMM_API void libxsmm_generator_spgemm_csr_reg_kernel(libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const double* i_values); /* @TODO change int based architecture value */ LIBXSMM_API void libxsmm_generator_spgemm_csr_soa_kernel(libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const void* i_values, const unsigned int i_packed_width ); /* @TODO change int based architecture value */ LIBXSMM_API void libxsmm_generator_spgemm_csc_soa_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const void* i_values, const unsigned int i_packed_width ); /* @TODO change int based architecture value */ LIBXSMM_API void libxsmm_generator_packed_gemm_ac_rm( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_packed_width, const char* i_arch ); /* @TODO change int based architecture value */ LIBXSMM_API void libxsmm_generator_packed_gemm_bc_rm( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_packed_width, const char* i_arch ); LIBXSMM_API void libxsmm_generator_pgemm_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_pgemm_descriptor* i_packed_pgemm_desc, int i_arch, ... ); LIBXSMM_API void libxsmm_generator_getrf_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_getrf_descriptor* i_packed_pgemm_desc, int i_arch ); LIBXSMM_API void libxsmm_generator_trmm_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_trmm_descriptor* i_packed_trmm_desc, const char* i_arch ); LIBXSMM_API void libxsmm_generator_trsm_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_trsm_descriptor* i_packed_trsm_desc, const char* i_arch ); /* @TODO change int based architecture value */ LIBXSMM_API void libxsmm_generator_matcopy_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_mcopy_descriptor* i_matcopy_desc, const char* i_arch ); LIBXSMM_API void libxsmm_generator_mateltwise_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_meltw_descriptor* i_mateltw_desc ); LIBXSMM_API void libxsmm_generator_transpose_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_trans_descriptor* i_trans_desc, int i_arch ); /** Initialization counter that can be used to check whether the library is initialized (!=0) or not (==0). */ LIBXSMM_APIVAR_PUBLIC(unsigned int libxsmm_ninit); /** Target architecture (libxsmm_get_target_archid, libxsmm_set_target_archid). */ LIBXSMM_APIVAR_PUBLIC(int libxsmm_target_archid); /** Verbosity level (0: quiet, 1: errors, 2: warnings, 3: info, neg.: all/dump). */ LIBXSMM_APIVAR_PUBLIC(int libxsmm_verbosity); /** Security-enhanced environment. */ LIBXSMM_APIVAR_PUBLIC(int libxsmm_se); #endif /*LIBXSMM_GENERATOR_H*/ libxsmm-1.17/include/libxsmm_intrinsics_x86.h000066400000000000000000001531431415223013700213430ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_INTRINSICS_X86_H #define LIBXSMM_INTRINSICS_X86_H #include "libxsmm_cpuid.h" /** https://github.com/intel/Immintrin-debug */ #if !defined(LIBXSMM_INTRINSICS_DEBUG) && 0 # define LIBXSMM_INTRINSICS_DEBUG #endif #if defined(LIBXSMM_INTRINSICS_DEBUG) # include "immintrin_dbg.h" # define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CPX # if !defined(_mm512_undefined_epi32) # define _mm512_undefined_epi32() _mm512_set1_epi32(0) # endif # if !defined(_mm256_movemask_epi8) # define _mm256_movemask_epi8 mm256_movemask_epi8_dbg LIBXSMM_API_INLINE int mm256_movemask_epi8_dbg(__m256i k) { unsigned char mask[32], i; int result = 0; _mm256_storeu_si256((__m256i*)mask, k); for (i = 0; i < 32; ++i) result |= (mask[i] >> 7) << i; return result; } # endif # if !defined(_mm512_and_epi32) # define _mm512_and_epi32 mm512_and_epi32_dbg LIBXSMM_API_INLINE __m512i mm512_and_epi32_dbg(__m512i a, __m512i b) { uint32_t a16[16], b16[16]; signed char i; _mm512_storeu_si512((__m512i*)a16, a); _mm512_storeu_si512((__m512i*)b16, b); for (i = 0; i < 16; ++i) a16[i] &= b16[i]; return _mm512_loadu_si512((const __m512i*)a16); } # endif # if !defined(_mm512_or_epi32) # define _mm512_or_epi32 mm512_or_epi32_dbg LIBXSMM_API_INLINE __m512i mm512_or_epi32_dbg(__m512i a, __m512i b) { uint32_t a16[16], b16[16]; signed char i; _mm512_storeu_si512((__m512i*)a16, a); _mm512_storeu_si512((__m512i*)b16, b); for (i = 0; i < 16; ++i) a16[i] |= b16[i]; return _mm512_loadu_si512((const __m512i*)a16); } # endif # if !defined(_mm512_xor_epi32) # define _mm512_xor_epi32 mm512_xor_epi32_dbg LIBXSMM_API_INLINE __m512i mm512_xor_epi32_dbg(__m512i a, __m512i b) { uint32_t a16[16], b16[16]; signed char i; _mm512_storeu_si512((__m512i*)a16, a); _mm512_storeu_si512((__m512i*)b16, b); for (i = 0; i < 16; ++i) a16[i] ^= b16[i]; return _mm512_loadu_si512((const __m512i*)a16); } # endif # if !defined(_mm512_srli_epi32_dbg) /* GCC: avoid conflict w/ built-in */ # undef _mm512_srli_epi32 # define _mm512_srli_epi32 mm512_srli_epi32_dbg LIBXSMM_API_INLINE __m512i mm512_srli_epi32_dbg(__m512i a, unsigned int imm8) { uint32_t a16[16]; signed char i; _mm512_storeu_si512((__m512i*)a16, a); for (i = 0; i < 16; ++i) a16[i] >>= imm8; return _mm512_loadu_si512((const __m512i*)a16); } # endif # if !defined(_mm512_slli_epi32_dbg) /* GCC: avoid conflict w/ built-in */ # undef _mm512_slli_epi32 # define _mm512_slli_epi32 mm512_slli_epi32_dbg LIBXSMM_API_INLINE __m512i mm512_slli_epi32_dbg(__m512i a, unsigned int imm8) { uint32_t a16[16]; signed char i; _mm512_storeu_si512((__m512i*)a16, a); for (i = 0; i < 16; ++i) a16[i] <<= imm8; return _mm512_loadu_si512((const __m512i*)a16); } # endif # if !defined(_mm512_sub_ps) # define _mm512_sub_ps mm512_sub_ps_dbg LIBXSMM_API_INLINE __m512 mm512_sub_ps_dbg(__m512 a, __m512 b) { float a16[16], b16[16]; signed char i; _mm512_storeu_ps((__m512*)a16, a); _mm512_storeu_ps((__m512*)b16, b); for (i = 0; i < 16; ++i) a16[i] -= b16[i]; return _mm512_loadu_ps((const __m512*)a16); } # endif #endif /** Macro evaluates to LIBXSMM_ATTRIBUTE_TARGET_xxx (see below). */ #define LIBXSMM_ATTRIBUTE_TARGET(TARGET) LIBXSMM_CONCATENATE(LIBXSMM_ATTRIBUTE_TARGET_, TARGET) #if /*no intrinsics: tested with 17.x and 18.x*/(defined(__PGI) && \ LIBXSMM_VERSION2(19, 0) > LIBXSMM_VERSION2(__PGIC__, __PGIC_MINOR__)) \ || /*legacy*/(defined(_CRAYC) && !defined(__GNUC__)) # if !defined(LIBXSMM_INTRINSICS_NONE) && !defined(LIBXSMM_INTRINSICS_STATIC) # define LIBXSMM_INTRINSICS_NONE # endif #elif !defined(LIBXSMM_INTRINSICS_STATIC) && !defined(LIBXSMM_INTRINSICS_NONE) && ( \ (defined(__GNUC__) && !defined(__clang__) && !defined(LIBXSMM_INTEL_COMPILER) && !defined(_CRAYC) && \ LIBXSMM_VERSION2(4, 4) > LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)) /* GCC 4.4 (target-attribute) */ \ || (defined(__clang__) && LIBXSMM_VERSION2(3, 7) > LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) \ || (defined(__APPLE__) && defined(__MACH__) && !defined(LIBXSMM_INTEL_COMPILER) && defined(__clang__) && \ LIBXSMM_VERSION2(9, 0) > LIBXSMM_VERSION2(__clang_major__, __clang_minor__))) # define LIBXSMM_INTRINSICS_STATIC #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #if defined(__MIC__) && !defined(LIBXSMM_INTRINSICS_NONE) # if !defined(LIBXSMM_STATIC_TARGET_ARCH) # define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_TARGET_ARCH_GENERIC # endif # define LIBXSMM_INTRINSICS(TARGET) # define LIBXSMM_INTRINSICS_INCLUDE #elif !defined(LIBXSMM_INTRINSICS_NONE) /*!defined(__MIC__)*/ # if defined(__AVX512F__) && defined(__AVX512CD__) \ && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__) && defined(__AVX512VNNI__) && defined(__AVX512BF16__) \ && defined(__AVX2__) && defined(__FMA__) && defined(__AVX__) && defined(__SSE4_2__) && defined(__SSE4_1__) && defined(__SSE3__) \ && (!defined(__GNUC__) || defined(__clang__) || defined(LIBXSMM_INTEL_COMPILER) || defined(_CRAYC) /* TODO: check GCC, Clang, etc. */ \ || (LIBXSMM_VERSION2(10, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__))) \ && (!defined(__clang__) || (LIBXSMM_VERSION2( 9, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))) \ && (!defined(__APPLE__) || !defined(__MACH__) || LIBXSMM_VERSION2(99, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) # if !defined(LIBXSMM_STATIC_TARGET_ARCH) # define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CPX # endif # define LIBXSMM_INTRINSICS_INCLUDE # elif defined(__AVX512F__) && defined(__AVX512CD__) \ && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__) && defined(__AVX512VNNI__) \ && defined(__AVX2__) && defined(__FMA__) && defined(__AVX__) && defined(__SSE4_2__) && defined(__SSE4_1__) && defined(__SSE3__) \ && (!defined(__GNUC__) || defined(__clang__) || defined(LIBXSMM_INTEL_COMPILER) || defined(_CRAYC) \ || (LIBXSMM_VERSION2(8, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__))) \ && (!defined(__clang__) || (LIBXSMM_VERSION2(6, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))) \ && (!defined(__APPLE__) || !defined(__MACH__) || LIBXSMM_VERSION2(10, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) # if !defined(LIBXSMM_STATIC_TARGET_ARCH) # define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CLX # endif # define LIBXSMM_INTRINSICS_INCLUDE # elif defined(__AVX512F__) && defined(__AVX512CD__) \ && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__) \ && defined(__AVX2__) && defined(__FMA__) && defined(__AVX__) && defined(__SSE4_2__) && defined(__SSE4_1__) && defined(__SSE3__) \ && (!defined(__GNUC__) || defined(__clang__) || defined(LIBXSMM_INTEL_COMPILER) || defined(_CRAYC) \ || (LIBXSMM_VERSION2(5, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__))) \ && (!defined(__clang__) || (LIBXSMM_VERSION2(4, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))) \ && (!defined(__APPLE__) || !defined(__MACH__) || LIBXSMM_VERSION2(9, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) # if !defined(LIBXSMM_STATIC_TARGET_ARCH) # define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CORE # endif # define LIBXSMM_INTRINSICS_INCLUDE # elif defined(__AVX512F__) && defined(__AVX512CD__) \ && defined(__AVX512PF__) && defined(__AVX512ER__) \ && defined(__AVX2__) && defined(__FMA__) && defined(__AVX__) && defined(__SSE4_2__) && defined(__SSE4_1__) && defined(__SSE3__) \ && (!defined(__GNUC__) || defined(__clang__) || defined(LIBXSMM_INTEL_COMPILER) || defined(_CRAYC) \ || (LIBXSMM_VERSION2(5, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__))) \ && (!defined(__clang__) || (LIBXSMM_VERSION2(4, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))) \ && (!defined(__APPLE__) || !defined(__MACH__) || LIBXSMM_VERSION2(9, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) # if !defined(LIBXSMM_STATIC_TARGET_ARCH) # define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_MIC # endif # define LIBXSMM_INTRINSICS_INCLUDE # elif defined(__AVX512F__) && defined(__AVX512CD__) \ && defined(__AVX2__) && defined(__FMA__) && defined(__AVX__) && defined(__SSE4_2__) && defined(__SSE4_1__) && defined(__SSE3__) \ && (!defined(__GNUC__) || defined(__clang__) || defined(LIBXSMM_INTEL_COMPILER) || defined(_CRAYC) \ || (LIBXSMM_VERSION2(5, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__))) \ && (!defined(__clang__) || (LIBXSMM_VERSION2(4, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))) \ && (!defined(__APPLE__) || !defined(__MACH__) || LIBXSMM_VERSION2(9, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) # if !defined(LIBXSMM_STATIC_TARGET_ARCH) # define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512 # endif # define LIBXSMM_INTRINSICS_INCLUDE # elif defined(__AVX2__) && defined(__FMA__) && defined(__AVX__) && defined(__SSE4_2__) && defined(__SSE4_1__) && defined(__SSE3__) # if !defined(LIBXSMM_STATIC_TARGET_ARCH) # define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_AVX2 # endif # define LIBXSMM_INTRINSICS_INCLUDE # elif defined(__AVX__) && defined(__SSE4_2__) && defined(__SSE4_1__) && defined(__SSE3__) # if !defined(LIBXSMM_STATIC_TARGET_ARCH) # define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_AVX # endif # define LIBXSMM_INTRINSICS_INCLUDE # elif defined(__SSE4_2__) && defined(__SSE4_1__) && defined(__SSE3__) # if !defined(LIBXSMM_STATIC_TARGET_ARCH) # define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_SSE4 # endif # define LIBXSMM_INTRINSICS_INCLUDE # elif defined(__SSE3__) # if !defined(LIBXSMM_STATIC_TARGET_ARCH) # define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_SSE3 # endif # define LIBXSMM_INTRINSICS_INCLUDE # elif defined(LIBXSMM_PLATFORM_X86) # if !defined(LIBXSMM_STATIC_TARGET_ARCH) # define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_GENERIC # endif # if defined(__GNUC__) # define LIBXSMM_INTRINSICS_INCLUDE # endif # endif # if defined(LIBXSMM_STATIC_TARGET_ARCH) && !defined(LIBXSMM_INTRINSICS_STATIC) # if defined(__INTEL_COMPILER) /* TODO: compiler version check for LIBXSMM_MAX_STATIC_TARGET_ARCH */ # if 1904 <= (LIBXSMM_INTEL_COMPILER) && !defined(_WIN32) # define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CPX # elif 1801 <= (LIBXSMM_INTEL_COMPILER) # define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CLX # elif 1500 <= (LIBXSMM_INTEL_COMPILER) # define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CORE # elif 1400 <= (LIBXSMM_INTEL_COMPILER) # define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_MIC # else # define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX2 # endif # define LIBXSMM_INTRINSICS(TARGET)/*no need for target flags*/ # define LIBXSMM_INTRINSICS_INCLUDE # elif defined(_CRAYC) && defined(__GNUC__) /* TODO: version check, e.g., LIBXSMM_VERSION2(11, 5) <= LIBXSMM_VERSION2(_RELEASE, _RELEASE_MINOR) */ # define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX # define LIBXSMM_INTRINSICS(TARGET)/*no need for target flags*/ # define LIBXSMM_INTRINSICS_INCLUDE # elif defined(_MSC_VER) && !defined(__clang__) /* TODO: compiler version check for LIBXSMM_MAX_STATIC_TARGET_ARCH */ # define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX2 # define LIBXSMM_INTRINSICS(TARGET)/*no need for target flags*/ # define LIBXSMM_INTRINSICS_INCLUDE # elif (!defined(__GNUC__) || LIBXSMM_VERSION2(4, 9) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)) \ && (!defined(__clang__) || LIBXSMM_VERSION2(4, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) \ && (!defined(__APPLE__) || !defined(__MACH__)) && !defined(__PGI) && !defined(_MSC_VER) # if defined(__CYGWIN__) && !defined(LIBXSMM_INTRINSICS_DEBUG) /* Cygwin: invalid register for .seh_savexmm */ # define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX2 # elif (defined(__clang__) && LIBXSMM_VERSION2(10, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) # define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CPX # elif (defined(__GNUC__) && LIBXSMM_VERSION2(10, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)) \ || (defined(__clang__) && LIBXSMM_VERSION2( 9, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__) && !defined(__cray__)) # define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CPX # elif (defined(__GNUC__) && LIBXSMM_VERSION2(8, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)) \ || (defined(__clang__) && LIBXSMM_VERSION2(6, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) # define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CLX # elif (defined(__GNUC__) && LIBXSMM_VERSION2(5, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)) \ || (defined(__clang__) && LIBXSMM_VERSION2(6, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) # define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CORE # else # define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX2 # endif # define LIBXSMM_INTRINSICS_INCLUDE # else /* GCC/legacy incl. Clang */ # if defined(__clang__) && !(defined(__APPLE__) && defined(__MACH__)) && !defined(_WIN32) # if (LIBXSMM_VERSION2(7, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) /* TODO */ /* no limitations */ # elif (LIBXSMM_VERSION2(4, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) # if !defined(LIBXSMM_INTRINSICS_STATIC) && (LIBXSMM_STATIC_TARGET_ARCH < LIBXSMM_X86_AVX2/*workaround*/) # define LIBXSMM_INTRINSICS_STATIC # endif # elif !defined(LIBXSMM_INTRINSICS_STATIC) # define LIBXSMM_INTRINSICS_STATIC # endif # if defined(__CYGWIN__) && !defined(LIBXSMM_INTRINSICS_DEBUG) /* Cygwin: invalid register for .seh_savexmm */ # define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX2 # elif LIBXSMM_VERSION2(10, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__) # define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CPX # elif LIBXSMM_VERSION2( 9, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__) && !defined(__cray__) # define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CPX # elif LIBXSMM_VERSION2( 6, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__) # define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CLX # else # define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CORE # endif # else /* fall-back */ # define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_STATIC_TARGET_ARCH # if !defined(LIBXSMM_INTRINSICS_STATIC) && (LIBXSMM_STATIC_TARGET_ARCH < LIBXSMM_X86_AVX2/*workaround*/) # define LIBXSMM_INTRINSICS_STATIC # endif # endif # if !defined(LIBXSMM_INTRINSICS_INCLUDE) && (!defined(__PGI) || LIBXSMM_VERSION2(19, 0) <= LIBXSMM_VERSION2(__PGIC__, __PGIC_MINOR__)) # define LIBXSMM_INTRINSICS_INCLUDE # endif # endif /* GCC/legacy incl. Clang */ # if !defined(LIBXSMM_MAX_STATIC_TARGET_ARCH) # error "LIBXSMM_MAX_STATIC_TARGET_ARCH not defined!" # endif # if defined(LIBXSMM_INTRINSICS_INCLUDE) && !defined(LIBXSMM_INTRINSICS_NONE) && !defined(LIBXSMM_INTRINSICS_DEBUG) # include # endif /*defined(LIBXSMM_INTRINSICS_INCLUDE)*/ # if !defined(LIBXSMM_INTRINSICS) # if (LIBXSMM_MAX_STATIC_TARGET_ARCH > LIBXSMM_STATIC_TARGET_ARCH) # define LIBXSMM_INTRINSICS(TARGET) LIBXSMM_ATTRIBUTE(LIBXSMM_ATTRIBUTE_TARGET(TARGET)) /* LIBXSMM_ATTRIBUTE_TARGET_xxx is required to literally match the CPUID (libxsmm_cpuid.h)! */ # define LIBXSMM_ATTRIBUTE_TARGET_1002 target("sse2") /* LIBXSMM_X86_GENERIC (64-bit ABI) */ # if (LIBXSMM_X86_SSE3 <= LIBXSMM_MAX_STATIC_TARGET_ARCH) # define LIBXSMM_ATTRIBUTE_TARGET_1003 target("sse3") # else # define LIBXSMM_ATTRIBUTE_TARGET_1003 LIBXSMM_ATTRIBUTE_TARGET_1002 # endif # if (LIBXSMM_X86_SSE4 <= LIBXSMM_MAX_STATIC_TARGET_ARCH) # define LIBXSMM_ATTRIBUTE_TARGET_1004 target("sse4.1,sse4.2") # else # define LIBXSMM_ATTRIBUTE_TARGET_1004 LIBXSMM_ATTRIBUTE_TARGET_1003 # endif # if (LIBXSMM_X86_AVX <= LIBXSMM_MAX_STATIC_TARGET_ARCH) # define LIBXSMM_ATTRIBUTE_TARGET_1005 target("avx") # else # define LIBXSMM_ATTRIBUTE_TARGET_1005 LIBXSMM_ATTRIBUTE_TARGET_1004 # endif # if (LIBXSMM_X86_AVX2 <= LIBXSMM_MAX_STATIC_TARGET_ARCH) # define LIBXSMM_ATTRIBUTE_TARGET_1006 target("avx2,fma") # else # define LIBXSMM_ATTRIBUTE_TARGET_1006 LIBXSMM_ATTRIBUTE_TARGET_1005 # endif # if (LIBXSMM_X86_AVX512 <= LIBXSMM_MAX_STATIC_TARGET_ARCH) # define LIBXSMM_ATTRIBUTE_TARGET_1007 target("avx2,fma,avx512f,avx512cd") # else # define LIBXSMM_ATTRIBUTE_TARGET_1007 LIBXSMM_ATTRIBUTE_TARGET_1006 # endif # if (LIBXSMM_X86_AVX512_MIC <= LIBXSMM_MAX_STATIC_TARGET_ARCH) # define LIBXSMM_ATTRIBUTE_TARGET_1010 target("avx2,fma,avx512f,avx512cd,avx512pf,avx512er") # else /* LIBXSMM_X86_AVX512 */ # define LIBXSMM_ATTRIBUTE_TARGET_1010 LIBXSMM_ATTRIBUTE_TARGET_1007 # endif # if (LIBXSMM_X86_AVX512_KNM <= LIBXSMM_MAX_STATIC_TARGET_ARCH) # define LIBXSMM_ATTRIBUTE_TARGET_1011 target("avx2,fma,avx512f,avx512cd,avx512pf,avx512er,avx5124vnniw,avx5124fmaps") # else /* LIBXSMM_X86_AVX512_MIC */ # define LIBXSMM_ATTRIBUTE_TARGET_1011 LIBXSMM_ATTRIBUTE_TARGET_1010 # endif # if (LIBXSMM_X86_AVX512_CORE <= LIBXSMM_MAX_STATIC_TARGET_ARCH) # define LIBXSMM_ATTRIBUTE_TARGET_1020 target("avx2,fma,avx512f,avx512cd,avx512dq,avx512bw,avx512vl") # else /* LIBXSMM_X86_AVX512 */ # define LIBXSMM_ATTRIBUTE_TARGET_1020 LIBXSMM_ATTRIBUTE_TARGET_1007 # endif # if (LIBXSMM_X86_AVX512_CLX <= LIBXSMM_MAX_STATIC_TARGET_ARCH) # define LIBXSMM_ATTRIBUTE_TARGET_1021 target("avx2,fma,avx512f,avx512cd,avx512dq,avx512bw,avx512vl,avx512vnni") # else /* LIBXSMM_X86_AVX512_CORE */ # define LIBXSMM_ATTRIBUTE_TARGET_1021 LIBXSMM_ATTRIBUTE_TARGET_1020 # endif # if (LIBXSMM_X86_AVX512_CPX <= LIBXSMM_MAX_STATIC_TARGET_ARCH) # define LIBXSMM_ATTRIBUTE_TARGET_1022 target("avx2,fma,avx512f,avx512cd,avx512dq,avx512bw,avx512vl,avx512vnni,avx512bf16") # else /* LIBXSMM_X86_AVX512_CORE */ # define LIBXSMM_ATTRIBUTE_TARGET_1022 LIBXSMM_ATTRIBUTE_TARGET_1021 # endif # else # define LIBXSMM_INTRINSICS(TARGET)/*no need for target flags*/ # endif # elif !defined(LIBXSMM_INTRINSICS_TARGET) # define LIBXSMM_INTRINSICS_TARGET # endif /*!defined(LIBXSMM_INTRINSICS)*/ # endif /*defined(LIBXSMM_STATIC_TARGET_ARCH)*/ #endif /*!defined(LIBXSMM_INTRINSICS_NONE)*/ #if !defined(LIBXSMM_STATIC_TARGET_ARCH) # if !defined(LIBXSMM_INTRINSICS_NONE) && !defined(LIBXSMM_INTRINSICS_STATIC) # define LIBXSMM_INTRINSICS_NONE # endif # define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_TARGET_ARCH_GENERIC #endif #if !defined(LIBXSMM_MAX_STATIC_TARGET_ARCH) # define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_STATIC_TARGET_ARCH #elif (LIBXSMM_MAX_STATIC_TARGET_ARCH < LIBXSMM_STATIC_TARGET_ARCH) # undef LIBXSMM_MAX_STATIC_TARGET_ARCH # define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_STATIC_TARGET_ARCH #endif #if !defined(LIBXSMM_INTRINSICS) # define LIBXSMM_INTRINSICS(TARGET) #endif /** Include basic x86 intrinsics such as __rdtsc. */ #if defined(LIBXSMM_INTRINSICS_INCLUDE) && !defined(LIBXSMM_INTRINSICS_DEBUG) # if defined(_WIN32) # include # elif defined(LIBXSMM_INTEL_COMPILER) || defined(_CRAYC) || defined(__clang__) || defined(__PGI) # include # elif defined(__GNUC__) && (LIBXSMM_VERSION2(4, 4) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)) # include # endif # include # if defined(__SSE3__) # include # endif #endif #if !defined(LIBXSMM_INTRINSICS_NONE) # if defined(_WIN32) # include # else # include # endif #endif /** * Intrinsic-specific fix-ups */ #if defined(__clang__) # define LIBXSMM_INTRINSICS_LDDQU_SI128(A) _mm_loadu_si128(A) #else # define LIBXSMM_INTRINSICS_LDDQU_SI128(A) _mm_lddqu_si128(A) #endif #if !defined(LIBXSMM_INTEL_COMPILER) && defined(__clang__) && ( \ (LIBXSMM_VERSION2(3, 9) > LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) \ || (LIBXSMM_VERSION2(7, 3) > LIBXSMM_VERSION2(__clang_major__, __clang_minor__) && \ defined(__APPLE__) && defined(__MACH__))) /* prototypes with incorrect signature: _mm512_load_ps takes DP*, _mm512_load_pd takes SP* (checked with v3.8.1) */ # define LIBXSMM_INTRINSICS_MM512_LOAD_PS(A) _mm512_loadu_ps((const double*)(A)) # define LIBXSMM_INTRINSICS_MM512_LOAD_PD(A) _mm512_loadu_pd((const float*)(A)) /* Clang misses _mm512_stream_p? (checked with v3.8.1). */ # define LIBXSMM_INTRINSICS_MM512_STREAM_SI512(A, B) _mm512_store_si512(A, B) # define LIBXSMM_INTRINSICS_MM512_STREAM_PS(A, B) _mm512_storeu_ps(A, B) # define LIBXSMM_INTRINSICS_MM512_STREAM_PD(A, B) _mm512_store_pd(A, B) #else # define LIBXSMM_INTRINSICS_MM512_LOAD_PS(A) _mm512_loadu_ps((const float*)(A)) # define LIBXSMM_INTRINSICS_MM512_LOAD_PD(A) _mm512_loadu_pd((const double*)(A)) # define LIBXSMM_INTRINSICS_MM512_STREAM_SI512(A, B) _mm512_stream_si512((__m512i*)(A), (B)) # define LIBXSMM_INTRINSICS_MM512_STREAM_PS(A, B) _mm512_stream_ps(A, B) # define LIBXSMM_INTRINSICS_MM512_STREAM_PD(A, B) _mm512_stream_pd(A, B) #endif #if !defined(LIBXSMM_INTEL_COMPILER) || (defined(__clang__) && ( \ (LIBXSMM_VERSION2(8, 0) > LIBXSMM_VERSION2(__clang_major__, __clang_minor__)))) \ || (defined(__APPLE__) && defined(__MACH__)) || defined(__GNUC__) # define LIBXSMM_INTRINSICS_MM256_STORE_EPI32(A, B) _mm256_storeu_si256((__m256i*)(A), B) #else # define LIBXSMM_INTRINSICS_MM256_STORE_EPI32(A, B) _mm256_storeu_epi32(A, B) #endif #if defined(LIBXSMM_INTEL_COMPILER) # if 1600 <= (LIBXSMM_INTEL_COMPILER) # define LIBXSMM_INTRINSICS_MM512_SET_EPI16(E31, E30, E29, E28, E27, E26, E25, E24, E23, E22, E21, E20, E19, E18, E17, E16, \ E15, E14, E13, E12, E11, E10, E9, E8, E7, E6, E5, E4, E3, E2, E1, E0) \ _mm512_set_epi16(E31, E30, E29, E28, E27, E26, E25, E24, E23, E22, E21, E20, E19, E18, E17, E16, \ E15, E14, E13, E12, E11, E10, E9, E8, E7, E6, E5, E4, E3, E2, E1, E0) # else # define LIBXSMM_INTRINSICS_MM512_SET_EPI16(E31, E30, E29, E28, E27, E26, E25, E24, E23, E22, E21, E20, E19, E18, E17, E16, \ E15, E14, E13, E12, E11, E10, E9, E8, E7, E6, E5, E4, E3, E2, E1, E0) \ _mm512_castps_si512(_mm512_set_epi16(E31, E30, E29, E28, E27, E26, E25, E24, E23, E22, E21, E20, E19, E18, E17, E16, \ E15, E14, E13, E12, E11, E10, E9, E8, E7, E6, E5, E4, E3, E2, E1, E0)) # endif #else # define LIBXSMM_INTRINSICS_MM512_SET_EPI16(E31, E30, E29, E28, E27, E26, E25, E24, E23, E22, E21, E20, E19, E18, E17, E16, \ E15, E14, E13, E12, E11, E10, E9, E8, E7, E6, E5, E4, E3, E2, E1, E0) \ _mm512_set_epi32(((E31) << 16) | (E30), ((E29) << 16) | (E28), ((E27) << 16) | (E26), ((E25) << 16) | (E24), \ ((E23) << 16) | (E22), ((E21) << 16) | (E20), ((E19) << 16) | (E18), ((E17) << 16) | (E16), \ ((E15) << 16) | (E14), ((E13) << 16) | (E12), ((E11) << 16) | (E10), ((E9) << 16) | (E8), \ ((E7) << 16) | (E6), ((E5) << 16) | (E4), ((E3) << 16) | (E2), ((E1) << 16) | (E0)) #endif #if defined(LIBXSMM_INTEL_COMPILER) \ || (defined(__GNUC__) && LIBXSMM_VERSION2(7, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)) \ || (defined(__clang__) && (!defined(__APPLE__) || !defined(__MACH__)) \ && LIBXSMM_VERSION2(4, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) # define LIBXSMM_INTRINSICS_MM512_MASK_I32GATHER_EPI32(A, B, C, D, E) _mm512_mask_i32gather_epi32(A, B, C, D, E) # define LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(A, B) _mm512_extracti64x4_epi64(A, B) # define LIBXSMM_INTRINSICS_MM512_ABS_PS(A) _mm512_abs_ps(A) # define LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32() _mm512_undefined_epi32() # define LIBXSMM_INTRINSICS_MM512_UNDEFINED() _mm512_undefined() # define LIBXSMM_INTRINSICS_MM_UNDEFINED_PD() _mm_undefined_pd() #else # define LIBXSMM_INTRINSICS_MM512_MASK_I32GATHER_EPI32(A, B, C, D, E) _mm512_castps_si512(_mm512_mask_i32gather_ps( \ _mm512_castsi512_ps(A), B, C, (const float*)(D), E)) # define LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(A, B) _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castsi512_pd(A), B)) # define LIBXSMM_INTRINSICS_MM512_ABS_PS(A) _mm512_castsi512_ps(_mm512_and_epi32( \ _mm512_castps_si512(A), _mm512_set1_epi32(0x7FFFFFFF))) # define LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32() _mm512_set1_epi32(0) # define LIBXSMM_INTRINSICS_MM512_UNDEFINED() _mm512_set1_ps(0) # define LIBXSMM_INTRINSICS_MM_UNDEFINED_PD() _mm_set1_pd(0) #endif #if (defined(LIBXSMM_INTEL_COMPILER) && (1800 <= (LIBXSMM_INTEL_COMPILER))) \ || (!defined(LIBXSMM_INTEL_COMPILER) && defined(__GNUC__) \ && LIBXSMM_VERSION2(7, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)) \ || ((!defined(__APPLE__) || !defined(__MACH__)) && defined(__clang__) \ && LIBXSMM_VERSION2(8, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) # define LIBXSMM_INTRINSICS_MM512_STORE_MASK(DST_PTR, SRC, NBITS) \ LIBXSMM_CONCATENATE(_store_mask, NBITS)((LIBXSMM_CONCATENATE(__mmask, NBITS)*)(DST_PTR), SRC) # define LIBXSMM_INTRINSICS_MM512_LOAD_MASK(SRC_PTR, NBITS) \ LIBXSMM_CONCATENATE(_load_mask, NBITS)((/*const*/ LIBXSMM_CONCATENATE(__mmask, NBITS)*)(SRC_PTR)) # define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK(A, NBITS) LIBXSMM_CONCATENATE(_cvtu32_mask, NBITS)((unsigned int)(A)) #elif defined(LIBXSMM_INTEL_COMPILER) # define LIBXSMM_INTRINSICS_MM512_STORE_MASK(DST_PTR, SRC, NBITS) \ (*(LIBXSMM_CONCATENATE(__mmask, NBITS)*)(DST_PTR) = (LIBXSMM_CONCATENATE(__mmask, NBITS))(SRC)) # define LIBXSMM_INTRINSICS_MM512_LOAD_MASK(SRC_PTR, NBITS) \ ((LIBXSMM_CONCATENATE(__mmask, NBITS))_mm512_mask2int(*(const __mmask16*)(SRC_PTR))) # define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK(A, NBITS) LIBXSMM_CONCATENATE(LIBXSMM_INTRINSICS_MM512_CVTU32_MASK_, NBITS)(A) # define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK_32(A) ((__mmask32)(A)) # define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK_16(A) _mm512_int2mask((int)(A)) # define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK_8(A) ((__mmask8)(A)) #else # define LIBXSMM_INTRINSICS_MM512_STORE_MASK(DST_PTR, SRC, NBITS) \ (*(LIBXSMM_CONCATENATE(__mmask, NBITS)*)(DST_PTR) = (LIBXSMM_CONCATENATE(__mmask, NBITS))(SRC)) # define LIBXSMM_INTRINSICS_MM512_LOAD_MASK(SRC_PTR, NBITS) (*(const LIBXSMM_CONCATENATE(__mmask, NBITS)*)(SRC_PTR)) # define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK(A, NBITS) ((LIBXSMM_CONCATENATE(__mmask, NBITS))(A)) #endif #define LIBXSMM_INTRINSICS_MM512_STORE_MASK64(DST_PTR, SRC) LIBXSMM_INTRINSICS_MM512_STORE_MASK(DST_PTR, SRC, 64) #define LIBXSMM_INTRINSICS_MM512_STORE_MASK32(DST_PTR, SRC) LIBXSMM_INTRINSICS_MM512_STORE_MASK(DST_PTR, SRC, 32) #define LIBXSMM_INTRINSICS_MM512_STORE_MASK16(DST_PTR, SRC) LIBXSMM_INTRINSICS_MM512_STORE_MASK(DST_PTR, SRC, 16) #define LIBXSMM_INTRINSICS_MM512_STORE_MASK8(DST_PTR, SRC) LIBXSMM_INTRINSICS_MM512_STORE_MASK(DST_PTR, SRC, 8) #define LIBXSMM_INTRINSICS_MM512_LOAD_MASK64(SRC_PTR) LIBXSMM_INTRINSICS_MM512_LOAD_MASK(SRC_PTR, 64) #define LIBXSMM_INTRINSICS_MM512_LOAD_MASK32(SRC_PTR) LIBXSMM_INTRINSICS_MM512_LOAD_MASK(SRC_PTR, 32) #define LIBXSMM_INTRINSICS_MM512_LOAD_MASK16(SRC_PTR) LIBXSMM_INTRINSICS_MM512_LOAD_MASK(SRC_PTR, 16) #define LIBXSMM_INTRINSICS_MM512_LOAD_MASK8(SRC_PTR) LIBXSMM_INTRINSICS_MM512_LOAD_MASK(SRC_PTR, 8) #define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK32(A) LIBXSMM_INTRINSICS_MM512_CVTU32_MASK(A, 32) #define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK16(A) LIBXSMM_INTRINSICS_MM512_CVTU32_MASK(A, 16) #define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK8(A) LIBXSMM_INTRINSICS_MM512_CVTU32_MASK(A, 8) /** * Pseudo intrinsics for portability */ LIBXSMM_API_INLINE int LIBXSMM_INTRINSICS_BITSCANFWD32_SW(unsigned int n) { unsigned int i, r = 0; if (0 != n) for (i = 1; 0 == (n & i); i <<= 1) { ++r; } return r; } LIBXSMM_API_INLINE int LIBXSMM_INTRINSICS_BITSCANFWD64_SW(unsigned long long n) { unsigned int i, r = 0; if (0 != n) for (i = 1; 0 == (n & i); i <<= 1) { ++r; } return r; } /** Binary Logarithm (based on Stackoverflow's NBITSx macro). */ #define LIBXSMM_INTRINSICS_BITSCANBWD_SW02(N) (0 != ((N) & 0x2/*0b10*/) ? 1 : 0) #define LIBXSMM_INTRINSICS_BITSCANBWD_SW04(N) (0 != ((N) & 0xC/*0b1100*/) ? (2 | LIBXSMM_INTRINSICS_BITSCANBWD_SW02((N) >> 2)) : LIBXSMM_INTRINSICS_BITSCANBWD_SW02(N)) #define LIBXSMM_INTRINSICS_BITSCANBWD_SW08(N) (0 != ((N) & 0xF0/*0b11110000*/) ? (4 | LIBXSMM_INTRINSICS_BITSCANBWD_SW04((N) >> 4)) : LIBXSMM_INTRINSICS_BITSCANBWD_SW04(N)) #define LIBXSMM_INTRINSICS_BITSCANBWD_SW16(N) (0 != ((N) & 0xFF00) ? (8 | LIBXSMM_INTRINSICS_BITSCANBWD_SW08((N) >> 8)) : LIBXSMM_INTRINSICS_BITSCANBWD_SW08(N)) #define LIBXSMM_INTRINSICS_BITSCANBWD_SW32(N) (0 != ((N) & 0xFFFF0000) ? (16 | LIBXSMM_INTRINSICS_BITSCANBWD_SW16((N) >> 16)) : LIBXSMM_INTRINSICS_BITSCANBWD_SW16(N)) #define LIBXSMM_INTRINSICS_BITSCANBWD_SW64(N) (0 != ((N) & 0xFFFFFFFF00000000) ? (32 | LIBXSMM_INTRINSICS_BITSCANBWD_SW32((N) >> 32)) : LIBXSMM_INTRINSICS_BITSCANBWD_SW32(N)) #define LIBXSMM_INTRINSICS_BITSCANBWD32_SW(N) LIBXSMM_INTRINSICS_BITSCANBWD_SW32((unsigned int)(N)) #define LIBXSMM_INTRINSICS_BITSCANBWD64_SW(N) LIBXSMM_INTRINSICS_BITSCANBWD_SW64((unsigned long long)(N)) #if defined(_WIN32) && !defined(LIBXSMM_INTRINSICS_NONE) LIBXSMM_API_INLINE unsigned int LIBXSMM_INTRINSICS_BITSCANFWD32(unsigned int n) { unsigned long r = 0; _BitScanForward(&r, n); return (0 != n) * r; } LIBXSMM_API_INLINE unsigned int LIBXSMM_INTRINSICS_BITSCANBWD32(unsigned int n) { unsigned long r = 0; _BitScanReverse(&r, n); return r; } # if defined(_WIN64) LIBXSMM_API_INLINE unsigned int LIBXSMM_INTRINSICS_BITSCANFWD64(unsigned long long n) { unsigned long r = 0; _BitScanForward64(&r, n); return (0 != n) * r; } LIBXSMM_API_INLINE unsigned int LIBXSMM_INTRINSICS_BITSCANBWD64(unsigned long long n) { unsigned long r = 0; _BitScanReverse64(&r, n); return r; } # else # define LIBXSMM_INTRINSICS_BITSCANFWD64 LIBXSMM_INTRINSICS_BITSCANFWD64_SW # define LIBXSMM_INTRINSICS_BITSCANBWD64 LIBXSMM_INTRINSICS_BITSCANBWD64_SW # endif #elif defined(__GNUC__) && !defined(LIBXSMM_INTRINSICS_NONE) # define LIBXSMM_INTRINSICS_BITSCANFWD32(N) ((0 != (N)) * __builtin_ctz(N)) # define LIBXSMM_INTRINSICS_BITSCANFWD64(N) ((0 != (N)) * __builtin_ctzll(N)) # define LIBXSMM_INTRINSICS_BITSCANBWD32(N) ((0 != (N)) * (31 - __builtin_clz(N))) # define LIBXSMM_INTRINSICS_BITSCANBWD64(N) ((0 != (N)) * (63 - __builtin_clzll(N))) #else /* fall-back implementation */ # define LIBXSMM_INTRINSICS_BITSCANFWD32 LIBXSMM_INTRINSICS_BITSCANFWD32_SW # define LIBXSMM_INTRINSICS_BITSCANFWD64 LIBXSMM_INTRINSICS_BITSCANFWD64_SW # define LIBXSMM_INTRINSICS_BITSCANBWD32 LIBXSMM_INTRINSICS_BITSCANBWD32_SW # define LIBXSMM_INTRINSICS_BITSCANBWD64 LIBXSMM_INTRINSICS_BITSCANBWD64_SW #endif /** LIBXSMM_NBITS determines the minimum number of bits needed to represent N. */ #define LIBXSMM_NBITS(N) (LIBXSMM_INTRINSICS_BITSCANBWD64(N) + LIBXSMM_MIN(1, N)) #define LIBXSMM_ISQRT2(N) ((unsigned int)((1ULL << (LIBXSMM_NBITS(N) >> 1)) /*+ LIBXSMM_MIN(1, N)*/)) /** LIBXSMM_ILOG2 definition matches ceil(log2(N)). */ LIBXSMM_API_INLINE unsigned int LIBXSMM_ILOG2(unsigned long long n) { unsigned int result = 0; if (1 < n) { const unsigned int m = LIBXSMM_INTRINSICS_BITSCANBWD64(n); result = m + ((unsigned int)LIBXSMM_INTRINSICS_BITSCANBWD64(n - 1) == m); } return result; } /** * Target attribution */ #if !defined(LIBXSMM_INTRINSICS_KNC) && !defined(LIBXSMM_INTRINSICS_NONE) && defined(__MIC__) # define LIBXSMM_INTRINSICS_KNC #endif /** LIBXSMM_INTRINSICS_X86 is defined only if the compiler is able to generate this code without special flags. */ #if !defined(LIBXSMM_INTRINSICS_X86) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_GENERIC <= LIBXSMM_STATIC_TARGET_ARCH || \ (!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_GENERIC <= LIBXSMM_MAX_STATIC_TARGET_ARCH)) # define LIBXSMM_INTRINSICS_X86 #endif /** LIBXSMM_INTRINSICS_SSE3 is defined only if the compiler is able to generate this code without special flags. */ #if !defined(LIBXSMM_INTRINSICS_SSE3) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_SSE3 <= LIBXSMM_STATIC_TARGET_ARCH || \ (!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_SSE3 <= LIBXSMM_MAX_STATIC_TARGET_ARCH)) # define LIBXSMM_INTRINSICS_SSE3 #endif /** LIBXSMM_INTRINSICS_SSE4 is defined only if the compiler is able to generate this code without special flags. */ #if !defined(LIBXSMM_INTRINSICS_SSE4) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_SSE4 <= LIBXSMM_STATIC_TARGET_ARCH || \ (!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_SSE4 <= LIBXSMM_MAX_STATIC_TARGET_ARCH)) # define LIBXSMM_INTRINSICS_SSE4 #endif /** LIBXSMM_INTRINSICS_AVX is defined only if the compiler is able to generate this code without special flags. */ #if !defined(LIBXSMM_INTRINSICS_AVX) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_AVX <= LIBXSMM_STATIC_TARGET_ARCH || \ (!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_AVX <= LIBXSMM_MAX_STATIC_TARGET_ARCH)) # define LIBXSMM_INTRINSICS_AVX #endif /** LIBXSMM_INTRINSICS_AVX2 is defined only if the compiler is able to generate this code without special flags. */ #if !defined(LIBXSMM_INTRINSICS_AVX2) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_AVX2 <= LIBXSMM_STATIC_TARGET_ARCH || \ (!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_AVX2 <= LIBXSMM_MAX_STATIC_TARGET_ARCH)) # define LIBXSMM_INTRINSICS_AVX2 #endif /** LIBXSMM_INTRINSICS_AVX512 is defined only if the compiler is able to generate this code without special flags. */ #if !defined(LIBXSMM_INTRINSICS_AVX512) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_AVX512 <= LIBXSMM_STATIC_TARGET_ARCH || \ (!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_AVX512 <= LIBXSMM_MAX_STATIC_TARGET_ARCH)) # define LIBXSMM_INTRINSICS_AVX512 #endif /** LIBXSMM_INTRINSICS_AVX512_MIC is defined only if the compiler is able to generate this code without special flags. */ #if !defined(LIBXSMM_INTRINSICS_AVX512_MIC) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_AVX512_MIC <= LIBXSMM_STATIC_TARGET_ARCH || \ (!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_AVX512_MIC <= LIBXSMM_MAX_STATIC_TARGET_ARCH)) # define LIBXSMM_INTRINSICS_AVX512_MIC #endif /** LIBXSMM_INTRINSICS_AVX512_KNM is defined only if the compiler is able to generate this code without special flags. */ #if !defined(LIBXSMM_INTRINSICS_AVX512_KNM) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_AVX512_KNM <= LIBXSMM_STATIC_TARGET_ARCH || \ (!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_AVX512_KNM <= LIBXSMM_MAX_STATIC_TARGET_ARCH)) # define LIBXSMM_INTRINSICS_AVX512_KNM #endif /** LIBXSMM_INTRINSICS_AVX512_CORE is defined only if the compiler is able to generate this code without special flags. */ #if !defined(LIBXSMM_INTRINSICS_AVX512_CORE) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_AVX512_CORE <= LIBXSMM_STATIC_TARGET_ARCH || \ (!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_AVX512_CORE <= LIBXSMM_MAX_STATIC_TARGET_ARCH)) # define LIBXSMM_INTRINSICS_AVX512_CORE #endif /** LIBXSMM_INTRINSICS_AVX512_CLX is defined only if the compiler is able to generate this code without special flags. */ #if !defined(LIBXSMM_INTRINSICS_AVX512_CLX) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_AVX512_CLX <= LIBXSMM_STATIC_TARGET_ARCH || \ (!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_AVX512_CLX <= LIBXSMM_MAX_STATIC_TARGET_ARCH)) # define LIBXSMM_INTRINSICS_AVX512_CLX #endif /** LIBXSMM_INTRINSICS_AVX512_CPX is defined only if the compiler is able to generate this code without special flags. */ #if !defined(LIBXSMM_INTRINSICS_AVX512_CPX) && !defined(LIBXSMM_INTRINSICS_NONE) && defined(LIBXSMM_X86_AVX512_CPX) && \ !defined(LIBXSMM_INTRINSICS_STATIC) && (LIBXSMM_X86_AVX512_CPX <= LIBXSMM_MAX_STATIC_TARGET_ARCH) # define LIBXSMM_INTRINSICS_AVX512_CPX #endif /** * Pseudo intrinsics (AVX-512) */ #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ # define LIBXSMM_INTRINSICS_MM512_QUANTIZE_NEAR_PS_EPI16( A, B ) _mm512_cvtepi32_epi16(_mm512_cvt_roundps_epi32( \ _mm512_mul_ps(LIBXSMM_INTRINSICS_MM512_LOAD_PS(A), B), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512i LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(__m512 a) { const __m512i vnaninf = _mm512_set1_epi32(0x7f800000), vrneadd = _mm512_set1_epi32(0x00007fff); const __m512i vfixup = _mm512_set1_epi32(0x00000001), vfixupmask = _mm512_set1_epi32(0x00010000); const __m512i mm512_roundbf16rne_a_ = _mm512_castps_si512(a); const __mmask16 mm512_roundbf16rne_mask1_ = _mm512_cmp_epi32_mask(_mm512_and_epi32(mm512_roundbf16rne_a_, vnaninf), vnaninf, _MM_CMPINT_NE); const __mmask16 mm512_roundbf16rne_mask2_ = _mm512_cmp_epi32_mask(_mm512_and_epi32(mm512_roundbf16rne_a_, vfixupmask), vfixupmask, _MM_CMPINT_EQ); return _mm512_mask_add_epi32(mm512_roundbf16rne_a_, mm512_roundbf16rne_mask1_, mm512_roundbf16rne_a_, _mm512_mask_add_epi32(vrneadd, mm512_roundbf16rne_mask2_, vrneadd, vfixup)); } LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m256i LIBXSMM_INTRINSICS_MM512_CVT_FP32_BF16(__m512 a) { return _mm512_cvtepi32_epi16(_mm512_srai_epi32(LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(a), 16)); } LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512i LIBXSMM_INTRINSICS_MM512_CVT2_FP32_BF16(__m512 a, __m512 b) { const __m256i aa = _mm512_cvtepi32_epi16(_mm512_srai_epi32(LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(b), 16)); const __m256i bb = _mm512_cvtepi32_epi16(_mm512_srai_epi32(LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(a), 16)); return _mm512_inserti64x4(_mm512_inserti64x4(_mm512_setzero_si512(), aa, 0), bb, 1); } LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(__m256i a) { return _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(a),16)); } /** SVML-intrinsics */ LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_TANH_PS_RATIONAL_78( __m512 x ) { const __m512 c0 = _mm512_set1_ps(2027025.0f); const __m512 c1 = _mm512_set1_ps(270270.0f); const __m512 c2 = _mm512_set1_ps(6930.0f); const __m512 c3 = _mm512_set1_ps(36.0f); const __m512 c1_d = _mm512_set1_ps(945945.0f); const __m512 c2_d = _mm512_set1_ps(51975.0f); const __m512 c3_d = _mm512_set1_ps(630.0f); const __m512 hi_bound = _mm512_set1_ps(4.97f); const __m512 lo_bound = _mm512_set1_ps(-4.97f); const __m512 ones = _mm512_set1_ps(1.0f); const __m512 neg_ones = _mm512_set1_ps(-1.0f); const __m512 x2 = _mm512_mul_ps( x, x ); const __m512 t1_nom = _mm512_fmadd_ps( c3, x2, c2 ); const __m512 t2_nom = _mm512_fmadd_ps( t1_nom, x2, c1 ); const __m512 t3_nom = _mm512_fmadd_ps( t2_nom, x2, c0 ); const __m512 nom = _mm512_mul_ps( t3_nom, x ); const __m512 t1_denom = _mm512_add_ps( x2, c3_d ); const __m512 t2_denom = _mm512_fmadd_ps( t1_denom, x2, c2_d ); const __m512 t3_denom = _mm512_fmadd_ps( t2_denom, x2, c1_d ); const __m512 denom = _mm512_fmadd_ps( t3_denom, x2, c0 ); const __m512 denom_rcp = _mm512_rcp14_ps( denom ); const __mmask16 mask_hi = _mm512_cmp_ps_mask( x, hi_bound, _CMP_GT_OQ); const __mmask16 mask_lo = _mm512_cmp_ps_mask( x, lo_bound, _CMP_LT_OQ); __m512 result = _mm512_mul_ps( nom, denom_rcp ); result = _mm512_mask_blend_ps(mask_hi, result, ones); result = _mm512_mask_blend_ps(mask_lo, result, neg_ones); return result; } LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_TANH_PS_RATIONAL_32( __m512 x ) { const __m512 c1 = _mm512_set1_ps((float)(1.0/27.0)); const __m512 c2 = _mm512_set1_ps((float)(1.0/3)); const __m512 hi_bound = _mm512_set1_ps(3.2f); const __m512 lo_bound = _mm512_set1_ps(-3.2f); const __m512 ones = _mm512_set1_ps(1.0f); const __m512 neg_ones = _mm512_set1_ps(-1.0f); const __m512 x2 = _mm512_mul_ps( x, x ); const __m512 t1_nom = _mm512_fmadd_ps( x2, c1, ones); const __m512 nom = _mm512_mul_ps( t1_nom, x ); const __m512 denom = _mm512_fmadd_ps( x2, c2, ones); const __m512 denom_rcp = _mm512_rcp14_ps( denom ); const __mmask16 mask_hi = _mm512_cmp_ps_mask( x, hi_bound, _CMP_GT_OQ); const __mmask16 mask_lo = _mm512_cmp_ps_mask( x, lo_bound, _CMP_LT_OQ); __m512 result = _mm512_mul_ps(nom, denom_rcp); result = _mm512_mask_blend_ps(mask_hi, result, ones); result = _mm512_mask_blend_ps(mask_lo, result, neg_ones); return result; } LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_TANH_PS_EXP2( __m512 _x ) { const __m512 twice_log2_e = _mm512_set1_ps((float)(1.442695*2)); const __m512 half = _mm512_set1_ps(0.5f); const __m512 c2 = _mm512_set1_ps(0.240226507f); const __m512 c1 = _mm512_set1_ps(0.452920674f); const __m512 c0 = _mm512_set1_ps(0.713483036f); const __m512 ones = _mm512_set1_ps(1.0f); const __m512 minus_twos = _mm512_set1_ps(-2.0f); const __m512 x = _mm512_fmadd_ps(_x, twice_log2_e, half); #if 1 const __m512 y = _mm512_sub_ps(x, _mm512_roundscale_round_ps(x, 1, _MM_FROUND_CUR_DIRECTION)); #else const __m512 y = _mm512_reduce_ps(x, 1); #endif const __m512 t1 = _mm512_fmadd_ps( y, c2, c1); const __m512 two_to_y = _mm512_fmadd_ps( y, t1, c0); const __m512 exp = _mm512_scalef_ps( two_to_y, x ); const __m512 denom_rcp = _mm512_rcp14_ps( _mm512_add_ps( exp, ones) ); __m512 result = _mm512_fmadd_ps( denom_rcp, minus_twos, ones); return result; } LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_TANH_PS_EXP3( __m512 _x ) { const __m512 twice_log2_e = _mm512_set1_ps((float)(1.442695*2)); const __m512 half = _mm512_set1_ps(0.5f); const __m512 c3 = _mm512_set1_ps(0.05550410866f); const __m512 c2 = _mm512_set1_ps(0.15697034396f); const __m512 c1 = _mm512_set1_ps(0.49454875509f); const __m512 c0 = _mm512_set1_ps(0.70654502287f); const __m512 ones = _mm512_set1_ps(1.0f); const __m512 minus_twos = _mm512_set1_ps(-2.0f); const __m512 x = _mm512_fmadd_ps(_x, twice_log2_e, half); #if 1 const __m512 y = _mm512_sub_ps(x, _mm512_roundscale_round_ps(x, 1, _MM_FROUND_CUR_DIRECTION)); #else const __m512 y = _mm512_reduce_ps(x, 1); #endif const __m512 t1 = _mm512_fmadd_ps( y, c3, c2); const __m512 t2 = _mm512_fmadd_ps( y, t1, c1); const __m512 two_to_y = _mm512_fmadd_ps( y, t2, c0); const __m512 exp = _mm512_scalef_ps( two_to_y, x ); const __m512 denom_rcp = _mm512_rcp14_ps( _mm512_add_ps( exp, ones) ); __m512 result = _mm512_fmadd_ps( denom_rcp, minus_twos, ones); return result; } LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2( __m512 x ) { __m512 result, func_p0, func_p1, func_p2; const __m512i sign_mask = _mm512_set1_epi32( 0x80000000 ); const __m512i sign_filter = _mm512_set1_epi32( 0x7FFFFFFF ); const __m512i lut_low = _mm512_set1_epi32( 246 ); const __m512i lut_high = _mm512_set1_epi32( 261 ); const __m512 tanh_p0_2_reg = _mm512_set_ps( 0.40555000f, 0.11892800f, -0.00972979f, -0.02740300f, -0.0169851f, -0.00776152f, -0.00305889f, -0.00116259f, -0.00041726f, -8.53233e-6f, 1.0000000f, 0.99999800f, 0.99975400f, 0.99268200f, 0.93645300f, 0.73833900f); const __m512 tanh_p1_2_reg = _mm512_set_ps( 0.495602f, 0.88152f, 1.125700000f, 1.17021000f, 1.1289000000f, 1.07929000f, 1.0432300f, 1.023010f, 1.011620f, 1.00164f, 1.56828e-14f, 4.49924e-7f, 0.0000646924f, 0.00260405f, 0.0311608f, 0.168736f); const __m512 tanh_p2_2_reg = _mm512_set_ps(-0.108238f, -0.2384280f, -0.354418000f, -0.38240300f, -0.34135700f, -0.274509000f, -0.20524900f, -0.1511960f, -0.107635f, -0.0466868f, -3.60822e-16f, -2.05971e-8f, -4.24538e-6f, -0.000231709f, -0.00386434f, -0.0277702f); const __m512i signs = _mm512_and_epi32(_mm512_castps_si512(x), sign_mask); const __m512i abs_arg = _mm512_and_epi32(_mm512_castps_si512(x), sign_filter); __m512i indices = _mm512_srli_epi32(abs_arg, 22); indices = _mm512_max_epi32(indices, lut_low); indices = _mm512_min_epi32(indices, lut_high); func_p0 = _mm512_permutexvar_ps(indices, tanh_p0_2_reg); func_p1 = _mm512_permutexvar_ps(indices, tanh_p1_2_reg); func_p2 = _mm512_permutexvar_ps(indices, tanh_p2_2_reg); result = _mm512_fmadd_ps(_mm512_castsi512_ps(abs_arg), func_p2, func_p1); result = _mm512_fmadd_ps(_mm512_castsi512_ps(abs_arg), result, func_p0); result = _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(result), signs)); return result; } LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX3( __m512 x ) { __m512 result, func_p0, func_p1, func_p2, func_p3; const __m512i sign_mask = _mm512_set1_epi32( 0x80000000 ); const __m512i sign_filter = _mm512_set1_epi32( 0x7FFFFFFF ); const __m512i lut_low = _mm512_set1_epi32( 246 ); const __m512i lut_high = _mm512_set1_epi32( 261 ); const __m512 tanh_p0_3_reg = _mm512_setr_ps( 0.466283000f, 0.82850600f, 0.97437500f, 0.99882600f, 0.9999860f, 1.0000000f, -1.50006e-08f, -7.98169e-06f, -4.53753e-05f, -0.00023755f, -0.00125285f, -0.00572314f, -0.0227717f, -0.0629089f, -0.084234300f, 0.071199800f); const __m512 tanh_p1_3_reg = _mm512_setr_ps( 0.500617f, 0.124369f, 0.0137214f, 0.000464124f, 4.02465e-06f, 0.00000f, 1.00001f, 1.00028f, 1.00112f, 1.00414f, 1.015570f, 1.050950f, 1.1478500f, 1.310130000f, 1.378950000f, 1.07407f); const __m512 tanh_p2_3_reg = _mm512_setr_ps(-0.16133200f, -0.0305526f, -0.00245909f, -6.12647e-05f, -3.76127e-07f, 0.000000f, -0.000245872f, -0.00341151f, -0.00971505f, -0.0256817f, -0.06869110f, -0.162433000f, -0.346828000f, -0.566516f, -0.640214000f, -0.44011900f); const __m512 tanh_p3_3_reg = _mm512_setr_ps( 0.0177393f, 0.00253432f, 0.000147303f, 2.69963e-06f, 1.16764e-08f, 0.0000000f, -0.330125f, -0.3176210f, -0.3017760f, -0.27358000f, -0.219375000f, -0.136197000f, -0.01868680f, 0.0808901f, 0.107095f, 0.0631459f); const __m512i signs = _mm512_and_epi32(_mm512_castps_si512(x), sign_mask); const __m512i abs_arg = _mm512_and_epi32(_mm512_castps_si512(x), sign_filter); __m512i indices = _mm512_srli_epi32(abs_arg, 22); indices = _mm512_max_epi32(indices, lut_low); indices = _mm512_min_epi32(indices, lut_high); func_p0 = _mm512_permutexvar_ps(indices, tanh_p0_3_reg); func_p1 = _mm512_permutexvar_ps(indices, tanh_p1_3_reg); func_p2 = _mm512_permutexvar_ps(indices, tanh_p2_3_reg); func_p3 = _mm512_permutexvar_ps(indices, tanh_p3_3_reg); result = _mm512_fmadd_ps(_mm512_castsi512_ps(abs_arg), func_p3, func_p2); result = _mm512_fmadd_ps(_mm512_castsi512_ps(abs_arg), result, func_p1); result = _mm512_fmadd_ps(_mm512_castsi512_ps(abs_arg), result, func_p0); result = _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(result), signs)); return result; } #if defined(LIBXSMM_INTEL_COMPILER) # define LIBXSMM_INTRINSICS_MM512_TANH_PS(A) _mm512_tanh_ps(A) # define LIBXSMM_INTRINSICS_MM512_EXP_PS(A) _mm512_exp_ps(A) #else # if !defined(LIBXSMM_NO_LIBM) # include # endif LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_TANH_PS(__m512 a) { float a16[16]; int i; _mm512_storeu_ps(a16, a); for (i = 0; i < 16; ++i) a16[i] = LIBXSMM_TANHF(a16[i]); return _mm512_loadu_ps(a16); } LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_EXP_PS(__m512 a) { float a16[16]; int i; _mm512_storeu_ps(a16, a); for (i = 0; i < 16; ++i) a16[i] = LIBXSMM_EXPF(a16[i]); return _mm512_loadu_ps(a16); } #endif /* SVML */ /** 2048-bit state for xoshiro128+ RNG */ #define LIBXSMM_INTRINSICS_MM512_RNG_STATE(INDEX) (*(__m512i*)LIBXSMM_CONCATENATE(libxsmm_intrinsics_mm512_rng_state, INDEX)) LIBXSMM_APIVAR_PUBLIC(unsigned int libxsmm_intrinsics_mm512_rng_state0[16]); LIBXSMM_APIVAR_PUBLIC(unsigned int libxsmm_intrinsics_mm512_rng_state1[16]); LIBXSMM_APIVAR_PUBLIC(unsigned int libxsmm_intrinsics_mm512_rng_state2[16]); LIBXSMM_APIVAR_PUBLIC(unsigned int libxsmm_intrinsics_mm512_rng_state3[16]); # if defined(__GNUC__) && !defined(__clang__) && !defined(LIBXSMM_INTEL_COMPILER) && !defined(_CRAYC) && 0 LIBXSMM_PRAGMA_OPTIMIZE_OFF /* avoid ICE in case of symbols (-g) */ # endif /** Generate random number in the interval [0, 1); not thread-safe. * this is based on xoshiro128+ 1.0, e.g. http://prng.di.unimi.it/xoshiro128plus.c */ LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512i LIBXSMM_INTRINSICS_MM512_RNG_XOSHIRO128P_EPI32(void) { const __m512i result = _mm512_add_epi32(LIBXSMM_INTRINSICS_MM512_RNG_STATE(0), LIBXSMM_INTRINSICS_MM512_RNG_STATE(3)); const __m512i s = _mm512_slli_epi32(LIBXSMM_INTRINSICS_MM512_RNG_STATE(1), 9); __m512i t; LIBXSMM_INTRINSICS_MM512_RNG_STATE(2) = _mm512_xor_epi32(LIBXSMM_INTRINSICS_MM512_RNG_STATE(2), LIBXSMM_INTRINSICS_MM512_RNG_STATE(0)); LIBXSMM_INTRINSICS_MM512_RNG_STATE(3) = _mm512_xor_epi32(LIBXSMM_INTRINSICS_MM512_RNG_STATE(3), LIBXSMM_INTRINSICS_MM512_RNG_STATE(1)); LIBXSMM_INTRINSICS_MM512_RNG_STATE(1) = _mm512_xor_epi32(LIBXSMM_INTRINSICS_MM512_RNG_STATE(1), LIBXSMM_INTRINSICS_MM512_RNG_STATE(2)); LIBXSMM_INTRINSICS_MM512_RNG_STATE(0) = _mm512_xor_epi32(LIBXSMM_INTRINSICS_MM512_RNG_STATE(0), LIBXSMM_INTRINSICS_MM512_RNG_STATE(3)); LIBXSMM_INTRINSICS_MM512_RNG_STATE(2) = _mm512_xor_epi32(LIBXSMM_INTRINSICS_MM512_RNG_STATE(2), s); t = _mm512_slli_epi32(LIBXSMM_INTRINSICS_MM512_RNG_STATE(3), 11); LIBXSMM_INTRINSICS_MM512_RNG_STATE(3) = _mm512_or_epi32(t, _mm512_srli_epi32(LIBXSMM_INTRINSICS_MM512_RNG_STATE(3), 32 - 11)); return result; } LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_RNG_PS(void) { const __m512i rng_mantissa = _mm512_srli_epi32( LIBXSMM_INTRINSICS_MM512_RNG_XOSHIRO128P_EPI32(), 9 ); const __m512 one = _mm512_set1_ps(1.0f); return _mm512_sub_ps(_mm512_castsi512_ps(_mm512_or_epi32(_mm512_set1_epi32(0x3f800000), rng_mantissa)), one); } /** Generate random number in the interval [0, 1); thread save, state needs to be managed by user. * this is based on xoshiro128+ 1.0, e.g. http://prng.di.unimi.it/xoshiro128plus.c */ LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512i LIBXSMM_INTRINSICS_MM512_RNG_XOSHIRO128P_EXTSTATE_EPI32( unsigned int* stateptr ) { __m512i state_0 = _mm512_loadu_si512( stateptr ); __m512i state_1 = _mm512_loadu_si512( stateptr+16 ); __m512i state_2 = _mm512_loadu_si512( stateptr+32 ); __m512i state_3 = _mm512_loadu_si512( stateptr+48 ); const __m512i result = _mm512_add_epi32(state_0, state_3); const __m512i s = _mm512_slli_epi32(state_1, 9); __m512i t; state_2 = _mm512_xor_epi32(state_2, state_0); state_3 = _mm512_xor_epi32(state_3, state_1); state_1 = _mm512_xor_epi32(state_1, state_2); state_0 = _mm512_xor_epi32(state_0, state_3); state_2 = _mm512_xor_epi32(state_2, s); _mm512_storeu_si512( stateptr , state_0 ); _mm512_storeu_si512( stateptr+16, state_1 ); _mm512_storeu_si512( stateptr+32, state_2 ); t = _mm512_slli_epi32(state_3, 11); state_3 = _mm512_or_epi32(t, _mm512_srli_epi32(state_3, 32 - 11)); _mm512_storeu_si512( stateptr+48, state_3 ); return result; } LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_RNG_EXTSTATE_PS( unsigned int* stateptr) { const __m512i rng_mantissa = _mm512_srli_epi32( LIBXSMM_INTRINSICS_MM512_RNG_XOSHIRO128P_EXTSTATE_EPI32( stateptr ), 9 ); const __m512 one = _mm512_set1_ps(1.0f); return _mm512_sub_ps(_mm512_castsi512_ps(_mm512_or_epi32(_mm512_set1_epi32(0x3f800000), rng_mantissa)), one); } # if defined(__GNUC__) && !defined(__clang__) && !defined(LIBXSMM_INTEL_COMPILER) && !defined(_CRAYC) && 0 LIBXSMM_PRAGMA_OPTIMIZE_ON # endif #endif /*__AVX512F__*/ #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif #endif /*LIBXSMM_INTRINSICS_X86_H*/ libxsmm-1.17/include/libxsmm_macros.h000066400000000000000000001247341415223013700177410ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_MACROS_H #define LIBXSMM_MACROS_H #include "libxsmm_config.h" /** Parameters the library was built for. */ #define LIBXSMM_CACHELINE LIBXSMM_CONFIG_CACHELINE #define LIBXSMM_ALIGNMENT LIBXSMM_CONFIG_ALIGNMENT #define LIBXSMM_MALLOC LIBXSMM_CONFIG_MALLOC #define LIBXSMM_ILP64 LIBXSMM_CONFIG_ILP64 #define LIBXSMM_SYNC LIBXSMM_CONFIG_SYNC #define LIBXSMM_JIT LIBXSMM_CONFIG_JIT /** Parameters of GEMM domain (static kernels, etc). */ #define LIBXSMM_PREFETCH LIBXSMM_CONFIG_PREFETCH #define LIBXSMM_MAX_MNK LIBXSMM_CONFIG_MAX_MNK #define LIBXSMM_MAX_DIM LIBXSMM_CONFIG_MAX_DIM #define LIBXSMM_MAX_M LIBXSMM_CONFIG_MAX_M #define LIBXSMM_MAX_N LIBXSMM_CONFIG_MAX_N #define LIBXSMM_MAX_K LIBXSMM_CONFIG_MAX_K #define LIBXSMM_FLAGS LIBXSMM_CONFIG_FLAGS #define LIBXSMM_ALPHA LIBXSMM_CONFIG_ALPHA #define LIBXSMM_BETA LIBXSMM_CONFIG_BETA /** * Use "make PLATFORM=1" to disable platform checks. * The platform check is to bail-out with an error * message for an attempt to build an upstream package * and subsequently to list LIBXSMM as "broken" on * that platform. * Note: successful compilation on an unsupported * platform is desired, but only fallback code is * present at best. */ #if !defined(LIBXSMM_PLATFORM_FORCE) && 0 # define LIBXSMM_PLATFORM_FORCE #endif #if !defined(LIBXSMM_PLATFORM_X86) && ( \ (defined(__x86_64__) && 0 != (__x86_64__)) || \ (defined(__amd64__) && 0 != (__amd64__)) || \ (defined(_M_X64) || defined(_M_AMD64)) || \ (defined(__i386__) && 0 != (__i386__)) || \ (defined(_M_IX86))) # define LIBXSMM_PLATFORM_X86 #endif #if !defined(LIBXSMM_PLATFORM_SUPPORTED) # if defined(LIBXSMM_PLATFORM_X86) # define LIBXSMM_PLATFORM_SUPPORTED # elif !defined(LIBXSMM_PLATFORM_FORCE) # error Intel Architecture or compatible CPU required! # endif #endif #if !defined(LIBXSMM_BITS) # if (defined(__SIZEOF_PTRDIFF_T__) && 4 < (__SIZEOF_PTRDIFF_T__)) || \ (defined(__SIZE_MAX__) && (4294967295U < (__SIZE_MAX__))) || \ (defined(__x86_64__) && 0 != (__x86_64__)) || \ (defined(__amd64__) && 0 != (__amd64__)) || \ (defined(_M_X64) || defined(_M_AMD64)) || \ (defined(_WIN64)) || \ (defined(__powerpc64)) # define LIBXSMM_UNLIMITED 0xFFFFFFFFFFFFFFFF # define LIBXSMM_BITS 64 # elif !defined(LIBXSMM_PLATFORM_FORCE) && defined(NDEBUG) # error LIBXSMM is only supported on 64-bit platforms! # else /* JIT-generated code (among other issues) is not supported! */ # define LIBXSMM_UNLIMITED 0xFFFFFFFF # define LIBXSMM_BITS 32 # endif #endif #define LIBXSMM_STRINGIFY2(SYMBOL) #SYMBOL #define LIBXSMM_STRINGIFY(SYMBOL) LIBXSMM_STRINGIFY2(SYMBOL) #define LIBXSMM_TOSTRING(SYMBOL) LIBXSMM_STRINGIFY(SYMBOL) #define LIBXSMM_CONCATENATE2(A, B) A##B #define LIBXSMM_CONCATENATE3(A, B, C) LIBXSMM_CONCATENATE(LIBXSMM_CONCATENATE(A, B), C) #define LIBXSMM_CONCATENATE4(A, B, C, D) LIBXSMM_CONCATENATE(LIBXSMM_CONCATENATE3(A, B, C), D) #define LIBXSMM_CONCATENATE(A, B) LIBXSMM_CONCATENATE2(A, B) #define LIBXSMM_FSYMBOL(SYMBOL) LIBXSMM_CONCATENATE(SYMBOL, _) #define LIBXSMM_UNIQUE(NAME) LIBXSMM_CONCATENATE(NAME, __LINE__) #define LIBXSMM_EXPAND(...) __VA_ARGS__ #define LIBXSMM_ELIDE(...) /** * Check given value against type-range (assertion). * Note: allows "-1" for unsigned types. */ #if !defined(NDEBUG) # define LIBXSMM_CHECK_ULLONG(VALUE) assert(-1 <= (VALUE) && (VALUE) <= ULLONG_MAX) # define LIBXSMM_CHECK_LLONG(VALUE) assert(ULLONG_MIN <= (VALUE) && (VALUE) <= LLONG_MAX) # define LIBXSMM_CHECK_ULONG(VALUE) assert(-1 <= (VALUE) && (VALUE) <= ULONG_MAX) # define LIBXSMM_CHECK_LONG(VALUE) assert(LONG_MIN <= (VALUE) && (VALUE) <= LONG_MAX) # define LIBXSMM_CHECK_USHORT(VALUE) assert(-1 <= (VALUE) && (VALUE) <= USHRT_MAX) # define LIBXSMM_CHECK_SHORT(VALUE) assert(SHRT_MIN <= (VALUE) && (VALUE) <= SHRT_MAX) # define LIBXSMM_CHECK_UCHAR(VALUE) assert(-1 <= (VALUE) && (VALUE) <= UCHAR_MAX) # define LIBXSMM_CHECK_ICHAR(VALUE) assert(SCHAR_MIN <= (VALUE) && (VALUE) <= SCHAR_MAX) # define LIBXSMM_CHECK_UINT(VALUE) assert(-1 <= (VALUE) && (VALUE) <= UINT_MAX) # define LIBXSMM_CHECK_INT(VALUE) assert(INT_MIN <= (VALUE) && (VALUE) <= INT_MAX) #else # define LIBXSMM_CHECK_ULLONG(VALUE) 0/*dummy*/ # define LIBXSMM_CHECK_LLONG(VALUE) 0/*dummy*/ # define LIBXSMM_CHECK_ULONG(VALUE) 0/*dummy*/ # define LIBXSMM_CHECK_LONG(VALUE) 0/*dummy*/ # define LIBXSMM_CHECK_USHORT(VALUE) 0/*dummy*/ # define LIBXSMM_CHECK_SHORT(VALUE) 0/*dummy*/ # define LIBXSMM_CHECK_UCHAR(VALUE) 0/*dummy*/ # define LIBXSMM_CHECK_ICHAR(VALUE) 0/*dummy*/ # define LIBXSMM_CHECK_UINT(VALUE) 0/*dummy*/ # define LIBXSMM_CHECK_INT(VALUE) 0/*dummy*/ #endif /** * Perform verbose type-cast with following two advantages: * (1) Make it easy to locate/find the type-cast. * (2) Range-check to ensure fitting into type. */ #define LIBXSMM_CAST_ULLONG(VALUE) (LIBXSMM_CHECK_ULLONG(VALUE), (unsigned long long)(VALUE)) #define LIBXSMM_CAST_LLONG(VALUE) (LIBXSMM_CHECK_LLONG(VALUE), (/*signed*/long long)(VALUE)) #define LIBXSMM_CAST_ULONG(VALUE) (LIBXSMM_CHECK_ULONG(VALUE), (unsigned long)(VALUE)) #define LIBXSMM_CAST_LONG(VALUE) (LIBXSMM_CHECK_LONG(VALUE), (/*signed*/long)(VALUE)) #define LIBXSMM_CAST_USHORT(VALUE) (LIBXSMM_CHECK_USHORT(VALUE), (unsigned short)(VALUE)) #define LIBXSMM_CAST_SHORT(VALUE) (LIBXSMM_CHECK_SHORT(VALUE), (/*signed*/short)(VALUE)) #define LIBXSMM_CAST_UCHAR(VALUE) (LIBXSMM_CHECK_UCHAR(VALUE), (unsigned char)(VALUE)) #define LIBXSMM_CAST_ICHAR(VALUE) (LIBXSMM_CHECK_ICHAR(VALUE), (signed char)(VALUE)) #define LIBXSMM_CAST_UINT(VALUE) (LIBXSMM_CHECK_UINT(VALUE), (unsigned int)(VALUE)) #define LIBXSMM_CAST_INT(VALUE) (LIBXSMM_CHECK_INT(VALUE), (/*signed*/int)(VALUE)) /** Use LIBXSMM_VERSION2 instead of LIBXSMM_VERSION3, e.g., if __GNUC_PATCHLEVEL__ or __clang_patchlevel__ is zero (0). */ #define LIBXSMM_VERSION2(MAJOR, MINOR) ((MAJOR) * 10000 + (MINOR) * 100) #define LIBXSMM_VERSION3(MAJOR, MINOR, UPDATE) (LIBXSMM_VERSION2(MAJOR, MINOR) + (UPDATE)) #define LIBXSMM_VERSION4(MAJOR, MINOR, UPDATE, PATCH) \ (((0x7F & (MAJOR)) << 24) | ((0x1F & (MINOR)) << 19) | ((0x1F & (UPDATE)) << 14) | (0x3FFF & (PATCH))) #define LIBXSMM_VERSION41(VERSION) (((VERSION) >> 24)) #define LIBXSMM_VERSION42(VERSION) (((VERSION) >> 19) & 0x1F) #define LIBXSMM_VERSION43(VERSION) (((VERSION) >> 14) & 0x1F) #define LIBXSMM_VERSION44(VERSION) (((VERSION)) & 0x3FFF) #if !defined(LIBXSMM_UNPACKED) && (defined(_CRAYC) || defined(LIBXSMM_OFFLOAD_BUILD) || \ (0 == LIBXSMM_SYNC)/*Windows: missing pack(pop) error*/) # define LIBXSMM_UNPACKED #endif #if defined(_WIN32) && !defined(__GNUC__) && !defined(__clang__) # define LIBXSMM_ATTRIBUTE(A) __declspec(A) # if defined(__cplusplus) # define LIBXSMM_INLINE_ALWAYS __forceinline # else # define LIBXSMM_INLINE_ALWAYS static __forceinline # endif # define LIBXSMM_ALIGNED(DECL, N) LIBXSMM_ATTRIBUTE(align(N)) DECL # if !defined(LIBXSMM_UNPACKED) # define LIBXSMM_PACKED(TYPE) LIBXSMM_PRAGMA(pack(1)) TYPE # endif # define LIBXSMM_CDECL __cdecl #elif (defined(__GNUC__) || defined(__clang__) || defined(__PGI)) # define LIBXSMM_ATTRIBUTE(A) __attribute__((A)) # define LIBXSMM_INLINE_ALWAYS LIBXSMM_ATTRIBUTE(always_inline) LIBXSMM_INLINE # define LIBXSMM_ALIGNED(DECL, N) LIBXSMM_ATTRIBUTE(aligned(N)) DECL # if !defined(LIBXSMM_UNPACKED) # define LIBXSMM_PACKED(TYPE) TYPE LIBXSMM_ATTRIBUTE(__packed__) # endif # define LIBXSMM_CDECL LIBXSMM_ATTRIBUTE(cdecl) #else # define LIBXSMM_ATTRIBUTE(A) # define LIBXSMM_INLINE_ALWAYS LIBXSMM_INLINE # define LIBXSMM_ALIGNED(DECL, N) DECL # define LIBXSMM_CDECL #endif #if !defined(LIBXSMM_PACKED) # define LIBXSMM_PACKED(TYPE) TYPE # if !defined(LIBXSMM_UNPACKED) # define LIBXSMM_UNPACKED # endif #endif #if !defined(LIBXSMM_UNPACKED) && 0 /* no braces around EXPR */ # define LIBXSMM_PAD(EXPR) EXPR; #endif #if !defined(LIBXSMM_PAD) # define LIBXSMM_PAD(EXPR) #endif #if defined(__INTEL_COMPILER) # if !defined(__INTEL_COMPILER_UPDATE) # define LIBXSMM_INTEL_COMPILER __INTEL_COMPILER # else # define LIBXSMM_INTEL_COMPILER (__INTEL_COMPILER + __INTEL_COMPILER_UPDATE) # endif #elif defined(__INTEL_COMPILER_BUILD_DATE) # define LIBXSMM_INTEL_COMPILER ((__INTEL_COMPILER_BUILD_DATE / 10000 - 2000) * 100) #endif /* LIBXSMM_ATTRIBUTE_USED: mark library functions as used to avoid warning */ #if defined(__GNUC__) || defined(__clang__) || (defined(__INTEL_COMPILER) && !defined(_WIN32)) # if !defined(__cplusplus) || !defined(__clang__) # define LIBXSMM_ATTRIBUTE_COMMON LIBXSMM_ATTRIBUTE(common) # else # define LIBXSMM_ATTRIBUTE_COMMON # endif # define LIBXSMM_ATTRIBUTE_MALLOC LIBXSMM_ATTRIBUTE(malloc) # define LIBXSMM_ATTRIBUTE_UNUSED LIBXSMM_ATTRIBUTE(unused) # define LIBXSMM_ATTRIBUTE_USED LIBXSMM_ATTRIBUTE(used) #else # if defined(_WIN32) # define LIBXSMM_ATTRIBUTE_COMMON LIBXSMM_ATTRIBUTE(selectany) # else # define LIBXSMM_ATTRIBUTE_COMMON # endif # define LIBXSMM_ATTRIBUTE_MALLOC # define LIBXSMM_ATTRIBUTE_UNUSED # define LIBXSMM_ATTRIBUTE_USED #endif #if !defined(__INTEL_COMPILER) && (defined(__clang__) || defined(__PGLLVM__)) # define LIBXSMM_ATTRIBUTE_NO_SANITIZE(KIND) LIBXSMM_ATTRIBUTE(no_sanitize(LIBXSMM_STRINGIFY(KIND))) #elif defined(__GNUC__) && LIBXSMM_VERSION2(4, 8) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__) \ && !defined(__INTEL_COMPILER) # define LIBXSMM_ATTRIBUTE_NO_SANITIZE(KIND) LIBXSMM_ATTRIBUTE(LIBXSMM_CONCATENATE(no_sanitize_, KIND)) #else # define LIBXSMM_ATTRIBUTE_NO_SANITIZE(KIND) #endif #if defined(__cplusplus) # define LIBXSMM_VARIADIC ... # define LIBXSMM_EXTERN extern "C" # define LIBXSMM_EXTERN_C extern "C" # define LIBXSMM_INLINE_KEYWORD inline # define LIBXSMM_INLINE LIBXSMM_INLINE_KEYWORD # if defined(__GNUC__) || defined(_CRAYC) # define LIBXSMM_CALLER __PRETTY_FUNCTION__ # elif defined(_MSC_VER) # define LIBXSMM_CALLER __FUNCDNAME__ # define LIBXSMM_FUNCNAME __FUNCTION__ # else # define LIBXSMM_CALLER __FUNCNAME__ # endif #else /* C */ # define LIBXSMM_VARIADIC # define LIBXSMM_EXTERN extern # define LIBXSMM_EXTERN_C # if defined(__STDC_VERSION__) && (199901L <= __STDC_VERSION__) /*C99*/ # define LIBXSMM_PRAGMA(DIRECTIVE) _Pragma(LIBXSMM_STRINGIFY(DIRECTIVE)) # define LIBXSMM_CALLER __func__ # define LIBXSMM_RESTRICT restrict # define LIBXSMM_INLINE_KEYWORD inline # elif defined(_MSC_VER) # define LIBXSMM_CALLER __FUNCDNAME__ # define LIBXSMM_FUNCNAME __FUNCTION__ # define LIBXSMM_INLINE_KEYWORD __inline # define LIBXSMM_INLINE_FIXUP # elif defined(__GNUC__) && !defined(__STRICT_ANSI__) # define LIBXSMM_CALLER __PRETTY_FUNCTION__ # endif # if !defined(LIBXSMM_INLINE_KEYWORD) # define LIBXSMM_INLINE_KEYWORD # define LIBXSMM_INLINE_FIXUP # endif /* LIBXSMM_ATTRIBUTE_USED: increases compile-time of header-only by a large factor */ # define LIBXSMM_INLINE static LIBXSMM_INLINE_KEYWORD LIBXSMM_ATTRIBUTE_UNUSED #endif /*__cplusplus*/ #if !defined(LIBXSMM_CALLER) # define LIBXSMM_CALLER NULL #endif #if !defined(LIBXSMM_FUNCNAME) # define LIBXSMM_FUNCNAME LIBXSMM_CALLER #endif #if !defined(LIBXSMM_CALLER_ID) # if defined(__GNUC__) || 1 # define LIBXSMM_CALLER_ID ((const void*)((uintptr_t)libxsmm_hash_string(LIBXSMM_CALLER))) # else /* assume no string-pooling (perhaps unsafe) */ # define LIBXSMM_CALLER_ID LIBXSMM_CALLER # endif #endif #if defined(LIBXSMM_OFFLOAD_BUILD) && \ defined(__INTEL_OFFLOAD) && (!defined(_WIN32) || (1400 <= LIBXSMM_INTEL_COMPILER)) # define LIBXSMM_OFFLOAD(A) LIBXSMM_ATTRIBUTE(target(A)) # define LIBXSMM_NO_OFFLOAD(RTYPE, FN, ...) ((RTYPE (*)(LIBXSMM_VARIADIC))(FN))(__VA_ARGS__) # if !defined(LIBXSMM_OFFLOAD_TARGET) # define LIBXSMM_OFFLOAD_TARGET mic # endif #else # define LIBXSMM_OFFLOAD(A) # define LIBXSMM_NO_OFFLOAD(RTYPE, FN, ...) (FN)(__VA_ARGS__) #endif #define LIBXSMM_RETARGETABLE LIBXSMM_OFFLOAD(LIBXSMM_OFFLOAD_TARGET) #if !defined(__STATIC) && !defined(_WINDLL) && (defined(_WIN32) || defined(__CYGWIN__) || defined(__MINGW32__)) # define __STATIC #endif /* may include Clang and other compatible compilers */ #if defined(__GNUC__) && !defined(_WIN32) && !defined(__CYGWIN__) && !defined(__MINGW32__) # define LIBXSMM_VISIBILITY_INTERNAL LIBXSMM_ATTRIBUTE(visibility("internal")) # define LIBXSMM_VISIBILITY_HIDDEN LIBXSMM_ATTRIBUTE(visibility("hidden")) # define LIBXSMM_VISIBILITY_PUBLIC LIBXSMM_ATTRIBUTE(visibility("default")) #endif #if !defined(LIBXSMM_VISIBILITY_INTERNAL) # define LIBXSMM_VISIBILITY_INTERNAL #endif #if !defined(LIBXSMM_VISIBILITY_HIDDEN) # define LIBXSMM_VISIBILITY_HIDDEN #endif #if !defined(LIBXSMM_VISIBILITY_PUBLIC) # define LIBXSMM_VISIBILITY_PUBLIC #endif #if !defined(LIBXSMM_VISIBILITY_PRIVATE) # define LIBXSMM_VISIBILITY_PRIVATE LIBXSMM_VISIBILITY_HIDDEN #endif /* Windows Dynamic Link Library (DLL) */ #if !defined(__STATIC) && (defined(_WIN32) || defined(__CYGWIN__) || defined(__MINGW32__)) # define LIBXSMM_VISIBILITY_EXPORT LIBXSMM_ATTRIBUTE(dllexport) # define LIBXSMM_VISIBILITY_IMPORT LIBXSMM_ATTRIBUTE(dllimport) #endif #if !defined(LIBXSMM_VISIBILITY_EXPORT) # define LIBXSMM_VISIBILITY_EXPORT LIBXSMM_VISIBILITY_PUBLIC #endif #if !defined(LIBXSMM_VISIBILITY_IMPORT) # define LIBXSMM_VISIBILITY_IMPORT LIBXSMM_VISIBILITY_PUBLIC #endif #if defined(LIBXSMM_SOURCE_H) /* header-only mode */ # define LIBXSMM_API_VISIBILITY_EXPORT # define LIBXSMM_API_VISIBILITY_IMPORT # define LIBXSMM_API_VISIBILITY_INTERN # define LIBXSMM_API_COMMON LIBXSMM_RETARGETABLE LIBXSMM_ATTRIBUTE_COMMON # define LIBXSMM_API_TARGET LIBXSMM_API_INLINE # define LIBXSMM_API_EXTERN LIBXSMM_EXTERN_C #else /* classic ABI */ # if defined(LIBXSMM_BUILD_EXT) # define LIBXSMM_API_VISIBILITY_EXPORT LIBXSMM_VISIBILITY_IMPORT # define LIBXSMM_API_VISIBILITY_IMPORT LIBXSMM_VISIBILITY_EXPORT # define LIBXSMM_API_VISIBILITY_INTERN LIBXSMM_VISIBILITY_PRIVATE # elif defined(LIBXSMM_BUILD) # define LIBXSMM_API_VISIBILITY_EXPORT LIBXSMM_VISIBILITY_EXPORT # define LIBXSMM_API_VISIBILITY_IMPORT LIBXSMM_VISIBILITY_IMPORT # define LIBXSMM_API_VISIBILITY_INTERN LIBXSMM_VISIBILITY_PRIVATE # else /* import */ # define LIBXSMM_API_VISIBILITY_EXPORT LIBXSMM_VISIBILITY_IMPORT # define LIBXSMM_API_VISIBILITY_IMPORT LIBXSMM_VISIBILITY_IMPORT # define LIBXSMM_API_VISIBILITY_INTERN # endif # define LIBXSMM_API_COMMON LIBXSMM_RETARGETABLE # define LIBXSMM_API_TARGET LIBXSMM_RETARGETABLE # define LIBXSMM_API_EXTERN LIBXSMM_EXTERN #endif #define LIBXSMM_API_VISIBILITY(VISIBILITY) LIBXSMM_CONCATENATE(LIBXSMM_API_VISIBILITY_, VISIBILITY) #define LIBXSMM_APIVAR(DECL, VISIBILITY, EXTERN) EXTERN LIBXSMM_API_COMMON LIBXSMM_API_VISIBILITY(VISIBILITY) DECL #define LIBXSMM_API_INLINE LIBXSMM_INLINE LIBXSMM_RETARGETABLE #define LIBXSMM_API_DEF #if (!defined(__INTEL_COMPILER) || !defined(_WIN32)) #define LIBXSMM_APIVAR_ALIGNED(DECL, VISIBILITY) LIBXSMM_ALIGNED(LIBXSMM_APIVAR(DECL, VISIBILITY, LIBXSMM_API_DEF), LIBXSMM_CONFIG_CACHELINE) #else #define LIBXSMM_APIVAR_ALIGNED(DECL, VISIBILITY) LIBXSMM_APIVAR(DECL, VISIBILITY, LIBXSMM_API_DEF) #endif /** Public variable declaration (without definition) located in header file. */ #define LIBXSMM_APIVAR_PUBLIC(DECL) LIBXSMM_APIVAR(DECL, EXPORT, LIBXSMM_API_EXTERN) /** Public variable definition (complements declaration) located in source file. */ #define LIBXSMM_APIVAR_PUBLIC_DEF(DECL) LIBXSMM_APIVAR_ALIGNED(DECL, EXPORT) /** Private variable declaration (without definition) located in header file. */ #define LIBXSMM_APIVAR_PRIVATE(DECL) LIBXSMM_APIVAR(DECL, INTERN, LIBXSMM_API_EXTERN) /** Private variable definition (complements declaration) located in source file. */ #define LIBXSMM_APIVAR_PRIVATE_DEF(DECL) LIBXSMM_APIVAR_ALIGNED(DECL, INTERN) /** Private variable (declaration and definition) located in source file. */ #define LIBXSMM_APIVAR_DEFINE(DECL) LIBXSMM_APIVAR_PRIVATE(DECL); LIBXSMM_APIVAR_PRIVATE_DEF(DECL) /** Function decoration used for private functions. */ #define LIBXSMM_API_INTERN LIBXSMM_API_EXTERN LIBXSMM_API_TARGET LIBXSMM_API_VISIBILITY(INTERN) /** Function decoration used for public functions of LIBXSMMext library. */ #define LIBXSMM_APIEXT LIBXSMM_API_EXTERN LIBXSMM_API_TARGET LIBXSMM_API_VISIBILITY(IMPORT) /** Function decoration used for public functions of LIBXSMM library. */ #define LIBXSMM_API LIBXSMM_API_EXTERN LIBXSMM_API_TARGET LIBXSMM_API_VISIBILITY(EXPORT) #if !defined(LIBXSMM_RESTRICT) # if ((defined(__GNUC__) && !defined(__CYGWIN32__)) || defined(LIBXSMM_INTEL_COMPILER)) && !defined(_WIN32) # define LIBXSMM_RESTRICT __restrict__ # elif defined(_MSC_VER) || defined(LIBXSMM_INTEL_COMPILER) # define LIBXSMM_RESTRICT __restrict # else # define LIBXSMM_RESTRICT # endif #endif /*LIBXSMM_RESTRICT*/ #if !defined(LIBXSMM_PRAGMA) # if defined(LIBXSMM_INTEL_COMPILER) || defined(_MSC_VER) # define LIBXSMM_PRAGMA(DIRECTIVE) __pragma(LIBXSMM_EXPAND(DIRECTIVE)) # else # define LIBXSMM_PRAGMA(DIRECTIVE) # endif #endif /*LIBXSMM_PRAGMA*/ #if !defined(LIBXSMM_OPENMP_SIMD) # if defined(LIBXSMM_INTEL_COMPILER) && (1500 <= LIBXSMM_INTEL_COMPILER) # define LIBXSMM_OPENMP_SIMD # elif defined(_OPENMP) && (201307/*v4.0*/ <= _OPENMP) # define LIBXSMM_OPENMP_SIMD # endif #endif #if !defined(LIBXSMM_INTEL_COMPILER) || (LIBXSMM_INTEL_COMPILER < 9900) # if defined(LIBXSMM_OPENMP_SIMD) # define LIBXSMM_PRAGMA_SIMD_REDUCTION(EXPRESSION) LIBXSMM_PRAGMA(omp simd reduction(EXPRESSION)) # define LIBXSMM_PRAGMA_SIMD_COLLAPSE(N) LIBXSMM_PRAGMA(omp simd collapse(N)) # define LIBXSMM_PRAGMA_SIMD_PRIVATE(...) LIBXSMM_PRAGMA(omp simd private(__VA_ARGS__)) # define LIBXSMM_PRAGMA_SIMD LIBXSMM_PRAGMA(omp simd) # elif defined(__INTEL_COMPILER) # define LIBXSMM_PRAGMA_SIMD_REDUCTION(EXPRESSION) LIBXSMM_PRAGMA(simd reduction(EXPRESSION)) # define LIBXSMM_PRAGMA_SIMD_COLLAPSE(N) LIBXSMM_PRAGMA(simd collapse(N)) # define LIBXSMM_PRAGMA_SIMD_PRIVATE(...) LIBXSMM_PRAGMA(simd private(__VA_ARGS__)) # define LIBXSMM_PRAGMA_SIMD LIBXSMM_PRAGMA(simd) # endif #endif #if !defined(LIBXSMM_PRAGMA_SIMD) # define LIBXSMM_PRAGMA_SIMD_REDUCTION(EXPRESSION) # define LIBXSMM_PRAGMA_SIMD_COLLAPSE(N) # define LIBXSMM_PRAGMA_SIMD_PRIVATE(...) # define LIBXSMM_PRAGMA_SIMD #endif #if defined(__INTEL_COMPILER) # define LIBXSMM_PRAGMA_NONTEMPORAL(...) LIBXSMM_PRAGMA(vector nontemporal(__VA_ARGS__)) # define LIBXSMM_PRAGMA_VALIGNED LIBXSMM_PRAGMA(vector aligned) # define LIBXSMM_PRAGMA_NOVECTOR LIBXSMM_PRAGMA(novector) # define LIBXSMM_PRAGMA_FORCEINLINE LIBXSMM_PRAGMA(forceinline) # define LIBXSMM_PRAGMA_LOOP_COUNT(MIN, MAX, AVG) LIBXSMM_PRAGMA(loop_count min=MIN max=MAX avg=AVG) # define LIBXSMM_PRAGMA_UNROLL_AND_JAM(N) LIBXSMM_PRAGMA(unroll_and_jam(N)) # define LIBXSMM_PRAGMA_UNROLL_N(N) LIBXSMM_PRAGMA(unroll(N)) # define LIBXSMM_PRAGMA_UNROLL LIBXSMM_PRAGMA(unroll) # define LIBXSMM_PRAGMA_VALIGNED_VAR(A) LIBXSMM_ASSUME_ALIGNED(A, LIBXSMM_ALIGNMENT); /*# define LIBXSMM_UNUSED(VARIABLE) LIBXSMM_PRAGMA(unused(VARIABLE))*/ #else # if defined(LIBXSMM_OPENMP_SIMD) && (201811/*v5.0*/ <= _OPENMP) # define LIBXSMM_PRAGMA_NONTEMPORAL(...) LIBXSMM_PRAGMA(omp simd nontemporal(__VA_ARGS__)) # else # define LIBXSMM_PRAGMA_NONTEMPORAL(...) # endif # if defined(__clang__) # define LIBXSMM_PRAGMA_VALIGNED_VAR(A) # define LIBXSMM_PRAGMA_VALIGNED # define LIBXSMM_PRAGMA_NOVECTOR LIBXSMM_PRAGMA(clang loop vectorize(disable)) # define LIBXSMM_PRAGMA_FORCEINLINE # define LIBXSMM_PRAGMA_LOOP_COUNT(MIN, MAX, AVG) LIBXSMM_PRAGMA(unroll(AVG)) # define LIBXSMM_PRAGMA_UNROLL_AND_JAM(N) LIBXSMM_PRAGMA(unroll(N)) # define LIBXSMM_PRAGMA_UNROLL_N(N) LIBXSMM_PRAGMA(unroll(N)) # define LIBXSMM_PRAGMA_UNROLL LIBXSMM_PRAGMA_UNROLL_N(4) # else # define LIBXSMM_PRAGMA_VALIGNED_VAR(A) # define LIBXSMM_PRAGMA_VALIGNED # define LIBXSMM_PRAGMA_NOVECTOR # define LIBXSMM_PRAGMA_FORCEINLINE # define LIBXSMM_PRAGMA_LOOP_COUNT(MIN, MAX, AVG) # define LIBXSMM_PRAGMA_UNROLL_AND_JAM(N) # define LIBXSMM_PRAGMA_UNROLL # endif #endif #if !defined(LIBXSMM_PRAGMA_UNROLL_N) # if defined(__GNUC__) && (LIBXSMM_VERSION2(8, 3) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)) # define LIBXSMM_PRAGMA_UNROLL_N(N) LIBXSMM_PRAGMA(GCC unroll N) # else # define LIBXSMM_PRAGMA_UNROLL_N(N) # endif #endif #if defined(LIBXSMM_INTEL_COMPILER) # define LIBXSMM_PRAGMA_OPTIMIZE_OFF LIBXSMM_PRAGMA(optimize("", off)) # define LIBXSMM_PRAGMA_OPTIMIZE_ON LIBXSMM_PRAGMA(optimize("", on)) #elif defined(__clang__) # define LIBXSMM_PRAGMA_OPTIMIZE_OFF LIBXSMM_PRAGMA(clang optimize off) # define LIBXSMM_PRAGMA_OPTIMIZE_ON LIBXSMM_PRAGMA(clang optimize on) #elif defined(__GNUC__) # define LIBXSMM_PRAGMA_OPTIMIZE_OFF LIBXSMM_PRAGMA(GCC push_options) LIBXSMM_PRAGMA(GCC optimize("O0")) # define LIBXSMM_PRAGMA_OPTIMIZE_ON LIBXSMM_PRAGMA(GCC pop_options) #else # define LIBXSMM_PRAGMA_OPTIMIZE_OFF # define LIBXSMM_PRAGMA_OPTIMIZE_ON #endif #if defined(_OPENMP) && (200805/*v3.0*/ <= _OPENMP) \ && defined(NDEBUG) /* CCE complains for debug builds */ # define LIBXSMM_OPENMP_COLLAPSE(N) collapse(N) #else # define LIBXSMM_OPENMP_COLLAPSE(N) #endif /** LIBXSMM_UP2POT rounds up to the next power of two (POT). */ #define LIBXSMM_UP2POT_01(N) ((N) | ((N) >> 1)) #define LIBXSMM_UP2POT_02(N) (LIBXSMM_UP2POT_01(N) | (LIBXSMM_UP2POT_01(N) >> 2)) #define LIBXSMM_UP2POT_04(N) (LIBXSMM_UP2POT_02(N) | (LIBXSMM_UP2POT_02(N) >> 4)) #define LIBXSMM_UP2POT_08(N) (LIBXSMM_UP2POT_04(N) | (LIBXSMM_UP2POT_04(N) >> 8)) #define LIBXSMM_UP2POT_16(N) (LIBXSMM_UP2POT_08(N) | (LIBXSMM_UP2POT_08(N) >> 16)) #define LIBXSMM_UP2POT_32(N) (LIBXSMM_UP2POT_16(N) | (LIBXSMM_UP2POT_16(N) >> 32)) #define LIBXSMM_UP2POT(N) (LIBXSMM_UP2POT_32((unsigned long long)(N) - LIBXSMM_MIN(1, N)) + LIBXSMM_MIN(1, N)) #define LIBXSMM_LO2POT(N) (LIBXSMM_UP2POT_32((unsigned long long)(N) >> 1) + LIBXSMM_MIN(1, N)) #define LIBXSMM_UPDIV(N, MULT) (((N) + ((MULT) - 1)) / (MULT)) #define LIBXSMM_UP(N, MULT) (LIBXSMM_UPDIV(N, MULT) * (MULT)) #define LIBXSMM_UP2(N, NPOT) (((N) + ((NPOT) - 1)) & ~((NPOT) - 1)) #define LIBXSMM_ABS(A) (0 <= (A) ? (A) : -(A)) #define LIBXSMM_MIN(A, B) ((A) < (B) ? (A) : (B)) #define LIBXSMM_MAX(A, B) ((A) < (B) ? (B) : (A)) #define LIBXSMM_MOD(A, N) ((A) % (N)) #define LIBXSMM_MOD2(A, NPOT) ((A) & ((NPOT) - 1)) #define LIBXSMM_DELTA(T0, T1) ((T0) < (T1) ? ((T1) - (T0)) : ((T0) - (T1))) #define LIBXSMM_CLMP(VALUE, LO, HI) ((LO) < (VALUE) ? ((VALUE) <= (HI) ? (VALUE) : LIBXSMM_MIN(VALUE, HI)) : LIBXSMM_MAX(LO, VALUE)) #define LIBXSMM_SIZEOF(START, LAST) (((const char*)(LAST)) - ((const char*)(START)) + sizeof(*LAST)) #define LIBXSMM_FEQ(A, B) ((A) == (B)) #define LIBXSMM_NEQ(A, B) ((A) != (B)) #define LIBXSMM_ISPOT(A) (0 != (A) && !((A) & ((A) - 1))) #define LIBXSMM_ISWAP(A, B) (((A) ^= (B)), ((B) ^= (A)), ((A) ^= (B))) #define LIBXSMM_ISNAN(A) LIBXSMM_NEQ(A, A) #define LIBXSMM_NOTNAN(A) LIBXSMM_FEQ(A, A) #define LIBXSMM_ROUNDX(TYPE, A) ((TYPE)((long long)(0 <= (A) ? ((double)(A) + 0.5) : ((double)(A) - 0.5)))) #define LIBXSMM_CONST_VOID_PTR(A) *((const void**)&(A)) /** Makes some functions available independent of C99 support. */ #if defined(__STDC_VERSION__) && (199901L/*C99*/ <= __STDC_VERSION__) # if defined(__PGI) # define LIBXSMM_POWF(A, B) ((float)pow((float)(A), (float)(B))) # else # define LIBXSMM_POWF(A, B) powf(A, B) # endif # define LIBXSMM_FREXPF(A, B) frexpf(A, B) # define LIBXSMM_ROUNDF(A) roundf(A) # define LIBXSMM_ROUND(A) round(A) # define LIBXSMM_TANHF(A) tanhf(A) # define LIBXSMM_SQRTF(A) sqrtf(A) # define LIBXSMM_EXP2F(A) exp2f(A) # define LIBXSMM_LOG2F(A) log2f(A) # define LIBXSMM_ERFF(A) erff(A) # define LIBXSMM_EXP2(A) exp2(A) # define LIBXSMM_LOG2(A) log2(A) # define LIBXSMM_EXPF(A) expf(A) # define LIBXSMM_LOGF(A) logf(A) #else # define LIBXSMM_POWF(A, B) ((float)pow((float)(A), (float)(B))) # define LIBXSMM_FREXPF(A, B) ((float)frexp((float)(A), B)) # define LIBXSMM_ROUNDF(A) LIBXSMM_ROUNDX(float, A) # define LIBXSMM_ROUND(A) LIBXSMM_ROUNDX(double, A) # define LIBXSMM_TANHF(A) ((float)tanh((float)(A))) # define LIBXSMM_SQRTF(A) ((float)sqrt((float)(A))) # define LIBXSMM_EXP2F(A) LIBXSMM_POWF(2, A) # define LIBXSMM_LOG2F(A) ((float)LIBXSMM_LOG2((float)(A))) # define LIBXSMM_ERFF(A) ((float)erf((float)(A))) # define LIBXSMM_EXP2(A) pow(2.0, A) # define LIBXSMM_LOG2(A) (log(A) * (1.0 / (M_LN2))) # define LIBXSMM_EXPF(A) ((float)exp((float)(A))) # define LIBXSMM_LOGF(A) ((float)log((float)(A))) #endif #if defined(LIBXSMM_INTEL_COMPILER) # if (1700 <= LIBXSMM_INTEL_COMPILER) # define LIBXSMM_ASSUME(EXPRESSION) __assume(EXPRESSION) # else # define LIBXSMM_ASSUME(EXPRESSION) assert(EXPRESSION) # endif #elif defined(_MSC_VER) # define LIBXSMM_ASSUME(EXPRESSION) __assume(EXPRESSION) #elif defined(__GNUC__) && !defined(_CRAYC) && (LIBXSMM_VERSION2(4, 5) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)) # define LIBXSMM_ASSUME(EXPRESSION) do { if (!(EXPRESSION)) __builtin_unreachable(); } while(0) #else # define LIBXSMM_ASSUME(EXPRESSION) assert(EXPRESSION) #endif #if defined(__INTEL_COMPILER) # define LIBXSMM_ASSUME_ALIGNED(A, N) __assume_aligned(A, N) #else # define LIBXSMM_ASSUME_ALIGNED(A, N) assert(0 == ((uintptr_t)(A)) % (N)) #endif #define LIBXSMM_ALIGN(POINTER, ALIGNMENT/*POT*/) ((POINTER) + (LIBXSMM_UP2((uintptr_t)(POINTER), ALIGNMENT) - ((uintptr_t)(POINTER))) / sizeof(*(POINTER))) #define LIBXSMM_FOLD2(POINTER, ALIGNMENT, NPOT) LIBXSMM_MOD2(((uintptr_t)(POINTER) / (ALIGNMENT)), NPOT) #if defined(_MSC_VER) && !defined(__clang__) && !defined(LIBXSMM_INTEL_COMPILER) /* account for incorrect handling of __VA_ARGS__ */ # define LIBXSMM_SELECT_ELEMENT(INDEX1/*one-based*/, .../*elements*/) LIBXSMM_CONCATENATE(LIBXSMM_SELECT_ELEMENT_, INDEX1)LIBXSMM_EXPAND((__VA_ARGS__)) #else # define LIBXSMM_SELECT_ELEMENT(INDEX1/*one-based*/, .../*elements*/) LIBXSMM_CONCATENATE(LIBXSMM_SELECT_ELEMENT_, INDEX1)(__VA_ARGS__) #endif #define LIBXSMM_SELECT_ELEMENT_1(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E0 #define LIBXSMM_SELECT_ELEMENT_2(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E1 #define LIBXSMM_SELECT_ELEMENT_3(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E2 #define LIBXSMM_SELECT_ELEMENT_4(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E3 #define LIBXSMM_SELECT_ELEMENT_5(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E4 #define LIBXSMM_SELECT_ELEMENT_6(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E5 #define LIBXSMM_SELECT_ELEMENT_7(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E6 #define LIBXSMM_SELECT_ELEMENT_8(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E7 #define LIBXSMM_SELECT_ELEMENT_9(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E8 #define LIBXSMM_SELECT_ELEMENT_10(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E9 #define LIBXSMM_SELECT_HEAD_AUX(A, ...) (A) #define LIBXSMM_SELECT_HEAD(...) LIBXSMM_EXPAND(LIBXSMM_SELECT_HEAD_AUX(__VA_ARGS__, 0/*dummy*/)) #define LIBXSMM_SELECT_TAIL(A, ...) __VA_ARGS__ /** * For VLAs, check EXACTLY for C99 since a C11-conforming compiler may not provide VLAs. * However, some compilers (Intel) may signal support for VLA even with strict ANSI (C89). * To ultimately disable VLA-support, define LIBXSMM_NO_VLA (make VLA=0). * VLA-support is signaled by LIBXSMM_VLA. */ #if !defined(LIBXSMM_VLA) && !defined(LIBXSMM_NO_VLA) && !defined(__PGI) && ( \ (defined(__STDC_VERSION__) && (199901L/*C99*/ == __STDC_VERSION__ || (!defined(__STDC_NO_VLA__) && 199901L/*C99*/ < __STDC_VERSION__))) || \ (defined(__GNUC__) && LIBXSMM_VERSION2(5, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__) && !defined(__STRICT_ANSI__) && !defined(__cplusplus)) || \ (defined(LIBXSMM_INTEL_COMPILER) && !defined(_WIN32) && !defined(__cplusplus)) || \ (defined(__INTEL_COMPILER) && !defined(_WIN32))) # define LIBXSMM_VLA #endif /** * LIBXSMM_INDEX1 calculates the linear address for a given set of (multiple) indexes/bounds. * Syntax: LIBXSMM_INDEX1(, , ..., , , ..., ). * Please note that the leading dimension (s0) is omitted in the above syntax! * TODO: support leading dimension (pitch/stride). */ #if defined(_MSC_VER) && !defined(__clang__) /* account for incorrect handling of __VA_ARGS__ */ # define LIBXSMM_INDEX1(NDIMS, ...) LIBXSMM_CONCATENATE(LIBXSMM_INDEX1_, NDIMS)LIBXSMM_EXPAND((__VA_ARGS__)) #else # define LIBXSMM_INDEX1(NDIMS, ...) LIBXSMM_CONCATENATE(LIBXSMM_INDEX1_, NDIMS)(__VA_ARGS__) #endif #define LIBXSMM_INDEX1_1(...) ((size_t)LIBXSMM_SELECT_HEAD(__VA_ARGS__)) #define LIBXSMM_INDEX1_2(I0, I1, S1) (LIBXSMM_INDEX1_1(I0) * ((size_t)S1) + (size_t)I1) #define LIBXSMM_INDEX1_3(I0, I1, I2, S1, S2) (LIBXSMM_INDEX1_2(I0, I1, S1) * ((size_t)S2) + (size_t)I2) #define LIBXSMM_INDEX1_4(I0, I1, I2, I3, S1, S2, S3) (LIBXSMM_INDEX1_3(I0, I1, I2, S1, S2) * ((size_t)S3) + (size_t)I3) #define LIBXSMM_INDEX1_5(I0, I1, I2, I3, I4, S1, S2, S3, S4) (LIBXSMM_INDEX1_4(I0, I1, I2, I3, S1, S2, S3) * ((size_t)S4) + (size_t)I4) #define LIBXSMM_INDEX1_6(I0, I1, I2, I3, I4, I5, S1, S2, S3, S4, S5) (LIBXSMM_INDEX1_5(I0, I1, I2, I3, I4, S1, S2, S3, S4) * ((size_t)S5) + (size_t)I5) #define LIBXSMM_INDEX1_7(I0, I1, I2, I3, I4, I5, I6, S1, S2, S3, S4, S5, S6) (LIBXSMM_INDEX1_6(I0, I1, I2, I3, I4, I5, S1, S2, S3, S4, S5) * ((size_t)S6) + (size_t)I6) #define LIBXSMM_INDEX1_8(I0, I1, I2, I3, I4, I5, I6, I7, S1, S2, S3, S4, S5, S6, S7) (LIBXSMM_INDEX1_7(I0, I1, I2, I3, I4, I5, I6, S1, S2, S3, S4, S5, S6) * ((size_t)S7) + (size_t)I7) #define LIBXSMM_INDEX1_9(I0, I1, I2, I3, I4, I5, I6, I7, I8, S1, S2, S3, S4, S5, S6, S7, S8) (LIBXSMM_INDEX1_8(I0, I1, I2, I3, I4, I5, I6, I7, S1, S2, S3, S4, S5, S6, S7) * ((size_t)S8) + (size_t)I8) #define LIBXSMM_INDEX1_10(I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, S1, S2, S3, S4, S5, S6, S7, S8, S9) (LIBXSMM_INDEX1_9(I0, I1, I2, I3, I4, I5, I6, I7, I8, S1, S2, S3, S4, S5, S6, S7, S8) * ((size_t)S9) + (size_t)I9) /** * LIBXSMM_VLA_DECL declares an array according to the given set of (multiple) bounds. * Syntax: LIBXSMM_VLA_DECL(, , , , , ..., ). * The element type can be "const" or otherwise qualified; initial value must be (const)element-type*. * Please note that the syntax is similar to LIBXSMM_INDEX1, and the leading dimension (s0) is omitted! * * LIBXSMM_VLA_ACCESS gives the array element according to the given set of (multiple) indexes/bounds. * Syntax: LIBXSMM_VLA_ACCESS(, , , ..., , , ..., ). * Please note that the syntax is similar to LIBXSMM_INDEX1, and the leading dimension (s0) is omitted! */ #if !defined(LIBXSMM_VLA_POSTFIX) # define LIBXSMM_VLA_POSTFIX _ #endif #if defined(LIBXSMM_VLA) LIBXSMM_API_INLINE int libxsmm_nonconst_int(int i) { return i; } # define LIBXSMM_VLA_ACCESS(NDIMS, ARRAY, ...) LIBXSMM_VLA_ACCESS_ND(NDIMS, LIBXSMM_CONCATENATE(ARRAY, LIBXSMM_VLA_POSTFIX), LIBXSMM_VLA_ACCESS_SINK, __VA_ARGS__) # define LIBXSMM_VLA_ACCESS_SINK(S) + 0 * (S) # define LIBXSMM_VLA_ACCESS_NONCONST(I) libxsmm_nonconst_int(I) # define LIBXSMM_VLA_ACCESS_ND(NDIMS, ARRAY, XY, ...) LIBXSMM_CONCATENATE3(LIBXSMM_VLA_ACCESS_, NDIMS, D)(ARRAY, XY, __VA_ARGS__) # define LIBXSMM_VLA_ACCESS_0D(ARRAY, XY, ...) (ARRAY)/*scalar*/ # define LIBXSMM_VLA_ACCESS_1D(ARRAY, XY, ...) ((ARRAY)[LIBXSMM_VLA_ACCESS_NONCONST(LIBXSMM_SELECT_HEAD(__VA_ARGS__))]) # define LIBXSMM_VLA_ACCESS_2D(ARRAY, XY, I0, I1, ...) (((ARRAY) XY(__VA_ARGS__))[I0][LIBXSMM_VLA_ACCESS_NONCONST(I1)]) # define LIBXSMM_VLA_ACCESS_3D(ARRAY, XY, I0, I1, I2, S1, ...) (((ARRAY) XY(S1) XY(__VA_ARGS__))[I0][I1][LIBXSMM_VLA_ACCESS_NONCONST(I2)]) # define LIBXSMM_VLA_ACCESS_4D(ARRAY, XY, I0, I1, I2, I3, S1, S2, ...) (((ARRAY) XY(S1) XY(S2) XY(__VA_ARGS__))[I0][I1][I2][LIBXSMM_VLA_ACCESS_NONCONST(I3)]) # define LIBXSMM_VLA_ACCESS_5D(ARRAY, XY, I0, I1, I2, I3, I4, S1, S2, S3, ...) (((ARRAY) XY(S1) XY(S2) XY(S3) XY(__VA_ARGS__))[I0][I1][I2][I3][LIBXSMM_VLA_ACCESS_NONCONST(I4)]) # define LIBXSMM_VLA_ACCESS_6D(ARRAY, XY, I0, I1, I2, I3, I4, I5, S1, S2, S3, S4, ...) (((ARRAY) XY(S1) XY(S2) XY(S3) XY(S4) XY(__VA_ARGS__))[I0][I1][I2][I3][I4][LIBXSMM_VLA_ACCESS_NONCONST(I5)]) # define LIBXSMM_VLA_ACCESS_7D(ARRAY, XY, I0, I1, I2, I3, I4, I5, I6, S1, S2, S3, S4, S5, ...) (((ARRAY) XY(S1) XY(S2) XY(S3) XY(S4) XY(S5) XY(__VA_ARGS__))[I0][I1][I2][I3][I4][I5][LIBXSMM_VLA_ACCESS_NONCONST(I6)]) # define LIBXSMM_VLA_ACCESS_8D(ARRAY, XY, I0, I1, I2, I3, I4, I5, I6, I7, S1, S2, S3, S4, S5, S6, ...) (((ARRAY) XY(S1) XY(S2) XY(S3) XY(S4) XY(S5) XY(S6) XY(__VA_ARGS__))[I0][I1][I2][I3][I4][I5][I6][LIBXSMM_VLA_ACCESS_NONCONST(I7)]) # define LIBXSMM_VLA_ACCESS_9D(ARRAY, XY, I0, I1, I2, I3, I4, I5, I6, I7, I8, S1, S2, S3, S4, S5, S6, S7, ...) (((ARRAY) XY(S1) XY(S2) XY(S3) XY(S4) XY(S5) XY(S6) XY(S7) XY(__VA_ARGS__))[I0][I1][I2][I3][I4][I5][I6][I7][LIBXSMM_VLA_ACCESS_NONCONST(I8)]) # define LIBXSMM_VLA_ACCESS_10D(ARRAY, XY, I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, S1, S2, S3, S4, S5, S6, S7, S8, ...) (((ARRAY) XY(S1) XY(S2) XY(S3) XY(S4) XY(S5) XY(S6) XY(S7) XY(S8) XY(__VA_ARGS__))[I0][I1][I2][I3][I4][I5][I6][I7][I8][LIBXSMM_VLA_ACCESS_NONCONST(I9)]) # define LIBXSMM_VLA_DECL(NDIMS, ELEMENT_TYPE, ARRAY_VAR, .../*initial value, and bounds*/) \ ELEMENT_TYPE LIBXSMM_VLA_ACCESS_ND(LIBXSMM_SELECT_ELEMENT(NDIMS, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), *LIBXSMM_RESTRICT LIBXSMM_CONCATENATE(ARRAY_VAR, LIBXSMM_VLA_POSTFIX), \ LIBXSMM_ELIDE, LIBXSMM_SELECT_TAIL(__VA_ARGS__, 0)/*bounds*/, LIBXSMM_SELECT_TAIL(__VA_ARGS__, 0)/*dummy*/) = \ (ELEMENT_TYPE LIBXSMM_VLA_ACCESS_ND(LIBXSMM_SELECT_ELEMENT(NDIMS, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), *, \ LIBXSMM_ELIDE, LIBXSMM_SELECT_TAIL(__VA_ARGS__, 0)/*bounds*/, LIBXSMM_SELECT_TAIL(__VA_ARGS__, 0)/*dummy*/))LIBXSMM_SELECT_HEAD(__VA_ARGS__) #else /* calculate linear index */ # define LIBXSMM_VLA_ACCESS(NDIMS, ARRAY, ...) LIBXSMM_CONCATENATE(ARRAY, LIBXSMM_VLA_POSTFIX)[LIBXSMM_INDEX1(NDIMS, __VA_ARGS__)] # define LIBXSMM_VLA_DECL(NDIMS, ELEMENT_TYPE, ARRAY_VAR, .../*initial value, and bounds*/) \ ELEMENT_TYPE *LIBXSMM_RESTRICT LIBXSMM_CONCATENATE(ARRAY_VAR, LIBXSMM_VLA_POSTFIX) = /*(ELEMENT_TYPE*)*/LIBXSMM_SELECT_HEAD(__VA_ARGS__) \ + 0 * LIBXSMM_INDEX1(NDIMS, LIBXSMM_SELECT_TAIL(__VA_ARGS__, LIBXSMM_SELECT_TAIL(__VA_ARGS__, 0))) /* dummy-shift to "sink" unused arguments */ #endif /** Access an array of TYPE with Byte-measured stride. */ #define LIBXSMM_ACCESS(TYPE, ARRAY, STRIDE) (*(TYPE*)((char*)(ARRAY) + (STRIDE))) #if !defined(LIBXSMM_UNUSED) # if 0 # define LIBXSMM_UNUSED(VARIABLE) LIBXSMM_PRAGMA(unused(VARIABLE)) # else # define LIBXSMM_UNUSED(VARIABLE) (void)(VARIABLE) # endif #endif #if !defined(NDEBUG) # define LIBXSMM_UNUSED_DEBUG(VARIABLE) LIBXSMM_UNUSED(VARIABLE) #else # define LIBXSMM_UNUSED_DEBUG(VARIABLE) #endif #if defined(_OPENMP) # define LIBXSMM_PRAGMA_OMP(...) LIBXSMM_PRAGMA(omp __VA_ARGS__) # if defined(_MSC_VER) && !defined(__INTEL_COMPILER) # define LIBXSMM_OMP_VAR(A) LIBXSMM_UNUSED(A) /* suppress warning about "unused" variable */ # elif defined(__clang__) # define LIBXSMM_OMP_VAR(A) (A) = 0 # else # define LIBXSMM_OMP_VAR(A) # endif #else # define LIBXSMM_PRAGMA_OMP(...) # define LIBXSMM_OMP_VAR(A) #endif #if defined(LIBXSMM_BUILD) && (defined(__GNUC__) || defined(__clang__)) && !defined(__CYGWIN__) && !defined(__MINGW32__) # define LIBXSMM_ATTRIBUTE_WEAK_IMPORT LIBXSMM_ATTRIBUTE(weak_import) # define LIBXSMM_ATTRIBUTE_WEAK LIBXSMM_ATTRIBUTE(weak) #else # define LIBXSMM_ATTRIBUTE_WEAK # define LIBXSMM_ATTRIBUTE_WEAK_IMPORT #endif #if !defined(LIBXSMM_NO_CTOR) && !defined(LIBXSMM_CTOR) && \ (defined(__STDC_VERSION__) && (199901L <= __STDC_VERSION__)) && \ (defined(LIBXSMM_BUILD) && !defined(__STATIC)) && \ (defined(__GNUC__) || defined(__clang__)) # define LIBXSMM_ATTRIBUTE_CTOR LIBXSMM_ATTRIBUTE(constructor) # define LIBXSMM_ATTRIBUTE_DTOR LIBXSMM_ATTRIBUTE(destructor) # define LIBXSMM_CTOR #else # define LIBXSMM_ATTRIBUTE_CTOR # define LIBXSMM_ATTRIBUTE_DTOR #endif #if defined(__GNUC__) && !defined(__PGI) && !defined(__ibmxl__) # define LIBXSMM_ATTRIBUTE_NO_TRACE LIBXSMM_ATTRIBUTE(no_instrument_function) #else # define LIBXSMM_ATTRIBUTE_NO_TRACE #endif #if defined(__GNUC__) # define LIBXSMM_MAY_ALIAS LIBXSMM_ATTRIBUTE(__may_alias__) #else # define LIBXSMM_MAY_ALIAS #endif #if !defined(LIBXSMM_MKTEMP_PATTERN) # define LIBXSMM_MKTEMP_PATTERN "XXXXXX" #endif /** Below group is to fix-up some platform/compiler specifics. */ #if defined(_WIN32) # if !defined(_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES) # define _CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES 1 # endif # if !defined(_CRT_SECURE_NO_DEPRECATE) # define _CRT_SECURE_NO_DEPRECATE 1 # endif # if !defined(_USE_MATH_DEFINES) # define _USE_MATH_DEFINES 1 # endif # if !defined(WIN32_LEAN_AND_MEAN) # define WIN32_LEAN_AND_MEAN 1 # endif # if !defined(NOMINMAX) # define NOMINMAX 1 # endif # if defined(__INTEL_COMPILER) && (190023506 <= _MSC_FULL_VER) # define __builtin_huge_val() HUGE_VAL # define __builtin_huge_valf() HUGE_VALF # define __builtin_nan nan # define __builtin_nanf nanf # define __builtin_nans nan # define __builtin_nansf nanf # if defined(__cplusplus) # define _CMATH_ # endif # endif #endif #if !defined(_GNU_SOURCE) && defined(LIBXSMM_BUILD) # define _GNU_SOURCE #endif #if !defined(__STDC_FORMAT_MACROS) # define __STDC_FORMAT_MACROS #endif #if defined(__clang__) && !defined(__extern_always_inline) # define __extern_always_inline LIBXSMM_INLINE #endif #if defined(LIBXSMM_INLINE_FIXUP) && !defined(inline) # define inline LIBXSMM_INLINE_KEYWORD #endif #if (0 != LIBXSMM_SYNC) # if !defined(_REENTRANT) # define _REENTRANT # endif # if defined(__PGI) # if defined(__GCC_ATOMIC_TEST_AND_SET_TRUEVAL) # undef __GCC_ATOMIC_TEST_AND_SET_TRUEVAL # endif # define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1 # endif #endif #if !defined(__has_feature) && !defined(__clang__) # define __has_feature(A) 0 #endif #if !defined(__has_builtin) && !defined(__clang__) # define __has_builtin(A) 0 #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #if (0 != LIBXSMM_SYNC) # if defined(_WIN32) || defined(__CYGWIN__) # include # else # include # endif #endif #if !defined(LIBXSMM_ASSERT) # include # if defined(NDEBUG) # define LIBXSMM_ASSERT(EXPR) LIBXSMM_ASSUME(EXPR) # else # define LIBXSMM_ASSERT(EXPR) assert(EXPR) # endif #endif #if !defined(LIBXSMM_ASSERT_MSG) # define LIBXSMM_ASSERT_MSG(EXPR, MSG) assert((EXPR) && (0 != *(MSG))) #endif #if !defined(LIBXSMM_EXPECT_ELIDE) # define LIBXSMM_EXPECT_ELIDE(RESULT, EXPR) do { \ /*const*/ int libxsmm_expect_result_ = ((RESULT) == (EXPR)); \ LIBXSMM_UNUSED(libxsmm_expect_result_); \ } while(0) #endif #if defined(NDEBUG) # define LIBXSMM_EXPECT LIBXSMM_EXPECT_ELIDE # define LIBXSMM_EXPECT_NOT LIBXSMM_EXPECT_ELIDE #else # define LIBXSMM_EXPECT(RESULT, EXPR) LIBXSMM_ASSERT((RESULT) == (EXPR)) # define LIBXSMM_EXPECT_NOT(RESULT, EXPR) LIBXSMM_ASSERT((RESULT) != (EXPR)) #endif #if defined(_DEBUG) # define LIBXSMM_EXPECT_DEBUG LIBXSMM_EXPECT #else # define LIBXSMM_EXPECT_DEBUG LIBXSMM_EXPECT_ELIDE #endif #if defined(_OPENMP) && defined(LIBXSMM_SYNC_OMP) # include #endif #include #include #include #include #include #include #include #include #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif #if !defined(FLT_MAX) # if !defined(__FLT_MAX__) # define FLT_MAX 3.40282346638528859811704183484516925e+38F # else # define FLT_MAX __FLT_MAX__ # endif #endif #if !defined(FLT_MIN) # if !defined(__FLT_MIN__) # define FLT_MIN 1.17549435082228750796873653722224568e-38F # else # define FLT_MIN __FLT_MIN__ # endif #endif #if defined(_WIN32) && 0 # define LIBXSMM_SNPRINTF(S, N, ...) _snprintf_s(S, N, _TRUNCATE, __VA_ARGS__) #elif defined(__STDC_VERSION__) && (199901L <= __STDC_VERSION__ || defined(__GNUC__)) # define LIBXSMM_SNPRINTF(S, N, ...) snprintf(S, N, __VA_ARGS__) #else # define LIBXSMM_SNPRINTF(S, N, ...) sprintf((S) + /*unused*/(N) * 0, __VA_ARGS__) #endif #if defined(__THROW) && defined(__cplusplus) # define LIBXSMM_THROW __THROW #endif #if !defined(LIBXSMM_THROW) # define LIBXSMM_THROW #endif #if defined(__GNUC__) && LIBXSMM_VERSION2(4, 2) == LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__) && \ !defined(__clang__) && !defined(__PGI) && !defined(__INTEL_COMPILER) && !defined(_CRAYC) # define LIBXSMM_NOTHROW LIBXSMM_THROW #else # define LIBXSMM_NOTHROW #endif #if defined(__cplusplus) # if (__cplusplus > 199711L) # define LIBXSMM_NOEXCEPT noexcept # else # define LIBXSMM_NOEXCEPT throw() # endif #else # define LIBXSMM_NOEXCEPT LIBXSMM_NOTHROW #endif #if defined(_WIN32) # define LIBXSMM_PUTENV(A) _putenv(A) #else # define LIBXSMM_PUTENV(A) putenv(A) #endif /* block must be after including above header files */ #if (defined(__GLIBC__) && defined(__GLIBC_MINOR__) && LIBXSMM_VERSION2(__GLIBC__, __GLIBC_MINOR__) < LIBXSMM_VERSION2(2, 26)) \ || (defined(LIBXSMM_INTEL_COMPILER) && (1802 >= LIBXSMM_INTEL_COMPILER) && !defined(__cplusplus) && defined(__linux__)) /* _Float128 was introduced with GNU GCC 7.0. */ # if !defined(_Float128) && !defined(__SIZEOF_FLOAT128__) && defined(__GNUC__) && !defined(__cplusplus) && defined(__linux__) # define _Float128 __float128 # endif # if !defined(LIBXSMM_GLIBC_FPTYPES) && defined(__GNUC__) && !defined(__cplusplus) && defined(__linux__) \ && (LIBXSMM_VERSION2(7, 0) > LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__) || \ (defined(LIBXSMM_INTEL_COMPILER) && (1802 >= LIBXSMM_INTEL_COMPILER))) # define LIBXSMM_GLIBC_FPTYPES # endif # if !defined(_Float128X) && defined(LIBXSMM_GLIBC_FPTYPES) # define _Float128X _Float128 # endif # if !defined(_Float32) && defined(LIBXSMM_GLIBC_FPTYPES) # define _Float32 float # endif # if !defined(_Float32x) && defined(LIBXSMM_GLIBC_FPTYPES) # define _Float32x _Float32 # endif # if !defined(_Float64) && defined(LIBXSMM_GLIBC_FPTYPES) # define _Float64 double # endif # if !defined(_Float64x) && defined(LIBXSMM_GLIBC_FPTYPES) # define _Float64x _Float64 # endif #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #if defined(LIBXSMM_GLIBC_FPTYPES) # if defined(__cplusplus) # undef __USE_MISC # if !defined(_DEFAULT_SOURCE) # define _DEFAULT_SOURCE # endif # if !defined(_BSD_SOURCE) # define _BSD_SOURCE # endif # else # if !defined(__PURE_INTEL_C99_HEADERS__) # define __PURE_INTEL_C99_HEADERS__ # endif # endif #endif #if !defined(LIBXSMM_NO_LIBM) # if (defined(LIBXSMM_INTEL_COMPILER) && (1800 <= LIBXSMM_INTEL_COMPILER)) \ && !defined(_WIN32) /* error including dfp754.h */ # include # endif # include #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif #endif /*LIBXSMM_MACROS_H*/ libxsmm-1.17/include/libxsmm_malloc.h000066400000000000000000000373001415223013700177140ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_MALLOC_H #define LIBXSMM_MALLOC_H #include "libxsmm_memory.h" /* include tensorflow/core/public/version.h prior to LIBXSMM otherwise the current TensorFlow API is assumed */ #if !defined(LIBXSMM_TF12) && (!defined(TF_VERSION_STRING) || \ LIBXSMM_VERSION2(1, 12) <= LIBXSMM_VERSION2(TF_MAJOR_VERSION, TF_MINOR_VERSION)) # define LIBXSMM_TF12 /* TF_PATCH_VERSION does not matter */ #endif /** Can be used with libxsmm_[get|set]_scratch_limit. */ #define LIBXSMM_SCRATCH_UNLIMITED ((size_t)LIBXSMM_UNLIMITED) #define LIBXSMM_SCRATCH_DEFAULT 0 /** Function types accepted for memory allocation (see libxsmm_*_allocator). */ LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void* (*libxsmm_malloc_ctx)(size_t /*size*/, const void* /*context*/); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void* (*libxsmm_malloc_fun)(size_t /*size*/); LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE libxsmm_malloc_function { libxsmm_malloc_ctx ctx_form; libxsmm_malloc_fun function; } libxsmm_malloc_function; /** Function types accepted for releasing memory (see libxsmm_*_allocator). */ LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_free_ctx)(void* /*buffer*/, const void* /*context*/); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_free_fun)(void* /*buffer*/); LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE libxsmm_free_function { libxsmm_free_ctx ctx_form; libxsmm_free_fun function; } libxsmm_free_function; /** * To setup the custom default memory allocator, either a malloc_fn and a free_fn * are given, or two NULL-pointers designate to reset the default allocator to a * library-internal default. If a context is given (non-NULL), the context-based * form of the memory allocation is used. * Changing the allocator including the function for deallocation applies to * upcoming allocation/deallocation and works correctly for pending buffers. */ LIBXSMM_API int libxsmm_set_default_allocator(/* malloc_fn/free_fn must correspond */ const void* context, libxsmm_malloc_function malloc_fn, libxsmm_free_function free_fn); /** Retrieve the default memory allocator. */ LIBXSMM_API int libxsmm_get_default_allocator(const void** context, libxsmm_malloc_function* malloc_fn, libxsmm_free_function* free_fn); /** * To setup the scratch memory allocator, a malloc_fn function and an optional free_fn * are given. A NULL-free acts as a "no-operation", and the deallocation is expected * to be controlled otherwise. If two NULL-pointers are given, the allocator is reset * to the currently active default memory allocator. If a context is given (non-NULL), * the context-based form of the memory allocation is used. * Changing the allocator including the function for deallocation applies to * upcoming allocation/deallocation and works correctly for pending buffers. */ LIBXSMM_API int libxsmm_set_scratch_allocator(/* malloc_fn/free_fn must correspond */ const void* context, libxsmm_malloc_function malloc_fn, libxsmm_free_function free_fn); /** Retrieve the scratch memory allocator. */ LIBXSMM_API int libxsmm_get_scratch_allocator(const void** context, libxsmm_malloc_function* malloc_fn, libxsmm_free_function* free_fn); /** Allocate memory (malloc/free interface). */ LIBXSMM_API LIBXSMM_ATTRIBUTE_MALLOC void* libxsmm_malloc(size_t size); /** Allocate aligned memory using the default allocator. */ LIBXSMM_API LIBXSMM_ATTRIBUTE_MALLOC void* libxsmm_aligned_malloc(size_t size, /** * =0: align automatically according to the size * 0<: align according to the alignment value */ size_t alignment); /** Reallocate memory using the default allocator (alignment is preserved). */ LIBXSMM_API void* libxsmm_realloc(size_t size, void* ptr); /** * Allocate aligned scratch memory. It is not supported * to query properties per libxsmm_get_malloc_info, but * libxsmm_get_scratch_info can used instead. */ LIBXSMM_API void* libxsmm_scratch_malloc(size_t size, /** * =0: align automatically according to the size * 0<: align according to the alignment value */ size_t alignment, /** * Identifies the call site, which is used * to determine the memory pool. */ const void* caller); /** * Binary form of libxsmm_scratch_malloc, which * expands the call-context automatically. This * macro is intentionally lower case. */ #define libxsmm_aligned_scratch(size, alignment) \ libxsmm_scratch_malloc(size, alignment, \ LIBXSMM_CALLER_ID) /** Deallocate memory (malloc/free interface). */ LIBXSMM_API void libxsmm_free(const void* memory); /** * Release the entire scratch memory regardless * of whether it is still referenced or not. */ LIBXSMM_API void libxsmm_release_scratch(void); /** Information about a buffer (default memory domain). */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_malloc_info { /** Size of the buffer. */ size_t size; } libxsmm_malloc_info; /** Retrieve information about a buffer (default memory domain). */ LIBXSMM_API int libxsmm_get_malloc_info(const void* memory, libxsmm_malloc_info* info); /** Information about the scratch memory domain. */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_scratch_info { /** Watermark memory across pools (size), unsatisfied (local), and library-internal memory. */ size_t size, local, internal; /** Pending allocations (not released). */ size_t npending; /** Number of allocations so far. */ size_t nmallocs; /** Number of pools used. */ unsigned int npools; } libxsmm_scratch_info; /** Retrieve information about the scratch memory domain. */ LIBXSMM_API int libxsmm_get_scratch_info(libxsmm_scratch_info* info); /** * Limit the total size (Bytes) of the scratch memory. * LIBXSMM_SCRATCH_UNLIMITED removes any limit, and * LIBXSMM_SCRATCH_DEFAULT populates the default. * The related environment variable LIBXSMM_SCRATCH_LIMIT * allows units: /b/B (Bytes), k/K, m/M, and g/G. */ LIBXSMM_API void libxsmm_set_scratch_limit(size_t nbytes); /** Get the maximum size of the scratch memory domain. */ LIBXSMM_API size_t libxsmm_get_scratch_limit(void); /** * Intercepts malloc/free to use scratch memory allocator. * (related environment variable LIBXSMM_MALLOC). * Optionally set the range of malloc-sizes to be intercepted. * The related environment variable LIBXSMM_MALLOC_LIMIT * allows units: /b/B (Bytes), k/K, m/M, and g/G. */ LIBXSMM_API void libxsmm_set_malloc(int enabled, const size_t* lo, const size_t* hi); /** * Determines if malloc/free are (and can be) intercepted. * Optionally gets the range of enabled malloc-sizes. */ LIBXSMM_API int libxsmm_get_malloc(size_t* lo, size_t* hi); /** * Calculate the linear offset of the n-dimensional (ndims) offset (can be NULL), * and the (optional) linear size of the corresponding shape. */ LIBXSMM_API size_t libxsmm_offset(const size_t offset[], const size_t shape[], size_t ndims, size_t* size); #if defined(__cplusplus) /** RAII idiom to temporarily setup an allocator for the lifetime of the scope. */ template class LIBXSMM_RETARGETABLE libxsmm_scoped_allocator { public: /** C'tor, which instantiates the new allocator (plain form). */ libxsmm_scoped_allocator(libxsmm_malloc_fun malloc_fn, libxsmm_free_fun free_fn) { kind::get(m_context, m_malloc, m_free); kind::set(NULL/*context*/, NULL/*malloc_ctx*/, NULL/*free_ctx*/, malloc_fn, free_fn); } /** C'tor, which instantiates the new allocator (context form). */ libxsmm_scoped_allocator(const void* context, libxsmm_malloc_ctx malloc_ctx, libxsmm_free_ctx free_ctx, libxsmm_malloc_fun malloc_fun = NULL, libxsmm_free_fun free_fun = NULL) { kind::get(m_context, m_malloc, m_free); kind::set(context, malloc_ctx, free_ctx, malloc_fun, free_fun); } /** Following the RAII idiom, the d'tor restores the previous allocator. */ ~libxsmm_scoped_allocator() { kind::set(m_context, m_malloc.ctx_form, m_free.ctx_form, m_malloc.function, m_free.function); } private: /* no copy/assignment */ explicit libxsmm_scoped_allocator(const libxsmm_scoped_allocator&); libxsmm_scoped_allocator& operator=(const libxsmm_scoped_allocator&); protected: /* saved/previous allocator */ const void* m_context; libxsmm_malloc_function m_malloc; libxsmm_free_function m_free; }; /** Allocator-kind to instantiate libxsmm_scoped_allocator. */ struct LIBXSMM_RETARGETABLE libxsmm_default_allocator { static void set(const void* context, libxsmm_malloc_ctx malloc_ctx, libxsmm_free_ctx free_ctx, libxsmm_malloc_fun malloc_fun, libxsmm_free_fun free_fun) { libxsmm_malloc_function malloc_fn; libxsmm_free_function free_fn; if (NULL == context) { /* use global form only when no context is given */ malloc_fn.function = malloc_fun; free_fn.function = free_fun; } else { malloc_fn.ctx_form = malloc_ctx; free_fn.ctx_form = free_ctx; } libxsmm_set_default_allocator(context, malloc_fn, free_fn); } static void get(const void*& context, libxsmm_malloc_function& malloc_fn, libxsmm_free_function& free_fn) { libxsmm_get_default_allocator(&context, &malloc_fn, &free_fn); } }; /** Allocator-kind to instantiate libxsmm_scoped_allocator. */ struct LIBXSMM_RETARGETABLE libxsmm_scratch_allocator { static void set(const void* context, libxsmm_malloc_ctx malloc_ctx, libxsmm_free_ctx free_ctx, libxsmm_malloc_fun malloc_fun, libxsmm_free_fun free_fun) { libxsmm_malloc_function malloc_fn; libxsmm_free_function free_fn; if (NULL != context) { /* adopt context form */ malloc_fn.function = malloc_fun; free_fn.function = free_fun; } else { /* adopt global form */ malloc_fn.ctx_form = malloc_ctx; free_fn.ctx_form = free_ctx; } libxsmm_set_scratch_allocator(context, malloc_fn, free_fn); } static void get(const void*& context, libxsmm_malloc_function& malloc_fn, libxsmm_free_function& free_fn) { libxsmm_get_scratch_allocator(&context, &malloc_fn, &free_fn); } }; /** Forward-declared types/functions used to implement libxsmm_tf_allocator. */ namespace tensorflow { class Allocator; #if defined(LIBXSMM_TF12) class DeviceBase; int DeviceNumaNode(const DeviceBase* /*device*/); Allocator* cpu_allocator(int /*numa_node*/); #else Allocator* cpu_allocator(); #endif } /** * An object of this type adopts a memory allocator from TensorFlow. * All memory allocations of the requested kind within the current * scope (where the libxsmm_tf_allocator object lives) are subject * to TensorFlow's memory allocation scheme. The allocation kind * is usually "libxsmm_scratch_allocator"; using a second object * of kind "libxsmm_default_allocator" makes the default memory * allocation of LIBXSMM subject to TensorFlow as well. */ template class LIBXSMM_RETARGETABLE libxsmm_tf_allocator: public libxsmm_scoped_allocator { public: /** The TensorFlow allocator is adopted from the global CPU memory allocator. */ explicit libxsmm_tf_allocator() : libxsmm_scoped_allocator( libxsmm_tf_allocator::malloc, libxsmm_tf_allocator::free) {} /** The TensorFlow allocator is adopted from the given OpKernelContext. */ template explicit libxsmm_tf_allocator(context_type& context) : libxsmm_scoped_allocator(&context, libxsmm_tf_allocator::template malloc_ctx, libxsmm_tf_allocator::template free_ctx, libxsmm_tf_allocator::malloc, libxsmm_tf_allocator::free) {} /** Global form of allocating memory (malloc signature). */ static void* malloc(size_t size) { #if defined(LIBXSMM_TF12) return libxsmm_tf_allocator::allocate(tensorflow::cpu_allocator(-1/*kNUMANoAffinity*/), size); #else return libxsmm_tf_allocator::allocate(tensorflow::cpu_allocator(), size); #endif } /** Global form of deallocating memory (free signature). */ static void free(void* buffer) { #if defined(LIBXSMM_TF12) libxsmm_tf_allocator::deallocate(tensorflow::cpu_allocator(-1/*kNUMANoAffinity*/), buffer); #else libxsmm_tf_allocator::deallocate(tensorflow::cpu_allocator(), buffer); #endif } /** Context based form of allocating memory. */ template static void* malloc_ctx(const void* context, size_t size) { typedef typename context_type::WrappedAllocator::first_type allocator_ptr; context_type *const tf_context = static_cast(context); allocator_ptr allocator = NULL; if (NULL != tf_context) { #if !defined(LIBXSMM_TF12) if (NULL != tf_context->device()) { if (0 < tf_context->num_outputs()) { allocator = tf_context->device()->GetStepAllocator( tf_context->output_alloc_attr(0), tf_context->resource_manager()); } else if (0 < tf_context->num_inputs()) { allocator = tf_context->device()->GetStepAllocator( tf_context->input_alloc_attr(0), tf_context->resource_manager()); } } #else /* include tensorflow/core/public/version.h prior to LIBXSMM otherwise the current TensorFlow API is assumed */ const int numa_node = DeviceNumaNode(tf_context->device()); allocator = tensorflow::cpu_allocator(numa_node); #endif } return libxsmm_tf_allocator::allocate(allocator, size); } /** Context based form of deallocating memory. */ template static void free_ctx(const void* context, void* buffer) { typedef typename context_type::WrappedAllocator::first_type allocator_ptr; context_type *const tf_context = static_cast(context); allocator_ptr allocator = NULL; if (NULL != tf_context) { #if defined(LIBXSMM_TF12) const int numa_node = DeviceNumaNode(tf_context->device()); allocator = tensorflow::cpu_allocator(numa_node); #else if (NULL != tf_context->device()) { if (0 < tf_context->num_outputs()) { allocator = tf_context->device()->GetStepAllocator( tf_context->output_alloc_attr(0), tf_context->resource_manager()); } else if (0 < tf_context->num_inputs()) { allocator = tf_context->device()->GetStepAllocator( tf_context->input_alloc_attr(0), tf_context->resource_manager()); } } #endif } libxsmm_tf_allocator::deallocate(allocator, buffer); } private: template /* break interface dependency with TF */ static void* allocate(allocator_ptr allocator, size_t size) { void* result; if (NULL != allocator) { /* no (useless) waste with alignment; raw result is re-aligned anyways */ result = allocator->AllocateRaw(1/*alignment*/, size); } else { LIBXSMM_ASSERT_MSG(0/*false*/, "LIBXSMM ERROR: memory allocator is missing"); result = NULL; } return result; } template /* break interface dependency with TF */ static void deallocate(allocator_ptr allocator, void* buffer) { LIBXSMM_ASSERT_MSG(NULL != allocator, "LIBXSMM ERROR: memory allocator is missing"); if (NULL != allocator) allocator->DeallocateRaw(buffer); } }; #endif /*defined(__cplusplus)*/ #endif /*LIBXSMM_MALLOC_H*/ libxsmm-1.17/include/libxsmm_math.h000066400000000000000000000142651415223013700174030ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_MATH_H #define LIBXSMM_MATH_H #include "libxsmm_typedefs.h" /** * Structure of differences with matrix norms according * to http://www.netlib.org/lapack/lug/node75.html). */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_matdiff_info { /** One-norm */ double norm1_abs, norm1_rel; /** Infinity-norm */ double normi_abs, normi_rel; /** Froebenius-norm */ double normf_rel; /** Maximum difference, and L2-norm (both absolute and relative). */ double linf_abs, linf_rel, l2_abs, l2_rel; /** Statistics: sum/l1, min., max., arith. avg., and variance. */ double l1_ref, min_ref, max_ref, avg_ref, var_ref; /** Statistics: sum/l1, min., max., arith. avg., and variance. */ double l1_tst, min_tst, max_tst, avg_tst, var_tst; /** Location (m, n) of largest difference (linf_abs). */ libxsmm_blasint m, n; } libxsmm_matdiff_info; /** * Utility function to calculate a collection of scalar differences between two matrices (libxsmm_matdiff_info). * The location (m, n) of the largest difference (linf_abs) is recorded (also in case of NaN). In case of NaN, * differences are set to infinity. If no difference is discovered, the location (m, n) is negative (OOB). */ LIBXSMM_API int libxsmm_matdiff(libxsmm_matdiff_info* info, libxsmm_datatype datatype, libxsmm_blasint m, libxsmm_blasint n, const void* ref, const void* tst, const libxsmm_blasint* ldref, const libxsmm_blasint* ldtst); /** * Reduces input into output such that the difference is maintained or increased (max function). * The very first (initial) output should be zeroed (libxsmm_matdiff_clear). */ LIBXSMM_API void libxsmm_matdiff_reduce(libxsmm_matdiff_info* output, const libxsmm_matdiff_info* input); /** Clears the given info-structure, e.g., for the initial reduction-value (libxsmm_matdiff_reduce). */ LIBXSMM_API void libxsmm_matdiff_clear(libxsmm_matdiff_info* info); /** Greatest common divisor (corner case: the GCD of 0 and 0 is 1). */ LIBXSMM_API size_t libxsmm_gcd(size_t a, size_t b); /** Least common multiple. */ LIBXSMM_API size_t libxsmm_lcm(size_t a, size_t b); /** * This function finds prime-factors (up to 32) of an unsigned integer in ascending order, and * returns the number of factors found (zero if the given number is prime and unequal to two). */ LIBXSMM_API int libxsmm_primes_u32(unsigned int num, unsigned int num_factors_n32[]); /** Calculate co-prime number <= n/2 (except: libxsmm_shuffle(0|1) == 0). */ LIBXSMM_API size_t libxsmm_shuffle(unsigned int n); /** * Divides the product into prime factors and selects factors such that the new product is within * the given limit (0/1-Knapsack problem), e.g., product=12=2*2*3 and limit=6 then result=2*3=6. * The limit is at least reached or exceeded with the minimal possible product (is_lower=true). */ LIBXSMM_API unsigned int libxsmm_product_limit(unsigned int product, unsigned int limit, int is_lower); /** SQRT with Newton's method using integer arithmetic. */ LIBXSMM_API unsigned int libxsmm_isqrt_u64(unsigned long long x); /** SQRT with Newton's method using integer arithmetic. */ LIBXSMM_API unsigned int libxsmm_isqrt_u32(unsigned int x); /** Based on libxsmm_isqrt_u32, but actual factor of x. */ LIBXSMM_API unsigned int libxsmm_isqrt2_u32(unsigned int x); /** SQRT with Newton's method using double-precision. */ LIBXSMM_API double libxsmm_dsqrt(double x); /** SQRT with Newton's method using single-precision. */ LIBXSMM_API float libxsmm_ssqrt(float x); /** CBRT with Newton's method using integer arithmetic. */ LIBXSMM_API unsigned int libxsmm_icbrt_u64(unsigned long long x); /** CBRT with Newton's method using integer arithmetic. */ LIBXSMM_API unsigned int libxsmm_icbrt_u32(unsigned int x); /** Single-precision approximation of exponential function (base 2). */ LIBXSMM_API float libxsmm_sexp2(float x); /** * Exponential function (base 2), which is limited to unsigned 8-bit input values. * This function reproduces bit-accurate results (single-precision). */ LIBXSMM_API float libxsmm_sexp2_u8(unsigned char x); /** * Exponential function (base 2), which is limited to signed 8-bit input values. * This function reproduces bit-accurate results (single-precision). */ LIBXSMM_API float libxsmm_sexp2_i8(signed char x); /** Similar to libxsmm_sexp2_i8, but takes an integer as signed 8-bit value (check). */ LIBXSMM_API float libxsmm_sexp2_i8i(int x); /** Inlineable fast tanh, such that a the compiler can potentially vectorize. */ LIBXSMM_API_INLINE float libxsmm_stanh_pade78( float i_x ) { const float l_c0 = 2027025.0f; const float l_c1 = 270270.0f; const float l_c2 = 6930.0f; const float l_c3 = 36.0f; const float l_c1_d = 945945.0f; const float l_c2_d = 51975.0f; const float l_c3_d = 630.0f; const float l_hi_bound = 4.97f; const float l_lo_bound = -4.97f; const float l_ones = 1.0f; const float l_neg_ones = -1.0f; const float x2 = i_x * i_x; const float t1_nom = (l_c3 * x2) + l_c2; const float t2_nom = (t1_nom * x2) + l_c1; const float t3_nom = (t2_nom * x2) + l_c0; const float nom = t3_nom * i_x; const float t1_denom = x2 + l_c3_d; const float t2_denom = (t1_denom * x2) + l_c2_d; const float t3_denom = (t2_denom * x2) + l_c1_d; const float denom = (t3_denom * x2) + l_c0; float result = nom/denom ; result = ( result > l_hi_bound ) ? l_ones : result; result = ( result < l_lo_bound ) ? l_neg_ones : result; return result; } #endif /*LIBXSMM_MATH_H*/ libxsmm-1.17/include/libxsmm_memory.h000066400000000000000000000067411415223013700177620ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_MEMORY_H #define LIBXSMM_MEMORY_H #include "libxsmm_macros.h" #if defined(__clang_analyzer__) # define LIBXSMM_MEMSET127(PTRDST, VALUE, SIZE) memset((void*)(PTRDST), VALUE, SIZE) #else # define LIBXSMM_MEMSET127(PTRDST, VALUE, SIZE) { \ char *const libxsmm_memset127_dst_ = (char*)(PTRDST); \ union { size_t size; signed char size1; } libxsmm_memset127_; \ signed char libxsmm_memset127_i_; LIBXSMM_ASSERT((SIZE) <= 127); \ libxsmm_memset127_.size = (SIZE); \ LIBXSMM_PRAGMA_UNROLL \ for (libxsmm_memset127_i_ = 0; libxsmm_memset127_i_ < libxsmm_memset127_.size1; \ ++libxsmm_memset127_i_) \ { \ libxsmm_memset127_dst_[libxsmm_memset127_i_] = (char)(VALUE); \ } \ } #endif #define LIBXSMM_MEMZERO127(PTRDST) LIBXSMM_MEMSET127(PTRDST, '\0', sizeof(*(PTRDST))) #define LIBXSMM_MEMCPY127_LOOP(PTRDST, PTRSRC, SIZE, NTS) { \ const unsigned char *const libxsmm_memcpy127_loop_src_ = (const unsigned char*)(PTRSRC); \ unsigned char *const libxsmm_memcpy127_loop_dst_ = (unsigned char*)(PTRDST); \ signed char libxsmm_memcpy127_loop_i_; LIBXSMM_ASSERT((SIZE) <= 127); \ NTS(libxsmm_memcpy127_loop_dst_) LIBXSMM_PRAGMA_UNROLL \ for (libxsmm_memcpy127_loop_i_ = 0; libxsmm_memcpy127_loop_i_ < (signed char)(SIZE); \ ++libxsmm_memcpy127_loop_i_) \ { \ libxsmm_memcpy127_loop_dst_[libxsmm_memcpy127_loop_i_] = \ libxsmm_memcpy127_loop_src_[libxsmm_memcpy127_loop_i_]; \ } \ } #define LIBXSMM_MEMCPY127_NTS(...) #define LIBXSMM_MEMCPY127(PTRDST, PTRSRC, SIZE) \ LIBXSMM_MEMCPY127_LOOP(PTRDST, PTRSRC, SIZE, LIBXSMM_MEMCPY127_NTS) #define LIBXSMM_ASSIGN127(PTRDST, PTRSRC) LIBXSMM_ASSERT(sizeof(*(PTRSRC)) <= sizeof(*(PTRDST))); \ LIBXSMM_MEMCPY127(PTRDST, PTRSRC, sizeof(*(PTRSRC))) /** * Calculates if there is a difference between two (short) buffers. * Returns zero if there is no difference; otherwise non-zero. */ LIBXSMM_API unsigned char libxsmm_diff(const void* a, const void* b, unsigned char size); /** * Calculates if there is a difference between "a" and "n x b". * Returns the index of the first match (or "n" in case of no match). */ LIBXSMM_API unsigned int libxsmm_diff_n(const void* a, const void* bn, unsigned char size, unsigned char stride, unsigned int hint, unsigned int n); /** Similar to memcmp (C standard library), but the result is conceptually only a boolean. */ LIBXSMM_API int libxsmm_memcmp(const void* a, const void* b, size_t size); /** Calculate a hash value for the given buffer and seed; accepts NULL-buffer. */ LIBXSMM_API unsigned int libxsmm_hash(const void* data, unsigned int size, unsigned int seed); /** Calculate a 64-bit hash for the given character string; accepts NULL-string. */ LIBXSMM_API unsigned long long libxsmm_hash_string(const char string[]); #endif /*LIBXSMM_MEMORY_H*/ libxsmm-1.17/include/libxsmm_mhd.h000066400000000000000000000152201415223013700172120ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_MHD_H #define LIBXSMM_MHD_H #include "libxsmm_typedefs.h" /** Denotes the element/pixel type of an image/channel. */ typedef enum libxsmm_mhd_elemtype { LIBXSMM_MHD_ELEMTYPE_F64 = LIBXSMM_DATATYPE_F64, /* MET_DOUBLE */ LIBXSMM_MHD_ELEMTYPE_F32 = LIBXSMM_DATATYPE_F32, /* MET_FLOAT */ LIBXSMM_MHD_ELEMTYPE_BF16 = LIBXSMM_DATATYPE_BF16, /* MET_BFLOAT */ LIBXSMM_MHD_ELEMTYPE_I64 = LIBXSMM_DATATYPE_I64, /* MET_LONG */ LIBXSMM_MHD_ELEMTYPE_I32 = LIBXSMM_DATATYPE_I32, /* MET_INT */ LIBXSMM_MHD_ELEMTYPE_I16 = LIBXSMM_DATATYPE_I16, /* MET_SHORT */ LIBXSMM_MHD_ELEMTYPE_I8 = LIBXSMM_DATATYPE_I8, /* MET_CHAR */ LIBXSMM_MHD_ELEMTYPE_U64 = LIBXSMM_DATATYPE_UNSUPPORTED, /* MET_ULONG */ LIBXSMM_MHD_ELEMTYPE_U32, /* MET_UINT */ LIBXSMM_MHD_ELEMTYPE_U16, /* MET_USHORT */ LIBXSMM_MHD_ELEMTYPE_U8, /* MET_UCHAR */ LIBXSMM_MHD_ELEMTYPE_UNKNOWN } libxsmm_mhd_elemtype; /** * Function type used for custom data-handler or element conversion. * The value-range (src_min, src_max) may be used to scale values * in case of a type-conversion. */ LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE int (*libxsmm_mhd_element_handler)( void* dst, libxsmm_mhd_elemtype dst_type, libxsmm_mhd_elemtype src_type, const void* src, const void* src_min, const void* src_max); /** * Predefined function to perform element data conversion. * Scales source-values in case of non-NULL src_min and src_max, * or otherwise clamps to the destination-type. */ LIBXSMM_API int libxsmm_mhd_element_conversion( void* dst, libxsmm_mhd_elemtype dst_type, libxsmm_mhd_elemtype src_type, const void* src, const void* src_min, const void* src_max); /** * Predefined function to check a buffer against file content. * In case of different types, libxsmm_mhd_element_conversion * is performed to compare values using the source-type. */ LIBXSMM_API int libxsmm_mhd_element_comparison( void* dst, libxsmm_mhd_elemtype dst_type, libxsmm_mhd_elemtype src_type, const void* src, const void* src_min, const void* src_max); /** Returns the name and size of the element type; result may be NULL/0 in case of an unknown type. */ LIBXSMM_API const char* libxsmm_mhd_typename(libxsmm_mhd_elemtype type, size_t* typesize, const char** ctypename); /** Returns the type of the element for a given type-name. */ LIBXSMM_API libxsmm_mhd_elemtype libxsmm_mhd_typeinfo(const char elemname[]); /** * Parse the header of an MHD-file. The header can be part of the data file (local), * or separately stored (header: MHD, data MHA or RAW). */ LIBXSMM_API int libxsmm_mhd_read_header( /* Filename referring to the header-file (may also contain the data). */ const char header_filename[], /* Maximum length of path/file name. */ size_t filename_max_length, /* Filename containing the data (may be the same as the header-file). */ char filename[], /* Yields the maximum/possible number of dimensions on input, * and the actual number of dimensions on output. */ size_t* ndims, /* Image extents ("ndims" number of entries). */ size_t size[], /* Number of interleaved image channels. */ size_t* ncomponents, /* Type of the image elements (pixel type). */ libxsmm_mhd_elemtype* type, /* Size of the header in bytes; may be used to skip the header, * when reading content; can be a NULL-argument (optional). */ size_t* header_size, /* Size (in Bytes) of an user-defined extended data record; * can be a NULL-argument (optional). */ size_t* extension_size); /** * Loads the data file, and optionally allows data conversion. * Conversion is performed such that values are clamped to fit * into the destination. */ LIBXSMM_API int libxsmm_mhd_read( /* Filename referring to the data. */ const char filename[], /* Offset within pitched buffer (NULL: no offset). */ const size_t offset[], /* Image dimensions (extents). */ const size_t size[], /* Leading buffer dimensions (NULL: same as size). */ const size_t pitch[], /* Dimensionality (number of entries in size). */ size_t ndims, /* Number of interleaved image channels. */ size_t ncomponents, /* Used to skip the header, and to only read the data. */ size_t header_size, /* Data element type as stored (pixel type). */ libxsmm_mhd_elemtype type_stored, /* Storage type (data conversion, optional). */ const libxsmm_mhd_elemtype* type_data, /* Buffer where the data is read into. */ void* data, /** * Optional callback executed per entry when reading the data. * May assign the value to the left-most argument, but also * allows to only compare with present data. Can be used to * avoid allocating an actual destination. */ libxsmm_mhd_element_handler handle_element, /* Post-content data (extension, optional). */ char extension[], /* Size of the extension; can be zero. */ size_t extension_size); /** * Save a file using an extended data format, which is compatible with the Meta Image Format (MHD). * The file is suitable for visual inspection using, e.g., ITK-SNAP or ParaView. */ LIBXSMM_API int libxsmm_mhd_write(const char filename[], /* Offset within pitched buffer (NULL: no offset). */ const size_t offset[], /* Image dimensions (extents). */ const size_t size[], /* Leading buffer dimensions (NULL: same as size). */ const size_t pitch[], /* Dimensionality, i.e., number of entries in data_size/size. */ size_t ndims, /* Number of pixel components. */ size_t ncomponents, /* Type (input). */ libxsmm_mhd_elemtype type_data, /* Type (data conversion, optional). */ const libxsmm_mhd_elemtype* type, /* Raw data to be saved. */ const void* data, /* Size of the header; can be a NULL-argument (optional). */ size_t* header_size, /* Extension header data; can be NULL. */ const char extension_header[], /* Extension data stream; can be NULL. */ const void* extension, /* Extension data size; can be NULL. */ size_t extension_size); #endif /*LIBXSMM_MHD_H*/ libxsmm-1.17/include/libxsmm_rng.h000066400000000000000000000050041415223013700172270ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Hans Pabst (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_RNG_H #define LIBXSMM_RNG_H #include "libxsmm_typedefs.h" /** * create a new external state for thread-save execution managed * by the user. We do not provide a function for drawing the random numbers * the user is supposed to call the LIBXSMM_INTRINSICS_MM512_RNG_EXTSTATE_PS * or LIBXSMM_INTRINSICS_MM512_RNG_XOSHIRO128P_EXTSTATE_EPI32 intrinsic. * */ LIBXSMM_API unsigned int* libxsmm_rng_create_avx512_extstate(unsigned int/*uint32_t*/ seed); /** free a previously created rng_avx512_extstate */ LIBXSMM_API void libxsmm_rng_destroy_avx512_extstate(unsigned int* stateptr); /** Set the seed of libxsmm_rng_* (similar to srand). */ LIBXSMM_API void libxsmm_rng_set_seed(unsigned int/*uint32_t*/ seed); /** * This SP-RNG is using xoshiro128+ 1.0, work done by * David Blackman and Sebastiano Vigna (vigna@acm.org). * It is their best and fastest 32-bit generator for * 32-bit floating-point numbers. They suggest to use * its upper bits for floating-point generation, what * we do here and generate numbers in [0,1(. */ LIBXSMM_API void libxsmm_rng_f32_seq(float* rngs, libxsmm_blasint count); /** * Returns a (pseudo-)random value based on rand/rand48 in the interval [0, n). * This function compensates for an n, which is not a factor of RAND_MAX. * Note: libxsmm_rng_set_seed must be used if one wishes to seed the generator. */ LIBXSMM_API unsigned int libxsmm_rng_u32(unsigned int n); /** Sequence of random data based on libxsmm_rng_u32. */ LIBXSMM_API void libxsmm_rng_seq(void* data, libxsmm_blasint nbytes); /** * Similar to libxsmm_rng_u32, but returns a DP-value in the interval [0, 1). * Note: libxsmm_rng_set_seed must be used if one wishes to seed the generator. */ LIBXSMM_API double libxsmm_rng_f64(void); #endif /* LIBXSMM_RNG_H */ libxsmm-1.17/include/libxsmm_source.h000066400000000000000000000126451415223013700177520ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_SOURCE_H #define LIBXSMM_SOURCE_H #if defined(LIBXSMM_MACROS_H) # error Please do not include any LIBXSMM header other than libxsmm_source.h! #endif #if defined(LIBXSMM_BUILD) # error LIBXSMM_BUILD cannot be defined for the header-only LIBXSMM! #endif /** * This header is intentionally called "libxsmm_source.h" since the followings block * includes *internal* files, and thereby exposes LIBXSMM's implementation. * The so-called "header-only" usage model gives up the clearly defined binary interface * (including support for hot-fixes after deployment), and requires to rebuild client * code for every (internal) change of LIBXSMM. Please make sure to only rely on the * public interface as the internal implementation may change without notice. */ #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #include "../src/generator_common.c" #include "../src/generator_gemm.c" #include "../src/generator_gemm_avx2_microkernel.c" #include "../src/generator_gemm_avx512_microkernel.c" #include "../src/generator_gemm_avx_microkernel.c" #include "../src/generator_gemm_common.c" #include "../src/generator_gemm_noarch.c" #include "../src/generator_gemm_sse3_avx_avx2_avx512.c" #include "../src/generator_gemm_sse3_microkernel.c" #include "../src/generator_matcopy.c" #include "../src/generator_matcopy_avx_avx512.c" #include "../src/generator_mateltwise.c" #include "../src/generator_mateltwise_avx_avx512.c" #include "../src/generator_packed.c" #include "../src/generator_packed_gemm_ac_rm_avx_avx2_avx512.c" #include "../src/generator_packed_gemm_avx_avx512.c" #include "../src/generator_packed_gemm_bc_rm_avx_avx2_avx512.c" #include "../src/generator_packed_getrf_avx_avx512.c" #include "../src/generator_packed_trmm_avx_avx512.c" #include "../src/generator_packed_trsm_avx_avx512.c" #include "../src/generator_spgemm.c" #include "../src/generator_spgemm_csc_asparse.c" #include "../src/generator_spgemm_csc_bsparse.c" #include "../src/generator_spgemm_csc_bsparse_soa.c" #include "../src/generator_spgemm_csc_csparse_soa.c" #include "../src/generator_spgemm_csc_reader.c" #include "../src/generator_spgemm_csr_asparse.c" #include "../src/generator_spgemm_csr_asparse_reg.c" #include "../src/generator_spgemm_csr_asparse_soa.c" #include "../src/generator_spgemm_csr_bsparse_soa.c" #include "../src/generator_spgemm_csr_reader.c" #include "../src/generator_transpose.c" #include "../src/generator_transpose_avx_avx512.c" #include "../src/generator_x86_instructions.c" #include "../src/libxsmm_blocked_gemm.c" #include "../src/libxsmm_cpuid_x86.c" #include "../src/libxsmm_dnn.c" #include "../src/libxsmm_dnn_convolution.c" #include "../src/libxsmm_dnn_convolution_backward.c" #include "../src/libxsmm_dnn_convolution_forward.c" #include "../src/libxsmm_dnn_convolution_weight_update.c" #include "../src/libxsmm_dnn_elementwise.c" #include "../src/libxsmm_dnn_fullyconnected.c" #include "../src/libxsmm_dnn_fullyconnected_backward_weight_update.c" #include "../src/libxsmm_dnn_fullyconnected_forward.c" #include "../src/libxsmm_dnn_fusedbatchnorm.c" #include "../src/libxsmm_dnn_fusedbatchnorm_backward.c" #include "../src/libxsmm_dnn_fusedbatchnorm_forward.c" #include "../src/libxsmm_dnn_fusedgroupnorm.c" #include "../src/libxsmm_dnn_fusedgroupnorm_backward.c" #include "../src/libxsmm_dnn_fusedgroupnorm_forward.c" #include "../src/libxsmm_dnn_optimizer.c" #include "../src/libxsmm_dnn_optimizer_sgd.c" #include "../src/libxsmm_dnn_pooling.c" #include "../src/libxsmm_dnn_pooling_backward.c" #include "../src/libxsmm_dnn_pooling_forward.c" #include "../src/libxsmm_dnn_rnncell.c" #include "../src/libxsmm_dnn_rnncell_backward_weight_update.c" #include "../src/libxsmm_dnn_rnncell_forward.c" #include "../src/libxsmm_dnn_softmaxloss.c" #include "../src/libxsmm_dnn_softmaxloss_backward.c" #include "../src/libxsmm_dnn_softmaxloss_forward.c" #include "../src/libxsmm_dnn_tensor.c" #include "../src/libxsmm_ext.c" #include "../src/libxsmm_ext_blocked_gemm.c" #include "../src/libxsmm_ext_gemm.c" #include "../src/libxsmm_ext_xcopy.c" #include "../src/libxsmm_fsspmdm.c" #include "../src/libxsmm_gemm.c" #include "../src/libxsmm_generator.c" #include "../src/libxsmm_hash.c" #include "../src/libxsmm_main.c" #include "../src/libxsmm_malloc.c" #include "../src/libxsmm_math.c" #include "../src/libxsmm_memory.c" #include "../src/libxsmm_mhd.c" #include "../src/libxsmm_perf.c" #include "../src/libxsmm_python.c" #include "../src/libxsmm_rng.c" #include "../src/libxsmm_spmdm.c" #include "../src/libxsmm_sync.c" #include "../src/libxsmm_timer.c" #include "../src/libxsmm_trace.c" #include "../src/libxsmm_xcopy.c" #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif #endif /*LIBXSMM_SOURCE_H*/ libxsmm-1.17/include/libxsmm_spmdm.h000066400000000000000000000074631415223013700175740ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Nadathur Satish (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_SPMDM_H #define LIBXSMM_SPMDM_H #include "libxsmm_typedefs.h" typedef enum libxsmm_spmdm_datatype { LIBXSMM_SPMDM_DATATYPE_F32, LIBXSMM_SPMDM_DATATYPE_BFLOAT16 } libxsmm_spmdm_datatype; LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_spmdm_handle { /* The following are the matrix multiply dimensions: A (sparse): m X k, B (dense): k X n, Output C (dense): m X n */ int m; int n; int k; /* The block sizes for A, B and C. */ /* Here we fix A to be divided into 128 X 128 blocks, B/C to be 128 X 48 for HSW/BDW and 128 X 96 for SKX */ int bm; int bn; int bk; /* The number of blocks for the m, n and k dimensions */ int mb; int nb; int kb; libxsmm_spmdm_datatype datatype; char* base_ptr_scratch_A; char* base_ptr_scratch_B_scratch_C; int memory_for_scratch_per_thread; } libxsmm_spmdm_handle; /** * This stores a single sparse splice (or block) of sparse matrix A using a CSR representation (rowidx, colidx, and values * Each splice corresponds to a bm X bk region of A, and stores local indexes */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_CSR_sparseslice { /* Since bm and bk are assumed to be <=256, a 16-bit integer is enough to store the local rowidx, colidx */ uint16_t* rowidx; uint16_t* colidx; float* values; } libxsmm_CSR_sparseslice; LIBXSMM_API void libxsmm_spmdm_init( int M, int N, int K, int max_threads, libxsmm_spmdm_handle* handle, libxsmm_CSR_sparseslice** libxsmm_output_csr); LIBXSMM_API void libxsmm_spmdm_destroy( libxsmm_spmdm_handle* handle); LIBXSMM_API int libxsmm_spmdm_get_num_createSparseSlice_blocks( const libxsmm_spmdm_handle* handle); LIBXSMM_API int libxsmm_spmdm_get_num_compute_blocks( const libxsmm_spmdm_handle* handle); /** This converts a dense representation of the sparse matrix to 2D array of sparse slices. */ LIBXSMM_API void libxsmm_spmdm_createSparseSlice_fp32_thread( const libxsmm_spmdm_handle* handle, char transa, const float* a, libxsmm_CSR_sparseslice* libxsmm_output_csr_a, int block_id, int tid, int nthreads); LIBXSMM_API void libxsmm_spmdm_createSparseSlice_bfloat16_thread( const libxsmm_spmdm_handle* handle, char transa, const libxsmm_bfloat16* a, libxsmm_CSR_sparseslice* libxsmm_output_csr_a, int block_id, int tid, int nthreads); /** NOTE: This code currently ignores alpha input to the matrix multiply */ LIBXSMM_API void libxsmm_spmdm_compute_fp32_thread( const libxsmm_spmdm_handle* handle, char transa, char transb, const float* alpha, libxsmm_CSR_sparseslice* a_sparse, const float* b, char transc, const float* beta, float* c, int block_id, int tid, int nthreads); /** NOTE: This code currently ignores alpha input to the matrix multiply */ LIBXSMM_API void libxsmm_spmdm_compute_bfloat16_thread( const libxsmm_spmdm_handle* handle, char transa, char transb, const libxsmm_bfloat16* alpha, libxsmm_CSR_sparseslice* a_sparse, const libxsmm_bfloat16* b, char transc, const libxsmm_bfloat16* beta, float* c, int block_id, int tid, int nthreads); #endif /*LIBXSMM_SPMDM_H*/ libxsmm-1.17/include/libxsmm_sync.h000066400000000000000000001400761415223013700174260ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_SYNC_H #define LIBXSMM_SYNC_H #include "libxsmm_intrinsics_x86.h" #if !defined(LIBXSMM_TLS) # if (0 != LIBXSMM_SYNC) && !defined(LIBXSMM_NO_TLS) # if defined(__CYGWIN__) && defined(__clang__) # define LIBXSMM_NO_TLS # define LIBXSMM_TLS # else # if (defined(_WIN32) && !defined(__GNUC__) && !defined(__clang__)) || (defined(__PGI) && !defined(__cplusplus)) # define LIBXSMM_TLS LIBXSMM_ATTRIBUTE(thread) # elif defined(__GNUC__) || defined(__clang__) || defined(_CRAYC) # define LIBXSMM_TLS __thread # elif defined(__cplusplus) # define LIBXSMM_TLS thread_local # else # error Missing TLS support! # endif # endif # else # if !defined(LIBXSMM_NO_TLS) # define LIBXSMM_NO_TLS # endif # define LIBXSMM_TLS # endif #endif #if !defined(LIBXSMM_GCC_BASELINE) && !defined(LIBXSMM_SYNC_LEGACY) && ((defined(_WIN32) && defined(__clang__)) || \ (defined(__GNUC__) && LIBXSMM_VERSION2(4, 7) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__))) # define LIBXSMM_GCC_BASELINE #endif #if defined(__MIC__) # define LIBXSMM_SYNC_PAUSE _mm_delay_32(8/*delay*/) #elif !defined(LIBXSMM_INTRINSICS_NONE) # if defined(LIBXSMM_GCC_BASELINE) && !defined(__INTEL_COMPILER) # define LIBXSMM_SYNC_PAUSE __builtin_ia32_pause() # else # define LIBXSMM_SYNC_PAUSE _mm_pause() # endif #elif (LIBXSMM_X86_GENERIC <= LIBXSMM_STATIC_TARGET_ARCH) && defined(__GNUC__) # define LIBXSMM_SYNC_PAUSE __asm__ __volatile__("pause" ::: "memory") #else # define LIBXSMM_SYNC_PAUSE #endif /* permit thread-unsafe */ #if !defined(LIBXSMM_SYNC_NONE) && ( \ (defined(__PGI) && (!defined(LIBXSMM_LIBATOMIC) || !defined(__STATIC))) || \ (defined(_CRAYC) && !defined(__GNUC__))) # define LIBXSMM_SYNC_NONE #endif #if !defined(LIBXSMM_ATOMIC_TRYLOCK_CMPSWP) && 0 # define LIBXSMM_ATOMIC_TRYLOCK_CMPSWP #endif #if !defined(LIBXSMM_ATOMIC_ZERO_STORE) && defined(_CRAYC) # define LIBXSMM_ATOMIC_ZERO_STORE #endif #if !defined(LIBXSMM_ATOMIC_LOCKTYPE) # if !defined(_WIN32) || 1 # define LIBXSMM_ATOMIC_LOCKTYPE char # else /* Windows */ # define LIBXSMM_ATOMIC_LOCKTYPE int # endif #endif typedef enum libxsmm_atomic_kind { #if defined(__ATOMIC_SEQ_CST) LIBXSMM_ATOMIC_SEQ_CST = __ATOMIC_SEQ_CST, #else LIBXSMM_ATOMIC_SEQ_CST = 0, #endif #if defined(__ATOMIC_RELAXED) LIBXSMM_ATOMIC_RELAXED = __ATOMIC_RELAXED #else LIBXSMM_ATOMIC_RELAXED = LIBXSMM_ATOMIC_SEQ_CST #endif } libxsmm_atomic_kind; #define LIBXSMM_NONATOMIC_LOCKTYPE LIBXSMM_ATOMIC_LOCKTYPE #define LIBXSMM_NONATOMIC_LOAD(SRC_PTR, KIND) (*(SRC_PTR)) #define LIBXSMM_NONATOMIC_STORE(DST_PTR, VALUE, KIND) { LIBXSMM_UNUSED(KIND); *(DST_PTR) = (VALUE); } #define LIBXSMM_NONATOMIC_STORE_ZERO(DST_PTR, KIND) LIBXSMM_NONATOMIC_STORE(DST_PTR, 0, KIND) #define LIBXSMM_NONATOMIC_FETCH_OR(DST_PTR, VALUE/*side-effect*/, KIND) (/* 1st step: swap(dst, val) */ \ ((*DST_PTR) = (*DST_PTR) ^ (VALUE)), (VALUE = (VALUE) ^ (*DST_PTR)), ((*DST_PTR) = (*DST_PTR) ^ (VALUE)), \ (*(DST_PTR) |= VALUE), (VALUE) /* 2nd step: or, and 3rd/last step: original dst-value */) #define LIBXSMM_NONATOMIC_ADD_FETCH(DST_PTR, VALUE, KIND) (*(DST_PTR) += VALUE) #define LIBXSMM_NONATOMIC_SUB_FETCH(DST_PTR, VALUE, KIND) (*(DST_PTR) -= VALUE) #define LIBXSMM_NONATOMIC_FETCH_ADD(DST_PTR, VALUE, KIND) (LIBXSMM_NONATOMIC_ADD_FETCH(DST_PTR, VALUE, KIND), (*(DST_PTR) - (VALUE))) #define LIBXSMM_NONATOMIC_FETCH_SUB(DST_PTR, VALUE, KIND) (LIBXSMM_NONATOMIC_SUB_FETCH(DST_PTR, VALUE, KIND), (*(DST_PTR) + (VALUE))) #define LIBXSMM_NONATOMIC_CMPSWP(DST_PTR, OLDVAL, NEWVAL, KIND) ((NEWVAL) == (*(DST_PTR) == (OLDVAL) ? (*(DST_PTR) = (NEWVAL)) : (OLDVAL))) #define LIBXSMM_NONATOMIC_TRYLOCK(DST_PTR, KIND) LIBXSMM_NONATOMIC_CMPSWP(DST_PTR, 0, 1, KIND) #define LIBXSMM_NONATOMIC_ACQUIRE(DST_PTR, NPAUSE, KIND) { LIBXSMM_UNUSED(NPAUSE); \ LIBXSMM_ASSERT_MSG(0 == *(DST_PTR), "LIBXSMM_NONATOMIC_ACQUIRE"); LIBXSMM_NONATOMIC_STORE(DST_PTR, 1, KIND); \ LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_NONATOMIC_ACQUIRE"); } #define LIBXSMM_NONATOMIC_RELEASE(DST_PTR, KIND) { LIBXSMM_UNUSED(DST_PTR); LIBXSMM_UNUSED(KIND); \ LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_NONATOMIC_RELEASE"); LIBXSMM_NONATOMIC_STORE(DST_PTR, 0, KIND); \ LIBXSMM_ASSERT_MSG(0 == *(DST_PTR), "LIBXSMM_NONATOMIC_RELEASE"); } #define LIBXSMM_NONATOMIC_SYNC(KIND) LIBXSMM_UNUSED(KIND) #if (0 == LIBXSMM_SYNC) || defined(LIBXSMM_SYNC_NONE) # define LIBXSMM_ATOMIC(FN, BITS) FN # define LIBXSMM_ATOMIC_LOAD LIBXSMM_NONATOMIC_LOAD # define LIBXSMM_ATOMIC_STORE LIBXSMM_NONATOMIC_STORE # define LIBXSMM_ATOMIC_STORE_ZERO LIBXSMM_NONATOMIC_STORE_ZERO # define LIBXSMM_ATOMIC_FETCH_OR LIBXSMM_NONATOMIC_FETCH_OR # define LIBXSMM_ATOMIC_ADD_FETCH LIBXSMM_NONATOMIC_ADD_FETCH # define LIBXSMM_ATOMIC_SUB_FETCH LIBXSMM_NONATOMIC_SUB_FETCH # define LIBXSMM_ATOMIC_FETCH_ADD LIBXSMM_NONATOMIC_FETCH_ADD # define LIBXSMM_ATOMIC_FETCH_SUB LIBXSMM_NONATOMIC_FETCH_SUB # define LIBXSMM_ATOMIC_CMPSWP LIBXSMM_NONATOMIC_CMPSWP # define LIBXSMM_ATOMIC_TRYLOCK LIBXSMM_NONATOMIC_TRYLOCK # define LIBXSMM_ATOMIC_ACQUIRE LIBXSMM_NONATOMIC_ACQUIRE # define LIBXSMM_ATOMIC_RELEASE LIBXSMM_NONATOMIC_RELEASE # define LIBXSMM_ATOMIC_SYNC LIBXSMM_NONATOMIC_SYNC # if !defined(LIBXSMM_SYNC_NPAUSE) # define LIBXSMM_SYNC_NPAUSE 0 # endif #elif (defined(LIBXSMM_GCC_BASELINE) || defined(LIBXSMM_LIBATOMIC) /* GNU's libatomic required */ || \ (defined(__GNUC__) && LIBXSMM_VERSION2(4, 1) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__))) # if defined(LIBXSMM_LIBATOMIC) # define LIBXSMM_ATOMIC(FN, BITS) LIBXSMM_CONCATENATE(LIBXSMM_ATOMIC, BITS)(FN) # define LIBXSMM_ATOMIC8(FN) LIBXSMM_CONCATENATE(FN, 8) # define LIBXSMM_ATOMIC16(FN) LIBXSMM_CONCATENATE(FN, 16) # define LIBXSMM_ATOMIC32(FN) FN/*default*/ # define LIBXSMM_ATOMIC64(FN) LIBXSMM_CONCATENATE(FN, 64) # if defined(__PGI) # define LIBXSMM_ATOMIC_LOAD(SRC_PTR, KIND) LIBXSMM_NONATOMIC_LOAD(SRC_PTR, KIND) # define LIBXSMM_ATOMIC_LOAD8(SRC_PTR, KIND) LIBXSMM_NONATOMIC_LOAD(SRC_PTR, KIND) # define LIBXSMM_ATOMIC_LOAD16(SRC_PTR, KIND) LIBXSMM_NONATOMIC_LOAD(SRC_PTR, KIND) # define LIBXSMM_ATOMIC_LOAD64(SRC_PTR, KIND) LIBXSMM_NONATOMIC_LOAD(SRC_PTR, KIND) # define LIBXSMM_ATOMIC_STORE(DST_PTR, VALUE, KIND) LIBXSMM_NONATOMIC_STORE(DST_PTR, VALUE, KIND) # define LIBXSMM_ATOMIC_STORE8(DST_PTR, VALUE, KIND) LIBXSMM_NONATOMIC_STORE(DST_PTR, VALUE, KIND) # define LIBXSMM_ATOMIC_STORE16(DST_PTR, VALUE, KIND) LIBXSMM_NONATOMIC_STORE(DST_PTR, VALUE, KIND) # define LIBXSMM_ATOMIC_STORE64(DST_PTR, VALUE, KIND) LIBXSMM_NONATOMIC_STORE(DST_PTR, VALUE, KIND) # else # define LIBXSMM_ATOMIC_LOAD(SRC_PTR, KIND) __atomic_load_4(SRC_PTR, KIND) # define LIBXSMM_ATOMIC_LOAD8(SRC_PTR, KIND) __atomic_load_1(SRC_PTR, KIND) # define LIBXSMM_ATOMIC_LOAD16(SRC_PTR, KIND) __atomic_load_2(SRC_PTR, KIND) # define LIBXSMM_ATOMIC_LOAD64(SRC_PTR, KIND) __atomic_load_8(SRC_PTR, KIND) # define LIBXSMM_ATOMIC_STORE(DST_PTR, VALUE, KIND) __atomic_store_4(DST_PTR, (unsigned int)(VALUE), KIND) # define LIBXSMM_ATOMIC_STORE8(DST_PTR, VALUE, KIND) __atomic_store_1(DST_PTR, (unsigned char)(VALUE), KIND) # define LIBXSMM_ATOMIC_STORE16(DST_PTR, VALUE, KIND) __atomic_store_2(DST_PTR, (unsigned short)(VALUE), KIND) # define LIBXSMM_ATOMIC_STORE64(DST_PTR, VALUE, KIND) __atomic_store_8(DST_PTR, (unsigned long long)(VALUE), KIND) # endif # define LIBXSMM_ATOMIC_FETCH_OR(DST_PTR, VALUE, KIND) __atomic_fetch_or_4(DST_PTR, (unsigned int)(VALUE), KIND) # define LIBXSMM_ATOMIC_FETCH_OR8(DST_PTR, VALUE, KIND) __atomic_fetch_or_1(DST_PTR, (unsigned char)(VALUE), KIND) # define LIBXSMM_ATOMIC_FETCH_OR16(DST_PTR, VALUE, KIND) __atomic_fetch_or_2(DST_PTR, (unsigned short)(VALUE), KIND) # define LIBXSMM_ATOMIC_FETCH_OR64(DST_PTR, VALUE, KIND) __atomic_fetch_or_8(DST_PTR, (unsigned long long)(VALUE), KIND) # define LIBXSMM_ATOMIC_ADD_FETCH(DST_PTR, VALUE, KIND) __atomic_add_fetch_4(DST_PTR, (int)(VALUE), KIND) # define LIBXSMM_ATOMIC_ADD_FETCH8(DST_PTR, VALUE, KIND) __atomic_add_fetch_1(DST_PTR, (signed char)(VALUE), KIND) # define LIBXSMM_ATOMIC_ADD_FETCH16(DST_PTR, VALUE, KIND) __atomic_add_fetch_2(DST_PTR, (short)(VALUE), KIND) # define LIBXSMM_ATOMIC_ADD_FETCH64(DST_PTR, VALUE, KIND) __atomic_add_fetch_8(DST_PTR, (long long)(VALUE), KIND) # define LIBXSMM_ATOMIC_SUB_FETCH(DST_PTR, VALUE, KIND) __atomic_sub_fetch_4(DST_PTR, (int)(VALUE), KIND) # define LIBXSMM_ATOMIC_SUB_FETCH8(DST_PTR, VALUE, KIND) __atomic_sub_fetch_1(DST_PTR, (signed char)(VALUE), KIND) # define LIBXSMM_ATOMIC_SUB_FETCH16(DST_PTR, VALUE, KIND) __atomic_sub_fetch_2(DST_PTR, (short)(VALUE), KIND) # define LIBXSMM_ATOMIC_SUB_FETCH64(DST_PTR, VALUE, KIND) __atomic_sub_fetch_8(DST_PTR, (long long)(VALUE), KIND) # define LIBXSMM_ATOMIC_FETCH_ADD(DST_PTR, VALUE, KIND) __atomic_fetch_add_4(DST_PTR, (int)(VALUE), KIND) # define LIBXSMM_ATOMIC_FETCH_ADD8(DST_PTR, VALUE, KIND) __atomic_fetch_add_1(DST_PTR, (signed char)(VALUE), KIND) # define LIBXSMM_ATOMIC_FETCH_ADD16(DST_PTR, VALUE, KIND) __atomic_fetch_add_2(DST_PTR, (short)(VALUE), KIND) # define LIBXSMM_ATOMIC_FETCH_ADD64(DST_PTR, VALUE, KIND) __atomic_fetch_add_8(DST_PTR, (long long)(VALUE), KIND) # define LIBXSMM_ATOMIC_FETCH_SUB(DST_PTR, VALUE, KIND) __atomic_fetch_sub_4(DST_PTR, (int)(VALUE), KIND) # define LIBXSMM_ATOMIC_FETCH_SUB8(DST_PTR, VALUE, KIND) __atomic_fetch_sub_1(DST_PTR, (signed char)(VALUE), KIND) # define LIBXSMM_ATOMIC_FETCH_SUB16(DST_PTR, VALUE, KIND) __atomic_fetch_sub_2(DST_PTR, (short)(VALUE), KIND) # define LIBXSMM_ATOMIC_FETCH_SUB64(DST_PTR, VALUE, KIND) __atomic_fetch_sub_8(DST_PTR, (long long)(VALUE), KIND) # define LIBXSMM_ATOMIC_CMPSWP(DST_PTR, OLDVAL, NEWVAL, KIND) \ __atomic_compare_exchange_4(DST_PTR, &(OLDVAL), (NEWVAL), 0/*false*/, KIND, LIBXSMM_ATOMIC_RELAXED) # define LIBXSMM_ATOMIC_CMPSWP8(DST_PTR, OLDVAL, NEWVAL, KIND) \ __atomic_compare_exchange_1(DST_PTR, &(OLDVAL), (NEWVAL), 0/*false*/, KIND, LIBXSMM_ATOMIC_RELAXED) # define LIBXSMM_ATOMIC_CMPSWP16(DST_PTR, OLDVAL, NEWVAL, KIND) \ __atomic_compare_exchange_2(DST_PTR, &(OLDVAL), (NEWVAL), 0/*false*/, KIND, LIBXSMM_ATOMIC_RELAXED) # define LIBXSMM_ATOMIC_CMPSWP64(DST_PTR, OLDVAL, NEWVAL, KIND) \ __atomic_compare_exchange_8(DST_PTR, &(OLDVAL), (NEWVAL), 0/*false*/, KIND, LIBXSMM_ATOMIC_RELAXED) # if defined(LIBXSMM_ATOMIC_TRYLOCK_CMPSWP) # define LIBXSMM_ATOMIC_TRYLOCK(DST_PTR, KIND) (!__atomic_test_and_set(DST_PTR, KIND)) # endif # if defined(__PGI) # define LIBXSMM_ATOMIC_RELEASE(DST_PTR, KIND) { LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_ATOMIC_RELEASE"); \ LIBXSMM_ATOMIC_STORE_ZERO8(DST_PTR, KIND); } /* matches bit-width of LIBXSMM_ATOMIC_LOCKTYPE */ # else # define LIBXSMM_ATOMIC_RELEASE(DST_PTR, KIND) { LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_ATOMIC_RELEASE"); \ __atomic_clear(DST_PTR, KIND); } # endif # define LIBXSMM_ATOMIC_SYNC(KIND) __sync_synchronize() # if !defined(LIBXSMM_ATOMIC_ZERO_STORE) # define LIBXSMM_ATOMIC_ZERO_STORE # endif # elif defined(LIBXSMM_GCC_BASELINE) # define LIBXSMM_ATOMIC(FN, BITS) FN # define LIBXSMM_ATOMIC_LOAD(SRC_PTR, KIND) __atomic_load_n(SRC_PTR, KIND) # define LIBXSMM_ATOMIC_STORE(DST_PTR, VALUE, KIND) __atomic_store_n(DST_PTR, VALUE, KIND) # if !defined(LIBXSMM_ATOMIC_ZERO_STORE) # define LIBXSMM_ATOMIC_STORE_ZERO(DST_PTR, KIND) do {} while (__atomic_and_fetch(DST_PTR, 0, KIND)) # endif # define LIBXSMM_ATOMIC_FETCH_OR(DST_PTR, VALUE, KIND) __atomic_fetch_or(DST_PTR, VALUE, KIND) # define LIBXSMM_ATOMIC_ADD_FETCH(DST_PTR, VALUE, KIND) __atomic_add_fetch(DST_PTR, VALUE, KIND) # define LIBXSMM_ATOMIC_SUB_FETCH(DST_PTR, VALUE, KIND) __atomic_sub_fetch(DST_PTR, VALUE, KIND) # define LIBXSMM_ATOMIC_FETCH_ADD(DST_PTR, VALUE, KIND) __atomic_fetch_add(DST_PTR, VALUE, KIND) # define LIBXSMM_ATOMIC_FETCH_SUB(DST_PTR, VALUE, KIND) __atomic_fetch_sub(DST_PTR, VALUE, KIND) # define LIBXSMM_ATOMIC_CMPSWP(DST_PTR, OLDVAL, NEWVAL, KIND) __sync_bool_compare_and_swap(DST_PTR, OLDVAL, NEWVAL) # if defined(LIBXSMM_ATOMIC_TRYLOCK_CMPSWP) # define LIBXSMM_ATOMIC_TRYLOCK(DST_PTR, KIND) (!__atomic_test_and_set(DST_PTR, KIND)) # endif # define LIBXSMM_ATOMIC_RELEASE(DST_PTR, KIND) { LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_ATOMIC_RELEASE"); \ __atomic_clear(DST_PTR, KIND); } # if 0 /* __atomic_thread_fence: incorrect behavior in libxsmm_barrier (even with LIBXSMM_ATOMIC_SEQ_CST) */ # define LIBXSMM_ATOMIC_SYNC(KIND) __atomic_thread_fence(KIND) # else # define LIBXSMM_ATOMIC_SYNC(KIND) __sync_synchronize() # endif # else /* GCC legacy atomics */ # define LIBXSMM_ATOMIC(FN, BITS) FN # define LIBXSMM_ATOMIC_LOAD(SRC_PTR, KIND) __sync_or_and_fetch(SRC_PTR, 0) # if (LIBXSMM_X86_GENERIC <= LIBXSMM_STATIC_TARGET_ARCH) # define LIBXSMM_ATOMIC_STORE(DST_PTR, VALUE, KIND) { \ __asm__ __volatile__("" ::: "memory"); *(DST_PTR) = (VALUE); \ __asm__ __volatile__("" ::: "memory"); } # else # define LIBXSMM_ATOMIC_SYNC_NOFENCE(KIND) # define LIBXSMM_ATOMIC_STORE(DST_PTR, VALUE, KIND) *(DST_PTR) = (VALUE) # endif # if !defined(LIBXSMM_ATOMIC_ZERO_STORE) # define LIBXSMM_ATOMIC_STORE_ZERO(DST_PTR, KIND) do {} while (__sync_and_and_fetch(DST_PTR, 0)) # endif # define LIBXSMM_ATOMIC_FETCH_OR(DST_PTR, VALUE, KIND) __sync_fetch_and_or(DST_PTR, VALUE) # define LIBXSMM_ATOMIC_ADD_FETCH(DST_PTR, VALUE, KIND) __sync_add_and_fetch(DST_PTR, VALUE) # define LIBXSMM_ATOMIC_SUB_FETCH(DST_PTR, VALUE, KIND) __sync_sub_and_fetch(DST_PTR, VALUE) # define LIBXSMM_ATOMIC_FETCH_ADD(DST_PTR, VALUE, KIND) __sync_fetch_and_add(DST_PTR, VALUE) # define LIBXSMM_ATOMIC_FETCH_SUB(DST_PTR, VALUE, KIND) __sync_fetch_and_sub(DST_PTR, VALUE) # define LIBXSMM_ATOMIC_CMPSWP(DST_PTR, OLDVAL, NEWVAL, KIND) __sync_bool_compare_and_swap(DST_PTR, OLDVAL, NEWVAL) # if defined(LIBXSMM_ATOMIC_TRYLOCK_CMPSWP) # define LIBXSMM_ATOMIC_TRYLOCK(DST_PTR, KIND) (0 == __sync_lock_test_and_set(DST_PTR, 1)) # endif # define LIBXSMM_ATOMIC_RELEASE(DST_PTR, KIND) { LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_ATOMIC_RELEASE"); \ __sync_lock_release(DST_PTR); } # define LIBXSMM_ATOMIC_SYNC(KIND) __sync_synchronize() # endif # if defined(LIBXSMM_ATOMIC_ZERO_STORE) # define LIBXSMM_ATOMIC_STORE_ZERO(DST_PTR, KIND) LIBXSMM_ATOMIC_STORE(DST_PTR, 0, KIND) # define LIBXSMM_ATOMIC_STORE_ZERO8(DST_PTR, KIND) LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_STORE, 8)(DST_PTR, 0, KIND) # define LIBXSMM_ATOMIC_STORE_ZERO16(DST_PTR, KIND) LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_STORE, 16)(DST_PTR, 0, KIND) # define LIBXSMM_ATOMIC_STORE_ZERO64(DST_PTR, KIND) LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_STORE, 64)(DST_PTR, 0, KIND) # endif # if !defined(LIBXSMM_ATOMIC_TRYLOCK_CMPSWP) # define LIBXSMM_ATOMIC_TRYLOCK(DST_PTR, KIND) /* matches bit-width of LIBXSMM_ATOMIC_LOCKTYPE */ \ (0 == LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_FETCH_OR, 8)(DST_PTR, 1, KIND)) # endif # define LIBXSMM_ATOMIC_ACQUIRE(DST_PTR, NPAUSE, KIND) \ LIBXSMM_ASSERT(1 == sizeof(LIBXSMM_ATOMIC_LOCKTYPE)); LIBXSMM_ASSERT(0 == LIBXSMM_MOD2((uintptr_t)(DST_PTR), 4)); \ while (!LIBXSMM_ATOMIC_TRYLOCK(DST_PTR, KIND)) LIBXSMM_SYNC_CYCLE(DST_PTR, 0/*free*/, NPAUSE); \ LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_ATOMIC_ACQUIRE") # if !defined(LIBXSMM_SYNC_NPAUSE) # define LIBXSMM_SYNC_NPAUSE 4096 # endif #elif defined(_WIN32) # define LIBXSMM_ATOMIC(FN, BITS) LIBXSMM_CONCATENATE(LIBXSMM_ATOMIC, BITS)(FN) # define LIBXSMM_ATOMIC8(FN) LIBXSMM_CONCATENATE(FN, 8) # define LIBXSMM_ATOMIC16(FN) LIBXSMM_CONCATENATE(FN, 16) # define LIBXSMM_ATOMIC32(FN) FN/*default*/ # define LIBXSMM_ATOMIC64(FN) LIBXSMM_CONCATENATE(FN, 64) # define LIBXSMM_ATOMIC_LOAD(SRC_PTR, KIND) InterlockedOr((volatile LONG*)(SRC_PTR), 0) # define LIBXSMM_ATOMIC_LOAD8(SRC_PTR, KIND) _InterlockedOr8((volatile char*)(SRC_PTR), 0) # define LIBXSMM_ATOMIC_LOAD64(SRC_PTR, KIND) InterlockedOr64((volatile LONGLONG*)(SRC_PTR), 0) # define LIBXSMM_ATOMIC_STORE(DST_PTR, VALUE, KIND) InterlockedExchange((volatile LONG*)(DST_PTR), (LONG)(VALUE)) # define LIBXSMM_ATOMIC_STORE8(DST_PTR, VALUE, KIND) InterlockedExchange8((volatile char*)(DST_PTR), (LONGLONG)(VALUE)) # define LIBXSMM_ATOMIC_STORE64(DST_PTR, VALUE, KIND) InterlockedExchange64((volatile LONGLONG*)(DST_PTR), (LONGLONG)(VALUE)) # if defined(LIBXSMM_ATOMIC_ZERO_STORE) # define LIBXSMM_ATOMIC_STORE_ZERO(DST_PTR, KIND) LIBXSMM_ATOMIC_STORE(DST_PTR, 0, KIND) # define LIBXSMM_ATOMIC_STORE_ZERO8(DST_PTR, KIND) LIBXSMM_ATOMIC_STORE8(DST_PTR, 0, KIND) # define LIBXSMM_ATOMIC_STORE_ZERO64(DST_PTR, KIND) LIBXSMM_ATOMIC_STORE64(DST_PTR, 0, KIND) # else # define LIBXSMM_ATOMIC_STORE_ZERO(DST_PTR, KIND) InterlockedAnd((volatile LONG*)(DST_PTR), 0) # define LIBXSMM_ATOMIC_STORE_ZERO8(DST_PTR, KIND) InterlockedAnd8((volatile char*)(DST_PTR), 0) # define LIBXSMM_ATOMIC_STORE_ZERO64(DST_PTR, KIND) InterlockedAnd64((volatile LONGLONG*)(DST_PTR), 0) # endif # define LIBXSMM_ATOMIC_FETCH_OR(DST_PTR, VALUE, KIND) InterlockedOr((volatile LONG*)(DST_PTR), VALUE) # define LIBXSMM_ATOMIC_FETCH_OR8(DST_PTR, VALUE, KIND) _InterlockedOr8((volatile char*)(DST_PTR), VALUE) # define LIBXSMM_ATOMIC_ADD_FETCH(DST_PTR, VALUE, KIND) (LIBXSMM_ATOMIC_FETCH_ADD(DST_PTR, VALUE, KIND) + (VALUE)) # define LIBXSMM_ATOMIC_ADD_FETCH16(DST_PTR, VALUE, KIND) (LIBXSMM_ATOMIC_FETCH_ADD16(DST_PTR, VALUE, KIND) + (VALUE)) # define LIBXSMM_ATOMIC_ADD_FETCH64(DST_PTR, VALUE, KIND) (LIBXSMM_ATOMIC_FETCH_ADD64(DST_PTR, VALUE, KIND) + (VALUE)) # define LIBXSMM_ATOMIC_SUB_FETCH(DST_PTR, VALUE, KIND) ((size_t)LIBXSMM_ATOMIC_FETCH_SUB(DST_PTR, VALUE, KIND) - ((size_t)VALUE)) # define LIBXSMM_ATOMIC_SUB_FETCH16(DST_PTR, VALUE, KIND) (LIBXSMM_ATOMIC_FETCH_SUB16(DST_PTR, VALUE, KIND) - (VALUE)) # define LIBXSMM_ATOMIC_SUB_FETCH64(DST_PTR, VALUE, KIND) (LIBXSMM_ATOMIC_FETCH_SUB64(DST_PTR, VALUE, KIND) - (VALUE)) # define LIBXSMM_ATOMIC_FETCH_ADD(DST_PTR, VALUE, KIND) InterlockedExchangeAdd((volatile LONG*)(DST_PTR), VALUE) # define LIBXSMM_ATOMIC_FETCH_ADD16(DST_PTR, VALUE, KIND) _InterlockedExchangeAdd16((volatile SHORT*)(DST_PTR), VALUE) # define LIBXSMM_ATOMIC_FETCH_ADD64(DST_PTR, VALUE, KIND) InterlockedExchangeAdd64((volatile LONGLONG*)(DST_PTR), VALUE) # define LIBXSMM_ATOMIC_FETCH_SUB(DST_PTR, VALUE, KIND) LIBXSMM_ATOMIC_FETCH_ADD(DST_PTR, -1 * (VALUE), KIND) # define LIBXSMM_ATOMIC_FETCH_SUB16(DST_PTR, VALUE, KIND) LIBXSMM_ATOMIC_FETCH_ADD16(DST_PTR, -1 * (VALUE), KIND) # define LIBXSMM_ATOMIC_FETCH_SUB64(DST_PTR, VALUE, KIND) LIBXSMM_ATOMIC_FETCH_ADD64(DST_PTR, -1 * (VALUE), KIND) # define LIBXSMM_ATOMIC_CMPSWP(DST_PTR, OLDVAL, NEWVAL, KIND) (((LONG)(OLDVAL)) == InterlockedCompareExchange((volatile LONG*)(DST_PTR), NEWVAL, OLDVAL)) # define LIBXSMM_ATOMIC_CMPSWP8(DST_PTR, OLDVAL, NEWVAL, KIND) ((OLDVAL) == _InterlockedCompareExchange8((volatile char*)(DST_PTR), NEWVAL, OLDVAL)) # if defined(LIBXSMM_ATOMIC_TRYLOCK_CMPSWP) # define LIBXSMM_ATOMIC_TRYLOCK(DST_PTR, KIND) LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_CMPSWP, 8)(DST_PTR, 0, 1, KIND) # else # define LIBXSMM_ATOMIC_TRYLOCK(DST_PTR, KIND) (0 == LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_FETCH_OR, 8)(DST_PTR, 1, KIND)) # endif # define LIBXSMM_ATOMIC_ACQUIRE(DST_PTR, NPAUSE, KIND) \ /*LIBXSMM_ASSERT(1 == sizeof(LIBXSMM_ATOMIC_LOCKTYPE)); LIBXSMM_ASSERT(0 == LIBXSMM_MOD2((uintptr_t)(DST_PTR), 4));*/ \ while (!LIBXSMM_ATOMIC_TRYLOCK(DST_PTR, KIND)) LIBXSMM_SYNC_CYCLE(DST_PTR, 0/*free*/, NPAUSE); \ LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_ATOMIC_ACQUIRE") # define LIBXSMM_ATOMIC_RELEASE(DST_PTR, KIND) { \ LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_ATOMIC_RELEASE"); \ LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_STORE_ZERO, 8)(DST_PTR, KIND); } # define LIBXSMM_ATOMIC_SYNC(KIND) _ReadWriteBarrier() # if !defined(LIBXSMM_SYNC_NPAUSE) # define LIBXSMM_SYNC_NPAUSE 4096 # endif #else /* consider to permit LIBXSMM_SYNC_NONE */ # error LIBXSMM is missing atomic compiler builtins! #endif #if !defined(LIBXSMM_SYNC_CYCLE) # if (0 < LIBXSMM_SYNC_NPAUSE) # define LIBXSMM_SYNC_CYCLE_ELSE(DST_PTR, EXP_STATE, NPAUSE, ELSE) do { int libxsmm_sync_cycle_npause_ = 1; \ do { int libxsmm_sync_cycle_counter_ = 0; \ for (; libxsmm_sync_cycle_counter_ < libxsmm_sync_cycle_npause_; ++libxsmm_sync_cycle_counter_) LIBXSMM_SYNC_PAUSE; \ if (libxsmm_sync_cycle_npause_ < (NPAUSE)) { \ libxsmm_sync_cycle_npause_ *= 2; \ } \ else { \ libxsmm_sync_cycle_npause_ = (NPAUSE); \ LIBXSMM_SYNC_YIELD; \ ELSE \ } \ } while(((EXP_STATE) & 1) != (*(DST_PTR) & 1)); \ } while(0) # else # define LIBXSMM_SYNC_CYCLE_ELSE(DST_PTR, EXP_STATE, NPAUSE, ELSE) LIBXSMM_SYNC_PAUSE # endif # define LIBXSMM_SYNC_CYCLE(DST_PTR, EXP_STATE, NPAUSE) \ LIBXSMM_SYNC_CYCLE_ELSE(DST_PTR, EXP_STATE, NPAUSE, /*else*/;) #endif #if (0 != LIBXSMM_SYNC) # define LIBXSMM_LOCK_DEFAULT LIBXSMM_LOCK_SPINLOCK # if !defined(LIBXSMM_LOCK_SYSTEM_SPINLOCK) && !(defined(_OPENMP) && defined(LIBXSMM_SYNC_OMP)) && \ (!defined(__linux__) || defined(__USE_XOPEN2K)) && 0/*disabled*/ # define LIBXSMM_LOCK_SYSTEM_SPINLOCK # endif # if !defined(LIBXSMM_LOCK_SYSTEM_MUTEX) && !(defined(_OPENMP) && defined(LIBXSMM_SYNC_OMP)) # define LIBXSMM_LOCK_SYSTEM_MUTEX # endif # if !defined(LIBXSMM_LOCK_SYSTEM_RWLOCK) && !(defined(_OPENMP) && defined(LIBXSMM_SYNC_OMP)) && \ (!defined(__linux__) || defined(__USE_XOPEN2K) || defined(__USE_UNIX98)) # define LIBXSMM_LOCK_SYSTEM_RWLOCK # endif /* Lock type, initialization, destruction, (try-)lock, unlock, etc */ # define LIBXSMM_LOCK_ACQUIRED(KIND) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_ACQUIRED_, KIND) # define LIBXSMM_LOCK_TYPE_ISPOD(KIND) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_TYPE_ISPOD_, KIND) # define LIBXSMM_LOCK_TYPE_ISRW(KIND) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_TYPE_ISRW_, KIND) # define LIBXSMM_LOCK_TYPE(KIND) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_TYPE_, KIND) # define LIBXSMM_LOCK_INIT(KIND, LOCK, ATTR) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_INIT_, KIND)(LOCK, ATTR) # define LIBXSMM_LOCK_DESTROY(KIND, LOCK) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_DESTROY_, KIND)(LOCK) # define LIBXSMM_LOCK_TRYLOCK(KIND, LOCK) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_TRYLOCK_, KIND)(LOCK) # define LIBXSMM_LOCK_ACQUIRE(KIND, LOCK) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_ACQUIRE_, KIND)(LOCK) # define LIBXSMM_LOCK_RELEASE(KIND, LOCK) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_RELEASE_, KIND)(LOCK) # define LIBXSMM_LOCK_TRYREAD(KIND, LOCK) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_TRYREAD_, KIND)(LOCK) # define LIBXSMM_LOCK_ACQREAD(KIND, LOCK) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_ACQREAD_, KIND)(LOCK) # define LIBXSMM_LOCK_RELREAD(KIND, LOCK) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_RELREAD_, KIND)(LOCK) /* Attribute type, initialization, destruction */ # define LIBXSMM_LOCK_ATTR_TYPE(KIND) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_ATTR_TYPE_, KIND) # define LIBXSMM_LOCK_ATTR_INIT(KIND, ATTR) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_ATTR_INIT_, KIND)(ATTR) # define LIBXSMM_LOCK_ATTR_DESTROY(KIND, ATTR) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_ATTR_DESTROY_, KIND)(ATTR) /* Cygwin's Pthread implementation appears to be broken; use Win32 */ # if !defined(LIBXSMM_WIN32_THREADS) && (defined(_WIN32) || defined(__CYGWIN__)) # define LIBXSMM_WIN32_THREADS _WIN32_WINNT # if defined(__CYGWIN__) || defined(__MINGW32__) /* hack: make SRW-locks available */ # if defined(_WIN32_WINNT) # undef _WIN32_WINNT # if !defined(NTDDI_VERSION) # define NTDDI_VERSION 0x0600 # endif # define _WIN32_WINNT ((LIBXSMM_WIN32_THREADS) | 0x0600) # else # define _WIN32_WINNT 0x0600 # endif # endif # endif # if defined(LIBXSMM_WIN32_THREADS) # define LIBXSMM_TLS_TYPE DWORD # define LIBXSMM_TLS_CREATE(KEYPTR) *(KEYPTR) = TlsAlloc() # define LIBXSMM_TLS_DESTROY(KEY) TlsFree(KEY) # define LIBXSMM_TLS_SETVALUE(KEY, PTR) TlsSetValue(KEY, PTR) # define LIBXSMM_TLS_GETVALUE(KEY) TlsGetValue(KEY) # define LIBXSMM_LOCK_SPINLOCK spin # if ((LIBXSMM_WIN32_THREADS) & 0x0600) # define LIBXSMM_LOCK_MUTEX rwlock # define LIBXSMM_LOCK_RWLOCK rwlock # else /* mutex exposes high latency */ # define LIBXSMM_LOCK_MUTEX mutex # define LIBXSMM_LOCK_RWLOCK mutex # endif # if defined(LIBXSMM_LOCK_SYSTEM_SPINLOCK) # define LIBXSMM_LOCK_ACQUIRED_spin TRUE # define LIBXSMM_LOCK_TYPE_ISPOD_spin 0 # define LIBXSMM_LOCK_TYPE_spin CRITICAL_SECTION # define LIBXSMM_LOCK_INIT_spin(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); InitializeCriticalSection(LOCK); } # define LIBXSMM_LOCK_DESTROY_spin(LOCK) DeleteCriticalSection((LIBXSMM_LOCK_TYPE_spin*)(LOCK)) # define LIBXSMM_LOCK_TRYLOCK_spin(LOCK) TryEnterCriticalSection(LOCK) # define LIBXSMM_LOCK_ACQUIRE_spin(LOCK) EnterCriticalSection(LOCK) # define LIBXSMM_LOCK_RELEASE_spin(LOCK) LeaveCriticalSection(LOCK) # define LIBXSMM_LOCK_TRYREAD_spin(LOCK) LIBXSMM_LOCK_TRYLOCK_spin(LOCK) # define LIBXSMM_LOCK_ACQREAD_spin(LOCK) LIBXSMM_LOCK_ACQUIRE_spin(LOCK) # define LIBXSMM_LOCK_RELREAD_spin(LOCK) LIBXSMM_LOCK_RELEASE_spin(LOCK) # define LIBXSMM_LOCK_ATTR_TYPE_spin int # define LIBXSMM_LOCK_ATTR_INIT_spin(ATTR) LIBXSMM_UNUSED(ATTR) # define LIBXSMM_LOCK_ATTR_DESTROY_spin(ATTR) LIBXSMM_UNUSED(ATTR) # endif # if defined(LIBXSMM_LOCK_SYSTEM_MUTEX) # define LIBXSMM_LOCK_ACQUIRED_mutex WAIT_OBJECT_0 # define LIBXSMM_LOCK_TYPE_ISPOD_mutex 0 # define LIBXSMM_LOCK_TYPE_ISRW_mutex 0 # define LIBXSMM_LOCK_TYPE_mutex HANDLE # define LIBXSMM_LOCK_INIT_mutex(LOCK, ATTR) (*(LOCK) = CreateMutex(*(ATTR), FALSE, NULL)) # define LIBXSMM_LOCK_DESTROY_mutex(LOCK) CloseHandle(*(LOCK)) # define LIBXSMM_LOCK_TRYLOCK_mutex(LOCK) WaitForSingleObject(*(LOCK), 0) # define LIBXSMM_LOCK_ACQUIRE_mutex(LOCK) WaitForSingleObject(*(LOCK), INFINITE) # define LIBXSMM_LOCK_RELEASE_mutex(LOCK) ReleaseMutex(*(LOCK)) # define LIBXSMM_LOCK_TRYREAD_mutex(LOCK) LIBXSMM_LOCK_TRYLOCK_mutex(LOCK) # define LIBXSMM_LOCK_ACQREAD_mutex(LOCK) LIBXSMM_LOCK_ACQUIRE_mutex(LOCK) # define LIBXSMM_LOCK_RELREAD_mutex(LOCK) LIBXSMM_LOCK_RELEASE_mutex(LOCK) # define LIBXSMM_LOCK_ATTR_TYPE_mutex LPSECURITY_ATTRIBUTES # define LIBXSMM_LOCK_ATTR_INIT_mutex(ATTR) (*(ATTR) = NULL) # define LIBXSMM_LOCK_ATTR_DESTROY_mutex(ATTR) LIBXSMM_UNUSED(ATTR) # endif # if defined(LIBXSMM_LOCK_SYSTEM_RWLOCK) # define LIBXSMM_LOCK_ACQUIRED_rwlock TRUE # define LIBXSMM_LOCK_TYPE_ISPOD_rwlock 1 # define LIBXSMM_LOCK_TYPE_ISRW_rwlock 1 # define LIBXSMM_LOCK_TYPE_rwlock SRWLOCK # define LIBXSMM_LOCK_INIT_rwlock(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); InitializeSRWLock(LOCK); } # define LIBXSMM_LOCK_DESTROY_rwlock(LOCK) LIBXSMM_UNUSED(LOCK) # define LIBXSMM_LOCK_TRYLOCK_rwlock(LOCK) TryAcquireSRWLockExclusive(LOCK) # define LIBXSMM_LOCK_ACQUIRE_rwlock(LOCK) AcquireSRWLockExclusive(LOCK) # define LIBXSMM_LOCK_RELEASE_rwlock(LOCK) ReleaseSRWLockExclusive(LOCK) # define LIBXSMM_LOCK_TRYREAD_rwlock(LOCK) TryAcquireSRWLockShared(LOCK) # define LIBXSMM_LOCK_ACQREAD_rwlock(LOCK) AcquireSRWLockShared(LOCK) # define LIBXSMM_LOCK_RELREAD_rwlock(LOCK) ReleaseSRWLockShared(LOCK) # define LIBXSMM_LOCK_ATTR_TYPE_rwlock int # define LIBXSMM_LOCK_ATTR_INIT_rwlock(ATTR) LIBXSMM_UNUSED(ATTR) # define LIBXSMM_LOCK_ATTR_DESTROY_rwlock(ATTR) LIBXSMM_UNUSED(ATTR) # endif # define LIBXSMM_SYNC_YIELD YieldProcessor() # else # define LIBXSMM_TLS_TYPE pthread_key_t # define LIBXSMM_TLS_CREATE(KEYPTR) pthread_key_create(KEYPTR, NULL) # define LIBXSMM_TLS_DESTROY(KEY) pthread_key_delete(KEY) # define LIBXSMM_TLS_SETVALUE(KEY, PTR) pthread_setspecific(KEY, PTR) # define LIBXSMM_TLS_GETVALUE(KEY) pthread_getspecific(KEY) # if defined(__APPLE__) && defined(__MACH__) # define LIBXSMM_SYNC_YIELD pthread_yield_np() # else # if defined(__USE_GNU) || !defined(__BSD_VISIBLE) LIBXSMM_EXTERN int pthread_yield(void) LIBXSMM_THROW; # else LIBXSMM_EXTERN void pthread_yield(void); # endif # define LIBXSMM_SYNC_YIELD pthread_yield() # endif # if defined(LIBXSMM_LOCK_SYSTEM_SPINLOCK) && defined(__APPLE__) && defined(__MACH__) # define LIBXSMM_LOCK_SPINLOCK mutex # else # define LIBXSMM_LOCK_SPINLOCK spin # endif # define LIBXSMM_LOCK_MUTEX mutex # define LIBXSMM_LOCK_RWLOCK rwlock # if defined(LIBXSMM_LOCK_SYSTEM_SPINLOCK) # define LIBXSMM_LOCK_ACQUIRED_spin 0 # define LIBXSMM_LOCK_TYPE_ISPOD_spin 0 # define LIBXSMM_LOCK_TYPE_ISRW_spin 0 # define LIBXSMM_LOCK_TYPE_spin pthread_spinlock_t # define LIBXSMM_LOCK_INIT_spin(LOCK, ATTR) LIBXSMM_EXPECT(0, pthread_spin_init(LOCK, *(ATTR))) # define LIBXSMM_LOCK_DESTROY_spin(LOCK) LIBXSMM_EXPECT(0, pthread_spin_destroy(LOCK)) # define LIBXSMM_LOCK_TRYLOCK_spin(LOCK) pthread_spin_trylock(LOCK) # define LIBXSMM_LOCK_ACQUIRE_spin(LOCK) LIBXSMM_EXPECT(0, pthread_spin_lock(LOCK)) # define LIBXSMM_LOCK_RELEASE_spin(LOCK) LIBXSMM_EXPECT(0, pthread_spin_unlock(LOCK)) # define LIBXSMM_LOCK_TRYREAD_spin(LOCK) LIBXSMM_LOCK_TRYLOCK_spin(LOCK) # define LIBXSMM_LOCK_ACQREAD_spin(LOCK) LIBXSMM_LOCK_ACQUIRE_spin(LOCK) # define LIBXSMM_LOCK_RELREAD_spin(LOCK) LIBXSMM_LOCK_RELEASE_spin(LOCK) # define LIBXSMM_LOCK_ATTR_TYPE_spin int # define LIBXSMM_LOCK_ATTR_INIT_spin(ATTR) (*(ATTR) = 0) # define LIBXSMM_LOCK_ATTR_DESTROY_spin(ATTR) LIBXSMM_UNUSED(ATTR) # endif # if defined(LIBXSMM_LOCK_SYSTEM_MUTEX) # define LIBXSMM_LOCK_ACQUIRED_mutex 0 # define LIBXSMM_LOCK_TYPE_ISPOD_mutex 0 # define LIBXSMM_LOCK_TYPE_ISRW_mutex 0 # define LIBXSMM_LOCK_TYPE_mutex pthread_mutex_t # define LIBXSMM_LOCK_INIT_mutex(LOCK, ATTR) LIBXSMM_EXPECT(0, pthread_mutex_init(LOCK, ATTR)) # define LIBXSMM_LOCK_DESTROY_mutex(LOCK) LIBXSMM_EXPECT_DEBUG(0, pthread_mutex_destroy(LOCK)) # define LIBXSMM_LOCK_TRYLOCK_mutex(LOCK) pthread_mutex_trylock(LOCK) /*!LIBXSMM_EXPECT*/ # define LIBXSMM_LOCK_ACQUIRE_mutex(LOCK) LIBXSMM_EXPECT(0, pthread_mutex_lock(LOCK)) # define LIBXSMM_LOCK_RELEASE_mutex(LOCK) LIBXSMM_EXPECT(0, pthread_mutex_unlock(LOCK)) # define LIBXSMM_LOCK_TRYREAD_mutex(LOCK) LIBXSMM_LOCK_TRYLOCK_mutex(LOCK) # define LIBXSMM_LOCK_ACQREAD_mutex(LOCK) LIBXSMM_LOCK_ACQUIRE_mutex(LOCK) # define LIBXSMM_LOCK_RELREAD_mutex(LOCK) LIBXSMM_LOCK_RELEASE_mutex(LOCK) # define LIBXSMM_LOCK_ATTR_TYPE_mutex pthread_mutexattr_t #if !defined(__linux__) || defined(__USE_UNIX98) || defined(__USE_XOPEN2K8) # if defined(_DEBUG) # define LIBXSMM_LOCK_ATTR_INIT_mutex(ATTR) (LIBXSMM_EXPECT(0, pthread_mutexattr_init(ATTR)), \ LIBXSMM_EXPECT(0, pthread_mutexattr_settype(ATTR, PTHREAD_MUTEX_ERRORCHECK))) # else # define LIBXSMM_LOCK_ATTR_INIT_mutex(ATTR) (pthread_mutexattr_init(ATTR), \ pthread_mutexattr_settype(ATTR, PTHREAD_MUTEX_NORMAL)) # endif #else # define LIBXSMM_LOCK_ATTR_INIT_mutex(ATTR) pthread_mutexattr_init(ATTR) #endif # define LIBXSMM_LOCK_ATTR_DESTROY_mutex(ATTR) LIBXSMM_EXPECT(0, pthread_mutexattr_destroy(ATTR)) # endif # if defined(LIBXSMM_LOCK_SYSTEM_RWLOCK) # define LIBXSMM_LOCK_ACQUIRED_rwlock 0 # define LIBXSMM_LOCK_TYPE_ISPOD_rwlock 0 # define LIBXSMM_LOCK_TYPE_ISRW_rwlock 1 # define LIBXSMM_LOCK_TYPE_rwlock pthread_rwlock_t # define LIBXSMM_LOCK_INIT_rwlock(LOCK, ATTR) LIBXSMM_EXPECT(0, pthread_rwlock_init(LOCK, ATTR)) # define LIBXSMM_LOCK_DESTROY_rwlock(LOCK) LIBXSMM_EXPECT(0, pthread_rwlock_destroy(LOCK)) # define LIBXSMM_LOCK_TRYLOCK_rwlock(LOCK) pthread_rwlock_trywrlock(LOCK) # define LIBXSMM_LOCK_ACQUIRE_rwlock(LOCK) LIBXSMM_EXPECT(0, pthread_rwlock_wrlock(LOCK)) # define LIBXSMM_LOCK_RELEASE_rwlock(LOCK) LIBXSMM_EXPECT(0, pthread_rwlock_unlock(LOCK)) # define LIBXSMM_LOCK_TRYREAD_rwlock(LOCK) pthread_rwlock_tryrdlock(LOCK) # define LIBXSMM_LOCK_ACQREAD_rwlock(LOCK) LIBXSMM_EXPECT(0, pthread_rwlock_rdlock(LOCK)) # define LIBXSMM_LOCK_RELREAD_rwlock(LOCK) LIBXSMM_LOCK_RELEASE_rwlock(LOCK) # define LIBXSMM_LOCK_ATTR_TYPE_rwlock pthread_rwlockattr_t # define LIBXSMM_LOCK_ATTR_INIT_rwlock(ATTR) LIBXSMM_EXPECT(0, pthread_rwlockattr_init(ATTR)) # define LIBXSMM_LOCK_ATTR_DESTROY_rwlock(ATTR) LIBXSMM_EXPECT(0, pthread_rwlockattr_destroy(ATTR)) # endif # endif /* OpenMP based locks need to stay disabled unless both * libxsmm and libxsmmext are built with OpenMP support. */ # if defined(_OPENMP) && defined(LIBXSMM_SYNC_OMP) # if !defined(LIBXSMM_LOCK_SYSTEM_SPINLOCK) # define LIBXSMM_LOCK_ACQUIRED_spin 1 # define LIBXSMM_LOCK_TYPE_ISPOD_spin 0 # define LIBXSMM_LOCK_TYPE_ISRW_spin 0 # define LIBXSMM_LOCK_TYPE_spin omp_lock_t # define LIBXSMM_LOCK_DESTROY_spin(LOCK) omp_destroy_lock(LOCK) # define LIBXSMM_LOCK_TRYLOCK_spin(LOCK) omp_test_lock(LOCK) # define LIBXSMM_LOCK_ACQUIRE_spin(LOCK) omp_set_lock(LOCK) # define LIBXSMM_LOCK_RELEASE_spin(LOCK) omp_unset_lock(LOCK) # define LIBXSMM_LOCK_TRYREAD_spin(LOCK) LIBXSMM_LOCK_TRYLOCK_spin(LOCK) # define LIBXSMM_LOCK_ACQREAD_spin(LOCK) LIBXSMM_LOCK_ACQUIRE_spin(LOCK) # define LIBXSMM_LOCK_RELREAD_spin(LOCK) LIBXSMM_LOCK_RELEASE_spin(LOCK) # if (201811 <= _OPENMP/*v5.0*/) # define LIBXSMM_LOCK_INIT_spin(LOCK, ATTR) omp_init_lock_with_hint(LOCK, *(ATTR)) # define LIBXSMM_LOCK_ATTR_TYPE_spin omp_lock_hint_t # define LIBXSMM_LOCK_ATTR_INIT_spin(ATTR) (*(ATTR) = omp_lock_hint_none) # else # define LIBXSMM_LOCK_INIT_spin(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); omp_init_lock(LOCK); } # define LIBXSMM_LOCK_ATTR_TYPE_spin const void* # define LIBXSMM_LOCK_ATTR_INIT_spin(ATTR) LIBXSMM_UNUSED(ATTR) # endif # define LIBXSMM_LOCK_ATTR_DESTROY_spin(ATTR) LIBXSMM_UNUSED(ATTR) # endif # if !defined(LIBXSMM_LOCK_SYSTEM_MUTEX) # define LIBXSMM_LOCK_ACQUIRED_mutex 1 # define LIBXSMM_LOCK_TYPE_ISPOD_mutex 0 # define LIBXSMM_LOCK_TYPE_ISRW_mutex 0 # define LIBXSMM_LOCK_TYPE_mutex omp_lock_t # define LIBXSMM_LOCK_DESTROY_mutex(LOCK) omp_destroy_lock(LOCK) # define LIBXSMM_LOCK_TRYLOCK_mutex(LOCK) omp_test_lock(LOCK) # define LIBXSMM_LOCK_ACQUIRE_mutex(LOCK) omp_set_lock(LOCK) # define LIBXSMM_LOCK_RELEASE_mutex(LOCK) omp_unset_lock(LOCK) # define LIBXSMM_LOCK_TRYREAD_mutex(LOCK) LIBXSMM_LOCK_TRYLOCK_mutex(LOCK) # define LIBXSMM_LOCK_ACQREAD_mutex(LOCK) LIBXSMM_LOCK_ACQUIRE_mutex(LOCK) # define LIBXSMM_LOCK_RELREAD_mutex(LOCK) LIBXSMM_LOCK_RELEASE_mutex(LOCK) # if (201811 <= _OPENMP/*v5.0*/) # define LIBXSMM_LOCK_INIT_mutex(LOCK, ATTR) omp_init_lock_with_hint(LOCK, *(ATTR)) # define LIBXSMM_LOCK_ATTR_TYPE_mutex omp_lock_hint_t # define LIBXSMM_LOCK_ATTR_INIT_mutex(ATTR) (*(ATTR) = omp_lock_hint_none) # else # define LIBXSMM_LOCK_INIT_mutex(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); omp_init_lock(LOCK); } # define LIBXSMM_LOCK_ATTR_TYPE_mutex const void* # define LIBXSMM_LOCK_ATTR_INIT_mutex(ATTR) LIBXSMM_UNUSED(ATTR) # endif # define LIBXSMM_LOCK_ATTR_DESTROY_mutex(ATTR) LIBXSMM_UNUSED(ATTR) # endif # if !defined(LIBXSMM_LOCK_SYSTEM_RWLOCK) # define LIBXSMM_LOCK_ACQUIRED_rwlock 1 # define LIBXSMM_LOCK_TYPE_ISPOD_rwlock 0 # define LIBXSMM_LOCK_TYPE_ISRW_rwlock 0 # define LIBXSMM_LOCK_TYPE_rwlock omp_lock_t # define LIBXSMM_LOCK_DESTROY_rwlock(LOCK) omp_destroy_lock(LOCK) # define LIBXSMM_LOCK_TRYLOCK_rwlock(LOCK) omp_test_lock(LOCK) # define LIBXSMM_LOCK_ACQUIRE_rwlock(LOCK) omp_set_lock(LOCK) # define LIBXSMM_LOCK_RELEASE_rwlock(LOCK) omp_unset_lock(LOCK) # define LIBXSMM_LOCK_TRYREAD_rwlock(LOCK) LIBXSMM_LOCK_TRYLOCK_rwlock(LOCK) # define LIBXSMM_LOCK_ACQREAD_rwlock(LOCK) LIBXSMM_LOCK_ACQUIRE_rwlock(LOCK) # define LIBXSMM_LOCK_RELREAD_rwlock(LOCK) LIBXSMM_LOCK_RELEASE_rwlock(LOCK) # if (201811 <= _OPENMP/*v5.0*/) # define LIBXSMM_LOCK_INIT_rwlock(LOCK, ATTR) omp_init_lock_with_hint(LOCK, *(ATTR)) # define LIBXSMM_LOCK_ATTR_TYPE_rwlock omp_lock_hint_t # define LIBXSMM_LOCK_ATTR_INIT_rwlock(ATTR) (*(ATTR) = omp_lock_hint_none) # else # define LIBXSMM_LOCK_INIT_rwlock(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); omp_init_lock(LOCK); } # define LIBXSMM_LOCK_ATTR_TYPE_rwlock const void* # define LIBXSMM_LOCK_ATTR_INIT_rwlock(ATTR) LIBXSMM_UNUSED(ATTR) # endif # define LIBXSMM_LOCK_ATTR_DESTROY_rwlock(ATTR) LIBXSMM_UNUSED(ATTR) # endif # elif !defined(LIBXSMM_SYNC_NONE) /* based on atomic primitives */ # if !defined(LIBXSMM_LOCK_SYSTEM_SPINLOCK) # define LIBXSMM_LOCK_ACQUIRED_spin 0 # define LIBXSMM_LOCK_TYPE_ISPOD_spin 1 # define LIBXSMM_LOCK_TYPE_ISRW_spin 0 # define LIBXSMM_LOCK_TYPE_spin volatile LIBXSMM_ATOMIC_LOCKTYPE # define LIBXSMM_LOCK_INIT_spin(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); (*(LOCK) = 0); } # define LIBXSMM_LOCK_DESTROY_spin(LOCK) LIBXSMM_UNUSED(LOCK) # define LIBXSMM_LOCK_TRYLOCK_spin(LOCK) (LIBXSMM_LOCK_ACQUIRED_spin + !LIBXSMM_ATOMIC_TRYLOCK(LOCK, LIBXSMM_ATOMIC_RELAXED)) # define LIBXSMM_LOCK_ACQUIRE_spin(LOCK) LIBXSMM_ATOMIC_ACQUIRE(LOCK, LIBXSMM_SYNC_NPAUSE, LIBXSMM_ATOMIC_RELAXED) # define LIBXSMM_LOCK_RELEASE_spin(LOCK) LIBXSMM_ATOMIC_RELEASE(LOCK, LIBXSMM_ATOMIC_RELAXED) # define LIBXSMM_LOCK_TRYREAD_spin(LOCK) LIBXSMM_LOCK_TRYLOCK_spin(LOCK) # define LIBXSMM_LOCK_ACQREAD_spin(LOCK) LIBXSMM_LOCK_ACQUIRE_spin(LOCK) # define LIBXSMM_LOCK_RELREAD_spin(LOCK) LIBXSMM_LOCK_RELEASE_spin(LOCK) # define LIBXSMM_LOCK_ATTR_TYPE_spin int # define LIBXSMM_LOCK_ATTR_INIT_spin(ATTR) LIBXSMM_UNUSED(ATTR) # define LIBXSMM_LOCK_ATTR_DESTROY_spin(ATTR) LIBXSMM_UNUSED(ATTR) # endif # if !defined(LIBXSMM_LOCK_SYSTEM_MUTEX) # define LIBXSMM_LOCK_ACQUIRED_mutex 0 # define LIBXSMM_LOCK_TYPE_ISPOD_mutex 1 # define LIBXSMM_LOCK_TYPE_ISRW_mutex 0 # define LIBXSMM_LOCK_TYPE_mutex volatile LIBXSMM_ATOMIC_LOCKTYPE # define LIBXSMM_LOCK_INIT_mutex(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); (*(LOCK) = 0); } # define LIBXSMM_LOCK_DESTROY_mutex(LOCK) LIBXSMM_UNUSED(LOCK) # define LIBXSMM_LOCK_TRYLOCK_mutex(LOCK) (LIBXSMM_LOCK_ACQUIRED_mutex + !LIBXSMM_ATOMIC_TRYLOCK(LOCK, LIBXSMM_ATOMIC_RELAXED)) # define LIBXSMM_LOCK_ACQUIRE_mutex(LOCK) LIBXSMM_ATOMIC_ACQUIRE(LOCK, LIBXSMM_SYNC_NPAUSE, LIBXSMM_ATOMIC_RELAXED) # define LIBXSMM_LOCK_RELEASE_mutex(LOCK) LIBXSMM_ATOMIC_RELEASE(LOCK, LIBXSMM_ATOMIC_RELAXED) # define LIBXSMM_LOCK_TRYREAD_mutex(LOCK) LIBXSMM_LOCK_TRYLOCK_mutex(LOCK) # define LIBXSMM_LOCK_ACQREAD_mutex(LOCK) LIBXSMM_LOCK_ACQUIRE_mutex(LOCK) # define LIBXSMM_LOCK_RELREAD_mutex(LOCK) LIBXSMM_LOCK_RELEASE_mutex(LOCK) # define LIBXSMM_LOCK_ATTR_TYPE_mutex int # define LIBXSMM_LOCK_ATTR_INIT_mutex(ATTR) LIBXSMM_UNUSED(ATTR) # define LIBXSMM_LOCK_ATTR_DESTROY_mutex(ATTR) LIBXSMM_UNUSED(ATTR) # endif # if !defined(LIBXSMM_LOCK_SYSTEM_RWLOCK) # define LIBXSMM_LOCK_ACQUIRED_rwlock 0 # define LIBXSMM_LOCK_TYPE_ISPOD_rwlock 1 # define LIBXSMM_LOCK_TYPE_ISRW_rwlock 0 # define LIBXSMM_LOCK_TYPE_rwlock volatile LIBXSMM_ATOMIC_LOCKTYPE # define LIBXSMM_LOCK_INIT_rwlock(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); (*(LOCK) = 0); } # define LIBXSMM_LOCK_DESTROY_rwlock(LOCK) LIBXSMM_UNUSED(LOCK) # define LIBXSMM_LOCK_TRYLOCK_rwlock(LOCK) (LIBXSMM_LOCK_ACQUIRED_rwlock + !LIBXSMM_ATOMIC_TRYLOCK(LOCK, LIBXSMM_ATOMIC_RELAXED)) # define LIBXSMM_LOCK_ACQUIRE_rwlock(LOCK) LIBXSMM_ATOMIC_ACQUIRE(LOCK, LIBXSMM_SYNC_NPAUSE, LIBXSMM_ATOMIC_RELAXED) # define LIBXSMM_LOCK_RELEASE_rwlock(LOCK) LIBXSMM_ATOMIC_RELEASE(LOCK, LIBXSMM_ATOMIC_RELAXED) # define LIBXSMM_LOCK_TRYREAD_rwlock(LOCK) LIBXSMM_LOCK_TRYLOCK_rwlock(LOCK) # define LIBXSMM_LOCK_ACQREAD_rwlock(LOCK) LIBXSMM_LOCK_ACQUIRE_rwlock(LOCK) # define LIBXSMM_LOCK_RELREAD_rwlock(LOCK) LIBXSMM_LOCK_RELEASE_rwlock(LOCK) # define LIBXSMM_LOCK_ATTR_TYPE_rwlock int # define LIBXSMM_LOCK_ATTR_INIT_rwlock(ATTR) LIBXSMM_UNUSED(ATTR) # define LIBXSMM_LOCK_ATTR_DESTROY_rwlock(ATTR) LIBXSMM_UNUSED(ATTR) # endif # else /* experimental */ # if !defined(LIBXSMM_LOCK_SYSTEM_SPINLOCK) # define LIBXSMM_LOCK_ACQUIRED_spin 0 # define LIBXSMM_LOCK_TYPE_ISPOD_spin 0 # define LIBXSMM_LOCK_TYPE_ISRW_spin 0 # define LIBXSMM_LOCK_TYPE_spin libxsmm_spinlock* # define LIBXSMM_LOCK_INIT_spin(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); (*(LOCK) = libxsmm_spinlock_create()); } # define LIBXSMM_LOCK_DESTROY_spin(LOCK) libxsmm_spinlock_destroy(*(LOCK)) # define LIBXSMM_LOCK_TRYLOCK_spin(LOCK) libxsmm_spinlock_trylock(*(LOCK)) # define LIBXSMM_LOCK_ACQUIRE_spin(LOCK) libxsmm_spinlock_acquire(*(LOCK)) # define LIBXSMM_LOCK_RELEASE_spin(LOCK) libxsmm_spinlock_release(*(LOCK)) # define LIBXSMM_LOCK_TRYREAD_spin(LOCK) LIBXSMM_LOCK_TRYLOCK_spin(LOCK) # define LIBXSMM_LOCK_ACQREAD_spin(LOCK) LIBXSMM_LOCK_ACQUIRE_spin(LOCK) # define LIBXSMM_LOCK_RELREAD_spin(LOCK) LIBXSMM_LOCK_RELEASE_spin(LOCK) # define LIBXSMM_LOCK_ATTR_TYPE_spin int # define LIBXSMM_LOCK_ATTR_INIT_spin(ATTR) LIBXSMM_UNUSED(ATTR) # define LIBXSMM_LOCK_ATTR_DESTROY_spin(ATTR) LIBXSMM_UNUSED(ATTR) # endif # if !defined(LIBXSMM_LOCK_SYSTEM_MUTEX) # define LIBXSMM_LOCK_ACQUIRED_mutex 0 # define LIBXSMM_LOCK_TYPE_ISPOD_mutex 0 # define LIBXSMM_LOCK_TYPE_ISRW_mutex 0 # define LIBXSMM_LOCK_TYPE_mutex libxsmm_mutex* # define LIBXSMM_LOCK_INIT_mutex(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); (*(LOCK) = libxsmm_mutex_create()); } # define LIBXSMM_LOCK_DESTROY_mutex(LOCK) libxsmm_mutex_destroy(*(LOCK)) # define LIBXSMM_LOCK_TRYLOCK_mutex(LOCK) libxsmm_mutex_trylock(*(LOCK)) # define LIBXSMM_LOCK_ACQUIRE_mutex(LOCK) libxsmm_mutex_acquire(*(LOCK)) # define LIBXSMM_LOCK_RELEASE_mutex(LOCK) libxsmm_mutex_release(*(LOCK)) # define LIBXSMM_LOCK_TRYREAD_mutex(LOCK) LIBXSMM_LOCK_TRYLOCK_mutex(LOCK) # define LIBXSMM_LOCK_ACQREAD_mutex(LOCK) LIBXSMM_LOCK_ACQUIRE_mutex(LOCK) # define LIBXSMM_LOCK_RELREAD_mutex(LOCK) LIBXSMM_LOCK_RELEASE_mutex(LOCK) # define LIBXSMM_LOCK_ATTR_TYPE_mutex int # define LIBXSMM_LOCK_ATTR_INIT_mutex(ATTR) LIBXSMM_UNUSED(ATTR) # define LIBXSMM_LOCK_ATTR_DESTROY_mutex(ATTR) LIBXSMM_UNUSED(ATTR) # endif # if !defined(LIBXSMM_LOCK_SYSTEM_RWLOCK) # define LIBXSMM_LOCK_ACQUIRED_rwlock 0 # define LIBXSMM_LOCK_TYPE_ISPOD_rwlock 0 # define LIBXSMM_LOCK_TYPE_ISRW_rwlock 1 # define LIBXSMM_LOCK_TYPE_rwlock libxsmm_rwlock* # define LIBXSMM_LOCK_INIT_rwlock(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); (*(LOCK) = libxsmm_rwlock_create()); } # define LIBXSMM_LOCK_DESTROY_rwlock(LOCK) libxsmm_rwlock_destroy(*(LOCK)) # define LIBXSMM_LOCK_TRYLOCK_rwlock(LOCK) libxsmm_rwlock_trylock(*(LOCK)) # define LIBXSMM_LOCK_ACQUIRE_rwlock(LOCK) libxsmm_rwlock_acquire(*(LOCK)) # define LIBXSMM_LOCK_RELEASE_rwlock(LOCK) libxsmm_rwlock_release(*(LOCK)) # define LIBXSMM_LOCK_TRYREAD_rwlock(LOCK) libxsmm_rwlock_tryread(*(LOCK)) # define LIBXSMM_LOCK_ACQREAD_rwlock(LOCK) libxsmm_rwlock_acqread(*(LOCK)) # define LIBXSMM_LOCK_RELREAD_rwlock(LOCK) libxsmm_rwlock_relread(*(LOCK)) # define LIBXSMM_LOCK_ATTR_TYPE_rwlock int # define LIBXSMM_LOCK_ATTR_INIT_rwlock(ATTR) LIBXSMM_UNUSED(ATTR) # define LIBXSMM_LOCK_ATTR_DESTROY_rwlock(ATTR) LIBXSMM_UNUSED(ATTR) # endif # endif #else /* no synchronization */ # define LIBXSMM_SYNC_YIELD LIBXSMM_SYNC_PAUSE # define LIBXSMM_LOCK_SPINLOCK spinlock_dummy # define LIBXSMM_LOCK_MUTEX mutex_dummy # define LIBXSMM_LOCK_RWLOCK rwlock_dummy # define LIBXSMM_LOCK_ACQUIRED(KIND) 0 # define LIBXSMM_LOCK_TYPE_ISPOD(KIND) 1 # define LIBXSMM_LOCK_TYPE_ISRW(KIND) 0 # define LIBXSMM_LOCK_ATTR_TYPE(KIND) int # define LIBXSMM_LOCK_ATTR_INIT(KIND, ATTR) LIBXSMM_UNUSED(ATTR) # define LIBXSMM_LOCK_ATTR_DESTROY(KIND, ATTR) LIBXSMM_UNUSED(ATTR) # define LIBXSMM_LOCK_TYPE(KIND) int # define LIBXSMM_LOCK_INIT(KIND, LOCK, ATTR) { LIBXSMM_UNUSED(LOCK); LIBXSMM_UNUSED(ATTR); } # define LIBXSMM_LOCK_DESTROY(KIND, LOCK) LIBXSMM_UNUSED(LOCK) # define LIBXSMM_LOCK_TRYLOCK(KIND, LOCK) LIBXSMM_LOCK_ACQUIRED(KIND) # define LIBXSMM_LOCK_ACQUIRE(KIND, LOCK) LIBXSMM_UNUSED(LOCK) # define LIBXSMM_LOCK_RELEASE(KIND, LOCK) LIBXSMM_UNUSED(LOCK) # define LIBXSMM_LOCK_TRYREAD(KIND, LOCK) LIBXSMM_LOCK_TRYLOCK(KIND, LOCK) # define LIBXSMM_LOCK_ACQREAD(KIND, LOCK) LIBXSMM_LOCK_ACQUIRE(KIND, LOCK) # define LIBXSMM_LOCK_RELREAD(KIND, LOCK) LIBXSMM_LOCK_RELEASE(KIND, LOCK) #endif #if (0 == LIBXSMM_SYNC) # define LIBXSMM_FLOCK(FILE) # define LIBXSMM_FUNLOCK(FILE) #elif defined(_WIN32) # define LIBXSMM_FLOCK(FILE) _lock_file(FILE) # define LIBXSMM_FUNLOCK(FILE) _unlock_file(FILE) #else # if !defined(__CYGWIN__) # define LIBXSMM_FLOCK(FILE) flockfile(FILE) # define LIBXSMM_FUNLOCK(FILE) funlockfile(FILE) LIBXSMM_EXTERN void flockfile(FILE*) LIBXSMM_THROW; LIBXSMM_EXTERN void funlockfile(FILE*) LIBXSMM_THROW; # else /* Only available with __CYGWIN__ *and* C++0x. */ # define LIBXSMM_FLOCK(FILE) # define LIBXSMM_FUNLOCK(FILE) # endif #endif /** Synchronize console output */ #define LIBXSMM_STDIO_ACQUIRE() LIBXSMM_FLOCK(stdout); LIBXSMM_FLOCK(stderr) #define LIBXSMM_STDIO_RELEASE() LIBXSMM_FUNLOCK(stderr); LIBXSMM_FUNLOCK(stdout) /** Opaque type which represents a barrier. */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_barrier libxsmm_barrier; /** Create barrier from one of the threads. */ LIBXSMM_API libxsmm_barrier* libxsmm_barrier_create(int ncores, int nthreads_per_core); /** Initialize the barrier from each thread of the team. */ LIBXSMM_API void libxsmm_barrier_init(libxsmm_barrier* barrier, int tid); /** Wait for the entire team to arrive. */ LIBXSMM_API void libxsmm_barrier_wait(libxsmm_barrier* barrier, int tid); /** Destroy the resources associated with this barrier. */ LIBXSMM_API void libxsmm_barrier_destroy(const libxsmm_barrier* barrier); /** DEPRECATED: use libxsmm_barrier_destroy instead. */ #define libxsmm_barrier_release libxsmm_barrier_destroy /** Spin-lock, which eventually differs from LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK_SPINLOCK). */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_spinlock libxsmm_spinlock; LIBXSMM_API libxsmm_spinlock* libxsmm_spinlock_create(void); LIBXSMM_API void libxsmm_spinlock_destroy(const libxsmm_spinlock* spinlock); LIBXSMM_API int libxsmm_spinlock_trylock(libxsmm_spinlock* spinlock); LIBXSMM_API void libxsmm_spinlock_acquire(libxsmm_spinlock* spinlock); LIBXSMM_API void libxsmm_spinlock_release(libxsmm_spinlock* spinlock); /** Mutual-exclusive lock (Mutex), which eventually differs from LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK_MUTEX). */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_mutex libxsmm_mutex; LIBXSMM_API libxsmm_mutex* libxsmm_mutex_create(void); LIBXSMM_API void libxsmm_mutex_destroy(const libxsmm_mutex* mutex); LIBXSMM_API int libxsmm_mutex_trylock(libxsmm_mutex* mutex); LIBXSMM_API void libxsmm_mutex_acquire(libxsmm_mutex* mutex); LIBXSMM_API void libxsmm_mutex_release(libxsmm_mutex* mutex); /** Reader-Writer lock (RW-lock), which eventually differs from LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK_RWLOCK). */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_rwlock libxsmm_rwlock; LIBXSMM_API libxsmm_rwlock* libxsmm_rwlock_create(void); LIBXSMM_API void libxsmm_rwlock_destroy(const libxsmm_rwlock* rwlock); LIBXSMM_API int libxsmm_rwlock_trylock(libxsmm_rwlock* rwlock); LIBXSMM_API void libxsmm_rwlock_acquire(libxsmm_rwlock* rwlock); LIBXSMM_API void libxsmm_rwlock_release(libxsmm_rwlock* rwlock); LIBXSMM_API int libxsmm_rwlock_tryread(libxsmm_rwlock* rwlock); LIBXSMM_API void libxsmm_rwlock_acqread(libxsmm_rwlock* rwlock); LIBXSMM_API void libxsmm_rwlock_relread(libxsmm_rwlock* rwlock); /** Utility function to receive the process ID of the calling process. */ LIBXSMM_API unsigned int libxsmm_get_pid(void); /** * Utility function to receive a Thread-ID (TID) for the calling thread. * The TID is not related to a specific threading runtime. TID=0 may not * represent the main thread. TIDs are zero-based and consecutive numbers. */ LIBXSMM_API unsigned int libxsmm_get_tid(void); #endif /*LIBXSMM_SYNC_H*/ libxsmm-1.17/include/libxsmm_timer.h000066400000000000000000000033771415223013700175740ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_TIMER_H #define LIBXSMM_TIMER_H #include "libxsmm_macros.h" typedef unsigned long long libxsmm_timer_tickint; LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_timer_info { int tsc; } libxsmm_timer_info; /** Query timer properties. */ LIBXSMM_API int libxsmm_get_timer_info(libxsmm_timer_info* info); /** * Returns the current clock tick of a monotonic timer source with * platform-specific resolution (not necessarily CPU cycles). */ LIBXSMM_API libxsmm_timer_tickint libxsmm_timer_tick(void); /** Returns the difference between two timer ticks (cycles); avoids potential side-effects/assumptions of LIBXSMM_DIFF. */ LIBXSMM_API_INLINE libxsmm_timer_tickint libxsmm_timer_ncycles(libxsmm_timer_tickint tick0, libxsmm_timer_tickint tick1) { return LIBXSMM_DELTA(tick0, tick1); } /** Returns the duration (in seconds) between two values received by libxsmm_timer_tick. */ LIBXSMM_API double libxsmm_timer_duration(libxsmm_timer_tickint tick0, libxsmm_timer_tickint tick1); #endif /*LIBXSMM_TIMER_H*/ libxsmm-1.17/include/libxsmm_typedefs.h000066400000000000000000001511441415223013700202730ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_TYPEDEFS_H #define LIBXSMM_TYPEDEFS_H #include "libxsmm_macros.h" /** Check ILP64 configuration for sanity. */ #if !defined(LIBXSMM_ILP64) || (0 == LIBXSMM_ILP64 && defined(MKL_ILP64)) # error "Inconsistent ILP64 configuration detected!" #elif (0 != LIBXSMM_ILP64 && !defined(MKL_ILP64)) # define MKL_ILP64 #endif #if (0 != LIBXSMM_ILP64) # define LIBXSMM_BLASINT_NBITS 64 # define LIBXSMM_BLASINT long long #else /* LP64 */ # define LIBXSMM_BLASINT_NBITS 32 # define LIBXSMM_BLASINT int #endif /** Generic prefetches; similar to LIBXSMM_PREFETCH_AUTO (libxsmm_frontend.h) */ #define LIBXSMM_PREFETCH_SIGONLY 1 #define LIBXSMM_PREFETCH_NONE 0 /** Helper macro for type names. */ #define LIBXSMM_TYPENAME(TYPE) LIBXSMM_STRINGIFY(LIBXSMM_CONCATENATE(LIBXSMM_TYPENAME_, TYPE)) #define LIBXSMM_TYPENAME_double f64 #define LIBXSMM_TYPENAME_float f32 #define LIBXSMM_TYPENAME_libxsmm_bfloat16 bf16 #define LIBXSMM_TYPENAME_int i32 #define LIBXSMM_TYPENAME_short i16 #define LIBXSMM_TYPENAME_char i8 /** Helper macro for type information: INFO := { FP }. */ #define LIBXSMM_TYPEINFO(TYPE, INFO) LIBXSMM_CONCATENATE4(LIBXSMM_TYPEINFO_, INFO, _, TYPE) #define LIBXSMM_TYPEINFO_FP_double 1 #define LIBXSMM_TYPEINFO_FP_float 1 #define LIBXSMM_TYPEINFO_FP_libxsmm_bfloat16 1 #define LIBXSMM_TYPEINFO_FP_int 0 #define LIBXSMM_TYPEINFO_FP_short 0 #define LIBXSMM_TYPEINFO_FP_char 0 /** Helper macro for type postfixes. */ #define LIBXSMM_TYPESYMBOL(TYPE) LIBXSMM_CONCATENATE(LIBXSMM_TYPESYMBOL_, TYPE) #define LIBXSMM_TYPESYMBOL_double F64 #define LIBXSMM_TYPESYMBOL_float F32 #define LIBXSMM_TYPESYMBOL_libxsmm_bfloat16 BF16 #define LIBXSMM_TYPESYMBOL_int I32 #define LIBXSMM_TYPESYMBOL_short I16 #define LIBXSMM_TYPESYMBOL_char I8 #define LIBXSMM_TYPESIZE(ENUM) ( \ ((int)(ENUM)) == LIBXSMM_DATATYPE_F64 ? 8 : ( \ ((int)(ENUM)) == LIBXSMM_DATATYPE_F32 ? 4 : ( \ ((int)(ENUM)) == LIBXSMM_DATATYPE_BF16 ? 2 : ( \ ((int)(ENUM)) == LIBXSMM_DATATYPE_I32 ? 4 : ( \ ((int)(ENUM)) == LIBXSMM_DATATYPE_I16 ? 2 : ( \ ((int)(ENUM)) == LIBXSMM_DATATYPE_I8 ? 1 : ( \ 0/*invalid*/))))))) /* Get input or output precision */ #define LIBXSMM_GETENUM_INP(SRC) ((SRC) & 0x0F) #define LIBXSMM_GETENUM_OUT(SRC) (0 == ((SRC) >> 4) ? LIBXSMM_GETENUM_INP(SRC) : ((SRC) >> 4)) /* Get/Set input and output precision */ #define LIBXSMM_GETENUM(INP, OUT) (((INP) == (OUT)) ? (INP) : ((INP) | ((OUT) << 4))) #define LIBXSMM_SETENUM(DST, INP, OUT) DST = LIBXSMM_GETENUM(INP, OUT) /* Construct an enumerator (libxsmm_datatype) from a built-in type (float, double, etc.). */ #define LIBXSMM_DATATYPE(TYPE) LIBXSMM_CONCATENATE(LIBXSMM_DATATYPE_, LIBXSMM_TYPESYMBOL(TYPE)) /* Construct a type-id from built-in input/output types (float, double, etc.). */ #define LIBXSMM_DATATYPE2(ITYPE, OTYPE) LIBXSMM_GETENUM(LIBXSMM_DATATYPE(ITYPE), LIBXSMM_DATATYPE(OTYPE)) /* Construct an enumerator (libxsmm_gemm_precision) from a built-in type (float, double, etc.). */ #define LIBXSMM_GEMM_PRECISION(TYPE) LIBXSMM_CONCATENATE(LIBXSMM_GEMM_PRECISION_, LIBXSMM_TYPESYMBOL(TYPE)) /* Construct GEMM-precision from built-in input/output types (float, double, etc.). */ #define LIBXSMM_GEMM_PRECISION2(ITYPE, OTYPE) (libxsmm_gemm_precision)LIBXSMM_GETENUM( \ LIBXSMM_GEMM_PRECISION(ITYPE), LIBXSMM_GEMM_PRECISION(OTYPE)) /** Maximum size available to store a descriptor/blob (GEMM, MCOPY, TRANS, TRSM, TRMM). */ #if !defined(LIBXSMM_DESCRIPTOR_MAXSIZE) # define LIBXSMM_DESCRIPTOR_MAXSIZE 64 #endif /** Size of the descriptor considered as unique signature. */ #if !defined(LIBXSMM_DESCRIPTOR_SIGSIZE) # define LIBXSMM_DESCRIPTOR_SIGSIZE LIBXSMM_DESCRIPTOR_MAXSIZE #endif /* Support for Bfloat16 */ typedef unsigned short libxsmm_bfloat16; LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE libxsmm_bfloat16_hp { libxsmm_bfloat16 i[2]; float f; } libxsmm_bfloat16_hp; #if defined(__cplusplus) namespace tensorflow { struct bfloat16; } #endif /*__cplusplus*/ /** Integer type for LAPACK/BLAS (LP64: 32-bit, and ILP64: 64-bit). */ typedef LIBXSMM_BLASINT libxsmm_blasint; /** Type representing sufficient storage space for a GEMM handle. */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_gemm_blob { char data[128]; } libxsmm_gemm_blob; LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_gemm_handle libxsmm_gemm_handle; /** Type representing sufficient storage space for descriptors (GEMM, TCOPY, MCOPY). */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_descriptor_blob { char data[LIBXSMM_DESCRIPTOR_MAXSIZE]; } libxsmm_descriptor_blob; /** Structure storing arguments of GEMM-like routines. */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_gemm_descriptor libxsmm_gemm_descriptor; /** Structure storing arguments of the matrix-copy routine. */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_mcopy_descriptor libxsmm_mcopy_descriptor; /** Structure storing arguments of the matrix-eltw routine. */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_meltw_descriptor libxsmm_meltw_descriptor; /** Structure storing arguments of the transpose routine. */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_trans_descriptor libxsmm_trans_descriptor; /** Structure storing arguments of packed TRSM. */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_trsm_descriptor libxsmm_trsm_descriptor; /** Structure storing arguments of packed TRMM. */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_trmm_descriptor libxsmm_trmm_descriptor; /** Structure storing arguments of packed GETRF. */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_getrf_descriptor libxsmm_getrf_descriptor; /** Structure storing arguments of packed GEMM. */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_pgemm_descriptor libxsmm_pgemm_descriptor; /** Enumerates element/data types. */ typedef enum libxsmm_datatype { LIBXSMM_DATATYPE_F64, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_BF16, LIBXSMM_DATATYPE_I64, LIBXSMM_DATATYPE_I32, LIBXSMM_DATATYPE_I16, LIBXSMM_DATATYPE_I8, LIBXSMM_DATATYPE_UNSUPPORTED } libxsmm_datatype; /** Denotes the precision/data type of GEMM. */ typedef enum libxsmm_gemm_precision { LIBXSMM_GEMM_PRECISION_F64 = LIBXSMM_DATATYPE_F64, LIBXSMM_GEMM_PRECISION_F32 = LIBXSMM_DATATYPE_F32, LIBXSMM_GEMM_PRECISION_BF16 = LIBXSMM_DATATYPE_BF16, LIBXSMM_GEMM_PRECISION_I32 = LIBXSMM_DATATYPE_I32, LIBXSMM_GEMM_PRECISION_I16 = LIBXSMM_DATATYPE_I16, LIBXSMM_GEMM_PRECISION_I8 = LIBXSMM_DATATYPE_I8 } libxsmm_gemm_precision; typedef enum libxsmm_meltw_operation { LIBXSMM_MELTW_OPERATION_NONE = 0, LIBXSMM_MELTW_OPERATION_COPY = 1, LIBXSMM_MELTW_OPERATION_ZERO = 2, LIBXSMM_MELTW_OPERATION_ADD = 3, LIBXSMM_MELTW_OPERATION_MUL = 4, LIBXSMM_MELTW_OPERATION_RELU = 5, LIBXSMM_MELTW_OPERATION_CVTFP32BF16 = 6, LIBXSMM_MELTW_OPERATION_REDUCE = 7, LIBXSMM_MELTW_OPERATION_SCALE = 8, LIBXSMM_MELTW_OPERATION_CVTFP32BF16_ACT = 9, LIBXSMM_MELTW_OPERATION_ACT_CVTFP32BF16 = 10, LIBXSMM_MELTW_OPERATION_COLBIAS_ACT = 11 } libxsmm_meltw_operation; typedef enum libxsmm_meltw_null_flags { LIBXSMM_MELTW_FLAG_NONE = 0 } libxsmm_meltw_null_flags; typedef enum libxsmm_meltw_redu_flags { LIBXSMM_MELTW_FLAG_REDUCE_NONE = 0, LIBXSMM_MELTW_FLAG_REDUCE_OP_ADD = 1, LIBXSMM_MELTW_FLAG_REDUCE_OP_MAX = 2, LIBXSMM_MELTW_FLAG_REDUCE_OP_MUL = 4, LIBXSMM_MELTW_FLAG_REDUCE_ROWS = 8, LIBXSMM_MELTW_FLAG_REDUCE_COLS = 16, LIBXSMM_MELTW_FLAG_REDUCE_ELTS = 32, LIBXSMM_MELTW_FLAG_REDUCE_ELTS_SQUARED = 64, LIBXSMM_MELTW_FLAG_REDUCE_OP_ADD_ROWS = LIBXSMM_MELTW_FLAG_REDUCE_OP_ADD | LIBXSMM_MELTW_FLAG_REDUCE_ROWS, LIBXSMM_MELTW_FLAG_REDUCE_OP_ADD_COLS = LIBXSMM_MELTW_FLAG_REDUCE_OP_ADD | LIBXSMM_MELTW_FLAG_REDUCE_COLS } libxsmm_meltw_redu_flags; typedef enum libxsmm_meltw_scal_flags { LIBXSMM_MELTW_FLAG_SCALE_NONE = 0, LIBXSMM_MELTW_FLAG_SCALE_MULT = 1, LIBXSMM_MELTW_FLAG_SCALE_SHIFT = 2, LIBXSMM_MELTW_FLAG_SCALE_ADD_BIAS = 4, LIBXSMM_MELTW_FLAG_SCALE_ROWS = 8, LIBXSMM_MELTW_FLAG_SCALE_COLS = 16, LIBXSMM_MELTW_FLAG_SCALE_MULT_ROWS = LIBXSMM_MELTW_FLAG_SCALE_MULT | LIBXSMM_MELTW_FLAG_SCALE_ROWS, LIBXSMM_MELTW_FLAG_SCALE_SHIFT_ROWS = LIBXSMM_MELTW_FLAG_SCALE_SHIFT | LIBXSMM_MELTW_FLAG_SCALE_ROWS, LIBXSMM_MELTW_FLAG_SCALE_ADD_BIAS_ROWS = LIBXSMM_MELTW_FLAG_SCALE_ADD_BIAS | LIBXSMM_MELTW_FLAG_SCALE_ROWS, LIBXSMM_MELTW_FLAG_SCALE_MULT_SHIFT_ROWS = LIBXSMM_MELTW_FLAG_SCALE_MULT | LIBXSMM_MELTW_FLAG_SCALE_SHIFT | LIBXSMM_MELTW_FLAG_SCALE_ROWS, LIBXSMM_MELTW_FLAG_SCALE_ADD_BIAS_SHIFT_ROWS = LIBXSMM_MELTW_FLAG_SCALE_ADD_BIAS | LIBXSMM_MELTW_FLAG_SCALE_SHIFT | LIBXSMM_MELTW_FLAG_SCALE_ROWS, LIBXSMM_MELTW_FLAG_SCALE_MULT_ADD_BIAS_ROWS = LIBXSMM_MELTW_FLAG_SCALE_ADD_BIAS | LIBXSMM_MELTW_FLAG_SCALE_MULT | LIBXSMM_MELTW_FLAG_SCALE_ROWS, LIBXSMM_MELTW_FLAG_SCALE_MULT_SHIFT_ADD_BIAS_ROWS = LIBXSMM_MELTW_FLAG_SCALE_MULT | LIBXSMM_MELTW_FLAG_SCALE_SHIFT | LIBXSMM_MELTW_FLAG_SCALE_ADD_BIAS | LIBXSMM_MELTW_FLAG_SCALE_ROWS, LIBXSMM_MELTW_FLAG_SCALE_MULT_COLS = LIBXSMM_MELTW_FLAG_SCALE_MULT | LIBXSMM_MELTW_FLAG_SCALE_COLS, LIBXSMM_MELTW_FLAG_SCALE_SHIFT_COLS = LIBXSMM_MELTW_FLAG_SCALE_SHIFT | LIBXSMM_MELTW_FLAG_SCALE_COLS, LIBXSMM_MELTW_FLAG_SCALE_ADD_BIAS_COLS = LIBXSMM_MELTW_FLAG_SCALE_ADD_BIAS | LIBXSMM_MELTW_FLAG_SCALE_COLS, LIBXSMM_MELTW_FLAG_SCALE_MULT_SHIFT_COLS = LIBXSMM_MELTW_FLAG_SCALE_MULT | LIBXSMM_MELTW_FLAG_SCALE_SHIFT | LIBXSMM_MELTW_FLAG_SCALE_COLS, LIBXSMM_MELTW_FLAG_SCALE_ADD_BIAS_SHIFT_COLS = LIBXSMM_MELTW_FLAG_SCALE_ADD_BIAS | LIBXSMM_MELTW_FLAG_SCALE_SHIFT | LIBXSMM_MELTW_FLAG_SCALE_COLS, LIBXSMM_MELTW_FLAG_SCALE_MULT_ADD_BIAS_COLS = LIBXSMM_MELTW_FLAG_SCALE_ADD_BIAS | LIBXSMM_MELTW_FLAG_SCALE_MULT | LIBXSMM_MELTW_FLAG_SCALE_COLS, LIBXSMM_MELTW_FLAG_SCALE_MULT_SHIFT_ADD_BIAS_COLS = LIBXSMM_MELTW_FLAG_SCALE_MULT | LIBXSMM_MELTW_FLAG_SCALE_SHIFT | LIBXSMM_MELTW_FLAG_SCALE_ADD_BIAS | LIBXSMM_MELTW_FLAG_SCALE_COLS } libxsmm_meltw_scal_flags; typedef enum libxsmm_meltw_cvta_flags { LIBXSMM_MELTW_FLAG_CVTA_NONE = 0, LIBXSMM_MELTW_FLAG_CVTA_FUSE_RELU = 1, LIBXSMM_MELTW_FLAG_CVTA_FUSE_TANH = 2, LIBXSMM_MELTW_FLAG_CVTA_FUSE_SIGM = 4 } libxsmm_meltw_cvta_flags; typedef enum libxsmm_meltw_acvt_flags { LIBXSMM_MELTW_FLAG_ACVT_NONE = 0, LIBXSMM_MELTW_FLAG_ACVT_FUSE_TANH = 1, LIBXSMM_MELTW_FLAG_ACVT_FUSE_SIGM = 2 } libxsmm_meltw_acvt_flags; typedef enum libxsmm_meltw_cbiasact_flags { LIBXSMM_MELTW_FLAG_CBIASACT_NONE = 0, LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS = 1, LIBXSMM_MELTW_FLAG_CBIASACT_ACT_RELU = 2, LIBXSMM_MELTW_FLAG_CBIASACT_ACT_TANH = 4, LIBXSMM_MELTW_FLAG_CBIASACT_ACT_SIGM = 8, LIBXSMM_MELTW_FLAG_CBIASACT_ACT_GELU = 16, LIBXSMM_MELTW_FLAG_CBIASACT_OVERWRITE_C = 32, LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS_ACT_RELU = LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS | LIBXSMM_MELTW_FLAG_CBIASACT_ACT_RELU, LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS_ACT_TANH = LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS | LIBXSMM_MELTW_FLAG_CBIASACT_ACT_TANH, LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS_ACT_SIGM = LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS | LIBXSMM_MELTW_FLAG_CBIASACT_ACT_SIGM, LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS_ACT_GELU = LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS | LIBXSMM_MELTW_FLAG_CBIASACT_ACT_GELU, LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS_ACT_RELU_OVERWRITE_C = LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS | LIBXSMM_MELTW_FLAG_CBIASACT_ACT_RELU | LIBXSMM_MELTW_FLAG_CBIASACT_OVERWRITE_C, LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS_ACT_TANH_OVERWRITE_C = LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS | LIBXSMM_MELTW_FLAG_CBIASACT_ACT_TANH | LIBXSMM_MELTW_FLAG_CBIASACT_OVERWRITE_C, LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS_ACT_SIGM_OVERWRITE_C = LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS | LIBXSMM_MELTW_FLAG_CBIASACT_ACT_SIGM | LIBXSMM_MELTW_FLAG_CBIASACT_OVERWRITE_C, LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS_ACT_GELU_OVERWRITE_C = LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS | LIBXSMM_MELTW_FLAG_CBIASACT_ACT_GELU | LIBXSMM_MELTW_FLAG_CBIASACT_OVERWRITE_C } libxsmm_meltw_cbiasact_flags; LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE libxsmm_xmelt_flags { libxsmm_meltw_null_flags elt_null; libxsmm_meltw_redu_flags elt_redu; libxsmm_meltw_scal_flags elt_scal; libxsmm_meltw_cvta_flags elt_cvta; libxsmm_meltw_acvt_flags elt_acvt; libxsmm_meltw_cbiasact_flags elt_cbiasact; } libxsmm_xmelt_flags; /** Flag enumeration which can be binary ORed. */ typedef enum libxsmm_gemm_flags { LIBXSMM_GEMM_FLAG_NONE = 0, /** Transpose matrix A. */ LIBXSMM_GEMM_FLAG_TRANS_A = 1, /** Transpose matrix B. */ LIBXSMM_GEMM_FLAG_TRANS_B = 2, /** Transpose matrix A and B. */ LIBXSMM_GEMM_FLAG_TRANS_AB = LIBXSMM_GEMM_FLAG_TRANS_A | LIBXSMM_GEMM_FLAG_TRANS_B, #if 0 /** Alpha=0|1 */ LIBXSMM_GEMM_FLAG_ALPHA_0 = 4, /** Alpha=neg|pos */ LIBXSMM_GEMM_FLAG_ALPHA_S = 8, #endif /** Beta=0|1 */ LIBXSMM_GEMM_FLAG_BETA_0 = 16, #if 0 /** Beta=neg|pos */ LIBXSMM_GEMM_FLAG_BETA_S = 32, #endif /** Generate aligned load instructions. */ LIBXSMM_GEMM_FLAG_ALIGN_A = 64, /** Aligned load/store instructions. */ LIBXSMM_GEMM_FLAG_ALIGN_C = 128, /** Batch-reduce Ai * Bi. */ LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS = 256, /** Batch-reduce Ai * Bi. */ LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET = 512, /** Batch-reduce Ai * Bi. */ LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE = 1024, /** Aligned C matrix, but using NTS Hint when storing */ LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT = 2176, /* in case of integer GEMM, if A is unsigned */ LIBXSMM_GEMM_FLAG_A_UNSIGNED = 4096, /* in case of integer GEMM, if B is unsigned */ LIBXSMM_GEMM_FLAG_B_UNSIGNED = 8192, /* in case of integer GEMM, if C is unsigned */ LIBXSMM_GEMM_FLAG_C_UNSIGNED = 16384, /* in case of integer GEMM, if A and B are unsigned */ LIBXSMM_GEMM_FLAG_AB_UNSIGNED = LIBXSMM_GEMM_FLAG_A_UNSIGNED | LIBXSMM_GEMM_FLAG_B_UNSIGNED, /* for low precision we also require up-front packed formats "VNNI" for best performance, this flag indicates A */ LIBXSMM_GEMM_FLAG_VNNI_A = 32768, /* for low precision we also require up-front packed formats "VNNI" for best performance, this flag indicates B */ LIBXSMM_GEMM_FLAG_VNNI_B = 65536, /* combined types */ LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BETA_0 = LIBXSMM_GEMM_FLAG_BETA_0 | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT, LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BATCH_REDUCE_ADDRESS = LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT, LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BETA_0_BATCH_REDUCE_ADDRESS = LIBXSMM_GEMM_FLAG_BETA_0 | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BATCH_REDUCE_OFFSET = LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT, LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BETA_0_BATCH_REDUCE_OFFSET = LIBXSMM_GEMM_FLAG_BETA_0 | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BATCH_REDUCE_STRIDE = LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT, LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BETA_0_BATCH_REDUCE_STRIDE = LIBXSMM_GEMM_FLAG_BETA_0 | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BETA_0_A_UNSIGNED = LIBXSMM_GEMM_FLAG_BETA_0 | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_A_UNSIGNED, LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BATCH_REDUCE_ADDRESS_A_UNSIGNED = LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_A_UNSIGNED, LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BETA_0_BATCH_REDUCE_ADDRESS_A_UNSIGNED = LIBXSMM_GEMM_FLAG_BETA_0 | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS | LIBXSMM_GEMM_FLAG_A_UNSIGNED, LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BATCH_REDUCE_OFFSET_A_UNSIGNED = LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_A_UNSIGNED, LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BETA_0_BATCH_REDUCE_OFFSET_A_UNSIGNED = LIBXSMM_GEMM_FLAG_BETA_0 | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET | LIBXSMM_GEMM_FLAG_A_UNSIGNED, LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BATCH_REDUCE_STRIDE_A_UNSIGNED = LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_A_UNSIGNED, LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BETA_0_BATCH_REDUCE_STRIDE_A_UNSIGNED = LIBXSMM_GEMM_FLAG_BETA_0 | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE | LIBXSMM_GEMM_FLAG_A_UNSIGNED, LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BETA_0_B_UNSIGNED = LIBXSMM_GEMM_FLAG_BETA_0 | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_B_UNSIGNED, LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BATCH_REDUCE_ADDRESS_B_UNSIGNED = LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_B_UNSIGNED, LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BETA_0_BATCH_REDUCE_ADDRESS_B_UNSIGNED = LIBXSMM_GEMM_FLAG_BETA_0 | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS | LIBXSMM_GEMM_FLAG_B_UNSIGNED, LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BATCH_REDUCE_OFFSET_B_UNSIGNED = LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_B_UNSIGNED, LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BETA_0_BATCH_REDUCE_OFFSET_B_UNSIGNED = LIBXSMM_GEMM_FLAG_BETA_0 | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET | LIBXSMM_GEMM_FLAG_B_UNSIGNED, LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BATCH_REDUCE_STRIDE_B_UNSIGNED = LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_B_UNSIGNED, LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BETA_0_BATCH_REDUCE_STRIDE_B_UNSIGNED = LIBXSMM_GEMM_FLAG_BETA_0 | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE | LIBXSMM_GEMM_FLAG_B_UNSIGNED, LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BETA_0_AB_UNSIGNED = LIBXSMM_GEMM_FLAG_BETA_0 | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_AB_UNSIGNED, LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BATCH_REDUCE_ADDRESS_AB_UNSIGNED = LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_AB_UNSIGNED, LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BETA_0_BATCH_REDUCE_ADDRESS_AB_UNSIGNED = LIBXSMM_GEMM_FLAG_BETA_0 | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS | LIBXSMM_GEMM_FLAG_AB_UNSIGNED, LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BATCH_REDUCE_OFFSET_AB_UNSIGNED = LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_AB_UNSIGNED, LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BETA_0_BATCH_REDUCE_OFFSET_AB_UNSIGNED = LIBXSMM_GEMM_FLAG_BETA_0 | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET | LIBXSMM_GEMM_FLAG_AB_UNSIGNED, LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BATCH_REDUCE_STRIDE_AB_UNSIGNED = LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_AB_UNSIGNED, LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BETA_0_BATCH_REDUCE_STRIDE_AB_UNSIGNED = LIBXSMM_GEMM_FLAG_BETA_0 | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE | LIBXSMM_GEMM_FLAG_AB_UNSIGNED, /** Marker flag; do not use. */ LIBXSMM_GEMM_FLAG_INVALID = 131072 } libxsmm_gemm_flags; /** Flag enumeration which can be binary ORed. */ typedef enum libxsmm_gemm_handle_flags { LIBXSMM_GEMM_HANDLE_FLAG_AUTO = 0, LIBXSMM_GEMM_HANDLE_FLAG_COPY_A = 1, LIBXSMM_GEMM_HANDLE_FLAG_COPY_B = 2, LIBXSMM_GEMM_HANDLE_FLAG_COPY_C = 4 } libxsmm_gemm_handle_flags; /** Auto-batch flags (can be ORed) applicable to mmbatch_begin/mmbatch_end. */ typedef enum libxsmm_mmbatch_flags { /** Handle recorded batch unsynchronized-parallel. */ LIBXSMM_MMBATCH_FLAG_DEFAULT = LIBXSMM_GEMM_FLAG_INVALID * 0, /** Synchronize among C matrices. */ LIBXSMM_MMBATCH_FLAG_SYNCHRONIZED = LIBXSMM_GEMM_FLAG_INVALID * 1, /** Handle recorded batch sequentially. */ LIBXSMM_MMBATCH_FLAG_SEQUENTIAL = LIBXSMM_GEMM_FLAG_INVALID * 2, /** Only record a statistic of potential SMMs. */ LIBXSMM_MMBATCH_FLAG_STATISTIC = LIBXSMM_GEMM_FLAG_INVALID * 4 } libxsmm_mmbatch_flags; /** Enumeration of the available prefetch strategies. */ typedef enum libxsmm_gemm_prefetch_type { /** No prefetching and no prefetch fn. signature. */ LIBXSMM_GEMM_PREFETCH_NONE = LIBXSMM_PREFETCH_NONE, /** Only function prefetch signature. */ LIBXSMM_GEMM_PREFETCH_SIGONLY = LIBXSMM_PREFETCH_SIGONLY, /** Prefetch PA using accesses to A. */ LIBXSMM_GEMM_PREFETCH_AL2 = 2, /** Prefetch PA (aggressive). */ LIBXSMM_GEMM_PREFETCH_BL2_VIA_C = 4, /** Prefetch A ahead. */ LIBXSMM_GEMM_PREFETCH_AL2_AHEAD = 8, LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C = LIBXSMM_GEMM_PREFETCH_BL2_VIA_C | LIBXSMM_GEMM_PREFETCH_AL2, LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C_AHEAD = LIBXSMM_GEMM_PREFETCH_BL2_VIA_C | LIBXSMM_GEMM_PREFETCH_AL2_AHEAD, /** Backward compatibility: AL2CL2BL2_VIA_C is an alias for AL2BL2_VIA_C (Eigen library). */ LIBXSMM_PREFETCH_AL2CL2BL2_VIA_C = LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C, /** Current B into L1. */ LIBXSMM_GEMM_PREFETCH_BL1 = 16 } libxsmm_gemm_prefetch_type; /** Flag enumeration which can be binary ORed. */ typedef enum libxsmm_matcopy_flags { LIBXSMM_MATCOPY_FLAG_DEFAULT = 0, /** If set, then use zero matrix as source */ LIBXSMM_MATCOPY_FLAG_ZERO_SOURCE = 1 } libxsmm_matcopy_flags; /** Determines the kernel kind. */ typedef enum libxsmm_kernel_kind { /** Matrix multiplication kernel */ LIBXSMM_KERNEL_KIND_MATMUL = 0, /** Matcopy kernel kind */ LIBXSMM_KERNEL_KIND_MCOPY = 1, /** Mateltw kernel kind */ LIBXSMM_KERNEL_KIND_MELTW = 2, /** Transpose kernel kind */ LIBXSMM_KERNEL_KIND_TRANS = 3, /** GEMM/packed kernel kind */ LIBXSMM_KERNEL_KIND_PGEMM = 4, /** GEMM/packed kernel kind */ LIBXSMM_KERNEL_KIND_GETRF = 5, /** TRMM kernel kind */ LIBXSMM_KERNEL_KIND_TRMM = 6, /** TRSM kernel kind */ LIBXSMM_KERNEL_KIND_TRSM = 7, /** User-defined kernels */ LIBXSMM_KERNEL_KIND_USER = 8, /** Not a JIT kernel */ LIBXSMM_KERNEL_UNREGISTERED = 9 } libxsmm_kernel_kind; typedef enum libxsmm_dnn_tensor_format { /* use LIBXSMM internal format, we need to copy data into that */ LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM = 1, /* use NHWC format internally, this allows no-copy operations */ LIBXSMM_DNN_TENSOR_FORMAT_NHWC = 2, /* use NCHW format internally, this will include shadow copies, not preferred */ LIBXSMM_DNN_TENSOR_FORMAT_NCHW = 4, /* use RSCK format internally, this allows no-copy operations */ LIBXSMM_DNN_TENSOR_FORMAT_RSCK = 8, /* use KCRS format internally, this will include shadow copies, not preferred */ LIBXSMM_DNN_TENSOR_FORMAT_KCRS = 16, LIBXSMM_DNN_TENSOR_FORMAT_CK = 32, LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED = 64, LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED = 128, LIBXSMM_DNN_TENSOR_FORMAT_NC = 256 } libxsmm_dnn_tensor_format; /** Denotes the element/pixel type of an image/channel. */ typedef enum libxsmm_dnn_datatype { LIBXSMM_DNN_DATATYPE_F64 = LIBXSMM_DATATYPE_F64, LIBXSMM_DNN_DATATYPE_F32 = LIBXSMM_DATATYPE_F32, LIBXSMM_DNN_DATATYPE_BF16 = LIBXSMM_DATATYPE_BF16, LIBXSMM_DNN_DATATYPE_I32 = LIBXSMM_DATATYPE_I32, LIBXSMM_DNN_DATATYPE_I16 = LIBXSMM_DATATYPE_I16, LIBXSMM_DNN_DATATYPE_I8 = LIBXSMM_DATATYPE_I8 } libxsmm_dnn_datatype; typedef enum libxsmm_dnn_conv_option { /* we get default settings */ LIBXSMM_DNN_CONV_OPTION_NONE = 0, /* overwrite results buffer (set it to zero before running the operations) */ LIBXSMM_DNN_CONV_OPTION_OVERWRITE = 1, /* external filter transpose to bwd convolutions */ LIBXSMM_DNN_CONV_OPTION_BWD_NO_FILTER_TRANSPOSE = 2, /* compound types */ LIBXSMM_DNN_CONV_OPTION_BWD_NO_FILTER_TRANSPOSE_OVERWRITE = LIBXSMM_DNN_CONV_OPTION_OVERWRITE | LIBXSMM_DNN_CONV_OPTION_BWD_NO_FILTER_TRANSPOSE } libxsmm_dnn_conv_option; typedef enum libxsmm_dnn_fusedbatchnorm_fuse_order { /* the fuse order is: 1. BN, 2. element-wise 3. RELU */ LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU = 0 } libxsmm_dnn_fusedbatchnorm_fuse_order; typedef enum libxsmm_dnn_fusedbatchnorm_fuse_op { /* the fuse order is: 1. BN, 2. element-wise 3. RELU */ LIBXSMM_DNN_FUSEDBN_OPS_BN = 1, LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE = 2, LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS = 4, LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED = 8, LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE = 16, LIBXSMM_DNN_FUSEDBN_OPS_RELU = 32, LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK = 64, LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU = LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE | LIBXSMM_DNN_FUSEDBN_OPS_RELU, LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK = LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE | LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK, LIBXSMM_DNN_FUSEDBN_OPS_BN_ELTWISE = LIBXSMM_DNN_FUSEDBN_OPS_BN | LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE, LIBXSMM_DNN_FUSEDBN_OPS_BN_RELU = LIBXSMM_DNN_FUSEDBN_OPS_BN | LIBXSMM_DNN_FUSEDBN_OPS_RELU, LIBXSMM_DNN_FUSEDBN_OPS_BN_RELU_WITH_MASK = LIBXSMM_DNN_FUSEDBN_OPS_BN | LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK, LIBXSMM_DNN_FUSEDBN_OPS_BN_ELTWISE_RELU = LIBXSMM_DNN_FUSEDBN_OPS_BN | LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE | LIBXSMM_DNN_FUSEDBN_OPS_RELU, LIBXSMM_DNN_FUSEDBN_OPS_BN_ELTWISE_RELU_WITH_MASK = LIBXSMM_DNN_FUSEDBN_OPS_BN | LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE | LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK, LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE_ELTWISE = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE | LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE, LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE_RELU = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE | LIBXSMM_DNN_FUSEDBN_OPS_RELU, LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE_RELU_WITH_MASK = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE | LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK, LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE_ELTWISE_RELU = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE | LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE | LIBXSMM_DNN_FUSEDBN_OPS_RELU, LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE_ELTWISE_RELU_WITH_MASK = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE | LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE | LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK, LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_ELTWISE = LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS | LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE, LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_RELU = LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS | LIBXSMM_DNN_FUSEDBN_OPS_RELU, LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_RELU_WITH_MASK = LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS | LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK, LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_ELTWISE_RELU = LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS | LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE | LIBXSMM_DNN_FUSEDBN_OPS_RELU, LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_ELTWISE_RELU_WITH_MASK = LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS | LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE | LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK, LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED_ELTWISE = LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED | LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE, LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED_RELU = LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED | LIBXSMM_DNN_FUSEDBN_OPS_RELU, LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED_RELU_WITH_MASK = LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED | LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK, LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED_ELTWISE_RELU = LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED | LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE | LIBXSMM_DNN_FUSEDBN_OPS_RELU, LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED_ELTWISE_RELU_WITH_MASK = LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED | LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE | LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK } libxsmm_dnn_fusedbatchnorm_fuse_op; LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_fusedbatchnorm_desc { int partN; /* number of images in mini-batch, used for all elementwise computations */ int fullN; /* number of images in mini-batch, used for statistics computations */ int C; /* number of input feature maps */ int H; /* height of input image */ int W; /* width of input image */ int u; /* vertical stride */ int v; /* horizontal stride */ int pad_h_in; /* height of physical zero-padding in input buffer */ int pad_w_in; /* width of physical zero-padding in input buffer */ int pad_h_out; /* height of physical zero-padding in output buffer */ int pad_w_out; /* width of physical zero-padding in output buffer */ int threads; /* number of threads used */ libxsmm_dnn_datatype datatype_in; /* datatype used for all input related buffers */ libxsmm_dnn_datatype datatype_out; /* datatype used for all output related buffers */ libxsmm_dnn_datatype datatype_stats; /* datatype used for all stats related buffers */ libxsmm_dnn_tensor_format buffer_format; /* format which is for activation buffers */ libxsmm_dnn_fusedbatchnorm_fuse_order fuse_order; /* additional options */ libxsmm_dnn_fusedbatchnorm_fuse_op fuse_ops; /* used ops into convolutions */ } libxsmm_dnn_fusedbatchnorm_desc; typedef enum libxsmm_dnn_fusedgroupnorm_fuse_order { /* the fuse order is: 1. BN, 2. element-wise 3. RELU */ LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU = 0 } libxsmm_dnn_fusedgroupnorm_fuse_order; typedef enum libxsmm_dnn_fusedgroupnorm_fuse_op { /* the fuse order is: 1. GN, 2. element-wise 3. RELU */ LIBXSMM_DNN_FUSEDGN_OPS_GN = 1, LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE = 2, LIBXSMM_DNN_FUSEDGN_OPS_RELU = 4, LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK = 8, LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU = LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE | LIBXSMM_DNN_FUSEDGN_OPS_RELU, LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK = LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE | LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK, LIBXSMM_DNN_FUSEDGN_OPS_GN_ELTWISE = LIBXSMM_DNN_FUSEDGN_OPS_GN | LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE, LIBXSMM_DNN_FUSEDGN_OPS_GN_RELU = LIBXSMM_DNN_FUSEDGN_OPS_GN | LIBXSMM_DNN_FUSEDGN_OPS_RELU, LIBXSMM_DNN_FUSEDGN_OPS_GN_RELU_WITH_MASK = LIBXSMM_DNN_FUSEDGN_OPS_GN | LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK, LIBXSMM_DNN_FUSEDGN_OPS_GN_ELTWISE_RELU = LIBXSMM_DNN_FUSEDGN_OPS_GN | LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE | LIBXSMM_DNN_FUSEDGN_OPS_RELU, LIBXSMM_DNN_FUSEDGN_OPS_GN_ELTWISE_RELU_WITH_MASK = LIBXSMM_DNN_FUSEDGN_OPS_GN | LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE | LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK } libxsmm_dnn_fusedgroupnorm_fuse_op; LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_fusedgroupnorm_desc { int N; /* number of images in mini-batch */ int G; /* groups of channels to norm */ int C; /* number of input feature maps */ int H; /* height of input image */ int W; /* width of input image */ int u; /* vertical stride */ int v; /* horizontal stride */ int pad_h_in; /* height of physical zero-padding in input buffer */ int pad_w_in; /* width of physical zero-padding in input buffer */ int pad_h_out; /* height of physical zero-padding in output buffer */ int pad_w_out; /* width of physical zero-padding in output buffer */ int threads; /* number of threads used */ libxsmm_dnn_datatype datatype_in; /* datatype used for all input related buffers */ libxsmm_dnn_datatype datatype_out; /* datatype used for all output related buffers */ libxsmm_dnn_datatype datatype_stats; /* datatype used for all stats related buffers */ libxsmm_dnn_tensor_format buffer_format; /* format which is for activation buffers */ libxsmm_dnn_fusedgroupnorm_fuse_order fuse_order; /* additional options */ libxsmm_dnn_fusedgroupnorm_fuse_op fuse_ops; /* used ops into convolutions */ } libxsmm_dnn_fusedgroupnorm_desc; /** argument struct for matrix-eltwise: copy */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_meltw_copy_param { const void* in_ptr; /* input pointer */ void* out_ptr; /* output pointer */ } libxsmm_meltw_copy_param; /** argument struct for matrix-eltwise: zero */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_meltw_zero_param { const void* in_ptr; /* input pointer */ void* out_ptr; /* output pointer */ } libxsmm_meltw_zero_param; /** argument struct for matrix-eltwise: add */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_meltw_add_param { const void* in_ptr; /* input pointer */ void* out_ptr; /* output pointer */ } libxsmm_meltw_add_param; /** argument struct for matrix-eltwise: mul */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_meltw_mul_param { const void* in_ptr; /* input pointer */ void* out_ptr; /* output pointer */ } libxsmm_meltw_mul_param; /** argument struct for matrix-eltwise: relu */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_meltw_relu_param { const void* in_ptr; /* input pointer */ void* mask_ptr; /* pointer to load/store ReLU mask */ void* out_ptr; /* output pointer */ } libxsmm_meltw_relu_param; /** argument struct for matrix-eltwise: cvtfp32bf16 */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_meltw_cvtfp32bf16_param { const void* in_ptr; /* input pointer */ void* out_ptr; /* output pointer */ } libxsmm_meltw_cvtfp32bf16_param; /** argument struct for matrix-eltwise: cvtfp32bf16_act */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_meltw_cvtfp32bf16_act_param { const void* in_ptr; /* input pointer */ void* out_ptr; /* output pointer */ void* actstore_ptr; /* output pointer for activation if it is fused into the convert */ } libxsmm_meltw_cvtfp32bf16_act_param; /** argument struct for matrix-eltwise: act_cvtfp32bf16 */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_meltw_act_cvtfp32bf16_param { const void* in_ptr; /* input pointer */ void* out_ptr; /* output pointer */ void* actstore_ptr; /* output pointer for activation if it is fused into the convert */ } libxsmm_meltw_act_cvtfp32bf16_param; /** argument struct for matrix-eltwise: reduce */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_meltw_reduce_param { const void* in_ptr; /* input pointer */ void* out_ptr_0; /* output pointer */ void* out_ptr_1; /* output pointer */ } libxsmm_meltw_reduce_param; /** argument struct for matrix-eltwise: scale */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_meltw_scale_param { const void* in_ptr; /* input pointer */ const void* shift_vals_ptr; /* pointer to shift values array */ const void* scale_vals_ptr; /* pointer to scale values array */ const void* bias_vals_ptr; /* pointer to bias values array*/ void* out_ptr; /* output pointer */ } libxsmm_meltw_scale_param; LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_meltw_cbiasact_param { const void* in_ptr; /* input pointer */ const void* bias_ptr; /* col-bias pointer */ void* mask_ptr; /* pointer to load/store ReLU mask */ void* out_ptr; /* output pointer */ } libxsmm_meltw_cbiasact_param; LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_meltw_cbiasact_gemm_param { const void* bias_ptr; /* optional, col-bias pointer */ void* out_ptr; /* optional, pointer to output after eltwise (contains mask in case of ReLU); */ /* Need for some activation functions, assumed to have the same shape as C matrix, */ /* may not be set when OVERWRITE_C option is chosen */ /* If OVERWRITE_C is false: out_ptr contains the post-act output, C has the pre-act output */ /* If OVERWRITE_C is true: C contains post-act output, out_ptr contains the ReLU mask (only when act was ReLU) for other act unused */ } libxsmm_meltw_cbiasact_gemm_param; /** Specialized function for matrix-eltw (weak-typed). */ LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_meltwfunction_copy)(const libxsmm_meltw_copy_param* in_struct); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_meltwfunction_zero)(const libxsmm_meltw_zero_param* in_struct); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_meltwfunction_add)(const libxsmm_meltw_add_param* in_struct); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_meltwfunction_mul)(const libxsmm_meltw_mul_param* in_struct); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_meltwfunction_relu)(const libxsmm_meltw_relu_param* in_struct); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_meltwfunction_cvtfp32bf16)(const libxsmm_meltw_cvtfp32bf16_param* in_struct); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_meltwfunction_reduce)(const libxsmm_meltw_reduce_param* in_struct); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_meltwfunction_scale)(const libxsmm_meltw_scale_param* in_struct); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_meltwfunction_cvtfp32bf16_act)(const libxsmm_meltw_cvtfp32bf16_act_param* in_struct); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_meltwfunction_act_cvtfp32bf16)(const libxsmm_meltw_act_cvtfp32bf16_param* in_struct); LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE libxsmm_xmeltwfunction { void (*xmeltw)(const void* in_struct); libxsmm_meltwfunction_copy meltw_copy; libxsmm_meltwfunction_zero meltw_zero; libxsmm_meltwfunction_add meltw_add; libxsmm_meltwfunction_mul meltw_mul; libxsmm_meltwfunction_relu meltw_relu; libxsmm_meltwfunction_cvtfp32bf16 meltw_cvtfp32bf16; libxsmm_meltwfunction_reduce meltw_reduce; libxsmm_meltwfunction_scale meltw_scale; libxsmm_meltwfunction_cvtfp32bf16_act meltw_cvtfp32bf16_act; libxsmm_meltwfunction_act_cvtfp32bf16 meltw_act_cvtfp32bf16; } libxsmm_xmeltwfunction; /** Specialized function with fused alpha and beta arguments, and optional prefetch locations (double-precision). */ LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_dmmfunction)(const double* a, const double* b, double* c, ...); /** Specialized function with fused alpha and beta arguments, and optional prefetch locations (single-precision). */ LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_smmfunction)(const float* a, const float* b, float* c, ...); /** Specialized function with fused alpha and beta arguments, and optional prefetch locations (bf16, fp32-accumulate). */ LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_bsmmfunction)(const libxsmm_bfloat16* a, const libxsmm_bfloat16* b, float* c, ...); /** Specialized function with fused alpha and beta arguments, and optional prefetch locations (bf16, fp32-accumulate). */ LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_bmmfunction)(const libxsmm_bfloat16* a, const libxsmm_bfloat16* b, libxsmm_bfloat16* c, ...); /** Specialized function with fused alpha and beta arguments, and optional prefetch locations (low-precision). */ LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_wimmfunction)(const short* a, const short* b, int* c, ...); /** Specialized function with fused alpha and beta arguments, and optional prefetch locations (int8, int32 accumulate). */ LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_ssbimmfunction)(const char* a, const char* b, int* c, ...); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_usbimmfunction)(const unsigned char* a, const char* b, int* c, ...); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_subimmfunction)(const char* a, const unsigned char* b, int* c, ...); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_uubimmfunction)(const unsigned char* a, const unsigned char* b, int* c, ...); /** Specialized function with fused alpha and beta arguments, and optional prefetch locations (int8, int32 accumulate, int8 downconvert). */ LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_sububmmfunction)(const char* a, const unsigned char* b, unsigned char* c, float* scf, ...); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_dmmfunction_reducebatch_addr)(const double** a, const double** b, double* c, const unsigned long long* count, ...); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_smmfunction_reducebatch_addr)(const float** a, const float** b, float* c, const unsigned long long* count, ...); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_bsmmfunction_reducebatch_addr)(const libxsmm_bfloat16** a, const libxsmm_bfloat16** b, float* c, const unsigned long long* count, ...); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_bmmfunction_reducebatch_addr)(const libxsmm_bfloat16** a, const libxsmm_bfloat16** b, libxsmm_bfloat16* c, const unsigned long long* count, ...); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_wimmfunction_reducebatch_addr)(const short** a, const short** b, int* c, const unsigned long long* count, ...); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_ssbimmfunction_reducebatch_addr)(const char** a, const char** b, int* c, const unsigned long long* count, ...); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_usbimmfunction_reducebatch_addr)(const unsigned char** a, const char** b, int* c, const unsigned long long* count, ...); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_subimmfunction_reducebatch_addr)(const char** a, const unsigned char** b, int* c, const unsigned long long* count, ...); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_uubimmfunction_reducebatch_addr)(const unsigned char** a, const unsigned char** b, int* c, const unsigned long long* count, ...); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_sububmmfunction_reducebatch_addr)(const char** a, const unsigned char** b, unsigned char* c, const unsigned long long* count, float* scf, ...); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_dmmfunction_reducebatch_offs)(const double* a, const double* b, double* c, const unsigned long long* count, const unsigned long long* a_offs, const unsigned long long* b_offs, ...); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_smmfunction_reducebatch_offs)(const float* a, const float* b, float* c, const unsigned long long* count, const unsigned long long* a_offs, const unsigned long long* b_offs, ...); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_bsmmfunction_reducebatch_offs)(const libxsmm_bfloat16* a, const libxsmm_bfloat16* b, float* c, const unsigned long long* count, const unsigned long long* a_offs, const unsigned long long* b_offs, ...); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_bmmfunction_reducebatch_offs)(const libxsmm_bfloat16* a, const libxsmm_bfloat16* b, libxsmm_bfloat16* c, const unsigned long long* count, const unsigned long long* a_offs, const unsigned long long* b_offs, ...); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_wimmfunction_reducebatch_offs)(const short* a, const short* b, int* c, const unsigned long long* count, const unsigned long long* a_offs, const unsigned long long* b_offs, ...); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_ssbimmfunction_reducebatch_offs)(const char* a, const char* b, int* c, const unsigned long long* count, const unsigned long long* a_offs, const unsigned long long* b_offs, ...); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_usbimmfunction_reducebatch_offs)(const unsigned char* a, const char* b, int* c, const unsigned long long* count, const unsigned long long* a_offs, const unsigned long long* b_offs, ...); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_subimmfunction_reducebatch_offs)(const char* a, const unsigned char* b, int* c, const unsigned long long* count, const unsigned long long* a_offs, const unsigned long long* b_offs, ...); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_uubimmfunction_reducebatch_offs)(const unsigned char* a, const unsigned char* b, int* c, const unsigned long long* count, const unsigned long long* a_offs, const unsigned long long* b_offs, ...); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_sububmmfunction_reducebatch_offs)(const char* a, const unsigned char* b, unsigned char* c, const unsigned long long* count, const unsigned long long* a_offs, const unsigned long long* b_offs, float* scf, ...); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_dmmfunction_reducebatch_strd)(const double* a, const double* b, double* c, const unsigned long long* count, ...); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_smmfunction_reducebatch_strd)(const float* a, const float* b, float* c, const unsigned long long* count, ...); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_bsmmfunction_reducebatch_strd)(const libxsmm_bfloat16* a, const libxsmm_bfloat16* b, float* c, const unsigned long long* count, ...); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_bmmfunction_reducebatch_strd)(const libxsmm_bfloat16* a, const libxsmm_bfloat16* b, libxsmm_bfloat16* c, const unsigned long long* count, ...); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_wimmfunction_reducebatch_strd)(const short* a, const short* b, int* c, const unsigned long long* count, ...); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_ssbimmfunction_reducebatch_strd)(const char* a, const char* b, int* c, const unsigned long long* count, ...); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_usbimmfunction_reducebatch_strd)(const unsigned char* a, const char* b, int* c, const unsigned long long* count, ...); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_subimmfunction_reducebatch_strd)(const char* a, const unsigned char* b, int* c, const unsigned long long* count, ...); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_uubimmfunction_reducebatch_strd)(const unsigned char* a, const unsigned char* b, int* c, const unsigned long long* count, ...); LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_sububmmfunction_reducebatch_strd)(const char* a, const unsigned char* b, unsigned char* c, const unsigned long long* count, float* scf, ...); /** Function type which is either libxsmm_smmfunction or libxsmm_dmmfunction (weak-typed). */ LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE libxsmm_xmmfunction { void (*xmm)(const void* a, const void* b, void* c, ...); void (*xbm)(const void** a, const void** b, void* c, const unsigned long long* count, ...); libxsmm_dmmfunction dmm; libxsmm_smmfunction smm; libxsmm_wimmfunction wimm; libxsmm_bsmmfunction bsmm; libxsmm_bmmfunction bmm; libxsmm_ssbimmfunction ssbimm; libxsmm_usbimmfunction usbimm; libxsmm_subimmfunction subimm; libxsmm_uubimmfunction uubimm; libxsmm_sububmmfunction sububmm; libxsmm_dmmfunction_reducebatch_addr dmra; libxsmm_smmfunction_reducebatch_addr smra; libxsmm_bsmmfunction_reducebatch_addr bsmra; libxsmm_bmmfunction_reducebatch_addr bmra; libxsmm_wimmfunction_reducebatch_addr wimra; libxsmm_ssbimmfunction_reducebatch_addr ssbimra; libxsmm_usbimmfunction_reducebatch_addr usbimra; libxsmm_subimmfunction_reducebatch_addr subimra; libxsmm_uubimmfunction_reducebatch_addr uubimra; libxsmm_sububmmfunction_reducebatch_addr sububmra; libxsmm_dmmfunction_reducebatch_offs dmro; libxsmm_smmfunction_reducebatch_offs smro; libxsmm_bsmmfunction_reducebatch_offs bsmro; libxsmm_bmmfunction_reducebatch_offs bmro; libxsmm_wimmfunction_reducebatch_offs wimro; libxsmm_ssbimmfunction_reducebatch_offs ssbimro; libxsmm_usbimmfunction_reducebatch_offs usbimro; libxsmm_subimmfunction_reducebatch_offs subimro; libxsmm_uubimmfunction_reducebatch_offs uubimro; libxsmm_sububmmfunction_reducebatch_offs sububmro; libxsmm_dmmfunction_reducebatch_strd dmrs; libxsmm_smmfunction_reducebatch_strd smrs; libxsmm_bsmmfunction_reducebatch_strd bsmrs; libxsmm_bmmfunction_reducebatch_strd bmrs; libxsmm_wimmfunction_reducebatch_strd wimrs; libxsmm_ssbimmfunction_reducebatch_strd ssbimrs; libxsmm_usbimmfunction_reducebatch_strd usbimrs; libxsmm_subimmfunction_reducebatch_strd subimrs; libxsmm_uubimmfunction_reducebatch_strd uubimrs; libxsmm_sububmmfunction_reducebatch_strd sububmrs; } libxsmm_xmmfunction; /** Specialized function for matrix-copy (weak-typed). */ LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_xmcopyfunction)( const void* in, const unsigned int* ldi, void* out, const unsigned int* ldo, ...); /** Specialized function for transpose (weak-typed). */ LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_xtransfunction)( const void* in, const unsigned int* ldi, void* out, const unsigned int* ldo); /** Specialized function for packed GEMM (weak-typed). */ LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_pgemm_xfunction)( const void* a, const void* b, void* c); /** Specialized function for packed GEMM (weak-typed). */ LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_getrf_xfunction)( const void* a, const void* b, void* c); /** Specialized function for TRMM (weak-typed). */ LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_trmm_xfunction)( const void* a, const void* b, void* c); /** Specialized function for TRSM (weak-typed). */ LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_trsm_xfunction)( const void* a, const void* b, void* c); /** Structure to receive information about GEMM-kernels (libxsmm_get_mmkernel_info). */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_mmkernel_info { /** Input/output data-type */ libxsmm_gemm_precision iprecision, oprecision; /** Prefetch strategy. */ libxsmm_gemm_prefetch_type prefetch; /** Leading dimensions. */ unsigned int lda, ldb, ldc; /** Extents/shape. */ unsigned int m, n, k; /** Set of flags. */ int flags; } libxsmm_mmkernel_info; /** Structure to receive information about transpose-kernels (libxsmm_get_transkernel_info). */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_transkernel_info { /** LD, M, and N. */ unsigned int ldo, m, n; /** Size of data element. */ unsigned int typesize; } libxsmm_transkernel_info; /** Structure to receive information about matrix-copy kernels (libxsmm_get_mcopykernel_info). */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_mcopykernel_info { /** LDx, M, and N. */ unsigned int ldi, ldo, m, n; /** Size of data element. */ unsigned int typesize; /** Boolean value. */ int prefetch; /** Set of flags. */ int flags; } libxsmm_mcopykernel_info; /** Structure to receive information about matrix-eltw kernels (libxsmm_get_mcopykernel_info). */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_meltwkernel_info { /** LDx, M, and N. */ unsigned int ldi, ldo, m, n; /** Size of data element. */ unsigned int datatype; /** Set of flags. */ unsigned int flags; /** Set of operation. */ unsigned int operation; } libxsmm_meltwkernel_info; LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_kernel_info { libxsmm_kernel_kind kind; /** Number of FLoating Point OperationS (FLOPS). */ unsigned int nflops; /** Code size (Bytes). */ size_t code_size; } libxsmm_kernel_info; /** Structure to receive information about the code registry status (libxsmm_get_registry_info). */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_registry_info { size_t capacity, size, nbytes, nstatic, ncache; } libxsmm_registry_info; #endif /*LIBXSMM_TYPEDEFS_H*/ libxsmm-1.17/samples/000077500000000000000000000000001415223013700145575ustar00rootroot00000000000000libxsmm-1.17/samples/blocked_gemm/000077500000000000000000000000001415223013700171675ustar00rootroot00000000000000libxsmm-1.17/samples/blocked_gemm/DeepBench_matrices.txt000066400000000000000000000033151415223013700234360ustar00rootroot000000000000001760 16 1760 N N 1760 32 1760 N N 1760 64 1760 N N 1760 128 1760 N N 1760 7000 1760 N N 2048 16 2048 N N 2048 32 2048 N N 2048 64 2048 N N 2048 128 2048 N N 2048 7000 2048 N N 2560 16 2560 N N 2560 32 2560 N N 2560 64 2560 N N 2560 128 2560 N N 2560 7000 2560 N N 4096 16 4096 N N 4096 32 4096 N N 4096 64 4096 N N 4096 128 4096 N N 4096 7000 4096 N N 1760 16 1760 T N 1760 32 1760 T N 1760 64 1760 T N 1760 128 1760 T N 1760 7000 1760 T N 2048 16 2048 T N 2048 32 2048 T N 2048 64 2048 T N 2048 128 2048 T N 2048 7000 2048 T N 2560 16 2560 T N 2560 32 2560 T N 2560 64 2560 T N 2560 128 2560 T N 2560 7000 2560 T N 4096 16 4096 T N 4096 32 4096 T N 4096 64 4096 T N 4096 128 4096 T N 4096 7000 4096 T N 1760 7133 1760 N T 2048 7133 2048 N T 2560 7133 2560 N T 4096 7133 4096 N T 5124 9124 1760 N N 28 4 32 35 8457 1760 N N 35 3 32 5124 9124 2048 N N 28 4 64 35 8457 2048 N N 35 3 64 5124 9124 2560 N N 28 4 64 35 8457 2560 N N 35 3 64 5124 9124 4096 N N 28 4 64 35 8457 4096 N N 35 3 64 5124 9124 1760 T N 28 4 32 35 8457 1760 T N 35 3 32 5124 9124 2048 T N 28 4 64 35 8457 2048 T N 35 3 64 5124 9124 2560 T N 28 4 64 35 8457 2560 T N 35 3 64 5124 9124 4096 T N 28 4 64 35 8457 4096 T N 35 3 64 7680 16 2560 N N 7680 32 2560 N N 7680 64 2560 N N 7680 128 2560 N N 7680 16 2560 T N 7680 32 2560 T N 7680 64 2560 T N 7680 128 2560 T N 3072 16 1024 N N 3072 32 1024 N N 3072 64 1024 N N 3072 128 1024 N N 3072 16 1024 T N 3072 32 1024 T N 3072 64 1024 T N 3072 128 1024 T N 3072 7435 1024 N T 7680 5481 2560 N T 32 9 64 libxsmm-1.17/samples/blocked_gemm/Makefile000066400000000000000000000073311415223013700206330ustar00rootroot00000000000000ROOTDIR = $(abspath $(dir $(firstword $(MAKEFILE_LIST)))) DEPDIR = ../.. SRCDIR = . INCDIR = . BLDDIR = obj OUTDIR = . CXXFLAGS = $(NULL) CFLAGS = $(NULL) DFLAGS = -DLIBXSMM_BLAS_CONST BLAS = 2 OMP = 1 SYM = 1 # include common Makefile artifacts include $(DEPDIR)/Makefile.inc # necessary include directories IFLAGS += -I$(call quote,$(INCDIR)) IFLAGS += -I$(call quote,$(DEPDIR)/include) OUTNAME := $(shell basename "$(ROOTDIR)") HEADERS := $(wildcard $(INCDIR)/*.h) $(wildcard $(INCDIR)/*.hpp) $(wildcard $(INCDIR)/*.hxx) $(wildcard $(INCDIR)/*.hh) \ $(wildcard $(SRCDIR)/*.h) $(wildcard $(SRCDIR)/*.hpp) $(wildcard $(SRCDIR)/*.hxx) $(wildcard $(SRCDIR)/*.hh) \ $(DEPDIR)/include/libxsmm_source.h CPPSRCS := $(wildcard $(SRCDIR)/*.cpp) CXXSRCS := $(wildcard $(SRCDIR)/*.cxx) CCXSRCS := $(wildcard $(SRCDIR)/*.cc) CSOURCS := $(wildcard $(SRCDIR)/*.c) CPPOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CPPSRCS:.cpp=-cpp.o))) CXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CXXSRCS:.cxx=-cxx.o))) CCXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CCXSRCS:.cc=-cc.o))) COBJCTS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CSOURCS:.c=-c.o))) ifneq (,$(strip $(FC))) FXXSRCS := $(wildcard $(SRCDIR)/*.f) F77SRCS := $(wildcard $(SRCDIR)/*.F) F90SRCS := $(wildcard $(SRCDIR)/*.f90) $(wildcard $(SRCDIR)/*.F90) FXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(FXXSRCS:.f=-f.o))) F77OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F77SRCS:.F=-f77.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90SRCS:.f90=-f90.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90OBJS:.F90=-f90.o))) endif SOURCES := $(CPPSRCS) $(CXXSRCS) $(CCXSRCS) $(CSOURCS) OBJECTS := $(CPPOBJS) $(CXXOBJS) $(CCXOBJS) $(COBJCTS) FTNSRCS := $(FXXSRCS) $(F77SRCS) $(F90SRCS) MODULES := $(addsuffix .mod,$(basename $(FTNSRCS))) $(addsuffix .modmic,$(basename $(FTNSRCS))) FTNOBJS := $(FXXOBJS) $(F77OBJS) $(F90OBJS) XFILES := $(OUTDIR)/$(OUTNAME) .PHONY: all all: $(XFILES) .PHONY: compile compile: $(OBJECTS) $(FTNOBJS) $(OUTDIR)/$(OUTNAME): $(OUTDIR)/.make $(OBJECTS) $(LIBDEP) $(EXTDEP) $(LD) -o $@ $(OBJECTS) $(call cleanld,$(EXTLIB) $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS)) $(BLDDIR)/%-cpp.o: $(SRCDIR)/%.cpp .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CXX) $(DFLAGS) $(IFLAGS) $(CXXFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-c.o: $(SRCDIR)/%.c .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CC) $(DFLAGS) $(IFLAGS) $(CFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-f.o: $(SRCDIR)/%.f .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.f90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.F90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f77.o: $(SRCDIR)/%.F .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ .PHONY: clean clean: ifneq ($(call qapath,$(BLDDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(BLDDIR)),$(call qapath,.)) @rm -rf $(BLDDIR) endif endif ifneq (,$(wildcard $(BLDDIR))) # still exists @rm -f $(OBJECTS) $(OBJECTX) $(FTNOBJS) $(FTNOBJX) *__genmod.* fit.log *.dat @rm -f $(BLDDIR)/*.gcno $(BLDDIR)/*.gcda $(BLDDIR)/*.gcov endif @rm -f .make .state .PHONY: realclean realclean: clean ifneq ($(call qapath,$(OUTDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(OUTDIR)),$(call qapath,.)) @rm -rf $(OUTDIR) endif endif ifneq (,$(wildcard $(OUTDIR))) # still exists @rm -f $(OUTDIR)/libxsmm.$(DLIBEXT) $(OUTDIR)/*.stackdump @rm -f $(XFILES) $(MODULES) endif libxsmm-1.17/samples/blocked_gemm/blocked_gemm.c000066400000000000000000000200541415223013700217440ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Kunal Banerjee (Intel Corp.), Dheevatsa Mudigere (Intel Corp.) Alexander Heinecke (Intel Corp.), Hans Pabst (Intel Corp.) ******************************************************************************/ #include #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #include #include #include #if defined(_OPENMP) # include #endif #if defined(__MKL) # include #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif #if !defined(ITYPE) # define ITYPE float #endif #if !defined(CHECK) && (LIBXSMM_EQUAL(ITYPE, float) || LIBXSMM_EQUAL(ITYPE, double)) # if !defined(MKL_DIRECT_CALL_SEQ) && !defined(MKL_DIRECT_CALL) LIBXSMM_BLAS_SYMBOL_DECL(ITYPE, gemm) # endif # define CHECK #endif #define MYASSERT(x) if (!(x)) { printf("Assertion %s failed...\n", #x); exit(1);} int main(int argc, char* argv[]) { LIBXSMM_BLAS_CONST libxsmm_blasint m = (1 < argc ? atoi(argv[1]) : 1024); LIBXSMM_BLAS_CONST libxsmm_blasint k = (3 < argc ? atoi(argv[3]) : m); LIBXSMM_BLAS_CONST libxsmm_blasint n = (2 < argc ? atoi(argv[2]) : k); const libxsmm_blasint bm = (4 < argc ? atoi(argv[4]) : 32); const libxsmm_blasint bk = (6 < argc ? atoi(argv[6]) : bm); const libxsmm_blasint bn = (5 < argc ? atoi(argv[5]) : bk); const libxsmm_blocked_gemm_order order = (libxsmm_blocked_gemm_order)(7 < argc ? atoi(argv[7]) : 0); const int nrepeat = (8 < argc ? atoi(argv[8]) : 100); const libxsmm_blasint b_m1 = (9 < argc ? atoi(argv[9]) : 1); const libxsmm_blasint b_n1 = (10 < argc ? atoi(argv[10]) : 1); const libxsmm_blasint b_k1 = (11 < argc ? atoi(argv[11]) : 1); const libxsmm_blasint b_k2 = (12 < argc ? atoi(argv[12]) : 1); const int ab = (13 < argc ? atoi(argv[13]) : 0); LIBXSMM_BLAS_CONST libxsmm_blasint lda = (14 < argc ? atoi(argv[13]) : m); LIBXSMM_BLAS_CONST libxsmm_blasint ldb = (15 < argc ? atoi(argv[14]) : k); LIBXSMM_BLAS_CONST libxsmm_blasint ldc = (16 < argc ? atoi(argv[15]) : m); LIBXSMM_BLAS_CONST char transa = 'N', transb = 'N'; /* no transposes */ LIBXSMM_BLAS_CONST ITYPE alpha = 1, beta = 1; const int gemm_flags = LIBXSMM_GEMM_FLAGS(transa, transb); const double gflops = 2.0 * m * n * k * 1E-9; int result = EXIT_SUCCESS; #if defined(CHECK) && (!defined(__BLAS) || (0 != __BLAS)) const char *const env_check = getenv("CHECK"); const double check = LIBXSMM_ABS(NULL == env_check ? 0 : atof(env_check)); #endif if (argc > 1 && !strncmp(argv[1], "-h", 3)) { /* check command line */ printf("\nUsage: ./bgemm [M] [N] [K] [bm] [bn] [bk] [order] [reps] [b_m1] [b_n1] [b_k1] [b_k2] [verbose]\n\n"); return result; } MYASSERT(m % b_m1 == 0); MYASSERT(n % b_n1 == 0); MYASSERT(k % b_k1 == 0); MYASSERT(m/b_m1 % bm == 0); MYASSERT(n/b_n1 % bn == 0); MYASSERT(k/b_k1/b_k2 % bk == 0); #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload target(LIBXSMM_OFFLOAD_TARGET) #endif { ITYPE* agold = (ITYPE*)libxsmm_malloc((size_t)lda * (size_t)k * sizeof(ITYPE)); ITYPE* bgold = (ITYPE*)libxsmm_malloc((size_t)ldb * (size_t)n * sizeof(ITYPE)); ITYPE* cgold = (ITYPE*)libxsmm_malloc((size_t)ldc * (size_t)n * sizeof(ITYPE)); ITYPE* a = (ITYPE*)libxsmm_malloc((size_t)m * (size_t)k * sizeof(ITYPE)); ITYPE* b = (ITYPE*)libxsmm_malloc((size_t)k * (size_t)n * sizeof(ITYPE)); ITYPE* c = (ITYPE*)libxsmm_malloc((size_t)m * (size_t)n * sizeof(ITYPE)); libxsmm_blocked_gemm_handle* handle = 0; unsigned long long start; double duration; #if defined(_OPENMP) const int nthreads = omp_get_max_threads(); #else const int nthreads = 1; #endif handle = libxsmm_blocked_gemm_handle_create(nthreads, LIBXSMM_GEMM_PRECISION(ITYPE), LIBXSMM_GEMM_PRECISION(ITYPE), m, n, k, &bm, &bn, &bk, &b_m1, &b_n1, &b_k1, &b_k2, &alpha, &beta, &gemm_flags, NULL/*auto-prefetch*/, &order); if (0 != handle) { LIBXSMM_MATINIT_OMP(ITYPE, 42, agold, m, k, lda, 1.0); LIBXSMM_MATINIT_OMP(ITYPE, 24, bgold, k, n, ldb, 1.0); LIBXSMM_MATINIT_OMP(ITYPE, 0, cgold, m, n, ldc, 1.0); libxsmm_blocked_gemm_copyin_a(handle, agold, &lda, a); libxsmm_blocked_gemm_copyin_b(handle, bgold, &ldb, b); libxsmm_blocked_gemm_copyin_c(handle, cgold, &ldc, c); #if defined(MKL_ENABLE_AVX512) mkl_enable_instructions(MKL_ENABLE_AVX512); #endif /* warm-up OpenMP (populate thread pool) */ libxsmm_blocked_gemm_omp(handle, a, b, c, 1); #if defined(CHECK) && (!defined(__BLAS) || (0 != __BLAS)) if (!LIBXSMM_FEQ(0, check)) { LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, agold, &lda, bgold, &ldb, &beta, cgold, &ldc); } #endif if (!ab) { libxsmm_gemm_print(stdout, LIBXSMM_GEMM_PRECISION(ITYPE), &transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); fprintf(stdout, "\n\n"); } start = libxsmm_timer_tick(); libxsmm_blocked_gemm_omp(handle, a, b, c, nrepeat); duration = libxsmm_timer_duration(start, libxsmm_timer_tick()); if (0 < duration) { if (ab) { fprintf(stdout, "\tLIBXSMM: %.1f GFLOPS/s | %lli,%lli,%lli,%lli,%lli,%lli,%i,%lli,%lli,%lli,%lli\n", gflops * nrepeat / duration, (long long)m, (long long)n, (long long)k, (long long)bm, (long long)bn, (long long)bk, (int)order, (long long)b_m1, (long long)b_n1, (long long)b_k1, (long long)b_k2); } else { fprintf(stdout, "\tLIBXSMM: %.1f GFLOPS/s\n", gflops * nrepeat / duration); } } #if defined(CHECK) && (!defined(__BLAS) || (0 != __BLAS)) if (!LIBXSMM_FEQ(0, check)) { /* validate result against LAPACK/BLAS xGEMM */ ITYPE* ctest = 0; int i; start = libxsmm_timer_tick(); for (i = 0; i < nrepeat; ++i) { LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, agold, &lda, bgold, &ldb, &beta, cgold, &ldc); } duration = libxsmm_timer_duration(start, libxsmm_timer_tick()); if (0 < duration) { fprintf(stdout, "\tBLAS: %.1f GFLOPS/s\n", gflops * nrepeat / duration); } /* free memory not needed further; avoid double-free later on */ libxsmm_free(agold); agold = 0; libxsmm_free(bgold); bgold = 0; libxsmm_free(a); a = 0; libxsmm_free(b); b = 0; /* allocate C-matrix in regular format, and perform copy-out */ ctest = (ITYPE*)libxsmm_malloc((size_t)(sizeof(ITYPE) * ldc * n)); if (0 != ctest) { libxsmm_matdiff_info diff; libxsmm_blocked_gemm_copyout_c(handle, c, &ldc, ctest); result = libxsmm_matdiff(&diff, LIBXSMM_DATATYPE(ITYPE), m, n, cgold, ctest, &ldc, &ldc); if (EXIT_SUCCESS == result) { fprintf(stdout, "\tdiff: L2abs=%f Linf=%f\n", diff.l2_abs, diff.linf_abs); if (check < 100.0 * diff.normf_rel) { fprintf(stderr, "FAILED with an error of %f%%!\n", 100.0 * diff.normf_rel); result = EXIT_FAILURE; } } libxsmm_free(ctest); } } #endif libxsmm_blocked_gemm_handle_destroy(handle); } else { fprintf(stderr, "FAILED to create BGEMM-handle! For details retry with LIBXSMM_VERBOSE=1.\n"); result = EXIT_FAILURE; } libxsmm_free(agold); libxsmm_free(bgold); libxsmm_free(cgold); libxsmm_free(a); libxsmm_free(b); libxsmm_free(c); } if (!ab) { fprintf(stdout, "Finished\n"); } return result; } libxsmm-1.17/samples/blocked_gemm/blocked_gemm.sh000077500000000000000000000050421415223013700221370ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")"; pwd -P) NAME=$(basename $0 .sh) GREP=$(command -v grep) ENV=$(command -v env) if [ "Windows_NT" = "${OS}" ]; then # Cygwin's "env" does not set PATH ("Files/Black: No such file or directory") export PATH=${PATH}:${HERE}/../../lib:/usr/x86_64-w64-mingw32/sys-root/mingw/bin # Cygwin's ldd hangs with dyn. linked executables or certain shared libraries LDD=$(command -v cygcheck) EXE=.exe else if [ "" != "$(command -v ldd)" ]; then LDD=ldd elif [ "" != "$(command -v otool)" ]; then LDD="otool -L" else LDD=echo fi fi MICINFO=$(command -v micinfo) if [ "" != "${MICINFO}" ]; then MICCORES=$(${MICINFO} 2>/dev/null | sed -n "0,/[[:space:]]\+Total No of Active Cores :[[:space:]]\+\([0-9]\+\)/s//\1/p") fi if [ "" = "${MICCORES}" ]; then MICCORES=61 fi MICTPERC=3 if [ "-mic" != "$1" ]; then if [ "" != "$(${LDD} ${HERE}/${NAME}${EXE} 2>/dev/null | ${GREP} libiomp5\.)" ]; then ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ KMP_AFFINITY=compact,granularity=fine,1 \ MIC_KMP_AFFINITY=compact,granularity=fine \ MIC_KMP_HW_SUBSET=$((MICCORES-1))c${MICTPERC}t \ MIC_ENV_PREFIX=MIC \ OFFLOAD_INIT=on_start \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" else ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ OMP_PROC_BIND=TRUE \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" fi else shift ${ENV} \ SINK_LD_LIBRARY_PATH=${SINK_LD_LIBRARY_PATH}:${MIC_LD_LIBRARY_PATH}:${HERE}/../../lib \ micnativeloadex \ ${HERE}/${NAME}${EXE} -a "$*" \ -e "KMP_AFFINITY=compact,granularity=fine" \ -e "MIC_KMP_HW_SUBSET=$((MICCORES-1))${MICTPERC}t" fi libxsmm-1.17/samples/blocked_gemm/blocked_gemm.vcxproj000066400000000000000000000552571415223013700232320ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 blocked_gemm {C8835447-1AEF-4B54-B8A6-F75D0020B997} 10.0 Application Disabled Disabled Sequential v142 true Application true true Disabled Disabled Sequential v142 Application true Disabled Disabled Sequential v142 true Application Disabled Disabled Sequential v142 true true Application true Disabled Disabled Sequential v142 Application true Disabled Disabled true Sequential v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false 3948,10373,10382 HOST true true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmext.lib;mkl_rt.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false SingleFile 3948,10373,10382 HOST true true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false 3948,10373,10382 HOST true true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmext.lib;mkl_rt.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false SingleFile 3948,10373,10382 HOST true true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false 3948,10373,10382 HOST true true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false 3948,10373,10382 HOST true true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/blocked_gemm/perfSweep.sh000077500000000000000000000130321415223013700214650ustar00rootroot00000000000000#!/usr/bin/env bash ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### if [ $1 = "-h" ] then echo "Usage: $(basename $0) matrices.txt iters numa (1-mcdram/0-DDR)" exit fi if [[ -z "${OMP_NUM_THREADS}" ]]; then echo "using defaults for OMP settings!" export KMP_HW_SUBSET=1T export KMP_AFFINITY=compact,granularity=fine export KMP_AFFINITY=proclist=[1-67],granularity=thread,explicit,norespect export OMP_NUM_THREADS=67 else echo "using environment OMP settings!" fi _fn=${1:-"deepbench_matrices.txt"} _it=${2:-100} NUMA=${3:-1} NUMACTL="${TOOL_COMMAND}" CPUFLAGS=$(if [ -e /proc/cpuinfo ]; then grep -m1 flags /proc/cpuinfo | cut -d: -f2-; fi) if [ "" != "$(echo "${CPUFLAGS}" | grep -o avx512er)" ]; then if [ "0" != "$((NUMA < $(numactl -H | grep "node " | tr -s " " | cut -d" " -f2- | wc -w | tr -d " ")))" ]; then NUMACTL="numactl --preferred=${NUMA} ${TOOL_COMMAND}" fi fi NUMACTL="numactl --interleave=0,1" #----bgemm parameters _MB_="24 48 64" _NB_="24 48 64" _KB_="24 48 64 96" #mb1="0.1 0.2 0.3 0.4 0.5 0.8 0.16 0.32 1 2 4 8 10 16" #nb1="0.1 0.2 0.3 0.4 0.5 0.8 0.16 0.32 1 2 4 8 10 16" #kb1="0.1 0.2 0.3 0.4 0.5 0.8 0.16 0.32 1 2 4 8 10 16" #kb2="0.1 0.2 0.3 0.4 0.5 0.8 0.16 0.32 1 2 4 8 10 16" MBT=4096 NBT=4096 KBT=4096 mb11="0.1" nb11="0.1" kb11="0.1" mb12="0.1" nb12="0.1" kb12="0.1" kb2="0.1 0.2 0.4 0.5 0.8 0.16 0.24 0.32" order="0 1 2" perflog="perfSweep.log" function bgemm_test { best="0" echo "M=$M N=$N K=$K it=$it" bin="$NUMACTL ./bgemm" log="$((M))_$((N))_$((K)).out" for _mb in $mb do for _nb in $nb do for _kb in $kb do for _mb1 in $mb1 do for _nb1 in $nb1 do for _kb1 in $kb1 do for _kb2 in $kb2 do for _o in $order do _M=$M _N=$N _K=$K if [[ "$((M % _mb))" -gt 0 ]] then _M=$((_mb*(M/_mb+1))) fi if [[ "$((N % _nb))" -gt 0 ]] then _N=$((_nb*(N/_nb+1))) fi if [[ "$((K % _kb))" -gt 0 ]] then _K=$((_kb*(K/_kb+1))) fi if [ $(bc <<< "$_mb1 < 1") -eq 1 ]; then IFS="." read temp _MB1 <<< $_mb1 else _MB1=$(($_M/$_mb1)) fi if [ $(bc <<< "$_nb1 < 1") -eq 1 ]; then IFS="." read temp _NB1 <<< $_nb1 else _NB1=$(($_N/$_nb1)) fi if [ $(bc <<< "$_kb1 < 1") -eq 1 ]; then IFS="." read temp _KB1 <<< $_kb1 else _KB1=$(($_K/$_kb1)) fi if [ $(bc <<< "$_kb2 < 1") -eq 1 ]; then IFS="." read temp _KB2 <<< $_kb2 else _KB2=$(($_K/$_kb2)) fi echo "$bin $_M $_N $_K $_mb $_nb $_kb $_o $it $_MB1 $_NB1 $_KB1 $_KB2" $bin $_M $_N $_K $_mb $_nb $_kb $_o $it $_MB1 $_NB1 $_KB1 $_KB2 > /dev/null $bin $_M $_N $_K $_mb $_nb $_kb $_o $it $_MB1 $_NB1 $_KB1 $_KB2 > temp.out prf="$(grep "LIBXSMM" temp.out | awk {'print $2;'})" cfg="$_M $_N $_K $_mb $_nb $_kb $_o $it $_MB1 $_NB1 $_KB1 $_KB2" echo "$cfg $prf" >> $log done done done done done done done done best=$(cat $log | awk ' BEGIN { val = 0 } { if ($13 > val) {val = $13; best=$0} } END { print best }') echo "$best" >> $perflog } function run_bsgemm { M=$1 N=$2 K=$3 _AT=$4 _BT=$5 if [[ $# -gt 5 ]] then mb=$6 nb=$7 kb=$8 else mb=24 nb=24 kb=24 fi #_it=$9 #_bin=$7 if [[ "$mb" -gt "$M" ]] then mb=$M else mb=$_MB_ fi if [[ "$nb" -gt "$N" ]] then nb=$N else nb=$_NB_ fi if [[ "$kb" -gt "$K" ]] then kb=$K else kb=$_KB_ fi if [[ "$M" -gt "$MBT" ]]; then mb1=$mb12 else mb1=$mb11 fi if [[ "$N" -gt "$NBT" ]]; then nb1=$nb12 else nb1=$nb11 fi if [[ "$K" -gt "$KBT" ]]; then kb1=$kb12 else kb1=$kb11 fi _Trans=0 if [[ "$_AT" == "T" ]] then _Trans=1 echo "!!! $M $N $K $_AT $_BT - Not supported !!!, doing $N $M $K N N instead" t_M=$M t_mb=$mb M=$N N=$t_M mb=$nb nb=$t_mb fi if [[ "$_BT" == "T" ]] then _Trans=2 echo "!!! $M $N $K $_AT $_BT - Not supported !!!, doing $M $K $N N N instead" t_K=$K t_kb=$kb K=$N N=$t_K kb=$nb nb=$t_kb fi if [[ "$M" -gt "2000" ]]; then if [[ "$N" -gt "2000" ]]; then if [[ "$K" -gt "2000" ]]; then it=10 fi fi fi if [[ "$M" -gt "4000" ]]; then it=10 fi if [[ "$N" -gt "4000" ]]; then it=10 fi if [[ "$K" -gt "4000" ]]; then it=10 fi bgemm_test echo "--------------------------------------------------------------------------------------" } nc=$(wc -l $_fn) idx=1 cat $_fn | while read line do if [ ! -z "$line" ]; then echo -n "($idx/$nc)" it=$_it run_bsgemm $line fi idx=$((idx+1)) done libxsmm-1.17/samples/blocked_gemm/run_bench.sh000077500000000000000000000057731415223013700215050ustar00rootroot00000000000000#!/usr/bin/env bash ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### if [ $1 = "-h" ] then echo "Usage: $(basename $0) matrices.txt iters numa (1-mcdram/0-DDR)" exit fi fn=${1:-"deepbench_opt.txt"} ITERS=${2:-100} NUMA=${3:-1} NUMACTL="${TOOL_COMMAND}" CPUFLAGS=$(if [ -e /proc/cpuinfo ]; then grep -m1 flags /proc/cpuinfo | cut -d: -f2-; fi) if [ "" != "$(echo "${CPUFLAGS}" | grep -o avx512er)" ]; then if [ "0" != "$((NUMA < $(numactl -H | grep "node " | tr -s " " | cut -d" " -f2- | wc -w | tr -d " ")))" ]; then NUMACTL="numactl --preferred=${NUMA} ${TOOL_COMMAND}" fi fi if [[ -z "${OMP_NUM_THREADS}" ]]; then echo "using defaults for OMP settings!" export KMP_HW_SUBSET=1T export KMP_AFFINITY=compact,granularity=fine export KMP_AFFINITY=proclist=[1-67],granularity=thread,explicit,norespect export OMP_NUM_THREADS=67 else echo "using environment OMP settings!" fi #./bgemm iters M N K order BM BN BK B_M B_N B_K K_unroll # current bgemm only supports non-transpose GEMM, TODO: transpose support _bin="$NUMACTL ./bgemm" _it=$ITERS function run_bsgemm { _M=$1 _N=$2 _K=$3 _AT=$4 _BT=$5 _mb=32 _nb=32 _kb=32 _order=0 _mb1=1 _nb1=1 _kb1=1 _kb2=1 if [[ $# -gt 5 ]] then _mb=$6 _nb=$7 _kb=$8 fi if [[ $# -gt 8 ]] then _order=$9 _mb1=${10} _nb1=${11} _kb1=${12} _kb2=${13} st=${14} _it=$((st*ITERS)) fi if [[ "$_mb" -gt "_M" ]] then _mb=$_M fi if [[ "$_nb" -gt "_N" ]] then _nb=$_N fi if [[ "$_kb" -gt "_K" ]] then _kb=$_K fi if [[ "$((_M % _mb))" -gt 0 ]] then #_mb=$_M _M=$((_mb*(_M/_mb+1))) fi if [[ "$((_N % _nb))" -gt 0 ]] then #_nb=$_N _N=$((_nb*(_N/_nb+1))) fi if [[ "$((_K % _kb))" -gt 0 ]] then #_kb=$_K _K=$((_kb*(_K/_kb+1))) fi if [[ "$_AT" == "T" ]] then echo "!!! $_M $_N $_K $_AT $_BT - Not supported !!!, doing $_N $_M $_K N N instead" t_M=$_M t_mb=$_mb _M=$_N _N=$t_M _mb=$_nb _nb=$t_mb fi if [[ "$_BT" == "T" ]] then echo "!!! $_M $_N $_K $_AT $_BT - Not supported !!!, doing $_M $_K $_N N N instead" t_K=$_K t_kb=$_kb _K=$_N _N=$t_K _kb=$_nb _nb=$t_kb fi echo "$_bin $_M $_N $_K $_mb $_nb $_kb $_order $_it $_mb1 $_nb1 $_kb1 $_kb2" $_bin $_M $_N $_K $_mb $_nb $_kb $_order $_it $_mb1 $_nb1 $_kb1 $_kb2 echo "--------------------------------------------------------------------------------------" } nc=$(wc -l $fn) idx=1 cat $fn | while read line do if [ ! -z "$line" ]; then echo -n "($idx/$nc) " run_bsgemm $line fi idx=$((idx+1)) done libxsmm-1.17/samples/cp2k/000077500000000000000000000000001415223013700154165ustar00rootroot00000000000000libxsmm-1.17/samples/cp2k/Makefile000066400000000000000000000102561415223013700170620ustar00rootroot00000000000000ROOTDIR = $(abspath $(dir $(firstword $(MAKEFILE_LIST)))) DEPDIR = ../.. SRCDIR = . INCDIR = . BLDDIR = obj OUTDIR = . CXXFLAGS = $(NULL) CFLAGS = $(NULL) DFLAGS = $(NULL) BLAS = 1 OMP = 1 SYM = 1 # include common Makefile artifacts include $(DEPDIR)/Makefile.inc # necessary include directories IFLAGS += -I$(call quote,$(INCDIR)) IFLAGS += -I$(call quote,$(DEPDIR)/include) OUTNAME := $(shell basename "$(ROOTDIR)") HEADERS := $(wildcard $(INCDIR)/*.h) $(wildcard $(INCDIR)/*.hpp) $(wildcard $(INCDIR)/*.hxx) $(wildcard $(INCDIR)/*.hh) \ $(wildcard $(SRCDIR)/*.h) $(wildcard $(SRCDIR)/*.hpp) $(wildcard $(SRCDIR)/*.hxx) $(wildcard $(SRCDIR)/*.hh) \ $(DEPDIR)/include/libxsmm_source.h CPPSRCS := $(wildcard $(SRCDIR)/*.cpp) CXXSRCS := $(wildcard $(SRCDIR)/*.cxx) CCXSRCS := $(wildcard $(SRCDIR)/*.cc) CSOURCS := $(wildcard $(SRCDIR)/*.c) CPPOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CPPSRCS:.cpp=-cpp.o))) CXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CXXSRCS:.cxx=-cxx.o))) CCXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CCXSRCS:.cc=-cc.o))) COBJCTS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CSOURCS:.c=-c.o))) ifneq (,$(strip $(FC))) FXXSRCS := $(wildcard $(SRCDIR)/*.f) F77SRCS := $(wildcard $(SRCDIR)/*.F) F90SRCS := $(wildcard $(SRCDIR)/*.f90) $(wildcard $(SRCDIR)/*.F90) FXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(FXXSRCS:.f=-f.o))) F77OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F77SRCS:.F=-f77.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90SRCS:.f90=-f90.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90OBJS:.F90=-f90.o))) endif SOURCES := $(CPPSRCS) $(CXXSRCS) $(CCXSRCS) $(CSOURCS) OBJECTS := $(CPPOBJS) $(CXXOBJS) $(COBJCTS) FTNSRCS := $(FXXSRCS) $(F77SRCS) $(F90SRCS) MODULES := $(addsuffix .mod,$(basename $(FTNSRCS))) $(addsuffix .modmic,$(basename $(FTNSRCS))) FTNOBJS := $(FXXOBJS) $(F77OBJS) $(F90OBJS) XFILES += $(OUTDIR)/$(OUTNAME)-dbcsr ifneq (,$(findstring std=c++,$(CXXFLAGS))) XFILES += $(OUTDIR)/$(OUTNAME)-collocate OBJECTS += $(CCXOBJS) endif .PHONY: all all: $(XFILES) .PHONY: compile compile: $(OBJECTS) $(FTNOBJS) $(OUTDIR)/$(OUTNAME)-dbcsr: $(OUTDIR)/.make $(OBJECTS) $(LIBDEP) $(EXTDEP) $(XLD) -o $@ $(BLDDIR)/$(OUTNAME)-dbcsr-cpp.o \ $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(OUTDIR)/$(OUTNAME)-collocate: $(OUTDIR)/.make $(OBJECTS) $(LIBDEP) $(EXTDEP) $(XLD) -o $@ $(BLDDIR)/$(OUTNAME)-collocate-cc.o $(BLDDIR)/rt_graph-cc.o \ $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(BLDDIR)/%-cpp.o: $(SRCDIR)/%.cpp .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CXX) $(DFLAGS) $(IFLAGS) $(CXXFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-cc.o: $(SRCDIR)/%.cc .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CXX) $(DFLAGS) $(IFLAGS) $(CXXFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-c.o: $(SRCDIR)/%.c .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CC) $(DFLAGS) $(IFLAGS) $(CFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-f.o: $(SRCDIR)/%.f .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.f90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.F90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f77.o: $(SRCDIR)/%.F .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ .PHONY: clean clean: ifneq ($(call qapath,$(BLDDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(BLDDIR)),$(call qapath,.)) @rm -rf $(BLDDIR) endif endif ifneq (,$(wildcard $(BLDDIR))) # still exists @rm -f $(OBJECTS) $(OBJECTX) $(FTNOBJS) $(FTNOBJX) *__genmod.* fit.log *.dat @rm -f $(BLDDIR)/*.gcno $(BLDDIR)/*.gcda $(BLDDIR)/*.gcov endif @rm -f .make .state .PHONY: realclean realclean: clean ifneq ($(call qapath,$(OUTDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(OUTDIR)),$(call qapath,.)) @rm -rf $(OUTDIR) endif endif ifneq (,$(wildcard $(OUTDIR))) # still exists @rm -f $(OUTDIR)/libxsmm.$(DLIBEXT) $(OUTDIR)/*.stackdump @rm -f $(XFILES) $(MODULES) endif libxsmm-1.17/samples/cp2k/README.md000066400000000000000000000016741415223013700167050ustar00rootroot00000000000000# CP2K Artificial Benchmark The first code sample given for LIBXSMM was a performance reproducer exercising the same set of kernels usually generated for CP2K's SMM library. The code sample attempted to model the way "matrix stacks" are processed in CP2K, however there are two different code paths in CP2K: (1) the "main" code path used when processing stacks on the host-side, and (2) a code path targeting offload devices. Beside of the host-sided parallelization via MPI (and perhaps OpenMP), the secondly mentioned code path relies on an additional level of parallelization (which is obviously necessary to drive a potentially highly parallel offload device). Also, the additional level of parallelism is not exactly "nested" in the sense that it participates on sharing the same resources as the host-side. In fact, this "artificial benchmark" (cp2k code sample) is modeling a code path as utilized in the secondly mentioned case (offload device). libxsmm-1.17/samples/cp2k/cp2k-collocate.cc000066400000000000000000000452121415223013700205330ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include "mdarray.hpp" #include "rt_graph.hpp" #if !defined(XSMM) && 1 # define XSMM #endif #if (defined(HAVE_MKL) || defined(__MKL) || defined(OPENBLAS) || defined(__OPENBLAS) || defined(__CBLAS)) && \ !defined(TRIANGULAR) && 1 # define TRIANGULAR #endif #if !defined(SCRATCH) && 1 # define SCRATCH #elif !defined(SCRATCH_LOCAL) && 1 # define SCRATCH_LOCAL #endif #if !defined(NAIVE2) && 0 # define NAIVE2 #endif rt_graph::Timer timer; template void collocate_core(void* scratch, const int length_[3], const mdarray &co, const mdarray &p_alpha_beta_reduced_, mdarray &Vtmp) { const T *LIBXSMM_RESTRICT src_x = p_alpha_beta_reduced_.template at(2, 0, 0); const T *LIBXSMM_RESTRICT src_y = p_alpha_beta_reduced_.template at(1, 0, 0); const T *LIBXSMM_RESTRICT src_z = p_alpha_beta_reduced_.template at(0, 0, 0); T *LIBXSMM_RESTRICT dst = Vtmp.template at(0, 0, 0); const int ld = Vtmp.ld(); if (co.size(0) > 1) { timer.start("init"); #if (defined(SCRATCH) || defined(SCRATCH_LOCAL)) # if defined(SCRATCH) T *const Cdata = LIBXSMM_ALIGN(static_cast(scratch), LIBXSMM_ALIGNMENT); T *const xyz_data = LIBXSMM_ALIGN(Cdata + co.size(0) * co.size(1) * length_[1], LIBXSMM_ALIGNMENT); # else T *const Cdata = static_cast(libxsmm_aligned_scratch(sizeof(T) * co.size(0) * co.size(1) * length_[1], 0/*auto-alignment*/)); T *const xyz_data = static_cast(libxsmm_aligned_scratch(sizeof(T) * co.size(0) * length_[0] * length_[1], 0/*auto-alignment*/)); # endif # if defined(TRIANGULAR) mdarray C(Cdata, co.size(1), length_[1]); # else mdarray C(Cdata, co.size(0), co.size(1), length_[1]); # endif mdarray xyz_alpha_beta(xyz_data, co.size(0), length_[0], length_[1]); #else # if defined(TRIANGULAR) mdarray C(co.size(1), length_[1]); # else mdarray C(co.size(0), co.size(1), length_[1]); # endif mdarray xyz_alpha_beta(co.size(0), length_[0], length_[1]); #endif #if defined(XSMM) struct collocate { int i, j, k, lmax; } key = { static_cast(Vtmp.size(0)), static_cast(Vtmp.size(1)), static_cast(Vtmp.size(2)), static_cast(co.size(0)) }; libxsmm_mmfunction* kernelset = static_cast*>(libxsmm_xdispatch(&key, sizeof(key))); if (NULL == kernelset) { # if defined(TRIANGULAR) kernelset = static_cast*>(libxsmm_xregister(&key, sizeof(key), sizeof(libxsmm_mmfunction) * (static_cast(2) * key.lmax - 1), NULL)); for (int a1 = 0; a1 < (key.lmax - 1); a1++) { kernelset[2*a1+0] = libxsmm_mmfunction(LIBXSMM_GEMM_FLAG_NONE, length_[1], static_cast(co.size(1)) - a1, static_cast(co.size(2)) - a1, static_cast(p_alpha_beta_reduced_.ld()), static_cast(co.ld()), static_cast(C.ld()), 1/*alpha*/, 0/*beta*/, LIBXSMM_GEMM_PREFETCH_AL2/*_AHEAD*/); kernelset[2*a1+1] = libxsmm_mmfunction(LIBXSMM_GEMM_FLAG_TRANS_B, length_[1], length_[0], static_cast(co.size(2)) - a1, static_cast(C.ld()), static_cast(p_alpha_beta_reduced_.ld()), static_cast(xyz_alpha_beta.ld()), 1/*alpha*/, 0/*beta*/, LIBXSMM_PREFETCH_NONE); } kernelset[2*(key.lmax-1)] = libxsmm_mmfunction(LIBXSMM_GEMM_FLAG_TRANS_B, length_[2], length_[0] * length_[1], static_cast(co.size(2)), static_cast(p_alpha_beta_reduced_.ld()), static_cast(xyz_alpha_beta.size(1)) * static_cast(xyz_alpha_beta.ld()), ld, 1/*alpha*/, 0/*beta*/, LIBXSMM_PREFETCH_NONE); # else kernelset = static_cast*>(libxsmm_xregister(&key, sizeof(key), 3 * sizeof(libxsmm_mmfunction), NULL)); kernelset[0] = libxsmm_mmfunction(LIBXSMM_GEMM_FLAG_NONE, length_[1], static_cast(co.size(2)), static_cast(co.size(2)), static_cast(p_alpha_beta_reduced_.ld()), static_cast(co.ld()), static_cast(C.ld()), 1/*alpha*/, 0/*beta*/, LIBXSMM_PREFETCH_AUTO); kernelset[1] = libxsmm_mmfunction(LIBXSMM_GEMM_FLAG_TRANS_B, length_[1], length_[0], static_cast(co.size(2)), static_cast(C.ld()), static_cast(p_alpha_beta_reduced_.ld()), static_cast(xyz_alpha_beta.ld()), 1/*alpha*/, 0/*beta*/, LIBXSMM_PREFETCH_AUTO); kernelset[2] = libxsmm_mmfunction(LIBXSMM_GEMM_FLAG_TRANS_B, length_[2], length_[0] * length_[1], static_cast(co.size(2)), static_cast(p_alpha_beta_reduced_.ld()), static_cast(xyz_alpha_beta.size(1)) * static_cast(xyz_alpha_beta.ld()), ld, 1/*alpha*/, 0/*beta*/, LIBXSMM_PREFETCH_NONE); # endif } #endif timer.stop("init"); timer.start("gemm"); const T* aj = co.template at(0, 0, 0); #if defined(TRIANGULAR) T* cj = xyz_alpha_beta.template at(0, 0, 0); T *const bi = C.template at(0, 0); #else T* cj = C.template at(0, 0, 0); #endif // run loop excluding the last element for (int a1 = 0; a1 < static_cast(co.size(0) - 1); a1++) { const T *const ai = aj; aj = co.template at(a1 + 1, 0, 0); #if defined(TRIANGULAR) T *const ci = cj; cj = xyz_alpha_beta.template at(a1 + 1, 0, 0); #else T *const ci = cj; cj = C.template at(a1 + 1, 0, 0); #endif #if defined(TRIANGULAR) # if defined(XSMM) kernelset[2*a1+0](src_y, ai, bi, src_y, aj, bi); kernelset[2*a1+1](bi, src_z, ci/*, bi, src_z, cj*/); # else cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, co.size(1) - a1, length_[1], co.size(2) - a1, 1.0, ai, // Coef_{alpha,gamma,beta} co.ld(), src_y, // Y_{beta,j} p_alpha_beta_reduced_.ld(), 0.0, bi, // tmp_{alpha, gamma, j} C.ld()); cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, length_[0], length_[1], co.size(2) - a1, 1.0, src_z, // Z_{gamma,k} -> I need to transpose it I want Z_{k,gamma} p_alpha_beta_reduced_.ld(), bi, // C_{gamma, j} = Coef_{alpha,gamma,beta} Y_{beta,j} (fixed alpha) C.ld(), 0.0, ci, // contains xyz_{alpha, kj} the order kj is important xyz_alpha_beta.ld()); # endif #elif defined(XSMM) kernelset[0](src_y, ai, ci, src_y, aj, cj); #else cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, co.size(2), length_[1], co.size(2), 1.0, ai, // Coef_{alpha,gamma,beta} co.ld(), src_y, // Y_{beta,j} p_alpha_beta_reduced_.ld(), 0.0, ci, // tmp_{alpha, gamma, j} C.ld()); #endif } // execute remainder #if !defined(TRIANGULAR) # if defined(XSMM) kernelset[0](src_y, aj, cj, src_y, aj, cj); // with pseudo-prefetch # else cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, co.size(2), length_[1], co.size(2), 1.0, aj, // Coef_{alpha,gamma,beta} co.ld(), src_y, // Y_{beta,j} p_alpha_beta_reduced_.ld(), 0.0, cj, // tmp_{alpha, gamma, j} C.ld()); # endif // run loop excluding the last element const T* bj = C.template at(0, 0, 0); cj = xyz_alpha_beta.template at(0, 0, 0); for (int a1 = 0; a1 < static_cast(co.size(0) - 1); a1++) { const T* const bi = bj; T *const ci = cj; bj = C.template at(a1 + 1, 0, 0); cj = xyz_alpha_beta.template at(a1 + 1, 0, 0); # if defined(XSMM) kernelset[1](bi, src_z, ci, bj, src_z, cj); # else cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, length_[0], length_[1], co.size(2), 1.0, src_z, // Z_{gamma,k} -> I need to transpose it I want Z_{k,gamma} p_alpha_beta_reduced_.ld(), bi, // C_{gamma, j} = Coef_{alpha,gamma,beta} Y_{beta,j} (fixed alpha) C.ld(), 0.0, ci, // contains xyz_{alpha, kj} the order kj is important xyz_alpha_beta.ld()); # endif } // execute remainder # if defined(XSMM) kernelset[1](bj, src_z, cj, bj, src_z, cj); // with pseudo-prefetch # else cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, length_[0], length_[1], co.size(2), 1.0, src_z, // Z_{gamma,k} -> I need to transpose it I want Z_{k,gamma} p_alpha_beta_reduced_.ld(), bj, // C_{gamma, j} = Coef_{alpha,gamma,beta} Y_{beta,j} (fixed alpha) C.ld(), 0.0, cj, // contains xyz_{alpha, kj} the order kj is important xyz_alpha_beta.ld()); # endif #else memset(xyz_alpha_beta.template at(co.size(0) - 1, 0, 0), 0, sizeof(T) * length_[0] * xyz_alpha_beta.ld()); cblas_dger(CblasRowMajor, length_[0], length_[1], co(co.size(0) - 1, 0, 0), src_z, 1, src_y, 1, xyz_alpha_beta.template at(co.size(0) - 1, 0, 0), xyz_alpha_beta.ld()); #endif #if defined(XSMM) # if defined(TRIANGULAR) kernelset[2*(key.lmax-1)](src_x, xyz_alpha_beta.template at(0, 0, 0), dst); # else kernelset[2](src_x, xyz_alpha_beta.template at(0, 0, 0), dst); # endif #else cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, length_[0] * length_[1], length_[2], co.size(2), 1.0, xyz_alpha_beta.template at(0, 0, 0), xyz_alpha_beta.size(1) * xyz_alpha_beta.ld(), src_x, p_alpha_beta_reduced_.ld(), 0.0, dst, ld); #endif timer.stop("gemm"); #if defined(SCRATCH_LOCAL) timer.start("deinit"); libxsmm_free(Cdata); libxsmm_free(xyz_data); timer.stop("deinit"); #endif } else { timer.start("remainder"); for (int z1 = 0; z1 < length_[0]; z1++) { const T tz = co(0, 0, 0) * src_z[z1]; LIBXSMM_PRAGMA_UNROLL_N(4) for (int y1 = 0; y1 < length_[1]; y1++) { const T tmp = tz * src_y[y1]; LIBXSMM_PRAGMA_SIMD for (int x1 = 0; x1 < length_[2]; x1++) { dst[x1] = tmp * src_x[x1]; } dst += ld; } } timer.stop("remainder"); } } template void collocate_core_naive(const int *length_, const mdarray &co, const mdarray &p_alpha_beta_reduced_, mdarray &Vtmp) { Vtmp.zero(); for (int alpha = 0; alpha < static_cast(co.size(2)); alpha++) { for (int gamma = 0; gamma < static_cast(co.size(0)); gamma++) { for (int beta = 0; beta < static_cast(co.size(1)); beta++) { double coef = co(alpha, gamma, beta); for (int z = 0; z < length_[0]; z++) { double c1 = coef * p_alpha_beta_reduced_(0, gamma, z); for (int y = 0; y < length_[1]; y++) { double c2 = c1 * p_alpha_beta_reduced_(1, beta, y); for (int x = 0; x < length_[2]; x++) { Vtmp(z, y, x) += c2 * p_alpha_beta_reduced_(2, alpha, x); } } } } } } } #if defined(NAIVE2) template void collocate_core_naive2(const int *length_, const mdarray &co, const mdarray &p_alpha_beta_reduced_, mdarray &Vtmp) { Vtmp.zero(); for (int gamma = 0; gamma < static_cast(co.size(0)); gamma++) { for (int beta = 0; beta < static_cast(co.size(1)); beta++) { for (int z = 0; z < length_[0]; z++) { double c1 = p_alpha_beta_reduced_(0, gamma, z); for (int y = 0; y < length_[1]; y++) { double c2 = c1 * p_alpha_beta_reduced_(1, beta, y); for (int x = 0; x < length_[2]; x++) { T tmp = 0; for (int alpha = 0; alpha < static_cast(co.size(2)); alpha++) { double coef = co(alpha, gamma, beta); tmp += coef * p_alpha_beta_reduced_(2, alpha, x); } Vtmp(z, y, x) += c2 * tmp; } } } } } } #endif // The three first numbers are the grid size, the last one can be anything template T test_collocate_core(const int i, const int j, const int k, const int lmax) { #if defined(SCRATCH) void* const scratch = malloc(sizeof(T) * (static_cast(lmax) * lmax * j + static_cast(lmax) * i * j) + 2 * LIBXSMM_ALIGNMENT); #else void* const scratch = NULL; #endif mdarray pol = mdarray(3, lmax, std::max(std::max(i, j), k)); mdarray co = mdarray(lmax, lmax, lmax); mdarray Vgemm(i, j, k); mdarray Vref(i, j, k); std::default_random_engine generator; std::uniform_real_distribution distribution(-1.0, 1.0); int length[3] = {i, j, k}; for (int s = 0; s < static_cast(pol.size()); s++) pol[s] = distribution(generator); #if !defined(TRIANGULAR) for (int s = 0; s < static_cast(co.size()); s++) co[s] = distribution(generator); #else co.zero(); for (int a1 = 0; a1 < static_cast(co.size(0)); a1++) { // for fixed a1, the matrix should be triangular of this form // b1 b2 b3 // b4 b5 // b6 const int b2 = static_cast(co.size(1)) - a1; for (int b1 = 0; b1 < b2; b1++) { for (int g1 = 0; g1 < (b2 - b1); g1++) { co(a1, b1, g1) = distribution(generator); } } } #endif Vgemm.zero(); timer.start("collocate_gemm"); collocate_core(scratch, length, co, pol, Vgemm); timer.stop("collocate_gemm"); timer.start("collocate_brute_force"); #if !defined(NAIVE2) collocate_core_naive(length, co, pol, Vref); #else // variant collocate_core_naive2(length, co, pol, Vref); #endif timer.stop("collocate_brute_force"); T maxi = -2.0; for (int l = 0; l < static_cast(Vgemm.size(0)); l++) for (int m = 0; m < static_cast(Vgemm.size(1)); m++) for (int n = 0; n < static_cast(Vgemm.size(2)); n++) maxi = std::max(std::abs(Vref(l, m, n) - Vgemm(l, m, n)), maxi); pol.clear(); co.clear(); Vgemm.clear(); Vref.clear(); #if defined(SCRATCH) free(scratch); #endif return maxi; } // template void integrate_core_naive(const int *length_, // const mdarray &pol_, // const mdarray &Vtmp, // mdarray &co) // { // for (int gamma = 0; gamma < co.size(0); gamma++) { // for (int beta = 0; beta < co.size(1); beta++) { // for (int alpha = 0; alpha < co.size(2); alpha++) { // T res = 0.0; // for (int z = 0; z < length_[0]; z++) { // for (int y = 0; y < length_[1]; y++) { // const T c1 = pol_(0, gamma, z) * pol_(1, beta, y); // const T*LIBXSMM_RESTRICT vtmp = Vtmp.template at(z, y, 0); // for (int x = 0; x < length_[2]; x++) { // res += c1 * pol_(2, alpha, x) * vtmp[x]; // } // } // } // co(gamma, beta, alpha) = res; // } // } // } // } // template bool test_integrate_core(const int i, const int j, const int k, const int lmax) // { // mdarray pol = mdarray(3, // lmax, // std::max(std::max(i, j), k)); // mdarray co_ref = mdarray(lmax, lmax, lmax); // mdarray co_gemm = mdarray(lmax, lmax, lmax); // mdarray V = mdarray(i, j, k); // std::default_random_engine generator; // std::uniform_real_distribution distribution(-1.0, 1.0); // int length[3] = {i, j, k}; // for (int s = 0; s < pol.size(); s++) // pol[s] = distribution(generator); // for (int s = 0; s < V.size(); s++) // V[s] = distribution(generator); // co_gemm.zero(); // integrate_core(length, pol, V, co_gemm); // co_ref.zero(); // integrate_core_naive(length, // pol, // V, // co_ref); // T maxi = -2.0; // for (int l = 0; l < co_gemm.size(0); l++) // for (int m = 0; m < co_gemm.size(1); m++) // for (int n = 0; n < co_gemm.size(2); n++) { // maxi = std::max(std::abs(co_gemm(l, m, n) - co_ref(l, m, n)), maxi); // } // if (maxi > 1e-13) // return false; // return true; // } int main(int argc, char* argv[]) { typedef double elem_type; const int nrepin = (1 < argc ? atoi(argv[1]) : 100), nrep = std::max(nrepin, 1); const int n1in = (2 < argc ? atoi(argv[2]) : 0), n1 = std::max(n1in, 1); const int n2in = (3 < argc ? atoi(argv[3]) : n1), n2 = (0 < n2in ? n2in : n1); const int n3in = (4 < argc ? atoi(argv[4]) : n1), n3 = (0 < n3in ? n3in : n1); const int lmin = (5 < argc ? atoi(argv[5]) : 6), lmax = (0 < lmin ? lmin : 6); elem_type diff = 0; #if (defined(HAVE_MKL) || defined(__MKL)) && 0 mkl_set_threading_layer(MKL_THREADING_SEQUENTIAL); #endif timer.start("test_collocate_core"); for (int i = 0; i < nrep; ++i) { if (0 == n1in) { diff = std::max(diff, test_collocate_core(27, 31, 23, 3)); diff = std::max(diff, test_collocate_core(13, 35, 13, 7)); diff = std::max(diff, test_collocate_core(15, 11, 23, 9)); diff = std::max(diff, test_collocate_core(13, 19, 17, 5)); diff = std::max(diff, test_collocate_core(9, 11, 19, 3)); diff = std::max(diff, test_collocate_core(19, 17, 25, 5)); diff = std::max(diff, test_collocate_core(23, 19, 27, 1)); diff = std::max(diff, test_collocate_core(25, 23, 31, 11)); diff = std::max(diff, test_collocate_core(27, 31, 23, 13)); } else { diff = std::max(diff, test_collocate_core(n1, n2, n3, lmax)); } } timer.stop("test_collocate_core"); // process timings const auto result = timer.process(); // print default statistics std::cout << "Default statistic:" << std::endl; std::cout << result.print(); if (diff > 1e-14) { printf("Wrong result : maximum error %.15lf\n", diff); return 1; } return 0; } libxsmm-1.17/samples/cp2k/cp2k-collocate.vcxproj000066400000000000000000000561371415223013700216510ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 cp2k-collocate 10.0 {EDB39F12-1195-41FA-AE27-F227CDE067B2} Application Disabled Disabled v142 true Sequential Application true true Disabled Disabled v142 Sequential Application true Disabled Disabled v142 true Sequential Application Disabled Disabled v142 true Sequential true Application true Disabled Disabled v142 Sequential Application true Disabled Disabled true v142 Sequential <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;HAVE_MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;mkl_rt.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;HAVE_MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;HAVE_MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;mkl_rt.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;HAVE_MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;HAVE_MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;HAVE_MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/cp2k/cp2k-dbcsr.cpp000066400000000000000000000354151415223013700200640ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #if !defined(USE_HEADER_ONLY) # include #else # include #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #include #include #include #include #include #include #include #if defined(_OPENMP) # include #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif #if !defined(ITYPE) # define ITYPE double #endif #if !defined(MAX_SIZE) # define MAX_SIZE ((LIBXSMM_MAX_M) * (LIBXSMM_MAX_N)) #endif /** >1: number of locks, =1: omp critical, =0: atomic */ #define CP2K_SYNCHRONIZATION 0 // ensures sufficient parallel slack #define CP2K_MIN_NPARALLEL 240 // ensures amortized atomic overhead #define CP2K_MIN_NLOCAL 160 // OpenMP schedule policy (and chunk size) #if defined(__MIC__) # define CP2K_SCHEDULE schedule(static,1) #else # define CP2K_SCHEDULE #endif #if defined(_OPENMP) && defined(CP2K_SYNCHRONIZATION) && (1 < (CP2K_SYNCHRONIZATION)) LIBXSMM_RETARGETABLE class LIBXSMM_RETARGETABLE lock_type { public: lock_type() { for (int i = 0; i < (CP2K_SYNCHRONIZATION); ++i) omp_init_lock(m_lock + i); } ~lock_type() { for (int i = 0; i < (CP2K_SYNCHRONIZATION); ++i) omp_destroy_lock(m_lock + i); } public: void acquire(const void* address) { omp_set_lock(m_lock + LIBXSMM_FOLD2(address, LIBXSMM_ALIGNMENT, CP2K_SYNCHRONIZATION)); } void release(const void* address) { omp_unset_lock(m_lock + LIBXSMM_FOLD2(address, LIBXSMM_ALIGNMENT, CP2K_SYNCHRONIZATION)); } private: omp_lock_t m_lock[CP2K_SYNCHRONIZATION]; } lock; #endif template LIBXSMM_INLINE LIBXSMM_RETARGETABLE void add(T *LIBXSMM_RESTRICT dst, const T *LIBXSMM_RESTRICT src, libxsmm_blasint nrows, libxsmm_blasint ncols, libxsmm_blasint ld_src = 0) { const libxsmm_blasint ld = (0 == ld_src ? ncols : ld_src); #if defined(_OPENMP) && defined(CP2K_SYNCHRONIZATION) && (0 < (CP2K_SYNCHRONIZATION)) # if (1 == (CP2K_SYNCHRONIZATION)) # pragma omp critical(smmadd) # else lock.acquire(dst); # endif #endif { for (libxsmm_blasint i = 0; i < nrows; ++i) { LIBXSMM_PRAGMA_UNROLL for (libxsmm_blasint j = 0; j < ncols; ++j) { const T value = src[i*ld+j]; #if defined(_OPENMP) && (!defined(CP2K_SYNCHRONIZATION) || (0 == (CP2K_SYNCHRONIZATION))) # pragma omp atomic #endif dst[i*ncols+j] += value; } } } #if defined(_OPENMP) && defined(CP2K_SYNCHRONIZATION) && (1 < (CP2K_SYNCHRONIZATION)) lock.release(dst); #endif } int main(int argc, char* argv[]) { int result = EXIT_SUCCESS; try { typedef ITYPE T; const libxsmm_blasint m = 1 < argc ? std::atoi(argv[1]) : 23; const libxsmm_blasint q = ((1ULL << 30) / (3 * m * m * sizeof(T))); const libxsmm_blasint r = 2 < argc ? (0 < std::atoi(argv[2]) ? std::atoi(argv[2]) : ('+' == *argv[2] ? (q << std::strlen(argv[2])) : ('-' == *argv[2] ? (q >> std::strlen(argv[2])) : 0))) : 0; const libxsmm_blasint t = 3 < argc ? (0 < std::atoi(argv[3]) ? std::atoi(argv[3]) : ('+' == *argv[3] ? ((CP2K_MIN_NLOCAL) << std::strlen(argv[3])) : ('-' == *argv[3] ? ((CP2K_MIN_NLOCAL) >> std::strlen(argv[3])) : -1))) : -1; const libxsmm_blasint k = 5 < argc ? std::atoi(argv[5]) : m; const libxsmm_blasint n = 4 < argc ? std::atoi(argv[4]) : k; const char transa = 'N', transb = 'N'; const ITYPE alpha = 1, beta = 1; const libxsmm_blasint csize = m * n; if ((MAX_SIZE) < csize) { throw "The size M x N is exceeding MAX_SIZE!"; } const libxsmm_blasint asize = m * k, bsize = k * n, aspace = LIBXSMM_ALIGNMENT / sizeof(T); const libxsmm_blasint s = 0 < r ? r : ((2ULL << 30) / ((asize + bsize) * sizeof(T))); // 2 GByte const libxsmm_blasint u = 0 < t ? t : static_cast(libxsmm_isqrt_u64(s * CP2K_MIN_NLOCAL / CP2K_MIN_NPARALLEL)); const size_t bwsize = static_cast((s * (asize + bsize)/*load*/ + LIBXSMM_UPDIV(s, u) * csize * 2/*accumulate*/) * sizeof(T)); const double gflops = 2.0 * s * m * n * k * 1E-9, scale = 1.0 / s; const char ops[] = "FLOPS"; const char *const env_check = getenv("CHECK"); const double check = LIBXSMM_ABS(NULL == env_check ? 0 : atof(env_check)); LIBXSMM_RETARGETABLE struct LIBXSMM_RETARGETABLE raii { // avoid std::vector (first-touch init. causes NUMA issue) T *a, *b, *c; raii(libxsmm_blasint asize_, libxsmm_blasint bsize_, libxsmm_blasint csize_) : a(new T[static_cast(asize_)]), b(new T[static_cast(bsize_)]) , c(new T[static_cast(csize_)]) {} ~raii() { delete[] a; delete[] b; delete[] c; } } buffer(s * asize + aspace - 1, s * bsize + aspace - 1, csize); T *const a = LIBXSMM_ALIGN(buffer.a, LIBXSMM_ALIGNMENT); T *const b = LIBXSMM_ALIGN(buffer.b, LIBXSMM_ALIGNMENT); T * /*const*/ c = buffer.c; // no alignment, but thread-local array will be aligned #if defined(_OPENMP) # pragma omp parallel for #endif for (libxsmm_blasint i = 0; i < s; ++i) { LIBXSMM_MATINIT(ITYPE, 42 + i, a + i * asize, m, k, m, scale); LIBXSMM_MATINIT(ITYPE, 24 + i, b + i * bsize, k, n, k, scale); } #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload target(LIBXSMM_OFFLOAD_TARGET) in(a: length(s * asize)) in(b: length(s * bsize)) out(c: length(csize)) #endif { // initialize LIBXSMM libxsmm_init(); #if !defined(LIBXSMM_OFFLOAD_TARGET) // some more setup similar to CP2K/intel branch libxsmm_set_gemm_auto_prefetch(LIBXSMM_X86_AVX512_MIC != libxsmm_get_target_archid() ? LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C : LIBXSMM_GEMM_PREFETCH_BL2_VIA_C); #endif //libxsmm_set_dispatch_trylock(1); fprintf(stdout, "m=%lli n=%lli k=%lli size=%lli memory=%.1f MB (%s)\n\n", static_cast(m), static_cast(n), static_cast(k), static_cast(s), 1.0 * (s * (asize + bsize) * sizeof(T)) / (1 << 20), 8 == sizeof(T) ? "DP" : "SP"); LIBXSMM_RETARGETABLE struct LIBXSMM_RETARGETABLE raii_expect { // avoid std::vector (first-touch init. causes NUMA issue) T *expect; explicit raii_expect(libxsmm_blasint size): expect(0 < size ? new T[static_cast(size)] : 0) {} ~raii_expect() { delete[] expect; } } expect_buffer(LIBXSMM_FEQ(0, check) ? 0 : csize); T *const expect = (0 == expect_buffer.expect ? c : expect_buffer.expect); libxsmm_matdiff_info d, diff; const T zero = 0; // eventually JIT-compile the requested kernel const libxsmm_mmfunction xmm(LIBXSMM_GEMM_FLAGS(transa, transb), m, n, k, LIBXSMM_PREFETCH); libxsmm_matdiff_clear(&diff); { // LAPACK/BLAS3 (warmup BLAS Library) std::fill_n(expect, csize, zero); #if defined(_OPENMP) # pragma omp parallel for CP2K_SCHEDULE #endif for (libxsmm_blasint i = 0; i < s; i += u) { T tmp[MAX_SIZE] = { 0 }; // make sure that stacksize is covering the problem size const T *ai = a + i * asize, *bi = b + i * bsize; for (libxsmm_blasint j = 0; j < LIBXSMM_MIN(u, s - i); ++j) { const T *const aij = ai + asize, *const bij = bi + bsize; libxsmm_blas_gemm(&transa, &transb, m, n, k, &alpha, ai, &m, bi, &k, &beta, tmp, &m); ai = aij; bi = bij; } add(expect, tmp, m, n); // atomic } } { // LAPACK/BLAS3 (reference) fprintf(stdout, "LAPACK/BLAS...\n"); std::fill_n(c, csize, zero); const unsigned long long start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel for CP2K_SCHEDULE #endif for (libxsmm_blasint i = 0; i < s; i += u) { T tmp[MAX_SIZE] = { 0 }; // make sure that stacksize is covering the problem size const T *ai = a + i * asize, *bi = b + i * bsize; for (libxsmm_blasint j = 0; j < LIBXSMM_MIN(u, s - i); ++j) { const T *const aij = ai + asize, *const bij = bi + bsize; libxsmm_blas_gemm(&transa, &transb, &m, &n, &k, &alpha, ai, &m, bi, &k, &beta, tmp, &m); ai = aij; bi = bij; } add(c, tmp, m, n); // atomic } const double duration = libxsmm_timer_duration(start, libxsmm_timer_tick()); if (0 < duration) { fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", bwsize / (duration * (1 << 30))); fprintf(stdout, "\tcalls/s: %.0f Hz\n", s / duration); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); if (!LIBXSMM_FEQ(0, check) && EXIT_SUCCESS == libxsmm_matdiff(&d, LIBXSMM_DATATYPE(ITYPE), m, n, expect, c, 0, 0)) { fprintf(stdout, "\tdiff: L2abs=%f Linfo=%f\n", d.l2_abs, d.linf_abs); libxsmm_matdiff_reduce(&diff, &d); } } { // inline an optimized implementation fprintf(stdout, "Inlined...\n"); std::fill_n(c, csize, zero); const unsigned long long start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel for CP2K_SCHEDULE #endif for (libxsmm_blasint i = 0; i < s; i += u) { T tmp[MAX_SIZE] = { 0 }; // make sure that stacksize is covering the problem size const T *ai = a + i * asize, *bi = b + i * bsize; for (libxsmm_blasint j = 0; j < LIBXSMM_MIN(u, s - i); ++j) { const T *const aij = ai + asize, *const bij = bi + bsize; LIBXSMM_INLINE_XGEMM(ITYPE, ITYPE, &transa, &transb, &m, &n, &k, &alpha, ai, &m, bi, &k, &beta, tmp, &m); ai = aij; bi = bij; } add(c, tmp, m, n); // atomic } const double duration = libxsmm_timer_duration(start, libxsmm_timer_tick()); if (0 < duration) { fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", bwsize / (duration * (1 << 30))); fprintf(stdout, "\tcalls/s: %.0f Hz\n", s / duration); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); if (!LIBXSMM_FEQ(0, check) && EXIT_SUCCESS == libxsmm_matdiff(&d, LIBXSMM_DATATYPE(ITYPE), m, n, expect, c, 0, 0)) { fprintf(stdout, "\tdiff: L2abs=%f Linfo=%f\n", d.l2_abs, d.linf_abs); libxsmm_matdiff_reduce(&diff, &d); } } { // auto-dispatched fprintf(stdout, "Dispatched...\n"); std::fill_n(c, csize, zero); const unsigned long long start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel for CP2K_SCHEDULE #endif for (libxsmm_blasint i = 0; i < s; i += u) { T tmp[MAX_SIZE] = { 0 }; // make sure that stacksize is covering the problem size const T *ai = a + i * asize, *bi = b + i * bsize; for (libxsmm_blasint j = 0; j < LIBXSMM_MIN(u, s - i); ++j) { const T *const aij = ai + asize, *const bij = bi + bsize; libxsmm_gemm(&transa, &transb, m, n, k, &alpha, ai, &m, bi, &k, &beta, tmp, &m); ai = aij; bi = bij; } add(c, tmp, m, n); // atomic } const double duration = libxsmm_timer_duration(start, libxsmm_timer_tick()); if (0 < duration) { fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", bwsize / (duration * (1 << 30))); fprintf(stdout, "\tcalls/s: %.0f Hz\n", s / duration); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); if (!LIBXSMM_FEQ(0, check) && EXIT_SUCCESS == libxsmm_matdiff(&d, LIBXSMM_DATATYPE(ITYPE), m, n, expect, c, 0, 0)) { fprintf(stdout, "\tdiff: L2abs=%f Linfo=%f\n", d.l2_abs, d.linf_abs); libxsmm_matdiff_reduce(&diff, &d); } } if (xmm) { // specialized routine fprintf(stdout, "Specialized...\n"); std::fill_n(c, csize, zero); const unsigned long long start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel for CP2K_SCHEDULE #endif for (libxsmm_blasint i = 0; i < s; i += u) { T tmp[MAX_SIZE] = { 0 }; // make sure that stacksize is covering the problem size const T *ai = a + i * asize, *bi = b + i * bsize; for (libxsmm_blasint j = 0; j < LIBXSMM_MIN(u, s - i); ++j) { const T *const aij = ai + asize, *const bij = bi + bsize; #if (0 != LIBXSMM_PREFETCH) xmm(ai, bi, tmp, LIBXSMM_GEMM_PREFETCH_A(aij + asize), LIBXSMM_GEMM_PREFETCH_B(bij + bsize), LIBXSMM_GEMM_PREFETCH_C(tmp)); #else xmm(ai, bi, tmp); #endif ai = aij; bi = bij; } add(c, tmp, m, n); // atomic } const double duration = libxsmm_timer_duration(start, libxsmm_timer_tick()); if (0 < duration) { fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", bwsize / (duration * (1 << 30))); fprintf(stdout, "\tcalls/s: %.0f Hz\n", s / duration); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); if (!LIBXSMM_FEQ(0, check) && EXIT_SUCCESS == libxsmm_matdiff(&d, LIBXSMM_DATATYPE(ITYPE), m, n, expect, c, 0, 0)) { fprintf(stdout, "\tdiff: L2abs=%f Linfo=%f\n", d.l2_abs, d.linf_abs); libxsmm_matdiff_reduce(&diff, &d); } } // finalize LIBXSMM libxsmm_finalize(); fprintf(stdout, "Finished\n"); if (!LIBXSMM_FEQ(0, check)) { if (check < 100.0 * diff.normf_rel) { fprintf(stderr, "FAILED with an error of %f%%!\n", 100.0 * diff.normf_rel); result = EXIT_FAILURE; } } } } catch(const std::exception& e) { fprintf(stderr, "Error: %s\n", e.what()); result = EXIT_FAILURE; } catch(const char* message) { fprintf(stderr, "Error: %s\n", message); result = EXIT_FAILURE; } catch(...) { fprintf(stderr, "Error: unknown exception caught!\n"); result = EXIT_FAILURE; } return result; } libxsmm-1.17/samples/cp2k/cp2k-dbcsr.sh000077500000000000000000000050141415223013700177070ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) NAME=$(basename $0 .sh) GREP=$(command -v grep) ENV=$(command -v env) if [ "Windows_NT" = "${OS}" ]; then # Cygwin's "env" does not set PATH ("Files/Black: No such file or directory") export PATH=${PATH}:${HERE}/../../lib:/usr/x86_64-w64-mingw32/sys-root/mingw/bin # Cygwin's ldd hangs with dyn. linked executables or certain shared libraries LDD=$(command -v cygcheck) EXE=.exe else if [ "$(command -v ldd)" ]; then LDD=ldd elif [ "$(command -v otool)" ]; then LDD="otool -L" else LDD=echo fi fi MICINFO=$(command -v micinfo) if [ "${MICINFO}" ]; then MICCORES=$(${MICINFO} 2>/dev/null | sed -n "0,/[[:space:]]\+Total No of Active Cores :[[:space:]]\+\([0-9]\+\)/s//\1/p") fi if [ "" = "${MICCORES}" ]; then MICCORES=61 fi MICTPERC=3 if [ "-mic" != "$1" ]; then if [ "$(${LDD} ${HERE}/${NAME}${EXE} 2>/dev/null | ${GREP} libiomp5\.)" ]; then ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ KMP_AFFINITY=scatter,granularity=fine,1 \ MIC_KMP_AFFINITY=scatter,granularity=fine \ MIC_KMP_HW_SUBSET=$((MICCORES-1))c${MICTPERC}t \ MIC_ENV_PREFIX=MIC \ OFFLOAD_INIT=on_start \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" else ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ OMP_PROC_BIND=TRUE \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" fi else shift ${ENV} \ SINK_LD_LIBRARY_PATH=${SINK_LD_LIBRARY_PATH}:${MIC_LD_LIBRARY_PATH}:${HERE}/../../lib \ micnativeloadex \ ${HERE}/${NAME}${EXE} -a "$*" \ -e "KMP_AFFINITY=scatter,granularity=fine" \ -e "MIC_KMP_HW_SUBSET=$((MICCORES-1))${MICTPERC}t" fi libxsmm-1.17/samples/cp2k/cp2k-dbcsr.vcxproj000066400000000000000000000572171415223013700210010ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 cp2k-dbcsr {99A237B2-5EA1-4988-9D09-B4FD97C42AF1} 10.0 Application Disabled Disabled Sequential v142 true Application true true Disabled Disabled Sequential v142 Application true Disabled Disabled Sequential v142 true Application Disabled Disabled Sequential v142 true true Application true Disabled Disabled Sequential v142 Application true Disabled Disabled true Sequential v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true true GenerateParallelCode 10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;mkl_rt.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true true GenerateParallelCode SingleFile 10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true true GenerateParallelCode 10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;mkl_rt.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true true GenerateParallelCode SingleFile 10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true true GenerateParallelCode 10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true true GenerateParallelCode 10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/cp2k/cp2k-perf-jit.sh000077500000000000000000000071461415223013700203420ustar00rootroot00000000000000#!/usr/bin/env bash ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) SCRT=${HERE}/../../scripts/libxsmm_utilities.py FILE=cp2k-perf.txt RUNS0=$(${SCRT} -1 $((64*64*64-0)) 19 23, 6, 14 16 29, 5 16 13 24 26, 9 16 22, 32, 64, 78, 16 29 55 0 0) RUNS1=$(${SCRT} -1 $((64*64*64-0)) 19 23, 6, 14 16 29, 5 32 13 24 26, 9 32 22, 32, 64, 78, 16 29 55 0 0) RUNS2=$(${SCRT} -1 $((64*64*64-0)) 20 23, 6, 14 16 29, 5 32 13 24 26, 9 32 22, 32, 64, 78, 16 29 55, 12 0 0) RUNS3=$(${SCRT} -1 $((64*64*64-0)) 26 23, 6, 14 16 29, 14 32 29, 5 32 13 24 26, 9 32 22, 32, 64, 78, 16 29 55, 32 29 55, 12 0 0) RUNS4=$(${SCRT} -1 $((64*64*64-1)) 31 23, 6, 14 16 29, 14 32 29, 5 32 13 24 26, 9 32 22, 32, 64, 78, 16 29 55, 32 29 55, 12, 13 26 28 32 45 0 0) RUNS5=$(${SCRT} -1 $((64*64*64-0)) 31 23, 6, 14 16 29, 14 32 29, 5 32 13 24 26, 9 32 22, 32, 64, 78, 16 29 55, 32 29 55, 12, 13 26 28 32 45 0 0) RUNS6=$(${SCRT} -1 $((80*80*80-0)) 35 23, 6, 14 16 29, 14 32 29, 5 32 13 24 26, 9 32 22, 32, 64, 78, 16 29 55, 32 29 55, 12, 13 26 28 32 45, 7 13 25 32 0 0) RUNS7=$(${SCRT} -1 $((80*80*80-0)) 35 23, 6, 14 16 29, 14 32 29, 5 32 13 24 26, 9 32 22, 64, 78, 16 29 55, 32 29 55, 12, 4 5 7 9 13 25 26 28 32 45 0 0) RUNS8=$(${SCRT} -1 $((80*80*80-0)) 37 23, 6, 14 16 29, 14 32 29, 5 32 13 24 26, 9 32 22, 64, 78, 16 29 55, 32 29 55, 12, 4 5 7 9 13 25 26 28 32 45, 4 10 0 0) RUNS9=$(${SCRT} -1 $((80*80*80-0)) 38 23, 6, 14 16 29, 14 32 29, 5 32 13 24 26, 9 32 22, 64, 78, 16 29 55, 32 29 55, 12, 4 5 7 9 13 25 26 28 32 45, 4 10 15 0 0) RUNS10=$(${SCRT} -1 $((128*128*128)) 41 23, 6, 14 16 29, 14 32 29, 5 32 13 24 26, 9 32 22, 64, 78, 16 29 55, 32 29 55, 12, 4 5 7 9 13 25 26 28 32 45, 4 10 15, 6 7 8 0 0) RUNS11=$(${SCRT} -1 $((128*128*128)) 46 23, 6, 14 16 29, 14 32 29, 5 32 13 24 26, 9 32 22, 64, 78, 16 29 55, 32 29 55, 12, 4 5 7 9 13 25 26 28 32 45, 4 10 15, 6 7 8, 13 14 25 26 32 0 0) case "$1" in "-"*) RUNS=RUNS${1:1}; shift ;; esac if [ -z "${RUNS}" ]; then RUNS=RUNS11 fi if [ "$1" ]; then SIZE=$1 shift else SIZE=0 fi if [ "$1" ]; then FILE=$1 shift fi cat /dev/null > ${FILE} NRUN=1 NMAX=$(echo ${!RUNS} | wc -w | tr -d " ") for RUN in ${!RUNS} ; do MVALUE=$(echo ${RUN} | cut --output-delimiter=' ' -d_ -f1) NVALUE=$(echo ${RUN} | cut --output-delimiter=' ' -d_ -f2) KVALUE=$(echo ${RUN} | cut --output-delimiter=' ' -d_ -f3) >&2 echo -n "${NRUN} of ${NMAX} (M=${MVALUE} N=${NVALUE} K=${KVALUE})... " ERROR=$({ CHECK=1 ${HERE}/cp2k ${MVALUE} ${SIZE} 0 ${NVALUE} ${KVALUE} >> ${FILE}; } 2>&1) RESULT=$? if [ 0 != ${RESULT} ]; then echo "FAILED(${RESULT}) ${ERROR}" exit 1 else echo "OK ${ERROR}" fi echo >> ${FILE} NRUN=$((NRUN+1)) done libxsmm-1.17/samples/cp2k/cp2k-perf.plt000066400000000000000000000301111415223013700177240ustar00rootroot00000000000000MPARM = 1 NPARM = 2 KPARM = 3 FLOPS = 6 HIM = -1 HIN = HIM HIK = HIM BASENAME = "cp2k" FILENAME = system("sh -c \"echo ${FILENAME}\"") if (FILENAME eq "") { FILENAME = BASENAME."-perf.pdf" } FILECOUNT = 1 # initial file number # MULTI =-1: multiple files; no titles # MULTI = 0: multiple files with titles # MULTI = 1: single file with titles MULTI = system("sh -c \"echo ${MULTI}\"") if (MULTI eq "") { MULTI = 1 } XFLOPS(M, N, K) = 2.0 * M * N * K NFLOPS(M, N, K) = XFLOPS(column(M), column(N), column(K)) NBYTES(M, N, K, ELEMSIZE) = ELEMSIZE * (column(M) * column(K) + column(K) * column(N) + column(M) * column(N)) AI(M, N, K, ELEMSIZE) = NFLOPS(M, N, K) / NBYTES(M, N, K, ELEMSIZE) TIME(M, N, K, F) = NFLOPS(M, N, K) * 1E-9 / column(F) BW(M, N, K, F, ELEMSIZE) = (column(M) * column(K) + column(K) * column(N)) * ELEMSIZE / (TIME(M, N, K, F) * 1024 * 1024 * 1024) stats BASENAME."-perf.dat" using (column(MPARM)*column(NPARM)*column(KPARM)) nooutput; MNK = STATS_stddev**(1.0/3.0); MAXMNK = int(STATS_max) stats BASENAME."-perf.dat" using (log(column(FLOPS))) nooutput; NSAMPLES = STATS_records; GEOFLOPS = exp(STATS_sum/STATS_records) stats BASENAME."-perf.dat" using FLOPS nooutput; MEDFLOPS = STATS_median; AVGFLOPS = STATS_mean; MINFLOPS = STATS_min; MAXFLOPS = STATS_max stats BASENAME."-perf.dat" using NPARM nooutput; XN = int(STATS_max) stats BASENAME."-perf.dat" using ((NFLOPS(MPARM,NPARM,KPARM)<=XFLOPS(13,13,13))?column(FLOPS):1/0) nooutput; BIN1_FLOPS = STATS_mean; BIN1_NSAMPLES = STATS_records stats BASENAME."-perf.dat" using (((XFLOPS(13,13,13)-1) { set title "Performance (Selected Kernels)" } set origin -0.03, 0 set pm3d interpolate 0, 0 #set colorbox horizontal user origin 0, 0.1 size 1, 0.1 #set autoscale fix if (0HIM) { set xrange [*:MNK] } if (0>HIN) { set yrange [*:MNK] } if (0>HIK) { set zrange [*:MNK] } set xlabel "M" set ylabel "N" offset -3.0 set zlabel "K" offset 1.0 set ticslevel 0 set cblabel "GFLOP/s" offset 1.5 set format x "%g"; set format y "%g"; set format z "%g"; set format cb "%g" splot BASENAME."-perf.dat" using MPARM:NPARM:KPARM:FLOPS notitle with points pointtype 7 linetype palette reset if (MULTI<=0) { set output "".FILECOUNT."-".FILENAME; FILECOUNT = FILECOUNT + 1 } if (MULTI>-1) { set title "Performance (K-Average for ".sprintf("%u Kernels", NSAMPLES).")" } set origin -0.02, 0 set dgrid3d #9, 9 set pm3d interpolate 0, 0 map set autoscale fix set xlabel "M" set ylabel "N" offset -1.5 set cblabel "GFLOP/s" offset 0.5 set format x "%g"; set format y "%g"; set format cb "%g" set mxtics 2 splot BASENAME."-plot-avg.dat" using (("".strcol(3)."" eq "i")?(I1($1, XN)):(1/0)):(("".strcol(3)."" eq "i")?(J1($1, XN)):(1/0)):2 notitle with pm3d reset if (MULTI<=0) { set output "".FILECOUNT."-".FILENAME; FILECOUNT = FILECOUNT + 1 } if (MULTI>-1) { set title "Performance (Average per Bin)" } set style fill solid 0.4 noborder set boxwidth 0.5 set grid y2tics linecolor "grey" unset key unset xtics set xtics ("MNK <= 13^3" 0, "13^3 < MNK <= 23^3" 1, "23^3 < MNK" 2) scale 0 offset 0, 0.2 set x2tics ("Small" 0, "Medium" 1, "Larger" 2) scale 0 set xlabel "Problem Size (MNK)" set ytics format "" set y2tics nomirror set y2label "GFLOP/s" set xrange [-0.5:2.5] set yrange [0:*] set autoscale fix set label sprintf("{/=9 ".FORMAT(BIN1_FLOPS)." GFLOP/s}", BIN1_FLOPS) at 0.0, BIN1_FLOPS centre offset 0, -1 front set label sprintf("{/=9 ".FORMAT(BIN2_FLOPS)." GFLOP/s}", BIN2_FLOPS) at 1.0, BIN2_FLOPS centre offset 0, -1 front set label sprintf("{/=9 ".FORMAT(BIN3_FLOPS)." GFLOP/s}", BIN3_FLOPS) at 2.0, BIN3_FLOPS centre offset 0, -1 front set label sprintf("{/=9 (".FORMAT(BIN1_MEMBW)." GB/s)}", BIN1_MEMBW) at 0.0, BIN1_FLOPS centre offset 0, -2 front set label sprintf("{/=9 (".FORMAT(BIN2_MEMBW)." GB/s)}", BIN2_MEMBW) at 1.0, BIN2_FLOPS centre offset 0, -2 front set label sprintf("{/=9 (".FORMAT(BIN3_MEMBW)." GB/s)}", BIN3_MEMBW) at 2.0, BIN3_FLOPS centre offset 0, -2 front set label sprintf("{/=9 N=%u}", BIN1_NSAMPLES) at 0.0, 0.0 centre offset 0, 0.5 front set label sprintf("{/=9 N=%u}", BIN2_NSAMPLES) at 1.0, 0.0 centre offset 0, 0.5 front set label sprintf("{/=9 N=%u}", BIN3_NSAMPLES) at 2.0, 0.0 centre offset 0, 0.5 front plot BASENAME."-perf.dat" \ using (0.0):(BIN1_FLOPS) notitle smooth unique with boxes linetype 1 linecolor "grey", \ "" using (1.0):(BIN2_FLOPS) notitle smooth unique with boxes linetype 1 linecolor "grey", \ "" using (2.0):(BIN3_FLOPS) notitle smooth unique with boxes linetype 1 linecolor "grey" reset if (MULTI<=0) { set output "".FILECOUNT."-".FILENAME; FILECOUNT = FILECOUNT + 1 } if (MULTI>-1) { set title "Cummulative Performance Distribution (CDF for ".sprintf("%u Kernels", NSAMPLES).")" } set xlabel "Probability\n\n{/=9 Min.: ".sprintf(FORMAT(MINFLOPS), MINFLOPS)." GFLOP/s Geo.: ".sprintf(FORMAT(GEOFLOPS), GEOFLOPS)." GFLOP/s Med.: ".sprintf(FORMAT(MEDFLOPS), MEDFLOPS)." GFLOP/s Avg.: ".sprintf(FORMAT(AVGFLOPS), AVGFLOPS)." GFLOP/s Max.: ".sprintf(FORMAT(MAXFLOPS), MAXFLOPS)." GFLOP/s}" set ylabel "GB/s" set y2label "GFLOP/s" set format x "%g%%" set format y "%g" set format y2 "%g" set ytics nomirror set y2tics nomirror set grid x y2 linecolor "grey" set xrange [0:100] set yrange [0:*] set y2range [0:*] set fit quiet f(x) = b * x + a fit f(x) BASENAME."-plot-cdf.dat" using (("".strcol(3)."" eq "i")?(100*$2/FREQSUM):(1/0)):1 via a, b g(x) = (x - a) / b x50 = 0.5 * (100 + MAX(0, g(0))) h(x) = d * x + c dx = 100.0 / FREQN fit [x50-2.0*dx:x50+2.0*dx] h(x) BASENAME."-plot-cdf.dat" using (("".strcol(3)."" eq "i")?(100*$2/FREQSUM):(1/0)):1 via c, d set arrow from x50, second h(x50) to x50, second 0 front set arrow from x50, second h(x50) to 100, second h(x50) front set label sprintf("%.0f%%", x50) at x50, second 0.5 * h(x50) left offset 1 front set label sprintf(FORMAT(h(x50))." GFLOP/s", h(x50)) at 0.5 * (x50 + 100.0), second h(x50) centre offset 0, -1 front set key left invert plot BASENAME."-plot-mbw.dat" using (("".strcol(3)."" eq "i")?(100*$2/FREQSUM):(1/0)):1 axes x1y1 title "Memory Bandwidth" with lines linecolor "grey", \ BASENAME."-plot-cdf.dat" using (("".strcol(3)."" eq "i")?(100*$2/FREQSUM):(1/0)):1 axes x1y2 title "Compute Performance" with lines linewidth 2 reset if (MULTI<=0) { set output "".FILECOUNT."-".FILENAME; FILECOUNT = FILECOUNT + 1 } if (MULTI>-1) { set title "Arithmetic Intensity (".sprintf("%u Kernels", NSAMPLES).")" } set grid x y2 linecolor "grey" set key left #spacing 0.5 set ytics format "" set y2tics nomirror set y2label "GFLOP/s" #set xlabel "FLOPS/Byte\n\n{/=9 ".sprintf("N: %u", NSAMPLES)." Min.: ".sprintf("%.1f", MINAI)." Geo.: ".sprintf("%.1f", GEOAI)." Med.: ".sprintf("%.1f", MEDAI)." Avg.: ".sprintf("%.1f", AVGAI)." Max.: ".sprintf("%.1f", MAXAI)."}" set xlabel "FLOPS/Byte (Min.: ".sprintf("%.1f", MINAI)." Geo.: ".sprintf("%.1f", GEOAI)." Med.: ".sprintf("%.1f", MEDAI)." Avg.: ".sprintf("%.1f", AVGAI)." Max.: ".sprintf("%.1f", MAXAI).")" set yrange [0:*] set autoscale fix plot BASENAME."-perf.dat" using (AI(MPARM,NPARM,KPARM,8)):FLOPS notitle smooth sbezier with lines linecolor "grey" linewidth 2, \ "" using (AI(MPARM,NPARM,KPARM,8)):FLOPS notitle smooth unique with points pointtype 7 pointsize 0.1 reset if (MULTI<=0) { set output "".FILECOUNT."-".FILENAME; FILECOUNT = FILECOUNT + 1 } if (MULTI>-1) { set title "Memory Bandwidth Consumption (".sprintf("%u Kernels", NSAMPLES).")" } set grid x y2 linecolor "grey" set key left #spacing 0.5 set ytics format "" set y2tics nomirror set y2label "GB/s" set xlabel "Problem Size (MNK^{1/3})\n\n{/=9 Min.: ".sprintf("%.0f GB/s", MINMEMBW)." Geo.: ".sprintf("%.0f GB/s", GEOMEMBW)." Med.: ".sprintf("%.0f GB/s", MEDMEMBW)." Avg.: ".sprintf("%.0f GB/s", AVGMEMBW)." Max.: ".sprintf("%.0f GB/s", MAXMEMBW)."}" set yrange [0:*] set autoscale fix plot BASENAME."-perf.dat" using ((column(MPARM)*column(NPARM)*column(KPARM))**(1.0/3.0)):(BW(MPARM,NPARM,KPARM,FLOPS,8)) notitle smooth sbezier with lines linecolor "grey" linewidth 2, \ "" using ((column(MPARM)*column(NPARM)*column(KPARM))**(1.0/3.0)):(BW(MPARM,NPARM,KPARM,FLOPS,8)) notitle with points pointtype 7 pointsize 0.1 reset if (MULTI<=0) { set output "".FILECOUNT."-".FILENAME; FILECOUNT = FILECOUNT + 1 } if (MULTI>-1) { set title "Compute Consumption (".sprintf("%u Kernels", NSAMPLES).")" } set grid x y2 linecolor "grey" set key left #spacing 0.5 set ytics format "" set y2tics nomirror set y2label "GFLOPS/s" set xlabel "Problem Size (MNK^{1/3})\n\n{/=9 Min.: ".sprintf(FORMAT(MINFLOPS), MINFLOPS)." GFLOP/s Geo.: ".sprintf(FORMAT(GEOFLOPS), GEOFLOPS)." GFLOP/s Med.: ".sprintf(FORMAT(MEDFLOPS), MEDFLOPS)." GFLOP/s Avg.: ".sprintf(FORMAT(AVGFLOPS), AVGFLOPS)." GFLOP/s Max.: ".sprintf(FORMAT(MAXFLOPS), MAXFLOPS)." GFLOP/s}" set yrange [0:*] set autoscale fix plot BASENAME."-perf.dat" using ((column(MPARM)*column(NPARM)*column(KPARM))**(1.0/3.0)):FLOPS notitle smooth sbezier with lines linecolor "grey" linewidth 2, \ "" using ((column(MPARM)*column(NPARM)*column(KPARM))**(1.0/3.0)):FLOPS notitle with points pointtype 7 pointsize 0.1 if (0!=system("sh -c \"if [[ -e cp2k-plot-join.dat ]]; then echo 1; else echo 0; fi\"")) { reset if (MULTI<=0) { set output "".FILECOUNT."-".FILENAME; FILECOUNT = FILECOUNT + 1 } if (MULTI>-1) { set title "Performance (Selected Kernels)" } set style fill solid 0.4 border -1 set style data histograms set style histogram cluster #gap 2 #set boxwidth 0.5 relative set grid y2tics lc "grey" set key left #spacing 0.5 set xtics rotate by -45 scale 0; set bmargin 6 set ytics format "" set y2tics nomirror set y2label "GFLOP/s" set yrange [0:*] plot "cp2k-plot-join.dat" using FLOPS:xtic("(".strcol(MPARM).",".strcol(NPARM).",".strcol(KPARM).")") notitle } libxsmm-1.17/samples/cp2k/cp2k-plot.sh000077500000000000000000000064431415223013700175770ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### SORT=$(command -v sort) SED=$(command -v sed) CUT=$(command -v cut) VARIANT=Specialized if [ "$1" ]; then VARIANT=$1 shift fi HERE=$(cd "$(dirname "$0")" && pwd -P) FILE=${HERE}/cp2k-perf.txt GREP=$(command -v grep) PERF=$(${GREP} -A1 -i "${VARIANT}" ${FILE} | \ ${GREP} -e "performance" | \ ${CUT} -d" " -f2 | \ ${SORT} -n) NUM=$(echo "${PERF}" | wc -l | tr -d " ") MIN=$(echo ${PERF} | ${CUT} -d" " -f1) MAX=$(echo ${PERF} | ${CUT} -d" " -f${NUM}) echo "num=${NUM}" echo "min=${MIN}" echo "max=${MAX}" BC=$(command -v bc) if [ "${BC}" ]; then AVG=$(echo "$(echo -n "scale=3;(${PERF})/${NUM}" | tr "\n" "+")" | ${BC}) NUM2=$((NUM / 2)) if [ "0" = "$((NUM % 2))" ]; then A=$(echo ${PERF} | ${CUT} -d" " -f${NUM2}) B=$(echo ${PERF} | ${CUT} -d" " -f$((NUM2 + 1))) MED=$(echo "$(echo -n "scale=3;(${A} + ${B})/2")" | ${BC}) else MED=$(echo ${PERF} | ${CUT} -d" " -f$((NUM2 + 1))) fi echo "avg=${AVG}" echo "med=${MED}" fi if [ -f /cygdrive/c/Program\ Files/gnuplot/bin/wgnuplot ]; then WGNUPLOT=/cygdrive/c/Program\ Files/gnuplot/bin/wgnuplot GNUPLOT=/cygdrive/c/Program\ Files/gnuplot/bin/gnuplot elif [ -f /cygdrive/c/Program\ Files\ \(x86\)/gnuplot/bin/wgnuplot ]; then WGNUPLOT=/cygdrive/c/Program\ Files\ \(x86\)/gnuplot/bin/wgnuplot GNUPLOT=/cygdrive/c/Program\ Files\ \(x86\)/gnuplot/bin/gnuplot else GNUPLOT=$(command -v gnuplot) WGNUPLOT=${GNUPLOT} fi GNUPLOT_MAJOR=0 GNUPLOT_MINOR=0 if [ -f "${GNUPLOT}" ]; then GNUPLOT_MAJOR=$("${GNUPLOT}" --version | ${SED} "s/.\+ \([0-9]\).\([0-9]\) .*/\1/") GNUPLOT_MINOR=$("${GNUPLOT}" --version | ${SED} "s/.\+ \([0-9]\).\([0-9]\) .*/\2/") fi GNUPLOT_VERSION=$((GNUPLOT_MAJOR * 10000 + GNUPLOT_MINOR * 100)) if [ "40600" -le "${GNUPLOT_VERSION}" ]; then # determine behavior of sort command export LC_ALL=C.UTF-8 if [ "" = "$1" ]; then FILENAME=cp2k-$(echo ${VARIANT} | tr '[:upper:]' '[:lower:]').pdf else FILENAME=$1 shift fi if [ "" = "$1" ]; then MULTI=1 else MULTI=$1 shift fi ${GREP} -i -A2 \ -e "^m=" -e "${VARIANT}" \ ${FILE} | \ ${SED} \ -e "s/m=//" -e "s/n=//" -e "s/k=//" -e "s/ldc=[0-9][0-9]* //" -e "s/ (..*) / /" \ -e "s/size=//" -e "s/batch=[0-9][0-9]* //" -e "s/memory=//" -e "s/ GB\/s//" \ -e "/^..*\.\.\./Id" -e "/^$/d" -e "/--/d" | \ ${SED} \ -e "N;s/ MB\( (.P)\)*\n\tperformance://g" \ -e "N;s/ GFLOPS\/s\n\tbandwidth://g" \ > "${HERE}/cp2k-perf.dat" env \ GDFONTPATH=/cygdrive/c/Windows/Fonts \ FILENAME=${FILENAME} \ MULTI=${MULTI} \ "${WGNUPLOT}" cp2k-perf.plt fi libxsmm-1.17/samples/cp2k/mdarray.hpp000066400000000000000000001014341415223013700175710ustar00rootroot00000000000000/** \file mdarray.hpp * * \brief Contains implementation of multidimensional array class. */ #ifndef __MDARRAY_HPP__ #define __MDARRAY_HPP__ #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HAVE_CUDA #include "GPU/cuda.hpp" #endif #if defined(HAVE_MKL) || defined(__MKL) # include #elif defined(__CBLAS) # include #else # define CblasRowMajor 101 # define CblasColMajor 102 extern "C" void cblas_dger(int, int, int, double, const double*, int, const double*, int, double*, int); #endif #if !defined(CBLAS_LAYOUT) # define CBLAS_LAYOUT int #endif #ifdef NDEBUG #define mdarray_assert(condition__) #else #define mdarray_assert(condition__) \ { \ if (!(condition__)) { \ printf("Assertion (%s) failed ", #condition__); \ printf("at line %i of file %s\n", __LINE__, __FILE__); \ printf("array label: %s\n", label_.c_str()); \ int mdarray_assert_i_ = 0; \ for (; mdarray_assert_i_ < N; mdarray_assert_i_++) \ printf("dim[%i].size = %llu\n", mdarray_assert_i_, \ static_cast( \ dims_[mdarray_assert_i_].size())); \ raise(SIGTERM); \ exit(-13); \ } \ } #endif /// Type of the main processing unit. enum device_t { /// CPU device. CPU = 0, /// GPU device (with CUDA programming model). GPU = 1 }; /// Type of memory. /** Various combinations of flags can be used. To check for any host memory (pinned or non-pinned): \code{.cpp} mem_type & memory_t::host == memory_t::host \endcode To check for pinned memory: \code{.cpp} mem_type & memory_t::host_pinned == memory_t::host_pinned \endcode To check for device memory: \code{.cpp} mem_type & memory_t::device == memory_t::device \endcode */ enum class memory_t : unsigned int { /// Nothing. none = 0b000, /// Host memory. host = 0b001, /// Pinned host memory. This is host memory + extra bit flag. host_pinned = 0b011, /// Device memory. device = 0b100 }; inline constexpr memory_t operator&(memory_t a__, memory_t b__) { return static_cast(static_cast(a__) & static_cast(b__)); } inline constexpr memory_t operator|(memory_t a__, memory_t b__) { return static_cast(static_cast(a__) | static_cast(b__)); } inline constexpr bool on_device(memory_t mem_type__) { return (mem_type__ & memory_t::device) == memory_t::device ? true : false; } /// Index descriptor of mdarray. class mdarray_index_descriptor { private: /// Beginning of index. int64_t begin_{0}; /// End of index. int64_t end_{-1}; /// Size of index. size_t size_{0}; public: /// Constructor of empty descriptor. mdarray_index_descriptor() { } /// Constructor for index range [0, size). mdarray_index_descriptor(size_t const size__) : begin_(0) , end_(size__ - 1) , size_(size__) { } /// Constructor for index range [begin, end] mdarray_index_descriptor(int64_t const begin__, int64_t const end__) : begin_(begin__) , end_(end__) , size_(end_ - begin_ + 1) { assert(end_ >= begin_); }; /// Constructor for index range [begin, end] mdarray_index_descriptor(std::pair const range__) : begin_(range__.first) , end_(range__.second) , size_(end_ - begin_ + 1) { assert(end_ >= begin_); }; /// Return first index value. inline int64_t begin() const { return begin_; } /// Return last index value. inline int64_t end() const { return end_; } /// Return index size. inline size_t size() const { return size_; } }; /// Base class of multidimensional array. template class mdarray_base { protected: /// Optional array label. std::string label_; /// Unique pointer to the allocated memory. /// std::unique_ptr> unique_ptr_{nullptr}; /// Raw pointer. T* raw_ptr_{nullptr}; // layout Fortran by default CBLAS_LAYOUT layout_{CblasColMajor}; // the table is allocated outside the class bool allocated_outside_cpu_{true}; // the table is allocated outside the class bool allocated_outside_gpu_{true}; #ifdef __GPU /// Unique pointer to the allocated GPU memory. /// std::unique_ptr> unique_ptr_device_{nullptr}; /// Raw pointer to GPU memory T* raw_ptr_device_{nullptr}; #endif /// Array dimensions. std::array dims_; /// List of offsets to compute the element location by dimension indices. std::array offsets_; /// leading dimension on CPU and GPUs (can be different because of alignment constraint) size_t ld_cpu_{0}; size_t ld_gpu_{0}; size_t raw_data_size_{0}; void init_dimensions(std::array const dims__) { dims_ = dims__; offsets_[0] = -dims_[0].begin(); size_t ld{1}; for (int i = 1; i < N; i++) { ld *= dims_[i - 1].size(); offsets_[i] = ld; offsets_[0] -= ld * dims_[i].begin(); } ld_cpu_ = dims_[0].size(); #ifdef HAVE_CUDA ni = dims_[0].size() / WARP_SIZE; lda_gpu_ = 32 * ( (dims_[0].size() % WARP_SIZE) != 0 + ni); #endif } private: inline int64_t idx(std::array idx__) const { #ifdef NDEBUG for (int d = 0; d < N; d++) { mdarray_assert(idx__[d] >= dims_[d].begin() && i0 <= dims_[d].end()); } #endif size_t i = offsets_[0] + idx__[0]; for (int d = 0; d < idx__.size(); d++) i += idx__[d] * offsets_[d]; mdarray_assert(/*i >= 0 &&*/ i < size()); return i; } inline int64_t idx(int64_t const i0) const { static_assert(N == 1, "wrong number of dimensions"); mdarray_assert(i0 >= dims_[0].begin() && i0 <= dims_[0].end()); size_t i = offsets_[0] + i0; mdarray_assert(/*i >= 0 &&*/ i < size()); return i; } inline int64_t idx(int64_t const i0, int64_t const i1) const { static_assert(N == 2, "wrong number of dimensions"); mdarray_assert(i0 >= dims_[0].begin() && i0 <= dims_[0].end()); mdarray_assert(i1 >= dims_[1].begin() && i1 <= dims_[1].end()); size_t i = offsets_[0] + i0 + i1 * offsets_[1]; mdarray_assert(/*i >= 0 &&*/ i < size()); return i; } inline int64_t idx(int64_t const i0, int64_t const i1, int64_t const i2) const { static_assert(N == 3, "wrong number of dimensions"); mdarray_assert(i0 >= dims_[0].begin() && i0 <= dims_[0].end()); mdarray_assert(i1 >= dims_[1].begin() && i1 <= dims_[1].end()); mdarray_assert(i2 >= dims_[2].begin() && i2 <= dims_[2].end()); size_t i = offsets_[0] + i0 + i1 * offsets_[1] + i2 * offsets_[2]; mdarray_assert(/*i >= 0 &&*/ i < size()); return i; } inline int64_t idx(int64_t const i0, int64_t const i1, int64_t const i2, int64_t const i3) const { static_assert(N == 4, "wrong number of dimensions"); mdarray_assert(i0 >= dims_[0].begin() && i0 <= dims_[0].end()); mdarray_assert(i1 >= dims_[1].begin() && i1 <= dims_[1].end()); mdarray_assert(i2 >= dims_[2].begin() && i2 <= dims_[2].end()); mdarray_assert(i3 >= dims_[3].begin() && i3 <= dims_[3].end()); size_t i = offsets_[0] + i0 + i1 * offsets_[1] + i2 * offsets_[2] + i3 * offsets_[3]; mdarray_assert(/*i >= 0 &&*/ i < size()); return i; } inline int64_t idx(int64_t const i0, int64_t const i1, int64_t const i2, int64_t const i3, int64_t const i4) const { static_assert(N == 5, "wrong number of dimensions"); mdarray_assert(i0 >= dims_[0].begin() && i0 <= dims_[0].end()); mdarray_assert(i1 >= dims_[1].begin() && i1 <= dims_[1].end()); mdarray_assert(i2 >= dims_[2].begin() && i2 <= dims_[2].end()); mdarray_assert(i3 >= dims_[3].begin() && i3 <= dims_[3].end()); mdarray_assert(i4 >= dims_[4].begin() && i4 <= dims_[4].end()); size_t i = offsets_[0] + i0 + i1 * offsets_[1] + i2 * offsets_[2] + i3 * offsets_[3] + i4 * offsets_[4]; mdarray_assert(/*i >= 0 &&*/ i < size()); return i; } template inline T* at_idx(int64_t const idx__) { switch (pu) { case CPU: { mdarray_assert(raw_ptr_ != nullptr); return &raw_ptr_[idx__]; } case GPU: { #ifdef HAVE_CUDA mdarray_assert(raw_ptr_device_ != nullptr); return &raw_ptr_device_[idx__]; #else printf("error at line %i of file %s: not compiled with GPU support\n", __LINE__, __FILE__); exit(0); #endif } } return nullptr; } template inline T const* at_idx(int64_t const idx__) const { switch (pu) { case CPU: { mdarray_assert(raw_ptr_ != nullptr); return &raw_ptr_[idx__]; } case GPU: { #ifdef HAVE_CUDA mdarray_assert(raw_ptr_device_ != nullptr); return &raw_ptr_device_[idx__]; #else printf("error at line %i of file %s: not compiled with GPU support\n", __LINE__, __FILE__); exit(0); #endif } } return nullptr; } /// Copy constructor is forbidden mdarray_base(mdarray_base const& src) = delete; /// Assignment operator is forbidden mdarray_base& operator=(mdarray_base const& src) = delete; public: /// Constructor of an empty array. mdarray_base() { } /// Destructor. ~mdarray_base() { } /// Move constructor mdarray_base(mdarray_base&& src) : label_(src.label_) , //unique_ptr_(std::move(src.unique_ptr_)), raw_ptr_(src.raw_ptr_) , allocated_outside_cpu_(src.allocated_outside_cpu_) #ifdef __GPU , allocated_outside_gpu_(src.allocated_outside_gpu_) , //unique_ptr_device_(std::move(src.unique_ptr_device_)), raw_ptr_device_(src.raw_ptr_device_) , layout_(src.Layout_) #endif { for (int i = 0; i < N; i++) { dims_[i] = src.dims_[i]; offsets_[i] = src.offsets_[i]; } src.raw_ptr_ = nullptr; #ifdef __GPU src.raw_ptr_device_ = nullptr; #endif } /// Move assigment operator inline mdarray_base& operator=(mdarray_base&& src) { if (this != &src) { label_ = src.label_; layout_ = src.layout_; raw_ptr_ = src.raw_ptr_; raw_data_size_ = src.raw_data_size_; allocated_outside_cpu_ = src.allocated_outside_cpu_; src.raw_ptr_ = nullptr; #ifdef __GPU raw_ptr_device_ = src.raw_ptr_device_; src.raw_ptr_device_ = nullptr; allocated_outside_gpu_ = src.allocated_outside_gpu_; #endif for (int i = 0; i < N; i++) { dims_[i] = src.dims_[i]; offsets_[i] = src.offsets_[i]; } } return *this; } /// Allocate memory for array. void allocate(memory_t memory__) { if ((memory__ & memory_t::host) == memory_t::host) { #if defined(_WIN32) raw_ptr_ = static_cast(_aligned_malloc(sizeof(T) * size(), 256)); if (raw_ptr_ == nullptr) #else if (posix_memalign(reinterpret_cast(&raw_ptr_), 256, sizeof(T) * size()) != 0) #endif { printf("Allocation failed\n"); std::abort(); } allocated_outside_cpu_ = false; } #ifdef __GPU if ((memory__ & memory_t::device) == memory_t::device) { cudaMalloc(&raw_prt_device_, sizeof(T) * size()); allocated_outside_gpu_ = false; } #endif } void deallocate(memory_t memory__) { if ((memory__ & memory_t::host) == memory_t::host) { if ((raw_ptr_ != nullptr) && (!allocated_outside_cpu_)) { #if defined(_WIN32) _aligned_free(raw_ptr_); #else free(raw_ptr_); #endif raw_ptr_ = nullptr; } } #ifdef __GPU if ((memory__ & memory_t::device) == memory_t::device) { if ((raw_ptr_ != nullptr) && (!allocated_outside_gpu_)) { free(raw_ptr_device_); raw_ptr_device_ = nullptr; } } #endif } void clear() { deallocate(memory_t::host); #ifdef HAVE_CUDA deallocate(memory_t::device); #endif } inline T& operator()(int64_t const i0) { mdarray_assert(raw_ptr_ != nullptr); return raw_ptr_[idx(i0)]; } inline T const& operator()(int64_t const i0) const { mdarray_assert(raw_ptr_ != nullptr); return raw_ptr_[idx(i0)]; } inline T& operator()(int64_t const i0, int64_t const i1) { mdarray_assert(raw_ptr_ != nullptr); if (layout_ == CblasColMajor) return raw_ptr_[idx(i0, i1)]; else return raw_ptr_[idx(i1, i0)]; } inline T const& operator()(int64_t const i0, int64_t const i1) const { mdarray_assert(raw_ptr_ != nullptr); if (layout_ == CblasColMajor) return raw_ptr_[idx(i0, i1)]; else return raw_ptr_[idx(i1, i0)]; } inline T& operator()(int64_t const i0, int64_t const i1, int64_t const i2) { mdarray_assert(raw_ptr_ != nullptr); if (layout_ == CblasColMajor) return raw_ptr_[idx(i0, i1, i2)]; else return raw_ptr_[idx(i2, i1, i0)]; } inline T const& operator()(int64_t const i0, int64_t const i1, int64_t const i2) const { mdarray_assert(raw_ptr_ != nullptr); if (layout_ == CblasColMajor) return raw_ptr_[idx(i0, i1, i2)]; else return raw_ptr_[idx(i2, i1, i0)]; } inline T& operator()(int64_t const i0, int64_t const i1, int64_t const i2, int64_t const i3) { mdarray_assert(raw_ptr_ != nullptr); if (layout_ == CblasColMajor) return raw_ptr_[idx(i0, i1, i2, i3)]; else return raw_ptr_[idx(i3, i2, i1, i0)]; } inline T const& operator()(int64_t const i0, int64_t const i1, int64_t const i2, int64_t const i3) const { mdarray_assert(raw_ptr_ != nullptr); if (layout_ == CblasColMajor) return raw_ptr_[idx(i0, i1, i2, i3)]; else return raw_ptr_[idx(i3, i2, i1, i0)]; } inline T& operator()(int64_t const i0, int64_t const i1, int64_t const i2, int64_t const i3, int64_t const i4) { mdarray_assert(raw_ptr_ != nullptr); if (layout_ == CblasColMajor) return raw_ptr_[idx(i0, i1, i2, i3, i4)]; else return raw_ptr_[idx(i4, i3, i2, i1, i0)]; } inline T const& operator()(int64_t const i0, int64_t const i1, int64_t const i2, int64_t const i3, int64_t const i4) const { mdarray_assert(raw_ptr_ != nullptr); if (layout_ == CblasColMajor) return raw_ptr_[idx(i0, i1, i2, i3, i4)]; else return raw_ptr_[idx(i4, i3, i2, i1, i0)]; } inline T& operator()(std::array idx__) { mdarray_assert(raw_ptr_ != nullptr); return raw_ptr_[idx(idx__)]; } inline T& operator[](size_t const idx__) { mdarray_assert(/*idx__ >= 0 &&*/ idx__ < size()); return raw_ptr_[idx__]; } inline T const& operator[](size_t const idx__) const { assert(/*idx__ >= 0 &&*/ idx__ < size()); return raw_ptr_[idx__]; } template inline T* at() { return at_idx(0); } template inline T const* at() const { return at_idx(0); } template inline T* at(int64_t const i0) { return at_idx(idx(i0)); } template inline T const* at(int64_t const i0) const { return at_idx(idx(i0)); } template inline T* at(int64_t const i0, int64_t const i1) { if (layout_ == CblasColMajor) return at_idx(idx(i0, i1)); else return at_idx(idx(i1, i0)); } template inline T const* at(int64_t const i0, int64_t const i1) const { if (layout_ == CblasColMajor) return at_idx(idx(i0, i1)); else return at_idx(idx(i1, i0)); } template inline T* at(int64_t const i0, int64_t const i1, int64_t const i2) { if (layout_ == CblasColMajor) return at_idx(idx(i0, i1, i2)); else return at_idx(idx(i2, i1, i0)); } template inline T const* at(int64_t const i0, int64_t const i1, int64_t const i2) const { if (layout_ == CblasColMajor) return at_idx(idx(i0, i1, i2)); else return at_idx(idx(i2, i1, i0)); } template inline T* at(int64_t const i0, int64_t const i1, int64_t const i2, int64_t const i3) { if (layout_ == CblasColMajor) return at_idx(idx(i0, i1, i2, i3)); else return at_idx(idx(i3, i2, i1, i0)); } template inline T const* at(int64_t const i0, int64_t const i1, int64_t const i2, int64_t const i3) const { if (layout_ == CblasColMajor) return at_idx(idx(i0, i1, i2, i3)); else return at_idx(idx(i3, i2, i1, i0)); } template inline T* at(int64_t const i0, int64_t const i1, int64_t const i2, int64_t const i3, int64_t const i4) { if (layout_ == CblasColMajor) return at_idx(idx(i0, i1, i2, i3, i4)); else return at_idx(idx(i4, i3, i2, i1, i0)); } template inline T* at(std::array const idx__) { if (layout_ == CblasRowMajor) std::reverse(std::begin(idx__), std::end(idx__)); return at_idx(idx(idx__)); } template /// Return total size (number of elements) of the array. inline size_t size() const { size_t size_{1}; for (int i = 0; i < N; i++) { size_ *= dims_[i].size(); } return size_; } /// Return size of particular dimension. inline size_t size(int i) const { mdarray_assert(i < N); if (layout_ == CblasRowMajor) return dims_[N - i - 1].size(); else return dims_[i].size(); } /// Return leading dimension size. inline uint32_t ld() const { mdarray_assert(dims_[0].size() < size_t(1 << 31)); return (int32_t)dims_[0].size(); } /// Compute hash of the array /** Example: printf("hash(h) : %16llX\n", h.hash()); */ inline uint64_t hash(uint64_t h__ = 5381) const { for (size_t i = 0; i < size() * sizeof(T); i++) { h__ = ((h__ << 5) + h__) + ((unsigned char*)raw_ptr_)[i]; } return h__; } /// Copy the content of the array to dest void operator>>(mdarray_base& dest__) const { for (int i = 0; i < N; i++) { if (dest__.dims_[i].begin() != dims_[i].begin() || dest__.dims_[i].end() != dims_[i].end()) { printf("error at line %i of file %s: array dimensions don't match\n", __LINE__, __FILE__); raise(SIGTERM); exit(-1); } } std::memcpy(dest__.raw_ptr_, raw_ptr_, size() * sizeof(T)); } /// Copy n elements starting from idx0. template inline void copy(size_t idx0__, size_t n__, int stream_id__ = -1) { #ifdef HAVE_CUDA mdarray_assert(raw_ptr_ != nullptr); mdarray_assert(raw_ptr_device_ != nullptr); mdarray_assert(idx0__ + n__ <= size()); if ((from__ & memory_t::host) == memory_t::host && (to__ & memory_t::device) == memory_t::device) { if (stream_id__ == -1) { acc::copyin(&raw_ptr_device_[idx0__], &raw_ptr_[idx0__], n__); } else { acc::copyin(&raw_ptr_device_[idx0__], &raw_ptr_[idx0__], n__, stream_id__); } } if ((from__ & memory_t::device) == memory_t::device && (to__ & memory_t::host) == memory_t::host) { if (stream_id__ == -1) { acc::copyout(&raw_ptr_[idx0__], &raw_ptr_device_[idx0__], n__); } else { acc::copyout(&raw_ptr_[idx0__], &raw_ptr_device_[idx0__], n__, stream_id__); } } #else (void)idx0__; (void)n__; (void)stream_id__; /* unused */ #endif } template inline void copy(size_t n__) { copy(0, n__); } template inline void async_copy(size_t n__, int stream_id__) { copy(0, n__, stream_id__); } template inline void copy() { copy(0, size()); } template inline void async_copy(int stream_id__) { copy(0, size(), stream_id__); } inline void retrieve(T *dst) { if (dst == this->at()) return; mdarray_assert(dst != nullptr); memcpy (dst, this->at(), sizeof(T) * this->size()); } inline void store(const T *src) { if (src == this->at< CPU>()) return; mdarray_assert(src != nullptr); memcpy (this->at(), src, sizeof(double) * this->size()); } /// Zero n elements starting from idx0. template inline void zero(size_t idx0__, size_t n__) { mdarray_assert(idx0__ + n__ <= size()); if (((mem_type__ & memory_t::host) == memory_t::host) && n__) { mdarray_assert(raw_ptr_ != nullptr); std::memset(reinterpret_cast(&raw_ptr_[idx0__]), 0, n__ * sizeof(T)); } #ifdef HAVE_CUDA if (((mem_type__ & memory_t::device) == memory_t::device) && on_device() && n__) { mdarray_assert(raw_ptr_device_ != nullptr); acc::zero(&raw_ptr_device_[idx0__], n__); } #endif } template inline void zero() { zero(0, size()); } inline bool on_device() const { #ifdef HAVE_CUDA return (raw_ptr_device_ != nullptr); #else return false; #endif } }; /// Multidimensional array with the column-major (Fortran) order. template class mdarray : public mdarray_base { public: mdarray() { } mdarray(std::array const& shape, memory_t memory__ = memory_t::host, std::string label__ = "") { this->label_ = label__; this->layout_ = format; this->init_dimensions(shape); this->allocate(memory__); } mdarray(mdarray_index_descriptor const& d0, memory_t memory__ = memory_t::host, std::string label__ = "") { static_assert(N == 1, "wrong number of dimensions"); this->label_ = label__; this->layout_ = format; this->init_dimensions({d0}); this->allocate(memory__); } mdarray(mdarray_index_descriptor const& d0, mdarray_index_descriptor const& d1, memory_t memory__ = memory_t::host, std::string label__ = "") { static_assert(N == 2, "wrong number of dimensions"); this->label_ = label__; this->layout_ = format; if (this->layout_ == CblasColMajor) this->init_dimensions({{d0, d1}}); else this->init_dimensions({{d1, d0}}); this->allocate(memory__); } mdarray(mdarray_index_descriptor const& d0, mdarray_index_descriptor const& d1, mdarray_index_descriptor const& d2, memory_t memory__ = memory_t::host, std::string label__ = "") { static_assert(N == 3, "wrong number of dimensions"); this->label_ = label__; this->layout_ = format; if (this->layout_ == CblasColMajor) this->init_dimensions({{d0, d1, d2}}); else this->init_dimensions({{d2, d1, d0}}); this->allocate(memory__); } mdarray(mdarray_index_descriptor const& d0, mdarray_index_descriptor const& d1, mdarray_index_descriptor const& d2, mdarray_index_descriptor const& d3, memory_t memory__ = memory_t::host, std::string label__ = "") { static_assert(N == 4, "wrong number of dimensions"); this->label_ = label__; this->layout_ = format; if (this->layout_ == CblasColMajor) this->init_dimensions({{d0, d1, d2, d3}}); else this->init_dimensions({{d3, d2, d1, d0}}); this->allocate(memory__); } mdarray(mdarray_index_descriptor const& d0, mdarray_index_descriptor const& d1, mdarray_index_descriptor const& d2, mdarray_index_descriptor const& d3, mdarray_index_descriptor const& d4, memory_t memory__ = memory_t::host, std::string label__ = "") { static_assert(N == 5, "wrong number of dimensions"); this->label_ = label__; this->layout_ = format; if (this->layout_ == CblasColMajor) this->init_dimensions({{d0, d1, d2, d3, d4}}); else this->init_dimensions({{d4, d3, d2, d1, d0}}); this->allocate(memory__); } mdarray(T* ptr__, std::array const& shape, std::string label__ = "") { this->layout_ = format; this->label_ = label__; this->init_dimensions(shape); this->raw_ptr_ = ptr__; } mdarray(T* ptr__, mdarray_index_descriptor const& d0, std::string label__ = "") { static_assert(N == 1, "wrong number of dimensions"); this->layout_ = format; this->label_ = label__; this->init_dimensions({d0}); this->raw_ptr_ = ptr__; } mdarray(T* ptr__, T* ptr_device__, mdarray_index_descriptor const& d0, std::string label__ = "") { static_assert(N == 1, "wrong number of dimensions"); this->layout_ = format; this->label_ = label__; this->init_dimensions({d0}); this->raw_ptr_ = ptr__; #ifdef HAVE_CUDA this->raw_ptr_device_ = ptr_device__; #else (void)ptr_device__; /* unused */ #endif } mdarray(T* ptr__, mdarray_index_descriptor const& d0, mdarray_index_descriptor const& d1, std::string label__ = "") { static_assert(N == 2, "wrong number of dimensions"); this->layout_ = format; this->label_ = label__; if (this->layout_ == CblasColMajor) this->init_dimensions({{d0, d1}}); else this->init_dimensions({{d1, d0}}); this->raw_ptr_ = ptr__; } mdarray(T* ptr__, T* ptr_device__, mdarray_index_descriptor const& d0, mdarray_index_descriptor const& d1, std::string label__ = "") { static_assert(N == 2, "wrong number of dimensions"); this->layout_ = format; this->label_ = label__; if (this->layout_ == CblasColMajor) this->init_dimensions({{d0, d1}}); else this->init_dimensions({{d1, d0}}); this->raw_ptr_ = ptr__; #ifdef HAVE_CUDA this->raw_ptr_device_ = ptr_device__; #else (void)ptr_device__; /* unused */ #endif } mdarray(T* ptr__, mdarray_index_descriptor const& d0, mdarray_index_descriptor const& d1, mdarray_index_descriptor const& d2, std::string label__ = "") { static_assert(N == 3, "wrong number of dimensions"); this->layout_ = format; this->label_ = label__; if (this->layout_ == CblasColMajor) this->init_dimensions({{d0, d1, d2}}); else this->init_dimensions({{d2, d1, d0}}); this->raw_ptr_ = ptr__; } mdarray(T* ptr__, T* ptr_device__, mdarray_index_descriptor const& d0, mdarray_index_descriptor const& d1, mdarray_index_descriptor const& d2, std::string label__ = "") { static_assert(N == 3, "wrong number of dimensions"); this->layout_ = format; this->label_ = label__; if (this->layout_ == CblasColMajor) this->init_dimensions({{d0, d1, d2}}); else this->init_dimensions({{d2, d1, d0}}); this->raw_ptr_ = ptr__; #ifdef HAVE_CUDA this->raw_ptr_device_ = ptr_device__; #else (void)ptr_device__; /* unused */ #endif } mdarray(T* ptr__, mdarray_index_descriptor const& d0, mdarray_index_descriptor const& d1, mdarray_index_descriptor const& d2, mdarray_index_descriptor const& d3, std::string label__ = "") { static_assert(N == 4, "wrong number of dimensions"); this->layout_ = format; this->label_ = label__; if (this->layout_ == CblasColMajor) this->init_dimensions({{d0, d1, d2, d3}}); else this->init_dimensions({{d3, d2, d1, d0}}); this->raw_ptr_ = ptr__; } mdarray(T* ptr__, mdarray_index_descriptor const& d0, mdarray_index_descriptor const& d1, mdarray_index_descriptor const& d2, mdarray_index_descriptor const& d3, mdarray_index_descriptor const& d4, std::string label__ = "") { static_assert(N == 5, "wrong number of dimensions"); this->layout_ = format; this->label_ = label__; if (this->layout_ == CblasColMajor) this->init_dimensions({{d0, d1, d2, d3, d4}}); else this->init_dimensions({{d4, d3, d2, d1, d0}}); this->raw_ptr_ = ptr__; } // mdarray& operator=(std::function f__) // { // static_assert(N == 1, "wrong number of dimensions"); // for (int64_t i0 = this->dims_[0].begin(); i0 <= this->dims_[0].end(); // i0++) { // (*this)(i0) = f__(i0); // } // return *this; // } // mdarray& operator=(std::function f__) // { // static_assert(N == 2, "wrong number of dimensions"); // for (int64_t i1 = this->dims_[1].begin(); i1 <= this->dims_[1].end(); // i1++) { // for (int64_t i0 = this->dims_[0].begin(); i0 <= // this->dims_[0].end(); i0++) { // (*this)(i0, i1) = f__(i0, i1); // } // } // return *this; // } }; // Alias for matrix template using matrix = mdarray; /// Serialize to std::ostream template std::ostream& operator<<(std::ostream& out, mdarray& v) { if (v.size()) { out << v[0]; for (size_t i = 1; i < v.size(); i++) { out << std::string(" ") << v[i]; } } return out; } #endif // __MDARRAY_HPP__ libxsmm-1.17/samples/cp2k/rt_graph.cc000066400000000000000000000437351415223013700175470ustar00rootroot00000000000000/* * Copyright (c) 2019 Simon Frasch * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the copyright holder nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "rt_graph.hpp" #include #include #include #include #include #include #include #include #include namespace rt_graph { // ====================== // internal helper // ====================== namespace internal { namespace { struct Format { Format(Stat stat_) : stat(stat_) { switch (stat_) { case Stat::Count: header = "#"; space = 6; break; case Stat::Total: header = "Total"; space = 14; break; case Stat::Mean: header = "Mean"; space = 14; break; case Stat::Median: header = "Median"; space = 14; break; case Stat::QuartileHigh: header = "Quartile High"; space = 14; break; case Stat::QuartileLow: header = "Quartile Low"; space = 14; break; case Stat::Min: header = "Min"; space = 14; break; case Stat::Max: header = "Max"; space = 14; break; case Stat::Percentage: header = "%"; space = 11; break; case Stat::ParentPercentage: header = "Parent %"; space = 11; break; } } Stat stat; std::string header; std::size_t space; }; // format time input in seconds into string with appropriate unit auto format_time(const double time_seconds) -> std::string { if (time_seconds <= 0.0) return std::string("0 s"); // time is always greater than 0 here const double exponent = std::log10(std::abs(time_seconds)); const int siExponent = static_cast(std::floor(exponent / 3.0) * 3); std::stringstream result; result << std::fixed << std::setprecision(2); result << time_seconds * std::pow(10.0, static_cast(-siExponent)); result << " "; switch (siExponent) { case 24: result << "Y"; break; case 21: result << "Z"; break; case 18: result << "E"; break; case 15: result << "P"; break; case 12: result << "T"; break; case 9: result << "G"; break; case 6: result << "M"; break; case 3: result << "k"; break; case 0: break; case -3: result << "m"; break; case -6: result << "u"; break; case -9: result << "n"; break; case -12: result << "p"; break; case -15: result << "f"; break; case -18: result << "a"; break; case -21: result << "z"; break; case -24: result << "y"; break; default: result << "?"; } result << "s"; return result.str(); } auto calc_median(const std::vector::const_iterator& begin, const std::vector::const_iterator& end) -> double { const auto n = end - begin; if (n == 0) return 0.0; if (n % 2 == 0) { return (*(begin + n / 2) + *(begin + n / 2 - 1)) / 2.0; } else { return *(begin + n / 2); } } auto print_stat(std::ostream& out, const Format& format, const std::vector& sortedTimings, double totalSum, double parentSum, double currentSum) -> void { switch (format.stat) { case Stat::Count: out << std::right << std::setw(format.space) << sortedTimings.size(); break; case Stat::Total: out << std::right << std::setw(format.space) << format_time(currentSum); break; case Stat::Mean: out << std::right << std::setw(format.space) << format_time(currentSum / sortedTimings.size()); break; case Stat::Median: out << std::right << std::setw(format.space) << format_time(calc_median(sortedTimings.begin(), sortedTimings.end())); break; case Stat::QuartileHigh: { const double upperQuartile = calc_median(sortedTimings.begin() + sortedTimings.size() / 2 + (sortedTimings.size() % 2) * (sortedTimings.size() > 1), sortedTimings.end()); out << std::right << std::setw(format.space) << format_time(upperQuartile); } break; case Stat::QuartileLow: { const double lowerQuartile = calc_median(sortedTimings.begin(), sortedTimings.begin() + sortedTimings.size() / 2); out << std::right << std::setw(format.space) << format_time(lowerQuartile); } break; case Stat::Min: out << std::right << std::setw(format.space) << format_time(sortedTimings.front()); break; case Stat::Max: out << std::right << std::setw(format.space) << format_time(sortedTimings.back()); break; case Stat::Percentage: { const double p = (totalSum < currentSum || totalSum == 0) ? 100.0 : currentSum / totalSum * 100.0; out << std::right << std::fixed << std::setprecision(2) << std::setw(format.space) << p; } break; case Stat::ParentPercentage: { const double p = (parentSum < currentSum || parentSum == 0) ? 100.0 : currentSum / parentSum * 100.0; out << std::right << std::fixed << std::setprecision(2) << std::setw(format.space) << p; } break; } } // Helper struct for creating a tree of timings struct TimeStampPair { std::string identifier; double time = 0.0; std::size_t startIdx = 0; std::size_t stopIdx = 0; internal::TimingNode* nodePtr = nullptr; }; auto calculate_statistic(std::vector values) -> std::tuple { if (values.empty()) return std::make_tuple(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0); std::sort(values.begin(), values.end()); const double min = values.front(); const double max = values.back(); const double median = calc_median(values.begin(), values.end()); const double sum = std::accumulate(values.begin(), values.end(), 0.0); const double mean = sum / values.size(); const double lowerQuartile = calc_median(values.begin(), values.begin() + values.size() / 2); const double upperQuartile = calc_median( values.begin() + values.size() / 2 + (values.size() % 2) * (values.size() > 1), values.end()); return std::make_tuple(sum, mean, median, min, max, lowerQuartile, upperQuartile); } // print rt_graph nodes in tree recursively auto print_node(std::ostream& out, const std::vector formats, const std::size_t identifierSpace, const std::string& nodePrefix, const internal::TimingNode& node, const bool isSubNode, const bool isLastSubnode, double parentTime, double totalTime) -> void { double sum, mean, median, min, max, lowerQuartile, upperQuartile; std::tie(sum, mean, median, min, max, lowerQuartile, upperQuartile) = calculate_statistic(node.timings); if (!isSubNode) { totalTime = sum; parentTime = sum; } const double totalPercentage = (totalTime < sum || totalTime == 0) ? 100.0 : sum / totalTime * 100.0; const double parentPercentage = (parentTime < sum || parentTime == 0) ? 100.0 : sum / parentTime * 100.0; std::stringstream totalPercentageStream; totalPercentageStream << std::fixed << std::setprecision(2) << totalPercentage; std::stringstream parentPercentageStream; parentPercentageStream << std::fixed << std::setprecision(2) << parentPercentage; out << std::left << std::setw(identifierSpace); if (isSubNode) out << nodePrefix + "- " + node.identifier; else out << nodePrefix + node.identifier; auto sortedTimings = node.timings; std::sort(sortedTimings.begin(), sortedTimings.end()); const double currentTime = std::accumulate(sortedTimings.begin(), sortedTimings.end(), 0.0); for (const auto& format : formats) { print_stat(out, format, sortedTimings, totalTime, parentTime, currentTime); } out << std::endl; for (const auto& subNode : node.subNodes) { print_node(out, formats, identifierSpace, nodePrefix + std::string(" |"), subNode, true, &subNode == &node.subNodes.back(), sum, totalTime); if (!isLastSubnode && &subNode == &node.subNodes.back()) { out << nodePrefix << std::endl; } } } // determine length of padding required for printing entire tree identifiers recursively auto max_node_identifier_length(const internal::TimingNode& node, const std::size_t recursionDepth, const std::size_t addPerLevel, const std::size_t parentMax) -> std::size_t { std::size_t currentLength = node.identifier.length() + recursionDepth * addPerLevel; std::size_t max = currentLength > parentMax ? currentLength : parentMax; for (const auto& subNode : node.subNodes) { const std::size_t subMax = max_node_identifier_length(subNode, recursionDepth + 1, addPerLevel, max); if (subMax > max) max = subMax; } return max; } auto export_node_json(const std::string& padding, const std::list& nodeList, std::ostream& stream) -> void { stream << "{" << std::endl; const std::string nodePadding = padding + " "; const std::string subNodePadding = nodePadding + " "; for (const auto& node : nodeList) { stream << nodePadding << "\"" << node.identifier << "\" : {" << std::endl; stream << subNodePadding << "\"timings\" : ["; for (const auto& value : node.timings) { stream << value; if (&value != &(node.timings.back())) stream << ", "; } stream << "]," << std::endl; stream << subNodePadding << "\"sub-timings\" : "; export_node_json(subNodePadding, node.subNodes, stream); stream << nodePadding << "}"; if (&node != &(nodeList.back())) stream << ","; stream << std::endl; } stream << padding << "}" << std::endl; } auto extract_timings(const std::string& identifier, const std::list& nodes, std::vector& timings) -> void { for (const auto& node : nodes) { if (node.identifier == identifier) { timings.insert(timings.end(), node.timings.begin(), node.timings.end()); } extract_timings(identifier, node.subNodes, timings); } } } // namespace } // namespace internal // ====================== // Timer // ====================== auto Timer::process() const -> TimingResult { std::list results; std::stringstream warnings; try { std::vector timePairs; timePairs.reserve(timeStamps_.size() / 2); // create pairs of start / stop timings for (std::size_t i = 0; i < timeStamps_.size(); ++i) { if (timeStamps_[i].type == internal::TimeStampType::Start) { internal::TimeStampPair pair; pair.startIdx = i; pair.identifier = std::string(timeStamps_[i].identifierPtr); std::size_t numInnerMatchingIdentifiers = 0; // search for matching stop after start for (std::size_t j = i + 1; j < timeStamps_.size(); ++j) { // only consider matching identifiers if (std::string(timeStamps_[j].identifierPtr) == std::string(timeStamps_[i].identifierPtr)) { if (timeStamps_[j].type == internal::TimeStampType::Stop && numInnerMatchingIdentifiers == 0) { // Matching stop found std::chrono::duration duration = timeStamps_[j].time - timeStamps_[i].time; pair.time = duration.count(); pair.stopIdx = j; timePairs.push_back(pair); if (pair.time < 0) { warnings << "rt_graph WARNING:Measured time is negative. Non-steady system-clock?!" << std::endl; } break; } else if (timeStamps_[j].type == internal::TimeStampType::Stop && numInnerMatchingIdentifiers > 0) { // inner stop with matching identifier --numInnerMatchingIdentifiers; } else if (timeStamps_[j].type == internal::TimeStampType::Start) { // inner start with matching identifier ++numInnerMatchingIdentifiers; } } } if (pair.stopIdx == 0) { warnings << "rt_graph WARNING: Start / stop time stamps do not match for \"" << timeStamps_[i].identifierPtr << "\"!" << std::endl; } } } // create tree of timings where sub-nodes represent timings fully enclosed by another start / // stop pair Use the fact that timePairs is sorted by startIdx for (std::size_t i = 0; i < timePairs.size(); ++i) { auto& pair = timePairs[i]; // find potential parent by going backwards through pairs, starting with the current pair // position for (auto timePairIt = timePairs.rbegin() + (timePairs.size() - i); timePairIt != timePairs.rend(); ++timePairIt) { if (timePairIt->stopIdx > pair.stopIdx && timePairIt->nodePtr != nullptr) { auto& parentNode = *(timePairIt->nodePtr); // check if sub-node with identifier exists bool nodeFound = false; for (auto& subNode : parentNode.subNodes) { if (subNode.identifier == pair.identifier) { nodeFound = true; subNode.timings.push_back(pair.time); // mark node position in pair for finding sub-nodes pair.nodePtr = &(subNode); break; } } if (!nodeFound) { // create new sub-node internal::TimingNode newNode; newNode.identifier = pair.identifier; newNode.timings.push_back(pair.time); parentNode.subNodes.push_back(std::move(newNode)); // mark node position in pair for finding sub-nodes pair.nodePtr = &(parentNode.subNodes.back()); } break; } } // No parent found, must be top level node if (pair.nodePtr == nullptr) { // Check if top level node with same name exists for (auto& topNode : results) { if (topNode.identifier == pair.identifier) { topNode.timings.push_back(pair.time); pair.nodePtr = &(topNode); break; } } } // New top level node if (pair.nodePtr == nullptr) { internal::TimingNode newNode; newNode.identifier = pair.identifier; newNode.timings.push_back(pair.time); // newNode.parent = nullptr; results.push_back(std::move(newNode)); // mark node position in pair for finding sub-nodes pair.nodePtr = &(results.back()); } } } catch (const std::exception& e) { warnings << "rt_graph WARNING: Processing of timings failed: " << e.what() << std::endl; } catch (...) { warnings << "rt_graph WARNING: Processing of timings failed!" << std::endl; } return TimingResult(std::move(results), warnings.str()); } // ====================== // // ====================== auto TimingResult::json() const -> std::string { std::stringstream jsonStream; jsonStream << std::scientific; internal::export_node_json("", rootNodes_, jsonStream); return jsonStream.str(); } auto TimingResult::get_timings(const std::string& identifier) const -> std::vector { std::vector timings; internal::extract_timings(identifier, rootNodes_, timings); return timings; } auto TimingResult::print(std::vector statistic) const -> std::string { std::stringstream stream; // print warnings stream << warnings_; // calculate space for printing identifiers std::size_t identifierSpace = 0; for (const auto& node : rootNodes_) { const auto nodeMax = internal::max_node_identifier_length(node, 0, 2, identifierSpace); if (nodeMax > identifierSpace) identifierSpace = nodeMax; } identifierSpace += 3; auto totalSpace = identifierSpace; std::vector formats; formats.reserve(statistic.size()); for (const auto& stat : statistic) { formats.emplace_back(stat); totalSpace += formats.back().space; } // Construct table header // Table start stream << std::string(totalSpace, '=') << std::endl; // header stream << std::right << std::setw(identifierSpace) << ""; for (const auto& format : formats) { stream << std::right << std::setw(format.space) << format.header; } stream << std::endl; // Header separation line stream << std::string(totalSpace, '-') << std::endl; // print all timings for (const auto& node : rootNodes_) { internal::print_node(stream, formats, identifierSpace, std::string(), node, false, true, 0.0, 0.0); stream << std::endl; } // End table stream << std::string(totalSpace, '=') << std::endl; return stream.str(); } } // namespace rt_graph libxsmm-1.17/samples/cp2k/rt_graph.hpp000066400000000000000000000175411415223013700177450ustar00rootroot00000000000000/* * Copyright (c) 2019 Simon Frasch * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the copyright holder nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef RT_GRAPH_HPP_GUARD #define RT_GRAPH_HPP_GUARD #include #include #include #include #include #include #include namespace rt_graph { using ClockType = std::chrono::high_resolution_clock; // Selection of available statistics enum class Stat { Count, // Number of measurements Total, // Total accumulated time Mean, // Mean time Median, // Median time QuartileHigh, // Third quartile time QuartileLow, // First quartile time Min, // Mininum time Max, // Maximum time Percentage, // Percentage of accumulated time with respect to the top-level node in graph ParentPercentage // Percentage of accumulated time with respect to the parent node in graph }; // internal helper functionality namespace internal { enum class TimeStampType { Start, Stop, Empty }; struct TimeStamp { TimeStamp() : type(TimeStampType::Empty) {} // Identifier pointer must point to compile time string literal TimeStamp(const char* identifier, const TimeStampType& stampType) : time(ClockType::now()), identifierPtr(identifier), type(stampType) {} ClockType::time_point time; const char* identifierPtr; TimeStampType type; }; struct TimingNode { std::string identifier; std::vector timings; std::list subNodes; }; } // namespace internal // Processed timings results. class TimingResult { public: TimingResult(std::list rootNodes, std::string warnings) : rootNodes_(std::move(rootNodes)), warnings_(std::move(warnings)) {} // Get json representation of the full graph with all timings. Unit of time is seconds. auto json() const -> std::string; // Get all timings for given identifier auto get_timings(const std::string& identifier) const -> std::vector; // Print graph statistic to string. auto print(std::vector statistic = {Stat::Count, Stat::Total, Stat::Percentage, Stat::ParentPercentage, Stat::Median, Stat::Min, Stat::Max}) const -> std::string; private: std::list rootNodes_; std::string warnings_; }; class ScopedTiming; // Timer class, which allows to start / stop measurements with a given identifier. class Timer { public: // reserve space for 1000'000 measurements Timer() { timeStamps_.reserve(2 * 1000 * 1000); } // reserve space for given number of measurements explicit Timer(std::size_t reserveCount) { timeStamps_.reserve(2 * reserveCount); } // start with string literal identifier template inline auto start(const char (&identifierPtr)[N]) -> void { atomic_signal_fence(std::memory_order_seq_cst); // only prevents compiler reordering timeStamps_.emplace_back(identifierPtr, internal::TimeStampType::Start); atomic_signal_fence(std::memory_order_seq_cst); // only prevents compiler reordering } // start with string identifier (storing string object comes with some additional overhead) inline auto start(std::string identifier) -> void { atomic_signal_fence(std::memory_order_seq_cst); // only prevents compiler reordering identifierStrings_.emplace_back(std::move(identifier)); timeStamps_.emplace_back(identifierStrings_.back().c_str(), internal::TimeStampType::Start); atomic_signal_fence(std::memory_order_seq_cst); // only prevents compiler reordering } // stop with string literal identifier template inline auto stop(const char (&identifierPtr)[N]) -> void { atomic_signal_fence(std::memory_order_seq_cst); // only prevents compiler reordering timeStamps_.emplace_back(identifierPtr, internal::TimeStampType::Stop); atomic_signal_fence(std::memory_order_seq_cst); // only prevents compiler reordering } // stop with string identifier (storing string object comes with some additional overhead) inline auto stop(std::string identifier) -> void { atomic_signal_fence(std::memory_order_seq_cst); // only prevents compiler reordering identifierStrings_.emplace_back(std::move(identifier)); timeStamps_.emplace_back(identifierStrings_.back().c_str(), internal::TimeStampType::Stop); atomic_signal_fence(std::memory_order_seq_cst); // only prevents compiler reordering } // clear timer and reserve space for given number of new measurements. inline auto clear(std::size_t reserveCount) -> void { timeStamps_.clear(); identifierStrings_.clear(); this->reserve(reserveCount); } // reserve space for given number of measurements. Can prevent allocations at start / stop calls. inline auto reserve(std::size_t reserveCount) -> void { timeStamps_.reserve(reserveCount); } // process timings into result type auto process() const -> TimingResult; private: inline auto stop_with_ptr(const char* identifierPtr) -> void { atomic_signal_fence(std::memory_order_seq_cst); // only prevents compiler reordering timeStamps_.emplace_back(identifierPtr, internal::TimeStampType::Stop); atomic_signal_fence(std::memory_order_seq_cst); // only prevents compiler reordering } friend ScopedTiming; std::vector timeStamps_; std::deque identifierStrings_; // pointer to elements always remain valid after push back }; // Helper class, which calls start() upon creation and stop() on timer when leaving scope with given // identifier. class ScopedTiming { public: // timer reference must be valid for the entire lifetime template ScopedTiming(const char (&identifierPtr)[N], Timer& timer) : identifierPtr_(identifierPtr), timer_(timer) { timer_.start(identifierPtr); } ScopedTiming(std::string identifier, Timer& timer) : identifierPtr_(nullptr), identifier_(std::move(identifier)), timer_(timer) { timer_.start(identifier_); } ScopedTiming(const ScopedTiming&) = delete; ScopedTiming(ScopedTiming&&) = delete; auto operator=(const ScopedTiming&) -> ScopedTiming& = delete; auto operator=(ScopedTiming &&) -> ScopedTiming& = delete; ~ScopedTiming() { if (identifierPtr_) { timer_.stop_with_ptr(identifierPtr_); } else { timer_.stop(std::move(identifier_)); } } private: const char* identifierPtr_; std::string identifier_; Timer& timer_; }; } // namespace rt_graph #endif libxsmm-1.17/samples/deeplearning/000077500000000000000000000000001415223013700172145ustar00rootroot00000000000000libxsmm-1.17/samples/deeplearning/cnnlayer/000077500000000000000000000000001415223013700210275ustar00rootroot00000000000000libxsmm-1.17/samples/deeplearning/cnnlayer/Makefile000066400000000000000000000076351415223013700225020ustar00rootroot00000000000000ROOTDIR = $(abspath $(dir $(firstword $(MAKEFILE_LIST)))) DEPDIR = ../../.. SRCDIR = . INCDIR = . BLDDIR = obj OUTDIR = . CXXFLAGS = $(NULL) CFLAGS = $(NULL) DFLAGS = $(NULL) BLAS = 0 OMP = 1 SYM = 1 # include common Makefile artifacts include $(DEPDIR)/Makefile.inc # necessary include directories IFLAGS += -I$(call quote,$(INCDIR)) IFLAGS += -I$(call quote,$(DEPDIR)/include) OUTNAME := $(shell basename "$(ROOTDIR)") HEADERS := $(wildcard $(INCDIR)/*.h) $(wildcard $(INCDIR)/*.hpp) $(wildcard $(INCDIR)/*.hxx) $(wildcard $(INCDIR)/*.hh) \ $(wildcard $(SRCDIR)/*.h) $(wildcard $(SRCDIR)/*.hpp) $(wildcard $(SRCDIR)/*.hxx) $(wildcard $(SRCDIR)/*.hh) \ $(DEPDIR)/include/libxsmm_source.h CPPSRCS := $(wildcard $(SRCDIR)/*.cpp) CXXSRCS := $(wildcard $(SRCDIR)/*.cxx) CCXSRCS := $(wildcard $(SRCDIR)/*.cc) CSOURCS := $(wildcard $(SRCDIR)/*.c) CPPOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CPPSRCS:.cpp=-cpp.o))) CXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CXXSRCS:.cxx=-cxx.o))) CCXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CCXSRCS:.cc=-cc.o))) COBJCTS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CSOURCS:.c=-c.o))) ifneq (,$(strip $(FC))) FXXSRCS := $(wildcard $(SRCDIR)/*.f) F77SRCS := $(wildcard $(SRCDIR)/*.F) F90SRCS := $(wildcard $(SRCDIR)/*.f90) $(wildcard $(SRCDIR)/*.F90) FXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(FXXSRCS:.f=-f.o))) F77OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F77SRCS:.F=-f77.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90SRCS:.f90=-f90.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90OBJS:.F90=-f90.o))) endif SOURCES := $(CPPSRCS) $(CXXSRCS) $(CCXSRCS) $(CSOURCS) OBJECTS := $(CPPOBJS) $(CXXOBJS) $(CCXOBJS) $(COBJCTS) FTNSRCS := $(FXXSRCS) $(F77SRCS) $(F90SRCS) MODULES := $(addsuffix .mod,$(basename $(FTNSRCS))) $(addsuffix .modmic,$(basename $(FTNSRCS))) FTNOBJS := $(FXXOBJS) $(F77OBJS) $(F90OBJS) XFILES := $(OUTDIR)/layer_example_f32 $(OUTDIR)/layer_example_i8i32 $(OUTDIR)/layer_example_i8i8 $(OUTDIR)/layer_example_bf16 .PHONY: all all: $(XFILES) .PHONY: compile compile: $(OBJECTS) $(FTNOBJS) $(OUTDIR)/layer_example_f32: $(OUTDIR)/.make $(BLDDIR)/layer_example_f32-c.o $(LIBDEP) $(EXTDEP) $(LD) -o $@ $(BLDDIR)/layer_example_f32-c.o $(call cleanld,$(EXTLIB) $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS)) $(OUTDIR)/layer_example_bf16: $(OUTDIR)/.make $(BLDDIR)/layer_example_bf16-c.o $(LIBDEP) $(EXTDEP) $(LD) -o $@ $(BLDDIR)/layer_example_bf16-c.o $(call cleanld,$(EXTLIB) $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS)) $(OUTDIR)/layer_example_i8i32: $(OUTDIR)/.make $(BLDDIR)/layer_example_i8i32-c.o $(LIBDEP) $(EXTDEP) $(LD) -o $@ $(BLDDIR)/layer_example_i8i32-c.o $(call cleanld,$(EXTLIB) $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS)) $(OUTDIR)/layer_example_i8i8: $(OUTDIR)/.make $(BLDDIR)/layer_example_i8i8-c.o $(LIBDEP) $(EXTDEP) $(LD) -o $@ $(BLDDIR)/layer_example_i8i8-c.o $(call cleanld,$(EXTLIB) $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS)) $(BLDDIR)/%-cpp.o: $(SRCDIR)/%.cpp .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc ./../common/dnn_common.h $(CXX) $(DFLAGS) $(IFLAGS) $(CXXFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-c.o: $(SRCDIR)/%.c .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc ./../common/dnn_common.h $(SRCDIR) $(CC) $(DFLAGS) $(IFLAGS) $(CFLAGS) $(CTARGET) -c $< -o $@ .PHONY: clean clean: ifneq ($(call qapath,$(BLDDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(BLDDIR)),$(call qapath,.)) @rm -rf $(BLDDIR) endif endif ifneq (,$(wildcard $(BLDDIR))) # still exists @rm -f $(OBJECTS) $(OBJECTX) $(FTNOBJS) $(FTNOBJX) *__genmod.* fit.log *.dat @rm -f $(BLDDIR)/*.gcno $(BLDDIR)/*.gcda $(BLDDIR)/*.gcov endif @rm -f .make .state .PHONY: realclean realclean: clean ifneq ($(call qapath,$(OUTDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(OUTDIR)),$(call qapath,.)) @rm -rf $(OUTDIR) endif endif ifneq (,$(wildcard $(OUTDIR))) # still exists @rm -f $(OUTDIR)/libxsmm.$(DLIBEXT) $(OUTDIR)/*.stackdump @rm -f $(XFILES) $(MODULES) endif libxsmm-1.17/samples/deeplearning/cnnlayer/kernel_test/000077500000000000000000000000001415223013700233465ustar00rootroot00000000000000libxsmm-1.17/samples/deeplearning/cnnlayer/kernel_test/allconv_f32_ppad_skx_01_07thr.slurm000077500000000000000000000006771415223013700317740ustar00rootroot00000000000000#!/usr/bin/env bash #SBATCH -J allconv_f32_ppad_skx_01_07thr #SBATCH --get-user-env #SBATCH --partition=skx #SBATCH --ntasks=1 #SBATCH --cpus-per-task=112 #SBATCH --time=24:00:00 export OMP_NUM_THREADS=7 export KMP_AFFINITY=granularity=fine,compact,1,28 export ITERS=1 #export LIBXSMM_VERBOSE=-1 date srun ./run_all_conv.sh 1 ${ITERS} 1 f32 F L 1 srun ./run_all_conv.sh 1 ${ITERS} 1 f32 B L 1 srun ./run_all_conv.sh 1 ${ITERS} 1 f32 U L 1 date libxsmm-1.17/samples/deeplearning/cnnlayer/kernel_test/allconv_f32_ppad_skx_01_13thr.slurm000077500000000000000000000007001415223013700317540ustar00rootroot00000000000000#!/usr/bin/env bash #SBATCH -J allconv_f32_ppad_skx_01_13thr #SBATCH --get-user-env #SBATCH --partition=skx #SBATCH --ntasks=1 #SBATCH --cpus-per-task=112 #SBATCH --time=24:00:00 export OMP_NUM_THREADS=13 export KMP_AFFINITY=granularity=fine,compact,1,28 export ITERS=1 #export LIBXSMM_VERBOSE=-1 date srun ./run_all_conv.sh 1 ${ITERS} 1 f32 F L 1 srun ./run_all_conv.sh 1 ${ITERS} 1 f32 B L 1 srun ./run_all_conv.sh 1 ${ITERS} 1 f32 U L 1 date libxsmm-1.17/samples/deeplearning/cnnlayer/kernel_test/allconv_f32_ppad_skx_01_28thr.slurm000077500000000000000000000007001415223013700317620ustar00rootroot00000000000000#!/usr/bin/env bash #SBATCH -J allconv_f32_ppad_skx_01_28thr #SBATCH --get-user-env #SBATCH --partition=skx #SBATCH --ntasks=1 #SBATCH --cpus-per-task=112 #SBATCH --time=24:00:00 export OMP_NUM_THREADS=28 export KMP_AFFINITY=granularity=fine,compact,1,28 export ITERS=1 #export LIBXSMM_VERBOSE=-1 date srun ./run_all_conv.sh 1 ${ITERS} 1 f32 F L 1 srun ./run_all_conv.sh 1 ${ITERS} 1 f32 B L 1 srun ./run_all_conv.sh 1 ${ITERS} 1 f32 U L 1 date libxsmm-1.17/samples/deeplearning/cnnlayer/kernel_test/allconv_f32_ppad_skx_11_07thr.slurm000077500000000000000000000007021415223013700317620ustar00rootroot00000000000000#!/usr/bin/env bash #SBATCH -J allconv_f32_ppad_skx_11_07thr #SBATCH --get-user-env #SBATCH --partition=skx #SBATCH --ntasks=1 #SBATCH --cpus-per-task=112 #SBATCH --time=24:00:00 export OMP_NUM_THREADS=7 export KMP_AFFINITY=granularity=fine,compact,1,28 export ITERS=1 #export LIBXSMM_VERBOSE=-1 date srun ./run_all_conv.sh 11 ${ITERS} 1 f32 F L 1 srun ./run_all_conv.sh 11 ${ITERS} 1 f32 B L 1 srun ./run_all_conv.sh 11 ${ITERS} 1 f32 U L 1 date libxsmm-1.17/samples/deeplearning/cnnlayer/kernel_test/allconv_f32_ppad_skx_11_13thr.slurm000077500000000000000000000007031415223013700317600ustar00rootroot00000000000000#!/usr/bin/env bash #SBATCH -J allconv_f32_ppad_skx_11_13thr #SBATCH --get-user-env #SBATCH --partition=skx #SBATCH --ntasks=1 #SBATCH --cpus-per-task=112 #SBATCH --time=24:00:00 export OMP_NUM_THREADS=13 export KMP_AFFINITY=granularity=fine,compact,1,28 export ITERS=1 #export LIBXSMM_VERBOSE=-1 date srun ./run_all_conv.sh 11 ${ITERS} 1 f32 F L 1 srun ./run_all_conv.sh 11 ${ITERS} 1 f32 B L 1 srun ./run_all_conv.sh 11 ${ITERS} 1 f32 U L 1 date libxsmm-1.17/samples/deeplearning/cnnlayer/kernel_test/allconv_f32_ppad_skx_28_16thr.slurm000077500000000000000000000007031415223013700317730ustar00rootroot00000000000000#!/usr/bin/env bash #SBATCH -J allconv_f32_ppad_skx_28_16thr #SBATCH --get-user-env #SBATCH --partition=skx #SBATCH --ntasks=1 #SBATCH --cpus-per-task=112 #SBATCH --time=24:00:00 export OMP_NUM_THREADS=16 export KMP_AFFINITY=granularity=fine,compact,1,28 export ITERS=1 #export LIBXSMM_VERBOSE=-1 date srun ./run_all_conv.sh 28 ${ITERS} 1 f32 F L 1 srun ./run_all_conv.sh 28 ${ITERS} 1 f32 B L 1 srun ./run_all_conv.sh 28 ${ITERS} 1 f32 U L 1 date libxsmm-1.17/samples/deeplearning/cnnlayer/kernel_test/allconv_f32_ppad_skx_28_28thr.slurm000077500000000000000000000007031415223013700317760ustar00rootroot00000000000000#!/usr/bin/env bash #SBATCH -J allconv_f32_ppad_skx_28_28thr #SBATCH --get-user-env #SBATCH --partition=skx #SBATCH --ntasks=1 #SBATCH --cpus-per-task=112 #SBATCH --time=24:00:00 export OMP_NUM_THREADS=28 export KMP_AFFINITY=granularity=fine,compact,1,28 export ITERS=1 #export LIBXSMM_VERBOSE=-1 date srun ./run_all_conv.sh 28 ${ITERS} 1 f32 F L 1 srun ./run_all_conv.sh 28 ${ITERS} 1 f32 B L 1 srun ./run_all_conv.sh 28 ${ITERS} 1 f32 U L 1 date libxsmm-1.17/samples/deeplearning/cnnlayer/kernel_test/allconv_f32_ppad_skx_50_28thr_bwd.slurm000077500000000000000000000005511415223013700326260ustar00rootroot00000000000000#!/usr/bin/env bash #SBATCH -J allconv_f32_ppad_skx_50_28thr_bwd #SBATCH --get-user-env #SBATCH --partition=skx #SBATCH --ntasks=1 #SBATCH --cpus-per-task=112 #SBATCH --time=24:00:00 export OMP_NUM_THREADS=28 export KMP_AFFINITY=granularity=fine,compact,1,28 export ITERS=1 #export LIBXSMM_VERBOSE=-1 date srun ./run_all_conv.sh 50 ${ITERS} 1 f32 B L 1 date libxsmm-1.17/samples/deeplearning/cnnlayer/kernel_test/allconv_f32_ppad_skx_50_28thr_fwd.slurm000077500000000000000000000005511415223013700326320ustar00rootroot00000000000000#!/usr/bin/env bash #SBATCH -J allconv_f32_ppad_skx_50_28thr_fwd #SBATCH --get-user-env #SBATCH --partition=skx #SBATCH --ntasks=1 #SBATCH --cpus-per-task=112 #SBATCH --time=24:00:00 export OMP_NUM_THREADS=28 export KMP_AFFINITY=granularity=fine,compact,1,28 export ITERS=1 #export LIBXSMM_VERBOSE=-1 date srun ./run_all_conv.sh 50 ${ITERS} 1 f32 F L 1 date libxsmm-1.17/samples/deeplearning/cnnlayer/kernel_test/allconv_f32_ppad_skx_50_28thr_upd.slurm000077500000000000000000000005511415223013700326420ustar00rootroot00000000000000#!/usr/bin/env bash #SBATCH -J allconv_f32_ppad_skx_50_28thr_upd #SBATCH --get-user-env #SBATCH --partition=skx #SBATCH --ntasks=1 #SBATCH --cpus-per-task=112 #SBATCH --time=24:00:00 export OMP_NUM_THREADS=28 export KMP_AFFINITY=granularity=fine,compact,1,28 export ITERS=1 #export LIBXSMM_VERBOSE=-1 date srun ./run_all_conv.sh 50 ${ITERS} 1 f32 U L 1 date libxsmm-1.17/samples/deeplearning/cnnlayer/kernel_test_clean.sh000077500000000000000000000043211415223013700250470ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) MKTEMP=${HERE}/../../../.mktmp.sh SED=$(command -v sed) CP=$(command -v cp) RM=$(command -v rm) CLEANUP="-o -D" JOBDIR=kernel_test JOBEXT=slurm if [ "${MKTEMP}" ] && [ "${SED}" ] && \ [ "${CP}" ] && [ "${RM}" ]; then # remove any leftover temporary files ${RM} -f .${JOBDIR}_??????.${JOBEXT} # create temporary file to avoid sed's i-flag JOBTMPFILE=$(${MKTEMP} ${HERE}/.${JOBDIR}_XXXXXX.${JOBEXT}) # disable glob in Shell #set -f for CLEAN in ${CLEANUP}; do CLEAN_CHECK="${CLEAN_CHECK}/^#SBATCH[[:space:]][[:space:]]*${CLEAN}\([[:space:]=][[:space:]=]*\|$\)/p;" CLEAN_CLEAN="${CLEAN_CLEAN}/^#SBATCH[[:space:]][[:space:]]*${CLEAN}\([[:space:]=][[:space:]=]*\|$\)/d;" done CLEAN_CHECK="${CLEAN_CHECK}/^LIBXSMM_TARGET=/p;" CLEAN_CLEAN="${CLEAN_CLEAN}/^LIBXSMM_TARGET=/d;" COUNT_TOTAL=0 COUNT_CLEAN=0 for JOBFILE in $(ls -1 ${HERE}/${JOBDIR}/*.${JOBEXT}); do if [ "$(${SED} -n "${CLEAN_CHECK}" ${JOBFILE})" ]; then echo "Cleaning ${JOBFILE}..." ${SED} "${CLEAN_CLEAN}" ${JOBFILE} > ${JOBTMPFILE} ${CP} ${JOBTMPFILE} ${JOBFILE} COUNT_CLEAN=$((COUNT_CLEAN+1)) fi COUNT_TOTAL=$((COUNT_TOTAL+1)) done ${RM} -f ${JOBTMPFILE} if [ "0" != "${COUNT_CLEAN}" ]; then echo "Successfully cleaned ${COUNT_CLEAN} of ${COUNT_TOTAL} job files." else echo "Successfully completed (there was nothing to clean)." fi else >&2 echo "Error: missing prerequisites!" exit 1 fi libxsmm-1.17/samples/deeplearning/cnnlayer/layer_example_bf16.c000066400000000000000000000772511415223013700246540ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas, Alexander Heinecke, Hans Pabst, Dhiraj Kalamkar, * Ankush Mandal (Intel Corp.) ******************************************************************************/ #include #include #include #include #include #if defined(_OPENMP) # include #endif #define USE_OVERWRITE /*#define USE_BWD_NO_FILTER_TRANSPOSE_OVERWRITE*/ /* include c-based dnn library */ #include "../common/dnn_common.h" #define CHKERR_LIBXSMM_DNN(A) { const int chkerr_libxsmm_dnn_ = A; if (LIBXSMM_DNN_SUCCESS != chkerr_libxsmm_dnn_) { \ fprintf(stderr, "%s\n", libxsmm_dnn_get_error(chkerr_libxsmm_dnn_)); global_status = chkerr_libxsmm_dnn_; } \ } int main(int argc, char* argv[]) { float *naive_input, *naive_input_save, *naive_output_save, *naive_filter, *naive_output, *naive_output_bp, *naive_output_fp, *naive_input_bp , *naive_filter_wu, *naive_input_tmp, *naive_libxsmm_output_f32, *naive_libxsmm_input_f32 ,*naive_libxsmm_filter_f32; libxsmm_bfloat16 *naive_input_bf16, *naive_input_bp_bf16, *naive_filter_bf16, *naive_output_bf16, *naive_output_bp_bf16, *naive_filter_wu_bf16; libxsmm_bfloat16 *input_libxsmm, *filter_libxsmm, *filtertr_libxsmm, *output_libxsmm, *naive_libxsmm_output, *naive_libxsmm_input, *naive_libxsmm_filter, *dinput_libxsmm, *doutput_libxsmm, *dfilter_libxsmm; int ifhp, ifwp, ofhp, ofwp, ofh, ofw; int stride_h, stride_w, pad_h, pad_w, pad_h_in, pad_w_in, pad_h_out, pad_w_out; naive_conv_t naive_param; void* scratch; size_t scratch_size; /* some parameters we can overwrite via cli, default is some inner layer of overfeat */ int iters = 10; /* repetitions of benchmark */ int ifw = 14; /* input width, "W" */ int ifh = 18; /* input height, "H" */ int nImg = 32; /* mini-batch size, "N" */ int nIfm = 256; /* number of input feature maps, "C" */ int nOfm = 512; /* number of output feature maps, "K" */ int kh = 3; /* filter height, "R" */ int kw = 3; /* filter width, "S" */ int padh = 1; /* padding in input, height */ int padw = 1; /* padding in input, width */ int stride = 1; /* stride when accessing inputs */ char type = 'A'; /* 'A': ALL, 'F': FP, 'B': BP, 'U', WU */ char format = 'L'; const char *const env_check = getenv("CHECK"); const double check = LIBXSMM_ABS(0 == env_check ? 1 : atof(env_check)); #if defined(_OPENMP) int nThreads = omp_get_max_threads(); /* number of threads */ #else int nThreads = 1; /* number of threads */ #endif int padding_mode = 0; /* padding mode */ unsigned long long l_start, l_end; double l_total = 0.0; double lpOps = 0.0; /* number of low precision operations */ int i; libxsmm_dnn_conv_desc conv_desc; libxsmm_dnn_layer* libxsmm_handle; libxsmm_dnn_tensor* libxsmm_input; libxsmm_dnn_tensor* libxsmm_output; libxsmm_dnn_tensor* libxsmm_filter; libxsmm_dnn_tensor* libxsmm_filter_tr; libxsmm_dnn_tensor* libxsmm_dinput; libxsmm_dnn_tensor* libxsmm_doutput; libxsmm_dnn_tensor* libxsmm_dfilter; libxsmm_dnn_tensor_datalayout* libxsmm_layout; libxsmm_dnn_err_t status; libxsmm_dnn_err_t global_status = LIBXSMM_DNN_SUCCESS; libxsmm_matdiff_info norms_fwd, norms_bwd, norms_upd, diff; libxsmm_matdiff_clear(&norms_fwd); libxsmm_matdiff_clear(&norms_bwd); libxsmm_matdiff_clear(&norms_upd); libxsmm_matdiff_clear(&diff); if (argc > 1 && !strncmp(argv[1], "-h", 3)) { printf("Usage: %s iters inpWidth inpHeight nImg nIfm nOfm kw kh pad stride type padding_mode\n", argv[0]); return 0; } libxsmm_rng_set_seed(1); /* reading new values from cli */ i = 1; if (argc > i) iters = atoi(argv[i++]); if (argc > i) ifw = atoi(argv[i++]); if (argc > i) ifh = atoi(argv[i++]); if (argc > i) nImg = atoi(argv[i++]); if (argc > i) nIfm = atoi(argv[i++]); if (argc > i) nOfm = atoi(argv[i++]); if (argc > i) kw = atoi(argv[i++]); if (argc > i) kh = atoi(argv[i++]); if (argc > i) padw = atoi(argv[i++]); if (argc > i) padh = atoi(argv[i++]); if (argc > i) stride = atoi(argv[i++]); if (argc > i) type = *(argv[i++]); if (argc > i) format = *(argv[i++]); if (argc > i) padding_mode = atoi(argv[i++]); if (type != 'A' && type != 'F' && type != 'B'&& type != 'U') { printf("type needs to be 'A' (All), 'F' (FP only), 'B' (BP only), 'U' (WU only)\n"); return 0; } if (format != 'L') { printf("format needs to be 'L'\n"); return 0; } stride_w = stride; stride_h = stride; pad_w = padw; pad_h = padh; if (0 == padding_mode) { pad_h_in = 0; pad_w_in = 0; pad_h_out = 0; pad_w_out = 0; } else { /* TODO: change "1" to "0" if "padding_mode = -1" is acknowledged */ if (1 < padding_mode) pad_w = padding_mode; pad_h_in = pad_h; pad_w_in = pad_w; pad_h_out = pad_h; pad_w_out = pad_w; } /* deriving some values for naive code */ ofh = (ifh + 2 * pad_h - kh) / stride_h + 1; ofw = (ifw + 2 * pad_w - kw) / stride_w + 1; ifhp = ifh + 2 * pad_h_in; ifwp = ifw + 2 * pad_w_in; ofhp = ofh + 2 * pad_h_out; ofwp = ofw + 2 * pad_w_out; /* set struct for naive convolution */ naive_param.nImg = nImg; naive_param.nIfm = nIfm; naive_param.nOfm = nOfm; naive_param.ifhp = ifhp; naive_param.ifwp = ifwp; naive_param.ifh = ifh; naive_param.ifw = ifw; naive_param.ofhp = ofhp; naive_param.ofwp = ofwp; naive_param.ofh = ofh; naive_param.ofw = ofw; naive_param.pad_h = pad_h; naive_param.pad_w = pad_w; naive_param.pad_h_in = pad_h_in; naive_param.pad_w_in = pad_w_in; naive_param.pad_h_out = pad_h_out; naive_param.pad_w_out = pad_w_out; naive_param.kh = kh; naive_param.kw = kw; naive_param.stride_h = stride_h; naive_param.stride_w = stride_w; #if defined(__SSE3__) _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); #endif /* print some summary */ printf("##########################################\n"); printf("# Setting Up Common #\n"); printf("##########################################\n"); printf("PARAMS: W:%d H:%d N:%d C:%d K:%d R:%d S:%d P:%d Q:%d STRIDE:%d\n", ifw, ifh, nImg, nIfm, nOfm, kw, kh, ofh, ofw, stride); printf("PARAMS: ITERS:%d", iters); if (LIBXSMM_FEQ(0, check)) printf(" Threads:%d\n", nThreads); else printf("\n"); printf(" InImg %dx%d Padded (%dx%d)\n", ifh, ifw, ifhp, ifwp); printf("OutImg %dx%d Padded (%dx%d)\n", ofh, ofw, ofhp, ofwp); printf("SIZE Input (MB): %10.2f MiB\n", (double)(nImg*nIfm*ifhp*ifwp*sizeof(libxsmm_bfloat16))/(1024.0*1024.0) ); printf("SIZE Output (MB): %10.2f MiB\n", (double)(nImg*nOfm*ofhp*ofwp*sizeof(libxsmm_bfloat16))/(1024.0*1024.0) ); printf("SIZE Input (1): %10.2f MiB\n", (double)(1*nIfm*ifhp*ifwp* sizeof(libxsmm_bfloat16))/(1024.0*1024.0) ); printf("SIZE Output (1): %10.2f MiB\n", (double)(1*nOfm*ofhp*ofwp* sizeof(libxsmm_bfloat16))/(1024.0*1024.0) ); printf("SIZE Weight : %10.2f MiB\n", (double)(nIfm*nOfm*kw*kh* sizeof(libxsmm_bfloat16))/(1024.0*1024.0) ); /* allocate data */ naive_input = (float* )libxsmm_aligned_malloc( nImg*nIfm*ifhp*ifwp*sizeof(float ), 2097152); naive_input_save = (float* )libxsmm_aligned_malloc( nImg*nIfm*ifhp*ifwp*sizeof(float ), 2097152); naive_input_tmp = (float* )libxsmm_aligned_malloc( nImg*nIfm*ifhp*ifwp*sizeof(float ), 2097152); naive_output = (float* )libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(float ), 2097152); naive_output_fp = (float* )libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(float ), 2097152); naive_output_save = (float* )libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(float ), 2097152); naive_output_bp = (float* )libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(float ), 2097152); naive_input_bp = (float* )libxsmm_aligned_malloc( nImg*nIfm*ifhp*ifwp*sizeof(float ), 2097152); naive_filter = (float* )libxsmm_aligned_malloc( nOfm*nIfm*kh*kw* sizeof(float ), 2097152); naive_filter_wu = (float* )libxsmm_aligned_malloc( nOfm*nIfm*kh*kw* sizeof(float ), 2097152); naive_input_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nIfm*ifhp*ifwp*sizeof(libxsmm_bfloat16), 2097152); naive_input_bp_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nIfm*ifhp*ifwp*sizeof(libxsmm_bfloat16), 2097152); naive_output_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(libxsmm_bfloat16), 2097152); naive_output_bp_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(libxsmm_bfloat16), 2097152); naive_filter_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nOfm*nIfm*kh*kw* sizeof(libxsmm_bfloat16), 2097152); naive_filter_wu_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nOfm*nIfm*kh*kw* sizeof(libxsmm_bfloat16), 2097152); naive_libxsmm_output = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(libxsmm_bfloat16), 2097152); naive_libxsmm_output_f32 = (float* )libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(float ), 2097152); naive_libxsmm_input_f32 = (float* )libxsmm_aligned_malloc( nImg*nIfm*ifhp*ifwp*sizeof(float ), 2097152); naive_libxsmm_filter_f32 = (float* )libxsmm_aligned_malloc( nOfm*nIfm*kh*kw* sizeof(float ), 2097152); naive_libxsmm_input = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nIfm*ifhp*ifwp*sizeof(libxsmm_bfloat16), 2097152); naive_libxsmm_filter = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nOfm*nIfm*kh*kw* sizeof(libxsmm_bfloat16), 2097152); input_libxsmm = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nIfm*ifhp*ifwp*sizeof(libxsmm_bfloat16), 2097152); filter_libxsmm = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nOfm*nIfm*kh*kw* sizeof(libxsmm_bfloat16), 2097152); filtertr_libxsmm = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nOfm*nIfm*kh*kw* sizeof(libxsmm_bfloat16), 2097152); output_libxsmm = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(libxsmm_bfloat16), 2097152); dinput_libxsmm = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nIfm*ifhp*ifwp*sizeof(libxsmm_bfloat16), 2097152); doutput_libxsmm = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(libxsmm_bfloat16), 2097152); dfilter_libxsmm = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nOfm*nIfm*kh*kw* sizeof(libxsmm_bfloat16), 2097152); /* initialize data */ zero_buf(naive_input, nImg*nIfm*ifhp*ifwp); zero_buf(naive_output_bp, nImg*nOfm*ofhp*ofwp); if (padding_mode == 0 ) { init_buf(naive_input, nImg*nIfm*ifhp*ifwp, 0, 0); init_buf(naive_output_bp, nImg*nOfm*ofhp*ofwp, 0, 0); } else { float *naive_output_bp_tmp = (float*)libxsmm_aligned_scratch( nImg*nOfm*ofhp*ofwp*sizeof(float), 2097152); init_buf(naive_input_tmp, nImg*nIfm*ifh*ifw, 0, 0); init_buf(naive_output_bp_tmp, nImg*nOfm*ofh*ofw, 0, 0); copy_internal_nchw( naive_input , naive_input_tmp, nImg, nIfm, ifh, ifw, pad_h, pad_w); copy_internal_nchw( naive_output_bp , naive_output_bp_tmp, nImg, nOfm, ofh, ofw, pad_h, pad_w); libxsmm_free(naive_output_bp_tmp); } #if defined(USE_FUSED_RELU_BWD) /* Initialize some entries with zeros */ for (i = 0; i < nImg*nIfm*ifhp*ifwp; i++ ) { if ( ((i%16) == 2) || ((i%16) == 3) || ((i%16) == 7) || ((i%16) == 14) ) { naive_input[i] = 0.0; } } #endif copy_buf(naive_input, naive_input_save, nImg*nIfm*ifhp*ifwp); copy_buf(naive_output_bp, naive_output_save, nImg*nOfm*ofhp*ofwp); init_buf(naive_filter, nIfm*nOfm*kh*kw, 0, 0); zero_buf(naive_output_fp, nImg*nOfm*ofhp*ofwp); zero_buf(naive_input_bp, nImg*nIfm*ifhp*ifwp); zero_buf(naive_filter_wu, nOfm*nIfm*kh*kw); /*zero_buf(output_libxsmm, nImg*nOfm*ofhp*ofwp); zero_buf(dinput_libxsmm, nImg*nIfm*ifhp*ifwp); zero_buf(naive_libxsmm_output, nImg*nOfm*ofhp*ofwp); zero_buf(naive_libxsmm_input, nImg*nIfm*ifhp*ifwp); zero_buf(naive_libxsmm_filter, nOfm*nIfm*kh*kw);*/ if (LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Computing Reference ... #\n"); printf("##########################################\n"); /* run naive convolutions */ if (type == 'A' || type == 'F') { naive_conv_fp(&naive_param, naive_input, naive_output_fp, naive_filter, NULL); } /* run naive convolutions */ if (type == 'A' || type == 'B') { naive_conv_bp(&naive_param, naive_input_bp, naive_output_bp, naive_filter, naive_input_save); } /* run naive convolutions */ if (type == 'A' || type == 'U') { naive_conv_wu(&naive_param, naive_input_save, naive_output_save, naive_filter_wu); } printf("##########################################\n"); printf("# Computing Reference ... done #\n"); printf("##########################################\n"); } /* make things bf16 */ truncate_mask_fp32_bf16( naive_input, naive_input, nImg*nIfm*ifhp*ifwp ); truncate_mask_fp32_bf16( naive_input_bp, naive_input_bp, nImg*nIfm*ifhp*ifwp ); truncate_mask_fp32_bf16( naive_output_fp, naive_output_fp, nImg*nOfm*ofhp*ofwp ); truncate_mask_fp32_bf16( naive_output_bp, naive_output_bp, nImg*nOfm*ofhp*ofwp ); truncate_mask_fp32_bf16( naive_filter, naive_filter, nIfm*nOfm*kh*kw ); truncate_mask_fp32_bf16( naive_filter_wu, naive_filter_wu, nIfm*nOfm*kh*kw ); libxsmm_truncate_convert_f32_bf16( naive_input, naive_input_bf16, nImg*nIfm*ifhp*ifwp ); libxsmm_truncate_convert_f32_bf16( naive_input_bp, naive_input_bp_bf16, nImg*nIfm*ifhp*ifwp ); libxsmm_truncate_convert_f32_bf16( naive_output_fp, naive_output_bf16, nImg*nOfm*ofhp*ofwp ); libxsmm_truncate_convert_f32_bf16( naive_output_bp, naive_output_bp_bf16, nImg*nOfm*ofhp*ofwp ); libxsmm_truncate_convert_f32_bf16( naive_filter, naive_filter_bf16, nIfm*nOfm*kh*kw ); libxsmm_truncate_convert_f32_bf16( naive_filter_wu, naive_filter_wu_bf16, nIfm*nOfm*kh*kw ); printf("\n"); printf("##########################################\n"); printf("# Setting Up (custom-Storage) #\n"); printf("##########################################\n"); /* setup LIBXSMM handle */ conv_desc.N = nImg; conv_desc.C = nIfm; conv_desc.H = ifh; conv_desc.W = ifw; conv_desc.K = nOfm; conv_desc.R = kh; conv_desc.S = kw; conv_desc.u = stride_h; conv_desc.v = stride_w; conv_desc.pad_h = pad_h; conv_desc.pad_w = pad_w; conv_desc.pad_h_in = pad_h_in; conv_desc.pad_w_in = pad_w_in; conv_desc.pad_h_out = pad_h_out; conv_desc.pad_w_out = pad_w_out; conv_desc.threads = nThreads; conv_desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT; conv_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; conv_desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; conv_desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE; #if defined(USE_BWD_NO_FILTER_TRANSPOSE_OVERWRITE) conv_desc.options = LIBXSMM_DNN_CONV_OPTION_BWD_NO_FILTER_TRANSPOSE_OVERWRITE; #elif defined(USE_OVERWRITE) conv_desc.options = LIBXSMM_DNN_CONV_OPTION_OVERWRITE; #endif conv_desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE; conv_desc.datatype_in = LIBXSMM_DNN_DATATYPE_BF16; conv_desc.datatype_out = LIBXSMM_DNN_DATATYPE_BF16; libxsmm_handle = libxsmm_dnn_create_conv_layer( conv_desc, &status ); CHKERR_LIBXSMM_DNN( status ); /* setup LIBXSMM buffers and filter */ libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_input = libxsmm_dnn_link_tensor( libxsmm_layout, input_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); printf("inner activation blocking: %u\n", libxsmm_layout->dim_size[0] ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dinput = libxsmm_dnn_link_tensor( libxsmm_layout, dinput_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_output = libxsmm_dnn_link_tensor( libxsmm_layout, output_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_doutput = libxsmm_dnn_link_tensor( libxsmm_layout, doutput_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_filter = libxsmm_dnn_link_tensor( libxsmm_layout, filter_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER_TRANS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_filter_tr = libxsmm_dnn_link_tensor( libxsmm_layout, filtertr_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_FILTER, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dfilter = libxsmm_dnn_link_tensor( libxsmm_layout, dfilter_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); /* copy in data to LIBXSMM format */ /* we can also use the layout functions and set the data on our own external to the library, @TODO, we plan to add an example here */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_input, (void*)naive_input_bf16, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_doutput, (void*)naive_output_bp_bf16, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_filter, (void*)naive_filter_bf16, LIBXSMM_DNN_TENSOR_FORMAT_KCRS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_zero_tensor( libxsmm_output ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_zero_tensor( libxsmm_dfilter ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_zero_tensor( libxsmm_dinput ) ); /* bind buffers and filter to handle */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_input, LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_dinput, LIBXSMM_DNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_output, LIBXSMM_DNN_REGULAR_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_doutput, LIBXSMM_DNN_GRADIENT_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_filter, LIBXSMM_DNN_REGULAR_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_dfilter, LIBXSMM_DNN_GRADIENT_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_filter_tr, LIBXSMM_DNN_REGULAR_FILTER_TRANS ) ); /* let's allocate and bind scratch */ scratch_size = libxsmm_dnn_get_scratch_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, &status ); CHKERR_LIBXSMM_DNN( status ); scratch = libxsmm_aligned_scratch( scratch_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, scratch ) ); /* set scratch to bogus to make sure that libxsmm takes care of zeroing internally */ init_buf( scratch, scratch_size/4, 0, 0 ); if ((type == 'A' || type == 'F') && LIBXSMM_NEQ(0, check)) { printf("##############################################\n"); printf("# Check Correctness - FWD (custom-Storage) #\n"); printf("##############################################\n"); /* run LIBXSMM convolutions */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) ); } /* copy out data */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_output, (void*)naive_libxsmm_output, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); libxsmm_convert_bf16_f32( naive_libxsmm_output, naive_libxsmm_output_f32, nImg*nOfm*ofhp*ofwp ); /* compare */ libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_F32, nImg*nOfm*ofhp*ofwp, 1, naive_output_fp, naive_libxsmm_output_f32, 0, 0); printf("L1 reference : %.25f\n", norms_fwd.l1_ref); printf("L1 test : %.25f\n", norms_fwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_fwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_fwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel); printf("Check-norm : %.24f\n", norms_fwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_fwd); } if ((type == 'A' || type == 'B') && (nIfm > 3) && LIBXSMM_NEQ(0, check)) { printf("##############################################\n"); printf("# Check Correctness - BWD (custom-Storage) #\n"); printf("##############################################\n"); #if defined(USE_BWD_NO_FILTER_TRANSPOSE_OVERWRITE) CHKERR_LIBXSMM_DNN( libxsmm_dnn_trans_reg_bf16_filter( libxsmm_handle ) ); #endif /* run LIBXSMM convolutions */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ) ); } /* copy out data */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_dinput, (void*)naive_libxsmm_input, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); libxsmm_convert_bf16_f32( naive_libxsmm_input, naive_libxsmm_input_f32, nImg*nIfm*ifhp*ifwp ); /* compare */ libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, nImg*nIfm*ifhp*ifwp, 1, naive_input_bp, naive_libxsmm_input_f32, 0, 0); printf("L1 reference : %.25f\n", norms_bwd.l1_ref); printf("L1 test : %.25f\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); } if ((type == 'A' || type == 'U') && LIBXSMM_NEQ(0, check)) { printf("##############################################\n"); printf("# Check Correctness - UPD (custom-Storage) #\n"); printf("##############################################\n"); /* run LIBXSMM convolutions */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ) ); } /* copy out data */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_dfilter, (void*)naive_libxsmm_filter, LIBXSMM_DNN_TENSOR_FORMAT_KCRS ) ); libxsmm_convert_bf16_f32( naive_libxsmm_filter, naive_libxsmm_filter_f32, nOfm*nIfm*kh*kw); /* compare */ libxsmm_matdiff(&norms_upd, LIBXSMM_DATATYPE_F32, nOfm*nIfm*kh*kw, 1, naive_filter_wu, naive_libxsmm_filter_f32, 0, 0); printf("L1 reference : %.25f\n", norms_upd.l1_ref); printf("L1 test : %.25f\n", norms_upd.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd.linf_rel); printf("Check-norm : %.24f\n", norms_upd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd); } if (type == 'A' || type == 'F') { printf("##########################################\n"); printf("# Performance - FWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolution for performance */ l_start = libxsmm_timer_tick(); for (i = 0; i < iters; ++i) { #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif libxsmm_dnn_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); lpOps = (double)nImg * (double)nIfm * (double)nOfm * (double)ofh * (double)ofw * (double)(2 * kh * kw) * (double)iters; printf("GOP = %.5g\n", lpOps*1e-9/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GOPS = %.5g\n", (lpOps*1e-9)/l_total); printf("PERFDUMP,FP,%s,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nIfm, nOfm, ifw, ifh, kw, kh, stride, padw, padh, ((double)(l_total/iters)), (lpOps*1e-9)/l_total, norms_fwd.l1_ref, norms_fwd.l1_tst, norms_fwd.l2_abs, norms_fwd.l2_rel, norms_fwd.linf_abs, norms_fwd.linf_rel, norms_fwd.normf_rel); } if ( (type == 'A') || ((type == 'B') && (nIfm > 3)) ) { printf("##########################################\n"); printf("# Performance - BWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolution for performance */ l_start = libxsmm_timer_tick(); for (i = 0; i < iters; ++i) { #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif libxsmm_dnn_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); lpOps = (double)nImg * (double)nIfm * (double)nOfm * (double)ofh * (double)ofw * (double)(2 * kh * kw) * (double)iters; printf("GOP = %.5g\n", lpOps*1e-9/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GOPS = %.5g\n", (lpOps*1e-9)/l_total); printf("PERFDUMP,BP,%s,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nIfm, nOfm, ifw, ifh, kw, kh, stride, padw, padh, ((double)(l_total/iters)), (lpOps*1e-9)/l_total, norms_bwd.l1_ref, norms_bwd.l1_tst, norms_bwd.l2_abs, norms_bwd.l2_rel, norms_bwd.linf_abs, norms_bwd.linf_rel, norms_bwd.normf_rel); } if (type == 'A' || type == 'U') { printf("##########################################\n"); printf("# Performance - UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolution for performance */ l_start = libxsmm_timer_tick(); for (i = 0; i < iters; ++i) { #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif libxsmm_dnn_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); lpOps = (double)nImg * (double)nIfm * (double)nOfm * (double)ofh * (double)ofw * (double)(2 * kh * kw) * (double)iters; printf("GOP = %.5g\n", lpOps*1e-9/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GOPS = %.5g\n", (lpOps*1e-9)/l_total); printf("PERFDUMP,WU,%s,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nIfm, nOfm, ifw, ifh, kw, kh, stride, padw, padh, ((double)(l_total/iters)), (lpOps*1e-9)/l_total, norms_upd.l1_ref, norms_upd.l1_tst, norms_upd.l2_abs, norms_upd.l2_rel, norms_upd.linf_abs, norms_upd.linf_rel, norms_upd.normf_rel); } /* clean-up */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL ) ); libxsmm_free(scratch); CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_input ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_output ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_filter ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dinput ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_doutput ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dfilter ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_conv_layer( libxsmm_handle ) ); /* deallocate data */ libxsmm_free( naive_input ); libxsmm_free( naive_input_save ); libxsmm_free( naive_input_tmp ); libxsmm_free( naive_output ); libxsmm_free( naive_output_fp ); libxsmm_free( naive_output_save ); libxsmm_free( naive_output_bp ); libxsmm_free( naive_input_bp ); libxsmm_free( naive_filter ); libxsmm_free( naive_filter_wu ); libxsmm_free( naive_input_bf16 ); libxsmm_free( naive_input_bp_bf16 ); libxsmm_free( naive_output_bf16 ); libxsmm_free( naive_output_bp_bf16 ); libxsmm_free( naive_filter_bf16 ); libxsmm_free( naive_filter_wu_bf16 ); libxsmm_free( naive_libxsmm_output ); libxsmm_free( naive_libxsmm_output_f32 ); libxsmm_free( naive_libxsmm_input_f32 ); libxsmm_free( naive_libxsmm_filter_f32 ); libxsmm_free( naive_libxsmm_input ); libxsmm_free( naive_libxsmm_filter ); libxsmm_free( input_libxsmm ); libxsmm_free( filter_libxsmm ); libxsmm_free( output_libxsmm ); libxsmm_free( dinput_libxsmm ); libxsmm_free( doutput_libxsmm ); libxsmm_free( dfilter_libxsmm ); { const char *const env_check_scale = getenv("CHECK_SCALE"); const double check_scale = LIBXSMM_ABS(0 == env_check_scale ? 100.0 : atof(env_check_scale)); if (LIBXSMM_NEQ(0, check) && (check < 100.0 * check_scale * diff.normf_rel) && (global_status == LIBXSMM_DNN_SUCCESS)) { fprintf(stderr, "FAILED with an error of %f%%!\n", 100.0 * diff.normf_rel); exit(EXIT_FAILURE); } } /* some empty lines at the end */ printf("\n\n\n"); return global_status; } libxsmm-1.17/samples/deeplearning/cnnlayer/layer_example_bf16.vcxproj000066400000000000000000000550131415223013700261150ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 layer_example_bf16 {0176F1E0-2816-4D47-89A2-96337E1B5755} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/deeplearning/cnnlayer/layer_example_f32.c000066400000000000000000001677621415223013700245170ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Hans Pabst, Dhiraj Kalamkar, Rajkishore Barik (Intel Corp.) ******************************************************************************/ #include #include #include #include #include #include #if defined(_OPENMP) # include #endif # define USE_OVERWRITE /*# define USE_BWD_NO_FILTER_TRANSPOSE_OVERWRITE*/ /* include c-based dnn library */ #include "../common/dnn_common.h" #define CHKERR_LIBXSMM_DNN(A) { const int chkerr_libxsmm_dnn_ = A; if (LIBXSMM_DNN_SUCCESS != chkerr_libxsmm_dnn_) { \ fprintf(stderr, "%s\n", libxsmm_dnn_get_error(chkerr_libxsmm_dnn_)); global_status = chkerr_libxsmm_dnn_; } \ } int main(int argc, char* argv[]) { float *naive_input, *naive_output, *naive_output_save, *naive_filter, *naive_filter_wu, *naive_output_bp, *naive_output_wu, *naive_libxsmm_output; float *naive_libxsmm_input, *naive_libxsmm_filter, *naive_input_save, *naive_filter_save, *naive_filter_kcrs; float *input_nhwc, *output_nhwc, *filter_rsck, *dinput_nhwc, *doutput_nhwc, *dfilter_rsck, *naive_output_nhwc, *naive_input_nhwc; float *input_libxsmm, *filter_libxsmm, *output_libxsmm, *dinput_libxsmm, *dfilter_libxsmm, *doutput_libxsmm, *filtertr_libxsmm; int ifhp, ifwp, ofhp, ofwp, ofh, ofw; int stride_h, stride_w, pad_h, pad_w, pad_h_in, pad_w_in, pad_h_out, pad_w_out; naive_conv_t naive_param; void* scratch; size_t scratch_size = 0; /* some parameters we can overwrite via cli, default is some inner layer of overfeat */ int iters = 10; /* repetitions of benchmark */ int ifw = 14; /* input width, "W" */ int ifh = 20; /* input height, "H" */ int nImg = 32; /* mini-batch size, "N" */ int nIfm = 256; /* number of input feature maps, "C" */ int nOfm = 512; /* number of output feature maps, "K" */ int kh = 3; /* filter height, "R" */ int kw = 3; /* filter width, "S" */ int padh = 0; /* padding in input, height */ int padw = 0; /* padding in input, width */ int stride = 1; /* stride when accessing inputs */ int padding_mode = 0; /* padding mode */ char type = 'A'; /* 'A': ALL, 'F': FP, 'B': BP, 'U', WU */ char format = 'A'; /* 'A': ALL, 'L': LIBXSMM, 'T': Tensorflow, 'M', Mixed */ const char *const env_check = getenv("CHECK"); const double check = LIBXSMM_ABS(0 == env_check ? 1 : atof(env_check)); #if defined(_OPENMP) int nThreads = omp_get_max_threads(); /* number of threads */ #else int nThreads = 1; /* number of threads */ #endif unsigned long long l_start, l_end; double l_total = 0.0; double flops = 0.0; int i; libxsmm_dnn_conv_desc conv_desc; libxsmm_dnn_layer* libxsmm_handle; libxsmm_dnn_tensor* libxsmm_input; libxsmm_dnn_tensor* libxsmm_output; libxsmm_dnn_tensor* libxsmm_filter; libxsmm_dnn_tensor* libxsmm_dinput; libxsmm_dnn_tensor* libxsmm_doutput; libxsmm_dnn_tensor* libxsmm_dfilter; libxsmm_dnn_tensor* libxsmm_filter_tr; libxsmm_dnn_tensor_datalayout* libxsmm_layout; libxsmm_dnn_err_t status; libxsmm_dnn_err_t global_status = LIBXSMM_DNN_SUCCESS; libxsmm_matdiff_info norms_fwd, norms_bwd, norms_upd, diff; libxsmm_matdiff_clear(&norms_fwd); libxsmm_matdiff_clear(&norms_bwd); libxsmm_matdiff_clear(&norms_upd); libxsmm_matdiff_clear(&diff); if (argc > 1 && !strncmp(argv[1], "-h", 3)) { printf("Usage: %s iters inpWidth inpHeight nImg nIfm nOfm kw kh pad stride type format padding_mode\n", argv[0]); return 0; } libxsmm_rng_set_seed(1); /* reading new values from cli */ i = 1; if (argc > i) iters = atoi(argv[i++]); if (argc > i) ifw = atoi(argv[i++]); if (argc > i) ifh = atoi(argv[i++]); if (argc > i) nImg = atoi(argv[i++]); if (argc > i) nIfm = atoi(argv[i++]); if (argc > i) nOfm = atoi(argv[i++]); if (argc > i) kw = atoi(argv[i++]); if (argc > i) kh = atoi(argv[i++]); if (argc > i) padw = atoi(argv[i++]); if (argc > i) padh = atoi(argv[i++]); if (argc > i) stride = atoi(argv[i++]); if (argc > i) type = *(argv[i++]); if (argc > i) format = *(argv[i++]); if (argc > i) padding_mode = atoi(argv[i++]); if (type != 'A' && type != 'F' && type != 'B' && type != 'U') { printf("type needs to be 'A' (All), 'F' (FP only), 'B' (BP only), 'U' (WU only)\n"); return 0; } stride_w = stride; stride_h = stride; pad_w = padw; pad_h = padh; if (0 == padding_mode) { pad_h_in = 0; pad_w_in = 0; pad_h_out = 0; pad_w_out = 0; } else { /* TODO: change "1" to "0" if "padding_mode = -1" is acknowledged */ if (1 < padding_mode) pad_w = padding_mode; pad_h_in = pad_h; pad_w_in = pad_w; pad_h_out = pad_h; pad_w_out = pad_w; } /* deriving some values for naive code */ ofh = (ifh + 2 * pad_h - kh) / stride_h + 1; ofw = (ifw + 2 * pad_w - kw) / stride_w + 1; ifhp = ifh + 2 * pad_h_in; ifwp = ifw + 2 * pad_w_in; ofhp = ofh + 2 * pad_h_out; ofwp = ofw + 2 * pad_w_out; /* set struct for naive convolution */ naive_param.nImg = nImg; naive_param.nIfm = nIfm; naive_param.nOfm = nOfm; naive_param.ifhp = ifhp; naive_param.ifwp = ifwp; naive_param.ofhp = ofhp; naive_param.ofwp = ofwp; naive_param.ifh = ifh; naive_param.ifw = ifw; naive_param.ofh = ofh; naive_param.ofw = ofw; naive_param.pad_h = pad_h; naive_param.pad_w = pad_w; naive_param.pad_h_in = pad_h_in; naive_param.pad_w_in = pad_w_in; naive_param.pad_h_out = pad_h_out; naive_param.pad_w_out = pad_w_out; naive_param.kh = kh; naive_param.kw = kw; naive_param.stride_h = stride_h; naive_param.stride_w = stride_w; #if defined(__SSE3__) _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); #endif /* print some summary */ printf("##########################################\n"); printf("# Setting Up (Common) #\n"); printf("##########################################\n"); printf("PARAMS: W:%d H:%d N:%d C:%d K:%d R:%d S:%d P:%d Q:%d STRIDE:%d\n", ifw, ifh, nImg, nIfm, nOfm, kw, kh, ofh, ofw, stride); printf("PARAMS: ITERS:%d", iters); if (LIBXSMM_FEQ(0, check)) printf(" Threads:%d\n", nThreads); else printf("\n"); printf(" InImg %dx%d Padded (%dx%d)\n", ifh, ifw, ifhp, ifwp); printf("OutImg %dx%d Padded (%dx%d)\n", ofh, ofw, ofhp, ofwp); printf("SIZE Input (MB): %10.2f MiB\n", (double)(nImg*nIfm*ifhp*ifwp*sizeof(float))/(1024.0*1024.0) ); printf("SIZE Output (MB): %10.2f MiB\n", (double)(nImg*nOfm*ofhp*ofwp*sizeof(float))/(1024.0*1024.0) ); printf("SIZE Input (1): %10.2f MiB\n", (double)(1*nIfm*ifhp*ifwp* sizeof(float))/(1024.0*1024.0) ); printf("SIZE Output (1): %10.2f MiB\n", (double)(1*nOfm*ofhp*ofwp* sizeof(float))/(1024.0*1024.0) ); printf("SIZE Weight : %10.2f MiB\n", (double)(nIfm*nOfm*kw*kh* sizeof(float))/(1024.0*1024.0) ); #if defined(USE_OVERWRITE) printf("Using Overwrite Option\n"); #endif /* allocate data */ naive_input = (float*)libxsmm_aligned_malloc( nImg*nIfm*ifhp*ifwp*sizeof(float), 2097152); naive_input_save = (float*)libxsmm_aligned_malloc( nImg*nIfm*ifhp*ifwp*sizeof(float), 2097152); naive_output = (float*)libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(float), 2097152); naive_output_save = (float*)libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(float), 2097152); naive_output_bp = (float*)libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(float), 2097152); naive_output_wu = (float*)libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(float), 2097152); naive_libxsmm_output = (float*)libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(float), 2097152); naive_libxsmm_input = (float*)libxsmm_aligned_malloc( nImg*nIfm*ifhp*ifwp*sizeof(float), 2097152); naive_filter = (float*)libxsmm_aligned_malloc( nOfm*nIfm*kh*kw* sizeof(float), 2097152); naive_filter_save = (float*)libxsmm_aligned_malloc( nOfm*nIfm*kh*kw* sizeof(float), 2097152); naive_filter_wu = (float*)libxsmm_aligned_malloc( nOfm*nIfm*kh*kw* sizeof(float), 2097152); naive_filter_kcrs = (float*)libxsmm_aligned_malloc( nOfm*nIfm*kh*kw* sizeof(float), 2097152); naive_libxsmm_filter = (float*)libxsmm_aligned_malloc( nOfm*nIfm*kh*kw* sizeof(float), 2097152); input_nhwc = (float*)libxsmm_aligned_malloc( nImg*nIfm*ifhp*ifwp*sizeof(float), 2097152); doutput_nhwc = (float*)libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(float), 2097152); dinput_nhwc = (float*)libxsmm_aligned_malloc( nImg*nIfm*ifhp*ifwp*sizeof(float), 2097152); output_nhwc = (float*)libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(float), 2097152); naive_output_nhwc = (float*)libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(float), 2097152); naive_input_nhwc = (float*)libxsmm_aligned_malloc( nImg*nIfm*ifhp*ifwp*sizeof(float), 2097152); filter_rsck = (float*)libxsmm_aligned_malloc( nOfm*nIfm*kh*kw* sizeof(float), 2097152); dfilter_rsck = (float*)libxsmm_aligned_malloc( nOfm*nIfm*kh*kw* sizeof(float), 2097152); input_libxsmm = (float*)libxsmm_aligned_malloc( nImg*nIfm*ifhp*ifwp*sizeof(float), 2097152); filter_libxsmm = (float*)libxsmm_aligned_malloc( nOfm*nIfm*kh*kw* sizeof(float), 2097152); output_libxsmm = (float*)libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(float), 2097152); dinput_libxsmm = (float*)libxsmm_aligned_malloc( nImg*nIfm*ifhp*ifwp*sizeof(float), 2097152); dfilter_libxsmm = (float*)libxsmm_aligned_malloc( nOfm*nIfm*kh*kw* sizeof(float), 2097152); doutput_libxsmm = (float*)libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(float), 2097152); filtertr_libxsmm = (float*)libxsmm_aligned_malloc( nOfm*nIfm*kh*kw* sizeof(float), 2097152); /* initialize data */ if (padding_mode == 0 ) { init_buf(naive_input, nImg*nIfm*ifhp*ifwp, 0, 0); } else { float *naive_input_tmp = (float*)libxsmm_aligned_scratch( nImg*nIfm*ifhp*ifwp*sizeof(float), 2097152); init_buf(naive_input_tmp, nImg*nIfm*ifh*ifw, 0, 0); copy_internal_nchw( naive_input , naive_input_tmp, nImg, nIfm, ifh, ifw, pad_h, pad_w); libxsmm_free(naive_input_tmp); } if (padding_mode == 0 ) { init_buf(naive_output_bp, nImg*nOfm*ofhp*ofwp, 0, 0); init_buf(naive_output_wu, nImg*nOfm*ofhp*ofwp, 0, 0); } else { float *naive_output_bp_tmp = (float*)libxsmm_aligned_scratch( nImg*nOfm*ofhp*ofwp*sizeof(float), 2097152); float *naive_output_wu_tmp = (float*)libxsmm_aligned_scratch( nImg*nOfm*ofhp*ofwp*sizeof(float), 2097152); init_buf(naive_output_bp_tmp, nImg*nOfm*ofh*ofw, 0, 0); copy_internal_nchw( naive_output_bp , naive_output_bp_tmp, nImg, nOfm, ofh, ofw, pad_h, pad_w); init_buf(naive_output_wu_tmp, nImg*nOfm*ofh*ofw, 0, 0); copy_internal_nchw( naive_output_wu , naive_output_wu_tmp, nImg, nOfm, ofh, ofw, pad_h, pad_w); libxsmm_free(naive_output_bp_tmp); libxsmm_free(naive_output_wu_tmp); } set_zeropad_nchw(naive_input, nImg, nIfm, ifhp, ifwp, pad_h_in, pad_w_in); set_zeropad_nchw(naive_output_bp, nImg, nOfm, ofhp, ofwp, pad_h_out, pad_w_out); set_zeropad_nchw(naive_output_wu, nImg, nOfm, ofhp, ofwp, pad_h_out, pad_w_out); copy_buf(naive_input, naive_input_save, nImg*nIfm*ifhp*ifwp); zero_buf(naive_output_save, nImg*nOfm*ofhp*ofwp); if (padding_mode == 0 ) { init_buf(naive_output, nImg*nOfm*ofhp*ofwp, 0, 0); } else { float *naive_output_tmp = (float*)libxsmm_aligned_scratch( nImg*nOfm*ofhp*ofwp*sizeof(float), 2097152); init_buf(naive_output_tmp, nImg*nOfm*ofh*ofw, 0, 0); libxsmm_free(naive_output_tmp); } set_zeropad_nchw(naive_output, nImg, nOfm, ofhp, ofwp, pad_h_out, pad_w_out); copy_buf(naive_output, naive_output_save, nImg*nOfm*ofhp*ofwp); zero_buf(naive_libxsmm_output, nImg*nOfm*ofhp*ofwp); zero_buf(naive_libxsmm_input, nImg*nIfm*ifhp*ifwp); init_buf(naive_filter, nOfm*nIfm*kh*kw, 0, 0); copy_buf(naive_filter, naive_filter_wu, nOfm*nIfm*kh*kw); zero_buf(naive_libxsmm_filter, nOfm*nIfm*kh*kw); naive_copy_NCHW_to_NHWC(naive_input, input_nhwc, nImg, ifhp, ifwp, nIfm); zero_buf(output_nhwc, nImg*nOfm*ofhp*ofwp); zero_buf(naive_output_nhwc, nImg*nOfm*ofhp*ofwp); zero_buf(naive_input_nhwc, nImg*nIfm*ifhp*ifwp); naive_copy_KCRS_to_RSCK(naive_filter, filter_rsck, kh, kw, nIfm, nOfm); /* first touch LIBXSMM */ zero_buf( input_libxsmm , nImg*nIfm*ifhp*ifwp ); zero_buf( filter_libxsmm , nOfm*nIfm*kh*kw ); zero_buf( output_libxsmm , nImg*nOfm*ofhp*ofwp ); zero_buf( dinput_libxsmm , nImg*nIfm*ifhp*ifwp ); zero_buf( dfilter_libxsmm , nOfm*nIfm*kh*kw ); zero_buf( doutput_libxsmm , nImg*nOfm*ofhp*ofwp ); zero_buf( filtertr_libxsmm , nOfm*nIfm*kh*kw ); if (LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Computing Reference ... #\n"); printf("##########################################\n"); if (type == 'A' || type == 'F') { #ifdef USE_OVERWRITE zero_buf(naive_output, nImg*nOfm*ofhp*ofwp); #endif naive_conv_fp(&naive_param, naive_input, naive_output, naive_filter, NULL); } if ( (type == 'A' || type == 'B') && (nIfm > 3) ) { #ifdef USE_OVERWRITE zero_buf(naive_input, nImg*nIfm*ifhp*ifwp); #endif naive_conv_bp(&naive_param, naive_input, naive_output_bp, naive_filter, naive_input_save); } if (type == 'A' || type == 'U') { /* NB: We reuse naive_input_save for weight update because the input should not * have been modified between forward propagation and weight update; it further * helps in exploiting reuse to converted data. */ #ifdef USE_OVERWRITE zero_buf(naive_filter_wu, nOfm*nIfm*kh*kw); #endif naive_conv_wu(&naive_param, naive_input_save, naive_output_wu, naive_filter_wu); } printf("##########################################\n"); printf("# Computing Reference ... done #\n"); printf("##########################################\n"); } if (format == 'A' || format == 'L') { printf("\n"); printf("##########################################\n"); printf("# Setting Up (custom-Storage) #\n"); printf("##########################################\n"); /* setup LIBXSMM handle */ conv_desc.N = nImg; conv_desc.C = nIfm; conv_desc.H = ifh; conv_desc.W = ifw; conv_desc.K = nOfm; conv_desc.R = kh; conv_desc.S = kw; conv_desc.u = stride_h; conv_desc.v = stride_w; conv_desc.pad_h = pad_h; conv_desc.pad_w = pad_w; conv_desc.pad_h_in = pad_h_in; conv_desc.pad_w_in = pad_w_in; conv_desc.pad_h_out = pad_h_out; conv_desc.pad_w_out = pad_w_out; conv_desc.threads = nThreads; conv_desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT; conv_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; conv_desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; conv_desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE; #if defined(USE_BWD_NO_FILTER_TRANSPOSE_OVERWRITE) conv_desc.options = LIBXSMM_DNN_CONV_OPTION_BWD_NO_FILTER_TRANSPOSE_OVERWRITE; #elif defined(USE_OVERWRITE) conv_desc.options = LIBXSMM_DNN_CONV_OPTION_OVERWRITE; #else conv_desc.options = LIBXSMM_DNN_CONV_OPTION_NONE; #endif conv_desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE; conv_desc.datatype_in = LIBXSMM_DNN_DATATYPE_F32; conv_desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32; libxsmm_handle = libxsmm_dnn_create_conv_layer( conv_desc, &status ); CHKERR_LIBXSMM_DNN( status ); /* setup LIBXSMM buffers and filter */ libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_input = libxsmm_dnn_link_tensor( libxsmm_layout, input_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dinput = libxsmm_dnn_link_tensor( libxsmm_layout, dinput_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_output = libxsmm_dnn_link_tensor( libxsmm_layout, output_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_doutput = libxsmm_dnn_link_tensor( libxsmm_layout, doutput_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_FILTER, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_filter = libxsmm_dnn_link_tensor( libxsmm_layout, filter_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dfilter = libxsmm_dnn_link_tensor( libxsmm_layout, dfilter_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER_TRANS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_filter_tr = libxsmm_dnn_link_tensor( libxsmm_layout, filtertr_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); /* copy in data to LIBXSMM format */ /* we can also use the layout functions and set the data on our own external to the library, @TODO, we plan to add an example here */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_input, (void*)naive_input_save, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_output, (void*)naive_output_save, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_filter, (void*)naive_filter, LIBXSMM_DNN_TENSOR_FORMAT_KCRS ) ); zero_buf(filtertr_libxsmm, nOfm*nIfm*kh*kw); /* bind buffers and filter to handle */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_input, LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_dinput, LIBXSMM_DNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_output, LIBXSMM_DNN_REGULAR_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_doutput, LIBXSMM_DNN_GRADIENT_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_filter, LIBXSMM_DNN_REGULAR_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_dfilter, LIBXSMM_DNN_GRADIENT_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_filter_tr, LIBXSMM_DNN_REGULAR_FILTER_TRANS ) ); /* let's allocate and bind scratch */ scratch_size = libxsmm_dnn_get_scratch_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, &status ); CHKERR_LIBXSMM_DNN( status ); scratch = libxsmm_aligned_scratch( scratch_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, scratch ) ); /* set scratch to bogus to make sure that libxsmm takes care of zeroing internally */ init_buf( (float*)scratch, scratch_size/4, 0, 0 ); if ((type == 'A' || type == 'F') && LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Correctness - FWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolutions */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) ); } /* copy out data */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_output, (void*)naive_libxsmm_output, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); /* compare */ libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_F32, nImg*nOfm*ofhp*ofwp, 1, naive_output, naive_libxsmm_output, 0, 0); printf("L1 reference : %.25g\n", norms_fwd.l1_ref); printf("L1 test : %.25g\n", norms_fwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_fwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_fwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel); printf("Check-norm : %.24f\n", norms_fwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_fwd); } if ( (type == 'A' || type == 'B') && (nIfm > 3) && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - BWD (custom-Storage) #\n"); printf("##########################################\n"); /* let's do some additional init such that we can run passes standalone */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_doutput, (void*)naive_output_bp, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_dinput, (void*)naive_input_save, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); #if defined(USE_BWD_NO_FILTER_TRANSPOSE_OVERWRITE) CHKERR_LIBXSMM_DNN( libxsmm_dnn_trans_reg_filter( libxsmm_handle ) ); #endif /* run LIBXSMM convolutions */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ) ); } /* copy out data */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_dinput, (void*)naive_libxsmm_input, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); /* compare */ libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, nImg*nIfm*ifhp*ifwp, 1, naive_input, naive_libxsmm_input, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); } if ((type == 'A' || type == 'U') && LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Correctness - UPD (custom-Storage) #\n"); printf("##########################################\n"); /* let's do some additional init such that we can run passes standalone */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_input, (void*)naive_input_save, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_doutput, (void*)naive_output_wu, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_dfilter, (void*)naive_filter, LIBXSMM_DNN_TENSOR_FORMAT_KCRS ) ); /* run LIBXSMM convolutions */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ) ); } /* copy out data */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_dfilter, (void*)naive_libxsmm_filter, LIBXSMM_DNN_TENSOR_FORMAT_KCRS ) ); /* compare */ libxsmm_matdiff(&norms_upd, LIBXSMM_DATATYPE_F32, nOfm*nIfm*kh*kw, 1, naive_filter_wu, naive_libxsmm_filter, 0, 0); printf("L1 reference : %.25g\n", norms_upd.l1_ref); printf("L1 test : %.25g\n", norms_upd.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd.linf_rel); printf("Check-norm : %.24f\n", norms_upd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd); } if (type == 'A' || type == 'F') { printf("##########################################\n"); printf("# Performance - FWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolution for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = (double)nImg * (double)nIfm * (double)nOfm * (double)ofh * (double)ofw * (double)(2 * kh * kw) * (double)iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,FP,%s,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nIfm, nOfm, ifw, ifh, kw, kh, stride, padw, padh, ((double)(l_total/iters)), (flops*1e-9)/l_total, norms_fwd.l1_ref, norms_fwd.l1_tst, norms_fwd.l2_abs, norms_fwd.l2_rel, norms_fwd.linf_abs, norms_fwd.linf_rel, norms_fwd.normf_rel); } if ( (type == 'A' || type == 'B') && (nIfm > 3) ) { printf("##########################################\n"); printf("# Performance - BWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolution for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = (double)nImg * (double)nIfm * (double)nOfm * (double)ofh * (double)ofw * (double)(2 * kh * kw) * (double)iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("bp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,BP,%s,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nIfm, nOfm, ifw, ifh, kw, kh, stride, padw, padh, ((double)(l_total/iters)), (flops*1e-9)/l_total, norms_bwd.l1_ref, norms_bwd.l1_tst, norms_bwd.l2_abs, norms_bwd.l2_rel, norms_bwd.linf_abs, norms_bwd.linf_rel, norms_bwd.normf_rel); } if (type == 'A' || type == 'U') { printf("##########################################\n"); printf("# Performance - UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolution for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = (double)nImg * (double)nIfm * (double)nOfm * (double)ofh * (double)ofw * (double)(2 * kh * kw) * (double)iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("wu time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,WU,%s,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nIfm, nOfm, ifw, ifh, kw, kh, stride, padw, padh, ((double)(l_total/iters)), (flops*1e-9)/l_total, norms_upd.l1_ref, norms_upd.l1_tst, norms_upd.l2_abs, norms_upd.l2_rel, norms_upd.linf_abs, norms_upd.linf_rel, norms_upd.normf_rel); } /* clean-up */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL ) ); libxsmm_free(scratch); CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER_TRANS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_input ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_output ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_filter ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dinput ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_doutput ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dfilter ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_filter_tr ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_conv_layer( libxsmm_handle ) ); } if (format == 'A' || format == 'T') { printf("\n"); printf("##########################################\n"); printf("# Setting Up - (NHWC/RSCK-Storage) #\n"); printf("##########################################\n"); /* setup LIBXSMM handle */ conv_desc.N = nImg; conv_desc.C = nIfm; conv_desc.H = ifh; conv_desc.W = ifw; conv_desc.K = nOfm; conv_desc.R = kh; conv_desc.S = kw; conv_desc.u = stride_h; conv_desc.v = stride_w; conv_desc.pad_h = pad_h; conv_desc.pad_w = pad_w; conv_desc.pad_h_in = pad_h_in; conv_desc.pad_w_in = pad_w_in; conv_desc.pad_h_out = pad_h_out; conv_desc.pad_w_out = pad_w_out; conv_desc.threads = nThreads; conv_desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT; conv_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NHWC; conv_desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_RSCK; #ifdef USE_OVERWRITE conv_desc.options = LIBXSMM_DNN_CONV_OPTION_OVERWRITE; #else conv_desc.options = LIBXSMM_DNN_CONV_OPTION_NONE; #endif conv_desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE; conv_desc.datatype_in = LIBXSMM_DNN_DATATYPE_F32; conv_desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32; libxsmm_handle = libxsmm_dnn_create_conv_layer( conv_desc, &status ); CHKERR_LIBXSMM_DNN( status ); /* setup LIBXSMM buffers and filter */ naive_copy_NCHW_to_NHWC(naive_input_save, input_nhwc, nImg, ifhp, ifwp, nIfm); naive_copy_NCHW_to_NHWC(naive_output_save, output_nhwc, nImg, ofhp, ofwp, nOfm); libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_input = libxsmm_dnn_link_tensor( libxsmm_layout, input_nhwc, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dinput = libxsmm_dnn_link_tensor( libxsmm_layout, dinput_nhwc, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_output = libxsmm_dnn_link_tensor( libxsmm_layout, output_nhwc, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_doutput = libxsmm_dnn_link_tensor( libxsmm_layout, doutput_nhwc, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_FILTER, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_filter = libxsmm_dnn_link_tensor( libxsmm_layout, filter_rsck, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dfilter = libxsmm_dnn_link_tensor( libxsmm_layout, dfilter_rsck, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); /* bind buffers and filter to handle */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_input, LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_dinput, LIBXSMM_DNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_output, LIBXSMM_DNN_REGULAR_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_doutput, LIBXSMM_DNN_GRADIENT_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_filter, LIBXSMM_DNN_REGULAR_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_dfilter, LIBXSMM_DNN_GRADIENT_FILTER ) ); /* let's allocate and bind scratch */ scratch_size = libxsmm_dnn_get_scratch_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, &status ); CHKERR_LIBXSMM_DNN( status ); scratch = libxsmm_aligned_scratch( scratch_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, scratch ) ); /* set scratch to bogus to make sure that libxsmm takes care of zeroing internally */ init_buf( (float*)scratch, scratch_size/4, 0, 0 ); if ((type == 'A' || type == 'F') && LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Correctness - FWD (NHWC/RSCK-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolutions */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) ); } /* copy output data into NCHW storage in user code */ naive_copy_NHWC_to_NCHW(output_nhwc, naive_output_nhwc, nImg, ofhp, ofwp, nOfm); /* compare */ libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_F32, nImg*nOfm*ofhp*ofwp, 1, naive_output, naive_output_nhwc, 0, 0); printf("L1 reference : %.25g\n", norms_fwd.l1_ref); printf("L1 test : %.25g\n", norms_fwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_fwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_fwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel); printf("Check-norm : %.24f\n", norms_fwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_fwd); } if ( (type == 'A' || type == 'B') && (nIfm > 3) && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - BWD (NHWC/RSCK-Storage) #\n"); printf("##########################################\n"); /* let's do some additional init such that we can run passes standalone */ naive_copy_NCHW_to_NHWC(naive_output_bp, doutput_nhwc, nImg, ofhp, ofwp, nOfm); naive_copy_NCHW_to_NHWC(naive_input_save, dinput_nhwc, nImg, ifhp, ifwp, nIfm); /* run LIBXSMM convolutions */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ) ); } /* copy input data into NCHW storage in user code */ naive_copy_NHWC_to_NCHW(dinput_nhwc, naive_input_nhwc, nImg, ifhp, ifwp, nIfm); /* compare */ libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, nImg*nIfm*ifhp*ifwp, 1, naive_input, naive_input_nhwc, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); } if ((type == 'A' || type == 'U') && LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Correctness - UPD (NHWC/RSCK-Storage) #\n"); printf("##########################################\n"); /* let's do some additional init such that we can run passes standalone */ naive_copy_NCHW_to_NHWC(naive_input_save, input_nhwc, nImg, ifhp, ifwp, nIfm); naive_copy_NCHW_to_NHWC(naive_output_wu, doutput_nhwc, nImg, ofhp, ofwp, nOfm); naive_copy_KCRS_to_RSCK(naive_filter, dfilter_rsck, kh, kw, nIfm, nOfm); /* run LIBXSMM convolutions */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ) ); } /* copy input data into KCRS storage in user code */ naive_copy_RSCK_to_KCRS(dfilter_rsck, naive_filter_kcrs, kh, kw, nIfm, nOfm); /* compare */ libxsmm_matdiff(&norms_upd, LIBXSMM_DATATYPE_F32, nOfm*nIfm*kh*kw, 1, naive_filter_wu, naive_filter_kcrs, 0, 0); printf("L1 reference : %.25g\n", norms_upd.l1_ref); printf("L1 test : %.25g\n", norms_upd.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd.linf_rel); printf("Check-norm : %.24f\n", norms_upd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd); } if (type == 'A' || type == 'F') { printf("##########################################\n"); printf("# Performance - FWD (NHWC/RSCK-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolution for performance */ l_start = libxsmm_timer_tick(); for (i = 0; i < iters; ++i) { #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif libxsmm_dnn_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = (double)nImg * (double)nIfm * (double)nOfm * (double)ofh * (double)ofw * (double)(2 * kh * kw) * (double)iters; printf("GFLOP (NHWC,RSCK) = %.5g\n", flops*1e-9/(double)iters); printf("fp time (NHWC,RSCK) = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS (NHWC,RSCK) = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP-NHWC-RSCK,FP,%s,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nIfm, nOfm, ifw, ifh, kw, kh, stride, padw, padh, ((double)(l_total/iters)), (flops*1e-9)/l_total, norms_fwd.l1_ref, norms_fwd.l1_tst, norms_fwd.l2_abs, norms_fwd.l2_rel, norms_fwd.linf_abs, norms_fwd.linf_rel, norms_fwd.normf_rel); } if ( (type == 'A' || type == 'B') && (nIfm > 3) ) { printf("##########################################\n"); printf("# Performance - BWD (NHWC/RSCK-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolution for performance */ l_start = libxsmm_timer_tick(); for (i = 0; i < iters; ++i) { #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif libxsmm_dnn_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = (double)nImg * (double)nIfm * (double)nOfm * (double)ofh * (double)ofw * (double)(2 * kh * kw) * (double)iters; printf("GFLOP (NHWC,RSCK) = %.5g\n", flops*1e-9/(double)iters); printf("fp time (NHWC,RSCK) = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS (NHWC,RSCK) = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP-NHWC-RSCK,BP,%s,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nIfm, nOfm, ifw, ifh, kw, kh, stride, padw, padh, ((double)(l_total/iters)), (flops*1e-9)/l_total, norms_bwd.l1_ref, norms_bwd.l1_tst, norms_bwd.l2_abs, norms_bwd.l2_rel, norms_bwd.linf_abs, norms_bwd.linf_rel, norms_bwd.normf_rel); } if (type == 'A' || type == 'U') { printf("##########################################\n"); printf("# Performance - UPD (NHWC/RSCK-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolution for performance */ l_start = libxsmm_timer_tick(); for (i = 0; i < iters; ++i) { #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif libxsmm_dnn_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = (double)nImg * (double)nIfm * (double)nOfm * (double)ofh * (double)ofw * (double)(2 * kh * kw) * (double)iters; printf("GFLOP (NHWC,RSCK) = %.5g\n", flops*1e-9/(double)iters); printf("fp time (NHWC,RSCK) = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS (NHWC,RSCK) = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP-NHWC-RSCK,WU,%s,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nIfm, nOfm, ifw, ifh, kw, kh, stride, padw, padh, ((double)(l_total/iters)), (flops*1e-9)/l_total, norms_upd.l1_ref, norms_upd.l1_tst, norms_upd.l2_abs, norms_upd.l2_rel, norms_upd.linf_abs, norms_upd.linf_rel, norms_upd.normf_rel); } /* clean-up */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL ) ); libxsmm_free(scratch); CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_input ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dinput ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_output ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_doutput ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_filter ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dfilter ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_conv_layer( libxsmm_handle ) ); } if (format == 'A' || format == 'M') { printf("\n"); printf("##########################################\n"); printf("# Setting Up - (NHWC/custom-Storage) #\n"); printf("##########################################\n"); /* setup LIBXSMM handle */ conv_desc.N = nImg; conv_desc.C = nIfm; conv_desc.H = ifh; conv_desc.W = ifw; conv_desc.K = nOfm; conv_desc.R = kh; conv_desc.S = kw; conv_desc.u = stride_h; conv_desc.v = stride_w; conv_desc.pad_h = pad_h; conv_desc.pad_w = pad_w; conv_desc.pad_h_in = pad_h_in; conv_desc.pad_w_in = pad_w_in; conv_desc.pad_h_out = pad_h_out; conv_desc.pad_w_out = pad_w_out; conv_desc.threads = nThreads; conv_desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT; conv_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NHWC; conv_desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; #ifdef USE_OVERWRITE conv_desc.options = LIBXSMM_DNN_CONV_OPTION_OVERWRITE; #else conv_desc.options = LIBXSMM_DNN_CONV_OPTION_NONE; #endif conv_desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE; conv_desc.datatype_in = LIBXSMM_DNN_DATATYPE_F32; conv_desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32; libxsmm_handle = libxsmm_dnn_create_conv_layer( conv_desc, &status ); CHKERR_LIBXSMM_DNN( status ); /* setup LIBXSMM buffers and filter */ naive_copy_NCHW_to_NHWC(naive_output_save, output_nhwc, nImg, ofhp, ofwp, nOfm); naive_copy_NCHW_to_NHWC(naive_input_save, input_nhwc, nImg, ifhp, ifwp, nIfm); libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_input = libxsmm_dnn_link_tensor( libxsmm_layout, input_nhwc, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dinput = libxsmm_dnn_link_tensor( libxsmm_layout, dinput_nhwc, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_output = libxsmm_dnn_link_tensor( libxsmm_layout, output_nhwc, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_doutput = libxsmm_dnn_link_tensor( libxsmm_layout, doutput_nhwc, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_FILTER, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_filter = libxsmm_dnn_link_tensor( libxsmm_layout, filter_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dfilter = libxsmm_dnn_link_tensor( libxsmm_layout, dfilter_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); /* copy in data to LIBXSMM format */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_filter, (void*)naive_filter, LIBXSMM_DNN_TENSOR_FORMAT_KCRS ) ); /* bind buffers and filter to handle */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_input, LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_dinput, LIBXSMM_DNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_output, LIBXSMM_DNN_REGULAR_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_doutput, LIBXSMM_DNN_GRADIENT_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_filter, LIBXSMM_DNN_REGULAR_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_dfilter, LIBXSMM_DNN_GRADIENT_FILTER ) ); /* let's allocate and bind scratch */ scratch_size = libxsmm_dnn_get_scratch_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, &status ); CHKERR_LIBXSMM_DNN( status ); scratch = libxsmm_aligned_scratch( scratch_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, scratch ) ); /* set scratch to bogus to make sure that libxsmm takes care of zeroing internally */ init_buf( (float*)scratch, scratch_size/4, 0, 0 ); if ((type == 'A' || type == 'F') && LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Correctness - FWD(NHWC/custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolutions */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) ); } /* copy output data into NCHW storage in user code */ naive_copy_NHWC_to_NCHW(output_nhwc, naive_output_nhwc, nImg, ofhp, ofwp, nOfm); /* compare */ libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_F32, nImg*nOfm*ofhp*ofwp, 1, naive_output, naive_output_nhwc, 0, 0); printf("L1 reference : %.25g\n", norms_fwd.l1_ref); printf("L1 test : %.25g\n", norms_fwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_fwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_fwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel); printf("Check-norm : %.24f\n", norms_fwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_fwd); } if ( (type == 'A' || type == 'B') && (nIfm > 3) && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - BWD(NHWC/custom-Storage) #\n"); printf("##########################################\n"); /* let's do some additional init such that we can run passes standalone */ naive_copy_NCHW_to_NHWC(naive_output_bp, doutput_nhwc, nImg, ofhp, ofwp, nOfm); naive_copy_NCHW_to_NHWC(naive_input_save, dinput_nhwc, nImg, ifhp, ifwp, nIfm); /* run LIBXSMM convolutions */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ) ); } /* copy input data into NCHW storage in user code */ naive_copy_NHWC_to_NCHW(dinput_nhwc, naive_input_nhwc, nImg, ifhp, ifwp, nIfm); /* compare */ libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, nImg*nIfm*ifhp*ifwp, 1, naive_input, naive_input_nhwc, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); } if ((type == 'A' || type == 'U') && LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Correctness - UPD(NHWC/custom-Storage) #\n"); printf("##########################################\n"); /* let's do some additional init such that we can run passes standalone */ naive_copy_NCHW_to_NHWC(naive_input_save, input_nhwc, nImg, ifhp, ifwp, nIfm); naive_copy_NCHW_to_NHWC(naive_output_wu, doutput_nhwc, nImg, ofhp, ofwp, nOfm); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_dfilter, (void*)naive_filter, LIBXSMM_DNN_TENSOR_FORMAT_KCRS ) ); /* run LIBXSMM convolutions */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ) ); } /* copy out data */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_dfilter, (void*)naive_libxsmm_filter, LIBXSMM_DNN_TENSOR_FORMAT_KCRS ) ); /* compare */ libxsmm_matdiff(&norms_upd, LIBXSMM_DATATYPE_F32, nOfm*nIfm*kh*kw, 1, naive_filter_wu, naive_libxsmm_filter, 0, 0); printf("L1 reference : %.25g\n", norms_upd.l1_ref); printf("L1 test : %.25g\n", norms_upd.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd.linf_rel); printf("Check-norm : %.24f\n", norms_upd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd); } if (type == 'A' || type == 'F') { printf("##########################################\n"); printf("# Performance - FWD(NHWC/custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolution for performance */ l_start = libxsmm_timer_tick(); for (i = 0; i < iters; ++i) { #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif libxsmm_dnn_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = (double)nImg * (double)nIfm * (double)nOfm * (double)ofh * (double)ofw * (double)(2 * kh * kw) * (double)iters; printf("GFLOP (NHWC,custom) = %.5g\n", flops*1e-9/(double)iters); printf("fp time (NHWC,custom) = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS (NHWC,custom) = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP-NHWC-custom,FP,%s,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nIfm, nOfm, ifw, ifh, kw, kh, stride, padw, padh, ((double)(l_total/iters)), (flops*1e-9)/l_total, norms_fwd.l1_ref, norms_fwd.l1_tst, norms_fwd.l2_abs, norms_fwd.l2_rel, norms_fwd.linf_abs, norms_fwd.linf_rel, norms_fwd.normf_rel); } if ( (type == 'A' || type == 'B') && (nIfm > 3) ) { printf("##########################################\n"); printf("# Performance - BWD(NHWC/custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolution for performance */ l_start = libxsmm_timer_tick(); for (i = 0; i < iters; ++i) { #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif libxsmm_dnn_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = (double)nImg * (double)nIfm * (double)nOfm * (double)ofh * (double)ofw * (double)(2 * kh * kw) * (double)iters; printf("GFLOP (NHWC,custom) = %.5g\n", flops*1e-9/(double)iters); printf("fp time (NHWC,custom) = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS (NHWC,custom) = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP-NHWC-custom,BP,%s,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nIfm, nOfm, ifw, ifh, kw, kh, stride, padw, padh, ((double)(l_total/iters)), (flops*1e-9)/l_total, norms_bwd.l1_ref, norms_bwd.l1_tst, norms_bwd.l2_abs, norms_bwd.l2_rel, norms_bwd.linf_abs, norms_bwd.linf_rel, norms_bwd.normf_rel); } if (type == 'A' || type == 'U') { printf("##########################################\n"); printf("# Performance - UPD(NHWC/custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolution for performance */ l_start = libxsmm_timer_tick(); for (i = 0; i < iters; ++i) { #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif libxsmm_dnn_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = (double)nImg * (double)nIfm * (double)nOfm * (double)ofh * (double)ofw * (double)(2 * kh * kw) * (double)iters; printf("GFLOP (NHWC,custom) = %.5g\n", flops*1e-9/(double)iters); printf("fp time (NHWC,custom) = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS (NHWC,custom) = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP-NHWC-custom,WU,%s,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nIfm, nOfm, ifw, ifh, kw, kh, stride, padw, padh, ((double)(l_total/iters)), (flops*1e-9)/l_total, norms_upd.l1_ref, norms_upd.l1_tst, norms_upd.l2_abs, norms_upd.l2_rel, norms_upd.linf_abs, norms_upd.linf_rel, norms_upd.normf_rel); } /* clean-up */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL ) ); libxsmm_free(scratch); CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_input ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dinput ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_output ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_doutput ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_filter ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dfilter ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_conv_layer( libxsmm_handle ) ); } /* deallocate data */ libxsmm_free(naive_input); libxsmm_free(naive_input_save); libxsmm_free(naive_output); libxsmm_free(naive_output_save); libxsmm_free(naive_output_bp); libxsmm_free(naive_output_wu); libxsmm_free(naive_libxsmm_output); libxsmm_free(naive_libxsmm_input); libxsmm_free(naive_filter); libxsmm_free(naive_filter_save); libxsmm_free(naive_filter_wu); libxsmm_free(naive_filter_kcrs); libxsmm_free(naive_libxsmm_filter); libxsmm_free(input_nhwc); libxsmm_free(output_nhwc); libxsmm_free(dinput_nhwc); libxsmm_free(doutput_nhwc); libxsmm_free(naive_output_nhwc); libxsmm_free(naive_input_nhwc); libxsmm_free(filter_rsck); libxsmm_free(dfilter_rsck); libxsmm_free(input_libxsmm); libxsmm_free(filter_libxsmm); libxsmm_free(output_libxsmm); libxsmm_free(dinput_libxsmm); libxsmm_free(dfilter_libxsmm); libxsmm_free(doutput_libxsmm); libxsmm_free(filtertr_libxsmm); { const char *const env_check_scale = getenv("CHECK_SCALE"); const double check_scale = LIBXSMM_ABS(0 == env_check_scale ? 1.0 : atof(env_check_scale)); if (LIBXSMM_NEQ(0, check) && (check < 100.0 * check_scale * diff.normf_rel) && (global_status == LIBXSMM_DNN_SUCCESS)) { fprintf(stderr, "FAILED with an error of %f%%!\n", 100.0 * diff.normf_rel); exit(EXIT_FAILURE); } } /* some empty lines at the end */ printf("\n\n\n"); return global_status; } libxsmm-1.17/samples/deeplearning/cnnlayer/layer_example_f32.vcxproj000066400000000000000000000550111415223013700257470ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 layer_example_f32 {63B474A0-5BC5-4212-888E-88D30CAA791B} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/deeplearning/cnnlayer/layer_example_i8i32.c000066400000000000000000000407311415223013700247450ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Evangelos Georganas, Hans Pabst, Dhiraj Kalamkar, Ankush Mandal (Intel Corp.) ******************************************************************************/ #include #include #include #include #if defined(_OPENMP) # include #endif #define USE_OVERWRITE /* include c-based dnn library */ #include "../common/dnn_common.h" #define CHKERR_LIBXSMM_DNN(A) { const int chkerr_libxsmm_dnn_ = A; if (LIBXSMM_DNN_SUCCESS != chkerr_libxsmm_dnn_) { \ fprintf(stderr, "%s\n", libxsmm_dnn_get_error(chkerr_libxsmm_dnn_)); global_status = chkerr_libxsmm_dnn_; } \ } int main(int argc, char* argv[]) { unsigned char *naive_input, *naive_input_tmp; char *naive_filter; int *naive_output_fp; int *naive_libxsmm_output; unsigned char *input_libxsmm; char *filter_libxsmm; int *output_libxsmm; int ifhp, ifwp, ofhp, ofwp, ofh, ofw; int stride_h, stride_w, pad_h, pad_w, pad_h_in, pad_w_in, pad_h_out, pad_w_out; naive_conv_t naive_param; void* scratch; size_t scratch_size; /* some parameters we can overwrite via cli, default is some inner layer of overfeat */ int iters = 10; /* repetitions of benchmark */ int ifw = 14; /* input width, "W" */ int ifh = 18; /* input height, "H" */ int nImg = 32; /* mini-batch size, "N" */ int nIfm = 256; /* number of input feature maps, "C" */ int nOfm = 512; /* number of output feature maps, "K" */ int kh = 3; /* filter height, "R" */ int kw = 3; /* filter width, "S" */ int padh = 1; /* padding in input, height */ int padw = 1; /* padding in input, width */ int stride = 1; /* stride when accessing inputs */ char type = 'A'; /* 'A': ALL, 'F': FP, 'B': BP, 'U', WU */ char format = 'L'; const char *const env_check = getenv("CHECK"); const double check = LIBXSMM_ABS(0 == env_check ? 1 : atof(env_check)); #if defined(_OPENMP) int nThreads = omp_get_max_threads(); /* number of threads */ #else int nThreads = 1; /* number of threads */ #endif int padding_mode = 0; /* padding mode */ unsigned long long l_start, l_end; double l_total = 0.0; double lpOps = 0.0; /* number of low precision operations */ int i; libxsmm_dnn_conv_desc conv_desc; libxsmm_dnn_layer* libxsmm_handle; libxsmm_dnn_tensor* libxsmm_input; libxsmm_dnn_tensor* libxsmm_output; libxsmm_dnn_tensor* libxsmm_filter; libxsmm_dnn_tensor_datalayout* libxsmm_layout; libxsmm_dnn_err_t status; libxsmm_dnn_err_t global_status = LIBXSMM_DNN_SUCCESS; libxsmm_matdiff_info norms_fwd, diff; libxsmm_matdiff_clear(&norms_fwd); libxsmm_matdiff_clear(&diff); if (argc > 1 && !strncmp(argv[1], "-h", 3)) { printf("Usage: %s iters inpWidth inpHeight nImg nIfm nOfm kw kh pad stride type padding_mode\n", argv[0]); return 0; } srand(1); /* reading new values from cli */ i = 1; if (argc > i) iters = atoi(argv[i++]); if (argc > i) ifw = atoi(argv[i++]); if (argc > i) ifh = atoi(argv[i++]); if (argc > i) nImg = atoi(argv[i++]); if (argc > i) nIfm = atoi(argv[i++]); if (argc > i) nOfm = atoi(argv[i++]); if (argc > i) kw = atoi(argv[i++]); if (argc > i) kh = atoi(argv[i++]); if (argc > i) padw = atoi(argv[i++]); if (argc > i) padh = atoi(argv[i++]); if (argc > i) stride = atoi(argv[i++]); if (argc > i) type = *(argv[i++]); if (argc > i) format = *(argv[i++]); if (argc > i) padding_mode = atoi(argv[i++]); if (type != 'A' && type != 'F' && type != 'B' && type != 'U') { printf("type needs to be 'A' (All), 'F' (FP only), 'B' (BP only), 'U' (WU only)\n"); return 0; } if (format != 'L') { printf("format needs to be 'L'\n"); return 0; } stride_w = stride; stride_h = stride; pad_w = padw; pad_h = padh; if (0 == padding_mode) { pad_h_in = 0; pad_w_in = 0; pad_h_out = 0; pad_w_out = 0; } else { /* TODO: change "1" to "0" if "padding_mode = -1" is acknowledged */ if (1 < padding_mode) pad_w = padding_mode; pad_h_in = pad_h; pad_w_in = pad_w; pad_h_out = pad_h; pad_w_out = pad_w; } /* deriving some values for naive code */ ofh = (ifh + 2 * pad_h - kh) / stride_h + 1; ofw = (ifw + 2 * pad_w - kw) / stride_w + 1; ifhp = ifh + 2 * pad_h_in; ifwp = ifw + 2 * pad_w_in; ofhp = ofh + 2 * pad_h_out; ofwp = ofw + 2 * pad_w_out; /* set struct for naive convolution */ naive_param.nImg = nImg; naive_param.nIfm = nIfm; naive_param.nOfm = nOfm; naive_param.ifhp = ifhp; naive_param.ifwp = ifwp; naive_param.ifh = ifh; naive_param.ifw = ifw; naive_param.ofhp = ofhp; naive_param.ofwp = ofwp; naive_param.ofh = ofh; naive_param.ofw = ofw; naive_param.pad_h = pad_h; naive_param.pad_w = pad_w; naive_param.pad_h_in = pad_h_in; naive_param.pad_w_in = pad_w_in; naive_param.pad_h_out = pad_h_out; naive_param.pad_w_out = pad_w_out; naive_param.kh = kh; naive_param.kw = kw; naive_param.stride_h = stride_h; naive_param.stride_w = stride_w; /* print some summary */ printf("##########################################\n"); printf("# Setting Up Common #\n"); printf("##########################################\n"); printf("PARAMS: W:%d H:%d N:%d C:%d K:%d R:%d S:%d P:%d Q:%d STRIDE:%d\n", ifw, ifh, nImg, nIfm, nOfm, kw, kh, ofh, ofw, stride); printf("PARAMS: ITERS:%d", iters); if (LIBXSMM_FEQ(0, check)) printf(" Threads:%d\n", nThreads); else printf("\n"); printf(" InImg %dx%d Padded (%dx%d)\n", ifh, ifw, ifhp, ifwp); printf("OutImg %dx%d Padded (%dx%d)\n", ofh, ofw, ofhp, ofwp); printf("SIZE Input (MB): %10.2f MiB\n", (double)(nImg*nIfm*ifhp*ifwp*sizeof(unsigned char))/(1024.0*1024.0) ); printf("SIZE Output (MB): %10.2f MiB\n", (double)(nImg*nOfm*ofhp*ofwp*sizeof(int))/(1024.0*1024.0) ); printf("SIZE Input (1): %10.2f MiB\n", (double)(1*nIfm*ifhp*ifwp* sizeof(unsigned char))/(1024.0*1024.0) ); printf("SIZE Output (1): %10.2f MiB\n", (double)(1*nOfm*ofhp*ofwp* sizeof(int))/(1024.0*1024.0) ); printf("SIZE Weight : %10.2f MiB\n", (double)(nIfm*nOfm*kw*kh* sizeof(char))/(1024.0*1024.0) ); /* allocate data */ naive_input = (unsigned char*)libxsmm_aligned_malloc( nImg*nIfm*ifhp*ifwp*sizeof(unsigned char), 2097152); naive_output_fp = (int* )libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(int), 2097152); naive_libxsmm_output = (int* )libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(int), 2097152); naive_filter = (char*)libxsmm_aligned_malloc( nOfm*nIfm*kh*kw* sizeof(char), 2097152); input_libxsmm = (unsigned char*)libxsmm_aligned_malloc( nImg*nIfm*ifhp*ifwp*sizeof(unsigned char), 2097152); filter_libxsmm = (char*)libxsmm_aligned_malloc( nOfm*nIfm*kh*kw* sizeof(char), 2097152); output_libxsmm = (int*) libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(int), 2097152); /* initialize data */ naive_input_tmp = (unsigned char*)libxsmm_aligned_malloc( nImg*nIfm*ifhp*ifwp*sizeof(unsigned char), 2097152); zero_buf_uint8(naive_input, nImg*nIfm*ifhp*ifwp); if (padding_mode == 0 ) { init_buf_uint8(naive_input, nImg*nIfm*ifhp*ifwp, 0, 0); } else { init_buf_uint8(naive_input_tmp, nImg*nIfm*ifh*ifw, 0, 0); copy_internal_nchw_uint8( naive_input , naive_input_tmp, nImg, nIfm, ifh, ifw, pad_h, pad_w); } init_buf_int8(naive_filter, nOfm*nIfm*kh*kw, 0, 0); zero_buf_int32(naive_output_fp, nImg*nOfm*ofhp*ofwp); zero_buf_int32(output_libxsmm, nImg*nOfm*ofhp*ofwp); zero_buf_int32(naive_libxsmm_output, nImg*nOfm*ofhp*ofwp); if (LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Computing Reference ... #\n"); printf("##########################################\n"); /* run naive convolutions */ if (type == 'A' || type == 'F') { naive_conv_fp_int8int32(&naive_param, naive_input, naive_output_fp, naive_filter); } printf("##########################################\n"); printf("# Computing Reference ... done #\n"); printf("##########################################\n"); } printf("\n"); printf("##########################################\n"); printf("# Setting Up (custom-Storage) #\n"); printf("##########################################\n"); /* setup LIBXSMM handle */ conv_desc.N = nImg; conv_desc.C = nIfm; conv_desc.H = ifh; conv_desc.W = ifw; conv_desc.K = nOfm; conv_desc.R = kh; conv_desc.S = kw; conv_desc.u = stride_h; conv_desc.v = stride_w; conv_desc.pad_h = pad_h; conv_desc.pad_w = pad_w; conv_desc.pad_h_in = pad_h_in; conv_desc.pad_w_in = pad_w_in; conv_desc.pad_h_out = pad_h_out; conv_desc.pad_w_out = pad_w_out; conv_desc.threads = nThreads; conv_desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT; conv_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; conv_desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; conv_desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE; conv_desc.options = LIBXSMM_DNN_CONV_OPTION_OVERWRITE; conv_desc.datatype_in = LIBXSMM_DNN_DATATYPE_I8; conv_desc.datatype_out = LIBXSMM_DNN_DATATYPE_I32; libxsmm_handle = libxsmm_dnn_create_conv_layer( conv_desc, &status ); CHKERR_LIBXSMM_DNN( status ); /* setup LIBXSMM buffers and filter */ libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_input = libxsmm_dnn_link_tensor( libxsmm_layout, input_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_output = libxsmm_dnn_link_tensor( libxsmm_layout, output_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_FILTER, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_filter = libxsmm_dnn_link_tensor( libxsmm_layout, filter_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); /* copy in data to LIBXSMM format */ /* we can also use the layout functions and set the data on our own external to the library, @TODO, we plan to add an example here */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_input, (void*)naive_input, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_zero_tensor( libxsmm_output ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_filter, (void*)naive_filter, LIBXSMM_DNN_TENSOR_FORMAT_KCRS ) ); /* bind buffers and filter to handle */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_input, LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_output, LIBXSMM_DNN_REGULAR_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_filter, LIBXSMM_DNN_REGULAR_FILTER ) ); /* let's allocate and bind scratch */ scratch_size = libxsmm_dnn_get_scratch_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, &status ); CHKERR_LIBXSMM_DNN( status ); scratch = libxsmm_aligned_scratch( scratch_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, scratch ) ); /* set scratch to bogus to make sure that libxsmm takes care of zeroing internally */ if ((type == 'A' || type == 'F') && LIBXSMM_NEQ(0, check)) { printf("##############################################\n"); printf("# Check Correctness - FWD (custom-Storage) #\n"); printf("##############################################\n"); /* run LIBXSMM convolutions */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) ); } /* copy out data */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_output, (void*)naive_libxsmm_output, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); /* compare */ libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_I32, nImg*nOfm*ofhp*ofwp, 1, naive_output_fp, naive_libxsmm_output, 0, 0); printf("L1 reference : %.25g\n", norms_fwd.l1_ref); printf("L1 test : %.25g\n", norms_fwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_fwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_fwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel); printf("Check-norm : %.24f\n", norms_fwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_fwd); } if (type == 'A' || type == 'F') { printf("##########################################\n"); printf("# Performance - FWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolution for performance */ for (i = 0; i < 10; ++i) { #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif libxsmm_dnn_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); } } l_start = libxsmm_timer_tick(); for (i = 0; i < iters; ++i) { #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif libxsmm_dnn_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); lpOps = (double)nImg * (double)nIfm * (double)nOfm * (double)ofh * (double)ofw * (double)(2 * kh * kw) * (double)iters; printf("GOP = %.5g\n", lpOps*1e-9/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GOPS = %.5g\n", (lpOps*1e-9)/l_total); printf("PERFDUMP,FP,%s,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nIfm, nOfm, ifw, ifh, kw, kh, stride, padw, padh, ((double)(l_total/iters)), (lpOps*1e-9)/l_total, norms_fwd.l1_ref, norms_fwd.l1_tst, norms_fwd.l2_abs, norms_fwd.l2_rel, norms_fwd.linf_abs, norms_fwd.linf_rel, norms_fwd.normf_rel); } /* clean-up */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL ) ); libxsmm_free(scratch); CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_input ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_output ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_filter ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_conv_layer( libxsmm_handle ) ); /* deallocate data */ libxsmm_free(naive_input); libxsmm_free(naive_output_fp); libxsmm_free(naive_libxsmm_output); libxsmm_free(naive_filter); libxsmm_free(input_libxsmm); libxsmm_free(output_libxsmm); libxsmm_free(filter_libxsmm); { const char *const env_check_scale = getenv("CHECK_SCALE"); const double check_scale = LIBXSMM_ABS(0 == env_check_scale ? 1.0 : atof(env_check_scale)); if (LIBXSMM_NEQ(0, check) && (check < 100.0 * check_scale * diff.normf_rel) && (global_status == LIBXSMM_DNN_SUCCESS)) { fprintf(stderr, "FAILED with an error of %f%%!\n", 100.0 * diff.normf_rel); exit(EXIT_FAILURE); } } /* some empty lines at the end */ printf("\n\n\n"); return global_status; } libxsmm-1.17/samples/deeplearning/cnnlayer/layer_example_i8i32.vcxproj000066400000000000000000000550151415223013700262170ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 layer_example_i8i32 {E5AF3D43-4860-4F28-8674-D641A954C7C9} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/deeplearning/cnnlayer/layer_example_i8i8.c000066400000000000000000000462611415223013700246740ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Evangelos Georganas, Hans Pabst, Dhiraj Kalamkar, Ankush Mandal (Intel Corp.) ******************************************************************************/ #include #include #include #include #if defined(_OPENMP) # include #endif #define USE_OVERWRITE /* include c-based dnn library */ #include "../common/dnn_common.h" #define CHKERR_LIBXSMM_DNN(A) { const int chkerr_libxsmm_dnn_ = A; if (LIBXSMM_DNN_SUCCESS != chkerr_libxsmm_dnn_) { \ fprintf(stderr, "%s\n", libxsmm_dnn_get_error(chkerr_libxsmm_dnn_)); global_status = chkerr_libxsmm_dnn_; } \ } int main(int argc, char* argv[]) { float *naive_input_fp, *naive_output_fp, *naive_filter_fp, *naive_libxsmm_output_fp, *dq_naive_input, *dq_naive_filter; char *naive_filter_i8, *naive_output_i8, *naive_libxsmm_output, *filter_libxsmm, *output_libxsmm; unsigned char *naive_input_i8, *input_libxsmm; int ifhp, ifwp, ofhp, ofwp, ofh, ofw; int stride_h, stride_w, pad_h, pad_w, pad_h_in, pad_w_in, pad_h_out, pad_w_out; naive_conv_t naive_param; void* scratch; size_t scratch_size; /* some parameters we can overwrite via cli, default is some inner layer of overfeat */ int iters = 10; /* repetitions of benchmark */ int ifw = 14; /* input width, "W" */ int ifh = 18; /* input height, "H" */ int nImg = 32; /* mini-batch size, "N" */ int nIfm = 256; /* number of input feature maps, "C" */ int nOfm = 512; /* number of output feature maps, "K" */ int kh = 3; /* filter height, "R" */ int kw = 3; /* filter width, "S" */ int padh = 1; /* padding in input, height */ int padw = 1; /* padding in input, width */ int stride = 1; /* stride when accessing inputs */ char type = 'A'; /* 'A': ALL, 'F': FP, 'B': BP, 'U', WU */ char format = 'L'; const char *const env_check = getenv("CHECK"); const double check = LIBXSMM_ABS(0 == env_check ? 1 : atof(env_check)); #if defined(_OPENMP) int nThreads = omp_get_max_threads(); /* number of threads */ #else int nThreads = 1; /* number of threads */ #endif int padding_mode = 0; /* padding mode */ unsigned long long l_start, l_end; double l_total = 0.0; double lpOps = 0.0; /* number of low precision operations */ int i; libxsmm_dnn_conv_desc conv_desc; libxsmm_dnn_layer* libxsmm_handle; libxsmm_dnn_tensor* libxsmm_input; libxsmm_dnn_tensor* libxsmm_output; libxsmm_dnn_tensor* libxsmm_filter; libxsmm_dnn_tensor_datalayout* libxsmm_layout; libxsmm_dnn_err_t status; libxsmm_dnn_err_t global_status = LIBXSMM_DNN_SUCCESS; libxsmm_matdiff_info norms_fwd, diff, norms_quant; libxsmm_matdiff_clear(&norms_fwd); libxsmm_matdiff_clear(&diff); libxsmm_matdiff_clear(&norms_quant); if (argc > 1 && !strncmp(argv[1], "-h", 3)) { printf("Usage: %s iters inpWidth inpHeight nImg nIfm nOfm kw kh pad stride type padding_mode\n", argv[0]); return 0; } srand(1); /* reading new values from cli */ i = 1; if (argc > i) iters = atoi(argv[i++]); if (argc > i) ifw = atoi(argv[i++]); if (argc > i) ifh = atoi(argv[i++]); if (argc > i) nImg = atoi(argv[i++]); if (argc > i) nIfm = atoi(argv[i++]); if (argc > i) nOfm = atoi(argv[i++]); if (argc > i) kw = atoi(argv[i++]); if (argc > i) kh = atoi(argv[i++]); if (argc > i) padw = atoi(argv[i++]); if (argc > i) padh = atoi(argv[i++]); if (argc > i) stride = atoi(argv[i++]); if (argc > i) type = *(argv[i++]); if (argc > i) format = *(argv[i++]); if (argc > i) padding_mode = atoi(argv[i++]); if (type != 'A' && type != 'F' && type != 'B' && type != 'U') { printf("type needs to be 'A' (All), 'F' (FP only), 'B' (BP only), 'U' (WU only)\n"); return 0; } if (format != 'L') { printf("format needs to be 'L'\n"); return 0; } stride_w = stride; stride_h = stride; pad_w = padw; pad_h = padh; if (0 == padding_mode) { pad_h_in = 0; pad_w_in = 0; pad_h_out = 0; pad_w_out = 0; } else { /* TODO: change "1" to "0" if "padding_mode = -1" is acknowledged */ if (1 < padding_mode) pad_w = padding_mode; pad_h_in = pad_h; pad_w_in = pad_w; pad_h_out = pad_h; pad_w_out = pad_w; } /* deriving some values for naive code */ ofh = (ifh + 2 * pad_h - kh) / stride_h + 1; ofw = (ifw + 2 * pad_w - kw) / stride_w + 1; ifhp = ifh + 2 * pad_h_in; ifwp = ifw + 2 * pad_w_in; ofhp = ofh + 2 * pad_h_out; ofwp = ofw + 2 * pad_w_out; /* set struct for naive convolution */ naive_param.nImg = nImg; naive_param.nIfm = nIfm; naive_param.nOfm = nOfm; naive_param.ifhp = ifhp; naive_param.ifwp = ifwp; naive_param.ifh = ifh; naive_param.ifw = ifw; naive_param.ofhp = ofhp; naive_param.ofwp = ofwp; naive_param.ofh = ofh; naive_param.ofw = ofw; naive_param.pad_h = pad_h; naive_param.pad_w = pad_w; naive_param.pad_h_in = pad_h_in; naive_param.pad_w_in = pad_w_in; naive_param.pad_h_out = pad_h_out; naive_param.pad_w_out = pad_w_out; naive_param.kh = kh; naive_param.kw = kw; naive_param.stride_h = stride_h; naive_param.stride_w = stride_w; /* print some summary */ printf("##########################################\n"); printf("# Setting Up Common #\n"); printf("##########################################\n"); printf("PARAMS: W:%d H:%d N:%d C:%d K:%d R:%d S:%d P:%d Q:%d STRIDE:%d\n", ifw, ifh, nImg, nIfm, nOfm, kw, kh, ofh, ofw, stride); printf("PARAMS: ITERS:%d", iters); if (LIBXSMM_FEQ(0, check)) printf(" Threads:%d\n", nThreads); else printf("\n"); printf(" InImg %dx%d Padded (%dx%d)\n", ifh, ifw, ifhp, ifwp); printf("OutImg %dx%d Padded (%dx%d)\n", ofh, ofw, ofhp, ofwp); printf("SIZE Input (MB): %10.2f MiB\n", (double)(nImg*nIfm*ifhp*ifwp*sizeof(unsigned char))/(1024.0*1024.0) ); printf("SIZE Output (MB): %10.2f MiB\n", (double)(nImg*nOfm*ofhp*ofwp*sizeof(char))/(1024.0*1024.0) ); printf("SIZE Input (1): %10.2f MiB\n", (double)(1*nIfm*ifhp*ifwp* sizeof(unsigned char))/(1024.0*1024.0) ); printf("SIZE Output (1): %10.2f MiB\n", (double)(1*nOfm*ofhp*ofwp* sizeof(char))/(1024.0*1024.0) ); printf("SIZE Weight : %10.2f MiB\n", (double)(nIfm*nOfm*kw*kh* sizeof(char))/(1024.0*1024.0) ); /* allocate data */ naive_input_fp = (float*)libxsmm_aligned_malloc( nImg*nIfm*ifhp*ifwp*sizeof(float), 2097152); naive_output_fp = (float*)libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(float), 2097152); naive_filter_fp = (float*)libxsmm_aligned_malloc( nOfm*nIfm*kh*kw* sizeof(float), 2097152); naive_input_i8 = (unsigned char*)libxsmm_aligned_malloc( nImg*nIfm*ifhp*ifwp*sizeof(unsigned char), 2097152); naive_output_i8 = (char*)libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(char), 2097152); naive_filter_i8 = (char*)libxsmm_aligned_malloc( nOfm*nIfm*kh*kw* sizeof(char), 2097152); naive_libxsmm_output = (char*)libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(char), 2097152); naive_libxsmm_output_fp = (float*)libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(float), 2097152); input_libxsmm = (unsigned char*)libxsmm_aligned_malloc( nImg*nIfm*ifhp*ifwp*sizeof(unsigned char), 2097152); filter_libxsmm = (char*)libxsmm_aligned_malloc( nOfm*nIfm*kh*kw* sizeof(char), 2097152); output_libxsmm = (char*) libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(char), 2097152); dq_naive_input = (float*)libxsmm_aligned_malloc( nImg*nIfm*ifhp*ifwp*sizeof(float), 2097152); dq_naive_filter = (float*)libxsmm_aligned_malloc( nOfm*nIfm*kh*kw* sizeof(float), 2097152); /* initialize data */ if (padding_mode == 0 ) { init_buf_range(naive_input_fp, nImg*nIfm*ifhp*ifwp, 0.0, 1.0); } else { float *naive_input_tmp = (float*)libxsmm_aligned_malloc( nImg*nIfm*ifhp*ifwp*sizeof(float), 2097152); init_buf_range(naive_input_tmp, nImg*nIfm*ifh*ifw, 0.0, 1.0); copy_internal_nchw( naive_input_fp , naive_input_tmp, nImg, nIfm, ifh, ifw, pad_h, pad_w); libxsmm_free(naive_input_tmp); } init_buf_range(naive_filter_fp, nOfm*nIfm*kh*kw, -1.0, 1.0); zero_buf_int8(output_libxsmm, nImg*nOfm*ofhp*ofwp); if (LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Computing Reference ... #\n"); printf("##########################################\n"); /* run naive convolutions */ if (type == 'A' || type == 'F') { zero_buf(naive_output_fp, nImg*nOfm*ofhp*ofwp); naive_conv_fp(&naive_param, naive_input_fp, naive_output_fp, naive_filter_fp, NULL); } printf("##########################################\n"); printf("# Computing Reference ... done #\n"); printf("##########################################\n"); } /* Quantize input and filter */ unsigned char filter_scf, input_scf, output_scf; quantize_buffer_uchar(naive_input_fp, naive_input_i8, nImg*nIfm*ifhp*ifwp, 2, &input_scf); quantize_buffer_char(naive_filter_fp, naive_filter_i8, nOfm*nIfm*kh*kw, 2, &filter_scf); quantize_buffer_char(naive_output_fp, naive_output_i8, nImg*nOfm*ofhp*ofwp, 2, &output_scf); /* dequantize to check quantization error */ libxsmm_dnn_dequantize_int8( (char*)naive_input_i8, dq_naive_input, nImg*nIfm*ifhp*ifwp, input_scf); libxsmm_dnn_dequantize_int8( (char*)naive_filter_i8, dq_naive_filter, nIfm*nOfm*kw*kh, filter_scf); #if 0 /* norms quantization */ libxsmm_matdiff(&norms_quant, LIBXSMM_DATATYPE_F32, nImg*nIfm*ifhp*ifwp, 1, naive_input_fp, dq_naive_input, 0, 0); printf("Input Quantization:\n"); printf("L1 reference : %.25g\n", norms_quant.l1_ref); printf("L1 test : %.25g\n", norms_quant.l1_tst); printf("L2 abs.error : %.24f\n", norms_quant.l2_abs); printf("L2 rel.error : %.24f\n", norms_quant.l2_rel); printf("Linf abs.error: %.24f\n", norms_quant.linf_abs); printf("Linf rel.error: %.24f\n", norms_quant.linf_rel); printf("Check-norm : %.24f\n", norms_quant.normf_rel); libxsmm_matdiff_clear(&norms_quant); libxsmm_matdiff(&norms_quant, LIBXSMM_DATATYPE_F32, nIfm*nOfm*kw*kh, 1, naive_filter_fp, dq_naive_filter, 0, 0); printf("Filter Quantization:\n"); printf("L1 reference : %.25g\n", norms_quant.l1_ref); printf("L1 test : %.25g\n", norms_quant.l1_tst); printf("L2 abs.error : %.24f\n", norms_quant.l2_abs); printf("L2 rel.error : %.24f\n", norms_quant.l2_rel); printf("Linf abs.error: %.24f\n", norms_quant.linf_abs); printf("Linf rel.error: %.24f\n", norms_quant.linf_rel); printf("Check-norm : %.24f\n", norms_quant.normf_rel); libxsmm_matdiff_clear(&norms_quant); printf("\n"); #endif printf("\n"); printf("##########################################\n"); printf("# Setting Up (custom-Storage) #\n"); printf("##########################################\n"); /* setup LIBXSMM handle */ conv_desc.N = nImg; conv_desc.C = nIfm; conv_desc.H = ifh; conv_desc.W = ifw; conv_desc.K = nOfm; conv_desc.R = kh; conv_desc.S = kw; conv_desc.u = stride_h; conv_desc.v = stride_w; conv_desc.pad_h = pad_h; conv_desc.pad_w = pad_w; conv_desc.pad_h_in = pad_h_in; conv_desc.pad_w_in = pad_w_in; conv_desc.pad_h_out = pad_h_out; conv_desc.pad_w_out = pad_w_out; conv_desc.threads = nThreads; conv_desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT; conv_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; conv_desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; conv_desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE; conv_desc.options = LIBXSMM_DNN_CONV_OPTION_OVERWRITE; conv_desc.datatype_in = LIBXSMM_DNN_DATATYPE_I8; conv_desc.datatype_out = LIBXSMM_DNN_DATATYPE_I8; libxsmm_handle = libxsmm_dnn_create_conv_layer( conv_desc, &status ); CHKERR_LIBXSMM_DNN( status ); /* setup LIBXSMM buffers and filter */ libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_input = libxsmm_dnn_link_tensor( libxsmm_layout, input_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_output = libxsmm_dnn_link_tensor( libxsmm_layout, output_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_FILTER, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_filter = libxsmm_dnn_link_tensor( libxsmm_layout, filter_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); /* copy in data to LIBXSMM format */ /* we can also use the layout functions and set the data on our own external to the library, @TODO, we plan to add an example here */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_input, (void*)naive_input_i8, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_zero_tensor( libxsmm_output ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_filter, (void*)naive_filter_i8, LIBXSMM_DNN_TENSOR_FORMAT_KCRS ) ); /* bind buffers and filter to handle */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_input, LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_output, LIBXSMM_DNN_REGULAR_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_filter, LIBXSMM_DNN_REGULAR_FILTER ) ); /* set scaling factors into tensors */ libxsmm_dnn_set_qtensor_scf( libxsmm_input, input_scf ); libxsmm_dnn_set_qtensor_scf( libxsmm_filter, filter_scf ); libxsmm_dnn_set_qtensor_scf( libxsmm_output, output_scf ); /* let's allocate and bind scratch */ scratch_size = libxsmm_dnn_get_scratch_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, &status ); CHKERR_LIBXSMM_DNN( status ); scratch = libxsmm_aligned_scratch( scratch_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, scratch ) ); if ((type == 'A' || type == 'F') && LIBXSMM_NEQ(0, check)) { printf("##############################################\n"); printf("# Check Correctness - FWD (custom-Storage) #\n"); printf("##############################################\n"); /* run LIBXSMM convolutions */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) ); } /* copy out data */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_output, (void*)naive_libxsmm_output, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); /* Dequantize result to check correctness */ libxsmm_dnn_dequantize_int8( (char*)naive_libxsmm_output, naive_libxsmm_output_fp, nImg*nOfm*ofhp*ofwp, output_scf); /* compare */ libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_F32, nImg*nOfm*ofhp*ofwp, 1, naive_output_fp, naive_libxsmm_output_fp, 0, 0); printf("L1 reference : %.25g\n", norms_fwd.l1_ref); printf("L1 test : %.25g\n", norms_fwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_fwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_fwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel); printf("Check-norm : %.24f\n", norms_fwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_fwd); } if (type == 'A' || type == 'F') { printf("##########################################\n"); printf("# Performance - FWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolution for performance */ l_start = libxsmm_timer_tick(); for (i = 0; i < iters; ++i) { #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif libxsmm_dnn_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); lpOps = (double)nImg * (double)nIfm * (double)nOfm * (double)ofh * (double)ofw * (double)(2 * kh * kw) * (double)iters; printf("GOP = %.5g\n", lpOps*1e-9/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GOPS = %.5g\n", (lpOps*1e-9)/l_total); printf("PERFDUMP,FP,%s,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nIfm, nOfm, ifw, ifh, kw, kh, stride, padw, padh, ((double)(l_total/iters)), (lpOps*1e-9)/l_total, norms_fwd.l1_ref, norms_fwd.l1_tst, norms_fwd.l2_abs, norms_fwd.l2_rel, norms_fwd.linf_abs, norms_fwd.linf_rel, norms_fwd.normf_rel); } /* clean-up */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL ) ); libxsmm_free(scratch); CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_input ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_output ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_filter ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_conv_layer( libxsmm_handle ) ); /* deallocate data */ libxsmm_free(naive_input_fp); libxsmm_free(naive_filter_fp); libxsmm_free(naive_output_fp); libxsmm_free(naive_libxsmm_output_fp); libxsmm_free(dq_naive_input); libxsmm_free(dq_naive_filter); libxsmm_free(naive_filter_i8); libxsmm_free(naive_output_i8); libxsmm_free(naive_libxsmm_output); libxsmm_free(input_libxsmm); libxsmm_free(naive_input_i8); libxsmm_free(output_libxsmm); libxsmm_free(filter_libxsmm); { const char *const env_check_scale = getenv("CHECK_SCALE"); const double check_scale = LIBXSMM_ABS(0 == env_check_scale ? 0.01 : atof(env_check_scale)); if (LIBXSMM_NEQ(0, check) && (check < 100.0 * check_scale * diff.normf_rel) && (global_status == LIBXSMM_DNN_SUCCESS)) { fprintf(stderr, "FAILED with an error of %f%%!\n", 100.0 * diff.normf_rel); exit(EXIT_FAILURE); } } /* some empty lines at the end */ printf("\n\n\n"); return LIBXSMM_DNN_SUCCESS; } libxsmm-1.17/samples/deeplearning/cnnlayer/layer_example_i8i8.vcxproj000066400000000000000000000550151415223013700261420ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 layer_example_i8i8 {E5AF3D43-4860-4F28-8674-D641A954C7C9} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/deeplearning/cnnlayer/run_alexnet.sh000077500000000000000000000072631415223013700237220ustar00rootroot00000000000000#!/usr/bin/env bash set -eo pipefail UNAME=$(command -v uname) SORT=$(command -v sort) GREP=$(command -v grep) CUT=$(command -v cut) WC=$(command -v wc) TR=$(command -v tr) if [ "" = "${CHECK}" ] || [ "0" = "${CHECK}" ]; then if [ "" = "${CHECK_DNN_MB}" ]; then CHECK_DNN_MB=256; fi if [ "" = "${CHECK_DNN_ITERS}" ]; then CHECK_DNN_ITERS=1000; fi else # check if [ "" = "${CHECK_DNN_MB}" ]; then CHECK_DNN_MB=32; fi if [ "" = "${CHECK_DNN_ITERS}" ]; then CHECK_DNN_ITERS=1; fi fi if [ $# -ne 7 ] then echo "Usage: $(basename $0) mb iters numa (1-mcdram/0-DDR) TYPE ('A'-ALL/'F'-FP/'B'-BP/'U'-WU) FORMAT ('A'-ALL/'L'-LIBXSMM/'T'-Tensorflow/'M'-Mixed) padding; using default values; using default values: 256 1000 1 f32 A L 1" MB=${CHECK_DNN_MB} ITERS=${CHECK_DNN_ITERS} NUMA=-1 BIN=f32 TYPE=A FORMAT=L PAD=1 else MB=$1 ITERS=$2 NUMA=$3 BIN=$4 TYPE=$5 FORMAT=$6 PAD=$7 fi if [ "${GREP}" ] && [ "${SORT}" ] && [ "${CUT}" ] && [ "${TR}" ] && [ "${WC}" ]; then if [ "$(command -v lscpu)" ]; then NS=$(lscpu | ${GREP} -m1 "Socket(s)" | ${TR} -d " " | ${CUT} -d: -f2) if [ "" = "${NS}" ]; then NS=1; fi NC=$((NS*$(lscpu | ${GREP} -m1 "Core(s) per socket" | ${TR} -d " " | ${CUT} -d: -f2))) NT=$((NC*$(lscpu | ${GREP} -m1 "Thread(s) per core" | ${TR} -d " " | ${CUT} -d: -f2))) elif [ -e /proc/cpuinfo ]; then NS=$(${GREP} "physical id" /proc/cpuinfo | ${SORT} -u | ${WC} -l | ${TR} -d " ") if [ "" = "${NS}" ] || [ "" = "${NS}" ]; then NS=1; fi NC=$((NS*$(${GREP} -m1 "cpu cores" /proc/cpuinfo | ${TR} -d " " | ${CUT} -d: -f2))) NT=$(${GREP} "core id" /proc/cpuinfo | ${WC} -l | ${TR} -d " ") elif [ "Darwin" = "$(uname)" ]; then NS=$(sysctl hw.packages | ${CUT} -d: -f2 | ${TR} -d " ") NC=$(sysctl hw.physicalcpu | ${CUT} -d: -f2 | ${TR} -d " ") NT=$(sysctl hw.logicalcpu | ${CUT} -d: -f2 | ${TR} -d " ") fi if [ "${NC}" ] && [ "${NT}" ]; then HT=$((NT/NC)) else NS=1 NC=1 NT=1 HT=1 fi if [ "$(command -v numactl)" ]; then NN=$(numactl -H | ${GREP} "available:" | ${CUT} -d' ' -f2) else NN=${NS} fi fi CPUFLAGS=$(if [ "${GREP}" ] && [ "${CUT}" ] && [ -e /proc/cpuinfo ]; then ${GREP} -m1 flags /proc/cpuinfo | ${CUT} -d: -f2- || true; fi) if [ "${GREP}" ] && [ "$(echo "${CPUFLAGS}" | ${GREP} -o avx512er)" ]; then if [ "0" != "$((0>NUMA))" ] && [ "0" != "$((NS #include #if defined(_OPENMP) # include #endif typedef struct { int nImg; int nIfm; int nOfm; int ifhp; int ifwp; int ifh; int ifw; int ofhp; int ofwp; int ofh; int ofw; int pad_h; int pad_w; int pad_h_in; int pad_w_in; int pad_h_out; int pad_w_out; int kh; int kw; int stride_h; int stride_w; } naive_conv_t; typedef struct { int N; int C; int H; int W; int stride_h; int stride_w; int norm_type; /* 0: full batchnorm, 1: batch scaling only */ int fuse_type; /* 0: nothing fused, 1: relu fused, 2: elementwise fused, 3: relu and elementwise fused */ } naive_fusedbatchnorm_t; typedef struct { int N; int C; int G; int H; int W; int stride_h; int stride_w; int fuse_type; /* 0: nothing fused, 1: relu fused, 2: elementwise fused, 3: relu and elementwise fused */ } naive_fusedgroupnorm_t; typedef struct { int N; int C; int K; int fuse_type; /* 0: nothing fused */ } naive_fullyconnected_t; typedef struct { int N; int C; int H; int W; int R; int S; int pad_h; int pad_w; int stride_h; int stride_w; int type; } naive_pooling_t; /* it's fine to alias in and out */ LIBXSMM_INLINE void truncate_mask_fp32_bf16(float* in, float* out, unsigned int len) { unsigned int i = 0; /* truncate buffer to bf16 */ for ( i = 0; i < len; ++i ) { union libxsmm_bfloat16_hp t; t.f = in[i]; t.i[0] = 0; out[i] = t.f; } } /* it's fine to alias in and out */ LIBXSMM_INLINE void rnaz_mask_fp32_bf16(float* in, float* out, unsigned int len) { unsigned int i = 0; /* rnaz buffer to bf16 */ for ( i = 0; i < len; ++i ) { unsigned int int_round = 0; unsigned int do_round = 1; const void *const ptr = &int_round; int_round = *((unsigned int*)&(in[i])); /* we don't round NaN and inf */ if ( (int_round & 0x7f800000) == 0x7f800000 ) { do_round = 0; } /* perform round nearest tie away from zero */ if ( do_round != 0 ) { int_round = int_round + 0x00008000; } /* chop bits to create BFP16 in FP32 */ int_round = int_round & 0xffff0000; out[i] = *((float*)ptr); } } /* it's fine to alias in and out */ LIBXSMM_INLINE void rne_mask_fp32_bf16(float* in, float* out, unsigned int len) { unsigned int i = 0; /* rnaz buffer to bf16 */ for ( i = 0; i < len; ++i ) { unsigned int int_round = 0; unsigned int do_round = 1; const void *const ptr = &int_round; int_round = *((unsigned int*)&(in[i])); /* we don't round NaN and inf */ if ( (int_round & 0x7f800000) == 0x7f800000 ) { do_round = 0; } /* perform round nearest tie even */ if ( do_round != 0 ) { unsigned int fixup = (int_round >> 16) & 1; int_round = int_round + 0x00007fff + fixup; } /* chop bits to create BFP16 in FP32 */ int_round = int_round & 0xffff0000; out[i] = *((float*)ptr); } } LIBXSMM_INLINE void zero_buf(float* buf, size_t size) { int i; #if defined(_OPENMP) LIBXSMM_OMP_VAR(i); # pragma omp parallel for private(i) #endif for (i = 0; i < (int)size; ++i) { buf[i] = 0.0f; } } LIBXSMM_INLINE void zero_buf_bf16(libxsmm_bfloat16* buf, size_t size) { int i; #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < (int)size; ++i) { buf[i] = 0; } } LIBXSMM_INLINE void zero_buf_int16(short* buf, size_t size) { int i; #if defined(_OPENMP) LIBXSMM_OMP_VAR(i); # pragma omp parallel for private(i) #endif for (i = 0; i < (int)size; ++i) { buf[i] = 0; } } LIBXSMM_INLINE void zero_buf_int32(int* buf, size_t size) { int i; #if defined(_OPENMP) LIBXSMM_OMP_VAR(i); # pragma omp parallel for private(i) #endif for (i = 0; i < (int)size; ++i) { buf[i] = 0; } } LIBXSMM_INLINE void zero_buf_int8(char* buf, size_t size) { int i; #if defined(_OPENMP) LIBXSMM_OMP_VAR(i); # pragma omp parallel for private(i) #endif for (i = 0; i < (int)size; ++i) { buf[i] = 0; } } LIBXSMM_INLINE void zero_buf_uint8(unsigned char* buf, size_t size) { int i; #if defined(_OPENMP) LIBXSMM_OMP_VAR(i); # pragma omp parallel for private(i) #endif for (i = 0; i < (int)size; ++i) { buf[i] = 0; } } LIBXSMM_INLINE void copy_buf(float* src, float* dst, size_t size) { int i; #if defined(_OPENMP) LIBXSMM_OMP_VAR(i); # pragma omp parallel for private(i) #endif for (i = 0; i < (int)size; ++i) { dst[i] = src[i]; } } LIBXSMM_INLINE void copy_buf_int16(short* src, short* dst, size_t size) { int i; #if defined(_OPENMP) LIBXSMM_OMP_VAR(i); # pragma omp parallel for private(i) #endif for (i = 0; i < (int)size; ++i) { dst[i] = src[i]; } } LIBXSMM_INLINE void copy_buf_int8(char* src, char* dst, size_t size) { int i; #if defined(_OPENMP) LIBXSMM_OMP_VAR(i); # pragma omp parallel for private(i) #endif for (i = 0; i < (int)size; ++i) { dst[i] = src[i]; } } LIBXSMM_INLINE void copy_buf_uint8(unsigned char* src, unsigned char* dst, size_t size) { int i; #if defined(_OPENMP) LIBXSMM_OMP_VAR(i); # pragma omp parallel for private(i) #endif for (i = 0; i < (int)size; ++i) { dst[i] = src[i]; } } LIBXSMM_INLINE void init_buf(float* buf, size_t size, int initPos, int initOne) { int i; zero_buf(buf, size); #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < (int)size; ++i) { buf[i] = (float)((initOne != 0) ? 1.0 : ((initPos != 0) ? libxsmm_rng_f64() : (0.05 - libxsmm_rng_f64()/10.0))); } } LIBXSMM_INLINE void init_buf_bf16(libxsmm_bfloat16* buf, size_t size, int initPos, int initOne) { int i; zero_buf_bf16(buf, size); #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < (int)size; ++i) { libxsmm_bfloat16_hp tmp; tmp.f = (float)((initOne != 0) ? 1.0 : ((initPos != 0) ? libxsmm_rng_f64() : (0.05 - libxsmm_rng_f64()/10.0))); buf[i] = tmp.i[1]; } } LIBXSMM_INLINE void libxsmm_dnn_dequantize_int8( char* in_buffer, float* out_buffer, int length, unsigned char scf ) { const float val_exp = libxsmm_sexp2_i8i(-scf); int i = 0; #ifdef _OPENMP # pragma omp parallel for private(i) #endif for ( i = 0; i < length; ++i ) { out_buffer[i] = ((float)in_buffer[i])*val_exp; } } LIBXSMM_INLINE float libxsmm_internal_get_max_common( float* in_buffer, int length ) { float absmax_value = LIBXSMM_ABS(in_buffer[0]); int i = 0; for (i = 1; i < length; ++i ) { if (LIBXSMM_ABS(in_buffer[i]) > absmax_value) { absmax_value = LIBXSMM_ABS(in_buffer[i]); } } return absmax_value; } LIBXSMM_INLINE void quantize_buffer_char(float *in_buffer, char *out_buffer, int size, unsigned char add_shift, unsigned char* scf) { int i; const float max_value = libxsmm_internal_get_max_common(in_buffer, size); int maxexp = 0; /* take return value of LIBXSMM_FREXPF to mute static analysis issue */ float scfq = LIBXSMM_FREXPF(max_value, &maxexp); maxexp -= (7 - add_shift); scfq = libxsmm_sexp2_i8i(-maxexp); for (i=0; i= H-pad_h || w < pad_w || w >= W-pad_w) { LIBXSMM_VLA_ACCESS(4, input, n, c, h, w, C, H, W) = 0.0; } } } } } } LIBXSMM_INLINE void set_zeropad_nchw_int16(short* nchw, int N, int C, int H, int W, int pad_h, int pad_w) { LIBXSMM_VLA_DECL(4, short, input, nchw, C, H, W); int n, h, w, c; #if defined(_OPENMP) LIBXSMM_OMP_VAR(c); LIBXSMM_OMP_VAR(h); LIBXSMM_OMP_VAR(w); # pragma omp parallel for private(n,c,h,w) #endif for ( n = 0; n < N; n++ ) { for ( c = 0; c < C; c++ ) { for ( h = 0; h < H; h++ ) { for ( w = 0; w < W; w++ ) { if (h < pad_h || h >= H-pad_h || w < pad_w || w >= W-pad_w) { LIBXSMM_VLA_ACCESS(4, input, n, c, h, w, C, H, W) = 0; } } } } } } LIBXSMM_INLINE void set_zeropad_nchw_int32(int* nchw, int N, int C, int H, int W, int pad_h, int pad_w) { LIBXSMM_VLA_DECL(4, int, input, nchw, C, H, W); int n, h, w, c; #if defined(_OPENMP) LIBXSMM_OMP_VAR(c); LIBXSMM_OMP_VAR(h); LIBXSMM_OMP_VAR(w); # pragma omp parallel for private(n,c,h,w) #endif for ( n = 0; n < N; n++ ) { for ( c = 0; c < C; c++ ) { for ( h = 0; h < H; h++ ) { for ( w = 0; w < W; w++ ) { if (h < pad_h || h >= H-pad_h || w < pad_w || w >= W-pad_w) { LIBXSMM_VLA_ACCESS(4, input, n, c, h, w, C, H, W) = 0; } } } } } } LIBXSMM_INLINE void set_zeropad_nchw_uint8(unsigned char* nchw, int N, int C, int H, int W, int pad_h, int pad_w) { LIBXSMM_VLA_DECL(4, unsigned char, input, nchw, C, H, W); int n, h, w, c; #if defined(_OPENMP) LIBXSMM_OMP_VAR(c); LIBXSMM_OMP_VAR(h); LIBXSMM_OMP_VAR(w); # pragma omp parallel for private(n,c,h,w) #endif for ( n = 0; n < N; n++ ) { for ( c = 0; c < C; c++ ) { for ( h = 0; h < H; h++ ) { for ( w = 0; w < W; w++ ) { if (h < pad_h || h >= H-pad_h || w < pad_w || w >= W-pad_w) { LIBXSMM_VLA_ACCESS(4, input, n, c, h, w, C, H, W) = 0; } } } } } } LIBXSMM_INLINE void copy_internal_nchw(float* dst , float* src, int N, int C, int H, int W, int pad_h, int pad_w) { LIBXSMM_VLA_DECL(4, float, input, src, C, H, W); LIBXSMM_VLA_DECL(4, float, new_input, dst, C, H+2*pad_h, W+2*pad_w); int n, h, w, c; #if defined(_OPENMP) LIBXSMM_OMP_VAR(c); LIBXSMM_OMP_VAR(h); LIBXSMM_OMP_VAR(w); # pragma omp parallel for private(n,c,h,w) #endif for ( n = 0; n < N; n++ ) { for ( c = 0; c < C; c++ ) { for ( h = 0; h < H; h++ ) { for ( w = 0; w < W; w++ ) { LIBXSMM_VLA_ACCESS(4, new_input, n, c, h+pad_h, w+pad_w, C, H+2*pad_h, W+2*pad_w) = LIBXSMM_VLA_ACCESS(4, input, n, c, h, w, C, H, W); } } } } } LIBXSMM_INLINE void copy_internal_nchw_int16(short* dst , short* src, int N, int C, int H, int W, int pad_h, int pad_w) { LIBXSMM_VLA_DECL(4, short, input, src, C, H, W); LIBXSMM_VLA_DECL(4, short, new_input, dst, C, H+2*pad_h, W+2*pad_w); int n, h, w, c; #if defined(_OPENMP) LIBXSMM_OMP_VAR(c); LIBXSMM_OMP_VAR(h); LIBXSMM_OMP_VAR(w); # pragma omp parallel for private(n,c,h,w) #endif for ( n = 0; n < N; n++ ) { for ( c = 0; c < C; c++ ) { for ( h = 0; h < H; h++ ) { for ( w = 0; w < W; w++ ) { LIBXSMM_VLA_ACCESS(4, new_input, n, c, h+pad_h, w+pad_w, C, H+2*pad_h, W+2*pad_w) = LIBXSMM_VLA_ACCESS(4, input, n, c, h, w, C, H, W); } } } } } LIBXSMM_INLINE void copy_internal_nchw_uint8(unsigned char* dst , unsigned char* src, int N, int C, int H, int W, int pad_h, int pad_w) { LIBXSMM_VLA_DECL(4, unsigned char, input, src, C, H, W); LIBXSMM_VLA_DECL(4, unsigned char, new_input, dst, C, H+2*pad_h, W+2*pad_w); int n, h, w, c; #if defined(_OPENMP) LIBXSMM_OMP_VAR(c); LIBXSMM_OMP_VAR(h); LIBXSMM_OMP_VAR(w); # pragma omp parallel for private(n,c,h,w) #endif for ( n = 0; n < N; n++ ) { for ( c = 0; c < C; c++ ) { for ( h = 0; h < H; h++ ) { for ( w = 0; w < W; w++ ) { LIBXSMM_VLA_ACCESS(4, new_input, n, c, h+pad_h, w+pad_w, C, H+2*pad_h, W+2*pad_w) = LIBXSMM_VLA_ACCESS(4, input, n, c, h, w, C, H, W); } } } } } LIBXSMM_INLINE void naive_copy_NCHW_to_NHWC(const float* nchw, float* nhwc, int N, int H, int W, int C) { LIBXSMM_VLA_DECL(4, float, output, nhwc, H, W, C); LIBXSMM_VLA_DECL(4, const float, input, nchw, C, H, W); int n, h, w, c; #if defined(_OPENMP) LIBXSMM_OMP_VAR(c); LIBXSMM_OMP_VAR(h); LIBXSMM_OMP_VAR(w); # pragma omp parallel for private(n,c,h,w) #endif for ( n = 0; n < N; n++ ) { for ( h = 0; h < H; h++ ) { for ( w = 0; w < W; w++ ) { for ( c = 0; c < C; c++ ) { LIBXSMM_VLA_ACCESS(4, output, n, h, w, c, H, W, C) = LIBXSMM_VLA_ACCESS(4, input, n, c, h, w, C, H, W); } } } } } LIBXSMM_INLINE void naive_copy_NHWC_to_NCHW(const float* nhwc, float* nchw, int N, int H, int W, int C) { LIBXSMM_VLA_DECL(4, float, output, nchw, C, H, W); LIBXSMM_VLA_DECL(4, const float, input, nhwc, H, W, C); int n, h, w, c; #if defined(_OPENMP) LIBXSMM_OMP_VAR(c); LIBXSMM_OMP_VAR(h); LIBXSMM_OMP_VAR(w); # pragma omp parallel for private(n,c,h,w) #endif for ( n = 0; n < N; n++ ) { for ( h = 0; h < H; h++ ) { for ( w = 0; w < W; w++ ) { for ( c = 0; c < C; c++ ) { LIBXSMM_VLA_ACCESS(4, output, n, c, h, w, C, H, W) = LIBXSMM_VLA_ACCESS(4, input, n, h, w, c, H, W, C); } } } } } LIBXSMM_INLINE void naive_copy_KCRS_to_RSCK(const float* kcrs, float* rsck, int R, int S, int C, int K) { LIBXSMM_VLA_DECL(4, float, output, rsck, S, C, K); LIBXSMM_VLA_DECL(4, const float, input, kcrs, C, R, S); int r, s, c, k; #if defined(_OPENMP) LIBXSMM_OMP_VAR(s); LIBXSMM_OMP_VAR(c); LIBXSMM_OMP_VAR(k); # pragma omp parallel for private(r,s,c,k) #endif for ( r = 0; r < R; r++ ) { for ( s = 0; s < S; s++ ) { for ( c = 0; c < C; c++ ) { for ( k = 0; k < K; k++ ) { LIBXSMM_VLA_ACCESS(4, output, r, s, c, k, S, C, K) = LIBXSMM_VLA_ACCESS(4, input, k, c, r, s, C, R, S); } } } } } LIBXSMM_INLINE void naive_copy_RSCK_to_KCRS(const float* rsck, float* kcrs, int R, int S, int C, int K) { LIBXSMM_VLA_DECL(4, const float, input, rsck, S, C, K); LIBXSMM_VLA_DECL(4, float, output, kcrs, C, R, S); int r, s, c, k; #if defined(_OPENMP) LIBXSMM_OMP_VAR(s); LIBXSMM_OMP_VAR(c); LIBXSMM_OMP_VAR(k); # pragma omp parallel for private(r,s,c,k) #endif for ( r = 0; r < R; r++ ) { for ( s = 0; s < S; s++ ) { for ( c = 0; c < C; c++ ) { for ( k = 0; k < K; k++ ) { LIBXSMM_VLA_ACCESS(4, output, k, c, r, s, C, R, S) = LIBXSMM_VLA_ACCESS(4, input, r, s, c, k, S, C, K); } } } } } LIBXSMM_INLINE void matrix_copy_NC_to_NCNC(float *src, float *dst, int T, int N, int C, int bn, int bc) { int t, n1, n2, c1, c2; int nBlocks = N/bn; int cBlocks = C/bc; LIBXSMM_VLA_DECL(3, float, real_src, src, N, C); LIBXSMM_VLA_DECL(5, float, real_dst, dst, nBlocks, cBlocks, bn, bc); #if defined(_OPENMP) LIBXSMM_OMP_VAR(n1); LIBXSMM_OMP_VAR(c1); LIBXSMM_OMP_VAR(n2); LIBXSMM_OMP_VAR(c2); # pragma omp parallel for private(t,n1,c1,n2,c2) #endif for (t = 0; t < T; t++) { for (n1 = 0; n1 < nBlocks; n1++) { for (c1 = 0; c1 < cBlocks; c1++) { for (n2 = 0; n2 < bn; n2++) { for (c2 = 0; c2 < bc; c2++) { LIBXSMM_VLA_ACCESS(5, real_dst, t, n1, c1, n2, c2, nBlocks, cBlocks, bn, bc) = LIBXSMM_VLA_ACCESS(3, real_src, t, n1*bn+n2, c1*bc+c2, N, C); } } } } } } LIBXSMM_INLINE void matrix_copy_NCNC_to_NC(float *src, float *dst, int T, int N, int C, int bn, int bc) { int t, n1, n2, c1, c2; int nBlocks = N/bn; int cBlocks = C/bc; LIBXSMM_VLA_DECL(3, float, real_dst, dst, N, C); LIBXSMM_VLA_DECL(5, float, real_src, src, nBlocks, cBlocks, bn, bc); #if defined(_OPENMP) LIBXSMM_OMP_VAR(n1); LIBXSMM_OMP_VAR(c1); LIBXSMM_OMP_VAR(n2); LIBXSMM_OMP_VAR(c2); # pragma omp parallel for private(t,n1,c1,n2,c2) #endif for (t = 0; t < T; t++) { for (n1 = 0; n1 < nBlocks; n1++) { for (c1 = 0; c1 < cBlocks; c1++) { for (n2 = 0; n2 < bn; n2++) { for (c2 = 0; c2 < bc; c2++) { LIBXSMM_VLA_ACCESS(3, real_dst, t, n1*bn+n2, c1*bc+c2, N, C) = LIBXSMM_VLA_ACCESS(5, real_src, t, n1, c1, n2, c2, nBlocks, cBlocks, bn, bc); } } } } } } LIBXSMM_INLINE void matrix_copy_NC_to_NCNC_bf16(libxsmm_bfloat16 *src, libxsmm_bfloat16 *dst, int T, int N, int C, int bn, int bc) { int t, n1, n2, c1, c2; int nBlocks = N/bn; int cBlocks = C/bc; LIBXSMM_VLA_DECL(3, libxsmm_bfloat16, real_src, src, N, C); LIBXSMM_VLA_DECL(5, libxsmm_bfloat16, real_dst, dst, nBlocks, cBlocks, bn, bc); #if defined(_OPENMP) LIBXSMM_OMP_VAR(n1); LIBXSMM_OMP_VAR(c1); LIBXSMM_OMP_VAR(n2); LIBXSMM_OMP_VAR(c2); # pragma omp parallel for private(t,n1,c1,n2,c2) #endif for (t = 0; t < T; t++) { for (n1 = 0; n1 < nBlocks; n1++) { for (c1 = 0; c1 < cBlocks; c1++) { for (n2 = 0; n2 < bn; n2++) { for (c2 = 0; c2 < bc; c2++) { LIBXSMM_VLA_ACCESS(5, real_dst, t, n1, c1, n2, c2, nBlocks, cBlocks, bn, bc) = LIBXSMM_VLA_ACCESS(3, real_src, t, n1*bn+n2, c1*bc+c2, N, C); } } } } } } LIBXSMM_INLINE void matrix_copy_NCNC_to_NC_bf16(libxsmm_bfloat16 *src, libxsmm_bfloat16 *dst, int T, int N, int C, int bn, int bc) { int t, n1, n2, c1, c2; int nBlocks = N/bn; int cBlocks = C/bc; LIBXSMM_VLA_DECL(3, libxsmm_bfloat16, real_dst, dst, N, C); LIBXSMM_VLA_DECL(5, libxsmm_bfloat16, real_src, src, nBlocks, cBlocks, bn, bc); #if defined(_OPENMP) LIBXSMM_OMP_VAR(n1); LIBXSMM_OMP_VAR(c1); LIBXSMM_OMP_VAR(n2); LIBXSMM_OMP_VAR(c2); # pragma omp parallel for private(t,n1,c1,n2,c2) #endif for (t = 0; t < T; t++) { for (n1 = 0; n1 < nBlocks; n1++) { for (c1 = 0; c1 < cBlocks; c1++) { for (n2 = 0; n2 < bn; n2++) { for (c2 = 0; c2 < bc; c2++) { LIBXSMM_VLA_ACCESS(3, real_dst, t, n1*bn+n2, c1*bc+c2, N, C) = LIBXSMM_VLA_ACCESS(5, real_src, t, n1, c1, n2, c2, nBlocks, cBlocks, bn, bc); } } } } } } LIBXSMM_INLINE void matrix_copy_CK_to_KCCK(float *src, float *dst, int C, int K, int bc, int bk) { int k1, k2, c1, c2; int kBlocks = K/bk; int cBlocks = C/bc; LIBXSMM_VLA_DECL(2, float, real_src, src, K); LIBXSMM_VLA_DECL(4, float, real_dst, dst, cBlocks, bc, bk); #if defined(_OPENMP) LIBXSMM_OMP_VAR(c1); LIBXSMM_OMP_VAR(c2); LIBXSMM_OMP_VAR(k2); # pragma omp parallel for private(k1,c1,c2,k2) #endif for (k1 = 0; k1 < kBlocks; k1++) { for (c1 = 0; c1 < cBlocks; c1++) { for (c2 = 0; c2 < bc; c2++) { for (k2 = 0; k2 < bk; k2++) { LIBXSMM_VLA_ACCESS(4, real_dst, k1, c1, c2, k2, cBlocks, bc, bk) = LIBXSMM_VLA_ACCESS(2, real_src, c1*bc+c2, k1*bk+k2, K); } } } } } LIBXSMM_INLINE void matrix_copy_CK_to_CKKC(float *src, float *dst, int C, int K, int bc, int bk) { int k1, k2, c1, c2; int kBlocks = K/bk; int cBlocks = C/bc; LIBXSMM_VLA_DECL(2, float, real_src, src, K); LIBXSMM_VLA_DECL(4, float, real_dst, dst, kBlocks, bk, bc); #if defined(_OPENMP) LIBXSMM_OMP_VAR(k1); LIBXSMM_OMP_VAR(c1); LIBXSMM_OMP_VAR(c2); LIBXSMM_OMP_VAR(k2); # pragma omp parallel for private(k1,c1,c2,k2) #endif for (c1 = 0; c1 < cBlocks; c1++) { for (k1 = 0; k1 < kBlocks; k1++) { for (k2 = 0; k2 < bk; k2++) { for (c2 = 0; c2 < bc; c2++) { LIBXSMM_VLA_ACCESS(4, real_dst, c1, k1, k2, c2, kBlocks, bk, bc) = LIBXSMM_VLA_ACCESS(2, real_src, c1*bc+c2, k1*bk+k2, K); } } } } } LIBXSMM_INLINE void matrix_copy_KC_to_KCCK(float *src, float *dst, int C, int K, int bc, int bk) { int k1, k2, c1, c2; int kBlocks = K/bk; int cBlocks = C/bc; LIBXSMM_VLA_DECL(2, float, real_src, src, C); LIBXSMM_VLA_DECL(4, float, real_dst, dst, cBlocks, bc, bk); #if defined(_OPENMP) LIBXSMM_OMP_VAR(c1); LIBXSMM_OMP_VAR(c2); LIBXSMM_OMP_VAR(k2); # pragma omp parallel for private(k1,c1,c2,k2) #endif for (k1 = 0; k1 < kBlocks; k1++) { for (c1 = 0; c1 < cBlocks; c1++) { for (c2 = 0; c2 < bc; c2++) { for (k2 = 0; k2 < bk; k2++) { LIBXSMM_VLA_ACCESS(4, real_dst, k1, c1, c2, k2, cBlocks, bc, bk) = LIBXSMM_VLA_ACCESS(2, real_src, k1*bk+k2, c1*bc+c2, C); } } } } } LIBXSMM_INLINE void matrix_copy_KCCK_to_KC(float *src, float *dst, int C, int K, int bc, int bk) { int k1, k2, c1, c2; int kBlocks = K/bk; int cBlocks = C/bc; LIBXSMM_VLA_DECL(2, float, real_dst, dst, C); LIBXSMM_VLA_DECL(4, float, real_src, src, cBlocks, bc, bk); #if defined(_OPENMP) LIBXSMM_OMP_VAR(c1); LIBXSMM_OMP_VAR(c2); LIBXSMM_OMP_VAR(k2); # pragma omp parallel for private(k1,c1,c2,k2) #endif for (k1 = 0; k1 < kBlocks; k1++) { for (c1 = 0; c1 < cBlocks; c1++) { for (c2 = 0; c2 < bc; c2++) { for (k2 = 0; k2 < bk; k2++) { LIBXSMM_VLA_ACCESS(2, real_dst, k1*bk+k2, c1*bc+c2, C) = LIBXSMM_VLA_ACCESS(4, real_src, k1, c1, c2, k2, cBlocks, bc, bk); } } } } } LIBXSMM_INLINE void matrix_copy_KCCK_to_CK(float *src, float *dst, int C, int K, int bc, int bk) { int k1, k2, c1, c2; int kBlocks = K/bk; int cBlocks = C/bc; LIBXSMM_VLA_DECL(2, float, real_dst, dst, K); LIBXSMM_VLA_DECL(4, float, real_src, src, cBlocks, bc, bk); #if defined(_OPENMP) LIBXSMM_OMP_VAR(c1); LIBXSMM_OMP_VAR(c2); LIBXSMM_OMP_VAR(k2); # pragma omp parallel for private(k1,c1,c2,k2) #endif for (k1 = 0; k1 < kBlocks; k1++) { for (c1 = 0; c1 < cBlocks; c1++) { for (c2 = 0; c2 < bc; c2++) { for (k2 = 0; k2 < bk; k2++) { LIBXSMM_VLA_ACCESS(2, real_dst, c1*bc+c2, k1*bk+k2, K) = LIBXSMM_VLA_ACCESS(4, real_src, k1, c1, c2, k2, cBlocks, bc, bk); } } } } } LIBXSMM_INLINE void matrix_copy_CK_to_KCCK_bf16(libxsmm_bfloat16 *src, libxsmm_bfloat16 *dst, int C, int K, int bc, int bk) { int k1, k2, c1, c2; int kBlocks = K/bk; int cBlocks = C/bc; LIBXSMM_VLA_DECL(2, libxsmm_bfloat16, real_src, src, K); LIBXSMM_VLA_DECL(5, libxsmm_bfloat16, real_dst, dst, cBlocks, bc/2, bk, 2); #if defined(_OPENMP) LIBXSMM_OMP_VAR(c1); LIBXSMM_OMP_VAR(c2); LIBXSMM_OMP_VAR(k2); # pragma omp parallel for private(k1,c1,c2,k2) #endif for (k1 = 0; k1 < kBlocks; k1++) { for (c1 = 0; c1 < cBlocks; c1++) { for (c2 = 0; c2 < bc; c2++) { for (k2 = 0; k2 < bk; k2++) { LIBXSMM_VLA_ACCESS(5, real_dst, k1, c1, c2/2, k2, c2%2, cBlocks, bc/2, bk, 2) = LIBXSMM_VLA_ACCESS(2, real_src, c1*bc+c2, k1*bk+k2, K); } } } } } LIBXSMM_INLINE void matrix_copy_CK_to_CKKC_bf16(libxsmm_bfloat16 *src, libxsmm_bfloat16 *dst, int C, int K, int bc, int bk) { int k1, k2, c1, c2; int kBlocks = K/bk; int cBlocks = C/bc; LIBXSMM_VLA_DECL(2, libxsmm_bfloat16, real_src, src, K); LIBXSMM_VLA_DECL(5, libxsmm_bfloat16, real_dst, dst, kBlocks, bk/2, bc, 2); #if defined(_OPENMP) LIBXSMM_OMP_VAR(k1); LIBXSMM_OMP_VAR(c1); LIBXSMM_OMP_VAR(c2); LIBXSMM_OMP_VAR(k2); # pragma omp parallel for private(k1,c1,c2,k2) #endif for (c1 = 0; c1 < cBlocks; c1++) { for (k1 = 0; k1 < kBlocks; k1++) { for (k2 = 0; k2 < bk; k2++) { for (c2 = 0; c2 < bc; c2++) { LIBXSMM_VLA_ACCESS(5, real_dst, c1, k1, k2/2, c2, k2%2, kBlocks, bk/2, bc, 2) = LIBXSMM_VLA_ACCESS(2, real_src, c1*bc+c2, k1*bk+k2, K); } } } } } LIBXSMM_INLINE void matrix_copy_KC_to_KCCK_bf16(libxsmm_bfloat16 *src, libxsmm_bfloat16 *dst, int C, int K, int bc, int bk) { int k1, k2, c1, c2; int kBlocks = K/bk; int cBlocks = C/bc; LIBXSMM_VLA_DECL(2, libxsmm_bfloat16, real_src, src, C); LIBXSMM_VLA_DECL(5, libxsmm_bfloat16, real_dst, dst, cBlocks, bc/2, bk, 2); #if defined(_OPENMP) LIBXSMM_OMP_VAR(c1); LIBXSMM_OMP_VAR(c2); LIBXSMM_OMP_VAR(k2); # pragma omp parallel for private(k1,c1,c2,k2) #endif for (k1 = 0; k1 < kBlocks; k1++) { for (c1 = 0; c1 < cBlocks; c1++) { for (c2 = 0; c2 < bc; c2++) { for (k2 = 0; k2 < bk; k2++) { LIBXSMM_VLA_ACCESS(5, real_dst, k1, c1, c2/2, k2, c2%2, cBlocks, bc/2, bk, 2) = LIBXSMM_VLA_ACCESS(2, real_src, k1*bk+k2, c1*bc+c2, C); } } } } } LIBXSMM_INLINE void matrix_copy_KCCK_to_KC_bf16(libxsmm_bfloat16 *src, libxsmm_bfloat16 *dst, int C, int K, int bc, int bk) { int k1, k2, c1, c2; int kBlocks = K/bk; int cBlocks = C/bc; LIBXSMM_VLA_DECL(2, libxsmm_bfloat16, real_dst, dst, C); LIBXSMM_VLA_DECL(5, libxsmm_bfloat16, real_src, src, cBlocks, bc/2, bk, 2); #if defined(_OPENMP) LIBXSMM_OMP_VAR(c1); LIBXSMM_OMP_VAR(c2); LIBXSMM_OMP_VAR(k2); # pragma omp parallel for private(k1,c1,c2,k2) #endif for (k1 = 0; k1 < kBlocks; k1++) { for (c1 = 0; c1 < cBlocks; c1++) { for (c2 = 0; c2 < bc; c2++) { for (k2 = 0; k2 < bk; k2++) { LIBXSMM_VLA_ACCESS(2, real_dst, k1*bk+k2, c1*bc+c2, C) = LIBXSMM_VLA_ACCESS(5, real_src, k1, c1, c2/2, k2, c2%2, cBlocks, bc/2, bk, 2); } } } } } LIBXSMM_INLINE void matrix_copy_KCCK_to_CK_bf16(libxsmm_bfloat16 *src, libxsmm_bfloat16 *dst, int C, int K, int bc, int bk) { int k1, k2, c1, c2; int kBlocks = K/bk; int cBlocks = C/bc; LIBXSMM_VLA_DECL(2, libxsmm_bfloat16, real_dst, dst, K); LIBXSMM_VLA_DECL(5, libxsmm_bfloat16, real_src, src, cBlocks, bc/2, bk, 2); #if defined(_OPENMP) LIBXSMM_OMP_VAR(c1); LIBXSMM_OMP_VAR(c2); LIBXSMM_OMP_VAR(k2); # pragma omp parallel for private(k1,c1,c2,k2) #endif for (k1 = 0; k1 < kBlocks; k1++) { for (c1 = 0; c1 < cBlocks; c1++) { for (c2 = 0; c2 < bc; c2++) { for (k2 = 0; k2 < bk; k2++) { LIBXSMM_VLA_ACCESS(2, real_dst, c1*bc+c2, k1*bk+k2, K) = LIBXSMM_VLA_ACCESS(5, real_src, k1, c1, c2/2, k2, c2%2, cBlocks, bc/2, bk, 2); } } } } } LIBXSMM_INLINE void matrix_copy_KCCK_to_CKKC_bf16(libxsmm_bfloat16 *src, libxsmm_bfloat16 *dst, int C, int K, int bc, int bk) { int k1, k2, c1, c2; int kBlocks = K/bk; int cBlocks = C/bc; LIBXSMM_VLA_DECL(5, libxsmm_bfloat16, real_dst, dst, kBlocks, bk/2, bc, 2); LIBXSMM_VLA_DECL(5, libxsmm_bfloat16, real_src, src, cBlocks, bc/2, bk, 2); #if defined(_OPENMP) LIBXSMM_OMP_VAR(c1); LIBXSMM_OMP_VAR(c2); LIBXSMM_OMP_VAR(k2); # pragma omp parallel for private(k1,c1,c2,k2) #endif for (k1 = 0; k1 < kBlocks; k1++) { for (c1 = 0; c1 < cBlocks; c1++) { for (c2 = 0; c2 < bc; c2++) { for (k2 = 0; k2 < bk; k2++) { LIBXSMM_VLA_ACCESS(5, real_dst, c1, k1, k2/2, c2, k2%2, kBlocks, bk/2, bc, 2) = LIBXSMM_VLA_ACCESS(5, real_src, k1, c1, c2/2, k2, c2%2, cBlocks, bc/2, bk, 2); } } } } } LIBXSMM_INLINE void matrix_add(int size, float *a, float *b, float *c) { int i; #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < size; i++) { c[i] = a[i] + b[i]; } } LIBXSMM_INLINE void matrix_eltwise_mult(int size, float *a, float *b, float *c) { int i; #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < size; i++) { c[i] = a[i] * b[i]; } } LIBXSMM_INLINE void matrix_eltwise_fma(int size, float *a, float *b, float *c) { int i; #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < size; i++) { c[i] += a[i] * b[i]; } } LIBXSMM_INLINE void matrix_eltwise_mult_ld_a(int m, int n, int ld, float *a, float *b, float *c) { int i; #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < m*n; i++) { int row = i / m; int col = i % m; c[i] = a[row*ld + col] * b[i]; } } LIBXSMM_INLINE void matrix_eltwise_mult_ld_ab(int m, int n, int ld, float *a, float *b, float *c) { int i; #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < m*n; i++) { int row = i / m; int col = i % m; c[i] = a[row*ld + col] * b[row*ld + col]; } } LIBXSMM_INLINE void matrix_eltwise_mult_ld_c(int m, int n, int ld, float *a, float *b, float *c) { int i; #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < m*n; i++) { int row = i / m; int col = i % m; c[row*ld + col] = a[i] * b[i]; } } LIBXSMM_INLINE void matrix_sigmoid(int size, float *src, float *dst) { int i; #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < size; i++) { const float exp_value = (float)exp((double) -src[i]); dst[i] = 1.0f / (1.0f + exp_value); } } LIBXSMM_INLINE void matrix_sigmoid_ld(int m, int n, int ld, float *src, float *dst) { int i; #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < m*n; i++) { int row = i / m; int col = i % m; const float exp_value = (float)exp((double) -src[row*ld + col]); dst[row*ld + col] = 1.0f / (1.0f + exp_value); } } LIBXSMM_INLINE void matrix_tanh(int size, float *src, float *dst) { int i; #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < size; i++) { dst[i] = (float)tanh((double)src[i]); } } LIBXSMM_INLINE void matrix_tanh_ld(int m, int n, int ld, float *src, float *dst) { int i; #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < m*n; i++) { int row = i / m; int col = i % m; dst[row*ld + col] = (float)tanh((double)src[row*ld + col]); } } LIBXSMM_INLINE void matrix_relu(int size, float *src, float *dst) { int i; #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < size; i++) { dst[i] = (src[i] > 0.0f) ? src[i] : 0.0f; } } LIBXSMM_INLINE void matrix_sigmoid_inverse(int size, float *src, float *dst) { int i; #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < size; i++) { const float exp_value = (float)exp((double) -src[i]); const float sig_exp = 1.0f / (1.0f + exp_value); dst[i] = (1.0f - sig_exp)*sig_exp; } } LIBXSMM_INLINE void matrix_tanh_inverse(int size, float *src, float *dst) { int i; #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < size; i++) { const float tanh_value = (float)tanh((double)src[i]); dst[i] = 1.0f - (tanh_value * tanh_value); } } LIBXSMM_INLINE void matrix_relu_inverse(int size, float *src, float *dst) { int i; #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < size; i++) { dst[i] = (src[i] > 0.0f) ? 1.0f : 0.0f; } } LIBXSMM_INLINE void matrix_transpose(int rows, int cols, float *src, float *dst) { libxsmm_otrans_omp(dst, src, sizeof(float), cols, rows, cols/*ldi*/, rows/*ldo*/); } LIBXSMM_INLINE void matrix_copy(int size, float *src, float *dst) { int i; #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < size; i++) { dst[i] = src[i]; } } LIBXSMM_INLINE void matrix_copy_f32_bf16(int size, float *src, libxsmm_bfloat16 *dst) { int i; #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < size; i++) { libxsmm_bfloat16_hp t; t.f = src[i]; dst[i] = t.i[1]; } } LIBXSMM_INLINE void matrix_copy_bf16_f32(int size, libxsmm_bfloat16 *src, float *dst) { int i; #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < size; i++) { libxsmm_bfloat16_hp t; t.i[1] = src[i]; t.i[0] = 0; dst[i] = t.f; } } LIBXSMM_INLINE void matrix_copy_ld(int m, int n, int ld, float *src, float *dst) { int i; #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < m*n; i++) { int row = i / m; int col = i % m; dst[i] = src[row*ld + col]; } } LIBXSMM_INLINE void matrix_copy_bias(int m, int n, int ld, float *src, float *dst) { int i; #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < m*n; i++) { int row = i / m; int col = i % m; dst[row*ld + col] = src[col]; } } LIBXSMM_INLINE void matrix_copy_forget_bias(int m, int n, int ld, float *src, float *dst, float forget_bias) { int i; #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < m*n; i++) { int row = i / m; int col = i % m; dst[row*ld + col] = src[col] + forget_bias; } } LIBXSMM_INLINE void matrix_complement(int size, float *src, float *dst) { int i; #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < size; i++) { dst[i] = 1.0f - src[i]; } } LIBXSMM_INLINE void matrix_complement_ld(int m, int n, int ld, float *src, float *dst) { int i; #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < m*n; i++) { int row = i / m; int col = i % m; dst[i] = 1.0f - src[row*ld + col]; } } LIBXSMM_INLINE void matrix_complement_square(int size, float *src, float *dst) { int i; #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < size; i++) { dst[i] = 1.0f - (src[i] * src[i]); } } LIBXSMM_INLINE void matrix_complement_square_ld(int m, int n, int ld, float *src, float *dst) { int i; #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < m*n; i++) { int row = i / m; int col = i % m; dst[i] = 1.0f - (src[row*ld + col] * src[row*ld + col]); } } LIBXSMM_INLINE void convert_ck_c4k_offset(int C, int K, int offset, float *src, float *dst) { /* offsets: i--0, c--1, f--2, o--3 */ int x, y; #if defined(_OPENMP) LIBXSMM_OMP_VAR(x); # pragma omp parallel for private(x, y) #endif for (y = 0; y < C; y++) { for (x = 0; x < K; x++) { dst[y*4*K + offset*K + x] = src[y*K + x]; } } } LIBXSMM_INLINE void convert_ck_c4k(int C, int K, float *src, float *dst) { convert_ck_c4k_offset(C, K, 0, src, dst); } LIBXSMM_INLINE void convert_ck_f32_to_c4k_bf16(int C, int K, float *src, libxsmm_bfloat16 *dst) { int x, y; #if defined(_OPENMP) LIBXSMM_OMP_VAR(x); # pragma omp parallel for private(x, y) #endif for (y = 0; y < C; y++) { for (x = 0; x < K; x++) { libxsmm_bfloat16_hp t; t.f = src[y*K + x]; dst[y*4*K + x] = t.i[1]; } } } LIBXSMM_INLINE void convert_c4k_4ck(int C, int K, float *src, float *dst) { /* offsets: i--0, c--1, f--2, o--3 */ int x, y, offset; #if defined(_OPENMP) LIBXSMM_OMP_VAR(x); LIBXSMM_OMP_VAR(y); # pragma omp parallel for private(x, y, offset) #endif for (offset = 0; offset < 4; offset++) { for (y = 0; y < C; y++) { for (x = 0; x < K; x++) { dst[offset*C*K + y*K + x] = src[y*4*K + offset*K + x]; } } } } LIBXSMM_INLINE void convert_ck_c3k(int C, int K, float *src, float *dst) { int x, y; #if defined(_OPENMP) LIBXSMM_OMP_VAR(x); # pragma omp parallel for private(x, y) #endif for (y = 0; y < C; y++) { for (x = 0; x < K; x++) { dst[y*3*K + x] = src[y*K + x]; } } } LIBXSMM_INLINE void convert_nk_nck(int N, int K, int CK, float *src, float *dst) { int x, y; #if defined(_OPENMP) LIBXSMM_OMP_VAR(x); # pragma omp parallel for private(x, y) #endif for (y = 0; y < N; y++) { for (x = 0; x < K; x++) { dst[y*CK + x] = src[y*K + x]; } } } LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input, float* output, const float* filter, const float* bias) { int nImg = param->nImg; int nIfm = param->nIfm; int nOfm = param->nOfm; int ifhp = param->ifhp; int ifwp = param->ifwp; int ofhp = param->ofhp; int ofwp = param->ofwp; int ifh = param->ifh; int ifw = param->ifw; int ofh = param->ofh; int ofw = param->ofw; int pad_h = param->pad_h; int pad_w = param->pad_w; int pad_h_in = param->pad_h_in; int pad_w_in = param->pad_w_in; int pad_h_out = param->pad_h_out; int pad_w_out = param->pad_w_out; int kh = param->kh; int kw = param->kw; int stride_h = param->stride_h; int stride_w = param->stride_w; /* loop counters */ int img, ofm, ifm, oj, oi, ij, ii, kj, ki; LIBXSMM_VLA_DECL(4, float, output_t, output + (pad_h_out * ofwp + pad_w_out), nOfm, ofhp, ofwp); LIBXSMM_VLA_DECL(4, const float, input_t, input + (pad_h_in * ifwp + pad_w_in), nIfm, ifhp, ifwp); LIBXSMM_VLA_DECL(4, const float, filter_t, filter, nIfm, kh, kw); #if defined(USE_FUSED_BIAS) || defined(USE_FUSED_BIAS_RELU) #if defined(_OPENMP) # pragma omp parallel for LIBXSMM_OPENMP_COLLAPSE(2) private(img, ofm, ifm, oj, oi, ij, ii, kj, ki) #endif for (img = 0; img < nImg; ++img) { for (ofm = 0; ofm < nOfm; ++ofm) { for (oj = 0; oj < ofh; ++oj) { for (oi = 0; oi < ofw; ++oi) { LIBXSMM_VLA_ACCESS( 4, output_t, img, ofm, oj, oi, nOfm, ofhp, ofwp) = bias[ofm]; } } } } #else LIBXSMM_UNUSED(bias); #endif #if defined(_OPENMP) LIBXSMM_OMP_VAR(img); LIBXSMM_OMP_VAR(ofm); LIBXSMM_OMP_VAR(oj); LIBXSMM_OMP_VAR(oi); LIBXSMM_OMP_VAR(ifm); LIBXSMM_OMP_VAR(ij); LIBXSMM_OMP_VAR(ii); LIBXSMM_OMP_VAR(kj); LIBXSMM_OMP_VAR(ki); # pragma omp parallel for LIBXSMM_OPENMP_COLLAPSE(2) private(img, ofm, ifm, oj, oi, ij, ii, kj, ki) #endif for (img = 0; img < nImg; ++img) { for (ofm = 0; ofm < nOfm; ++ofm) { for (ifm = 0; ifm < nIfm; ++ifm) { for (oj = 0; oj < ofh; ++oj) { ij = oj * stride_h - pad_h; for (oi = 0; oi < ofw; ++oi) { ii = oi * stride_w - pad_w; for (kj = 0; kj < kh; ++kj) { if (ij+kj < 0 || ij+kj >= ifh) continue; for (ki = 0; ki < kw; ++ki) { if (ii+ki < 0 || ii+ki >= ifw) continue; LIBXSMM_VLA_ACCESS( 4, output_t, img, ofm, oj, oi, nOfm, ofhp, ofwp) += LIBXSMM_VLA_ACCESS(4, input_t, img, ifm, ij + kj, ii + ki, nIfm, ifhp, ifwp) * LIBXSMM_VLA_ACCESS(4, filter_t, ofm, ifm, kj, ki, nIfm, kh, kw); } } } } } #if defined(USE_FUSED_RELU) || defined(USE_FUSED_BIAS_RELU) for (oj = 0; oj < ofh; ++oj) { for (oi = 0; oi < ofw; ++oi) { LIBXSMM_VLA_ACCESS( 4, output_t, img, ofm, oj, oi, nOfm, ofhp, ofwp) = (LIBXSMM_VLA_ACCESS( 4, output_t, img, ofm, oj, oi, nOfm, ofhp, ofwp) < 0.0f) ? 0.0f : LIBXSMM_VLA_ACCESS( 4, output_t, img, ofm, oj, oi, nOfm, ofhp, ofwp); } } #endif } } } LIBXSMM_INLINE void naive_conv_bp(naive_conv_t* param, float* input, const float* output, const float* filter, const float* naive_input_save) { int nImg = param->nImg; int nIfm = param->nIfm; int nOfm = param->nOfm; int ifhp = param->ifhp; int ifwp = param->ifwp; int ofhp = param->ofhp; int ofwp = param->ofwp; int ifh = param->ifh; int ifw = param->ifw; int ofh = param->ofh; int ofw = param->ofw; int pad_h = param->pad_h; int pad_w = param->pad_w; int pad_h_in = param->pad_h_in; int pad_w_in = param->pad_w_in; int pad_h_out = param->pad_h_out; int pad_w_out = param->pad_w_out; int kh = param->kh; int kw = param->kw; int stride_h = param->stride_h; int stride_w = param->stride_w; /* loop counters */ int img, ofm, ifm, oj, oi, ij, ii, kj, ki; LIBXSMM_VLA_DECL(4, const float, output_t, output + (pad_h_out * ofwp + pad_w_out), nOfm, ofhp, ofwp); LIBXSMM_VLA_DECL(4, float, input_t, input + (pad_h_in * ifwp + pad_w_in), nIfm, ifhp, ifwp); LIBXSMM_VLA_DECL(4, const float, filter_t, filter, nIfm, kh, kw); #if (defined(USE_FUSED_RELU_BWD) || defined(USE_FUSED_BATCH_STATS_BWD)) LIBXSMM_VLA_DECL(4, const float, naive_input_t, naive_input_save + (pad_h_in * ifwp + pad_w_in), nIfm, ifhp, ifwp); #else LIBXSMM_UNUSED(naive_input_save); #endif #if defined(_OPENMP) LIBXSMM_OMP_VAR(img); LIBXSMM_OMP_VAR(ofm); LIBXSMM_OMP_VAR(oj); LIBXSMM_OMP_VAR(oi); LIBXSMM_OMP_VAR(ifm); LIBXSMM_OMP_VAR(ij); LIBXSMM_OMP_VAR(ii); LIBXSMM_OMP_VAR(kj); LIBXSMM_OMP_VAR(ki); # pragma omp parallel for LIBXSMM_OPENMP_COLLAPSE(2) private(img, ofm, ifm, oj, oi, ij, ii, kj, ki) #endif for (img = 0; img < nImg; ++img) { for (ifm = 0; ifm < nIfm; ++ifm) { for (ofm = 0; ofm < nOfm; ++ofm) { for (oj = 0; oj < ofh; ++oj) { ij = oj * stride_h - pad_h; for (oi = 0; oi < ofw; ++oi) { ii = oi * stride_w - pad_w; for (kj = 0; kj < kh; ++kj) { if (ij+kj < 0 || ij+kj >= ifh) continue; for (ki = 0; ki < kw; ++ki) { if (ii+ki < 0 || ii+ki >= ifw) continue; LIBXSMM_VLA_ACCESS(4, input_t, img, ifm, ij + kj, ii + ki, nIfm, ifhp, ifwp) += LIBXSMM_VLA_ACCESS(4, output_t, img, ofm, oj, oi, nOfm, ofhp, ofwp) * LIBXSMM_VLA_ACCESS(4, filter_t, ofm, ifm, kj, ki, nIfm, kh, kw); } } } } } #if (defined(USE_FUSED_RELU_BWD) || defined(USE_FUSED_BATCH_STATS_BWD)) for (ij = 0; ij < ifh; ij++) { for (ii = 0; ii < ifw; ii++) { if ( LIBXSMM_VLA_ACCESS(4, naive_input_t, img, ifm, ij, ii , nIfm, ifhp, ifwp) == 0.0 ) { LIBXSMM_VLA_ACCESS(4, input_t, img, ifm, ij, ii , nIfm, ifhp, ifwp) = 0.0; } } } #endif } } } LIBXSMM_INLINE void naive_conv_wu(naive_conv_t* param, const float* input, const float* output, float* filter) { int nImg = param->nImg; int nIfm = param->nIfm; int nOfm = param->nOfm; int ifhp = param->ifhp; int ifwp = param->ifwp; int ofhp = param->ofhp; int ofwp = param->ofwp; int ifh = param->ifh; int ifw = param->ifw; int ofh = param->ofh; int ofw = param->ofw; int pad_h = param->pad_h; int pad_w = param->pad_w; int pad_h_in = param->pad_h_in; int pad_w_in = param->pad_w_in; int pad_h_out = param->pad_h_out; int pad_w_out = param->pad_w_out; int kh = param->kh; int kw = param->kw; int stride_h = param->stride_h; int stride_w = param->stride_w; /* loop counters */ int img, ofm, ifm, oj, oi, ij, ii, kj, ki; LIBXSMM_VLA_DECL(4, const float, output_t, output + (pad_h_out * ofwp + pad_w_out), nOfm, ofhp, ofwp); LIBXSMM_VLA_DECL(4, const float, input_t, input + (pad_h_in * ifwp + pad_w_in), nIfm, ifhp, ifwp); LIBXSMM_VLA_DECL(4, float, filter_t, filter, nIfm, kh, kw); #if defined(_OPENMP) LIBXSMM_OMP_VAR(img); LIBXSMM_OMP_VAR(ofm); LIBXSMM_OMP_VAR(oj); LIBXSMM_OMP_VAR(oi); LIBXSMM_OMP_VAR(ifm); LIBXSMM_OMP_VAR(ij); LIBXSMM_OMP_VAR(ii); LIBXSMM_OMP_VAR(kj); LIBXSMM_OMP_VAR(ki); # pragma omp parallel for LIBXSMM_OPENMP_COLLAPSE(2) private(img, ofm, ifm, oj, oi, ij, ii, kj, ki) #endif for (ofm = 0; ofm < nOfm; ++ofm) { for (ifm = 0; ifm < nIfm; ++ifm) { for (img = 0; img < nImg; ++img) { for (oj = 0; oj < ofh; ++oj) { ij = oj * stride_h - pad_h; for (oi = 0; oi < ofw; ++oi) { ii = oi * stride_w - pad_w; for (kj = 0; kj < kh; ++kj) { if (ij+kj < 0 || ij+kj >= ifh) continue; for (ki = 0; ki < kw; ++ki) { if (ii+ki < 0 || ii+ki >= ifw) continue; LIBXSMM_VLA_ACCESS(4, filter_t, ofm, ifm, kj, ki, nIfm, kh, kw) += LIBXSMM_VLA_ACCESS(4, input_t, img, ifm, ij + kj, ii + ki, nIfm, ifhp, ifwp) * LIBXSMM_VLA_ACCESS(4, output_t, img, ofm, oj, oi, nOfm, ofhp, ofwp); } } } } } } } } LIBXSMM_INLINE void naive_conv_fp_int16fp32(naive_conv_t* param, const short* input, float* output, const short* filter) { int nImg = param->nImg; int nIfm = param->nIfm; int nOfm = param->nOfm; int ifhp = param->ifhp; int ifwp = param->ifwp; int ofhp = param->ofhp; int ofwp = param->ofwp; int ifh = param->ifh; int ifw = param->ifw; int ofh = param->ofh; int ofw = param->ofw; int pad_h = param->pad_h; int pad_w = param->pad_w; int pad_h_in = param->pad_h_in; int pad_w_in = param->pad_w_in; int pad_h_out = param->pad_h_out; int pad_w_out = param->pad_w_out; int kh = param->kh; int kw = param->kw; int stride_h = param->stride_h; int stride_w = param->stride_w; /* loop counters */ int img, ofm, ifm, oj, oi, ij, ii, kj, ki; LIBXSMM_VLA_DECL(4, float, output_t, output + (pad_w_out * ofwp + pad_h_out), nOfm, ofhp, ofwp); LIBXSMM_VLA_DECL(4, const short, input_t, input + (pad_w_in * ifwp + pad_h_in), nIfm, ifhp, ifwp); LIBXSMM_VLA_DECL(4, const short, filter_t, filter, nIfm, kh, kw); #if defined(_OPENMP) LIBXSMM_OMP_VAR(img); LIBXSMM_OMP_VAR(ofm); LIBXSMM_OMP_VAR(oj); LIBXSMM_OMP_VAR(oi); LIBXSMM_OMP_VAR(ifm); LIBXSMM_OMP_VAR(ij); LIBXSMM_OMP_VAR(ii); LIBXSMM_OMP_VAR(kj); LIBXSMM_OMP_VAR(ki); # pragma omp parallel for LIBXSMM_OPENMP_COLLAPSE(2) private(img, ofm, ifm, oj, oi, ij, ii, kj, ki) #endif for (img = 0; img < nImg; ++img) { for (ofm = 0; ofm < nOfm; ++ofm) { for (ifm = 0; ifm < nIfm; ++ifm) { for (oj = 0; oj < ofh; ++oj) { ij = oj * stride_h - pad_h; for (oi = 0; oi < ofw; ++oi) { ii = oi * stride_w - pad_w; for (kj = 0; kj < kh; ++kj) { if (ij+kj < 0 || ij+kj >= ifh) continue; for (ki = 0; ki < kw; ++ki) { if (ii+ki < 0 || ii+ki >= ifw) continue; LIBXSMM_VLA_ACCESS(4, output_t, img, ofm, oj, oi, nOfm, ofhp, ofwp) += (1.f * LIBXSMM_VLA_ACCESS(4, input_t, img, ifm, ij + kj, ii + ki, nIfm, ifhp, ifwp)) * (1.f * LIBXSMM_VLA_ACCESS(4, filter_t, ofm, ifm, kj, ki, nIfm, kh, kw)); } } } } } } } } LIBXSMM_INLINE void naive_conv_fp_int16int32(naive_conv_t* param, const short* input, int* output, const short* filter) { int nImg = param->nImg; int nIfm = param->nIfm; int nOfm = param->nOfm; int ifhp = param->ifhp; int ifwp = param->ifwp; int ofhp = param->ofhp; int ofwp = param->ofwp; int ifh = param->ifh; int ifw = param->ifw; int ofh = param->ofh; int ofw = param->ofw; int pad_h = param->pad_h; int pad_w = param->pad_w; int pad_h_in = param->pad_h_in; int pad_w_in = param->pad_w_in; int pad_h_out = param->pad_h_out; int pad_w_out = param->pad_w_out; int kh = param->kh; int kw = param->kw; int stride_h = param->stride_h; int stride_w = param->stride_w; /* loop counters */ int img, ofm, ifm, oj, oi, ij, ii, kj, ki; LIBXSMM_VLA_DECL(4, int, output_t, output + (pad_w_out * ofwp + pad_h_out), nOfm, ofhp, ofwp); LIBXSMM_VLA_DECL(4, const short, input_t, input + (pad_w_in * ifwp + pad_h_in), nIfm, ifhp, ifwp); LIBXSMM_VLA_DECL(4, const short, filter_t, filter, nIfm, kh, kw); #if defined(_OPENMP) LIBXSMM_OMP_VAR(img); LIBXSMM_OMP_VAR(ofm); LIBXSMM_OMP_VAR(oj); LIBXSMM_OMP_VAR(oi); LIBXSMM_OMP_VAR(ifm); LIBXSMM_OMP_VAR(ij); LIBXSMM_OMP_VAR(ii); LIBXSMM_OMP_VAR(kj); LIBXSMM_OMP_VAR(ki); # pragma omp parallel for LIBXSMM_OPENMP_COLLAPSE(2) private(img, ofm, ifm, oj, oi, ij, ii, kj, ki) #endif for (img = 0; img < nImg; ++img) { for (ofm = 0; ofm < nOfm; ++ofm) { for (ifm = 0; ifm < nIfm; ++ifm) { for (oj = 0; oj < ofh; ++oj) { ij = oj * stride_h - pad_h; for (oi = 0; oi < ofw; ++oi) { ii = oi * stride_w - pad_w; for (kj = 0; kj < kh; ++kj) { if (ij+kj < 0 || ij+kj >= ifh) continue; for (ki = 0; ki < kw; ++ki) { if (ii+ki < 0 || ii+ki >= ifw) continue; LIBXSMM_VLA_ACCESS( 4, output_t, img, ofm, oj, oi, nOfm, ofhp, ofwp) += (int) ( (int)LIBXSMM_VLA_ACCESS(4, input_t, img, ifm, ij + kj, ii + ki, nIfm, ifhp, ifwp)) * ( (int) LIBXSMM_VLA_ACCESS(4, filter_t, ofm, ifm, kj, ki, nIfm, kh, kw)); } } } } } } } } LIBXSMM_INLINE void naive_conv_fp_int8int32(naive_conv_t* param, const unsigned char* input, int* output, const char* filter) { int nImg = param->nImg; int nIfm = param->nIfm; int nOfm = param->nOfm; int ifhp = param->ifhp; int ifwp = param->ifwp; int ofhp = param->ofhp; int ofwp = param->ofwp; int ifh = param->ifh; int ifw = param->ifw; int ofh = param->ofh; int ofw = param->ofw; int pad_h = param->pad_h; int pad_w = param->pad_w; int pad_h_in = param->pad_h_in; int pad_w_in = param->pad_w_in; int pad_h_out = param->pad_h_out; int pad_w_out = param->pad_w_out; int kh = param->kh; int kw = param->kw; int stride_h = param->stride_h; int stride_w = param->stride_w; /* loop counters */ int img, ofm, ifm, oj, oi, ij, ii, kj, ki; LIBXSMM_VLA_DECL(4, int, output_t, output + (pad_w_out * ofwp + pad_h_out), nOfm, ofhp, ofwp); LIBXSMM_VLA_DECL(4, const unsigned char, input_t, input + (pad_w_in * ifwp + pad_h_in), nIfm, ifhp, ifwp); LIBXSMM_VLA_DECL(4, const char, filter_t, filter, nIfm, kh, kw); #if defined(_OPENMP) LIBXSMM_OMP_VAR(img); LIBXSMM_OMP_VAR(ofm); LIBXSMM_OMP_VAR(oj); LIBXSMM_OMP_VAR(oi); LIBXSMM_OMP_VAR(ifm); LIBXSMM_OMP_VAR(ij); LIBXSMM_OMP_VAR(ii); LIBXSMM_OMP_VAR(kj); LIBXSMM_OMP_VAR(ki); # pragma omp parallel for LIBXSMM_OPENMP_COLLAPSE(2) private(img, ofm, ifm, oj, oi, ij, ii, kj, ki) #endif for (img = 0; img < nImg; ++img) { for (ofm = 0; ofm < nOfm; ++ofm) { for (ifm = 0; ifm < nIfm; ++ifm) { for (oj = 0; oj < ofh; ++oj) { ij = oj * stride_h - pad_h; for (oi = 0; oi < ofw; ++oi) { ii = oi * stride_w - pad_w; for (kj = 0; kj < kh; ++kj) { if (ij+kj < 0 || ij+kj >= ifh) continue; for (ki = 0; ki < kw; ++ki) { if (ii+ki < 0 || ii+ki >= ifw) continue; LIBXSMM_VLA_ACCESS(4, output_t, img, ofm, oj, oi, nOfm, ofhp, ofwp) += (int) LIBXSMM_VLA_ACCESS(4, input_t, img, ifm, ij + kj, ii + ki, nIfm, ifhp, ifwp) * LIBXSMM_VLA_ACCESS(4, filter_t, ofm, ifm, kj, ki, nIfm, kh, kw); } } } } } } } } LIBXSMM_INLINE void naive_fullyconnected_fp(naive_fullyconnected_t* param, const float* input_ptr, float* output_ptr, const float* filter_ptr) { const int nImg = param->N; const int nIFm = param->C; const int nOFm = param->K; int img, ifm, ofm; LIBXSMM_VLA_DECL(2, const float, input, input_ptr, nIFm); LIBXSMM_VLA_DECL(2, const float, filter, filter_ptr, nIFm); LIBXSMM_VLA_DECL(2, float, output, output_ptr, nOFm); #if defined(_OPENMP) LIBXSMM_OMP_VAR(img); LIBXSMM_OMP_VAR(ifm); LIBXSMM_OMP_VAR(ofm); # pragma omp parallel for private(img, ofm, ifm) #endif for (ofm = 0; ofm < nOFm; ++ofm) { for(img = 0; img < nImg; ++img) { LIBXSMM_VLA_ACCESS(2, output, img, ofm, nOFm) = (float)0; for (ifm = 0; ifm < nIFm; ++ifm) { LIBXSMM_VLA_ACCESS(2, output, img, ofm, nOFm) += LIBXSMM_VLA_ACCESS(2, filter, ofm, ifm, nIFm) * LIBXSMM_VLA_ACCESS(2, input, img, ifm, nIFm); } } } } LIBXSMM_INLINE void naive_fullyconnected_bp(naive_fullyconnected_t* param, float* delinput_ptr, const float* deloutput_ptr, const float* filter_ptr) { const int nImg = param->N; const int nIFm = param->C; const int nOFm = param->K; int img, ifm, ofm; LIBXSMM_VLA_DECL(2, float, dinput, delinput_ptr, nIFm); LIBXSMM_VLA_DECL(2, const float, filter, filter_ptr, nIFm); LIBXSMM_VLA_DECL(2, const float, doutput, deloutput_ptr, nOFm); #if defined(_OPENMP) LIBXSMM_OMP_VAR(img); LIBXSMM_OMP_VAR(ofm); LIBXSMM_OMP_VAR(ifm); # pragma omp parallel for private(img, ofm, ifm) #endif for (ifm = 0; ifm < nIFm; ++ifm) { for(img = 0; img < nImg; ++img) { LIBXSMM_VLA_ACCESS(2, dinput, img, ifm, nIFm) = (float)0; for (ofm = 0; ofm < nOFm; ++ofm) { LIBXSMM_VLA_ACCESS(2, dinput, img, ifm, nIFm) += LIBXSMM_VLA_ACCESS(2, filter, ofm, ifm, nIFm) * LIBXSMM_VLA_ACCESS(2, doutput, img, ofm, nOFm); } } } } LIBXSMM_INLINE void naive_fullyconnected_fused_fp(naive_fullyconnected_t* param, const float* input_ptr, float* output_ptr, const float* filter_ptr, const float* bias_ptr) { const int nImg = param->N; const int nIFm = param->C; const int nOFm = param->K; int img, ifm, ofm; LIBXSMM_VLA_DECL(2, const float, input, input_ptr, nIFm); LIBXSMM_VLA_DECL(2, const float, filter, filter_ptr, nIFm); LIBXSMM_VLA_DECL(2, float, output, output_ptr, nOFm); #if defined(_OPENMP) LIBXSMM_OMP_VAR(img); LIBXSMM_OMP_VAR(ifm); LIBXSMM_OMP_VAR(ofm); # pragma omp parallel for private(img, ofm, ifm) #endif for (ofm = 0; ofm < nOFm; ++ofm) { for(img = 0; img < nImg; ++img) { LIBXSMM_VLA_ACCESS(2, output, img, ofm, nOFm) = (float)0; for (ifm = 0; ifm < nIFm; ++ifm) { LIBXSMM_VLA_ACCESS(2, output, img, ofm, nOFm) += LIBXSMM_VLA_ACCESS(2, filter, ofm, ifm, nIFm) * LIBXSMM_VLA_ACCESS(2, input, img, ifm, nIFm); } if ( param->fuse_type == 1 ) { LIBXSMM_VLA_ACCESS(2, output, img, ofm, nOFm) += bias_ptr[ofm]; } else if ( param->fuse_type == 2 ) { LIBXSMM_VLA_ACCESS(2, output, img, ofm, nOFm) = ( LIBXSMM_VLA_ACCESS(2, output, img, ofm, nOFm) > 0 ) ? LIBXSMM_VLA_ACCESS(2, output, img, ofm, nOFm) : 0; } else if ( param->fuse_type == 3 ) { LIBXSMM_VLA_ACCESS(2, output, img, ofm, nOFm) = ((float)tanh((double)LIBXSMM_VLA_ACCESS(2, output, img, ofm, nOFm)/2.0)+1.0f)/2.0f; } else if ( param->fuse_type == 4 ) { LIBXSMM_VLA_ACCESS(2, output, img, ofm, nOFm) += bias_ptr[ofm]; LIBXSMM_VLA_ACCESS(2, output, img, ofm, nOFm) = ( LIBXSMM_VLA_ACCESS(2, output, img, ofm, nOFm) > 0 ) ? LIBXSMM_VLA_ACCESS(2, output, img, ofm, nOFm) : 0; } else if ( param->fuse_type == 5 ) { LIBXSMM_VLA_ACCESS(2, output, img, ofm, nOFm) += bias_ptr[ofm]; LIBXSMM_VLA_ACCESS(2, output, img, ofm, nOFm) = ((float)tanh((double)LIBXSMM_VLA_ACCESS(2, output, img, ofm, nOFm)/2.0)+1.0f)/2.0f; } } } } LIBXSMM_INLINE void naive_fullyconnected_fused_bp(naive_fullyconnected_t* param, float* delinput_ptr, float* deloutput_ptr, const float* filter_ptr, float* delbias_ptr, const float* output_ptr) { const int nImg = param->N; const int nIFm = param->C; const int nOFm = param->K; int img, ifm, ofm; LIBXSMM_VLA_DECL(2, float, dinput, delinput_ptr, nIFm); LIBXSMM_VLA_DECL(2, const float, filter, filter_ptr, nIFm); LIBXSMM_VLA_DECL(2, float, doutput, deloutput_ptr, nOFm); LIBXSMM_VLA_DECL(2, const float, output, output_ptr, nOFm); if ( param->fuse_type != 0 ) { #if defined(_OPENMP) LIBXSMM_OMP_VAR(img); LIBXSMM_OMP_VAR(ofm); # pragma omp parallel for private(img, ofm) #endif for (ofm = 0; ofm < nOFm; ++ofm) { float dbias = 0.0f; for(img = 0; img < nImg; ++img) { if ( param->fuse_type == 1 ) { dbias += LIBXSMM_VLA_ACCESS(2, doutput, img, ofm, nOFm); } else if ( param->fuse_type == 2 ) { LIBXSMM_VLA_ACCESS(2, doutput, img, ofm, nOFm) = ( LIBXSMM_VLA_ACCESS(2, output, img, ofm, nOFm) > 0 ) ? LIBXSMM_VLA_ACCESS(2, doutput, img, ofm, nOFm) : 0; } else if ( param->fuse_type == 3 ) { LIBXSMM_VLA_ACCESS(2, doutput, img, ofm, nOFm) = LIBXSMM_VLA_ACCESS(2, doutput, img, ofm, nOFm)*(1.0f-LIBXSMM_VLA_ACCESS(2, doutput, img, ofm, nOFm)); } else if ( param->fuse_type == 4 ) { LIBXSMM_VLA_ACCESS(2, doutput, img, ofm, nOFm) = ( LIBXSMM_VLA_ACCESS(2, output, img, ofm, nOFm) > 0 ) ? LIBXSMM_VLA_ACCESS(2, doutput, img, ofm, nOFm) : 0; dbias += LIBXSMM_VLA_ACCESS(2, doutput, img, ofm, nOFm); } else if ( param->fuse_type == 5 ) { LIBXSMM_VLA_ACCESS(2, doutput, img, ofm, nOFm) = LIBXSMM_VLA_ACCESS(2, doutput, img, ofm, nOFm)*(1.0f-LIBXSMM_VLA_ACCESS(2, doutput, img, ofm, nOFm)); dbias += LIBXSMM_VLA_ACCESS(2, doutput, img, ofm, nOFm); } } delbias_ptr[ofm] = dbias; } } #if defined(_OPENMP) LIBXSMM_OMP_VAR(img); LIBXSMM_OMP_VAR(ofm); LIBXSMM_OMP_VAR(ifm); # pragma omp parallel for private(img, ofm, ifm) #endif for (ifm = 0; ifm < nIFm; ++ifm) { for(img = 0; img < nImg; ++img) { LIBXSMM_VLA_ACCESS(2, dinput, img, ifm, nIFm) = (float)0; for (ofm = 0; ofm < nOFm; ++ofm) { LIBXSMM_VLA_ACCESS(2, dinput, img, ifm, nIFm) += LIBXSMM_VLA_ACCESS(2, filter, ofm, ifm, nIFm) * LIBXSMM_VLA_ACCESS(2, doutput, img, ofm, nOFm); } } } } LIBXSMM_INLINE void naive_fullyconnected_wu(naive_fullyconnected_t* param, const float* input_ptr, const float* deloutput_ptr, float* delfilter_ptr) { const int nImg = param->N; const int nIFm = param->C; const int nOFm = param->K; int img, ifm, ofm; LIBXSMM_VLA_DECL(2, const float, input, input_ptr, nIFm); LIBXSMM_VLA_DECL(2, float, dfilter, delfilter_ptr, nIFm); LIBXSMM_VLA_DECL(2, const float, doutput, deloutput_ptr, nOFm); #if defined(_OPENMP) LIBXSMM_OMP_VAR(img); LIBXSMM_OMP_VAR(ofm); LIBXSMM_OMP_VAR(ifm); # pragma omp parallel for private(img, ofm, ifm) #endif for (ofm = 0; ofm < nOFm; ++ofm) { for (ifm = 0; ifm < nIFm; ++ifm) { LIBXSMM_VLA_ACCESS(2, dfilter, ofm, ifm, nIFm) = (float)0; for(img = 0; img < nImg; ++img) { LIBXSMM_VLA_ACCESS(2, dfilter, ofm, ifm, nIFm) += LIBXSMM_VLA_ACCESS(2, doutput, img, ofm, nOFm) * LIBXSMM_VLA_ACCESS(2, input, img, ifm, nIFm); } } } } LIBXSMM_INLINE void naive_pooling_fp(naive_pooling_t* param, const float* input_ptr, float* output_ptr, int* mask_ptr) { const int nImg = param->N; const int nFm = param->C; const int ifh = param->H; const int ifw = param->W; const int sh = param->stride_h; const int sw = param->stride_w; const int r = param->R; const int s = param->S; const int pad_h = param->pad_h; const int pad_w = param->pad_w; const int ofh = (ifh + 2*pad_h - r)/sh + 1; const int ofw = (ifw + 2*pad_w - s)/sw + 1; int img, fm; LIBXSMM_VLA_DECL(4, const float, input, input_ptr, nFm, ifh, ifw); LIBXSMM_VLA_DECL(4, int, mask, mask_ptr, nFm, ofh, ofw); LIBXSMM_VLA_DECL(4, float, output, output_ptr, nFm, ofh, ofw); #if defined(_OPENMP) float* tmp_buffer = (float*)malloc(sizeof(float)*ofh*ofw*omp_get_max_threads()); LIBXSMM_OMP_VAR(img); LIBXSMM_OMP_VAR(fm); # pragma omp parallel for private(img, fm) #else float* tmp_buffer = (float*)malloc(sizeof(float)*ofh*ofw); #endif for (img = 0; img < nImg; img++) { for (fm = 0; fm < nFm; fm++) { #if defined(_OPENMP) float* lcl_buffer_ptr = tmp_buffer + (ofh*ofw*omp_get_thread_num()); #else float* lcl_buffer_ptr = tmp_buffer; #endif LIBXSMM_VLA_DECL(2, float, lcl_buffer, lcl_buffer_ptr, ofw); int i, ho, wo, hi, wi, kh, kw; if (param->type == 0 ) { for ( i = 0; i < ofh*ofw; i++ ) { lcl_buffer_ptr[i] = -FLT_MAX; } } else if (param->type == 1) { for ( i = 0; i < ofh*ofw; i++ ) { lcl_buffer_ptr[i] = 0.0; } } else { /* shouldn't happen */ } for( ho = 0; ho < ofh; ho++ ) { hi = (ho * sh) - pad_h; for( wo = 0; wo < ofw; wo++ ) { wi = (wo * sw) - pad_w; for( kh = 0; kh < r; kh++ ) { if (hi+kh < 0 || hi+kh >= ifh) continue; for( kw = 0; kw < s; kw++ ) { if (wi+kw < 0 || wi+kw >= ifw) continue; if ( param->type == 0 ) { const int index = (hi+kh)*ifw + wi+kw; if ( LIBXSMM_VLA_ACCESS(4, input, img, fm, hi+kh, wi+kw, nFm, ifh, ifw) > LIBXSMM_VLA_ACCESS(2, lcl_buffer, ho, wo, ofw) ) { LIBXSMM_VLA_ACCESS(2, lcl_buffer, ho, wo, ofw) = LIBXSMM_VLA_ACCESS(4, input, img, fm, hi+kh, wi+kw, nFm, ifh, ifw); LIBXSMM_VLA_ACCESS(4, mask, img, fm, ho, wo, nFm, ofh, ofw) = index; } } else if ( param->type == 1 ) { LIBXSMM_VLA_ACCESS(2, lcl_buffer, ho, wo, ofw) += LIBXSMM_VLA_ACCESS(4, input, img, fm, hi+kh, wi+kw, nFm, ifh, ifw); } else { /* shouldn't happen */ } } } } } if (param->type == 0 ) { for( ho = 0; ho < ofh; ho++ ) { for( wo = 0; wo < ofw; wo++ ) { LIBXSMM_VLA_ACCESS(4, output, img, fm, ho, wo, nFm, ofh, ofw) = LIBXSMM_VLA_ACCESS(2, lcl_buffer, ho, wo, ofw); } } } else if (param->type == 1) { for( ho = 0; ho < ofh; ho++ ) { for( wo = 0; wo < ofw; wo++ ) { LIBXSMM_VLA_ACCESS(4, output, img, fm, ho, wo, nFm, ofh, ofw) = LIBXSMM_VLA_ACCESS(2, lcl_buffer, ho, wo, ofw) * (1.0f/(((float)r) * ((float)s))); } } } else { /* shouldn't happen */ } } } free( tmp_buffer ); } LIBXSMM_INLINE void naive_pooling_bp(naive_pooling_t* param, float* dinput_ptr, const float* doutput_ptr, const int* mask_ptr) { const int nImg = param->N; const int nFm = param->C; const int ifh = param->H; const int ifw = param->W; const int sh = param->stride_h; const int sw = param->stride_w; const int r = param->R; const int s = param->S; const int pad_h = param->pad_h; const int pad_w = param->pad_w; const int ofh = (ifh + 2*pad_h - r)/sh + 1; const int ofw = (ifw + 2*pad_w - s)/sw + 1; int img, fm; LIBXSMM_VLA_DECL(4, float, dinput, dinput_ptr, nFm, ifh, ifw); LIBXSMM_VLA_DECL(4, const int , mask, mask_ptr, nFm, ofh, ofw); LIBXSMM_VLA_DECL(4, const float, doutput, doutput_ptr, nFm, ofh, ofw); #if defined(_OPENMP) float* tmp_buffer = (float*)malloc(sizeof(float)*ifh*ifw*omp_get_max_threads()); LIBXSMM_OMP_VAR(img); LIBXSMM_OMP_VAR(fm); # pragma omp parallel for private(img, fm) #else float* tmp_buffer = (float*)malloc(sizeof(float)*ofh*ofw); #endif for (img = 0; img < nImg; img++) { for (fm = 0; fm < nFm; fm++) { #if defined(_OPENMP) float* lcl_buffer_ptr = tmp_buffer + (ifh*ifw*omp_get_thread_num()); #else float* lcl_buffer_ptr = tmp_buffer; #endif LIBXSMM_VLA_DECL(2, float, lcl_buffer, lcl_buffer_ptr, ifw); int i, ho, wo, hi, wi, kh, kw; for ( i = 0; i < ifh*ifw; i++ ) { lcl_buffer_ptr[i] = 0.0; } if (param->type == 0 ) { for( ho = 0; ho < ofh; ho++ ) { for( wo = 0; wo < ofw; wo++ ) { lcl_buffer_ptr[LIBXSMM_VLA_ACCESS(4, mask, img, fm, ho, wo, nFm, ofh, ofw)] += LIBXSMM_VLA_ACCESS(4, doutput, img, fm, ho, wo, nFm, ofh, ofw); } } } else if ( param->type == 1 ) { for( ho = 0; ho < ofh; ho++ ) { hi = (ho * sh) - pad_h; for( wo = 0; wo < ofw; wo++ ) { wi = (wo * sw) - pad_w; for( kh = 0; kh < r; kh++ ) { if (hi+kh < 0 || hi+kh >= ifh) continue; for( kw = 0; kw < s; kw++ ) { if (wi+kw < 0 || wi+kw >= ifw) continue; LIBXSMM_VLA_ACCESS(2, lcl_buffer, hi+kh, wi+kw, ifw) += ( LIBXSMM_VLA_ACCESS(4, doutput, img, fm, ho, wo, nFm, ofh, ofw) * (1.0f/(((float)r) * ((float)s))) ); } } } } } else { /* shouldn't happen */ } for( hi = 0; hi < ifh; hi++ ) { for( wi = 0; wi < ifw; wi++ ) { LIBXSMM_VLA_ACCESS(4, dinput, img, fm, hi, wi, nFm, ifh, ifw) = LIBXSMM_VLA_ACCESS(2, lcl_buffer, hi, wi, ifw); } } } } free( tmp_buffer ); } LIBXSMM_INLINE void naive_fusedbatchnorm_fp(naive_fusedbatchnorm_t* param, const float* input_ptr, float* output_ptr, const float* input_add_ptr, const float* beta_ptr, const float* gamma_ptr, float* expectval_ptr, float* rcpstddev_ptr, float* variance_ptr) { const int nImg = param->N; const int nFm = param->C; const int ifh = param->H; const int ifw = param->W; const int sh = param->stride_h; const int sw = param->stride_w; const int ofh = ifh/sh; const int ofw = ifw/sw; const float nhw = (float)(nImg * ifh * ifw); const float recp_nhw = 1.0f/nhw; const float sqrt_eps = 1e-7f; int img, fm, hi, wi, ho, wo; LIBXSMM_VLA_DECL(4, const float, input, input_ptr, nFm, ifh, ifw); LIBXSMM_VLA_DECL(4, const float, input_add, input_add_ptr, nFm, ifh, ifw); LIBXSMM_VLA_DECL(4, float, output, output_ptr, nFm, ofh, ofw); if ( param->norm_type == 0 ) { #if defined(_OPENMP) LIBXSMM_OMP_VAR(wi); LIBXSMM_OMP_VAR(hi); # pragma omp parallel for private(img, fm, hi, wi) #endif for (fm = 0; fm < nFm; fm++) { float ch_sum = 0.0f; float ch_sumsq = 0.0f; float tbmean = 0.0f; float tbmeansq = 0.0f; float tsqbmean = 0.0f; float tbrstd = 0.0f; float tvariance = 0.0f; for ( img = 0; img < nImg; img++ ) { for ( hi = 0; hi < ifh; hi++ ) { for ( wi = 0; wi < ifw; wi++ ) { const float input_val = LIBXSMM_VLA_ACCESS(4, input, img, fm, hi, wi, nFm, ifh, ifw); ch_sum += input_val; ch_sumsq += (input_val * input_val); } } } tbmean = recp_nhw * ch_sum; tbmeansq = tbmean * tbmean; tsqbmean = recp_nhw * ch_sumsq; tvariance = tsqbmean - tbmeansq; tbrstd = (float)(1.0/sqrt(tvariance + sqrt_eps)); expectval_ptr[fm] = tbmean; rcpstddev_ptr[fm] = tbrstd; variance_ptr[fm] = tvariance; } } #if defined(_OPENMP) LIBXSMM_OMP_VAR(ho); LIBXSMM_OMP_VAR(wo); # pragma omp parallel for private(img, fm, hi, wi, ho, wo) #endif for ( img = 0; img < nImg; img++ ) { for ( fm = 0; fm < nFm; fm++ ) { for ( hi = 0, ho = 0; hi < ifh; hi += sh, ho++ ) { for ( wi = 0, wo = 0; wi < ifw; wi += sw, wo++ ) { const float input_val = LIBXSMM_VLA_ACCESS(4, input, img, fm, hi, wi, nFm, ifh, ifw); const float input_add_val = LIBXSMM_VLA_ACCESS(4, input_add, img, fm, hi, wi, nFm, ifh, ifw); float* output_ptr2 = &LIBXSMM_VLA_ACCESS(4, output, img, fm, ho, wo, nFm, ofh, ofw); /* BN + scale (gamma, beta) */ float o = gamma_ptr[fm]*(input_val - expectval_ptr[fm])*rcpstddev_ptr[fm] + beta_ptr[fm]; /* Eltwise */ if ( (param->fuse_type == 2) || (param->fuse_type == 3) || (param->fuse_type == 5) ) { o += input_add_val; } /* ReLU */ if ( (param->fuse_type == 1) || (param->fuse_type == 3) || (param->fuse_type == 4) || (param->fuse_type == 5) ) { o = ( o < 0.0f ) ? 0.0f : o; } *output_ptr2 = o; } } } } } LIBXSMM_INLINE void naive_fusedbatchnorm_bp(naive_fusedbatchnorm_t* param, const float* input_ptr, float* dinput_ptr, const float* output_ptr, float* doutput_ptr, float* dinput_add_ptr, const float* beta_ptr, float* del_beta_ptr, const float* gamma_ptr, float* del_gamma_ptr, const float* expectval_ptr, const float* rcpstddev_ptr) { const int nImg = param->N; const int nFm = param->C; const int ifh = param->H; const int ifw = param->W; const int sh = param->stride_h; const int sw = param->stride_w; const int ofh = ifh/sh; const int ofw = ifw/sw; const float nhw = (float)(nImg * ifh * ifw); const float recp_nhw = 1.0f/nhw; int img, fm, hi, wi, ho, wo; LIBXSMM_VLA_DECL(4, const float, input, input_ptr, nFm, ifh, ifw); LIBXSMM_VLA_DECL(4, float, dinput, dinput_ptr, nFm, ifh, ifw); LIBXSMM_VLA_DECL(4, float, dinput_add, dinput_add_ptr, nFm, ifh, ifw); LIBXSMM_VLA_DECL(4, const float, output, output_ptr, nFm, ofh, ofw); LIBXSMM_VLA_DECL(4, float, doutput, doutput_ptr, nFm, ofh, ofw); LIBXSMM_UNUSED(beta_ptr); if ( param->norm_type == 0 ) { #if defined(_OPENMP) LIBXSMM_OMP_VAR(hi); LIBXSMM_OMP_VAR(wi); LIBXSMM_OMP_VAR(ho); LIBXSMM_OMP_VAR(wo); # pragma omp parallel for private(img, fm, hi, wi, ho, wo) #endif for ( fm = 0; fm < nFm; fm++ ) { del_gamma_ptr[fm] = 0.0f; del_beta_ptr[fm] = 0.0f; for ( img = 0; img < nImg; img++ ) { for ( hi = 0, ho = 0; hi < ifh; hi += sh, ho++ ) { for ( wi = 0, wo = 0; wi < ifw; wi += sw, wo++ ) { float* del_input_add_ptr = &LIBXSMM_VLA_ACCESS(4, dinput_add, img, fm, hi, wi, fm, ifh, ifw); const float output_val = LIBXSMM_VLA_ACCESS(4, output, img, fm, ho, wo, fm, ofh, ofw); const float input_val = LIBXSMM_VLA_ACCESS(4, input, img, fm, hi, wi, fm, ifh, ifw); float* del_output_ptr = &LIBXSMM_VLA_ACCESS(4, doutput, img, fm, ho, wo, fm, ofh, ofw); /* ReLU */ if ( (param->fuse_type == 1) || (param->fuse_type == 3) || (param->fuse_type == 4) || (param->fuse_type == 5) ) { *del_output_ptr = (output_val == 0) ? 0 : *del_output_ptr; } /* elementwise */ if ( (param->fuse_type == 2) || (param->fuse_type == 3) || (param->fuse_type == 5) ) { *del_input_add_ptr = *del_output_ptr; } del_gamma_ptr[fm] += (input_val - expectval_ptr[fm]) * (*del_output_ptr) * rcpstddev_ptr[fm]; del_beta_ptr[fm] += *del_output_ptr; } } } } } #if defined(_OPENMP) # pragma omp parallel for private(img, fm, hi, wi, ho, wo) #endif for ( img = 0; img < nImg; img++ ) { for ( fm = 0; fm < nFm; fm++ ) { for ( hi = 0, ho = 0; hi < ifh; hi += sh, ho++ ) { for ( wi = 0, wo = 0; wi < ifw; wi += sw, wo++) { float* del_input_ptr = &LIBXSMM_VLA_ACCESS(4, dinput, img, fm, hi, wi, fm, ifh, ifw); const float input_val = LIBXSMM_VLA_ACCESS(4, input, img, fm, hi, wi, fm, ifh, ifw); const float del_output_val = LIBXSMM_VLA_ACCESS(4, doutput, img, fm, ho, wo, fm, ofh, ofw); *del_input_ptr = gamma_ptr[fm] * rcpstddev_ptr[fm] * recp_nhw * (nhw * del_output_val - (del_beta_ptr[fm] + (input_val - expectval_ptr[fm]) * del_gamma_ptr[fm] * rcpstddev_ptr[fm])); } } } } } LIBXSMM_INLINE void naive_fusedgroupnorm_fp(naive_fusedgroupnorm_t* param, const float* input_ptr, float* output_ptr, const float* input_add_ptr, const float* beta_ptr, const float* gamma_ptr, float* expectval_ptr, float* rcpstddev_ptr, float* variance_ptr) { const int nImg = param->N; const int nFm = param->C; const int ifh = param->H; const int ifw = param->W; const int sh = param->stride_h; const int sw = param->stride_w; const int ofh = ifh/sh; const int ofw = ifw/sw; const int nG = param->G; const int nFMG = nFm/nG; const float ghw = (float)(nFMG * ifh * ifw); const float recp_ghw = 1.0f/ghw; const float sqrt_eps = 1e-7f; int img, g, fmg, hi, wi, ho, wo; LIBXSMM_VLA_DECL(5, const float, input, input_ptr, nG, nFMG, ifh, ifw); LIBXSMM_VLA_DECL(5, const float, input_add, input_add_ptr, nG, nFMG, ifh, ifw); LIBXSMM_VLA_DECL(5, float, output, output_ptr, nG, nFMG, ofh, ofw); #if defined(_OPENMP) LIBXSMM_OMP_VAR(img); LIBXSMM_OMP_VAR(g); LIBXSMM_OMP_VAR(fmg); LIBXSMM_OMP_VAR(hi); LIBXSMM_OMP_VAR(wi); # pragma omp parallel for private(img, g, fmg, hi, wi) #endif for ( img = 0; img < nImg; img++ ) { for (g = 0; g < nG; g++) { float ch_sum = 0.0f; float ch_sumsq = 0.0f; float tbmean = 0.0f; float tbmeansq = 0.0f; float tsqbmean = 0.0f; float tbrstd = 0.0f; float tvariance = 0.0f; for ( fmg = 0; fmg < nFMG; fmg++) { for ( hi = 0; hi < ifh; hi++ ) { for ( wi = 0; wi < ifw; wi++ ) { const float input_val = LIBXSMM_VLA_ACCESS(5, input, img, g, fmg, hi, wi, nG, nFMG, ifh, ifw); ch_sum += input_val; ch_sumsq += (input_val * input_val); } } } tbmean = recp_ghw * ch_sum; tbmeansq = tbmean * tbmean; tsqbmean = recp_ghw * ch_sumsq; tvariance = tsqbmean - tbmeansq; tbrstd = (float)(1.0/sqrt(tvariance + sqrt_eps)); expectval_ptr[img*nG+g] = tbmean; rcpstddev_ptr[img*nG+g] = tbrstd; variance_ptr[img*nG+g] = tvariance; } } #if defined(_OPENMP) LIBXSMM_OMP_VAR(ho); LIBXSMM_OMP_VAR(wo); # pragma omp parallel for private(img, g, fmg, hi, wi, ho, wo) #endif for ( img = 0; img < nImg; img++ ) { for ( g = 0; g < nG; g++ ) { for ( fmg = 0; fmg < nFMG; fmg++ ) { for ( hi = 0, ho = 0; hi < ifh; hi += sh, ho++ ) { for ( wi = 0, wo = 0; wi < ifw; wi += sw, wo++ ) { const float input_val = LIBXSMM_VLA_ACCESS(5, input, img, g, fmg, hi, wi, nG, nFMG, ifh, ifw); const float input_add_val = LIBXSMM_VLA_ACCESS(5, input_add, img, g, fmg, hi, wi, nG, nFMG, ifh, ifw); float* output_ptr2 = &LIBXSMM_VLA_ACCESS(5, output, img, g, fmg, ho, wo, nG, nFMG, ofh, ofw); /* BN + scale (gamma, beta) */ float o = gamma_ptr[g*nFMG+fmg]*(input_val - expectval_ptr[img*nG+g])*rcpstddev_ptr[img*nG+g] + beta_ptr[g*nFMG+fmg]; /* Eltwise */ if ( (param->fuse_type == 2) || (param->fuse_type == 3) || (param->fuse_type == 5) ) { o += input_add_val; } /* ReLU */ if ( (param->fuse_type == 1) || (param->fuse_type == 3) || (param->fuse_type == 4) || (param->fuse_type == 5) ) { o = ( o < 0.0f ) ? 0.0f : o; } *output_ptr2 = o; } } } } } } LIBXSMM_INLINE void naive_fusedgroupnorm_bp(naive_fusedgroupnorm_t* param, const float* input_ptr, float* dinput_ptr, const float* output_ptr, float* doutput_ptr, float* dinput_add_ptr, const float* beta_ptr, float* del_beta_ptr, const float* gamma_ptr, float* del_gamma_ptr, const float* expectval_ptr, const float* rcpstddev_ptr, const float* variance_ptr) { const int nImg = param->N; const int nFm = param->C; const int ifh = param->H; const int ifw = param->W; const int sh = param->stride_h; const int sw = param->stride_w; const int ofh = ifh/sh; const int ofw = ifw/sw; const int nG = param->G; const int nFMG = nFm/nG; const float ghw = (float)(nFMG * ifh * ifw); const float recp_ghw = 1.0f/ghw; const float eps = 1e-7f; int img, g, fmg, fm, hi, wi, ho, wo; LIBXSMM_VLA_DECL(5, const float, input, input_ptr, nG, nFMG, ifh, ifw); LIBXSMM_VLA_DECL(5, float, dinput, dinput_ptr, nG, nFMG, ifh, ifw); /*LIBXSMM_VLA_DECL(5, const float, output, output_ptr, nG, nFMG, ofh, ofw);*/ LIBXSMM_VLA_DECL(5, float, doutput, doutput_ptr, nG, nFMG, ofh, ofw); LIBXSMM_VLA_DECL(4, const float, input_gb, input_ptr, nFm, ifh, ifw); LIBXSMM_VLA_DECL(4, const float, output_gb, output_ptr, nFm, ofh, ofw); LIBXSMM_VLA_DECL(4, float, doutput_gb, doutput_ptr, nFm, ofh, ofw); LIBXSMM_VLA_DECL(4, float, dinput_add, dinput_add_ptr, nFm, ifh, ifw); LIBXSMM_UNUSED(beta_ptr); #if defined(_OPENMP) LIBXSMM_OMP_VAR(hi); LIBXSMM_OMP_VAR(wi); LIBXSMM_OMP_VAR(ho); LIBXSMM_OMP_VAR(wo); LIBXSMM_OMP_VAR(g); # pragma omp parallel for private(img, fm, hi, wi, ho, wo, g) #endif for ( fm = 0; fm < nFm; fm++ ) { del_gamma_ptr[fm] = 0.0f; del_beta_ptr[fm] = 0.0f; for ( img = 0; img < nImg; img++ ) { for ( hi = 0, ho = 0; hi < ifh; hi += sh, ho++ ) { for ( wi = 0, wo = 0; wi < ifw; wi += sw, wo++ ) { float* del_input_add_ptr = &LIBXSMM_VLA_ACCESS(4, dinput_add, img, fm, hi, wi, nFm, ifh, ifw); const float output_val = LIBXSMM_VLA_ACCESS(4, output_gb, img, fm, ho, wo, nFm, ofh, ofw); const float input_val = LIBXSMM_VLA_ACCESS(4, input_gb, img, fm, hi, wi, nFm, ifh, ifw); float* del_output_ptr = &LIBXSMM_VLA_ACCESS(4, doutput_gb, img, fm, ho, wo, nFm, ofh, ofw); /* ReLU */ if ( (param->fuse_type == 1) || (param->fuse_type == 3) || (param->fuse_type == 4) || (param->fuse_type == 5) ) { *del_output_ptr = (output_val == 0) ? 0 : *del_output_ptr; } /* elementwise */ if ( (param->fuse_type == 2) || (param->fuse_type == 3) || (param->fuse_type == 5) ) { *del_input_add_ptr = *del_output_ptr; } g = fm/nFMG; del_gamma_ptr[fm] += (input_val - expectval_ptr[img*nG+g]) * (*del_output_ptr) * rcpstddev_ptr[img*nG+g]; del_beta_ptr[fm] += *del_output_ptr; } } } } #if defined(_OPENMP) LIBXSMM_OMP_VAR(fmg); # pragma omp parallel for private(img, g, fmg, hi, wi, ho, wo) #endif for ( img = 0; img < nImg; img++ ) { for ( g = 0; g < nG; g++ ) { float d1_val = 0.0; float d2_val = 0.0; for ( fmg = 0; fmg < nFMG; fmg++ ) { for ( hi = 0, ho = 0; hi < ifh; hi += sh, ho++ ) { for ( wi = 0, wo = 0; wi < ifw; wi += sw, wo++) { const float input_val = LIBXSMM_VLA_ACCESS(5, input, img, g, fmg, hi, wi, nG, nFMG, ifh, ifw); const float del_output_val = LIBXSMM_VLA_ACCESS(5, doutput, img, g, fmg, ho, wo, nG, nFMG, ofh, ofw); d1_val += del_output_val * (input_val - expectval_ptr[img*nG+g]) * gamma_ptr[g*nFMG+fmg]; d2_val += del_output_val * gamma_ptr[g*nFMG+fmg]; } } } for ( fmg = 0; fmg < nFMG; fmg++ ) { for ( hi = 0, ho = 0; hi < ifh; hi += sh, ho++ ) { for ( wi = 0, wo = 0; wi < ifw; wi += sw, wo++) { const float input_val = LIBXSMM_VLA_ACCESS(5, input, img, g, fmg, hi, wi, nG, nFMG, ifh, ifw); const float del_output_val = LIBXSMM_VLA_ACCESS(5, doutput, img, g, fmg, ho, wo, nG, nFMG, ofh, ofw); float* del_input_ptr = &LIBXSMM_VLA_ACCESS(5, dinput, img, g, fmg, hi, wi, nG, nFMG, ifh, ifw); float t0_val = rcpstddev_ptr[img*nG+g] * recp_ghw; *del_input_ptr = t0_val * ((gamma_ptr[g*nFMG+fmg] * ghw * del_output_val) - d2_val - ((input_val - expectval_ptr[img*nG+g]) * d1_val * (1.0f/(variance_ptr[img*nG+g]+eps)))); } } } } } } LIBXSMM_INLINE void lstm_fwd_copy_bias(int N, int K, float *bigold, float *bcgold, float *bfgold, float *bogold, float forget_bias, float *icfogoldt, int j) { LIBXSMM_VLA_DECL(3, float, icfogold, icfogoldt, N, 4 * K); int i, l; #if defined(_OPENMP) LIBXSMM_OMP_VAR(i); LIBXSMM_OMP_VAR(l); # pragma omp parallel for private(i, l) LIBXSMM_OPENMP_COLLAPSE(2) #endif for (i = 0; i < N; i++) { for (l = 0; l < K; l++) { LIBXSMM_VLA_ACCESS(3, icfogold, j, i, l, N, 4 * K) = bigold[l]; LIBXSMM_VLA_ACCESS(3, icfogold, j, i, l+K, N, 4 * K) = bcgold[l]; LIBXSMM_VLA_ACCESS(3, icfogold, j, i, l+2*K, N, 4 * K) = bfgold[l] + forget_bias; LIBXSMM_VLA_ACCESS(3, icfogold, j, i, l+3*K, N, 4 * K) = bogold[l]; } } } LIBXSMM_INLINE void lstm_fwd_eltwise_merged(int N, int K, float *i, float *c, float *f, float *o, float *csp, float *cs, float *co, float *h) { int j; #if defined(__AVX512F__) int l; int rem = (K/16)*16; __m512 minus1 = _mm512_set1_ps (-1.0f); __m512 plus1 = _mm512_set1_ps (1.0f); #if defined(_OPENMP) # pragma omp parallel for private(j, l) LIBXSMM_OPENMP_COLLAPSE(2) #endif for (j = 0; j < N; j++) { for (l = 0; l < rem; l+=16) { __m512 iv = LIBXSMM_INTRINSICS_MM512_LOAD_PS (&(i[j*4*K + l])); __m512 cv = LIBXSMM_INTRINSICS_MM512_LOAD_PS (&(c[j*4*K + l])); __m512 fv = LIBXSMM_INTRINSICS_MM512_LOAD_PS (&(f[j*4*K + l])); __m512 ov = LIBXSMM_INTRINSICS_MM512_LOAD_PS (&(o[j*4*K + l])); __m512 cspv = LIBXSMM_INTRINSICS_MM512_LOAD_PS (&(csp[j*K + l])); __m512 csv, cov, hv; /* i = sigmoid(i) */ iv = _mm512_mul_ps (iv, minus1); iv = LIBXSMM_INTRINSICS_MM512_EXP_PS (iv); iv = _mm512_add_ps (iv, plus1); iv = _mm512_div_ps (plus1, iv); /* c = tanh(c) */ cv = LIBXSMM_INTRINSICS_MM512_TANH_PS (cv); /* f = sigmoid(f) */ fv = _mm512_mul_ps (fv, minus1); fv = LIBXSMM_INTRINSICS_MM512_EXP_PS (fv); fv = _mm512_add_ps (fv, plus1); fv = _mm512_div_ps (plus1, fv); /* o = sigmoid(o) */ ov = _mm512_mul_ps (ov, minus1); ov = LIBXSMM_INTRINSICS_MM512_EXP_PS (ov); ov = _mm512_add_ps (ov, plus1); ov = _mm512_div_ps (plus1, ov); /* cs = f.csp + i.c */ csv = _mm512_mul_ps (fv, cspv); csv = _mm512_fmadd_ps (iv, cv, csv); /* co = tanh(cs) */ cov = LIBXSMM_INTRINSICS_MM512_TANH_PS (csv); /* h = o.co */ hv = _mm512_mul_ps (ov, cov); _mm512_storeu_ps (&(i[j*4*K + l]), iv); _mm512_storeu_ps (&(c[j*4*K + l]), cv); _mm512_storeu_ps (&(f[j*4*K + l]), fv); _mm512_storeu_ps (&(o[j*4*K + l]), ov); _mm512_storeu_ps (&(cs[j*K + l]), csv); _mm512_storeu_ps (&(co[j*K + l]), cov); _mm512_storeu_ps (&(h[j*K + l]), hv); } } #if defined(_OPENMP) # pragma omp parallel for private(j, l) LIBXSMM_OPENMP_COLLAPSE(2) #endif for (j = 0; j < N; j++) { for (l = rem; l < K; l++) { float exp_value; /* i = sigmoid(i) */ exp_value = (float)exp((double) -i[j*4*K + l]); i[j*4*K + l] = 1.0f / (1.0f + exp_value); /* c = tanh(c) */ c[j*4*K + l] = (float)tanh((double)c[j*4*K + l]); /* f = sigmoid(f) */ exp_value = (float)exp((double) -f[j*4*K + l]); f[j*4*K + l] = 1.0f / (1.0f + exp_value); /* o = sigmoid(o) */ exp_value = (float)exp((double) -o[j*4*K + l]); o[j*4*K + l] = 1.0f / (1.0f + exp_value); /* cs = f.csp + i.c */ cs[j*K + l] = f[j*4*K + l]*csp[j*K + l] + i[j*4*K + l]*c[j*4*K + l]; /* co = tanh(cs) */ co[j*K + l] = (float)tanh((double)cs[j*K + l]); /* h = o.co */ h[j*K + l] = o[j*4*K + l] * co[j*K + l]; } } #else #if defined(_OPENMP) # pragma omp parallel for private(j) #endif for (j = 0; j < N*K; j++) { const int row = j / K; const int col = j % K; float exp_value; /* i = sigmoid(i) */ exp_value = (float)exp((double) -i[row*4*K + col]); i[row*4*K + col] = 1.0f / (1.0f + exp_value); /* c = tanh(c) */ c[row*4*K + col] = (float)tanh((double)c[row*4*K + col]); /* f = sigmoid(f) */ exp_value = (float)exp((double) -f[row*4*K + col]); f[row*4*K + col] = 1.0f / (1.0f + exp_value); /* o = sigmoid(o) */ exp_value = (float)exp((double) -o[row*4*K + col]); o[row*4*K + col] = 1.0f / (1.0f + exp_value); /* cs = f.csp + i.c */ cs[j] = f[row*4*K + col]*csp[j] + i[row*4*K + col]*c[row*4*K + col]; /* co = tanh(cs) */ co[j] = (float)tanh((double)cs[j]); /* h = o.co */ h[j] = o[row*4*K + col] * co[j]; } #endif } LIBXSMM_INLINE void lstm_bwd_upd_eltwise_merged(int N, int K, float *i, float *c, float *f, float *o, float *csp, float *co, float *dh, float *dout, float *di, float *dc, float *df, float *dp, float *dcsp, float *dcs) { int j; #if defined(__AVX512F__) int l; int rem = (K/16)*16; __m512 plus1 = _mm512_set1_ps (1.0f); #if defined(_OPENMP) # pragma omp parallel for private(j, l) LIBXSMM_OPENMP_COLLAPSE(2) #endif for (j = 0; j < N; j++) { for (l = 0; l < rem; l+=16) { __m512 iv = LIBXSMM_INTRINSICS_MM512_LOAD_PS (&(i[j*4*K + l])); __m512 cv = LIBXSMM_INTRINSICS_MM512_LOAD_PS (&(c[j*4*K + l])); __m512 fv = LIBXSMM_INTRINSICS_MM512_LOAD_PS (&(f[j*4*K + l])); __m512 ov = LIBXSMM_INTRINSICS_MM512_LOAD_PS (&(o[j*4*K + l])); __m512 cspv = LIBXSMM_INTRINSICS_MM512_LOAD_PS (&(csp[j*K + l])); __m512 cov = LIBXSMM_INTRINSICS_MM512_LOAD_PS (&(co[j*K + l])); __m512 dcsv = LIBXSMM_INTRINSICS_MM512_LOAD_PS (&(dcs[j*K + l])); __m512 dhv, doutv, div, dcv, dfv, dov, dcspv, deltav, tv; /* compute delta */ if (NULL == dout) { deltav = LIBXSMM_INTRINSICS_MM512_LOAD_PS (&(dh[j*K + l])); } else { dhv = LIBXSMM_INTRINSICS_MM512_LOAD_PS (&(dh[j*K + l])); doutv = LIBXSMM_INTRINSICS_MM512_LOAD_PS (&(dout[j*K + l])); deltav = _mm512_add_ps (dhv, doutv); } /* compute dcsp */ /* dcsp = delta.o.(1 - (co.co)) + dcs */ tv = _mm512_mul_ps (cov, cov); tv = _mm512_sub_ps (plus1, tv); dcspv = _mm512_mul_ps (deltav, ov); dcspv = _mm512_fmadd_ps (dcspv, tv, dcsv); /* compute di */ /* di = dcsp.c.i.(1 - i) */ tv = _mm512_sub_ps (plus1, iv); tv = _mm512_mul_ps (iv, tv); div = _mm512_mul_ps (dcspv, cv); div = _mm512_mul_ps (div, tv); /* compute dc */ /* dc = dcsp.i.(1 - (c.c)) */ tv = _mm512_mul_ps (cv, cv); tv = _mm512_sub_ps (plus1, tv); dcv = _mm512_mul_ps (dcspv, iv); dcv = _mm512_mul_ps (dcv, tv); /* compute df */ /* df = dcsp.csp.f.(1 - f) */ tv = _mm512_sub_ps (plus1, fv); tv = _mm512_mul_ps (fv, tv); dfv = _mm512_mul_ps (dcspv, cspv); dfv = _mm512_mul_ps (dfv, tv); /* compute do */ /* do = delta.co.o.(1 - o) */ tv = _mm512_sub_ps (plus1, ov); tv = _mm512_mul_ps (ov, tv); dov = _mm512_mul_ps (deltav, cov); dov = _mm512_mul_ps (dov, tv); /* update dcsp */ /* dcsp = dcsp.f */ dcspv = _mm512_mul_ps (dcspv, fv); _mm512_storeu_ps (&(di[j*4*K + l]), div); _mm512_storeu_ps (&(dc[j*4*K + l]), dcv); _mm512_storeu_ps (&(df[j*4*K + l]), dfv); _mm512_storeu_ps (&(dp[j*4*K + l]), dov); _mm512_storeu_ps (&(dcsp[j*K + l]), dcspv); } } #if defined(_OPENMP) # pragma omp parallel for private(j, l) LIBXSMM_OPENMP_COLLAPSE(2) #endif for (j = 0; j < N; j++) { for (l = rem; l < K; l++) { float delta; /* compute delta */ if (NULL == dout) { delta = dh[j*K + l]; } else { delta = dh[j*K + l] + dout[j*K + l]; } /* compute dcsp */ dcsp[j*K + l] = delta * o[j*4*K + l] * (1.0f - (co[j*K + l]*co[j*K + l])) + dcs[j*K + l]; /* compute di */ di[j*4*K + l] = dcsp[j*K + l] * c[j*4*K + l] * i[j*4*K + l] * (1.0f - i[j*4*K + l]); /* compute dc */ dc[j*4*K + l] = dcsp[j*K + l] * i[j*4*K + l] * (1.0f - (c[j*4*K + l]*c[j*4*K + l])); /* compute df */ df[j*4*K + l] = dcsp[j*K + l] * csp[j*K + l] * f[j*4*K + l] * (1.0f - f[j*4*K + l]); /* compute do */ dp[j*4*K + l] = delta * co[j*K + l] * o[j*4*K + l] * (1.0f - o[j*4*K + l]); /* update dcsp */ dcsp[j*K + l] = dcsp[j*K + l] * f[j*4*K + l]; } } #else #if defined(_OPENMP) # pragma omp parallel for private(j) #endif for (j = 0; j < N*K; j++) { const int row = j / K; const int col = j % K; float delta; /* compute delta */ if (NULL == dout) { delta = dh[j]; } else { delta = dh[j] + dout[j]; } /* compute dcsp */ dcsp[j] = delta * o[row*4*K + col] * (1.0f - (co[j]*co[j])) + dcs[j]; /* compute di */ di[row*4*K + col] = dcsp[j] * c[row*4*K + col] * i[row*4*K + col] * (1.0f - i[row*4*K + col]); /* compute dc */ dc[row*4*K + col] = dcsp[j] * i[row*4*K + col] * (1.0f - (c[row*4*K + col]*c[row*4*K + col])); /* compute df */ df[row*4*K + col] = dcsp[j] * csp[j] * f[row*4*K + col] * (1.0f - f[row*4*K + col]); /* compute do */ dp[row*4*K + col] = delta * co[j] * o[row*4*K + col] * (1.0f - o[row*4*K + col]); /* update dcsp */ dcsp[j] = dcsp[j] * f[row*4*K + col]; } #endif } LIBXSMM_INLINE void lstm_ref_fwd( int N, int C, int K, int t, float forget_bias, float *wigold, float *wcgold, float *wfgold, float *wogold, float *rigold, float *rcgold, float *rfgold, float *rogold, float *bigold, float *bcgold, float *bfgold, float *bogold, float *xgoldt, float *cspgold, float *hpgold, float *csgoldt, float *cogoldt, float *hgoldt, float *icfogoldt, float *wgold, float *rgold, float *scratch ) { #if !defined(TWO_GEMMS) float *xhgold = scratch; #endif const char transa = 'N', transb = 'N'; /* no transposes */ const float alpha = 1, beta = 1; int j; int K4 = K * 4; int CK = C + K; LIBXSMM_VLA_DECL(2, float, xgold, xgoldt, N * C); LIBXSMM_VLA_DECL(2, float, csgold, csgoldt, K * N); LIBXSMM_VLA_DECL(2, float, cogold, cogoldt, K * N); LIBXSMM_VLA_DECL(2, float, hgold, hgoldt, K * N); LIBXSMM_VLA_DECL(3, float, icfogold, icfogoldt, N, 4 * K); #if defined(PROFILE) Gbl_conv_start = libxsmm_timer_tick(); #endif #if defined(TWO_GEMMS) convert_ck_c4k(C, K, wigold, wgold); convert_ck_c4k(C, K, wcgold, &(wgold[K])); convert_ck_c4k(C, K, wfgold, &(wgold[2*K])); convert_ck_c4k(C, K, wogold, &(wgold[3*K])); convert_ck_c4k(K, K, rigold, rgold); convert_ck_c4k(K, K, rcgold, &(rgold[K])); convert_ck_c4k(K, K, rfgold, &(rgold[2*K])); convert_ck_c4k(K, K, rogold, &(rgold[3*K])); #else LIBXSMM_UNUSED(rgold); convert_ck_c4k(C, K, wigold, wgold); convert_ck_c4k(C, K, wcgold, &(wgold[K])); convert_ck_c4k(C, K, wfgold, &(wgold[2*K])); convert_ck_c4k(C, K, wogold, &(wgold[3*K])); convert_ck_c4k(K, K, rigold, &(wgold[C*K*4])); convert_ck_c4k(K, K, rcgold, &(wgold[C*K*4 + K])); convert_ck_c4k(K, K, rfgold, &(wgold[C*K*4 + 2*K])); convert_ck_c4k(K, K, rogold, &(wgold[C*K*4 + 3*K])); #endif #if defined(PROFILE) Gbl_conv_end = libxsmm_timer_tick(); Gbl_conv_total += libxsmm_timer_duration(Gbl_conv_start, Gbl_conv_end); #endif for (j = 0; j < t; ++j) { /* Initialization with bias */ #if defined(PROFILE) Gbl_copy_bias_start = libxsmm_timer_tick(); #endif lstm_fwd_copy_bias(N, K, bigold, bcgold, bfgold, bogold, forget_bias, icfogoldt, j); #if defined(PROFILE) Gbl_copy_bias_end = libxsmm_timer_tick(); Gbl_copy_bias_total += libxsmm_timer_duration(Gbl_copy_bias_start, Gbl_copy_bias_end); Gbl_blas_start = libxsmm_timer_tick(); #endif #if defined(TWO_GEMMS) /* icfo += W * x */ LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K4, &N, &C, &alpha, wgold, &K4, &LIBXSMM_VLA_ACCESS(2, xgold, j, 0, N * C), &C, &beta, &LIBXSMM_VLA_ACCESS(3, icfogold, j, 0, 0, N, 4 * K), &K4); /* icfo += R * h */ if (j == 0) { LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K4, &N, &K, &alpha, rgold, &K4, hpgold, &K, &beta, &LIBXSMM_VLA_ACCESS(3, icfogold, 0, 0, 0, N, 4 * K), &K4); } else { LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K4, &N, &K, &alpha, rgold, &K4, &LIBXSMM_VLA_ACCESS(2, hgold, j-1, 0, K * N), &K, &beta, &LIBXSMM_VLA_ACCESS(3, icfogold, j, 0, 0, N, 4 * K), &K4); } #else /* Concatenate x and h */ convert_nk_nck(N, C, C+K, &LIBXSMM_VLA_ACCESS(2, xgold, j, 0, N * C), xhgold); if (j == 0) { convert_nk_nck(N, K, C+K, hpgold, &(xhgold[C])); } else { convert_nk_nck(N, K, C+K, &LIBXSMM_VLA_ACCESS(2, hgold, j-1, 0, K * N), &(xhgold[C])); } /* icfo += (W * x) + (R * h) */ LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K4, &N, &CK, &alpha, wgold, &K4, xhgold, &CK, &beta, &LIBXSMM_VLA_ACCESS(3, icfogold, j, 0, 0, N, 4 * K), &K4); #endif #if defined(PROFILE) Gbl_blas_end = libxsmm_timer_tick(); Gbl_blas_total += libxsmm_timer_duration(Gbl_blas_start, Gbl_blas_end); Gbl_eltwise_start = libxsmm_timer_tick(); #endif if (j == 0) { lstm_fwd_eltwise_merged( N, K, &LIBXSMM_VLA_ACCESS(3, icfogold, 0, 0, 0, N, 4 * K), &LIBXSMM_VLA_ACCESS(3, icfogold, 0, 0, K, N, 4 * K), &LIBXSMM_VLA_ACCESS(3, icfogold, 0, 0, 2*K, N, 4 * K), &LIBXSMM_VLA_ACCESS(3, icfogold, 0, 0, 3*K, N, 4 * K), cspgold, &LIBXSMM_VLA_ACCESS(2, csgold, 0, 0, K * N), &LIBXSMM_VLA_ACCESS(2, cogold, 0, 0, K * N), &LIBXSMM_VLA_ACCESS(2, hgold, 0, 0, K * N) ); } else { lstm_fwd_eltwise_merged( N, K, &LIBXSMM_VLA_ACCESS(3, icfogold, j, 0, 0, N, 4 * K), &LIBXSMM_VLA_ACCESS(3, icfogold, j, 0, K, N, 4 * K), &LIBXSMM_VLA_ACCESS(3, icfogold, j, 0, 2*K, N, 4 * K), &LIBXSMM_VLA_ACCESS(3, icfogold, j, 0, 3*K, N, 4 * K), &LIBXSMM_VLA_ACCESS(2, csgold, j-1, 0, K * N), &LIBXSMM_VLA_ACCESS(2, csgold, j, 0, K * N), &LIBXSMM_VLA_ACCESS(2, cogold, j, 0, K * N), &LIBXSMM_VLA_ACCESS(2, hgold, j, 0, K * N) ); } #if defined(PROFILE) Gbl_eltwise_end = libxsmm_timer_tick(); Gbl_eltwise_total += libxsmm_timer_duration(Gbl_eltwise_start, Gbl_eltwise_end); #endif } } LIBXSMM_INLINE void lstm_ref_bwd_upd( int N, int C, int K, int t, float *xgoldt, float *cspgold, float *hpgold, float *csgoldt, float *cogoldt, float *hgoldt, float *icfogoldt, float *wgold, float *rgold, float *dcsgold, float *dhgoldt, float *dwgold, float *drgold, float *dbgold, float *dxgoldt, float *dcspgold, float *dhpgold, float *scratch ) { #if !defined(TWO_GEMMS) float *xhgold = &(scratch[K*N*t*5]); float *dxhgold = &(scratch[K*N*t*5 + (C+K)*N]); #endif float *dicfogoldt = scratch; float *doutgoldt = &(scratch[K*N*t*4]); float *dout, *dcs, *csp; const char transa = 'N', transb = 'N'; /* no transposes */ const char transaT = 'T', transbT = 'T'; /* transposes */ const float alpha = 1, beta = 1, beta0 = 0; int j, l, p; int K4 = K * 4; int CK = C + K; LIBXSMM_VLA_DECL(2, float, xgold, xgoldt, N * C); LIBXSMM_VLA_DECL(2, float, csgold, csgoldt, K * N); LIBXSMM_VLA_DECL(2, float, cogold, cogoldt, K * N); LIBXSMM_VLA_DECL(2, float, hgold, hgoldt, K * N); LIBXSMM_VLA_DECL(3, float, icfogold, icfogoldt, N, 4 * K); LIBXSMM_VLA_DECL(2, float, dxgold, dxgoldt, N * C); LIBXSMM_VLA_DECL(2, float, dhgold, dhgoldt, K * N); LIBXSMM_VLA_DECL(3, float, dicfogold, dicfogoldt, N, 4 * K); LIBXSMM_VLA_DECL(2, float, doutgold, doutgoldt, K * N); for (j = t-1; j >= 0; --j) { #if defined(PROFILE) Gbl_eltwise_start = libxsmm_timer_tick(); #endif if (t-1 == j) { dout = NULL; dcs = dcsgold; } else { dout = &LIBXSMM_VLA_ACCESS(2, doutgold, j, 0, K * N); dcs = dcspgold; } if (0 == j) { csp = cspgold; } else { csp = &LIBXSMM_VLA_ACCESS(2, csgold, j-1, 0, K * N); } lstm_bwd_upd_eltwise_merged( N, K, &LIBXSMM_VLA_ACCESS(3, icfogold, j, 0, 0, N, 4 * K), &LIBXSMM_VLA_ACCESS(3, icfogold, j, 0, K, N, 4 * K), &LIBXSMM_VLA_ACCESS(3, icfogold, j, 0, 2*K, N, 4 * K), &LIBXSMM_VLA_ACCESS(3, icfogold, j, 0, 3*K, N, 4 * K), csp, &LIBXSMM_VLA_ACCESS(2, cogold, j, 0, K * N), &LIBXSMM_VLA_ACCESS(2, dhgold, j, 0, K * N), dout, &LIBXSMM_VLA_ACCESS(3, dicfogold, j, 0, 0, N, 4 * K), &LIBXSMM_VLA_ACCESS(3, dicfogold, j, 0, K, N, 4 * K), &LIBXSMM_VLA_ACCESS(3, dicfogold, j, 0, 2*K, N, 4 * K), &LIBXSMM_VLA_ACCESS(3, dicfogold, j, 0, 3*K, N, 4 * K), dcspgold, dcs); #if defined(PROFILE) Gbl_eltwise_end = libxsmm_timer_tick(); Gbl_eltwise_total += libxsmm_timer_duration(Gbl_eltwise_start, Gbl_eltwise_end); Gbl_blas_start = libxsmm_timer_tick(); #endif #if defined(TWO_GEMMS) if (j > 0) { /* compute dout */ LIBXSMM_XBLAS_SYMBOL(float)(&transaT, &transb, &K, &N, &K4, &alpha, rgold, &K4, &LIBXSMM_VLA_ACCESS(3, dicfogold, j, 0, 0, N, 4 * K), &K4, &beta0, &LIBXSMM_VLA_ACCESS(2, doutgold, j-1, 0, K * N), &K); } else { /* compute dhp */ LIBXSMM_XBLAS_SYMBOL(float)(&transaT, &transb, &K, &N, &K4, &alpha, rgold, &K4, &LIBXSMM_VLA_ACCESS(3, dicfogold, 0, 0, 0, N, 4 * K), &K4, &beta0, dhpgold, &K); } /* compute dx */ LIBXSMM_XBLAS_SYMBOL(float)(&transaT, &transb, &C, &N, &K4, &alpha, wgold, &K4, &LIBXSMM_VLA_ACCESS(3, dicfogold, j, 0, 0, N, 4 * K), &K4, &beta, &LIBXSMM_VLA_ACCESS(2, dxgold, j, 0, N * C), &C); /* compute dw */ LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transbT, &K4, &C, &N, &alpha, &LIBXSMM_VLA_ACCESS(3, dicfogold, j, 0, 0, N, 4 * K), &K4, &LIBXSMM_VLA_ACCESS(2, xgold, j, 0, N * C), &C, &beta, dwgold, &K4); /* compute dr */ if (j == 0) { LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transbT, &K4, &K, &N, &alpha, &LIBXSMM_VLA_ACCESS(3, dicfogold, j, 0, 0, N, 4 * K), &K4, hpgold, &K, &beta, drgold, &K4); } else { LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transbT, &K4, &K, &N, &alpha, &LIBXSMM_VLA_ACCESS(3, dicfogold, j, 0, 0, N, 4 * K), &K4, &LIBXSMM_VLA_ACCESS(2, hgold, j-1, 0, K * N), &K, &beta, drgold, &K4); } #else LIBXSMM_UNUSED(rgold); LIBXSMM_UNUSED(drgold); LIBXSMM_XBLAS_SYMBOL(float)(&transaT, &transb, &CK, &N, &K4, &alpha, wgold, &K4, &LIBXSMM_VLA_ACCESS(3, dicfogold, j, 0, 0, N, 4 * K), &K4, &beta0, dxhgold, &CK); matrix_copy_ld(C, N, C+K, dxhgold, &LIBXSMM_VLA_ACCESS(2, dxgold, j, 0, N * C)); if (j > 0) { matrix_copy_ld(K, N, C+K, &(dxhgold[C]), &LIBXSMM_VLA_ACCESS(2, doutgold, j-1, 0, K * N)); } else { matrix_copy_ld(K, N, C+K, &(dxhgold[C]), dhpgold); } /* Concatenate x and h */ convert_nk_nck(N, C, C+K, &LIBXSMM_VLA_ACCESS(2, xgold, j, 0, N * C), xhgold); if (j == 0) { convert_nk_nck(N, K, C+K, hpgold, &(xhgold[C])); } else { convert_nk_nck(N, K, C+K, &LIBXSMM_VLA_ACCESS(2, hgold, j-1, 0, K * N), &(xhgold[C])); } LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transbT, &K4, &CK, &N, &alpha, &LIBXSMM_VLA_ACCESS(3, dicfogold, j, 0, 0, N, 4 * K), &K4, xhgold, &CK, &beta, dwgold, &K4); #endif #if defined(PROFILE) Gbl_blas_end = libxsmm_timer_tick(); Gbl_blas_total += libxsmm_timer_duration(Gbl_blas_start, Gbl_blas_end); #endif /* compute db */ #if defined(_OPENMP) LIBXSMM_OMP_VAR(p); # pragma omp parallel for private(l, p) #endif for (l = 0; l < K; l++) { for (p = 0; p < N; p++) { dbgold[l] += LIBXSMM_VLA_ACCESS(3, dicfogold, j, p, l, N, 4 * K); dbgold[l + K] += LIBXSMM_VLA_ACCESS(3, dicfogold, j, p, l + K, N, 4 * K); dbgold[l + 2*K] += LIBXSMM_VLA_ACCESS(3, dicfogold, j, p, l + 2*K, N, 4 * K); dbgold[l + 3*K] += LIBXSMM_VLA_ACCESS(3, dicfogold, j, p, l + 3*K, N, 4 * K); } } } } LIBXSMM_INLINE void gru_ref_fwd( int N, int C, int K, int t, float *wi, float *wc, float *wf, float *ri, float *rc, float *rf, float *bi, float *bc, float *bf, float *xt, float *hp, float *ht, float *it, float *ct, float *ft, float *ot ) { const char transa = 'N', transb = 'N'; /* no transposes */ const float alpha = 1, beta = 1; int j; LIBXSMM_VLA_DECL(2, float, x, xt, N * C); LIBXSMM_VLA_DECL(2, float, h, ht, K * N); LIBXSMM_VLA_DECL(2, float, i, it, K * N); LIBXSMM_VLA_DECL(2, float, c, ct, K * N); LIBXSMM_VLA_DECL(2, float, f, ft, K * N); LIBXSMM_VLA_DECL(2, float, o, ot, K * N); for (j = 0; j < t; ++j) { /* i_t = b_i */ matrix_copy_bias(K, N, K, bi, &LIBXSMM_VLA_ACCESS(2, i, j, 0, K * N)); /* i_t += W_i * x_t */ LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &C, &alpha, wi, &K, &LIBXSMM_VLA_ACCESS(2, x, j, 0, N * C), &C, &beta, &LIBXSMM_VLA_ACCESS(2, i, j, 0, K * N), &K); /* i_t += R_i * h_{t-1} */ if (0 == j) { LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &K, &alpha, ri, &K, hp, &K, &beta, &LIBXSMM_VLA_ACCESS(2, i, j, 0, K * N), &K); } else { LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &K, &alpha, ri, &K, &LIBXSMM_VLA_ACCESS(2, h, j-1, 0, K * N), &K, &beta, &LIBXSMM_VLA_ACCESS(2, i, j, 0, K * N), &K); } /* i_t = sigmoid(i_t) */ matrix_sigmoid(N*K, &LIBXSMM_VLA_ACCESS(2, i, j, 0, K * N), &LIBXSMM_VLA_ACCESS(2, i, j, 0, K * N)); /* c_t = b_c */ matrix_copy_bias(K, N, K, bc, &LIBXSMM_VLA_ACCESS(2, c, j, 0, K * N)); /* c_t += W_c * x_t */ LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &C, &alpha, wc, &K, &LIBXSMM_VLA_ACCESS(2, x, j, 0, N * C), &C, &beta, &LIBXSMM_VLA_ACCESS(2, c, j, 0, K * N), &K); /* c_t += R_c * h_{t-1} */ if (0 == j) { LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &K, &alpha, rc, &K, hp, &K, &beta, &LIBXSMM_VLA_ACCESS(2, c, j, 0, K * N), &K); } else { LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &K, &alpha, rc, &K, &LIBXSMM_VLA_ACCESS(2, h, j-1, 0, K * N), &K, &beta, &LIBXSMM_VLA_ACCESS(2, c, j, 0, K * N), &K); } /* c_t = sigmoid(c_t) */ matrix_sigmoid(N*K, &LIBXSMM_VLA_ACCESS(2, c, j, 0, K * N), &LIBXSMM_VLA_ACCESS(2, c, j, 0, K * N)); /* o_t = h_{t-1} . i_t */ if (0 == j) { matrix_eltwise_mult(N*K, hp, &LIBXSMM_VLA_ACCESS(2, i, j, 0, K * N), &LIBXSMM_VLA_ACCESS(2, o, j, 0, K * N)); } else { matrix_eltwise_mult(N*K, &LIBXSMM_VLA_ACCESS(2, h, j-1, 0, K * N), &LIBXSMM_VLA_ACCESS(2, i, j, 0, K * N), &LIBXSMM_VLA_ACCESS(2, o, j, 0, K * N)); } /* f_t = b_f */ matrix_copy_bias(K, N, K, bf, &LIBXSMM_VLA_ACCESS(2, f, j, 0, K * N)); /* f_t += W_f * x_t */ LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &C, &alpha, wf, &K, &LIBXSMM_VLA_ACCESS(2, x, j, 0, N * C), &C, &beta, &LIBXSMM_VLA_ACCESS(2, f, j, 0, K * N), &K); /* f_t += R_f * o_t */ LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &K, &alpha, rf, &K, &LIBXSMM_VLA_ACCESS(2, o, j, 0, K * N), &K, &beta, &LIBXSMM_VLA_ACCESS(2, f, j, 0, K * N), &K); /* f_t = tanh(f_t) */ matrix_tanh(N*K, &LIBXSMM_VLA_ACCESS(2, f, j, 0, K * N), &LIBXSMM_VLA_ACCESS(2, f, j, 0, K * N)); /* h_t = (1 - c_t) . f_t */ matrix_complement (N*K, &LIBXSMM_VLA_ACCESS(2, c, j, 0, K * N), &LIBXSMM_VLA_ACCESS(2, h, j, 0, K * N)); matrix_eltwise_mult(N*K, &LIBXSMM_VLA_ACCESS(2, h, j, 0, K * N), &LIBXSMM_VLA_ACCESS(2, f, j, 0, K * N), &LIBXSMM_VLA_ACCESS(2, h, j, 0, K * N)); /* h_t += c_t . h_{t-1} */ if (0 == j) { matrix_eltwise_fma(N*K, &LIBXSMM_VLA_ACCESS(2, c, j, 0, K * N), hp, &LIBXSMM_VLA_ACCESS(2, h, j, 0, K * N)); } else { matrix_eltwise_fma(N*K, &LIBXSMM_VLA_ACCESS(2, c, j, 0, K * N), &LIBXSMM_VLA_ACCESS(2, h, j-1, 0, K * N), &LIBXSMM_VLA_ACCESS(2, h, j, 0, K * N)); } } } LIBXSMM_INLINE void gru_ref_bwd_upd( int N, int C, int K, int t, float *xt, float *hpD, float *ht, float *it, float *ct, float *ft, float *ot, float *wi, float *wc, float *wf, float *ri, float *rc, float *rf, float *dht, float *dw, float *dr, float *db, float *dxt, float *dhpD, float *scratch ) { const char transa = 'N', transb = 'N'; /* no transposes */ const char transaT = 'T', transbT = 'T'; /* transposes */ const float alpha = 1, beta = 1, beta0 = 0; int j, l, p; float *dwi = dw; float *dwc = &(dw[C*K]); float *dwf = &(dw[2*C*K]); float *dri = dr; float *drc = &(dr[K*K]); float *drf = &(dr[2*K*K]); float *dbi = db; float *dbc = &(db[K]); float *dbf = &(db[2*K]); float *deltaD = scratch; float *doutD = &(scratch[N*K]); float *diD = &(scratch[2*N*K]); float *dcD = &(scratch[3*N*K]); float *dfD = &(scratch[4*N*K]); float *doD = &(scratch[5*N*K]); LIBXSMM_VLA_DECL(3, float, x, xt, N, C); LIBXSMM_VLA_DECL(2, float, hp, hpD, K); LIBXSMM_VLA_DECL(3, float, h, ht, N, K); LIBXSMM_VLA_DECL(3, float, i, it, N, K); LIBXSMM_VLA_DECL(3, float, c, ct, N, K); LIBXSMM_VLA_DECL(3, float, f, ft, N, K); LIBXSMM_VLA_DECL(3, float, o, ot, N, K); LIBXSMM_VLA_DECL(3, float, dx, dxt, N, C); LIBXSMM_VLA_DECL(2, float, dhp, dhpD, K); LIBXSMM_VLA_DECL(3, float, dh, dht, N, K); LIBXSMM_VLA_DECL(2, float, di, diD, K); LIBXSMM_VLA_DECL(2, float, dc, dcD, K); LIBXSMM_VLA_DECL(2, float, df, dfD, K); LIBXSMM_VLA_DECL(2, float, dp, doD, K); LIBXSMM_VLA_DECL(2, float, dout, doutD, K); LIBXSMM_VLA_DECL(2, float, delta, deltaD, K); for (j = t-1; j >= 0; j--) { #if defined(_OPENMP) LIBXSMM_OMP_VAR(p); # pragma omp parallel for private(l, p) LIBXSMM_OPENMP_COLLAPSE(2) #endif for (l = 0; l < N; l++) { for (p = 0; p < K; p++) { if (t-1 == j) { LIBXSMM_VLA_ACCESS(2, delta, l, p, K) = LIBXSMM_VLA_ACCESS(3, dh, t-1, l, p, N, K); } else { LIBXSMM_VLA_ACCESS(2, delta, l, p, K) = LIBXSMM_VLA_ACCESS(3, dh, j, l, p, N, K) + LIBXSMM_VLA_ACCESS(2, dout, l, p, K); } /* df = delta . (1 - c_t) . (1 - (f_t . f_t)) */ LIBXSMM_VLA_ACCESS(2, df, l, p, K) = LIBXSMM_VLA_ACCESS(2, delta, l, p, K) * (1.0f - LIBXSMM_VLA_ACCESS(3, c, j, l, p, N, K)) * (1.0f - (LIBXSMM_VLA_ACCESS(3, f, j, l, p, N, K) * LIBXSMM_VLA_ACCESS(3, f, j, l, p, N, K))); /* dc = delta . (h_{t-1} - f_t) . c_t . (1 - c_t) */ if (0 == j) { LIBXSMM_VLA_ACCESS(2, dc, l, p, K) = LIBXSMM_VLA_ACCESS(2, delta, l, p, K) * (LIBXSMM_VLA_ACCESS(2, hp, l, p, K) - LIBXSMM_VLA_ACCESS(3, f, j, l, p, N, K)) * LIBXSMM_VLA_ACCESS(3, c, j, l, p, N, K) * (1.0f - LIBXSMM_VLA_ACCESS(3, c, j, l, p, N, K)); } else { LIBXSMM_VLA_ACCESS(2, dc, l, p, K) = LIBXSMM_VLA_ACCESS(2, delta, l, p, K) * (LIBXSMM_VLA_ACCESS(3, h, j-1, l, p, N, K) - LIBXSMM_VLA_ACCESS(3, f, j, l, p, N, K)) * LIBXSMM_VLA_ACCESS(3, c, j, l, p, N, K) * (1.0f - LIBXSMM_VLA_ACCESS(3, c, j, l, p, N, K)); } } } /* do = {R_f}^T * df */ LIBXSMM_XBLAS_SYMBOL(float)(&transaT, &transb, &K, &N, &K, &alpha, rf, &K, dfD, &K, &beta0, doD, &K); /* di = do . h_{t-1} . i_t . (1 - i_t) */ if (0 == j) { #if defined(_OPENMP) # pragma omp parallel for private(l, p) LIBXSMM_OPENMP_COLLAPSE(2) #endif for (l = 0; l < N; l++) { for (p = 0; p < K; p++) { LIBXSMM_VLA_ACCESS(2, di, l, p, K) = LIBXSMM_VLA_ACCESS(2, dp, l, p, K) * LIBXSMM_VLA_ACCESS(2, hp, l, p, K) * LIBXSMM_VLA_ACCESS(3, i, 0, l, p, N, K) * (1.0f - LIBXSMM_VLA_ACCESS(3, i, 0, l, p, N, K)); } } } else { #if defined(_OPENMP) # pragma omp parallel for private(l, p) LIBXSMM_OPENMP_COLLAPSE(2) #endif for (l = 0; l < N; l++) { for (p = 0; p < K; p++) { LIBXSMM_VLA_ACCESS(2, di, l, p, K) = LIBXSMM_VLA_ACCESS(2, dp, l, p, K) * LIBXSMM_VLA_ACCESS(3, h, j-1, l, p, N, K) * LIBXSMM_VLA_ACCESS(3, i, j, l, p, N, K) * (1.0f - LIBXSMM_VLA_ACCESS(3, i, j, l, p, N, K)); } } } /* dx_t = {W_i}^T * di */ LIBXSMM_XBLAS_SYMBOL(float)(&transaT, &transb, &C, &N, &K, &alpha, wi, &K, diD, &K, &beta0, &LIBXSMM_VLA_ACCESS(3, dx, j, 0, 0, N, C), &C); /* dx_t += {W_c}^T * dc */ LIBXSMM_XBLAS_SYMBOL(float)(&transaT, &transb, &C, &N, &K, &alpha, wc, &K, dcD, &K, &beta, &LIBXSMM_VLA_ACCESS(3, dx, j, 0, 0, N, C), &C); /* dx_t += {W_f}^T * df */ LIBXSMM_XBLAS_SYMBOL(float)(&transaT, &transb, &C, &N, &K, &alpha, wf, &K, dfD, &K, &beta, &LIBXSMM_VLA_ACCESS(3, dx, j, 0, 0, N, C), &C); /* dh_{t-1} = {R_i}^T * di */ /* dh_{t-1} += {R_c}^T * dc */ if (0 == j) { LIBXSMM_XBLAS_SYMBOL(float)(&transaT, &transb, &K, &N, &K, &alpha, ri, &K, diD, &K, &beta0, dhpD, &K); LIBXSMM_XBLAS_SYMBOL(float)(&transaT, &transb, &K, &N, &K, &alpha, rc, &K, dcD, &K, &beta, dhpD, &K); } else { LIBXSMM_XBLAS_SYMBOL(float)(&transaT, &transb, &K, &N, &K, &alpha, ri, &K, diD, &K, &beta0, doutD, &K); LIBXSMM_XBLAS_SYMBOL(float)(&transaT, &transb, &K, &N, &K, &alpha, rc, &K, dcD, &K, &beta, doutD, &K); } /* dh_{t-1} += do * i_t + delta * c_t */ if (0 == j) { #if defined(_OPENMP) # pragma omp parallel for private(l, p) LIBXSMM_OPENMP_COLLAPSE(2) #endif for (l = 0; l < N; l++) { for (p = 0; p < K; p++) { LIBXSMM_VLA_ACCESS(2, dhp, l, p, K) += LIBXSMM_VLA_ACCESS(2, dp, l, p, K) * LIBXSMM_VLA_ACCESS(3, i, j, l, p, N, K) + LIBXSMM_VLA_ACCESS(2, delta, l, p, K) * LIBXSMM_VLA_ACCESS(3, c, j, l, p, N, K); } } } else { #if defined(_OPENMP) # pragma omp parallel for private(l, p) LIBXSMM_OPENMP_COLLAPSE(2) #endif for (l = 0; l < N; l++) { for (p = 0; p < K; p++) { LIBXSMM_VLA_ACCESS(2, dout, l, p, K) += LIBXSMM_VLA_ACCESS(2, dp, l, p, K) * LIBXSMM_VLA_ACCESS(3, i, j, l, p, N, K) + LIBXSMM_VLA_ACCESS(2, delta, l, p, K) * LIBXSMM_VLA_ACCESS(3, c, j, l, p, N, K); } } } /* dw_i += di * {x_t}^T */ /* dw_c += dc * {x_t}^T */ /* dw_f += df * {x_t}^T */ LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transbT, &K, &C, &N, &alpha, diD, &K, &LIBXSMM_VLA_ACCESS(3, x, j, 0, 0, N, C), &C, &beta, dwi, &K); LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transbT, &K, &C, &N, &alpha, dcD, &K, &LIBXSMM_VLA_ACCESS(3, x, j, 0, 0, N, C), &C, &beta, dwc, &K); LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transbT, &K, &C, &N, &alpha, dfD, &K, &LIBXSMM_VLA_ACCESS(3, x, j, 0, 0, N, C), &C, &beta, dwf, &K); /* dr_i += di * {o_t}^T */ /* dr_c += dc * {o_t}^T */ /* dr_f += df * {h_{t-1}}^T */ LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transbT, &K, &K, &N, &alpha, diD, &K, &LIBXSMM_VLA_ACCESS(3, o, j, 0, 0, N, K), &K, &beta, dri, &K); LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transbT, &K, &K, &N, &alpha, dcD, &K, &LIBXSMM_VLA_ACCESS(3, o, j, 0, 0, N, K), &K, &beta, drc, &K); if (0 == j) { LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transbT, &K, &K, &N, &alpha, dfD, &K, &LIBXSMM_VLA_ACCESS(2, hp, 0, 0, K), &K, &beta, drf, &K); } else { LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transbT, &K, &K, &N, &alpha, dfD, &K, &LIBXSMM_VLA_ACCESS(3, h, j-1, 0, 0, N, K), &K, &beta, drf, &K); } /* compute db */ #if defined(_OPENMP) # pragma omp parallel for private(l, p) #endif for (l = 0; l < K; l++) { for (p = 0; p < N; p++) { dbi[l] += LIBXSMM_VLA_ACCESS(2, di, p, l, K); dbc[l] += LIBXSMM_VLA_ACCESS(2, dc, p, l, K); dbf[l] += LIBXSMM_VLA_ACCESS(2, df, p, l, K); } } } } libxsmm-1.17/samples/deeplearning/fullyconnecteddriver/000077500000000000000000000000001415223013700234465ustar00rootroot00000000000000libxsmm-1.17/samples/deeplearning/fullyconnecteddriver/Makefile000066400000000000000000000072511415223013700251130ustar00rootroot00000000000000ROOTDIR = $(abspath $(dir $(firstword $(MAKEFILE_LIST)))) DEPDIR = ../../.. SRCDIR = . INCDIR = . BLDDIR = obj OUTDIR = . CXXFLAGS = $(NULL) CFLAGS = $(NULL) DFLAGS = $(NULL) BLAS = 0 OMP = 1 SYM = 1 # include common Makefile artifacts include $(DEPDIR)/Makefile.inc # necessary include directories IFLAGS += -I$(call quote,$(INCDIR)) IFLAGS += -I$(call quote,$(DEPDIR)/include) OUTNAME := $(shell basename "$(ROOTDIR)") HEADERS := $(wildcard $(INCDIR)/*.h) $(wildcard $(INCDIR)/*.hpp) $(wildcard $(INCDIR)/*.hxx) $(wildcard $(INCDIR)/*.hh) \ $(wildcard $(SRCDIR)/*.h) $(wildcard $(SRCDIR)/*.hpp) $(wildcard $(SRCDIR)/*.hxx) $(wildcard $(SRCDIR)/*.hh) \ $(DEPDIR)/include/libxsmm_source.h CPPSRCS := $(wildcard $(SRCDIR)/*.cpp) CXXSRCS := $(wildcard $(SRCDIR)/*.cxx) CCXSRCS := $(wildcard $(SRCDIR)/*.cc) CSOURCS := $(wildcard $(SRCDIR)/*.c) CPPOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CPPSRCS:.cpp=-cpp.o))) CXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CXXSRCS:.cxx=-cxx.o))) CCXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CCXSRCS:.cc=-cc.o))) COBJCTS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CSOURCS:.c=-c.o))) ifneq (,$(strip $(FC))) FXXSRCS := $(wildcard $(SRCDIR)/*.f) F77SRCS := $(wildcard $(SRCDIR)/*.F) F90SRCS := $(wildcard $(SRCDIR)/*.f90) $(wildcard $(SRCDIR)/*.F90) FXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(FXXSRCS:.f=-f.o))) F77OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F77SRCS:.F=-f77.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90SRCS:.f90=-f90.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90OBJS:.F90=-f90.o))) endif SOURCES := $(CPPSRCS) $(CXXSRCS) $(CCXSRCS) $(CSOURCS) OBJECTS := $(CPPOBJS) $(CXXOBJS) $(CCXOBJS) $(COBJCTS) FTNSRCS := $(FXXSRCS) $(F77SRCS) $(F90SRCS) MODULES := $(addsuffix .mod,$(basename $(FTNSRCS))) $(addsuffix .modmic,$(basename $(FTNSRCS))) FTNOBJS := $(FXXOBJS) $(F77OBJS) $(F90OBJS) XFILES := $(OUTDIR)/layer_example_f32 $(OUTDIR)/layer_example_bf16 $(OUTDIR)/layer_example_bf16_f32 .PHONY: all all: $(XFILES) .PHONY: compile compile: $(OBJECTS) $(FTNOBJS) $(OUTDIR)/layer_example_f32: $(OUTDIR)/.make $(BLDDIR)/layer_example_f32-c.o $(LIBDEP) $(EXTDEP) $(LD) -o $@ $(BLDDIR)/layer_example_f32-c.o $(call cleanld,$(EXTLIB) $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS)) $(OUTDIR)/layer_example_bf16: $(OUTDIR)/.make $(BLDDIR)/layer_example_bf16-c.o $(LIBDEP) $(EXTDEP) $(LD) -o $@ $(BLDDIR)/layer_example_bf16-c.o $(call cleanld,$(EXTLIB) $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS)) $(OUTDIR)/layer_example_bf16_f32: $(OUTDIR)/.make $(BLDDIR)/layer_example_bf16_f32-c.o $(LIBDEP) $(EXTDEP) $(LD) -o $@ $(BLDDIR)/layer_example_bf16_f32-c.o $(call cleanld,$(EXTLIB) $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS)) $(BLDDIR)/%-cpp.o: $(SRCDIR)/%.cpp .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc ./../common/dnn_common.h $(CXX) $(DFLAGS) $(IFLAGS) $(CXXFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-c.o: $(SRCDIR)/%.c .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc ./../common/dnn_common.h $(CC) $(DFLAGS) $(IFLAGS) $(CFLAGS) $(CTARGET) -c $< -o $@ .PHONY: clean clean: ifneq ($(call qapath,$(BLDDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(BLDDIR)),$(call qapath,.)) @rm -rf $(BLDDIR) endif endif ifneq (,$(wildcard $(BLDDIR))) # still exists @rm -f $(OBJECTS) $(OBJECTX) $(FTNOBJS) $(FTNOBJX) *__genmod.* fit.log *.dat @rm -f $(BLDDIR)/*.gcno $(BLDDIR)/*.gcda $(BLDDIR)/*.gcov endif @rm -f .make .state .PHONY: realclean realclean: clean ifneq ($(call qapath,$(OUTDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(OUTDIR)),$(call qapath,.)) @rm -rf $(OUTDIR) endif endif ifneq (,$(wildcard $(OUTDIR))) # still exists @rm -f $(OUTDIR)/libxsmm.$(DLIBEXT) $(OUTDIR)/*.stackdump @rm -f $(XFILES) $(MODULES) endif libxsmm-1.17/samples/deeplearning/fullyconnecteddriver/fc_bf16_tuning_job.sh000077500000000000000000000061461415223013700274400ustar00rootroot00000000000000#!/usr/bin/env bash #SBATCH --partition=clx #SBATCH --ntasks=1 #SBATCH --cpus-per-task=112 #SBATCH --time=2:00:00 export OMP_NUM_THREADS=28 export KMP_AFFINITY=granularity=fine,compact,1,0 export CHECK=1 ITERS=1000 # Initialize Env vars export FWD_BF=1 export BWD_BF=1 export UPD_BF=1 export FWD_2D_BLOCKING=1 export BWD_2D_BLOCKING=1 export UPD_2D_BLOCKING=1 export FWD_ROW_TEAMS=1 export FWD_COLUMN_TEAMS=1 export BWD_ROW_TEAMS=1 export BWD_COLUMN_TEAMS=1 export UPD_ROW_TEAMS=1 export UPD_COLUMN_TEAMS=1 export IFM_SUBTASKS=1 export OFM_SUBTASKS=1 MB=2160 BFN=24 # Tune layers (1024,1024) and (512,512) for THREADS in 20 24; do for OFM in 512 1024; do IFM=${OFM} for PASS in 'FWD' 'BWD' 'UPD'; do if [ $PASS == "FWD" ] then PASS_ARG='F' fi if [ $PASS == "BWD" ] then PASS_ARG='B' fi if [ $PASS == "UPD" ] then PASS_ARG='U' fi rm -f ${PASS}_TUNING_${MB}_${IFM}_${OFM}_threads_${THREADS} touch ${PASS}_TUNING_${MB}_${IFM}_${OFM}_threads_${THREADS} export OMP_NUM_THREADS=${THREADS} export ${PASS}_2D_BLOCKING=0 if [ $PASS == "UPD" ] then for IFMSUBTASKS in 1 2; do export IFM_SUBTASKS=${IFMSUBTASKS} for OFMSUBTASKS in 1 2; do export OFM_SUBTASKS=${OFMSUBTASKS} for BFM in 32 64; do for BFACC in 1 2 3 6 9 10 15 30 45 90; do export ${PASS}_BF=${BFACC} srun -n 1 ./layer_example_bf16 ${ITERS} ${MB} ${IFM} ${OFM} 0 ${PASS_ARG} B ${BFN} ${BFM} ${BFM} >> ${PASS}_TUNING_${MB}_${IFM}_${OFM}_threads_${THREADS} done done done done else for BFM in 32 64; do for BFACC in 1 2 4 8; do export ${PASS}_BF=${BFACC} srun -n 1 ./layer_example_bf16 ${ITERS} ${MB} ${IFM} ${OFM} 0 ${PASS_ARG} B ${BFN} ${BFM} ${BFM} >> ${PASS}_TUNING_${MB}_${IFM}_${OFM}_threads_${THREADS} done done fi export ${PASS}_2D_BLOCKING=1 export IFM_SUBTASKS=1 export OFM_SUBTASKS=1 if [ $PASS == "UPD" ] then for COLUMNS in 2 4; do export ${PASS}_COLUMN_TEAMS=${COLUMNS} ROWS=$((THREADS / COLUMNS)) export ${PASS}_ROW_TEAMS=${ROWS} for BFM in 32 64; do for BFACC in 1 2 3 6 9 10 15 30 45 90; do export ${PASS}_BF=${BFACC} srun -n 1 ./layer_example_bf16 ${ITERS} ${MB} ${IFM} ${OFM} 0 ${PASS_ARG} B ${BFN} ${BFM} ${BFM} >> ${PASS}_TUNING_${MB}_${IFM}_${OFM}_threads_${THREADS} done done done else for COLUMNS in 2 4; do export ${PASS}_COLUMN_TEAMS=${COLUMNS} ROWS=$((THREADS / COLUMNS)) export ${PASS}_ROW_TEAMS=${ROWS} for BFM in 32 64; do for BFACC in 1 2 4 8; do export ${PASS}_BF=${BFACC} srun -n 1 ./layer_example_bf16 ${ITERS} ${MB} ${IFM} ${OFM} 0 ${PASS_ARG} B ${BFN} ${BFM} ${BFM} >> ${PASS}_TUNING_${MB}_${IFM}_${OFM}_threads_${THREADS} done done done fi done done done libxsmm-1.17/samples/deeplearning/fullyconnecteddriver/fc_bf16_tuning_job_non_square.sh000077500000000000000000000203711415223013700316660ustar00rootroot00000000000000#!/usr/bin/env bash #SBATCH --partition=clx #SBATCH --ntasks=1 #SBATCH --cpus-per-task=112 #SBATCH --time=2:00:00 export OMP_NUM_THREADS=28 export KMP_AFFINITY=granularity=fine,compact,1,0 export CHECK=1 ITERS=1000 # Initialize Env vars export FWD_BF=1 export BWD_BF=1 export UPD_BF=1 export FWD_2D_BLOCKING=1 export BWD_2D_BLOCKING=1 export UPD_2D_BLOCKING=1 export FWD_ROW_TEAMS=1 export FWD_COLUMN_TEAMS=1 export BWD_ROW_TEAMS=1 export BWD_COLUMN_TEAMS=1 export UPD_ROW_TEAMS=1 export UPD_COLUMN_TEAMS=1 export IFM_SUBTASKS=1 export OFM_SUBTASKS=1 MB=2160 BFN=24 # Tune layer (100,1024) IFM=100 OFM=1024 for THREADS in 20 24; do for PASS in 'FWD' 'BWD' 'UPD'; do if [ $PASS == "FWD" ] then PASS_ARG='F' fi if [ $PASS == "BWD" ] then PASS_ARG='B' fi if [ $PASS == "UPD" ] then PASS_ARG='U' fi rm -f ${PASS}_TUNING_${MB}_${IFM}_${OFM}_threads_${THREADS} touch ${PASS}_TUNING_${MB}_${IFM}_${OFM}_threads_${THREADS} export OMP_NUM_THREADS=${THREADS} export ${PASS}_2D_BLOCKING=0 if [ $PASS == "UPD" ] then for IFMSUBTASKS in 1 2; do export IFM_SUBTASKS=${IFMSUBTASKS} for OFMSUBTASKS in 1 2; do export OFM_SUBTASKS=${OFMSUBTASKS} for BFM in 32 64; do for BFACC in 1 2 3 6 9 10 15 30 45 90; do export ${PASS}_BF=${BFACC} ./layer_example_bf16 ${ITERS} ${MB} ${IFM} ${OFM} 0 ${PASS_ARG} B ${BFN} ${BFM} 50 >> ${PASS}_TUNING_${MB}_${IFM}_${OFM}_threads_${THREADS} done done done done for IFMSUBTASKS in 1; do export IFM_SUBTASKS=${IFMSUBTASKS} for OFMSUBTASKS in 1 2; do export OFM_SUBTASKS=${OFMSUBTASKS} for BFM in 32 64; do for BFACC in 1 2 3 6 9 10 15 30 45 90; do export ${PASS}_BF=${BFACC} ./layer_example_bf16 ${ITERS} ${MB} ${IFM} ${OFM} 0 ${PASS_ARG} B ${BFN} ${BFM} 100 >> ${PASS}_TUNING_${MB}_${IFM}_${OFM}_threads_${THREADS} done done done done fi if [ $PASS == "FWD" ] then for BFM in 32 64; do for BFACC in 1; do export ${PASS}_BF=${BFACC} ./layer_example_bf16 ${ITERS} ${MB} ${IFM} ${OFM} 0 ${PASS_ARG} B ${BFN} ${BFM} 100 >> ${PASS}_TUNING_${MB}_${IFM}_${OFM}_threads_${THREADS} done for BFACC in 1 2 ; do export ${PASS}_BF=${BFACC} ./layer_example_bf16 ${ITERS} ${MB} ${IFM} ${OFM} 0 ${PASS_ARG} B ${BFN} ${BFM} 50 >> ${PASS}_TUNING_${MB}_${IFM}_${OFM}_threads_${THREADS} done done fi if [ $PASS == "BWD" ] then for BFM in 32 64; do for BFACC in 1 2 4 8; do export ${PASS}_BF=${BFACC} ./layer_example_bf16 ${ITERS} ${MB} ${IFM} ${OFM} 0 ${PASS_ARG} B ${BFN} ${BFM} 50 >> ${PASS}_TUNING_${MB}_${IFM}_${OFM}_threads_${THREADS} ./layer_example_bf16 ${ITERS} ${MB} ${IFM} ${OFM} 0 ${PASS_ARG} B ${BFN} ${BFM} 100 >> ${PASS}_TUNING_${MB}_${IFM}_${OFM}_threads_${THREADS} done done fi export ${PASS}_2D_BLOCKING=1 export IFM_SUBTASKS=1 export OFM_SUBTASKS=1 if [ $PASS == "UPD" ] then for COLUMNS in 2 4; do export ${PASS}_COLUMN_TEAMS=${COLUMNS} ROWS=$((THREADS / COLUMNS)) export ${PASS}_ROW_TEAMS=${ROWS} for BFM in 32 64; do for BFACC in 1 2 3 6 9 10 15 30 45 90; do export ${PASS}_BF=${BFACC} ./layer_example_bf16 ${ITERS} ${MB} ${IFM} ${OFM} 0 ${PASS_ARG} B ${BFN} ${BFM} 50 >> ${PASS}_TUNING_${MB}_${IFM}_${OFM}_threads_${THREADS} ./layer_example_bf16 ${ITERS} ${MB} ${IFM} ${OFM} 0 ${PASS_ARG} B ${BFN} ${BFM} 100 >> ${PASS}_TUNING_${MB}_${IFM}_${OFM}_threads_${THREADS} done done done fi if [ $PASS == "FWD" ] then for COLUMNS in 2 4; do export ${PASS}_COLUMN_TEAMS=${COLUMNS} ROWS=$((THREADS / COLUMNS)) export ${PASS}_ROW_TEAMS=${ROWS} for BFM in 32 64; do for BFACC in 1; do export ${PASS}_BF=${BFACC} ./layer_example_bf16 ${ITERS} ${MB} ${IFM} ${OFM} 0 ${PASS_ARG} B ${BFN} ${BFM} 100 >> ${PASS}_TUNING_${MB}_${IFM}_${OFM}_threads_${THREADS} done for BFACC in 1 2; do export ${PASS}_BF=${BFACC} ./layer_example_bf16 ${ITERS} ${MB} ${IFM} ${OFM} 0 ${PASS_ARG} B ${BFN} ${BFM} 50 >> ${PASS}_TUNING_${MB}_${IFM}_${OFM}_threads_${THREADS} done done done fi if [ $PASS == "BWD" ] then for COLUMNS in 2 4; do export ${PASS}_COLUMN_TEAMS=${COLUMNS} ROWS=$((THREADS / COLUMNS)) export ${PASS}_ROW_TEAMS=${ROWS} for BFM in 32 64; do for BFACC in 1 2 4 8; do export ${PASS}_BF=${BFACC} ./layer_example_bf16 ${ITERS} ${MB} ${IFM} ${OFM} 0 ${PASS_ARG} B ${BFN} ${BFM} 100 >> ${PASS}_TUNING_${MB}_${IFM}_${OFM}_threads_${THREADS} ./layer_example_bf16 ${ITERS} ${MB} ${IFM} ${OFM} 0 ${PASS_ARG} B ${BFN} ${BFM} 50 >> ${PASS}_TUNING_${MB}_${IFM}_${OFM}_threads_${THREADS} done done done fi done done # Tune layer (1024,1) IFM=1024 OFM=1 for THREADS in 20 24; do for PASS in 'FWD' 'BWD' 'UPD'; do if [ $PASS == "FWD" ] then PASS_ARG='F' fi if [ $PASS == "BWD" ] then PASS_ARG='B' fi if [ $PASS == "UPD" ] then PASS_ARG='U' fi rm -f ${PASS}_TUNING_${MB}_${IFM}_${OFM}_threads_${THREADS} touch ${PASS}_TUNING_${MB}_${IFM}_${OFM}_threads_${THREADS} export OMP_NUM_THREADS=${THREADS} export ${PASS}_2D_BLOCKING=0 if [ $PASS == "UPD" ] then for IFMSUBTASKS in 1 2; do export IFM_SUBTASKS=${IFMSUBTASKS} for OFMSUBTASKS in 1; do export OFM_SUBTASKS=${OFMSUBTASKS} for BFM in 16 32 64; do for BFACC in 1 2 3 6 9 10 15 30 45 90; do export ${PASS}_BF=${BFACC} ./layer_example_bf16 ${ITERS} ${MB} ${IFM} ${OFM} 0 ${PASS_ARG} B ${BFN} 1 ${BFM} >> ${PASS}_TUNING_${MB}_${IFM}_${OFM}_threads_${THREADS} done done done done fi if [ $PASS == "FWD" ] then for BFM in 16 32 64; do for BFACC in 1 2 4 8; do export ${PASS}_BF=${BFACC} ./layer_example_bf16 ${ITERS} ${MB} ${IFM} ${OFM} 0 ${PASS_ARG} B ${BFN} 1 ${BFM} >> ${PASS}_TUNING_${MB}_${IFM}_${OFM}_threads_${THREADS} done done fi if [ $PASS == "BWD" ] then for BFM in 16 32 64; do for BFACC in 1; do export ${PASS}_BF=${BFACC} ./layer_example_bf16 ${ITERS} ${MB} ${IFM} ${OFM} 0 ${PASS_ARG} B ${BFN} 1 ${BFM} >> ${PASS}_TUNING_${MB}_${IFM}_${OFM}_threads_${THREADS} done done fi export ${PASS}_2D_BLOCKING=1 export IFM_SUBTASKS=1 export OFM_SUBTASKS=1 if [ $PASS == "UPD" ] then for COLUMNS in 2 4; do export ${PASS}_COLUMN_TEAMS=${COLUMNS} ROWS=$((THREADS / COLUMNS)) export ${PASS}_ROW_TEAMS=${ROWS} for BFM in 16 32 64; do for BFACC in 1 2 3 6 9 10 15 30 45 90; do export ${PASS}_BF=${BFACC} ./layer_example_bf16 ${ITERS} ${MB} ${IFM} ${OFM} 0 ${PASS_ARG} B ${BFN} 1 ${BFM} >> ${PASS}_TUNING_${MB}_${IFM}_${OFM}_threads_${THREADS} done done done fi if [ $PASS == "FWD" ] then for COLUMNS in 2 4; do export ${PASS}_COLUMN_TEAMS=${COLUMNS} ROWS=$((THREADS / COLUMNS)) export ${PASS}_ROW_TEAMS=${ROWS} for BFM in 16 32 64; do for BFACC in 1 2 4 8; do export ${PASS}_BF=${BFACC} ./layer_example_bf16 ${ITERS} ${MB} ${IFM} ${OFM} 0 ${PASS_ARG} B ${BFN} 1 ${BFM} >> ${PASS}_TUNING_${MB}_${IFM}_${OFM}_threads_${THREADS} done done done fi if [ $PASS == "BWD" ] then for COLUMNS in 2 4; do export ${PASS}_COLUMN_TEAMS=${COLUMNS} ROWS=$((THREADS / COLUMNS)) export ${PASS}_ROW_TEAMS=${ROWS} for BFM in 16 32 64; do for BFACC in 1; do export ${PASS}_BF=${BFACC} ./layer_example_bf16 ${ITERS} ${MB} ${IFM} ${OFM} 0 ${PASS_ARG} B ${BFN} 1 ${BFM} >> ${PASS}_TUNING_${MB}_${IFM}_${OFM}_threads_${THREADS} done done done fi done done libxsmm-1.17/samples/deeplearning/fullyconnecteddriver/layer_example_bf16.c000066400000000000000000001132151415223013700272620ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include #include #include #include #include #if defined(_OPENMP) # include #endif /* include c-based dnn library */ #include "../common/dnn_common.h" #define CHKERR_LIBXSMM_DNN(A) { const int chkerr_libxsmm_dnn_ = A; if (LIBXSMM_DNN_SUCCESS != chkerr_libxsmm_dnn_) { \ fprintf(stderr, "%s\n", libxsmm_dnn_get_error(chkerr_libxsmm_dnn_)); global_status = chkerr_libxsmm_dnn_; } \ } int main(int argc, char* argv[]) { float *naive_input, *naive_output, *naive_filter, *naive_delinput, *naive_deloutput, *naive_delfilter, *naive_bias, *naive_delbias; libxsmm_bfloat16 *naive_input_bf16, *naive_filter_bf16, *naive_output_bf16, *naive_delinput_bf16, *naive_delfilter_bf16, *naive_deloutput_bf16, *naive_bias_bf16, *naive_delbias_bf16; float *naive_libxsmm_output_f32, *naive_libxsmm_delinput_f32, *naive_libxsmm_delfilter_f32, *naive_libxsmm_delbias_f32; libxsmm_bfloat16 *naive_libxsmm_output_bf16, *naive_libxsmm_delinput_bf16, *naive_libxsmm_delfilter_bf16, *naive_libxsmm_delbias_bf16; libxsmm_bfloat16 *input_libxsmm, *filter_libxsmm, *delinput_libxsmm, *delfilter_libxsmm, *output_libxsmm, *deloutput_libxsmm, *bias_libxsmm, *delbias_libxsmm; unsigned char *relumask_libxsmm; naive_fullyconnected_t naive_param; void* scratch; size_t scratch_size = 0; /* some parameters we can overwrite via cli, default is some inner layer of overfeat */ int iters = 100; /* repetitions of benchmark */ int nImg = 32; /* mini-batch size, "N" */ int nIFm = 256; /* number of input feature maps, "C" */ int nOFm = 256; /* number of input feature maps, "C" */ int fuse_type = 0; /* 0: nothing fused, 1: relu fused, 2: elementwise fused, 3: relu and elementwise fused */ char type = 'A'; /* 'A': ALL, 'F': FP, 'B': BP, 'U', WU */ char format = 'B'; int bn = 32; int bk = 32; int bc = 32; const char *const env_check = getenv("CHECK"); const double check = LIBXSMM_ABS(0 == env_check ? 1 : atof(env_check)); #if defined(_OPENMP) int nThreads = omp_get_max_threads(); /* number of threads */ #else int nThreads = 1; /* number of threads */ #endif unsigned long long l_start, l_end; double l_total = 0.0; double gflop = 0.0; int i; libxsmm_dnn_fullyconnected_desc fullyconnected_desc; libxsmm_dnn_fullyconnected* libxsmm_handle; libxsmm_dnn_tensor* libxsmm_input; libxsmm_dnn_tensor* libxsmm_delinput; libxsmm_dnn_tensor* libxsmm_output; libxsmm_dnn_tensor* libxsmm_deloutput; libxsmm_dnn_tensor* libxsmm_filter; libxsmm_dnn_tensor* libxsmm_delfilter; libxsmm_dnn_tensor* libxsmm_bias; libxsmm_dnn_tensor* libxsmm_delbias; libxsmm_dnn_tensor* libxsmm_relumask; libxsmm_dnn_tensor_datalayout* libxsmm_layout; libxsmm_dnn_err_t status; libxsmm_dnn_err_t global_status = LIBXSMM_DNN_SUCCESS; libxsmm_matdiff_info norms_fwd, norms_bwd, norms_upd, diff; libxsmm_matdiff_clear(&norms_fwd); libxsmm_matdiff_clear(&norms_bwd); libxsmm_matdiff_clear(&norms_upd); libxsmm_matdiff_clear(&diff); if (argc > 1 && !strncmp(argv[1], "-h", 3)) { printf("Usage: %s iters nImg nIFm nOFm fuse_type type format\n", argv[0]); return 0; } libxsmm_rng_set_seed(1); /* reading new values from cli */ i = 1; if (argc > i) iters = atoi(argv[i++]); if (argc > i) nImg = atoi(argv[i++]); if (argc > i) nIFm = atoi(argv[i++]); if (argc > i) nOFm = atoi(argv[i++]); if (argc > i) fuse_type = atoi(argv[i++]); if (argc > i) type = *(argv[i++]); if (argc > i) format = *(argv[i++]); if (argc > i) bn = atoi(argv[i++]); if (argc > i) bk = atoi(argv[i++]); if (argc > i) bc = atoi(argv[i++]); /* These are tuning parameters to be attached to the perfdump string */ #if 0 int fwd_bf = atoi(getenv("FWD_BF")); int bwd_bf = atoi(getenv("BWD_BF")); int upd_bf = atoi(getenv("UPD_BF")); int fwd_2d_blocking = atoi(getenv("FWD_2D_BLOCKING")); int bwd_2d_blocking = atoi(getenv("BWD_2D_BLOCKING")); int upd_2d_blocking = atoi(getenv("UPD_2D_BLOCKING")); int fwd_row_teams = atoi(getenv("FWD_ROW_TEAMS")); int fwd_column_teams = atoi(getenv("FWD_COLUMN_TEAMS")); int bwd_row_teams = atoi(getenv("BWD_ROW_TEAMS")); int bwd_column_teams = atoi(getenv("BWD_COLUMN_TEAMS")); int upd_row_teams = atoi(getenv("UPD_ROW_TEAMS")); int upd_column_teams = atoi(getenv("UPD_COLUMN_TEAMS")); int ifm_subtasks = atoi(getenv("IFM_SUBTASKS")); int ofm_subtasks = atoi(getenv("OFM_SUBTASKS")); #endif int fwd_bf = 1; int bwd_bf = 1; int upd_bf = 1; int fwd_2d_blocking = 1; int bwd_2d_blocking = 1; int upd_2d_blocking = 1; int fwd_row_teams = 1; int fwd_column_teams = 1; int bwd_row_teams = 1; int bwd_column_teams = 1; int upd_row_teams = 1; int upd_column_teams = 1; int ifm_subtasks = 1; int ofm_subtasks = 1; if ( nImg % bn != 0 ) { bn = nImg; } if ( nIFm % bc != 0 ) { bc = nIFm; } if ( nOFm % bk != 0 ) { bk = nOFm; } if (type != 'A' && type != 'F' && type != 'B' && type != 'U' && type != 'M') { printf("type needs to be 'A' (All), 'F' (FP only), 'B' (BP only), 'U' (UP only). 'M' (BPUP-fused only)\n"); return -1; } if ( (fuse_type < 0) || (fuse_type > 5) ) { printf("fuse type needs to be 0 (None), 1 (Bias), 2 (ReLU), 3 (Sigmoid), 4 (Bias+ReLU), 5 (Bias+Sigmoid)\n"); return -1; } if (format != 'L' && format != 'B') { printf("format needs to be 'L' (libxsmm) or 'B' (for locked NCNC KCCK)\n"); return -1; } /* set struct for naive convolution */ naive_param.N = nImg; naive_param.C = nIFm; naive_param.K = nOFm; naive_param.fuse_type = fuse_type; #if defined(__SSE3__) _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); #endif /* print some summary */ printf("##########################################\n"); printf("# Setting Up (Common) #\n"); printf("##########################################\n"); printf("PARAMS: N:%d C:%d K:%d\n", nImg, nIFm, nOFm); printf("PARAMS: ITERS:%d", iters); if (LIBXSMM_FEQ(0, check)) printf(" Threads:%d\n", nThreads); else printf("\n"); printf("SIZE Input (MB): %10.2f MiB\n", (double)(nImg*nIFm*sizeof(libxsmm_bfloat16))/(1024.0*1024.0) ); printf("SIZE Output (MB): %10.2f MiB\n", (double)(nImg*nOFm*sizeof(float))/(1024.0*1024.0) ); printf("SIZE Input (1): %10.2f MiB\n", (double)(1*nIFm* sizeof(libxsmm_bfloat16))/(1024.0*1024.0) ); printf("SIZE Output (1): %10.2f MiB\n", (double)(1*nOFm* sizeof(float))/(1024.0*1024.0) ); printf("SIZE Filter : %10.2f MiB\n", (double)(nIFm*nOFm*sizeof(libxsmm_bfloat16))/(1024.0*1024.0) ); /* allocate data */ naive_input = (float*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(float), 2097152); naive_delinput = (float*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(float), 2097152); naive_output = (float*)libxsmm_aligned_malloc( nImg*nOFm*sizeof(float), 2097152); naive_deloutput = (float*)libxsmm_aligned_malloc( nImg*nOFm*sizeof(float), 2097152); naive_filter = (float*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(float), 2097152); naive_delfilter = (float*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(float), 2097152); naive_bias = (float*)libxsmm_aligned_malloc( nOFm *sizeof(float), 2097152); naive_delbias = (float*)libxsmm_aligned_malloc( nOFm *sizeof(float), 2097152); naive_input_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(libxsmm_bfloat16), 2097152); naive_delinput_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(libxsmm_bfloat16), 2097152); naive_output_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nOFm*sizeof(libxsmm_bfloat16), 2097152); naive_deloutput_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nOFm*sizeof(libxsmm_bfloat16), 2097152); naive_filter_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(libxsmm_bfloat16), 2097152); naive_delfilter_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(libxsmm_bfloat16), 2097152); naive_bias_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nOFm *sizeof(libxsmm_bfloat16), 2097152); naive_delbias_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nOFm *sizeof(libxsmm_bfloat16), 2097152); naive_libxsmm_output_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nOFm*sizeof(libxsmm_bfloat16), 2097152); naive_libxsmm_delinput_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(libxsmm_bfloat16), 2097152); naive_libxsmm_delfilter_bf16= (libxsmm_bfloat16*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(libxsmm_bfloat16), 2097152); naive_libxsmm_delbias_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nOFm *sizeof(libxsmm_bfloat16), 2097152); naive_libxsmm_output_f32 = (float*)libxsmm_aligned_malloc( nImg*nOFm*sizeof(float), 2097152); naive_libxsmm_delinput_f32 = (float*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(float), 2097152); naive_libxsmm_delfilter_f32 = (float*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(float), 2097152); naive_libxsmm_delbias_f32 = (float*)libxsmm_aligned_malloc( nOFm *sizeof(float), 2097152); input_libxsmm = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(libxsmm_bfloat16), 2097152); delinput_libxsmm = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(libxsmm_bfloat16), 2097152); output_libxsmm = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nOFm*sizeof(libxsmm_bfloat16), 2097152); deloutput_libxsmm = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nOFm*sizeof(libxsmm_bfloat16), 2097152); filter_libxsmm = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(libxsmm_bfloat16), 2097152); delfilter_libxsmm = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(libxsmm_bfloat16), 2097152); bias_libxsmm = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nOFm *sizeof(libxsmm_bfloat16), 2097152); delbias_libxsmm = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nOFm *sizeof(libxsmm_bfloat16), 2097152); relumask_libxsmm = (unsigned char*)libxsmm_aligned_malloc( nImg*nOFm*sizeof(unsigned char), 2097152); /* initialize data */ init_buf( naive_input, nImg*nIFm, 0, 0 ); init_buf( naive_delinput, nImg*nIFm, 0, 0 ); init_buf( naive_output, nImg*nOFm, 0, 0 ); init_buf( naive_deloutput, nImg*nOFm, 0, 0 ); init_buf( naive_filter, nIFm*nOFm, 0, 0 ); init_buf( naive_delfilter, nIFm*nOFm, 0, 0 ); init_buf( naive_bias, nOFm, 0, 0 ); init_buf( naive_delbias, nOFm, 0, 0 ); libxsmm_rne_convert_fp32_bf16( naive_input, naive_input_bf16, nImg*nIFm ); libxsmm_rne_convert_fp32_bf16( naive_delinput, naive_delinput_bf16, nImg*nIFm ); libxsmm_rne_convert_fp32_bf16( naive_output, naive_output_bf16, nImg*nOFm ); libxsmm_rne_convert_fp32_bf16( naive_deloutput, naive_deloutput_bf16, nImg*nOFm ); libxsmm_rne_convert_fp32_bf16( naive_filter, naive_filter_bf16, nIFm*nOFm ); libxsmm_rne_convert_fp32_bf16( naive_delfilter, naive_delfilter_bf16, nIFm*nOFm ); libxsmm_rne_convert_fp32_bf16( naive_bias, naive_bias_bf16, nOFm ); libxsmm_rne_convert_fp32_bf16( naive_delbias, naive_delbias_bf16, nOFm ); if (LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Computing Reference ... #\n"); printf("##########################################\n"); if (type == 'A' || type == 'F') { naive_fullyconnected_fused_fp(&naive_param, naive_input, naive_output, naive_filter, naive_bias); } if (type == 'A' || type == 'B' || type == 'M') { naive_fullyconnected_fused_bp(&naive_param, naive_delinput, naive_deloutput, naive_filter, naive_delbias, naive_output); } if (type == 'A' || type == 'U' || type == 'M') { naive_fullyconnected_wu(&naive_param, naive_input, naive_deloutput, naive_delfilter); } printf("##########################################\n"); printf("# Computing Reference ... done #\n"); printf("##########################################\n"); } if (format == 'A' || format == 'B') { printf("\n"); printf("##########################################\n"); printf("# Setting Up (custom-Storage) #\n"); printf("##########################################\n"); /* setup LIBXSMM handle */ fullyconnected_desc.N = nImg; fullyconnected_desc.C = nIFm; fullyconnected_desc.K = nOFm; fullyconnected_desc.bn = bn; fullyconnected_desc.bk = bk; fullyconnected_desc.bc = bc; fullyconnected_desc.threads = nThreads; fullyconnected_desc.datatype_in = LIBXSMM_DNN_DATATYPE_BF16; fullyconnected_desc.datatype_out = LIBXSMM_DNN_DATATYPE_BF16; fullyconnected_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED; fullyconnected_desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED; if ( fuse_type == 0 ) { fullyconnected_desc.fuse_ops = LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE; } else if ( fuse_type == 1 ) { fullyconnected_desc.fuse_ops = LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS; } else if ( fuse_type == 2 ) { fullyconnected_desc.fuse_ops = LIBXSMM_DNN_FULLYCONNECTED_FUSE_RELU; } else if ( fuse_type == 3 ) { fullyconnected_desc.fuse_ops = LIBXSMM_DNN_FULLYCONNECTED_FUSE_SIGMOID; } else if ( fuse_type == 4 ) { fullyconnected_desc.fuse_ops = LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_RELU; } else if ( fuse_type == 5 ) { fullyconnected_desc.fuse_ops = LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_SIGMOID; } else { /* cannot happen */ } libxsmm_handle = libxsmm_dnn_create_fullyconnected( fullyconnected_desc, &status ); CHKERR_LIBXSMM_DNN( status ); /* setup LIBXSMM buffers */ libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_input = libxsmm_dnn_link_tensor( libxsmm_layout, input_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); printf("inner activation blocking: %i\n", libxsmm_layout->dim_size[0] ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delinput = libxsmm_dnn_link_tensor( libxsmm_layout, delinput_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_output = libxsmm_dnn_link_tensor( libxsmm_layout, output_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_deloutput = libxsmm_dnn_link_tensor( libxsmm_layout, deloutput_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_filter = libxsmm_dnn_link_tensor( libxsmm_layout, filter_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_FILTER, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delfilter = libxsmm_dnn_link_tensor( libxsmm_layout, delfilter_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_CHANNEL_BIAS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_bias = libxsmm_dnn_link_tensor( libxsmm_layout, bias_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delbias = libxsmm_dnn_link_tensor( libxsmm_layout, delbias_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RELU_MASK, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_relumask = libxsmm_dnn_link_tensor( libxsmm_layout, relumask_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); /* copy in data to LIBXSMM format */ /* we can also use the layout functions and set the data on our own external to the library */ matrix_copy_NC_to_NCNC_bf16( naive_input_bf16, input_libxsmm, 1, nImg, nIFm, bn, bc ); matrix_copy_NC_to_NCNC_bf16( naive_delinput_bf16, delinput_libxsmm, 1, nImg, nIFm, bn, bc ); matrix_copy_NC_to_NCNC_bf16( naive_output_bf16, output_libxsmm, 1, nImg, nOFm, bn, bk ); matrix_copy_NC_to_NCNC_bf16( naive_deloutput_bf16, deloutput_libxsmm, 1, nImg, nOFm, bn, bk ); matrix_copy_KC_to_KCCK_bf16( naive_filter_bf16, filter_libxsmm , nIFm, nOFm, bc, bk ); matrix_copy_KC_to_KCCK_bf16( naive_delfilter_bf16, delfilter_libxsmm , nIFm, nOFm, bc, bk ); matrix_copy_NC_to_NCNC_bf16( naive_bias_bf16, bias_libxsmm, 1, 1, nOFm, 1, nOFm ); matrix_copy_NC_to_NCNC_bf16( naive_delbias_bf16, delbias_libxsmm, 1, 1, nOFm, 1, nOFm ); /* bind buffers and filter to handle */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_input, LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_delinput, LIBXSMM_DNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_output, LIBXSMM_DNN_REGULAR_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_deloutput, LIBXSMM_DNN_GRADIENT_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_filter, LIBXSMM_DNN_REGULAR_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_delfilter, LIBXSMM_DNN_GRADIENT_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_bias, LIBXSMM_DNN_REGULAR_CHANNEL_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_delbias, LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_relumask, LIBXSMM_DNN_RELU_MASK ) ); /* let's allocate and bind scratch */ scratch_size = libxsmm_dnn_fullyconnected_get_scratch_size( libxsmm_handle, &status ); CHKERR_LIBXSMM_DNN( status ); scratch = libxsmm_aligned_scratch( scratch_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_scratch( libxsmm_handle, scratch ) ); /* set scratch to bogus to make sure that libxsmm takes care of zeroing internally */ init_buf( (float*)scratch, scratch_size/4, 0, 0 ); if ((type == 'A' || type == 'F') && LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Correctness - FWD (custom-Storage) #\n"); printf("##########################################\n"); #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) ); } /* copy out data */ matrix_copy_NCNC_to_NC_bf16( output_libxsmm, naive_libxsmm_output_bf16, 1, nImg, nOFm, bn, bk ); libxsmm_convert_bf16_f32( naive_libxsmm_output_bf16, naive_libxsmm_output_f32, nImg*nOFm ); /* compare */ libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_F32, nImg*nOFm, 1, naive_output, naive_libxsmm_output_f32, 0, 0); printf("L1 reference : %.25g\n", norms_fwd.l1_ref); printf("L1 test : %.25g\n", norms_fwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_fwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_fwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel); printf("Check-norm : %.24f\n", norms_fwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_fwd); } if ( (type == 'A' || type == 'B') && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - BWD (custom-Storage) #\n"); printf("##########################################\n"); #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ) ); } /* copy out data */ matrix_copy_NCNC_to_NC_bf16( delinput_libxsmm, naive_libxsmm_delinput_bf16, 1, nImg, nIFm, bn, bc ); libxsmm_convert_bf16_f32( naive_libxsmm_delinput_bf16, naive_libxsmm_delinput_f32, nImg*nIFm ); /* compare */ libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, nImg*nIFm, 1, naive_delinput, naive_libxsmm_delinput_f32, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); if ( (fuse_type == 1) || (fuse_type == 4) || (fuse_type == 5) ) { /* copy out data */ matrix_copy_NCNC_to_NC_bf16( delbias_libxsmm, naive_libxsmm_delbias_bf16, 1, 1, nOFm, 1, nOFm ); libxsmm_convert_bf16_f32( naive_libxsmm_delbias_bf16, naive_libxsmm_delbias_f32, nOFm ); libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, nOFm, 1, naive_delbias, naive_libxsmm_delbias_f32, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); } } if ( (type == 'A' || type == 'U') && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - UPD (custom-Storage) #\n"); printf("##########################################\n"); #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ) ); } /* copy out data */ matrix_copy_KCCK_to_KC_bf16( delfilter_libxsmm, naive_libxsmm_delfilter_bf16, nIFm, nOFm, bc, bk ); libxsmm_convert_bf16_f32( naive_libxsmm_delfilter_bf16, naive_libxsmm_delfilter_f32, nIFm*nOFm ); /* compare */ libxsmm_matdiff(&norms_upd, LIBXSMM_DATATYPE_F32, nIFm*nOFm, 1, naive_delfilter, naive_libxsmm_delfilter_f32, 0, 0); printf("L1 reference : %.25g\n", norms_upd.l1_ref); printf("L1 test : %.25g\n", norms_upd.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd.linf_rel); printf("Check-norm : %.24f\n", norms_upd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd); } if ( (type == 'A' || type == 'M') && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - BWDUPD (custom-Storage) #\n"); printf("##########################################\n"); #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, 0, tid ) ); } /* copy out data */ matrix_copy_NCNC_to_NC_bf16( delinput_libxsmm, naive_libxsmm_delinput_bf16, 1, nImg, nIFm, bn, bc ); libxsmm_convert_bf16_f32( naive_libxsmm_delinput_bf16, naive_libxsmm_delinput_f32, nImg*nIFm ); /* compare */ libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, nImg*nIFm, 1, naive_delinput, naive_libxsmm_delinput_f32, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); if ( (fuse_type == 1) || (fuse_type == 4) || (fuse_type == 5) ) { /* copy out data */ matrix_copy_NCNC_to_NC_bf16( delbias_libxsmm, naive_libxsmm_delbias_bf16, 1, 1, nOFm, 1, nOFm ); libxsmm_convert_bf16_f32( naive_libxsmm_delbias_bf16, naive_libxsmm_delbias_f32, nOFm ); libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, nOFm, 1, naive_delbias, naive_libxsmm_delbias_f32, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); } /* copy out data */ matrix_copy_KCCK_to_KC_bf16( delfilter_libxsmm, naive_libxsmm_delfilter_bf16, nIFm, nOFm, bc, bk ); libxsmm_convert_bf16_f32( naive_libxsmm_delfilter_bf16, naive_libxsmm_delfilter_f32, nIFm*nOFm ); /* compare */ libxsmm_matdiff(&norms_upd, LIBXSMM_DATATYPE_F32, nIFm*nOFm, 1, naive_delfilter, naive_libxsmm_delfilter_f32, 0, 0); printf("L1 reference : %.25g\n", norms_upd.l1_ref); printf("L1 test : %.25g\n", norms_upd.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd.linf_rel); printf("Check-norm : %.24f\n", norms_upd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd); } if (type == 'A' || type == 'F') { printf("##########################################\n"); printf("# Performance - FWD (custom-Storage) #\n"); printf("##########################################\n"); l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); gflop = (2.0*(double)nImg*(double)nIFm*(double)nOFm*(double)iters) / (1000*1000*1000); printf("GFLOP = %.5g\n", gflop/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", gflop/l_total); char tune_string_fwd[1000]; sprintf(tune_string_fwd,"threads=%d_2D=%d_rows=%d_cols=%d_BN=%d_BK=%d_BC=%d_BFACCUM=%d",nThreads, fwd_2d_blocking, fwd_row_teams, fwd_column_teams, bn, bk, bc, fwd_bf); printf("PERFDUMP,%s,FP,%s,%i,%i,%i,%i,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n",tune_string_fwd, LIBXSMM_VERSION, nThreads, nImg, nIFm, nOFm, ((double)(l_total/iters)), gflop/l_total, norms_fwd.l1_ref, norms_fwd.l1_tst, norms_fwd.l2_abs, norms_fwd.l2_rel, norms_fwd.linf_abs, norms_fwd.linf_rel, norms_fwd.normf_rel); } if (type == 'A' || type == 'B') { printf("##########################################\n"); printf("# Performance - BWD (custom-Storage) #\n"); printf("##########################################\n"); l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); gflop = (2.0*(double)nImg*(double)nIFm*(double)nOFm*(double)iters) / (1000*1000*1000); printf("GFLOP = %.5g\n", gflop/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", gflop/l_total); char tune_string_bwd[1000]; sprintf(tune_string_bwd,"threads=%d_2D=%d_rows=%d_cols=%d_BN=%d_BK=%d_BC=%d_BFACCUM=%d",nThreads, bwd_2d_blocking, bwd_row_teams, bwd_column_teams, bn, bk, bc, bwd_bf); printf("PERFDUMP,%s,BP,%s,%i,%i,%i,%i,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", tune_string_bwd , LIBXSMM_VERSION, nThreads, nImg, nIFm, nOFm, ((double)(l_total/iters)), gflop/l_total, norms_bwd.l1_ref, norms_bwd.l1_tst, norms_bwd.l2_abs, norms_bwd.l2_rel, norms_bwd.linf_abs, norms_bwd.linf_rel, norms_bwd.normf_rel); } if (type == 'A' || type == 'U') { printf("##########################################\n"); printf("# Performance - UPD (custom-Storage) #\n"); printf("##########################################\n"); l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); gflop = (2.0*(double)nImg*(double)nIFm*(double)nOFm*(double)iters) / (1000*1000*1000); printf("GFLOP = %.5g\n", gflop/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", gflop/l_total); char tune_string_upd[1000]; sprintf(tune_string_upd,"threads=%d_2D=%d_rows=%d_cols=%d_BN=%d_BK=%d_BC=%d_BFACCUM=%d_IFMSUBTASK=%d_OFMSUBTASK=%d",nThreads, upd_2d_blocking, upd_row_teams, upd_column_teams, bn, bk, bc, upd_bf, ifm_subtasks, ofm_subtasks); printf("PERFDUMP,%s,UP,%s,%i,%i,%i,%i,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", tune_string_upd , LIBXSMM_VERSION, nThreads, nImg, nIFm, nOFm, ((double)(l_total/iters)), gflop/l_total, norms_upd.l1_ref, norms_upd.l1_tst, norms_upd.l2_abs, norms_upd.l2_rel, norms_upd.linf_abs, norms_upd.linf_rel, norms_upd.normf_rel); } if (type == 'A' || type == 'M') { printf("##########################################\n"); printf("# Performance - BWDUPD (custom-Storage) #\n"); printf("##########################################\n"); l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); gflop = (4.0*(double)nImg*(double)nIFm*(double)nOFm*(double)iters) / (1000*1000*1000); printf("GFLOP = %.5g\n", gflop/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", gflop/l_total); printf("PERFDUMP,UP,%s,%i,%i,%i,%i,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nIFm, nOFm, ((double)(l_total/iters)), gflop/l_total, norms_upd.l1_ref, norms_upd.l1_tst, norms_upd.l2_abs, norms_upd.l2_rel, norms_upd.linf_abs, norms_upd.linf_rel, norms_upd.normf_rel); } /* clean-up */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_scratch( libxsmm_handle ) ); libxsmm_free(scratch); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_CHANNEL_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_RELU_MASK ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_input ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_delinput ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_output ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_deloutput ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_filter ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_delfilter ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_bias ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_delbias ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_relumask ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_fullyconnected( libxsmm_handle ) ); } /* deallocate data */ libxsmm_free(naive_input); libxsmm_free(naive_output); libxsmm_free(naive_delinput); libxsmm_free(naive_deloutput); libxsmm_free(naive_filter); libxsmm_free(naive_delfilter); libxsmm_free(naive_input_bf16); libxsmm_free(naive_delinput_bf16); libxsmm_free(naive_output_bf16); libxsmm_free(naive_deloutput_bf16); libxsmm_free(naive_filter_bf16); libxsmm_free(naive_delfilter_bf16); libxsmm_free(naive_libxsmm_output_bf16); libxsmm_free(naive_libxsmm_delinput_bf16); libxsmm_free(naive_libxsmm_delfilter_bf16); libxsmm_free(naive_libxsmm_output_f32); libxsmm_free(naive_libxsmm_delinput_f32); libxsmm_free(naive_libxsmm_delfilter_f32); libxsmm_free(input_libxsmm); libxsmm_free(output_libxsmm); libxsmm_free(delinput_libxsmm); libxsmm_free(deloutput_libxsmm); libxsmm_free(filter_libxsmm); libxsmm_free(delfilter_libxsmm); libxsmm_free(naive_bias); libxsmm_free(naive_delbias); libxsmm_free(naive_bias_bf16); libxsmm_free(naive_delbias_bf16); libxsmm_free(naive_libxsmm_delbias_bf16); libxsmm_free(naive_libxsmm_delbias_f32); libxsmm_free(relumask_libxsmm); libxsmm_free(bias_libxsmm); libxsmm_free(delbias_libxsmm); { const char *const env_check_scale = getenv("CHECK_SCALE"); const double check_scale = LIBXSMM_ABS(0 == env_check_scale ? 1.0 : atof(env_check_scale)); if (LIBXSMM_NEQ(0, check) && (check < 100.0 * check_scale * diff.normf_rel) && (global_status == LIBXSMM_DNN_SUCCESS)) { fprintf(stderr, "FAILED with an error of %f%%!\n", 100.0 * diff.normf_rel); exit(EXIT_FAILURE); } } /* some empty lines at the end */ printf("\n\n\n"); return global_status; } libxsmm-1.17/samples/deeplearning/fullyconnecteddriver/layer_example_bf16_f32.c000066400000000000000000000613331415223013700277370ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include #include #include #include #include #if defined(_OPENMP) # include #endif /* include c-based dnn library */ #include "../common/dnn_common.h" #define CHKERR_LIBXSMM_DNN(A) { const int chkerr_libxsmm_dnn_ = A; if (LIBXSMM_DNN_SUCCESS != chkerr_libxsmm_dnn_) { \ fprintf(stderr, "%s\n", libxsmm_dnn_get_error(chkerr_libxsmm_dnn_)); global_status = chkerr_libxsmm_dnn_; } \ } int main(int argc, char* argv[]) { float *naive_input, *naive_output, *naive_filter, *naive_delinput, *naive_deloutput, *naive_delfilter; libxsmm_bfloat16 *naive_input_bf16, *naive_filter_bf16, *naive_delinput_bf16, *naive_delfilter_bf16; float *naive_libxsmm_output, *naive_libxsmm_delinput_f32, *naive_libxsmm_delfilter_f32; libxsmm_bfloat16 *naive_libxsmm_delinput, *naive_libxsmm_delfilter; libxsmm_bfloat16 *input_libxsmm, *filter_libxsmm, *delinput_libxsmm, *delfilter_libxsmm; float *output_libxsmm, *deloutput_libxsmm; naive_fullyconnected_t naive_param; void* scratch; size_t scratch_size = 0; /* some parameters we can overwrite via cli, default is some inner layer of overfeat */ int iters = 10; /* repetitions of benchmark */ int nImg = 32; /* mini-batch size, "N" */ int nIFm = 256; /* number of input feature maps, "C" */ int nOFm = 256; /* number of input feature maps, "C" */ int fuse_type = 0; /* 0: nothing fused, 1: relu fused, 2: elementwise fused, 3: relu and elementwise fused */ char type = 'A'; /* 'A': ALL, 'F': FP, 'B': BP, 'U', WU */ char format = 'L'; const char *const env_check = getenv("CHECK"); const double check = LIBXSMM_ABS(0 == env_check ? 1 : atof(env_check)); #if defined(_OPENMP) int nThreads = omp_get_max_threads(); /* number of threads */ #else int nThreads = 1; /* number of threads */ #endif unsigned long long l_start, l_end; double l_total = 0.0; double gflop = 0.0; int i; libxsmm_dnn_fullyconnected_desc fullyconnected_desc; libxsmm_dnn_fullyconnected* libxsmm_handle; libxsmm_dnn_tensor* libxsmm_input; libxsmm_dnn_tensor* libxsmm_delinput; libxsmm_dnn_tensor* libxsmm_output; libxsmm_dnn_tensor* libxsmm_deloutput; libxsmm_dnn_tensor* libxsmm_filter; libxsmm_dnn_tensor* libxsmm_delfilter; libxsmm_dnn_tensor_datalayout* libxsmm_layout; libxsmm_dnn_err_t status; libxsmm_dnn_err_t global_status = LIBXSMM_DNN_SUCCESS; libxsmm_matdiff_info norms_fwd, norms_bwd, norms_upd, diff; libxsmm_matdiff_clear(&norms_fwd); libxsmm_matdiff_clear(&norms_bwd); libxsmm_matdiff_clear(&norms_upd); libxsmm_matdiff_clear(&diff); if (argc > 1 && !strncmp(argv[1], "-h", 3)) { printf("Usage: %s iters nImg nIFm nOFm fuse_type type format\n", argv[0]); return 0; } libxsmm_rng_set_seed(1); /* reading new values from cli */ i = 1; if (argc > i) iters = atoi(argv[i++]); if (argc > i) nImg = atoi(argv[i++]); if (argc > i) nIFm = atoi(argv[i++]); if (argc > i) nOFm = atoi(argv[i++]); if (argc > i) fuse_type = atoi(argv[i++]); if (argc > i) type = *(argv[i++]); if (argc > i) format = *(argv[i++]); if (type != 'A' && type != 'F' && type != 'B' && type != 'U') { printf("type needs to be 'A' (All), 'F' (FP only), 'B' (BP only), 'U' (UP only)\n"); return -1; } if ( fuse_type != 0 ) { printf("fuse type needs to be 0\n"); return -1; } if (format != 'L') { printf("format needs to be 'L' (libxsmm)\n"); return -1; } /* set struct for naive convolution */ naive_param.N = nImg; naive_param.C = nIFm; naive_param.K = nOFm; naive_param.fuse_type = fuse_type; #if defined(__SSE3__) _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); #endif /* print some summary */ printf("##########################################\n"); printf("# Setting Up (Common) #\n"); printf("##########################################\n"); printf("PARAMS: N:%d C:%d K:%d\n", nImg, nIFm, nOFm); printf("PARAMS: ITERS:%d", iters); if (LIBXSMM_FEQ(0, check)) printf(" Threads:%d\n", nThreads); else printf("\n"); printf("SIZE Input (MB): %10.2f MiB\n", (double)(nImg*nIFm*sizeof(libxsmm_bfloat16))/(1024.0*1024.0) ); printf("SIZE Output (MB): %10.2f MiB\n", (double)(nImg*nOFm*sizeof(float))/(1024.0*1024.0) ); printf("SIZE Input (1): %10.2f MiB\n", (double)(1*nIFm* sizeof(libxsmm_bfloat16))/(1024.0*1024.0) ); printf("SIZE Output (1): %10.2f MiB\n", (double)(1*nOFm* sizeof(float))/(1024.0*1024.0) ); printf("SIZE Filter : %10.2f MiB\n", (double)(nIFm*nOFm*sizeof(libxsmm_bfloat16))/(1024.0*1024.0) ); /* allocate data */ naive_input = (float*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(float), 2097152); naive_delinput = (float*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(float), 2097152); naive_output = (float*)libxsmm_aligned_malloc( nImg*nOFm*sizeof(float), 2097152); naive_deloutput = (float*)libxsmm_aligned_malloc( nImg*nOFm*sizeof(float), 2097152); naive_filter = (float*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(float), 2097152); naive_delfilter = (float*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(float), 2097152); naive_input_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(libxsmm_bfloat16), 2097152); naive_delinput_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(libxsmm_bfloat16), 2097152); naive_filter_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(libxsmm_bfloat16), 2097152); naive_delfilter_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(libxsmm_bfloat16), 2097152); naive_libxsmm_delinput = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(libxsmm_bfloat16), 2097152); naive_libxsmm_output = (float*)libxsmm_aligned_malloc( nImg*nOFm*sizeof(float), 2097152); naive_libxsmm_delfilter = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(libxsmm_bfloat16), 2097152); naive_libxsmm_delinput_f32 = (float*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(float), 2097152); naive_libxsmm_delfilter_f32 = (float*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(float), 2097152); input_libxsmm = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(libxsmm_bfloat16), 2097152); delinput_libxsmm = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(libxsmm_bfloat16), 2097152); output_libxsmm = (float*)libxsmm_aligned_malloc( nImg*nOFm*sizeof(float), 2097152); deloutput_libxsmm = (float*)libxsmm_aligned_malloc( nImg*nOFm*sizeof(float), 2097152); filter_libxsmm = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(libxsmm_bfloat16), 2097152); delfilter_libxsmm = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(libxsmm_bfloat16), 2097152); /* initialize data */ init_buf( naive_input, nImg*nIFm, 0, 0 ); init_buf( naive_delinput, nImg*nIFm, 0, 0 ); init_buf( naive_output, nImg*nOFm, 0, 0 ); init_buf( naive_deloutput, nImg*nOFm, 0, 0 ); init_buf( naive_filter, nIFm*nOFm, 0, 0 ); init_buf( naive_delfilter, nIFm*nOFm, 0, 0 ); libxsmm_rne_convert_fp32_bf16( naive_input, naive_input_bf16, nImg*nIFm ); libxsmm_rne_convert_fp32_bf16( naive_delinput, naive_delinput_bf16, nImg*nIFm ); libxsmm_rne_convert_fp32_bf16( naive_filter, naive_filter_bf16, nIFm*nOFm ); libxsmm_rne_convert_fp32_bf16( naive_delfilter, naive_delfilter_bf16, nIFm*nOFm ); if (LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Computing Reference ... #\n"); printf("##########################################\n"); if (type == 'A' || type == 'F') { naive_fullyconnected_fp(&naive_param, naive_input, naive_output, naive_filter); } if (type == 'A' || type == 'B') { naive_fullyconnected_bp(&naive_param, naive_delinput, naive_deloutput, naive_filter); } if (type == 'A' || type == 'U') { naive_fullyconnected_wu(&naive_param, naive_input, naive_deloutput, naive_delfilter); } printf("##########################################\n"); printf("# Computing Reference ... done #\n"); printf("##########################################\n"); } if (format == 'A' || format == 'L') { printf("\n"); printf("##########################################\n"); printf("# Setting Up (custom-Storage) #\n"); printf("##########################################\n"); /* setup LIBXSMM handle */ fullyconnected_desc.N = nImg; fullyconnected_desc.C = nIFm; fullyconnected_desc.K = nOFm; fullyconnected_desc.threads = nThreads; fullyconnected_desc.datatype_in = LIBXSMM_DNN_DATATYPE_BF16; fullyconnected_desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32; fullyconnected_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; fullyconnected_desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; fullyconnected_desc.fuse_ops = LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE; libxsmm_handle = libxsmm_dnn_create_fullyconnected( fullyconnected_desc, &status ); CHKERR_LIBXSMM_DNN( status ); /* setup LIBXSMM buffers */ libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_input = libxsmm_dnn_link_tensor( libxsmm_layout, input_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); printf("inner activation blocking: %i\n", libxsmm_layout->dim_size[0] ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delinput = libxsmm_dnn_link_tensor( libxsmm_layout, delinput_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_output = libxsmm_dnn_link_tensor( libxsmm_layout, output_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_deloutput = libxsmm_dnn_link_tensor( libxsmm_layout, deloutput_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_filter = libxsmm_dnn_link_tensor( libxsmm_layout, filter_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_FILTER, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delfilter = libxsmm_dnn_link_tensor( libxsmm_layout, delfilter_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); /* copy in data to LIBXSMM format */ /* we can also use the layout functions and set the data on our own external to the library */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_input, (void*)naive_input_bf16, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_output, (void*)naive_output, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_filter, (void*)naive_filter_bf16, LIBXSMM_DNN_TENSOR_FORMAT_KCRS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_delinput, (void*)naive_delinput_bf16, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_deloutput, (void*)naive_deloutput, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_delfilter, (void*)naive_delfilter_bf16, LIBXSMM_DNN_TENSOR_FORMAT_KCRS ) ); /* bind buffers and filter to handle */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_input, LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_delinput, LIBXSMM_DNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_output, LIBXSMM_DNN_REGULAR_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_deloutput, LIBXSMM_DNN_GRADIENT_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_filter, LIBXSMM_DNN_REGULAR_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_delfilter, LIBXSMM_DNN_GRADIENT_FILTER ) ); /* let's allocate and bind scratch */ scratch_size = libxsmm_dnn_fullyconnected_get_scratch_size( libxsmm_handle, &status ); CHKERR_LIBXSMM_DNN( status ); scratch = libxsmm_aligned_scratch( scratch_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_scratch( libxsmm_handle, scratch ) ); /* set scratch to bogus to make sure that libxsmm takes care of zeroing internally */ init_buf( (float*)scratch, scratch_size/4, 0, 0 ); if ((type == 'A' || type == 'F') && LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Correctness - FWD (custom-Storage) #\n"); printf("##########################################\n"); #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) ); } /* copy out data */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_output, (void*)naive_libxsmm_output, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); /* compare */ libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_F32, nImg*nOFm, 1, naive_output, naive_libxsmm_output, 0, 0); printf("L1 reference : %.25g\n", norms_fwd.l1_ref); printf("L1 test : %.25g\n", norms_fwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_fwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_fwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel); printf("Check-norm : %.24f\n", norms_fwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_fwd); } if ( (type == 'A' || type == 'B') && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - BWD (custom-Storage) #\n"); printf("##########################################\n"); #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ) ); } /* copy out data */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_delinput, (void*)naive_libxsmm_delinput, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); libxsmm_convert_bf16_f32( naive_libxsmm_delinput, naive_libxsmm_delinput_f32, nImg*nIFm ); /* compare */ libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, nImg*nIFm, 1, naive_delinput, naive_libxsmm_delinput_f32, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); } if ( (type == 'A' || type == 'U') && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - UPD (custom-Storage) #\n"); printf("##########################################\n"); #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ) ); } /* copy out data */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_delfilter, (void*)naive_libxsmm_delfilter, LIBXSMM_DNN_TENSOR_FORMAT_KCRS ) ); libxsmm_convert_bf16_f32( naive_libxsmm_delfilter, naive_libxsmm_delfilter_f32, nIFm*nOFm ); /* compare */ libxsmm_matdiff(&norms_upd, LIBXSMM_DATATYPE_F32, nIFm*nOFm, 1, naive_delfilter, naive_libxsmm_delfilter_f32, 0, 0); printf("L1 reference : %.25g\n", norms_upd.l1_ref); printf("L1 test : %.25g\n", norms_upd.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd.linf_rel); printf("Check-norm : %.24f\n", norms_upd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd); } if (type == 'A' || type == 'F') { printf("##########################################\n"); printf("# Performance - FWD (custom-Storage) #\n"); printf("##########################################\n"); l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); gflop = (2.0*(double)nImg*(double)nIFm*(double)nOFm*(double)iters) / (1000*1000*1000); printf("GFLOP = %.5g\n", gflop/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", gflop/l_total); printf("PERFDUMP,FP,%s,%i,%i,%i,%i,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nIFm, nOFm, ((double)(l_total/iters)), gflop/l_total, norms_fwd.l1_ref, norms_fwd.l1_tst, norms_fwd.l2_abs, norms_fwd.l2_rel, norms_fwd.linf_abs, norms_fwd.linf_rel, norms_fwd.normf_rel); } if (type == 'A' || type == 'B') { printf("##########################################\n"); printf("# Performance - BWD (custom-Storage) #\n"); printf("##########################################\n"); l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); gflop = (2.0*(double)nImg*(double)nIFm*(double)nOFm*(double)iters) / (1000*1000*1000); printf("GFLOP = %.5g\n", gflop/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", gflop/l_total); printf("PERFDUMP,BP,%s,%i,%i,%i,%i,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nIFm, nOFm, ((double)(l_total/iters)), gflop/l_total, norms_bwd.l1_ref, norms_bwd.l1_tst, norms_bwd.l2_abs, norms_bwd.l2_rel, norms_bwd.linf_abs, norms_bwd.linf_rel, norms_bwd.normf_rel); } if (type == 'A' || type == 'U') { printf("##########################################\n"); printf("# Performance - UPD (custom-Storage) #\n"); printf("##########################################\n"); l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); gflop = (2.0*(double)nImg*(double)nIFm*(double)nOFm*(double)iters) / (1000*1000*1000); printf("GFLOP = %.5g\n", gflop/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", gflop/l_total); printf("PERFDUMP,UP,%s,%i,%i,%i,%i,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nIFm, nOFm, ((double)(l_total/iters)), gflop/l_total, norms_upd.l1_ref, norms_upd.l1_tst, norms_upd.l2_abs, norms_upd.l2_rel, norms_upd.linf_abs, norms_upd.linf_rel, norms_upd.normf_rel); } /* clean-up */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_scratch( libxsmm_handle ) ); libxsmm_free(scratch); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_input ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_delinput ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_output ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_deloutput ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_filter ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_delfilter ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_fullyconnected( libxsmm_handle ) ); } /* deallocate data */ libxsmm_free(naive_input); libxsmm_free(naive_output); libxsmm_free(naive_delinput); libxsmm_free(naive_deloutput); libxsmm_free(naive_filter); libxsmm_free(naive_delfilter); libxsmm_free(naive_input_bf16); libxsmm_free(naive_delinput_bf16); libxsmm_free(naive_filter_bf16); libxsmm_free(naive_delfilter_bf16); libxsmm_free(naive_libxsmm_output); libxsmm_free(naive_libxsmm_delinput); libxsmm_free(naive_libxsmm_delfilter); libxsmm_free(naive_libxsmm_delinput_f32); libxsmm_free(naive_libxsmm_delfilter_f32); libxsmm_free(input_libxsmm); libxsmm_free(output_libxsmm); libxsmm_free(delinput_libxsmm); libxsmm_free(deloutput_libxsmm); libxsmm_free(filter_libxsmm); libxsmm_free(delfilter_libxsmm); { const char *const env_check_scale = getenv("CHECK_SCALE"); const double check_scale = LIBXSMM_ABS(0 == env_check_scale ? 1.0 : atof(env_check_scale)); if (LIBXSMM_NEQ(0, check) && (check < 100.0 * check_scale * diff.normf_rel) && (global_status == LIBXSMM_DNN_SUCCESS)) { fprintf(stderr, "FAILED with an error of %f%%!\n", 100.0 * diff.normf_rel); exit(EXIT_FAILURE); } } /* some empty lines at the end */ printf("\n\n\n"); return global_status; } libxsmm-1.17/samples/deeplearning/fullyconnecteddriver/layer_example_f32.c000066400000000000000000001034031415223013700271140ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include #include #include #include #include #if defined(_OPENMP) # include #endif /* include c-based dnn library */ #include "../common/dnn_common.h" #define CHKERR_LIBXSMM_DNN(A) { const int chkerr_libxsmm_dnn_ = A; if (LIBXSMM_DNN_SUCCESS != chkerr_libxsmm_dnn_) { \ fprintf(stderr, "%s\n", libxsmm_dnn_get_error(chkerr_libxsmm_dnn_)); global_status = chkerr_libxsmm_dnn_; } \ } int main(int argc, char* argv[]) { float *naive_input, *naive_output, *naive_filter, *naive_delinput, *naive_deloutput, *naive_delfilter; float *naive_bias, *naive_delbias, *naive_deloutput_copy; float *naive_libxsmm_output, *naive_libxsmm_delinput, *naive_libxsmm_delfilter; float *input_libxsmm, *output_libxsmm, *filter_libxsmm, *delinput_libxsmm, *deloutput_libxsmm, *delfilter_libxsmm; float *bias_libxsmm, *delbias_libxsmm; unsigned char *relumask_libxsmm; naive_fullyconnected_t naive_param; void* scratch; size_t scratch_size = 0; /* some parameters we can overwrite via cli, default is some inner layer of overfeat */ int iters = 10; /* repetitions of benchmark */ int nImg = 32; /* mini-batch size, "N" */ int nIFm = 256; /* number of input feature maps, "C" */ int nOFm = 256; /* number of input feature maps, "C" */ int fuse_type = 0; /* 0: nothing fused, 1: relu fused, 2: elementwise fused, 3: relu and elementwise fused */ char type = 'A'; /* 'A': ALL, 'F': FP, 'B': BP, 'U', WU */ char format = 'L'; int bn = 64; int bk = 64; int bc = 64; const char *const env_check = getenv("CHECK"); const double check = LIBXSMM_ABS(0 == env_check ? 1 : atof(env_check)); #if defined(_OPENMP) int nThreads = omp_get_max_threads(); /* number of threads */ #else int nThreads = 1; /* number of threads */ #endif unsigned long long l_start, l_end; double l_total = 0.0; double gflop = 0.0; int i; libxsmm_dnn_fullyconnected_desc fullyconnected_desc; libxsmm_dnn_fullyconnected* libxsmm_handle; libxsmm_dnn_tensor* libxsmm_input; libxsmm_dnn_tensor* libxsmm_delinput; libxsmm_dnn_tensor* libxsmm_output; libxsmm_dnn_tensor* libxsmm_deloutput; libxsmm_dnn_tensor* libxsmm_filter; libxsmm_dnn_tensor* libxsmm_delfilter; libxsmm_dnn_tensor* libxsmm_bias; libxsmm_dnn_tensor* libxsmm_delbias; libxsmm_dnn_tensor* libxsmm_relumask; libxsmm_dnn_tensor_datalayout* libxsmm_layout; libxsmm_dnn_err_t status; libxsmm_dnn_err_t global_status = LIBXSMM_DNN_SUCCESS; libxsmm_matdiff_info norms_fwd, norms_bwd, norms_upd, diff; libxsmm_matdiff_clear(&norms_fwd); libxsmm_matdiff_clear(&norms_bwd); libxsmm_matdiff_clear(&norms_upd); libxsmm_matdiff_clear(&diff); if (argc > 1 && !strncmp(argv[1], "-h", 3)) { printf("Usage: %s iters nImg nIFm nOFm fuse_type type format\n", argv[0]); return 0; } libxsmm_rng_set_seed(1); /* reading new values from cli */ i = 1; if (argc > i) iters = atoi(argv[i++]); if (argc > i) nImg = atoi(argv[i++]); if (argc > i) nIFm = atoi(argv[i++]); if (argc > i) nOFm = atoi(argv[i++]); if (argc > i) fuse_type = atoi(argv[i++]); if (argc > i) type = *(argv[i++]); if (argc > i) format = *(argv[i++]); if (argc > i) bn = atoi(argv[i++]); if (argc > i) bk = atoi(argv[i++]); if (argc > i) bc = atoi(argv[i++]); if (type != 'A' && type != 'F' && type != 'B' && type != 'U' && type != 'M') { printf("type needs to be 'A' (All), 'F' (FP only), 'B' (BP only), 'U' (UP only). 'M' (BPUP-fused only)\n"); return -1; } if ( (fuse_type < 0) || (fuse_type > 5) ) { printf("fuse type needs to be 0 (None), 1 (Bias), 2 (ReLU), 3 (Sigmoid), 4 (Bias+ReLU), 5 (Bias+Sigmoid)\n"); return -1; } if (format != 'L' && format != 'B') { printf("format needs to be 'L' (libxsmm) or 'B' (for locked NCNC KCCK)\n"); return -1; } /* set struct for naive convolution */ naive_param.N = nImg; naive_param.C = nIFm; naive_param.K = nOFm; naive_param.fuse_type = fuse_type; #if defined(__SSE3__) _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); #endif /* print some summary */ printf("##########################################\n"); printf("# Setting Up (Common) #\n"); printf("##########################################\n"); printf("PARAMS: N:%d C:%d K:%d\n", nImg, nIFm, nOFm); printf("PARAMS: ITERS:%d", iters); if (LIBXSMM_FEQ(0, check)) printf(" Threads:%d\n", nThreads); else printf("\n"); printf("SIZE Input (MB): %10.2f MiB\n", (double)(nImg*nIFm*sizeof(float))/(1024.0*1024.0) ); printf("SIZE Output (MB): %10.2f MiB\n", (double)(nImg*nOFm*sizeof(float))/(1024.0*1024.0) ); printf("SIZE Input (1): %10.2f MiB\n", (double)(1*nIFm* sizeof(float))/(1024.0*1024.0) ); printf("SIZE Output (1): %10.2f MiB\n", (double)(1*nOFm* sizeof(float))/(1024.0*1024.0) ); printf("SIZE Filter : %10.2f MiB\n", (double)(nIFm*nOFm*sizeof(float))/(1024.0*1024.0) ); /* allocate data */ naive_input = (float*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(float), 2097152); naive_delinput = (float*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(float), 2097152); naive_output = (float*)libxsmm_aligned_malloc( nImg*nOFm*sizeof(float), 2097152); naive_deloutput = (float*)libxsmm_aligned_malloc( nImg*nOFm*sizeof(float), 2097152); naive_deloutput_copy = (float*)libxsmm_aligned_malloc( nImg*nOFm*sizeof(float), 2097152); naive_filter = (float*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(float), 2097152); naive_delfilter = (float*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(float), 2097152); naive_bias = (float*)libxsmm_aligned_malloc( nOFm *sizeof(float), 2097152); naive_delbias = (float*)libxsmm_aligned_malloc( nOFm *sizeof(float), 2097152); naive_libxsmm_delinput = (float*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(float), 2097152); naive_libxsmm_output = (float*)libxsmm_aligned_malloc( nImg*nOFm*sizeof(float), 2097152); naive_libxsmm_delfilter = (float*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(float), 2097152); input_libxsmm = (float*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(float), 2097152); delinput_libxsmm = (float*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(float), 2097152); output_libxsmm = (float*)libxsmm_aligned_malloc( nImg*nOFm*sizeof(float), 2097152); deloutput_libxsmm = (float*)libxsmm_aligned_malloc( nImg*nOFm*sizeof(float), 2097152); filter_libxsmm = (float*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(float), 2097152); delfilter_libxsmm = (float*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(float), 2097152); bias_libxsmm = (float*)libxsmm_aligned_malloc( nOFm *sizeof(float), 2097152); delbias_libxsmm = (float*)libxsmm_aligned_malloc( nOFm *sizeof(float), 2097152); relumask_libxsmm = (unsigned char*)libxsmm_aligned_malloc( nImg*nOFm*sizeof(unsigned char), 2097152); /* initialize data */ init_buf( naive_input, nImg*nIFm, 0, 0 ); init_buf( naive_delinput, nImg*nIFm, 0, 0 ); init_buf( naive_output, nImg*nOFm, 0, 0 ); init_buf( naive_deloutput, nImg*nOFm, 0, 0 ); init_buf( naive_filter, nIFm*nOFm, 0, 0 ); init_buf( naive_delfilter, nIFm*nOFm, 0, 0 ); init_buf( naive_bias, nOFm, 0, 0 ); init_buf( naive_delbias, nOFm, 0, 0 ); copy_buf( naive_deloutput, naive_deloutput_copy, nImg*nOFm ); if (LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Computing Reference ... #\n"); printf("##########################################\n"); if (type == 'A' || type == 'F') { naive_fullyconnected_fused_fp(&naive_param, naive_input, naive_output, naive_filter, naive_bias); } if (type == 'A' || type == 'B' || type == 'M') { naive_fullyconnected_fused_bp(&naive_param, naive_delinput, naive_deloutput, naive_filter, naive_delbias, naive_output); } if (type == 'A' || type == 'U' || type == 'M') { naive_fullyconnected_wu(&naive_param, naive_input, naive_deloutput, naive_delfilter); } printf("##########################################\n"); printf("# Computing Reference ... done #\n"); printf("##########################################\n"); } if (format == 'B' || format == 'L') { printf("\n"); printf("##########################################\n"); printf("# Setting Up (custom-Storage) #\n"); printf("##########################################\n"); /* setup LIBXSMM handle */ fullyconnected_desc.N = nImg; fullyconnected_desc.C = nIFm; fullyconnected_desc.K = nOFm; fullyconnected_desc.bn = bn; fullyconnected_desc.bk = bk; fullyconnected_desc.bc = bc; fullyconnected_desc.threads = nThreads; fullyconnected_desc.datatype_in = LIBXSMM_DNN_DATATYPE_F32; fullyconnected_desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32; if (nImg % bn != 0) { bn = nImg; } if (nIFm % bc != 0) { bc = nIFm; } if (nOFm % bk != 0) { bk = nOFm; } if ( format == 'L' ) { fullyconnected_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; fullyconnected_desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; } else { fullyconnected_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED; fullyconnected_desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED; } if ( fuse_type == 0 ) { fullyconnected_desc.fuse_ops = LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE; } else if ( fuse_type == 1 ) { fullyconnected_desc.fuse_ops = LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS; } else if ( fuse_type == 2 ) { fullyconnected_desc.fuse_ops = LIBXSMM_DNN_FULLYCONNECTED_FUSE_RELU; } else if ( fuse_type == 3 ) { fullyconnected_desc.fuse_ops = LIBXSMM_DNN_FULLYCONNECTED_FUSE_SIGMOID; } else if ( fuse_type == 4 ) { fullyconnected_desc.fuse_ops = LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_RELU; } else if ( fuse_type == 5 ) { fullyconnected_desc.fuse_ops = LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_SIGMOID; } else { /* cannot happen */ } libxsmm_handle = libxsmm_dnn_create_fullyconnected( fullyconnected_desc, &status ); CHKERR_LIBXSMM_DNN( status ); /* setup LIBXSMM buffers */ libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_input = libxsmm_dnn_link_tensor( libxsmm_layout, input_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); printf("inner activation blocking: %i\n", libxsmm_layout->dim_size[0] ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delinput = libxsmm_dnn_link_tensor( libxsmm_layout, delinput_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_output = libxsmm_dnn_link_tensor( libxsmm_layout, output_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_deloutput = libxsmm_dnn_link_tensor( libxsmm_layout, deloutput_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_filter = libxsmm_dnn_link_tensor( libxsmm_layout, filter_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_FILTER, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delfilter = libxsmm_dnn_link_tensor( libxsmm_layout, delfilter_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); if ( format == 'B' ) { libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_CHANNEL_BIAS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_bias = libxsmm_dnn_link_tensor( libxsmm_layout, bias_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delbias = libxsmm_dnn_link_tensor( libxsmm_layout, delbias_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RELU_MASK, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_relumask = libxsmm_dnn_link_tensor( libxsmm_layout, relumask_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); } /* copy in data to LIBXSMM format */ /* we can also use the layout functions and set the data on our own external to the library */ if ( format == 'L' ) { CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_input, (void*)naive_input, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_output, (void*)naive_output, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_filter, (void*)naive_filter, LIBXSMM_DNN_TENSOR_FORMAT_KCRS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_delinput, (void*)naive_delinput, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_deloutput, (void*)naive_deloutput_copy, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_delfilter, (void*)naive_delfilter, LIBXSMM_DNN_TENSOR_FORMAT_KCRS ) ); } else { matrix_copy_NC_to_NCNC( naive_input, input_libxsmm, 1, nImg, nIFm, bn, bc ); matrix_copy_NC_to_NCNC( naive_delinput, delinput_libxsmm, 1, nImg, nIFm, bn, bc ); matrix_copy_NC_to_NCNC( naive_output, output_libxsmm, 1, nImg, nOFm, bn, bk ); matrix_copy_NC_to_NCNC( naive_deloutput_copy, deloutput_libxsmm, 1, nImg, nOFm, bn, bk ); matrix_copy_KC_to_KCCK( naive_filter, filter_libxsmm , nIFm, nOFm, bc, bk ); matrix_copy_KC_to_KCCK( naive_delfilter, delfilter_libxsmm , nIFm, nOFm, bc, bk ); copy_buf(naive_bias, bias_libxsmm, nOFm); copy_buf(naive_delbias, delbias_libxsmm, nOFm); } /* bind buffers and filter to handle */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_input, LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_delinput, LIBXSMM_DNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_output, LIBXSMM_DNN_REGULAR_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_deloutput, LIBXSMM_DNN_GRADIENT_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_filter, LIBXSMM_DNN_REGULAR_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_delfilter, LIBXSMM_DNN_GRADIENT_FILTER ) ); if ( format == 'B' ) { CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_bias, LIBXSMM_DNN_REGULAR_CHANNEL_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_delbias, LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_relumask, LIBXSMM_DNN_RELU_MASK ) ); } /* let's allocate and bind scratch */ scratch_size = libxsmm_dnn_fullyconnected_get_scratch_size( libxsmm_handle, &status ); CHKERR_LIBXSMM_DNN( status ); scratch = libxsmm_aligned_scratch( scratch_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_scratch( libxsmm_handle, scratch ) ); /* set scratch to bogus to make sure that libxsmm takes care of zeroing internally */ init_buf( (float*)scratch, scratch_size/4, 0, 0 ); if ((type == 'A' || type == 'F') && LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Correctness - FWD (custom-Storage) #\n"); printf("##########################################\n"); #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) ); } /* copy out data */ if ( format == 'L' ) { CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_output, (void*)naive_libxsmm_output, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); } else { matrix_copy_NCNC_to_NC( output_libxsmm, naive_libxsmm_output, 1, nImg, nOFm, bn, bk ); } /* compare */ libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_F32, nImg*nOFm, 1, naive_output, naive_libxsmm_output, 0, 0); printf("L1 reference : %.25g\n", norms_fwd.l1_ref); printf("L1 test : %.25g\n", norms_fwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_fwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_fwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel); printf("Check-norm : %.24f\n", norms_fwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_fwd); } if ( (type == 'A' || type == 'B') && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - BWD (custom-Storage) #\n"); printf("##########################################\n"); #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ) ); } /* copy out data */ if ( format == 'L' ) { CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_delinput, (void*)naive_libxsmm_delinput, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); } else { matrix_copy_NCNC_to_NC( delinput_libxsmm, naive_libxsmm_delinput, 1, nImg, nIFm, bn, bc ); } /* compare */ libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, nImg*nIFm, 1, naive_delinput, naive_libxsmm_delinput, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); if ( (fuse_type == 1) || (fuse_type == 4) || (fuse_type == 5) ) { libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, nOFm, 1, naive_delbias, delbias_libxsmm, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); } } if ( (type == 'A' || type == 'U') && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - UPD (custom-Storage) #\n"); printf("##########################################\n"); #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ) ); } /* copy out data */ if ( format == 'L' ) { CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_delfilter, (void*)naive_libxsmm_delfilter, LIBXSMM_DNN_TENSOR_FORMAT_KCRS ) ); } else { matrix_copy_KCCK_to_KC( delfilter_libxsmm, naive_libxsmm_delfilter, nIFm, nOFm, bc, bk ); } /* compare */ libxsmm_matdiff(&norms_upd, LIBXSMM_DATATYPE_F32, nIFm*nOFm, 1, naive_delfilter, naive_libxsmm_delfilter, 0, 0); printf("L1 reference : %.25g\n", norms_upd.l1_ref); printf("L1 test : %.25g\n", norms_upd.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd.linf_rel); printf("Check-norm : %.24f\n", norms_upd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd); } if ( (type == 'A' || type == 'M') && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - BWDUPD (custom-Storage) #\n"); printf("##########################################\n"); #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, 0, tid ) ); } /* copy out data */ if ( format == 'L' ) { CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_delinput, (void*)naive_libxsmm_delinput, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_delfilter, (void*)naive_libxsmm_delfilter, LIBXSMM_DNN_TENSOR_FORMAT_KCRS ) ); } else { matrix_copy_NCNC_to_NC( delinput_libxsmm, naive_libxsmm_delinput, 1, nImg, nIFm, bn, bc ); matrix_copy_KCCK_to_KC( delfilter_libxsmm, naive_libxsmm_delfilter, nIFm, nOFm, bc, bk ); } /* compare */ libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, nImg*nIFm, 1, naive_delinput, naive_libxsmm_delinput, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); if ( (fuse_type == 1) || (fuse_type == 4) || (fuse_type == 5) ) { libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, nOFm, 1, naive_delbias, delbias_libxsmm, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); } libxsmm_matdiff(&norms_upd, LIBXSMM_DATATYPE_F32, nIFm*nOFm, 1, naive_delfilter, naive_libxsmm_delfilter, 0, 0); printf("L1 reference : %.25g\n", norms_upd.l1_ref); printf("L1 test : %.25g\n", norms_upd.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd.linf_rel); printf("Check-norm : %.24f\n", norms_upd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd); } if (type == 'A' || type == 'F') { printf("##########################################\n"); printf("# Performance - FWD (custom-Storage) #\n"); printf("##########################################\n"); l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); gflop = (2.0*(double)nImg*(double)nIFm*(double)nOFm*(double)iters) / (1000*1000*1000); printf("GFLOP = %.5g\n", gflop/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", gflop/l_total); printf("PERFDUMP,FP,%s,%i,%i,%i,%i,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nIFm, nOFm, ((double)(l_total/iters)), gflop/l_total, norms_fwd.l1_ref, norms_fwd.l1_tst, norms_fwd.l2_abs, norms_fwd.l2_rel, norms_fwd.linf_abs, norms_fwd.linf_rel, norms_fwd.normf_rel); } if (type == 'A' || type == 'B') { printf("##########################################\n"); printf("# Performance - BWD (custom-Storage) #\n"); printf("##########################################\n"); l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); gflop = (2.0*(double)nImg*(double)nIFm*(double)nOFm*(double)iters) / (1000*1000*1000); printf("GFLOP = %.5g\n", gflop/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", gflop/l_total); printf("PERFDUMP,BP,%s,%i,%i,%i,%i,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nIFm, nOFm, ((double)(l_total/iters)), gflop/l_total, norms_bwd.l1_ref, norms_bwd.l1_tst, norms_bwd.l2_abs, norms_bwd.l2_rel, norms_bwd.linf_abs, norms_bwd.linf_rel, norms_bwd.normf_rel); } if (type == 'A' || type == 'U') { printf("##########################################\n"); printf("# Performance - UPD (custom-Storage) #\n"); printf("##########################################\n"); l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); gflop = (2.0*(double)nImg*(double)nIFm*(double)nOFm*(double)iters) / (1000*1000*1000); printf("GFLOP = %.5g\n", gflop/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", gflop/l_total); printf("PERFDUMP,UP,%s,%i,%i,%i,%i,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nIFm, nOFm, ((double)(l_total/iters)), gflop/l_total, norms_upd.l1_ref, norms_upd.l1_tst, norms_upd.l2_abs, norms_upd.l2_rel, norms_upd.linf_abs, norms_upd.linf_rel, norms_upd.normf_rel); } if (type == 'A' || type == 'M') { printf("##########################################\n"); printf("# Performance - BWDUPD (custom-Storage) #\n"); printf("##########################################\n"); l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); gflop = (4.0*(double)nImg*(double)nIFm*(double)nOFm*(double)iters) / (1000*1000*1000); printf("GFLOP = %.5g\n", gflop/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", gflop/l_total); printf("PERFDUMP,UP,%s,%i,%i,%i,%i,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nIFm, nOFm, ((double)(l_total/iters)), gflop/l_total, norms_upd.l1_ref, norms_upd.l1_tst, norms_upd.l2_abs, norms_upd.l2_rel, norms_upd.linf_abs, norms_upd.linf_rel, norms_upd.normf_rel); } /* clean-up */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_scratch( libxsmm_handle ) ); libxsmm_free(scratch); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_FILTER ) ); if ( format == 'B' ) { CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_CHANNEL_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_RELU_MASK ) ); } CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_input ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_delinput ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_output ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_deloutput ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_filter ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_delfilter ) ); if ( format == 'B' ) { CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_bias ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_delbias ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_relumask ) ); } CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_fullyconnected( libxsmm_handle ) ); } /* deallocate data */ libxsmm_free(naive_input); libxsmm_free(naive_output); libxsmm_free(naive_delinput); libxsmm_free(naive_deloutput); libxsmm_free(naive_deloutput_copy); libxsmm_free(naive_filter); libxsmm_free(naive_delfilter); libxsmm_free(naive_bias); libxsmm_free(naive_delbias); libxsmm_free(naive_libxsmm_output); libxsmm_free(naive_libxsmm_delinput); libxsmm_free(naive_libxsmm_delfilter); libxsmm_free(input_libxsmm); libxsmm_free(output_libxsmm); libxsmm_free(delinput_libxsmm); libxsmm_free(deloutput_libxsmm); libxsmm_free(filter_libxsmm); libxsmm_free(delfilter_libxsmm); libxsmm_free(bias_libxsmm); libxsmm_free(delbias_libxsmm); libxsmm_free(relumask_libxsmm); { const char *const env_check_scale = getenv("CHECK_SCALE"); const double check_scale = LIBXSMM_ABS(0 == env_check_scale ? 1.0 : atof(env_check_scale)); if (LIBXSMM_NEQ(0, check) && (check < 100.0 * check_scale * diff.normf_rel) && (global_status == LIBXSMM_DNN_SUCCESS)) { fprintf(stderr, "FAILED with an error of %f%%!\n", 100.0 * diff.normf_rel); exit(EXIT_FAILURE); } } /* some empty lines at the end */ printf("\n\n\n"); return global_status; } libxsmm-1.17/samples/deeplearning/fullyconnecteddriver/run_fullyconnected.sh000077500000000000000000000067651415223013700277250ustar00rootroot00000000000000#!/usr/bin/env bash set -eo pipefail UNAME=$(command -v uname) SORT=$(command -v sort) GREP=$(command -v grep) CUT=$(command -v cut) WC=$(command -v wc) TR=$(command -v tr) NUMA=-1 if [ "" = "${CHECK}" ] || [ "0" = "${CHECK}" ]; then if [ "" = "${CHECK_DNN_MB}" ]; then CHECK_DNN_MB=64; fi if [ "" = "${CHECK_DNN_ITERS}" ]; then CHECK_DNN_ITERS=1000; fi else # check if [ "" = "${CHECK_DNN_MB}" ]; then CHECK_DNN_MB=64; fi if [ "" = "${CHECK_DNN_ITERS}" ]; then CHECK_DNN_ITERS=1; fi fi if [ $# -ne 9 ] then echo "Usage: $(basename $0) format=(L, B) bin=(f32, bf16) iters MB type=(A, F, B, U, M) fuse=(0 (None), 1 (Bias), 2 (ReLU), 3 (Sigmoid), 4 (Bias+ReLU), 5 (Bias+Sigmoid)) bn bc bk" FORMAT=B BIN=f32 ITERS=${CHECK_DNN_ITERS} MB=${CHECK_DNN_MB} TYPE=A FUSE=0 BN=32 BC=32 BK=32 else FORMAT=$1 BIN=$2 ITERS=$3 MB=$4 TYPE=$5 FUSE=$6 BN=$7 BC=$8 BK=$9 fi if [ "${GREP}" ] && [ "${SORT}" ] && [ "${CUT}" ] && [ "${TR}" ] && [ "${WC}" ]; then if [ "$(command -v lscpu)" ]; then NS=$(lscpu | ${GREP} -m1 "Socket(s)" | ${TR} -d " " | ${CUT} -d: -f2) if [ "" = "${NS}" ]; then NS=1; fi NC=$((NS*$(lscpu | ${GREP} -m1 "Core(s) per socket" | ${TR} -d " " | ${CUT} -d: -f2))) NT=$((NC*$(lscpu | ${GREP} -m1 "Thread(s) per core" | ${TR} -d " " | ${CUT} -d: -f2))) elif [ -e /proc/cpuinfo ]; then NS=$(${GREP} "physical id" /proc/cpuinfo | ${SORT} -u | ${WC} -l | ${TR} -d " ") if [ "" = "${NS}" ] || [ "" = "${NS}" ]; then NS=1; fi NC=$((NS*$(${GREP} -m1 "cpu cores" /proc/cpuinfo | ${TR} -d " " | ${CUT} -d: -f2))) NT=$(${GREP} "core id" /proc/cpuinfo | ${WC} -l | ${TR} -d " ") elif [ "Darwin" = "$(uname)" ]; then NS=$(sysctl hw.packages | ${CUT} -d: -f2 | ${TR} -d " ") NC=$(sysctl hw.physicalcpu | ${CUT} -d: -f2 | ${TR} -d " ") NT=$(sysctl hw.logicalcpu | ${CUT} -d: -f2 | ${TR} -d " ") fi if [ "${NC}" ] && [ "${NT}" ]; then HT=$((NT/NC)) else NS=1 NC=1 NT=1 HT=1 fi if [ "$(command -v numactl)" ]; then NN=$(numactl -H | ${GREP} "available:" | ${CUT} -d' ' -f2) else NN=${NS} fi fi CPUFLAGS=$(if [ "${GREP}" ] && [ "${CUT}" ] && [ -e /proc/cpuinfo ]; then ${GREP} -m1 flags /proc/cpuinfo | ${CUT} -d: -f2- || true; fi) if [ "${GREP}" ] && [ "$(echo "${CPUFLAGS}" | ${GREP} -o avx512er)" ]; then if [ "0" != "$((0>NUMA))" ] && [ "0" != "$((NS #include #include #include #include #if defined(_OPENMP) # include #endif /* include c-based dnn library */ #include "../common/dnn_common.h" #define CHKERR_LIBXSMM_DNN(A) { const int chkerr_libxsmm_dnn_ = A; if (LIBXSMM_DNN_SUCCESS != chkerr_libxsmm_dnn_) { \ fprintf(stderr, "%s\n", libxsmm_dnn_get_error(chkerr_libxsmm_dnn_)); global_status = chkerr_libxsmm_dnn_; } \ } int main(int argc, char* argv[]) { float *naive_input, *naive_output, *naive_input_add, *naive_delinput_add, *naive_delinput, *naive_deloutput; float *naive_input_pad, *naive_output_pad, *naive_input_add_pad, *naive_delinput_add_pad, *naive_delinput_pad, *naive_deloutput_pad; libxsmm_bfloat16 *naive_input_pad_bf16, *naive_output_pad_bf16, *naive_input_add_pad_bf16, *naive_delinput_add_pad_bf16, *naive_delinput_pad_bf16, *naive_deloutput_pad_bf16; libxsmm_bfloat16 *naive_libxsmm_output, *naive_libxsmm_delinput, *naive_libxsmm_delinput_add; float *naive_libxsmm_output_f32, *naive_libxsmm_delinput_f32, *naive_libxsmm_delinput_add_f32; float *naive_beta, *naive_gamma, *naive_delbeta, *naive_delgamma, *naive_expectval, *naive_rcpstddev, *naive_variance; libxsmm_bfloat16 *input_libxsmm, *output_libxsmm, *input_add_libxsmm, *delinput_libxsmm, *deloutput_libxsmm, *delinput_add_libxsmm; float *beta_libxsmm, *gamma_libxsmm, *delbeta_libxsmm, *delgamma_libxsmm, *expectval_libxsmm, *rcpstddev_libxsmm, *variance_libxsmm; unsigned char* relumask_libxsmm; int ifhp, ifwp, ofhp, ofwp, ofh, ofw; int stride_h, stride_w; naive_fusedbatchnorm_t naive_param; void* scratch; size_t scratch_size = 0; /* some parameters we can overwrite via cli, default is some inner layer of overfeat */ int iters = 10; /* repetitions of benchmark */ int ifw = 14; /* input width, "W" */ int ifh = 20; /* input height, "H" */ int nImg = 32; /* mini-batch size, "N" */ int nFm = 256; /* number of input feature maps, "C" */ int stride = 1; /* stride when accessing inputs */ int pad_h_in = 0; /* padding mode */ int pad_w_in = 0; /* padding mode */ int pad_h_out = 0; /* padding mode */ int pad_w_out = 0; /* padding mode */ int norm_type = 0; /* 0: full batchnorm, 1: batch scaling only */ int fuse_type = 0; /* 0: nothing fused, 1: relu fused, 2: elementwise fused, 3: relu and elementwise fused */ char type = 'A'; /* 'A': ALL, 'F': FP, 'B': BP, 'U', WU */ char format = 'L'; const char *const env_check = getenv("CHECK"); const double check = LIBXSMM_ABS(0 == env_check ? 1 : atof(env_check)); #if defined(_OPENMP) int nThreads = omp_get_max_threads(); /* number of threads */ #else int nThreads = 1; /* number of threads */ #endif unsigned long long l_start, l_end; double l_total = 0.0; double gb = 0.0; double gib = 0.0; int i; int relu_no_match; libxsmm_dnn_fusedbatchnorm_desc fusedbatchnorm_desc; libxsmm_dnn_fusedbatchnorm* libxsmm_handle; libxsmm_dnn_tensor* libxsmm_input; libxsmm_dnn_tensor* libxsmm_delinput; libxsmm_dnn_tensor* libxsmm_output; libxsmm_dnn_tensor* libxsmm_deloutput; libxsmm_dnn_tensor* libxsmm_input_add; libxsmm_dnn_tensor* libxsmm_delinput_add; libxsmm_dnn_tensor* libxsmm_beta; libxsmm_dnn_tensor* libxsmm_gamma; libxsmm_dnn_tensor* libxsmm_delbeta; libxsmm_dnn_tensor* libxsmm_delgamma; libxsmm_dnn_tensor* libxsmm_expectval; libxsmm_dnn_tensor* libxsmm_rcpstddev; libxsmm_dnn_tensor* libxsmm_variance; libxsmm_dnn_tensor* libxsmm_relumask; libxsmm_dnn_tensor_datalayout* libxsmm_layout; libxsmm_dnn_err_t status; libxsmm_dnn_err_t global_status = LIBXSMM_DNN_SUCCESS; libxsmm_matdiff_info norms_fwd, norms_bwd, diff; libxsmm_matdiff_clear(&norms_fwd); libxsmm_matdiff_clear(&norms_bwd); libxsmm_matdiff_clear(&diff); if (argc > 1 && !strncmp(argv[1], "-h", 3)) { printf("Usage: %s iters inpWidth inpHeight nImg nFm pad_w_in pad_h_in pad_w_out pad_h_out stride type format\n", argv[0]); return 0; } libxsmm_rng_set_seed(1); /* reading new values from cli */ i = 1; if (argc > i) iters = atoi(argv[i++]); if (argc > i) ifw = atoi(argv[i++]); if (argc > i) ifh = atoi(argv[i++]); if (argc > i) nImg = atoi(argv[i++]); if (argc > i) nFm = atoi(argv[i++]); if (argc > i) pad_w_in = atoi(argv[i++]); if (argc > i) pad_h_in = atoi(argv[i++]); if (argc > i) pad_w_out = atoi(argv[i++]); if (argc > i) pad_h_out = atoi(argv[i++]); if (argc > i) stride = atoi(argv[i++]); if (argc > i) norm_type = atoi(argv[i++]); if (argc > i) fuse_type = atoi(argv[i++]); if (argc > i) type = *(argv[i++]); if (type != 'A' && type != 'F' && type != 'B') { printf("type needs to be 'A' (All), 'F' (FP only), 'B' (BP only)\n"); return -1; } if ((norm_type != 0) && (norm_type != 1)) { printf("norm type needs to be 0 or 1\n"); return -1; } if ((fuse_type < 0) || (fuse_type > 5)) { printf("fuse type needs to be 0, 1, 2, 3, 4 or 5\n"); return -1; } stride_w = stride; stride_h = stride; /* deriving some values for naive code */ ofh = ifh/stride_h; ofw = ifw/stride_w; ifhp = ifh + 2 * pad_h_in; ifwp = ifw + 2 * pad_w_in; ofhp = ofh + 2 * pad_h_out; ofwp = ofw + 2 * pad_w_out; /* set struct for naive convolution */ naive_param.N = nImg; naive_param.C = nFm; naive_param.H = ifh; naive_param.W = ifw; naive_param.stride_h = stride_h; naive_param.stride_w = stride_w; naive_param.norm_type = norm_type; naive_param.fuse_type = fuse_type; #if defined(__SSE3__) _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); #endif /* print some summary */ printf("##########################################\n"); printf("# Setting Up (Common) #\n"); printf("##########################################\n"); printf("PARAMS: W:%d H:%d N:%d C:%d P:%d Q:%d STRIDE:%d\n", ifw, ifh, nImg, nFm, ofh, ofw, stride); printf("PARAMS: ITERS:%d", iters); if (LIBXSMM_FEQ(0, check)) printf(" Threads:%d\n", nThreads); else printf("\n"); printf(" InImg %dx%d Padded (%dx%d)\n", ifh, ifw, ifhp, ifwp); printf("OutImg %dx%d Padded (%dx%d)\n", ofh, ofw, ofhp, ofwp); printf("SIZE Input (MB): %10.2f MiB\n", (double)(nImg*nFm*ifhp*ifwp*sizeof(libxsmm_bfloat16))/(1024.0*1024.0) ); printf("SIZE Output (MB): %10.2f MiB\n", (double)(nImg*nFm*ofhp*ofwp*sizeof(libxsmm_bfloat16))/(1024.0*1024.0) ); printf("SIZE Input (1): %10.2f MiB\n", (double)(1*nFm*ifhp*ifwp* sizeof(libxsmm_bfloat16))/(1024.0*1024.0) ); printf("SIZE Output (1): %10.2f MiB\n", (double)(1*nFm*ofhp*ofwp* sizeof(libxsmm_bfloat16))/(1024.0*1024.0) ); #if defined(USE_OVERWRITE) printf("Using Overwrite Option\n"); #endif /* allocate data */ naive_input = (float*)libxsmm_aligned_malloc( nImg*nFm*ifh *ifw *sizeof(float), 2097152); naive_input_add = (float*)libxsmm_aligned_malloc( nImg*nFm*ifh *ifw *sizeof(float), 2097152); naive_delinput = (float*)libxsmm_aligned_malloc( nImg*nFm*ifh *ifw *sizeof(float), 2097152); naive_delinput_add = (float*)libxsmm_aligned_malloc( nImg*nFm*ifh *ifw *sizeof(float), 2097152); naive_output = (float*)libxsmm_aligned_malloc( nImg*nFm*ofh *ofw *sizeof(float), 2097152); naive_deloutput = (float*)libxsmm_aligned_malloc( nImg*nFm*ofh *ofw *sizeof(float), 2097152); naive_input_pad = (float*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(float), 2097152); naive_input_add_pad = (float*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(float), 2097152); naive_delinput_pad = (float*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(float), 2097152); naive_delinput_add_pad = (float*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(float), 2097152); naive_output_pad = (float*)libxsmm_aligned_malloc( nImg*nFm*ofhp*ofwp*sizeof(float), 2097152); naive_deloutput_pad = (float*)libxsmm_aligned_malloc( nImg*nFm*ofhp*ofwp*sizeof(float), 2097152); naive_input_pad_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(libxsmm_bfloat16), 2097152); naive_input_add_pad_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(libxsmm_bfloat16), 2097152); naive_delinput_pad_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(libxsmm_bfloat16), 2097152); naive_delinput_add_pad_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(libxsmm_bfloat16), 2097152); naive_output_pad_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nFm*ofhp*ofwp*sizeof(libxsmm_bfloat16), 2097152); naive_deloutput_pad_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nFm*ofhp*ofwp*sizeof(libxsmm_bfloat16), 2097152); naive_libxsmm_output = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nFm*ofhp*ofwp*sizeof(libxsmm_bfloat16), 2097152); naive_libxsmm_delinput = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(libxsmm_bfloat16), 2097152); naive_libxsmm_delinput_add = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(libxsmm_bfloat16), 2097152); naive_libxsmm_output_f32 = (float*)libxsmm_aligned_malloc( nImg*nFm*ofhp*ofwp*sizeof(float), 2097152); naive_libxsmm_delinput_f32 = (float*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(float), 2097152); naive_libxsmm_delinput_add_f32 = (float*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(float), 2097152); input_libxsmm = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(libxsmm_bfloat16), 2097152); delinput_libxsmm = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(libxsmm_bfloat16), 2097152); input_add_libxsmm = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(libxsmm_bfloat16), 2097152); delinput_add_libxsmm = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(libxsmm_bfloat16), 2097152); output_libxsmm = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nFm*ofhp*ofwp*sizeof(libxsmm_bfloat16), 2097152); deloutput_libxsmm = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nFm*ofhp*ofwp*sizeof(libxsmm_bfloat16), 2097152); naive_beta = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); naive_gamma = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); naive_delbeta = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); naive_delgamma = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); naive_expectval = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); naive_rcpstddev = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); naive_variance = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); beta_libxsmm = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); gamma_libxsmm = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); delbeta_libxsmm = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); delgamma_libxsmm = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); expectval_libxsmm = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); rcpstddev_libxsmm = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); variance_libxsmm = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); relumask_libxsmm = (unsigned char*)libxsmm_aligned_malloc( nImg*nFm*ofhp*ofwp*sizeof(unsigned char), 2097152); /* initialize data */ init_buf( naive_input, nImg*nFm*ifh*ifw, 0, 0 ); copy_internal_nchw( naive_input_pad , naive_input, nImg, nFm, ifh, ifw, pad_h_in, pad_w_in ); init_buf( naive_delinput, nImg*nFm*ifh*ifw, 0, 0 ); copy_internal_nchw( naive_delinput_pad, naive_delinput, nImg, nFm, ifh, ifw, pad_h_in, pad_w_in ); init_buf( naive_input_add, nImg*nFm*ifh*ifw, 0, 0 ); copy_internal_nchw( naive_input_add_pad, naive_input_add, nImg, nFm, ifh, ifw, pad_h_in, pad_w_in ); init_buf( naive_delinput_add, nImg*nFm*ifh*ifw, 0, 0 ); copy_internal_nchw( naive_delinput_add_pad, naive_delinput_add, nImg, nFm, ifh, ifw, pad_h_in, pad_w_in ); init_buf( naive_output, nImg*nFm*ofh*ofw, 0, 0 ); copy_internal_nchw( naive_output_pad, naive_output, nImg, nFm, ofh, ofw, pad_h_out, pad_w_out ); init_buf( naive_deloutput, nImg*nFm*ofh*ofw, 0, 0 ); copy_internal_nchw( naive_deloutput_pad, naive_deloutput, nImg, nFm, ofh, ofw, pad_h_out, pad_w_out ); set_zeropad_nchw(naive_input_pad, nImg, nFm, ifhp, ifwp, pad_h_in, pad_w_in); set_zeropad_nchw(naive_delinput_pad, nImg, nFm, ifhp, ifwp, pad_h_in, pad_w_in); set_zeropad_nchw(naive_input_add_pad, nImg, nFm, ifhp, ifwp, pad_h_in, pad_w_in); set_zeropad_nchw(naive_delinput_add_pad, nImg, nFm, ifhp, ifwp, pad_h_in, pad_w_in); set_zeropad_nchw(naive_output_pad, nImg, nFm, ofhp, ofwp, pad_h_out, pad_w_out); set_zeropad_nchw(naive_deloutput_pad, nImg, nFm, ofhp, ofwp, pad_h_out, pad_w_out); libxsmm_rne_convert_fp32_bf16( naive_input_pad, naive_input_pad_bf16, nImg*nFm*ifhp*ifwp ); libxsmm_rne_convert_fp32_bf16( naive_delinput_pad, naive_delinput_pad_bf16, nImg*nFm*ifhp*ifwp ); libxsmm_rne_convert_fp32_bf16( naive_input_add_pad, naive_input_add_pad_bf16, nImg*nFm*ifhp*ifwp ); libxsmm_rne_convert_fp32_bf16( naive_delinput_add_pad, naive_delinput_add_pad_bf16, nImg*nFm*ifhp*ifwp ); libxsmm_rne_convert_fp32_bf16( naive_output_pad, naive_output_pad_bf16, nImg*nFm*ofhp*ofwp ); libxsmm_rne_convert_fp32_bf16( naive_deloutput_pad, naive_deloutput_pad_bf16, nImg*nFm*ofhp*ofwp ); init_buf(naive_beta, nFm, 0, 0); init_buf(naive_gamma, nFm, 0, 0); init_buf(naive_delbeta, nFm, 0, 0); init_buf(naive_delgamma, nFm, 0, 0); init_buf(naive_expectval, nFm, 0, 0); init_buf(naive_rcpstddev, nFm, 0, 0); init_buf(naive_variance, nFm, 0, 0); copy_buf(naive_beta, beta_libxsmm, nFm); copy_buf(naive_gamma, gamma_libxsmm, nFm); copy_buf(naive_delbeta, delbeta_libxsmm, nFm); copy_buf(naive_delgamma, delgamma_libxsmm, nFm); copy_buf(naive_expectval, expectval_libxsmm, nFm); copy_buf(naive_rcpstddev, rcpstddev_libxsmm, nFm); copy_buf(naive_variance, variance_libxsmm, nFm); if (LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Computing Reference ... #\n"); printf("##########################################\n"); if (type == 'A' || type == 'F') { naive_fusedbatchnorm_fp(&naive_param, naive_input, naive_output, naive_input_add, naive_beta, naive_gamma, naive_expectval, naive_rcpstddev, naive_variance); } if (type == 'A' || type == 'B') { naive_fusedbatchnorm_bp(&naive_param, naive_input, naive_delinput, naive_output, naive_deloutput, naive_delinput_add, naive_beta, naive_delbeta, naive_gamma, naive_delgamma, naive_expectval, naive_rcpstddev); } printf("##########################################\n"); printf("# Computing Reference ... done #\n"); printf("##########################################\n"); } if (format == 'A' || format == 'L') { printf("\n"); printf("##########################################\n"); printf("# Setting Up (custom-Storage) #\n"); printf("##########################################\n"); /* setup LIBXSMM handle */ fusedbatchnorm_desc.partN = nImg; fusedbatchnorm_desc.fullN = nImg; fusedbatchnorm_desc.C = nFm; fusedbatchnorm_desc.H = ifh; fusedbatchnorm_desc.W = ifw; fusedbatchnorm_desc.u = stride_h; fusedbatchnorm_desc.v = stride_w; fusedbatchnorm_desc.pad_h_in = pad_h_in; fusedbatchnorm_desc.pad_w_in = pad_w_in; fusedbatchnorm_desc.pad_h_out = pad_h_out; fusedbatchnorm_desc.pad_w_out = pad_w_out; fusedbatchnorm_desc.threads = nThreads; fusedbatchnorm_desc.datatype_in = LIBXSMM_DNN_DATATYPE_BF16; fusedbatchnorm_desc.datatype_out = LIBXSMM_DNN_DATATYPE_BF16; fusedbatchnorm_desc.datatype_stats = LIBXSMM_DNN_DATATYPE_F32; fusedbatchnorm_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; fusedbatchnorm_desc.fuse_order = LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU; if ( norm_type == 0 ) { if ( fuse_type == 0 ) { fusedbatchnorm_desc.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BN; } else if ( fuse_type == 1 ) { fusedbatchnorm_desc.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BN_RELU; } else if ( fuse_type == 2 ) { fusedbatchnorm_desc.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BN_ELTWISE; } else if ( fuse_type == 3 ) { fusedbatchnorm_desc.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BN_ELTWISE_RELU; } else if ( fuse_type == 4 ) { fusedbatchnorm_desc.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BN_RELU_WITH_MASK; } else if ( fuse_type == 5 ) { fusedbatchnorm_desc.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BN_ELTWISE_RELU_WITH_MASK; } else { /* shouldn't happen */ return -1; } } else { if ( fuse_type == 0 ) { fusedbatchnorm_desc.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE; } else if ( fuse_type == 1 ) { fusedbatchnorm_desc.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE_RELU; } else if ( fuse_type == 2 ) { fusedbatchnorm_desc.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE_ELTWISE; } else if ( fuse_type == 3 ) { fusedbatchnorm_desc.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE_ELTWISE_RELU; } else if ( fuse_type == 4 ) { fusedbatchnorm_desc.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE_RELU_WITH_MASK; } else if ( fuse_type == 5 ) { fusedbatchnorm_desc.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE_ELTWISE_RELU_WITH_MASK; } else { /* shouldn't happen */ return -1; } } libxsmm_handle = libxsmm_dnn_create_fusedbatchnorm( fusedbatchnorm_desc, &status ); CHKERR_LIBXSMM_DNN( status ); /* setup LIBXSMM buffers */ libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_input = libxsmm_dnn_link_tensor( libxsmm_layout, input_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delinput = libxsmm_dnn_link_tensor( libxsmm_layout, delinput_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT_ADD, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_input_add = libxsmm_dnn_link_tensor( libxsmm_layout, input_add_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT_ADD, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delinput_add = libxsmm_dnn_link_tensor( libxsmm_layout, delinput_add_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_output = libxsmm_dnn_link_tensor( libxsmm_layout, output_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_deloutput = libxsmm_dnn_link_tensor( libxsmm_layout, deloutput_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_CHANNEL_BETA, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_beta = libxsmm_dnn_link_tensor( libxsmm_layout, beta_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_CHANNEL_BETA, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delbeta = libxsmm_dnn_link_tensor( libxsmm_layout, delbeta_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_gamma = libxsmm_dnn_link_tensor( libxsmm_layout, gamma_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delgamma = libxsmm_dnn_link_tensor( libxsmm_layout, delgamma_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_CHANNEL_EXPECTVAL, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_expectval = libxsmm_dnn_link_tensor( libxsmm_layout, expectval_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_CHANNEL_RCPSTDDEV, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_rcpstddev = libxsmm_dnn_link_tensor( libxsmm_layout, rcpstddev_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_CHANNEL_VARIANCE, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_variance = libxsmm_dnn_link_tensor( libxsmm_layout, variance_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RELU_MASK, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_relumask = libxsmm_dnn_link_tensor( libxsmm_layout, relumask_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); /* copy in data to LIBXSMM format */ /* we can also use the layout functions and set the data on our own external to the library */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_input, (void*)naive_input_pad_bf16, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_output, (void*)naive_output_pad_bf16, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_input_add, (void*)naive_input_add_pad_bf16, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_delinput, (void*)naive_delinput_pad_bf16, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_deloutput, (void*)naive_deloutput_pad_bf16, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_delinput_add, (void*)naive_delinput_add_pad_bf16, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); /* bind buffers and filter to handle */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle, libxsmm_input, LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle, libxsmm_delinput, LIBXSMM_DNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle, libxsmm_output, LIBXSMM_DNN_REGULAR_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle, libxsmm_deloutput, LIBXSMM_DNN_GRADIENT_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle, libxsmm_input_add, LIBXSMM_DNN_REGULAR_INPUT_ADD ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle, libxsmm_delinput_add, LIBXSMM_DNN_GRADIENT_INPUT_ADD ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle, libxsmm_beta, LIBXSMM_DNN_REGULAR_CHANNEL_BETA ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle, libxsmm_gamma, LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle, libxsmm_delbeta, LIBXSMM_DNN_GRADIENT_CHANNEL_BETA ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle, libxsmm_delgamma, LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle, libxsmm_expectval, LIBXSMM_DNN_CHANNEL_EXPECTVAL ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle, libxsmm_rcpstddev, LIBXSMM_DNN_CHANNEL_RCPSTDDEV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle, libxsmm_variance, LIBXSMM_DNN_CHANNEL_VARIANCE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle, libxsmm_relumask, LIBXSMM_DNN_RELU_MASK ) ); /* let's allocate and bind scratch */ scratch_size = libxsmm_dnn_fusedbatchnorm_get_scratch_size( libxsmm_handle, &status ); CHKERR_LIBXSMM_DNN( status ); scratch = libxsmm_aligned_scratch( scratch_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_scratch( libxsmm_handle, scratch ) ); /* set scratch to bogus to make sure that libxsmm takes care of zeroing internally */ init_buf( (float*)scratch, scratch_size/4, 0, 0 ); if ((type == 'A' || type == 'F') && LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Correctness - FWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolutions */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) ); } /* copy out data */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_output, (void*)naive_libxsmm_output, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); libxsmm_convert_bf16_f32( naive_libxsmm_output, naive_libxsmm_output_f32, nImg*nFm*ofhp*ofwp ); copy_internal_nchw( naive_output_pad, naive_output, nImg, nFm, ofh, ofw, pad_h_out, pad_w_out); /* compare */ printf("rcpstddev:\n"); libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_F32, nFm, 1, naive_rcpstddev, rcpstddev_libxsmm, 0, 0); printf("L1 reference : %.25g\n", norms_fwd.l1_ref); printf("L1 test : %.25g\n", norms_fwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_fwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_fwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel); printf("Check-norm : %.24f\n", norms_fwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_fwd); printf("variance:\n"); libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_F32, nFm, 1, naive_variance, variance_libxsmm, 0, 0); printf("L1 reference : %.25g\n", norms_fwd.l1_ref); printf("L1 test : %.25g\n", norms_fwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_fwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_fwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel); printf("Check-norm : %.24f\n", norms_fwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_fwd); printf("expected value:\n"); libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_F32, nFm, 1, naive_expectval, expectval_libxsmm, 0, 0); printf("L1 reference : %.25g\n", norms_fwd.l1_ref); printf("L1 test : %.25g\n", norms_fwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_fwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_fwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel); printf("Check-norm : %.24f\n", norms_fwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_fwd); printf("output:\n"); libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_F32, nImg*nFm*ofhp*ofwp, 1, naive_output_pad, naive_libxsmm_output_f32, 0, 0); printf("L1 reference : %.25g\n", norms_fwd.l1_ref); printf("L1 test : %.25g\n", norms_fwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_fwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_fwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel); printf("Check-norm : %.24f\n", norms_fwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_fwd); /* let's check ReLU positions */ relu_no_match = 0; for ( i = 0; i < nImg*nFm*ofhp*ofwp; ++i ) { if ( (naive_output_pad[i] == 0.0f && naive_libxsmm_output_f32[i] != 0.0f) || (naive_output_pad[i] != 0.0f && naive_libxsmm_output_f32[i] == 0.0f) ) { relu_no_match++; } } printf("ReLU mismatch count: %i\n", relu_no_match ); } if ( (type == 'A' || type == 'B') && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - BWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolutions */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ) ); } /* copy out data */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_delinput, (void*)naive_libxsmm_delinput, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_delinput_add, (void*)naive_libxsmm_delinput_add, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); libxsmm_convert_bf16_f32( naive_libxsmm_delinput, naive_libxsmm_delinput_f32, nImg*nFm*ifhp*ifwp ); libxsmm_convert_bf16_f32( naive_libxsmm_delinput_add, naive_libxsmm_delinput_add_f32, nImg*nFm*ifhp*ifwp ); copy_internal_nchw( naive_delinput_pad, naive_delinput, nImg, nFm, ifh, ifw, pad_h_in, pad_w_in); copy_internal_nchw( naive_delinput_add_pad, naive_delinput_add, nImg, nFm, ifh, ifw, pad_h_in, pad_w_in); /* compare */ printf("delinput_add:\n"); libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, nImg*nFm*ifhp*ifwp, 1, naive_delinput_add_pad, naive_libxsmm_delinput_add_f32, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); printf("delbeta:\n"); libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, nFm, 1, naive_delbeta, delbeta_libxsmm, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); printf("delgamma:\n"); libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, nFm, 1, naive_delgamma, delgamma_libxsmm, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); printf("delinput:\n"); libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, nImg*nFm*ifhp*ifwp, 1, naive_delinput_pad, naive_libxsmm_delinput_f32, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); } if (type == 'A' || type == 'F') { printf("##########################################\n"); printf("# Performance - FWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolution for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_fusedbatchnorm_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); gb = ((double)nImg*(double)nFm*(((double)ifh*(double)ifw) + ((double)ofh*(double)ofw))*(double)sizeof(float)*(double)iters) / (1000*1000*1000); gib = ((double)nImg*(double)nFm*(((double)ifh*(double)ifw) + ((double)ofh*(double)ofw))*(double)sizeof(float)*(double)iters) / (1024*1024*1024); printf("GB = %.5g\n", gb/(double)iters); printf("GiB = %.5g\n", gib/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GB/s = %.5g\n", gb/l_total); printf("GiB/s = %.5g\n", gib/l_total); printf("PERFDUMP,FP,%s,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%.5g,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nFm, ifw, ifh, stride, pad_w_in, pad_h_in, pad_w_out, pad_h_out, ((double)(l_total/iters)), gb/l_total, gib/l_total, norms_fwd.l1_ref, norms_fwd.l1_tst, norms_fwd.l2_abs, norms_fwd.l2_rel, norms_fwd.linf_abs, norms_fwd.linf_rel, norms_fwd.normf_rel); } if ( (type == 'A' || type == 'B') ) { printf("##########################################\n"); printf("# Performance - BWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolution for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_fusedbatchnorm_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); gb = (2.0*(double)nImg*(double)nFm*(((double)ifh*(double)ifw) + (2.0*(double)ofh*(double)ofw))*(double)sizeof(float)*(double)iters) / (1000*1000*1000); gib = (2.0*(double)nImg*(double)nFm*(((double)ifh*(double)ifw) + (2.0*(double)ofh*(double)ofw))*(double)sizeof(float)*(double)iters) / (1024*1024*1024); printf("GB = %.5g\n", gb/(double)iters); printf("GiB = %.5g\n", gib/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GB/s = %.5g\n", gb/l_total); printf("GiB/s = %.5g\n", gib/l_total); printf("PERFDUMP,BP,%s,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%.5g,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nFm, ifw, ifh, stride, pad_w_in, pad_h_in, pad_w_out, pad_h_out, ((double)(l_total/iters)), gb/l_total, gib/l_total, norms_bwd.l1_ref, norms_bwd.l1_tst, norms_bwd.l2_abs, norms_bwd.l2_rel, norms_bwd.linf_abs, norms_bwd.linf_rel, norms_bwd.normf_rel); } /* clean-up */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_release_scratch( libxsmm_handle ) ); libxsmm_free(scratch); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT_ADD ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT_ADD ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_CHANNEL_BETA ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_CHANNEL_BETA ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_CHANNEL_EXPECTVAL ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_CHANNEL_RCPSTDDEV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_CHANNEL_VARIANCE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_RELU_MASK) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_input ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_delinput ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_output ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_deloutput ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_input_add ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_delinput_add ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_beta ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_delbeta ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_gamma ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_delgamma ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_expectval ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_rcpstddev ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_variance ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_relumask ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_fusedbatchnorm( libxsmm_handle ) ); } /* deallocate data */ libxsmm_free(naive_input); libxsmm_free(naive_input_add); libxsmm_free(naive_output); libxsmm_free(naive_delinput); libxsmm_free(naive_delinput_add); libxsmm_free(naive_deloutput); libxsmm_free(naive_input_pad); libxsmm_free(naive_input_add_pad); libxsmm_free(naive_output_pad); libxsmm_free(naive_delinput_pad); libxsmm_free(naive_delinput_add_pad); libxsmm_free(naive_deloutput_pad); libxsmm_free(naive_input_pad_bf16); libxsmm_free(naive_input_add_pad_bf16); libxsmm_free(naive_output_pad_bf16); libxsmm_free(naive_delinput_pad_bf16); libxsmm_free(naive_delinput_add_pad_bf16); libxsmm_free(naive_deloutput_pad_bf16); libxsmm_free(naive_beta); libxsmm_free(naive_gamma); libxsmm_free(naive_delbeta); libxsmm_free(naive_delgamma); libxsmm_free(naive_expectval); libxsmm_free(naive_rcpstddev); libxsmm_free(naive_variance); libxsmm_free(naive_libxsmm_output); libxsmm_free(naive_libxsmm_delinput); libxsmm_free(naive_libxsmm_delinput_add); libxsmm_free(naive_libxsmm_output_f32); libxsmm_free(naive_libxsmm_delinput_f32); libxsmm_free(naive_libxsmm_delinput_add_f32); libxsmm_free(input_libxsmm); libxsmm_free(input_add_libxsmm); libxsmm_free(output_libxsmm); libxsmm_free(delinput_libxsmm); libxsmm_free(delinput_add_libxsmm); libxsmm_free(deloutput_libxsmm); libxsmm_free(beta_libxsmm); libxsmm_free(gamma_libxsmm); libxsmm_free(delbeta_libxsmm); libxsmm_free(delgamma_libxsmm); libxsmm_free(expectval_libxsmm); libxsmm_free(rcpstddev_libxsmm); libxsmm_free(variance_libxsmm); libxsmm_free(relumask_libxsmm); { const char *const env_check_scale = getenv("CHECK_SCALE"); const double check_scale = LIBXSMM_ABS(0 == env_check_scale ? 1.0 : atof(env_check_scale)); if (LIBXSMM_NEQ(0, check) && (check < 100.0 * check_scale * diff.normf_rel) && (global_status == LIBXSMM_DNN_SUCCESS)) { fprintf(stderr, "FAILED with an error of %f%%!\n", 100.0 * diff.normf_rel); exit(EXIT_FAILURE); } } /* some empty lines at the end */ printf("\n\n\n"); return global_status; } libxsmm-1.17/samples/deeplearning/fusedbndriver/layer_example_bf16.vcxproj000066400000000000000000000547721415223013700271570ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 layer_example_bf16 10.0 {2723C222-5053-4CDE-93B8-A5CF62E51FD6} Application Disabled Disabled Sequential v142 true Application true true Disabled Disabled Sequential v142 Application true Disabled Disabled Sequential v142 true Application Disabled Disabled Sequential v142 true true Application true Disabled Disabled Sequential v142 Application true Disabled Disabled true Sequential v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;mkl_rt.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;mkl_rt.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/deeplearning/fusedbndriver/layer_example_f32.c000066400000000000000000001177121415223013700255340ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include #include #include #include #include #if defined(_OPENMP) # include #endif /* include c-based dnn library */ #include "../common/dnn_common.h" #define CHKERR_LIBXSMM_DNN(A) { const int chkerr_libxsmm_dnn_ = A; if (LIBXSMM_DNN_SUCCESS != chkerr_libxsmm_dnn_) { \ fprintf(stderr, "%s\n", libxsmm_dnn_get_error(chkerr_libxsmm_dnn_)); global_status = chkerr_libxsmm_dnn_; } \ } int main(int argc, char* argv[]) { float *naive_input, *naive_output, *naive_input_add, *naive_delinput_add, *naive_delinput, *naive_deloutput; float *naive_input_pad, *naive_output_pad, *naive_input_add_pad, *naive_delinput_add_pad, *naive_delinput_pad, *naive_deloutput_pad; float *naive_libxsmm_output, *naive_libxsmm_delinput, *naive_libxsmm_delinput_add; float *naive_beta, *naive_gamma, *naive_delbeta, *naive_delgamma, *naive_expectval, *naive_rcpstddev, *naive_variance; float *input_libxsmm, *output_libxsmm, *input_add_libxsmm, *delinput_libxsmm, *deloutput_libxsmm, *delinput_add_libxsmm; float *beta_libxsmm, *gamma_libxsmm, *delbeta_libxsmm, *delgamma_libxsmm, *expectval_libxsmm, *rcpstddev_libxsmm, *variance_libxsmm; unsigned char* relumask_libxsmm; int ifhp, ifwp, ofhp, ofwp, ofh, ofw; int stride_h, stride_w; naive_fusedbatchnorm_t naive_param; void* scratch; size_t scratch_size = 0; /* some parameters we can overwrite via cli, default is some inner layer of overfeat */ int iters = 10; /* repetitions of benchmark */ int ifw = 14; /* input width, "W" */ int ifh = 20; /* input height, "H" */ int nImg = 32; /* mini-batch size, "N" */ int nFm = 256; /* number of input feature maps, "C" */ int stride = 1; /* stride when accessing inputs */ int pad_h_in = 0; /* padding mode */ int pad_w_in = 0; /* padding mode */ int pad_h_out = 0; /* padding mode */ int pad_w_out = 0; /* padding mode */ int norm_type = 0; /* 0: full batchnorm, 1: batch scaling only */ int fuse_type = 0; /* 0: nothing fused, 1: relu fused, 2: elementwise fused, 3: relu and elementwise fused */ char type = 'A'; /* 'A': ALL, 'F': FP, 'B': BP, 'U', WU */ char format = 'L'; const char *const env_check = getenv("CHECK"); const double check = LIBXSMM_ABS(0 == env_check ? 1 : atof(env_check)); #if defined(_OPENMP) int nThreads = omp_get_max_threads(); /* number of threads */ #else int nThreads = 1; /* number of threads */ #endif unsigned long long l_start, l_end; double l_total = 0.0; double gb = 0.0; double gib = 0.0; int i; int relu_no_match; libxsmm_dnn_fusedbatchnorm_desc fusedbatchnorm_desc; libxsmm_dnn_fusedbatchnorm* libxsmm_handle; libxsmm_dnn_tensor* libxsmm_input; libxsmm_dnn_tensor* libxsmm_delinput; libxsmm_dnn_tensor* libxsmm_output; libxsmm_dnn_tensor* libxsmm_deloutput; libxsmm_dnn_tensor* libxsmm_input_add; libxsmm_dnn_tensor* libxsmm_delinput_add; libxsmm_dnn_tensor* libxsmm_beta; libxsmm_dnn_tensor* libxsmm_gamma; libxsmm_dnn_tensor* libxsmm_delbeta; libxsmm_dnn_tensor* libxsmm_delgamma; libxsmm_dnn_tensor* libxsmm_expectval; libxsmm_dnn_tensor* libxsmm_rcpstddev; libxsmm_dnn_tensor* libxsmm_variance; libxsmm_dnn_tensor* libxsmm_relumask; libxsmm_dnn_tensor_datalayout* libxsmm_layout; libxsmm_dnn_err_t status; libxsmm_dnn_err_t global_status = LIBXSMM_DNN_SUCCESS; libxsmm_matdiff_info norms_fwd, norms_bwd, diff; libxsmm_matdiff_clear(&norms_fwd); libxsmm_matdiff_clear(&norms_bwd); libxsmm_matdiff_clear(&diff); if (argc > 1 && !strncmp(argv[1], "-h", 3)) { printf("Usage: %s iters inpWidth inpHeight nImg nFm pad_w_in pad_h_in pad_w_out pad_h_out stride type format\n", argv[0]); return 0; } libxsmm_rng_set_seed(1); /* reading new values from cli */ i = 1; if (argc > i) iters = atoi(argv[i++]); if (argc > i) ifw = atoi(argv[i++]); if (argc > i) ifh = atoi(argv[i++]); if (argc > i) nImg = atoi(argv[i++]); if (argc > i) nFm = atoi(argv[i++]); if (argc > i) pad_w_in = atoi(argv[i++]); if (argc > i) pad_h_in = atoi(argv[i++]); if (argc > i) pad_w_out = atoi(argv[i++]); if (argc > i) pad_h_out = atoi(argv[i++]); if (argc > i) stride = atoi(argv[i++]); if (argc > i) norm_type = atoi(argv[i++]); if (argc > i) fuse_type = atoi(argv[i++]); if (argc > i) type = *(argv[i++]); if (type != 'A' && type != 'F' && type != 'B') { printf("type needs to be 'A' (All), 'F' (FP only), 'B' (BP only)\n"); return -1; } if ((norm_type != 0) && (norm_type != 1)) { printf("norm type needs to be 0 or 1\n"); return -1; } if ((fuse_type < 0) || (fuse_type > 5)) { printf("fuse type needs to be 0, 1, 2, 3, 4 or 5\n"); return -1; } stride_w = stride; stride_h = stride; /* deriving some values for naive code */ ofh = ifh/stride_h; ofw = ifw/stride_w; ifhp = ifh + 2 * pad_h_in; ifwp = ifw + 2 * pad_w_in; ofhp = ofh + 2 * pad_h_out; ofwp = ofw + 2 * pad_w_out; /* set struct for naive convolution */ naive_param.N = nImg; naive_param.C = nFm; naive_param.H = ifh; naive_param.W = ifw; naive_param.stride_h = stride_h; naive_param.stride_w = stride_w; naive_param.norm_type = norm_type; naive_param.fuse_type = fuse_type; #if defined(__SSE3__) _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); #endif /* print some summary */ printf("##########################################\n"); printf("# Setting Up (Common) #\n"); printf("##########################################\n"); printf("PARAMS: W:%d H:%d N:%d C:%d P:%d Q:%d STRIDE:%d\n", ifw, ifh, nImg, nFm, ofh, ofw, stride); printf("PARAMS: ITERS:%d", iters); if (LIBXSMM_FEQ(0, check)) printf(" Threads:%d\n", nThreads); else printf("\n"); printf(" InImg %dx%d Padded (%dx%d)\n", ifh, ifw, ifhp, ifwp); printf("OutImg %dx%d Padded (%dx%d)\n", ofh, ofw, ofhp, ofwp); printf("SIZE Input (MB): %10.2f MiB\n", (double)(nImg*nFm*ifhp*ifwp*sizeof(float))/(1024.0*1024.0) ); printf("SIZE Output (MB): %10.2f MiB\n", (double)(nImg*nFm*ofhp*ofwp*sizeof(float))/(1024.0*1024.0) ); printf("SIZE Input (1): %10.2f MiB\n", (double)(1*nFm*ifhp*ifwp* sizeof(float))/(1024.0*1024.0) ); printf("SIZE Output (1): %10.2f MiB\n", (double)(1*nFm*ofhp*ofwp* sizeof(float))/(1024.0*1024.0) ); #if defined(USE_OVERWRITE) printf("Using Overwrite Option\n"); #endif /* allocate data */ naive_input = (float*)libxsmm_aligned_malloc( nImg*nFm*ifh *ifw *sizeof(float), 2097152); naive_input_add = (float*)libxsmm_aligned_malloc( nImg*nFm*ifh *ifw *sizeof(float), 2097152); naive_delinput = (float*)libxsmm_aligned_malloc( nImg*nFm*ifh *ifw *sizeof(float), 2097152); naive_delinput_add = (float*)libxsmm_aligned_malloc( nImg*nFm*ifh *ifw *sizeof(float), 2097152); naive_output = (float*)libxsmm_aligned_malloc( nImg*nFm*ofh *ofw *sizeof(float), 2097152); naive_deloutput = (float*)libxsmm_aligned_malloc( nImg*nFm*ofh *ofw *sizeof(float), 2097152); naive_input_pad = (float*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(float), 2097152); naive_input_add_pad = (float*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(float), 2097152); naive_delinput_pad = (float*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(float), 2097152); naive_delinput_add_pad = (float*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(float), 2097152); naive_output_pad = (float*)libxsmm_aligned_malloc( nImg*nFm*ofhp*ofwp*sizeof(float), 2097152); naive_deloutput_pad = (float*)libxsmm_aligned_malloc( nImg*nFm*ofhp*ofwp*sizeof(float), 2097152); naive_libxsmm_output = (float*)libxsmm_aligned_malloc( nImg*nFm*ofhp*ofwp*sizeof(float), 2097152); naive_libxsmm_delinput = (float*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(float), 2097152); naive_libxsmm_delinput_add = (float*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(float), 2097152); input_libxsmm = (float*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(float), 2097152); delinput_libxsmm = (float*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(float), 2097152); input_add_libxsmm = (float*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(float), 2097152); delinput_add_libxsmm = (float*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(float), 2097152); output_libxsmm = (float*)libxsmm_aligned_malloc( nImg*nFm*ofhp*ofwp*sizeof(float), 2097152); deloutput_libxsmm = (float*)libxsmm_aligned_malloc( nImg*nFm*ofhp*ofwp*sizeof(float), 2097152); naive_beta = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); naive_gamma = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); naive_delbeta = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); naive_delgamma = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); naive_expectval = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); naive_rcpstddev = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); naive_variance = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); beta_libxsmm = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); gamma_libxsmm = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); delbeta_libxsmm = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); delgamma_libxsmm = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); expectval_libxsmm = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); rcpstddev_libxsmm = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); variance_libxsmm = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); relumask_libxsmm = (unsigned char*)libxsmm_aligned_malloc( nImg*nFm*ofhp*ofwp*sizeof(unsigned char), 2097152); /* initialize data */ init_buf( naive_input, nImg*nFm*ifh*ifw, 0, 0 ); copy_internal_nchw( naive_input_pad , naive_input, nImg, nFm, ifh, ifw, pad_h_in, pad_w_in ); init_buf( naive_delinput, nImg*nFm*ifh*ifw, 0, 0 ); copy_internal_nchw( naive_delinput_pad, naive_delinput, nImg, nFm, ifh, ifw, pad_h_in, pad_w_in ); init_buf( naive_input_add, nImg*nFm*ifh*ifw, 0, 0 ); copy_internal_nchw( naive_input_add_pad, naive_input_add, nImg, nFm, ifh, ifw, pad_h_in, pad_w_in ); init_buf( naive_delinput_add, nImg*nFm*ifh*ifw, 0, 0 ); copy_internal_nchw( naive_delinput_add_pad, naive_delinput_add, nImg, nFm, ifh, ifw, pad_h_in, pad_w_in ); init_buf( naive_output, nImg*nFm*ofh*ofw, 0, 0 ); copy_internal_nchw( naive_output_pad, naive_output, nImg, nFm, ofh, ofw, pad_h_out, pad_w_out ); init_buf( naive_deloutput, nImg*nFm*ofh*ofw, 0, 0 ); copy_internal_nchw( naive_deloutput_pad, naive_deloutput, nImg, nFm, ofh, ofw, pad_h_out, pad_w_out ); set_zeropad_nchw(naive_input_pad, nImg, nFm, ifhp, ifwp, pad_h_in, pad_w_in); set_zeropad_nchw(naive_delinput_pad, nImg, nFm, ifhp, ifwp, pad_h_in, pad_w_in); set_zeropad_nchw(naive_input_add_pad, nImg, nFm, ifhp, ifwp, pad_h_in, pad_w_in); set_zeropad_nchw(naive_delinput_add_pad, nImg, nFm, ifhp, ifwp, pad_h_in, pad_w_in); set_zeropad_nchw(naive_output_pad, nImg, nFm, ofhp, ofwp, pad_h_out, pad_w_out); set_zeropad_nchw(naive_deloutput_pad, nImg, nFm, ofhp, ofwp, pad_h_out, pad_w_out); init_buf(naive_beta, nFm, 0, 0); init_buf(naive_gamma, nFm, 0, 0); init_buf(naive_delbeta, nFm, 0, 0); init_buf(naive_delgamma, nFm, 0, 0); init_buf(naive_expectval, nFm, 0, 0); init_buf(naive_rcpstddev, nFm, 0, 0); init_buf(naive_variance, nFm, 0, 0); copy_buf(naive_beta, beta_libxsmm, nFm); copy_buf(naive_gamma, gamma_libxsmm, nFm); copy_buf(naive_delbeta, delbeta_libxsmm, nFm); copy_buf(naive_delgamma, delgamma_libxsmm, nFm); copy_buf(naive_expectval, expectval_libxsmm, nFm); copy_buf(naive_rcpstddev, rcpstddev_libxsmm, nFm); copy_buf(naive_variance, variance_libxsmm, nFm); if (LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Computing Reference ... #\n"); printf("##########################################\n"); if (type == 'A' || type == 'F') { naive_fusedbatchnorm_fp(&naive_param, naive_input, naive_output, naive_input_add, naive_beta, naive_gamma, naive_expectval, naive_rcpstddev, naive_variance); } if (type == 'A' || type == 'B') { naive_fusedbatchnorm_bp(&naive_param, naive_input, naive_delinput, naive_output, naive_deloutput, naive_delinput_add, naive_beta, naive_delbeta, naive_gamma, naive_delgamma, naive_expectval, naive_rcpstddev); } printf("##########################################\n"); printf("# Computing Reference ... done #\n"); printf("##########################################\n"); } if (format == 'A' || format == 'L') { printf("\n"); printf("##########################################\n"); printf("# Setting Up (custom-Storage) #\n"); printf("##########################################\n"); /* setup LIBXSMM handle */ fusedbatchnorm_desc.partN = nImg; fusedbatchnorm_desc.fullN = nImg; fusedbatchnorm_desc.C = nFm; fusedbatchnorm_desc.H = ifh; fusedbatchnorm_desc.W = ifw; fusedbatchnorm_desc.u = stride_h; fusedbatchnorm_desc.v = stride_w; fusedbatchnorm_desc.pad_h_in = pad_h_in; fusedbatchnorm_desc.pad_w_in = pad_w_in; fusedbatchnorm_desc.pad_h_out = pad_h_out; fusedbatchnorm_desc.pad_w_out = pad_w_out; fusedbatchnorm_desc.threads = nThreads; fusedbatchnorm_desc.datatype_in = LIBXSMM_DNN_DATATYPE_F32; fusedbatchnorm_desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32; fusedbatchnorm_desc.datatype_stats = LIBXSMM_DNN_DATATYPE_F32; fusedbatchnorm_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; fusedbatchnorm_desc.fuse_order = LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU; if ( norm_type == 0 ) { if ( fuse_type == 0 ) { fusedbatchnorm_desc.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BN; } else if ( fuse_type == 1 ) { fusedbatchnorm_desc.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BN_RELU; } else if ( fuse_type == 2 ) { fusedbatchnorm_desc.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BN_ELTWISE; } else if ( fuse_type == 3 ) { fusedbatchnorm_desc.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BN_ELTWISE_RELU; } else if ( fuse_type == 4 ) { fusedbatchnorm_desc.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BN_RELU_WITH_MASK; } else if ( fuse_type == 5 ) { fusedbatchnorm_desc.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BN_ELTWISE_RELU_WITH_MASK; } else { /* shouldn't happen */ return -1; } } else { if ( fuse_type == 0 ) { fusedbatchnorm_desc.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE; } else if ( fuse_type == 1 ) { fusedbatchnorm_desc.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE_RELU; } else if ( fuse_type == 2 ) { fusedbatchnorm_desc.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE_ELTWISE; } else if ( fuse_type == 3 ) { fusedbatchnorm_desc.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE_ELTWISE_RELU; } else if ( fuse_type == 4 ) { fusedbatchnorm_desc.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE_RELU_WITH_MASK; } else if ( fuse_type == 5 ) { fusedbatchnorm_desc.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE_ELTWISE_RELU_WITH_MASK; } else { /* shouldn't happen */ return -1; } } libxsmm_handle = libxsmm_dnn_create_fusedbatchnorm( fusedbatchnorm_desc, &status ); CHKERR_LIBXSMM_DNN( status ); /* setup LIBXSMM buffers */ libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); printf("inner activation blocking: %i\n", libxsmm_layout->dim_size[0] ); libxsmm_input = libxsmm_dnn_link_tensor( libxsmm_layout, input_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delinput = libxsmm_dnn_link_tensor( libxsmm_layout, delinput_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT_ADD, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_input_add = libxsmm_dnn_link_tensor( libxsmm_layout, input_add_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT_ADD, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delinput_add = libxsmm_dnn_link_tensor( libxsmm_layout, delinput_add_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_output = libxsmm_dnn_link_tensor( libxsmm_layout, output_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_deloutput = libxsmm_dnn_link_tensor( libxsmm_layout, deloutput_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_CHANNEL_BETA, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_beta = libxsmm_dnn_link_tensor( libxsmm_layout, beta_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_CHANNEL_BETA, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delbeta = libxsmm_dnn_link_tensor( libxsmm_layout, delbeta_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_gamma = libxsmm_dnn_link_tensor( libxsmm_layout, gamma_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delgamma = libxsmm_dnn_link_tensor( libxsmm_layout, delgamma_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_CHANNEL_EXPECTVAL, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_expectval = libxsmm_dnn_link_tensor( libxsmm_layout, expectval_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_CHANNEL_RCPSTDDEV, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_rcpstddev = libxsmm_dnn_link_tensor( libxsmm_layout, rcpstddev_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_CHANNEL_VARIANCE, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_variance = libxsmm_dnn_link_tensor( libxsmm_layout, variance_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RELU_MASK, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_relumask = libxsmm_dnn_link_tensor( libxsmm_layout, relumask_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); /* copy in data to LIBXSMM format */ /* we can also use the layout functions and set the data on our own external to the library */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_input, (void*)naive_input_pad, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_output, (void*)naive_output_pad, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_input_add, (void*)naive_input_add_pad, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_delinput, (void*)naive_delinput_pad, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_deloutput, (void*)naive_deloutput_pad, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_delinput_add, (void*)naive_delinput_add_pad, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); /* bind buffers and filter to handle */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle, libxsmm_input, LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle, libxsmm_delinput, LIBXSMM_DNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle, libxsmm_output, LIBXSMM_DNN_REGULAR_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle, libxsmm_deloutput, LIBXSMM_DNN_GRADIENT_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle, libxsmm_input_add, LIBXSMM_DNN_REGULAR_INPUT_ADD ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle, libxsmm_delinput_add, LIBXSMM_DNN_GRADIENT_INPUT_ADD ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle, libxsmm_beta, LIBXSMM_DNN_REGULAR_CHANNEL_BETA ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle, libxsmm_gamma, LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle, libxsmm_delbeta, LIBXSMM_DNN_GRADIENT_CHANNEL_BETA ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle, libxsmm_delgamma, LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle, libxsmm_expectval, LIBXSMM_DNN_CHANNEL_EXPECTVAL ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle, libxsmm_rcpstddev, LIBXSMM_DNN_CHANNEL_RCPSTDDEV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle, libxsmm_variance, LIBXSMM_DNN_CHANNEL_VARIANCE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle, libxsmm_relumask, LIBXSMM_DNN_RELU_MASK ) ); /* let's allocate and bind scratch */ scratch_size = libxsmm_dnn_fusedbatchnorm_get_scratch_size( libxsmm_handle, &status ); CHKERR_LIBXSMM_DNN( status ); scratch = libxsmm_aligned_scratch( scratch_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_scratch( libxsmm_handle, scratch ) ); /* set scratch to bogus to make sure that libxsmm takes care of zeroing internally */ init_buf( (float*)scratch, scratch_size/4, 0, 0 ); if ((type == 'A' || type == 'F') && LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Correctness - FWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolutions */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) ); } /* copy out data */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_output, (void*)naive_libxsmm_output, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); copy_internal_nchw( naive_output_pad, naive_output, nImg, nFm, ofh, ofw, pad_h_out, pad_w_out); /* compare */ printf("rcpstddev:\n"); libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_F32, nFm, 1, naive_rcpstddev, rcpstddev_libxsmm, 0, 0); printf("L1 reference : %.25g\n", norms_fwd.l1_ref); printf("L1 test : %.25g\n", norms_fwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_fwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_fwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel); printf("Check-norm : %.24f\n", norms_fwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_fwd); printf("variance:\n"); libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_F32, nFm, 1, naive_variance, variance_libxsmm, 0, 0); printf("L1 reference : %.25g\n", norms_fwd.l1_ref); printf("L1 test : %.25g\n", norms_fwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_fwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_fwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel); printf("Check-norm : %.24f\n", norms_fwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_fwd); printf("expected value:\n"); libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_F32, nFm, 1, naive_expectval, expectval_libxsmm, 0, 0); printf("L1 reference : %.25g\n", norms_fwd.l1_ref); printf("L1 test : %.25g\n", norms_fwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_fwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_fwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel); printf("Check-norm : %.24f\n", norms_fwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_fwd); printf("output:\n"); libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_F32, nImg*nFm*ofhp*ofwp, 1, naive_output_pad, naive_libxsmm_output, 0, 0); printf("L1 reference : %.25g\n", norms_fwd.l1_ref); printf("L1 test : %.25g\n", norms_fwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_fwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_fwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel); printf("Check-norm : %.24f\n", norms_fwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_fwd); /* let's check ReLU positions */ relu_no_match = 0; for ( i = 0; i < nImg*nFm*ofhp*ofwp; ++i ) { if ( (naive_output_pad[i] == 0.0f && naive_libxsmm_output[i] != 0.0f) || (naive_output_pad[i] != 0.0f && naive_libxsmm_output[i] == 0.0f) ) { relu_no_match++; } } printf("ReLU mismatch count: %i\n", relu_no_match ); } if ( (type == 'A' || type == 'B') && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - BWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolutions */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ) ); } /* copy out data */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_delinput, (void*)naive_libxsmm_delinput, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_delinput_add, (void*)naive_libxsmm_delinput_add, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); copy_internal_nchw( naive_delinput_pad, naive_delinput, nImg, nFm, ifh, ifw, pad_h_in, pad_w_in); copy_internal_nchw( naive_delinput_add_pad, naive_delinput_add, nImg, nFm, ifh, ifw, pad_h_in, pad_w_in); /* compare */ printf("delinput_add:\n"); libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, nImg*nFm*ifhp*ifwp, 1, naive_delinput_add_pad, naive_libxsmm_delinput_add, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); printf("delbeta:\n"); libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, nFm, 1, naive_delbeta, delbeta_libxsmm, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); printf("delgamma:\n"); libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, nFm, 1, naive_delgamma, delgamma_libxsmm, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); printf("delinput:\n"); libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, nImg*nFm*ifhp*ifwp, 1, naive_delinput_pad, naive_libxsmm_delinput, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); } if (type == 'A' || type == 'F') { printf("##########################################\n"); printf("# Performance - FWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolution for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_fusedbatchnorm_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); gb = ((double)nImg*(double)nFm*(((double)ifh*(double)ifw) + ((double)ofh*(double)ofw))*(double)sizeof(float)*(double)iters) / (1000*1000*1000); gib = ((double)nImg*(double)nFm*(((double)ifh*(double)ifw) + ((double)ofh*(double)ofw))*(double)sizeof(float)*(double)iters) / (1024*1024*1024); printf("GB = %.5g\n", gb/(double)iters); printf("GiB = %.5g\n", gib/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GB/s = %.5g\n", gb/l_total); printf("GiB/s = %.5g\n", gib/l_total); printf("PERFDUMP,FP,%s,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%.5g,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nFm, ifw, ifh, stride, pad_w_in, pad_h_in, pad_w_out, pad_h_out, ((double)(l_total/iters)), gb/l_total, gib/l_total, norms_fwd.l1_ref, norms_fwd.l1_tst, norms_fwd.l2_abs, norms_fwd.l2_rel, norms_fwd.linf_abs, norms_fwd.linf_rel, norms_fwd.normf_rel); } if ( (type == 'A' || type == 'B') ) { printf("##########################################\n"); printf("# Performance - BWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolution for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_fusedbatchnorm_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); gb = (2.0*(double)nImg*(double)nFm*(((double)ifh*(double)ifw) + (2.0*(double)ofh*(double)ofw))*(double)sizeof(float)*(double)iters) / (1000*1000*1000); gib = (2.0*(double)nImg*(double)nFm*(((double)ifh*(double)ifw) + (2.0*(double)ofh*(double)ofw))*(double)sizeof(float)*(double)iters) / (1024*1024*1024); printf("GB = %.5g\n", gb/(double)iters); printf("GiB = %.5g\n", gib/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GB/s = %.5g\n", gb/l_total); printf("GiB/s = %.5g\n", gib/l_total); printf("PERFDUMP,BP,%s,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%.5g,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nFm, ifw, ifh, stride, pad_w_in, pad_h_in, pad_w_out, pad_h_out, ((double)(l_total/iters)), gb/l_total, gib/l_total, norms_bwd.l1_ref, norms_bwd.l1_tst, norms_bwd.l2_abs, norms_bwd.l2_rel, norms_bwd.linf_abs, norms_bwd.linf_rel, norms_bwd.normf_rel); } /* clean-up */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_release_scratch( libxsmm_handle ) ); libxsmm_free(scratch); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT_ADD ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT_ADD ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_CHANNEL_BETA ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_CHANNEL_BETA ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_CHANNEL_EXPECTVAL ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_CHANNEL_RCPSTDDEV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_CHANNEL_VARIANCE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_RELU_MASK) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_input ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_delinput ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_output ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_deloutput ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_input_add ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_delinput_add ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_beta ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_delbeta ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_gamma ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_delgamma ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_expectval ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_rcpstddev ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_variance ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_relumask ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_fusedbatchnorm( libxsmm_handle ) ); } /* deallocate data */ libxsmm_free(naive_input); libxsmm_free(naive_input_add); libxsmm_free(naive_output); libxsmm_free(naive_delinput); libxsmm_free(naive_delinput_add); libxsmm_free(naive_deloutput); libxsmm_free(naive_input_pad); libxsmm_free(naive_input_add_pad); libxsmm_free(naive_output_pad); libxsmm_free(naive_delinput_pad); libxsmm_free(naive_delinput_add_pad); libxsmm_free(naive_deloutput_pad); libxsmm_free(naive_beta); libxsmm_free(naive_gamma); libxsmm_free(naive_delbeta); libxsmm_free(naive_delgamma); libxsmm_free(naive_expectval); libxsmm_free(naive_rcpstddev); libxsmm_free(naive_variance); libxsmm_free(naive_libxsmm_output); libxsmm_free(naive_libxsmm_delinput); libxsmm_free(naive_libxsmm_delinput_add); libxsmm_free(input_libxsmm); libxsmm_free(input_add_libxsmm); libxsmm_free(output_libxsmm); libxsmm_free(delinput_libxsmm); libxsmm_free(delinput_add_libxsmm); libxsmm_free(deloutput_libxsmm); libxsmm_free(beta_libxsmm); libxsmm_free(gamma_libxsmm); libxsmm_free(delbeta_libxsmm); libxsmm_free(delgamma_libxsmm); libxsmm_free(expectval_libxsmm); libxsmm_free(rcpstddev_libxsmm); libxsmm_free(variance_libxsmm); libxsmm_free(relumask_libxsmm); { const char *const env_check_scale = getenv("CHECK_SCALE"); const double check_scale = LIBXSMM_ABS(0 == env_check_scale ? 1.0 : atof(env_check_scale)); if (LIBXSMM_NEQ(0, check) && (check < 100.0 * check_scale * diff.normf_rel) && (global_status == LIBXSMM_DNN_SUCCESS)) { fprintf(stderr, "FAILED with an error of %f%%!\n", 100.0 * diff.normf_rel); exit(EXIT_FAILURE); } } /* some empty lines at the end */ printf("\n\n\n"); return global_status; } libxsmm-1.17/samples/deeplearning/fusedbndriver/layer_example_f32.vcxproj000066400000000000000000000547711415223013700270120ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 layer_example_f32 10.0 {8D23BCAE-ECF1-4C6D-AB18-0D028EEDE39D} Application Disabled Disabled Sequential v142 true Application true true Disabled Disabled Sequential v142 Application true Disabled Disabled Sequential v142 true Application Disabled Disabled Sequential v142 true true Application true Disabled Disabled Sequential v142 Application true Disabled Disabled true Sequential v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;mkl_rt.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;mkl_rt.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/deeplearning/fusedbndriver/run_resnet50.sh000077500000000000000000000121211415223013700247430ustar00rootroot00000000000000#!/usr/bin/env bash set -eo pipefail UNAME=$(command -v uname) SORT=$(command -v sort) GREP=$(command -v grep) CUT=$(command -v cut) WC=$(command -v wc) TR=$(command -v tr) if [ "" = "${CHECK}" ] || [ "0" = "${CHECK}" ]; then if [ "" = "${CHECK_DNN_MB}" ]; then CHECK_DNN_MB=64; fi if [ "" = "${CHECK_DNN_ITERS}" ]; then CHECK_DNN_ITERS=1000; fi else # check if [ "" = "${CHECK_DNN_MB}" ]; then CHECK_DNN_MB=64; fi if [ "" = "${CHECK_DNN_ITERS}" ]; then CHECK_DNN_ITERS=1; fi fi if [ $# -ne 7 ] then echo "Usage: $(basename $0) mb iters numa (1-mcdram/0-DDR) prec (f32,bf16) NORM (0,1) FUSE (0,1,2,3) PASS ('A'-ALL/'F'-FP/'B'-BP); using default values; using default values: 64 1000 1 f32 0 0 A" MB=${CHECK_DNN_MB} ITERS=${CHECK_DNN_ITERS} NUMA=-1 BIN=f32 NORM=0 FUSE=0 PASS="A" else MB=$1 ITERS=$2 NUMA=$3 BIN=$4 NORM=$5 FUSE=$6 PASS=$7 fi if [ "${GREP}" ] && [ "${SORT}" ] && [ "${CUT}" ] && [ "${TR}" ] && [ "${WC}" ]; then if [ "$(command -v lscpu)" ]; then NS=$(lscpu | ${GREP} -m1 "Socket(s)" | ${TR} -d " " | ${CUT} -d: -f2) if [ "" = "${NS}" ]; then NS=1; fi NC=$((NS*$(lscpu | ${GREP} -m1 "Core(s) per socket" | ${TR} -d " " | ${CUT} -d: -f2))) NT=$((NC*$(lscpu | ${GREP} -m1 "Thread(s) per core" | ${TR} -d " " | ${CUT} -d: -f2))) elif [ -e /proc/cpuinfo ]; then NS=$(${GREP} "physical id" /proc/cpuinfo | ${SORT} -u | ${WC} -l | ${TR} -d " ") if [ "" = "${NS}" ] || [ "" = "${NS}" ]; then NS=1; fi NC=$((NS*$(${GREP} -m1 "cpu cores" /proc/cpuinfo | ${TR} -d " " | ${CUT} -d: -f2))) NT=$(${GREP} "core id" /proc/cpuinfo | ${WC} -l | ${TR} -d " ") elif [ "Darwin" = "$(uname)" ]; then NS=$(sysctl hw.packages | ${CUT} -d: -f2 | ${TR} -d " ") NC=$(sysctl hw.physicalcpu | ${CUT} -d: -f2 | ${TR} -d " ") NT=$(sysctl hw.logicalcpu | ${CUT} -d: -f2 | ${TR} -d " ") fi if [ "${NC}" ] && [ "${NT}" ]; then HT=$((NT/NC)) else NS=1 NC=1 NT=1 HT=1 fi if [ "$(command -v numactl)" ]; then NN=$(numactl -H | ${GREP} "available:" | ${CUT} -d' ' -f2) else NN=${NS} fi fi CPUFLAGS=$(if [ "${GREP}" ] && [ "${CUT}" ] && [ -e /proc/cpuinfo ]; then ${GREP} -m1 flags /proc/cpuinfo | ${CUT} -d: -f2- || true; fi) if [ "${GREP}" ] && [ "$(echo "${CPUFLAGS}" | ${GREP} -o avx512er)" ]; then if [ "0" != "$((0>NUMA))" ] && [ "0" != "$((NS #include #include #include #include #if defined(_OPENMP) # include #endif /* include c-based dnn library */ #include "../common/dnn_common.h" #define CHKERR_LIBXSMM_DNN(A) { const int chkerr_libxsmm_dnn_ = A; if (LIBXSMM_DNN_SUCCESS != chkerr_libxsmm_dnn_) { \ fprintf(stderr, "%s\n", libxsmm_dnn_get_error(chkerr_libxsmm_dnn_)); global_status = chkerr_libxsmm_dnn_; } \ } int main(int argc, char* argv[]) { float *naive_input, *naive_output, *naive_input_add, *naive_delinput_add, *naive_delinput, *naive_deloutput; float *naive_input_pad, *naive_output_pad, *naive_input_add_pad, *naive_delinput_add_pad, *naive_delinput_pad, *naive_deloutput_pad; float *naive_libxsmm_output, *naive_libxsmm_delinput, *naive_libxsmm_delinput_add; float *naive_beta, *naive_gamma, *naive_delbeta, *naive_delgamma, *naive_expectval, *naive_rcpstddev, *naive_variance; float *input_libxsmm, *output_libxsmm, *input_add_libxsmm, *delinput_libxsmm, *deloutput_libxsmm, *delinput_add_libxsmm; float *beta_libxsmm, *gamma_libxsmm, *delbeta_libxsmm, *delgamma_libxsmm, *expectval_libxsmm, *rcpstddev_libxsmm, *variance_libxsmm; unsigned char* relumask_libxsmm; int ifhp, ifwp, ofhp, ofwp, ofh, ofw; int stride_h, stride_w; naive_fusedgroupnorm_t naive_param; void* scratch; size_t scratch_size = 0; /* some parameters we can overwrite via cli, default is some inner layer of overfeat */ int iters = 10; /* repetitions of benchmark */ int ifw = 14; /* input width, "W" */ int ifh = 20; /* input height, "H" */ int nImg = 32; /* mini-batch size, "N" */ int nFm = 256; /* number of input feature maps, "C" */ int nG = 32; int stride = 1; /* stride when accessing inputs */ int pad_h_in = 0; /* padding mode */ int pad_w_in = 0; /* padding mode */ int pad_h_out = 0; /* padding mode */ int pad_w_out = 0; /* padding mode */ int fuse_type = 0; /* 0: nothing fused, 1: relu fused, 2: elementwise fused, 3: relu and elementwise fused */ char type = 'A'; /* 'A': ALL, 'F': FP, 'B': BP, 'U', WU */ char format = 'L'; const char *const env_check = getenv("CHECK"); const double check = LIBXSMM_ABS(0 == env_check ? 1 : atof(env_check)); #if defined(_OPENMP) int nThreads = omp_get_max_threads(); /* number of threads */ #else int nThreads = 1; /* number of threads */ #endif unsigned long long l_start, l_end; double l_total = 0.0; double gb = 0.0; double gib = 0.0; int i; int relu_no_match; libxsmm_dnn_fusedgroupnorm_desc fusedgroupnorm_desc; libxsmm_dnn_fusedgroupnorm* libxsmm_handle; libxsmm_dnn_tensor* libxsmm_input; libxsmm_dnn_tensor* libxsmm_delinput; libxsmm_dnn_tensor* libxsmm_output; libxsmm_dnn_tensor* libxsmm_deloutput; libxsmm_dnn_tensor* libxsmm_input_add; libxsmm_dnn_tensor* libxsmm_delinput_add; libxsmm_dnn_tensor* libxsmm_beta; libxsmm_dnn_tensor* libxsmm_gamma; libxsmm_dnn_tensor* libxsmm_delbeta; libxsmm_dnn_tensor* libxsmm_delgamma; libxsmm_dnn_tensor* libxsmm_expectval; libxsmm_dnn_tensor* libxsmm_rcpstddev; libxsmm_dnn_tensor* libxsmm_variance; libxsmm_dnn_tensor* libxsmm_relumask; libxsmm_dnn_tensor_datalayout* libxsmm_layout; libxsmm_dnn_err_t status; libxsmm_dnn_err_t global_status = LIBXSMM_DNN_SUCCESS; libxsmm_matdiff_info norms_fwd, norms_bwd, diff; libxsmm_matdiff_clear(&norms_fwd); libxsmm_matdiff_clear(&norms_bwd); libxsmm_matdiff_clear(&diff); if (argc > 1 && !strncmp(argv[1], "-h", 3)) { printf("Usage: %s iters inpWidth inpHeight nImg nFm nG pad_w_in pad_h_in pad_w_out pad_h_out stride type format\n", argv[0]); return 0; } libxsmm_rng_set_seed(1); /* reading new values from cli */ i = 1; if (argc > i) iters = atoi(argv[i++]); if (argc > i) ifw = atoi(argv[i++]); if (argc > i) ifh = atoi(argv[i++]); if (argc > i) nImg = atoi(argv[i++]); if (argc > i) nFm = atoi(argv[i++]); if (argc > i) nG = atoi(argv[i++]); if (argc > i) pad_w_in = atoi(argv[i++]); if (argc > i) pad_h_in = atoi(argv[i++]); if (argc > i) pad_w_out = atoi(argv[i++]); if (argc > i) pad_h_out = atoi(argv[i++]); if (argc > i) stride = atoi(argv[i++]); if (argc > i) fuse_type = atoi(argv[i++]); if (argc > i) type = *(argv[i++]); if (type != 'A' && type != 'F' && type != 'B') { printf("type needs to be 'A' (All), 'F' (FP only), 'B' (BP only)\n"); return -1; } if ((fuse_type < 0) || (fuse_type > 5)) { printf("fuse type needs to be 0, 1, 2, 3, 4 or 5\n"); return -1; } stride_w = stride; stride_h = stride; /* deriving some values for naive code */ ofh = ifh/stride_h; ofw = ifw/stride_w; ifhp = ifh + 2 * pad_h_in; ifwp = ifw + 2 * pad_w_in; ofhp = ofh + 2 * pad_h_out; ofwp = ofw + 2 * pad_w_out; /* set struct for naive convolution */ naive_param.N = nImg; naive_param.C = nFm; naive_param.G = nG; naive_param.H = ifh; naive_param.W = ifw; naive_param.stride_h = stride_h; naive_param.stride_w = stride_w; naive_param.fuse_type = fuse_type; #if defined(__SSE3__) _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); #endif /* print some summary */ printf("##########################################\n"); printf("# Setting Up (Common) #\n"); printf("##########################################\n"); printf("PARAMS: W:%d H:%d N:%d C:%d P:%d Q:%d STRIDE:%d\n", ifw, ifh, nImg, nFm, ofh, ofw, stride); printf("PARAMS: ITERS:%d", iters); if (LIBXSMM_FEQ(0, check)) printf(" Threads:%d\n", nThreads); else printf("\n"); printf(" InImg %dx%d Padded (%dx%d)\n", ifh, ifw, ifhp, ifwp); printf("OutImg %dx%d Padded (%dx%d)\n", ofh, ofw, ofhp, ofwp); printf("SIZE Input (MB): %10.2f MiB\n", (double)(nImg*nFm*ifhp*ifwp*sizeof(float))/(1024.0*1024.0) ); printf("SIZE Output (MB): %10.2f MiB\n", (double)(nImg*nFm*ofhp*ofwp*sizeof(float))/(1024.0*1024.0) ); printf("SIZE Input (1): %10.2f MiB\n", (double)(1*nFm*ifhp*ifwp* sizeof(float))/(1024.0*1024.0) ); printf("SIZE Output (1): %10.2f MiB\n", (double)(1*nFm*ofhp*ofwp* sizeof(float))/(1024.0*1024.0) ); #if defined(USE_OVERWRITE) printf("Using Overwrite Option\n"); #endif /* allocate data */ naive_input = (float*)libxsmm_aligned_malloc( nImg*nFm*ifh *ifw *sizeof(float), 2097152); naive_input_add = (float*)libxsmm_aligned_malloc( nImg*nFm*ifh *ifw *sizeof(float), 2097152); naive_delinput = (float*)libxsmm_aligned_malloc( nImg*nFm*ifh *ifw *sizeof(float), 2097152); naive_delinput_add = (float*)libxsmm_aligned_malloc( nImg*nFm*ifh *ifw *sizeof(float), 2097152); naive_output = (float*)libxsmm_aligned_malloc( nImg*nFm*ofh *ofw *sizeof(float), 2097152); naive_deloutput = (float*)libxsmm_aligned_malloc( nImg*nFm*ofh *ofw *sizeof(float), 2097152); naive_input_pad = (float*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(float), 2097152); naive_input_add_pad = (float*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(float), 2097152); naive_delinput_pad = (float*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(float), 2097152); naive_delinput_add_pad = (float*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(float), 2097152); naive_output_pad = (float*)libxsmm_aligned_malloc( nImg*nFm*ofhp*ofwp*sizeof(float), 2097152); naive_deloutput_pad = (float*)libxsmm_aligned_malloc( nImg*nFm*ofhp*ofwp*sizeof(float), 2097152); naive_libxsmm_output = (float*)libxsmm_aligned_malloc( nImg*nFm*ofhp*ofwp*sizeof(float), 2097152); naive_libxsmm_delinput = (float*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(float), 2097152); naive_libxsmm_delinput_add = (float*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(float), 2097152); input_libxsmm = (float*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(float), 2097152); delinput_libxsmm = (float*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(float), 2097152); input_add_libxsmm = (float*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(float), 2097152); delinput_add_libxsmm = (float*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(float), 2097152); output_libxsmm = (float*)libxsmm_aligned_malloc( nImg*nFm*ofhp*ofwp*sizeof(float), 2097152); deloutput_libxsmm = (float*)libxsmm_aligned_malloc( nImg*nFm*ofhp*ofwp*sizeof(float), 2097152); naive_beta = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); naive_gamma = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); naive_delbeta = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); naive_delgamma = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); naive_expectval = (float*)libxsmm_aligned_malloc( nImg*nG* sizeof(float), 2097152); naive_rcpstddev = (float*)libxsmm_aligned_malloc( nImg*nG* sizeof(float), 2097152); naive_variance = (float*)libxsmm_aligned_malloc( nImg*nG* sizeof(float), 2097152); beta_libxsmm = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); gamma_libxsmm = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); delbeta_libxsmm = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); delgamma_libxsmm = (float*)libxsmm_aligned_malloc( nFm* sizeof(float), 2097152); expectval_libxsmm = (float*)libxsmm_aligned_malloc( nImg*nG* sizeof(float), 2097152); rcpstddev_libxsmm = (float*)libxsmm_aligned_malloc( nImg*nG* sizeof(float), 2097152); variance_libxsmm = (float*)libxsmm_aligned_malloc( nImg*nG* sizeof(float), 2097152); relumask_libxsmm = (unsigned char*)libxsmm_aligned_malloc( nImg*nFm*ofhp*ofwp*sizeof(unsigned char), 2097152); /* initialize data */ init_buf( naive_input, nImg*nFm*ifh*ifw, 0, 0 ); copy_internal_nchw( naive_input_pad , naive_input, nImg, nFm, ifh, ifw, pad_h_in, pad_w_in ); init_buf( naive_delinput, nImg*nFm*ifh*ifw, 0, 0 ); copy_internal_nchw( naive_delinput_pad, naive_delinput, nImg, nFm, ifh, ifw, pad_h_in, pad_w_in ); init_buf( naive_input_add, nImg*nFm*ifh*ifw, 0, 0 ); copy_internal_nchw( naive_input_add_pad, naive_input_add, nImg, nFm, ifh, ifw, pad_h_in, pad_w_in ); init_buf( naive_delinput_add, nImg*nFm*ifh*ifw, 0, 0 ); copy_internal_nchw( naive_delinput_add_pad, naive_delinput_add, nImg, nFm, ifh, ifw, pad_h_in, pad_w_in ); init_buf( naive_output, nImg*nFm*ofh*ofw, 0, 0 ); copy_internal_nchw( naive_output_pad, naive_output, nImg, nFm, ofh, ofw, pad_h_out, pad_w_out ); init_buf( naive_deloutput, nImg*nFm*ofh*ofw, 0, 0 ); copy_internal_nchw( naive_deloutput_pad, naive_deloutput, nImg, nFm, ofh, ofw, pad_h_out, pad_w_out ); set_zeropad_nchw(naive_input_pad, nImg, nFm, ifhp, ifwp, pad_h_in, pad_w_in); set_zeropad_nchw(naive_delinput_pad, nImg, nFm, ifhp, ifwp, pad_h_in, pad_w_in); set_zeropad_nchw(naive_input_add_pad, nImg, nFm, ifhp, ifwp, pad_h_in, pad_w_in); set_zeropad_nchw(naive_delinput_add_pad, nImg, nFm, ifhp, ifwp, pad_h_in, pad_w_in); set_zeropad_nchw(naive_output_pad, nImg, nFm, ofhp, ofwp, pad_h_out, pad_w_out); set_zeropad_nchw(naive_deloutput_pad, nImg, nFm, ofhp, ofwp, pad_h_out, pad_w_out); init_buf(naive_beta, nFm, 0, 0); init_buf(naive_gamma, nFm, 0, 0); init_buf(naive_delbeta, nFm, 0, 0); init_buf(naive_delgamma, nFm, 0, 0); init_buf(naive_expectval, nImg*nG, 0, 0); init_buf(naive_rcpstddev, nImg*nG, 0, 0); init_buf(naive_variance, nImg*nG, 0, 0); copy_buf(naive_beta, beta_libxsmm, nFm); copy_buf(naive_gamma, gamma_libxsmm, nFm); copy_buf(naive_delbeta, delbeta_libxsmm, nFm); copy_buf(naive_delgamma, delgamma_libxsmm, nFm); copy_buf(naive_expectval, expectval_libxsmm, nImg*nG); copy_buf(naive_rcpstddev, rcpstddev_libxsmm, nImg*nG); copy_buf(naive_variance, variance_libxsmm, nImg*nG); if (LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Computing Reference ... #\n"); printf("##########################################\n"); if (type == 'A' || type == 'F') { naive_fusedgroupnorm_fp(&naive_param, naive_input, naive_output, naive_input_add, naive_beta, naive_gamma, naive_expectval, naive_rcpstddev, naive_variance); } if (type == 'A' || type == 'B') { naive_fusedgroupnorm_bp(&naive_param, naive_input, naive_delinput, naive_output, naive_deloutput, naive_delinput_add, naive_beta, naive_delbeta, naive_gamma, naive_delgamma, naive_expectval, naive_rcpstddev, naive_variance); } printf("##########################################\n"); printf("# Computing Reference ... done #\n"); printf("##########################################\n"); } if (format == 'A' || format == 'L') { printf("\n"); printf("##########################################\n"); printf("# Setting Up (custom-Storage) #\n"); printf("##########################################\n"); /* setup LIBXSMM handle */ fusedgroupnorm_desc.N = nImg; fusedgroupnorm_desc.G = nG; fusedgroupnorm_desc.C = nFm; fusedgroupnorm_desc.H = ifh; fusedgroupnorm_desc.W = ifw; fusedgroupnorm_desc.u = stride_h; fusedgroupnorm_desc.v = stride_w; fusedgroupnorm_desc.pad_h_in = pad_h_in; fusedgroupnorm_desc.pad_w_in = pad_w_in; fusedgroupnorm_desc.pad_h_out = pad_h_out; fusedgroupnorm_desc.pad_w_out = pad_w_out; fusedgroupnorm_desc.threads = nThreads; fusedgroupnorm_desc.datatype_in = LIBXSMM_DNN_DATATYPE_F32; fusedgroupnorm_desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32; fusedgroupnorm_desc.datatype_stats = LIBXSMM_DNN_DATATYPE_F32; fusedgroupnorm_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; fusedgroupnorm_desc.fuse_order = LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU; if ( fuse_type == 0 ) { fusedgroupnorm_desc.fuse_ops = LIBXSMM_DNN_FUSEDGN_OPS_GN; } else if ( fuse_type == 1 ) { fusedgroupnorm_desc.fuse_ops = LIBXSMM_DNN_FUSEDGN_OPS_GN_RELU; } else if ( fuse_type == 2 ) { fusedgroupnorm_desc.fuse_ops = LIBXSMM_DNN_FUSEDGN_OPS_GN_ELTWISE; } else if ( fuse_type == 3 ) { fusedgroupnorm_desc.fuse_ops = LIBXSMM_DNN_FUSEDGN_OPS_GN_ELTWISE_RELU; } else if ( fuse_type == 4 ) { fusedgroupnorm_desc.fuse_ops = LIBXSMM_DNN_FUSEDGN_OPS_GN_RELU_WITH_MASK; } else if ( fuse_type == 5 ) { fusedgroupnorm_desc.fuse_ops = LIBXSMM_DNN_FUSEDGN_OPS_GN_ELTWISE_RELU_WITH_MASK; } else { /* shouldn't happen */ return -1; } printf("fuse type is: %i\n", fuse_type); libxsmm_handle = libxsmm_dnn_create_fusedgroupnorm( fusedgroupnorm_desc, &status ); CHKERR_LIBXSMM_DNN( status ); /* setup LIBXSMM buffers */ libxsmm_layout = libxsmm_dnn_fusedgroupnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); printf("inner activation blocking: %i\n", libxsmm_layout->dim_size[0] ); libxsmm_input = libxsmm_dnn_link_tensor( libxsmm_layout, input_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedgroupnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delinput = libxsmm_dnn_link_tensor( libxsmm_layout, delinput_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedgroupnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT_ADD, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_input_add = libxsmm_dnn_link_tensor( libxsmm_layout, input_add_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedgroupnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT_ADD, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delinput_add = libxsmm_dnn_link_tensor( libxsmm_layout, delinput_add_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedgroupnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_output = libxsmm_dnn_link_tensor( libxsmm_layout, output_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedgroupnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_deloutput = libxsmm_dnn_link_tensor( libxsmm_layout, deloutput_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedgroupnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_CHANNEL_BETA, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_beta = libxsmm_dnn_link_tensor( libxsmm_layout, beta_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedgroupnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_CHANNEL_BETA, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delbeta = libxsmm_dnn_link_tensor( libxsmm_layout, delbeta_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedgroupnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_gamma = libxsmm_dnn_link_tensor( libxsmm_layout, gamma_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedgroupnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delgamma = libxsmm_dnn_link_tensor( libxsmm_layout, delgamma_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedgroupnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_CHANNEL_EXPECTVAL, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_expectval = libxsmm_dnn_link_tensor( libxsmm_layout, expectval_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedgroupnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_CHANNEL_RCPSTDDEV, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_rcpstddev = libxsmm_dnn_link_tensor( libxsmm_layout, rcpstddev_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedgroupnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_CHANNEL_VARIANCE, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_variance = libxsmm_dnn_link_tensor( libxsmm_layout, variance_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fusedgroupnorm_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RELU_MASK, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_relumask = libxsmm_dnn_link_tensor( libxsmm_layout, relumask_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); /* copy in data to LIBXSMM format */ /* we can also use the layout functions and set the data on our own external to the library */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_input, (void*)naive_input_pad, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_output, (void*)naive_output_pad, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_input_add, (void*)naive_input_add_pad, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_delinput, (void*)naive_delinput_pad, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_deloutput, (void*)naive_deloutput_pad, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_delinput_add, (void*)naive_delinput_add_pad, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); /* bind buffers and filter to handle */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedgroupnorm_bind_tensor( libxsmm_handle, libxsmm_input, LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedgroupnorm_bind_tensor( libxsmm_handle, libxsmm_delinput, LIBXSMM_DNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedgroupnorm_bind_tensor( libxsmm_handle, libxsmm_output, LIBXSMM_DNN_REGULAR_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedgroupnorm_bind_tensor( libxsmm_handle, libxsmm_deloutput, LIBXSMM_DNN_GRADIENT_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedgroupnorm_bind_tensor( libxsmm_handle, libxsmm_input_add, LIBXSMM_DNN_REGULAR_INPUT_ADD ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedgroupnorm_bind_tensor( libxsmm_handle, libxsmm_delinput_add, LIBXSMM_DNN_GRADIENT_INPUT_ADD ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedgroupnorm_bind_tensor( libxsmm_handle, libxsmm_beta, LIBXSMM_DNN_REGULAR_CHANNEL_BETA ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedgroupnorm_bind_tensor( libxsmm_handle, libxsmm_gamma, LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedgroupnorm_bind_tensor( libxsmm_handle, libxsmm_delbeta, LIBXSMM_DNN_GRADIENT_CHANNEL_BETA ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedgroupnorm_bind_tensor( libxsmm_handle, libxsmm_delgamma, LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedgroupnorm_bind_tensor( libxsmm_handle, libxsmm_expectval, LIBXSMM_DNN_CHANNEL_EXPECTVAL ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedgroupnorm_bind_tensor( libxsmm_handle, libxsmm_rcpstddev, LIBXSMM_DNN_CHANNEL_RCPSTDDEV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedgroupnorm_bind_tensor( libxsmm_handle, libxsmm_variance, LIBXSMM_DNN_CHANNEL_VARIANCE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedgroupnorm_bind_tensor( libxsmm_handle, libxsmm_relumask, LIBXSMM_DNN_RELU_MASK ) ); /* let's allocate and bind scratch */ scratch_size = libxsmm_dnn_fusedgroupnorm_get_scratch_size( libxsmm_handle, &status ); CHKERR_LIBXSMM_DNN( status ); scratch = libxsmm_aligned_scratch( scratch_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedgroupnorm_bind_scratch( libxsmm_handle, scratch ) ); /* set scratch to bogus to make sure that libxsmm takes care of zeroing internally */ init_buf( (float*)scratch, scratch_size/4, 0, 0 ); if ((type == 'A' || type == 'F') && LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Correctness - FWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolutions */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedgroupnorm_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) ); } /* copy out data */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_output, (void*)naive_libxsmm_output, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); copy_internal_nchw( naive_output_pad, naive_output, nImg, nFm, ofh, ofw, pad_h_out, pad_w_out); /* compare */ printf("rcpstddev:\n"); libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_F32, nImg*nG, 1, naive_rcpstddev, rcpstddev_libxsmm, 0, 0); printf("L1 reference : %.25g\n", norms_fwd.l1_ref); printf("L1 test : %.25g\n", norms_fwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_fwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_fwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel); printf("Check-norm : %.24f\n", norms_fwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_fwd); printf("variance:\n"); libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_F32, nImg*nG, 1, naive_variance, variance_libxsmm, 0, 0); printf("L1 reference : %.25g\n", norms_fwd.l1_ref); printf("L1 test : %.25g\n", norms_fwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_fwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_fwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel); printf("Check-norm : %.24f\n", norms_fwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_fwd); printf("expected value:\n"); libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_F32, nImg*nG, 1, naive_expectval, expectval_libxsmm, 0, 0); printf("L1 reference : %.25g\n", norms_fwd.l1_ref); printf("L1 test : %.25g\n", norms_fwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_fwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_fwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel); printf("Check-norm : %.24f\n", norms_fwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_fwd); printf("output:\n"); libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_F32, nImg*nFm*ofhp*ofwp, 1, naive_output_pad, naive_libxsmm_output, 0, 0); printf("L1 reference : %.25g\n", norms_fwd.l1_ref); printf("L1 test : %.25g\n", norms_fwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_fwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_fwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel); printf("Check-norm : %.24f\n", norms_fwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_fwd); /* let's check ReLU positions */ relu_no_match = 0; for ( i = 0; i < nImg*nFm*ofhp*ofwp; ++i ) { if ( (naive_output_pad[i] == 0.0f && naive_libxsmm_output[i] != 0.0f) || (naive_output_pad[i] != 0.0f && naive_libxsmm_output[i] == 0.0f) ) { relu_no_match++; } } printf("ReLU mismatch count: %i\n", relu_no_match ); } if ( (type == 'A' || type == 'B') && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - BWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolutions */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedgroupnorm_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ) ); } /* copy out data */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_delinput, (void*)naive_libxsmm_delinput, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_delinput_add, (void*)naive_libxsmm_delinput_add, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); copy_internal_nchw( naive_delinput_pad, naive_delinput, nImg, nFm, ifh, ifw, pad_h_in, pad_w_in); copy_internal_nchw( naive_delinput_add_pad, naive_delinput_add, nImg, nFm, ifh, ifw, pad_h_in, pad_w_in); /* compare */ printf("delinput_add:\n"); libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, nImg*nFm*ifhp*ifwp, 1, naive_delinput_add_pad, naive_libxsmm_delinput_add, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); printf("delbeta:\n"); libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, nFm, 1, naive_delbeta, delbeta_libxsmm, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); printf("delgamma:\n"); libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, nFm, 1, naive_delgamma, delgamma_libxsmm, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); printf("delinput:\n"); libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, nImg*nFm*ifhp*ifwp, 1, naive_delinput_pad, naive_libxsmm_delinput, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); } if (type == 'A' || type == 'F') { printf("##########################################\n"); printf("# Performance - FWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolution for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_fusedgroupnorm_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); gb = ((double)nImg*(double)nFm*(((double)ifh*(double)ifw) + ((double)ofh*(double)ofw))*(double)sizeof(float)*(double)iters) / (1000*1000*1000); gib = ((double)nImg*(double)nFm*(((double)ifh*(double)ifw) + ((double)ofh*(double)ofw))*(double)sizeof(float)*(double)iters) / (1024*1024*1024); printf("GB = %.5g\n", gb/(double)iters); printf("GiB = %.5g\n", gib/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GB/s = %.5g\n", gb/l_total); printf("GiB/s = %.5g\n", gib/l_total); printf("PERFDUMP,FP,%s,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%.5g,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nFm, ifw, ifh, stride, pad_w_in, pad_h_in, pad_w_out, pad_h_out, ((double)(l_total/iters)), gb/l_total, gib/l_total, norms_fwd.l1_ref, norms_fwd.l1_tst, norms_fwd.l2_abs, norms_fwd.l2_rel, norms_fwd.linf_abs, norms_fwd.linf_rel, norms_fwd.normf_rel); } if ( (type == 'A' || type == 'B') ) { printf("##########################################\n"); printf("# Performance - BWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolution for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_fusedgroupnorm_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); gb = (2.0*(double)nImg*(double)nFm*(((double)ifh*(double)ifw) + (2.0*(double)ofh*(double)ofw))*(double)sizeof(float)*(double)iters) / (1000*1000*1000); gib = (2.0*(double)nImg*(double)nFm*(((double)ifh*(double)ifw) + (2.0*(double)ofh*(double)ofw))*(double)sizeof(float)*(double)iters) / (1024*1024*1024); printf("GB = %.5g\n", gb/(double)iters); printf("GiB = %.5g\n", gib/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GB/s = %.5g\n", gb/l_total); printf("GiB/s = %.5g\n", gib/l_total); printf("PERFDUMP,BP,%s,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%.5g,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nFm, ifw, ifh, stride, pad_w_in, pad_h_in, pad_w_out, pad_h_out, ((double)(l_total/iters)), gb/l_total, gib/l_total, norms_bwd.l1_ref, norms_bwd.l1_tst, norms_bwd.l2_abs, norms_bwd.l2_rel, norms_bwd.linf_abs, norms_bwd.linf_rel, norms_bwd.normf_rel); } /* clean-up */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedgroupnorm_release_scratch( libxsmm_handle ) ); libxsmm_free(scratch); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedgroupnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedgroupnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedgroupnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedgroupnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedgroupnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT_ADD ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedgroupnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT_ADD ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedgroupnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_CHANNEL_BETA ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedgroupnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_CHANNEL_BETA ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedgroupnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedgroupnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedgroupnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_CHANNEL_EXPECTVAL ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedgroupnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_CHANNEL_RCPSTDDEV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedgroupnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_CHANNEL_VARIANCE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedgroupnorm_release_tensor( libxsmm_handle, LIBXSMM_DNN_RELU_MASK) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_input ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_delinput ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_output ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_deloutput ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_input_add ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_delinput_add ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_beta ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_delbeta ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_gamma ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_delgamma ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_expectval ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_rcpstddev ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_variance ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_relumask ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_fusedgroupnorm( libxsmm_handle ) ); } /* deallocate data */ libxsmm_free(naive_input); libxsmm_free(naive_input_add); libxsmm_free(naive_output); libxsmm_free(naive_delinput); libxsmm_free(naive_delinput_add); libxsmm_free(naive_deloutput); libxsmm_free(naive_input_pad); libxsmm_free(naive_input_add_pad); libxsmm_free(naive_output_pad); libxsmm_free(naive_delinput_pad); libxsmm_free(naive_delinput_add_pad); libxsmm_free(naive_deloutput_pad); libxsmm_free(naive_beta); libxsmm_free(naive_gamma); libxsmm_free(naive_delbeta); libxsmm_free(naive_delgamma); libxsmm_free(naive_expectval); libxsmm_free(naive_rcpstddev); libxsmm_free(naive_variance); libxsmm_free(naive_libxsmm_output); libxsmm_free(naive_libxsmm_delinput); libxsmm_free(naive_libxsmm_delinput_add); libxsmm_free(input_libxsmm); libxsmm_free(input_add_libxsmm); libxsmm_free(output_libxsmm); libxsmm_free(delinput_libxsmm); libxsmm_free(delinput_add_libxsmm); libxsmm_free(deloutput_libxsmm); libxsmm_free(beta_libxsmm); libxsmm_free(gamma_libxsmm); libxsmm_free(delbeta_libxsmm); libxsmm_free(delgamma_libxsmm); libxsmm_free(expectval_libxsmm); libxsmm_free(rcpstddev_libxsmm); libxsmm_free(variance_libxsmm); libxsmm_free(relumask_libxsmm); { const char *const env_check_scale = getenv("CHECK_SCALE"); const double check_scale = LIBXSMM_ABS(0 == env_check_scale ? 1.0 : atof(env_check_scale)); if (LIBXSMM_NEQ(0, check) && (check < 100.0 * check_scale * diff.normf_rel) && (global_status == LIBXSMM_DNN_SUCCESS)) { fprintf(stderr, "FAILED with an error of %f%%!\n", 100.0 * diff.normf_rel); exit(EXIT_FAILURE); } } /* some empty lines at the end */ printf("\n\n\n"); return global_status; } libxsmm-1.17/samples/deeplearning/fusedgndriver/run_resnet50.sh000077500000000000000000000117201415223013700247540ustar00rootroot00000000000000#!/usr/bin/env bash set -eo pipefail UNAME=$(command -v uname) SORT=$(command -v sort) GREP=$(command -v grep) CUT=$(command -v cut) WC=$(command -v wc) TR=$(command -v tr) if [ "" = "${CHECK}" ] || [ "0" = "${CHECK}" ]; then if [ "" = "${CHECK_DNN_MB}" ]; then CHECK_DNN_MB=64; fi if [ "" = "${CHECK_DNN_ITERS}" ]; then CHECK_DNN_ITERS=1000; fi else # check if [ "" = "${CHECK_DNN_MB}" ]; then CHECK_DNN_MB=64; fi if [ "" = "${CHECK_DNN_ITERS}" ]; then CHECK_DNN_ITERS=1; fi fi if [ $# -ne 6 ] then echo "Usage: $(basename $0) mb iters numa (1-mcdram/0-DDR) prec (f32,bf16) FUSE (0,1,2,3) PASS ('A'-ALL/'F'-FP/'B'-BP); using default values; using default values: 64 1000 1 f32 0 A" MB=${CHECK_DNN_MB} ITERS=${CHECK_DNN_ITERS} NUMA=-1 BIN=f32 FUSE=0 PASS="A" else MB=$1 ITERS=$2 NUMA=$3 BIN=$4 FUSE=$5 PASS=$6 fi if [ "${GREP}" ] && [ "${SORT}" ] && [ "${CUT}" ] && [ "${TR}" ] && [ "${WC}" ]; then if [ "$(command -v lscpu)" ]; then NS=$(lscpu | ${GREP} -m1 "Socket(s)" | ${TR} -d " " | ${CUT} -d: -f2) if [ "" = "${NS}" ]; then NS=1; fi NC=$((NS*$(lscpu | ${GREP} -m1 "Core(s) per socket" | ${TR} -d " " | ${CUT} -d: -f2))) NT=$((NC*$(lscpu | ${GREP} -m1 "Thread(s) per core" | ${TR} -d " " | ${CUT} -d: -f2))) elif [ -e /proc/cpuinfo ]; then NS=$(${GREP} "physical id" /proc/cpuinfo | ${SORT} -u | ${WC} -l | ${TR} -d " ") if [ "" = "${NS}" ] || [ "" = "${NS}" ]; then NS=1; fi NC=$((NS*$(${GREP} -m1 "cpu cores" /proc/cpuinfo | ${TR} -d " " | ${CUT} -d: -f2))) NT=$(${GREP} "core id" /proc/cpuinfo | ${WC} -l | ${TR} -d " ") elif [ "Darwin" = "$(uname)" ]; then NS=$(sysctl hw.packages | ${CUT} -d: -f2 | ${TR} -d " ") NC=$(sysctl hw.physicalcpu | ${CUT} -d: -f2 | ${TR} -d " ") NT=$(sysctl hw.logicalcpu | ${CUT} -d: -f2 | ${TR} -d " ") fi if [ "${NC}" ] && [ "${NT}" ]; then HT=$((NT/NC)) else NS=1 NC=1 NT=1 HT=1 fi if [ "$(command -v numactl)" ]; then NN=$(numactl -H | ${GREP} "available:" | ${CUT} -d' ' -f2) else NN=${NS} fi fi CPUFLAGS=$(if [ "${GREP}" ] && [ "${CUT}" ] && [ -e /proc/cpuinfo ]; then ${GREP} -m1 flags /proc/cpuinfo | ${CUT} -d: -f2- || true; fi) if [ "${GREP}" ] && [ "$(echo "${CPUFLAGS}" | ${GREP} -o avx512er)" ]; then if [ "0" != "$((0>NUMA))" ] && [ "0" != "$((NS #include #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #include #include #include #include #if defined(_OPENMP) # include #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif /* include c-based dnn library */ #include "../common/dnn_common.h" #define CHKERR_LIBXSMM_DNN(A) { const int chkerr_libxsmm_dnn_ = A; if (LIBXSMM_DNN_SUCCESS != chkerr_libxsmm_dnn_) { \ fprintf(stderr, "%s\n", libxsmm_dnn_get_error(chkerr_libxsmm_dnn_)); global_status = chkerr_libxsmm_dnn_; } \ } int main(int argc, char* argv[]) { float *wigold, *wcgold, *wfgold, *rigold, *rcgold, *rfgold, *bigold, *bcgold, *bfgold; float *xgoldt, *hpgold, *hgoldt; float *dwgold, *drgold, *dbgold; float *dxgoldt, *dhpgold, *dhgoldt; float *igoldt, *cgoldt, *fgoldt, *ogoldt; float *xt, *hp, *w, *r, *b, *ht; float *it, *ct, *ft, *ot; float *dxt, *dhp, *dw, *dr, *db, *dht; float *scratch_bu, *dwtest, *drtest; void *scratch, *internalstate; size_t scratch_size = 0, internalstate_size = 0; int iters = 10; /* repetitions of benchmark */ int pass = 0; /* pass: 0--FWD, 1--BWD, 2--UPD, 3--BWD+UPD */ int N = 168; /* size of mini-batch */ int C = 512; /* number of inputs */ int K = 256; /* number of outputs */ int t = 50; /* number of time steps (>= 1) */ int bn = 24; int bc = 64; int bk = 64; const char *const env_check = getenv("CHECK"); const double check = LIBXSMM_ABS(0 == env_check ? 0/*disabled by default*/ : atof(env_check)); #if defined(_OPENMP) int nThreads = omp_get_max_threads(); /* number of threads */ #else int nThreads = 1; /* number of threads */ #endif unsigned long long l_start, l_end; double l_total = 0.0; double flops = 0.0; const double tflops = 12; /* transcendental flops */ int j; libxsmm_dnn_rnncell_desc grucell_desc; libxsmm_dnn_rnncell* libxsmm_handle; libxsmm_dnn_tensor* libxsmm_input; libxsmm_dnn_tensor* libxsmm_hidden_state_prev; libxsmm_dnn_tensor* libxsmm_weight; libxsmm_dnn_tensor* libxsmm_recur_weight; libxsmm_dnn_tensor* libxsmm_bias; libxsmm_dnn_tensor* libxsmm_hidden_state; libxsmm_dnn_tensor* libxsmm_i; libxsmm_dnn_tensor* libxsmm_c; libxsmm_dnn_tensor* libxsmm_f; libxsmm_dnn_tensor* libxsmm_o; libxsmm_dnn_tensor* libxsmm_dinput; libxsmm_dnn_tensor* libxsmm_dhidden_state_prev; libxsmm_dnn_tensor* libxsmm_dweight; libxsmm_dnn_tensor* libxsmm_drecur_weight; libxsmm_dnn_tensor* libxsmm_dbias; libxsmm_dnn_tensor* libxsmm_dhidden_state; libxsmm_dnn_tensor_datalayout* libxsmm_layout; libxsmm_dnn_err_t status; libxsmm_dnn_err_t global_status = LIBXSMM_DNN_SUCCESS; libxsmm_matdiff_info norms_fwd, norms_bwd, norms_upd_w, norms_upd_r, norms_upd_b, diff; libxsmm_matdiff_clear(&norms_fwd); libxsmm_matdiff_clear(&norms_bwd); libxsmm_matdiff_clear(&norms_upd_w); libxsmm_matdiff_clear(&norms_upd_r); libxsmm_matdiff_clear(&norms_upd_b); libxsmm_matdiff_clear(&diff); if (argc > 1 && !strncmp(argv[1], "-h", 3)) { printf("\nUsage: ./grudriver [reps] [pass: 0--FWD, 1--BWD, 2--UPD, 3--BWD+UPD] [N] [C] [K] [time_steps > 0]\n\n"); return 0; } libxsmm_rng_set_seed(1); /* reading new values from cli */ j = 1; if (argc > j) iters = atoi(argv[j++]); if (argc > j) pass = atoi(argv[j++]); if (argc > j) N = atoi(argv[j++]); if (argc > j) C = atoi(argv[j++]); if (argc > j) K = atoi(argv[j++]); if (argc > j) t = atoi(argv[j++]); if (argc > j) bn = atoi(argv[j++]); if (argc > j) bc = atoi(argv[j++]); if (argc > j) bk = atoi(argv[j++]); if (t <= 0) { printf("time_steps %d should be greater than or equal to 1\n\n", t); return 0; } if (!(pass == 0 || pass == 1 || pass == 2 || pass == 3)) { printf("Unknown pass: %d, valid arguments for pass = {0(FWD), 1(BWD), 2(UPD), 3(BWD+UPD)\n\n", pass); return 0; } #if defined(__SSE3__) _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); #endif /* print some summary */ printf("##########################################\n"); printf("# Setting Up (Common) #\n"); printf("##########################################\n"); printf("PARAMS: N:%d C:%d K:%d T:%d\n", N, C, K, t); printf("PARAMS: ITERS:%d", iters); if (LIBXSMM_FEQ(0, check)) printf(" Threads:%d\n", nThreads); else printf("\n"); printf("SIZE Weight (MB): %10.2f MiB\n", (double)(C*K*sizeof(float))/(1024.0*1024.0) ); printf("SIZE Input (MB): %10.2f MiB\n", (double)(N*C*sizeof(float))/(1024.0*1024.0) ); printf("SIZE Hidden State: %10.2f MiB\n", (double)(K*N*sizeof(float))/(1024.0*1024.0) ); /* allocate data */ xgoldt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); hpgold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); wigold = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); wcgold = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); wfgold = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); rigold = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); rcgold = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); rfgold = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); bigold = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); bcgold = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); bfgold = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); hgoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); igoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); cgoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); fgoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); ogoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); dxgoldt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); dhpgold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); dwgold = (float*)libxsmm_aligned_malloc(C*K*3*sizeof(float), 2097152); drgold = (float*)libxsmm_aligned_malloc(K*K*3*sizeof(float), 2097152); dbgold = (float*)libxsmm_aligned_malloc(K*3*sizeof(float), 2097152); dhgoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); scratch_bu = (float*)libxsmm_aligned_malloc(K*N*6*sizeof(float), 2097152); xt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); hp = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); w = (float*)libxsmm_aligned_malloc(C*K*3*sizeof(float), 2097152); r = (float*)libxsmm_aligned_malloc(K*K*3*sizeof(float), 2097152); b = (float*)libxsmm_aligned_malloc(K*3*sizeof(float), 2097152); ht = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); it = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); ct = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); ft = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); ot = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); dxt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); dhp = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); dw = (float*)libxsmm_aligned_malloc(C*K*3*sizeof(float), 2097152); dr = (float*)libxsmm_aligned_malloc(K*K*3*sizeof(float), 2097152); db = (float*)libxsmm_aligned_malloc(K*3*sizeof(float), 2097152); dht = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); dwtest = (float*)libxsmm_aligned_malloc(C*K*3*sizeof(float), 2097152); drtest = (float*)libxsmm_aligned_malloc(K*K*3*sizeof(float), 2097152); LIBXSMM_VLA_DECL(2, float, xgold, xgoldt, N * C); LIBXSMM_VLA_DECL(2, float, hgold, hgoldt, N * K); /*LIBXSMM_VLA_DECL(2, float, igold, igoldt, N * K);*/ /*LIBXSMM_VLA_DECL(2, float, cgold, cgoldt, N * K);*/ /*LIBXSMM_VLA_DECL(2, float, fgold, fgoldt, N * K);*/ /*LIBXSMM_VLA_DECL(2, float, ogold, ogoldt, N * K);*/ /*LIBXSMM_VLA_DECL(2, float, dxgold, dxgoldt, N * C);*/ LIBXSMM_VLA_DECL(2, float, dhgold, dhgoldt, N * K); LIBXSMM_VLA_DECL(2, float, h, ht, N * K); /* initialize data */ /* FWD */ for (j = 0; j < t; ++j) { LIBXSMM_MATINIT_OMP(float, 24, &LIBXSMM_VLA_ACCESS(2, xgold, j, 0, N * C), N, C, N, 1.0); } LIBXSMM_MATINIT_OMP(float, 24, hpgold, N, K, N, 1.0); LIBXSMM_MATINIT_OMP(float, 42, wigold, C, K, C, 1.0); LIBXSMM_MATINIT_OMP(float, 42, wcgold, C, K, C, 1.0); LIBXSMM_MATINIT_OMP(float, 42, wfgold, C, K, C, 1.0); LIBXSMM_MATINIT_OMP(float, 42, rigold, K, K, K, 1.0); LIBXSMM_MATINIT_OMP(float, 42, rcgold, K, K, K, 1.0); LIBXSMM_MATINIT_OMP(float, 42, rfgold, K, K, K, 1.0); LIBXSMM_MATINIT_OMP(float, 24, bigold, 1, K, 1, 1.0); LIBXSMM_MATINIT_OMP(float, 24, bcgold, 1, K, 1, 1.0); LIBXSMM_MATINIT_OMP(float, 24, bfgold, 1, K, 1, 1.0); zero_buf(hgoldt, N*K*t); /* BWD/UPD */ for (j = 0; j < t; ++j) { LIBXSMM_MATINIT_OMP(float, 24, &LIBXSMM_VLA_ACCESS(2, dhgold, j, 0, K * N), N, K, N, 1.0); } zero_buf(dxgoldt, N*C*t); zero_buf(dhpgold, K*N); zero_buf(dwgold, C*K*3); zero_buf(drgold, K*K*3); zero_buf(dbgold, K*3); /* first touch LIBXSMM */ zero_buf(xt, N*C*t); zero_buf(hp, K*N); zero_buf(w, C*K*3); zero_buf(r, K*K*3); zero_buf(b, K*3); zero_buf(ht, N*K*t); zero_buf(it, K*N*t); zero_buf(ct, K*N*t); zero_buf(ft, K*N*t); zero_buf(ot, K*N*t); zero_buf(dxt, N*C*t); zero_buf(dhp, K*N); zero_buf(dw, C*K*3); zero_buf(dr, K*K*3); zero_buf(db, K*3); zero_buf(dht, K*N*t); if (LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Computing Reference ... #\n"); printf("##########################################\n"); gru_ref_fwd( N, C, K, t, wigold, wcgold, wfgold, rigold, rcgold, rfgold, bigold, bcgold, bfgold, xgoldt, hpgold, hgoldt, igoldt, cgoldt, fgoldt, ogoldt ); gru_ref_bwd_upd( N, C, K, t, xgoldt, hpgold, hgoldt, igoldt, cgoldt, fgoldt, ogoldt, wigold, wcgold, wfgold, rigold, rcgold, rfgold, dhgoldt, dwgold, drgold, dbgold, dxgoldt, dhpgold, scratch_bu ); printf("##########################################\n"); printf("# Computing Reference ... done #\n"); printf("##########################################\n"); } if (1 /* format == 'A' || format == 'L' */) { printf("\n"); printf("##########################################\n"); printf("# Setting Up (custom-Storage) #\n"); printf("##########################################\n"); /* setup LIBXSMM handle */ grucell_desc.threads = nThreads; grucell_desc.N = N; grucell_desc.C = C; grucell_desc.K = K; grucell_desc.max_T = t; grucell_desc.bn = bn; grucell_desc.bc = bc; grucell_desc.bk = bk; grucell_desc.cell_type = LIBXSMM_DNN_RNNCELL_GRU; grucell_desc.datatype_in = LIBXSMM_DNN_DATATYPE_F32; grucell_desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32; grucell_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NC; grucell_desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_CK; libxsmm_handle = libxsmm_dnn_create_rnncell( grucell_desc, &status ); CHKERR_LIBXSMM_DNN( status ); /* setup LIBXSMM buffers and filter */ libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_input = libxsmm_dnn_link_tensor( libxsmm_layout, xt, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_hidden_state_prev = libxsmm_dnn_link_tensor( libxsmm_layout, hp, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_weight = libxsmm_dnn_link_tensor( libxsmm_layout, w, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_recur_weight = libxsmm_dnn_link_tensor( libxsmm_layout, r, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_BIAS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_bias = libxsmm_dnn_link_tensor( libxsmm_layout, b, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_hidden_state = libxsmm_dnn_link_tensor( libxsmm_layout, ht, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_I, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_i = libxsmm_dnn_link_tensor( libxsmm_layout, it, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_CI, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_c = libxsmm_dnn_link_tensor( libxsmm_layout, ct, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_F, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_f = libxsmm_dnn_link_tensor( libxsmm_layout, ft, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_O, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_o = libxsmm_dnn_link_tensor( libxsmm_layout, ot, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dinput = libxsmm_dnn_link_tensor( libxsmm_layout, dxt, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dhidden_state_prev = libxsmm_dnn_link_tensor( libxsmm_layout, dhp, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dweight = libxsmm_dnn_link_tensor( libxsmm_layout, dw, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_drecur_weight = libxsmm_dnn_link_tensor( libxsmm_layout, dr, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_BIAS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dbias = libxsmm_dnn_link_tensor( libxsmm_layout, db, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dhidden_state = libxsmm_dnn_link_tensor( libxsmm_layout, dht, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); /* copy in data to LIBXSMM format */ matrix_copy(N*C*t, xgoldt, xt); matrix_copy(K*N, hpgold, hp); convert_ck_c3k(C, K, wigold, w); convert_ck_c3k(C, K, wcgold, &(w[K])); convert_ck_c3k(C, K, wfgold, &(w[2*K])); convert_ck_c3k(K, K, rigold, r); convert_ck_c3k(K, K, rcgold, &(r[K])); convert_ck_c3k(K, K, rfgold, &(r[2*K])); matrix_copy(K, bigold, b); matrix_copy(K, bcgold, &(b[K])); matrix_copy(K, bfgold, &(b[2*K])); matrix_copy(K*N*t, dhgoldt, dht); /* bind buffers and filter to handle */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_input, LIBXSMM_DNN_RNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_hidden_state_prev, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_weight, LIBXSMM_DNN_RNN_REGULAR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_recur_weight, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_bias, LIBXSMM_DNN_RNN_REGULAR_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_hidden_state, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_i, LIBXSMM_DNN_RNN_INTERNAL_I ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_c, LIBXSMM_DNN_RNN_INTERNAL_CI ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_f, LIBXSMM_DNN_RNN_INTERNAL_F ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_o, LIBXSMM_DNN_RNN_INTERNAL_O ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dinput, LIBXSMM_DNN_RNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dhidden_state_prev, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dweight, LIBXSMM_DNN_RNN_GRADIENT_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_drecur_weight, LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dbias, LIBXSMM_DNN_RNN_GRADIENT_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dhidden_state, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE ) ); /* let's allocate and bind scratch */ if (pass == 0) { scratch_size = libxsmm_dnn_rnncell_get_scratch_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, &status ); CHKERR_LIBXSMM_DNN( status ); scratch = libxsmm_aligned_malloc( scratch_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, scratch ) ); } else { scratch_size = libxsmm_dnn_rnncell_get_scratch_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, &status ); CHKERR_LIBXSMM_DNN( status ); scratch = libxsmm_aligned_malloc( scratch_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, scratch ) ); } zero_buf( (float*)scratch, scratch_size/4 ); /* let's allocate and bind internalstate */ if (pass == 0) { internalstate_size = libxsmm_dnn_rnncell_get_internalstate_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, &status ); CHKERR_LIBXSMM_DNN( status ); internalstate = libxsmm_aligned_malloc( internalstate_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, internalstate ) ); } else { internalstate_size = libxsmm_dnn_rnncell_get_internalstate_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, &status ); CHKERR_LIBXSMM_DNN( status ); internalstate = libxsmm_aligned_malloc( internalstate_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, internalstate ) ); } zero_buf( (float*)internalstate, internalstate_size/4 ); if ((pass == 0) && LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Correctness - FWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM GRU */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) ); } /* compare */ libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_F32, K*N, 1, &LIBXSMM_VLA_ACCESS(2, hgold, t-1, 0, K * N), &LIBXSMM_VLA_ACCESS(2, h, t-1, 0, K * N), 0, 0); printf("L1 reference : %.25g\n", norms_fwd.l1_ref); printf("L1 test : %.25g\n", norms_fwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_fwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_fwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel); printf("Check-norm : %.24f\n", norms_fwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_fwd); } else { /* We need to always run FWD pass once to populate i, c, f, o, h */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) ); } } if ( (pass == 1) && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - BWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM GRU */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ) ); } /* compare */ libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, N*C*t, 1, dxgoldt, dxt, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); } if ( (pass == 2) && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM GRU */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ) ); } convert_ck_c3k(C, K, &(dwgold[0]), &(dwtest[0])); convert_ck_c3k(C, K, &(dwgold[C*K]), &(dwtest[K])); convert_ck_c3k(C, K, &(dwgold[2*C*K]), &(dwtest[2*K])); convert_ck_c3k(K, K, &(drgold[0]), &(drtest[0])); convert_ck_c3k(K, K, &(drgold[K*K]), &(drtest[K])); convert_ck_c3k(K, K, &(drgold[2*K*K]), &(drtest[2*K])); /* compare */ libxsmm_matdiff(&norms_upd_w, LIBXSMM_DATATYPE_F32, C*K*3, 1, dwtest, dw, 0, 0); printf("Delta weight\n"); printf("L1 reference : %.25g\n", norms_upd_w.l1_ref); printf("L1 test : %.25g\n", norms_upd_w.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_w.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_w.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_w.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_w.linf_rel); printf("Check-norm : %.24f\n", norms_upd_w.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_w); libxsmm_matdiff(&norms_upd_r, LIBXSMM_DATATYPE_F32, K*K*3, 1, drtest, dr, 0, 0); printf("Delta recurrent weight\n"); printf("L1 reference : %.25g\n", norms_upd_r.l1_ref); printf("L1 test : %.25g\n", norms_upd_r.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_r.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_r.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_r.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_r.linf_rel); printf("Check-norm : %.24f\n", norms_upd_r.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_r); libxsmm_matdiff(&norms_upd_b, LIBXSMM_DATATYPE_F32, K*3, 1, dbgold, db, 0, 0); printf("Delta bias\n"); printf("L1 reference : %.25g\n", norms_upd_b.l1_ref); printf("L1 test : %.25g\n", norms_upd_b.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_b.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_b.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_b.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_b.linf_rel); printf("Check-norm : %.24f\n", norms_upd_b.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_b); } if ( (pass == 3) && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - BWD+UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM GRU */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, 0, tid ) ); } convert_ck_c3k(C, K, &(dwgold[0]), &(dwtest[0])); convert_ck_c3k(C, K, &(dwgold[C*K]), &(dwtest[K])); convert_ck_c3k(C, K, &(dwgold[2*C*K]), &(dwtest[2*K])); convert_ck_c3k(K, K, &(drgold[0]), &(drtest[0])); convert_ck_c3k(K, K, &(drgold[K*K]), &(drtest[K])); convert_ck_c3k(K, K, &(drgold[2*K*K]), &(drtest[2*K])); /* compare */ libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, N*C*t, 1, dxgoldt, dxt, 0, 0); printf("Delta input\n"); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); libxsmm_matdiff(&norms_upd_w, LIBXSMM_DATATYPE_F32, C*K*3, 1, dwtest, dw, 0, 0); printf("Delta weight\n"); printf("L1 reference : %.25g\n", norms_upd_w.l1_ref); printf("L1 test : %.25g\n", norms_upd_w.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_w.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_w.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_w.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_w.linf_rel); printf("Check-norm : %.24f\n", norms_upd_w.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_w); libxsmm_matdiff(&norms_upd_r, LIBXSMM_DATATYPE_F32, K*K*3, 1, drtest, dr, 0, 0); printf("Delta recurrent weight\n"); printf("L1 reference : %.25g\n", norms_upd_r.l1_ref); printf("L1 test : %.25g\n", norms_upd_r.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_r.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_r.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_r.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_r.linf_rel); printf("Check-norm : %.24f\n", norms_upd_r.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_r); libxsmm_matdiff(&norms_upd_b, LIBXSMM_DATATYPE_F32, K*3, 1, dbgold, db, 0, 0); printf("Delta bias\n"); printf("L1 reference : %.25g\n", norms_upd_b.l1_ref); printf("L1 test : %.25g\n", norms_upd_b.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_b.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_b.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_b.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_b.linf_rel); printf("Check-norm : %.24f\n", norms_upd_b.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_b); } if ( pass == 0 ) { printf("##########################################\n"); printf("# Performance - FWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM GRU for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(j) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (j = 0; j < iters; ++j) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = (((2.0 * K * N * C) + (2.0 * K * N * K) + (2.0 * K * N) + (tflops * K * N)) * 2.0 + (K * N) + (2.0 * K * N * C) + (2.0 * K * N * K) + (tflops * K * N) + 4.0 * (K * N)) * (double)t * (double)iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,FP,%s,%i,%i,%i,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, bn, bc, bk, ((double)(l_total/iters)), (flops*1e-9)/l_total); } if ( pass == 1 ) { printf("##########################################\n"); printf("# Performance - BWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM GRU for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(j) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (j = 0; j < iters; ++j) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = K * N; /* d3 = djdh + d23 (delta) */ flops += 2.0 * K * N; /* d4 = (1 - z).d3 */ flops += K * N; /* d5 = d3.h */ flops += K * N; /* d6 = -d5 */ flops += K * N; /* d7 = d3.g */ flops += K * N; /* d8 = d3.z */ flops += K * N; /* d9 = d7 + d8 */ flops += 3.0 * K * N; /* d10 = d8.tanh'(g) */ flops += 3.0 * K * N; /* d11 = d9.sig'(z) */ flops += (2.0 * K * K * N + K * K); /* d13 = Wg^T * d10 (including transpose) */ flops += (2.0 * K * K * N + K * K); /* d15 = Wz^T * d11 (including transpose) */ flops += K * N; /* d16 = d13.z */ flops += K * N; /* d17 = d13.r */ flops += 3.0 * K * N; /* d18 = d16.sig'(r) */ flops += K * N; /* d19 = d17 + d4 */ flops += (2.0 * K * K * N + K * K); /* d21 = Wr^T * d18 (including transpose) */ flops += K * N; /* d22 = d21 + d15 */ flops += K * N; /* d23 = d19 + d22 */ flops += (2.0 * K * C * N + K * C); /* d12 = Ug^T * d10 (including transpose) */ flops += (2.0 * K * C * N + K * C); /* d14 = Uz^T * d11 (including transpose) */ flops += (2.0 * K * C * N + K * C); /* d20 = Ur^T * d18 (including transpose) */ flops += 2.0 * K * N; /* djdx = d12 + d14 + d20 */ flops *= t; /* for t time steps */ flops *= iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("bp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,BP,%s,%i,%i,%i,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, bn, bc, bk, ((double)(l_total/iters)), (flops*1e-9)/l_total); } if ( pass == 2 ) { printf("##########################################\n"); printf("# Performance - UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM GRU for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(j) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (j = 0; j < iters; ++j) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = K * N; /* d3 = djdh + d23 (delta) */ flops += 2.0 * K * N; /* d4 = (1 - z).d3 */ flops += K * N; /* d5 = d3.h */ flops += K * N; /* d6 = -d5 */ flops += K * N; /* d7 = d3.g */ flops += K * N; /* d8 = d3.z */ flops += K * N; /* d9 = d7 + d8 */ flops += 3.0 * K * N; /* d10 = d8.tanh'(g) */ flops += 3.0 * K * N; /* d11 = d9.sig'(z) */ flops += (2.0 * K * K * N + K * K); /* d13 = Wg^T * d10 (including transpose) */ flops += (2.0 * K * K * N + K * K); /* d15 = Wz^T * d11 (including transpose) */ flops += K * N; /* d16 = d13.z */ flops += K * N; /* d17 = d13.r */ flops += 3.0 * K * N; /* d18 = d16.sig'(r) */ flops += K * N; /* d19 = d17 + d4 */ flops += (2.0 * K * K * N + K * K); /* d21 = Wr^T * d18 (including transpose) */ flops += K * N; /* d22 = d21 + d15 */ flops += K * N; /* d23 = d19 + d22 */ flops += (2.0 * K * N * K + K * N + K * K); /* djdwr = djdwr + d18 * h^T */ flops += (2.0 * K * N * K + K * N + K * K); /* djdwz = djdwz + d11 * h^T */ flops += (2.0 * K * N * K + 2.0 * K * N + K * K); /* djdwg = djdwg + d10 * (h.r)^T */ flops += (2.0 * K * N * C + C * N + K * C); /* djdur = djdur + d18 * x^T */ flops += (2.0 * K * N * C + C * N + K * C); /* djduz = djduz + d11 * x^T */ flops += (2.0 * K * N * C + C * N + K * C); /* djdug = djdug + d10 * x^T */ flops += K * N; /* djdbr = djdbr + d18 */ flops += K * N; /* djdbz = djdbz + d11 */ flops += K * N; /* djdbg = djdbg + d10 */ flops *= t; /* for t time steps */ flops *= iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("wu time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,WU,%s,%i,%i,%i,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, bn, bc, bk, ((double)(l_total/iters)), (flops*1e-9)/l_total); } if ( pass == 3 ) { printf("##########################################\n"); printf("# Performance - BWD+UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM GRU for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(j) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (j = 0; j < iters; ++j) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = K * N; /* d3 = djdh + d23 (delta) */ flops += 2.0 * K * N; /* d4 = (1 - z).d3 */ flops += K * N; /* d5 = d3.h */ flops += K * N; /* d6 = -d5 */ flops += K * N; /* d7 = d3.g */ flops += K * N; /* d8 = d3.z */ flops += K * N; /* d9 = d7 + d8 */ flops += 3.0 * K * N; /* d10 = d8.tanh'(g) */ flops += 3.0 * K * N; /* d11 = d9.sig'(z) */ flops += (2.0 * K * K * N + K * K); /* d13 = Wg^T * d10 (including transpose) */ flops += (2.0 * K * K * N + K * K); /* d15 = Wz^T * d11 (including transpose) */ flops += K * N; /* d16 = d13.z */ flops += K * N; /* d17 = d13.r */ flops += 3.0 * K * N; /* d18 = d16.sig'(r) */ flops += K * N; /* d19 = d17 + d4 */ flops += (2.0 * K * K * N + K * K); /* d21 = Wr^T * d18 (including transpose) */ flops += K * N; /* d22 = d21 + d15 */ flops += K * N; /* d23 = d19 + d22 */ flops += (2.0 * K * C * N + K * C); /* d12 = Ug^T * d10 (including transpose) */ flops += (2.0 * K * C * N + K * C); /* d14 = Uz^T * d11 (including transpose) */ flops += (2.0 * K * C * N + K * C); /* d20 = Ur^T * d18 (including transpose) */ flops += 2.0 * K * N; /* djdx = d12 + d14 + d20 */ flops += (2.0 * K * N * K + K * N + K * K); /* djdwr = djdwr + d18 * h^T */ flops += (2.0 * K * N * K + K * N + K * K); /* djdwz = djdwz + d11 * h^T */ flops += (2.0 * K * N * K + 2.0 * K * N + K * K); /* djdwg = djdwg + d10 * (h.r)^T */ flops += (2.0 * K * N * C + C * N + K * C); /* djdur = djdur + d18 * x^T */ flops += (2.0 * K * N * C + C * N + K * C); /* djduz = djduz + d11 * x^T */ flops += (2.0 * K * N * C + C * N + K * C); /* djdug = djdug + d10 * x^T */ flops += K * N; /* djdbr = djdbr + d18 */ flops += K * N; /* djdbz = djdbz + d11 */ flops += K * N; /* djdbg = djdbg + d10 */ flops *= t; /* for t time steps */ flops *= iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("bp+wu time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,BP+WU,%s,%i,%i,%i,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, bn, bc, bk, ((double)(l_total/iters)), (flops*1e-9)/l_total); } /* clean-up */ if (pass == 0) { CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD ) ); } else { CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL ) ); } libxsmm_free(scratch); libxsmm_free(internalstate); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_I ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_CI ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_F ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_O ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_input ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_hidden_state_prev ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_weight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_recur_weight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_bias ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_hidden_state ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_i ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_c ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_f ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_o ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dinput ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dhidden_state_prev ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dweight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_drecur_weight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dbias ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dhidden_state ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_rnncell( libxsmm_handle ) ); } /* deallocate data */ libxsmm_free(xgoldt); libxsmm_free(hpgold); libxsmm_free(wigold); libxsmm_free(wcgold); libxsmm_free(wfgold); libxsmm_free(rigold); libxsmm_free(rcgold); libxsmm_free(rfgold); libxsmm_free(bigold); libxsmm_free(bcgold); libxsmm_free(bfgold); libxsmm_free(hgoldt); libxsmm_free(igoldt); libxsmm_free(cgoldt); libxsmm_free(fgoldt); libxsmm_free(ogoldt); libxsmm_free(dxgoldt); libxsmm_free(dhpgold); libxsmm_free(dwgold); libxsmm_free(drgold); libxsmm_free(dbgold); libxsmm_free(dhgoldt); libxsmm_free(xt); libxsmm_free(hp); libxsmm_free(w); libxsmm_free(r); libxsmm_free(b); libxsmm_free(ht); libxsmm_free(it); libxsmm_free(ct); libxsmm_free(ft); libxsmm_free(ot); libxsmm_free(dxt); libxsmm_free(dhp); libxsmm_free(dw); libxsmm_free(dr); libxsmm_free(db); libxsmm_free(dht); libxsmm_free(dwtest); libxsmm_free(drtest); { const char *const env_check_scale = getenv("CHECK_SCALE"); const double check_scale = LIBXSMM_ABS(0 == env_check_scale ? 1.0 : atof(env_check_scale)); if (LIBXSMM_NEQ(0, check) && (check < 100.0 * check_scale * diff.normf_rel) && (global_status == LIBXSMM_DNN_SUCCESS)) { fprintf(stderr, "FAILED with an error of %f%%!\n", 100.0 * diff.normf_rel); exit(EXIT_FAILURE); } } /* some empty lines at the end */ printf("\n\n\n"); return global_status; } libxsmm-1.17/samples/deeplearning/grudriver/grudriver_nc_ck.sh000077500000000000000000000070451415223013700247400ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.), Kunal Banerjee (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) NAME=$(basename $0 .sh) GREP=$(command -v grep) ENV=$(command -v env) if [ "Windows_NT" = "${OS}" ]; then # Cygwin's "env" does not set PATH ("Files/Black: No such file or directory") export PATH=${PATH}:${HERE}/../../lib:/usr/x86_64-w64-mingw32/sys-root/mingw/bin # Cygwin's ldd hangs with dyn. linked executables or certain shared libraries LDD=$(command -v cygcheck) EXE=.exe else if [ "$(command -v ldd)" ]; then LDD=ldd elif [ "$(command -v otool)" ]; then LDD="otool -L" else LDD=echo fi fi MICINFO=$(command -v micinfo) if [ "${MICINFO}" ]; then MICCORES=$(${MICINFO} 2>/dev/null | sed -n "0,/[[:space:]]\+Total No of Active Cores :[[:space:]]\+\([0-9]\+\)/s//\1/p") fi if [ "" = "${MICCORES}" ]; then MICCORES=61 fi MICTPERC=3 if [ "-mic" != "$1" ]; then if [ "$(${LDD} ${HERE}/${NAME}${EXE} 2>/dev/null | ${GREP} libiomp5\.)" ]; then ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ KMP_AFFINITY=compact,granularity=fine,1 \ MIC_KMP_AFFINITY=compact,granularity=fine \ MIC_KMP_HW_SUBSET=$((MICCORES-1))c${MICTPERC}t \ MIC_ENV_PREFIX=MIC \ OFFLOAD_INIT=on_start \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" else ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ OMP_PROC_BIND=TRUE \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" fi else shift ${ENV} \ SINK_LD_LIBRARY_PATH=${SINK_LD_LIBRARY_PATH}:${MIC_LD_LIBRARY_PATH}:${HERE}/../../lib \ micnativeloadex \ ${HERE}/${NAME}${EXE} -a "$*" \ -e "KMP_AFFINITY=compact,granularity=fine" \ -e "MIC_KMP_HW_SUBSET=$((MICCORES-1))${MICTPERC}t" fi ITERS=100 CHKVAL=1 export OMP_NUM_THREADS=28 export KMP_AFFINITY=granularity=fine,compact,1,0 echo "GRU FWD" CHECK=${CHKVAL} ./grudriver_nc_ck ${ITERS} 0 168 256 256 50 24 64 64 wait CHECK=${CHKVAL} ./grudriver_nc_ck ${ITERS} 0 168 512 512 50 24 64 64 wait CHECK=${CHKVAL} ./grudriver_nc_ck ${ITERS} 0 168 1024 1024 50 24 64 64 wait CHECK=${CHKVAL} ./grudriver_nc_ck ${ITERS} 0 168 2048 2048 50 24 64 64 wait CHECK=${CHKVAL} ./grudriver_nc_ck ${ITERS} 0 168 4096 4096 50 24 64 64 wait echo "GRU BWD+UPD" CHECK=${CHKVAL} ./grudriver_nc_ck ${ITERS} 3 168 256 256 50 24 64 64 wait CHECK=${CHKVAL} ./grudriver_nc_ck ${ITERS} 3 168 512 512 50 24 64 64 wait CHECK=${CHKVAL} ./grudriver_nc_ck ${ITERS} 3 168 1024 1024 50 24 64 64 wait CHECK=${CHKVAL} ./grudriver_nc_ck ${ITERS} 3 168 2048 2048 50 24 64 64 wait CHECK=${CHKVAL} ./grudriver_nc_ck ${ITERS} 3 168 4096 4096 50 24 64 64 wait echo "GRU performance done" echo "" libxsmm-1.17/samples/deeplearning/grudriver/grudriver_nc_ck.vcxproj000066400000000000000000000551041415223013700260150ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 grudriver {1066873D-98A8-4133-9386-5D1BC113290E} 10.0 Application Disabled Disabled Sequential v142 Application true true Disabled Disabled Sequential v142 Application true true Disabled Disabled Sequential v142 Application Disabled Disabled Sequential v142 true Application true Disabled Disabled Sequential v142 true Application true Disabled Disabled true Sequential v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmext.lib;mkl_rt.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmext.lib;mkl_rt.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/deeplearning/grudriver/grudriver_nc_kcck.c000066400000000000000000001327751415223013700250740ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Kunal Banerjee (Intel Corp.) ******************************************************************************/ #include #include #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #include #include #include #include #if defined(_OPENMP) # include #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif /* include c-based dnn library */ #include "../common/dnn_common.h" #define CHKERR_LIBXSMM_DNN(A) { const int chkerr_libxsmm_dnn_ = A; if (LIBXSMM_DNN_SUCCESS != chkerr_libxsmm_dnn_) { \ fprintf(stderr, "%s\n", libxsmm_dnn_get_error(chkerr_libxsmm_dnn_)); global_status = chkerr_libxsmm_dnn_; } \ } int main(int argc, char* argv[]) { float *wigold, *wcgold, *wfgold, *rigold, *rcgold, *rfgold, *bigold, *bcgold, *bfgold; float *xgoldt, *hpgold, *hgoldt; float *dwgold, *drgold, *dbgold; float *dxgoldt, *dhpgold, *dhgoldt; float *igoldt, *cgoldt, *fgoldt, *ogoldt; float *xt, *hp, *w, *r, *b, *ht; float *it, *ct, *ft, *ot; float *dxt, *dhp, *dw, *dr, *db, *dht; float *scratch_bu, *dwtest, *drtest, *w_tmp, *r_tmp; void *scratch, *internalstate; size_t scratch_size = 0, internalstate_size = 0; int iters = 10; /* repetitions of benchmark */ int pass = 0; /* pass: 0--FWD, 1--BWD, 2--UPD, 3--BWD+UPD */ int N = 168; /* size of mini-batch */ int C = 512; /* number of inputs */ int K = 256; /* number of outputs */ int t = 50; /* number of time steps (>= 1) */ int bn = 24; int bc = 64; int bk = 64; const char *const env_check = getenv("CHECK"); const double check = LIBXSMM_ABS(0 == env_check ? 0/*disabled by default*/ : atof(env_check)); #if defined(_OPENMP) int nThreads = omp_get_max_threads(); /* number of threads */ #else int nThreads = 1; /* number of threads */ #endif unsigned long long l_start, l_end; double l_total = 0.0; double flops = 0.0; const double tflops = 12; /* transcendental flops */ int j; libxsmm_dnn_rnncell_desc grucell_desc; libxsmm_dnn_rnncell* libxsmm_handle; libxsmm_dnn_tensor* libxsmm_input; libxsmm_dnn_tensor* libxsmm_hidden_state_prev; libxsmm_dnn_tensor* libxsmm_weight; libxsmm_dnn_tensor* libxsmm_recur_weight; libxsmm_dnn_tensor* libxsmm_bias; libxsmm_dnn_tensor* libxsmm_hidden_state; libxsmm_dnn_tensor* libxsmm_i; libxsmm_dnn_tensor* libxsmm_c; libxsmm_dnn_tensor* libxsmm_f; libxsmm_dnn_tensor* libxsmm_o; libxsmm_dnn_tensor* libxsmm_dinput; libxsmm_dnn_tensor* libxsmm_dhidden_state_prev; libxsmm_dnn_tensor* libxsmm_dweight; libxsmm_dnn_tensor* libxsmm_drecur_weight; libxsmm_dnn_tensor* libxsmm_dbias; libxsmm_dnn_tensor* libxsmm_dhidden_state; libxsmm_dnn_tensor_datalayout* libxsmm_layout; libxsmm_dnn_err_t status; libxsmm_dnn_err_t global_status = LIBXSMM_DNN_SUCCESS; libxsmm_matdiff_info norms_fwd, norms_bwd, norms_upd_w, norms_upd_r, norms_upd_b, diff; libxsmm_matdiff_clear(&norms_fwd); libxsmm_matdiff_clear(&norms_bwd); libxsmm_matdiff_clear(&norms_upd_w); libxsmm_matdiff_clear(&norms_upd_r); libxsmm_matdiff_clear(&norms_upd_b); libxsmm_matdiff_clear(&diff); if (argc > 1 && !strncmp(argv[1], "-h", 3)) { printf("\nUsage: ./grudriver [reps] [pass: 0--FWD, 1--BWD, 2--UPD, 3--BWD+UPD] [N] [C] [K] [time_steps > 0]\n\n"); return 0; } libxsmm_rng_set_seed(1); /* reading new values from cli */ j = 1; if (argc > j) iters = atoi(argv[j++]); if (argc > j) pass = atoi(argv[j++]); if (argc > j) N = atoi(argv[j++]); if (argc > j) C = atoi(argv[j++]); if (argc > j) K = atoi(argv[j++]); if (argc > j) t = atoi(argv[j++]); if (argc > j) bn = atoi(argv[j++]); if (argc > j) bc = atoi(argv[j++]); if (argc > j) bk = atoi(argv[j++]); if (t <= 0) { printf("time_steps %d should be greater than or equal to 1\n\n", t); return 0; } if (!(pass == 0 || pass == 1 || pass == 2 || pass == 3)) { printf("Unknown pass: %d, valid arguments for pass = {0(FWD), 1(BWD), 2(UPD), 3(BWD+UPD)\n\n", pass); return 0; } #if defined(__SSE3__) _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); #endif /* print some summary */ printf("##########################################\n"); printf("# Setting Up (Common) #\n"); printf("##########################################\n"); printf("PARAMS: N:%d C:%d K:%d T:%d\n", N, C, K, t); printf("PARAMS: ITERS:%d", iters); if (LIBXSMM_FEQ(0, check)) printf(" Threads:%d\n", nThreads); else printf("\n"); printf("SIZE Weight (MB): %10.2f MiB\n", (double)(C*K*sizeof(float))/(1024.0*1024.0) ); printf("SIZE Input (MB): %10.2f MiB\n", (double)(N*C*sizeof(float))/(1024.0*1024.0) ); printf("SIZE Hidden State: %10.2f MiB\n", (double)(K*N*sizeof(float))/(1024.0*1024.0) ); /* allocate data */ xgoldt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); hpgold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); wigold = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); wcgold = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); wfgold = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); rigold = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); rcgold = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); rfgold = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); bigold = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); bcgold = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); bfgold = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); hgoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); igoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); cgoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); fgoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); ogoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); dxgoldt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); dhpgold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); dwgold = (float*)libxsmm_aligned_malloc(C*K*3*sizeof(float), 2097152); drgold = (float*)libxsmm_aligned_malloc(K*K*3*sizeof(float), 2097152); dbgold = (float*)libxsmm_aligned_malloc(K*3*sizeof(float), 2097152); dhgoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); scratch_bu = (float*)libxsmm_aligned_malloc(K*N*6*sizeof(float), 2097152); xt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); hp = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); w = (float*)libxsmm_aligned_malloc(C*K*3*sizeof(float), 2097152); r = (float*)libxsmm_aligned_malloc(K*K*3*sizeof(float), 2097152); b = (float*)libxsmm_aligned_malloc(K*3*sizeof(float), 2097152); ht = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); it = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); ct = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); ft = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); ot = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); dxt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); dhp = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); dw = (float*)libxsmm_aligned_malloc(C*K*3*sizeof(float), 2097152); dr = (float*)libxsmm_aligned_malloc(K*K*3*sizeof(float), 2097152); db = (float*)libxsmm_aligned_malloc(K*3*sizeof(float), 2097152); dht = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); dwtest = (float*)libxsmm_aligned_malloc(C*K*3*sizeof(float), 2097152); drtest = (float*)libxsmm_aligned_malloc(K*K*3*sizeof(float), 2097152); w_tmp = (float*)libxsmm_aligned_malloc(C*K*3*sizeof(float), 2097152); r_tmp = (float*)libxsmm_aligned_malloc(K*K*3*sizeof(float), 2097152); LIBXSMM_VLA_DECL(2, float, xgold, xgoldt, N * C); LIBXSMM_VLA_DECL(2, float, hgold, hgoldt, N * K); /*LIBXSMM_VLA_DECL(2, float, igold, igoldt, N * K);*/ /*LIBXSMM_VLA_DECL(2, float, cgold, cgoldt, N * K);*/ /*LIBXSMM_VLA_DECL(2, float, fgold, fgoldt, N * K);*/ /*LIBXSMM_VLA_DECL(2, float, ogold, ogoldt, N * K);*/ /*LIBXSMM_VLA_DECL(2, float, dxgold, dxgoldt, N * C);*/ LIBXSMM_VLA_DECL(2, float, dhgold, dhgoldt, N * K); LIBXSMM_VLA_DECL(2, float, h, ht, N * K); /* initialize data */ /* FWD */ for (j = 0; j < t; ++j) { LIBXSMM_MATINIT_OMP(float, 24, &LIBXSMM_VLA_ACCESS(2, xgold, j, 0, N * C), N, C, N, 1.0); } LIBXSMM_MATINIT_OMP(float, 24, hpgold, N, K, N, 1.0); LIBXSMM_MATINIT_OMP(float, 42, wigold, C, K, C, 1.0); LIBXSMM_MATINIT_OMP(float, 42, wcgold, C, K, C, 1.0); LIBXSMM_MATINIT_OMP(float, 42, wfgold, C, K, C, 1.0); LIBXSMM_MATINIT_OMP(float, 42, rigold, K, K, K, 1.0); LIBXSMM_MATINIT_OMP(float, 42, rcgold, K, K, K, 1.0); LIBXSMM_MATINIT_OMP(float, 42, rfgold, K, K, K, 1.0); LIBXSMM_MATINIT_OMP(float, 24, bigold, 1, K, 1, 1.0); LIBXSMM_MATINIT_OMP(float, 24, bcgold, 1, K, 1, 1.0); LIBXSMM_MATINIT_OMP(float, 24, bfgold, 1, K, 1, 1.0); zero_buf(hgoldt, N*K*t); /* BWD/UPD */ for (j = 0; j < t; ++j) { LIBXSMM_MATINIT_OMP(float, 24, &LIBXSMM_VLA_ACCESS(2, dhgold, j, 0, K * N), N, K, N, 1.0); } zero_buf(dxgoldt, N*C*t); zero_buf(dhpgold, K*N); zero_buf(dwgold, C*K*3); zero_buf(drgold, K*K*3); zero_buf(dbgold, K*3); /* first touch LIBXSMM */ zero_buf(xt, N*C*t); zero_buf(hp, K*N); zero_buf(w, C*K*3); zero_buf(r, K*K*3); zero_buf(b, K*3); zero_buf(ht, N*K*t); zero_buf(it, K*N*t); zero_buf(ct, K*N*t); zero_buf(ft, K*N*t); zero_buf(ot, K*N*t); zero_buf(dxt, N*C*t); zero_buf(dhp, K*N); zero_buf(dw, C*K*3); zero_buf(dr, K*K*3); zero_buf(db, K*3); zero_buf(dht, K*N*t); zero_buf(w_tmp, C*K*3); zero_buf(r_tmp, K*K*3); if (LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Computing Reference ... #\n"); printf("##########################################\n"); gru_ref_fwd( N, C, K, t, wigold, wcgold, wfgold, rigold, rcgold, rfgold, bigold, bcgold, bfgold, xgoldt, hpgold, hgoldt, igoldt, cgoldt, fgoldt, ogoldt ); gru_ref_bwd_upd( N, C, K, t, xgoldt, hpgold, hgoldt, igoldt, cgoldt, fgoldt, ogoldt, wigold, wcgold, wfgold, rigold, rcgold, rfgold, dhgoldt, dwgold, drgold, dbgold, dxgoldt, dhpgold, scratch_bu ); printf("##########################################\n"); printf("# Computing Reference ... done #\n"); printf("##########################################\n"); } if (1 /* format == 'A' || format == 'L' */) { printf("\n"); printf("##########################################\n"); printf("# Setting Up (custom-Storage) #\n"); printf("##########################################\n"); /* setup LIBXSMM handle */ grucell_desc.threads = nThreads; grucell_desc.N = N; grucell_desc.C = C; grucell_desc.K = K; grucell_desc.max_T = t; grucell_desc.bn = bn; grucell_desc.bc = bc; grucell_desc.bk = bk; grucell_desc.cell_type = LIBXSMM_DNN_RNNCELL_GRU; grucell_desc.datatype_in = LIBXSMM_DNN_DATATYPE_F32; grucell_desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32; grucell_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NC; grucell_desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED; libxsmm_handle = libxsmm_dnn_create_rnncell( grucell_desc, &status ); CHKERR_LIBXSMM_DNN( status ); /* setup LIBXSMM buffers and filter */ libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_input = libxsmm_dnn_link_tensor( libxsmm_layout, xt, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_hidden_state_prev = libxsmm_dnn_link_tensor( libxsmm_layout, hp, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_weight = libxsmm_dnn_link_tensor( libxsmm_layout, w, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_recur_weight = libxsmm_dnn_link_tensor( libxsmm_layout, r, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_BIAS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_bias = libxsmm_dnn_link_tensor( libxsmm_layout, b, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_hidden_state = libxsmm_dnn_link_tensor( libxsmm_layout, ht, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_I, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_i = libxsmm_dnn_link_tensor( libxsmm_layout, it, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_CI, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_c = libxsmm_dnn_link_tensor( libxsmm_layout, ct, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_F, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_f = libxsmm_dnn_link_tensor( libxsmm_layout, ft, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_O, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_o = libxsmm_dnn_link_tensor( libxsmm_layout, ot, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dinput = libxsmm_dnn_link_tensor( libxsmm_layout, dxt, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dhidden_state_prev = libxsmm_dnn_link_tensor( libxsmm_layout, dhp, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dweight = libxsmm_dnn_link_tensor( libxsmm_layout, dw, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_drecur_weight = libxsmm_dnn_link_tensor( libxsmm_layout, dr, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_BIAS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dbias = libxsmm_dnn_link_tensor( libxsmm_layout, db, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dhidden_state = libxsmm_dnn_link_tensor( libxsmm_layout, dht, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); /* copy in data to LIBXSMM format */ matrix_copy(N*C*t, xgoldt, xt); matrix_copy(K*N, hpgold, hp); convert_ck_c3k(C, K, wigold, w_tmp); convert_ck_c3k(C, K, wcgold, &(w_tmp[K])); convert_ck_c3k(C, K, wfgold, &(w_tmp[2*K])); convert_ck_c3k(K, K, rigold, r_tmp); convert_ck_c3k(K, K, rcgold, &(r_tmp[K])); convert_ck_c3k(K, K, rfgold, &(r_tmp[2*K])); matrix_copy_CK_to_KCCK(w_tmp, w, C, 3*K, bc, bk); matrix_copy_CK_to_KCCK(r_tmp, r, K, 3*K, bk, bk); matrix_copy(K, bigold, b); matrix_copy(K, bcgold, &(b[K])); matrix_copy(K, bfgold, &(b[2*K])); matrix_copy(K*N*t, dhgoldt, dht); /* bind buffers and filter to handle */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_input, LIBXSMM_DNN_RNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_hidden_state_prev, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_weight, LIBXSMM_DNN_RNN_REGULAR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_recur_weight, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_bias, LIBXSMM_DNN_RNN_REGULAR_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_hidden_state, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_i, LIBXSMM_DNN_RNN_INTERNAL_I ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_c, LIBXSMM_DNN_RNN_INTERNAL_CI ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_f, LIBXSMM_DNN_RNN_INTERNAL_F ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_o, LIBXSMM_DNN_RNN_INTERNAL_O ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dinput, LIBXSMM_DNN_RNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dhidden_state_prev, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dweight, LIBXSMM_DNN_RNN_GRADIENT_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_drecur_weight, LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dbias, LIBXSMM_DNN_RNN_GRADIENT_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dhidden_state, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE ) ); /* let's allocate and bind scratch */ if (pass == 0) { scratch_size = libxsmm_dnn_rnncell_get_scratch_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, &status ); CHKERR_LIBXSMM_DNN( status ); scratch = libxsmm_aligned_malloc( scratch_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, scratch ) ); } else { scratch_size = libxsmm_dnn_rnncell_get_scratch_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, &status ); CHKERR_LIBXSMM_DNN( status ); scratch = libxsmm_aligned_malloc( scratch_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, scratch ) ); } zero_buf( (float*)scratch, scratch_size/4 ); /* let's allocate and bind internalstate */ if (pass == 0) { internalstate_size = libxsmm_dnn_rnncell_get_internalstate_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, &status ); CHKERR_LIBXSMM_DNN( status ); internalstate = libxsmm_aligned_malloc( internalstate_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, internalstate ) ); } else { internalstate_size = libxsmm_dnn_rnncell_get_internalstate_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, &status ); CHKERR_LIBXSMM_DNN( status ); internalstate = libxsmm_aligned_malloc( internalstate_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, internalstate ) ); } zero_buf( (float*)internalstate, internalstate_size/4 ); if ((pass == 0) && LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Correctness - FWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM GRU */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) ); } /* compare */ libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_F32, K*N, 1, &LIBXSMM_VLA_ACCESS(2, hgold, t-1, 0, K * N), &LIBXSMM_VLA_ACCESS(2, h, t-1, 0, K * N), 0, 0); printf("L1 reference : %.25g\n", norms_fwd.l1_ref); printf("L1 test : %.25g\n", norms_fwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_fwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_fwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel); printf("Check-norm : %.24f\n", norms_fwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_fwd); } else { /* We need to always run FWD pass once to populate i, c, f, o, h */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) ); } } if ( (pass == 1) && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - BWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM GRU */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ) ); } /* compare */ libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, N*C*t, 1, dxgoldt, dxt, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); } if ( (pass == 2) && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM GRU */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ) ); } matrix_copy_KCCK_to_CK(dw, w_tmp, C, 3*K, bc, bk); matrix_copy_KCCK_to_CK(dr, r_tmp, K, 3*K, bk, bk); /* convert_c3k_3ck(C, K, w_tmp, dwtest); convert_c3k_3ck(K, K, r_tmp, drtest); */ convert_ck_c3k(C, K, &(dwgold[0]), &(dwtest[0])); convert_ck_c3k(C, K, &(dwgold[C*K]), &(dwtest[K])); convert_ck_c3k(C, K, &(dwgold[2*C*K]), &(dwtest[2*K])); convert_ck_c3k(K, K, &(drgold[0]), &(drtest[0])); convert_ck_c3k(K, K, &(drgold[K*K]), &(drtest[K])); convert_ck_c3k(K, K, &(drgold[2*K*K]), &(drtest[2*K])); /* compare */ /*libxsmm_matdiff(&norms_upd_w, LIBXSMM_DATATYPE_F32, C*K*3, 1, dwtest, dw, 0, 0);*/ libxsmm_matdiff(&norms_upd_w, LIBXSMM_DATATYPE_F32, C*K*3, 1, dwtest, w_tmp, 0, 0); printf("Delta weight\n"); printf("L1 reference : %.25g\n", norms_upd_w.l1_ref); printf("L1 test : %.25g\n", norms_upd_w.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_w.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_w.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_w.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_w.linf_rel); printf("Check-norm : %.24f\n", norms_upd_w.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_w); /*libxsmm_matdiff(&norms_upd_r, LIBXSMM_DATATYPE_F32, K*K*3, 1, drtest, dr, 0, 0);*/ libxsmm_matdiff(&norms_upd_r, LIBXSMM_DATATYPE_F32, K*K*3, 1, drtest, r_tmp, 0, 0); printf("Delta recurrent weight\n"); printf("L1 reference : %.25g\n", norms_upd_r.l1_ref); printf("L1 test : %.25g\n", norms_upd_r.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_r.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_r.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_r.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_r.linf_rel); printf("Check-norm : %.24f\n", norms_upd_r.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_r); libxsmm_matdiff(&norms_upd_b, LIBXSMM_DATATYPE_F32, K*3, 1, dbgold, db, 0, 0); printf("Delta bias\n"); printf("L1 reference : %.25g\n", norms_upd_b.l1_ref); printf("L1 test : %.25g\n", norms_upd_b.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_b.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_b.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_b.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_b.linf_rel); printf("Check-norm : %.24f\n", norms_upd_b.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_b); } if ( (pass == 3) && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - BWD+UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM GRU */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, 0, tid ) ); } matrix_copy_KCCK_to_CK(dw, w_tmp, C, 3*K, bc, bk); matrix_copy_KCCK_to_CK(dr, r_tmp, K, 3*K, bk, bk); convert_ck_c3k(C, K, &(dwgold[0]), &(dwtest[0])); convert_ck_c3k(C, K, &(dwgold[C*K]), &(dwtest[K])); convert_ck_c3k(C, K, &(dwgold[2*C*K]), &(dwtest[2*K])); convert_ck_c3k(K, K, &(drgold[0]), &(drtest[0])); convert_ck_c3k(K, K, &(drgold[K*K]), &(drtest[K])); convert_ck_c3k(K, K, &(drgold[2*K*K]), &(drtest[2*K])); /* compare */ libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, N*C*t, 1, dxgoldt, dxt, 0, 0); printf("Delta input\n"); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); /*libxsmm_matdiff(&norms_upd_w, LIBXSMM_DATATYPE_F32, C*K*3, 1, dwtest, dw, 0, 0);*/ libxsmm_matdiff(&norms_upd_w, LIBXSMM_DATATYPE_F32, C*K*3, 1, dwtest, w_tmp, 0, 0); printf("Delta weight\n"); printf("L1 reference : %.25g\n", norms_upd_w.l1_ref); printf("L1 test : %.25g\n", norms_upd_w.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_w.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_w.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_w.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_w.linf_rel); printf("Check-norm : %.24f\n", norms_upd_w.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_w); /*libxsmm_matdiff(&norms_upd_r, LIBXSMM_DATATYPE_F32, K*K*3, 1, drtest, dr, 0, 0);*/ libxsmm_matdiff(&norms_upd_r, LIBXSMM_DATATYPE_F32, K*K*3, 1, drtest, r_tmp, 0, 0); printf("Delta recurrent weight\n"); printf("L1 reference : %.25g\n", norms_upd_r.l1_ref); printf("L1 test : %.25g\n", norms_upd_r.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_r.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_r.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_r.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_r.linf_rel); printf("Check-norm : %.24f\n", norms_upd_r.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_r); libxsmm_matdiff(&norms_upd_b, LIBXSMM_DATATYPE_F32, K*3, 1, dbgold, db, 0, 0); printf("Delta bias\n"); printf("L1 reference : %.25g\n", norms_upd_b.l1_ref); printf("L1 test : %.25g\n", norms_upd_b.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_b.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_b.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_b.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_b.linf_rel); printf("Check-norm : %.24f\n", norms_upd_b.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_b); } if ( pass == 0 ) { printf("##########################################\n"); printf("# Performance - FWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM GRU for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(j) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (j = 0; j < iters; ++j) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = (((2.0 * K * N * C) + (2.0 * K * N * K) + (2.0 * K * N) + (tflops * K * N)) * 2.0 + (K * N) + (2.0 * K * N * C) + (2.0 * K * N * K) + (tflops * K * N) + 4.0 * (K * N)) * (double)t * (double)iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,FP,%s,%i,%i,%i,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, bn, bc, bk, ((double)(l_total/iters)), (flops*1e-9)/l_total); } if ( pass == 1 ) { printf("##########################################\n"); printf("# Performance - BWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM GRU for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(j) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (j = 0; j < iters; ++j) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = K * N; /* d3 = djdh + d23 (delta) */ flops += 2.0 * K * N; /* d4 = (1 - z).d3 */ flops += K * N; /* d5 = d3.h */ flops += K * N; /* d6 = -d5 */ flops += K * N; /* d7 = d3.g */ flops += K * N; /* d8 = d3.z */ flops += K * N; /* d9 = d7 + d8 */ flops += 3.0 * K * N; /* d10 = d8.tanh'(g) */ flops += 3.0 * K * N; /* d11 = d9.sig'(z) */ flops += (2.0 * K * K * N + K * K); /* d13 = Wg^T * d10 (including transpose) */ flops += (2.0 * K * K * N + K * K); /* d15 = Wz^T * d11 (including transpose) */ flops += K * N; /* d16 = d13.z */ flops += K * N; /* d17 = d13.r */ flops += 3.0 * K * N; /* d18 = d16.sig'(r) */ flops += K * N; /* d19 = d17 + d4 */ flops += (2.0 * K * K * N + K * K); /* d21 = Wr^T * d18 (including transpose) */ flops += K * N; /* d22 = d21 + d15 */ flops += K * N; /* d23 = d19 + d22 */ flops += (2.0 * K * C * N + K * C); /* d12 = Ug^T * d10 (including transpose) */ flops += (2.0 * K * C * N + K * C); /* d14 = Uz^T * d11 (including transpose) */ flops += (2.0 * K * C * N + K * C); /* d20 = Ur^T * d18 (including transpose) */ flops += 2.0 * K * N; /* djdx = d12 + d14 + d20 */ flops *= t; /* for t time steps */ flops *= iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("bp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,BP,%s,%i,%i,%i,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, bn, bc, bk, ((double)(l_total/iters)), (flops*1e-9)/l_total); } if ( pass == 2 ) { printf("##########################################\n"); printf("# Performance - UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM GRU for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(j) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (j = 0; j < iters; ++j) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = K * N; /* d3 = djdh + d23 (delta) */ flops += 2.0 * K * N; /* d4 = (1 - z).d3 */ flops += K * N; /* d5 = d3.h */ flops += K * N; /* d6 = -d5 */ flops += K * N; /* d7 = d3.g */ flops += K * N; /* d8 = d3.z */ flops += K * N; /* d9 = d7 + d8 */ flops += 3.0 * K * N; /* d10 = d8.tanh'(g) */ flops += 3.0 * K * N; /* d11 = d9.sig'(z) */ flops += (2.0 * K * K * N + K * K); /* d13 = Wg^T * d10 (including transpose) */ flops += (2.0 * K * K * N + K * K); /* d15 = Wz^T * d11 (including transpose) */ flops += K * N; /* d16 = d13.z */ flops += K * N; /* d17 = d13.r */ flops += 3.0 * K * N; /* d18 = d16.sig'(r) */ flops += K * N; /* d19 = d17 + d4 */ flops += (2.0 * K * K * N + K * K); /* d21 = Wr^T * d18 (including transpose) */ flops += K * N; /* d22 = d21 + d15 */ flops += K * N; /* d23 = d19 + d22 */ flops += (2.0 * K * N * K + K * N + K * K); /* djdwr = djdwr + d18 * h^T */ flops += (2.0 * K * N * K + K * N + K * K); /* djdwz = djdwz + d11 * h^T */ flops += (2.0 * K * N * K + 2.0 * K * N + K * K); /* djdwg = djdwg + d10 * (h.r)^T */ flops += (2.0 * K * N * C + C * N + K * C); /* djdur = djdur + d18 * x^T */ flops += (2.0 * K * N * C + C * N + K * C); /* djduz = djduz + d11 * x^T */ flops += (2.0 * K * N * C + C * N + K * C); /* djdug = djdug + d10 * x^T */ flops += K * N; /* djdbr = djdbr + d18 */ flops += K * N; /* djdbz = djdbz + d11 */ flops += K * N; /* djdbg = djdbg + d10 */ flops *= t; /* for t time steps */ flops *= iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("wu time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,WU,%s,%i,%i,%i,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, bn, bc, bk, ((double)(l_total/iters)), (flops*1e-9)/l_total); } if ( pass == 3 ) { printf("##########################################\n"); printf("# Performance - BWD+UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM GRU for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(j) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (j = 0; j < iters; ++j) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = K * N; /* d3 = djdh + d23 (delta) */ flops += 2.0 * K * N; /* d4 = (1 - z).d3 */ flops += K * N; /* d5 = d3.h */ flops += K * N; /* d6 = -d5 */ flops += K * N; /* d7 = d3.g */ flops += K * N; /* d8 = d3.z */ flops += K * N; /* d9 = d7 + d8 */ flops += 3.0 * K * N; /* d10 = d8.tanh'(g) */ flops += 3.0 * K * N; /* d11 = d9.sig'(z) */ flops += (2.0 * K * K * N + K * K); /* d13 = Wg^T * d10 (including transpose) */ flops += (2.0 * K * K * N + K * K); /* d15 = Wz^T * d11 (including transpose) */ flops += K * N; /* d16 = d13.z */ flops += K * N; /* d17 = d13.r */ flops += 3.0 * K * N; /* d18 = d16.sig'(r) */ flops += K * N; /* d19 = d17 + d4 */ flops += (2.0 * K * K * N + K * K); /* d21 = Wr^T * d18 (including transpose) */ flops += K * N; /* d22 = d21 + d15 */ flops += K * N; /* d23 = d19 + d22 */ flops += (2.0 * K * C * N + K * C); /* d12 = Ug^T * d10 (including transpose) */ flops += (2.0 * K * C * N + K * C); /* d14 = Uz^T * d11 (including transpose) */ flops += (2.0 * K * C * N + K * C); /* d20 = Ur^T * d18 (including transpose) */ flops += 2.0 * K * N; /* djdx = d12 + d14 + d20 */ flops += (2.0 * K * N * K + K * N + K * K); /* djdwr = djdwr + d18 * h^T */ flops += (2.0 * K * N * K + K * N + K * K); /* djdwz = djdwz + d11 * h^T */ flops += (2.0 * K * N * K + 2.0 * K * N + K * K); /* djdwg = djdwg + d10 * (h.r)^T */ flops += (2.0 * K * N * C + C * N + K * C); /* djdur = djdur + d18 * x^T */ flops += (2.0 * K * N * C + C * N + K * C); /* djduz = djduz + d11 * x^T */ flops += (2.0 * K * N * C + C * N + K * C); /* djdug = djdug + d10 * x^T */ flops += K * N; /* djdbr = djdbr + d18 */ flops += K * N; /* djdbz = djdbz + d11 */ flops += K * N; /* djdbg = djdbg + d10 */ flops *= t; /* for t time steps */ flops *= iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("bp+wu time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,BP+WU,%s,%i,%i,%i,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, bn, bc, bk, ((double)(l_total/iters)), (flops*1e-9)/l_total); } /* clean-up */ if (pass == 0) { CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD ) ); } else { CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL ) ); } libxsmm_free(scratch); libxsmm_free(internalstate); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_I ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_CI ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_F ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_O ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_input ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_hidden_state_prev ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_weight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_recur_weight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_bias ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_hidden_state ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_i ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_c ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_f ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_o ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dinput ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dhidden_state_prev ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dweight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_drecur_weight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dbias ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dhidden_state ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_rnncell( libxsmm_handle ) ); } /* deallocate data */ libxsmm_free(xgoldt); libxsmm_free(hpgold); libxsmm_free(wigold); libxsmm_free(wcgold); libxsmm_free(wfgold); libxsmm_free(rigold); libxsmm_free(rcgold); libxsmm_free(rfgold); libxsmm_free(bigold); libxsmm_free(bcgold); libxsmm_free(bfgold); libxsmm_free(hgoldt); libxsmm_free(igoldt); libxsmm_free(cgoldt); libxsmm_free(fgoldt); libxsmm_free(ogoldt); libxsmm_free(dxgoldt); libxsmm_free(dhpgold); libxsmm_free(dwgold); libxsmm_free(drgold); libxsmm_free(dbgold); libxsmm_free(dhgoldt); libxsmm_free(xt); libxsmm_free(hp); libxsmm_free(w); libxsmm_free(r); libxsmm_free(b); libxsmm_free(ht); libxsmm_free(it); libxsmm_free(ct); libxsmm_free(ft); libxsmm_free(ot); libxsmm_free(dxt); libxsmm_free(dhp); libxsmm_free(dw); libxsmm_free(dr); libxsmm_free(db); libxsmm_free(dht); libxsmm_free(dwtest); libxsmm_free(drtest); { const char *const env_check_scale = getenv("CHECK_SCALE"); const double check_scale = LIBXSMM_ABS(0 == env_check_scale ? 1.0 : atof(env_check_scale)); if (LIBXSMM_NEQ(0, check) && (check < 100.0 * check_scale * diff.normf_rel) && (global_status == LIBXSMM_DNN_SUCCESS)) { fprintf(stderr, "FAILED with an error of %f%%!\n", 100.0 * diff.normf_rel); exit(EXIT_FAILURE); } } /* some empty lines at the end */ printf("\n\n\n"); return global_status; } libxsmm-1.17/samples/deeplearning/grudriver/grudriver_nc_kcck.sh000077500000000000000000000070711415223013700252550ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.), Kunal Banerjee (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) NAME=$(basename $0 .sh) GREP=$(command -v grep) ENV=$(command -v env) if [ "Windows_NT" = "${OS}" ]; then # Cygwin's "env" does not set PATH ("Files/Black: No such file or directory") export PATH=${PATH}:${HERE}/../../lib:/usr/x86_64-w64-mingw32/sys-root/mingw/bin # Cygwin's ldd hangs with dyn. linked executables or certain shared libraries LDD=$(command -v cygcheck) EXE=.exe else if [ "$(command -v ldd)" ]; then LDD=ldd elif [ "$(command -v otool)" ]; then LDD="otool -L" else LDD=echo fi fi MICINFO=$(command -v micinfo) if [ "${MICINFO}" ]; then MICCORES=$(${MICINFO} 2>/dev/null | sed -n "0,/[[:space:]]\+Total No of Active Cores :[[:space:]]\+\([0-9]\+\)/s//\1/p") fi if [ "" = "${MICCORES}" ]; then MICCORES=61 fi MICTPERC=3 if [ "-mic" != "$1" ]; then if [ "$(${LDD} ${HERE}/${NAME}${EXE} 2>/dev/null | ${GREP} libiomp5\.)" ]; then ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ KMP_AFFINITY=compact,granularity=fine,1 \ MIC_KMP_AFFINITY=compact,granularity=fine \ MIC_KMP_HW_SUBSET=$((MICCORES-1))c${MICTPERC}t \ MIC_ENV_PREFIX=MIC \ OFFLOAD_INIT=on_start \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" else ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ OMP_PROC_BIND=TRUE \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" fi else shift ${ENV} \ SINK_LD_LIBRARY_PATH=${SINK_LD_LIBRARY_PATH}:${MIC_LD_LIBRARY_PATH}:${HERE}/../../lib \ micnativeloadex \ ${HERE}/${NAME}${EXE} -a "$*" \ -e "KMP_AFFINITY=compact,granularity=fine" \ -e "MIC_KMP_HW_SUBSET=$((MICCORES-1))${MICTPERC}t" fi ITERS=100 CHKVAL=1 export OMP_NUM_THREADS=28 export KMP_AFFINITY=granularity=fine,compact,1,0 echo "GRU FWD" CHECK=${CHKVAL} ./grudriver_nc_kcck ${ITERS} 0 168 256 256 50 24 64 64 wait CHECK=${CHKVAL} ./grudriver_nc_kcck ${ITERS} 0 168 512 512 50 24 64 64 wait CHECK=${CHKVAL} ./grudriver_nc_kcck ${ITERS} 0 168 1024 1024 50 24 64 64 wait CHECK=${CHKVAL} ./grudriver_nc_kcck ${ITERS} 0 168 2048 2048 50 24 64 64 wait CHECK=${CHKVAL} ./grudriver_nc_kcck ${ITERS} 0 168 4096 4096 50 24 64 64 wait echo "GRU BWD+UPD" CHECK=${CHKVAL} ./grudriver_nc_kcck ${ITERS} 3 168 256 256 50 24 64 64 wait CHECK=${CHKVAL} ./grudriver_nc_kcck ${ITERS} 3 168 512 512 50 24 64 64 wait CHECK=${CHKVAL} ./grudriver_nc_kcck ${ITERS} 3 168 1024 1024 50 24 64 64 wait CHECK=${CHKVAL} ./grudriver_nc_kcck ${ITERS} 3 168 2048 2048 50 24 64 64 wait CHECK=${CHKVAL} ./grudriver_nc_kcck ${ITERS} 3 168 4096 4096 50 24 64 64 wait echo "GRU performance done" echo "" libxsmm-1.17/samples/deeplearning/gxm/000077500000000000000000000000001415223013700200075ustar00rootroot00000000000000libxsmm-1.17/samples/deeplearning/gxm/Makefile000066400000000000000000000167011415223013700214540ustar00rootroot00000000000000############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Sasikanth Avancha, Dhiraj Kalamkar, Alexander Heinecke (Intel Corp.) ############################################################################### PROJECT := gxm CONFIG_FILE := Makefile.config include $(CONFIG_FILE) BUILD_DIR_LINK := $(BUILD_DIR) ifeq ($(RELEASE_BUILD_DIR),) RELEASE_BUILD_DIR := .$(BUILD_DIR)_release endif ifeq ($(DEBUG_BUILD_DIR),) DEBUG_BUILD_DIR := .$(BUILD_DIR)_debug endif DEBUG ?= 0 ifeq ($(DEBUG), 1) BUILD_DIR := $(DEBUG_BUILD_DIR) OTHER_BUILD_DIR := $(RELEASE_BUILD_DIR) else BUILD_DIR := $(RELEASE_BUILD_DIR) OTHER_BUILD_DIR := $(DEBUG_BUILD_DIR) endif # All of the directories containing code. SRC_DIRS := $(shell find * -type d -exec bash -c "find {} -maxdepth 1 \ \( -name '*.cpp' -o -name '*.proto' \) | grep -q ." \; -print) BINARY_NAME := $(PROJECT) BIN_BUILD_DIR := $(BUILD_DIR)/bin FULL_BIN_NAME := $(BIN_BUILD_DIR)/$(BINARY_NAME) # Source files # CXX_SRCS are the source files excluding the test ones. CXX_SRCS := $(shell find src/ ! -name "test_*.cpp" -name "*.cpp" | sort) #$(info CXX_SRCS = $(CXX_SRCS)) # BUILD_INCLUDE_DIR contains any generated header files we want to include. BUILD_INCLUDE_DIR := $(BUILD_DIR) # PROTO_SRCS are the protocol buffer definitions PROTO_SRC_DIR := proto PROTO_SRCS := $(wildcard $(PROTO_SRC_DIR)/*.proto) # PROTO_BUILD_DIR will contain the .cc and obj files generated from # PROTO_SRCS; PROTO_BUILD_INCLUDE_DIR will contain the .h header files PROTO_BUILD_DIR := $(BUILD_DIR)/$(PROTO_SRC_DIR) PROTO_BUILD_INCLUDE_DIR := $(BUILD_DIR)/proto NONGEN_CXX_SRCS := $(shell find \ src/ \ include/ \ -name "*.cpp" -or -name "*.hpp") # Generated files # The generated files for protocol buffers PROTO_GEN_HEADER_SRCS := $(addprefix $(PROTO_BUILD_DIR)/, \ $(call qndir,${PROTO_SRCS:.proto=.pb.h})) PROTO_GEN_HEADER := $(addprefix $(PROTO_BUILD_INCLUDE_DIR)/, \ $(call qndir,${PROTO_SRCS:.proto=.pb.h})) PROTO_GEN_CC := $(addprefix $(BUILD_DIR)/, ${PROTO_SRCS:.proto=.pb.cc}) # Source file objects CXX_OBJS := $(addprefix $(BUILD_DIR)/, ${CXX_SRCS:.cpp=.o}) PROTO_OBJS := ${PROTO_GEN_CC:.cc=.o} OBJS := $(PROTO_OBJS) $(CXX_OBJS) # Output files for automatic dependency generation DEPS := ${CXX_OBJS:.o=.d} # Compiler warning locations WARNS_EXT := warnings.txt CXX_WARNS := $(addprefix $(BUILD_DIR)/, ${CXX_SRCS:.cpp=.o.$(WARNS_EXT)}) ALL_WARNS := $(ALL_CXX_WARNS) EMPTY_WARN_REPORT := $(BUILD_DIR)/.$(WARNS_EXT) NONEMPTY_WARN_REPORT := $(BUILD_DIR)/$(WARNS_EXT) # include and lib directories INCLUDE_DIRS += $(BUILD_INCLUDE_DIR) ./src ./include LIBRARIES += glog gflags protobuf m USE_OPENCV ?= 1 ifeq ($(USE_OPENCV), 1) LIBRARIES += opencv_core opencv_highgui opencv_imgproc ifeq ($(OPENCV_VERSION), 3) LIBRARIES += opencv_imgcodecs endif endif WARNINGS := -Wall -Wno-sign-compare # build directories LIB_BUILD_DIR := ./lib ALL_BUILD_DIRS := $(sort $(BUILD_DIR) $(addprefix $(BUILD_DIR)/, $(SRC_DIRS)) \ $(LIB_BUILD_DIR) $(BIN_BUILD_DIR) $(PROTO_BUILD_INCLUDE_DIR)) # build # Determine platform UNAME := $(shell uname -s) ifeq ($(UNAME), Linux) LINUX := 1 endif # Linux ifeq ($(LINUX), 1) LIBRARIES += stdc++ endif # Custom compiler ifdef CUSTOM_CXX CXX := $(CUSTOM_CXX) endif # Architecture ifeq ($(ARCH), avx2) COMMON_FLAGS += -xCORE-AVX2 endif ifeq ($(ARCH), avx512_common) COMMON_FLAGS += -xCOMMON-AVX512 #CXXFLAGS += -DUSE_NTS_SPLIT -DUSE_NTS_BN -DUSE_BLOCKING_BN endif ifeq ($(ARCH), avx512_mic) COMMON_FLAGS += -xMIC-AVX512 #CXXFLAGS += -DUSE_NTS_SPLIT -DUSE_NTS_BN endif # Debugging ifeq ($(DEBUG), 1) COMMON_FLAGS += -g -O0 else COMMON_FLAGS += -DNDEBUG -O2 -ip -ipo #-qopt-report-phase=vec -qopt-report=2 endif # configure IO libraries ifeq ($(USE_OPENCV), 1) COMMON_FLAGS += -DUSE_OPENCV endif # CPU-only configuration ifeq ($(CPU_ONLY), 1) OBJS := $(PROTO_OBJS) $(CXX_OBJS) ALL_WARNS := $(ALL_CXX_WARNS) COMMON_FLAGS += -DCPU_ONLY endif ifeq ($(OPENMP), 1) COMMON_FLAGS += -qopenmp endif # BLAS configuration (default = mkl) ifeq ($(BLAS), openblas) # OpenBLAS LIBRARIES += openblas BLAS_INCLUDE = $(OPENBLAS)/include BLAS_LIB = $(OPENBLAS)/lib INCLUDE_DIRS += $(BLAS_INCLUDE) LIBRARY_DIRS += $(BLAS_LIB) endif ifeq ($(BLAS), mkl) COMMON_FLAGS += -mkl endif INCLUDE_DIRS += $(GXM_LIBRARY_PATH)/include LIBRARY_DIRS += $(GXM_LIBRARY_PATH)/lib LIBRARY_DIRS += $(LIB_BUILD_DIR) # libxsmm paths LIBRARIES += xsmm xsmmext INCLUDE_DIRS += $(LIBXSMM_PATH)/include LIBRARY_DIRS += $(LIBXSMM_PATH)/lib #MLSL paths ifeq ($(MLSL), 1) LIBRARIES += mlsl INCLUDE_DIRS += $(MLSL_ROOT)/intel64/include LIBRARY_DIRS += $(MLSL_ROOT)/intel64/lib CXXFLAGS += -DUSE_MLSL CXX = mpiicpc endif ifeq ($(MLSL_WITH_BF16), 1) CXXFLAGS += -DBF16_MLSL endif ifeq ($(LMDB), 1) LIBRARIES += lmdb LIBRARY_DIRS += $(GXM_LIBRARY_PATH)/lib INCLUDE_DIRS += $(GXM_LIBRARY_PATH)/include CXXFLAGS += -DUSE_LMDB endif # Optimized Buffer allocation for BP ifeq ($(BPOPT_ALLOC), 1) CXXFLAGS += -DUSE_OPTBP_ALLOC endif ifeq ($(NUMA_ON), 1) CXXFLAGS += -DUSE_NUMA endif ifeq ($(XSMM_TIMING), 1) CXXFLAGS += -DUSE_XSMM_TIMING endif # Return all ifeq ($(RETURN_NC), 1) CXXFLAGS += -DRETURNALL endif # Timing per layer ifeq ($(TIME), 1) CXXFLAGS += -DTIMING endif # Stats for layer activations/weights ifeq ($(STATS), 1) CXXFLAGS += -DGETSTATS endif ifeq ($(DUMP_ACT), 1) CXXFLAGS += -DDUMP_ACT_DATA endif ifeq ($(DUMP_WT), 1) CXXFLAGS += -DDUMP_WT_DATA endif ifeq ($(CANCHECK), 1) CXXFLAGS += -DCANARY_CHECK endif ifeq ($(FP32_BU), 1) CXXFLAGS += -DCHECK_BLOWUP_FP32 endif # Complete build flags. COMMON_FLAGS += $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) CXXFLAGS += $(COMMON_FLAGS) $(WARNINGS) -std=c++11 LINKFLAGS += $(COMMON_FLAGS) $(WARNINGS) LDFLAGS += $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) \ $(foreach library,$(LIBRARIES),-l$(library)) # build targets all: bin bin: $(FULL_BIN_NAME) $(BUILD_DIR_LINK): $(BUILD_DIR)/.linked $(BUILD_DIR)/.linked: @ mkdir -p $(BUILD_DIR) @ $(RM) $(OTHER_BUILD_DIR)/.linked @ $(RM) -r $(BUILD_DIR_LINK) @ ln -s $(BUILD_DIR) $(BUILD_DIR_LINK) @ touch $@ $(ALL_BUILD_DIRS): | $(BUILD_DIR_LINK) @ mkdir -p $@ $(FULL_BIN_NAME): $(OBJS) $(CXX) -o $@ $(OBJS) $(LINKFLAGS) $(LDFLAGS) $(BUILD_DIR)/%.o: %.cpp | $(ALL_BUILD_DIRS) @ echo CXX $< $(CXX) $< $(CXXFLAGS) -c -o $@ 2> $@.$(WARNS_EXT) \ || (cat $@.$(WARNS_EXT); exit 1) @ cat $@.$(WARNS_EXT) $(PROTO_BUILD_DIR)/%.pb.o: $(PROTO_BUILD_DIR)/%.pb.cc $(PROTO_GEN_HEADER) \ | $(PROTO_BUILD_DIR) @ echo CXX $< $(CXX) $< $(CXXFLAGS) -c -o $@ 2> $@.$(WARNS_EXT) \ || (cat $@.$(WARNS_EXT); exit 1) @ cat $@.$(WARNS_EXT) proto: $(PROTO_GEN_CC) $(PROTO_GEN_HEADER) $(PROTO_BUILD_DIR)/%.pb.cc $(PROTO_BUILD_DIR)/%.pb.h : \ $(PROTO_SRC_DIR)/%.proto | $(PROTO_BUILD_DIR) @ echo PROTOC $< protoc --proto_path=$(PROTO_SRC_DIR) --cpp_out=$(PROTO_BUILD_DIR) $< clean: @- $(RM) -rf $(ALL_BUILD_DIRS) @- $(RM) -rf $(OTHER_BUILD_DIR) @- $(RM) -rf $(BUILD_DIR_LINK) -include $(DEPS) libxsmm-1.17/samples/deeplearning/gxm/Makefile.config000066400000000000000000000042671415223013700227240ustar00rootroot00000000000000############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Sasikanth Avancha, Dhiraj Kalamkar, Alexander Heinecke (Intel Corp.) ############################################################################### # Graph execution Model (GxM) configuration CPU_ONLY := 1 # Compiler CUSTOM_CXX := icpc USE_OPENCV := 1 # BLAS choice: # mkl for MKL #BLAS := openblas BLAS := mkl # Use OpenMP OPENMP := 1 # Architecture ARCH := avx512_common # Path to libraries: glog, gflags, protobuf, opencv, lmdb GXM_LIBRARY_PATH := /swtools/caffe_deps # Path to libxsmm library for compute LIBXSMM_PATH := ../../.. BUILD_DIR := build # Use LMDB; this flag is one always to indicate that the # only supported db type is LMBD (e.g., not LEVELDB) LMDB := 1 # Set MLSL flag for multi-node runs; Clear for single-node runs MLSL := 0 # BF16 support in MLSL MLSL_WITH_BF16 := 0 # Set to 1 for debugging -- will compile with -O0 # turns on debug prints DEBUG := 0 # Return without compute to measure overhead of framework RETURN_NC := 0 # Allocate re-usable memory size in BackPropagation # Useful for MCDRAM-based CPU BPOPT_ALLOC := 0 #Turn on NUMA Init; Always on NUMA_ON := 1 #LIBXSMM Options; to time LIBXSMM layers XSMM_TIMING := 0 # Fine-grained timing; to time all layers TIME := 0 # Layer Summaries for debugging STATS := 0 # Dump activations from each layer for debugging DUMP_ACT := 0 # Dump weights from relevant layers for debugging DUMP_WT := 0 # Check canary values between tensors to ensure no overflows CANCHECK := 0 # Flag to check if first K tensor values have gone to # NaN or Inf due to bugs in code and stop execution FP32_BU := 0 libxsmm-1.17/samples/deeplearning/gxm/README.md000066400000000000000000000065261415223013700212770ustar00rootroot00000000000000# Deep Learning with GxM ## Compiling and Building GxM 1. Install Pre-requisite Libraries: Google logging module (glog), gflags, Google's data interchange format (Protobuf), OpenCV, LMDB 2. In Makefile.config, set GXM_LIBRARY_PATH variable to the path containing above libraries 3. In Makefile.config, set LIBXSMM_PATH variable to the path containing LIBXSMM library 4. Set/clear other flags in Makefile.config as required (see associated comments in Makefile.config) 5. source setup_env.sh 6. make clean; make ## Running GxM The network topology definitions directory is "model_zoo". Currently, it contains definitions for AlexNet (without LRN), ResNet-50, Inception v3 along with CIFAR10 and MNIST as simple test definitions. Each topology definition is in a .prototxt file. ResNet-50 can run with "dummy data", raw JPEG image data or with LMDB. Filenames indicate the data source along with the minibatch size. Inception v3 runs only with compressed LMDB data. The hyperparameter definitions for each topology are also in the corresponding directory under "model_zoo" in a .prototxt file with the suffix "solver". For a single-node, this file is called solver.prototxt. For multi-node the filename also contains the global minibatch size (=single node minibatch size x number of nodes);, e.g., solver_896.prototxt contains hyperparameters for MB=56 per node and 16 nodes. The "solver*" file also contains a flag that specifies whether to start execution from a checkpoint (and thus read load weights from the "./weights" directory) or from scratch; by default execution starts from scratch. Optimal parallelization of Convolutional layers in LIBXSMM happens when the number of OpenMP threads = MiniBatch. Therefore, on Xeon ```bash export OMP_NUM_THREADS= export KMP_AFFINITY=compact,granularity=fine,1,0 ``` The command line for a training run is: ```bash ./build/bin/gxm train ``` For example: ```bash ./build/bin/gxm train model_zoo/resnet/1_resnet50_dummy_56.prototxt model_zoo/resnet/solver.prototxt ``` ## Preping on RHEL 8.0 / CentOS 8.0 ```bash dnf install protobuf wget http://mirror.centos.org/centos/8/PowerTools/x86_64/os/Packages/protobuf-compiler-3.5.0-7.el8.x86_64.rpm dnf install protobuf-compiler-3.5.0-7.el8.x86_64.rpm wget http://mirror.centos.org/centos/8/PowerTools/x86_64/os/Packages/protobuf-devel-3.5.0-7.el8.x86_64.rpm dnf install protobuf-devel-3.5.0-7.el8.x86_64.rpm dnf install lmdb dnf install lmdb-devel wget http://repo.okay.com.mx/centos/8/x86_64/release/opencv-devel-3.4.1-9.el8.x86_64.rpm wget http://repo.okay.com.mx/centos/8/x86_64/release/opencv-3.4.1-9.el8.x86_64.rpm dnf install opencv-3.4.1-9.el8.x86_64.rpm dnf install opencv-devel-3.4.1-9.el8.x86_64.rpm wget http://mirror.centos.org/centos/8/PowerTools/x86_64/os/Packages/gflags-2.1.2-6.el8.x86_64.rpm wget http://mirror.centos.org/centos/8/PowerTools/x86_64/os/Packages/gflags-devel-2.1.2-6.el8.x86_64.rpm dnf install gflags-2.1.2-6.el8.x86_64.rpm dnf install gflags-devel-2.1.2-6.el8.x86_64.rpm wget http://mirror.centos.org/centos/8/PowerTools/x86_64/os/Packages/glog-devel-0.3.5-3.el8.x86_64.rpm wget http://mirror.centos.org/centos/8/PowerTools/x86_64/os/Packages/glog-0.3.5-3.el8.x86_64.rpm dnf install glog-0.3.5-3.el8.x86_64.rpm dnf install glog-devel-0.3.5-3.el8.x86_64.rpm ``` Make sure that the makefile follows the OpenCV Ver 3 path! libxsmm-1.17/samples/deeplearning/gxm/image_split_launch.sh000077500000000000000000000135451415223013700242050ustar00rootroot00000000000000#!/usr/bin/env sh # launch.sh [ARCH] [NUM_NODES] [TOPOLOGY] [MLSL_VER] # for example: launch.sh knl 2 googlenet ep #trap "set +x; sleep 1; set -x" DEBUG CUR_DIR=$(dirname `which $0`) if [ -z $1 ] || [ -z $2 ] || [ -z $3 ]; then echo "use: launch.sh [ARCH] [NUM_PROCS] [TOPOLOGY] [MLSL_VER](optional)" exit 1 fi arch_=$1 numprocs=$2 topo=$3 mlslver=ep #${4:-"ep"} TRAIN_VAL_PROTOTXT=`readlink -f $4` SOLVER_PROTOTXT=`readlink -f $5` checkpoint=$6 export EPLIB_SHM_SIZE_GB=10 export MLSL_HEAP_SIZE_GB=10 mcdram=1 export CLUSTER=endv echo "Running $topo topology with mlsl $mlslver on $arch_ in $numprocs processes on $CLUSTER cluster" #source ${CUR_DIR}/setup_env.sh #${CUR_DIR}/split-train-solver $arch_ $numprocs $topo if [ ! -d "${CUR_DIR}/$arch_" ]; then mkdir ${CUR_DIR}/$arch_ fi # Create a new directory for each run and copy the required input files work_dir="${CUR_DIR}/${arch_}/${arch_}_${numprocs}_${topo}" if [ $checkpoint == 0 ]; then rm -rf $work_dir mkdir -p $work_dir fi cd $work_dir if [ $checkpoint == 0 ]; then if [ ! -d weights ]; then mkdir -p weights fi if [ ! -d weights30 ]; then mkdir -p weights30 fi if [ ! -d weights60 ]; then mkdir -p weights60 fi if [ ! -d weights80 ]; then mkdir -p weights80 fi fi export WORK_DIR=$work_dir # Store all node names in an array. # Later we go thru this array # Note: PBS_NODEFILE is set by lsf, based on the parameters we pass # in bsub. For instance, if we request 2 nodes with bsub, PBS_NODEFILE contains two host names if [ "$CLUSTER" == "endv" ]; then cat $PBS_NODEFILE | uniq|sort > ${CUR_DIR}/hostfile elif [ "$CLUSTER" == "pcl" ]; then scontrol show hostnames > ${CUR_DIR}/hostfile fi if [ ! -f "${CUR_DIR}/hostfile" ]; then echo "Create hostfile at first" exit 1 fi # Names to configfile, binary (executable) files # cfile=${WORK_DIR}/nodeconfig.txt GXM_PATH=${CUR_DIR} xeonbin="${GXM_PATH}/build/bin/gxm train" cpuhostfile=${CUR_DIR}/hostfile nodenames=( `cat ${cpuhostfile}` ) # EPLIB configuration if [ ${arch_} == skx ]; then numservers=2 listep=6,34 elif [ ${arch_} == clx ]; then numservers=2 listep=6,34 elif [ ${arch_} == clxap ]; then numservers=4 listep=6,30,54,78 elif [ ${arch_} == knl ]; then numservers=2 listep=6,7,8,9,10,11,12,13 elif [ ${arch_} == knm ]; then numservers=2 listep=6,7,8,9,10,11,12,13 fi threadspercore=1 ppncpu=1 maxcores=`cpuinfo | grep "Processors(CPUs)" | awk '{print $3}'` maxcores=`cpuinfo | grep "Cores :" | awk '{print $3}'` load_bal_threads=0 numthreads=$(((maxcores-numservers-load_bal_threads)*threadspercore)) #numthreads=32 # MLSL configuration export MLSL_LOG_LEVEL=1 export MLSL_NUM_SERVERS=${numservers} export MLSL_SERVER_AFFINITY="${listep}" # PSM2 configuration export PSM2_MQ_RNDV_HFI_WINDOW=2097152 # to workaround PSM2 bug in IFS 10.2 and 10.3 export PSM2_IDENTIFY=1 # for debug export HFI_NO_CPUAFFINITY=1 # IMPI configuration export I_MPI_FABRICS=tmi export I_MPI_TMI_PROVIDER=psm2 export I_MPI_FALLBACK=0 export I_MPI_DYNAMIC_CONNECTION=0 export I_MPI_SCALABLE_OPTIMIZATION=0 export I_MPI_PIN_MODE=lib export I_MPI_PIN_DOMAIN=node export I_MPI_DEBUG=6 #export MKL_CBWR=AUTO # Produce the configuration file for mpiexec. Each line of the config file contains a # host, enviornment, binary name. rm -f $cfile node_id=0 numnodes=( `cat ${cpuhostfile} | grep -v ^$ | wc -l` ) max_ppn=$((numprocs/numnodes)) numthreads_per_proc=$((numthreads/max_ppn)) #MPIEXECARGS=" -np ${numnodes} -ppn $max_ppn -genv MLSL_NUM_SERVERS ${numservers} -genv MLSL_SERVER_AFFINITY \"${listep}\" -genv OMP_NUM_THREADS ${numthreads_per_proc} -genv KMP_AFFINITY \"$affinitystr\" " # OMP configuration if [ "$threadspercore" == "1" ]; then if [ "$numservers" == "0" ]; then affinitystr="proclist=[0-$((maxcores-1))],granularity=thread,explicit" elif [ "$numservers" == "2" ]; then affinitystr="proclist=[0-5,7-33,35-55],granularity=thread,explicit" #affinitystr="proclist=[0-5,7-16,28-33,35-44],granularity=thread,explicit" elif [ "$numservers" == "1" ]; then affinitystr="proclist=[0-5,7-27],granularity=thread,explicit" elif [ "$numservers" == "4" ]; then affinitystr="proclist=[0-5,7-29,31-53,55-77,79-95],granularity=thread,explicit" fi else affinitystr="proclist=[0-5,$((5+numservers+1))-$((maxcores-1)),$((maxcores))-$((maxcores+5)),$((maxcores+5+numservers+1))-$((2*maxcores-1))],granularity=thread,explicit" fi export KMP_AFFINITY=$affinitystr echo THREAD SETTINGS: Affinity $affinitystr Threads $numthreads Placement $KMP_PLACE_THREADS MPIEXECARGS=" -np ${numnodes} -ppn $max_ppn -genv OMP_NUM_THREADS ${numthreads_per_proc} -genv KMP_AFFINITY \"$affinitystr\" " mkdir -p ${WORK_DIR}/${arch_}_${numprocs}_${topo} if [ ${arch_} == skx ]; then export NUMACTLCMD= elif [ ${arch_} == skxbf16 ]; then export NUMACTLCMD= elif [ ${arch_} == clx ]; then export NUMACTLCMD= elif [ ${arch_} == clxap ]; then export NUMACTLCMD= elif [ ${arch_} == knl ]; then export NUMACTLCMD="numactl --preferred=$mcdram" elif [ ${arch_} == knm ]; then export NUMACTLCMD="numactl --preferred=$mcdram" fi cd $WORK_DIR if [ "$mlslver" == "ep" ]; then for host in `cat $cpuhostfile`; do ssh -n $host "rm -rf /dev/shm/*shm*; killall -q mpiexec.hydra pcldnn_server; killall -q mpiexec.hydra ep_server; for j in \$(ipcs -a | awk '{print \$1}' | grep -v '\-' | grep -v 'key'); do ipcrm -M \${j} > /dev/null 2&>1; done" & done wait fi if [ "$mlslver" == "nompi" ]; then echo "nompi" $NUMACTLCMD $xeonbin ${TRAIN_VAL_PROTOTXT} ${SOLVER_PROTOTXT} else echo "mpiexec" echo GLOG_minloglevel=0 mpiexec.hydra -l $MPIEXECARGS -hostfile $cpuhostfile $NUMACTLCMD $xeonbin ${TRAIN_VAL_PROTOTXT} ${SOLVER_PROTOTXT} GLOG_minloglevel=0 mpiexec.hydra -l $MPIEXECARGS -hostfile $cpuhostfile $NUMACTLCMD $xeonbin ${TRAIN_VAL_PROTOTXT} ${SOLVER_PROTOTXT} 2>&1 | tee -a outputCluster.txt fi libxsmm-1.17/samples/deeplearning/gxm/include/000077500000000000000000000000001415223013700214325ustar00rootroot00000000000000libxsmm-1.17/samples/deeplearning/gxm/include/Accuracy.hpp000066400000000000000000000051531415223013700237010ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include "Node.hpp" #include "Engine.hpp" #include "Params.hpp" #include "Tensor.hpp" #include "proto/gxm.pb.h" using namespace std; using namespace gxm; class AccuracyParams : public NNParams { public: AccuracyParams(void) {} virtual ~AccuracyParams(void) {} void set_axis(int axis) { axis_ = axis; } int get_axis() { return axis_; } void set_top_k(int top_k) { top_k_ = top_k; } int get_top_k() { return top_k_; } protected: int axis_, top_k_; }; static MLParams* parseAccuracyParams(NodeParameter* np) { AccuracyParams *p = new AccuracyParams(); AccuracyParameter ap = np->accuracy_param(); // Set name of node assert(!np->name().empty()); p->set_node_name(np->name()); //Set node type (Convolution, FullyConnected, etc) assert(!np->type().empty()); p->set_node_type(np->type()); //Set tensor names //Set tensor names for(int i=0; ibottom_size(); i++) { assert(!np->bottom(i).empty()); p->set_bottom_names(np->bottom(i)); } //Set Mode for the node assert((np->mode() == TRAIN) || (np->mode() == TEST)); p->set_mode(np->mode()); int axis = ap.axis(); p->set_axis(axis); int top_k = ap.top_k(); p->set_top_k(top_k); return p; } class AccuracyNode : public NNNode { public: AccuracyNode(AccuracyParams* p, MLEngine* e); virtual ~AccuracyNode(void) {} protected: void forwardPropagate(); vector tenBot_; vector tenBotData_; string node_name_, node_type_; Shape ts_; int top_k_, train_batch_count_, test_batch_count_; double avg_train_acc_, avg_test_acc_; MLEngine *eptr_; #if 1 vector max_val; vector max_id; vector > bot_data_vec; #endif void shape_setzero(Shape* s) { for(int i=0; idims[i] = 0; } }; libxsmm-1.17/samples/deeplearning/gxm/include/Concat.hpp000066400000000000000000000064111415223013700233540ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include "assert.h" #include "Node.hpp" #include "Engine.hpp" #include "Params.hpp" #include "Tensor.hpp" #include "proto/gxm.pb.h" #include "ConcatImpl.hpp" #include "ConcatXSMM.hpp" using namespace std; using namespace gxm; class ConcatParams : public NNParams { public: ConcatParams(void) {} virtual ~ConcatParams(void) {} void set_concat_axis(int axis) {axis_ = axis; } int get_concat_axis() { return axis_; } void set_data_type(int t) { data_type_ = t; } int get_data_type() { return data_type_; } void set_compute_engine(int ce) { compute_engine_ = ce; } int get_compute_engine() { return compute_engine_; } void set_algo_type(int at) { algotype_ = at; } int get_algo_type() { return algotype_; } protected: int axis_, data_type_, compute_engine_, algotype_; }; static MLParams* parseConcatParams(NodeParameter* np) { ConcatParams *cp = new ConcatParams(); // Set name of node string str = np->name(); assert(!str.empty()); cp->set_node_name(str); //Set node type (Convolution, FullyConnected, etc) str = np->type(); assert(!str.empty()); cp->set_node_type(str); //Set tensor names for(int i=0; ibottom_size(); i++) cp->set_bottom_names(np->bottom(i)); assert(np->top_size() == 1); assert(!np->top(0).empty()); cp->set_top_names(np->top(0)); //Set Mode for the node assert((np->mode() == TRAIN) || (np->mode() == TEST)); cp->set_mode(np->mode()); //Set backprop needed/not needed flag for this node cp->set_bprop_flag(np->propagate_down()); ConcatParameter pcp = np->concat_param(); cp->set_concat_axis(pcp.axis()); cp->set_data_type(pcp.data_type()); cp->set_compute_engine(pcp.engine()); cp->set_algo_type(pcp.algotype()); return cp; } class ConcatNode : public NNNode { public: ConcatNode(ConcatParams *p, MLEngine* e); virtual ~ConcatNode(void) {} protected: void forwardPropagate(); void backPropagate(); void configure(int engine); void shape_setzero(Shape* s) { for(int i=0; idims[i] = 0; } Tensor* tenTop_; // Output tensor pointer vector tenBot_; // Input tensor pointer ConcatImplParams gparams_; vector tenBotDiff_, tenBotData_; // Data & Gradients with respect to input TensorBuf *tenTopData_, *tenTopDiff_; // Output data and gradients with respect to output Shape ts_; vector bot_cengine_; int count_ = 0; ConcatImpl *impl=NULL; }; libxsmm-1.17/samples/deeplearning/gxm/include/ConcatImpl.hpp000066400000000000000000000045701415223013700242020ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include #include #include "common.hpp" #include "check.hpp" #include "Tensor.hpp" typedef struct { int nOutput; vector nInput; int bdims; int tdims; int iHeight; int iWidth; int oHeight; int oWidth; int batch_size; int axis; int algType; int data_type; int num_threads; } ConcatImplParams; class ConcatImpl { protected: ConcatImplParams *gp; int engine; TensorLayoutType top_layout_type; vector gbot_layout_type; void *top_layout; vector gbot_layout; int top_compute_engine=-1; vector bot_compute_engine; string next_ntype, nname; public: ConcatImpl(ConcatImplParams* gp_, int engine_): gp(gp_), engine(engine_) {} void set_top_compute_engine(int e) { top_compute_engine = e;} void set_bot_compute_engine(int e) { bot_compute_engine.push_back(e);} void set_next_node_type(string s) { next_ntype = s; } void set_node_name(string s) { nname = s; } virtual void forwardPropagate(vector& inp, TensorBuf *outp, int tid) = 0; virtual void backPropagate(TensorBuf* deloutp, vector& delinp, int tid) = 0; virtual void forwardPropagate(vector& inp, TensorBuf* outp) { switch(engine) { case XSMM: forwardPropagate(inp, outp, 0); break; } } virtual void backPropagate(TensorBuf* deloutp, vector& delinp) { switch(engine) { case XSMM: backPropagate(deloutp, delinp, 0); break; } } }; libxsmm-1.17/samples/deeplearning/gxm/include/ConcatXSMM.hpp000066400000000000000000000027331415223013700240640ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include "ConcatImpl.hpp" #include "libxsmm.h" class ConcatXSMM : public ConcatImpl { public: ConcatXSMM(ConcatImplParams* gp, int engine) : ConcatImpl(gp, engine) { top_layout_type = LIBXSMM_CUSTOM_LAYOUT; top_layout = NULL; for(int n=0; nnInput.size(); n++) { gbot_layout_type.push_back(LIBXSMM_CUSTOM_LAYOUT); gbot_layout.push_back(NULL); } } void forwardPropagate(vector& inp, TensorBuf* outp, int tid); void backPropagate(TensorBuf* deloutp, vector& delinp, int tid); void convert_NCHW_to_NCHWV(float*, int, int, int, int, float*); void convert_NCHWV_to_NCHW(float*, int, int, int, int, float*); }; libxsmm-1.17/samples/deeplearning/gxm/include/Config.hpp000066400000000000000000000022541415223013700233530ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include "proto/gxm.pb.h" using namespace std; using namespace gxm; bool parseMachineConfig(const string mcFile, MachineParameter* param); bool parseMLConfig(const string mlFile, NTGParameter* param); //bool parseStrategyConfig(const string& strategyFile, StrategyParameter* param); // Read saved tunning parameters bool parseSolverConfig(const string solverFile, SolverParameter* param); libxsmm-1.17/samples/deeplearning/gxm/include/Conv.hpp000066400000000000000000000275711415223013700230640ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include "assert.h" #include "Node.hpp" #include "Engine.hpp" #include "Params.hpp" #include "Tensor.hpp" #include "Solver.hpp" #include "proto/gxm.pb.h" #include "ConvImpl.hpp" #include "ConvXSMM.hpp" using namespace std; using namespace gxm; class ConvParams : public NNParams { public: ConvParams(void) {} virtual ~ConvParams(void) {} void set_kernel_dims(int kdims, int ksize) { for(int i=0; ikernel_dim_.push_back(ksize); } void set_kernel_dims(int kh, int kw, int kd) { this->kernel_dim_.push_back(kh); this->kernel_dim_.push_back(kw); this->kernel_dim_.push_back(kd); } vector& get_kernel_dims() { return kernel_dim_; } void set_strides(int sdims, int stride) { for(int i=0; istrides_.push_back(stride); } void set_strides(int sh, int sw, int sd) { this->strides_.push_back(sh); this->strides_.push_back(sw); this->strides_.push_back(sd); } vector& get_strides() { return strides_; } void set_pads(int pdims, int pad) { for(int i=0; ipads_.push_back(pad); } void set_pads(int ph, int pw, int pd) { this->pads_.push_back(ph); this->pads_.push_back(pw); this->pads_.push_back(pd); } vector& get_pads() { return pads_; } void set_output_pads(int pdims, int pad) { for(int i=0; iopads_.push_back(pad); } void set_output_pads(int ph, int pw, int pd) { this->opads_.push_back(ph); this->opads_.push_back(pw); this->opads_.push_back(pd); } vector& get_output_pads() { return opads_; } void set_group(int g) { this->group_ = g;} int get_group() { return this->group_; } void set_nOutput(int num_output) { this->nOutput_ = num_output; } int get_output() { return nOutput_; } void set_weight_filler_type(string ftype) { wfiller_type_ = ftype; } string get_weight_filler_type() { return wfiller_type_; } void set_std(float s) { std_ = s; } float get_std() { return std_; } void set_variance_norm(int v) { variance_norm_ = v; } int get_variance_norm() { return variance_norm_; } void set_bias_filler_type(string ftype) { bfiller_type_ = ftype; } string get_bias_filler_type() { return bfiller_type_; } void set_value(float v) { value_ = v; } float get_value() { return value_; } void set_fused_relu(bool relu) { relu_ = relu; } bool get_fused_relu() { return relu_; } void set_bwd_relu(bool br) { bwd_relu_ = br; } bool get_bwd_relu() { return bwd_relu_; } void set_bias_term(bool bias) { bias_term_ = bias; } bool get_bias_term() { return bias_term_; } void set_compute_stats(bool s) { compute_stats_ = s; } bool get_compute_stats() { return compute_stats_; } void set_physical_padding(bool p) { phys_pad_ = p; } bool get_physical_padding() { return phys_pad_; } void set_compute_engine(int ce) { compute_engine_ = ce; } int get_compute_engine() { return compute_engine_; } void set_algo_type(int at) { algotype_ = at; } int get_algo_type() { return algotype_; } void set_global_params(vector psv) { for(int i=0; i& get_lr_mult() { return lr_mult_; } const vector& get_decay_mult() { return decay_mult_; } void set_data_type(int t) { data_type_ = t; } int get_data_type() { return data_type_; } protected: vector kernel_dim_; // Order of dimensions is Height, Width, Depth (for 3D Conv) vector strides_; // Order follows kernel dimension vector pads_, opads_; // Order follows kernel dimension int nOutput_; // Number of output feature maps string wfiller_type_, bfiller_type_; float std_, value_; bool relu_, bwd_relu_, bias_term_, compute_stats_; bool phys_pad_; int group_, compute_engine_, algotype_; int variance_norm_, data_type_; vector lr_mult_, decay_mult_; }; static MLParams* parseConvParams(NodeParameter* np) { ConvParams* cp = new ConvParams(); // Set name of node string str = np->name(); assert(!str.empty()); cp->set_node_name(str); //Set node type (Convolution, FullyConnected, etc) str = np->type(); assert(!str.empty()); cp->set_node_type(str); //Set tensor names assert(np->bottom_size() == 1); assert(!np->bottom(0).empty()); cp->set_bottom_names(np->bottom(0)); for(int i=0; itop_size(); i++) { assert(!np->top(i).empty()); cp->set_top_names(np->top(i)); } //Set Mode for the node assert((np->mode() == TRAIN) || (np->mode() == TEST)); cp->set_mode(np->mode()); //Set backprop needed/not needed flag for this node cp->set_bprop_flag(np->propagate_down()); vector psv; for(int i=0; iparam_size(); i++) psv.push_back(np->param(i)); cp->set_global_params(psv); ConvolutionParameter pcp = np->convolution_param(); bool bias_term = pcp.bias_term(); int kdims = pcp.kernel_size_size(); switch(kdims) { int kh, kw, kd; case 0: kh = pcp.kernel_h(); kw = pcp.kernel_w(); if(pcp.ndims() == 3) kd = pcp.kernel_d(); else kd = 0; assert((kh > 0) && (kw > 0)); cp->set_kernel_dims(kh, kw, kd); break; case 1: kh = pcp.kernel_size(0); if(pcp.ndims() == 2) cp->set_kernel_dims(kh, kh, 0); else if(pcp.ndims() == 3) cp->set_kernel_dims(kh, kh, kh); break; case 2: kh = pcp.kernel_size(0); kw = pcp.kernel_size(1); assert(pcp.ndims() == 2); cp->set_kernel_dims(kh, kw, 0); break; case 3: kh = pcp.kernel_size(0); kw = pcp.kernel_size(1); kd = pcp.kernel_size(2); assert(pcp.ndims() == 3); cp->set_kernel_dims(kh, kw, kd); break; } // strides int sdims = pcp.stride_size(); switch(sdims) { int sh, sw, sd; case 0: sh = pcp.stride_h(); sw = pcp.stride_w(); if(pcp.ndims() == 3) sd = pcp.stride_d(); else sd = 0; assert((sh > 0) && (sw > 0)); cp->set_strides(sh, sw, sd); break; case 1: sh = pcp.stride(0); if(pcp.ndims() == 2) cp->set_strides(sh, sh, 0); else if(pcp.ndims() == 3) cp->set_strides(sh, sh, sh); break; case 2: sh = pcp.stride(0); sw = pcp.stride(1); assert(pcp.ndims() == 2); cp->set_strides(sh, sw, 0); break; case 3: sh = pcp.stride(0); sw = pcp.stride(1); sd = pcp.stride(2); assert(pcp.ndims() == 3); cp->set_strides(sh, sw, sd); break; } // pads int pdims = pcp.pad_size(); switch(pdims) { int ph, pw, pd; case 0: ph = pcp.pad_h(); pw = pcp.pad_w(); if(pcp.ndims() == 3) pd = pcp.pad_d(); else pd = 0; cp->set_pads(ph, pw, pd); break; case 1: ph = pcp.pad(0); if(pcp.ndims() == 2) cp->set_pads(ph, ph, 0); else if(pcp.ndims() == 3) cp->set_pads(ph, ph, ph); break; case 2: ph = pcp.pad(0); pw = pcp.pad(1); assert(pcp.ndims() == 2); cp->set_pads(ph, pw, 0); break; case 3: ph = pcp.pad(0); pw = pcp.pad(1); pd = pcp.pad(2); assert(pcp.ndims() == 3); cp->set_pads(ph, pw, pd); break; } // output pads int opdims = pcp.opad_size(); switch(opdims) { int oph, opw, opd; case 0: oph = pcp.opad_h(); opw = pcp.opad_w(); if(pcp.ndims() == 3) opd = pcp.opad_d(); else opd = 0; cp->set_output_pads(oph, opw, opd); break; case 1: oph = pcp.opad(0); if(pcp.ndims() == 2) cp->set_output_pads(oph, oph, 0); else if(pcp.ndims() == 3) cp->set_output_pads(oph, oph, oph); break; case 2: oph = pcp.opad(0); opw = pcp.opad(1); assert(pcp.ndims() == 2); cp->set_output_pads(oph, opw, 0); break; case 3: oph = pcp.opad(0); opw = pcp.opad(1); opd = pcp.opad(2); assert(pcp.ndims() == 3); cp->set_output_pads(oph, opw, opd); break; } if(pcp.group() > 1) cp->set_group(pcp.group()); else cp->set_group(1); int nOutput = pcp.num_output(); cp->set_nOutput(nOutput); FillerParameter wp = pcp.weight_filler(); cp->set_weight_filler_type(wp.type()); cp->set_std(wp.std()); cp->set_variance_norm(wp.variance_norm()); cp->set_bias_term(bias_term); if(bias_term) { FillerParameter bp = pcp.bias_filler(); cp->set_bias_filler_type(bp.type()); cp->set_value(bp.value()); } cp->set_fused_relu(pcp.fusedrelu()); cp->set_bwd_relu(pcp.bwd_relu()); cp->set_compute_stats(pcp.compute_stats()); cp->set_physical_padding(pcp.physical_padding()); cp->set_data_type(pcp.data_type()); cp->set_compute_engine(pcp.engine()); cp->set_algo_type(pcp.algotype()); return cp; } class ConvNode : public NNNode { public: ConvNode(ConvParams* p, MLEngine* e); virtual ~ConvNode(void) {} string get_weight_filler_type() { return wfiller_type_; } float get_std() { return std_; } string get_bias_filler_type() { return bfiller_type_; } float get_value() { return value_; } void fillWeightBuffers(TensorBuf* tBuf, int buftype, long long int size); void fillWeightMultipliers(float* lr_mult, float* decay_mult, long long int bytes); void fillBiasBuffers(TensorBuf* tBuf, int buftype, long long int size); void fillBiasMultipliers(float* lr_mult, float* decay_mult, long long int bytes); void Checkpoint(TensorBuf* tBuf, string name, string format); void convert_bf16_f32(libxsmm_bfloat16* in, float* out, int len); void convert_f32_bf16(float* in, libxsmm_bfloat16* out, int len); protected: void forwardPropagate(); void backPropagate(); void weightUpdate(); void solverStep(); void configure(int engine); void shape_setzero(Shape* s) { for(int i=0; idims[i] = 0; } Tensor *tenTop_, *tenBot_, *tenWeight_, *tenBias_; ConvImplParams gparams_; TensorBuf *tenBotDiff_, *tenBotData_; // Data & Gradients with respect to input TensorBuf *tenTopData_; TensorBuf *tenTopDiff_; // Output data TensorBuf *tenWeightDiff_, *tenWeightData_, *tenWeightInc_; // Weight gradients, data, increments TensorBuf *tenBiasData_, *tenBiasDiff_, *tenBiasInc_; // Bias data, gradients, increments TensorBuf *tenScratchData_; Shape ts_, ws_; string wfiller_type_, bfiller_type_; string weight_, bias_, mean_, mean2_; int variance_norm_; float std_, value_; int bot_cengine_; int count_, in_dtype, out_dtype; vector lr_mult_, decay_mult_; bool first_fp = true, first_bp=true; bool compute_stats_; libxsmm_bfloat16* bf16_wt_ptr=NULL; float *cbptr, *stptr=NULL, *dwptr=NULL; ConvImpl *impl=NULL; SolverNode *solver_; MLEngine* eptr_; }; libxsmm-1.17/samples/deeplearning/gxm/include/ConvImpl.hpp000066400000000000000000000063121415223013700236740ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include #include #include "common.hpp" #include "check.hpp" #include "Tensor.hpp" typedef struct { string node_name; int nInput, nOutput; int batch_size; int iHeight, iWidth, iDepth; int oHeight, oWidth, oDepth; int ipad_h, ipad_w, ipad_d; int opad_h, opad_w, opad_d; int pad_h, pad_w, pad_d; int stride_h, stride_w, stride_d; int kh, kw, kd; int group; bool bias_term, compute_stats; bool relu, bwd_relu, physical_padding; int algType; int bdims, tdims, wdims, bidims; int in_data_type, out_data_type; int num_threads; int num_numa_nodes; } ConvImplParams; class ConvImpl { protected: ConvImplParams *gp; int engine; TensorLayoutType top_layout_type, gbot_layout_type; void *top_layout, *gbot_layout; int top_compute_engine=-1; int bot_compute_engine=-1; string nname; TensorBuf* scratchp; public: ConvImpl(ConvImplParams* gp_, int engine_): gp(gp_), engine(engine_) {} void set_top_compute_engine(int e) { top_compute_engine = e;} void set_bot_compute_engine(int e) { bot_compute_engine = e;} void set_node_name(string s) { nname = s; } void set_scratch_buffer(TensorBuf* sb) { scratchp = sb; } virtual void forwardPropagate(TensorBuf *inp, TensorBuf *weightp, TensorBuf* hweightp, TensorBuf *biasp, TensorBuf *outp, int tid) = 0; virtual void backPropagate(TensorBuf* inp, TensorBuf *deloutp, TensorBuf* weightp, TensorBuf *delinp, int tid) = 0; virtual void weightUpdate(TensorBuf *inp, TensorBuf *deloutp, TensorBuf *delweightp, TensorBuf *delbiasp, int tid) = 0; virtual void dumpBuffer(TensorBuf*, void*) {} virtual void forwardPropagate(TensorBuf *inp, TensorBuf* weightp, TensorBuf* hweightp, TensorBuf* biasp, TensorBuf *outp) { switch(engine) { case XSMM: forwardPropagate(inp, weightp, hweightp, biasp, outp, 0); break; } } virtual void backPropagate(TensorBuf* inp, TensorBuf* weightp, TensorBuf *deloutp, TensorBuf *delinp) { switch(engine) { case XSMM: backPropagate(inp, weightp, deloutp, delinp, 0); break; } } virtual void weightUpdate(TensorBuf *inp, TensorBuf *deloutp, TensorBuf *delweightp, TensorBuf *delbiasp) { switch(engine) { case XSMM: weightUpdate(inp, deloutp, delweightp, delbiasp, 0); break; } } }; libxsmm-1.17/samples/deeplearning/gxm/include/ConvXSMM.hpp000066400000000000000000000055121415223013700235600ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include #include "common.hpp" #include "check.hpp" #include "ConvImpl.hpp" #include "libxsmm.h" #define VLEN 16 #define CHKERR_LIBXSMM_DNN(A) if ( A != LIBXSMM_DNN_SUCCESS )\ {\ fprintf(stdout, "%s, %s\n", gp->node_name.c_str(), libxsmm_dnn_get_error(A) );\ fflush(stdout);\ } class ConvXSMM : public ConvImpl { protected: ConvImpl *gp_; libxsmm_dnn_conv_desc conv_desc; libxsmm_dnn_layer* libxsmm_handle[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_input[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_output[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_filter[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_checkpoint_filter = NULL; libxsmm_dnn_tensor* libxsmm_checkpoint_history_filter = NULL; libxsmm_dnn_tensor* libxsmm_delinput[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_deloutput[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_delfilter[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_temp = NULL; libxsmm_dnn_tensor_datalayout* libxsmm_layout; libxsmm_dnn_err_t status; ConvImplParams *cp; float *dinptr, *dwtptr; bool updated_scratch_fwd=false, updated_scratch_bwd=false, updated_scratch_upd=false; void *in_ptr[NUM_NUMA_NODES] = {NULL}, *wt_ptr[NUM_NUMA_NODES]={NULL}, *hwt_ptr=NULL; void *out_ptr[NUM_NUMA_NODES] = {NULL}, *f32_wt_ptr[NUM_NUMA_NODES]={NULL}; void *din_ptr[NUM_NUMA_NODES] = {NULL}, *dout_ptr[NUM_NUMA_NODES] = {NULL}; void *scratch[NUM_NUMA_NODES]={NULL}; int prev_scratch_size = 0; public: ConvXSMM(ConvImplParams *gp, int engine); virtual ~ConvXSMM(void) {} void forwardPropagate(TensorBuf *inp, TensorBuf* weightp, TensorBuf* hweightp, TensorBuf* biasp, TensorBuf *outp, int tid); void backPropagate(TensorBuf *inp, TensorBuf* weightp, TensorBuf *deloutp, TensorBuf *delinp, int tid); void weightUpdate(TensorBuf *inp, TensorBuf *deloutp, TensorBuf *delweightp, TensorBuf *delbiasp, int tid); void dumpBuffer(TensorBuf *wt, void* temp); }; libxsmm-1.17/samples/deeplearning/gxm/include/Dropout.hpp000066400000000000000000000067311415223013700236060ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include "Node.hpp" #include "Engine.hpp" #include "Params.hpp" #include "Tensor.hpp" #include "proto/gxm.pb.h" #include "check.hpp" #include "io.hpp" using namespace std; using namespace gxm; typedef struct { int nInput, nOutput; int iHeight, iWidth; int oHeight, oWidth; int batch_size; int algType, data_type; int num_threads; }DropoutImplParams; class DropoutParams : public NNParams { public: DropoutParams(void) {} virtual ~DropoutParams(void) {} void set_dropout_ratio(float r) { dropout_ratio_ = r; } float get_dropout_ratio() { return dropout_ratio_; } void set_data_type(int t) { data_type_ = t; } int get_data_type() { return data_type_; } void set_compute_engine(int ce) { compute_engine_ = ce; } int get_compute_engine() { return compute_engine_; } void set_algo_type(int at) { algotype_ = at; } int get_algo_type() { return algotype_; } protected: float dropout_ratio_; int compute_engine_, algotype_, data_type_; }; static MLParams* parseDropoutParams(NodeParameter* np) { DropoutParams* dp = new DropoutParams(); // Set name of node string str = np->name(); assert(!str.empty()); dp->set_node_name(str); //Set node type (Dropout) str = np->type(); assert(!str.empty()); dp->set_node_type(str); //Set tensor names assert(np->bottom_size() == 1); assert(!np->bottom(0).empty()); dp->set_bottom_names(np->bottom(0)); assert(np->top_size() == 1); assert(!np->top(0).empty()); dp->set_top_names(np->top(0)); //Set Mode for the node assert((np->mode() == TRAIN) || (np->mode() == TEST)); dp->set_mode(np->mode()); //Set backprop needed/not needed flag for this node dp->set_bprop_flag(np->propagate_down()); DropoutParameter p = np->dropout_param(); dp->set_dropout_ratio(p.dropout_ratio()); dp->set_data_type(p.data_type()); dp->set_compute_engine(p.engine()); dp->set_algo_type(p.algotype()); return dp; } class DropoutNode : public NNNode { public: DropoutNode(DropoutParams* p, MLEngine* e); virtual ~DropoutNode(void) {} protected: void forwardPropagate(); void backPropagate(); void configure(int engine); void shape_setzero(Shape* s) { for(int i=0; idims[i] = 0; } Tensor *tenTop_; // Output tensor pointer Tensor *tenBot_; // Input tensor pointer void *tenMask_; TensorBuf *tenBotDiff_, *tenBotData_; // Data & Gradients with respect to input TensorBuf *tenTopData_; // Output buffer unsigned int *seeds; // Mask and seeds buffers Shape ts_; float threshold_, scale_; unsigned int uint_threshold_; MLEngine* eptr_; DropoutImplParams gparams_; }; libxsmm-1.17/samples/deeplearning/gxm/include/DummyData.hpp000066400000000000000000000126621415223013700240370ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include "Node.hpp" #include "Engine.hpp" #include "Params.hpp" #include "Tensor.hpp" #include "proto/gxm.pb.h" #include "check.hpp" #include "libxsmm.h" using namespace std; using namespace gxm; class DummyDataParams: public NNParams { public: DummyDataParams(void) {} virtual ~DummyDataParams(void) {} void set_lookahead(int l) { this->lookahead_ = l; } void set_chunk(int chunk) { this->chunk_ = chunk; } void set_shape_zero() { shape_.ndims = 0; for(int i=0; idummy_data_param()); // Set name of node assert(!np->name().empty()); p->set_node_name(np->name()); //Set node type (Convolution, FullyConnected, etc) assert(!np->type().empty()); p->set_node_type(np->type()); //Set tensor names assert(np->bottom_size() == 0); for(int i=0; itop_size(); i++) { assert(!np->top(i).empty()); p->set_top_names(np->top(i)); } //Set Mode for the node assert((np->mode() == TRAIN) || (np->mode() == TEST)); p->set_mode(np->mode()); p->set_bprop_flag(np->propagate_down()); if(ddp != NULL) { TensorShape s = ddp->shape(0); int ndims = s.dim_size(); for(int i=0; i 0); p->set_shape_zero(); if(ndims == 1) p->set_shape(s.dim(0)); else if(ndims == 2) { if(s.dim(1) > 3) { p->set_shape(s.dim(1)); p->set_num_train_files(s.dim(0)); p->set_num_test_files(s.dim(0)); } else p->set_shape(s.dim(0), s.dim(1)); } else if(ndims == 4) p->set_shape(s.dim(0), s.dim(1), s.dim(2), s.dim(3)); else if(ndims == 5) { p->set_shape(s.dim(1), s.dim(2), s.dim(3), s.dim(4)); p->set_num_train_files(s.dim(0)); p->set_num_test_files(s.dim(0)); } p->set_pad_h(ddp->pad_h()); p->set_pad_w(ddp->pad_w()); FillerParameter dfp = ddp->data_filler(0); p->set_filler_type(dfp.type()); if(dfp.value()) p->set_filler_val(dfp.value()); else p->set_filler_val(0.0f); p->set_data_type(ddp->data_type()); p->set_compute_engine(ddp->engine()); } return p; } class DummyDataNode : public NNNode { public: DummyDataNode(DummyDataParams* p, MLEngine* e); virtual ~DummyDataNode(void) {} void fillData(float* ptr, long long int size); void fillData(float *inptr, libxsmm_bfloat16* outptr, long long int size); void fillData(int* ptr, long long int size); protected: Tensor *tenBot_; vector tenTop_; vector tenTopData_; string node_name_, node_type_; string filler_type_; float filler_val_; int pad_h_, pad_w_; int global_batch_size_, num_machines_; Shape* ts_; bool first_fp=true; void forwardPropagate(); }; libxsmm-1.17/samples/deeplearning/gxm/include/Eltwise.hpp000066400000000000000000000063621415223013700235660ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include "assert.h" #include "Node.hpp" #include "Engine.hpp" #include "Params.hpp" #include "Tensor.hpp" #include "proto/gxm.pb.h" #include "EltwiseImpl.hpp" #include "EltwiseXSMM.hpp" using namespace std; using namespace gxm; class EltwiseParams : public NNParams { public: EltwiseParams(void) {} virtual ~EltwiseParams(void) {} void set_op(int op) {op_ = op; } int get_op() { return op_; } void set_data_type(int t) { data_type_ = t; } int get_data_type() { return data_type_; } void set_compute_engine(int ce) { compute_engine_ = ce; } int get_compute_engine() { return compute_engine_; } void set_algo_type(int at) { algotype_ = at; } int get_algo_type() { return algotype_; } protected: int op_, compute_engine_, algotype_, data_type_; }; static MLParams* parseEltwiseParams(NodeParameter* np) { EltwiseParams *ep = new EltwiseParams(); // Set name of node string str = np->name(); assert(!str.empty()); ep->set_node_name(str); //Set node type (Convolution, FullyConnected, etc) str = np->type(); assert(!str.empty()); ep->set_node_type(str); //Set tensor names for(int i=0; ibottom_size(); i++) ep->set_bottom_names(np->bottom(i)); assert(np->top_size() == 1); assert(!np->top(0).empty()); ep->set_top_names(np->top(0)); //Set Mode for the node assert((np->mode() == TRAIN) || (np->mode() == TEST)); ep->set_mode(np->mode()); //Set backprop needed/not needed flag for this node ep->set_bprop_flag(np->propagate_down()); EltwiseParameter pep = np->eltwise_param(); ep->set_op(pep.op()); ep->set_data_type(pep.data_type()); ep->set_compute_engine(pep.engine()); ep->set_algo_type(pep.algotype()); return ep; } class EltwiseNode : public NNNode { public: EltwiseNode(EltwiseParams *p, MLEngine* e); virtual ~EltwiseNode(void) {} protected: void forwardPropagate(); void backPropagate(); void configure(int engine); void shape_setzero(Shape* s) { for(int i=0; idims[i] = 0; } Tensor* tenTop_; // Output tensor pointer vector tenBot_; // Input tensor pointer EltwiseImplParams gparams_; vector tenBotDiff_, tenBotData_; // Data & Gradients with respect to input TensorBuf *tenTopData_, *tenTopDiff_; // Output data and gradients with respect to output Shape ts_; vector bot_cengine_; int count_ = 0; EltwiseImpl *impl=NULL; }; libxsmm-1.17/samples/deeplearning/gxm/include/EltwiseImpl.hpp000066400000000000000000000045731415223013700244120ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include #include #include "common.hpp" #include "check.hpp" #include "Tensor.hpp" typedef struct { int nOutput; vector nInput; int bdims; int tdims; int iHeight; int iWidth; int oHeight; int oWidth; int batch_size; int op; int algType; int data_type; int num_threads; } EltwiseImplParams; class EltwiseImpl { protected: EltwiseImplParams *gp; int engine; TensorLayoutType top_layout_type; vector gbot_layout_type; void *top_layout; vector gbot_layout; int top_compute_engine=-1; vector bot_compute_engine; string next_ntype, nname; public: EltwiseImpl(EltwiseImplParams* gp_, int engine_): gp(gp_), engine(engine_) {} void set_top_compute_engine(int e) { top_compute_engine = e;} void set_bot_compute_engine(int e) { bot_compute_engine.push_back(e);} void set_next_node_type(string s) { next_ntype = s; } void set_node_name(string s) { nname = s; } virtual void forwardPropagate(vector& inp, TensorBuf *outp, int tid) = 0; virtual void backPropagate(TensorBuf* deloutp, vector& delinp, int tid) = 0; virtual void forwardPropagate(vector& inp, TensorBuf* outp) { switch(engine) { case XSMM: forwardPropagate(inp, outp, 0); break; } } virtual void backPropagate(TensorBuf* deloutp, vector& delinp) { switch(engine) { case XSMM: backPropagate(deloutp, delinp, 0); break; } } }; libxsmm-1.17/samples/deeplearning/gxm/include/EltwiseXSMM.hpp000066400000000000000000000027411415223013700242700ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include "EltwiseImpl.hpp" #include "libxsmm.h" class EltwiseXSMM : public EltwiseImpl { public: EltwiseXSMM(EltwiseImplParams* gp, int engine) : EltwiseImpl(gp, engine) { top_layout_type = LIBXSMM_CUSTOM_LAYOUT; top_layout = NULL; for(int n=0; nnInput.size(); n++) { gbot_layout_type.push_back(LIBXSMM_CUSTOM_LAYOUT); gbot_layout.push_back(NULL); } } void forwardPropagate(vector& inp, TensorBuf* outp, int tid); void backPropagate(TensorBuf* deloutp, vector& delinp, int tid); void convert_NCHW_to_NCHWV(float*, int, int, int, int, float*); void convert_NCHWV_to_NCHW(float*, int, int, int, int, float*); }; libxsmm-1.17/samples/deeplearning/gxm/include/Engine.fwd.hpp000066400000000000000000000020211415223013700241220ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once class MLEngine; class ConvNode; class ConvParams; class FusedBNormNode; class FCNode; class FCparams; class PoolingNode; class PoolingParams; class ImageDataNode; class ImageDataParams; class SolverNode; class SolverParams; class NNNode; class NNParams; libxsmm-1.17/samples/deeplearning/gxm/include/Engine.hpp000066400000000000000000000153641415223013700233610ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include #include #include #include #include #include #include "proto/gxm.pb.h" #include "Engine.fwd.hpp" #include "MLNode.fwd.hpp" #include "Config.hpp" #include "Task.hpp" #include "common.hpp" #include "Solver.hpp" #include "libxsmm.h" #ifdef USE_MLSL #include "mpi.h" #endif using namespace std; using namespace gxm; extern int iter; #ifdef USE_MLSL #include "mlsl.hpp" //using namespace MLSL; #endif #define TRAIN 0 #define VAL 1 #define TEST 2 #define START_GUARD_BAND 64 #define END_GUARD_BAND 64 #define CANARY 0x7F #define NDIFFS 10 struct dupChecker_ { inline dupChecker_() : tmpSet() {} inline bool operator()(Task *t) { return tmpSet.insert(t).second; } private: std::set tmpSet; }; class MLEngine { protected: NTGParameter ntgparam_; NodeParameter np_; SolverParameter sparam_; #ifdef USE_MLSL MLSL::Distribution *data_parallelism; MLSL::Session *session_; #endif vector ntg_; list etg_[3]; // 0 - Training, 1 - Validation, 2 - testing SolverParams *solverParams_; SolverNode* solver_; Tensor* tenScratch_; TensorBuf* tenScratchBuf_; struct TensorPair { string name; Tensor* t; }; typedef list TensorList; typedef TensorList::iterator Iter; typedef map Tmap; Tmap inTensorMap_, outTensorMap_, weightTensorMap_, biasTensorMap_, statsTensorMap_; TensorList defTList_, inTList_, outTList_, wTList_, biasTList_, statsTList_; bool inferenceOnly_, load_from_checkpoint_; string checkpoint_dir_, checkpoint_format_; int num_epochs_, exec_mode_, current_epoch_, current_batch_; int data_type_; int num_machines_, num_machine_groups_, num_threads_; int batch_size_, num_train_batches_, num_test_batches_, num_test_views_; int global_node_id_; float lr_, *wt_lr_mult_[NUM_NUMA_NODES], *wt_decay_mult_[NUM_NUMA_NODES]; float *bias_lr_mult_[NUM_NUMA_NODES], *bias_decay_mult_[NUM_NUMA_NODES]; float scf_=0; void *input_buf_=NULL; void *fact_buf_=NULL, *bact_buf_=NULL, *wbuf_=NULL; void *weight_buf_[NUM_NUMA_NODES]={NULL}, *wdiff_buf_[NUM_NUMA_NODES]={NULL}; void *winc_buf_[NUM_NUMA_NODES]={NULL}, *lpweight_buf_[NUM_NUMA_NODES]={NULL}; void *lpwdiff_buf_[NUM_NUMA_NODES]={NULL}; #if 1 void *bias_buf_[NUM_NUMA_NODES]={NULL}, *bidiff_buf_[NUM_NUMA_NODES]={NULL}; void *biinc_buf_[NUM_NUMA_NODES]={NULL}, *stats_buf_[NUM_NUMA_NODES]={NULL}; #else void *bias_buf_=NULL, *bidiff_buf_=NULL; void *biinc_buf_=NULL, *stats_buf_=NULL; #endif int total_weights_, total_biases_, orig_total_weights_; void *scratch[NUM_NUMA_NODES]={NULL}; vector input_can_ptr; vector fact_can_ptr, bact_can_ptr; vector wt_can_ptr, wdiff_can_ptr, winc_can_ptr; vector bias_can_ptr, stats_can_ptr, bidiff_can_ptr, biinc_can_ptr; #ifdef USE_MLSL vector wtgrad_comms_vec, bias_grad_comms_vec, combo_grad_comms_vec; #endif int ic, fac, bac, wtc, wdc, wic, bic, sic, bidc, biic; void create_schedule(int); void optimize_schedule(int); void allocate_tensor_memory(Tensor*, int, void*); void clear_history(TensorList); int find_in_nodeTypeList(string); void checkpoint(TensorList L, int); void read_checkpoint_file(TensorBuf*, string, string); void load_checkpoint(TensorList, int, string); void canary_check(void*, vector&, int); void allocate_memory(string, TensorList, int, vector&, int*, long long int*); void* allocate_gradient_tensor(TensorList, int, int, long long int); void insertSplitNodes(NTGParameter& p, NTGParameter* ps); void convert_f32_bf16(float* in, libxsmm_bfloat16* out, int len, int numa_node); void convert_f32_bf16(float** in, libxsmm_bfloat16** out, int len); void convert_bf16_f32(libxsmm_bfloat16* in, float* out, int len); void waitForComms(string); public: MLEngine() {} virtual ~MLEngine() {} void create(int mode, string ntgConfig, string solverConfig); bool register_tensor(string name, int type, Tensor* t); Tensor* get_tensor(string name, int type); void execute_on_thread(int num_threads, MLNode* node, void (*fname)(int tid)); void set_global_strategy(MachineParameter* mparam); void run(int mode); SolverNode* getSolver() { return solver_; } TensorBuf* getScratchBuffer() { return tenScratchBuf_; } bool is_inference_only() { return inferenceOnly_; } int get_num_threads() { return num_threads_; } int get_num_machines() { return num_machines_; } int get_num_machine_groups() { return num_machine_groups_; } int get_num_epochs() { return num_epochs_;} int get_current_epoch() { return current_epoch_; } int get_current_batch() { return current_batch_; } int get_execution_mode() { return exec_mode_; } int get_global_node_id() { return global_node_id_; } int get_num_train_batches() { return num_train_batches_; } int get_num_test_batches() { return num_test_batches_; } int get_num_test_views() {return num_test_views_; } int get_batch_size() { return batch_size_; } float get_scaling_factor() { return scf_; } #ifdef USE_MLSL vector& get_wtgrad_comms_vec() { return wtgrad_comms_vec; } vector& get_bias_grad_comms_vec() { return bias_grad_comms_vec; } vector& get_combo_grad_comms_vec() { return combo_grad_comms_vec; } #endif void set_batch_size(int b) {batch_size_ = b; } void set_num_train_batches(int ntrainb) {num_train_batches_ = ntrainb; } void set_num_test_batches(int ntestb) {num_test_batches_ = ntestb; } void set_num_test_views(int ntestv) {num_test_views_ = ntestv; } void set_learning_rate(float lr) { lr_ = lr; } void set_scaling_factor(float scf) { scf_ = scf; } #ifdef USE_MLSL MLSL::Distribution* get_distribution() { return data_parallelism; } MLSL::Session *get_session() { return session_; } #endif }; libxsmm-1.17/samples/deeplearning/gxm/include/FCImpl.hpp000066400000000000000000000056101415223013700232570ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include #include "common.hpp" #include "check.hpp" #include "Tensor.hpp" typedef struct { string node_name; int nInput, nOutput; int batch_size; int iHeight, iWidth; int oHeight, oWidth; int kh, kw; bool bias_term; int in_data_type, out_data_type; int algType; int num_numa_nodes; int num_threads; } FCImplParams; class FCImpl { protected: FCImplParams* gp; int engine; TensorLayoutType bot_layout_type, top_layout_type, gbot_layout_type; void *bot_layout=NULL, *top_layout=NULL, *gbot_layout=NULL; int top_compute_engine=-1; int bot_compute_engine=-1; string nname; TensorBuf* scratchp; public: FCImpl(FCImplParams* gp_, int engine_): gp(gp_), engine(engine_) {} void set_top_compute_engine(int e) { top_compute_engine = e;} void set_bot_compute_engine(int e) { bot_compute_engine = e;} void set_node_name(string s) { nname = s; } void set_scratch_buffer(TensorBuf* sb) { scratchp = sb; } virtual void forwardPropagate(TensorBuf *inp, TensorBuf* weightp, TensorBuf *hweightp, TensorBuf* biasp, TensorBuf *outp, int tid) = 0; virtual void backPropagate(TensorBuf *deloutp, TensorBuf* weightp, TensorBuf *delinp, int tid) = 0; virtual void weightUpdate(TensorBuf *deloutp, TensorBuf *inp, TensorBuf *delweightp, TensorBuf *delbiasp, int tid) = 0; virtual void forwardPropagate(TensorBuf *inp, TensorBuf* weightp, TensorBuf *hweightp, TensorBuf* biasp, TensorBuf *outp) { switch(engine) { case XSMM: forwardPropagate(inp, weightp, hweightp, biasp, outp, 0); break; } } virtual void backPropagate(TensorBuf *deloutp, TensorBuf *weightp, TensorBuf *delinp) { switch(engine) { case XSMM: backPropagate(deloutp, weightp, delinp, 0); break; } } virtual void weightUpdate(TensorBuf *deloutp, TensorBuf *inp, TensorBuf *delweightp, TensorBuf *delbiasp) { switch(engine) { case XSMM: weightUpdate(deloutp, inp, delweightp, delbiasp, 0); break; } } }; libxsmm-1.17/samples/deeplearning/gxm/include/FCXSMM.hpp000066400000000000000000000047311415223013700231450ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include "FCImpl.hpp" #include "libxsmm.h" #define VLEN 16 #define CHKERR_LIBXSMM_DNN(A) if ( A != LIBXSMM_DNN_SUCCESS )\ {\ fprintf(stdout, "%s, %s\n", gp->node_name.c_str(), libxsmm_dnn_get_error(A) );\ fflush(stdout);\ } class FCXSMM : public FCImpl { protected: FCImpl *gp_; libxsmm_dnn_fullyconnected_desc fullyconnected_desc; libxsmm_dnn_fullyconnected* libxsmm_handle[NUM_NUMA_NODES]; libxsmm_dnn_tensor* libxsmm_input[NUM_NUMA_NODES]={NULL}; libxsmm_dnn_tensor* libxsmm_delinput[NUM_NUMA_NODES]={NULL}; libxsmm_dnn_tensor* libxsmm_output[NUM_NUMA_NODES]={NULL}; libxsmm_dnn_tensor* libxsmm_deloutput[NUM_NUMA_NODES]={NULL}; libxsmm_dnn_tensor* libxsmm_filter[NUM_NUMA_NODES]={NULL}; libxsmm_dnn_tensor* libxsmm_checkpoint_filter=NULL; libxsmm_dnn_tensor* libxsmm_checkpoint_history_filter=NULL; libxsmm_dnn_tensor* libxsmm_delfilter[NUM_NUMA_NODES]={NULL}; libxsmm_dnn_tensor_datalayout* libxsmm_layout; libxsmm_dnn_err_t status; libxsmm_dnn_err_t global_status = LIBXSMM_DNN_SUCCESS; bool updated_scratch_fwd=false, updated_scratch_bwd=false, updated_scratch_upd=false; void *scratch[NUM_NUMA_NODES]={NULL}; int prev_scratch_size = 0; public: FCXSMM(FCImplParams* gp, int engine); virtual ~FCXSMM(void) {} bool firstTimeFwd=true, firstTimeBwd=true; void forwardPropagate(TensorBuf *inp, TensorBuf *weightp, TensorBuf *hweightp, TensorBuf *biasp, TensorBuf *outp, int tid); void backPropagate(TensorBuf *deloutp, TensorBuf* weightp, TensorBuf *delinp, int tid); void weightUpdate(TensorBuf *deloutp, TensorBuf *inp, TensorBuf *delweightp, TensorBuf *delbiasp, int tid); }; libxsmm-1.17/samples/deeplearning/gxm/include/FullyConnected.hpp000066400000000000000000000157721415223013700250750ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include "assert.h" #include "Node.hpp" #include "Engine.hpp" #include "Params.hpp" #include "Solver.hpp" #include "common.hpp" #include "io.hpp" #include "proto/gxm.pb.h" #include "FCImpl.hpp" #include "FCXSMM.hpp" using namespace std; using namespace gxm; class FCParams : public NNParams { public: FCParams(void) {} virtual ~FCParams(void) {} void set_nOutput(int num_output) { this->nOutput_ = num_output; } int get_output() { return nOutput_; } void set_activation_filler_type(string ftype) { afiller_type_ = ftype; } string get_activation_filler_type() { return afiller_type_; } void set_weight_filler_type(string ftype) { wfiller_type_ = ftype; } string get_weight_filler_type() { return wfiller_type_; } void set_std(float s) { std_ = s; } float get_std() { return std_; } void set_variance_norm(int v) { variance_norm_ = v; } int get_variance_norm() { return variance_norm_; } void set_bias_filler_type(string ftype) { bfiller_type_ = ftype; } string get_bias_filler_type() { return bfiller_type_; } void set_bias_term(bool bias) { bias_term_ = bias; } bool get_bias_term() { return bias_term_; } void set_value(float v) { value_ = v; } float get_value() { return value_; } void set_timeSteps(int nt) { this->timesteps_ = nt; } void set_transpose_flag(bool xpose) { transpose_ = xpose; } bool get_transpose_flag() { return transpose_; } void set_data_type(int t) { data_type_ = t; } int get_data_type() { return data_type_; } void set_compute_engine(int ce) { compute_engine_ = ce; } int get_compute_engine() { return compute_engine_; } void set_algo_type(int at) { algotype_ = at; } int get_algo_type() { return algotype_; } void set_global_params(vector psv) { for(int i=0; i& get_lr_mult() { return lr_mult_; } const vector& get_decay_mult() { return decay_mult_; } protected: int nOutput_, data_type_; int timesteps_, compute_engine_, algotype_; int variance_norm_; bool transpose_; string wfiller_type_, bfiller_type_, afiller_type_; float std_, value_; bool bias_term_; vector lr_mult_, decay_mult_; }; static MLParams* parseFCParams(NodeParameter* np) { FCParams* fcp = new FCParams(); // Set name of node assert(!np->name().empty()); fcp->set_node_name(np->name()); //Set node type (Convolution, FullyConnected, etc) assert(!np->type().empty()); fcp->set_node_type(np->type()); //Set tensor names assert(np->bottom_size() == 1); assert(!np->bottom(0).empty()); fcp->set_bottom_names(np->bottom(0)); assert(np->top_size() == 1); assert(!np->top(0).empty()); fcp->set_top_names(np->top(0)); //Set Mode for the node assert((np->mode() == TRAIN) || (np->mode() == TEST)); fcp->set_mode(np->mode()); //Set backprop needed/not needed flag for this node fcp->set_bprop_flag(np->propagate_down()); // Set global parameters such as learning rate multiplier etc. vector psv; for(int i=0; iparam_size(); i++) psv.push_back(np->param(i)); fcp->set_global_params(psv); FullyConnectedParameter pfcp = np->fc_param(); int num_output = pfcp.num_output(); fcp->set_nOutput(num_output); FillerParameter wp = pfcp.weight_filler(); fcp->set_weight_filler_type(wp.type()); fcp->set_std(wp.std()); fcp->set_variance_norm(wp.variance_norm()); bool bias_term = pfcp.bias_term(); fcp->set_bias_term(bias_term); if(bias_term) { FillerParameter bp = pfcp.bias_filler(); fcp->set_bias_filler_type(bp.type()); fcp->set_value(bp.value()); } bool xpose = pfcp.transpose(); if(xpose) fcp->set_transpose_flag(xpose); bool activation_term = pfcp.activation_term(); if(activation_term) { FillerParameter ap = pfcp.activation_filler(); fcp->set_activation_filler_type(ap.type()); fcp->set_value(ap.value()); } int nt = pfcp.num_timesteps(); fcp->set_timeSteps(nt); fcp->set_data_type(pfcp.data_type()); fcp->set_compute_engine(pfcp.engine()); fcp->set_algo_type(pfcp.algotype()); return fcp; } class FCNode: public NNNode { public: FCNode(FCParams *p, MLEngine* e); virtual ~FCNode(void) {} string get_weight_filler_type() { return wfiller_type_; } float get_std() { return std_; } string get_bias_filler_type() { return bfiller_type_; } float get_value() { return value_; } void fillWeightBuffers(TensorBuf* tBuf, int buftype, long long int size); void fillWeightMultipliers(float* lr_mult, float* decay_mult, long long int bytes); void fillBiasBuffers(TensorBuf* tBuf, int buftype, long long int size); void fillBiasMultipliers(float *lr_mult, float *decay_mult, long long int bytes); void Checkpoint(TensorBuf *ptr, string name, string format); void convert_bf16_f32(libxsmm_bfloat16*, float*, int); void convert_f32_bf16(float*, libxsmm_bfloat16*, int); protected: void forwardPropagate(); void backPropagate(); void weightUpdate(); void solverStep(); void truncate_mask_fp32_bfp16(float* in, float* out, unsigned int len); void shape_setzero(Shape* s) { for(int i=0; idims[i] = 0; } void configure(int engine); Tensor *tenTop_=NULL; // Output tensor pointer Tensor *tenBot_=NULL; // Input tensor pointer Tensor *tenWeight_=NULL; // Weight tensor pointer Tensor *tenBias_=NULL; FCImplParams gparams_; TensorBuf *tenBotDiff_=NULL, *tenBotData_=NULL; TensorBuf *tenTopData_=NULL, *tenTopDiff_=NULL; TensorBuf *tenWeightDiff_=NULL, *tenWeightData_=NULL, *tenWeightInc_=NULL; TensorBuf *tenBiasData_=NULL, *tenBiasDiff_=NULL, *tenBiasInc_=NULL; TensorBuf *tenScratchData_=NULL; Shape bs_, ts_, ws_; int bot_cengine_; int count_; string wfiller_type_, bfiller_type_; string weight_, bias_; float std_, value_; int variance_norm_; float *stptr=NULL, cbptr[16]; int in_dtype, out_dtype; float *dwptr=NULL; vector lr_mult_, decay_mult_; FCImpl* impl; SolverNode* solver_; MLEngine* eptr_; }; libxsmm-1.17/samples/deeplearning/gxm/include/FusedBNorm.hpp000066400000000000000000000171661415223013700241620ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include "Node.hpp" #include "Engine.hpp" #include "Params.hpp" #include "Tensor.hpp" #include "proto/gxm.pb.h" #include "fillers.hpp" #include "FusedBNormImpl.hpp" #include "FusedBNormXSMM.hpp" using namespace std; using namespace gxm; class FusedBNormParams : public NNParams { public: FusedBNormParams(void) {} virtual ~FusedBNormParams(void) {} void set_strides(int sdims, int stride) { for(int i=0; istrides_.push_back(stride); } void set_strides(int sh, int sw, int sd) { this->strides_.push_back(sh); this->strides_.push_back(sw); this->strides_.push_back(sd); } vector& get_strides() { return strides_; } void set_pads(int pdims, int pad) { for(int i=0; ipads_.push_back(pad); } void set_pads(int ph, int pw, int pd) { this->pads_.push_back(ph); this->pads_.push_back(pw); this->pads_.push_back(pd); } vector& get_pads() { return pads_; } void set_ipads(int ipdims, int ipad) { for(int i=0; i& get_ipads() { return ipads_; } void set_lr_mult(float lr) {lr_mult_ = lr;} float get_lr_mult() { return lr_mult_; } void set_decay_mult(float decay) { decay_mult_ = decay;} float get_decay_mult() { return decay_mult_; } void set_eps(float eps) { eps_ = eps; } float get_eps() { return eps_; } void set_mmf(float mmf) { mmf_ = mmf; } float get_mmf() { return mmf_; } void set_global_stats_flag(bool s) { use_global_stats_ = s; } bool get_global_stats_flag() { return use_global_stats_; } void set_relu(bool r) { relu_ = r; } bool get_relu() { return relu_; } void set_bwd_relu(bool br) { brelu_ = br; } bool get_bwd_relu() { return brelu_; } void set_eltwise(bool e) { eltwise_ = e; } bool get_eltwise() { return eltwise_; } void set_data_type(int t) { data_type_ = t; } int get_data_type() { return data_type_; } void set_compute_engine(int ce) { compute_engine_ = ce; } int get_compute_engine() { return compute_engine_; } void set_algo_type(int at) { algotype_ = at; } int get_algo_type() { return algotype_; } protected: vector strides_; vector pads_, ipads_; bool relu_, brelu_, eltwise_, use_global_stats_; float eps_, mmf_, lr_mult_, decay_mult_; int compute_engine_, algotype_, data_type_; }; static MLParams* parseFusedBNormParams(NodeParameter* np) { FusedBNormParams* fbnp = new FusedBNormParams(); // Set name of node string str = np->name(); assert(!str.empty()); fbnp->set_node_name(str); //Set node type (FusedBNorm) str = np->type(); assert(!str.empty()); fbnp->set_node_type(str); //Set tensor names for(int i=0; ibottom_size(); i++) { assert(!np->bottom(i).empty()); fbnp->set_bottom_names(np->bottom(i)); } assert(!np->top(0).empty()); fbnp->set_top_names(np->top(0)); //Set Mode for the node assert((np->mode() == TRAIN) || (np->mode() == TEST)); fbnp->set_mode(np->mode()); //Set backprop needed/not needed flag for this node fbnp->set_bprop_flag(np->propagate_down()); FusedBNormParameter p = np->fused_bnorm_param(); int sdims = p.stride_size(); switch(sdims) { int sh, sw, sd=0; case 0: sh = p.stride_h(); sw = p.stride_w(); assert((sh > 0) && (sw > 0)); fbnp->set_strides(sh, sw, sd); break; case 1: sh = p.stride(0); fbnp->set_strides(sh, sh, sd); break; case 2: sh = p.stride(0); sw = p.stride(1); fbnp->set_strides(sh, sw, sd); break; case 3: sh = p.stride(0); sw = p.stride(1); sd = p.stride(2); fbnp->set_strides(sh, sw, sd); break; } // pads int pdims = p.pad_size(); switch(pdims) { int ph, pw, pd=0; case 0: ph = p.pad_h(); pw = p.pad_w(); fbnp->set_pads(ph, pw, pd); break; case 1: ph = p.pad(0); fbnp->set_pads(ph, ph, pd); break; case 2: ph = p.pad(0); pw = p.pad(1); fbnp->set_pads(ph, pw, pd); break; case 3: ph = p.pad(0); pw = p.pad(1); pd = p.pad(2); fbnp->set_pads(ph, pw, pd); break; } // input pads int ipdims = p.ipad_size(); switch(ipdims) { int iph, ipw, ipd=0; case 0: iph = p.ipad_h(); ipw = p.ipad_w(); fbnp->set_ipads(iph, ipw, ipd); break; case 1: iph = p.ipad(0); fbnp->set_ipads(iph, iph, ipd); break; case 2: iph = p.ipad(0); ipw = p.ipad(1); fbnp->set_ipads(iph, ipw, ipd); break; case 3: iph = p.ipad(0); ipw = p.ipad(1); ipd = p.ipad(2); fbnp->set_ipads(iph, ipw, ipd); break; } fbnp->set_lr_mult(p.lr_mult()); fbnp->set_decay_mult(p.decay_mult()); fbnp->set_mmf(p.mmf()); fbnp->set_eps(p.eps()); fbnp->set_global_stats_flag(p.use_global_stats()); fbnp->set_relu(p.relu()); fbnp->set_bwd_relu(p.bwd_relu()); fbnp->set_eltwise(p.eltwise()); fbnp->set_data_type(p.data_type()); fbnp->set_compute_engine(p.engine()); fbnp->set_algo_type(p.algotype()); return fbnp; } class FusedBNormNode : public NNNode { public: FusedBNormNode(FusedBNormParams* p, MLEngine* e); void Checkpoint(TensorBuf *tBuf, string name, string format); void fillBuffer(TensorBuf *tBuf, int buftype, long long int bytes); void fillBiasMultipliers(float* lr_mult, float* decay_mult, long long int bytes); void convert_bf16_f32(libxsmm_bfloat16* in, float* out, int len); virtual ~FusedBNormNode(void) {} protected: void forwardPropagate(); void backPropagate(); void weightUpdate(); void solverStep(); void configure(int engine); void shape_setzero(Shape* s) { for(int i=0; idims[i] = 0; } Tensor* tenTop_; vector tenBot_; Tensor *tenScale_, *tenShift_; Tensor *tenMean_, *tenVar_; FusedBNormImplParams gparams_; vector tenBotDiff_, tenBotData_; TensorBuf *tenTopData_, *tenTopDiff_; // Output data TensorBuf *tenScaleData_, *tenScaleDiff_; TensorBuf *tenShiftData_, *tenShiftDiff_; TensorBuf *tenScaleInc_, *tenShiftInc_; TensorBuf *tenMeanData_, *tenVarData_; TensorBuf *tenScratchData_; float *gmean_, *gvar_, eps, lr_mult_, decay_mult_; float *stptr=NULL,*cbptr; string scale_, shift_, mean_, var_; bool first_fp=true, first_bp=true; int count_, in_dtype, out_dtype; float scf_=0; vector bot_cengine_; Shape ts_; FusedBNormImpl *impl=NULL; SolverNode *solver_; MLEngine* eptr_; }; libxsmm-1.17/samples/deeplearning/gxm/include/FusedBNormImpl.hpp000066400000000000000000000054211415223013700247730ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include #include "common.hpp" #include "check.hpp" #include "Tensor.hpp" typedef struct { string node_name; int bdims, tdims; vector nInput; int nOutput; int pad_h, pad_w; int ipad_h, ipad_w; int stride_h, stride_w; int iHeight, iWidth; int oHeight, oWidth; int batch_size; float eps, mmf; bool relu, bwd_relu; bool eltwise, use_global_stats; string exec_mode; int algType; int in_data_type, out_data_type; int num_threads; int num_numa_nodes; }FusedBNormImplParams; class FusedBNormImpl { protected: FusedBNormImplParams *gp; int engine; TensorLayoutType bot_layout_type, top_layout_type, gbot_layout_type; void *bot_layout, *top_layout, *gbot_layout; int top_compute_engine=-1; int bot_compute_engine=-1; bool use_global_stats; string nname; TensorBuf* scratchp; float scaling_factor_; public: FusedBNormImpl(FusedBNormImplParams* gp_, int engine_): gp(gp_), engine(engine_) {} void set_top_compute_engine(int e) { top_compute_engine = e;} void set_bot_compute_engine(int e) { bot_compute_engine = e;} void set_node_name(string s) { nname = s; } void set_scratch_buffer(TensorBuf* sb) { scratchp = sb; } void set_global_stats(bool s) { use_global_stats = s; } void set_scaling_factor(float s) { scaling_factor_ = s; } // Assume external threading, e.g., #pragma omp virtual void forwardPropagate(vector inp, TensorBuf* gammap, TensorBuf* betap, TensorBuf* gmeanp, TensorBuf* gvarp, TensorBuf *outp, int tid) { switch(engine) { case XSMM: forwardPropagate(inp, gammap, betap, gmeanp, gvarp, outp, tid); break; } } virtual void backPropagate(TensorBuf *deloutp, TensorBuf *delgammap, TensorBuf *delbetap, vector delinp, int tid) { switch(engine) { case XSMM: backPropagate(deloutp, delgammap, delbetap, delinp, tid); break; } } }; libxsmm-1.17/samples/deeplearning/gxm/include/FusedBNormXSMM.hpp000066400000000000000000000073621415223013700246640ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include "FusedBNormImpl.hpp" #include "check.hpp" #include "libxsmm.h" #define CHKERR_LIBXSMM_DNN(A) if ( A != LIBXSMM_DNN_SUCCESS )\ {\ fprintf(stdout, "%s, %s\n", gp->node_name.c_str(), libxsmm_dnn_get_error(A) );\ fflush(stdout);\ } class FusedBNormXSMM : public FusedBNormImpl { protected: FusedBNormImpl *gp_; libxsmm_dnn_fusedbatchnorm_desc fusedbn_desc_train[2]; libxsmm_dnn_fusedbatchnorm_desc fusedbn_desc_test; libxsmm_dnn_fusedbatchnorm* libxsmm_handle_train[2][NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_fusedbatchnorm* libxsmm_handle_test[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_input_train[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_input_add_train[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_output_train[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_relumask_train[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_expectval_train[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_stddev_train[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_variance_train[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_gamma_train[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_beta_train[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_input_test[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_input_add_test[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_output_test[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_relumask_test[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_expectval_test[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_stddev_test[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_variance_test[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_gamma_test[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_beta_test[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_delinput[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_delinput_add[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_deloutput[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_delgamma[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_delbeta[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor_datalayout* libxsmm_layout; libxsmm_dnn_err_t status; float *bexpect[NUM_NUMA_NODES]={NULL}, *bstddev[NUM_NUMA_NODES]={NULL}, *bvariance[NUM_NUMA_NODES]={NULL}; void *relu_mask[NUM_NUMA_NODES]={NULL}; void *scratch=NULL; bool updated_scratch_fwd=false, updated_scratch_bwd=false; int nBlocksFm, ofmblock; float *sumscratch=NULL; public: FusedBNormXSMM(FusedBNormImplParams* gp, int engine); virtual ~FusedBNormXSMM(void) {} // Assume external threading, e.g., #pragma omp void forwardPropagate(vector inp, TensorBuf* gammap, TensorBuf* betap, TensorBuf *gmeanp, TensorBuf *gvarp, TensorBuf *outp, int tid); void backPropagate(TensorBuf *deloutp, TensorBuf *delgammap, TensorBuf *delbetap, vector delinp, int tid); }; libxsmm-1.17/samples/deeplearning/gxm/include/FusedConvBN.hpp000066400000000000000000000302531415223013700242620ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include "assert.h" #include "Node.hpp" #include "Engine.hpp" #include "Params.hpp" #include "Tensor.hpp" #include "Solver.hpp" #include "proto/gxm.pb.h" #include "FusedConvBNImpl.hpp" #include "FusedConvBNXSMM.hpp" #define VLEN 16 using namespace std; using namespace gxm; class FusedConvBNParams : public NNParams { public: FusedConvBNParams(void) {} virtual ~FusedConvBNParams(void) {} void set_kernel_dims(int kdims, int ksize) { for(int i=0; i& get_kernel_dims() { return kernel_dim_; } void set_bn_strides(int sdims, int stride) { for(int i=0; i& get_bn_strides() { return bn_strides_; } void set_c_strides(int sdims, int stride) { for(int i=0; i& get_c_strides() { return c_strides_; } void set_bot_pads(int pdims, int pad) { for(int i=0; i& get_bot_pads() { return bot_pads_; } void set_top_pads(int pdims, int pad) { for(int i=0; i& get_top_pads() { return top_pads_; } void set_mid_pads(int pdims, int pad) { for(int i=0; i& get_mid_pads() { return mid_pads_; } void set_group(int g) { this->group_ = g;} int get_group() { return this->group_; } void set_nOutput(int num_output) { this->nOutput_ = num_output; } int get_output() { return nOutput_; } void set_weight_filler_type(string ftype) { wfiller_type_ = ftype; } string get_weight_filler_type() { return wfiller_type_; } void set_std(float s) { std_ = s; } float get_std() { return std_; } void set_variance_norm(int v) { variance_norm_ = v; } int get_variance_norm() { return variance_norm_; } void set_eps(float eps) { eps_ = eps; } float get_eps() { return eps_; } void set_mmf(float mmf) { mmf_ = mmf; } float get_mmf() { return mmf_; } void set_global_stats_flag(bool s) { use_global_stats_ = s; } bool get_global_stats_flag() { return use_global_stats_; } void set_eltwise(bool e) { eltwise_ = e; } bool get_eltwise() { return eltwise_; } void set_relu_fwd(bool relu_fwd) { relu_fwd_ = relu_fwd; } bool get_relu_fwd() { return relu_fwd_; } void set_relu_bwd(bool br) { relu_bwd_ = br; } bool get_relu_bwd() { return relu_bwd_; } void set_physical_padding(bool p) { phys_pad_ = p; } bool get_physical_padding() { return phys_pad_; } void set_compute_engine(int ce) { compute_engine_ = ce; } int get_compute_engine() { return compute_engine_; } void set_algo_type(int at) { algotype_ = at; } int get_algo_type() { return algotype_; } void set_global_params(vector psv) { for(int i=0; i& get_lr_mult() { return lr_mult_; } const vector& get_decay_mult() { return decay_mult_; } void set_data_type(int t) { data_type_ = t; } int get_data_type() { return data_type_; } protected: vector kernel_dim_; vector c_strides_, bn_strides_; vector bot_pads_, mid_pads_, top_pads_; int nOutput_; string wfiller_type_; float std_, eps_, mmf_; bool relu_fwd_, relu_bwd_; bool phys_pad_, use_global_stats_, eltwise_; int group_, compute_engine_, algotype_; int variance_norm_, data_type_; vector lr_mult_, decay_mult_; }; static MLParams* parseFusedConvBNParams(NodeParameter* np) { FusedConvBNParams* fcbnp = new FusedConvBNParams(); // Set name of node string str = np->name(); assert(!str.empty()); fcbnp->set_node_name(str); //Set node type (Convolution, FullyConnected, etc) str = np->type(); assert(!str.empty()); fcbnp->set_node_type(str); //Set tensor names for(int i=0; ibottom_size(); i++) { assert(!np->bottom(i).empty()); fcbnp->set_bottom_names(np->bottom(i)); } for(int i=0; itop_size(); i++) { assert(!np->top(i).empty()); fcbnp->set_top_names(np->top(i)); } //Set Mode for the node assert((np->mode() == TRAIN) || (np->mode() == TEST)); fcbnp->set_mode(np->mode()); //Set backprop needed/not needed flag for this node fcbnp->set_bprop_flag(np->propagate_down()); vector psv; for(int i=0; iparam_size(); i++) psv.push_back(np->param(i)); fcbnp->set_global_params(psv); FusedConvBNParameter pcp = np->fused_conv_bn_param(); int kdims = pcp.kernel_size_size(); switch(kdims) { int kh, kw; case 0: kh = pcp.kernel_h(); kw = pcp.kernel_w(); assert((kh > 0) && (kw > 0)); fcbnp->set_kernel_dims(kh, kw, 0); break; case 1: kh = pcp.kernel_size(0); fcbnp->set_kernel_dims(kh, kh, 0); break; case 2: kh = pcp.kernel_size(0); kw = pcp.kernel_size(1); assert(pcp.ndims() == 2); fcbnp->set_kernel_dims(kh, kw, 0); break; default: printf("illegal kernel dimension size\n"); break; } // conv strides int sdims = pcp.c_stride_size(); switch(sdims) { int sh, sw, sd; case 0: sh = pcp.c_stride_h(); sw = pcp.c_stride_w(); assert((sh > 0) && (sw > 0)); fcbnp->set_c_strides(sh, sw, 0); break; case 1: sh = pcp.c_stride(0); fcbnp->set_c_strides(sh, sh, 0); break; case 2: sh = pcp.c_stride(0); sw = pcp.c_stride(1); assert(pcp.ndims() == 2); fcbnp->set_c_strides(sh, sw, 0); break; } // bn strides sdims = pcp.bn_stride_size(); switch(sdims) { int sh, sw, sd; case 0: sh = pcp.bn_stride_h(); sw = pcp.bn_stride_w(); assert((sh > 0) && (sw > 0)); fcbnp->set_bn_strides(sh, sw, 0); break; case 1: sh = pcp.bn_stride(0); fcbnp->set_bn_strides(sh, sh, 0); break; case 2: sh = pcp.bn_stride(0); sw = pcp.bn_stride(1); assert(pcp.ndims() == 2); fcbnp->set_bn_strides(sh, sw, 0); break; } // input pads int pdims = pcp.ipad_size(); switch(pdims) { int ph, pw, pd; case 0: ph = pcp.ipad_h(); pw = pcp.ipad_w(); fcbnp->set_bot_pads(ph, pw, 0); break; case 1: ph = pcp.ipad(0); fcbnp->set_bot_pads(ph, ph, 0); break; case 2: ph = pcp.ipad(0); pw = pcp.ipad(1); assert(pcp.ndims() == 2); fcbnp->set_bot_pads(ph, pw, 0); break; } // middle pads pdims = pcp.mpad_size(); switch(pdims) { int ph, pw, pd; case 0: ph = pcp.mpad_h(); pw = pcp.mpad_w(); fcbnp->set_mid_pads(ph, pw, 0); break; case 1: ph = pcp.mpad(0); fcbnp->set_mid_pads(ph, ph, 0); break; case 2: ph = pcp.mpad(0); pw = pcp.mpad(1); assert(pcp.ndims() == 2); fcbnp->set_mid_pads(ph, pw, 0); break; } // output pads int opdims = pcp.opad_size(); switch(opdims) { int oph, opw, opd; case 0: oph = pcp.opad_h(); opw = pcp.opad_w(); fcbnp->set_top_pads(oph, opw, 0); break; case 1: oph = pcp.opad(0); fcbnp->set_top_pads(oph, oph, 0); break; case 2: oph = pcp.opad(0); opw = pcp.opad(1); assert(pcp.ndims() == 2); fcbnp->set_top_pads(oph, opw, 0); break; } if(pcp.group() > 1) fcbnp->set_group(pcp.group()); else fcbnp->set_group(1); int nOutput = pcp.num_output(); fcbnp->set_nOutput(nOutput); fcbnp->set_mmf(pcp.mmf()); fcbnp->set_eps(pcp.eps()); fcbnp->set_global_stats_flag(pcp.use_global_stats()); fcbnp->set_relu_fwd(pcp.relu_fwd()); fcbnp->set_relu_bwd(pcp.relu_bwd()); FillerParameter wp = pcp.weight_filler(); fcbnp->set_weight_filler_type(wp.type()); fcbnp->set_std(wp.std()); fcbnp->set_variance_norm(wp.variance_norm()); fcbnp->set_eltwise(pcp.eltwise()); fcbnp->set_physical_padding(pcp.physical_padding()); fcbnp->set_data_type(pcp.data_type()); fcbnp->set_compute_engine(pcp.engine()); fcbnp->set_algo_type(pcp.algotype()); return fcbnp; } class FusedConvBNNode : public NNNode { public: FusedConvBNNode(FusedConvBNParams* p, MLEngine* e); virtual ~FusedConvBNNode(void) {} string get_weight_filler_type() { return wfiller_type_; } float get_std() { return std_; } void fillWeightBuffers(TensorBuf* tBuf, int buftype, long long int size); void fillBuffer(TensorBuf* tBuf, int buftype, long long int size); void fillWeightMultipliers(float* lr_mult, float* decay_mult, long long int bytes); void fillBiasMultipliers(float* lr_mult, float* decay_mult, long long int bytes); void Checkpoint(TensorBuf* tBuf, string name, string format); void convert_f32_bf16(float* in, libxsmm_bfloat16* out, int len); void convert_bf16_f32(libxsmm_bfloat16* in, float* out, int len); protected: void forwardPropagate(); void backPropagate(); void weightUpdate(); void solverStep(); void configure(int engine); void shape_setzero(Shape* s) { for(int i=0; idims[i] = 0; } vector tenBot_; Tensor *tenMid_, *tenTop_, *tenWeight_, *tenScale_, *tenShift_, *tenMean_, *tenVar_; FusedConvBNImplParams gparams_; vector tenBotDiff_, tenBotData_; // Data & Gradients with respect to input TensorBuf *tenMidData_, *tenTopData_; TensorBuf *tenMidDiff_=NULL, *tenTopDiff_; TensorBuf *tenWeightDiff_, *tenWeightData_, *tenWeightInc_; // Weight gradients, data, increments TensorBuf *tenScaleData_, *tenScaleDiff_, *tenScaleInc_; // Gamma data, gradients, increments TensorBuf *tenShiftData_, *tenShiftDiff_, *tenShiftInc_; // Beta data, gradients, increments TensorBuf *tenMeanData_, *tenVarData_; // Mean, variance data TensorBuf *tenScratchData_; int in_dtype, out_dtype; Shape ts_, ws_, ms_; string wfiller_type_; string weight_, scale_, shift_, mean_, var_; int variance_norm_; float std_, *cbptr=NULL, *stptr=NULL, scf_=0; float* dwptr=NULL; int bot_cengine_; int count_; vector lr_mult_, decay_mult_; bool first_fp = true, first_bp=true, first_upd=true; FusedConvBNImpl *impl=NULL; SolverNode *solver_; MLEngine* eptr_; }; libxsmm-1.17/samples/deeplearning/gxm/include/FusedConvBNImpl.hpp000066400000000000000000000076731415223013700251160ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include #include #include "common.hpp" #include "check.hpp" #include "Tensor.hpp" typedef struct { string node_name, node_type; vector nInput; int nOutput; int batch_size; int iHeight, iWidth, mHeight, mWidth, oHeight, oWidth; int ipad_h, ipad_w, mpad_h, mpad_w, opad_h, opad_w; int c_stride_h, c_stride_w, bn_stride_h, bn_stride_w; int kh, kw, kd; int group; float eps, mmf; bool use_global_stats, eltwise, split, bprop; bool relu_fwd, relu_bwd; bool physical_padding; int algType; int bdims, mdims, tdims, wdims; int in_data_type, out_data_type; int num_numa_nodes; int num_threads; string exec_mode; } FusedConvBNImplParams; class FusedConvBNImpl { protected: FusedConvBNImplParams *gp; int engine; TensorLayoutType top_layout_type; TensorLayoutType gbot_layout_type; void *top_layout, *gbot_layout; vector top_compute_engine, bot_compute_engine; bool use_global_stats; string nname; TensorBuf* scratchp; float scaling_factor_; public: FusedConvBNImpl(FusedConvBNImplParams* gp_, int engine_): gp(gp_), engine(engine_) {} void set_top_compute_engine(int e) { top_compute_engine.push_back(e);} void set_bot_compute_engine(int e) { bot_compute_engine.push_back(e);} void set_node_name(string s) { nname = s; } void set_scratch_buffer(TensorBuf* sb) { scratchp = sb; } void set_global_stats(bool s) { use_global_stats = s; } void set_scaling_factor(float s) { scaling_factor_ = s; } virtual void forwardPropagate(vector& inp, TensorBuf* weightp, TensorBuf *hweightp, TensorBuf* midp, TensorBuf* gammap, TensorBuf* betap, TensorBuf *gmeanp, TensorBuf *gvarp, TensorBuf *outp, int tid) = 0; virtual void backPropagate(TensorBuf *deloutp, TensorBuf* weightp, TensorBuf* delgammap, TensorBuf* delbetap, TensorBuf *delmidp, vector& delinp, int tid) = 0; virtual void weightUpdate(TensorBuf *inp, TensorBuf *deloutp, TensorBuf *delmidp, TensorBuf *delweightp, TensorBuf *delgammap, TensorBuf *delbetap, int tid) = 0; virtual void dumpBuffer(TensorBuf*, void*) {} virtual void forwardPropagate(vector& inp, TensorBuf* weightp, TensorBuf* hweightp, TensorBuf* midp, TensorBuf* gammap, TensorBuf* betap, TensorBuf *gmeanp, TensorBuf *gvarp, TensorBuf *outp) { switch(engine) { case XSMM: forwardPropagate(inp, weightp, hweightp, midp, gammap, betap, gmeanp, gvarp, outp, 0); break; } } virtual void backPropagate(TensorBuf *deloutp, TensorBuf* weightp, TensorBuf* delgammap, TensorBuf* delbetap, TensorBuf *delmidp, vector& delinp) { switch(engine) { case XSMM: backPropagate(deloutp, weightp, delgammap, delbetap, delmidp, delinp, 0); break; } } virtual void weightUpdate(TensorBuf *inp, TensorBuf *deloutp, TensorBuf *delmidp, TensorBuf *delweightp, TensorBuf *delgammap, TensorBuf *delbetap) { switch(engine) { case XSMM: weightUpdate(inp, delmidp, deloutp, delweightp, delgammap, delbetap, 0); break; } } }; libxsmm-1.17/samples/deeplearning/gxm/include/FusedConvBNXSMM.hpp000066400000000000000000000125571415223013700247760ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include #include "common.hpp" #include "check.hpp" #include "FusedConvBNImpl.hpp" #include "libxsmm.h" #define VLEN 16 #define CHKERR_LIBXSMM_DNN(A) if ( A != LIBXSMM_DNN_SUCCESS )\ {\ fprintf(stdout, "%s, %s\n", gp->node_name.c_str(), libxsmm_dnn_get_error(A) );\ fflush(stdout);\ } #define CHKERR_LIBXSMM_DNN_CREATE(t, A) if ( A != LIBXSMM_DNN_SUCCESS )\ {\ fprintf(stdout, "Creating tensor %s in %s, %s\n", t, gp->node_name.c_str(), libxsmm_dnn_get_error(A) );\ fflush(stdout);\ } #define CHKERR_LIBXSMM_DNN_LINK(t, A) if ( A != LIBXSMM_DNN_SUCCESS )\ {\ fprintf(stdout, "Linking tensor %s in %s, %s\n", t, gp->node_name.c_str(), libxsmm_dnn_get_error(A) );\ fflush(stdout);\ } #define CHKERR_LIBXSMM_DNN_BIND(t, A) if ( A != LIBXSMM_DNN_SUCCESS )\ {\ fprintf(stdout, "Binding tensor %s in %s, %s\n", t, gp->node_name.c_str(), libxsmm_dnn_get_error(A) );\ fflush(stdout);\ } class FusedConvBNXSMM : public FusedConvBNImpl { protected: FusedConvBNImpl *gp_; libxsmm_dnn_conv_desc conv_desc; libxsmm_dnn_fusedbatchnorm_desc fusedbn_desc_train; libxsmm_dnn_fusedbatchnorm_desc fusedbn_desc_test; libxsmm_dnn_layer* libxsmm_handle_conv[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_fusedbatchnorm* libxsmm_handle_bn_train[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_fusedbatchnorm* libxsmm_handle_bn_test[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_input[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_input_bntrain[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_input_add_bntrain[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_input_bntest[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_input_add_bntest[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_middle[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_output_bntrain[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_output_bntest[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_relumask_bntrain[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_relumask_bntest[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_filter[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_checkpoint_filter = NULL; libxsmm_dnn_tensor* libxsmm_checkpoint_history_filter = NULL; libxsmm_dnn_tensor* libxsmm_temp = NULL; libxsmm_dnn_tensor* libxsmm_delinput[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_delinput_add[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_deloutput[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_delmiddle_bn[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_delmiddle_conv[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_delfilter[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_expectval_train[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_stddev_train[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_expectval_test[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_stddev_test[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_variance_train[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_variance_test[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_gamma_train[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_gamma_test[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_beta_train[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_beta_test[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_delgamma[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_delbeta[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor_datalayout* libxsmm_layout; libxsmm_dnn_err_t status; FusedConvBNImplParams *cp; bool updated_scratch_fwd=false, updated_scratch_bwd=false, updated_scratch_upd=false; void *bexpect[NUM_NUMA_NODES]={NULL}, *bstddev[NUM_NUMA_NODES]={NULL}, *bvariance[NUM_NUMA_NODES]={NULL}; void *relu_mask[NUM_NUMA_NODES]={NULL}; void *gexp_test=NULL, *gvar_test=NULL; int prev_scratch_size = 0; public: FusedConvBNXSMM(FusedConvBNImplParams *gp, int engine); virtual ~FusedConvBNXSMM(void) {} void forwardPropagate(vector& inp, TensorBuf* weightp, TensorBuf *hweightp, TensorBuf* midp, TensorBuf* gammap, TensorBuf* betap, TensorBuf* gmeanp, TensorBuf* gvarp, TensorBuf* outp, int tid); void backPropagate(TensorBuf* deloutp, TensorBuf* weightp, TensorBuf* delgammap, TensorBuf* delbetap, TensorBuf* delmidp, vector& delinp, int tid); void weightUpdate(TensorBuf*, TensorBuf*, TensorBuf*, TensorBuf*, TensorBuf*, TensorBuf*, int tid); void dumpBuffer(TensorBuf *wt, void* temp); }; libxsmm-1.17/samples/deeplearning/gxm/include/ImageData.hpp000066400000000000000000000234201415223013700237600ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include #include #include "Node.hpp" #include "Engine.hpp" #include "Params.hpp" #include "Tensor.hpp" #include "proto/gxm.pb.h" #include "Shape.h" #include "ImageDataImpl.hpp" #include "ImageDataRGBFlat.hpp" #include "common.hpp" #define SQUARE 1 #define RECT 2 #define RGB 3 #define GRAY 1 using namespace std; using namespace gxm; class ImageDataParams : public NNParams { public: ImageDataParams(void) {} virtual ~ImageDataParams(void) {} void set_transform_params(bool mirror, bool vignette, bool color_bump) { mirror_ = mirror; vignette_ = vignette; color_bump_ = color_bump; } bool get_mirror() { return mirror_; } bool get_vignette() { return vignette_; } bool get_color_bump() { return color_bump_; } void set_train_source_path(string source_name) { train_source_ = source_name; } string get_train_source_path() { return train_source_; } void set_test_source_path(string source_name) { test_source_ = source_name; } string get_test_source_path() { return test_source_; } void set_data_type(int t) { data_type_ = t; } int get_data_type() { return data_type_; } void set_label_data_type(int t) { label_dtype_ = t; } int get_label_data_type() { return label_dtype_; } void set_batch_size(int batch) { batch_size_ = batch; } int get_batch_size() { return batch_size_; } void set_lookahead(int l) { lookahead_ = l; } int get_lookahead() { return lookahead_; } void set_num_train_files(int ntrain) { num_train_files_ = ntrain; } int get_num_train_files() { return num_train_files_; } void set_train_img_info(string s) { train_img_info_ = s; } string get_train_img_info() { return train_img_info_; } void set_num_test_files(int ntest) { num_test_files_ = ntest; } int get_num_test_files() { return num_test_files_; } void set_test_img_info(string s) { test_img_info_ = s; } string get_test_img_info() { return test_img_info_; } void set_mean_values(int channels, float mean_val) { for(int i=0; i& get_mean_values() { return mean_values_; } void set_scale_values(int channels, float std_val) { for(int i=0; i& get_scale_values() { return scale_values_; } void set_jitters(int j) { jitters_.push_back(j); } vector& get_jitters() { return jitters_; } void set_channels(int c) { channels_ = c; } int get_channels() { return channels_; } void set_crop_image(bool crop) { crop_image_ = crop; } bool get_crop_image() { return crop_image_; } void set_crop_sizes(int s, int v1, int v2) { if(s == SQUARE) { for(int i=0; i& get_crop_sizes() { return crop_sizes_; } void set_orig_sizes(int s, int v1, int v2) { if(s == SQUARE) { for(int i=0; i& get_orig_sizes() { return orig_sizes_; } void set_num_test_views(int nt) { test_views_ = nt; } int get_num_test_views() { return test_views_; } protected: vector jitters_, crop_sizes_, orig_sizes_; bool crop_image_; vector mean_values_, scale_values_; int batch_size_, channels_, lookahead_; int num_train_files_, num_test_files_; int data_type_, label_dtype_, test_views_; float mean_, std; string train_source_, test_source_, train_img_info_, test_img_info_; bool mirror_, vignette_, color_bump_; }; static MLParams* parseImageDataParams(NodeParameter* np) { ImageDataParams* itp = new ImageDataParams(); DataParameter dp = np->data_param(); ImageTransformParameter pitp = np->data_param().image_xform_param(); // Set name of node assert(!np->name().empty()); itp->set_node_name(np->name()); //Set node type (Convolution, FullyConnected, etc) assert(!np->type().empty()); itp->set_node_type(np->type()); //Set tensor names assert(np->bottom_size() == 0); for(int i=0; itop_size(); i++) { assert(!np->top(i).empty()); itp->set_top_names(np->top(i)); } //Set backprop needed/not needed flag for this node itp->set_bprop_flag(np->propagate_down()); //Set Mode for the node int mode = np->mode(); assert((mode == TRAIN) || (mode == TEST)); itp->set_mode(mode); // Get data source path assert(!(dp.train_source()).empty()); itp->set_train_source_path(dp.train_source()); assert(!(dp.test_source()).empty()); itp->set_test_source_path(dp.test_source()); // Get batch size assert(dp.batch_size() > 0); itp->set_batch_size(dp.batch_size()); // Get lookahead assert(dp.lookahead() > 0); itp->set_lookahead(dp.lookahead()); // Get data types itp->set_data_type(dp.data_type()); itp->set_label_data_type(dp.label_data_type()); // Get number of input files if((mode == TRAIN)) { assert((dp.num_train_files() > 0) && (dp.num_test_files() > 0)); itp->set_num_train_files(dp.num_train_files()); itp->set_num_test_files(dp.num_test_files()); itp->set_test_img_info(dp.test_data_info()); itp->set_num_test_views(pitp.test_views()); // Get data info file names itp->set_train_img_info(dp.train_data_info()); } else if(mode == TEST) { assert(dp.num_test_files() > 0); itp->set_num_test_files(dp.num_test_files()); itp->set_test_img_info(dp.test_data_info()); itp->set_num_test_views(pitp.test_views()); } // If cropping is turned on, set the crop size if(pitp.crop_image() == false) itp->set_crop_image(false); else { itp->set_crop_image(true); int cdims = pitp.crop_size_size(); if(cdims > 0) itp->set_crop_sizes(SQUARE, 2, pitp.crop_size(0)); else { int ch = pitp.crop_h(); int cw = pitp.crop_w(); assert((ch > 0) && (cw > 0)); itp->set_crop_sizes(RECT, ch, cw); } } int odims = pitp.orig_size_size(); if(odims > 0) itp->set_orig_sizes(SQUARE, 2, pitp.orig_size(0)); else { int oh = pitp.orig_h(); int ow = pitp.orig_w(); assert((oh > 0) && (ow > 0)); itp->set_orig_sizes(RECT, oh, ow); } int channels = pitp.channels(); bool force_color = pitp.force_color(); bool force_gray = pitp.force_gray(); if(force_color) itp->set_channels(RGB); else if(force_gray) itp->set_channels(GRAY); else itp->set_channels(channels); if(pitp.mean_values_size() > 1) itp->set_mean_values(pitp.mean_values(0), pitp.mean_values(1), pitp.mean_values(2)); else itp->set_mean_values(channels, pitp.mean_values(0)); if(pitp.scale_values_size() > 1) itp->set_scale_values(pitp.scale_values(0), pitp.scale_values(1), pitp.scale_values(2)); else itp->set_scale_values(channels, pitp.scale_values(0)); bool mirror = pitp.mirror(); bool vignette = pitp.vignette(); bool color_bump = pitp.color_bump(); itp->set_transform_params(mirror, vignette, color_bump); for(int i=0; iset_jitters(pitp.jitters(i)); return itp; } class ImageDataNode : public NNNode { public: ImageDataNode(ImageDataParams* p, MLEngine* e); ~ImageDataNode() {} protected: vector tenTop_; vector tenTopData_; int t_files_, v_files_, n_files_; int tfiles_per_mc_, vfiles_per_mc_; int current_epoch_, ctrain_pf_mb_, ctest_pf_mb_; int ctrain_proc_mb_, ctest_proc_mb_, curr_test_view_; int train_batches_, test_batches_; bool full_train_prefetch_, full_test_prefetch_; unsigned char *tempbuf_; DataImplParams gparams_; int* r_offset, *c_offset, *augmentation; MLEngine* eptr; string train_source_path_, test_source_path_; int num_epochs_, batch_size_, global_batch_size_; int num_train_files_, num_test_files_, num_machines_; int global_node_id_; int max_ep_, io_ep_, iopass_; vector train_imginfo_index_, test_imginfo_index_; vector labels_; AugmentParams ap; vector jitters_; vector train_list_; vector test_list_; void shape_setzero(Shape* s) { for(int i=0; idims[i] = 0; } void configure(int dataType); void setupTrainIndices(); void setupTestIndices(); void createImageList(vector&, string, int); #if 0 MPI_Request *req_; #endif ImageDataImpl *impl; void forwardPropagate(); }; libxsmm-1.17/samples/deeplearning/gxm/include/ImageDataImpl.hpp000066400000000000000000000036331415223013700246060ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include #include #include #include #include "Engine.hpp" #include "io.hpp" #define RGB_FLATFILE 0 #define JPEG_FLATFILE 1 #define RGB_LMDB 2 #define JPEG_LMDB 3 using namespace std; typedef struct { bool mirror; bool vignette; bool color_bump; } AugmentParams; typedef struct { string name; int height; int width; int length; int label; } ImageInfo; typedef struct { int batch_size; int channels; vector orig_sizes; vector crop_sizes; vector mean_values; vector scale_values; int test_views; int lookahead; int threads; int exec_mode; } DataImplParams; class ImageDataImpl { protected: DataImplParams *gp; AugmentParams *ap; unsigned int* tenSeeds_; public: ImageDataImpl(DataImplParams *gp_, AugmentParams *ap_): gp(gp_), ap(ap_) { tenSeeds_ = new unsigned int[gp->threads*16]; initSeeds(tenSeeds_, gp->threads); } virtual void forwardPropagate(unsigned char *inp, float *outp) = 0; virtual void forwardPropagate(unsigned char *inp, int test_view, float *outp) = 0; }; libxsmm-1.17/samples/deeplearning/gxm/include/ImageDataRGBFlat.hpp000066400000000000000000000026341415223013700251260ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include "ImageDataImpl.hpp" class ImageDataRGBFlat : public ImageDataImpl { protected: int *r_offset, *c_offset, *augmentation; public: ImageDataRGBFlat(DataImplParams *gp, AugmentParams *ap) : ImageDataImpl(gp, ap) { r_offset = new int[gp->batch_size]; c_offset = new int[gp->batch_size]; augmentation = new int[gp->batch_size]; } void forwardPropagate(unsigned char *inp, float *outp); void forwardPropagate(unsigned char *inp, int test_view, float *outp); void processTrainMinibatch(unsigned char *inp, float *outp); void processTestMinibatch(unsigned char *inp, int tv, float *outp); }; libxsmm-1.17/samples/deeplearning/gxm/include/JitterData.hpp000066400000000000000000000317761415223013700242140ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include #include #include #include #include #ifdef USE_OPENCV #include #include #include #endif #include "Node.hpp" #include "Engine.hpp" #include "Params.hpp" #include "Tensor.hpp" #include "proto/gxm.pb.h" #include "Shape.h" #include "io.hpp" #include "check.hpp" #include "common.hpp" #define SQUARE 1 #define RECT 2 #define RGB 3 #define GRAY 1 using namespace std; using namespace gxm; typedef struct { bool mirror; bool vignette; bool color_bump; } JitterAugmentParams; typedef struct { int batch_size; int channels; vector orig_sizes; vector crop_sizes; int pad_w, pad_h; vector mean_values; vector scale_values; int scalejittering_min, scalejittering_max; int test_smaller_side; float min_percent_area, max_percent_area; float min_aspect_ratio, max_aspect_ratio; bool shuffle; string mean_file; int test_views; int lookahead; int threads; int exec_mode; } JitterDataImplParams; class JitterDataParams : public NNParams { public: JitterDataParams(void) {} virtual ~JitterDataParams(void) {} void set_transform_params(bool mirror, bool vignette, bool color_bump) { mirror_ = mirror; vignette_ = vignette; color_bump_ = color_bump; } bool get_mirror() { return mirror_; } bool get_vignette() { return vignette_; } bool get_color_bump() { return color_bump_; } void set_train_source_path(string source_name) { train_source_ = source_name; } string get_train_source_path() { return train_source_; } void set_test_source_path(string source_name) { test_source_ = source_name; } string get_test_source_path() { return test_source_; } void set_train_list_path(string tr) { train_list_ = tr; } string get_train_list_path() { return train_list_; } void set_test_list_path(string te) { test_list_ = te; } string get_test_list_path() { return test_list_; } void set_numsplits(int s) {numsplits_ = s; } int get_numsplits() { return numsplits_; } void set_shuffle_flag(bool f) {shuffle_ = f; } bool get_shuffle_flag() { return shuffle_; } void set_data_type(int t) { data_type_ = t; } int get_data_type() { return data_type_; } void set_label_data_type(int t) { label_dtype_ = t; } int get_label_data_type() { return label_dtype_; } void set_batch_size(int batch) { batch_size_ = batch; } int get_batch_size() { return batch_size_; } void set_lookahead(int l) { lookahead_ = l; } int get_lookahead() { return lookahead_; } void set_num_train_files(int ntrain) { num_train_files_ = ntrain; } int get_num_train_files() { return num_train_files_; } void set_num_test_files(int ntest) { num_test_files_ = ntest; } int get_num_test_files() { return num_test_files_; } void set_mean_values(int channels, float mean_val) { for(int i=0; i& get_mean_values() { return mean_values_; } void set_mean_file(string n) { mean_file_ = n; } string get_mean_file() { return mean_file_; } void set_scale_values(int channels, float std_val) { for(int i=0; i& get_scale_values() { return scale_values_; } void set_channels(int c) { channels_ = c; } int get_channels() { return channels_; } void set_crop_image(bool crop) { crop_image_ = crop; } bool get_crop_image() { return crop_image_; } void set_crop_sizes(int s, int v1, int v2) { if(s == SQUARE) { for(int i=0; i& get_crop_sizes() { return crop_sizes_; } void set_physical_padding(bool p) {phys_pad_ = p; } bool get_physical_padding() { return phys_pad_; } void set_pad_h(int h) { pad_h_ = h; } int get_pad_h() {return pad_h_; } void set_pad_w(int w) { pad_w_ = w; } int get_pad_w() {return pad_w_; } void set_orig_sizes(int s, int v1, int v2) { if(s == SQUARE) { for(int i=0; i& get_orig_sizes() { return orig_sizes_; } void set_num_test_views(int nt) { test_views_ = nt; } int get_num_test_views() { return test_views_; } void set_scale_jitters(int sjmin, int sjmax) { sjmin_ = sjmin; sjmax_ = sjmax; } int get_jitter_min() { return sjmin_; } int get_jitter_max() { return sjmax_; } void set_percent_areas(float amin, float amax) { pc_amin_ = amin; pc_amax_ = amax; } float get_percent_min_area() { return pc_amin_; } float get_percent_max_area() { return pc_amax_; } void set_aspect_ratios(float armin, float armax) { ar_min_ = armin; ar_max_ = armax; } float get_min_aspect_ratio() { return ar_min_; } float get_max_aspect_ratio() { return ar_max_; } void set_test_smaller_side(int s) { test_smaller_side_ = s; } int get_test_smaller_side() {return test_smaller_side_; } void set_compute_engine(int e) {compute_engine_ = e; } int get_compute_engine() {return compute_engine_; } protected: vector crop_sizes_, orig_sizes_; int pad_h_, pad_w_; bool crop_image_, phys_pad_; vector mean_values_, scale_values_; int batch_size_, channels_, lookahead_, numsplits_; int num_train_files_, num_test_files_; int sjmin_, sjmax_, test_smaller_side_; int data_type_, label_dtype_, test_views_; int compute_engine_; float mean_, std, pc_amin_, pc_amax_; float ar_min_, ar_max_; string train_source_, test_source_; string mean_file_, train_list_, test_list_; bool mirror_, vignette_, color_bump_, shuffle_; }; static MLParams* parseJitterDataParams(NodeParameter* np) { JitterDataParams* jp = new JitterDataParams(); DataParameter dp = np->data_param(); ImageTransformParameter itp = np->data_param().image_xform_param(); // Set name of node assert(!np->name().empty()); jp->set_node_name(np->name()); //Set node type (Convolution, FullyConnected, etc) assert(!np->type().empty()); jp->set_node_type(np->type()); //Set tensor names assert(np->bottom_size() == 0); for(int i=0; itop_size(); i++) { assert(!np->top(i).empty()); jp->set_top_names(np->top(i)); } //Set backprop needed/not needed flag for this node jp->set_bprop_flag(np->propagate_down()); //Set Mode for the node int mode = np->mode(); assert((mode == TRAIN) || (mode == TEST)); jp->set_mode(mode); // Get data source path assert(!(dp.train_source()).empty()); jp->set_train_source_path(dp.train_source()); assert(!(dp.test_source()).empty()); jp->set_test_source_path(dp.test_source()); // Get data list path assert(!(dp.train_list()).empty()); jp->set_train_list_path(dp.train_list()); assert(!(dp.test_list()).empty()); jp->set_test_list_path(dp.test_list()); // Get number of splits jp->set_numsplits(dp.numsplits()); // Get shuffle flag jp->set_shuffle_flag(dp.shuffle()); // Get batch size assert(dp.batch_size() > 0); jp->set_batch_size(dp.batch_size()); // Get lookahead assert(dp.lookahead() > 0); jp->set_lookahead(dp.lookahead()); // Get data types jp->set_data_type(dp.data_type()); jp->set_label_data_type(dp.label_data_type()); // Get number of input files if((mode == TRAIN)) { assert((dp.num_train_files() > 0) && (dp.num_test_files() > 0)); jp->set_num_train_files(dp.num_train_files()); jp->set_num_test_files(dp.num_test_files()); jp->set_num_test_views(itp.test_views()); } else if(mode == TEST) { assert(dp.num_test_files() > 0); jp->set_num_test_files(dp.num_test_files()); jp->set_num_test_views(itp.test_views()); } // If cropping is turned on, set the crop size if(itp.crop_image() == false) jp->set_crop_image(false); else { jp->set_crop_image(true); int cdims = itp.crop_size_size(); if(cdims > 0) jp->set_crop_sizes(SQUARE, 2, itp.crop_size(0)); else { int ch = itp.crop_h(); int cw = itp.crop_w(); assert((ch > 0) && (cw > 0)); jp->set_crop_sizes(RECT, ch, cw); } } int odims = itp.orig_size_size(); if(odims > 0) jp->set_orig_sizes(SQUARE, 2, itp.orig_size(0)); else { int oh, ow; if(itp.orig_h() > 0) oh = itp.orig_h(); if(itp.orig_w() > 0) ow = itp.orig_w(); jp->set_orig_sizes(RECT, oh, ow); } jp->set_pad_h(itp.pad_h()); jp->set_pad_w(itp.pad_w()); jp->set_physical_padding(itp.physical_padding()); int channels = itp.channels(); bool force_color = itp.force_color(); bool force_gray = itp.force_gray(); if(force_color) jp->set_channels(RGB); else if(force_gray) jp->set_channels(GRAY); else jp->set_channels(channels); if(itp.mean_values_size() > 1) jp->set_mean_values(itp.mean_values(0), itp.mean_values(1), itp.mean_values(2)); else if(itp.mean_values_size() > 0) jp->set_mean_values(channels, itp.mean_values(0)); else if(itp.mean_file().size() > 0) jp->set_mean_file(itp.mean_file()); else jp->set_mean_values(channels, 0); if(itp.scale_values_size() > 1) jp->set_scale_values(itp.scale_values(0), itp.scale_values(1), itp.scale_values(2)); else if(itp.scale_values_size() > 0) jp->set_scale_values(channels, itp.scale_values(0)); else jp->set_scale_values(channels, 1); bool mirror = itp.mirror(); bool vignette = itp.vignette(); bool color_bump = itp.color_bump(); jp->set_transform_params(mirror, vignette, color_bump); jp->set_scale_jitters(itp.scalejittering_min(), itp.scalejittering_max()); jp->set_percent_areas(itp.min_percent_area(), itp.max_percent_area()); jp->set_aspect_ratios(itp.min_aspect_ratio(), itp.max_aspect_ratio()); jp->set_test_smaller_side(itp.test_smaller_side()); jp->set_compute_engine(dp.engine()); return jp; } class JitterDataNode : public NNNode { public: JitterDataNode(JitterDataParams* p, MLEngine* e); ~JitterDataNode() {} protected: vector tenTop_; vector tenTopData_; int t_files_, v_files_, n_files_; int current_epoch_, ctrain_pf_mb_, ctest_pf_mb_; int ctrain_proc_mb_, ctest_proc_mb_, curr_test_view_; int train_batches_, test_batches_, numsplits_, duplicates_; bool full_train_prefetch_, full_test_prefetch_; long long int *r_offset, *c_offset; double *drand1, *drand2, *drand3; int *augmentation; float* mean_data_; bool first_fp=true; vector < vector > tempbuf_, cropbuf_; vector < vector > labels_; JitterDataImplParams gparams_; MLEngine* eptr_; string train_source_path_, test_source_path_, train_list_path_, test_list_path_; vector > train_list_, test_list_; vector train_file_index_, test_file_index_; vector train_list_per_mc_, test_list_per_mc_; int num_epochs_, batch_size_, global_batch_size_; int num_train_files_, num_test_files_, num_machines_; int train_files_, test_files_, train_files_per_mc_, test_files_per_mc_; int global_node_id_, ridx_; void* bf16_img=NULL; JitterAugmentParams ap; vector jitters_; void shape_setzero(Shape* s) { for(int i=0; idims[i] = 0; } void forwardPropagate(); void cropTorch(const cv::Mat&, cv::Mat&, int*, int*); void cropVGG(const cv::Mat&, cv::Mat&, int*, int*); void imageTransform(vector&, float*); void setupTrainIndices(); void setupTestIndices(); void convert_f32_bf16(float* in, libxsmm_bfloat16* out, unsigned int len); }; libxsmm-1.17/samples/deeplearning/gxm/include/LMDBData.hpp000066400000000000000000000252621415223013700234620ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include #include #include #include #include #include "Node.hpp" #include "Engine.hpp" #include "Params.hpp" #include "Tensor.hpp" #include "proto/gxm.pb.h" #include "Shape.h" #include "db_lmdb.hpp" #include "io.hpp" #include "check.hpp" #define SQUARE 1 #define RECT 2 #define RGB 3 #define GRAY 1 using namespace std; using namespace gxm; typedef struct { bool mirror; bool vignette; bool color_bump; } LMDBAugmentParams; typedef struct { int batch_size; int channels; vector orig_sizes; vector crop_sizes; vector mean_values; vector scale_values; string mean_file; int test_views; int lookahead; int threads; int exec_mode; } LMDBDataImplParams; class LMDBDataParams : public NNParams { public: LMDBDataParams(void) {} virtual ~LMDBDataParams(void) {} void set_transform_params(bool mirror, bool vignette, bool color_bump) { mirror_ = mirror; vignette_ = vignette; color_bump_ = color_bump; } bool get_mirror() { return mirror_; } bool get_vignette() { return vignette_; } bool get_color_bump() { return color_bump_; } void set_train_source_path(string source_name) { train_source_ = source_name; } string get_train_source_path() { return train_source_; } void set_test_source_path(string source_name) { test_source_ = source_name; } string get_test_source_path() { return test_source_; } void set_split_db_flag(bool f) {split_db_ = f; } bool get_split_db_flag() { return split_db_; } void set_data_type(int t) { data_type_ = t; } int get_data_type() { return data_type_; } void set_label_data_type(int t) { label_dtype_ = t; } int get_label_data_type() { return label_dtype_; } void set_batch_size(int batch) { batch_size_ = batch; } int get_batch_size() { return batch_size_; } void set_lookahead(int l) { lookahead_ = l; } int get_lookahead() { return lookahead_; } void set_num_train_files(int ntrain) { num_train_files_ = ntrain; } int get_num_train_files() { return num_train_files_; } void set_train_img_info(string s) { train_img_info_ = s; } string get_train_img_info() { return train_img_info_; } void set_num_test_files(int ntest) { num_test_files_ = ntest; } int get_num_test_files() { return num_test_files_; } void set_test_img_info(string s) { test_img_info_ = s; } string get_test_img_info() { return test_img_info_; } void set_mean_values(int channels, float mean_val) { for(int i=0; i& get_mean_values() { return mean_values_; } void set_mean_file(string n) { mean_file_ = n; } string get_mean_file() { return mean_file_; } void set_scale_values(int channels, float std_val) { for(int i=0; i& get_scale_values() { return scale_values_; } void set_jitters(int j) { jitters_.push_back(j); } vector& get_jitters() { return jitters_; } void set_channels(int c) { channels_ = c; } int get_channels() { return channels_; } void set_crop_image(bool crop) { crop_image_ = crop; } bool get_crop_image() { return crop_image_; } void set_crop_sizes(int s, int v1, int v2) { if(s == SQUARE) { for(int i=0; i& get_crop_sizes() { return crop_sizes_; } void set_orig_sizes(int s, int v1, int v2) { if(s == SQUARE) { for(int i=0; i& get_orig_sizes() { return orig_sizes_; } void set_num_test_views(int nt) { test_views_ = nt; } int get_num_test_views() { return test_views_; } void set_compute_engine(int e) {compute_engine_ = e; } int get_compute_engine() {return compute_engine_; } protected: vector jitters_, crop_sizes_, orig_sizes_; bool crop_image_, split_db_; vector mean_values_, scale_values_; int batch_size_, channels_, lookahead_; int num_train_files_, num_test_files_; int data_type_, label_dtype_, test_views_; int compute_engine_; float mean_, std; string train_source_, test_source_, train_img_info_, test_img_info_; string mean_file_; bool mirror_, vignette_, color_bump_; }; static MLParams* parseLMDBDataParams(NodeParameter* np) { LMDBDataParams* itp = new LMDBDataParams(); DataParameter dp = np->data_param(); ImageTransformParameter pitp = np->data_param().image_xform_param(); // Set name of node assert(!np->name().empty()); itp->set_node_name(np->name()); //Set node type (Convolution, FullyConnected, etc) assert(!np->type().empty()); itp->set_node_type(np->type()); //Set tensor names assert(np->bottom_size() == 0); for(int i=0; itop_size(); i++) { assert(!np->top(i).empty()); itp->set_top_names(np->top(i)); } //Set backprop needed/not needed flag for this node itp->set_bprop_flag(np->propagate_down()); //Set Mode for the node int mode = np->mode(); assert((mode == TRAIN) || (mode == TEST)); itp->set_mode(mode); // Get data source path assert(!(dp.train_source()).empty()); itp->set_train_source_path(dp.train_source()); assert(!(dp.test_source()).empty()); itp->set_test_source_path(dp.test_source()); // Get split db flag itp->set_split_db_flag(dp.split_db()); // Get batch size assert(dp.batch_size() > 0); itp->set_batch_size(dp.batch_size()); // Get lookahead assert(dp.lookahead() > 0); itp->set_lookahead(dp.lookahead()); // Get data types itp->set_data_type(dp.data_type()); itp->set_label_data_type(dp.label_data_type()); // Get number of input files if((mode == TRAIN)) { assert((dp.num_train_files() > 0) && (dp.num_test_files() > 0)); itp->set_num_train_files(dp.num_train_files()); itp->set_num_test_files(dp.num_test_files()); itp->set_num_test_views(pitp.test_views()); } else if(mode == TEST) { assert(dp.num_test_files() > 0); itp->set_num_test_files(dp.num_test_files()); itp->set_num_test_views(pitp.test_views()); } // If cropping is turned on, set the crop size if(pitp.crop_image() == false) itp->set_crop_image(false); else { itp->set_crop_image(true); int cdims = pitp.crop_size_size(); if(cdims > 0) itp->set_crop_sizes(SQUARE, 2, pitp.crop_size(0)); else { int ch = pitp.crop_h(); int cw = pitp.crop_w(); assert((ch > 0) && (cw > 0)); itp->set_crop_sizes(RECT, ch, cw); } } int odims = pitp.orig_size_size(); if(odims > 0) itp->set_orig_sizes(SQUARE, 2, pitp.orig_size(0)); else { int oh, ow; if(pitp.orig_h() > 0) oh = pitp.orig_h(); if(pitp.orig_w() > 0) ow = pitp.orig_w(); itp->set_orig_sizes(RECT, oh, ow); } int channels = pitp.channels(); bool force_color = pitp.force_color(); bool force_gray = pitp.force_gray(); if(force_color) itp->set_channels(RGB); else if(force_gray) itp->set_channels(GRAY); else itp->set_channels(channels); if(pitp.mean_values_size() > 1) itp->set_mean_values(pitp.mean_values(0), pitp.mean_values(1), pitp.mean_values(2)); else if(pitp.mean_values_size() > 0) itp->set_mean_values(channels, pitp.mean_values(0)); else if(pitp.mean_file().size() > 0) itp->set_mean_file(pitp.mean_file()); else itp->set_mean_values(channels, 0); if(pitp.scale_values_size() > 1) itp->set_scale_values(pitp.scale_values(0), pitp.scale_values(1), pitp.scale_values(2)); else if(pitp.scale_values_size() > 0) itp->set_scale_values(channels, pitp.scale_values(0)); else itp->set_scale_values(channels, 1); bool mirror = pitp.mirror(); bool vignette = pitp.vignette(); bool color_bump = pitp.color_bump(); itp->set_transform_params(mirror, vignette, color_bump); for(int i=0; iset_jitters(pitp.jitters(i)); itp->set_compute_engine(dp.engine()); return itp; } class LMDBDataNode : public NNNode { public: LMDBDataNode(LMDBDataParams* p, MLEngine* e); ~LMDBDataNode() {} protected: vector tenTop_; vector tenTopData_; int t_files_, v_files_, n_files_; int tfiles_per_mc_, vfiles_per_mc_; int current_epoch_, ctrain_pf_mb_, ctest_pf_mb_; int ctrain_proc_mb_, ctest_proc_mb_, curr_test_view_; int train_batches_, test_batches_; bool full_train_prefetch_, full_test_prefetch_, split_db_; unsigned int* tenSeeds_; int *r_offset, *c_offset, *augmentation; float* mean_data_; LMDB* train_lmdb_, *test_lmdb_; LMDBCursor* train_cursor_, *test_cursor_; vector < vector > tempbuf_; LMDBDataImplParams gparams_; MLEngine* eptr; string train_source_path_, test_source_path_; int num_epochs_, batch_size_, global_batch_size_; int num_train_files_, num_test_files_, num_machines_; int global_node_id_; int max_ep_, io_ep_, iopass_; size_t node_id_, num_nodes_; LMDBAugmentParams ap; vector jitters_; void shape_setzero(Shape* s) { for(int i=0; idims[i] = 0; } void configure(); void forwardPropagate(); void trainImageTransform(vector&, float*); void testImageTransform(vector&, int, float*); }; libxsmm-1.17/samples/deeplearning/gxm/include/MLNode.fwd.hpp000066400000000000000000000014401415223013700240370ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once class MLNode;libxsmm-1.17/samples/deeplearning/gxm/include/MLNode.hpp000066400000000000000000000031011415223013700232540ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include "Params.hpp" #include "MLNode.fwd.hpp" #include "Engine.fwd.hpp" using namespace std; using namespace gxm; class MLNode { protected: public: MLNode(MLParams* p, MLEngine* e) {} virtual ~MLNode(void) {} virtual void createStrategy(int) {} virtual int executeTask(int) {return 0;} virtual void enqueTask(int pos) {} virtual void createCheckPoint() {} virtual void restoreCheckPoint() {} virtual void createPersistentTask() {} }; // Constructor should create Tensors for its output and internal buffers and assign type to it template MLNode *CreateMLNode(MLParams *param, MLEngine *engine) { NType *obj = new NType(dynamic_cast(param), engine); return dynamic_cast(obj); } libxsmm-1.17/samples/deeplearning/gxm/include/Node.fwd.hpp000066400000000000000000000014401415223013700236060ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once class NNNode;libxsmm-1.17/samples/deeplearning/gxm/include/Node.hpp000066400000000000000000000116551415223013700230400ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include #include #include #include "Params.hpp" #include "MLNode.hpp" #include "Engine.fwd.hpp" #include "Task.hpp" #include "proto/gxm.pb.h" using namespace std; using namespace gxm; #ifdef USE_MLSL #include "mlsl.hpp" #endif class NNParams : public MLParams { protected: vector top_; vector bottom_; string nname_; string type_; int mode_; bool bp_flag_; public: NNParams(void) {} virtual ~NNParams(void) {} void set_top_names(string name) { top_.push_back(name); } void set_bottom_names(string name) { bottom_.push_back(name); } void set_node_name(string nname) { nname_ = nname; } void set_node_type(string type) {type_ = type; } void set_mode(int mode) { mode_ = mode; } void set_bprop_flag(bool flag) { bp_flag_ = flag; } string get_node_name() { return nname_; } vector& get_top_names() { return top_; } vector& get_bottom_names() { return bottom_; } string get_node_type() { return type_; } int get_mode() { return mode_; } bool get_bprop_flag() { return bp_flag_; } }; class NNNode : public MLNode { public: NNNode(NNParams* p, MLEngine* e) : MLNode(p, e) { for(int i = 0; i < 4; i++) tBasic_[i] = NULL; } virtual ~NNNode(void) { for(int i = 0; i < 4; i++) if(tBasic_[i] != NULL) { delete tBasic_[i]; tBasic_[i] = NULL; } } void createTasks(list, int) {} virtual void createStrategy(int) {} virtual void forwardPropagate() {} virtual void backPropagate() {} virtual void weightUpdate() {} virtual void solverStep() {} int executeTask(int taskId) { if(taskId == 0) { forwardPropagate(); } else if(taskId == 1) { backPropagate(); } else if(taskId == 2) { weightUpdate(); } else if(taskId == 3) { solverStep(); } return 0; } void enqueTask(int pos) {} virtual void createPersistentTask() {} void setNextNode(NNNode* next) { //check if next is already in the nextNodes list if(std::find(nextNodes_.begin(), nextNodes_.end(), next) == nextNodes_.end()) { nextNodes_.push_back(next); next->prevNodes_.push_back(this); } } void setPrevNode(NNNode* prev) { //check if prev is already in the prevNodes list if(std::find(prevNodes_.begin(), prevNodes_.end(), prev) == prevNodes_.end()) { prevNodes_.push_back(prev); prev->nextNodes_.push_back(this); } } Task *getBasicTask(int type) { int index = -1; if(type == 0 || (type == 1 && bp_flag_) || (type > 1 && has_weights_)) index = type; if(index != -1) { if(tBasic_[index] == NULL) tBasic_[index] = new Task(this, -1, type); return tBasic_[index]; } return NULL; } void createNNGraph(int mode); void setNodeType(string type) { ntype_ = type; } string getNodeType() { return ntype_; } string getNodeName() { return nname_; } int getMode() { return mode_; } int getNumPrevNodes() { return prevNodes_.size(); } int getNumNextNodes() { return nextNodes_.size(); } NNNode* getPrevNode(int i) { if(prevNodes_.size() > 0) return prevNodes_[i]; else return NULL; } NNNode* getNextNode(int i) { if(nextNodes_.size() > 0) return nextNodes_[i]; else return NULL; } int get_num_tops() { return top_.size(); } void set_top_compute_engine(int e) { top_compute_engine_ = e; } int get_bot_compute_engine() { return bot_compute_engine_; } void set_next_node_type(string s) {next_ntype_ = s;} void refineTask(){} virtual void createCheckPoint() {} virtual void restoreCheckPoint() {} protected: string nname_, ntype_, next_ntype_; vector top_; vector bottom_; int mode_; bool bp_flag_; bool has_weights_; vector prevNodes_; vector nextNodes_; int top_compute_engine_, bot_compute_engine_; #ifdef USE_MLSL MLSL::Operation* op_; #endif // 0-Forw, 1-Back, 2-WGrad, 3-Solver Task *tBasic_[4]; }; libxsmm-1.17/samples/deeplearning/gxm/include/Params.hpp000066400000000000000000000017661415223013700234000ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include "Shape.h" #include "proto/gxm.pb.h" using namespace std; using namespace gxm; class MLParams { protected: public: MLParams(void) {} virtual ~MLParams(void) {} }; libxsmm-1.17/samples/deeplearning/gxm/include/Pooling.hpp000066400000000000000000000156661415223013700235700ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include "assert.h" #include "Node.hpp" #include "Engine.hpp" #include "Params.hpp" #include "proto/gxm.pb.h" #include "common.hpp" #include "PoolingImpl.hpp" #include "PoolingXSMM.hpp" using namespace std; using namespace gxm; class PoolingParams : public NNParams { public: PoolingParams(void) {} ~PoolingParams(void) {} void set_kernel_dims(int kdims, int ksize) { for(int i=0; i& get_kernel_dims() { return kernel_dim_; } void set_strides(int sdims, int stride) { for(int i=0; i& get_strides() { return strides_; } void set_pads(int pdims, int pad) { for(int i=0; i& get_pads() { return pads_; } void set_pool_mode(int m) { pool_mode_ = m; } int get_pool_mode() { return pool_mode_; } void set_compute_engine(int ce) { compute_engine_ = ce; } int get_compute_engine() { return compute_engine_; } void set_data_type(int t) { data_type_ = t; } int get_data_type() { return data_type_; } void set_algo_type(int at) { algotype_ = at; } int get_algo_type() { return algotype_; } protected: vector kernel_dim_; // Order of dimensions is Height, Width, Depth (for 3D Pooling) vector strides_; // Order follows kernel dimension vector pads_; // Order follows kernel dimension int pool_mode_, compute_engine_, algotype_, data_type_; }; static MLParams* parsePoolingParams(NodeParameter* np) { PoolingParams* pp = new PoolingParams(); // Set name of node assert(!np->name().empty()); pp->set_node_name(np->name()); //Set node type (Convolution, FullyConnected, etc) assert(!np->type().empty()); pp->set_node_type(np->type()); //Set tensor names assert(np->bottom_size() == 1); assert(!np->bottom(0).empty()); pp->set_bottom_names(np->bottom(0)); assert(np->top_size() == 1); assert(!np->top(0).empty()); pp->set_top_names(np->top(0)); //Set Mode for the node assert((np->mode() == TRAIN) || (np->mode() == TEST)); pp->set_mode(np->mode()); //Set backprop needed/not needed flag for this node pp->set_bprop_flag(np->propagate_down()); // kernel dimensions PoolingParameter ppp = np->pooling_param(); int kdims = ppp.kernel_size_size(); switch(kdims) { int kh, kw, kd; case 0: kh = ppp.kernel_h(); kw = ppp.kernel_w(); if(ppp.ndims() == 3) kd = ppp.kernel_d(); else kd = 0; assert((kh > 0) && (kw > 0)); pp->set_kernel_dims(kh, kw, kd); break; case 1: kh = ppp.kernel_size(0); if(ppp.ndims() == 2) pp->set_kernel_dims(kh, kh, 0); else if(ppp.ndims() == 3) pp->set_kernel_dims(kh, kh, kh); break; case 2: kh = ppp.kernel_size(0); kw = ppp.kernel_size(1); assert(ppp.ndims() == 2); pp->set_kernel_dims(kh, kw, 0); break; case 3: kh = ppp.kernel_size(0); kw = ppp.kernel_size(1); kd = ppp.kernel_size(2); assert(ppp.ndims() == 3); pp->set_kernel_dims(kh, kw, kd); break; } // strides int sdims = ppp.stride_size(); switch(sdims) { int sh, sw, sd; case 0: sh = ppp.stride_h(); sw = ppp.stride_w(); if(ppp.ndims() == 3) sd = ppp.stride_d(); else sd = 0; assert((sh > 0) && (sw > 0)); pp->set_strides(sh, sw, sd); break; case 1: sh = ppp.stride(0); if(ppp.ndims() == 2) pp->set_strides(sh, sh, 0); else if(ppp.ndims() == 3) pp->set_strides(sh, sh, sh); break; case 2: sh = ppp.stride(0); sw = ppp.stride(1); assert(ppp.ndims() == 2); pp->set_strides(sh, sw, 0); break; case 3: sh = ppp.stride(0); sw = ppp.stride(1); sd = ppp.stride(2); assert(ppp.ndims() == 3); pp->set_strides(sh, sw, sd); break; } // pads int pdims = ppp.pad_size(); switch(pdims) { int ph, pw, pd; case 0: ph = ppp.pad_h(); pw = ppp.pad_w(); if(ppp.ndims() == 3) pd = ppp.pad_d(); else pd = 0; pp->set_pads(ph, pw, pd); break; case 1: ph = ppp.pad(0); if(ppp.ndims() == 2) pp->set_pads(ph, ph, 0); else if(ppp.ndims() == 3) pp->set_pads(ph, ph, ph); break; case 2: ph = ppp.pad(0); pw = ppp.pad(1); assert(ppp.ndims() == 2); pp->set_pads(ph, pw, 0); break; case 3: ph = ppp.pad(0); pw = ppp.pad(1); pd = ppp.pad(2); assert(ppp.ndims() == 3); pp->set_pads(ph, pw, pd); break; } pp->set_pool_mode(ppp.pool()); pp->set_data_type(ppp.data_type()); pp->set_compute_engine(ppp.engine()); pp->set_algo_type(ppp.algotype()); return pp; } class PoolingNode : public NNNode { public: PoolingNode(PoolingParams* p, MLEngine* e); virtual ~PoolingNode(void) {} protected: void forwardPropagate(); void backPropagate(); void shape_setzero(Shape* s) { for(int i=0; idims[i] = 0; } void configure(int engine); void convert_bf16_f32(libxsmm_bfloat16* in, float* out, int len); Tensor* tenTop_; // Output tensor pointer Tensor* tenBot_; // Input tensor pointer int* tenMask_; PoolImplParams gparams_; TensorBuf *tenBotDiff_, *tenBotData_; TensorBuf *tenTopData_, *tenTopDiff_; TensorBuf *tenScratchData_; Shape ts_; int count_, in_dtype, out_dtype; int bot_cengine_; bool first_fp=true; float *stptr=NULL, cbptr[16]; PoolImpl* impl; MLEngine* eptr_; }; libxsmm-1.17/samples/deeplearning/gxm/include/PoolingImpl.hpp000066400000000000000000000053251415223013700244010ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include #include #include "common.hpp" #include "check.hpp" #include "Tensor.hpp" typedef struct { string node_name; int bdims, tdims; int nInput, nOutput; int batch_size; int in_data_type, out_data_type; int iHeight, iWidth, iDepth; int oHeight, oWidth, oDepth; int ipad_h, ipad_w, ipad_d; int opad_h, opad_w, opad_d; int pad_h, pad_w, pad_d; int stride_h, stride_w, stride_d; int kh, kw, kd; int pool_mode, data_type; int algType; int num_threads; } PoolImplParams; enum PoolFuncType {MAX, AVE}; class PoolImpl { protected: PoolImplParams *gp; int engine; TensorLayoutType bot_layout_type, top_layout_type, gbot_layout_type; void *bot_layout=NULL, *top_layout=NULL, *gbot_layout=NULL; int top_compute_engine=-1; int bot_compute_engine=-1; string next_ntype, nname; TensorBuf* scratchp; public: PoolImpl(PoolImplParams* gp_, int engine_) : gp(gp_), engine(engine_) {} void set_top_compute_engine(int e) { top_compute_engine = e;} void set_bot_compute_engine(int e) { bot_compute_engine = e;} void set_next_node_type(string s) { next_ntype = s; } void set_node_name(string s) { nname = s; } void set_scratch_buffer(TensorBuf* sb) { scratchp = sb; } // Assume external threading, e.g., #pragma omp virtual void forwardPropagate(TensorBuf *inp, TensorBuf *outp, int *maskp, int tid) = 0; virtual void backPropagate(TensorBuf *deloutp, int *maskp, TensorBuf *delinp, int tid) = 0; virtual void forwardPropagate(TensorBuf *inp, TensorBuf *outp, int *maskp) { switch(engine) { case XSMM: forwardPropagate(inp, outp, maskp, 0); break; } } virtual void backPropagate(TensorBuf *deloutp, int *maskp, TensorBuf *delinp) { switch(engine) { case XSMM: backPropagate(deloutp, maskp, delinp, 0); break; } } }; libxsmm-1.17/samples/deeplearning/gxm/include/PoolingXSMM.hpp000066400000000000000000000040441415223013700242610ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include "PoolingImpl.hpp" #include "libxsmm.h" #include "check.hpp" #define CHKERR_LIBXSMM_DNN(A) if ( A != LIBXSMM_DNN_SUCCESS )\ {\ fprintf(stdout, "%s, %s\n", gp->node_name.c_str(), libxsmm_dnn_get_error(A) );\ fflush(stdout);\ } class PoolXSMM : public PoolImpl { protected: PoolImpl *gp_; libxsmm_dnn_pooling_desc pooling_desc; libxsmm_dnn_pooling* libxsmm_handle[NUM_NUMA_NODES]; libxsmm_dnn_tensor* libxsmm_input[NUM_NUMA_NODES] = {NULL}; libxsmm_dnn_tensor* libxsmm_delinput[NUM_NUMA_NODES]={NULL}; libxsmm_dnn_tensor* libxsmm_output[NUM_NUMA_NODES]={NULL}; libxsmm_dnn_tensor* libxsmm_deloutput[NUM_NUMA_NODES]={NULL}; libxsmm_dnn_tensor* libxsmm_mask[NUM_NUMA_NODES]={NULL}; libxsmm_dnn_tensor_datalayout* libxsmm_layout; libxsmm_dnn_err_t status; libxsmm_dnn_err_t global_status = LIBXSMM_DNN_SUCCESS; bool updated_scratch_fwd=false, updated_scratch_bwd=false; void *scratch=NULL; int prev_scratch_size = 0; public: PoolXSMM(PoolImplParams* gp, int engine); virtual ~PoolXSMM(void) {} // Assume external threading, e.g., #pragma omp void forwardPropagate(TensorBuf *inp, TensorBuf *outp, int *maskp, int tid); void backPropagate(TensorBuf *deloutp, int *maskp, TensorBuf *delinp, int tid); }; libxsmm-1.17/samples/deeplearning/gxm/include/ReLU.hpp000066400000000000000000000063061415223013700227570ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include "Node.hpp" #include "Engine.hpp" #include "Params.hpp" #include "Tensor.hpp" #include "proto/gxm.pb.h" #include "ReLUImpl.hpp" #include "ReLUXSMM.hpp" using namespace std; using namespace gxm; class ReLUParams : public NNParams { public: ReLUParams(void) {} virtual ~ReLUParams(void) {} void set_negative_slope(float s) { neg_slope_ = s; } float get_negative_slope() { return neg_slope_; } void set_data_type(int t) { data_type_ = t; } int get_data_type() { return data_type_; } void set_compute_engine(int ce) { compute_engine_ = ce; } int get_compute_engine() { return compute_engine_; } void set_algo_type(int at) { algotype_ = at; } int get_algo_type() { return algotype_; } protected: float neg_slope_; int compute_engine_, algotype_, data_type_; }; static MLParams* parseReLUParams(NodeParameter* np) { ReLUParams* rp = new ReLUParams(); // Set name of node string str = np->name(); assert(!str.empty()); rp->set_node_name(str); //Set node type (ReLU) str = np->type(); assert(!str.empty()); rp->set_node_type(str); //Set tensor names assert(np->bottom_size() == 1); assert(!np->bottom(0).empty()); rp->set_bottom_names(np->bottom(0)); assert(np->top_size() == 1); assert(!np->top(0).empty()); rp->set_top_names(np->top(0)); //Set Mode for the node assert((np->mode() == TRAIN) || (np->mode() == TEST)); rp->set_mode(np->mode()); //Set backprop needed/not needed flag for this node rp->set_bprop_flag(np->propagate_down()); ReLUParameter p = np->relu_param(); rp->set_negative_slope(p.negative_slope()); rp->set_data_type(p.data_type()); rp->set_compute_engine(p.engine()); rp->set_algo_type(p.algotype()); return rp; } class ReLUNode : public NNNode { public: ReLUNode(ReLUParams* p, MLEngine* e); virtual ~ReLUNode(void) {} protected: void forwardPropagate(); void backPropagate(); void configure(int engine); void shape_setzero(Shape* s) { for(int i=0; idims[i] = 0; } Tensor* tenTop_; // Output tensor pointer Tensor* tenBot_; // Input tensor pointer ReLUImplParams gparams_; TensorBuf *tenBotDiff_, *tenBotData_; // Data & Gradients with respect to input TensorBuf *tenTopData_, *tenTopDiff_; // Output data int count_; int bot_cengine_; Shape ts_; ReLUImpl *impl; MLEngine* eptr_; }; libxsmm-1.17/samples/deeplearning/gxm/include/ReLUImpl.hpp000066400000000000000000000043661415223013700236050ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include #include "common.hpp" #include "check.hpp" #include "Tensor.hpp" typedef struct { string node_name; int bdims, tdims; int nInput, nOutput; int iDepth, iHeight, iWidth; int oDepth, oHeight, oWidth; int batch_size; float negative_slope; int data_type; int algType; int num_threads; }ReLUImplParams; class ReLUImpl { protected: ReLUImplParams *gp; int engine; TensorLayoutType bot_layout_type, top_layout_type, gbot_layout_type; void *bot_layout, *top_layout, *gbot_layout; int top_compute_engine=-1; int bot_compute_engine=-1; public: ReLUImpl(ReLUImplParams* gp_, int engine_): gp(gp_), engine(engine_) {} void set_top_compute_engine(int e) { top_compute_engine = e;} void set_bot_compute_engine(int e) { bot_compute_engine = e;} // Assume external threading, e.g., #pragma omp virtual void forwardPropagate(TensorBuf *inp, TensorBuf *outp, int tid) = 0; virtual void backPropagate(TensorBuf* inp, TensorBuf *deloutp, TensorBuf *delinp, int tid) = 0; virtual void forwardPropagate(TensorBuf *inp, TensorBuf *outp) { switch(engine) { case XSMM: forwardPropagate(inp, outp, 0); break; } } virtual void backPropagate(TensorBuf* inp, TensorBuf *deloutp, TensorBuf *delinp) { switch(engine) { case XSMM: backPropagate(inp, deloutp, delinp, 0); break; } } }; libxsmm-1.17/samples/deeplearning/gxm/include/ReLUXSMM.hpp000066400000000000000000000024631415223013700234640ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include "ReLUImpl.hpp" #include "check.hpp" class ReLUXSMM : public ReLUImpl { protected: public: ReLUXSMM(ReLUImplParams* gp, int engine) : ReLUImpl(gp, engine) { top_layout_type = LIBXSMM_CUSTOM_LAYOUT; top_layout = NULL; gbot_layout_type = LIBXSMM_CUSTOM_LAYOUT; gbot_layout = NULL; } // Assume external threading, e.g., #pragma omp void forwardPropagate(TensorBuf *inp, TensorBuf *outp, int tid); void backPropagate(TensorBuf *inp, TensorBuf *deloutp, TensorBuf *delinp, int tid); }; libxsmm-1.17/samples/deeplearning/gxm/include/Shape.h000066400000000000000000000017461415223013700226530ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #define MAX_DIMS 8 typedef struct { int ndims; // Number of dimensions in tensor int dims[MAX_DIMS]; //Logical dimensions: for activations assume N,FM,H,W; for weight tensor assume OFM,IFM,KH,KW } Shape; libxsmm-1.17/samples/deeplearning/gxm/include/SoftmaxLoss.hpp000066400000000000000000000063621415223013700244340ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include "Node.hpp" #include "Engine.hpp" #include "Params.hpp" #include "Tensor.hpp" #include "proto/gxm.pb.h" #include "SoftmaxLossImpl.hpp" #include "SoftmaxLossLoop.hpp" using namespace std; using namespace gxm; class SoftmaxLossParams : public NNParams { public: SoftmaxLossParams(void) {} virtual ~SoftmaxLossParams(void) {} void set_axis(int axis) { axis_ = axis; } int get_axis() { return axis_; } void set_data_type(int t) { data_type_ = t; } int get_data_type() { return data_type_; } void set_loss_weight(vector l) { for(int i=0; i& get_loss_weight() { return loss_weight_; } protected: int axis_, data_type_; vector loss_weight_; }; static MLParams* parseSoftmaxParams(NodeParameter* np) { SoftmaxLossParams *p = new SoftmaxLossParams(); SoftmaxParameter sp = np->softmax_param(); // Set name of node assert(!np->name().empty()); p->set_node_name(np->name()); //Set node type (Convolution, FullyConnected, etc) assert(!np->type().empty()); p->set_node_type(np->type()); //Set tensor names //Set tensor names for(int i=0; ibottom_size(); i++) { assert(!np->bottom(i).empty()); p->set_bottom_names(np->bottom(i)); } assert(np->top_size() == 1); assert(!np->top(0).empty()); p->set_top_names(np->top(0)); //Set Mode for the node assert((np->mode() == TRAIN) || (np->mode() == TEST)); p->set_mode(np->mode()); p->set_bprop_flag(np->propagate_down()); int axis = sp.axis(); p->set_axis(axis); p->set_data_type(sp.data_type()); vector lw; for(int i=0; iloss_weight_size(); i++) lw.push_back(np->loss_weight(i)); p->set_loss_weight(lw); return p; } class SoftmaxLossNode : public NNNode { public: SoftmaxLossNode(SoftmaxLossParams* p, MLEngine* e); virtual ~SoftmaxLossNode(void) {} void configure(int smaxtype); protected: vector tenBot_; Tensor *tenTop_; TensorBuf *tenTopData_, *tenBotDiff_; vector tenBotData_; string node_name_, node_type_; SMaxLossImplParams gparams_; Shape ts_; vector loss_weight_; float test_loss_; size_t node_id_, num_nodes_; void shape_setzero(Shape* s) { for(int i=0; idims[i] = 0; } void forwardPropagate(); void backPropagate(); SMaxLossImpl* impl; MLEngine* eptr_; }; libxsmm-1.17/samples/deeplearning/gxm/include/SoftmaxLossImpl.hpp000066400000000000000000000030641415223013700252520ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include #include #include "check.hpp" #include "Tensor.hpp" #include "common.hpp" using namespace std; typedef struct { string node_name; int nInput, nOutput; int batch_size; int nBInput, nBOutput; int iBlock, oBlock; float loss; float loss_weight; int num_threads; } SMaxLossImplParams; class SMaxLossImpl { protected: SMaxLossImplParams *gp; size_t num_nodes; public: SMaxLossImpl(SMaxLossImplParams* gp_): gp(gp_) {} void set_num_nodes(size_t n) { num_nodes = n; } size_t get_num_nodes() { return num_nodes; } virtual void forwardPropagate(TensorBuf *inp, TensorBuf *label, TensorBuf *outp) = 0; virtual void backPropagate(TensorBuf *outp, TensorBuf *label, TensorBuf *delinp) = 0; }; libxsmm-1.17/samples/deeplearning/gxm/include/SoftmaxLossLoop.hpp000066400000000000000000000021731415223013700252620ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include "SoftmaxLossImpl.hpp" class SMaxLossLoop : public SMaxLossImpl { public: SMaxLossLoop(SMaxLossImplParams* gp) : SMaxLossImpl(gp) {} // Assume external threading, e.g., #pragma omp void forwardPropagate(TensorBuf *inp, TensorBuf* label, TensorBuf *outp); void backPropagate(TensorBuf *outp, TensorBuf* label, TensorBuf *delinp); }; libxsmm-1.17/samples/deeplearning/gxm/include/SoftmaxWithLoss.hpp000066400000000000000000000075311415223013700252670ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include "Node.hpp" #include "Engine.hpp" #include "Params.hpp" #include "Tensor.hpp" #include "proto/gxm.pb.h" using namespace std; using namespace gxm; class SoftmaxLossNode : public NNNode { public: SoftmaxLossNode(SoftmaxLossParams* p, MLEngine* e) : NNNode(p, e) { NNNode::nname_ = p->get_node_name(); NNNode::ntype_ = p->get_node_type(); NNNode::mode_ = p->get_mode(); NNNode::top_ = p->get_top_name(); NNNode::bottom_ = p->get_bottom_name(); NNNode::has_weights_ = false; NNNode::bp_flag_ = true; //Create output tensor this->tenTop_ = new Tensor(NNNode::top_); assert(this->tenTop_ != NULL); this->tenTop_->setOwner(this); tenTopData_ = tenTop_->getBuf(DATA); this->tenBot_ = e->get_tensor(NNNode::bottom_); assert(this->tenBot_ != NULL); this->setPrevNode((NNNode*)this->tenBot_->getOwner()); tenBotData_ = tenBot_->getBuf(DATA); //Output tensor data type = input tensor data type int dtype = this->tenBot_->getBufDataType(DATA); this->tenTop_->setBufDataType(DATA, dtype); Shape* bs = this->tenBot_->getShape(); assert(bs->ndims <= MAX_DIMS); shape_setzero(&ts_); ts_.ndims = 1; ts_.dims[0] = 1; tenTop_->setShape(&ts_); long long int size = 1; for(int i=0; itenTop_->setDataBufferSize(DATA, size); // Register output tensor in tensorMap bool inserted = e->register_tensor(NNNode::top_, this->tenTop_); if(!inserted) printf("Warning: Tensor %s already registered\n",NNNode::top_.c_str()); if(!e->is_inference_only()) { if(NNNode::bp_flag_) { tenBotDiff_ = tenBot_->addBuf(); tenBotDiff_->setDataType(dtype); size = 1; for(int i=0; indims; i++) size = size*bs->dims[i]; if(dtype == DT_FLOAT) size = size*sizeof(float); else if(dtype == DT_INT) size = size*sizeof(int); // Set the size of the input-gradient buffer tenBotDiff_->setBufferSize(size); } } } virtual ~SoftmaxLossNode(void) {} void createTasks(list, int); void createPersistentTask(); void createStrategy(int); void enqueTask(int pos); void createCheckPoint(); void restoreCheckPoint(); protected: Tensor *tenBot_, *tenTop_; TensorBuf *tenTopData_, *tenBotData_, *tenBotDiff_; string node_name_, node_type_; Shape ts_; void shape_setzero(Shape* s) { for(int i=0; idims[i] = 0; } void forwardPropagate(); void backPropagate(); void weightUpdate(); void solverStep(); }; libxsmm-1.17/samples/deeplearning/gxm/include/Solver.hpp000066400000000000000000000134061415223013700234210ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include "assert.h" #include "MLNode.hpp" #include "Engine.hpp" #include #include "libxsmm.h" #include "check.hpp" using namespace std; class SolverParams : public MLParams { public: SolverParams(void){} virtual ~SolverParams(void) {} void setLRPolicy(string p) {lr_policy_ = p;} string getLRPolicy() { return lr_policy_; } void setGamma(float g) { gamma_ = g; } float getGamma() { return gamma_; } void setPower(float p) { power_ = p; } float getPower() { return power_; } void setStepSize(int s) { step_size_ = s; } int getStepSize() { return step_size_; } void setMaxIter(int i) { max_iter_ = i; } int getMaxIter() { return max_iter_; } void setLearningRate(float lr) { lr_.push_back(lr); } float getLearningRate() { return lr_[0]; } void setLearningRates(vector lr) { for(int i=0; i& getLearningRates() const { return lr_; } void setWarmupLR(float lr) { warmup_lr_.push_back(lr); } float getWarmupLR() { return warmup_lr_[0]; } void setMomentum(float m) { momentum_.push_back(m); } float getMomentum() { return momentum_[0]; } void setMomentums(vector m) { for(int i=0; i& getMomentums() const { return momentum_; } void setWeightDecay(float d) { decay_.push_back(d); } float getWeightDecay() { return decay_[0]; } void setWeightDecays(vector d) { for(int i=0; i& getWeightDecays() const { return decay_; } void setLRChangeEpochs(vector e) { for(int i=0; i& getLRChangeEpochs() const { return lrcepochs_; } void setStepValues(vector s) { stepvalues_.resize(s.size()); for(int i=0; i& getStepValues() const { return stepvalues_; } void setWarmupEpochs(int we) { warmup_epochs_ = we; } int getWarmupEpochs() { return warmup_epochs_; } void setEpochs(int e) { epochs_ = e; } int getEpochs() { return epochs_; } void setTestEpoch(int te) { test_epoch_ = te; } int getTestEpoch() { return test_epoch_; } void setSolverType(string s) { solver_type_ = s; } string getSolverType() { return solver_type_; } void setGlobalFlag(bool g) { global_ = g; } bool getGlobalFlag() { return global_; } void setDataType(int t) { data_type_ = t; } int getDataType() { return data_type_; } protected: vector lr_, momentum_, decay_, warmup_lr_; vector lrcepochs_, stepvalues_; int epochs_, test_epoch_, step_size_, max_iter_; string solver_type_, lr_policy_; float gamma_, power_; int warmup_epochs_, data_type_; bool global_; }; static SolverParams* parseSolverParams(SolverParameter* p) { SolverParams* sp = new SolverParams(); vector temp; vector itemp; string policy = p->lr_policy(); sp->setLRPolicy(policy); sp->setLearningRate(p->learning_rate(0)); sp->setWarmupLR(p->warmup_lr(0)); sp->setMomentum(p->momentum(0)); sp->setWeightDecay(p->weight_decay(0)); sp->setPower(p->power()); sp->setGamma(p->gamma()); sp->setStepSize(p->stepsize()); sp->setMaxIter(p->max_iter()); if(p->step_values_size() > 0) { itemp.resize(p->step_values_size()); for(int i=0; istep_values(i); sp->setStepValues(itemp); } sp->setWarmupEpochs(p->warmup_epochs()); assert(p->max_epochs() >= 1); sp->setEpochs(p->max_epochs()); assert(p->test_epoch() >= 1); sp->setTestEpoch(p->test_epoch()); sp->setSolverType(p->type()); sp->setDataType(p->data_type()); sp->setGlobalFlag(p->global()); return sp; } class SolverNode : public MLNode { public: SolverNode(SolverParams* p, MLEngine* e); virtual ~SolverNode(void) {} void applyUpdate(float**, float**, void**, int, float, float, string); void applyUpdate(float*, float*, void*, int, float, float, string); void applyUpdate(float*, float*, void*, int, float*, float*, string); void applyUpdate(float**, float**, void**, int, float**, float**, string); void convert_bf16_f32(libxsmm_bfloat16**, float**, int); void convert_bf16_f32(libxsmm_bfloat16*, float*, int); bool getGlobalFlag() { return global_; } protected: vector lr_, momentum_, decay_; vector lrcepochs_, stepvalues_; int epochs_, test_epoch_, step_size_, max_iter_; int stepidx_, warmup_max_epoch_; int data_type_; bool global_; string solver_type_, lr_policy_; map> hpmap_; float base_lr_, lrval_, mval_, decayval_; float gamma_, power_, warmup_lr_; float mc_, mc1_, mc2_, prev_lrval_=-1, prev_lrval_1_=-1; float *tmp_grad[NUM_NUMA_NODES]={NULL}; MLEngine *eptr_; }; libxsmm-1.17/samples/deeplearning/gxm/include/Split.hpp000066400000000000000000000060541415223013700232430ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include "assert.h" #include "Node.hpp" #include "Engine.hpp" #include "Params.hpp" #include "Tensor.hpp" #include "proto/gxm.pb.h" #include "SplitImpl.hpp" #include "SplitLoop.hpp" using namespace std; using namespace gxm; class SplitParams : public NNParams { public: SplitParams(void) {} virtual ~SplitParams(void) {} void set_data_type(int t) { data_type_ = t; } int get_data_type() { return data_type_; } void set_compute_engine(int ce) { compute_engine_ = ce; } int get_compute_engine() { return compute_engine_; } protected: int compute_engine_, data_type_; }; static MLParams* parseSplitParams(NodeParameter* np) { SplitParams* sp = new SplitParams(); // Set name of node string str = np->name(); assert(!str.empty()); sp->set_node_name(str); //Set node type (Convolution, FullyConnected, etc) str = np->type(); assert(!str.empty()); sp->set_node_type(str); //Set tensor names assert(np->bottom_size() == 1); assert(!np->bottom(0).empty()); sp->set_bottom_names(np->bottom(0)); for(int i=0; itop_size(); i++) sp->set_top_names(np->top(i)); //Set Mode for the node assert((np->mode() == TRAIN) || (np->mode() == TEST)); sp->set_mode(np->mode()); //Set backprop needed/not needed flag for this node sp->set_bprop_flag(np->propagate_down()); SplitParameter psp = np->split_param(); sp->set_data_type(psp.data_type()); sp->set_compute_engine(psp.engine()); return sp; } class SplitNode : public NNNode { public: SplitNode(SplitParams* p, MLEngine* e); virtual ~SplitNode(void) {} protected: void forwardPropagate(); void backPropagate(); void configure(int engine); void convert_bf16_f32(libxsmm_bfloat16*, float*, int); void shape_setzero(Shape* s) { for(int i=0; idims[i] = 0; } vectortenTop_; Tensor *tenBot_; vector tenTopData_, tenTopDiff_; TensorBuf *tenBotData_, *tenBotDiff_; int bot_cengine_; int count_, in_dtype, out_dtype; float *stptr=NULL, cbptr[16]; SplitImplParams gparams_; SplitImpl *impl=NULL; MLEngine* eptr_; }; libxsmm-1.17/samples/deeplearning/gxm/include/SplitImpl.hpp000066400000000000000000000043301415223013700240600ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include #include #include "common.hpp" #include "check.hpp" #include "Tensor.hpp" typedef struct { int bdims, tdims; int nInput; vector nOutput; int batch_size; int iHeight, iWidth, iDepth; int oHeight, oWidth, oDepth; int stride_h, stride_w, stride_d; int in_data_type, out_data_type; int num_threads; } SplitImplParams; class SplitImpl { protected: SplitImplParams *gp; int engine; TensorLayoutType top_layout_type, gbot_layout_type; void *top_layout, *gbot_layout; int bot_compute_engine=-1; vector top_compute_engine; public: SplitImpl(SplitImplParams* gp_, int engine_) : gp(gp_), engine(engine_) {} void set_top_compute_engine(int e) { top_compute_engine.push_back(e);} void set_bot_compute_engine(int e) { bot_compute_engine = e;} virtual void forwardPropagate(TensorBuf *inp, vector& outp, int tid) = 0; virtual void backPropagate(vector& deloutp, TensorBuf *delinp, int tid) = 0; virtual void forwardPropagate(TensorBuf *inp, vector& outp) { switch(engine) { case XSMM: forwardPropagate(inp, outp, 0); break; } } virtual void backPropagate(vector& deloutp, TensorBuf* delinp) { switch(engine) { case XSMM: backPropagate(deloutp, delinp, 0); break; } } }; libxsmm-1.17/samples/deeplearning/gxm/include/SplitLoop.hpp000066400000000000000000000023451415223013700240740ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include "SplitImpl.hpp" class SplitLoop : public SplitImpl { public: SplitLoop(SplitImplParams* gp, int engine) : SplitImpl(gp, engine) { top_layout_type = NCHWV; top_layout = NULL; gbot_layout_type = NCHWV; gbot_layout = NULL; } void forwardPropagate(TensorBuf *inpb, vector& outpb, int tid); void backPropagate(vector& deloutpb, TensorBuf *delinpb, int tid); }; libxsmm-1.17/samples/deeplearning/gxm/include/Task.hpp000066400000000000000000000062401415223013700230470ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include "MLNode.hpp" #include #include "Tensor.hpp" #define BASIC_TASK_FORW 0 #define BASIC_TASK_BACK 1 #define BASIC_TASK_WGRAD 2 #define BASIC_TASK_SOLVE 3 #define CUSTOM_TASK_START 100 using namespace std; using namespace gxm; class Task { protected: MLNode *node_; int taskId_; int basicTaskId_; int minBin_, maxBin_; vector inputs_; vector outputs_; vector subTasks_; Task *parent_; public: Task(MLNode* n, int taskId, int basicTaskId) { this->node_ = n; this->taskId_ = taskId; this->basicTaskId_ = basicTaskId; this->minBin_ = 0; this->maxBin_ = 0; parent_ = NULL; } virtual ~Task(void) {} Task *createSubTask(int taskId) { Task *subTask = new Task(this->node_, taskId, basicTaskId_); this->subTasks_.push_back(subTask); subTask->parent_ = this; return subTask; } bool addForwDep(Task *dest) { if(dest == NULL) return false; // add only if task is not in the list if(std::find(outputs_.begin(), outputs_.end(), dest) == outputs_.end()) { this->outputs_.push_back(dest); if(std::find(dest->inputs_.begin(), dest->inputs_.end(), this) == dest->inputs_.end()) dest->inputs_.push_back(this); return true; } else return false; } bool addBackDep(Task *src) { if(src == NULL) return false; // add only if task is not in the list if(std::find(inputs_.begin(), inputs_.end(), src) == inputs_.end()) { this->inputs_.push_back(src); if(std::find(src->outputs_.begin(), src->outputs_.end(), this) == src->outputs_.end()) src->outputs_.push_back(this); return true; } else return false; } vector& getForwDepTasks() { return this->outputs_; } vector& getBackDepTasks() { return this->inputs_; } void setMinBin(int bin) { minBin_ = bin; } void setMaxBin(int bin) { maxBin_ = bin; } int getMinBin() { return minBin_; } int getMaxBin() { return maxBin_; } int getBasicTaskId() {return basicTaskId_; } int getTaskId() {return taskId_; } MLNode* getNode() { return node_; } void invoke() { node_->executeTask(basicTaskId_); } inline int numInputs() { return inputs_.size(); } inline int numOutputs() { return outputs_.size(); } }; libxsmm-1.17/samples/deeplearning/gxm/include/Tensor.hpp000066400000000000000000000132731415223013700234230ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include #include "MLNode.fwd.hpp" #include "Shape.h" using namespace std; enum TensorDataType {DT_FLOAT, DT_INT, DT_BF16, DT_INT16, DT_DFP8, DT_INT8}; enum TensorBufType {DATA, DIFF, HISTORY, PRIVATE}; //also used as indices into tBuf_; should change enum TensorType {INPUT, LABEL, ACT, INPUT_ACT, ACT_LABEL, CONVWEIGHT, CONVBIAS, FCWEIGHT, FCBIAS, BNORMSCALE, BNORMSHIFT, BNORMMEAN, BNORMVAR}; enum TensorLayoutType {NCHW, NHWC, NCHWV, KCRS, RSCK, LIBXSMM_CUSTOM_LAYOUT, NUM_LAYOUTS}; class Tensor; class TensorBuf { protected: Tensor *tensor_; void *buf_; // Pointer to buffer void *lpbuf_; // Pointer to LP object void *prv_buf_; void *lp_prv_buf_; void **bufptr_; void **lpbufptr_; TensorLayoutType layout_type_; void *layout_; int offset_; int dType_; // Data type for this buffer int bType_; // Type of buffer (DATA/DIFF/HISTORY) long long int size_; // Size of this buffer int bin_; // Bin number assigned to this buffer public: TensorBuf(Tensor* tensor, int dtype = DT_FLOAT, int size = 0) : tensor_(tensor) { buf_ = NULL; lpbuf_ = NULL; prv_buf_ = NULL; lp_prv_buf_ = NULL; bufptr_ = NULL; lpbufptr_ = NULL; layout_type_ = NCHW; layout_ = NULL; offset_ = 0; dType_ = dtype; size_ = size; bin_ = 0; } Tensor* getTensor() { return tensor_; } void setBin(int bin) { bin_ = bin; } void setDataType(int t) { dType_ = t; } int getDataType() { return dType_; } void setBufferType(int t) { bType_ = t; } int getBufferType() { return bType_; } void setBufferSize(long long int size) { size_ = size; } long long int getBufferSize() { return size_; } void setOffset(int offset) { offset_ = offset; } int getOffset() { return offset_; } int getBin() { return bin_; } void setBuffer(void* bptr) { buf_ = bptr; } void* getBuffer() { return buf_; } void setLPBuffer(void* bptr) { lpbuf_ = bptr; } void* getLPBuffer() { return lpbuf_; } void setPrivBuffer(void* bptr) { prv_buf_ = bptr; } void* getPrivBuffer() { return prv_buf_; } void setLPPrivBuffer(void* bptr) { lp_prv_buf_ = bptr; } void* getLPPrivBuffer() { return lp_prv_buf_; } void setBufferPtr(void** bptr) { bufptr_ = bptr; } void** getBufferPtr() { return bufptr_; } void setLPBufferPtr(void** bptr) { lpbufptr_ = bptr; } void** getLPBufferPtr() { return lpbufptr_; } void setLayoutType(TensorLayoutType lt) { layout_type_ = lt; } TensorLayoutType getLayoutType() { return layout_type_; } void setLayout(void *layptr) { layout_ = layptr; } void* getLayout() { return layout_; } }; class Tensor { protected: string name_; Shape shape_; // Base logical shape of this tensor vector tBuf_; // Structure holding pointer to buffer, its size, type and bin TensorType tType_; // Type of this tensor (Activation, Weight etc) MLNode *owner_; void *layout_; // Layout for this tensor (applies to all buffers) TensorLayoutType layout_type_; // Layout type for this tensor (applies to all buffers) public: Tensor(string name) { this->name_ = name; tBuf_.push_back(new TensorBuf(this)); // Assume that tBuf_[0] is always the foward pass buffer layout_ = NULL; layout_type_ = NCHW; } virtual ~Tensor(void) {} MLNode *getOwner() { return owner_; } void setOwner(MLNode *owner) { owner_ = owner; } TensorBuf *addBuf(int dtype = DT_FLOAT, int size = 0) { TensorBuf *tb = new TensorBuf(this, dtype, size); this->tBuf_.push_back(tb); return tb; } void setShape(Shape* shape) { assert(shape->ndims <= MAX_DIMS); shape_.ndims = shape->ndims; for(int i=0; indims; i++) shape_.dims[i] = shape->dims[i]; for(int i=shape->ndims; isetDataType(tdt); } int getBufDataType(int bufId) { return this->tBuf_[bufId]->getDataType(); } string getTensorName() { return name_; } int getNumDataBuffers() { return tBuf_.size(); } TensorBuf *getBuf(int bufId) { if(bufId < tBuf_.size()) return this->tBuf_[bufId]; else return NULL; } void setDataBuffer(int bufId, void* ptr) { this->tBuf_[bufId]->setBuffer(ptr); } void* getDataBuffer(int bufId) { return this->tBuf_[bufId]->getBuffer(); } void setDataBufferSize(int bufId, long long int size) { this->tBuf_[bufId]->setBufferSize(size); } long long int getDataBufferSize(int bufId) { return this->tBuf_[bufId]->getBufferSize(); } int getBufBin(int bufId) { return this->tBuf_[bufId]->getBin(); } }; libxsmm-1.17/samples/deeplearning/gxm/include/TypeList.hpp000066400000000000000000000020251415223013700237170ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ typedef MLParams *(*ParseFunc)(NodeParameter* np); typedef MLNode *(*CreateFunc)(MLParams* p, MLEngine* e); typedef struct TypeList_{ std::string typeName; ParseFunc parse; CreateFunc create; } TypeList; extern TypeList nodeTypes[]; extern const int numTypes; libxsmm-1.17/samples/deeplearning/gxm/include/check.hpp000066400000000000000000000024701415223013700232230ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include #include "libxsmm.h" void check_physical_pad(const char *s, float *tensor, int nImg, int nBfm, int fh, int fw, int ifm, int iph, int ipw ); void check_physical_pad(const char *s, libxsmm_bfloat16 *tensor, int nImg, int nBfm, int fh, int fw, int ifm, int iph, int ipw ); void MeanOfLayer(char *s, float *array, int size); void MeanOfLayer(char *s, double *array, int size); void MeanOfLayer(char *s, int *array, int size); void MeanOfLayer(char *s, libxsmm_bfloat16 *array, int size); libxsmm-1.17/samples/deeplearning/gxm/include/common.hpp000066400000000000000000000060571415223013700234430ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #ifdef USE_MLSL #include "mlsl.hpp" #endif #include #define CHECK_ERR(f, err) do { \ (err) = (f); \ if ((err) != E_SUCCESS) { \ printf("[%s:%d] err (%d)\n", __FILE__, __LINE__, err); \ exit(-1); \ } \ } while(0) #define MIN_VAL -FLT_MAX #define STATFREQ 1 #define LOOP 0 #define XSMM 1 #define ELSUM 0 #define ELPROD 1 #define ELMAX 2 #define AUTO 0 #define DIRECT 1 #define NUM_NUMA_NODES 2 #define ALIGN_SIZE(x, a) ~(a-1) & (x + a - 1); #define _FIXUP_INPUT_CODE_QNAN 0 #define _FIXUP_INPUT_CODE_SNAN 1 #define _FIXUP_INPUT_CODE_NINF 4 #define _FIXUP_INPUT_CODE_PINF 5 #define _FIXUP_OUTPUT_CODE_COPY_INPUT 1 #define _FIXUP_OUTPUT_CODE_QNAN_INPUT 2 #define ENCODE_FIXUP_SELECTOR(input,output) ((output) << (4*(input))) static const int gxm_selector_int32 = ENCODE_FIXUP_SELECTOR(_FIXUP_INPUT_CODE_SNAN, _FIXUP_OUTPUT_CODE_QNAN_INPUT) | /* Qnan input to Qnan output (presenrving input bits 0..21) */ ENCODE_FIXUP_SELECTOR(_FIXUP_INPUT_CODE_QNAN, _FIXUP_OUTPUT_CODE_QNAN_INPUT) | /* Snan input to Qnan output (presenrving input bits 0..21) */ ENCODE_FIXUP_SELECTOR(_FIXUP_INPUT_CODE_NINF, _FIXUP_OUTPUT_CODE_COPY_INPUT) | /* Neg Inf input copied to output */ ENCODE_FIXUP_SELECTOR(_FIXUP_INPUT_CODE_PINF, _FIXUP_OUTPUT_CODE_COPY_INPUT); /* Pos Inf input copied to output */ static __m512 gxm_fp32_to_bfp16_rne_adjustment_avx512f(__m512 vfp32) { const __m512i vrne_even = _mm512_set1_epi32(0x00007fff); const __m512i one = _mm512_set1_epi32(1); const __m512i selector = _mm512_set1_epi32(gxm_selector_int32); __m512i vfp32_as_int = _mm512_castps_si512(vfp32); __m512i odd = _mm512_and_si512(_mm512_srli_epi32(vfp32_as_int, 16), one); __m512i rounding_factor = _mm512_add_epi32(vrne_even, odd); vfp32_as_int = _mm512_add_epi32(vfp32_as_int, rounding_factor); return _mm512_fixupimm_ps(_mm512_castsi512_ps(vfp32_as_int), vfp32, selector, 0); } static __m256i gxm_fp32_to_bfp16_truncate_avx512f(__m512 vfp32) { __m512i vbfp16_32 = _mm512_srai_epi32(_mm512_castps_si512(vfp32), 16); return _mm512_cvtepi32_epi16(vbfp16_32); } static __m512 gxm_bfp16_to_fp32_avx512f(__m256i vbfp16) { __m512i vbfp16_32 = _mm512_cvtepi16_epi32(vbfp16); return _mm512_castsi512_ps(_mm512_slli_epi32(vbfp16_32, 16)); } libxsmm-1.17/samples/deeplearning/gxm/include/db.hpp000066400000000000000000000027451415223013700225400ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #ifndef _DB_HPP_ #define _DB_HPP_ #include #include using namespace std; class Cursor { public: Cursor() { } virtual ~Cursor() { } virtual void SeekToFirst() = 0; virtual void Next(int skip = 0) = 0; virtual string key() = 0; virtual string value() = 0; virtual int count() = 0; virtual std::pair valuePointer() = 0; virtual bool valid() = 0; //DISABLE_COPY_AND_ASSIGN(Cursor); }; class DB { public: DB() { } virtual ~DB() { } virtual void Open(const string& source)= 0; virtual void Close() = 0; virtual Cursor* NewCursor() = 0; //DISABLE_COPY_AND_ASSIGN(DB); }; //DB* GetDB(DataParameter::DB backend); DB* GetDB(const string& backend); #endif // _DB_HPP_ libxsmm-1.17/samples/deeplearning/gxm/include/db_lmdb.hpp000066400000000000000000000063561415223013700235400ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #ifdef USE_LMDB #ifndef _LMDB_HPP_ #define _LMDB_HPP_ #include #include #include #include "lmdb.h" #include "db.hpp" inline void MDB_CHECK(int mdb_status) { //CHECK_EQ(mdb_status, MDB_SUCCESS) << mdb_strerror(mdb_status); if(mdb_status != MDB_SUCCESS) { printf("MDB Error: %s\n",mdb_strerror(mdb_status)); exit(1); } } class LMDBCursor : public Cursor { public: explicit LMDBCursor(MDB_txn* mdb_txn, MDB_cursor* mdb_cursor, int count) : mdb_txn_(mdb_txn), mdb_cursor_(mdb_cursor), valid_(false), count_(count) { SeekToFirst(); } virtual ~LMDBCursor() { mdb_cursor_close(mdb_cursor_); mdb_txn_abort(mdb_txn_); } virtual void SeekToFirst() { Seek(MDB_FIRST); } virtual void Next(int skip = 0) { Seek(MDB_NEXT, skip); } virtual string key() { return string(static_cast(mdb_key_.mv_data), mdb_key_.mv_size); } virtual string value() { return string(static_cast(mdb_value_.mv_data), mdb_value_.mv_size); } virtual int count() { return count_; } virtual std::pair valuePointer() { return std::make_pair(mdb_value_.mv_data, mdb_value_.mv_size); } virtual bool valid() { return valid_; } private: void Seek(MDB_cursor_op op, int skip = 0) { int mdb_status = MDB_SUCCESS; if(op == MDB_NEXT) for(int i = 0; i < skip; i++) { mdb_status = mdb_cursor_get(mdb_cursor_, &mdb_key_, 0, MDB_NEXT); if (mdb_status == MDB_NOTFOUND) { mdb_status = mdb_cursor_get(mdb_cursor_, &mdb_key_, 0, MDB_FIRST); //printf("LMDB wrap around\n"); } if (mdb_status != MDB_SUCCESS) break; } if(mdb_status == MDB_SUCCESS) mdb_status = mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, op); if (mdb_status == MDB_NOTFOUND) { mdb_status = mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, MDB_FIRST); //printf("LMDB wrap around\n"); //valid_ = false; } MDB_CHECK(mdb_status); valid_ = true; } MDB_txn* mdb_txn_; MDB_cursor* mdb_cursor_; MDB_val mdb_key_, mdb_value_; bool valid_; int count_; }; class LMDB : public DB { public: LMDB() : mdb_env_(NULL) { } virtual ~LMDB() { Close(); } virtual void Open(const string& sourc); virtual void Close() { if (mdb_env_ != NULL) { mdb_dbi_close(mdb_env_, mdb_dbi_); mdb_env_close(mdb_env_); mdb_env_ = NULL; } } virtual LMDBCursor* NewCursor(); private: MDB_env* mdb_env_; MDB_dbi mdb_dbi_; }; #endif #endif libxsmm-1.17/samples/deeplearning/gxm/include/fillers.hpp000066400000000000000000000024371415223013700236110ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include #include #include #include #include #include "Tensor.hpp" using namespace std; void Uniform(const float lower, const float upper, int n, float *ptr); void Gaussian(float mean, float stddev, int n, float *ptr); void initBuffer(void*, int vnorm, int fanin, int fanout, long long int, string, float std=0); void initConstantBuffer(void*, long long int, string, float); void initConstantBuffer(void*, long long int, string, short); libxsmm-1.17/samples/deeplearning/gxm/include/io.hpp000066400000000000000000000040141415223013700225510ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #pragma once #include #include #include #include #include #include #include #ifdef USE_OPENCV #include #include #include #include #endif // USE_OPENCV #include "proto/gxm.pb.h" using namespace std; using namespace gxm; using google::protobuf::io::FileInputStream; using google::protobuf::io::FileOutputStream; using google::protobuf::io::ZeroCopyInputStream; using google::protobuf::io::CodedInputStream; using google::protobuf::io::ZeroCopyOutputStream; using google::protobuf::io::CodedOutputStream; using google::protobuf::Message; bool ReadProtoFromText(string, Message*); bool ReadProtoFromBinary(string, Message*); void WriteProtoToText(const Message&, string); void ReadNWriteMeanFile(string, Message*, string); void initSeeds(unsigned int*, int); void CVMatToDatum(const cv::Mat& cv_img, Datum* datum); bool DecodeDatum(Datum* datum, bool is_color); bool DecodeDatumNative(Datum* datum); cv::Mat DecodeDatumToCVMat(const Datum& datum, bool is_color); cv::Mat DecodeDatumToCVMatNative(const Datum& datum); libxsmm-1.17/samples/deeplearning/gxm/parse.sh000077500000000000000000000121541415223013700214630ustar00rootroot00000000000000#!/usr/bin/env bash ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Evangelos Georganas, Alexander Heinecke (Intel Corp.) ############################################################################### #Usage: ./parse.sh output_log_file batch_id NEXT_BID=$(($2+1)) N_LINES=$(cat $1 | grep -A 10000 'Executing batch number '$2'' | grep -B 10000 'Executing batch number '$NEXT_BID''| wc -l | tr -d ' ') N_LINES=$(($N_LINES-1)) echo "================================================" echo "************ Convolutions timings **************" echo "LIBXSMM Conv FWD time in ms" FP_TIME=$(cat $1 | grep -A ${N_LINES} 'Executing batch number '$2'' | grep 'XSMM-CONV-FP mb' | cut -d "=" -f2 | cut -d 'm' -f1 | paste -sd+ | bc) FP_TIME=$(echo "${FP_TIME}" | bc) echo $FP_TIME echo "LIBXSMM Conv BWD time in ms" BP_TIME=$(cat $1 | grep -A ${N_LINES} 'Executing batch number '$2'' | grep 'XSMM-CONV-BP mb' | cut -d "=" -f2 | cut -d 'm' -f1 | paste -sd+ | bc) BP_TIME=$(echo "${BP_TIME}" | bc) echo $BP_TIME echo "LIBXSMM Conv UPD time in ms" WU_TIME=$(cat $1 | grep -A ${N_LINES} 'Executing batch number '$2'' | grep 'XSMM-CONV-WU mb' | cut -d "=" -f2 | cut -d 'm' -f1 | paste -sd+ | bc) WU_TIME=$(echo "${WU_TIME}" | bc) echo $WU_TIME echo "LIBXSMM Conv total time in ms for minibatch $2" TOTAL_TIME=$(echo "${FP_TIME}+${BP_TIME}+${WU_TIME}" | bc) echo $TOTAL_TIME echo "------------------------------------------------" echo "Conv FWD time in ms" FP_TIME=$(cat $1 | grep -A ${N_LINES} 'Executing batch number '$2'' | grep 'conv' | grep -v 'pool' | grep "task 0" | awk -F" " '{print $7}' | paste -sd+ | bc) echo $FP_TIME echo "Conv BWD time in ms" BP_TIME=$(cat $1 | grep -A ${N_LINES} 'Executing batch number '$2'' | grep 'conv' | grep -v 'pool' | grep "task 1" | grep -v "split" | awk -F" " '{print $7}' | paste -sd+ | bc) echo $BP_TIME echo "Conv UPD time in ms" WU_TIME=$(cat $1 | grep -A ${N_LINES} 'Executing batch number '$2'' | grep 'conv' | grep -v 'pool' | grep "task 2" | grep -v "split" | awk -F" " '{print $7}' | paste -sd+ | bc) echo $WU_TIME echo "================================================" echo "************ Batch norm timings ****************" echo "LIBXSMM BN FWD time in ms" BN_F=$(cat $1 | grep -A ${N_LINES} 'Executing batch number '$2'' | grep 'XSMM-BN-FP mb' | cut -d "=" -f2 | cut -d 'm' -f1 | paste -sd+ | bc) BN_F=$(echo "${BN_F}" | bc) echo $BN_F echo "LIBXSMM BN BWD time in ms" BN_B=$(cat $1 | grep -A ${N_LINES} 'Executing batch number '$2'' | grep 'XSMM-BN-BP mb' | cut -d "=" -f2 | cut -d 'm' -f1 | paste -sd+ | bc) BN_B=$(echo "${BN_B}" | bc) echo $BN_B echo "LIBXSMM BN total time in ms for minibatch $2" TOTAL_TIME=$(echo "${BN_F}+${BN_B}" | bc) echo $TOTAL_TIME echo "------------------------------------------------" echo "BN FWD time in ms" BN_F=$(cat $1 | grep -A ${N_LINES} 'Executing batch number '$2'' | grep 'bn' | grep "task 0" | awk -F" " '{print $7}' | paste -sd+ | bc) echo $BN_F echo "BN BWD time in ms" BN_B=$(cat $1 | grep -A ${N_LINES} 'Executing batch number '$2'' | grep 'bn' | grep "task 1" | grep -v "split" | awk -F" " '{print $7}' | paste -sd+ | bc) echo $BN_B echo "================================================" echo "************ Split timings *********************" echo "Split BWD time in ms" SPLIT_B=$(cat $1 | grep -A ${N_LINES} 'Executing batch number '$2'' | grep 'bn' | grep "task 1" | grep "split" | awk -F" " '{print $7}' | paste -sd+ | bc) echo $SPLIT_B echo "================================================" echo "************ Pool timings **********************" echo "Pool FWD time in ms" POOL_F=$(cat $1 | grep -A ${N_LINES} 'Executing batch number '$2'' | grep 'pool' | grep "task 0" | awk -F" " '{print $7}' | paste -sd+ | bc) echo $POOL_F echo "POOL BWD time in ms" POOL_B=$(cat $1 | grep -A ${N_LINES} 'Executing batch number '$2'' | grep 'pool' | grep "task 1" | awk -F" " '{print $7}' | paste -sd+ | bc) echo $POOL_B echo "========================================" echo "************ FC timings ****************" echo "FC time in ms" FC=$(cat $1 | grep -A ${N_LINES} 'Executing batch number '$2'' | grep 'fc' | awk -F" " '{print $7}' | paste -sd+ | bc) echo $FC echo "========================================" echo "************ SGD timings ****************" echo "SGD time in ms" SGD=$(cat $1 | grep -A ${N_LINES} 'Executing batch number '$2'' | grep "task 3" | awk -F" " '{print $7}' | paste -sd+ | bc) echo $SGD echo "=========================" echo "Total time in ms for minibatch $2" TOTAL_TIME=$(echo "${FP_TIME}+${BP_TIME}+${WU_TIME}+${BN_F}+${BN_B}+${SPLIT_B}+${FC}+${SGD}+${POOL_F}+${POOL_B}" | bc) echo $TOTAL_TIME libxsmm-1.17/samples/deeplearning/gxm/proto/000077500000000000000000000000001415223013700211525ustar00rootroot00000000000000libxsmm-1.17/samples/deeplearning/gxm/proto/gxm.proto000066400000000000000000001030211415223013700230270ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ syntax = "proto2"; package gxm; enum Mode { TRAIN = 0; VAL = 1; TEST = 2; } enum DataType { FLOAT = 0; INT = 1; BF16 = 2; FP16 = 3; INT16 = 4; INT8 = 5; } enum ComputeEngine { LOOP = 0; XSMM = 1; } enum AlgorithmType { AUTO = 0; DIRECT = 1; } //Machine parameters message MachineParameter { optional int32 num_machine_groups = 1 [default = 1]; optional int32 num_machines = 2 [default = 1]; optional int32 num_cores_per_machine = 3; optional int32 num_threads_per_core = 4 [default = 1]; optional int32 SIMD_width = 5; optional uint32 max_endpoints = 6 [default = 8]; optional uint32 io_endpoints = 7 [default = 2]; } // Tensor shape. message TensorShape { repeated int32 dim = 1 [packed = true]; } message TensorProto { repeated float data = 1 [packed = true]; } message TensorProtoVector { repeated TensorProto tensors = 1; } message Datum { optional int32 channels = 1; optional int32 height = 2; optional int32 width = 3; // the actual image data, in bytes optional bytes data = 4; optional int32 label = 5; // Optionally, the datum could also hold float data. repeated float float_data = 6; // If true data contains an encoded image that need to be decoded optional bool encoded = 7 [default = false]; } message FillerParameter { // The filler type. optional string type = 1 [default = 'constant']; optional float value = 2 [default = 0]; // the value in constant filler optional float min = 3 [default = 0]; // the min value in uniform filler optional float max = 4 [default = 1]; // the max value in uniform filler optional float mean = 5 [default = 0]; // the mean value in Gaussian filler optional float std = 6 [default = 1]; // the std value in Gaussian filler // The expected number of non-zero output weights for a given input in // Gaussian filler -- the default -1 means don't perform sparsification. optional int32 sparse = 7 [default = -1]; // Normalize the filler variance by fan_in, fan_out, or their average. // Applies to 'xavier' and 'msra' fillers. enum VarianceNorm { FAN_IN = 0; FAN_OUT = 1; AVERAGE = 2; } optional VarianceNorm variance_norm = 8 [default = FAN_IN]; } message NTGParameter { optional string name = 1; // consider giving the network a name optional string net = 2; // Proto file for net optional NTGParameter test_net_param = 3; // Inline test net params optional NetState state = 4; // State of NTG // The nodes that make up the graph. Each of their configurations, including // connectivity and behavior, is specified as a NodeParameter. repeated NodeParameter node = 100; // ID 100 so nodes are printed last. } // NOTE // Update the next available ID when you add a new SolverParameter field. // // SolverParameter next available ID: 24 (last added: warmup_lr) message SolverParameter { // The number of epochs for each net. optional int32 max_epochs = 1; // The number of epochs to test after. optional int32 test_epoch = 2 [default = 1]; repeated int32 lr_change_epochs = 3; // epochs at which learning rate changes repeated float learning_rate = 4; // The base learning rate repeated float warmup_lr = 23; // The base learning rate repeated float momentum = 5; // The momentum value. repeated float weight_decay = 6; // The weight decay. // The learning rate decay policy. The currently implemented learning rate // policies are as follows: // - fixed: always return base_lr. // - pcl_dnn: fixed sequence of learning rates // - step: return base_lr * gamma ^ (floor(iter / step)) // - exp: return base_lr * gamma ^ iter // - inv: return base_lr * (1 + gamma * iter) ^ (- power) // - multistep: similar to step but it allows non uniform steps defined by // stepvalue // - poly: the effective learning rate follows a polynomial decay, to be // zero by the max_iter. return base_lr (1 - iter/max_iter) ^ (power) // - sigmoid: the effective learning rate follows a sigmod decay // return base_lr ( 1/(1 + exp(-gamma * (iter - stepsize)))) // // where base_lr, max_iter, gamma, step, stepvalue and power are defined // in the solver parameter protocol buffer, and iter is the current iteration. optional string lr_policy = 13; optional float gamma = 14; // The parameter to compute the learning rate. optional float power = 15; // The parameter to compute the learning rate. // the stepsize for learning rate policy "step" optional int32 stepsize = 16; optional int32 max_iter = 17; repeated int32 step_values = 19; optional bool load_checkpoint = 7 [default = false]; // The snapshot interval optional string checkpoint_dir = 8; optional string checkpoint_format = 18; // type of the solver optional string type = 9 [default = "SGD"]; // numerical stability for RMSProp, AdaGrad and AdaDelta and Adam optional float delta = 10 [default = 1e-8]; // parameters for the Adam solver optional float momentum2 = 11 [default = 0.999]; // RMSProp decay value // MeanSquare(t) = rms_decay*MeanSquare(t-1) + (1-rms_decay)*SquareGradient(t) optional float rms_decay = 12; optional int32 warmup_epochs = 20 [default = 5]; optional bool global = 21 [default = false]; optional DataType data_type = 22 [default = FLOAT]; } // A message that stores the solver snapshots message SolverState { optional int32 epoch = 1; // The current iteration optional string learned_net = 2; // The file that stores the learned net. repeated TensorProto history = 3; // The history for sgd solvers optional int32 current_step = 4 [default = 0]; // The current step for learning rate } message NetState { optional Mode mode = 1 [default = TRAIN]; optional int32 level = 2 [default = 0]; repeated string stage = 3; } message NetStateRule { // Set mode to require the NetState have a particular mode (TRAIN or TEST) // to meet this rule. optional Mode mode = 1; // Set the minimum and/or maximum levels in which the node should be used. // Leave undefined to meet the rule regardless of level. optional int32 min_level = 2; optional int32 max_level = 3; // Customizable sets of stages to include or exclude. // The net must have ALL of the specified stages and NONE of the specified // "not_stage"s to meet the rule. // (Use multiple NetStateRules to specify conjunctions of stages.) repeated string stage = 4; repeated string not_stage = 5; } // Specifies training parameters (multipliers on global learning constants, // and the name and other settings used for weight sharing). message ParamSpec { // The names of the parameter blobs -- useful for sharing parameters among // layers, but never required otherwise. To share a parameter between two // layers, give it a (non-empty) name. optional string name = 1; // Whether to require shared weights to have the same shape, or just the same // count -- defaults to STRICT if unspecified. optional DimCheckMode share_mode = 2; enum DimCheckMode { // STRICT (default) requires that num, channels, height, width each match. STRICT = 0; // PERMISSIVE requires only the count (num*channels*height*width) to match. PERMISSIVE = 1; } // The multiplier on the global learning rate for this parameter. optional float lr_mult = 3 [default = 1.0]; // The multiplier on the global weight decay for this parameter. optional float decay_mult = 4 [default = 1.0]; } // // NodeParameter message NodeParameter { optional string name = 1; // the node name optional string type = 2; // the node type repeated string bottom = 3; // the name of each bottom tensor repeated string top = 4; // the name of each top tensor // The train / test mode for computation. optional Mode mode = 5; // The amount of weight to assign each top blob in the objective. // Each node assigns a default value, usually of either 0 or 1, // to each top blob. repeated float loss_weight = 6; // The tensors containing the numeric parameters of the node. repeated TensorProto tensors = 7; // Specifies on which bottoms the backpropagation should be skipped. // The size must be either 0 or equal to the number of bottoms. optional bool propagate_down = 8 [default = true]; // Rules controlling whether and when a node is included in the network, // based on the current NetState. You may specify a non-zero number of rules // to include OR exclude, but not both. If no include or exclude rules are // specified, the node is always included. If the current NetState meets // ANY (i.e., one or more) of the specified rules, the node is // included/excluded. repeated NetStateRule include = 9; repeated NetStateRule exclude = 10; // Allow recursive definition of a Node optional NodeParameter node = 11; // Specifies training parameters (multipliers on global learning constants, // and the name and other settings used for weight sharing). repeated ParamSpec param = 12; // Parameters for data pre-processing. // Parameters shared by loss nodes. optional LossParameter loss_param = 101; // Node type-specific parameters. // // Note: certain nodess may have more than one computational engine // for their implementation. These nodess include an Engine type and // engine parameter for selecting the implementation. Last ID = 123 // The default for the engine is set by the ENGINE switch at compile-time. optional AccuracyParameter accuracy_param = 102; optional ArgMaxParameter argmax_param = 103; optional ConcatParameter concat_param = 104; optional ConvolutionParameter convolution_param = 105; optional DataParameter data_param = 106; optional DropoutParameter dropout_param = 107; optional DummyDataParameter dummy_data_param = 108; optional EltwiseParameter eltwise_param = 109; optional EmbedParameter embed_param = 110; optional FullyConnectedParameter fc_param = 111; optional FusedBNormParameter fused_bnorm_param = 112; optional FusedConvBNParameter fused_conv_bn_param = 123; optional LogParameter log_param = 113; optional PoolingParameter pooling_param = 114; optional PReLUParameter prelu_param = 115; optional ReLUParameter relu_param = 116; optional SoftmaxParameter softmax_param = 117; optional SPPParameter spp_param = 118; optional SplitParameter split_param = 119; optional TanHParameter tanh_param = 120; optional ThresholdParameter threshold_param = 121; optional WindowDataParameter window_data_param = 122; } // Messages that store parameters used by individual nodes types follow, in // alphabetical order. message AccuracyParameter { // When computing accuracy, count as correct by comparing the true label to // the top k scoring classes. By default, only compare to the top scoring // class (i.e. argmax). optional uint32 top_k = 1 [default = 1]; // The "label" axis of the prediction blob, whose argmax corresponds to the // predicted label -- may be negative to index from the end (e.g., -1 for the // last axis). For example, if axis == 1 and the predictions are // (N x C x H x W), the label blob is expected to contain N*H*W ground truth // labels with integer values in {0, 1, ..., C-1}. optional int32 axis = 2 [default = 1]; // If specified, ignore instances with the given label. optional int32 ignore_label = 3; } message ArgMaxParameter { // If true produce pairs (argmax, maxval) optional bool out_max_val = 1 [default = false]; optional uint32 top_k = 2 [default = 1]; // The axis along which to maximise -- may be negative to index from the // end (e.g., -1 for the last axis). // By default ArgMaxnodes maximizes over the flattened trailing dimensions // for each index of the first / num dimension. optional int32 axis = 3; } message ConcatParameter { // The axis along which to concatenate -- may be negative to index from the // end (e.g., -1 for the last axis). Other axes must have the // same dimension for all the bottom blobs. // By default, Concatnodes concatenates blobs along the "channels" axis (1). optional int32 axis = 1 [default = 1]; optional ComputeEngine engine = 2 [default = XSMM]; optional AlgorithmType algotype = 3 [default = DIRECT]; optional DataType data_type = 4 [default = FLOAT]; } message ConvolutionParameter { optional uint32 num_output = 1; // The number of outputs for the nodes optional bool bias_term = 2 [default = true]; // whether to have bias terms // Pad, kernel size, and stride are all given as a single value for equal // dimensions in all spatial dimensions, or once per spatial dimension. repeated uint32 pad = 3; // The padding size; defaults to 0 repeated uint32 opad = 4; // The padding size; defaults to 0 repeated uint32 kernel_size = 5; // The kernel size optional uint32 group = 6 [default = 1]; // The group size for group conv repeated uint32 stride = 7; // The stride; defaults to 1 optional FillerParameter weight_filler = 8; // The filler for the weight optional FillerParameter bias_filler = 9; // The filler for the bias // Factor used to dilate the kernel, (implicitly) zero-filling the resulting // holes. (Kernel dilation is sometimes referred to by its use in the // algorithme à trous from Holschneider et al. 1987.) repeated uint32 dilation = 10; // The dilation; defaults to 1 // For 2D convolution only, the *_h and *_w versions may also be used to // specify both spatial dimensions. optional uint32 pad_h = 11 [default = 0]; // The padding height (2D only) optional uint32 pad_w = 12 [default = 0]; // The padding width (2D only) optional uint32 pad_d = 13 [default = 0]; // The padding width (2D only) optional uint32 kernel_h = 14; // The kernel height (2D only) optional uint32 kernel_w = 15; // The kernel width (2D only) optional uint32 kernel_d = 16; // The kernel width (2D only) optional uint32 stride_h = 17 [default = 1]; // The stride height (2D only) optional uint32 stride_w = 18 [default = 1]; // The stride width (2D only) optional uint32 stride_d = 19 [default = 1]; // The stride width (2D only) optional uint32 opad_h = 20 [default = 0]; // The padding height (2D only) optional uint32 opad_w = 21 [default = 0]; // The padding width (2D only) optional uint32 opad_d = 22 [default = 0]; // The padding width (2D only) optional uint32 ndims = 23 [default = 2]; optional ComputeEngine engine = 24 [default = XSMM]; optional AlgorithmType algotype = 25 [default = DIRECT]; optional bool fusedReLU = 26 [default = false]; optional bool bwd_relu = 27 [default = false]; optional bool compute_stats = 28 [default = false]; optional bool physical_padding = 29 [default = false]; optional DataType data_type = 30 [default = FLOAT]; } message DataParameter { enum DB { FLATFILE = 0; LEVELDB = 1; LMDB = 2; } // Specify the data source. optional string train_source = 1; optional string test_source = 15; // Specify the batch size. optional uint32 batch_size = 2; optional DB backend = 3 [default = FLATFILE]; // Specify the lookahead i.e., #minibatches to prefetch. optional uint32 lookahead = 4 [default = 2]; optional string root_folder = 5 [default = ""]; optional uint32 num_train_files = 6; optional uint32 num_test_files = 7; optional string train_data_info = 8; optional string test_data_info = 9; optional ImageTransformParameter image_xform_param = 10; // optional SpeechTransformParameter speech_xform_param = 11; // optional TextTransformParameter text_xform_param = 12; optional DataType data_type = 13 [default = FLOAT]; optional DataType label_data_type = 14 [default = INT]; optional bool split_db = 16 [default = false]; optional string train_list = 17; optional string test_list = 18; optional int32 numsplits = 19; optional bool shuffle = 20 [default = false]; optional ComputeEngine engine = 21 [default = XSMM]; } message DropoutParameter { optional float dropout_ratio = 1 [default = 0.5]; // dropout ratio optional ComputeEngine engine = 2 [default = XSMM]; optional AlgorithmType algotype = 3 [default = DIRECT]; optional DataType data_type = 4 [default = FLOAT]; } message DummyDataParameter { repeated FillerParameter data_filler = 1; repeated TensorShape shape = 2; optional int32 pad_h = 3 [default = 0]; optional int32 pad_w = 4 [default = 0]; optional ComputeEngine engine = 5 [default = XSMM]; optional DataType data_type = 6 [default = FLOAT]; } message EltwiseParameter { enum EltwiseOp { ELSUM = 0; ELPROD = 1; ELMAX = 2; } optional EltwiseOp op = 1 [default = ELSUM]; // element-wise operation, default is SUM optional float coeff = 2; // blob-wise coefficient for SUM operation // Whether to use an asymptotically slower (for >2 inputs) but stabler method // of computing the gradient for the PROD operation. (No effect for SUM op.) optional bool stable_prod_grad = 3 [default = true]; optional ComputeEngine engine = 4 [default = XSMM]; optional AlgorithmType algotype = 5 [default = DIRECT]; optional DataType data_type = 6 [default = FLOAT]; } // Message that stores parameters used by Embednodes message EmbedParameter { optional uint32 num_output = 1; // The number of outputs for the nodes // The input is given as integers to be interpreted as one-hot // vector indices with dimension num_input. Hence num_input should be // 1 greater than the maximum possible input value. optional uint32 input_dim = 2; optional bool bias_term = 3 [default = true]; // Whether to use a bias term optional FillerParameter weight_filler = 4; // The filler for the weight optional FillerParameter bias_filler = 5; // The filler for the bias } message FullyConnectedParameter { optional uint32 num_output = 1; // The number of outputs for the node optional bool bias_term = 2 [default = true]; // whether to have bias terms optional FillerParameter weight_filler = 3; // The filler for the weight optional FillerParameter bias_filler = 4; // The filler for the bias // The first axis to be lumped into a single inner product computation; // all preceding axes are retained in the output. // May be negative to index from the end (e.g., -1 for the last axis). optional int32 axis = 5 [default = 1]; // Specify whether to transpose the weight matrix or not. // If transpose == true, any operations will be performed on the transpose // of the weight matrix. The weight matrix itself is not going to be transposed // but rather the transfer flag of operations will be toggled accordingly. optional bool transpose = 6 [default = false]; optional int32 num_timesteps = 7 [default = 0]; optional bool activation_term = 8 [default = false]; optional FillerParameter activation_filler = 9; // block output in timestep t=0 for RNN/LSTM optional ComputeEngine engine = 10 [default = XSMM]; optional AlgorithmType algotype = 11 [default = DIRECT]; optional DataType data_type = 12 [default = FLOAT]; } message FusedBNormParameter { // If false, accumulate global mean/variance values via a moving average. If // true, use those accumulated values instead of computing mean/variance // across the batch. optional bool use_global_stats = 1; // How much does the moving average decay each iteration? optional float mmf = 2 [default = .999]; // Small value to add to the variance estimate so that we don't divide by zero. optional float eps = 3 [default = 1e-7]; optional bool relu = 4 [default = true]; optional bool bwd_relu = 20 [default = true]; optional bool eltwise = 5 [default = false]; repeated uint32 pad = 6; // The padding size; defaults to 0 optional uint32 pad_h = 7 [default = 0]; // The padding height (2D only) optional uint32 pad_w = 8 [default = 0]; // The padding width (2D only) optional uint32 pad_d = 9 [default = 0]; // The padding width (2D only) repeated uint32 ipad = 10; // The padding size; defaults to 0 optional uint32 ipad_h = 11 [default = 0]; // The padding height (2D only) optional uint32 ipad_w = 12 [default = 0]; // The padding width (2D only) optional uint32 ipad_d = 13 [default = 0]; // The padding width (2D only) repeated uint32 stride = 14; // The stride; defaults to 1 optional uint32 stride_h = 15 [default = 1]; // The stride height (2D only) optional uint32 stride_w = 16 [default = 1]; // The stride width (2D only) optional uint32 stride_d = 17 [default = 1]; // The stride width (2D only) optional float lr_mult = 21 [default = 1.0]; optional float decay_mult = 22 [default = 0.0]; optional ComputeEngine engine = 18 [default = XSMM]; optional AlgorithmType algotype = 19 [default = DIRECT]; optional DataType data_type = 23 [default = FLOAT]; } message FusedConvBNParameter { optional uint32 num_output = 1; // The number of outputs for the nodes // Pad, kernel size, and stride are all given as a single value for equal // dimensions in all spatial dimensions, or once per spatial dimension. repeated uint32 ipad = 2; // The padding size; defaults to 0 repeated uint32 mpad = 3; // The padding size; defaults to 0 repeated uint32 opad = 4; // The padding size; defaults to 0 repeated uint32 kernel_size = 5; // The kernel size optional uint32 group = 6 [default = 1]; // The group size for group conv repeated uint32 c_stride = 7; // The stride; defaults to 1 repeated uint32 bn_stride = 8; // The stride; defaults to 1 optional FillerParameter weight_filler = 9; // The filler for the weight // Factor used to dilate the kernel, (implicitly) zero-filling the resulting // holes. (Kernel dilation is sometimes referred to by its use in the // algorithme à trous from Holschneider et al. 1987.) repeated uint32 dilation = 10; // The dilation; defaults to 1 // For 2D convolution only, the *_h and *_w versions may also be used to // specify both spatial dimensions. optional uint32 ipad_h = 11 [default = 0]; // The padding height (2D only) optional uint32 ipad_w = 12 [default = 0]; // The padding width (2D only) optional uint32 mpad_h = 13 [default = 0]; // The padding width (2D only) optional uint32 mpad_w = 14 [default = 0]; // The padding width (2D only) optional uint32 opad_h = 15 [default = 0]; // The padding width (2D only) optional uint32 opad_w = 16 [default = 0]; // The padding width (2D only) optional uint32 kernel_h = 17; // The kernel height (2D only) optional uint32 kernel_w = 18; // The kernel width (2D only) optional uint32 c_stride_h = 19 [default = 1]; // The stride height (2D only) optional uint32 c_stride_w = 20 [default = 1]; // The stride width (2D only) optional uint32 bn_stride_h = 21 [default = 1]; // The stride width (2D only) optional uint32 bn_stride_w = 22 [default = 1]; // The stride width (2D only) optional uint32 ndims = 23 [default = 2]; optional bool relu_fwd = 24 [default = true]; optional bool relu_bwd = 25 [default = true]; optional bool physical_padding = 26 [default = false]; // If false, accumulate global mean/variance values via a moving average. If // true, use those accumulated values instead of computing mean/variance // across the batch. optional bool use_global_stats = 27; // How much does the moving average decay each iteration? optional float mmf = 28 [default = .999]; // Small value to add to the variance estimate so that we don't divide by zero. optional float eps = 29 [default = 1e-7]; optional bool eltwise = 30 [default = false]; optional ComputeEngine engine = 31 [default = XSMM]; optional AlgorithmType algotype = 32 [default = DIRECT]; optional DataType data_type = 33 [default = FLOAT]; } // Message that stores parameters used to apply transformation // to the image node's data message ImageTransformParameter { // For image data pre-processing, we can do simple scaling and subtracting the // data mean, if provided. Note that the mean subtraction is always carried // out before scaling. // Single scale for baseline optional uint32 scale = 1 [default = 256]; // Multiple scales for scale-jittering repeated uint32 jitters = 2; // Specify if we want to randomly mirror data. optional bool mirror = 3 [default = false]; // Specify the base crop size for a(n) square image (repeated for height, width) repeated uint32 crop_size = 4; // Specify base crop height for an image (not necessarily square) optional uint32 crop_h = 5 [default = 227]; // Specify base crop width for an image (not necessarily square) optional uint32 crop_w = 6 [default = 227]; // Specify if we would like to randomly crop an image. optional bool crop_image = 7 [default = false]; // mean_file and mean_value cannot be specified at the same time optional string mean_file = 8; // if specified can be repeated once (would subtract it from all the channels) // or can be repeated the same number of times as channels // (would subtract them from the corresponding channel) repeated float mean_values = 9; // Force the decoded image to have 3 color channels. optional bool force_color = 10 [default = false]; // Force the decoded image to have 1 color channels. optional bool force_gray = 11 [default = false]; // Assume that input images have 3 channels (i.e., they are color images) by default. Set to 1 for grayscale optional uint32 channels = 12 [default = 3]; // Random Vignette augmentation optional bool vignette = 13 [default = false]; optional bool color_bump = 14 [default = false]; // Specify the base original size for a(n) square image (repeated for height, width) repeated uint32 orig_size = 15; // Specify base original height for an image (not necessarily square) optional uint32 orig_h = 16 [default = 256]; // Specify base original width for an image (not necessarily square) optional uint32 orig_w = 17 [default = 256]; // Specify the number of crops for testing optional uint32 test_views = 18 [default = 1]; //Specify standard deviation for normalizing convnet repeated float scale_values = 19; optional float scalejittering_min = 20 [default = 256]; optional float scalejittering_max = 21 [default = 512]; optional float min_percent_area = 22 [default = 0.08]; optional float max_percent_area = 23 [default = 1]; optional float min_aspect_ratio = 24 [default = 0.75]; optional float max_aspect_ratio = 25 [default = 1.3333]; optional int32 test_smaller_side = 26 [default = 256]; optional int32 pad_h = 27 [default = 0]; optional int32 pad_w = 28 [default = 0]; optional bool physical_padding = 29 [default = false]; } // Message that stores parameters used by Lognodes message LogParameter { // Lognodes computes outputs y = log_base(shift + scale * x), for base > 0. // Or if base is set to the default (-1), base is set to e, // so y = ln(shift + scale * x) = log_e(shift + scale * x) optional float base = 1 [default = -1.0]; optional float scale = 2 [default = 1.0]; optional float shift = 3 [default = 0.0]; } // Message that stores parameters shared by loss nodess message LossParameter { // If specified, ignore instances with the given label. optional int32 ignore_label = 1; // How to normalize the loss for loss nodess that aggregate across batches, // spatial dimensions, or other dimensions. Currently only implemented in // SoftmaxWithLoss nodes. enum NormalizationMode { // Divide by the number of examples in the batch times spatial dimensions. // Outputs that receive the ignore label will NOT be ignored in computing // the normalization factor. FULL = 0; // Divide by the total number of output locations that do not take the // ignore_label. If ignore_label is not set, this behaves like FULL. VALID = 1; // Divide by the batch size. BATCH_SIZE = 2; // Do not normalize the loss. NONE = 3; } optional NormalizationMode normalization = 3 [default = VALID]; // Deprecated. Ignored if normalization is specified. If normalization // is not specified, then setting this to false will be equivalent to // normalization = BATCH_SIZE to be consistent with previous behavior. optional bool normalize = 2; } message PoolingParameter { enum PoolMethod { MAX = 0; AVE = 1; STOCHASTIC = 2; } optional PoolMethod pool = 1 [default = MAX]; // The pooling method // Pad, kernel size, and stride are all given as a single value for equal // dimensions in height and width or as Y, X pairs. repeated uint32 kernel_size = 2; // The kernel size (square) optional uint32 kernel_h = 3; // The kernel height optional uint32 kernel_w = 4; // The kernel width optional uint32 kernel_d = 5; // The kernel depth (for 3D) repeated uint32 stride = 6; // The stride (equal in Y, X) optional uint32 stride_h = 7 [default = 1]; // The stride height optional uint32 stride_w = 8 [default = 1]; // The stride width optional uint32 stride_d = 9 [default = 1]; // The stride depth (for 3D) repeated uint32 pad = 10; // The padding size (equal in Y, X) optional uint32 pad_h = 13 [default = 0]; // The padding height optional uint32 pad_w = 14 [default = 0]; // The padding width optional uint32 pad_d = 15 [default = 0]; // The padding depth (for 3D) optional uint32 ndims = 17 [default = 2]; optional ComputeEngine engine = 11 [default = XSMM]; // If global_pooling then it will pool over the size of the bottom by doing // kernel_h = bottom->height and kernel_w = bottom->width optional bool global_pooling = 12 [default = false]; optional AlgorithmType algotype = 16 [default = DIRECT]; optional DataType data_type = 18 [default = FLOAT]; } // Message that stores parameters used by ReLUnodes message ReLUParameter { // Allow non-zero slope for negative inputs to speed up optimization // Described in: // Maas, A. L., Hannun, A. Y., & Ng, A. Y. (2013). Rectifier nonlinearities // improve neural network acoustic models. In ICML Workshop on Deep Learning // for Audio, Speech, and Language Processing. optional float negative_slope = 1 [default = 0]; optional ComputeEngine engine = 2 [default = XSMM]; optional AlgorithmType algotype = 3 [default = DIRECT]; optional DataType data_type = 4 [default = FLOAT]; } // Message that stores parameters used by Softmaxnodes, SoftmaxWithLossnodes message SoftmaxParameter { optional ComputeEngine engine = 1 [default = XSMM]; // The axis along which to perform the softmax -- may be negative to index // from the end (e.g., -1 for the last axis). // Any other axes will be evaluated as independent softmaxes. optional int32 axis = 2 [default = 1]; optional AlgorithmType algotype = 3 [default = DIRECT]; optional DataType data_type = 4 [default = FLOAT]; } message SplitParameter { optional ComputeEngine engine = 1 [default = XSMM]; optional AlgorithmType algotype = 2 [default = DIRECT]; optional DataType data_type = 3 [default = FLOAT]; } message TanHParameter { optional ComputeEngine engine = 1 [default = XSMM]; optional AlgorithmType algotype = 2 [default = DIRECT]; } // Message that stores parameters used by Thresholdnodes message ThresholdParameter { optional float threshold = 1 [default = 0]; // Strictly positive values } message WindowDataParameter { // Specify the data source. optional string source = 1; // For data pre-processing, we can do simple scaling and subtracting the // data mean, if provided. Note that the mean subtraction is always carried // out before scaling. optional float scale = 2 [default = 1]; optional string mean_file = 3; // Specify the batch size. optional uint32 batch_size = 4; // Specify if we would like to randomly crop an image. optional uint32 crop_size = 5 [default = 0]; // Specify if we want to randomly mirror data. optional bool mirror = 6 [default = false]; // Foreground (object) overlap threshold optional float fg_threshold = 7 [default = 0.5]; // Background (non-object) overlap threshold optional float bg_threshold = 8 [default = 0.5]; // Fraction of batch that should be foreground objects optional float fg_fraction = 9 [default = 0.25]; // Amount of contextual padding to add around a window // (used only by the window_data_nodes) optional uint32 context_pad = 10 [default = 0]; // Mode for cropping out a detection window // warp: cropped window is warped to a fixed size and aspect ratio // square: the tightest square around the window is cropped optional string crop_mode = 11 [default = "warp"]; // cache_images: will load all images in memory for faster access optional bool cache_images = 12 [default = false]; // append root_folder to locate images optional string root_folder = 13 [default = ""]; } message SPPParameter { enum PoolMethod { MAX = 0; AVE = 1; STOCHASTIC = 2; } optional uint32 pyramid_height = 1; optional PoolMethod pool = 2 [default = MAX]; // The pooling method optional ComputeEngine engine = 3 [default = XSMM]; optional AlgorithmType algotype = 4 [default = DIRECT]; } message PReLUParameter { // Parametric ReLU described in K. He et al, Delving Deep into Rectifiers: // Surpassing Human-Level Performance on ImageNet Classification, 2015. // Initial value of a_i. Default is a_i=0.25 for all i. optional FillerParameter filler = 1; // Whether or not slope paramters are shared across channels. optional bool channel_shared = 2 [default = false]; } libxsmm-1.17/samples/deeplearning/gxm/setup_env.sh000077500000000000000000000017261415223013700223640ustar00rootroot00000000000000#!/usr/bin/env bash LIB_PATH=/swtools/caffe_deps/lib export LD_LIBRARY_PATH=${LIB_PATH}:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=`pwd`/build/lib:$LD_LIBRARY_PATH export BIN_PATH=/swtools/caffe_deps/bin export PATH=${BIN_PATH}:$PATH source /swtools/intel/compilers_and_libraries_2019.4.243/linux/bin/compilervars.sh intel64 #source /swtools/intel/compilers_and_libraries_2019.3.199/linux/mpi/intel64/bin/mpivars.sh source /swtools/intel/compilers_and_libraries_2019.4.243/linux/tbb/bin/tbbvars.sh intel64 source /swtools/intel/impi/2017.3.196/bin64/mpivars.sh export MLSL_ROOT=/nfs_home/savancha/MLSL/_install if [ -z "${I_MPI_ROOT}" ] then export I_MPI_ROOT="${MLSL_ROOT}" fi if [ -z "${PATH}" ] then export PATH="${MLSL_ROOT}/intel64/bin" else export PATH="${MLSL_ROOT}/intel64/bin:${PATH}" fi if [ -z "${LD_LIBRARY_PATH}" ] then export LD_LIBRARY_PATH="${MLSL_ROOT}/intel64/lib" else export LD_LIBRARY_PATH="${MLSL_ROOT}/intel64/lib:${LD_LIBRARY_PATH}" fi libxsmm-1.17/samples/deeplearning/gxm/src/000077500000000000000000000000001415223013700205765ustar00rootroot00000000000000libxsmm-1.17/samples/deeplearning/gxm/src/Accuracy.cpp000066400000000000000000000104431415223013700230360ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #include #include "Accuracy.hpp" #ifdef USE_MLSL #include "mpi.h" #endif AccuracyNode::AccuracyNode(AccuracyParams* p, MLEngine* e) : NNNode(p, e) { nname_ = p->get_node_name(); ntype_ = p->get_node_type(); mode_ = p->get_mode(); top_ = p->get_top_names(); bottom_ = p->get_bottom_names(); has_weights_ = false; bp_flag_ = p->get_bprop_flag(); tenBot_.resize(bottom_.size()); tenBotData_.resize(bottom_.size()); for(int i=0; iget_tensor(bottom_[i], LABEL); else tenBot_[i] = e->get_tensor(bottom_[i], ACT); assert(tenBot_[i] != NULL); setPrevNode((NNNode*)tenBot_[i]->getOwner()); tenBotData_[i] = tenBot_[i]->getBuf(DATA); } // Get input tensor shape (bottom) Shape* bs = tenBot_[0]->getShape(); assert(bs->ndims <= MAX_DIMS); shape_setzero(&ts_); ts_.ndims = 2; ts_.dims[0] = bs->dims[0]; // minibatch ts_.dims[1] = bs->dims[1]; // num output = num_input top_k_ = p->get_top_k(); max_val.resize(top_k_ + 1); max_id.resize(top_k_ + 1); eptr_ = e; train_batch_count_ = 0; test_batch_count_ = 0; avg_train_acc_ = 0; avg_test_acc_ = 0; } void AccuracyNode::forwardPropagate() { #ifdef RETURNALL return; #endif float* bot = (float*)(tenBotData_[0]->getBuffer()); int* label = (int*)(tenBotData_[1]->getBuffer()); #ifdef DEBUG printf("Executing FP %s: input %p, label %p\n",NNNode::nname_.c_str(), bot, label); #endif int accuracy = 0; int count = 0; for(int img=0; img= prob_true_class; if(num_better_predictions < top_k_) accuracy++; count++; } #ifdef USE_MLSL size_t num_nodes = MLSL::Environment::GetEnv().GetProcessCount(); size_t node_id = MLSL::Environment::GetEnv().GetProcessIdx(); #else size_t num_nodes = 1; size_t node_id = 0; #endif if(eptr_->get_execution_mode() == TRAIN) { avg_train_acc_ += (double)accuracy/(double)count; train_batch_count_++; if(train_batch_count_ == eptr_->get_num_train_batches()) { avg_train_acc_ = avg_train_acc_/(double)train_batch_count_; #ifdef USE_MLSL MPI_Allreduce(MPI_IN_PLACE, &avg_train_acc_, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); avg_train_acc_ = avg_train_acc_/num_nodes; if(node_id == 0) printf("Top-%d Minibatch training accuracy = %f\n", top_k_, avg_train_acc_); #else printf("Top-%d Minibatch training accuracy = %f\n", top_k_, avg_train_acc_); #endif train_batch_count_ = 0; avg_train_acc_ = 0; } } else if(eptr_->get_execution_mode() == TEST || eptr_->get_execution_mode() == VAL) { avg_test_acc_ += (double)accuracy/(double)count; test_batch_count_++; if(test_batch_count_ == eptr_->get_num_test_batches()*eptr_->get_num_test_views()) { avg_test_acc_ = avg_test_acc_/(double)test_batch_count_; #ifdef USE_MLSL MPI_Allreduce(MPI_IN_PLACE, &avg_test_acc_, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); avg_test_acc_ = avg_test_acc_/num_nodes; if(node_id == 0) printf("Top-%d Minibatch testing accuracy = %f\n", top_k_, avg_test_acc_); #else printf("Top-%d Minibatch testing accuracy = %f\n", top_k_, avg_test_acc_); #endif test_batch_count_ = 0; avg_test_acc_ = 0; } } } libxsmm-1.17/samples/deeplearning/gxm/src/Concat.cpp000066400000000000000000000162711415223013700225200ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #include #include "Concat.hpp" using namespace std; using namespace gxm; ConcatNode::ConcatNode(ConcatParams* p, MLEngine* e) : NNNode(p, e) { nname_ = p->get_node_name(); ntype_ = p->get_node_type(); mode_ = p->get_mode(); bottom_ = p->get_bottom_names(); top_ = p->get_top_names(); bp_flag_ = p->get_bprop_flag(); has_weights_ = false; bot_compute_engine_ = p->get_compute_engine(); assert(top_.size() == 1); tenTop_ = new Tensor(top_[0]); assert(tenTop_ != NULL); tenTop_->setOwner(this); tenTop_->setType(ACT); tenTopData_ = tenTop_->getBuf(DATA); tenTopData_->setBufferType(DATA); #ifdef DEBUG printf("bottom name %s\n",bottom_[0].c_str()); #endif Shape ts; shape_setzero(&ts); tenBot_.resize(bottom_.size()); bot_cengine_.resize(bottom_.size()); tenBotData_.resize(bottom_.size()); for(int i=0; iget_tensor(bottom_[i], ACT); assert(tenBot_[i] != NULL); NNNode *pnn = (NNNode*)tenBot_[i]->getOwner(); setPrevNode(pnn); pnn->set_top_compute_engine(p->get_compute_engine()); bot_cengine_[i] = pnn->get_bot_compute_engine(); tenBotData_[i] = tenBot_[i]->getBuf(DATA); } // number of concats gparams_.nInput.resize(bottom_.size()); tenBotDiff_.resize(bottom_.size()); int dtype = p->get_data_type(); for(int i=0; igetShape(); assert(bs->ndims <= MAX_DIMS); ts.dims[1] += bs->dims[1]; gparams_.nInput[i] = bs->dims[1]; if(!e->is_inference_only()) { if(NNNode::bp_flag_) { tenBotDiff_[i] = tenBot_[i]->addBuf(); // DIFF type and index tenBotDiff_[i]->setDataType(dtype); tenBotDiff_[i]->setBufferType(DIFF); long long int bsize = 1; for(int s=0; sndims; s++) bsize = bsize*bs->dims[s]; if(dtype == DT_FLOAT) bsize = bsize*sizeof(float); else if(dtype == DT_INT16) bsize = bsize*sizeof(short int); else if(dtype == DT_INT) bsize = bsize*sizeof(int); // Set the size of the input-gradient buffer tenBotDiff_[i]->setBufferSize(bsize); } } else tenBotDiff_[i] = NULL; } //Output tensor data type = input tensor data type tenTopData_->setDataType(dtype); Shape *bs = tenBot_[0]->getShape(); ts.ndims = bs->ndims; ts.dims[0] = bs->dims[0]; ts.dims[2] = bs->dims[2]; ts.dims[3] = bs->dims[3]; tenTop_->setShape(&ts); long long int tsize = 1; for(int s=0; ssetBufferSize(tsize); // Register output tensor in tensor map bool inserted = e->register_tensor(NNNode::top_[0], ACT, tenTop_); if(!inserted) printf("Warning: Tensor %s already registered\n",NNNode::top_[0].c_str()); gparams_.bdims = bs->ndims; gparams_.tdims = ts.ndims; gparams_.batch_size = ts.dims[0]; gparams_.nOutput = ts.dims[1]; gparams_.iHeight = bs->dims[2]; gparams_.iWidth = bs->dims[3]; gparams_.oHeight = ts.dims[2]; gparams_.oWidth = ts.dims[3]; gparams_.data_type = dtype; gparams_.algType = p->get_algo_type(); gparams_.num_threads = e->get_num_threads(); #ifdef GETSTATS count_ = 0; #endif configure(p->get_compute_engine()); } void ConcatNode::configure(int engine) { switch(engine) { case XSMM: impl = new ConcatXSMM(&gparams_, engine); break; } } void ConcatNode::forwardPropagate() { #ifdef DEBUG float* bot; float* top = (float*)(tenTopData_->getBuffer()); for(int i=0; igetBuffer()); printf("Executing FP %s: input %p, output %p\n",NNNode::nname_.c_str(), bot, top); } #endif for(int i=0; iset_bot_compute_engine(bot_cengine_[i]); impl->set_top_compute_engine(top_compute_engine_); impl->set_next_node_type(next_ntype_); impl->set_node_name(nname_); impl->forwardPropagate(tenBotData_, tenTopData_); #ifdef CHECK_BLOWUP_FP32 float* ptr = (float*)tenTopData_->getBuffer(); for(int i=0; i<16; i++) { if(isnan(ptr[i]) || isinf(ptr[i])) { printf("Warning! %s layer FP activations are NaN or Inf\n", nname_.c_str()); exit(-1); } } #endif #ifdef GETSTATS #ifdef USE_MLSL size_t node_id = MLSL::Environment::GetEnv().GetProcessIdx(); #else size_t node_id = 0; #endif if(node_id==0 && count_ % STATFREQ == 0) { float* p, *pp, *ptr; int size; for(int i=0; igetBuffer(); pp = (float*)tenBotData_[i]->getPrivBuffer(); ptr = (pp == NULL) ? p : pp; size = tenBotData_[i]->getBufferSize()/sizeof(float); MeanOfLayer((char*)bottom_[i].c_str(), ptr, size); } p = (float*)tenTopData_->getBuffer(); pp = (float*)tenTopData_->getPrivBuffer(); ptr = (pp == NULL) ? p : pp; size = tenTopData_->getBufferSize()/sizeof(float); MeanOfLayer((char*)top_[0].c_str(), ptr, size); } #endif } void ConcatNode::backPropagate() { tenTopDiff_ = tenTop_->getBuf(DIFF); #ifdef DEBUG for(int i=0; igetBuffer(), tenBotDiff_[i]->getBuffer()); #endif impl->backPropagate(tenTopDiff_, tenBotDiff_); #ifdef CHECK_BLOWUP_FP32 float* ptr = (float*)tenTopDiff_->getBuffer(); for(int i=0; i<16; i++) { if(isnan(ptr[i]) || isinf(ptr[i])) { printf("Warning! %s layer BP activations are NaN or Inf\n", nname_.c_str()); exit(-1); } } #endif #ifdef GETSTATS #ifdef USE_MLSL size_t node_id = MLSL::Environment::GetEnv().GetProcessIdx(); #else size_t node_id = 0; #endif if(node_id==0 && count_ % STATFREQ == 0) { float* p, *pp, *ptr; p = (float*)tenTopDiff_->getBuffer(); pp = (float*)tenTopDiff_->getPrivBuffer(); ptr = (pp == NULL) ? p : pp; int size = tenTopDiff_->getBufferSize()/sizeof(float); MeanOfLayer((char*)top_[0].c_str(), ptr, size); for(int i=0; igetBuffer(); pp = (float*)tenBotDiff_[i]->getPrivBuffer(); ptr = (pp == NULL) ? p : pp; size = tenBotDiff_[i]->getBufferSize()/sizeof(float); MeanOfLayer((char*)bottom_[i].c_str(), ptr, size); } count_++; } #endif } libxsmm-1.17/samples/deeplearning/gxm/src/ConcatXSMM.cpp000066400000000000000000000066521415223013700232270ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #include #include #include "ConcatXSMM.hpp" #define VLEN 16 void ConcatXSMM::forwardPropagate(vector& inpb, TensorBuf *outpb, int tid) { float *outp = (float*)outpb->getBuffer(); int nImg = gp->batch_size; int nOfm = gp->nOutput; int nBOfm = gp->nOutput/VLEN; int rem = 0; int ifh = gp->iHeight; int ifw = gp->iWidth; int ofh = gp->oHeight; int ofw = gp->oWidth; bool needs_conversion = false; int threads = gp->num_threads; __assume_aligned(outp, 64); float (* __restrict output)[nBOfm][ofh][ofw][VLEN] = (float (*)[*][*][*][VLEN])outp; #ifdef _OPENMP #pragma omp parallel for #endif for(int img=0; img < nImg; img++) { int ofm = 0; for(int b=0; bnInput[b]/VLEN; float *inp __attribute__((aligned(64))); inp = (float*)inpb[b]->getBuffer(); float (* __restrict input )[nBIfm][ifh][ifw][VLEN] = (float (*)[*][*][*][VLEN])inp; for(int ifm=0; ifm < nBIfm; ifm++) { for(int h=0; h < ifh; h++) { for(int w=0; w < ifw; w++) { #pragma omp simd #pragma vector aligned #pragma vector nontemporal for(int v=0; v < VLEN; v++) { output[img][ofm][h][w][v] = input[img][ifm][h][w][v]; } } } ofm++; } } } outpb->setLayoutType(LIBXSMM_CUSTOM_LAYOUT); } void ConcatXSMM::backPropagate(TensorBuf *deloutpb, vector& delinpb, int tid) { float *deloutp = (float*)deloutpb->getBuffer(); int nImg = gp->batch_size; int nOfm = gp->nOutput; int nBOfm = gp->nOutput/VLEN; int ifh = gp->iHeight; int ifw = gp->iWidth; int ofh = gp->oHeight; int ofw = gp->oWidth; int rem = 0; int threads = gp->num_threads; __assume_aligned(deloutp, 64); float (* __restrict del_output)[nBOfm][ofh][ofw][VLEN] = (float (*)[*][*][*][VLEN])deloutp; #ifdef _OPENMP #pragma omp parallel for #endif for(int img=0; img < nImg; img++) { int ofm = 0; for(int b=0; bnInput[b]/VLEN; float *delinp __attribute__((aligned(64))); delinp = (float*)delinpb[b]->getBuffer(); float (* __restrict del_input)[nBIfm][ifh][ifw][VLEN] = (float (*)[*][*][*][VLEN])delinp; for(int ifm=0; ifm < nBIfm; ifm++) { for(int h=0; h < ifh; h++) { for(int w=0; w < ifw; w++) { #pragma omp simd #pragma vector aligned #pragma vector nontemporal for(int v=0; v < VLEN; v++) { del_input[img][ifm][h][w][v] = del_output[img][ofm][h][w][v]; } } } ofm++; } } } for(int b=0; bsetLayoutType(LIBXSMM_CUSTOM_LAYOUT); } libxsmm-1.17/samples/deeplearning/gxm/src/Conv.cpp000066400000000000000000000777741415223013700222350ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar, Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include #include "Conv.hpp" #include "fillers.hpp" #ifdef USE_MLSL #include "mpi.h" #endif using namespace std; using namespace gxm; ConvNode::ConvNode(ConvParams* p, MLEngine* e): NNNode(p, e) { nname_ = p->get_node_name(); ntype_ = p->get_node_type(); bottom_ = p->get_bottom_names(); top_ = p->get_top_names(); bp_flag_ = p->get_bprop_flag(); has_weights_ = true; compute_stats_ = p->get_compute_stats(); bot_compute_engine_ = p->get_compute_engine(); assert((bottom_.size() == 1) && (top_.size() == 1)); bool bias_term = p->get_bias_term(); tenTop_ = new Tensor(top_[0]); assert(tenTop_ != NULL); tenTop_->setOwner(this); tenTop_->setType(ACT); tenTopData_ = tenTop_->getBuf(DATA); tenTopData_->setBufferType(DATA); #ifndef NDEBUG printf("bottom name %s\n",bottom_[0].c_str()); #endif if(bottom_[0] == "data") tenBot_ = e->get_tensor(bottom_[0], INPUT); else tenBot_ = e->get_tensor(bottom_[0], ACT); assert(tenBot_ != NULL); NNNode *pnn = (NNNode*)tenBot_->getOwner(); setPrevNode(pnn); mode_ = pnn->getMode(); pnn->set_top_compute_engine(p->get_compute_engine()); bot_cengine_ = pnn->get_bot_compute_engine(); tenBotData_ = tenBot_->getBuf(DATA); out_dtype = p->get_data_type(); in_dtype = tenBotData_->getDataType(); tenTopData_->setDataType(out_dtype); // Get input tensor shape (bottom) Shape* bs = tenBot_->getShape(); assert(bs->ndims <= MAX_DIMS); // Create shape of output tensor (top) vector vd = p->get_kernel_dims(); vector ovp = p->get_output_pads(); vector vp = p->get_pads(); vector vs = p->get_strides(); assert((vd.size() == vp.size()) && (vd.size() == vs.size()) && (vs.size() == ovp.size())); shape_setzero(&ts_); ts_.ndims = bs->ndims; // Number of dimensions ts_.dims[0] = bs->dims[0]; // Minibatch size ts_.dims[1] = p->get_output(); // Num output feature maps ts_.dims[2] = (bs->dims[2] - vd[0] + 2*vp[0])/vs[0] + 1; // Height ts_.dims[3] = (bs->dims[3] - vd[1] + 2*vp[1])/vs[1] + 1; // Width tenTop_->setShape(&ts_); long long int tsize; int telem = ts_.dims[0] * ts_.dims[1] * (ts_.dims[2] + 2*ovp[0]) * (ts_.dims[3] + 2*ovp[1]); // Buffer space for sum and sum^2 int tstats=0; if(compute_stats_) tstats = 2*ts_.dims[0]*ts_.dims[1]; if(out_dtype == DT_FLOAT) tsize = telem*sizeof(float) + tstats*sizeof(float); else if(out_dtype == DT_BF16) tsize = telem*sizeof(libxsmm_bfloat16) + tstats*sizeof(float); tenTopData_->setBufferSize(tsize); // Create FP weight tensor weight_ = top_[0] + "_wt"; tenWeight_ = new Tensor(weight_); assert(tenWeight_ != NULL); tenWeight_->setOwner(this); tenWeight_->setType(CONVWEIGHT); shape_setzero(&ws_); ws_.ndims = ts_.ndims; // Number of dimesions ws_.dims[0] = ts_.dims[1]; // Num output feature maps (from top tensor) ws_.dims[1] = bs->dims[1]; // Num input feature maps (from bottom tensor) ws_.dims[2] = vd[0]; // Kernel height if(ts_.ndims == 4) { ws_.dims[3] = vd[1]; // Kernel width } else if(ts_.ndims == 5) { ws_.dims[3] = vd[1]; ws_.dims[4] = vd[2]; } tenWeight_->setShape(&ws_); tenWeight_->setBufDataType(DATA, DT_FLOAT); tenWeightData_ = tenWeight_->getBuf(DATA); tenWeightData_->setBufferType(DATA); int welem = 1; long long int wsize; for(int i=0; isetBufferSize(wsize); wfiller_type_ = p->get_weight_filler_type(); variance_norm_ = p->get_variance_norm(); std_ = p->get_std(); lr_mult_ = p->get_lr_mult(); decay_mult_ = p->get_decay_mult(); // Create bias tensor long long int bisize; Shape bis; { if(bias_term) { bias_ = top_[0] + "_bias"; tenBias_ = new Tensor(bias_); assert(tenBias_ != NULL); tenBias_->setOwner(this); tenBias_->setType(CONVBIAS); shape_setzero(&bis); bis.ndims = 1; bis.dims[0] = ts_.dims[1]; tenBias_->setShape(&bis); tenBiasData_ = tenBias_->getBuf(DATA); tenBiasData_->setDataType(DT_FLOAT); tenBiasData_->setBufferType(DATA); bisize = bis.dims[0]; bisize = bisize*sizeof(float); // Biases are always in FP32 tenBiasData_->setBufferSize(bisize); bfiller_type_ = p->get_bias_filler_type(); value_ = p->get_value(); } } if(!e->is_inference_only()) { if(bp_flag_) { tenBotDiff_ = tenBot_->addBuf(); // DIFF type and index tenBotDiff_->setDataType(in_dtype); tenBotDiff_->setBufferType(DIFF); long long int bsize = bs->dims[0] * bs->dims[1] * (bs->dims[2] + 2*vp[0]) * (bs->dims[3] + 2*vp[1]); if((in_dtype == DT_FLOAT && out_dtype == DT_FLOAT) || (in_dtype == DT_BF16 && out_dtype == DT_FLOAT)) bsize = bsize*sizeof(float); else if(in_dtype == DT_BF16 && out_dtype == DT_BF16) bsize = bsize*sizeof(libxsmm_bfloat16); // Set the size of the input-gradient buffer tenBotDiff_->setBufferSize(bsize); } if(has_weights_) { tenWeightDiff_ = tenWeight_->addBuf(); // DIFF type and index tenWeightDiff_->setBufferType(DIFF); tenWeightInc_ = tenWeight_->addBuf(); // SHARED type and index tenWeightInc_->setBufferType(HISTORY); tenWeightInc_->setDataType(DT_FLOAT); tenWeightInc_->setBufferSize(welem*sizeof(float)); if(in_dtype == DT_FLOAT) { tenWeightDiff_->setDataType(DT_FLOAT); tenWeightDiff_->setBufferSize(welem*sizeof(float)); } else if(in_dtype == DT_BF16) { tenWeightDiff_->setDataType(DT_BF16); #ifdef BF16_MLSL tenWeightDiff_->setBufferSize(welem*sizeof(libxsmm_bfloat16)); #else tenWeightDiff_->setBufferSize(welem*sizeof(float)); #endif } if(bias_term) { tenBiasDiff_ = tenBias_->addBuf(); // DIFF type and index tenBiasDiff_->setDataType(DT_FLOAT); tenBiasDiff_->setBufferType(DIFF); tenBiasInc_ = tenBias_->addBuf(); // SHARED type and index tenBiasInc_->setDataType(DT_FLOAT); tenBiasInc_->setBufferType(HISTORY); // Set the size of the weight-gradient buffer and the weight-increment buffer tenBiasDiff_->setBufferSize(bisize); tenBiasInc_->setBufferSize(bisize); } } } else { tenBotDiff_ = NULL; tenWeightDiff_ = NULL; tenWeightInc_ = NULL; tenBiasDiff_ = NULL; tenBiasInc_ = NULL; } // Register output tensor in tensor map bool inserted = e->register_tensor(top_[0], ACT, tenTop_); if(!inserted) printf("Warning: Tensor %s already registered\n",top_[0].c_str()); // Register weight tensor in weight tensor map inserted = e->register_tensor(weight_, CONVWEIGHT, tenWeight_); if(!inserted) printf("Warning: Tensor %s already registered\n",weight_.c_str()); // Register bias tensor in bias tensor map if(bias_term) { inserted = e->register_tensor(bias_, CONVBIAS, tenBias_); if(!inserted) printf("Warning: Tensor %s already registered\n",bias_.c_str()); } // Setup parameter structure for convolution computation in library gparams_.bdims = bs->ndims; gparams_.tdims = ts_.ndims; gparams_.wdims = ws_.ndims; gparams_.bidims = bis.ndims; gparams_.node_name = nname_; gparams_.nInput = bs->dims[1]; gparams_.nOutput = ts_.dims[1]; gparams_.batch_size = bs->dims[0]; gparams_.iHeight = bs->dims[2]; gparams_.iWidth = bs->dims[3]; gparams_.oHeight = ts_.dims[2]; gparams_.oWidth = ts_.dims[3]; gparams_.pad_h = vp[0]; gparams_.pad_w = vp[1]; gparams_.physical_padding = p->get_physical_padding(); gparams_.compute_stats = compute_stats_; if(gparams_.physical_padding) { gparams_.ipad_h = vp[0]; gparams_.ipad_w = vp[1]; } else { gparams_.ipad_h = 0; gparams_.ipad_w = 0; } if(gparams_.physical_padding) { gparams_.opad_h = ovp[0]; gparams_.opad_w = ovp[1]; } else { gparams_.opad_h = 0; gparams_.opad_w = 0; } gparams_.group = p->get_group(); gparams_.stride_h = vs[0]; gparams_.stride_w = vs[1]; gparams_.kh = ws_.dims[2]; gparams_.kw = ws_.dims[3]; gparams_.bias_term = bias_term; gparams_.relu = p->get_fused_relu(); gparams_.bwd_relu = p->get_bwd_relu(); gparams_.in_data_type = in_dtype; gparams_.out_data_type = out_dtype; gparams_.algType = p->get_algo_type(); gparams_.num_threads = e->get_num_threads(); // get solver solver_ = e->getSolver(); //get global scratch tensor buffer tenScratchData_ = e->getScratchBuffer(); // get engine eptr_ = e; #ifdef USE_MLSL MLSL::DataType dt = MLSL::DT_FLOAT; MLSL::OperationRegInfo *myRegInfo; MLSL::Session *s = eptr_->get_session(); myRegInfo = s->CreateOperationRegInfo(MLSL::OT_CC); myRegInfo->SetName(nname_.c_str()); myRegInfo->AddParameterSet(gparams_.nInput*gparams_.nOutput/gparams_.group, gparams_.kw*gparams_.kh, dt, false); if(bias_term) myRegInfo->AddParameterSet(gparams_.nOutput, 1, dt, false); myRegInfo->Validate(); size_t opIdx = s->AddOperation(myRegInfo, e->get_distribution()); this->op_ = s->GetOperation(opIdx); s->DeleteOperationRegInfo(myRegInfo); e->get_wtgrad_comms_vec().push_back(op_); #endif configure(p->get_compute_engine()); } void ConvNode::fillWeightBuffers(TensorBuf* tBuf, int buftype, long long int size) { void *ptr = tBuf->getBuffer(); #ifdef USE_MLSL unsigned int node_id = MLSL::Environment::GetEnv().GetProcessIdx(); #else unsigned int node_id = 0; #endif int ic = gparams_.nInput; int oc = gparams_.nOutput; int kh = gparams_.kh; int kw = gparams_.kw; int g = gparams_.group; int fanin = (ic * kh * kw)/g; int fanout = (oc * kh * kw)/g; int welem = ic * oc * kh * kw; if(buftype == DATA) { if(node_id == 0) initBuffer(ptr, variance_norm_, fanin, fanout, welem*sizeof(float), wfiller_type_, std_); #ifdef USE_MLSL MPI_Bcast(ptr, welem, MPI_FLOAT, 0, MPI_COMM_WORLD); #endif } else if(buftype == HISTORY || buftype == DIFF) memset(ptr, 0, size); } void ConvNode::fillWeightMultipliers(float* lr, float* decay, long long int size) { for(int i=0; i < size; i++) { lr[i] = lr_mult_[0]; decay[i] = decay_mult_[0]; } } void ConvNode::fillBiasBuffers(TensorBuf* tBuf, int buftype, long long int size) { void *ptr = tBuf->getBuffer(); if(buftype == DATA) { initConstantBuffer(ptr, size, "CONSTANT", value_); } else memset(ptr, 0, size); } void ConvNode::fillBiasMultipliers(float* lr, float* decay, long long int size) { if(gparams_.bias_term) { for(int i=0; i < size; i++) { lr[i] = lr_mult_[1]; decay[i] = decay_mult_[1]; } } } void ConvNode::Checkpoint(TensorBuf *tBuf, string name, string format) { long long int bytes = tBuf->getBufferSize(); int dtype = tBuf->getDataType(); int buftype = tBuf->getBufferType(); FILE* f; void* ptr; size_t pos; if((name.find("30") == name.npos) && (name.find("60") == name.npos) && (name.find("80") == name.npos)) while((pos = name.find("/", 10)) != name.npos) name.replace(pos, 1, 1, '_'); float* p = (float*)tBuf->getBuffer(); bool no_checkpt = false; for(int i=0; i<16; i++) { if(isnan(p[i]) || isinf(p[i])) { no_checkpt = true; printf("Warning! %s Did not checkpoint! Weights are NaNs or Inf\n", nname_.c_str()); break; } } if(!no_checkpt) { if(format == "binary") { f = fopen(name.c_str(), "wb"); if(f != NULL) { #if 0 if(name.find("wt") != name.npos) { ptr = _mm_malloc(bytes, 64); assert(ptr != NULL); impl->dumpBuffer(tBuf, ptr); } else #endif ptr = tBuf->getBuffer(); size_t b = fwrite(ptr, 1, bytes, f); assert((long long int)b == bytes); #if 0 if(name.find("wt") != name.npos) _mm_free(ptr); #endif } else printf("Warning: could not checkpoint to file %s\n",name.c_str()); } else { f = fopen(name.c_str(), "w"); if(f != NULL) { #if 0 if(name.find("wt") != name.npos) { ptr = _mm_malloc(bytes, 64); assert(ptr != NULL); impl->dumpBuffer(tBuf, ptr); } else #endif ptr = tBuf->getBuffer(); for(int i=0; i 0) { for(int i=ntps*jobs; iset_top_compute_engine(top_compute_engine_); impl->set_bot_compute_engine(bot_cengine_); impl->set_node_name(nname_); impl->set_scratch_buffer(tenScratchData_); long long int size = nImg * ofm * ofhp * ofwp; if(first_fp) { if(tenTopData_->getDataType() == DT_FLOAT) { float* ptr = (float*)tenTopData_->getBuffer(); #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; igetDataType() == DT_BF16) { libxsmm_bfloat16* ptr = (libxsmm_bfloat16*)tenTopData_->getBuffer(); #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; igetDataType() == DT_FLOAT) { float* ptr = (float*)tenTopData_->getBuffer(); if(compute_stats_) { float* sptr = ptr + size; /* @TODO move this into Batch Norm/LIBXSMM */ #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; i<2*nImg*ofm; i++) sptr[i] = 0; } } else if(tenTopData_->getDataType() == DT_BF16) { libxsmm_bfloat16* ptr = (libxsmm_bfloat16*)tenTopData_->getBuffer(); if(compute_stats_) { libxsmm_bfloat16* sptr = ptr + size; /* @TODO move this into Batch Norm/LIBXSMM */ #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; i<2*nImg*ofm; i++) sptr[i] = 0; } } impl->forwardPropagate(tenBotData_, tenWeightData_, tenWeightInc_, tenBiasData_, tenTopData_); #ifdef CHECK_BLOWUP_FP32 if(out_dtype == DT_FLOAT) { for(int i=0; i<16; i++) { float v = ((float*)tenTopData_->getBuffer())[i]; if(isnan(v) || isinf(v)) { printf("Warning! %s layer FP activations are NaN or Inf\n", nname_.c_str()); exit(-1); } } } else if(out_dtype == DT_BF16) { convert_bf16_f32((libxsmm_bfloat16*)tenTopData_->getBuffer(), cbptr, 10240); for(int i=0; i<10240; i++) { if(isnan(cbptr[i]) || isinf(cbptr[i])) { printf("Warning! %s layer FP activations are NaN or Inf\n", nname_.c_str()); exit(-1); } } } #endif #ifdef GETSTATS #ifdef USE_MLSL unsigned int node_id = MLSL::Environment::GetEnv().GetProcessIdx(); if(node_id == 0) #endif { if(out_dtype == DT_FLOAT) { float *ptr, *pptr, *p; if(eptr_->get_current_batch() % STATFREQ == 0) { string s = nname_ + "_Inp"; ptr = (float*)tenBotData_->getBuffer(); pptr = (float*)tenBotData_->getPrivBuffer(); p = (pptr == NULL) ? ptr : pptr; MeanOfLayer((char*)s.c_str(), p, nImg*ifm*ifhp*ifwp); s = nname_ + "_Wt"; ptr = (float*)tenWeightData_->getBuffer(); pptr = (float*)tenWeightData_->getPrivBuffer(); p = (pptr == NULL) ? ptr : pptr; MeanOfLayer((char*)s.c_str(), p, ifm*ofm*kh*kw); if(gparams_.bias_term) { s = nname_ + "_Bias"; p = (float*)tenBiasData_->getBuffer(); MeanOfLayer((char*)s.c_str(), p, ofm); } s = nname_ + "_Outp"; ptr = (float*)tenTopData_->getBuffer(); pptr = (float*)tenTopData_->getPrivBuffer(); p = (pptr == NULL) ? ptr : pptr; MeanOfLayer((char*)s.c_str(), p, nImg*ofm*ofhp*ofwp); if(compute_stats_) { s = nname_ + "_sump"; int offset = nImg*ofm*ofhp*ofwp*sizeof(float); void* m = (void*)p + offset; MeanOfLayer((char*)s.c_str(), (double*)m, nImg*ofm); s = nname_ + "_sum2p"; void* m2 = (void*)m + nImg*ofm*sizeof(double); MeanOfLayer((char*)s.c_str(), (double*)m2, nImg*ofm); } } } else if(out_dtype == DT_BF16) { if(stptr == NULL) { int os = nImg*ofm*ofhp*ofwp; int is = nImg*ifm*ifhp*ifwp; int ws = ifm*ofm*kh*kw; int m = os < is ? is : os; int msize = m < ws ? ws : m; stptr = (float*)libxsmm_aligned_malloc(msize*sizeof(float), 2097152); } { string s = nname_ + "_Inp"; libxsmm_bfloat16 *ptr; if(tenBotData_->getLPBuffer() != NULL) ptr = (libxsmm_bfloat16*)tenBotData_->getLPBuffer(); else ptr = (libxsmm_bfloat16*)tenBotData_->getBuffer(); convert_bf16_f32(ptr, stptr, nImg*ifm*ifhp*ifwp); MeanOfLayer((char*)s.c_str(), stptr, nImg*ifm*ifhp*ifwp); s = nname_ + "_Wt"; float *fptr = (float*)tenWeightData_->getBuffer(); int w = ifm*ofm*kh*kw; MeanOfLayer((char*)s.c_str(), fptr, w); if(gparams_.bias_term) { s = nname_ + "_Bias"; float *p = (float*)tenBiasData_->getBuffer(); MeanOfLayer((char*)s.c_str(), p, ofm); } s = nname_ + "_Outp"; ptr = (libxsmm_bfloat16*)tenTopData_->getBuffer(); memset(stptr, 0, nImg*ofm*ofhp*ofwp); convert_bf16_f32(ptr, stptr, nImg*ofm*ofhp*ofwp); MeanOfLayer((char*)s.c_str(), stptr, nImg*ofm*ofhp*ofwp); if(compute_stats_) { s = nname_ + "_sump"; int offset = nImg*ofm*ofhp*ofwp*sizeof(float); void* m = (void*)ptr + offset; MeanOfLayer((char*)s.c_str(), (float*)m, nImg*ofm); s = nname_ + "_sum2p"; void* m2 = (void*)m + nImg*ofm*sizeof(float); MeanOfLayer((char*)s.c_str(), (float*)m2, nImg*ofm); } } } } #endif } void ConvNode::backPropagate() { int nImg = gparams_.batch_size; int ifm = gparams_.nInput; int ofm = gparams_.nOutput; int ifh = gparams_.iHeight; int ifhp = ifh + 2*gparams_.ipad_h; int ifw = gparams_.iWidth; int ifwp = ifw + 2*gparams_.ipad_w; int ofh = gparams_.oHeight; int ofw = gparams_.oWidth; int ofhp = ofh + 2*gparams_.opad_h; int ofwp = ofw + 2*gparams_.opad_w; int kh = gparams_.kh; int kw = gparams_.kw; #ifdef DEBUG printf("Executing BP %s\n",NNNode::nname_.c_str()); printf("Grad Outputs: %d x %d x %d\n", ofm, ofh, ofw); printf("Grad Inputs: %d x %d x %d\n", ifm, ifh, ifw); printf("Weights: %d x %d x %d x %d\n", ofm, ifm, kh, kw); #endif tenTopDiff_ = tenTop_->getBuf(DIFF); if(first_bp) { long long int size = nImg * ifm * ifhp *ifwp; if((in_dtype == DT_BF16 && out_dtype == DT_FLOAT) || (in_dtype == DT_FLOAT && out_dtype == DT_FLOAT)) { float* ptr = (float*)tenBotDiff_->getBuffer(); #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; igetBuffer(); #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; ibackPropagate(tenTopData_, tenWeightData_, tenTopDiff_, tenBotDiff_); #ifdef CHECK_BLOWUP_FP32 if(out_dtype == DT_FLOAT) { for(int i=0; i<10240; i++) { float v = ((float*)tenBotDiff_->getBuffer())[i]; if(isnan(v) || isinf(v)) { printf("Warning! %s layer BP activations are NaN or Inf\n", nname_.c_str()); exit(-1); } } } else if(out_dtype == DT_BF16) { convert_bf16_f32((libxsmm_bfloat16*)tenBotDiff_->getBuffer(), cbptr, 10240); #ifdef USE_MLSL int node_id = MLSL::Environment::GetEnv().GetProcessIdx(); #else int node_id = 0; #endif if(node_id == 0) { for(int i=0; i<10240; i++) { if(isnan(cbptr[i]) || isinf(cbptr[i])) { printf("Warning! %s layer BP activations are NaN or Inf\n", nname_.c_str()); MeanOfLayer((char*)((nname_+"_delin").c_str()), (libxsmm_bfloat16*)tenBotDiff_->getBuffer(), nImg*ifm*ifhp*ifwp); MeanOfLayer((char*)((nname_+"_delout").c_str()), (libxsmm_bfloat16*)tenTopDiff_->getBuffer(), nImg*ofm*ofhp*ofwp); MeanOfLayer((char*)((nname_+"_weight").c_str()), (libxsmm_bfloat16*)tenWeightData_->getLPBuffer(), ofm*ifm*kh*kw); #ifdef USE_MLSL MPI_Finalize(); #endif exit(-1); } } } } #endif #ifdef GETSTATS #ifdef USE_MLSL unsigned int node_id_ = MLSL::Environment::GetEnv().GetProcessIdx(); if(node_id_ == 0) #endif { if(eptr_->get_current_batch() % STATFREQ == 0) { if(in_dtype == DT_FLOAT && out_dtype == DT_FLOAT) { string s = nname_ + "_delOutp"; float *ptr = (float*)tenTopDiff_->getBuffer(); MeanOfLayer((char*)s.c_str(), ptr, nImg*ofm*ofhp*ofwp); s = nname_ + "_Wt"; ptr = (float*)tenWeightData_->getBuffer(); MeanOfLayer((char*)s.c_str(), ptr, ifm*ofm*kh*kw); s = nname_ + "_delInp"; ptr = (float*)tenBotDiff_->getBuffer(); MeanOfLayer((char*)s.c_str(), ptr, nImg*ifm*ifhp*ifwp); } else if(in_dtype == DT_BF16 && out_dtype == DT_BF16) { string s = nname_ + "_delOutp"; libxsmm_bfloat16 *ptr = (libxsmm_bfloat16*)tenTopDiff_->getBuffer(); memset(stptr, 0, nImg*ofm*ofhp*ofwp); convert_bf16_f32(ptr, stptr, nImg*ofm*ofhp*ofwp); MeanOfLayer((char*)s.c_str(), stptr, nImg*ofm*ofhp*ofwp); s = nname_ + "_Wt"; float *fptr = (float*)tenWeightData_->getBuffer(); MeanOfLayer((char*)s.c_str(), fptr, ifm*ofm*kh*kw); s = nname_ + "_delInp"; ptr = (libxsmm_bfloat16*)tenBotDiff_->getBuffer(); memset(stptr, 0, nImg*ifm*ifhp*ifwp); convert_bf16_f32(ptr, stptr, nImg*ifm*ifhp*ifwp); MeanOfLayer((char*)s.c_str(), stptr, nImg*ifm*ifhp*ifwp); } } } #endif } void ConvNode::weightUpdate() { int nImg = gparams_.batch_size; int ifm = gparams_.nInput; int ofm = gparams_.nOutput; int ifh = gparams_.iHeight; int ifw = gparams_.iWidth; int ofh = gparams_.oHeight; int ofw = gparams_.oWidth; int ofhp = ofh + 2*gparams_.opad_h; int ofwp = ofw + 2*gparams_.opad_w; int ifhp = ifh + 2*gparams_.ipad_h; int ifwp = ifw + 2*gparams_.ipad_w; int kh = gparams_.kh; int kw = gparams_.kw; #ifdef DEBUG // printf("Executing WU %s: grad_output %p, grad_weights %p, input %p\n",NNNode::nname_.c_str(), gtop, gwt, bot); printf("Executing WU %s\n",NNNode::nname_.c_str()); printf("Grad Outputs: %d x %d x %d\n",ofm, ofh,ofw); printf("Inputs: %d x %d x %d\n",ifm, ifh, ifw); printf("del-Weights: %d x %d x %d x %d\n", ofm, ifm, kh, kw); printf("del-Biases: %d\n", ofm); #endif #ifdef GETSTATS int node_id = 0; #ifdef USE_MLSL node_id = MLSL::Environment::GetEnv().GetProcessIdx(); if(node_id == 0 && eptr_->get_current_batch() % STATFREQ == 0) #else if(eptr_->get_current_batch() % STATFREQ == 0) #endif { if(in_dtype == DT_FLOAT) { string s = nname_ + "_delWt_Bef"; float *ptr = (float*)tenWeightDiff_->getBuffer(); MeanOfLayer((char*)s.c_str(), ptr, ifm*ofm*kh*kw); } else if(in_dtype == DT_BF16) { string s = nname_ + "_delWt_Bef"; libxsmm_bfloat16 *ptr = (libxsmm_bfloat16*)tenWeightDiff_->getBuffer(); memset(stptr, 0, ifm*ofm*kh*kw); convert_bf16_f32(ptr, stptr, ifm*ofm*kh*kw); MeanOfLayer((char*)s.c_str(), stptr, ifm*ofm*kh*kw); } if(gparams_.bias_term) { string s = nname_ + "_delBias_Bef"; float *p = (float*)tenBiasDiff_->getBuffer(); MeanOfLayer((char*)s.c_str(), p, ofm); } } #endif tenTopDiff_ = tenTop_->getBuf(DIFF); impl->weightUpdate(tenBotData_, tenTopDiff_, tenWeightDiff_, tenBiasDiff_); #ifdef CHECK_BLOWUP_FP32 if(out_dtype == DT_FLOAT) { for(int i=0; i<10240; i++) { float v = ((float*)tenWeightDiff_->getBuffer())[i]; if(isnan(v) || isinf(v)) { printf("Warning! %s layer weight-gradients are NaN or Inf\n", nname_.c_str()); exit(-1); } } } else if(out_dtype == DT_BF16) { #ifdef BF16_MLSL void **wptrptr = tenWeightDiff_->getBufferPtr(); #else void **wptrptr = tenWeightDiff_->getLPBufferPtr(); #endif int offset = tenWeightDiff_->getOffset(); void* bf16_wtdiff = wptrptr[0] + offset*sizeof(libxsmm_bfloat16); convert_bf16_f32((libxsmm_bfloat16*)bf16_wtdiff, cbptr, 10240); #ifdef USE_MLSL int node_id = MLSL::Environment::GetEnv().GetProcessIdx(); #else int node_id = 0; #endif if(node_id == 0) { for(int i=0; i<10240; i++) { if(isnan(cbptr[i]) || isinf(cbptr[i])) { printf("Warning! %s layer weight-gradients are NaN or Inf\n", nname_.c_str()); MeanOfLayer((char*)nname_.c_str(), (libxsmm_bfloat16*)bf16_wtdiff, ofm*ifm*kw*kw); exit(-1); } } } } #endif #ifdef USE_MLSL void *mptr = tenWeightDiff_->getBuffer(); #ifndef BF16_MLSL void *lmptr = tenWeightDiff_->getLPBuffer(); if(in_dtype == DT_BF16) { convert_bf16_f32((libxsmm_bfloat16*)lmptr, (float*)mptr, ifm*ofm*kh*kw); op_->GetParameterSet(0)->StartGradientComm(mptr); } else if(in_dtype == DT_FLOAT) op_->GetParameterSet(0)->StartGradientComm(mptr); #else op_->GetParameterSet(0)->StartGradientComm(mptr); #endif if(gparams_.bias_term) op_->GetParameterSet(1)->StartGradientComm(tenBiasDiff_->getBuffer()); #endif #ifdef GETSTATS #ifdef USE_MLSL node_id = MLSL::Environment::GetEnv().GetProcessIdx(); #else node_id = 0; #endif if(node_id == 0) { if(in_dtype == DT_FLOAT) { string s = nname_ + "_Inp"; float *ptr = (float*)tenBotData_->getBuffer(); MeanOfLayer((char*)s.c_str(), ptr, nImg*ifm*ifhp*ifwp); s = nname_ + "_delOutp"; ptr = (float*)tenTopDiff_->getBuffer(); MeanOfLayer((char*)s.c_str(), ptr, nImg*ofm*ofhp*ofwp); s = nname_ + "_delWt_Aft"; ptr = (float*)tenWeightDiff_->getBuffer(); float *pptr = (float*)tenWeightDiff_->getPrivBuffer(); float *p = (pptr == NULL) ? ptr : pptr; MeanOfLayer((char*)s.c_str(), p, ifm*ofm*kh*kw); } else if(in_dtype == DT_BF16) { string s = nname_ + "_Inp"; libxsmm_bfloat16 *ptr; if(tenBotData_->getLPBuffer() != NULL) ptr = (libxsmm_bfloat16*)tenBotData_->getLPBuffer(); else ptr = (libxsmm_bfloat16*)tenBotData_->getBuffer(); memset(stptr, 0, nImg*ifm*ifhp*ifwp); convert_bf16_f32(ptr, stptr, nImg*ifm*ifhp*ifwp); MeanOfLayer((char*)s.c_str(), stptr, nImg*ifm*ifhp*ifwp); s = nname_ + "_delOutp"; ptr = (libxsmm_bfloat16*)tenTopDiff_->getBuffer(); memset(stptr, 0, nImg*ofm*ofhp*ofwp); convert_bf16_f32(ptr, stptr, nImg*ofm*ofhp*ofwp); MeanOfLayer((char*)s.c_str(), stptr, nImg*ofm*ofhp*ofwp); s = nname_ + "_delWt_Aft"; ptr = (libxsmm_bfloat16*)tenWeightDiff_->getBuffer(); memset(stptr, 0, ifm*ofm*kh*kw); convert_bf16_f32(ptr, stptr, ifm*ofm*kh*kw); MeanOfLayer((char*)s.c_str(), stptr, ifm*ofm*kh*kw); } if(gparams_.bias_term) { string s = nname_ + "_delBias_Aft"; float *p = (float*)tenBiasDiff_->getBuffer(); MeanOfLayer((char*)s.c_str(), p, ofm); } } #endif } void ConvNode::solverStep() { #ifdef USE_MLSL int ifm = gparams_.nInput; int ofm = gparams_.nOutput; int kh = gparams_.kh; int kw = gparams_.kw; void *gwt = tenWeightDiff_->getBuffer(); float *gbias; if(gparams_.bias_term) gbias = (float*)(tenBiasDiff_->getBuffer()); int wsize = ifm*ofm*kh*kw; void *mptr = op_->GetParameterSet(0)->WaitGradientComm(); if(in_dtype == DT_FLOAT) { if(mptr != NULL && mptr != gwt) memcpy((void*)gwt, mptr, wsize*sizeof(float)); } else if(in_dtype == DT_BF16) { if(mptr != NULL && mptr != dwptr) memcpy((void*)dwptr, mptr, wsize*sizeof(float)); convert_f32_bf16(dwptr, (libxsmm_bfloat16*)gwt, wsize); } if(gparams_.bias_term) { mptr = op_->GetParameterSet(1)->WaitGradientComm(); if(mptr != NULL && mptr != gbias) memcpy((void*)gbias, mptr, ofm*sizeof(float)); } #endif } libxsmm-1.17/samples/deeplearning/gxm/src/ConvXSMM.cpp000066400000000000000000000735411415223013700227260ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar, Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "ConvXSMM.hpp" using namespace std; ConvXSMM::ConvXSMM(ConvImplParams* gp, int engine) : ConvImpl(gp, engine) { conv_desc.N = gp->batch_size/gp->num_numa_nodes; conv_desc.C = gp->nInput; conv_desc.H = gp->iHeight; conv_desc.W = gp->iWidth; conv_desc.K = gp->nOutput; conv_desc.R = gp->kh; conv_desc.S = gp->kw; conv_desc.u = gp->stride_h; conv_desc.v = gp->stride_w; if(gp->physical_padding) { conv_desc.pad_h_in = gp->ipad_h; conv_desc.pad_w_in = gp->ipad_w; } else { conv_desc.pad_h_in = 0; conv_desc.pad_w_in = 0; } conv_desc.pad_w = gp->pad_w; conv_desc.pad_h = gp->pad_h; if(gp->physical_padding) { conv_desc.pad_h_out = gp->opad_h; conv_desc.pad_w_out = gp->opad_w; } else { conv_desc.pad_h_out = 0; conv_desc.pad_w_out = 0; } conv_desc.threads = gp->num_threads/gp->num_numa_nodes; conv_desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT; conv_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; conv_desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; conv_desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE; if(gp->out_data_type == DT_FLOAT) conv_desc.options = LIBXSMM_DNN_CONV_OPTION_OVERWRITE; else if(gp->out_data_type == DT_BF16) conv_desc.options = LIBXSMM_DNN_CONV_OPTION_OVERWRITE; if(gp->bias_term) printf("GxM Error: Fusion for Bias into convolution is not available!\n"); if(gp->relu) printf("GxM Error: Fusion for ReLU into convolution is not available!\n"); if(gp->bias_term && gp->relu) printf("GxM Error: Fusion for ReLU ind Bias into convolution is not available!\n"); if(gp->compute_stats) printf("GxM Error: Fusion for Batch stats into convolution is not available!\n"); if(gp->compute_stats && gp->bwd_relu) printf("GxM Error: Fusion for Batch stats and ReLU into convolution is not available!\n"); if(gp->in_data_type == DT_BF16 && gp->out_data_type == DT_FLOAT) { conv_desc.datatype_in = LIBXSMM_DNN_DATATYPE_BF16; conv_desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32; } else if(gp->in_data_type == DT_BF16 && gp->out_data_type == DT_BF16) { conv_desc.datatype_in = LIBXSMM_DNN_DATATYPE_BF16; conv_desc.datatype_out = LIBXSMM_DNN_DATATYPE_BF16; } else if(gp->in_data_type == DT_FLOAT && gp->out_data_type == DT_FLOAT) { conv_desc.datatype_in = LIBXSMM_DNN_DATATYPE_F32; conv_desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32; } for(int i=0; inum_numa_nodes; i++) { libxsmm_handle[i] = libxsmm_dnn_create_conv_layer( conv_desc, &status ); CHKERR_LIBXSMM_DNN( status ); } top_layout_type = LIBXSMM_CUSTOM_LAYOUT; top_layout = libxsmm_handle; gbot_layout_type = LIBXSMM_CUSTOM_LAYOUT; gbot_layout = libxsmm_handle; } void ConvXSMM::forwardPropagate(TensorBuf *inp, TensorBuf *weightp, TensorBuf *hweightp, TensorBuf *biasp, TensorBuf *outp, int tid) { int nIFM = gp->nInput; int nOFM = gp->nOutput; int nBIfm = nIFM/VLEN; int nBOfm = nOFM/VLEN; int ifh = gp->iHeight; int ifw = gp->iWidth; int ofh = gp->oHeight; int ofw = gp->oWidth; int iph = gp->ipad_h; int ipw = gp->ipad_w; int oph = gp->opad_h; int opw = gp->opad_w; int ifhp = ifh + 2*iph; int ifwp = ifw + 2*ipw; int ofhp = ofh + 2*oph; int ofwp = ofw + 2*opw; int kh = gp->kh; int kw = gp->kw; // Conv input. LPBuffer is non-NULL if data layer output is BF16 int imoff = conv_desc.N * conv_desc.C * ifhp * ifwp; if(gp->in_data_type == DT_BF16) { if(inp->getLPBuffer() != NULL) in_ptr[0] = inp->getLPBuffer(); else in_ptr[0] = inp->getBuffer(); imoff = imoff * sizeof(libxsmm_bfloat16); } else if(gp->in_data_type == DT_FLOAT) { in_ptr[0] = inp->getBuffer(); imoff = imoff * sizeof(float); } for(int n=1; nnum_numa_nodes; n++) in_ptr[n] = in_ptr[n-1] + imoff; // Conv Weight void **lptrptr = weightp->getLPBufferPtr(); void **ptrptr = weightp->getBufferPtr(); int offset = weightp->getOffset(); if(gp->in_data_type == DT_BF16) { if(lptrptr != NULL) for(int n=0; nnum_numa_nodes; n++) { wt_ptr[n] = lptrptr[n] + offset*sizeof(libxsmm_bfloat16); f32_wt_ptr[n] = ptrptr[n] + offset*sizeof(float); } } else if(gp->in_data_type == DT_FLOAT) for(int n=0; nnum_numa_nodes; n++) wt_ptr[n] = ptrptr[n] + offset*sizeof(float); void *wt_prv_ptr = NULL; // Conv weight history if(hweightp != NULL) hwt_ptr = hweightp->getBuffer(); else hwt_ptr = NULL; // Conv output out_ptr[0] = outp->getBuffer(); imoff = conv_desc.N * conv_desc.K * ofhp * ofwp; if(gp->out_data_type == DT_FLOAT) imoff = imoff * sizeof(float); else if(gp->out_data_type == DT_BF16) imoff = imoff * sizeof(libxsmm_bfloat16); for(int n=1; nnum_numa_nodes; n++) out_ptr[n] = out_ptr[n-1] + imoff; void **sptrptr = scratchp->getBufferPtr(); for(int n=0; nnum_numa_nodes; n++) { if(libxsmm_input[n] == NULL && libxsmm_output[n] == NULL) { libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle[n], LIBXSMM_DNN_REGULAR_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_input[n] = libxsmm_dnn_link_tensor( libxsmm_layout, in_ptr[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN(libxsmm_dnn_bind_tensor( libxsmm_handle[n], libxsmm_input[n], LIBXSMM_DNN_REGULAR_INPUT ) ); // Conv Output libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle[n], LIBXSMM_DNN_REGULAR_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_output[n] = libxsmm_dnn_link_tensor( libxsmm_layout, out_ptr[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN(libxsmm_dnn_bind_tensor( libxsmm_handle[n], libxsmm_output[n], LIBXSMM_DNN_REGULAR_OUTPUT ) ); } } for(int n=0; nnum_numa_nodes; n++) { if(libxsmm_filter[n] == NULL) { libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle[n], LIBXSMM_DNN_REGULAR_FILTER, &status ); CHKERR_LIBXSMM_DNN( status ); int welem = gp->nInput * gp->nOutput * gp->kw * gp->kh; if(gp->in_data_type == DT_FLOAT) { libxsmm_filter[n] = libxsmm_dnn_link_tensor( libxsmm_layout, wt_ptr[n], &status ); CHKERR_LIBXSMM_DNN( status ); if(gp->node_name == "conv1" || gp->node_name == "convbn1") { libxsmm_dnn_tensor *tensor = libxsmm_filter[n]; libxsmm_dnn_err_t status; libxsmm_dnn_tensor_datalayout *mylayout = libxsmm_dnn_get_tensor_datalayout (tensor, &status); int i1, i2, i3, i4, i5, i6; int bofm = 0; int bifm = 0; int S = 0; int R = 0; int ifmb = 0; int ofmb = 0; assert( mylayout->num_dims == 6 ); bofm = mylayout->dim_size[0]; bifm = mylayout->dim_size[1]; S = mylayout->dim_size[2]; R = mylayout->dim_size[3]; ifmb = mylayout->dim_size[4]; ofmb = mylayout->dim_size[5]; LIBXSMM_VLA_DECL(6, float, handle_data, (float*)wt_ptr[n], ifmb, R, S, bifm, bofm); for (i1 = 0; i1 < ofmb; ++i1) { for (i2 = 0; i2 < ifmb; ++i2) { for (i3 = 0; i3 < R; ++i3) { for (i4 = 0; i4 < S; ++i4) { for (i5 = 0; i5 < bifm; ++i5) { for (i6 = 0; i6 < bofm; ++i6) { /* set 4th input channel to 0 */ if ( (i6 == 1) && (i5 == 1) ) { LIBXSMM_VLA_ACCESS(6, handle_data, i1, i2, i3, i4, i5, i6, ifmb, R, S, bifm, bofm) = (float)0; } } } } } } } } } else if(gp->in_data_type == DT_BF16) { libxsmm_filter[n] = libxsmm_dnn_link_tensor( libxsmm_layout, wt_ptr[n], &status ); CHKERR_LIBXSMM_DNN( status ); if(gp->node_name == "conv1" || gp->node_name == "convbn1") { libxsmm_dnn_tensor *tensor = libxsmm_filter[n]; libxsmm_dnn_err_t status; libxsmm_bfloat16* mydata = (libxsmm_bfloat16*)libxsmm_dnn_get_tensor_data_ptr(tensor, &status); libxsmm_dnn_tensor_datalayout * mylayout = libxsmm_dnn_get_tensor_datalayout (tensor, &status); int i1, i2, i3, i4, i5, i6, i7; int lpb = 0; int bofm = 0; int bifm = 0; int S = 0; int R = 0; int ifmb = 0; int ofmb = 0; /* check for VNNI weights */ assert( mylayout->num_dims == 7 ); lpb = mylayout->dim_size[0]; bofm = mylayout->dim_size[1]; bifm = mylayout->dim_size[2]; S = mylayout->dim_size[3]; R = mylayout->dim_size[4]; ifmb = mylayout->dim_size[5]; ofmb = mylayout->dim_size[6]; LIBXSMM_VLA_DECL(7, libxsmm_bfloat16, handle_data_1, mydata, ifmb, R, S, bifm, bofm, lpb); for (i1 = 0; i1 < ofmb; ++i1) { for (i2 = 0; i2 < ifmb; ++i2) { for (i3 = 0; i3 < R; ++i3) { for (i4 = 0; i4 < S; ++i4) { for (i5 = 0; i5 < bifm; ++i5) { for (i6 = 0; i6 < bofm; ++i6) { for (i7 = 0; i7 < lpb; ++i7) { /* set 4th input channel to 0 */ if ( (i7 == 1) && (i5 == 1) ) { LIBXSMM_VLA_ACCESS(7, handle_data_1, i1, i2, i3, i4, i5, i6, i7, ifmb, R, S, bifm, bofm, lpb) = (libxsmm_bfloat16)0; } } } } } } } } LIBXSMM_VLA_DECL(7, float, handle_data_2, (float*)f32_wt_ptr[n], ifmb, R, S, bifm, bofm, lpb); for (i1 = 0; i1 < ofmb; ++i1) { for (i2 = 0; i2 < ifmb; ++i2) { for (i3 = 0; i3 < R; ++i3) { for (i4 = 0; i4 < S; ++i4) { for (i5 = 0; i5 < bifm; ++i5) { for (i6 = 0; i6 < bofm; ++i6) { for (i7 = 0; i7 < lpb; ++i7) { /* set 4th input channel to 0 */ if ( (i7 == 1) && (i5 == 1) ) { LIBXSMM_VLA_ACCESS(7, handle_data_2, i1, i2, i3, i4, i5, i6, i7, ifmb, R, S, bifm, bofm, lpb) = (float)0; } } } } } } } } } } libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN(libxsmm_dnn_bind_tensor( libxsmm_handle[n], libxsmm_filter[n], LIBXSMM_DNN_REGULAR_FILTER ) ); } } /* let's allocate (if required) and bind scratch */ if(sptrptr == NULL) { sptrptr = (void**)libxsmm_aligned_malloc(gp->num_numa_nodes*sizeof(void*), 2097152); scratchp->setBufferPtr(sptrptr); } if(prev_scratch_size == 0) prev_scratch_size = scratchp->getBufferSize(); if(!updated_scratch_fwd || prev_scratch_size != scratchp->getBufferSize()) { int max_size = 0; for(int n=0; nnum_numa_nodes; n++) { if(sptrptr[n] == NULL) { int mysize = libxsmm_dnn_get_scratch_size( libxsmm_handle[n], LIBXSMM_DNN_COMPUTE_KIND_ALL, &status ); CHKERR_LIBXSMM_DNN( status ); sptrptr[n] = (void*)libxsmm_aligned_malloc(mysize, 2097152); max_size = mysize; #ifdef USE_MLSL if(MLSL::Environment::GetEnv().GetProcessIdx() == 0) #endif printf("%s allocated %d bytes for scratch @ %p\n",nname.c_str(), mysize, sptrptr[n]); } else { int ssize = scratchp->getBufferSize(); int mysize = libxsmm_dnn_get_scratch_size( libxsmm_handle[n], LIBXSMM_DNN_COMPUTE_KIND_ALL, &status ); CHKERR_LIBXSMM_DNN( status ); if(ssize < mysize) { libxsmm_free(sptrptr[n]); sptrptr[n] = (void*)libxsmm_aligned_malloc(mysize, 2097152); max_size = mysize; #ifdef USE_MLSL if(MLSL::Environment::GetEnv().GetProcessIdx() == 0) #endif printf("%s allocated %d bytes for scratch @ %p, prev size was %d bytes\n",nname.c_str(), mysize, sptrptr[n], ssize); } else max_size = ssize; } } scratchp->setBufferSize(max_size); for(int n=0; nnum_numa_nodes; n++) CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_scratch( libxsmm_handle[n], LIBXSMM_DNN_COMPUTE_KIND_ALL, sptrptr[n] ) ); updated_scratch_fwd = true; prev_scratch_size = scratchp->getBufferSize(); } #if 0 #ifndef NDEBUG /* check physical padding */ if ( (gp->ipad_h > 0 || gp->ipad_w > 0) && (gp->opad_h > 0 || gp->opad_w > 0) ) { } else if ( (gp->ipad_h == 0 || gp->ipad_w == 0) && (gp->opad_h == 0 || gp->opad_w == 0) ) { } else { printf("node %s: conv xsmm forward is partially padded which cannot be :-(\n", nname.c_str()); } if(gp->in_data_type == DT_FLOAT) check_physical_pad( nname.c_str(), (float*)in_ptr[0], conv_desc.N, nBIfm, ifh, ifw, VLEN, iph, ipw); else if(gp->in_data_type == DT_BF16) check_physical_pad( nname.c_str(), (libxsmm_bfloat16*)in_ptr[0], conv_desc.N, nBIfm, ifh, ifw, VLEN, iph, ipw); if(gp->in_data_type == DT_FLOAT) check_physical_pad( nname.c_str(), (float*)out_ptr[0], conv_desc.N, nBOfm, ofh, ofw, VLEN, oph, opw); else if(gp->in_data_type == DT_BF16) check_physical_pad( nname.c_str(), (libxsmm_bfloat16*)out_ptr[0], conv_desc.N, nBOfm, ofh, ofw, VLEN, oph, opw); #endif #endif #ifdef USE_XSMM_TIMING struct timeval tvsc, tvec; gettimeofday(&tvsc, NULL); #endif #ifdef _OPENMP #pragma omp parallel #endif { #ifdef _OPENMP const int tid = omp_get_thread_num(); #else const int tid = 0; #endif int ntps = gp->num_threads/gp->num_numa_nodes; int n = tid/ntps; CHKERR_LIBXSMM_DNN( libxsmm_dnn_execute_st( libxsmm_handle[n], LIBXSMM_DNN_COMPUTE_KIND_FWD, n*ntps, tid) ); } #ifdef USE_XSMM_TIMING gettimeofday(&tvec, NULL); double fp_time = (tvec.tv_sec + tvec.tv_usec*1e-6) - (tvsc.tv_sec + tvsc.tv_usec*1e-6); #ifdef USE_MLSL if(MLSL::Environment::GetEnv().GetProcessIdx() == 0) #endif { double gf = (double)gp->batch_size * (double)gp->nInput * (double)gp->nOutput * (double)gp->oHeight * (double)gp->oWidth * (double)gp->kh * (double)gp->kw * 2; if(gp->stride_h == 1 && gp->pad_h == 0) printf("%s XSMM-CONV-FP mb%dic%dih%doc%doh%dkh%dn time = %g ms, GFLOPS = %.1f\n",gp->node_name.c_str(),gp->batch_size,gp->nInput,gp->iHeight,gp->nOutput,gp->oHeight,gp->kh,fp_time*1000.0, gf/fp_time/1e9); else if(gp->stride_h == 2) printf("%s XSMM-CONV-FP mb%dic%dih%doc%doh%dkh%dsh%dn time = %g ms, GFLOPS = %.1f\n",gp->node_name.c_str(),gp->batch_size,gp->nInput,gp->iHeight,gp->nOutput,gp->oHeight,gp->kh,gp->stride_h,fp_time*1000.0, gf/fp_time/1e9); else if(gp->pad_h == 1) printf("%s XSMM-CONV-FP mb%dic%dih%doc%doh%dkh%dph%dn time = %g ms, GFLOPS = %.1f\n",gp->node_name.c_str(),gp->batch_size,gp->nInput,gp->iHeight,gp->nOutput,gp->oHeight,gp->kh,gp->pad_h,fp_time*1000.0, gf/fp_time/1e9); } #endif top_layout_type = LIBXSMM_CUSTOM_LAYOUT; outp->setLayoutType(top_layout_type); outp->setLayout(libxsmm_handle); #if 0 #ifndef NDEBUG /* check physical padding */ if(gp->in_data_type == DT_FLOAT) check_physical_pad( nname.c_str(), (float*)in_ptr[0], conv_desc.N, nBIfm, ifh, ifw, VLEN, iph, ipw); else if(gp->in_data_type == DT_BF16) check_physical_pad( nname.c_str(), (libxsmm_bfloat16*)in_ptr[0], conv_desc.N, nBIfm, ifh, ifw, VLEN, iph, ipw); if(gp->out_data_type == DT_FLOAT) check_physical_pad( nname.c_str(), (float*)out_ptr[0], conv_desc.N, nBOfm, ofh, ofw, VLEN, oph, opw); else if(gp->out_data_type == DT_BF16) check_physical_pad( nname.c_str(), (libxsmm_bfloat16*)out_ptr[0], conv_desc.N, nBOfm, ofh, ofw, VLEN, oph, opw); #endif #endif } void ConvXSMM::backPropagate(TensorBuf* inp, TensorBuf* weightp, TensorBuf *deloutp, TensorBuf* delinp, int tid) { int nIFM = gp->nInput; int nOFM = gp->nOutput; int nBIfm = nIFM/VLEN; int nBOfm = nOFM/VLEN; int ifh = gp->iHeight; int ifw = gp->iWidth; int ofh = gp->oHeight; int ofw = gp->oWidth; int iph = gp->ipad_h; int ipw = gp->ipad_w; int oph = gp->opad_h; int opw = gp->opad_w; int ifhp = ifh + 2*iph; int ifwp = ifw + 2*ipw; int ofhp = ofh + 2*oph; int ofwp = ofw + 2*opw; int imoff = conv_desc.N * conv_desc.K * ofhp * ofwp; if(gp->out_data_type == DT_FLOAT) imoff = imoff * sizeof(float); else if(gp->out_data_type == DT_BF16) imoff = imoff * sizeof(libxsmm_bfloat16); dout_ptr[0] = deloutp->getBuffer(); for(int n=1; nnum_numa_nodes; n++) dout_ptr[n] = dout_ptr[n-1] + imoff; imoff = conv_desc.N * conv_desc.C * ifhp * ifwp; if(gp->in_data_type == DT_FLOAT) imoff = imoff * sizeof(float); else if(gp->in_data_type == DT_BF16) imoff = imoff * sizeof(libxsmm_bfloat16); din_ptr[0] = delinp->getBuffer(); for(int n=1; nnum_numa_nodes; n++) din_ptr[n] = din_ptr[n-1] + imoff; void **sptrptr = scratchp->getBufferPtr(); if(!updated_scratch_bwd) { for(int n=0; nnum_numa_nodes; n++) CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_scratch( libxsmm_handle[n], LIBXSMM_DNN_COMPUTE_KIND_ALL, sptrptr[n] ) ); updated_scratch_bwd = true; } for(int n=0; nnum_numa_nodes; n++) { if(libxsmm_delinput[n] == NULL && libxsmm_deloutput[n] == NULL) { libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle[n], LIBXSMM_DNN_GRADIENT_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delinput[n] = libxsmm_dnn_link_tensor(libxsmm_layout, din_ptr[n], &status ); CHKERR_LIBXSMM_DNN(status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle[n], libxsmm_delinput[n], LIBXSMM_DNN_GRADIENT_INPUT ) ); libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle[n], LIBXSMM_DNN_GRADIENT_OUTPUT, &status ); CHKERR_LIBXSMM_DNN(status ); libxsmm_deloutput[n] = libxsmm_dnn_link_tensor( libxsmm_layout, dout_ptr[n], &status ); CHKERR_LIBXSMM_DNN(status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN(libxsmm_dnn_bind_tensor( libxsmm_handle[n], libxsmm_deloutput[n], LIBXSMM_DNN_GRADIENT_OUTPUT ) ); } } #if 0 #ifndef NDEBUG /* check physical padding */ if ( (gp->ipad_h > 0 || gp->ipad_w > 0) && (gp->opad_h > 0 || gp->opad_w > 0) ) { } else if ( (gp->ipad_h == 0 || gp->ipad_w == 0) && (gp->opad_h == 0 || gp->opad_w == 0) ) { } else { printf("node %s: conv xsmm backward is partially padded which cannot be :-(\n", nname.c_str()); } if(gp->out_data_type == DT_FLOAT) check_physical_pad( nname.c_str(), (float*)din_ptr[0], conv_desc.N, nBIfm, ifh, ifw, VLEN, iph, ipw); else if(gp->out_data_type == DT_BF16) check_physical_pad( nname.c_str(), (libxsmm_bfloat16*)din_ptr[0], conv_desc.N, nBIfm, ifh, ifw, VLEN, iph, ipw); if(gp->in_data_type == DT_FLOAT) check_physical_pad( nname.c_str(), (float*)dout_ptr[0], conv_desc.N, nBOfm, ofh, ofw, VLEN, oph, opw); else if(gp->in_data_type == DT_BF16) check_physical_pad( nname.c_str(), (libxsmm_bfloat16*)dout_ptr[0], conv_desc.N, nBOfm, ofh, ofw, VLEN, oph, opw); #endif #endif #ifdef USE_XSMM_TIMING struct timeval tvsc, tvec; gettimeofday(&tvsc, NULL); #endif #ifdef _OPENMP #pragma omp parallel #endif { #ifdef _OPENMP const int tid = omp_get_thread_num(); #else const int tid = 0; #endif int ntps = gp->num_threads/gp->num_numa_nodes; int n = tid/ntps; CHKERR_LIBXSMM_DNN( libxsmm_dnn_execute_st( libxsmm_handle[n], LIBXSMM_DNN_COMPUTE_KIND_BWD, n*ntps, tid ) ); } #ifdef USE_XSMM_TIMING gettimeofday(&tvec, NULL); double bp_time = (tvec.tv_sec + tvec.tv_usec*1e-6) - (tvsc.tv_sec + tvsc.tv_usec*1e-6); #ifdef USE_MLSL if(MLSL::Environment::GetEnv().GetProcessIdx() == 0) #endif { double gf = (double)gp->batch_size * (double)gp->nInput * (double)gp->nOutput * (double)gp->oHeight * (double)gp->oWidth * (double)gp->kh * (double)gp->kw * 2; if(gp->stride_h == 1 && gp->pad_h == 0) printf("%s XSMM-CONV-BP mb%dic%dih%doc%doh%dkh%dn time = %g ms, GFLOPS = %.1f\n",gp->node_name.c_str(),gp->batch_size,gp->nInput,gp->iHeight,gp->nOutput,gp->oHeight,gp->kh,bp_time*1000.0, gf/bp_time/1e9); else if(gp->stride_h == 2) printf("%s XSMM-CONV-BP mb%dic%dih%doc%doh%dkh%dsh%dn time = %g ms, GFLOPS = %.1f\n",gp->node_name.c_str(),gp->batch_size,gp->nInput,gp->iHeight,gp->nOutput,gp->oHeight,gp->kh,gp->stride_h,bp_time*1000.0, gf/bp_time/1e9); else if(gp->pad_h == 1) printf("%s XSMM-CONV-BP mb%dic%dih%doc%doh%dkh%dph%dn time = %g ms, GFLOPS = %.1f\n",gp->node_name.c_str(),gp->batch_size,gp->nInput,gp->iHeight,gp->nOutput,gp->oHeight,gp->kh,gp->pad_h,bp_time*1000.0, gf/bp_time/1e9); } #endif gbot_layout_type = LIBXSMM_CUSTOM_LAYOUT; delinp->setLayoutType(gbot_layout_type); delinp->setLayout(libxsmm_handle); #if 0 #ifndef NDEBUG /* check physical padding */ if(gp->out_data_type == DT_FLOAT) check_physical_pad( nname.c_str(), (float*)din_ptr[0], conv_desc.N, nBIfm, ifh, ifw, VLEN, iph, ipw); else if(gp->out_data_type == DT_BF16) check_physical_pad( nname.c_str(), (libxsmm_bfloat16*)din_ptr[0], conv_desc.N, nBIfm, ifh, ifw, VLEN, iph, ipw); if(gp->in_data_type == DT_FLOAT) check_physical_pad( nname.c_str(), (float*)dout_ptr[0], conv_desc.N, nBOfm, ofh, ofw, VLEN, oph, opw); else if(gp->in_data_type == DT_BF16) check_physical_pad( nname.c_str(), (libxsmm_bfloat16*)dout_ptr[0], conv_desc.N, nBOfm, ofh, ofw, VLEN, oph, opw); #endif #endif } void ConvXSMM::weightUpdate(TensorBuf *inp, TensorBuf *deloutp, TensorBuf* delweightp, TensorBuf* delbiasp, int tid) { int ifm = gp->nInput; int ofm = gp->nOutput; int ofh = gp->oHeight; int ofw = gp->oWidth; int oph = gp->opad_h; int opw = gp->opad_w; int ofhp = ofh + 2*oph; int ofwp = ofw + 2*opw; int kh = gp->kh; int kw = gp->kw; void *dwt_ptr[NUM_NUMA_NODES]; void **ptrptr; if(gp->in_data_type == DT_BF16) { #ifdef BF16_MLSL ptrptr = delweightp->getBufferPtr(); #else ptrptr = delweightp->getLPBufferPtr(); #endif } else ptrptr = delweightp->getBufferPtr(); int offset = delweightp->getOffset(); if(gp->in_data_type == DT_FLOAT) offset = offset*sizeof(float); else if(gp->in_data_type == DT_BF16) offset = offset*sizeof(libxsmm_bfloat16); for(int n=0; nnum_numa_nodes; n++) dwt_ptr[n] = ptrptr[n] + offset; int imoff = conv_desc.N * conv_desc.K * ofhp * ofwp; if(gp->out_data_type == DT_FLOAT) imoff = imoff * sizeof(float); else if(gp->out_data_type == DT_BF16) imoff = imoff * sizeof(libxsmm_bfloat16); dout_ptr[0] = deloutp->getBuffer(); for(int n=1; nnum_numa_nodes; n++) dout_ptr[n] = dout_ptr[n-1] + imoff; void **sptrptr = scratchp->getBufferPtr(); if(!updated_scratch_upd) { for(int n=0; nnum_numa_nodes; n++) CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_scratch( libxsmm_handle[n], LIBXSMM_DNN_COMPUTE_KIND_ALL, sptrptr[n] ) ); updated_scratch_upd = true; } for(int n=0; nnum_numa_nodes; n++) { if(libxsmm_delfilter[n] == NULL) { libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle[n], LIBXSMM_DNN_GRADIENT_FILTER, &status ); CHKERR_LIBXSMM_DNN(status ); libxsmm_delfilter[n] = libxsmm_dnn_link_tensor( libxsmm_layout, dwt_ptr[n], &status ); CHKERR_LIBXSMM_DNN(status); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN(libxsmm_dnn_bind_tensor( libxsmm_handle[n], libxsmm_delfilter[n], LIBXSMM_DNN_GRADIENT_FILTER ) ); } } for(int n=0; nnum_numa_nodes; n++) { if(libxsmm_deloutput[n] == NULL) { libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle[n], LIBXSMM_DNN_GRADIENT_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_deloutput[n] = libxsmm_dnn_link_tensor(libxsmm_layout, dout_ptr[n], &status ); CHKERR_LIBXSMM_DNN(status); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN(libxsmm_dnn_bind_tensor( libxsmm_handle[n], libxsmm_deloutput[n], LIBXSMM_DNN_GRADIENT_OUTPUT ) ); } } #ifdef USE_XSMM_TIMING struct timeval tvsc, tvec; gettimeofday(&tvsc, NULL); #endif #ifdef _OPENMP #pragma omp parallel #endif { #ifdef _OPENMP const int tid = omp_get_thread_num(); #else const int tid = 0; #endif int ntps = gp->num_threads/gp->num_numa_nodes; int n = tid/ntps; CHKERR_LIBXSMM_DNN( libxsmm_dnn_execute_st( libxsmm_handle[n], LIBXSMM_DNN_COMPUTE_KIND_UPD, n*ntps, tid ) ); #ifdef USE_MLSL #pragma omp barrier if(gp->num_numa_nodes > 1) { if(gp->in_data_type == DT_FLOAT) { int jobs = ofm * ifm * kh * kw; int jn = jobs/gp->num_numa_nodes; int jnv = jn/VLEN; int jpt = (jnv % ntps == 0) ? (jnv/ntps)*VLEN : ((jnv/ntps)+1)*VLEN; int ltid = tid - n*ntps; int tb = (ltid * jpt < jn) ? ltid*jpt : jn; int te = ((ltid+1)*jpt < jn) ? (ltid+1)*jpt : jn; float *wgp = (float*)dwt_ptr[n]+n*jn; for(int nn=0; nnnum_numa_nodes; nn++) { if(n == nn) continue; float *rgp = (float*)dwt_ptr[nn]+n*jn; #pragma omp simd for(int i=tb; inum_numa_nodes; nn++) { if(n == nn) continue; float *wgp = (float*)dwt_ptr[n]+nn*jn; float *rgp = (float*)dwt_ptr[nn]+nn*jn; #pragma vector nontemporal #pragma omp simd for(int i=tb; iin_data_type == DT_BF16) { if(n == 0) { int jobs = ofm * ifm * kh * kw; assert(jobs % VLEN == 0); int jv = jobs/VLEN; int rem = jv % ntps; int jpt = (rem == 0) ? (jv/ntps)*VLEN : ((jv-rem)/ntps)*VLEN; int tb = (tid * jpt < jobs) ? tid*jpt : jobs; int te = ((tid+1)*jpt < jobs) ? (tid+1)*jpt : jobs; libxsmm_bfloat16 *my_ptr = (libxsmm_bfloat16*)dwt_ptr[n]; for(int nn=1; nnnum_numa_nodes; nn++) { libxsmm_bfloat16 *rem_ptr = (libxsmm_bfloat16*)dwt_ptr[nn]; for(int i=tb; i 0) { for(int i=ntps*jpt; ibatch_size * (double)gp->nInput * (double)gp->nOutput * (double)gp->oHeight * (double)gp->oWidth * (double)gp->kh * (double)gp->kw * 2; if(gp->stride_h == 1 && gp->pad_h == 0) printf("%s XSMM-CONV-WU mb%dic%dih%doc%doh%dkh%dn time = %g ms, GFLOPS = %.1f\n",gp->node_name.c_str(),gp->batch_size,gp->nInput,gp->iHeight,gp->nOutput,gp->oHeight,gp->kh,wu_time*1000.0, gf/wu_time/1e9); else if(gp->stride_h == 2) printf("%s XSMM-CONV-WU mb%dic%dih%doc%doh%dkh%dsh%dn time = %g ms, GFLOPS = %.1f\n",gp->node_name.c_str(),gp->batch_size,gp->nInput,gp->iHeight,gp->nOutput,gp->oHeight,gp->kh,gp->stride_h,wu_time*1000.0, gf/wu_time/1e9); else if(gp->pad_h == 1) printf("%s XSMM-CONV-WU mb%dic%dih%doc%doh%dkh%dph%dn time = %g ms, GFLOPS = %.1f\n",gp->node_name.c_str(),gp->batch_size,gp->nInput,gp->iHeight,gp->nOutput,gp->oHeight,gp->kh,gp->pad_h,wu_time*1000.0, gf/wu_time/1e9); } #endif } void ConvXSMM::dumpBuffer(TensorBuf* tBuf, void* wtemp) { int buftype = tBuf->getBufferType(); if(buftype == DATA) { CHKERR_LIBXSMM_DNN(libxsmm_dnn_copyout_tensor(libxsmm_checkpoint_filter, wtemp, LIBXSMM_DNN_TENSOR_FORMAT_KCRS)); } else if(buftype == HISTORY) CHKERR_LIBXSMM_DNN(libxsmm_dnn_copyout_tensor(libxsmm_checkpoint_history_filter, wtemp, LIBXSMM_DNN_TENSOR_FORMAT_KCRS)); } libxsmm-1.17/samples/deeplearning/gxm/src/Dropout.cpp000066400000000000000000000136271415223013700227470ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #include #include "Dropout.hpp" #include "fillers.hpp" #define PRIME_SEED 131 using namespace std; using namespace gxm; DropoutNode::DropoutNode(DropoutParams* p, MLEngine* e): NNNode(p, e) { nname_ = p->get_node_name(); ntype_ = p->get_node_type(); mode_ = p->get_mode(); bottom_ = p->get_bottom_names(); top_ = p->get_top_names(); bp_flag_ = p->get_bprop_flag(); has_weights_ = false; assert((bottom_.size() == 1) && (top_.size() == 1)); tenTop_ = new Tensor(top_[0]); assert(tenTop_ != NULL); tenTop_->setOwner(this); tenTop_->setType(ACT); tenTopData_ = tenTop_->getBuf(DATA); tenTopData_->setBufferType(DATA); #ifdef DEBUG printf("bottom name %s\n",bottom_[0].c_str()); #endif tenBot_ = e->get_tensor(bottom_[0], ACT); assert(tenBot_ != NULL); setPrevNode((NNNode*)tenBot_->getOwner()); tenBotData_ = tenBot_->getBuf(DATA); //Output tensor data type = input tensor data type int dtype = p->get_data_type(); tenTopData_->setDataType(dtype); // Get input tensor shape (bottom) Shape* bs = tenBot_->getShape(); assert(bs->ndims <= MAX_DIMS); Shape ts; shape_setzero(&ts); ts.ndims = bs->ndims; for(int i=0; i < bs->ndims; i++) ts.dims[i] = bs->dims[i]; tenTop_->setShape(&ts); long long int tsize = 1; for(int i=0; isetBufferSize(tsize); if(!e->is_inference_only()) { if(bp_flag_) { tenBotDiff_ = tenBot_->addBuf(); // DIFF type and index tenBotDiff_->setDataType(dtype); tenBotDiff_->setBufferType(DIFF); long long int bsize = 1; for(int i=0; indims; i++) bsize = bsize*bs->dims[i]; if(dtype == DT_FLOAT) bsize = bsize*sizeof(float); else if(dtype == DT_INT) bsize = bsize*sizeof(int); // Set the size of the input-gradient buffer tenBotDiff_->setBufferSize(bsize); } } else tenBotDiff_ = NULL; // Compute scale via dropout_ratio threshold_ = p->get_dropout_ratio(); if(threshold_ != 0.5) { printf("Support for threshold %f not implemented! Resetting to 0.5\n",threshold_); threshold_ = 0.5; } scale_ = 1./(1 - threshold_); // Register output tensor in tensor map bool inserted = e->register_tensor(top_[0], ACT, tenTop_); if(!inserted) printf("Warning: Tensor %s already registered\n",NNNode::top_[0].c_str()); gparams_.batch_size = bs->dims[0]; gparams_.nInput = bs->dims[1]; gparams_.nOutput = gparams_.nInput; gparams_.iHeight = bs->dims[2]; gparams_.iWidth = bs->dims[3]; gparams_.oHeight = ts.dims[2]; gparams_.oWidth = ts.dims[3]; gparams_.data_type = dtype; gparams_.num_threads = e->get_num_threads(); seeds = new unsigned int[gparams_.num_threads]; for(int i=0; igetBuffer()); float* top = (float*)(tenTopData_->getBuffer()); int *mask = (int *)tenMask_; // unsigned int *seeds = tenSeeds_; #ifdef DEBUG printf("Executing FP %s: input %p, output %p\n",NNNode::nname_.c_str(), bot, top); printf("Inputs: %d\n",gparams_.nInput); printf("Outputs: %d\n",gparams_.nOutput); #endif int M = gparams_.batch_size; int N = gparams_.nOutput; int H = gparams_.oHeight; int W = gparams_.oWidth; if(eptr_->get_execution_mode() == TRAIN) { #ifdef _OPENMP #pragma omp parallel for #endif for (int i = 0; i < M*N*H*W; i++) { int r = rand_r(&seeds[omp_get_thread_num()]); if(r%2 == 0) top[i] = 0; else top[i] = bot[i] * scale_; } } else { #ifdef _OPENMP #pragma omp parallel for #endif for (int i = 0; i < M*N*H*W; i++) top[i] = bot[i]; } #ifdef DEBUG MeanOfLayer((char*)bottom_[0].c_str(), bot, M*N*H*W); MeanOfLayer((char*)top_[0].c_str(), top, M*N*H*W); #endif } void DropoutNode::backPropagate() { #ifdef REUTRNALL return; #endif int M = gparams_.batch_size; int N = gparams_.nOutput; int H = gparams_.oHeight; int W = gparams_.oWidth; TensorBuf *tenTopDiff = tenTop_->getBuf(DIFF); float *gtop = (float*)(tenTopDiff->getBuffer()); assert(gtop != NULL); float* gbot = (float*)(tenBotDiff_->getBuffer()); int *mask = (int *)tenMask_; #ifdef DEBUG printf("Executing BP %s: grad_output %p, grad_input %p\n",NNNode::nname_.c_str(), gtop, gbot); printf("Grad Outputs: %d\n", N*H*W); printf("Grad Inputs: %d\n", N*H*W); #endif assert(eptr_->get_execution_mode() == TRAIN); #ifdef _OPENMP #pragma omp parallel for #endif for (int i = 0; i < M*N*H*W; i++) gbot[i] = gtop[i] * mask[i] * scale_; #ifdef DEBUG MeanOfLayer((char*)bottom_[0].c_str(), gtop, M*N*H*W); MeanOfLayer((char*)top_[0].c_str(), gbot, M*N*H*W); #endif } libxsmm-1.17/samples/deeplearning/gxm/src/DummyData.cpp000066400000000000000000000176061415223013700232010ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #include "DummyData.hpp" DummyDataNode::DummyDataNode(DummyDataParams* p, MLEngine* e) : NNNode(p, e) { nname_ = p->get_node_name(); ntype_ = p->get_node_type(); mode_ = p->get_mode(); top_ = p->get_top_names(); has_weights_ = false; bot_compute_engine_ = p->get_compute_engine(); int dtype; tenTop_.resize(top_.size()); tenTopData_.resize(top_.size()); for(int i=0; isetOwner(this); tenTopData_[i] = tenTop_[i]->getBuf(DATA); if(top_[i].compare("data") == 0) { tenTop_[i]->setType(INPUT); tenTopData_[i]->setBufferType(DATA); // FIXME: the data type should be set elsewhere... dtype = p->get_data_type(); tenTopData_[i]->setDataType(dtype); ts_ = p->get_shape(); pad_h_ = p->get_pad_h(); pad_w_ = p->get_pad_w(); tenTop_[i]->setShape(ts_); long long int size = ts_->dims[0] * ts_->dims[1] * (ts_->dims[2] + 2*pad_h_) * (ts_->dims[3] + 2*pad_w_); if(dtype == DT_FLOAT) size = size*sizeof(float); else if(dtype == DT_BF16) size = size*sizeof(float); // Set the logical size of the tensor buffer for bufId=0 (forward data buffer). // Note: we have no knowledge of the machine parameters here, so effectively this is single-machine config tenTop_[i]->setDataBufferSize(DATA, size); num_machines_ = e->get_num_machines(); global_batch_size_ = num_machines_ * ts_->dims[0]; #ifdef USE_MLSL MLSL::Session *s = e->get_session(); s->SetGlobalMinibatchSize(global_batch_size_); #endif if(p->get_num_train_files() != 0) e->set_num_train_batches(p->get_num_train_files()/ts_->dims[0]); if(p->get_num_test_files() != 0) { e->set_num_test_batches(p->get_num_test_files()/ts_->dims[0]); e->set_num_test_views(1); } e->set_batch_size(ts_->dims[0]); bool inserted = e->register_tensor(top_[i], INPUT, tenTop_[i]); if(!inserted) printf("Warning: Tensor %s already registered\n",top_[i].c_str()); filler_type_ = p->get_filler_type(); filler_val_ = p->get_filler_val(); } else if(top_[i].compare("label") == 0) { tenTop_[i]->setType(LABEL); Shape *ts = p->get_shape(); ts->ndims = 1; ts->dims[1] = 0; ts->dims[2] = 0; ts->dims[3] = 0; // FIXME: the data type should be set elsewhere... dtype = DT_INT; tenTopData_[i]->setDataType(dtype); tenTopData_[i]->setBufferType(DATA); tenTop_[i]->setDataBufferSize(DATA, ts->dims[0]*sizeof(int)); tenTop_[i]->setShape(ts); bool inserted = e->register_tensor(top_[i], LABEL, tenTop_[i]); if(!inserted) printf("Warning: Tensor %s already registered\n",NNNode::top_[i].c_str()); } } //No input tensor to this node this->tenBot_ = NULL; } void convert_f32_bf16(float* in, libxsmm_bfloat16* out, int len) { int i; #ifdef _OPENMP #pragma omp parallel for private(i) #endif for ( i = 0; i < len; i+=16 ) { __m512 vfp32 = gxm_fp32_to_bfp16_rne_adjustment_avx512f( _mm512_loadu_ps( in+i ) ); __m256i vbfp16 = gxm_fp32_to_bfp16_truncate_avx512f( vfp32 ); _mm256_storeu_si256( (__m256i*)(out+i), vbfp16 ); } } void DummyDataNode::fillData(float* ptr, long long int size) { if(first_fp) { #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; igetShape(); int ifhp = ts->dims[2]+2*pad_h_; int ifwp = ts->dims[3]+2*pad_w_; int nFM = ts->dims[1]; float (* __restrict input)[ifhp][ifwp][nFM] = (float (*)[*][*][*])ptr; if(filler_type_ == "rand") { #ifdef _OPENMP #pragma omp parallel for #endif for(int img=0; imgdims[0]; img++) { for(int h=pad_h_; hdims[2]+pad_h_; h++) { for(int w=pad_w_; wdims[3]+pad_w_; w++) { for(int fm=0; fmdims[1]; fm++) { input[img][h][w][fm] = (float)(rand()/RAND_MAX); } } } } } else if(filler_type_ == "constant") { #ifdef _OPENMP #pragma omp parallel for #endif for(int img=0; imgdims[0]; img++) { for(int h=pad_h_; hdims[2]+pad_h_; h++) { for(int w=pad_w_; wdims[3]+pad_w_; w++) { for(int fm=0; fmdims[1]; fm++) { input[img][h][w][fm] = filler_val_; } } } } } } void DummyDataNode::fillData(int* ptr, long long int size) { if(filler_type_.compare("rand") == 0) { #ifdef _OPENMP #pragma omp parallel for #endif for(long long int i=0; igetShape(); int ifhp = ts->dims[2]+2*pad_h_; int ifwp = ts->dims[3]+2*pad_w_; int nFM = ts->dims[1]; if(first_fp) { #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; idims[0]; img++) { for(int h=pad_h_; hdims[2]+pad_h_; h++) { for(int w=pad_w_; wdims[3]+pad_w_; w++) { for(int fm=0; fmdims[1]; fm++) { input[img][h][w][fm] = (float)rand()/(float)RAND_MAX; } } } } } else if(filler_type_.compare("constant") == 0) { #ifdef _OPENMP #pragma omp parallel for #endif for(int img=0; imgdims[0]; img++) { for(int h=pad_h_; hdims[2]+pad_h_; h++) { for(int w=pad_w_; wdims[3]+pad_w_; w++) { for(int fm=0; fmdims[1]; fm++) { input[img][h][w][fm] = (float)filler_val_; } } } } } convert_f32_bf16(inptr, outptr, size); } void DummyDataNode::forwardPropagate() { #ifdef RETURNALL return; #endif for(int i=0; igetDataType(); long long int bytes = tenTopData_[i]->getBufferSize(); if(dtype == DT_FLOAT) { float* top = (float*)(tenTopData_[i]->getBuffer()); fillData(top, bytes/sizeof(float)); #ifdef DEBUG printf("Executing FP %s: Data %p\n",node_name_.c_str(), top); #endif } else if(dtype == DT_BF16) { libxsmm_bfloat16* top = (libxsmm_bfloat16*)(tenTopData_[i]->getLPBuffer()); if(top == NULL) top = (libxsmm_bfloat16*)_mm_malloc(bytes/sizeof(libxsmm_bfloat16), 64); tenTopData_[i]->setLPBuffer(top); float *bot = (float*)tenTopData_[i]->getBuffer(); fillData(bot, top, bytes/sizeof(float)); #ifdef DEBUG printf("Executing FP %s: Data %p\n",node_name_.c_str(), top); #endif } else if(dtype == DT_INT) { int* top = (int*)(tenTopData_[i]->getBuffer()); for(long long int i=0; i #include "Eltwise.hpp" using namespace std; using namespace gxm; EltwiseNode::EltwiseNode(EltwiseParams* p, MLEngine* e) : NNNode(p, e) { nname_ = p->get_node_name(); ntype_ = p->get_node_type(); mode_ = p->get_mode(); bottom_ = p->get_bottom_names(); top_ = p->get_top_names(); bp_flag_ = p->get_bprop_flag(); has_weights_ = false; bot_compute_engine_ = p->get_compute_engine(); assert(top_.size() == 1); tenTop_ = new Tensor(top_[0]); assert(tenTop_ != NULL); tenTop_->setOwner(this); tenTop_->setType(ACT); tenTopData_ = tenTop_->getBuf(DATA); tenTopData_->setBufferType(DATA); #ifdef DEBUG printf("bottom name %s\n",bottom_[0].c_str()); #endif Shape ts; shape_setzero(&ts); tenBot_.resize(bottom_.size()); bot_cengine_.resize(bottom_.size()); tenBotData_.resize(bottom_.size()); for(int i=0; iget_tensor(bottom_[i], ACT); assert(tenBot_[i] != NULL); NNNode *pnn = (NNNode*)tenBot_[i]->getOwner(); setPrevNode(pnn); pnn->set_top_compute_engine(p->get_compute_engine()); bot_cengine_[i] = pnn->get_bot_compute_engine(); tenBotData_[i] = tenBot_[i]->getBuf(DATA); } // number of inputs gparams_.nInput.resize(bottom_.size()); tenBotDiff_.resize(bottom_.size()); int dtype = p->get_data_type(); for(int i=0; igetShape(); assert(bs->ndims <= MAX_DIMS); gparams_.nInput[i] = bs->dims[1]; if(!e->is_inference_only()) { if(NNNode::bp_flag_) { tenBotDiff_[i] = tenBot_[i]->addBuf(); // DIFF type and index tenBotDiff_[i]->setDataType(dtype); tenBotDiff_[i]->setBufferType(DIFF); long long int bsize = 1; for(int s=0; sndims; s++) bsize = bsize*bs->dims[s]; if(dtype == DT_FLOAT) bsize = bsize*sizeof(float); else if(dtype == DT_INT16) bsize = bsize*sizeof(short int); else if(dtype == DT_INT) bsize = bsize*sizeof(int); // Set the size of the input-gradient buffer tenBotDiff_[i]->setBufferSize(bsize); } } else tenBotDiff_[i] = NULL; } //Output tensor data type = input tensor data type tenTopData_->setDataType(dtype); Shape *bs = tenBot_[0]->getShape(); ts.ndims = bs->ndims; ts.dims[0] = bs->dims[0]; ts.dims[1] = bs->dims[1]; ts.dims[2] = bs->dims[2]; ts.dims[3] = bs->dims[3]; tenTop_->setShape(&ts); long long int tsize = 1; for(int s=0; ssetBufferSize(tsize); // Register output tensor in tensor map bool inserted = e->register_tensor(NNNode::top_[0], ACT, tenTop_); if(!inserted) printf("Warning: Tensor %s already registered\n",NNNode::top_[0].c_str()); gparams_.bdims = bs->ndims; gparams_.tdims = ts.ndims; gparams_.batch_size = ts.dims[0]; gparams_.nOutput = ts.dims[1]; gparams_.iHeight = bs->dims[2]; gparams_.iWidth = bs->dims[3]; gparams_.oHeight = ts.dims[2]; gparams_.oWidth = ts.dims[3]; gparams_.data_type = dtype; gparams_.op = p->get_op(); gparams_.algType = p->get_algo_type(); gparams_.num_threads = e->get_num_threads(); #ifdef GETSTATS count_ = 0; #endif configure(p->get_compute_engine()); } void EltwiseNode::configure(int engine) { switch(engine) { case XSMM: impl = new EltwiseXSMM(&gparams_, engine); break; } } void EltwiseNode::forwardPropagate() { #ifdef DEBUG float* bot; float* top = (float*)(tenTopData_->getBuffer()); for(int i=0; igetBuffer()); printf("Executing FP %s: input %p, output %p\n",NNNode::nname_.c_str(), bot, top); } #endif for(int i=0; iset_bot_compute_engine(bot_cengine_[i]); impl->set_top_compute_engine(top_compute_engine_); impl->set_next_node_type(next_ntype_); impl->set_node_name(nname_); impl->forwardPropagate(tenBotData_, tenTopData_); #ifdef GETSTATS #ifdef USE_MLSL size_t node_id = MLSL::Environment::GetEnv().GetProcessIdx(); #else size_t node_id = 0; #endif if(node_id==0 && count_ % STATFREQ == 0) { float* p, *pp, *ptr; int size; for(int i=0; igetBuffer(); pp = (float*)tenBotData_[i]->getPrivBuffer(); ptr = (pp == NULL) ? p : pp; Shape *bs = tenBot_[i]->getShape(); size = bs->dims[0] * bs->dims[1] * bs->dims[2] * bs->dims[3]; string s = nname_ + "_inp_" + to_string(i); MeanOfLayer((char*)s.c_str(), ptr, size); } p = (float*)tenTopData_->getBuffer(); pp = (float*)tenTopData_->getPrivBuffer(); ptr = (pp == NULL) ? p : pp; Shape *ts = tenTop_->getShape(); size = ts->dims[0] * ts->dims[1] * ts->dims[2] * ts->dims[3]; string s = nname_ + "_outp"; MeanOfLayer((char*)s.c_str(), ptr, size); } #endif } void EltwiseNode::backPropagate() { tenTopDiff_ = tenTop_->getBuf(DIFF); #ifdef DEBUG for(int i=0; igetBuffer(), tenBotDiff_[i]->getBuffer()); #endif impl->backPropagate(tenTopDiff_, tenBotDiff_); #ifdef GETSTATS #ifdef USE_MLSL size_t node_id = MLSL::Environment::GetEnv().GetProcessIdx(); #else size_t node_id = 0; #endif if(node_id==0 && count_ % STATFREQ == 0) { float* p, *pp, *ptr; p = (float*)tenTopDiff_->getBuffer(); pp = (float*)tenTopDiff_->getPrivBuffer(); ptr = (pp == NULL) ? p : pp; Shape *ts = tenTop_->getShape(); int size = ts->dims[0] * ts->dims[1] * ts->dims[2] * ts->dims[3]; string s = nname_ + "_deloutp"; MeanOfLayer((char*)s.c_str(), ptr, size); for(int i=0; igetBuffer(); pp = (float*)tenBotDiff_[i]->getPrivBuffer(); ptr = (pp == NULL) ? p : pp; Shape *bs = tenBot_[i]->getShape(); size = bs->dims[0] * bs->dims[1] * bs->dims[2] * bs->dims[3]; string s = nname_ + "_delinp_" + to_string(i); MeanOfLayer((char*)s.c_str(), ptr, size); } count_++; } #endif } libxsmm-1.17/samples/deeplearning/gxm/src/EltwiseXSMM.cpp000066400000000000000000000142601415223013700234260ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #include #include #include "EltwiseXSMM.hpp" #define VLEN 16 void EltwiseXSMM::convert_NCHW_to_NCHWV(float *inp, int n, int c, int h, int w, float *outp) { __assume_aligned(inp,64); __assume_aligned(outp,64); int index=0; int cv = c/VLEN; for(int img=0; img < n; img++) for(int fm=0; fm < cv; fm++) for(int fh=0; fh < h; fh++) for(int fw=0; fw < w; fw++) for(int v=0; v& inpb, TensorBuf *outpb, int tid) { float *outp = (float*)outpb->getBuffer(); float *outpp = (float*)outpb->getPrivBuffer(); float *inp_r = (float*)inpb[0]->getBuffer(); float *inp_nr = (float*)inpb[1]->getBuffer(); int nImg = gp->batch_size; int nOfm = gp->nOutput; int rem = 0; int ifh = gp->iHeight; int ifw = gp->iWidth; int ofh = gp->oHeight; int ofw = gp->oWidth; bool needs_conversion = false; int threads = gp->num_threads; int op = gp->op; __assume_aligned(outp, 64); __assume_aligned(outpp, 64); if(top_compute_engine != engine) needs_conversion = true; if(needs_conversion) { if(outpp == NULL) outpp = (float*)libxsmm_aligned_malloc(nImg*nOfm*ofh*ofw*sizeof(float), 64); assert(outpp != NULL); outpb->setPrivBuffer(outpp); } float* out = needs_conversion ? outpp : outp; for(int b=0; bsetLayoutType(NCHW); } else outpb->setLayoutType(LIBXSMM_CUSTOM_LAYOUT); } #if 0 void EltwiseXSMM::backPropagate(TensorBuf *deloutpb, vector& delinpb, int tid) { #if !defined(USE_OPTBP) float *deloutp = (float*)deloutpb->getBuffer(); float *deloutpp = (float*)deloutpb->getPrivBuffer(); float *delinp_r = (float*)delinpb[0]->getBuffer(); float *delinp_nr = (float*)delinpb[1]->getBuffer(); #else float *deloutp = (float*)deloutpb->getGradBuffer(); float *deloutpp = (float*)deloutpb->getGradPrivBuffer(); float *delinp_r = (float*)delinpb[0]->getGradBuffer(); float *delinp_nr = (float*)delinpb[1]->getGradBuffer(); #ifdef USE_MLSL if(MLSL::Environment::GetEnv().GetProcessIdx() == 0) printf("ELT delinp_r %p, delinp_nr %p\n",delinp_r, delinp_nr); #endif #endif int nImg = gp->batch_size; int nOfm = gp->nOutput; int nIfm = gp->nOutput; int ifh = gp->iHeight; int ifw = gp->iWidth; int ofh = gp->oHeight; int ofw = gp->oWidth; int op = gp->op; int rem = 0; int threads = gp->num_threads; __assume_aligned(deloutp, 64); __assume_aligned(deloutpp, 64); if(top_compute_engine != engine) { #if 0 #ifdef USE_MLSL if(MLSL::GetNodeId() == 0) #endif printf("%s converting output buffer in forward prop\n",nname.c_str()); #endif if(deloutpp == NULL) deloutpp = (float*)libxsmm_aligned_malloc(nImg*nOfm*ofh*ofw*sizeof(float), 64); assert(deloutpp != NULL); deloutpb->setPrivBuffer(deloutpp); convert_NCHW_to_NCHWV(deloutpp, nImg, nOfm, ofh, ofw, deloutp); } #ifdef USE_MLSL if(MLSL::Environment::GetEnv().GetProcessIdx() == 0) { if((deloutp == delinp_r) || (deloutp == delinp_nr)) { printf("node delout %p and delin %p are equal!!\n",deloutp, delinp_r, delinp_nr); fflush(stdout); exit(1); } } #endif float *delout = deloutpp != NULL ? deloutpp : deloutp; for(int b=0; bsetLayoutType(LIBXSMM_CUSTOM_LAYOUT); } #endif #if 1 void EltwiseXSMM::backPropagate(TensorBuf *deloutpb, vector& delinpb, int tid) { float *deloutp = (float*)deloutpb->getBuffer(); int op = gp->op; switch(op) { case ELSUM: { for(int i=0; isetBuffer(deloutp); } break; case ELPROD: break; case ELMAX: break; } for(int b=0; bsetLayoutType(LIBXSMM_CUSTOM_LAYOUT); } #endif libxsmm-1.17/samples/deeplearning/gxm/src/Engine.cpp000066400000000000000000001721501415223013700225150ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #include #include "assert.h" #include "proto/gxm.pb.h" #include "Node.hpp" #include "Engine.hpp" #include "Conv.hpp" #include "FullyConnected.hpp" #include "FusedBNorm.hpp" #include "FusedConvBN.hpp" #include "DummyData.hpp" #include "TypeList.hpp" #include "unistd.h" #include "limits.h" #define VLEN 16 using namespace std; using namespace gxm; int iter=0; bool compare_task_bins(Task* first, Task* second) { return (first->getMaxBin() < second->getMinBin()); } void MLEngine::create_schedule(int mode) { for(auto it=etg_[mode].begin(); it != etg_[mode].end(); it++) { Task* t = *it; vector tp = t->getBackDepTasks(); for(int i=0; i(tp[i]->getNode())->getNodeName(); if(tp[i]->getBasicTaskId() == BASIC_TASK_FORW) { int maxbin = tp[i]->getMaxBin(); if((maxbin == 0) || (maxbin > t->getMinBin()-1)) { tp[i]->setMinBin(t->getMaxBin() - 1); tp[i]->setMaxBin(t->getMaxBin() - 1); etg_[mode].push_back(tp[i]); #ifdef DEBUG printf("FP task %p (node %s), with bin %d pushed to etg_\n",tp[i], s.c_str(), tp[i]->getMaxBin()); #endif } } } } if(mode == TRAIN) { for(auto it=etg_[mode].begin(); it != etg_[mode].end(); it++) { Task* t = *it; vector tp = t->getForwDepTasks(); for(int i=0; i(tp[i]->getNode())->getNodeName(); if(tp[i]->getBasicTaskId() != BASIC_TASK_FORW) { int maxbin = tp[i]->getMaxBin(); if((maxbin == 0) || (maxbin < t->getMinBin()+1)) { tp[i]->setMinBin(t->getMaxBin() + 1); tp[i]->setMaxBin(t->getMaxBin() + 1); etg_[mode].push_back(tp[i]); #ifdef DEBUG if(tp[i]->getBasicTaskId() == BASIC_TASK_BACK) printf("BP task %p (node %s), with bin %d pushed to etg_\n",tp[i], s.c_str(), tp[i]->getMaxBin()); else if(tp[i]->getBasicTaskId() == BASIC_TASK_WGRAD) printf("WU task %p (node %s), with bin %d pushed to etg_\n",tp[i], s.c_str(), tp[i]->getMaxBin()); else if(tp[i]->getBasicTaskId() == BASIC_TASK_SOLVE) printf("SOLVE task %p (node %s), with bin %d pushed to etg_\n",tp[i], s.c_str(), tp[i]->getMaxBin()); #endif } } } } } } int MLEngine::find_in_nodeTypeList(string name) { for(int i=0; it; TensorBuf *tBuf; bool found = false; for(int index=0; indexgetNumDataBuffers(); index++) { tBuf = t->getBuf(index); if(tBuf->getBufferType() == buftype) { found = true; break; } } if(!found) continue; long long int bytes = tBuf->getBufferSize(); int dtype = tBuf->getDataType(); float *fp = (float*)(tBuf->getBuffer()); #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; it; TensorBuf *tBuf; bool found=false; for(int index=0; indexgetNumDataBuffers(); index++) { tBuf = t->getBuf(index); if(tBuf->getBufferType() == buftype) { found = true; break; } } if(!found) continue; int tenType = t->getType(); string tn = t->getTensorName(); string n = checkpoint_dir_ + "/" + tn; if(buftype == HISTORY) n = n + "_history"; else if(buftype == DIFF) n = n + "_grad"; string nntype = dynamic_cast(t->getOwner())->getNodeType(); if(current_epoch_ == 30 || current_epoch_ == 60 || current_epoch_ == 80) { if(tenType == ACT) { n = checkpoint_dir_ + to_string(current_epoch_) + "/" + tn; if(tn.find("bn") != tn.npos) { if(nntype == "FusedBatchNorm") { FusedBNormNode* bn = dynamic_cast(t->getOwner()); bn->Checkpoint(tBuf, n, checkpoint_format_); } else if(nntype == "FusedConvBN") { FusedConvBNNode* fcbn = dynamic_cast(t->getOwner()); fcbn->Checkpoint(tBuf, n, checkpoint_format_); } } } } if((tenType == CONVWEIGHT) || (tenType == CONVBIAS)) { if(nntype == "Convolution") { ConvNode* cn = dynamic_cast(t->getOwner()); cn->Checkpoint(tBuf, n, checkpoint_format_); if(current_epoch_ == 30 || current_epoch_ == 60 || current_epoch_ == 80) { n = checkpoint_dir_ + to_string(current_epoch_) + "/" + tn; if(buftype == HISTORY) n = n + "_history"; else if(buftype == DIFF) n = n + "_diff"; cn->Checkpoint(tBuf, n, checkpoint_format_); } } else if(nntype == "FusedConvBN") { FusedConvBNNode* fcbn = dynamic_cast(t->getOwner()); fcbn->Checkpoint(tBuf, n, checkpoint_format_); if(current_epoch_ == 30 || current_epoch_ == 60 || current_epoch_ == 80) { n = checkpoint_dir_ + to_string(current_epoch_) + "/" + tn; if(buftype == HISTORY) n = n + "_history"; else if(buftype == DIFF) n = n + "_grad"; fcbn->Checkpoint(tBuf, n, checkpoint_format_); } } } else if((tenType == FCWEIGHT) || (tenType == FCBIAS)) { FCNode* fn = dynamic_cast(t->getOwner()); fn->Checkpoint(tBuf, n, checkpoint_format_); if(current_epoch_ == 30 || current_epoch_ == 60 || current_epoch_ == 80) { n = checkpoint_dir_ + to_string(current_epoch_) + "/" + tn; if(buftype == HISTORY) n = n + "_history"; else if(buftype == DIFF) n = n + "_grad"; fn->Checkpoint(tBuf, n, checkpoint_format_); } } else if((tenType == BNORMSCALE) || (tenType == BNORMSHIFT) || (tenType == BNORMMEAN) || (tenType == BNORMVAR)) { if(nntype == "FusedBatchNorm") { FusedBNormNode* bn = dynamic_cast(t->getOwner()); bn->Checkpoint(tBuf, n, checkpoint_format_); if(current_epoch_ == 30 || current_epoch_ == 60 || current_epoch_ == 80) { n = checkpoint_dir_ + to_string(current_epoch_) + "/" + tn; if(buftype == HISTORY) n = n + "_history"; else if(buftype == DIFF) n = n + "_grad"; bn->Checkpoint(tBuf, n, checkpoint_format_); } } else if(nntype == "FusedConvBN") { FusedConvBNNode* fcbn = dynamic_cast(t->getOwner()); fcbn->Checkpoint(tBuf, n, checkpoint_format_); if(current_epoch_ == 30 || current_epoch_ == 60 || current_epoch_ == 80) { n = checkpoint_dir_ + to_string(current_epoch_) + "/" + tn; if(buftype == HISTORY) n = n + "_history"; else if(buftype == DIFF) n = n + "_grad"; fcbn->Checkpoint(tBuf, n, checkpoint_format_); } } } } } void MLEngine::read_checkpoint_file(TensorBuf* tBuf, string filename, string format) { long long int bytes = tBuf->getBufferSize(); int dtype = tBuf->getDataType(); void* ptr; ptr = tBuf->getBuffer(); FILE* f; if(format == "binary") { f = fopen(filename.c_str(), "rb"); assert(f != NULL); size_t b = fread(ptr, 1, bytes, f); assert((long long int)b == bytes); } else { printf("Reading from %s\n",filename.c_str()); f = fopen(filename.c_str(), "r"); assert(f != NULL); if(dtype == DT_FLOAT) { float* p = (float*)ptr; for(int i=0; i < bytes/sizeof(float); i++) fscanf(f, "%f", &p[i]); } } fclose(f); if(data_type_ == BF16 && (filename.find("wt") != filename.npos)) if(filename.find("history") == filename.npos) convert_f32_bf16((float*)ptr, (libxsmm_bfloat16*)tBuf->getLPBuffer(), bytes/sizeof(float), 0); } void MLEngine::load_checkpoint(TensorList L, int buftype, string format) { TensorBuf* tBuf; for(Iter it=L.begin(); it != L.end(); it++) { Tensor* t = it->t; int tenType = t->getType(); if((tenType != CONVWEIGHT) && (tenType != CONVBIAS) && (tenType != FCWEIGHT) && (tenType != FCBIAS)) if((tenType != BNORMSCALE) && (tenType != BNORMSHIFT) && (tenType != BNORMMEAN) && (tenType != BNORMVAR)) continue; bool found = false; for(int index=0; indexgetNumDataBuffers(); index++) { tBuf = t->getBuf(index); if(tBuf->getBufferType() == buftype) { found = true; break; } } if(!found) continue; string n = checkpoint_dir_ + "/" + t->getTensorName(); if(buftype == HISTORY) n = n + "_history"; size_t pos; while((pos = n.find("/", 10)) != n.npos) n.replace(pos, 1, 1, '_'); read_checkpoint_file(tBuf, n, format); } } void MLEngine::canary_check(void* ptr, vector& cp, int nc) { if(ptr == NULL) { printf("FATAL: NULL pointer to buffer\n"); //exit(1); } int *p = (int*)ptr; for(int i=0; iGetParameterSet(0)->WaitGradientComm(); } } else if(tenType=="BIAS") { if(!bias_grad_comms_vec.empty()) { for(int i=0; iGetParameterSet(0)->WaitGradientComm(); bias_grad_comms_vec[i]->GetParameterSet(1)->WaitGradientComm(); bias_grad_comms_vec[i]->GetParameterSet(2)->WaitGradientComm(); bias_grad_comms_vec[i]->GetParameterSet(3)->WaitGradientComm(); } } } else if(tenType=="COMBO") { if(!combo_grad_comms_vec.empty()) { for(int i=0; iGetParameterSet(0)->WaitGradientComm(); combo_grad_comms_vec[i]->GetParameterSet(1)->WaitGradientComm(); combo_grad_comms_vec[i]->GetParameterSet(2)->WaitGradientComm(); combo_grad_comms_vec[i]->GetParameterSet(3)->WaitGradientComm(); combo_grad_comms_vec[i]->GetParameterSet(4)->WaitGradientComm(); } } } #endif } void MLEngine::run(int mode) { if(mode == TRAIN) { if(load_from_checkpoint_) { FILE *f = fopen("checkpoint", "r"); if(f != NULL) { fscanf(f, "%d %f %f\n",¤t_epoch_, &lr_, &scf_); fclose(f); } else printf("No checkpoint state file to read\n"); if(current_epoch_ != num_epochs_ - 1) current_epoch_++; load_checkpoint(wTList_, DATA, checkpoint_format_); load_checkpoint(wTList_, HISTORY, checkpoint_format_); load_checkpoint(biasTList_, DATA, checkpoint_format_); load_checkpoint(biasTList_, HISTORY, checkpoint_format_); load_checkpoint(statsTList_, DATA, checkpoint_format_); #ifdef _OPENMP #pragma omp parallel #endif { int tid = omp_get_thread_num(); int ntps = num_threads_/NUM_NUMA_NODES; int n = tid/ntps; int w = total_weights_; int b = total_biases_; if(n != 0 && tid % ntps == 0) { float *wptr = (float*)weight_buf_[n]; #if 1 float *bptr = (float*)bias_buf_[n]; float *sptr = (float*)stats_buf_[n]; #endif #pragma omp simd for(int i=0; iBarrier(MLSL::GT_DATA); #endif // current_epoch_ is set in create() function or by checkpoint code above for(; current_epoch_ < num_epochs_; current_epoch_++) { // Tell data node that it should use training data exec_mode_ = TRAIN; if(global_node_id_ == 0) { printf("===========================================\n"); printf("TRAIN mode, epoch %d, training batches %d\n", current_epoch_, num_train_batches_); printf("===========================================\n"); } // Run training network for an epoch struct timeval tvs, tve, tvts, tvte, tvis, tvie; double fbtime, runtime = 0; for(; current_batch_invoke(); #ifdef TIMING gettimeofday(&tvte, NULL); double tasktime = (tvte.tv_sec*1e6 + tvte.tv_usec) - (tvts.tv_sec*1e6 + tvts.tv_usec); NNNode *nn = dynamic_cast((*it)->getNode()); if(global_node_id_ == 0) printf("Node %s (task %d) time = %f ms\n",nn->getNodeName().c_str(), (*it)->getBasicTaskId(), tasktime/1000); #endif } if(solver_->getGlobalFlag()) { #ifdef TIMING gettimeofday(&tvis, NULL); #endif #ifdef DUMP_WT if(global_node_id_ == 0) if(current_epoch_ == 30 || current_epoch_ == 60 || current_epoch_ == 80) if(current_batch_ == num_train_batches_-1) checkpoint(wTList_, DIFF); #endif #ifdef USE_MLSL waitForComms("WEIGHT"); waitForComms("BIAS"); waitForComms("COMBO"); #endif #ifdef MLSL data_parallelism->Barrier(MLSL::GT_DATA); #endif #if 0 solver_->applyUpdate((float**)weight_buf_, (float**)winc_buf_, wdiff_buf_, total_weights_, (float**)wt_lr_mult_, (float**)wt_decay_mult_, "WEIGHT"); #else solver_->applyUpdate((float**)weight_buf_, (float**)winc_buf_, wdiff_buf_, total_weights_, 1.0, 1.0, "WEIGHT"); #endif if(data_type_ == BF16) convert_f32_bf16((float**)weight_buf_, (libxsmm_bfloat16**)lpweight_buf_, total_weights_); #if 0 solver_->applyUpdate((float**)bias_buf_, (float**)biinc_buf_, bidiff_buf_, total_biases_, (float**)bias_lr_mult_, (float**)bias_decay_mult_, "BIAS"); #else #if 1 solver_->applyUpdate((float**)bias_buf_, (float**)biinc_buf_, bidiff_buf_, total_biases_, 1.0, 0.0, "BIAS"); #else solver_->applyUpdate((float*)bias_buf_, (float*)biinc_buf_, bidiff_buf_, total_biases_, 1.0, 0.0, "BIAS"); #endif #endif #ifdef TIMING gettimeofday(&tvie, NULL); double sgdtime = (tvie.tv_sec + tvie.tv_usec*1e-6) - (tvis.tv_sec + tvis.tv_usec*1e-6); printf("global sgd time: %f ms\n",sgdtime*1000); #endif } gettimeofday(&tve, NULL); fbtime = (tve.tv_sec + tve.tv_usec*1e-6) - (tvs.tv_sec + tvs.tv_usec*1e-6); if(global_node_id_ == 0 && current_batch_ % 100 == 0) printf("Fwd-Bwd time: %f ms\n",fbtime*1000); if ( current_batch_ > 1 ) runtime += fbtime; #ifdef CANARY_CHECK canary_check(input_buf_, input_can_ptr, ic); canary_check(fact_buf_, fact_can_ptr, fac); canary_check(bact_buf_, bact_can_ptr, bac); #endif } current_batch_ = 0; if ( num_train_batches_ > 1 ) { char hostname[HOST_NAME_MAX + 1]; gethostname(hostname, HOST_NAME_MAX + 1); printf("%s; Average Training time = %f seconds", hostname, runtime/((double)(num_train_batches_-2))); if(runtime > 0) { printf("; Average Training throughput = %f images/s\n", ((double)(batch_size_*(num_train_batches_-2)))/runtime); } else { printf("\n"); } } // Checkpoint weights and biases if(global_node_id_ == 0) { checkpoint(wTList_, DATA); checkpoint(wTList_, HISTORY); checkpoint(biasTList_, DATA); checkpoint(biasTList_, HISTORY); checkpoint(statsTList_, DATA); #ifdef DUMP_ACT_DATA if(current_epoch_ == 30 || current_epoch_ == 60 || current_epoch_ == 80) { checkpoint(outTList_, DATA); checkpoint(outTList_, DIFF); } #endif FILE* f = fopen("checkpoint", "w"); if(f != NULL) { fprintf(f, "%d %10g %10g\n",current_epoch_, lr_, scf_); fclose(f); } } #ifdef USE_MLSL data_parallelism->Barrier(MLSL::GT_DATA); #endif // Tell data node that it should use test data exec_mode_ = VAL; if(global_node_id_ == 0) { printf("===========================================\n"); printf("VAL mode, testing batches %d\n", num_test_batches_); printf("===========================================\n"); } // Run validation network at end of each epoch for(; current_batch_invoke(); } current_batch_ = 0; #ifdef CANARY_CHECK canary_check(input_buf_, input_can_ptr, ic); canary_check(fact_buf_, fact_can_ptr, fac); canary_check(weight_buf_, wt_can_ptr, wtc); canary_check(bias_buf_, bias_can_ptr, bic); #endif } #ifdef USE_MLSL MLSL::Environment::GetEnv().Free(input_buf_); MLSL::Environment::GetEnv().Free(fact_buf_); MLSL::Environment::GetEnv().Free(bact_buf_); #else libxsmm_free(input_buf_); libxsmm_free(fact_buf_); libxsmm_free(bact_buf_); #endif for(int n=0; ninvoke(); } } } void MLEngine::convert_f32_bf16(float* in, libxsmm_bfloat16* out, int len, int numa_node) { #ifdef _OPENMP #pragma omp parallel #endif { int tid = omp_get_thread_num(); int ntps = num_threads_/NUM_NUMA_NODES; int n = tid/ntps; int ltid = tid - numa_node*ntps; if(n == numa_node) { int jobs = (len % ntps == 0) ? len/ntps : len/ntps + 1; int tb = (ltid*jobs < len) ? ltid*jobs : len; int te = ((ltid+1)*jobs < len) ? (ltid+1)*jobs : len; for (int i = tb; i < te; i+=16 ) { __m512 vfp32 = gxm_fp32_to_bfp16_rne_adjustment_avx512f( _mm512_loadu_ps( in+i ) ); __m256i vbfp16 = gxm_fp32_to_bfp16_truncate_avx512f( vfp32 ); _mm256_storeu_si256( (__m256i*)(out+i), vbfp16 ); } } } } void MLEngine::convert_f32_bf16(float** in, libxsmm_bfloat16** out, int len) { #ifdef _OPENMP #pragma omp parallel #endif { int tid = omp_get_thread_num(); int ntps = num_threads_/NUM_NUMA_NODES; int n = tid/ntps; int ltid = tid - n*ntps; float *inp = in[n]; libxsmm_bfloat16 *outp = out[n]; int jobs = (len % ntps == 0) ? len/ntps : len/ntps + 1; int tb = (ltid*jobs < len) ? ltid*jobs : len; int te = ((ltid+1)*jobs < len) ? (ltid+1)*jobs : len; for (int i = tb; i < te; i+=16 ) { __m512 vfp32 = gxm_fp32_to_bfp16_rne_adjustment_avx512f(_mm512_loadu_ps(inp + i)); __m256i vbfp16 = gxm_fp32_to_bfp16_truncate_avx512f(vfp32); _mm256_storeu_si256( (__m256i*)(outp+i), vbfp16 ); } } } void MLEngine::convert_bf16_f32(libxsmm_bfloat16* in, float* out, int len) { int i; #ifdef _OPENMP #pragma omp parallel for private(i) #endif for ( i = 0; i < len; i+=16 ) { __m256i vbfp16 = _mm256_loadu_si256( (const __m256i*)(in+i) ); __m512 vfp32 = gxm_bfp16_to_fp32_avx512f( vbfp16 ); _mm512_storeu_ps( out+i, vfp32 ); } } void MLEngine::allocate_memory(string tenType, TensorList L, int buftype, vector& can_ptr, int* nc, long long int* bufsize) { bool ttp = false; //(tenType != "WEIGHT") & (tenType != "BIAS"); long long int s = ttp ? START_GUARD_BAND : 0; TensorBuf* tBuf; int num_canaries = 0; float* lrptr, *decptr; // Get total buffer size required for tensors of type buftype for(Iter it=L.begin(); it != L.end(); it++) { Tensor* t = it->t; bool found = false; for(int i=0; igetNumDataBuffers(); i++) { tBuf = t->getBuf(i); if(tBuf->getBufferType() == buftype) { found = true; break; } } if(!found) continue; long long int size = tBuf->getBufferSize(); if(size > 0) { if(global_node_id_ == 0) { printf("Tensor %s needs %lld bytes for buffer %d\n", t->getTensorName().c_str(), size, buftype); fflush(stdout); } s += size; if(ttp) s += END_GUARD_BAND; if(ttp) num_canaries++; } } if(tenType == "WEIGHT") total_weights_ = s/sizeof(float); else if(tenType == "BIAS" || tenType == "STATS") total_biases_ = s/sizeof(float); if(solver_->getGlobalFlag()) { if(tenType == "WEIGHT") { #ifdef BF16_MLSL if(buftype == DIFF) { if(data_type_ == FLOAT) total_weights_ = s/sizeof(float); else if(data_type_ == BF16) total_weights_ = s/sizeof(libxsmm_bfloat16); } else #endif total_weights_ = s/sizeof(float); int factor = num_threads_ * VLEN; int nwt = (total_weights_ + factor - 1)/factor; total_weights_ = nwt * factor; #ifdef BF16_MLSL if(buftype == DIFF) { if(data_type_ == FLOAT) s = total_weights_ * sizeof(float); else if(data_type_ == BF16) s = total_weights_ * sizeof(libxsmm_bfloat16); } else #endif s = total_weights_ * sizeof(float); } else if(tenType == "BIAS" || tenType == "STATS") { total_biases_ = s / sizeof(float); int factor = num_threads_ * VLEN; int nwt = (total_biases_ + factor - 1)/factor; total_biases_ = nwt * factor; s = total_biases_ * sizeof(float); } } // Number of guard bands in tensor; used for canary checking *nc = num_canaries; // Allocate memory #ifdef BF16_MLSL bool lp = (data_type_ == BF16) && (tenType=="WEIGHT") && (buftype == DATA); #else bool lp = (data_type_ == BF16) && (tenType=="WEIGHT"); #endif void *buf_; void **ptrptr, **lptrptr=NULL; #if 0 //def USE_MLSL s = ALIGN_SIZE(s, 2097152); #endif if(tenType=="INPUT") { #ifdef USE_MLSL buf_ = (void*)MLSL::Environment::GetEnv().Alloc(s, 2097152); #else buf_ = (void*)libxsmm_aligned_malloc(s, 2097152); #endif input_buf_ = buf_; } else if(tenType == "FACT") { #ifdef USE_MLSL buf_ = (void*)MLSL::Environment::GetEnv().Alloc(s, 2097152); #else buf_ = (void*)libxsmm_aligned_malloc(s, 2097152); #endif fact_buf_ = buf_; } else if(tenType == "WEIGHT") { if(buftype == DATA) { for(int n=0; ngetGlobalFlag()) { if(tenType == "WEIGHT" && buftype == DIFF) { for(int n=0; nt; bool found = false; for(int i=0; igetNumDataBuffers(); i++) { tBuf = t->getBuf(i); if(tBuf->getBufferType() == buftype) { found = true; break; } } if(!found) continue; // Don't process Split nodes further for forward activations string nntype = dynamic_cast(t->getOwner())->getNodeType(); if(nntype.find("Split") != nntype.npos && buftype == DATA) continue; // Scrub or initialize buffers appropriately bytes = tBuf->getBufferSize(); assert(ptr+bytes <= buf_+s); lpbytes = lp ? bytes/sizeof(libxsmm_bfloat16) : 0; #ifndef USE_NUMA if(t->getType() == INPUT || t->getType() == ACT) { if(bytes > 0) memset(ptr, 0, bytes); } #endif int dtype = tBuf->getDataType(); // Set each node's tensor buffer pointers to the appropritate location in the global buffer if(tenType == "WEIGHT" || tenType == "BIAS" || tenType == "STATS") { if(buftype == DATA || buftype == DIFF) { tBuf->setBufferPtr(ptrptr); tBuf->setOffset(offset); } tBuf->setBuffer(ptr); if(lp) { if(buftype == DATA) tBuf->setLPBuffer(lptr); else if(buftype == DIFF) tBuf->setLPBuffer(lgptr); tBuf->setLPBufferPtr(lptrptr); } } else tBuf->setBuffer(ptr); // If weight or bias tensor, call corresponding intialization function (for training only) if(!is_inference_only()) { int tType = t->getType(); if(tType == CONVWEIGHT) { if(nntype == "FusedConvBN") { FusedConvBNNode *fcbn = dynamic_cast(t->getOwner()); assert(bytes > 0); if(!load_from_checkpoint_) { fcbn->fillWeightBuffers(tBuf, buftype, bytes); #if 0 if(lp) convert_f32_bf16((float*)ptr, (libxsmm_bfloat16*)lptr, lpbytes/sizeof(libxsmm_bfloat16), 0); #endif } #if 0 if(solver_->getGlobalFlag()) if(buftype == DIFF) if(data_type_ == FLOAT) fcbn->fillWeightMultipliers(lrptr, decptr, bytes/sizeof(float)); else if(data_type_ == BF16) fcbn->fillWeightMultipliers(lrptr, decptr, bytes/sizeof(libxsmm_bfloat16)); #endif } else if(nntype == "Convolution") { ConvNode* cn = dynamic_cast(t->getOwner()); assert(bytes > 0); if(!load_from_checkpoint_) { cn->fillWeightBuffers(tBuf, buftype, bytes); #if 0 if(lp) convert_f32_bf16((float*)ptr, (libxsmm_bfloat16*)lptr, lpbytes/sizeof(libxsmm_bfloat16), 0); #endif } #if 0 if(solver_->getGlobalFlag()) if(buftype == DIFF) if(data_type_ == FLOAT) cn->fillWeightMultipliers(lrptr, decptr, bytes/sizeof(float)); else if(data_type_ == BF16) cn->fillWeightMultipliers(lrptr, decptr, bytes/sizeof(libxsmm_bfloat16)); #endif } } else if(tType == CONVBIAS) { ConvNode* cn = dynamic_cast(t->getOwner()); assert(bytes > 0); if(!load_from_checkpoint_) cn->fillBiasBuffers(tBuf, buftype, bytes); #if 0 if(solver_->getGlobalFlag()) if(buftype == DIFF) cn->fillBiasMultipliers(lrptr, decptr, bytes/sizeof(float)); #endif } else if(tType == FCWEIGHT) { FCNode* fn = dynamic_cast(t->getOwner()); assert(bytes > 0); if(!load_from_checkpoint_) { fn->fillWeightBuffers(tBuf, buftype, bytes); #if 0 if(lp) convert_f32_bf16((float*)ptr, (libxsmm_bfloat16*)lptr, lpbytes/sizeof(libxsmm_bfloat16), 0); #endif } #if 0 if(solver_->getGlobalFlag()) if(buftype == DIFF) if(data_type_ == FLOAT) fn->fillWeightMultipliers(lrptr, decptr, bytes/sizeof(float)); else if(data_type_ == BF16) fn->fillWeightMultipliers(lrptr, decptr, bytes/sizeof(libxsmm_bfloat16)); #endif } else if(tType == FCBIAS) { FCNode* fn = dynamic_cast(t->getOwner()); assert(bytes > 0); if(!load_from_checkpoint_) fn->fillBiasBuffers(tBuf, buftype, bytes); #if 0 if(solver_->getGlobalFlag()) if(buftype == DIFF) fn->fillBiasMultipliers(lrptr, decptr, bytes/sizeof(float)); #endif } else if((tType == BNORMSCALE) || (tType == BNORMSHIFT)) { if(nntype == "FusedConvBN") { FusedConvBNNode *fcbn = dynamic_cast(t->getOwner()); assert(bytes > 0); if(!load_from_checkpoint_) fcbn->fillBuffer(tBuf, buftype, bytes); #if 0 if(solver_->getGlobalFlag()) if(buftype == DIFF) fcbn->fillBiasMultipliers(lrptr, decptr, bytes/sizeof(float)); #endif } else if(nntype == "FusedBatchNorm") { FusedBNormNode* bn = dynamic_cast(t->getOwner()); assert(bytes > 0); if(!load_from_checkpoint_) bn->fillBuffer(tBuf, buftype, bytes); #if 0 if(solver_->getGlobalFlag()) if(buftype == DIFF) bn->fillBiasMultipliers(lrptr, decptr, bytes/sizeof(float)); #endif } } else if((tType == BNORMMEAN) || (tType == BNORMVAR)) { if(nntype == "FusedConvBN") { FusedConvBNNode *fcbn = dynamic_cast(t->getOwner()); assert(bytes > 0); if(!load_from_checkpoint_) fcbn->fillBuffer(tBuf, buftype, bytes); } else if(nntype == "FusedBatchNorm") { FusedBNormNode* bn = dynamic_cast(t->getOwner()); assert(bytes > 0); if(!load_from_checkpoint_) bn->fillBuffer(tBuf, buftype, bytes); } } } if(bytes > 0) { ptr += bytes; if(lp) if(buftype == DATA) lptr += lpbytes; #ifndef BF16_MLSL else if(buftype == DIFF) lgptr += lpbytes; #endif #ifdef BF16_MLSL if(tenType == "WEIGHT" && buftype == DATA) offset += bytes/sizeof(float); else if(tenType == "WEIGHT" && buftype == DIFF) { if(data_type_ == FLOAT) offset += bytes/sizeof(float); else if(data_type_ == BF16) offset += bytes/sizeof(libxsmm_bfloat16); } #else if(tenType == "WEIGHT") offset += bytes/sizeof(float); #endif else if((tenType == "BIAS" && (buftype == DATA || buftype == DIFF)) || tenType == "STATS") offset += bytes/sizeof(float); #if 0 if(solver_->getGlobalFlag()) { if(tenType == "WEIGHT" && buftype == DIFF) { if(data_type_ == FLOAT) { lrptr += bytes/sizeof(float); decptr += bytes/sizeof(float); } else if(data_type_ == BF16) { lrptr += bytes/sizeof(libxsmm_bfloat16); decptr += bytes/sizeof(libxsmm_bfloat16); } } else if(tenType == "BIAS" && buftype == DIFF) { lrptr += bytes/sizeof(float); decptr += bytes/sizeof(float); } } #endif assert(ptr <= buf_ + s); // For canary checking if(ttp) { memset(ptr, CANARY, END_GUARD_BAND); can_ptr.push_back(bytes); assert(can_ptr.size() <= num_canaries); } if(ttp) ptr += END_GUARD_BAND; } assert(ptr <= buf_ + s); #if 0 printf("ptr @ %p\n",ptr); #endif } if(tenType=="WEIGHT" && buftype==DATA) { #ifdef _OPENMP #pragma omp parallel #endif { int tid = omp_get_thread_num(); int ntps = num_threads_/NUM_NUMA_NODES; int n = tid/ntps; int w = total_weights_; if(n != 0 && tid % ntps == 0) { float *wtptr = (float*)weight_buf_[n]; #pragma omp simd for(int i=0; iCopyFrom(p); ps->clear_node(); vector< pair > top_names; for(int i=0; i top_as_bot; for(int i=0; i < top_names.size(); i++) { pair tn = top_names[i]; for(int j=0; j < p.node_size(); j++) { const NodeParameter& np = p.node(j); string nn = p.node(j).name(); if(nn.compare(tn.second) == 0) continue; for(int k=0; k < np.bottom_size(); k++) { std::string t = tn.first; if(t.compare(p.node(j).bottom(k)) == 0) top_as_bot.insert(make_pair(t, p.node(j))); } } } std::multimap old_bottom; std::multimap new_bottom; for(int i=0; iadd_node(); np->CopyFrom(p.node(i)); string onn = np->name(); for(int j=0; jtop_size(); j++) { string t = np->top(j); int split_count = top_as_bot.count(t); if(split_count > 1) { NodeParameter *snp = ps->add_node(); snp->Clear(); snp->add_bottom(t); string snn = t + "_" + onn + "_" + std::to_string(j) + "_split"; snp->set_name(snn); snp->set_type("Split"); if(t.compare("label") == 0) snp->set_propagate_down(false); std::multimap::iterator it; int k = 0; for(it=top_as_bot.equal_range(t).first; it != top_as_bot.equal_range(t).second; it++) { NodeParameter onp = (*it).second; string nn = onp.name(); string stn = t + "_" + nn + "_" + std::to_string(j) + "_split_" + std::to_string(k); snp->add_top(stn); k++; for(int l=0; l::iterator it1; std::multimap::iterator it2; for(int i=0; inode_size(); i++) { NodeParameter* mn = ps->mutable_node(i); if(mn->type().compare("Split") == 0) continue; for(int j=0; jbottom_size(); j++) { string t = mn->bottom(j); it1 = old_bottom.find(t); if(it1 == old_bottom.end()) continue; for(it1=old_bottom.equal_range(t).first; it1 != old_bottom.equal_range(t).second; it1++) if(mn->name() == (*it1).second) break; assert(it1 != old_bottom.end()); string s = (*it1).second; for(it2=new_bottom.equal_range(s).first; it2 != new_bottom.equal_range(s).second; it2++) { string v = (*it2).second; if(v.find(mn->bottom(j)) != v.npos) mn->set_bottom(j, v); } } } } void MLEngine::create(int mode, string ntgConfig, string solverConfig) { bool parsed = parseMLConfig(ntgConfig, &ntgparam_); if(!parsed) exit(-1); if(!solverConfig.empty()) { parsed = parseSolverConfig(solverConfig, &sparam_); if(!parsed) exit(-1); num_epochs_ = sparam_.max_epochs(); current_epoch_ = 0; current_batch_ = 0; load_from_checkpoint_ = sparam_.load_checkpoint(); checkpoint_dir_ = sparam_.checkpoint_dir(); checkpoint_format_ = sparam_.checkpoint_format(); data_type_ = sparam_.data_type(); } #ifdef _OPENMP num_threads_ = omp_get_max_threads(); #else num_threads_ = 1; #endif printf("Using %d threads\n",num_threads_); #ifdef USE_MLSL global_node_id_ = MLSL::Environment::GetEnv().GetProcessIdx(); num_machines_ = MLSL::Environment::GetEnv().GetProcessCount(); data_parallelism = NULL; if(mode == TRAIN || mode == VAL) session_ = MLSL::Environment::GetEnv().CreateSession(MLSL::PT_TRAIN); else session_ = MLSL::Environment::GetEnv().CreateSession(MLSL::PT_TEST); #else global_node_id_ = 0; num_machines_ = 1; #endif // if no training mode in config, then set inferenceOnly_ to true inferenceOnly_ = (mode == TEST); // Initialize solver node int ni = find_in_nodeTypeList("Solver"); solverParams_ = parseSolverParams(&sparam_); solver_ = new SolverNode(solverParams_, this); /*************************************************************************************/ /*** Create a global tensor to hold scratch memory needed by Conv layers (LIBXSMM) ***/ /*************************************************************************************/ tenScratch_ = new Tensor("scratch"); tenScratchBuf_ = tenScratch_->getBuf(DATA); tenScratchBuf_->setBufferPtr(scratch); NTGParameter split_ntgparam; insertSplitNodes(ntgparam_, &split_ntgparam); if(global_node_id_ == 0) split_ntgparam.PrintDebugString(); int numNodes = split_ntgparam.node_size(); for(int i=0; i(ntg_[0]); assert(dnode != NULL); string first = dnode->getNodeType(); #ifdef DEBUG printf("first node type %s\n",first.c_str()); #endif assert(first.find("Data") != first.npos); // Create the neural network graph for training or testing mode dnode->createNNGraph(mode); // Forward Pass Binning. // Look for tasks attached to nodes with no successors. Add them to the Execution Task Graph (etg) first. for(int i=numNodes-1; i>0; i--) { NNNode *nn = dynamic_cast(ntg_[i]); Task* t = nn->getBasicTask(BASIC_TASK_FORW); if(nn->getNumNextNodes() == 0) { etg_[mode].push_back(t); #ifndef NDEBUG printf("FP task %p (node %s), bin %d pushed to etg_\n",t, nn->getNodeName().c_str(), t->getMaxBin()); #endif } } // Assign bins to tasks based on their dependencies. Tasks with lower bin number must // execute before those with higher bin number. Tasks with same bin number can execute in parallel // Ensure no duplicate tasks in etg create_schedule(mode); optimize_schedule(mode); if(mode == TRAIN) { for(auto it = etg_[mode].begin(); it != etg_[mode].end(); it++) { Task *t = *it; if(t->getBasicTaskId() == BASIC_TASK_FORW) etg_[VAL].push_back(t); else break; } } #ifdef DEBUG for(auto it=etg_[mode].begin(); it != etg_[mode].end(); it++) { Task* t = (*it); string s = dynamic_cast(t->getNode())->getNodeName(); if(t->getBasicTaskId() == BASIC_TASK_FORW) printf("FP Task %p in node %s at bin %d\n",t, s.c_str(), t->getMaxBin()); else if(t->getBasicTaskId() == BASIC_TASK_BACK) printf("BP Task %p in node %s at bin %d\n",t, s.c_str(), t->getMaxBin()); else if(t->getBasicTaskId() == BASIC_TASK_WGRAD) printf("WG Task %p in node %s at bin %d\n",t, s.c_str(), t->getMaxBin()); else printf("SOLVER Task %p in node %s at bin %d\n",t, s.c_str(), t->getMaxBin()); } #endif if(mode == TRAIN) printf("Training schedule has %u tasks\n",(unsigned int)etg_[mode].size()); else printf("Testing schedule has %u tasks\n",(unsigned int)etg_[mode].size()); /*** Allocate memory and set pointers for INPUT and LABEL buffers ***/ /**********************************************************************/ long long int total_input_size; long long int max_fwd_buffer_size=0; allocate_memory("INPUT", inTList_, DATA, input_can_ptr, &ic, &total_input_size); if(global_node_id_ == 0) printf("Total input memory allocated %lld bytes\n", total_input_size); /**********************************************************************/ /*** Allocate memory and set pointers for FORWARD ACTIVATION buffer ***/ /**********************************************************************/ long long int total_fact_size; allocate_memory("FACT", outTList_, DATA, fact_can_ptr, &fac, &total_fact_size); if(global_node_id_ == 0) printf("Total forward activation memory allocated %lld bytes\n", total_fact_size); /***********************************************************/ /*** Allocate memory and set pointers for WEIGHTS buffer ***/ /***********************************************************/ long long int total_weight_size; allocate_memory("WEIGHT", wTList_, DATA, wt_can_ptr, &wtc, &total_weight_size); if(global_node_id_ == 0) printf("Total weights memory allocated %lld bytes\n", total_weight_size); /***********************************************************/ /*** Allocate memory and set pointers for BIASES buffer ***/ /***********************************************************/ long long int total_bias_size; allocate_memory("BIAS", biasTList_, DATA, bias_can_ptr, &bic, &total_bias_size); if(global_node_id_ == 0) printf("Total bias memory allocated %lld bytes\n", total_bias_size); /***********************************************************/ /*** Allocate memory and set pointers for STATS buffer ***/ /***********************************************************/ long long int total_stats_size; allocate_memory("STATS", statsTList_, DATA, stats_can_ptr, &sic, &total_stats_size); if(global_node_id_ == 0) printf("Total stats memory allocated %lld bytes\n", total_stats_size); // Required only for training long long int total_bp_size; if(!inferenceOnly_) { /***********************************************************************/ /*** Allocate memory and set pointers for BACKWARD ACTIVATION buffer ***/ /***********************************************************************/ #if !defined(USE_OPTBP_ALLOC) long long int total_bact_size; allocate_memory("BACT", outTList_, DIFF, bact_can_ptr, &bac, &total_bact_size); if(global_node_id_ == 0) printf("Total backward activation memory allocated %lld bytes\n", total_bact_size); #else long long int total_bact_size = NDIFFS * max_fwd_buffer_size; allocate_gradient_tensor(outTList_, DIFF, NDIFFS, max_fwd_buffer_size); if(global_node_id_ == 0) printf("Total backward activation memory allocated %lld bytes\n", total_bact_size); #endif /********************************************************************/ /*** Allocate memory and set pointers for WEIGHT GRADIENTS buffer ***/ /********************************************************************/ long long int total_wdiff_size; allocate_memory("WEIGHT", wTList_, DIFF, wdiff_can_ptr, &wdc, &total_wdiff_size); if(global_node_id_ == 0) printf("Total weight gradient memory allocated %lld bytes\n", total_wdiff_size); /*********************************************************************/ /*** Allocate memory and set pointers for WEIGHT INCREMENTS buffer ***/ /*********************************************************************/ long long int total_winc_size; allocate_memory("WEIGHT", wTList_, HISTORY, winc_can_ptr, &wic, &total_winc_size); if(global_node_id_ == 0) printf("Total weight increment memory allocated %lld bytes\n", total_winc_size); /********************************************************************/ /*** Allocate memory and set pointers for BIAS GRADIENTS buffer ***/ /********************************************************************/ long long int total_bidiff_size; allocate_memory("BIAS", biasTList_, DIFF, bidiff_can_ptr, &bidc, &total_bidiff_size); if(global_node_id_ == 0) printf("Total bias gradient memory allocated %lld bytes\n", total_bidiff_size); /*********************************************************************/ /*** Allocate memory and set pointers for BIAS INCREMENTS buffer ***/ /*********************************************************************/ long long int total_biinc_size; allocate_memory("BIAS", biasTList_, HISTORY, biinc_can_ptr, &biic, &total_biinc_size); if(global_node_id_ == 0) printf("Total bias increment memory allocated %lld bytes\n", total_biinc_size); total_bp_size = total_bact_size + total_wdiff_size + total_winc_size + total_bidiff_size + total_biinc_size; } long long int total_memory = total_input_size + total_fact_size + total_weight_size + total_bias_size + total_bp_size; if(global_node_id_ == 0) printf("Total tensor memory = %lld\n",total_memory); } libxsmm-1.17/samples/deeplearning/gxm/src/FCXSMM.cpp000066400000000000000000000355121415223013700223050ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #include "FCXSMM.hpp" extern int iter; FCXSMM::FCXSMM(FCImplParams *gp, int engine) : FCImpl(gp, engine) { /* setup LIBXSMM handle */ fullyconnected_desc.N = gp->batch_size/gp->num_numa_nodes; fullyconnected_desc.C = gp->nInput; fullyconnected_desc.K = gp->nOutput; fullyconnected_desc.threads = gp->num_threads/gp->num_numa_nodes; if(gp->in_data_type == DT_FLOAT) fullyconnected_desc.datatype_in = LIBXSMM_DNN_DATATYPE_F32; else if(gp->in_data_type == DT_BF16) fullyconnected_desc.datatype_in = LIBXSMM_DNN_DATATYPE_BF16; if(gp->out_data_type == DT_FLOAT) fullyconnected_desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32; else if(gp->out_data_type == DT_BF16) fullyconnected_desc.datatype_out = LIBXSMM_DNN_DATATYPE_BF16; fullyconnected_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; fullyconnected_desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; fullyconnected_desc.fuse_ops = LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE; for(int i=0; inum_numa_nodes; i++) { libxsmm_handle[i] = libxsmm_dnn_create_fullyconnected( fullyconnected_desc, &status ); CHKERR_LIBXSMM_DNN( status ); } } void FCXSMM::forwardPropagate(TensorBuf *inpb, TensorBuf* weightpb, TensorBuf* hweightpb, TensorBuf* biaspb, TensorBuf *outpb, int tid) { #ifdef RETURNALL return; #endif int nIFM = gp->nInput; int kh = gp->kh; int kw = gp->kw; assert(top_compute_engine != -1); assert(bot_compute_engine != -1); void *input[NUM_NUMA_NODES]; input[0] = inpb->getBuffer(); int imoff = fullyconnected_desc.N * fullyconnected_desc.C; if(gp->in_data_type == DT_FLOAT) imoff = imoff*sizeof(float); else if(gp->in_data_type == DT_BF16) imoff = imoff*sizeof(libxsmm_bfloat16); for(int n=1; nnum_numa_nodes; n++) input[n] = input[n-1] + imoff; void *weight[NUM_NUMA_NODES], *f32_weight[NUM_NUMA_NODES]; void *wt_prv_ptr; void **lptrptr = weightpb->getLPBufferPtr(); void **ptrptr = weightpb->getBufferPtr(); int offset = weightpb->getOffset(); if(lptrptr != NULL) { for(int n=0; nnum_numa_nodes; n++) weight[n] = lptrptr[n] + offset*sizeof(libxsmm_bfloat16); } else for(int n=0; nnum_numa_nodes; n++) weight[n] = ptrptr[n] + offset*sizeof(float); void *hwt_ptr; if(hweightpb != NULL) hwt_ptr = hweightpb->getBuffer(); else hwt_ptr = NULL; void *bias; if(gp->bias_term) bias = biaspb->getBuffer(); void *output[NUM_NUMA_NODES]; output[0] = outpb->getBuffer(); imoff = fullyconnected_desc.N * fullyconnected_desc.K; if(gp->out_data_type == DT_FLOAT) imoff = imoff*sizeof(float); else if(gp->out_data_type == DT_BF16) imoff = imoff*sizeof(libxsmm_bfloat16); for(int n=1; nnum_numa_nodes; n++) output[n] = output[n-1] + imoff; void **sptrptr = scratchp->getBufferPtr(); /* setup LIBXSMM buffers */ for(int n=0; nnum_numa_nodes; n++) { if(libxsmm_input[n] == NULL && libxsmm_output[n] == NULL && libxsmm_filter[n] == NULL) { libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout(libxsmm_handle[n], LIBXSMM_DNN_REGULAR_INPUT, &status); CHKERR_LIBXSMM_DNN( status ); libxsmm_input[n] = libxsmm_dnn_link_tensor( libxsmm_layout, input[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle[n], libxsmm_input[n], LIBXSMM_DNN_REGULAR_INPUT)); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout(libxsmm_handle[n], LIBXSMM_DNN_REGULAR_FILTER, &status); CHKERR_LIBXSMM_DNN( status ); libxsmm_filter[n] = libxsmm_dnn_link_tensor( libxsmm_layout, weight[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle[n], libxsmm_filter[n], LIBXSMM_DNN_REGULAR_FILTER ) ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout(libxsmm_handle[n], LIBXSMM_DNN_REGULAR_OUTPUT, &status); CHKERR_LIBXSMM_DNN( status ); libxsmm_output[n] = libxsmm_dnn_link_tensor( libxsmm_layout, output[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle[n], libxsmm_output[n], LIBXSMM_DNN_REGULAR_OUTPUT ) ); } } if(sptrptr == NULL) { sptrptr = (void**)libxsmm_aligned_malloc(gp->num_numa_nodes*sizeof(void*), 2097152); scratchp->setBufferPtr(sptrptr); } int max_size = 0; for(int n=0; nnum_numa_nodes; n++) { if(sptrptr[n] == NULL) { int mysize = libxsmm_dnn_fullyconnected_get_scratch_size( libxsmm_handle[n], &status ); CHKERR_LIBXSMM_DNN( status ); sptrptr[n] = libxsmm_aligned_scratch( mysize, 2097152 ); max_size = mysize; #ifdef USE_MLSL if(MLSL::Environment::GetEnv().GetProcessIdx() == 0) #endif printf("%s allocated %d bytes for scratch @ %p\n",nname.c_str(), mysize, sptrptr[n]); } else { int ssize = scratchp->getBufferSize(); int mysize = libxsmm_dnn_fullyconnected_get_scratch_size( libxsmm_handle[n], &status ); CHKERR_LIBXSMM_DNN( status ); if(ssize < mysize) { libxsmm_free(sptrptr[n]); sptrptr[n] = (void*)libxsmm_aligned_malloc(mysize, 2097152); max_size = mysize; #ifdef USE_MLSL if(MLSL::Environment::GetEnv().GetProcessIdx() == 0) #endif printf("%s allocated %d bytes for scratch @ %p, prev size was %d bytes\n",nname.c_str(), mysize, sptrptr[n], ssize); } max_size = ssize; } } scratchp->setBufferSize(max_size); if(prev_scratch_size == 0) prev_scratch_size = scratchp->getBufferSize(); if(!updated_scratch_fwd || prev_scratch_size != scratchp->getBufferSize()) { for(int n=0; nnum_numa_nodes; n++) CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_scratch( libxsmm_handle[n], sptrptr[n] ) ); updated_scratch_fwd = true; prev_scratch_size = scratchp->getBufferSize(); } #if defined(_OPENMP) #pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif int ntps = gp->num_threads/gp->num_numa_nodes; int n = tid/ntps; CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle[n], LIBXSMM_DNN_COMPUTE_KIND_FWD, n*ntps, tid ) ); } } void FCXSMM::backPropagate(TensorBuf *deloutpb, TensorBuf *weightpb, TensorBuf *delinpb, int tid) { assert(top_compute_engine != -1); assert(bot_compute_engine != -1); void *deloutput[NUM_NUMA_NODES]; void *delinput[NUM_NUMA_NODES]; deloutput[0] = deloutpb->getBuffer(); delinput[0] = delinpb->getBuffer(); int imoff = fullyconnected_desc.N * fullyconnected_desc.K; if(gp->out_data_type == DT_FLOAT) imoff = imoff*sizeof(float); else if(gp->out_data_type == DT_BF16) imoff = imoff*sizeof(libxsmm_bfloat16); for(int n=1; nnum_numa_nodes; n++) deloutput[n] = deloutput[n-1] + imoff; imoff = fullyconnected_desc.N * fullyconnected_desc.C; if(gp->in_data_type == DT_FLOAT) imoff = imoff*sizeof(float); else if(gp->in_data_type == DT_BF16) imoff = imoff*sizeof(libxsmm_bfloat16); for(int n=1; nnum_numa_nodes; n++) delinput[n] = delinput[n-1] + imoff; void **sptrptr = scratchp->getBufferPtr(); if(!updated_scratch_bwd) { for(int n=0; nnum_numa_nodes; n++) CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_scratch( libxsmm_handle[n], sptrptr[n] ) ); updated_scratch_bwd = true; } for(int n=0; nnum_numa_nodes; n++) { if(libxsmm_deloutput[n] == NULL && libxsmm_delinput[n] == NULL) { libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle[n], LIBXSMM_DNN_GRADIENT_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_deloutput[n] = libxsmm_dnn_link_tensor( libxsmm_layout, deloutput[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle[n], libxsmm_deloutput[n], LIBXSMM_DNN_GRADIENT_OUTPUT ) ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle[n], LIBXSMM_DNN_GRADIENT_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delinput[n] = libxsmm_dnn_link_tensor( libxsmm_layout, delinput[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle[n], libxsmm_delinput[n], LIBXSMM_DNN_GRADIENT_INPUT ) ); } } #if defined(_OPENMP) #pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif int ntps = gp->num_threads/gp->num_numa_nodes; int n = tid/ntps; CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle[n], LIBXSMM_DNN_COMPUTE_KIND_BWD, n*ntps, tid ) ); } } void FCXSMM::weightUpdate(TensorBuf *deloutpb, TensorBuf *inpb, TensorBuf *delweightpb, TensorBuf *delbiaspb, int tid) { int ofm = fullyconnected_desc.K; int ifm = fullyconnected_desc.C; int kh = 1; int kw = 1; assert(top_compute_engine != -1); assert(bot_compute_engine != -1); void *dwt_ptr[NUM_NUMA_NODES]; void **ptrptr; if(gp->in_data_type == DT_BF16) { #ifdef BF16_MLSL ptrptr = delweightpb->getBufferPtr(); #else ptrptr = delweightpb->getLPBufferPtr(); #endif } else ptrptr = delweightpb->getBufferPtr(); int offset = delweightpb->getOffset(); if(gp->in_data_type == DT_FLOAT) offset = offset*sizeof(float); else if(gp->in_data_type == DT_BF16) offset = offset*sizeof(libxsmm_bfloat16); for(int n=0; nnum_numa_nodes; n++) dwt_ptr[n] = ptrptr[n] + offset; void **sptrptr = scratchp->getBufferPtr(); if(!updated_scratch_upd) { for(int n=0; nnum_numa_nodes; n++) CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_scratch( libxsmm_handle[n], sptrptr[n] ) ); updated_scratch_upd = true; } for(int n=0; nnum_numa_nodes; n++) { if(libxsmm_delfilter[n] == NULL) { libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle[n], LIBXSMM_DNN_GRADIENT_FILTER, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delfilter[n] = libxsmm_dnn_link_tensor( libxsmm_layout, dwt_ptr[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle[n], libxsmm_delfilter[n], LIBXSMM_DNN_GRADIENT_FILTER ) ); } } #if defined(_OPENMP) #pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif int ntps = gp->num_threads/gp->num_numa_nodes; int n = tid/ntps; CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle[n], LIBXSMM_DNN_COMPUTE_KIND_UPD, n*ntps, tid ) ); #ifdef USE_MLSL #pragma omp barrier if(gp->num_numa_nodes > 1) { if(gp->in_data_type == DT_FLOAT) { int jobs = ofm * ifm * kh * kw; int jn = jobs/gp->num_numa_nodes; int jnv = jn/VLEN; int jpt = (jnv % ntps == 0) ? (jnv/ntps)*VLEN : ((jnv/ntps)+1)*VLEN; int ltid = tid - n*ntps; int tb = (ltid * jpt < jn) ? ltid*jpt : jn; int te = ((ltid+1)*jpt < jn) ? (ltid+1)*jpt : jn; float *wgp = (float*)dwt_ptr[n]+n*jn; for(int nn=0; nnnum_numa_nodes; nn++) { if(n == nn) continue; float *rgp = (float*)dwt_ptr[nn]+n*jn; #pragma omp simd for(int i=tb; inum_numa_nodes; nn++) { float *wgp = (float*)dwt_ptr[n]+nn*jn; float *rgp = (float*)dwt_ptr[nn]+nn*jn; #pragma vector nontemporal #pragma omp simd for(int i=tb; iin_data_type == DT_BF16) { if(n == 0) { int jobs = ofm * ifm * kh * kw; assert(jobs % VLEN == 0); int jv = jobs/VLEN; int rem = jv % ntps; int jpt = (rem == 0) ? (jv/ntps)*VLEN : ((jv-rem)/ntps)*VLEN; int tb = (tid * jpt < jobs) ? tid*jpt : jobs; int te = ((tid+1)*jpt < jobs) ? (tid+1)*jpt : jobs; libxsmm_bfloat16 *my_ptr = (libxsmm_bfloat16*)dwt_ptr[n]; for(int nn=1; nnnum_numa_nodes; nn++) { libxsmm_bfloat16 *rem_ptr = (libxsmm_bfloat16*)dwt_ptr[nn]; for(int i=tb; i 0) { for(int i=ntps*jpt; i #include "FullyConnected.hpp" #include "fillers.hpp" #ifdef USE_MLSL #include "mpi.h" #endif FCNode::FCNode(FCParams *p, MLEngine* e) : NNNode(p, e) { nname_ = p->get_node_name(); ntype_ = p->get_node_type(); mode_ = p->get_mode(); bottom_ = p->get_bottom_names(); top_ = p->get_top_names(); bp_flag_ = p->get_bprop_flag(); has_weights_ = true; tenTop_ = new Tensor(top_[0]); assert(tenTop_ != NULL); tenTop_->setOwner(this); tenTop_->setType(ACT); tenTopData_ = tenTop_->getBuf(DATA); tenTopData_->setBufferType(DATA); #ifdef DEBUG printf("bottom name %s\n",bottom_[0].c_str()); #endif if((bottom_[0]).compare("data") == 0) tenBot_ = e->get_tensor(bottom_[0], INPUT); else tenBot_ = e->get_tensor(bottom_[0], ACT); assert(tenBot_ != NULL); NNNode *pnn = (NNNode*)tenBot_->getOwner(); setPrevNode(pnn); mode_ = pnn->getMode(); int cengine = p->get_compute_engine(); pnn->set_top_compute_engine(cengine); bot_cengine_ = pnn->get_bot_compute_engine(); tenBotData_ = tenBot_->getBuf(DATA); in_dtype = tenBotData_->getDataType(); //Output tensor data type = input tensor data type out_dtype = p->get_data_type(); tenTopData_->setDataType(out_dtype); tenTopData_->setBufferType(DATA); // Get input tensor shape (bottom) Shape* bs = tenBot_->getShape(); assert(bs->ndims <= MAX_DIMS); shape_setzero(&ts_); ts_.ndims = 4; // Number of dimensions ts_.dims[0] = bs->dims[0]; // Minibatch size ts_.dims[1] = p->get_output(); // Num output feature maps ts_.dims[2] = 1; ts_.dims[3] = 1; tenTop_->setShape(&ts_); long long int tsize = 1; for(int i=0; isetBufferSize(tsize); gparams_.num_numa_nodes = NUM_NUMA_NODES; // Create weight tensor weight_ = top_[0] + "_fp_wt"; tenWeight_ = new Tensor(weight_); assert(tenWeight_ != NULL); tenWeight_->setOwner(this); tenWeight_->setType(FCWEIGHT); shape_setzero(&ws_); ws_.ndims = ts_.ndims; // Number of dimensions if(p->get_transpose_flag()) { ws_.dims[0] = bs->dims[1] * bs->dims[2] * bs->dims[3]; // Num input feature maps (from bottom tensor) ws_.dims[1] = ts_.dims[1]; // Num output feature maps (from top tensor) ws_.dims[2] = 1; ws_.dims[3] = 1; } else { ws_.dims[1] = bs->dims[1] * bs->dims[2] * bs->dims[3]; // Num input feature maps (from bottom tensor) ws_.dims[0] = ts_.dims[1]; // Num output feature maps (from top tensor) ws_.dims[2] = 1; ws_.dims[3] = 1; } tenWeight_->setShape(&ws_); tenWeight_->setBufDataType(DATA, DT_FLOAT); tenWeightData_ = tenWeight_->getBuf(DATA); tenWeightData_->setBufferType(DATA); long long int wsize = 1; for(int i=0; isetBufferSize(wsize); wfiller_type_ = p->get_weight_filler_type(); std_ = p->get_std(); lr_mult_ = p->get_lr_mult(); decay_mult_ = p->get_decay_mult(); // Create bias tensor Shape bis; if(p->get_bias_term()) { bias_ = top_[0] + "_fp_bias"; tenBias_ = new Tensor(bias_); assert(tenBias_ != NULL); tenBias_->setOwner(this); tenBias_->setType(FCBIAS); shape_setzero(&bis); bis.ndims = 1; bis.dims[0] = ts_.dims[1]; tenBias_->setShape(&bis); tenBias_->setBufDataType(DATA, DT_FLOAT); tenBiasData_ = tenBias_->getBuf(DATA); tenBiasData_->setBufferType(DATA); long long int bisize = bis.dims[0]; bisize = bisize*sizeof(float); tenBiasData_->setBufferSize(bisize); bfiller_type_ = p->get_bias_filler_type(); value_ = p->get_value(); } if(!e->is_inference_only()) { if(bp_flag_) { tenBotDiff_ = tenBot_->addBuf(); tenBotDiff_->setDataType(in_dtype); tenBotDiff_->setBufferType(DIFF); long long int bsize = 1; for(int i=0; indims; i++) bsize = bsize*bs->dims[i]; if(in_dtype == DT_FLOAT) bsize = bsize*sizeof(float); else if(in_dtype == DT_BF16) bsize = bsize*sizeof(libxsmm_bfloat16); // Set the size of the input-gradient buffer tenBotDiff_->setBufferSize(bsize); } if(has_weights_) { tenWeightDiff_ = tenWeight_->addBuf(); if(in_dtype == DT_BF16 || out_dtype == DT_BF16) { tenWeightDiff_->setDataType(DT_BF16); int welem = ws_.dims[0]*ws_.dims[1]; #ifdef BF16_MLSL tenWeightDiff_->setBufferSize(welem*sizeof(libxsmm_bfloat16)); #else tenWeightDiff_->setBufferSize(welem*sizeof(float)); #endif } else { tenWeightDiff_->setDataType(DT_FLOAT); tenWeightDiff_->setBufferSize(wsize); } tenWeightDiff_->setBufferType(DIFF); tenWeightInc_ = tenWeight_->addBuf(); tenWeightInc_->setDataType(DT_FLOAT); tenWeightInc_->setBufferType(HISTORY); // Set the size of weight-increment buffer tenWeightInc_->setBufferSize(wsize); if(p->get_bias_term()) { tenBiasDiff_ = tenBias_->addBuf(); // DIFF type and index tenBiasDiff_->setDataType(DT_FLOAT); tenBiasDiff_->setBufferType(DIFF); tenBiasInc_ = tenBias_->addBuf(); // SHARED type and index tenBiasInc_->setDataType(DT_FLOAT); tenBiasInc_->setBufferType(HISTORY); // Set the size of the weight-gradient buffer and the weight-increment buffer long long int bisize = bis.dims[0]; bisize = bisize*sizeof(float); tenBiasDiff_->setBufferSize(bisize); tenBiasInc_->setBufferSize(bisize); } } } else { tenBotDiff_ = NULL; tenWeightDiff_ = NULL; tenWeightInc_ = NULL; } // Register output tensor in tensor Map bool inserted = e->register_tensor(top_[0], ACT, tenTop_); if(!inserted) printf("Warning: Tensor %s already registered\n",top_[0].c_str()); // Register weight tensor in tensor Map inserted = e->register_tensor(weight_, FCWEIGHT, tenWeight_); if(!inserted) printf("Warning: Tensor %s already registered\n",weight_.c_str()); // Register bias tensor in tensor Map if(p->get_bias_term()) { inserted = e->register_tensor(bias_, FCBIAS, tenBias_); if(!inserted) printf("Warning: Tensor %s already registered\n",bias_.c_str()); } // Setup parameter structure for computation in library gparams_.node_name = nname_; gparams_.batch_size = bs->dims[0]; gparams_.nInput = bs->dims[1]; gparams_.nOutput = ts_.dims[1]; gparams_.iHeight = bs->dims[2]; gparams_.iWidth = bs->dims[3]; gparams_.oHeight = ts_.dims[2]; gparams_.oWidth = ts_.dims[3]; gparams_.kh = 1; gparams_.kw = 1; gparams_.bias_term = p->get_bias_term(); gparams_.in_data_type = in_dtype; gparams_.out_data_type = out_dtype; gparams_.algType = p->get_algo_type(); gparams_.num_threads = e->get_num_threads(); // get solver solver_ = e->getSolver(); //get global scratch tensor buffer tenScratchData_ = e->getScratchBuffer(); //get engine eptr_ = e; #ifdef USE_MLSL MLSL::DataType dt = MLSL::DT_FLOAT; MLSL::OperationRegInfo *myRegInfo; MLSL::Session *s = eptr_->get_session(); myRegInfo = s->CreateOperationRegInfo(MLSL::OT_CC); myRegInfo->SetName(nname_.c_str()); myRegInfo->AddParameterSet(gparams_.nInput*gparams_.nOutput, gparams_.kh*gparams_.kw, dt, false); if (gparams_.bias_term) { myRegInfo->AddParameterSet(gparams_.nOutput, 1, dt, false); } myRegInfo->Validate(); size_t opIdx = s->AddOperation(myRegInfo, e->get_distribution()); this->op_ = s->GetOperation(opIdx); s->DeleteOperationRegInfo(myRegInfo); e->get_wtgrad_comms_vec().push_back(op_); #endif configure(p->get_compute_engine()); } void FCNode::fillWeightBuffers(TensorBuf* tBuf, int buftype, long long int size) { int dtype = tBuf->getBufferType(); void *ptr = tBuf->getBuffer(); #ifdef USE_MLSL unsigned int node_id = MLSL::Environment::GetEnv().GetProcessIdx(); #else unsigned int node_id = 0; #endif int ic = gparams_.nInput; int oc = gparams_.nOutput; int welem = ic * oc; int fanin = ic; int fanout = oc; if(buftype == DATA) { if(node_id == 0) initBuffer(ptr, variance_norm_, fanin, fanout, welem*sizeof(float), wfiller_type_, std_); #ifdef USE_MLSL MPI_Bcast(ptr, welem, MPI_FLOAT, 0, MPI_COMM_WORLD); #endif } else if(buftype == HISTORY || buftype == DIFF) memset(ptr, 0, size); } void FCNode::fillWeightMultipliers(float* lr, float* decay, long long int size) { for(int i=0; i < size; i++) { lr[i] = lr_mult_[0]; decay[i] = decay_mult_[0]; } } void FCNode::fillBiasBuffers(TensorBuf* tBuf, int buftype, long long int size) { int dtype = tBuf->getBufferType(); void *ptr = tBuf->getBuffer(); if(buftype == DATA) { assert(bfiller_type_.compare("constant") == 0); initConstantBuffer(ptr, size, "CONSTANT", value_); } else memset(ptr, 0, size); } void FCNode::fillBiasMultipliers(float* lr, float* decay, long long int size) { if(gparams_.bias_term) { for(int i=0; i < size; i++) { lr[i] = lr_mult_[1]; decay[i] = decay_mult_[1]; } } } void FCNode::Checkpoint(TensorBuf *tBuf, string name, string format) { long long int bytes = tBuf->getBufferSize(); int dtype = tBuf->getBufferType(); void* ptr = tBuf->getBuffer(); FILE* f; size_t pos; if((name.find("30") == name.npos) && (name.find("60") == name.npos) && (name.find("80") == name.npos)) while((pos = name.find("/", 10)) != name.npos) name.replace(pos, 1, 1, '_'); float* p = (float*)ptr; bool no_checkpt = false; for(int i=0; i<16; i++) { if(isnan(p[i]) || isinf(p[i])) { no_checkpt = true; printf("Warning! %s Did not checkpoint! Weights are NaNs or Inf\n", nname_.c_str()); break; } } if(!no_checkpt) { if(format.compare("binary") == 0) { f = fopen(name.c_str(), "wb"); if(f != NULL) { size_t b = fwrite(ptr, 1, bytes, f); assert((long long int)b == bytes); } else printf("Warning: could not checkpoint to file %s\n",name.c_str()); } else { f = fopen(name.c_str(), "w"); if(f != NULL) { if(dtype == DT_FLOAT) { for(int i=0; i 0) { for(int i=ntps*jobs; igetBuffer()); void* wt = (void*)(tenWeightData_->getBuffer()); void* bias; if(gparams_.bias_term) bias = (void*)(tenBiasData_->getBuffer()); void* top = (void*)(tenTopData_->getBuffer()); printf("Executing FP %s: input %p, weights %p, output %p\n",NNNode::nname_.c_str(), bot, wt, top); fflush(NULL); printf("Inputs: %d x %d\n",gparams_.batch_size, gparams_.nInput*gparams_.iHeight*gparams_.iWidth); printf("Outputs: %d x %d\n",gparams_.batch_size, gparams_.nOutput*gparams_.oHeight*gparams_.oWidth); printf("Weights: %d x %d x %d x %d\n", gparams_.nInput, gparams_.nOutput, gparams_.kw, gparams_.kw); #endif impl->set_bot_compute_engine(bot_cengine_); impl->set_top_compute_engine(top_compute_engine_); impl->set_node_name(nname_); impl->set_scratch_buffer(tenScratchData_); impl->forwardPropagate(tenBotData_, tenWeightData_, tenWeightInc_, tenBiasData_, tenTopData_); #ifdef CHECK_BLOWUP_FP32 if(out_dtype == DT_FLOAT) { for(int i=0; i<16; i++) { float v = ((float*)tenTopData_->getBuffer())[i]; if(isnan(v) || isinf(v)) { printf("Warning! %s layer FP activations are NaN or Inf\n", nname_.c_str()); exit(-1); } } } else if(out_dtype == DT_BF16) { convert_bf16_f32((libxsmm_bfloat16*)tenTopData_->getBuffer(), cbptr, 16); for(int i=0; i<16; i++) { if(isnan(cbptr[i]) || isinf(cbptr[i])) { printf("Warning! %s layer FP activations are NaN or Inf\n", nname_.c_str()); exit(-1); } } } #endif #ifdef GETSTATS #ifdef USE_MLSL unsigned int node_id = MLSL::Environment::GetEnv().GetProcessIdx(); if(node_id == 0 && eptr_->get_current_batch() % STATFREQ == 0) #else if(eptr_->get_current_batch() % STATFREQ == 0) #endif { if(in_dtype == DT_FLOAT) { string s = nname_ + "_Inp"; float *ptr = (float*)tenBotData_->getBuffer(); MeanOfLayer((char*)s.c_str(), ptr, gparams_.batch_size*gparams_.nInput); } else if(in_dtype == DT_BF16) { if(stptr == NULL) { int os = nImg*ofm; int is = nImg*ifm; int ws = ifm*ofm; int m = os < is ? is : os; int msize = m < ws ? ws : m; stptr = (float*)libxsmm_aligned_malloc(msize*sizeof(float), 2097152); } string s = nname_ + "_Inp"; libxsmm_bfloat16 *ptr = (libxsmm_bfloat16*)tenBotData_->getBuffer(); convert_bf16_f32(ptr, stptr, gparams_.batch_size*gparams_.nInput); MeanOfLayer((char*)s.c_str(), stptr, gparams_.batch_size*gparams_.nInput); } string s = nname_ + "_Wt"; float *ptr = (float*)tenWeightData_->getBuffer(); MeanOfLayer((char*)s.c_str(), ptr, gparams_.nInput*gparams_.nOutput); if(out_dtype == DT_FLOAT) { string s = nname_ + "_Outp"; float *ptr = (float*)tenTopData_->getBuffer(); MeanOfLayer((char*)s.c_str(), ptr, gparams_.batch_size*gparams_.nOutput); } else if(out_dtype == DT_BF16) { string s = nname_ + "_Outp"; libxsmm_bfloat16 *ptr = (libxsmm_bfloat16*)tenTopData_->getBuffer(); convert_bf16_f32(ptr, stptr, gparams_.batch_size*gparams_.nOutput); MeanOfLayer((char*)s.c_str(), stptr, gparams_.batch_size*gparams_.nOutput); } if(gparams_.bias_term) { string s = nname_ + "_Bias"; float *ptr = (float*)tenBiasData_->getBuffer(); MeanOfLayer((char*)s.c_str(), ptr, gparams_.nOutput); } } #endif } void FCNode::backPropagate() { tenTopDiff_ = tenTop_->getBuf(DIFF); #ifdef DEBUG void* top = (void*)(tenTopData_->getBuffer()); void *gtop = (void*)(tenTopDiff_->getBuffer()); assert(gtop != NULL); void* wt = (void*)(tenWeightData_->getBuffer()); void* gbot = (void*)(tenBotDiff_->getBuffer()); printf("Executing BP %s: grad_output %p, weights %p, grad_input %p\n",NNNode::nname_.c_str(), gtop, wt, gbot); printf("Grad Outputs: %d x %d\n", gparams_.batch_size, gparams_.nOutput); printf("Grad Inputs: %d x %d\n", gparams_.batch_size, gparams_.nInput); printf("Weights: %d x %d x %d x %d\n", gparams_.nOutput, gparams_.nInput, gparams_.kh, gparams_.kw); #endif impl->backPropagate(tenTopDiff_, tenWeightData_, tenBotDiff_); #ifdef CHECK_BLOWUP_FP32 if(out_dtype == DT_FLOAT) { for(int i=0; i<16; i++) { float v = ((float*)tenBotDiff_->getBuffer())[i]; if(isnan(v) || isinf(v)) { printf("Warning! %s layer FP activations are NaN or Inf\n", nname_.c_str()); exit(-1); } } } else if(out_dtype == DT_BF16) { convert_bf16_f32((libxsmm_bfloat16*)tenBotDiff_->getBuffer(), cbptr, 16); for(int i=0; i<16; i++) { if(isnan(cbptr[i]) || isinf(cbptr[i])) { printf("Warning! %s layer FP activations are NaN or Inf\n", nname_.c_str()); exit(-1); } } } #endif #ifdef GETSTATS #ifdef USE_MLSL unsigned int node_id = MLSL::Environment::GetEnv().GetProcessIdx(); if(node_id == 0 && eptr_->get_current_batch() % STATFREQ == 0) #else if(eptr_->get_current_batch() % STATFREQ == 0) #endif { if(out_dtype == DT_FLOAT) { string s = nname_ + "_delOutp"; float *ptr = (float*)tenTopDiff_->getBuffer(); MeanOfLayer((char*)s.c_str(), ptr, gparams_.batch_size*gparams_.nOutput); } else if(out_dtype == DT_BF16) { string s = nname_ + "_delOutp"; libxsmm_bfloat16 *ptr = (libxsmm_bfloat16*)tenTopDiff_->getBuffer(); convert_bf16_f32(ptr, stptr, gparams_.batch_size*gparams_.nOutput); MeanOfLayer((char*)s.c_str(), stptr, gparams_.batch_size*gparams_.nOutput); } string s = nname_ + "_Wt"; float *ptr = (float*)tenWeightData_->getBuffer(); MeanOfLayer((char*)s.c_str(), ptr, gparams_.nInput*gparams_.nOutput); if(in_dtype == DT_FLOAT) { string s = nname_ + "_delInp"; float *ptr = (float*)tenBotDiff_->getBuffer(); MeanOfLayer((char*)s.c_str(), ptr, gparams_.batch_size*gparams_.nInput); } else if(in_dtype == DT_BF16) { string s = nname_ + "_delInp"; libxsmm_bfloat16 *ptr = (libxsmm_bfloat16*)tenBotDiff_->getBuffer(); convert_bf16_f32(ptr, stptr, gparams_.batch_size*gparams_.nInput); MeanOfLayer((char*)s.c_str(), stptr, gparams_.batch_size*gparams_.nInput); } } #endif } void FCNode::weightUpdate() { tenTopDiff_ = tenTop_->getBuf(DIFF); #ifdef DEBUG void *gtop = (void*)(tenTopDiff_->getBuffer()); assert(gtop != NULL); void* bot = (void*)(tenBotData_->getBuffer()); void* gwt = (void*)(tenWeightDiff_->getBuffer()); void* gbias; if(gparams_.bias_term) gbias = (void*)(tenBiasDiff_->getBuffer()); printf("Executing WU %s: grad_output %p, grad_weights %p, grad_biases %p, input %p\n",NNNode::nname_.c_str(), gtop, gwt, gbias, bot); printf("Grad Outputs: %d x %d\n", gparams_.batch_size, gparams_.nOutput); printf("Inputs: %d x %d\n", gparams_.batch_size, gparams_.nInput); printf("Grad Weights: %d x %d x %d x %d\n", gparams_.nOutput, gparams_.nInput, gparams_.kh, gparams_.kw); printf("Grad Biases: %d\n", gparams_.nOutput); #endif impl->weightUpdate(tenTopDiff_, tenBotData_, tenWeightDiff_, tenBiasDiff_); #ifdef CHECK_BLOWUP_FP32 if(out_dtype == DT_FLOAT) { for(int i=0; i<16; i++) { float v = ((float*)tenWeightDiff_->getBuffer())[i]; if(isnan(v) || isinf(v)) { printf("Warning! %s layer FP activations are NaN or Inf\n", nname_.c_str()); exit(-1); } } } else if(out_dtype == DT_BF16) { convert_bf16_f32((libxsmm_bfloat16*)tenWeightDiff_->getBuffer(), cbptr, 16); for(int i=0; i<16; i++) { if(isnan(cbptr[i]) || isinf(cbptr[i])) { printf("Warning! %s layer FP activations are NaN or Inf\n", nname_.c_str()); exit(-1); } } } #endif #ifdef USE_MLSL void *mptr = tenWeightDiff_->getBuffer(); #ifndef BF16_MLSL void *lmptr = tenWeightDiff_->getLPBuffer(); if(in_dtype == DT_BF16) { convert_bf16_f32((libxsmm_bfloat16*)lmptr, (float*)mptr, gparams_.nInput*gparams_.nOutput); op_->GetParameterSet(0)->StartGradientComm(mptr); } else if(in_dtype == DT_FLOAT) op_->GetParameterSet(0)->StartGradientComm(mptr); #else op_->GetParameterSet(0)->StartGradientComm(mptr); #endif if(gparams_.bias_term) op_->GetParameterSet(1)->StartGradientComm(tenBiasDiff_->getBuffer()); #endif } void FCNode::solverStep() { #ifdef RETURNALL return; #endif void *gwt = tenWeightDiff_->getBuffer(); void *gbias; if(gparams_.bias_term) gbias = (void*)(tenBiasDiff_->getBuffer()); int wsize = gparams_.nInput*gparams_.nOutput; #ifdef USE_MLSL void *mptr = op_->GetParameterSet(0)->WaitGradientComm(); if(in_dtype == DT_FLOAT) { if(mptr != NULL && mptr != gwt) memcpy((void*)gwt, mptr, wsize*sizeof(float)); } else if(in_dtype == DT_BF16) { if(mptr != NULL && mptr != dwptr) memcpy((void*)dwptr, mptr, wsize*sizeof(float)); convert_f32_bf16(dwptr, (libxsmm_bfloat16*)gwt, wsize); } if(gparams_.bias_term) { mptr = op_->GetParameterSet(1)->WaitGradientComm(); if(mptr != NULL && mptr != gbias) memcpy((void*)gbias, mptr, gparams_.nOutput*sizeof(float)); } #endif } libxsmm-1.17/samples/deeplearning/gxm/src/FusedBNorm.cpp000066400000000000000000000644451415223013700233230ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #include #include "FusedBNorm.hpp" using namespace std; using namespace gxm; FusedBNormNode::FusedBNormNode(FusedBNormParams* p, MLEngine* e): NNNode(p, e) { nname_ = p->get_node_name(); ntype_ = p->get_node_type(); mode_ = p->get_mode(); bottom_ = p->get_bottom_names(); top_ = p->get_top_names(); bp_flag_ = p->get_bprop_flag(); has_weights_ = true; bot_compute_engine_ = p->get_compute_engine(); tenTop_ = new Tensor(top_[0]); assert(tenTop_ != NULL); tenTop_->setOwner(this); tenTop_->setType(ACT); tenTopData_ = tenTop_->getBuf(DATA); tenTopData_->setBufferType(DATA); #ifdef DEBUG for(int i=0; iget_tensor(bottom_[i], ACT); assert(tenBot_[i] != NULL); NNNode *pnn = (NNNode*)tenBot_[i]->getOwner(); setPrevNode(pnn); pnn->set_top_compute_engine(p->get_compute_engine()); bot_cengine_[i] = pnn->get_bot_compute_engine(); tenBotData_[i] = tenBot_[i]->getBuf(DATA); } in_dtype = tenBotData_[0]->getDataType(); out_dtype = p->get_data_type(); tenTopData_->setDataType(out_dtype); vector vp = p->get_pads(); vector ivp = p->get_ipads(); vector vs = p->get_strides(); // Get input tensor shape (bottom) // Even if there are two inputs for eltwise operation, their shapes are the same. So, pick the first one Shape* bs = tenBot_[0]->getShape(); assert(bs->ndims <= MAX_DIMS); shape_setzero(&ts_); ts_.ndims = bs->ndims; ts_.dims[0] = bs->dims[0]; ts_.dims[1] = bs->dims[1]; ts_.dims[2] = bs->dims[2]/vs[0]; ts_.dims[3] = bs->dims[3]/vs[1]; tenTop_->setShape(&ts_); int telem = ts_.dims[0] * ts_.dims[1] * (ts_.dims[2] + 2*vp[0]) * (ts_.dims[3] + 2*vp[1]); long long int tsize; if(in_dtype == DT_FLOAT && out_dtype == DT_FLOAT) tsize = telem*sizeof(float); else if(in_dtype == DT_FLOAT && out_dtype == DT_BF16) tsize = telem*sizeof(float) + telem*sizeof(libxsmm_bfloat16); else if(in_dtype == DT_BF16 && out_dtype == DT_BF16) tsize = telem*sizeof(libxsmm_bfloat16); tenTopData_->setBufferSize(tsize); Shape sss; shape_setzero(&sss); sss.ndims = 1; sss.dims[0] = bs->dims[1]; scale_ = top_[0] + "_scale"; tenScale_ = new Tensor(scale_); assert(tenScale_ != NULL); tenScale_->setOwner(this); tenScale_->setType(BNORMSCALE); tenScale_->setShape(&sss); tenScaleData_ = tenScale_->getBuf(DATA); tenScaleData_->setDataType(DT_FLOAT); tenScaleData_->setBufferType(DATA); telem = sss.dims[0]; tsize = telem*sizeof(float); tenScaleData_->setBufferSize(tsize); shift_ = top_[0] + "_shift"; tenShift_ = new Tensor(shift_); assert(tenShift_ != NULL); tenShift_->setOwner(this); tenShift_->setType(BNORMSHIFT); tenShift_->setShape(&sss); tenShiftData_ = tenShift_->getBuf(DATA); tenShiftData_->setDataType(DT_FLOAT); tenShiftData_->setBufferType(DATA); tenShiftData_->setBufferSize(tsize); // number of inputs gparams_.nInput.resize(bottom_.size()); tenBotDiff_.resize(bottom_.size()); mean_ = top_[0] + "_mean"; tenMean_ = new Tensor(mean_); assert(tenMean_ != NULL); tenMean_->setOwner(this); tenMean_->setType(BNORMMEAN); tenMean_->setShape(&sss); tenMeanData_ = tenMean_->getBuf(DATA); tenMeanData_->setDataType(DT_FLOAT); tenMeanData_->setBufferType(DATA); tenMeanData_->setBufferSize(tsize); var_ = top_[0] + "_var"; tenVar_ = new Tensor(var_); assert(tenVar_ != NULL); tenVar_->setOwner(this); tenVar_->setType(BNORMVAR); tenVar_->setShape(&sss); tenVarData_ = tenVar_->getBuf(DATA); tenVarData_->setDataType(DT_FLOAT); tenVarData_->setBufferType(DATA); tenVarData_->setBufferSize(tsize); for(int i=0; igetShape(); assert(bs->ndims <= MAX_DIMS); gparams_.nInput[i] = bs->dims[1]; if(!e->is_inference_only()) { if(bp_flag_) { tenBotDiff_[i] = tenBot_[i]->addBuf(); // DIFF type and index tenBotDiff_[i]->setDataType(in_dtype); tenBotDiff_[i]->setBufferType(DIFF); int belem = bs->dims[0] * bs->dims[1] * (bs->dims[2] + 2*ivp[0]) * (bs->dims[3] + 2*ivp[1]); long long int bsize; if(in_dtype == DT_FLOAT && out_dtype == DT_FLOAT) bsize = belem*sizeof(float); else if(in_dtype == DT_FLOAT && out_dtype == DT_BF16) bsize = belem*sizeof(float) + belem*sizeof(libxsmm_bfloat16); else if(in_dtype == DT_BF16 && out_dtype == DT_BF16) bsize = belem*sizeof(libxsmm_bfloat16); // Set the size of the input-gradient buffer tenBotDiff_[i]->setBufferSize(bsize); } } else tenBotDiff_[i] = NULL; } if(!e->is_inference_only()) { if(has_weights_) { tenScaleDiff_ = tenScale_->addBuf(); tenScaleDiff_->setDataType(DT_FLOAT); tenScaleDiff_->setBufferType(DIFF); tenScaleDiff_->setBufferSize(tsize); tenScaleInc_ = tenScale_->addBuf(); tenScaleInc_->setDataType(DT_FLOAT); tenScaleInc_->setBufferType(HISTORY); tenScaleInc_->setBufferSize(tsize); tenShiftDiff_ = tenShift_->addBuf(); tenShiftDiff_->setDataType(DT_FLOAT); tenShiftDiff_->setBufferType(DIFF); tenShiftDiff_->setBufferSize(tsize); tenShiftInc_ = tenShift_->addBuf(); tenShiftInc_->setDataType(DT_FLOAT); tenShiftInc_->setBufferType(HISTORY); tenShiftInc_->setBufferSize(tsize); } } else { tenScaleDiff_ = NULL; tenShiftDiff_ = NULL; tenScaleInc_ = NULL; tenShiftInc_ = NULL; } // Register output tensor in tensor map bool inserted = e->register_tensor(top_[0], ACT, tenTop_); if(!inserted) printf("Warning: Tensor %s already registered\n",top_[0].c_str()); inserted = e->register_tensor(scale_, BNORMSCALE, tenScale_); if(!inserted) printf("Warning: Tensor %s already registered\n",scale_.c_str()); inserted = e->register_tensor(shift_, BNORMSHIFT, tenShift_); if(!inserted) printf("Warning: Tensor %s already registered\n",shift_.c_str()); inserted = e->register_tensor(mean_, BNORMMEAN, tenMean_); if(!inserted) printf("Warning: Tensor %s already registered\n",mean_.c_str()); inserted = e->register_tensor(var_, BNORMVAR, tenVar_); if(!inserted) printf("Warning: Tensor %s already registered\n",var_.c_str()); gparams_.bdims = gparams_.tdims = bs->ndims; gparams_.batch_size = bs->dims[0]; gparams_.node_name = nname_; gparams_.nOutput = bs->dims[1]; gparams_.iHeight = bs->dims[2]; gparams_.iWidth = bs->dims[3]; gparams_.oHeight = ts_.dims[2]; gparams_.oWidth = ts_.dims[3]; gparams_.pad_h = vp[0]; gparams_.pad_w = vp[1]; gparams_.ipad_h = ivp[0]; gparams_.ipad_w = ivp[1]; gparams_.stride_h = vs[0]; gparams_.stride_w = vs[1]; gparams_.mmf = p->get_mmf(); gparams_.eps = p->get_eps(); gparams_.relu = p->get_relu(); gparams_.bwd_relu = p->get_bwd_relu(); gparams_.eltwise = p->get_eltwise(); gparams_.in_data_type = in_dtype; gparams_.out_data_type = out_dtype; gparams_.algType = p->get_algo_type(); gparams_.num_threads = e->get_num_threads(); gparams_.num_numa_nodes = NUM_NUMA_NODES; gparams_.use_global_stats = false; lr_mult_ = p->get_lr_mult(); decay_mult_ = p->get_decay_mult(); configure(p->get_compute_engine()); solver_ = e->getSolver(); eptr_ = e; //get global scratch tensor buffer tenScratchData_ = e->getScratchBuffer(); #ifdef USE_MLSL MLSL::DataType dt = MLSL::DT_FLOAT; MLSL::OperationRegInfo *myRegInfo; MLSL::Session *s = eptr_->get_session(); myRegInfo = s->CreateOperationRegInfo(MLSL::OT_BIAS); myRegInfo->AddParameterSet(gparams_.nOutput, 1, dt, false); myRegInfo->AddParameterSet(gparams_.nOutput, 1, dt, false); myRegInfo->AddParameterSet(gparams_.nOutput, 1, dt, false); myRegInfo->AddParameterSet(gparams_.nOutput, 1, dt, false); myRegInfo->Validate(); size_t opIdx = s->AddOperation(myRegInfo, e->get_distribution()); this->op_ = s->GetOperation(opIdx); s->DeleteOperationRegInfo(myRegInfo); e->get_bias_grad_comms_vec().push_back(op_); #endif }; void FusedBNormNode::configure(int engine) { switch(engine) { case XSMM: impl = new FusedBNormXSMM(&gparams_, engine); break; } } void FusedBNormNode::convert_bf16_f32(libxsmm_bfloat16* in, float* out, int len) { int i; #ifdef _OPENMP #pragma omp parallel for private(i) #endif for ( i = 0; i < len; i+=16 ) { __m256i vbfp16 = _mm256_loadu_si256( (const __m256i*)(in+i) ); __m512 vfp32 = gxm_bfp16_to_fp32_avx512f( vbfp16 ); _mm512_storeu_ps( out+i, vfp32 ); } } void FusedBNormNode::fillBuffer(TensorBuf *tBuf, int buftype, long long int bytes) { int ttype = tBuf->getTensor()->getType(); int dtype = tBuf->getDataType(); void *ptr = tBuf->getBuffer(); if(ttype==BNORMSCALE && buftype == DATA) { if(nname_.find("bn3") == nname_.npos) initConstantBuffer(ptr, bytes, "CONSTANT", 1.0f); else initConstantBuffer(ptr, bytes, "CONSTANT", 0.0f); } else initConstantBuffer(ptr, bytes, "CONSTANT", 0.0f); } void FusedBNormNode::fillBiasMultipliers(float* lr, float* decay, long long int size) { for(int i=0; i < size; i++) { lr[i] = lr_mult_; decay[i] = decay_mult_; } } void FusedBNormNode::Checkpoint(TensorBuf *tBuf, string name, string format) { long long int bytes = tBuf->getBufferSize(); int dtype = tBuf->getDataType(); FILE* f; void* ptr; size_t pos; if((name.find("30") == name.npos) && (name.find("60") == name.npos) && (name.find("80") == name.npos)) while((pos = name.find("/", 10)) != name.npos) name.replace(pos, 1, 1, '_'); float* p = (float*)tBuf->getBuffer(); bool no_checkpt = false; for(int i=0; i<16; i++) { if(isnan(p[i]) || isinf(p[i])) { no_checkpt = true; printf("Warning! %s Did not checkpoint! Weights are NaNs or Inf\n", nname_.c_str()); break; } } if(!no_checkpt) { if(format == "binary") { f = fopen(name.c_str(), "wb"); if(f != NULL) { if(name.find("mean") != name.npos || name.find("var") != name.npos) ptr = tBuf->getPrivBuffer(); else ptr = tBuf->getBuffer(); size_t b = fwrite(ptr, 1, bytes, f); assert((long long int)b == bytes); } else printf("Warning: could not checkpoint to file %s\n",name.c_str()); } else { f = fopen(name.c_str(), "w"); if(f != NULL) { ptr = tBuf->getBuffer(); if(dtype == DT_FLOAT) { for(int i=0; iget_execution_mode() == TRAIN) // || eptr_->get_execution_mode() == VAL) { impl->set_global_stats(false); gparams_.exec_mode = "TRAIN"; } else if(eptr_->get_execution_mode() == TEST || eptr_->get_execution_mode() == VAL) { impl->set_global_stats(true); gparams_.exec_mode = "TEST"; } if(first_fp) { impl->set_bot_compute_engine(bot_cengine_[0]); impl->set_top_compute_engine(top_compute_engine_); impl->set_node_name(nname_); impl->set_scratch_buffer(tenScratchData_); #if 0 if(eptr_->get_execution_mode() == TRAIN) // || eptr_->get_execution_mode() == VAL) { impl->set_global_stats(false); gparams_.exec_mode = "TRAIN"; } else if(eptr_->get_execution_mode() == TEST || eptr_->get_execution_mode() == VAL) { impl->set_global_stats(true); gparams_.exec_mode = "TEST"; } #endif int size = nImg * ofm * (ofh + 2*oph) * (ofw + 2*opw); if((gparams_.in_data_type == DT_FLOAT && gparams_.out_data_type == DT_FLOAT) || (gparams_.in_data_type == DT_FLOAT && gparams_.out_data_type == DT_BF16)) { float* ptr = (float*)tenTopData_->getBuffer(); #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; igetBuffer(); #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; iget_scaling_factor(); impl->set_scaling_factor(scf_); void** meanp = tenMeanData_->getBufferPtr(); void** varp = tenVarData_->getBufferPtr(); for(int n=0; nforwardPropagate(tenBotData_, tenScaleData_, tenShiftData_, tenMeanData_, tenVarData_, tenTopData_, 0); if(eptr_->get_execution_mode() != TEST && eptr_->get_execution_mode() != VAL) { scf_ *= gparams_.mmf; scf_ += 1.; eptr_->set_scaling_factor(scf_); } #ifdef CHECK_BLOWUP_FP32 if(out_dtype == DT_FLOAT) { for(int i=0; i<16; i++) { float v = ((float*)tenTopData_->getBuffer())[i]; if(isnan(v) || isinf(v)) { printf("Warning! %s layer FP activations are NaN or Inf\n", nname_.c_str()); exit(-1); } } } else if(out_dtype == DT_BF16) { convert_bf16_f32((libxsmm_bfloat16*)tenTopData_->getBuffer(), cbptr, 10240); for(int i=0; i<10240; i++) { if(isnan(cbptr[i]) || isinf(cbptr[i])) { printf("Warning! %s layer FP activations are NaN or Inf\n", nname_.c_str()); exit(-1); } } } #endif #ifdef GETSTATS #ifdef USE_MLSL unsigned int node_id = MLSL::Environment::GetEnv().GetProcessIdx(); #else unsigned int node_id = 0; #endif if(node_id == 0 && eptr_->get_current_batch() % STATFREQ == 0) { if(in_dtype == DT_FLOAT) { float *ptr = (float*)tenBotData_[0]->getBuffer(); string s = nname_ + "_r_Inp"; MeanOfLayer((char*)s.c_str(), ptr, nImg*ifm0*ifhp*ifwp); if(gparams_.nInput.size() > 1) { ptr = (float*)tenBotData_[1]->getBuffer(); s = nname_ + "_l_Inp"; MeanOfLayer((char*)s.c_str(), ptr, nImg*ifm1*ifh*ifw); } } else if(in_dtype == DT_BF16) { if(stptr == NULL) { int s = nImg*ofm*ofhp*ofwp; int is = nImg*ofm*ifhp*ifwp; if(s > is) stptr = (float*)libxsmm_aligned_malloc(s*sizeof(float), 2097152); else stptr = (float*)libxsmm_aligned_malloc(is*sizeof(float), 2097152); } libxsmm_bfloat16 *ptr = (libxsmm_bfloat16*)tenBotData_[0]->getBuffer(); string s = nname_ + "_r_Inp"; convert_bf16_f32(ptr, stptr, nImg*ifm0*ifhp*ifwp); MeanOfLayer((char*)s.c_str(), stptr, nImg*ifm0*ifhp*ifwp); if(gparams_.nInput.size() > 1) { ptr = (libxsmm_bfloat16*)tenBotData_[1]->getBuffer(); convert_bf16_f32(ptr, stptr, nImg*ifm1*ifhp*ifwp); s = nname_ + "_l_Inp"; MeanOfLayer((char*)s.c_str(), stptr, nImg*ifm1*ifh*ifw); } } string s = nname_ + "_gammap0"; float* gamma = (float*)tenScaleData_->getBuffer(); MeanOfLayer((char*)s.c_str(), gamma, gparams_.nOutput); #if 0 void **g = tenScaleData_->getBufferPtr(); float *g1 = (float*)g[1] + tenScaleData_->getOffset(); s = nname_ + "_gammap1"; MeanOfLayer((char*)s.c_str(), g1, gparams_.nOutput); #endif s = nname_ + "_betap0"; float* beta = (float*)tenShiftData_->getBuffer(); MeanOfLayer((char*)s.c_str(), beta, gparams_.nOutput); #if 0 void **b = tenShiftData_->getBufferPtr(); float *b1 = (float*)b[1] + tenShiftData_->getOffset(); s = nname_ + "_betap1"; MeanOfLayer((char*)s.c_str(), b1, gparams_.nOutput); #endif if(gparams_.exec_mode == "TEST") { float meanp[2048], varp[2048], stdevp[2048]; s = nname_ + "_meanp"; float *mean = (float*)tenMeanData_->getBuffer(); for(int i=0; igetBuffer(); for(int i=0; igetBuffer(); string s = nname_ + "_Outp"; int size = nImg*ofm*(ofh + 2*gparams_.pad_h)*(ofw + 2*gparams_.pad_w); MeanOfLayer((char*)s.c_str(), (float*)ptr, size); } else if(out_dtype == DT_BF16) { libxsmm_bfloat16 *ptr = (libxsmm_bfloat16*)tenTopData_->getBuffer(); s = nname_ + "_Outp"; int size = nImg*ofm*(ofh + 2*gparams_.pad_h)*(ofw + 2*gparams_.pad_w); convert_bf16_f32(ptr, stptr, size); MeanOfLayer((char*)s.c_str(), stptr, size); } } #endif } void FusedBNormNode::backPropagate() { int nImg = gparams_.batch_size; int ifm0 = gparams_.nInput[0]; int ifm1 = gparams_.nInput[1]; int ofm = gparams_.nOutput; int ifh = gparams_.iHeight; int ifhp = ifh + 2*gparams_.ipad_h; int ifw = gparams_.iWidth; int ifwp = ifw + 2*gparams_.ipad_w; int ofh = gparams_.oHeight; int ofw = gparams_.oWidth; int oph = gparams_.pad_h; int opw = gparams_.pad_w; int ofhp = ofh + 2*oph; int ofwp = ofw + 2*opw; int sh = gparams_.stride_h; int sw = gparams_.stride_w; tenTopDiff_ = tenTop_->getBuf(DIFF); #ifndef NDEBUG int offset = gparams_.batch_size * gparams_.nInput[0] * gparams_.iHeight * gparams_.iWidth; float *gtop = (float*)(tenTopDiff_->getBuffer()); assert(gtop != NULL); float* gbot = (float*)(tenBotDiff_[0]->getBuffer()); float* bot = (float*)(tenBotData_[0]->getBuffer()); float* mean = bot + offset; printf("Executing BP %s: grad_output %p, mean %p, grad_input %p\n",NNNode::nname_.c_str(), gtop, mean, gbot); printf("Inputs: %d x %d x %d\n",gparams_.nInput[0], gparams_.iHeight, gparams_.iWidth); printf("Grad Inputs: %d x %d x %d\n",gparams_.nInput[0], gparams_.iHeight, gparams_.iWidth); printf("Grad Outputs: %d x %d x %d\n",gparams_.nOutput, gparams_.oHeight, gparams_.oWidth); #endif if(first_bp) { if(in_dtype == DT_FLOAT) { int size = nImg*ifm0*ifhp*ifwp; float* gbot0 = (float*)(tenBotDiff_[0]->getBuffer()); float* gbot1 = gparams_.eltwise ? (float*)(tenBotDiff_[1]->getBuffer()) : NULL; #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; igetBuffer()); libxsmm_bfloat16* gbot1 = gparams_.eltwise ? (libxsmm_bfloat16*)(tenBotDiff_[1]->getBuffer()) : NULL; int size = nImg*ifm0*ifhp*ifwp; #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; ibackPropagate(tenTopDiff_, tenScaleDiff_, tenShiftDiff_, tenBotDiff_, 0); #ifdef CHECK_BLOWUP_FP32 if(out_dtype == DT_FLOAT) { for(int i=0; i<10240; i++) { float v = ((float*)tenBotDiff_[0]->getBuffer())[i]; if(isnan(v) || isinf(v)) { printf("Warning! %s layer BP activations are NaN or Inf\n", nname_.c_str()); exit(-1); } } } else if(out_dtype == DT_BF16) { convert_bf16_f32((libxsmm_bfloat16*)tenBotDiff_[0]->getBuffer(), cbptr, 10240); #ifdef USE_MLSL int node_id = MLSL::Environment::GetEnv().GetProcessIdx(); #else int node_id = 0; #endif if(node_id == 0) { for(int i=0; i<10240; i++) { if(isnan(cbptr[i]) || isinf(cbptr[i])) { printf("Warning! %s layer BP activations are NaN or Inf\n", nname_.c_str()); MeanOfLayer((char*)nname_.c_str(), (libxsmm_bfloat16*)tenBotDiff_[0]->getBuffer(), nImg*ifm0*ifhp*ifwp); if(gparams_.eltwise) MeanOfLayer((char*)nname_.c_str(), (libxsmm_bfloat16*)tenBotDiff_[1]->getBuffer(), nImg*ifm1*ifhp*ifwp); #ifdef USE_MLSL MPI_Finalize(); #endif exit(-1); } } } } #endif #ifdef GETSTATS #ifdef USE_MLSL unsigned int node_id = MLSL::Environment::GetEnv().GetProcessIdx(); #else unsigned int node_id = 0; #endif if(node_id == 0 && eptr_->get_current_batch() % STATFREQ == 0) { if(out_dtype == DT_FLOAT) { float *ptr = (float*)tenTopDiff_->getBuffer(); int size = nImg*ofm*ofhp*ofwp; string s = nname_ + "_delOutp"; MeanOfLayer((char*)s.c_str(), ptr, size); } else if(out_dtype == DT_BF16) { libxsmm_bfloat16 *ptr = (libxsmm_bfloat16*)tenTopDiff_->getBuffer(); int size = nImg*ofm*ofhp*ofwp; convert_bf16_f32(ptr, stptr, size); string s = nname_ + "_delOutp"; MeanOfLayer((char*)s.c_str(), stptr, size); } string s = nname_ + "_delgammap0"; float* delgamma = (float*)tenScaleDiff_->getBuffer(); MeanOfLayer((char*)s.c_str(), delgamma, gparams_.nOutput); #if 0 void **g = tenScaleDiff_->getBufferPtr(); float *g1 = (float*)g[1] + tenScaleDiff_->getOffset(); s = nname_ + "_delgammap1"; MeanOfLayer((char*)s.c_str(), g1, gparams_.nOutput); #endif s = nname_ + "_delbetap0"; float* delbeta = (float*)tenShiftDiff_->getBuffer(); MeanOfLayer((char*)s.c_str(), delbeta, gparams_.nOutput); #if 0 void **b = tenShiftDiff_->getBufferPtr(); float *b1 = (float*)b[1] + tenShiftDiff_->getOffset(); s = nname_ + "_delbetap1"; MeanOfLayer((char*)s.c_str(), b1, gparams_.nOutput); #endif if(in_dtype == DT_FLOAT) { float *ptr = (float*)tenBotDiff_[0]->getBuffer(); string s = nname_ + "_delInp"; int size = nImg*ifm0*ifhp*ifwp; MeanOfLayer((char*)s.c_str(), ptr, size); } else if(in_dtype == DT_BF16) { libxsmm_bfloat16 *ptr = (libxsmm_bfloat16*)tenBotDiff_[0]->getBuffer(); s = nname_ + "_delInp"; int size = nImg*ifm0*ifhp*ifwp; convert_bf16_f32(ptr, stptr, size); MeanOfLayer((char*)s.c_str(), stptr, size); } } #endif } void FusedBNormNode::weightUpdate() { #ifdef USE_MLSL void* gexp_test = tenMeanData_->getPrivBuffer(); void* gvar_test = tenVarData_->getPrivBuffer(); float *gmean = (float*)tenMeanData_->getBuffer(); float *gvar = (float*)tenVarData_->getBuffer(); int num_nodes = eptr_->get_num_machines(); for(int i=0; iGetParameterSet(0)->StartGradientComm(tenScaleDiff_->getBuffer()); op_->GetParameterSet(1)->StartGradientComm(tenShiftDiff_->getBuffer()); op_->GetParameterSet(2)->StartGradientComm(gexp_test); op_->GetParameterSet(3)->StartGradientComm(gvar_test); #endif } void FusedBNormNode::solverStep() { #if defined(USE_MLSL) || defined(CHECK_BLOWUP_FP32) float *delgamma = (float*)tenScaleDiff_->getBuffer(); float *delbeta = (float*)tenShiftDiff_->getBuffer(); void* gexp_test = tenMeanData_->getPrivBuffer(); void* gvar_test = tenVarData_->getPrivBuffer(); #endif #ifdef USE_MLSL void *mptr = op_->GetParameterSet(0)->WaitGradientComm(); if(mptr != NULL && mptr != delgamma) memcpy((void*)delgamma, mptr, gparams_.nOutput*sizeof(float)); mptr = op_->GetParameterSet(1)->WaitGradientComm(); if(mptr != NULL && mptr != delbeta) memcpy((void*)delbeta, mptr, gparams_.nOutput*sizeof(float)); mptr = op_->GetParameterSet(2)->WaitGradientComm(); if(mptr != NULL && mptr != gexp_test) memcpy((void*)gexp_test, mptr, gparams_.nOutput*sizeof(float)); mptr = op_->GetParameterSet(3)->WaitGradientComm(); if(mptr != NULL && mptr != gvar_test) memcpy((void*)gvar_test, mptr, gparams_.nOutput*sizeof(float)); #endif #ifdef CHECK_BLOWUP_FP32 for(int i=0; i<16; i++) { if(isnan(delgamma[i]) || isinf(delgamma[i])) { printf("Warning! %s layer Solver gamma gradients are NaN or Inf\n", nname_.c_str()); exit(-1); } } for(int i=0; i<16; i++) { if(isnan(delbeta[i]) || isinf(delbeta[i])) { printf("Warning! %s layer Solver beta gradients are NaN or Inf\n", nname_.c_str()); exit(-1); } } #endif } libxsmm-1.17/samples/deeplearning/gxm/src/FusedBNormXSMM.cpp000066400000000000000000001124441415223013700240210ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar, Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include #include #include #include #include #include "FusedBNormXSMM.hpp" #define VLEN 16 FusedBNormXSMM::FusedBNormXSMM(FusedBNormImplParams* gp, int engine) : FusedBNormImpl(gp, engine) { fusedbn_desc_train[0].partN = gp->batch_size/gp->num_numa_nodes; fusedbn_desc_train[0].fullN = gp->batch_size; fusedbn_desc_train[0].C = gp->nInput[0]; fusedbn_desc_train[0].H = gp->iHeight; fusedbn_desc_train[0].W = gp->iWidth; fusedbn_desc_train[0].u = gp->stride_h; fusedbn_desc_train[0].v = gp->stride_w; fusedbn_desc_train[0].pad_h_in = gp->ipad_h; fusedbn_desc_train[0].pad_w_in = gp->ipad_w; fusedbn_desc_train[0].pad_h_out = gp->pad_h; fusedbn_desc_train[0].pad_w_out = gp->pad_w; fusedbn_desc_train[0].threads = gp->num_threads/gp->num_numa_nodes; if(gp->in_data_type == DT_FLOAT && gp->out_data_type == DT_FLOAT) { fusedbn_desc_train[0].datatype_in = LIBXSMM_DNN_DATATYPE_F32; fusedbn_desc_train[0].datatype_out = LIBXSMM_DNN_DATATYPE_F32; } else if(gp->in_data_type == DT_BF16 && gp->out_data_type == DT_BF16) { fusedbn_desc_train[0].datatype_in = LIBXSMM_DNN_DATATYPE_BF16; fusedbn_desc_train[0].datatype_out = LIBXSMM_DNN_DATATYPE_BF16; } fusedbn_desc_train[0].datatype_stats = LIBXSMM_DNN_DATATYPE_F32; fusedbn_desc_train[0].buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; fusedbn_desc_train[0].fuse_order = LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU; fusedbn_desc_train[0].fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED; if(gp->relu) fusedbn_desc_train[0].fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED_RELU_WITH_MASK; if(gp->eltwise) fusedbn_desc_train[0].fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED_ELTWISE; if(gp->relu && gp->eltwise) fusedbn_desc_train[0].fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED_ELTWISE_RELU_WITH_MASK; for(int n=0; nnum_numa_nodes; n++) { libxsmm_handle_train[0][n] = libxsmm_dnn_create_fusedbatchnorm( fusedbn_desc_train[0], &status ); CHKERR_LIBXSMM_DNN( status ); } fusedbn_desc_train[1].partN = gp->batch_size/gp->num_numa_nodes; fusedbn_desc_train[1].fullN = gp->batch_size; fusedbn_desc_train[1].C = gp->nInput[0]; fusedbn_desc_train[1].H = gp->iHeight; fusedbn_desc_train[1].W = gp->iWidth; fusedbn_desc_train[1].u = gp->stride_h; fusedbn_desc_train[1].v = gp->stride_w; fusedbn_desc_train[1].pad_h_in = gp->ipad_h; fusedbn_desc_train[1].pad_w_in = gp->ipad_w; fusedbn_desc_train[1].pad_h_out = gp->pad_h; fusedbn_desc_train[1].pad_w_out = gp->pad_w; fusedbn_desc_train[1].threads = gp->num_threads/gp->num_numa_nodes; if(gp->in_data_type == DT_FLOAT && gp->out_data_type == DT_FLOAT) { fusedbn_desc_train[1].datatype_in = LIBXSMM_DNN_DATATYPE_F32; fusedbn_desc_train[1].datatype_out = LIBXSMM_DNN_DATATYPE_F32; } else if(gp->in_data_type == DT_BF16 && gp->out_data_type == DT_BF16) { fusedbn_desc_train[1].datatype_in = LIBXSMM_DNN_DATATYPE_BF16; fusedbn_desc_train[1].datatype_out = LIBXSMM_DNN_DATATYPE_BF16; } fusedbn_desc_train[1].datatype_stats = LIBXSMM_DNN_DATATYPE_F32; fusedbn_desc_train[1].buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; fusedbn_desc_train[1].fuse_order = LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU; fusedbn_desc_train[1].fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE; if(gp->relu) fusedbn_desc_train[1].fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE_RELU_WITH_MASK; if(gp->eltwise) fusedbn_desc_train[1].fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE_ELTWISE; if(gp->relu && gp->eltwise) fusedbn_desc_train[1].fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE_ELTWISE_RELU_WITH_MASK; for(int n=0; nnum_numa_nodes; n++) { libxsmm_handle_train[1][n] = libxsmm_dnn_create_fusedbatchnorm( fusedbn_desc_train[1], &status ); CHKERR_LIBXSMM_DNN( status ); } fusedbn_desc_test.partN = gp->batch_size/gp->num_numa_nodes; fusedbn_desc_test.fullN = gp->batch_size; fusedbn_desc_test.C = gp->nInput[0]; fusedbn_desc_test.H = gp->iHeight; fusedbn_desc_test.W = gp->iWidth; fusedbn_desc_test.u = gp->stride_h; fusedbn_desc_test.v = gp->stride_w; fusedbn_desc_test.pad_h_in = gp->ipad_h; fusedbn_desc_test.pad_w_in = gp->ipad_w; fusedbn_desc_test.pad_h_out = gp->pad_h; fusedbn_desc_test.pad_w_out = gp->pad_w; fusedbn_desc_test.threads = gp->num_threads/gp->num_numa_nodes; if(gp->in_data_type == DT_FLOAT && gp->out_data_type == DT_FLOAT) { fusedbn_desc_test.datatype_in = LIBXSMM_DNN_DATATYPE_F32; fusedbn_desc_test.datatype_out = LIBXSMM_DNN_DATATYPE_F32; } else if(gp->in_data_type == DT_BF16 && gp->out_data_type == DT_BF16) { fusedbn_desc_test.datatype_in = LIBXSMM_DNN_DATATYPE_BF16; fusedbn_desc_test.datatype_out = LIBXSMM_DNN_DATATYPE_BF16; } fusedbn_desc_test.datatype_stats = LIBXSMM_DNN_DATATYPE_F32; fusedbn_desc_test.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; fusedbn_desc_test.fuse_order = LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU; fusedbn_desc_test.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE; if(gp->relu) fusedbn_desc_test.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE_RELU_WITH_MASK; if(gp->eltwise) fusedbn_desc_test.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE_ELTWISE; if(gp->relu && gp->eltwise) fusedbn_desc_test.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE_ELTWISE_RELU_WITH_MASK; for(int n=0; nnum_numa_nodes; n++) { libxsmm_handle_test[n] = libxsmm_dnn_create_fusedbatchnorm( fusedbn_desc_test, &status ); CHKERR_LIBXSMM_DNN( status ); } } void FusedBNormXSMM::forwardPropagate(vector inpb, TensorBuf *gammapb, TensorBuf *betapb, TensorBuf *meanpb, TensorBuf *varpb, TensorBuf *outpb, int tid) { int nImg = gp->batch_size/gp->num_numa_nodes; int nFM = gp->nInput[0]; int nBfm = nFM/VLEN; int ifh = gp->iHeight; int ifw = gp->iWidth; int iph = gp->ipad_h; int ipw = gp->ipad_w; int ifhp = ifh +2*iph; int ifwp = ifw + 2*ipw; int ofh = gp->oHeight; int ofw = gp->oWidth; int oph = gp->pad_h; int opw = gp->pad_w; int ofhp = ofh + 2*oph; int ofwp = ofw + 2*opw; void *inp_r[NUM_NUMA_NODES]; void *inp_l[NUM_NUMA_NODES]; void *output[NUM_NUMA_NODES]; float *gamma[NUM_NUMA_NODES]; float *beta[NUM_NUMA_NODES]; float *gexpect[NUM_NUMA_NODES]; float *gvar[NUM_NUMA_NODES]; float *gexp_test = (float*)meanpb->getPrivBuffer(); float *gvar_test = (float*)varpb->getPrivBuffer(); inp_r[0] = inpb[0]->getBuffer(); int imoff = nImg*nFM*ifhp*ifwp; if(gp->in_data_type == DT_FLOAT) imoff = imoff*sizeof(float); else if(gp->in_data_type == DT_BF16) imoff = imoff*sizeof(libxsmm_bfloat16); for(int n=1; nnum_numa_nodes; n++) inp_r[n] = inp_r[n-1] + imoff; inp_l[0] = gp->eltwise ? inpb[1]->getBuffer() : NULL; if(inp_l[0]) { imoff = nImg*gp->nInput[1]*ifhp*ifwp; if(gp->in_data_type == DT_FLOAT) imoff = imoff*sizeof(float); else if(gp->in_data_type == DT_BF16) imoff = imoff*sizeof(libxsmm_bfloat16); for(int n=1; nnum_numa_nodes; n++) inp_l[n] = inp_l[n-1] + imoff; } output[0] = outpb->getBuffer(); imoff = nImg*gp->nOutput*ofhp*ofwp; if(gp->out_data_type == DT_FLOAT) imoff = imoff*sizeof(float); else if(gp->out_data_type == DT_BF16) imoff = imoff*sizeof(libxsmm_bfloat16); for(int n=1; nnum_numa_nodes; n++) output[n] = output[n-1] + imoff; void **gptrptr = gammapb->getBufferPtr(); int offset = gammapb->getOffset(); for(int n=0; nnum_numa_nodes; n++) gamma[n] = (float*)gptrptr[n] + offset; void **bptrptr = betapb->getBufferPtr(); offset = betapb->getOffset(); for(int n=0; nnum_numa_nodes; n++) beta[n] = (float*)bptrptr[n] + offset; void **mptrptr = meanpb->getBufferPtr(); offset = meanpb->getOffset(); for(int n=0; nnum_numa_nodes; n++) gexpect[n] = (float*)mptrptr[n] + offset; void **vptrptr = varpb->getBufferPtr(); offset = varpb->getOffset(); for(int n=0; nnum_numa_nodes; n++) gvar[n] = (float*)vptrptr[n] + offset; for(int n=0; nnum_numa_nodes; n++) { if(bexpect[n] == NULL) { bexpect[n] = (float*)libxsmm_aligned_malloc(nFM*sizeof(float), 2097152); #ifndef NDEBUG printf("%s allocated %lu bytes for mean\n",nname.c_str(), nFM*sizeof(float)); #endif } if(bstddev[n] == NULL) { bstddev[n] = (float*)libxsmm_aligned_malloc(nFM*sizeof(float), 2097152); #ifndef NDEBUG printf("%s allocated %lu bytes for stdev\n",nname.c_str(), nFM*sizeof(float)); #endif } if(bvariance[n] == NULL) { bvariance[n] = (float*)libxsmm_aligned_malloc(nFM*sizeof(float), 2097152); #ifndef NDEBUG printf("%s allocated %lu bytes for variance\n",nname.c_str(), nFM*sizeof(float)); #endif } if(relu_mask[n] == NULL) relu_mask[n] = (void*)libxsmm_aligned_malloc(nImg*nFM*ofhp*ofwp*sizeof(unsigned char), 2097152); } if(gexp_test == NULL) { gexp_test = (float*)libxsmm_aligned_malloc(nFM*sizeof(float), 2097152); meanpb->setPrivBuffer((void*)gexp_test); #ifndef NDEBUG printf("%s allocated %lu bytes for mean test\n",nname.c_str(), nFM*sizeof(float)); #endif } if(gvar_test == NULL) { gvar_test = (float*)libxsmm_aligned_malloc(nFM*sizeof(float), 2097152); varpb->setPrivBuffer((void*)gvar_test); #ifndef NDEBUG printf("%s allocated %lu bytes for mean test\n",nname.c_str(), nFM*sizeof(float)); #endif } void **sptrptr = scratchp->getBufferPtr(); for(int n=0; nnum_numa_nodes; n++) { if(libxsmm_input_train[n] == NULL && libxsmm_input_add_train[n] == NULL && libxsmm_expectval_train[n] == NULL && libxsmm_stddev_train[n] == NULL && libxsmm_variance_train[n] == NULL && libxsmm_gamma_train[n] == NULL && libxsmm_beta_train[n] == NULL && libxsmm_output_train[n] == NULL) { for(int t=0; t < 2; t++) { libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle_train[t][n], LIBXSMM_DNN_REGULAR_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_input_train[n] = libxsmm_dnn_link_tensor( libxsmm_layout, inp_r[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN(libxsmm_dnn_fusedbatchnorm_bind_tensor(libxsmm_handle_train[t][n], libxsmm_input_train[n], LIBXSMM_DNN_REGULAR_INPUT ) ); if(gp->eltwise) { libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle_train[t][n], LIBXSMM_DNN_REGULAR_INPUT_ADD, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_input_add_train[n] = libxsmm_dnn_link_tensor(libxsmm_layout, inp_l[n], &status); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle_train[t][n], libxsmm_input_add_train[n], LIBXSMM_DNN_REGULAR_INPUT_ADD ) ) } libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle_train[t][n], LIBXSMM_DNN_CHANNEL_EXPECTVAL, &status); CHKERR_LIBXSMM_DNN( status ); libxsmm_expectval_train[n] = libxsmm_dnn_link_tensor( libxsmm_layout, (void*)bexpect[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle_train[t][n], libxsmm_expectval_train[n], LIBXSMM_DNN_CHANNEL_EXPECTVAL ) ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle_train[t][n], LIBXSMM_DNN_CHANNEL_RCPSTDDEV, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_stddev_train[n] = libxsmm_dnn_link_tensor( libxsmm_layout, (void*)bstddev[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle_train[t][n], libxsmm_stddev_train[n], LIBXSMM_DNN_CHANNEL_RCPSTDDEV ) ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle_train[t][n], LIBXSMM_DNN_CHANNEL_VARIANCE, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_variance_train[n] = libxsmm_dnn_link_tensor( libxsmm_layout, (void*)bvariance[n], &status); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN(libxsmm_dnn_fusedbatchnorm_bind_tensor(libxsmm_handle_train[t][n], libxsmm_variance_train[n], LIBXSMM_DNN_CHANNEL_VARIANCE ) ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle_train[t][n], LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_gamma_train[n] = libxsmm_dnn_link_tensor( libxsmm_layout, (void*)gamma[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor(libxsmm_handle_train[t][n], libxsmm_gamma_train[n], LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA ) ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout(libxsmm_handle_train[t][n], LIBXSMM_DNN_REGULAR_CHANNEL_BETA, &status); CHKERR_LIBXSMM_DNN( status ); libxsmm_beta_train[n] = libxsmm_dnn_link_tensor( libxsmm_layout, (void*)beta[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor(libxsmm_handle_train[t][n], libxsmm_beta_train[n], LIBXSMM_DNN_REGULAR_CHANNEL_BETA ) ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle_train[t][n], LIBXSMM_DNN_REGULAR_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_output_train[n] = libxsmm_dnn_link_tensor( libxsmm_layout, output[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle_train[t][n], libxsmm_output_train[n], LIBXSMM_DNN_REGULAR_OUTPUT ) ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle_train[t][n], LIBXSMM_DNN_RELU_MASK, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_relumask_train[n] = libxsmm_dnn_link_tensor( libxsmm_layout, relu_mask[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor(libxsmm_handle_train[t][n], libxsmm_relumask_train[n], LIBXSMM_DNN_RELU_MASK) ); } } } /* let's allocate (if required) and bind scratch */ if(sptrptr == NULL) { sptrptr = (void**)libxsmm_aligned_malloc(gp->num_numa_nodes*sizeof(void*), 2097152); scratchp->setBufferPtr(sptrptr); } int max_size = 0; for(int n=0; nnum_numa_nodes; n++) { if(sptrptr[n] == NULL) { long long int mysize = libxsmm_dnn_fusedbatchnorm_get_scratch_size( libxsmm_handle_train[0][n], &status ); CHKERR_LIBXSMM_DNN( status ); sptrptr[n] = (void*)libxsmm_aligned_malloc(mysize , 2097152); max_size = mysize; #ifdef USE_MLSL if(MLSL::Environment::GetEnv().GetProcessIdx() == 0) #endif printf("%s allocated %lld bytes for scratch @ %p\n",nname.c_str(), mysize, sptrptr[n]); } else { long long int ssize = scratchp->getBufferSize(); long long int mysize = libxsmm_dnn_fusedbatchnorm_get_scratch_size( libxsmm_handle_train[0][n], &status ); CHKERR_LIBXSMM_DNN( status ); if(ssize < mysize) { libxsmm_free(sptrptr[n]); sptrptr[n] = (void*)libxsmm_aligned_malloc(mysize, 2097152); scratchp->setBufferSize(mysize); #ifdef USE_MLSL if(MLSL::Environment::GetEnv().GetProcessIdx() == 0) #endif printf("%s allocated %lld bytes for scratch @ %p, prev size was %lld bytes\n",nname.c_str(), mysize, sptrptr[n], ssize); } else max_size = ssize; } } scratchp->setBufferSize(max_size); for(int n=0; nnum_numa_nodes; n++) { if(libxsmm_input_test[n] == NULL && libxsmm_input_add_test[n] == NULL && libxsmm_expectval_test[n] == NULL && libxsmm_stddev_test[n] == NULL && libxsmm_variance_test[n] == NULL && libxsmm_gamma_test[n] == NULL && libxsmm_beta_test[n] == NULL && libxsmm_output_test[n] == NULL) { libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle_test[n], LIBXSMM_DNN_REGULAR_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_input_test[n] = libxsmm_dnn_link_tensor( libxsmm_layout, inp_r[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle_test[n], libxsmm_input_test[n], LIBXSMM_DNN_REGULAR_INPUT ) ); if(gp->eltwise) { libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle_test[n], LIBXSMM_DNN_REGULAR_INPUT_ADD, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_input_add_test[n] = libxsmm_dnn_link_tensor( libxsmm_layout, inp_l[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle_test[n], libxsmm_input_add_test[n], LIBXSMM_DNN_REGULAR_INPUT_ADD ) ) } libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle_test[n], LIBXSMM_DNN_CHANNEL_EXPECTVAL, &status); CHKERR_LIBXSMM_DNN( status ); libxsmm_expectval_test[n] = libxsmm_dnn_link_tensor( libxsmm_layout, (void*)bexpect[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle_test[n], libxsmm_expectval_test[n], LIBXSMM_DNN_CHANNEL_EXPECTVAL ) ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle_test[n], LIBXSMM_DNN_CHANNEL_RCPSTDDEV, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_stddev_test[n] = libxsmm_dnn_link_tensor( libxsmm_layout, (void*)bstddev[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle_test[n], libxsmm_stddev_test[n], LIBXSMM_DNN_CHANNEL_RCPSTDDEV ) ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle_test[n], LIBXSMM_DNN_CHANNEL_VARIANCE, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_variance_test[n] = libxsmm_dnn_link_tensor( libxsmm_layout, (void*)bvariance[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle_test[n], libxsmm_variance_test[n], LIBXSMM_DNN_CHANNEL_VARIANCE ) ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle_test[n], LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_gamma_test[n] = libxsmm_dnn_link_tensor( libxsmm_layout, (void*)gamma[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle_test[n], libxsmm_gamma_test[n], LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA ) ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout(libxsmm_handle_test[n], LIBXSMM_DNN_REGULAR_CHANNEL_BETA, &status); CHKERR_LIBXSMM_DNN( status ); libxsmm_beta_test[n] = libxsmm_dnn_link_tensor( libxsmm_layout, (void*)beta[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle_test[n], libxsmm_beta_test[n], LIBXSMM_DNN_REGULAR_CHANNEL_BETA ) ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle_test[n], LIBXSMM_DNN_REGULAR_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_output_test[n] = libxsmm_dnn_link_tensor( libxsmm_layout, output[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle_test[n], libxsmm_output_test[n], LIBXSMM_DNN_REGULAR_OUTPUT ) ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle_test[n], LIBXSMM_DNN_RELU_MASK, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_relumask_test[n] = libxsmm_dnn_link_tensor( libxsmm_layout, relu_mask[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor(libxsmm_handle_test[n], libxsmm_relumask_test[n], LIBXSMM_DNN_RELU_MASK) ); } } if(!updated_scratch_fwd) { for(int n=0; nnum_numa_nodes; n++) { for(int t=0; t < 2; t++) CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_scratch( libxsmm_handle_train[t][n], sptrptr[n] ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_scratch( libxsmm_handle_test[n], sptrptr[n] ) ); } updated_scratch_fwd = true; } #if 0 #ifndef NDEBUG if ( (oph > 0 || opw > 0) && (iph > 0 || ipw > 0) ) { printf("node %s: batchnorm forward input and output is padded which cannot be :-(\n", nname.c_str()); } /* check rims */ if(gp->in_data_type == DT_FLOAT && gp->out_data_type == DT_FLOAT) { check_physical_pad( nname.c_str(), (float*)inp_r[0], nImg, nBfm, ifh, ifw, VLEN, iph, ipw ); check_physical_pad( nname.c_str(), (float*)output[0], nImg, nBfm, ofh, ofw, VLEN, oph, opw ); } else if(gp->in_data_type == DT_BF16 && gp->out_data_type == DT_BF16) { check_physical_pad( nname.c_str(), (libxsmm_bfloat16*)inp_r[0], nImg, nBfm, ifh, ifw, VLEN, iph, ipw ); check_physical_pad( nname.c_str(), (libxsmm_bfloat16*)output[0], nImg, nBfm, ofh, ofw, VLEN, oph, opw ); } #endif #endif if(!use_global_stats) { #if defined(_OPENMP) #pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif int ntps = gp->num_threads/gp->num_numa_nodes; int n = tid/ntps; CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_execute_st( libxsmm_handle_train[0][n], LIBXSMM_DNN_COMPUTE_KIND_FWD, n*ntps, tid ) ); #pragma omp barrier if(n == 0) CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_reduce_stats_st(&libxsmm_handle_train[0][0], gp->num_numa_nodes, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid) ); #pragma omp barrier CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_execute_st( libxsmm_handle_train[1][n], LIBXSMM_DNN_COMPUTE_KIND_FWD, n*ntps, tid ) ); } #if 0 #ifndef NDEBUG if ( (oph > 0 || opw > 0) && (iph > 0 || ipw > 0) ) { printf("node %s: batchnorm forward input and output is padded which cannot be :-(\n", nname.c_str()); } /* check rims */ if(gp->in_data_type == DT_FLOAT && gp->out_data_type == DT_FLOAT) { check_physical_pad( nname.c_str(), (float*)inp_r[0], nImg, nBfm, ifh, ifw, VLEN, iph, ipw ); check_physical_pad( nname.c_str(), (float*)output[0], nImg, nBfm, ofh, ofw, VLEN, oph, opw ); } else if(gp->in_data_type == DT_BF16 && gp->out_data_type == DT_BF16) { check_physical_pad( nname.c_str(), (libxsmm_bfloat16*)inp_r[0], nImg, nBfm, ifh, ifw, VLEN, iph, ipw ); check_physical_pad( nname.c_str(), (libxsmm_bfloat16*)output[0], nImg, nBfm, ofh, ofw, VLEN, oph, opw ); } #endif #endif if(gp->exec_mode == "TRAIN") { for(int n=0; nnum_numa_nodes; n++) { float *gexp = (float*)gexpect[n]; float *gv = (float*)gvar[n]; float (* __restrict bmean)[VLEN] = (float (*)[VLEN])bexpect[n]; float (* __restrict bvar)[VLEN] = (float (*)[VLEN])bvariance[n]; float nhw_ratio = float(nImg*ifh*ifw)/float(nImg*ifh*ifw - 1); #ifdef __AVX512F__ __m512 vmmf = _mm512_set1_ps(gp->mmf); __m512 vnhw_ratio = _mm512_set1_ps(nhw_ratio); for (int b = 0; b < nBfm; ++b) { __m512 vbm = _mm512_loadu_ps(&bmean[b][0]); __m512 vbvar = _mm512_loadu_ps(&bvar[b][0]); _mm512_storeu_ps( &(gexp[b*VLEN]), _mm512_add_ps(_mm512_mul_ps(_mm512_loadu_ps( &(gexp[b*VLEN]) ), vmmf), vbm)); _mm512_storeu_ps( &(gv[b*VLEN]), _mm512_add_ps( _mm512_mul_ps( _mm512_loadu_ps( &(gv[b*VLEN]) ), vmmf), _mm512_mul_ps(vnhw_ratio, vbvar))); } #else for (int b = 0; b < nBfm; ++b) { #pragma omp simd for (int v = 0; v < 16; ++v) { gexp[(b*16)+v] = gexp[(b*16)+v] * gp->mmf + bmean[b][v]; gv[(b*16)+v] = gv[(b*16)+v] * gp->mmf + nhw_ratio*bvar[b][v]; } } #endif } scaling_factor_ *= gp->mmf; scaling_factor_ += 1.; } } else { for(int n=0; nnum_numa_nodes; n++) { float *gexp = (float*)gexpect[n]; float *gv = (float*)gvar[n]; #pragma omp simd for(int i=0; i < nFM; i++) { ((float*)bexpect[n])[i] = gexp[i]/scaling_factor_; float tmp = gv[i]/scaling_factor_; ((float*)bstddev[n])[i] = 1./sqrt(tmp + gp->eps); } } #if defined(_OPENMP) #pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif int ntps = gp->num_threads/gp->num_numa_nodes; int n = tid/ntps; CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_execute_st( libxsmm_handle_test[n], LIBXSMM_DNN_COMPUTE_KIND_FWD, n*ntps, tid ) ); } } } void FusedBNormXSMM::backPropagate(TensorBuf *deloutpb, TensorBuf *delgammapb, TensorBuf *delbetapb, vector delinpb, int tid) { int nImg = gp->batch_size/gp->num_numa_nodes; int nFM = gp->nOutput; int nBfm = nFM/VLEN; int ofh = gp->oHeight; int ofw = gp->oWidth; int oph = gp->pad_h; int opw = gp->pad_w; int ofhp = ofh + 2*oph; int ofwp = ofw + 2*opw; int ifh = gp->iHeight; int ifw = gp->iWidth; int iph = gp->ipad_h; int ipw = gp->ipad_w; int ifhp = ifh + 2*iph; int ifwp = ifw + 2*ipw; int sh = gp->stride_h; int sw = gp->stride_w; int imoff; void *deloutput[NUM_NUMA_NODES]; void *delinp_r[NUM_NUMA_NODES]; void *delinp_l[NUM_NUMA_NODES]; void *delgamma[NUM_NUMA_NODES]; void *delbeta[NUM_NUMA_NODES]; deloutput[0] = deloutpb->getBuffer(); imoff = nImg * nFM * ofhp * ofwp; if(gp->out_data_type == DT_FLOAT) imoff = imoff * sizeof(float); else if(gp->out_data_type == DT_BF16) imoff = imoff * sizeof(libxsmm_bfloat16); for(int n=1; nnum_numa_nodes; n++) deloutput[n] = deloutput[n-1] + imoff; delinp_r[0] = delinpb[0]->getBuffer(); imoff = nImg * gp->nInput[0] * ifhp * ifwp; if(gp->in_data_type == DT_FLOAT) imoff = imoff * sizeof(float); else if(gp->in_data_type == DT_BF16) imoff = imoff * sizeof(libxsmm_bfloat16); for(int n=1; nnum_numa_nodes; n++) delinp_r[n] = delinp_r[n-1] + imoff; delinp_l[0] = gp->eltwise ? delinpb[1]->getBuffer() : NULL; if(delinp_l[0]) { imoff = nImg * gp->nInput[1] * ifhp * ifwp; if(gp->in_data_type == DT_FLOAT) imoff = imoff * sizeof(float); else if(gp->in_data_type == DT_BF16) imoff = imoff * sizeof(libxsmm_bfloat16); for(int n=1; nnum_numa_nodes; n++) delinp_l[n] = delinp_l[n-1] + imoff; } void **gptrptr = delgammapb->getBufferPtr(); int offset = delgammapb->getOffset() * sizeof(float); for(int n=0; nnum_numa_nodes; n++) delgamma[n] = gptrptr[n] + offset; void **bptrptr = delbetapb->getBufferPtr(); offset = delbetapb->getOffset() * sizeof(float); for(int n=0; nnum_numa_nodes; n++) delbeta[n] = bptrptr[n] + offset; #if 0 for(int n=0; nnum_numa_nodes; n++) { if(gp->in_data_type == DT_FLOAT) { float (* __restrict del_input_r)[nBfm][ifhp][ifwp][64] = (float (*)[*][*][*][64])delinp_r[n]; /* zero the rims in case of physical padding */ /* @TODO, we need to do the same thing with del_input_l?! */ if (iph > 0 || ipw > 0) { #pragma omp parallel for for (int img = 0; img < nImg; img++) { for (int fm = 0; fm < nBfm; fm++) { for (int w = 0; w < ifwp; w++) { for (int ph = 0; ph < iph; ph++) { #ifdef __AVX512F__ for(int i=0; i<64; i+=16) { _mm512_stream_ps( &(del_input_r[img][fm][ph ][w][i]), _mm512_setzero_ps() ); _mm512_stream_ps( &(del_input_r[img][fm][ifhp-1-ph][w][i]), _mm512_setzero_ps() ); } #else #pragma omp simd #pragma vector aligned #ifdef USE_NTS_BN #pragma vector nontemporal #endif for(int v=0; v < 64; v++) { del_input_r[img][fm][ph][w][v] = 0.0f; del_input_r[img][fm][ifhp-1-ph][w][v] = 0.0f; } #endif } for (int h = iph; h < ifh+iph; h++) { for (int pw = 0; pw < ipw; pw++) { #ifdef __AVX512F__ for(int i=0; i<64; i+=16) { _mm512_stream_ps( &(del_input_r[img][fm][h][pw ][i]), _mm512_setzero_ps() ); _mm512_stream_ps( &(del_input_r[img][fm][h][ifwp-1-pw][i]), _mm512_setzero_ps() ); } #else #pragma omp simd #pragma vector aligned #ifdef USE_NTS_BN #pragma vector nontemporal #endif for(int v=0; v < 64; v++) { del_input_r[img][fm][h][pw][v] = 0.0f; del_input_r[img][fm][h][ifwp-1-pw][v] = 0.0f; } #endif } } } } } } } else if(gp->in_data_type == DT_BF16) { libxsmm_bfloat16 (* __restrict del_input_r)[nBlocksFm][ifhp][ifwp][64] = (libxsmm_bfloat16 (*)[*][*][*][64])delinp_r[n]; /* zero the rims in case of physical padding */ /* @TODO, we need to do the same thing with del_input_l?! */ if (iph > 0 || ipw > 0) { #pragma omp parallel for for (int img = 0; img < nImg; img++) { for (int fm = 0; fm < nBlocksFm; fm++) { for (int w = 0; w < ifwp; w++) { for (int ph = 0; ph < iph; ph++) { #pragma omp simd #pragma vector aligned #ifdef USE_NTS_BN #pragma vector nontemporal #endif for(int v=0; v < 64; v++) { del_input_r[img][fm][ph][w][v] = 0; del_input_r[img][fm][ifhp-1-ph][w][v] = 0; } } } for (int h = iph; h < ifh+iph; h++) { for (int pw = 0; pw < ipw; pw++) { #pragma omp simd #pragma vector aligned #ifdef USE_NTS_BN #pragma vector nontemporal #endif for(int v=0; v < 64; v++) { del_input_r[img][fm][h][pw][v] = 0; del_input_r[img][fm][h][ifwp-1-pw][v] = 0; } } } } } } } } #endif /* Perform physical padding tests */ #if 0 #ifndef NDEBUG if ( (oph > 0 || opw > 0) && (iph > 0 || ipw > 0) ) { printf("node %s: batchnorm backward input and output is padded which cannot be :-(\n", nname.c_str()); } /* check rims */ if(gp->in_data_type == DT_FLOAT && gp->out_data_type == DT_FLOAT) { check_physical_pad( nname.c_str(), (float*)delinp_r[0], nImg, nBfm, ifh, ifw, VLEN, iph, ipw ); check_physical_pad( nname.c_str(), (float*)deloutput[0], nImg, nBfm, ofh, ofw, VLEN, oph, opw ); } else if(gp->in_data_type == DT_BF16 && gp->out_data_type == DT_BF16) { check_physical_pad( nname.c_str(), (libxsmm_bfloat16*)delinp_r[0], nImg, nBfm, ifh, ifw, VLEN, iph, ipw ); check_physical_pad( nname.c_str(), (libxsmm_bfloat16*)deloutput[0], nImg, nBfm, ofh, ofw, VLEN, oph, opw ); } #endif #endif void **sptrptr = scratchp->getBufferPtr(); if(!updated_scratch_bwd) { for(int n=0; nnum_numa_nodes; n++) for(int t=0; t<2; t++) CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_scratch( libxsmm_handle_train[t][n], sptrptr[n] ) ); updated_scratch_bwd = true; } for(int n=0; nnum_numa_nodes; n++) { if(libxsmm_deloutput[n] == NULL && libxsmm_delinput[n] == NULL && libxsmm_delinput_add[n] == NULL && libxsmm_delgamma[n] == NULL && libxsmm_delbeta[n] == NULL) { for(int t=0; t < 2; t++) { libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle_train[t][n], LIBXSMM_DNN_GRADIENT_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_deloutput[n] = libxsmm_dnn_link_tensor( libxsmm_layout, deloutput[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor(libxsmm_handle_train[t][n], libxsmm_deloutput[n], LIBXSMM_DNN_GRADIENT_OUTPUT ) ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle_train[t][n], LIBXSMM_DNN_GRADIENT_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delinput[n] = libxsmm_dnn_link_tensor( libxsmm_layout, delinp_r[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle_train[t][n], libxsmm_delinput[n], LIBXSMM_DNN_GRADIENT_INPUT ) ); if(gp->eltwise) { libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout(libxsmm_handle_train[t][n], LIBXSMM_DNN_GRADIENT_INPUT_ADD, &status); CHKERR_LIBXSMM_DNN( status ); libxsmm_delinput_add[n] = libxsmm_dnn_link_tensor( libxsmm_layout, delinp_l[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle_train[t][n], libxsmm_delinput_add[n], LIBXSMM_DNN_GRADIENT_INPUT_ADD ) ); } libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout(libxsmm_handle_train[t][n], LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA, &status); CHKERR_LIBXSMM_DNN( status ); libxsmm_delgamma[n] = libxsmm_dnn_link_tensor( libxsmm_layout, delgamma[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle_train[t][n], libxsmm_delgamma[n], LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA ) ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout(libxsmm_handle_train[t][n], LIBXSMM_DNN_GRADIENT_CHANNEL_BETA, &status); CHKERR_LIBXSMM_DNN( status ); libxsmm_delbeta[n] = libxsmm_dnn_link_tensor( libxsmm_layout, delbeta[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle_train[t][n], libxsmm_delbeta[n], LIBXSMM_DNN_GRADIENT_CHANNEL_BETA ) ); } } } #if defined(_OPENMP) #pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif int ntps = gp->num_threads/gp->num_numa_nodes; int n = tid/ntps; CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_execute_st( libxsmm_handle_train[0][n], LIBXSMM_DNN_COMPUTE_KIND_BWD, n*ntps, tid ) ); #pragma omp barrier if(n == 0) CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_reduce_stats_st(&libxsmm_handle_train[0][0], gp->num_numa_nodes, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid)); #pragma omp barrier CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_execute_st( libxsmm_handle_train[1][n], LIBXSMM_DNN_COMPUTE_KIND_BWD, n*ntps, tid ) ); } /* Perform physical padding tests */ #if 0 #ifndef NDEBUG if ( (oph > 0 || opw > 0) && (iph > 0 || ipw > 0) ) { printf("node %s: batchnorm backward input and output is padded which cannot be :-(\n", nname.c_str()); } /* check rims */ if(gp->in_data_type == DT_FLOAT && gp->out_data_type == DT_FLOAT) { check_physical_pad( nname.c_str(), (float*)delinp_r[0], nImg, nBfm, ifh, ifw, VLEN, iph, ipw ); check_physical_pad( nname.c_str(), (float*)deloutput[0], nImg, nBfm, ofh, ofw, VLEN, oph, opw ); } else if(gp->in_data_type == DT_BF16 && gp->out_data_type == DT_BF16) { check_physical_pad( nname.c_str(), (libxsmm_bfloat16*)delinp_r[0], nImg, nBfm, ifh, ifw, VLEN, iph, ipw ); check_physical_pad( nname.c_str(), (libxsmm_bfloat16*)deloutput[0], nImg, nBfm, ofh, ofw, VLEN, oph, opw ); } #endif #endif } libxsmm-1.17/samples/deeplearning/gxm/src/FusedConvBN.cpp000066400000000000000000001147541415223013700234320ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar, Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include #include "FusedConvBN.hpp" #include "fillers.hpp" #ifdef USE_MLSL #include "mpi.h" #endif using namespace std; using namespace gxm; FusedConvBNNode::FusedConvBNNode(FusedConvBNParams* p, MLEngine* e): NNNode(p, e) { nname_ = p->get_node_name(); ntype_ = p->get_node_type(); bottom_ = p->get_bottom_names(); top_ = p->get_top_names(); bp_flag_ = p->get_bprop_flag(); has_weights_ = true; bot_compute_engine_ = p->get_compute_engine(); tenTop_ = new Tensor(top_[0]); assert(tenTop_ != NULL); tenTop_->setOwner(this); tenTop_->setType(ACT); tenTopData_ = tenTop_->getBuf(DATA); tenTopData_->setBufferType(DATA); tenMid_ = new Tensor("mid_"+top_[0]); assert(tenMid_ != NULL); tenMid_->setOwner(this); tenMid_->setType(ACT); tenMidData_ = tenMid_->getBuf(DATA); tenMidData_->setBufferType(DATA); tenBot_.resize(bottom_.size()); tenBotData_.resize(bottom_.size()); for(int i=0; i < bottom_.size(); i++) { #ifndef NDEBUG printf("bottom%d name %s\n",i,bottom_[i].c_str()); #endif if(bottom_[i] == "data") tenBot_[i] = e->get_tensor(bottom_[i], INPUT); else tenBot_[i] = e->get_tensor(bottom_[i], ACT); assert(tenBot_[i] != NULL); NNNode *pnn = (NNNode*)tenBot_[i]->getOwner(); setPrevNode(pnn); mode_ = pnn->getMode(); pnn->set_top_compute_engine(p->get_compute_engine()); bot_cengine_ = pnn->get_bot_compute_engine(); tenBotData_[i] = tenBot_[i]->getBuf(DATA); } in_dtype = tenBotData_[0]->getDataType(); out_dtype = p->get_data_type(); tenTopData_->setDataType(out_dtype); // Get input tensor shape (bottom) Shape* bs = tenBot_[0]->getShape(); assert(bs->ndims <= MAX_DIMS); // Create shape of output tensor (top) vector vd = p->get_kernel_dims(); vector mvp = p->get_mid_pads(); vector ovp = p->get_top_pads(); vector ivp = p->get_bot_pads(); vector vcs = p->get_c_strides(); vector vbns = p->get_bn_strides(); shape_setzero(&ms_); ms_.ndims = bs->ndims; // Number of dimensions ms_.dims[0] = bs->dims[0]; // Minibatch size ms_.dims[1] = p->get_output(); // Num output feature maps ms_.dims[2] = (bs->dims[2] - vd[0] + 2*ivp[0])/vcs[0] + 1; // Height ms_.dims[3] = (bs->dims[3] - vd[1] + 2*ivp[1])/vcs[1] + 1; // Width tenMid_->setShape(&ms_); shape_setzero(&ts_); ts_.ndims = bs->ndims; // Number of dimensions ts_.dims[0] = bs->dims[0]; // Minibatch size ts_.dims[1] = p->get_output(); // Num output feature maps ts_.dims[2] = ms_.dims[2]/vbns[0]; // Height ts_.dims[3] = ms_.dims[3]/vbns[1]; // Width tenTop_->setShape(&ts_); long long int tsize; int convelem = ms_.dims[0] * ms_.dims[1] * (ms_.dims[2] + 2*mvp[0]) * (ms_.dims[3] + 2*mvp[1]); int bnelem = ts_.dims[0] * ts_.dims[1] * (ts_.dims[2] + 2*ovp[0]) * (ts_.dims[3] + 2*ovp[1]); int telem = convelem + bnelem; if(out_dtype == DT_FLOAT) tsize = telem*sizeof(float); else if(out_dtype = DT_BF16) tsize = telem*sizeof(libxsmm_bfloat16); tenTopData_->setBufferSize(tsize); // Create FP weight tensor weight_ = top_[0] + "_wt"; tenWeight_ = new Tensor(weight_); assert(tenWeight_ != NULL); tenWeight_->setOwner(this); tenWeight_->setType(CONVWEIGHT); shape_setzero(&ws_); ws_.ndims = ts_.ndims; // Number of dimesions ws_.dims[0] = ms_.dims[1]; // Num output feature maps (from mid tensor) ws_.dims[1] = bs->dims[1]; // Num input feature maps (from bottom tensor) ws_.dims[2] = vd[0]; // Kernel height ws_.dims[3] = vd[1]; // Kernel width tenWeight_->setShape(&ws_); tenWeight_->setBufDataType(DATA, DT_FLOAT); tenWeightData_ = tenWeight_->getBuf(DATA); tenWeightData_->setBufferType(DATA); int welem = 1; long long int wsize; for(int i=0; isetBufferSize(wsize); wfiller_type_ = p->get_weight_filler_type(); variance_norm_ = p->get_variance_norm(); std_ = p->get_std(); lr_mult_ = p->get_lr_mult(); decay_mult_ = p->get_decay_mult(); Shape sss; shape_setzero(&sss); sss.ndims = 1; sss.dims[0] = ts_.dims[1]; scale_ = top_[0] + "_scale"; tenScale_ = new Tensor(scale_); assert(tenScale_ != NULL); tenScale_->setOwner(this); tenScale_->setType(BNORMSCALE); tenScale_->setShape(&sss); tenScaleData_ = tenScale_->getBuf(DATA); tenScaleData_->setDataType(DT_FLOAT); tenScaleData_->setBufferType(DATA); telem = sss.dims[0]; tsize = telem*sizeof(float); tenScaleData_->setBufferSize(tsize); shift_ = top_[0] + "_shift"; tenShift_ = new Tensor(shift_); assert(tenShift_ != NULL); tenShift_->setOwner(this); tenShift_->setType(BNORMSHIFT); tenShift_->setShape(&sss); tenShiftData_ = tenShift_->getBuf(DATA); tenShiftData_->setDataType(DT_FLOAT); tenShiftData_->setBufferType(DATA); tenShiftData_->setBufferSize(tsize); mean_ = top_[0] + "_mean"; tenMean_ = new Tensor(mean_); assert(tenMean_ != NULL); tenMean_->setOwner(this); tenMean_->setType(BNORMMEAN); tenMean_->setShape(&sss); tenMeanData_ = tenMean_->getBuf(DATA); tenMeanData_->setDataType(DT_FLOAT); tenMeanData_->setBufferType(DATA); tenMeanData_->setBufferSize(tsize); var_ = top_[0] + "_var"; tenVar_ = new Tensor(var_); assert(tenVar_ != NULL); tenVar_->setOwner(this); tenVar_->setType(BNORMVAR); tenVar_->setShape(&sss); tenVarData_ = tenVar_->getBuf(DATA); tenVarData_->setDataType(DT_FLOAT); tenVarData_->setBufferType(DATA); tenVarData_->setBufferSize(tsize); if(!e->is_inference_only()) { if(bp_flag_) { tenBotDiff_.resize(bottom_.size()); for(int i=0; iaddBuf(); // DIFF type and index tenBotDiff_[i]->setDataType(in_dtype); tenBotDiff_[i]->setBufferType(DIFF); // Set the size of the input-gradient buffer Shape *bs = tenBot_[i]->getShape(); int botelem = bs->dims[0] * bs->dims[1] * (bs->dims[2] + 2*ivp[0]) * (bs->dims[3] + 2*ivp[1]); if(in_dtype == DT_FLOAT) tenBotDiff_[i]->setBufferSize((botelem + convelem)*sizeof(float)); else if(in_dtype == DT_BF16) tenBotDiff_[i]->setBufferSize((botelem + convelem)*sizeof(libxsmm_bfloat16)); } tenMidDiff_ = tenMid_->addBuf(); // DIFF type and index tenMidDiff_->setDataType(in_dtype); tenMidDiff_->setBufferType(DIFF); } if(has_weights_) { if(tenMidDiff_ == NULL) { tenMidDiff_ = tenMid_->addBuf(); // DIFF type and index tenMidDiff_->setDataType(in_dtype); tenMidDiff_->setBufferType(DIFF); if(in_dtype == DT_FLOAT) tenMidDiff_->setBufferSize(convelem*sizeof(float)); else if(in_dtype == DT_BF16) tenMidDiff_->setBufferSize(convelem*sizeof(libxsmm_bfloat16)); } tenWeightDiff_ = tenWeight_->addBuf(); // DIFF type and index tenWeightDiff_->setBufferType(DIFF); tenWeightInc_ = tenWeight_->addBuf(); // SHARED type and index tenWeightInc_->setDataType(DT_FLOAT); tenWeightInc_->setBufferType(HISTORY); tenWeightInc_->setBufferSize(welem*sizeof(float)); // Set the size of the weight-gradient buffer and the weight-increment buffer if(in_dtype == DT_FLOAT) { tenWeightDiff_->setDataType(DT_FLOAT); tenWeightDiff_->setBufferSize(welem*sizeof(float)); } else if(in_dtype == DT_BF16) { tenWeightDiff_->setDataType(DT_BF16); tenWeightDiff_->setBufferSize(welem*sizeof(libxsmm_bfloat16)); } tenScaleDiff_ = tenScale_->addBuf(); tenScaleDiff_->setDataType(DT_FLOAT); tenScaleDiff_->setBufferType(DIFF); tenScaleDiff_->setBufferSize(tsize); tenScaleInc_ = tenScale_->addBuf(); tenScaleInc_->setDataType(DT_FLOAT); tenScaleInc_->setBufferType(HISTORY); tenScaleInc_->setBufferSize(tsize); tenShiftDiff_ = tenShift_->addBuf(); tenShiftDiff_->setDataType(DT_FLOAT); tenShiftDiff_->setBufferType(DIFF); tenShiftDiff_->setBufferSize(tsize); tenShiftInc_ = tenShift_->addBuf(); tenShiftInc_->setDataType(DT_FLOAT); tenShiftInc_->setBufferType(HISTORY); tenShiftInc_->setBufferSize(tsize); } } else { tenMidDiff_ = NULL; tenWeightDiff_ = NULL; tenWeightInc_ = NULL; tenScaleDiff_ = NULL; tenShiftDiff_ = NULL; tenScaleInc_ = NULL; tenShiftInc_ = NULL; } // Register output tensor in tensor map bool inserted = e->register_tensor(top_[0], ACT, tenTop_); if(!inserted) printf("Warning: Tensor %s already registered\n",top_[0].c_str()); string m = "mid_"+top_[0]; inserted = e->register_tensor(m, ACT, tenMid_); if(!inserted) printf("Warning: Tensor %s already registered\n",m.c_str()); // Register weight tensor in weight tensor map inserted = e->register_tensor(weight_, CONVWEIGHT, tenWeight_); if(!inserted) printf("Warning: Tensor %s already registered\n",weight_.c_str()); inserted = e->register_tensor(scale_, BNORMSCALE, tenScale_); if(!inserted) printf("Warning: Tensor %s already registered\n",scale_.c_str()); inserted = e->register_tensor(shift_, BNORMSHIFT, tenShift_); if(!inserted) printf("Warning: Tensor %s already registered\n",shift_.c_str()); inserted = e->register_tensor(mean_, BNORMMEAN, tenMean_); if(!inserted) printf("Warning: Tensor %s already registered\n",mean_.c_str()); inserted = e->register_tensor(var_, BNORMVAR, tenVar_); if(!inserted) printf("Warning: Tensor %s already registered\n",var_.c_str()); // Setup parameter structure for convolution computation in library gparams_.bdims = bs->ndims; gparams_.tdims = ts_.ndims; gparams_.mdims = ms_.ndims; gparams_.wdims = ws_.ndims; gparams_.node_name = nname_; gparams_.node_type = ntype_; gparams_.nInput.resize(bottom_.size()); if(bottom_.size() > 1) gparams_.nInput.resize(bottom_.size()); gparams_.nInput[0] = bs->dims[1]; if(bottom_.size() > 1) gparams_.nInput[1] = tenBot_[1]->getShape()->dims[1]; gparams_.nOutput = ts_.dims[1]; gparams_.batch_size = bs->dims[0]; gparams_.iHeight = bs->dims[2]; gparams_.iWidth = bs->dims[3]; gparams_.mHeight = ms_.dims[2]; gparams_.mWidth = ms_.dims[3]; gparams_.oHeight = ts_.dims[2]; gparams_.oWidth = ts_.dims[3]; gparams_.ipad_h = ivp[0]; gparams_.ipad_w = ivp[1]; gparams_.mpad_h = mvp[0]; gparams_.mpad_w = mvp[1]; gparams_.opad_h = ovp[0]; gparams_.opad_w = ovp[1]; gparams_.physical_padding = p->get_physical_padding(); gparams_.group = p->get_group(); gparams_.c_stride_h = vcs[0]; gparams_.c_stride_w = vcs[1]; gparams_.bn_stride_h = vbns[0]; gparams_.bn_stride_w = vbns[1]; gparams_.kh = ws_.dims[2]; gparams_.kw = ws_.dims[3]; gparams_.relu_fwd = p->get_relu_fwd(); gparams_.relu_bwd = p->get_relu_bwd(); gparams_.mmf = p->get_mmf(); gparams_.eps = p->get_eps(); gparams_.use_global_stats = p->get_global_stats_flag(); gparams_.eltwise = p->get_eltwise(); gparams_.bprop = bp_flag_; gparams_.in_data_type = in_dtype; gparams_.out_data_type = out_dtype; gparams_.algType = p->get_algo_type(); gparams_.num_threads = e->get_num_threads(); // get solver solver_ = e->getSolver(); //get global scratch tensor buffer tenScratchData_ = e->getScratchBuffer(); // get engine eptr_ = e; #ifdef USE_MLSL MLSL::DataType dt = MLSL::DT_FLOAT; MLSL::OperationRegInfo *myRegInfo; MLSL::Session *s = eptr_->get_session(); myRegInfo = s->CreateOperationRegInfo(MLSL::OT_CC); myRegInfo->SetName(nname_.c_str()); myRegInfo->AddParameterSet(gparams_.nInput[0]*gparams_.nOutput/gparams_.group, gparams_.kw*gparams_.kh, dt, false); myRegInfo->AddParameterSet(gparams_.nOutput, 1, dt, false); myRegInfo->AddParameterSet(gparams_.nOutput, 1, dt, false); myRegInfo->AddParameterSet(gparams_.nOutput, 1, dt, false); myRegInfo->AddParameterSet(gparams_.nOutput, 1, dt, false); myRegInfo->Validate(); size_t opIdx = s->AddOperation(myRegInfo, e->get_distribution()); this->op_ = s->GetOperation(opIdx); s->DeleteOperationRegInfo(myRegInfo); e->get_combo_grad_comms_vec().push_back(op_); #endif configure(p->get_compute_engine()); } void FusedConvBNNode::configure(int engine) { switch(engine) { case XSMM: impl = new FusedConvBNXSMM(&gparams_, engine); break; } } void FusedConvBNNode::fillWeightBuffers(TensorBuf* tBuf, int buftype, long long int size) { int dtype = DT_FLOAT; void *ptr = tBuf->getBuffer(); #ifdef USE_MLSL unsigned int node_id = MLSL::Environment::GetEnv().GetProcessIdx(); #else unsigned int node_id = 0; #endif int ic = gparams_.nInput[0]; int oc = gparams_.nOutput; int kh = gparams_.kh; int kw = gparams_.kw; int g = gparams_.group; int fanin = (ic * kh * kw)/g; int fanout = (oc * kh * kw)/g; int welem = ic * oc * kh * kw; if(buftype == DATA) { if(node_id == 0) initBuffer(ptr, variance_norm_, fanin, fanout, welem*sizeof(float), wfiller_type_, std_); #ifdef USE_MLSL if(dtype == DT_FLOAT) MPI_Bcast(ptr, welem, MPI_FLOAT, 0, MPI_COMM_WORLD); #endif } else if(buftype == HISTORY || buftype == DIFF) memset(ptr, 0, size); } void FusedConvBNNode::fillWeightMultipliers(float* lr, float* decay, long long int size) { for(int i=0; i < size; i++) { lr[i] = lr_mult_[0]; decay[i] = decay_mult_[0]; } } void FusedConvBNNode::fillBiasMultipliers(float* lr, float* decay, long long int size) { for(int i=0; i < size; i++) { lr[i] = lr_mult_[1]; decay[i] = decay_mult_[1]; } } void FusedConvBNNode::fillBuffer(TensorBuf* tBuf, int buftype, long long int size) { int ttype = tBuf->getTensor()->getType(); int dtype = DT_FLOAT; void *ptr = tBuf->getBuffer(); if(ttype==BNORMSCALE && buftype == DATA) { if(nname_.find("bn3") == nname_.npos) initConstantBuffer(ptr, size, "CONSTANT", 1.0f); else initConstantBuffer(ptr, size, "CONSTANT", 0.0f); } else initConstantBuffer(ptr, size, "CONSTANT", 0.0f); } void FusedConvBNNode::Checkpoint(TensorBuf *tBuf, string name, string format) { long long int bytes = tBuf->getBufferSize(); int dtype = tBuf->getDataType(); FILE* f; void* ptr; size_t pos; if((name.find("30") == name.npos) && (name.find("60") == name.npos) && (name.find("80") == name.npos)) while((pos = name.find("/", 10)) != name.npos) name.replace(pos, 1, 1, '_'); float* p = (float*)tBuf->getBuffer(); bool no_checkpt = false; for(int i=0; i<16; i++) { if(isnan(p[i]) || isinf(p[i])) { no_checkpt = true; printf("Warning! %s Did not checkpoint! Weights are NaNs or Inf\n", nname_.c_str()); break; } } if(!no_checkpt) { if(format.compare("binary") == 0) { f = fopen(name.c_str(), "wb"); if(f != NULL) { if(name.find("wt") != name.npos) { ptr = _mm_malloc(bytes, 64); assert(ptr != NULL); impl->dumpBuffer(tBuf, ptr); } else if(name.find("mean") != name.npos || name.find("var") != name.npos) ptr = tBuf->getPrivBuffer(); else ptr = tBuf->getBuffer(); size_t b = fwrite(ptr, 1, bytes, f); assert((long long int)b == bytes); if(name.find("wt") != name.npos) _mm_free(ptr); } else printf("Warning: could not checkpoint to file %s\n",name.c_str()); } else { f = fopen(name.c_str(), "w"); if(f != NULL) { if(name.find("wt") != name.npos) { ptr = _mm_malloc(bytes, 64); assert(ptr != NULL); impl->dumpBuffer(tBuf, ptr); } else ptr = tBuf->getBuffer(); for(int i=0; iset_top_compute_engine(top_compute_engine_); impl->set_bot_compute_engine(bot_cengine_); impl->set_node_name(nname_); impl->set_scratch_buffer(tenScratchData_); if(eptr_->get_execution_mode() == TRAIN || eptr_->get_execution_mode() == VAL) { impl->set_global_stats(false); gparams_.exec_mode = "TRAIN"; } else if(eptr_->get_execution_mode() == TEST) impl->set_global_stats(true); tenMidData_->setBuffer(tenTopData_->getBuffer()); if(out_dtype == DT_FLOAT) { float* ptr = (float*)tenMidData_->getBuffer(); int size = nImg * ofm * mfhp * mfwp; tenMidData_->setBufferSize(size*sizeof(float)); tenTopData_->setBuffer(tenTopData_->getBuffer() + size*sizeof(float)); tenTopData_->setBufferSize(tenTopData_->getBufferSize() - size*sizeof(float)); // NUMA initialize Conv output #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; igetBuffer(); #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; igetBuffer(); int size = nImg * ofm * mfhp * mfwp; tenMidData_->setBufferSize(size*sizeof(libxsmm_bfloat16)); tenTopData_->setBuffer(tenTopData_->getBuffer() + size*sizeof(libxsmm_bfloat16)); tenTopData_->setBufferSize(tenTopData_->getBufferSize() - size*sizeof(libxsmm_bfloat16)); // NUMA initialize Conv output #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; igetBuffer(); size = nImg * ofm * (ofh/bnsh + 2*oph) * (ofw/bnsw + 2*opw); #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; iget_scaling_factor(); impl->set_scaling_factor(scf_); first_fp = false; } impl->forwardPropagate(tenBotData_, tenWeightData_, tenWeightInc_, tenMidData_, tenScaleData_, tenShiftData_, tenMeanData_, tenVarData_, tenTopData_, 0); if(eptr_->get_execution_mode() != TEST && eptr_->get_execution_mode() != VAL) { scf_ *= gparams_.mmf; scf_ += 1.; eptr_->set_scaling_factor(scf_); } #ifdef CHECK_BLOWUP_FP32 if(out_dtype == DT_FLOAT) { for(int i=0; i<10240; i++) { float v = ((float*)tenTopData_->getBuffer())[i]; if(isnan(v) || isinf(v)) { printf("Warning! %s layer FP activations are NaN or Inf\n", nname_.c_str()); exit(-1); } } } else if(out_dtype == DT_BF16) { convert_bf16_f32((libxsmm_bfloat16*)tenMidData_->getBuffer(), cbptr, 10240); for(int i=0; i<10240; i++) { if(isnan(cbptr[i]) || isinf(cbptr[i])) { printf("Warning! %s layer FP mid activations are NaN or Inf\n", nname_.c_str()); libxsmm_bfloat16 *ptr = (libxsmm_bfloat16*)tenMidData_->getBuffer(); printf("cbptr[%d] = %d, cbptr[%d] = %f\n",i,ptr[i],i,cbptr[i]); exit(-1); } } convert_bf16_f32((libxsmm_bfloat16*)tenTopData_->getBuffer(), cbptr, 10240); for(int i=0; i<10240; i++) { if(isnan(cbptr[i]) || isinf(cbptr[i])) { printf("Warning! %s layer FP activations are NaN or Inf\n", nname_.c_str()); libxsmm_bfloat16 *ptr = (libxsmm_bfloat16*)tenTopData_->getBuffer(); printf("cbptr[%d] = %d, cbptr[%d] = %f\n",i,ptr[i],i,cbptr[i]); exit(-1); } } } #endif #ifdef GETSTATS #ifdef USE_MLSL unsigned int node_id = MLSL::Environment::GetEnv().GetProcessIdx(); #else unsigned int node_id = 0; #endif if(node_id == 0) { if(in_dtype == DT_FLOAT) { float *ptr = (float*)tenBotData_[0]->getBuffer(); string s = nname_ + "_r_Inp"; MeanOfLayer((char*)s.c_str(), ptr, nImg*ifm0*ifhp*ifwp); if(gparams_.nInput.size() > 1) { ptr = (float*)tenBotData_[1]->getBuffer(); s = nname_ + "_l_Inp"; MeanOfLayer((char*)s.c_str(), ptr, nImg*ifm1*ifhp*ifwp); } ptr = (float*)tenMidData_->getBuffer(); s = nname_ + "_mid"; MeanOfLayer((char*)s.c_str(), ptr, nImg*ofm*mfhp*mfwp); } else if(in_dtype == DT_BF16) { if(stptr == NULL) { int s = nImg*ofm*ofhp*ofwp; int ms = nImg*ofm*mfhp*mfwp; int is = nImg*ifm0*ifhp*ifwp; int is1=0; if(gparams_.nInput.size() > 1) is1 = nImg*ifm1*ifhp*ifwp; int size = s > ms ? s : ms; size = size > is ? size : is; size = size > is1 ? size : is1; stptr = (float*)libxsmm_aligned_malloc(size*sizeof(float), 2097152); } libxsmm_bfloat16 *ptr; if(tenBotData_[0]->getLPBuffer() != NULL) ptr = (libxsmm_bfloat16*)tenBotData_[0]->getLPBuffer(); else ptr = (libxsmm_bfloat16*)tenBotData_[0]->getBuffer(); string s = nname_ + "_r_Inp"; convert_bf16_f32(ptr, stptr, nImg*ifm0*ifhp*ifwp); MeanOfLayer((char*)s.c_str(), stptr, nImg*ifm0*ifhp*ifwp); if(gparams_.nInput.size() > 1) { if(tenBotData_[1]->getLPBuffer() != NULL) ptr = (libxsmm_bfloat16*)tenBotData_[1]->getLPBuffer(); else ptr = (libxsmm_bfloat16*)tenBotData_[1]->getBuffer(); convert_bf16_f32(ptr, stptr, nImg*ifm1*ifhp*ifwp); s = nname_ + "_l_Inp"; MeanOfLayer((char*)s.c_str(), stptr, nImg*ifm1*ifhp*ifwp); } ptr = (libxsmm_bfloat16*)tenMidData_->getBuffer(); convert_bf16_f32(ptr, stptr, nImg*ofm*mfhp*mfwp); s = nname_ + "_mid"; MeanOfLayer((char*)s.c_str(), stptr, nImg*ofm*mfhp*mfwp); } string s = nname_ + "_wt"; float* wt = (float*)tenWeightData_->getBuffer(); MeanOfLayer((char*)s.c_str(), wt, ifm0*ofm*kh*kw); s = nname_ + "_gammap"; float* gamma = (float*)tenScaleData_->getBuffer(); MeanOfLayer((char*)s.c_str(), gamma, gparams_.nOutput); s = nname_ + "_betap"; float* beta = (float*)tenShiftData_->getBuffer(); MeanOfLayer((char*)s.c_str(), beta, gparams_.nOutput); if(out_dtype == DT_FLOAT) { float *ptr = (float*)tenTopData_->getBuffer(); string s = nname_ + "_Outp"; int size = nImg*ofm*(ofh/bnsh + 2*oph)*(ofw/bnsw + 2*opw); MeanOfLayer((char*)s.c_str(), ptr, size); } else if(out_dtype == DT_BF16) { libxsmm_bfloat16 *ptr = (libxsmm_bfloat16*)tenTopData_->getBuffer(); s = nname_ + "_Outp"; int size = nImg*ofm*(ofh/bnsh + 2*oph)*(ofw/bnsw + 2*opw); convert_bf16_f32(ptr, stptr, size); MeanOfLayer((char*)s.c_str(), stptr, size); } } #endif } void FusedConvBNNode::backPropagate() { int nImg = gparams_.batch_size; int ifm0 = gparams_.nInput[0]; int ifm1 = gparams_.eltwise ? gparams_.nInput[1] : 0; int ofm = gparams_.nOutput; int ifh = gparams_.iHeight; int ifhp = ifh + 2*gparams_.ipad_h; int ifw = gparams_.iWidth; int ifwp = ifw + 2*gparams_.ipad_w; int mfh = gparams_.mHeight; int mfw = gparams_.mWidth; int mfhp = mfh + 2*gparams_.mpad_h; int mfwp = mfw + 2*gparams_.mpad_w; int ofh = gparams_.oHeight; int ofw = gparams_.oWidth; int ofhp = ofh + 2*gparams_.opad_h; int ofwp = ofw + 2*gparams_.opad_w; int kh = gparams_.kh; int kw = gparams_.kw; #ifdef DEBUG printf("Executing BP %s\n",NNNode::nname_.c_str()); printf("Grad Outputs: %d x %d x %d\n", ofm, ofh, ofw); printf("Grad Inputs: %d x %d x %d\n", ifm, ifh, ifw); printf("Weights: %d x %d x %d x %d\n", ofm, ifm, kh, kw); #endif tenTopDiff_ = tenTop_->getBuf(DIFF); if(first_bp) { int bsize0 = nImg*ifm0*ifhp*ifwp; int bsize1 = nImg*ifm1*ifhp*ifwp; int msize = nImg*ofm*mfhp*mfwp; if(in_dtype == DT_FLOAT) { float* ptr = (float*)tenBotDiff_[0]->getBuffer(); tenMidDiff_->setBuffer(tenBotDiff_[0]->getBuffer() + bsize0*sizeof(float)); tenMidDiff_->setBufferSize(msize*sizeof(float)); tenBotDiff_[0]->setBufferSize(bsize0*sizeof(float)); if(gparams_.eltwise) tenBotDiff_[1]->setBufferSize(bsize1*sizeof(float)); // NUMA initialize Conv delinp #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; igetBuffer(); #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; igetBuffer() : NULL; if(ptr) { #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; igetBuffer(); tenMidDiff_->setBuffer(tenBotDiff_[0]->getBuffer() + bsize0*sizeof(libxsmm_bfloat16)); tenMidDiff_->setBufferSize(msize*sizeof(libxsmm_bfloat16)); tenBotDiff_[0]->setBufferSize(bsize0*sizeof(libxsmm_bfloat16)); if(gparams_.eltwise) tenBotDiff_[1]->setBufferSize(bsize1*sizeof(libxsmm_bfloat16)); // NUMA initialize Conv delinp #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; igetBuffer(); #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; igetBuffer() : NULL; if(ptr) { #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; ibackPropagate(tenTopDiff_, tenWeightData_, tenScaleDiff_, tenShiftDiff_, tenMidDiff_, tenBotDiff_, 0); #ifdef CHECK_BLOWUP_FP32 float* cbptr = (float*)tenTopDiff_->getBuffer(); for(int i=0; i<10240; i++) { if(isnan(cbptr[i]) || isinf(cbptr[i])) { printf("Warning! %s layer BP activations are NaN or Inf\n", nname_.c_str()); exit(-1); } } #endif #ifdef GETSTATS float *ptr, *pptr, *p, *bias; #ifdef USE_MLSL unsigned int node_id_ = MLSL::Environment::GetEnv().GetProcessIdx(); #else unsigned int node_id_ = 0; #endif if(node_id_ == 0) { int sh = gparams_.bn_stride_h; int sw = gparams_.bn_stride_w; int ph = gparams_.opad_h; int pw = gparams_.opad_w; if(out_dtype == DT_FLOAT) { float *ptr = (float*)tenTopDiff_->getBuffer(); int size = nImg*ofm*ofhp*ofwp; string s = nname_ + "_delOutp"; MeanOfLayer((char*)s.c_str(), ptr, size); } else if(out_dtype == DT_BF16) { libxsmm_bfloat16 *ptr = (libxsmm_bfloat16*)tenTopDiff_->getBuffer(); int size = nImg*ofm*ofhp*ofwp; convert_bf16_f32(ptr, stptr, size); string s = nname_ + "_delOutp"; MeanOfLayer((char*)s.c_str(), stptr, size); } string s = nname_ + "_delgammap"; float* delgamma = (float*)tenScaleDiff_->getBuffer(); MeanOfLayer((char*)s.c_str(), delgamma, gparams_.nOutput); s = nname_ + "_delbetap"; float* delbeta = (float*)tenShiftDiff_->getBuffer(); MeanOfLayer((char*)s.c_str(), delbeta, gparams_.nOutput); if(in_dtype == DT_FLOAT) { float *ptr = (float*)tenBotDiff_[0]->getBuffer(); string s = nname_ + "_delInp"; int size = nImg*ifm0*ifhp*ifwp; MeanOfLayer((char*)s.c_str(), ptr, size); } else if(in_dtype == DT_BF16) { libxsmm_bfloat16 *ptr = (libxsmm_bfloat16*)tenBotDiff_[0]->getBuffer(); s = nname_ + "_delInp"; int size = nImg*ifm0*ifhp*ifwp; convert_bf16_f32(ptr, stptr, size); MeanOfLayer((char*)s.c_str(), stptr, size); } } #endif } void FusedConvBNNode::weightUpdate() { int nImg = gparams_.batch_size; int ifm0 = gparams_.nInput[0]; int ofm = gparams_.nOutput; int ifh = gparams_.iHeight; int ifw = gparams_.iWidth; int mfh = gparams_.mHeight; int mfw = gparams_.mWidth; int mfhp = mfh + 2*gparams_.mpad_h; int mfwp = mfw + 2*gparams_.mpad_w; int ifhp = ifh + 2*gparams_.ipad_h; int ifwp = ifw + 2*gparams_.ipad_w; int kh = gparams_.kh; int kw = gparams_.kw; #ifdef DEBUG // printf("Executing WU %s: grad_output %p, grad_weights %p, input %p\n",NNNode::nname_.c_str(), gtop, gwt, bot); printf("Executing WU %s\n",NNNode::nname_.c_str()); printf("Grad Outputs: %d x %d x %d\n",ofm, ofh,ofw); printf("Inputs: %d x %d x %d\n",ifm0, ifh, ifw); printf("del-Weights: %d x %d x %d x %d\n", ofm, ifm0, kh, kw); printf("del-Biases: %d\n", ofm); #endif #ifdef GETSTATS #ifdef USE_MLSL int node_id = MLSL::Environment::GetEnv().GetProcessIdx(); #else int node_id = 0; #endif if(node_id == 0) { if(in_dtype == DT_FLOAT) { string s = nname_ + "_delWt_Bef"; float *ptr = (float*)tenWeightDiff_->getBuffer(); MeanOfLayer((char*)s.c_str(), ptr, ifm0*ofm*kh*kw); } else if(in_dtype == DT_BF16) { string s = nname_ + "_delWt_Bef"; libxsmm_bfloat16 *ptr = (libxsmm_bfloat16*)tenWeightDiff_->getBuffer(); memset(stptr, 0, ifm0*ofm*kh*kw); convert_bf16_f32(ptr, stptr, ifm0*ofm*kh*kw); MeanOfLayer((char*)s.c_str(), stptr, ifm0*ofm*kh*kw); } } #endif if(!bp_flag_ && first_upd) { int msize = nImg*ofm*mfhp*mfwp; if(in_dtype == DT_FLOAT) { float *ptr = (float*)tenMidDiff_->getBuffer(); // NUMA initialize Conv delmidp #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; igetBuffer(); // NUMA initialize = Conv delmidp ptr = (libxsmm_bfloat16*)tenMidDiff_->getBuffer(); #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; igetBuf(DIFF); impl->weightUpdate(tenBotData_[0], tenTopDiff_, tenMidDiff_, tenWeightDiff_, tenScaleDiff_, tenShiftDiff_, 0); #ifdef CHECK_BLOWUP_FP32 if(out_dtype == DT_FLOAT) { for(int i=0; i<16; i++) { float v = ((float*)tenWeightDiff_->getBuffer())[i]; if(isnan(v) || isinf(v)) { printf("Warning! %s layer BP activations are NaN or Inf\n", nname_.c_str()); exit(-1); } } } else if(out_dtype == DT_BF16) { convert_bf16_f32((libxsmm_bfloat16*)tenWeightDiff_->getBuffer(), cbptr, 16); for(int i=0; i<16; i++) { if(isnan(cbptr[i]) || isinf(cbptr[i])) { printf("Warning! %s layer BP activations are NaN or Inf\n", nname_.c_str()); exit(-1); } } } #endif void* gexp[NUM_NUMA_NODES]; void* gvar[NUM_NUMA_NODES]; void* gexp_test = tenMeanData_->getPrivBuffer(); void* gvar_test = tenVarData_->getPrivBuffer(); void **mptrptr = tenMeanData_->getBufferPtr(); void **vptrptr = tenVarData_->getBufferPtr(); int offset = tenMeanData_->getOffset(); for(int n=0; ngetOffset(); for(int n=0; ngetBuffer(); if(in_dtype == DT_BF16) { if(dwptr == NULL) { int wsize = ifm0*ofm*kh*kw*sizeof(float); dwptr = (float*)MLSL::Environment::GetEnv().Alloc(wsize, 2097152); } convert_bf16_f32((libxsmm_bfloat16*)mptr, dwptr, ifm0*ofm*kh*kw); op_->GetParameterSet(0)->StartGradientComm(dwptr); } else if(in_dtype == DT_FLOAT) op_->GetParameterSet(0)->StartGradientComm(mptr); op_->GetParameterSet(1)->StartGradientComm(tenScaleDiff_->getBuffer()); op_->GetParameterSet(2)->StartGradientComm(tenShiftDiff_->getBuffer()); int num_nodes = eptr_->get_num_machines(); for(int i=0; iop_->GetParameterSet(3)->StartGradientComm(gexp_test); this->op_->GetParameterSet(4)->StartGradientComm(gvar_test); #endif #ifdef GETSTATS #ifdef USE_MLSL node_id = MLSL::Environment::GetEnv().GetProcessIdx(); #else node_id = 0; #endif if(node_id == 0) { if(in_dtype == DT_FLOAT) { string s = nname_ + "_Inp"; float *ptr = (float*)tenBotData_[0]->getBuffer(); MeanOfLayer((char*)s.c_str(), ptr, nImg*ifm0*ifhp*ifwp); s = nname_ + "_delMidp"; ptr = (float*)tenMidDiff_->getBuffer(); MeanOfLayer((char*)s.c_str(), ptr, nImg*ofm*mfhp*mfwp); s = nname_ + "_delWt_Aft"; ptr = (float*)tenWeightDiff_->getBuffer(); float *pptr = (float*)tenWeightDiff_->getPrivBuffer(); float *p = (pptr == NULL) ? ptr : pptr; MeanOfLayer((char*)s.c_str(), p, ifm0*ofm*kh*kw); } else if(in_dtype == DT_BF16) { string s = nname_ + "_Inp"; libxsmm_bfloat16 *ptr = (libxsmm_bfloat16*)tenBotData_[0]->getBuffer(); memset(stptr, 0, nImg*ifm0*ifhp*ifwp); convert_bf16_f32(ptr, stptr, nImg*ifm0*ifhp*ifwp); MeanOfLayer((char*)s.c_str(), stptr, nImg*ifm0*ifhp*ifwp); s = nname_ + "_delMidp"; ptr = (libxsmm_bfloat16*)tenMidDiff_->getBuffer(); memset(stptr, 0, nImg*ofm*mfhp*mfwp); convert_bf16_f32(ptr, stptr, nImg*ofm*mfhp*mfwp); MeanOfLayer((char*)s.c_str(), stptr, nImg*ofm*mfhp*mfwp); s = nname_ + "_delWt_Aft"; #ifdef USE_MLSL MeanOfLayer((char*)s.c_str(), dwptr, ifm0*ofm*kh*kw); #else ptr = (libxsmm_bfloat16*)tenWeightDiff_->getBuffer(); memset(stptr, 0, ifm0*ofm*kh*kw); convert_bf16_f32(ptr, stptr, ifm0*ofm*kh*kw); MeanOfLayer((char*)s.c_str(), stptr, ifm0*ofm*kh*kw); #endif } } #endif } void FusedConvBNNode::solverStep() { #ifdef USE_MLSL int ifm = gparams_.nInput[0]; int ofm = gparams_.nOutput; int kh = gparams_.kh; int kw = gparams_.kw; float *gwt = (float*)(tenWeightDiff_->getBuffer()); float *delgamma = (float*)tenScaleDiff_->getBuffer(); float *delbeta = (float*)tenShiftDiff_->getBuffer(); void* gexp_test = tenMeanData_->getPrivBuffer(); void* gvar_test = tenVarData_->getPrivBuffer(); int wsize = ifm*ofm*kh*kw; void *mptr = op_->GetParameterSet(0)->WaitGradientComm(); if(in_dtype == DT_FLOAT) { if(mptr != NULL && mptr != gwt) memcpy((void*)gwt, mptr, wsize*sizeof(float)); } else if(in_dtype == DT_BF16) { if(mptr != NULL && mptr != dwptr) memcpy((void*)dwptr, mptr, wsize*sizeof(float)); convert_f32_bf16(dwptr, (libxsmm_bfloat16*)gwt, wsize); } mptr = op_->GetParameterSet(1)->WaitGradientComm(); if(mptr != NULL && mptr != delgamma) memcpy((void*)delgamma, mptr, ofm*sizeof(float)); mptr = op_->GetParameterSet(2)->WaitGradientComm(); if(mptr != NULL && mptr != delbeta) memcpy((void*)delbeta, mptr, ofm*sizeof(float)); mptr = op_->GetParameterSet(3)->WaitGradientComm(); if(mptr != NULL && mptr != gexp_test) memcpy((void*)gexp_test, mptr, ofm*sizeof(float)); mptr = op_->GetParameterSet(4)->WaitGradientComm(); if(mptr != NULL && mptr != gvar_test) memcpy((void*)gvar_test, mptr, ofm*sizeof(float)); #ifdef CHECK_BLOWUP_FP32 float* ptr = (float*)tenWeightDiff_->getBuffer(); for(int i=0; i<16; i++) { if(isnan(ptr[i]) || isinf(ptr[i])) { printf("Warning! %s layer Solver gradients are NaN or Inf\n", nname_.c_str()); exit(-1); } } for(int i=0; i<16; i++) { if(isnan(delgamma[i]) || isinf(delgamma[i])) { printf("Warning! %s layer Solver gamma gradients are NaN or Inf\n", nname_.c_str()); exit(-1); } } for(int i=0; i<16; i++) { if(isnan(delbeta[i]) || isinf(delbeta[i])) { printf("Warning! %s layer Solver beta gradients are NaN or Inf\n", nname_.c_str()); exit(-1); } } #endif #endif } libxsmm-1.17/samples/deeplearning/gxm/src/FusedConvBNXSMM.cpp000066400000000000000000002047761415223013700241430ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar, Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "FusedConvBNXSMM.hpp" using namespace std; FusedConvBNXSMM::FusedConvBNXSMM(FusedConvBNImplParams* gp, int engine) : FusedConvBNImpl(gp, engine) { conv_desc.N = gp->batch_size/gp->num_numa_nodes; conv_desc.C = gp->nInput[0]; conv_desc.H = gp->iHeight; conv_desc.W = gp->iWidth; conv_desc.K = gp->nOutput; conv_desc.R = gp->kh; conv_desc.S = gp->kw; conv_desc.u = gp->c_stride_h; conv_desc.v = gp->c_stride_w; if(gp->physical_padding) { conv_desc.pad_h_in = gp->ipad_h; conv_desc.pad_w_in = gp->ipad_w; } else { conv_desc.pad_h_in = 0; conv_desc.pad_w_in = 0; } conv_desc.pad_w = gp->ipad_w; conv_desc.pad_h = gp->ipad_h; if(gp->physical_padding) { conv_desc.pad_h_out = gp->mpad_h; conv_desc.pad_w_out = gp->mpad_w; } else { conv_desc.pad_h_out = 0; conv_desc.pad_w_out = 0; } conv_desc.threads = gp->num_threads/gp->num_numa_nodes; conv_desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT; conv_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; conv_desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; conv_desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE; if(gp->out_data_type == DT_FLOAT) conv_desc.options = LIBXSMM_DNN_CONV_OPTION_OVERWRITE; else if(gp->out_data_type == DT_BF16) conv_desc.options = LIBXSMM_DNN_CONV_OPTION_OVERWRITE; if(gp->in_data_type == DT_BF16 && gp->out_data_type == DT_FLOAT) { conv_desc.datatype_in = LIBXSMM_DNN_DATATYPE_BF16; conv_desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32; } else if(gp->in_data_type == DT_BF16 && gp->out_data_type == DT_BF16) { conv_desc.datatype_in = LIBXSMM_DNN_DATATYPE_BF16; conv_desc.datatype_out = LIBXSMM_DNN_DATATYPE_BF16; } else if(gp->in_data_type == DT_FLOAT && gp->out_data_type == DT_FLOAT) { conv_desc.datatype_in = LIBXSMM_DNN_DATATYPE_F32; conv_desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32; } for(int i=0; inum_numa_nodes; i++) { libxsmm_handle_conv[i] = libxsmm_dnn_create_conv_layer( conv_desc, &status ); CHKERR_LIBXSMM_DNN( status ); } fusedbn_desc_train.partN = gp->batch_size/gp->num_numa_nodes; fusedbn_desc_train.fullN = gp->batch_size/gp->num_numa_nodes; fusedbn_desc_train.C = gp->nOutput; fusedbn_desc_train.H = gp->mHeight; fusedbn_desc_train.W = gp->mWidth; fusedbn_desc_train.u = gp->bn_stride_h; fusedbn_desc_train.v = gp->bn_stride_w; fusedbn_desc_train.pad_h_in = gp->mpad_h; fusedbn_desc_train.pad_w_in = gp->mpad_w; fusedbn_desc_train.pad_h_out = gp->opad_h; fusedbn_desc_train.pad_w_out = gp->opad_w; fusedbn_desc_train.threads = gp->num_threads/gp->num_numa_nodes; if(gp->in_data_type == DT_FLOAT && gp->out_data_type == DT_FLOAT) { fusedbn_desc_train.datatype_in = LIBXSMM_DNN_DATATYPE_F32; fusedbn_desc_train.datatype_out = LIBXSMM_DNN_DATATYPE_F32; } else if(gp->in_data_type == DT_BF16 && gp->out_data_type == DT_BF16) { fusedbn_desc_train.datatype_in = LIBXSMM_DNN_DATATYPE_BF16; fusedbn_desc_train.datatype_out = LIBXSMM_DNN_DATATYPE_BF16; } fusedbn_desc_train.datatype_stats = LIBXSMM_DNN_DATATYPE_F32; fusedbn_desc_train.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; fusedbn_desc_train.fuse_order = LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU; fusedbn_desc_train.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BN; if(gp->relu_fwd) #if 0 fusedbn_desc_train.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BN_RELU; #else fusedbn_desc_train.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BN_RELU_WITH_MASK; #endif if(gp->eltwise) fusedbn_desc_train.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BN_ELTWISE; if(gp->relu_fwd && gp->eltwise) #if 0 fusedbn_desc_train.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BN_ELTWISE_RELU; #else fusedbn_desc_train.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BN_ELTWISE_RELU_WITH_MASK; #endif for(int i=0; inum_numa_nodes; i++) { libxsmm_handle_bn_train[i] = libxsmm_dnn_create_fusedbatchnorm( fusedbn_desc_train, &status ); CHKERR_LIBXSMM_DNN( status ); } fusedbn_desc_test.partN = gp->batch_size/gp->num_numa_nodes; fusedbn_desc_test.fullN = gp->batch_size/gp->num_numa_nodes; fusedbn_desc_test.C = gp->nOutput; fusedbn_desc_test.H = gp->mHeight; fusedbn_desc_test.W = gp->mWidth; fusedbn_desc_test.u = gp->bn_stride_h; fusedbn_desc_test.v = gp->bn_stride_w; fusedbn_desc_test.pad_h_in = gp->mpad_h; fusedbn_desc_test.pad_w_in = gp->mpad_w; fusedbn_desc_test.pad_h_out = gp->opad_h; fusedbn_desc_test.pad_w_out = gp->opad_w; fusedbn_desc_test.threads = gp->num_threads/gp->num_numa_nodes; if(gp->in_data_type == DT_FLOAT && gp->out_data_type == DT_FLOAT) { fusedbn_desc_test.datatype_in = LIBXSMM_DNN_DATATYPE_F32; fusedbn_desc_test.datatype_out = LIBXSMM_DNN_DATATYPE_F32; } else if(gp->in_data_type == DT_BF16 && gp->out_data_type == DT_BF16) { fusedbn_desc_test.datatype_in = LIBXSMM_DNN_DATATYPE_BF16; fusedbn_desc_test.datatype_out = LIBXSMM_DNN_DATATYPE_BF16; } fusedbn_desc_test.datatype_stats = LIBXSMM_DNN_DATATYPE_F32; fusedbn_desc_test.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; fusedbn_desc_test.fuse_order = LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU; fusedbn_desc_test.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE; if(gp->relu_fwd) #if 0 fusedbn_desc_test.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE_RELU; #else fusedbn_desc_test.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE_RELU_WITH_MASK; #endif if(gp->eltwise) fusedbn_desc_test.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE_ELTWISE; if(gp->relu_fwd && gp->eltwise) #if 0 fusedbn_desc_test.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE_ELTWISE_RELU; #else fusedbn_desc_test.fuse_ops = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE_ELTWISE_RELU_WITH_MASK; #endif for(int i=0; inum_numa_nodes; i++) { libxsmm_handle_bn_test[i] = libxsmm_dnn_create_fusedbatchnorm( fusedbn_desc_test, &status ); CHKERR_LIBXSMM_DNN( status ); } } void FusedConvBNXSMM::forwardPropagate(vector& inp, TensorBuf *weightp, TensorBuf *hweightp, TensorBuf *midp, TensorBuf *gammap, TensorBuf *betap, TensorBuf *meanp, TensorBuf *varp, TensorBuf *outp, int tid) { int nImg = gp->batch_size/gp->num_numa_nodes; int nIFM = gp->nInput[0]; int nOFM = gp->nOutput; int nBIfm = nIFM/VLEN; int nBOfm = nOFM/VLEN; int ifh = gp->iHeight; int ifw = gp->iWidth; int mfh = gp->mHeight; int mfw = gp->mWidth; int ofh = gp->oHeight; int ofw = gp->oWidth; int bsh = gp->bn_stride_h; int bsw = gp->bn_stride_w; int csh = gp->c_stride_h; int csw = gp->c_stride_w; int iph = gp->ipad_h; int ipw = gp->ipad_w; int mph = gp->mpad_h; int mpw = gp->mpad_w; int oph = gp->opad_h; int opw = gp->opad_w; int fhm = mfh + 2*mph; int fwm = mfw + 2*mpw; int ifhp = ifh + 2*iph; int ifwp = ifw + 2*ipw; int ofhp = ofh + 2*oph; int ofwp = ofw + 2*opw; assert(bot_compute_engine[0] != -1); assert(top_compute_engine[0] != -1); // Conv input. LPBuffer is non-NULL if data layer output is BF16 void *inp_r[NUM_NUMA_NODES], *inp_l[NUM_NUMA_NODES], *hwt_ptr, *middle[NUM_NUMA_NODES], *output[NUM_NUMA_NODES]; void *wt_ptr[NUM_NUMA_NODES]; int imoff = conv_desc.N * conv_desc.C * ifhp * ifwp; if(gp->in_data_type == DT_BF16) { if(inp[0]->getLPBuffer() != NULL) inp_r[0] = inp[0]->getLPBuffer(); else inp_r[0] = inp[0]->getBuffer(); imoff = imoff * sizeof(libxsmm_bfloat16); } else if(gp->in_data_type == DT_FLOAT) { inp_r[0] = inp[0]->getBuffer(); imoff = imoff * sizeof(float); } for(int n=1; nnum_numa_nodes; n++) inp_r[n] = inp_r[n-1] + imoff; if(gp->eltwise) { imoff = fusedbn_desc_train.partN * gp->nInput[1] * ifhp * ifwp; if(gp->out_data_type == DT_FLOAT) imoff = imoff * sizeof(float); else if(gp->out_data_type == DT_BF16) imoff = imoff * sizeof(libxsmm_bfloat16); if(inp[1]->getLPBuffer() != NULL) inp_l[0] = inp[1]->getLPBuffer(); else inp_l[0] = inp[1]->getBuffer(); for(int n=1; nnum_numa_nodes; n++) inp_l[n] = inp_l[n-1] + imoff; } // Conv Weight void **lptrptr = weightp->getLPBufferPtr(); void **ptrptr = weightp->getBufferPtr(); int offset = weightp->getOffset(); if(lptrptr != NULL) for(int n=0; nnum_numa_nodes; n++) wt_ptr[n] = lptrptr[n] + offset*sizeof(libxsmm_bfloat16); else for(int n=0; nnum_numa_nodes; n++) wt_ptr[n] = ptrptr[n] + offset*sizeof(float); void *wt_prv_ptr = NULL; // Conv weight history if(hweightp != NULL) hwt_ptr = hweightp->getBuffer(); else hwt_ptr=NULL; // Conv output middle[0] = midp->getBuffer(); imoff = conv_desc.N * conv_desc.K * fhm * fwm; if(gp->out_data_type == DT_FLOAT) imoff = imoff * sizeof(float); else if(gp->out_data_type == DT_BF16) imoff = imoff * sizeof(libxsmm_bfloat16); for(int n=1; nnum_numa_nodes; n++) middle[n] = middle[n-1] + imoff; output[0] = outp->getBuffer(); imoff = fusedbn_desc_train.partN * fusedbn_desc_train.C * ofhp * ofwp; if(gp->out_data_type == DT_FLOAT) imoff = imoff * sizeof(float); else if(gp->out_data_type == DT_BF16) imoff = imoff * sizeof(libxsmm_bfloat16); for(int n=1; nnum_numa_nodes; n++) output[n] = output[n-1] + imoff; void *gamma[NUM_NUMA_NODES]; void *beta[NUM_NUMA_NODES]; float *gexpect[NUM_NUMA_NODES]; float *gvar[NUM_NUMA_NODES]; float *gexp_test = (float*)meanp->getPrivBuffer(); float *gvar_test = (float*)varp->getPrivBuffer(); void **gptrptr = gammap->getBufferPtr(); offset = gammap->getOffset() * sizeof(float); for(int n=0; nnum_numa_nodes; n++) gamma[n] = gptrptr[n] + offset; void **bptrptr = betap->getBufferPtr(); offset = betap->getOffset() * sizeof(float); for(int n=0; nnum_numa_nodes; n++) beta[n] = bptrptr[n] + offset; void **mptrptr = meanp->getBufferPtr(); offset = meanp->getOffset(); for(int n=0; nnum_numa_nodes; n++) gexpect[n] = (float*)mptrptr[n] + offset; void **vptrptr = varp->getBufferPtr(); offset = varp->getOffset(); for(int n=0; nnum_numa_nodes; n++) gvar[n] = (float*)vptrptr[n] + offset; void **sptrptr = scratchp->getBufferPtr(); for(int n=0; nnum_numa_nodes; n++) { if(bexpect[n] == NULL) { bexpect[n] = (void*)_mm_malloc(nOFM*sizeof(float), 64); #ifndef NDEBUG printf("%s allocated %lu bytes for mean\n",nname.c_str(), nOFM*sizeof(float)); #endif } if(bstddev[n] == NULL) { bstddev[n] = (void*)_mm_malloc(nOFM*sizeof(float), 64); #ifndef NDEBUG printf("%s allocated %lu bytes for stdev\n",nname.c_str(), nOFM*sizeof(float)); #endif } if(bvariance[n] == NULL) { bvariance[n] = (void*)_mm_malloc(nOFM*sizeof(float), 64); #ifndef NDEBUG printf("%s allocated %lu bytes for variance\n",nname.c_str(), nOFM*sizeof(float)); #endif } if(relu_mask[n] == NULL) relu_mask[n] = (void*)libxsmm_aligned_malloc(nImg*nOFM*ofhp*ofwp*sizeof(unsigned char), 2097152); } if(gexp_test == NULL) { gexp_test = (float*)_mm_malloc(nOFM*sizeof(float), 64); meanp->setPrivBuffer((void*)gexp_test); #ifndef NDEBUG printf("%s allocated %lu bytes for mean test\n",nname.c_str(), nOFM*sizeof(float)); #endif } if(gvar_test == NULL) { gvar_test = (float*)_mm_malloc(nOFM*sizeof(float), 64); varp->setPrivBuffer((void*)gvar_test); #ifndef NDEBUG printf("%s allocated %lu bytes for mean test\n",nname.c_str(), nOFM*sizeof(float)); #endif } for(int n=0; nnum_numa_nodes; n++) { if(libxsmm_input[n] == NULL) { libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle_conv[n], LIBXSMM_DNN_REGULAR_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_input[n] = libxsmm_dnn_link_tensor( libxsmm_layout, inp_r[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN(libxsmm_dnn_bind_tensor( libxsmm_handle_conv[n], libxsmm_input[n], LIBXSMM_DNN_REGULAR_INPUT ) ); } } int welem = gp->nInput[0] * gp->nOutput * gp->kw * gp->kh; for(int n=0; nnum_numa_nodes; n++) { if(libxsmm_filter[n] == NULL) { libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle_conv[n], LIBXSMM_DNN_REGULAR_FILTER, &status ); CHKERR_LIBXSMM_DNN( status ); if(gp->in_data_type == DT_FLOAT) { int wsize = welem*sizeof(float); wt_prv_ptr = (void*)libxsmm_aligned_malloc(wsize, 2097152); // Transform weight layout libxsmm_filter[n] = libxsmm_dnn_link_tensor( libxsmm_layout, wt_prv_ptr, &status ); CHKERR_LIBXSMM_DNN( status ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor(libxsmm_filter[n], wt_ptr[n], LIBXSMM_DNN_TENSOR_FORMAT_KCRS) ); memcpy(wt_ptr[n], wt_prv_ptr, wsize); if(n==0) { libxsmm_checkpoint_filter = libxsmm_dnn_link_tensor(libxsmm_layout, wt_ptr[n], &status); CHKERR_LIBXSMM_DNN( status ); } libxsmm_filter[n] = libxsmm_dnn_link_tensor( libxsmm_layout, wt_ptr[n], &status ); CHKERR_LIBXSMM_DNN( status ); // Transform weight history layout if(n == 0) { if(hwt_ptr != NULL) { libxsmm_temp = libxsmm_dnn_link_tensor( libxsmm_layout, wt_prv_ptr, &status ); CHKERR_LIBXSMM_DNN( status ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_temp, (void*)hwt_ptr, LIBXSMM_DNN_TENSOR_FORMAT_KCRS ) ); memcpy(hwt_ptr, wt_prv_ptr, welem*sizeof(float)); libxsmm_checkpoint_history_filter = libxsmm_dnn_link_tensor(libxsmm_layout, hwt_ptr, &status); CHKERR_LIBXSMM_DNN( status ); } } libxsmm_free(wt_prv_ptr); wt_prv_ptr = NULL; weightp->setPrivBuffer(NULL); } else if(gp->in_data_type == DT_BF16) { int wsize = welem*sizeof(libxsmm_bfloat16); wt_prv_ptr = (void*)libxsmm_aligned_malloc(wsize, 2097152); // Transform BF16 weight layout libxsmm_filter[n] = libxsmm_dnn_link_tensor( libxsmm_layout, wt_prv_ptr, &status ); CHKERR_LIBXSMM_DNN( status ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor(libxsmm_filter[n], wt_ptr[n], LIBXSMM_DNN_TENSOR_FORMAT_KCRS) ); memcpy(wt_ptr[n], wt_prv_ptr, wsize); libxsmm_filter[n] = libxsmm_dnn_link_tensor( libxsmm_layout, wt_ptr[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_free(wt_prv_ptr); // Transform FP32 weight layout if(n == 0) { libxsmm_layout->datatype = LIBXSMM_DNN_DATATYPE_F32; wt_prv_ptr = (void*)libxsmm_aligned_malloc(welem*sizeof(float), 2097152); libxsmm_checkpoint_filter = libxsmm_dnn_link_tensor( libxsmm_layout, wt_prv_ptr, &status ); CHKERR_LIBXSMM_DNN( status ); void *fwt_ptr = weightp->getBuffer(); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_checkpoint_filter, (void*)fwt_ptr, LIBXSMM_DNN_TENSOR_FORMAT_KCRS ) ); memcpy(fwt_ptr, wt_prv_ptr, welem*sizeof(float)); libxsmm_checkpoint_filter = libxsmm_dnn_link_tensor( libxsmm_layout, fwt_ptr, &status ); CHKERR_LIBXSMM_DNN( status ); // Transform FP32 weight history layout if(hwt_ptr != NULL) { libxsmm_checkpoint_history_filter = libxsmm_dnn_link_tensor( libxsmm_layout, wt_prv_ptr, &status ); CHKERR_LIBXSMM_DNN( status ); void *hfwt_ptr = hweightp->getBuffer(); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_checkpoint_history_filter, (void*)hfwt_ptr, LIBXSMM_DNN_TENSOR_FORMAT_KCRS ) ); memcpy(hfwt_ptr, wt_prv_ptr, welem*sizeof(float)); libxsmm_checkpoint_history_filter = libxsmm_dnn_link_tensor(libxsmm_layout, hfwt_ptr, &status); CHKERR_LIBXSMM_DNN( status ); } libxsmm_free(wt_prv_ptr); wt_prv_ptr = NULL; weightp->setPrivBuffer(NULL); } } libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN(libxsmm_dnn_bind_tensor( libxsmm_handle_conv[n], libxsmm_filter[n], LIBXSMM_DNN_REGULAR_FILTER ) ); } } for(int n=0; nnum_numa_nodes; n++) { if(libxsmm_middle[n] == NULL) { // Conv Output libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle_conv[n], LIBXSMM_DNN_REGULAR_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_middle[n] = libxsmm_dnn_link_tensor( libxsmm_layout, middle[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN(libxsmm_dnn_bind_tensor(libxsmm_handle_conv[n], libxsmm_middle[n], LIBXSMM_DNN_REGULAR_OUTPUT)); } } /* let's allocate (if required) and bind scratch */ int max_size = 0; for(int n=0; nnum_numa_nodes; n++) { if(sptrptr[n] == NULL) { int mysize = libxsmm_dnn_get_scratch_size( libxsmm_handle_conv[n], LIBXSMM_DNN_COMPUTE_KIND_ALL, &status ); CHKERR_LIBXSMM_DNN( status ); sptrptr[n] = (void*)libxsmm_aligned_malloc(mysize , 2097152); max_size = mysize; #ifdef USE_MLSL if(MLSL::Environment::GetEnv().GetProcessIdx() == 0) #endif printf("%s allocated %d bytes for scratch @ %p\n",nname.c_str(), mysize, sptrptr[n]); } else { int ssize = scratchp->getBufferSize(); int mysize = libxsmm_dnn_get_scratch_size( libxsmm_handle_conv[n], LIBXSMM_DNN_COMPUTE_KIND_ALL, &status ); CHKERR_LIBXSMM_DNN( status ); if(ssize < mysize) { libxsmm_free(sptrptr[n]); sptrptr[n] = (void*)libxsmm_aligned_malloc(mysize, 2097152); max_size = mysize; #ifdef USE_MLSL if(MLSL::Environment::GetEnv().GetProcessIdx() == 0) #endif printf("%s allocated %d bytes for scratch @ %p, prev size was %d bytes\n",nname.c_str(), mysize, sptrptr[n], ssize); } else max_size = ssize; } } scratchp->setBufferSize(max_size); for(int n=0; nnum_numa_nodes; n++) { if(libxsmm_input_bntrain[n]==NULL) { libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout(libxsmm_handle_bn_train[n], LIBXSMM_DNN_REGULAR_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_input_bntrain[n] = libxsmm_dnn_link_tensor( libxsmm_layout, middle[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle_bn_train[n], libxsmm_input_bntrain[n], LIBXSMM_DNN_REGULAR_INPUT ) ); } } if(gp->eltwise) { for(int n=0; nnum_numa_nodes; n++) { if(libxsmm_input_add_bntrain[n] == NULL) { libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle_bn_train[n], LIBXSMM_DNN_REGULAR_INPUT_ADD, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_input_add_bntrain[n] = libxsmm_dnn_link_tensor( libxsmm_layout, inp_l[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle_bn_train[n], libxsmm_input_add_bntrain[n], LIBXSMM_DNN_REGULAR_INPUT_ADD ) ) } } } for(int n=0; nnum_numa_nodes; n++) { if(libxsmm_expectval_train[n] == NULL) { libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout(libxsmm_handle_bn_train[n], LIBXSMM_DNN_CHANNEL_EXPECTVAL, &status); CHKERR_LIBXSMM_DNN( status ); libxsmm_expectval_train[n] = libxsmm_dnn_link_tensor( libxsmm_layout, bexpect[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor(libxsmm_handle_bn_train[n], libxsmm_expectval_train[n], LIBXSMM_DNN_CHANNEL_EXPECTVAL ) ); } if(libxsmm_stddev_train[n] == NULL) { libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle_bn_train[n], LIBXSMM_DNN_CHANNEL_RCPSTDDEV, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_stddev_train[n] = libxsmm_dnn_link_tensor( libxsmm_layout, bstddev[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor(libxsmm_handle_bn_train[n], libxsmm_stddev_train[n], LIBXSMM_DNN_CHANNEL_RCPSTDDEV ) ); } if(libxsmm_variance_train[n] == NULL) { libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout(libxsmm_handle_bn_train[n], LIBXSMM_DNN_CHANNEL_VARIANCE, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_variance_train[n] = libxsmm_dnn_link_tensor( libxsmm_layout, bvariance[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor(libxsmm_handle_bn_train[n], libxsmm_variance_train[n], LIBXSMM_DNN_CHANNEL_VARIANCE ) ); } if(libxsmm_gamma_train[n] == NULL) { libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle_bn_train[n], LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_gamma_train[n] = libxsmm_dnn_link_tensor( libxsmm_layout, gamma[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor(libxsmm_handle_bn_train[n], libxsmm_gamma_train[n], LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA ) ); } if(libxsmm_beta_train[n] == NULL) { libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout(libxsmm_handle_bn_train[n], LIBXSMM_DNN_REGULAR_CHANNEL_BETA, &status); CHKERR_LIBXSMM_DNN( status ); libxsmm_beta_train[n] = libxsmm_dnn_link_tensor( libxsmm_layout, beta[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor(libxsmm_handle_bn_train[n], libxsmm_beta_train[n], LIBXSMM_DNN_REGULAR_CHANNEL_BETA ) ); } if(libxsmm_output_bntrain[n] == NULL) { libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout(libxsmm_handle_bn_train[n], LIBXSMM_DNN_REGULAR_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_output_bntrain[n] = libxsmm_dnn_link_tensor( libxsmm_layout, output[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor(libxsmm_handle_bn_train[n], libxsmm_output_bntrain[n], LIBXSMM_DNN_REGULAR_OUTPUT ) ); } if(libxsmm_relumask_bntrain[n] == NULL) { libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout(libxsmm_handle_bn_train[n], LIBXSMM_DNN_RELU_MASK, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_relumask_bntrain[n] = libxsmm_dnn_link_tensor( libxsmm_layout, relu_mask[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor(libxsmm_handle_bn_train[n], libxsmm_relumask_bntrain[n], LIBXSMM_DNN_RELU_MASK ) ); } } /* let's allocate (if required) and bind scratch */ for(int n=0; nnum_numa_nodes; n++) { if(sptrptr[n] == NULL) { int mysize = libxsmm_dnn_fusedbatchnorm_get_scratch_size( libxsmm_handle_bn_train[n], &status ); CHKERR_LIBXSMM_DNN( status ); sptrptr[n] = (void*)libxsmm_aligned_malloc(mysize , 2097152); max_size = mysize; #ifdef USE_MLSL if(MLSL::Environment::GetEnv().GetProcessIdx() == 0) #endif printf("%s allocated %d bytes for scratch @ %p\n",nname.c_str(), mysize, sptrptr[n]); } else { int ssize = scratchp->getBufferSize(); int mysize = libxsmm_dnn_fusedbatchnorm_get_scratch_size( libxsmm_handle_bn_train[n], &status ); CHKERR_LIBXSMM_DNN( status ); if(ssize < mysize) { libxsmm_free(sptrptr[n]); sptrptr[n] = (void*)libxsmm_aligned_malloc(mysize, 2097152); max_size = mysize; #ifdef USE_MLSL if(MLSL::Environment::GetEnv().GetProcessIdx() == 0) #endif printf("%s allocated %d bytes for scratch @ %p, prev size was %d bytes\n",nname.c_str(), mysize, sptrptr[n], ssize); } else max_size = ssize; } } scratchp->setBufferSize(max_size); for(int n=0; nnum_numa_nodes; n++) { if(libxsmm_input_bntest[n]==NULL) { libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout(libxsmm_handle_bn_test[n], LIBXSMM_DNN_REGULAR_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_input_bntest[n] = libxsmm_dnn_link_tensor( libxsmm_layout, middle[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle_bn_test[n], libxsmm_input_bntest[n], LIBXSMM_DNN_REGULAR_INPUT ) ); } } if(gp->eltwise) { for(int n=0; nnum_numa_nodes; n++) { if(libxsmm_input_add_bntest[n] == NULL) { libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle_bn_test[n], LIBXSMM_DNN_REGULAR_INPUT_ADD, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_input_add_bntest[n] = libxsmm_dnn_link_tensor( libxsmm_layout, inp_l[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle_bn_test[n], libxsmm_input_add_bntest[n], LIBXSMM_DNN_REGULAR_INPUT_ADD ) ) } } } for(int n=0; nnum_numa_nodes; n++) { if(libxsmm_expectval_test[n] == NULL) { libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout(libxsmm_handle_bn_test[n], LIBXSMM_DNN_CHANNEL_EXPECTVAL, &status); CHKERR_LIBXSMM_DNN( status ); libxsmm_expectval_test[n] = libxsmm_dnn_link_tensor( libxsmm_layout, bexpect[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor(libxsmm_handle_bn_test[n], libxsmm_expectval_test[n], LIBXSMM_DNN_CHANNEL_EXPECTVAL ) ); } if(libxsmm_stddev_test[n] == NULL) { libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle_bn_test[n], LIBXSMM_DNN_CHANNEL_RCPSTDDEV, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_stddev_test[n] = libxsmm_dnn_link_tensor( libxsmm_layout, bstddev[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor(libxsmm_handle_bn_test[n], libxsmm_stddev_test[n], LIBXSMM_DNN_CHANNEL_RCPSTDDEV ) ); } if(libxsmm_variance_test[n] == NULL) { libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout(libxsmm_handle_bn_test[n], LIBXSMM_DNN_CHANNEL_VARIANCE, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_variance_test[n] = libxsmm_dnn_link_tensor( libxsmm_layout, bvariance[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor(libxsmm_handle_bn_test[n], libxsmm_variance_test[n], LIBXSMM_DNN_CHANNEL_VARIANCE ) ); } if(libxsmm_gamma_test[n] == NULL) { libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle_bn_test[n], LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_gamma_test[n] = libxsmm_dnn_link_tensor( libxsmm_layout, gamma[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor(libxsmm_handle_bn_test[n], libxsmm_gamma_test[n], LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA ) ); } if(libxsmm_beta_test[n] == NULL) { libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout(libxsmm_handle_bn_test[n], LIBXSMM_DNN_REGULAR_CHANNEL_BETA, &status); CHKERR_LIBXSMM_DNN( status ); libxsmm_beta_test[n] = libxsmm_dnn_link_tensor( libxsmm_layout, beta[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor(libxsmm_handle_bn_test[n], libxsmm_beta_test[n], LIBXSMM_DNN_REGULAR_CHANNEL_BETA ) ); } if(libxsmm_output_bntest[n] == NULL) { libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout(libxsmm_handle_bn_test[n], LIBXSMM_DNN_REGULAR_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_output_bntest[n] = libxsmm_dnn_link_tensor( libxsmm_layout, output[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor(libxsmm_handle_bn_test[n], libxsmm_output_bntest[n], LIBXSMM_DNN_REGULAR_OUTPUT ) ); } if(libxsmm_relumask_bntest[n] == NULL) { libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout(libxsmm_handle_bn_test[n], LIBXSMM_DNN_RELU_MASK, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_relumask_bntest[n] = libxsmm_dnn_link_tensor( libxsmm_layout, relu_mask[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor(libxsmm_handle_bn_test[n], libxsmm_relumask_bntest[n], LIBXSMM_DNN_RELU_MASK ) ); } } /* let's allocate (if required) and bind scratch */ for(int n=0; nnum_numa_nodes; n++) { if(sptrptr[n] == NULL) { int mysize = libxsmm_dnn_fusedbatchnorm_get_scratch_size( libxsmm_handle_bn_test[n], &status ); CHKERR_LIBXSMM_DNN( status ); sptrptr[n] = (void*)libxsmm_aligned_malloc(mysize , 2097152); max_size = mysize; #ifdef USE_MLSL if(MLSL::Environment::GetEnv().GetProcessIdx() == 0) #endif printf("%s allocated %d bytes for scratch @ %p\n",nname.c_str(), mysize, sptrptr[n]); } else { int ssize = scratchp->getBufferSize(); int mysize = libxsmm_dnn_fusedbatchnorm_get_scratch_size( libxsmm_handle_bn_test[n], &status ); CHKERR_LIBXSMM_DNN( status ); if(ssize < mysize) { libxsmm_free(sptrptr[n]); sptrptr[n] = (void*)libxsmm_aligned_malloc(mysize, 2097152); max_size = mysize; #ifdef USE_MLSL if(MLSL::Environment::GetEnv().GetProcessIdx() == 0) #endif printf("%s allocated %d bytes for scratch @ %p, prev size was %d bytes\n",nname.c_str(), mysize, sptrptr[n], ssize); } else max_size = ssize; } } scratchp->setBufferSize(max_size); if(prev_scratch_size == 0) prev_scratch_size = scratchp->getBufferSize(); if(!updated_scratch_fwd || prev_scratch_size != scratchp->getBufferSize()) { for(int n=0; nnum_numa_nodes; n++) { CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_scratch( libxsmm_handle_conv[n], LIBXSMM_DNN_COMPUTE_KIND_ALL, sptrptr[n] ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_scratch( libxsmm_handle_bn_train[n], sptrptr[n] ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_scratch( libxsmm_handle_bn_test[n], sptrptr[n] ) ); } updated_scratch_fwd = true; prev_scratch_size = scratchp->getBufferSize(); } #ifndef NDEBUG /* check physical padding */ if ( (iph > 0 || ipw > 0) && (mph > 0 || mpw > 0) ) { } else if ( (iph == 0 || ipw == 0) && (mph == 0 || mpw == 0) ) { } else { printf("node %s: conv xsmm forward is partially padded which cannot be :-(\n", nname.c_str()); } if ( (oph > 0 || opw > 0) && (mph > 0 || mpw > 0) ) { printf("node %s: batchnorm forward input and output is padded which cannot be :-(\n", nname.c_str()); } /* check rims */ if(gp->in_data_type == DT_FLOAT && gp->out_data_type == DT_FLOAT) { if(nIFM > 3) check_physical_pad( nname.c_str(), (float*)inp_r[0], nImg, nBIfm, ifh, ifw, VLEN, iph, ipw ); else check_physical_pad( nname.c_str(), (float*)inp_r[0], nImg, 1, ifh, ifw, 3, iph, ipw ); check_physical_pad( nname.c_str(), (float*)middle[0], nImg, nBOfm, mfh, mfw, VLEN, mph, mpw ); check_physical_pad( nname.c_str(), (float*)output[0], nImg, nBOfm, ofh, ofw, VLEN, oph, opw ); } else if(gp->in_data_type == DT_BF16 && gp->out_data_type == DT_BF16) { if(nIFM > 3) check_physical_pad( nname.c_str(), (libxsmm_bfloat16*)inp_r[0], nImg, nBIfm, ifh, ifw, VLEN, iph, ipw ); else check_physical_pad( nname.c_str(), (libxsmm_bfloat16*)inp_r[0], nImg, 1, ifh, ifw, 3, iph, ipw ); check_physical_pad( nname.c_str(), (libxsmm_bfloat16*)middle[0], nImg, nBOfm, mfh, mfw, VLEN, mph, mpw ); check_physical_pad( nname.c_str(), (libxsmm_bfloat16*)output[0], nImg, nBOfm, ofh, ofw, VLEN, oph, opw ); } #endif if(!use_global_stats) { #ifdef USE_XSMM_TIMING struct timeval tvsc, tvec; gettimeofday(&tvsc, NULL); #endif #ifdef _OPENMP #pragma omp parallel #endif { #ifdef _OPENMP const int tid = omp_get_thread_num(); #else const int tid = 0; #endif int ntps = gp->num_threads/gp->num_numa_nodes; int n = tid/ntps; CHKERR_LIBXSMM_DNN(libxsmm_dnn_execute_st( libxsmm_handle_conv[n], LIBXSMM_DNN_COMPUTE_KIND_FWD, n*ntps, tid) ); CHKERR_LIBXSMM_DNN(libxsmm_dnn_fusedbatchnorm_execute_st(libxsmm_handle_bn_train[n], LIBXSMM_DNN_COMPUTE_KIND_FWD, n*ntps, tid ) ); } #ifdef USE_XSMM_TIMING gettimeofday(&tvec, NULL); double fp_time = (tvec.tv_sec + tvec.tv_usec*1e-6) - (tvsc.tv_sec + tvsc.tv_usec*1e-6); #ifdef USE_MLSL if(MLSL::Environment::GetEnv().GetProcessIdx() == 0) #endif { double gf = (double)gp->batch_size * (double)gp->nInput[0] * (double)gp->nOutput * (double)gp->mHeight * (double)gp->mWidth * (double)gp->kh * (double)gp->kw * 2; if(gp->c_stride_h == 1 && gp->mpad_h == 0) printf("%s XSMM-CONV-FP mb%dic%dih%doc%doh%dkh%dn time = %g ms, GFLOPS = %.1f\n",gp->node_name.c_str(),gp->batch_size,gp->nInput[0],gp->iHeight,gp->nOutput,gp->mHeight,gp->kh,fp_time*1000.0, gf/fp_time/1e9); else if(gp->c_stride_h == 2) printf("%s XSMM-CONV-FP mb%dic%dih%doc%doh%dkh%dsh%dn time = %g ms, GFLOPS = %.1f\n",gp->node_name.c_str(),gp->batch_size,gp->nInput[0],gp->iHeight,gp->nOutput,gp->mHeight,gp->kh,gp->c_stride_h,fp_time*1000.0, gf/fp_time/1e9); else if(gp->mpad_h == 1) printf("%s XSMM-CONV-FP mb%dic%dih%doc%doh%dkh%dph%dn time = %g ms, GFLOPS = %.1f\n",gp->node_name.c_str(),gp->batch_size,gp->nInput[0],gp->iHeight,gp->nOutput,gp->mHeight,gp->kh,gp->mpad_h,fp_time*1000.0, gf/fp_time/1e9); } #endif #ifndef NDEBUG /* check physical padding */ if ( (iph > 0 || ipw > 0) && (mph > 0 || mpw > 0) ) { } else if ( (iph == 0 || ipw == 0) && (mph == 0 || mpw == 0) ) { } else { printf("node %s: conv xsmm forward is partially padded which cannot be :-(\n", nname.c_str()); } if ( (oph > 0 || opw > 0) && (mph > 0 || mpw > 0) ) { printf("node %s: batchnorm forward input and output is padded which cannot be :-(\n", nname.c_str()); } /* check rims */ if(gp->in_data_type == DT_FLOAT && gp->out_data_type == DT_FLOAT) { if(nIFM > 3) check_physical_pad( nname.c_str(), (float*)inp_r[0], nImg, nBIfm, ifh, ifw, VLEN, iph, ipw ); else check_physical_pad( nname.c_str(), (float*)inp_r[0], nImg, 1, ifh, ifw, 3, iph, ipw ); check_physical_pad( nname.c_str(), (float*)middle[0], nImg, nBOfm, mfh, mfw, VLEN, mph, mpw ); check_physical_pad( nname.c_str(), (float*)output[0], nImg, nBOfm, ofh, ofw, VLEN, oph, opw ); } else if(gp->in_data_type == DT_BF16 && gp->out_data_type == DT_BF16) { if(nIFM > 3) check_physical_pad( nname.c_str(), (libxsmm_bfloat16*)inp_r[0], nImg, nBIfm, ifh, ifw, VLEN, iph, ipw ); else check_physical_pad( nname.c_str(), (libxsmm_bfloat16*)inp_r[0], nImg, 1, ifh, ifw, 3, iph, ipw ); check_physical_pad( nname.c_str(), (libxsmm_bfloat16*)middle[0], nImg, nBOfm, mfh, mfw, VLEN, mph, mpw ); check_physical_pad( nname.c_str(), (libxsmm_bfloat16*)output[0], nImg, nBOfm, ofh, ofw, VLEN, oph, opw ); } #endif if(gp->exec_mode == "TRAIN") { for(int n=0; nnum_numa_nodes; n++) { float *gexp = gexpect[n]; float *gv = gvar[n]; float (* __restrict bmean)[VLEN] = (float (*)[VLEN])bexpect[n]; float (* __restrict bvar)[VLEN] = (float (*)[VLEN])bvariance[n]; float nhw_ratio = float(fusedbn_desc_train.fullN*mfh*mfw)/float(fusedbn_desc_train.fullN*mfh*mfw - 1); #ifdef __AVX512F__ __m512 vmmf = _mm512_set1_ps(gp->mmf); __m512 vnhw_ratio = _mm512_set1_ps(nhw_ratio); #ifdef _OPENMP #pragma omp parallel #endif { int tid = omp_get_thread_num(); int ntps = gp->num_threads/gp->num_numa_nodes; int s = tid/ntps; if(s==n && tid % ntps == 0) { for (int b = 0; b < nBOfm; ++b) { __m512 vbm = _mm512_loadu_ps(&bmean[b][0]); __m512 vbvar = _mm512_loadu_ps(&bvar[b][0]); _mm512_storeu_ps( &(gexp[b*VLEN]), _mm512_add_ps(_mm512_mul_ps(_mm512_loadu_ps( &(gexp[b*VLEN]) ), vmmf), vbm)); _mm512_storeu_ps( &(gv[b*VLEN]), _mm512_add_ps( _mm512_mul_ps( _mm512_loadu_ps( &(gv[b*VLEN]) ), vmmf), _mm512_mul_ps(vnhw_ratio, vbvar))); } } } #else #ifdef _OPENMP #pragma omp parallel for #endif for (int b = 0; b < nBOfm; ++b) { #pragma omp simd for (int v = 0; v < 16; ++v) { gexp[(b*16)+v] = gexp[(b*16)+v] * gp->mmf + bmean[b][v]; gv[(b*16)+v] = gv[(b*16)+v] * gp->mmf + nhw_ratio*bvar[b][v]; } } #endif } scaling_factor_ *= gp->mmf; scaling_factor_ += 1.; } } else { #if defined(_OPENMP) #pragma omp parallel #endif { int tid = omp_get_thread_num(); int ntps = gp->num_threads/gp->num_numa_nodes; int s = tid/ntps; int ltid = tid - s*ntps; int jobs = (nOFM % ntps == 0) ? nOFM/ntps : nOFM/ntps + 1; int tb = (ltid*jobs < nOFM) ? ltid*jobs : nOFM; int te = ((ltid+1)*jobs < nOFM) ? (ltid+1)*jobs : nOFM; for(int i=tb; i < te; i++) { ((float*)bexpect[s])[i] = ((float*)gexpect[s])[i]/scaling_factor_; float tmp = ((float*)gvar[s])[i]/scaling_factor_; ((float*)bstddev[s])[i] = 1./sqrt(tmp + gp->eps); } } #ifdef _OPENMP #pragma omp parallel #endif { #ifdef _OPENMP const int tid = omp_get_thread_num(); #else const int tid = 0; #endif int ntps = gp->num_threads/gp->num_numa_nodes; int n = tid/ntps; CHKERR_LIBXSMM_DNN(libxsmm_dnn_execute_st(libxsmm_handle_conv[n], LIBXSMM_DNN_COMPUTE_KIND_FWD, n*ntps, tid)); CHKERR_LIBXSMM_DNN(libxsmm_dnn_fusedbatchnorm_execute_st(libxsmm_handle_bn_test[n], LIBXSMM_DNN_COMPUTE_KIND_FWD, n*ntps, tid)); } } } void FusedConvBNXSMM::backPropagate(TensorBuf *deloutp, TensorBuf* weightp, TensorBuf *delgammap, TensorBuf *delbetap, TensorBuf *delmidp, vector& delinp, int tid) { void *deloutput[NUM_NUMA_NODES]; void *delmiddle[NUM_NUMA_NODES]; void *delinp_r[NUM_NUMA_NODES]; void *delinp_l[NUM_NUMA_NODES]; void *delgamma[NUM_NUMA_NODES]; void *delbeta[NUM_NUMA_NODES]; int nImg = fusedbn_desc_train.partN; int nIFM = gp->nInput[0]; int nOFM = gp->nOutput; int nBIfm = nIFM/VLEN; int nBOfm = nOFM/VLEN; int ofh = gp->oHeight; int ofw = gp->oWidth; int mfh = gp->mHeight; int mfw = gp->mWidth; int ifh = gp->iHeight; int ifw = gp->iWidth; int iph = gp->ipad_h; int ipw = gp->ipad_w; int oph = gp->opad_h; int opw = gp->opad_w; int mph = gp->mpad_h; int mpw = gp->mpad_w; int bsh = gp->bn_stride_h; int bsw = gp->bn_stride_w; int csh = gp->c_stride_h; int csw = gp->c_stride_w; int fhm = mfh + 2*mph; int fwm = mfw + 2*mpw; int fhi = ifh + 2*iph; int fwi = ifw + 2*ipw; int ofhp = ofh + 2*oph; int ofwp = ofw + 2*opw; deloutput[0] = deloutp->getBuffer(); delmiddle[0] = delmidp->getBuffer(); delinp_r[0] = delinp[0]->getBuffer(); delinp_l[0] = gp->eltwise ? delinp[1]->getBuffer() : NULL; int imoff = fusedbn_desc_train.partN * fusedbn_desc_train.C * ofhp * ofwp; if(gp->out_data_type == DT_FLOAT) imoff = imoff * sizeof(float); else if(gp->out_data_type == DT_BF16) imoff = imoff * sizeof(libxsmm_bfloat16); for(int n=1; nnum_numa_nodes; n++) deloutput[n] = deloutput[n-1] + imoff; imoff = conv_desc.N * conv_desc.K * fhm * fwm; if(gp->out_data_type == DT_FLOAT) imoff = imoff * sizeof(float); else if(gp->out_data_type == DT_BF16) imoff = imoff * sizeof(libxsmm_bfloat16); for(int n=1; nnum_numa_nodes; n++) delmiddle[n] = delmiddle[n-1] + imoff; imoff = conv_desc.N * conv_desc.C * fhi * fwi; if(gp->in_data_type == DT_FLOAT) imoff = imoff * sizeof(float); else if(gp->in_data_type == DT_BF16) imoff = imoff * sizeof(libxsmm_bfloat16); for(int n=1; nnum_numa_nodes; n++) delinp_r[n] = delinp_r[n-1] + imoff; if(gp->eltwise) { imoff = fusedbn_desc_train.partN * gp->nInput[1] * fhi * fwi; if(gp->in_data_type == DT_FLOAT) imoff = imoff * sizeof(float); else if(gp->in_data_type == DT_BF16) imoff = imoff * sizeof(libxsmm_bfloat16); for(int n=1; nnum_numa_nodes; n++) delinp_l[n] = delinp_l[n-1] + imoff; } void **gptrptr = delgammap->getBufferPtr(); void **bptrptr = delbetap->getBufferPtr(); int goffset = delgammap->getOffset() * sizeof(float); int boffset = delbetap->getOffset() * sizeof(float); for(int n=0; nnum_numa_nodes; n++) { delgamma[n] = gptrptr[n] + goffset; delbeta[n] = bptrptr[n] + boffset; } void **sptrptr = scratchp->getBufferPtr(); for(int n=0; nnum_numa_nodes; n++) { if(gp->in_data_type == DT_FLOAT) { float (* __restrict del_middle)[nBOfm][fhm][fwm][VLEN] = (float (*)[*][*][*][VLEN])delmiddle[n]; /* zero the rims in case of physical padding */ if (mph > 0 || mpw > 0) { #pragma omp parallel for for (int img = 0; img < conv_desc.N; img++) { for (int fm = 0; fm < nBOfm; fm++) { for (int w = 0; w < fwm; w++) { for (int ph = 0; ph < mph; ph++) { #ifdef __AVX512F__ _mm512_stream_ps( &(del_middle[img][fm][ph ][w][0]), _mm512_setzero_ps() ); _mm512_stream_ps( &(del_middle[img][fm][fhm-1-ph][w][0]), _mm512_setzero_ps() ); #else #pragma omp simd #pragma vector aligned #ifdef USE_NTS_BN #pragma vector nontemporal #endif for(int v=0; v < VLEN; v++) { del_middle[img][fm][ph][w][v] = 0.0f; del_middle[img][fm][fhm-1-ph][w][v] = 0.0f; } #endif } } for (int h = mph; h < mfh+mph; h++) { for (int pw = 0; pw < mpw; pw++) { #ifdef __AVX512F__ _mm512_stream_ps( &(del_middle[img][fm][h][pw ][0]), _mm512_setzero_ps() ); _mm512_stream_ps( &(del_middle[img][fm][h][fwm-1-pw][0]), _mm512_setzero_ps() ); #else #pragma omp simd #pragma vector aligned #ifdef USE_NTS_BN #pragma vector nontemporal #endif for(int v=0; v < VLEN; v++) { del_middle[img][fm][h][pw][v] = 0.0f; del_middle[img][fm][h][fwm-1-pw][v] = 0.0f; } #endif } } } } } } else if(gp->in_data_type == DT_BF16) { libxsmm_bfloat16 (* __restrict del_middle)[nBOfm][fhm][fwm][VLEN] = (libxsmm_bfloat16 (*)[*][*][*][VLEN])delmiddle[n]; /* zero the rims in case of physical padding */ /* @TODO, we need to do the same thing with del_input_l?! */ if (iph > 0 || iph > 0) { #pragma omp parallel for for (int img = 0; img < conv_desc.N; img++) { for (int fm = 0; fm < nBOfm; fm++) { for (int w = 0; w < fwm; w++) { for (int ph = 0; ph < mph; ph++) { #pragma omp simd #pragma vector aligned #ifdef USE_NTS_BN #pragma vector nontemporal #endif for(int v=0; v < VLEN; v++) { del_middle[img][fm][ph][w][v] = 0; del_middle[img][fm][fhm-1-ph][w][v] = 0; } } } for (int h = mph; h < mfh+mph; h++) { for (int pw = 0; pw < mpw; pw++) { #pragma omp simd #pragma vector aligned #ifdef USE_NTS_BN #pragma vector nontemporal #endif for(int v=0; v < VLEN; v++) { del_middle[img][fm][h][pw][v] = 0; del_middle[img][fm][h][fwm-1-pw][v] = 0; } } } } } } } } /* Perform physical padding tests */ #ifndef NDEBUG if ( (oph > 0 || opw > 0) && (mph > 0 || mpw > 0) ) { printf("node %s: batchnorm backward input and output is padded which cannot be :-(\n", nname.c_str()); } /* check rims */ if(gp->in_data_type == DT_FLOAT && gp->out_data_type == DT_FLOAT) { check_physical_pad( nname.c_str(), (float*)delmiddle[0], conv_desc.N, nBOfm, mfh, mfw, VLEN, mph, mpw ); check_physical_pad( nname.c_str(), (float*)deloutput[0], conv_desc.N, nBOfm, ofh, ofw, VLEN, oph, opw ); } else if(gp->in_data_type == DT_BF16 && gp->out_data_type == DT_BF16) { check_physical_pad( nname.c_str(), (libxsmm_bfloat16*)delmiddle[0], conv_desc.N, nBOfm, mfh, mfw, VLEN, mph, mpw ); check_physical_pad( nname.c_str(), (libxsmm_bfloat16*)deloutput[0], conv_desc.N, nBOfm, ofh, ofw, VLEN, oph, opw ); } #endif if(!updated_scratch_bwd) { for(int n=0; nnum_numa_nodes; n++) { CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_scratch( libxsmm_handle_bn_train[n], sptrptr[n] ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_scratch( libxsmm_handle_conv[n], LIBXSMM_DNN_COMPUTE_KIND_ALL, sptrptr[n] ) ); } updated_scratch_bwd = true; } for(int n=0; nnum_numa_nodes; n++) { if(libxsmm_deloutput[n] == NULL && libxsmm_delmiddle_bn[n] == NULL && libxsmm_delinput_add[n] == NULL) { libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle_bn_train[n], LIBXSMM_DNN_GRADIENT_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_deloutput[n] = libxsmm_dnn_link_tensor( libxsmm_layout, deloutput[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle_bn_train[n], libxsmm_deloutput[n], LIBXSMM_DNN_GRADIENT_OUTPUT ) ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle_bn_train[n], LIBXSMM_DNN_GRADIENT_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delmiddle_bn[n] = libxsmm_dnn_link_tensor( libxsmm_layout, delmiddle[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle_bn_train[n], libxsmm_delmiddle_bn[n], LIBXSMM_DNN_GRADIENT_INPUT ) ); if(gp->eltwise) { libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout(libxsmm_handle_bn_train[n], LIBXSMM_DNN_GRADIENT_INPUT_ADD, &status); CHKERR_LIBXSMM_DNN( status ); libxsmm_delinput_add[n] = libxsmm_dnn_link_tensor( libxsmm_layout, delinp_l[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle_bn_train[n], libxsmm_delinput_add[n], LIBXSMM_DNN_GRADIENT_INPUT_ADD ) ); } } } for(int n=0; nnum_numa_nodes; n++) { if(libxsmm_delgamma[n] == NULL && libxsmm_delbeta[n] == NULL) { libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout(libxsmm_handle_bn_train[n], LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA, &status); CHKERR_LIBXSMM_DNN( status ); libxsmm_delgamma[n] = libxsmm_dnn_link_tensor( libxsmm_layout, delgamma[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle_bn_train[n], libxsmm_delgamma[n], LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA ) ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout(libxsmm_handle_bn_train[n], LIBXSMM_DNN_GRADIENT_CHANNEL_BETA, &status); CHKERR_LIBXSMM_DNN( status ); libxsmm_delbeta[n] = libxsmm_dnn_link_tensor( libxsmm_layout, delbeta[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle_bn_train[n], libxsmm_delbeta[n], LIBXSMM_DNN_GRADIENT_CHANNEL_BETA ) ); } } /* Perform physical padding tests */ #ifndef NDEBUG if ( (oph > 0 || opw > 0) && (mph > 0 || mpw > 0) ) { printf("node %s: batchnorm backward input and output is padded which cannot be :-(\n", nname.c_str()); } /* check rims */ if(gp->in_data_type == DT_FLOAT && gp->out_data_type == DT_FLOAT) { check_physical_pad( nname.c_str(), (float*)delmiddle[0], conv_desc.N, nBOfm, mfh, mfw, VLEN, mph, mpw ); check_physical_pad( nname.c_str(), (float*)deloutput[0], conv_desc.N, nBOfm, ofh, ofw, VLEN, oph, opw ); } else if(gp->in_data_type == DT_BF16 && gp->out_data_type == DT_BF16) { check_physical_pad( nname.c_str(), (libxsmm_bfloat16*)delmiddle[0], conv_desc.N, nBOfm, mfh, mfw, VLEN, mph, mpw ); check_physical_pad( nname.c_str(), (libxsmm_bfloat16*)deloutput[0], conv_desc.N, nBOfm, ofh, ofw, VLEN, oph, opw ); } #endif for(int n=0; nnum_numa_nodes; n++) { if(libxsmm_delinput[n] == NULL && libxsmm_delmiddle_conv[n] == NULL) { libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle_conv[n], LIBXSMM_DNN_GRADIENT_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delinput[n] = libxsmm_dnn_link_tensor(libxsmm_layout, delinp_r[n], &status ); CHKERR_LIBXSMM_DNN(status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor(libxsmm_handle_conv[n], libxsmm_delinput[n], LIBXSMM_DNN_GRADIENT_INPUT)); libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle_conv[n], LIBXSMM_DNN_GRADIENT_OUTPUT, &status ); CHKERR_LIBXSMM_DNN(status ); libxsmm_delmiddle_conv[n] = libxsmm_dnn_link_tensor( libxsmm_layout, delmiddle[n], &status ); CHKERR_LIBXSMM_DNN(status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN(libxsmm_dnn_bind_tensor( libxsmm_handle_conv[n], libxsmm_delmiddle_conv[n], LIBXSMM_DNN_GRADIENT_OUTPUT ) ); } } #ifndef NDEBUG /* check physical padding */ if ( (gp->ipad_h > 0 || gp->ipad_w > 0) && (gp->mpad_h > 0 || gp->mpad_w > 0) ) { } else if ( (gp->ipad_h == 0 || gp->ipad_w == 0) && (gp->mpad_h == 0 || gp->mpad_w == 0) ) { } else { printf("node %s: conv xsmm backward is partially padded which cannot be :-(\n", nname.c_str()); } if(gp->out_data_type == DT_FLOAT) check_physical_pad( nname.c_str(), (float*)delinp_r[0], conv_desc.N, nBIfm, ifh, ifw, 16, iph, ipw ); else if(gp->out_data_type == DT_BF16) check_physical_pad( nname.c_str(), (libxsmm_bfloat16*)delinp_r[0], conv_desc.N, nBIfm, ifh, ifw, 16, iph, ipw ); if(gp->in_data_type == DT_FLOAT) check_physical_pad( nname.c_str(), (float*)delmiddle[0], conv_desc.N, nBOfm, mfh, mfw, 16, mph, mpw ); else if(gp->in_data_type == DT_BF16) check_physical_pad( nname.c_str(), (libxsmm_bfloat16*)delmiddle[0], conv_desc.N, nBOfm, mfh, mfw, 16, mph, mpw ); #endif #ifdef USE_XSMM_TIMING struct timeval tvsc, tvec; gettimeofday(&tvsc, NULL); #endif #ifdef _OPENMP #pragma omp parallel #endif { #ifdef _OPENMP const int tid = omp_get_thread_num(); #else const int tid = 0; #endif int ntps = gp->num_threads/gp->num_numa_nodes; int n = tid/ntps; CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_execute_st( libxsmm_handle_bn_train[n], LIBXSMM_DNN_COMPUTE_KIND_BWD, n*ntps, tid ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_execute_st( libxsmm_handle_conv[n], LIBXSMM_DNN_COMPUTE_KIND_BWD, n*ntps, tid ) ); #ifdef USE_MLSL #pragma omp barrier if(tid == 0) { float *dgp = (float*)delgamma[0]; float *dbp = (float*)delbeta[0]; for(int nn=1; nnnum_numa_nodes; nn++) { float *rdgp = (float*)delgamma[nn]; float *rdbp = (float*)delbeta[nn]; #pragma omp simd for(int i=0; inum_numa_nodes; nn++) { float *rdgp = (float*)delgamma[nn]; float *rdbp = (float*)delbeta[nn]; #pragma vector nontemporal #pragma omp simd for(int i=0; ibatch_size * (double)gp->nInput[0] * (double)gp->nOutput * (double)gp->mHeight * (double)gp->mWidth * (double)gp->kh * (double)gp->kw * 2; if(gp->c_stride_h == 1 && gp->mpad_h == 0) printf("%s XSMM-CONV-BP mb%dic%dih%doc%doh%dkh%dn time = %g ms, GFLOPS = %.1f\n",gp->node_name.c_str(),gp->batch_size, gp->nInput[0], gp->iHeight,gp->nOutput,gp->mHeight,gp->kh,bp_time*1000.0, gf/bp_time/1e9); else if(gp->c_stride_h == 2) printf("%s XSMM-CONV-BP mb%dic%dih%doc%doh%dkh%dsh%dn time = %g ms, GFLOPS = %.1f\n",gp->node_name.c_str(),gp->batch_size,gp->nInput[0],gp->iHeight,gp->nOutput,gp->mHeight,gp->kh,gp->c_stride_h,bp_time*1000.0, gf/bp_time/1e9); else if(gp->mpad_h == 1) printf("%s XSMM-CONV-BP mb%dic%dih%doc%doh%dkh%dph%dn time = %g ms, GFLOPS = %.1f\n",gp->node_name.c_str(),gp->batch_size,gp->nInput[0],gp->iHeight,gp->nOutput,gp->mHeight,gp->kh,gp->mpad_h,bp_time*1000.0, gf/bp_time/1e9); } #endif #ifndef NDEBUG /* check physical padding */ if ( (gp->ipad_h > 0 || gp->ipad_w > 0) && (gp->mpad_h > 0 || gp->mpad_w > 0) ) { } else if ( (gp->ipad_h == 0 || gp->ipad_w == 0) && (gp->mpad_h == 0 || gp->mpad_w == 0) ) { } else { printf("node %s: conv xsmm backward is partially padded which cannot be :-(\n", nname.c_str()); } if(gp->out_data_type == DT_FLOAT) check_physical_pad( nname.c_str(), (float*)delinp_r[0], conv_desc.N, nBIfm, ifh, ifw, 16, iph, ipw ); else if(gp->out_data_type == DT_BF16) check_physical_pad( nname.c_str(), (libxsmm_bfloat16*)delinp_r[0], conv_desc.N, nBIfm, ifh, ifw, 16, iph, ipw ); if(gp->in_data_type == DT_FLOAT) check_physical_pad( nname.c_str(), (float*)delmiddle[0], conv_desc.N, nBOfm, mfh, mfw, 16, mph, mpw ); else if(gp->in_data_type == DT_BF16) check_physical_pad( nname.c_str(), (libxsmm_bfloat16*)delmiddle[0], conv_desc.N, nBOfm, mfh, mfw, 16, mph, mpw ); #endif } void FusedConvBNXSMM::weightUpdate(TensorBuf *inp, TensorBuf *deloutp, TensorBuf *delmidp, TensorBuf* delweightp, TensorBuf *delgammap, TensorBuf* delbetap, int tid) { int nOFM = gp->nOutput; int ofm = gp->nOutput; int ifm = gp->nInput[0]; int kh = gp->kh; int kw = gp->kw; int nBOfm = nOFM/VLEN; int ofh = gp->oHeight; int ofw = gp->oWidth; int oph = gp->opad_h; int opw = gp->opad_w; int ofhp = ofh + 2*oph; int ofwp = ofw + 2*opw; int mfh = gp->mHeight; int mfw = gp->mWidth; int mph = gp->mpad_h; int mpw = gp->mpad_w; int fhm = mfh + 2*mph; int fwm = mfw + 2*mpw; void *deloutput[NUM_NUMA_NODES]; void *delgamma[NUM_NUMA_NODES]; void *delbeta[NUM_NUMA_NODES]; void *dwt_ptr[NUM_NUMA_NODES]; void *delmiddle[NUM_NUMA_NODES]; if(!gp->bprop) { deloutput[0] = deloutp->getBuffer(); int imoff = fusedbn_desc_train.partN * fusedbn_desc_train.C * ofhp * ofwp; if(gp->out_data_type == DT_FLOAT) imoff = imoff * sizeof(float); else if(gp->out_data_type == DT_BF16) imoff = imoff * sizeof(libxsmm_bfloat16); for(int n=1; nnum_numa_nodes; n++) deloutput[n] = deloutput[n-1] + imoff; } void **ptrptr = delweightp->getBufferPtr(); int offset = delweightp->getOffset(); if(gp->in_data_type == DT_FLOAT) offset = offset*sizeof(float); else if(gp->in_data_type == DT_BF16) offset = offset*sizeof(libxsmm_bfloat16); for(int n=0; nnum_numa_nodes; n++) dwt_ptr[n] = ptrptr[n] + offset; if(!gp->bprop) { void **gptrptr = delgammap->getBufferPtr(); void **bptrptr = delbetap->getBufferPtr(); int goffset = delgammap->getOffset() * sizeof(float); int boffset = delbetap->getOffset() * sizeof(float); for(int n=0; nnum_numa_nodes; n++) { delgamma[n] = gptrptr[n] + goffset; delbeta[n] = bptrptr[n] + boffset; } } delmiddle[0] = delmidp->getBuffer(); int imoff = conv_desc.N * conv_desc.K * fhm * fwm; if(gp->out_data_type == DT_FLOAT) imoff = imoff * sizeof(float); else if(gp->out_data_type == DT_BF16) imoff = imoff * sizeof(libxsmm_bfloat16); for(int n=1; nnum_numa_nodes; n++) delmiddle[n] = delmiddle[n-1] + imoff; void **sptrptr = scratchp->getBufferPtr(); if(!updated_scratch_upd) { for(int n=0; nnum_numa_nodes; n++) { if(!gp->bprop) CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_scratch( libxsmm_handle_bn_train[n], sptrptr[n] ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_scratch( libxsmm_handle_conv[n], LIBXSMM_DNN_COMPUTE_KIND_ALL, sptrptr[n] ) ); } updated_scratch_upd = true; } for(int n=0; nnum_numa_nodes; n++) { if(libxsmm_delfilter[n] == NULL) { libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle_conv[n], LIBXSMM_DNN_GRADIENT_FILTER, &status ); CHKERR_LIBXSMM_DNN(status ); libxsmm_delfilter[n] = libxsmm_dnn_link_tensor( libxsmm_layout, dwt_ptr[n], &status ); CHKERR_LIBXSMM_DNN(status); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN(libxsmm_dnn_bind_tensor( libxsmm_handle_conv[n], libxsmm_delfilter[n], LIBXSMM_DNN_GRADIENT_FILTER ) ); } } for(int n=0; nnum_numa_nodes; n++) { if(libxsmm_delmiddle_conv[n] == NULL) { libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle_conv[n], LIBXSMM_DNN_GRADIENT_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delmiddle_conv[n] = libxsmm_dnn_link_tensor(libxsmm_layout, delmiddle[n], &status ); CHKERR_LIBXSMM_DNN(status); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN(libxsmm_dnn_bind_tensor( libxsmm_handle_conv[n], libxsmm_delmiddle_conv[n], LIBXSMM_DNN_GRADIENT_OUTPUT ) ); } } if(!gp->bprop) { for(int n=0; nnum_numa_nodes; n++) { if(libxsmm_deloutput[n] == NULL && libxsmm_delmiddle_bn[n] == NULL) { libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle_bn_train[n], LIBXSMM_DNN_GRADIENT_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_deloutput[n] = libxsmm_dnn_link_tensor( libxsmm_layout, deloutput[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle_bn_train[n], libxsmm_deloutput[n], LIBXSMM_DNN_GRADIENT_OUTPUT ) ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout( libxsmm_handle_bn_train[n], LIBXSMM_DNN_GRADIENT_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delmiddle_bn[n] = libxsmm_dnn_link_tensor( libxsmm_layout, delmiddle[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle_bn_train[n], libxsmm_delmiddle_bn[n], LIBXSMM_DNN_GRADIENT_INPUT ) ); } if(libxsmm_delgamma[n] == NULL && libxsmm_delbeta[n] == NULL) { libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout(libxsmm_handle_bn_train[n], LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA, &status); CHKERR_LIBXSMM_DNN( status ); libxsmm_delgamma[n] = libxsmm_dnn_link_tensor( libxsmm_layout, delgamma[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle_bn_train[n], libxsmm_delgamma[n], LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA ) ); libxsmm_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout(libxsmm_handle_bn_train[n], LIBXSMM_DNN_GRADIENT_CHANNEL_BETA, &status); CHKERR_LIBXSMM_DNN( status ); libxsmm_delbeta[n] = libxsmm_dnn_link_tensor( libxsmm_layout, delbeta[n], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_bind_tensor( libxsmm_handle_bn_train[n], libxsmm_delbeta[n], LIBXSMM_DNN_GRADIENT_CHANNEL_BETA ) ); } } } #ifndef NDEBUG /* check physical padding */ if ( (gp->ipad_h > 0 || gp->ipad_w > 0) && (gp->mpad_h > 0 || gp->mpad_w > 0) ) { } else if ( (gp->ipad_h == 0 || gp->ipad_w == 0) && (gp->mpad_h == 0 || gp->mpad_w == 0) ) { } else { printf("node %s: conv xsmm backward is partially padded which cannot be :-(\n", nname.c_str()); } if(gp->in_data_type == DT_FLOAT) check_physical_pad( nname.c_str(), (float*)delmiddle[0], conv_desc.N, nBOfm, mfh, mfw, 16, mph, mpw); else if(gp->in_data_type == DT_BF16) check_physical_pad( nname.c_str(), (libxsmm_bfloat16*)delmiddle[0], conv_desc.N, nBOfm, mfh, mfw, 16, mph, mpw); #endif #ifdef USE_XSMM_TIMING__ struct timeval tvsc, tvec; gettimeofday(&tvsc, NULL); #endif if(!gp->bprop) { #ifdef _OPENMP #pragma omp parallel #endif { #ifdef _OPENMP const int tid = omp_get_thread_num(); #else const int tid = 0; #endif int ntps = gp->num_threads/gp->num_numa_nodes; int n = tid/ntps; CHKERR_LIBXSMM_DNN( libxsmm_dnn_fusedbatchnorm_execute_st( libxsmm_handle_bn_train[n], LIBXSMM_DNN_COMPUTE_KIND_BWD, n*ntps, tid ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_execute_st( libxsmm_handle_conv[n], LIBXSMM_DNN_COMPUTE_KIND_UPD, n*ntps, tid ) ); #ifdef USE_MLSL #pragma omp barrier if(gp->in_data_type == DT_FLOAT) { #include "reduce_weight_grads.c" } else if(gp->in_data_type == DT_BF16) { #include "reduce_weight_grads_bf16.c" } #pragma omp barrier if(tid == 0) { float *dgp = (float*)delgamma[0]; float *dbp = (float*)delbeta[0]; for(int nn=1; nnnum_numa_nodes; nn++) { float *rdgp = (float*)delgamma[nn]; float *rdbp = (float*)delbeta[nn]; #pragma omp simd for(int i=0; inum_threads/gp->num_numa_nodes; int n = tid/ntps; CHKERR_LIBXSMM_DNN( libxsmm_dnn_execute_st( libxsmm_handle_conv[n], LIBXSMM_DNN_COMPUTE_KIND_UPD, n*ntps, tid ) ); #ifdef USE_MLSL #pragma omp barrier if(gp->in_data_type == DT_FLOAT) { #include "reduce_weight_grads.c" } else if(gp->in_data_type == DT_BF16) { #include "reduce_weight_grads_bf16.c" } #endif } } #ifdef USE_XSMM_TIMING__ gettimeofday(&tvec, NULL); double wu_time = (tvec.tv_sec + tvec.tv_usec*1e-6) - (tvsc.tv_sec + tvsc.tv_usec*1e-6); #ifdef USE_MLSL if(MLSL::Environment::GetEnv().GetProcessIdx() == 0) #endif { double gf = (double)gp->batch_size * (double)gp->nInput[0] * (double)gp->nOutput * (double)gp->mHeight * (double)gp->mWidth * (double)gp->kh * (double)gp->kw * 2; if(gp->c_stride_h == 1 && gp->mpad_h == 0) printf("%s XSMM-CONV-WU mb%dic%dih%doc%doh%dkh%dn time = %g ms, GFLOPS = %.1f\n",gp->node_name.c_str(),gp->batch_size,gp->nInput[0],gp->iHeight,gp->nOutput,gp->mHeight,gp->kh,wu_time*1000.0, gf/wu_time/1e9); else if(gp->c_stride_h == 2) printf("%s XSMM-CONV-WU mb%dic%dih%doc%doh%dkh%dsh%dn time = %g ms, GFLOPS = %.1f\n",gp->node_name.c_str(),gp->batch_size,gp->nInput[0],gp->iHeight,gp->nOutput,gp->mHeight,gp->kh,gp->c_stride_h,wu_time*1000.0, gf/wu_time/1e9); else if(gp->mpad_h == 1) printf("%s XSMM-CONV-WU mb%dic%dih%doc%doh%dkh%dph%dn time = %g ms, GFLOPS = %.1f\n",gp->node_name.c_str(),gp->batch_size,gp->nInput[0],gp->iHeight,gp->nOutput,gp->mHeight,gp->kh,gp->mpad_h,wu_time*1000.0, gf/wu_time/1e9); } #endif #ifndef NDEBUG /* check physical padding */ if(gp->in_data_type == DT_FLOAT) check_physical_pad( nname.c_str(), (float*)delmiddle[0], conv_desc.N, nBOfm, mfh, mfw, 16, mph, mpw); else if(gp->in_data_type == DT_BF16) check_physical_pad( nname.c_str(), (libxsmm_bfloat16*)delmiddle[0], conv_desc.N, nBOfm, mfh, mfw, 16, mph, mpw); #endif } void FusedConvBNXSMM::dumpBuffer(TensorBuf* tBuf, void* wtemp) { int buftype = tBuf->getBufferType(); if(buftype == DATA) { CHKERR_LIBXSMM_DNN(libxsmm_dnn_copyout_tensor(libxsmm_checkpoint_filter, wtemp, LIBXSMM_DNN_TENSOR_FORMAT_KCRS)); } else if(buftype == HISTORY) CHKERR_LIBXSMM_DNN(libxsmm_dnn_copyout_tensor(libxsmm_checkpoint_history_filter, wtemp, LIBXSMM_DNN_TENSOR_FORMAT_KCRS)); } libxsmm-1.17/samples/deeplearning/gxm/src/ImageData.cpp000066400000000000000000000356421415223013700231300ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #include #include "string.h" #include #include #include #include "ImageData.hpp" #include "check.hpp" using namespace std; using namespace gxm; ImageDataNode::ImageDataNode(ImageDataParams* p, MLEngine* e) : NNNode(p, e) { nname_ = p->get_node_name(); ntype_ = p->get_node_type(); mode_ = p->get_mode(); top_ = p->get_top_names(); bp_flag_ = p->get_bprop_flag(); has_weights_ = false; bot_compute_engine_ = LOOP; //Create output tensor for(int i=0; isetOwner(this); tenTopData_.push_back(tenTop_[i]->getBuf(DATA)); if(NNNode::top_[i].compare("data") == 0) { tenTop_[i]->setType(INPUT); int dtype = p->get_data_type(); tenTopData_[i]->setDataType(dtype); tenTopData_[i]->setBufferType(DATA); Shape tts; shape_setzero(&tts); tts.ndims = 4; tts.dims[0] = p->get_batch_size(); tts.dims[1] = p->get_channels(); vector v = p->get_crop_sizes(); tts.dims[2] = v[0]; tts.dims[3] = v[1]; tenTop_[i]->setShape(&tts); long long int size = 1; for(int j=0; jsetBufferSize(size); // Register output tensor in tensorMap bool inserted = e->register_tensor(NNNode::top_[i], INPUT, tenTop_[i]); if(!inserted) printf("Warning: Tensor %s already registered\n",NNNode::top_[i].c_str()); } else if(top_[i].compare("label") == 0) { tenTop_[i]->setType(LABEL); int dtype = p->get_label_data_type(); tenTopData_[i]->setDataType(dtype); tenTopData_[i]->setBufferType(DATA); Shape tts; shape_setzero(&tts); tts.ndims = 1; tts.dims[0] = p->get_batch_size(); tenTop_[i]->setShape(&tts); long long int size = 1; for(int j=0; jsetBufferSize(size); // Register output tensor in tensorMap bool inserted = e->register_tensor(NNNode::top_[i], LABEL, tenTop_[i]); if(!inserted) printf("Warning: Tensor %s already registered\n",NNNode::top_[i].c_str()); } } // If training mode, setup training and validation data files, else only latter int mode = p->get_mode(); ap.mirror = p->get_mirror(); ap.vignette = p->get_vignette(); ap.color_bump = p->get_color_bump(); train_source_path_ = p->get_train_source_path(); test_source_path_ = p->get_test_source_path(); num_machines_ = e->get_num_machines(); num_epochs_ = e->get_num_epochs(); batch_size_ = p->get_batch_size(); global_batch_size_ = batch_size_ * num_machines_; e->set_batch_size(batch_size_); gparams_.channels = p->get_channels(); gparams_.crop_sizes = p->get_crop_sizes(); gparams_.orig_sizes = p->get_orig_sizes(); gparams_.batch_size = batch_size_; gparams_.threads = e->get_num_threads(); gparams_.lookahead = p->get_lookahead(); gparams_.mean_values = p->get_mean_values(); gparams_.scale_values = p->get_scale_values(); gparams_.test_views = p->get_num_test_views(); jitters_ = p->get_jitters(); current_epoch_ = 0; ctrain_pf_mb_ = 0; ctest_pf_mb_ = 0; ctrain_proc_mb_ = 0; ctest_proc_mb_ = 0; curr_test_view_ = 0; full_train_prefetch_ = true; full_test_prefetch_ = true; eptr = e; global_node_id_ = e->get_global_node_id(); if(mode == TRAIN) { num_train_files_ = p->get_num_train_files(); createImageList(train_list_, p->get_train_img_info(), num_train_files_); #ifdef DUMP_DATA train_batches_ = 1; #else train_batches_ = num_train_files_ % global_batch_size_ > 0 ? (((int)(num_train_files_/global_batch_size_)) + 1) : num_train_files_/global_batch_size_; #endif e->set_num_train_batches(train_batches_); setupTrainIndices(); num_test_files_ = p->get_num_test_files(); createImageList(test_list_, p->get_test_img_info(), num_test_files_); test_batches_ = num_test_files_ % global_batch_size_ > 0 ? (((int)(num_test_files_/global_batch_size_)) + 1) : num_test_files_/global_batch_size_; e->set_num_test_batches(test_batches_); e->set_num_test_views(gparams_.test_views); setupTestIndices(); } else if(mode == TEST) { num_test_files_ = p->get_num_test_files(); createImageList(test_list_, p->get_test_img_info(), num_test_files_); test_batches_ = num_test_files_/global_batch_size_; e->set_num_test_batches(test_batches_); e->set_num_test_views(gparams_.test_views); setupTestIndices(); } labels_.resize(gparams_.lookahead * gparams_.batch_size); // Allocate temporary buffer to hold 1 image with maximal original size int len = gparams_.batch_size * gparams_.channels * gparams_.orig_sizes[0] * gparams_.orig_sizes[1]; // Size of input buffer = batch_size * channels * height * width * lookahead * sizeof(char) len = len * gparams_.lookahead; tempbuf_ = (unsigned char*)_mm_malloc(len, 64); memset((unsigned char*)tempbuf_, 0, len); #ifdef USE_MLSL MLSL::Session *s = e->get_session(); s->SetGlobalMinibatchSize(global_batch_size_); #endif configure(RGB_FLATFILE); } void ImageDataNode::configure(int dataType) { switch(dataType) { case RGB_FLATFILE: if(num_machines_ == 1) srand(727); else srand(global_node_id_); impl = new ImageDataRGBFlat(&gparams_, &ap); break; } } void ImageDataNode::setupTrainIndices() { int ntrain = num_train_files_; if(ntrain <= batch_size_) { ntrain = batch_size_; gparams_.lookahead = 1; // Override default lookahead, if any } t_files_ = ntrain % global_batch_size_ > 0 ? (((int)(ntrain/global_batch_size_)) + 1)*global_batch_size_ : ntrain; tfiles_per_mc_ = t_files_/num_machines_; train_imginfo_index_.resize(tfiles_per_mc_); vector tfile_index(t_files_); for(int i=0; i= ntrain) tfile_index[i] = tfile_index[i-ntrain]; else tfile_index[i] = i; } random_shuffle(tfile_index.begin(), tfile_index.end()); int k=0; for(int n1=0; n1 < train_batches_; n1++) for(int n2=0; n2 < batch_size_; n2++) train_imginfo_index_[k++] = tfile_index[n1*global_batch_size_ + n2*num_machines_ + global_node_id_]; } void ImageDataNode::setupTestIndices() { int ntest = num_test_files_; if(ntest <= batch_size_) { ntest = batch_size_; gparams_.lookahead = 1; // Override default lookahead, if any } v_files_ = ntest % global_batch_size_ > 0 ? (((int)(ntest/global_batch_size_)) + 1)*global_batch_size_ : ntest; vfiles_per_mc_ = v_files_/num_machines_; vector temp_index(v_files_); test_imginfo_index_.resize(vfiles_per_mc_); for(int i=0; i= ntest) temp_index[i] = temp_index[i-ntest]; else temp_index[i] = i; } for(int n=0; n& list, string infofile, int nfiles) { FILE *f = fopen(infofile.c_str(), "r"); ImageInfo ii; for(int i=0; igetBuffer()); int* toplabel = (int*)(tenTopData_[1]->getBuffer()); #ifdef DEBUG printf("Executing FP %s: Data %p, Label %p\n", NNNode::nname_.c_str(),topdata, toplabel); #endif int em = eptr->get_execution_mode(); gparams_.exec_mode = em; current_epoch_ = eptr->get_current_epoch(); if(em == TRAIN) { if(full_train_prefetch_) { for(int i=0; iforwardPropagate((unsigned char*)&tempbuf_[mbslot*orig_img_size], topdata); #if 0 printf("mblsot %d, orig_img_size %d\n",mbslot,orig_img_size); #endif #ifdef GETSTATS #ifdef USE_MLSL size_t node_id = MLSL::Environment::GetEnv().GetProcessIdx(); if(node_id == 0 && eptr->get_current_batch() % STATFREQ == 0) #endif { int crop_img_size = gparams_.crop_sizes[0]*gparams_.crop_sizes[1]*gparams_.channels; MeanOfLayer("Data", topdata, gparams_.batch_size*crop_img_size); } #endif #ifdef DEBUG int crop_img_size = gparams_.crop_sizes[0]*gparams_.crop_sizes[1]*gparams_.channels; double sum=0.; for(int i=0; i max) max = toplabel[i]; if((long long int)toplabel[i] < min) min = toplabel[i]; } mean = (double)isum/(double)gparams_.batch_size; for(int i=0; iforwardPropagate((unsigned char*)&tempbuf_[mbslot*orig_img_size], curr_test_view_, topdata); curr_test_view_++; #ifdef DEBUG printf("tv = %d\n",curr_test_view_); #endif if(curr_test_view_ == gparams_.test_views) { curr_test_view_ = 0; ctest_proc_mb_++; if(ctest_proc_mb_ == test_batches_) { ctest_pf_mb_ = 0; ctest_proc_mb_ = 0; full_test_prefetch_ = true; } } } #ifdef DUMP_DATA int crop_size = gparams_.batch_size * gparams_.crop_sizes[0] * gparams_.crop_sizes[1] * gparams_.channels; string fname = NNNode::nname_ + "_fp_out"; FILE *f = fopen(fname.c_str(), "w"); for(int i=0; i #include void ImageDataRGBFlat::processTrainMinibatch(unsigned char *inp, float *outp) { int nImg = gp->batch_size; int nOfm = gp->channels; int ofh = gp->crop_sizes[0]; int ofw = gp->crop_sizes[1]; int ifh = gp->orig_sizes[0]; int ifw = gp->orig_sizes[1]; unsigned char (* __restrict input)[ifh][ifw][nOfm] = (unsigned char (*)[*][*][*])inp; float (* __restrict output)[nOfm][ofh][ofw] = (float (*)[*][*][*])outp; #ifdef _OPENMP #pragma omp parallel for collapse(3) #endif for(int img = 0; img < nImg; img++) { for(int ofm = 0; ofm < nOfm; ofm++) { for(int h = 0; h < ofh; h++) { for(int w = 0; w < ofw; w++) { int r_off = r_offset[img]; int c_off = c_offset[img]; if((augmentation[img] < 6) && (ap->mirror == true)) output[img][ofm][h][ofw-w-1] = ((float)input[img][h+r_off][w+c_off][ofm] - gp->mean_values[ofm])*gp->scale_values[0]; else output[img][ofm][h][w] = ((float)input[img][h+r_off][w+c_off][ofm] - gp->mean_values[ofm])*gp->scale_values[0]; } } } } } void ImageDataRGBFlat::processTestMinibatch(unsigned char *inp, int tv, float *outp) { int nImg = gp->batch_size; int nOfm = gp->channels; int ofh = gp->crop_sizes[0]; int ofw = gp->crop_sizes[1]; int ifh = gp->orig_sizes[0]; int ifw = gp->orig_sizes[1]; unsigned char (* __restrict input)[ifh][ifw][nOfm] = (unsigned char (*)[*][*][*])inp; float (* __restrict output)[nOfm][ofh][ofw] = (float (*)[*][*][*])outp; int tv2 = tv/2; #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; imean_values[ofm])*gp->scale_values[0]; } } } } } void ImageDataRGBFlat::forwardPropagate(unsigned char *inp, float *outp) { int em = gp->exec_mode; assert(em == TRAIN); for(int i=0; ibatch_size; i++) { r_offset[i] = lrand48() % (gp->orig_sizes[0] - gp->crop_sizes[0] + 1); c_offset[i] = lrand48() % (gp->orig_sizes[1] - gp->crop_sizes[1] + 1); augmentation[i] = lrand48() % 12; } processTrainMinibatch(inp, outp); } void ImageDataRGBFlat::forwardPropagate(unsigned char *inp, int tv, float *outp) { int em = gp->exec_mode; assert(em == TEST); for(int i=0; ibatch_size; i++) { r_offset[i] = lrand48() % (gp->orig_sizes[0] - gp->crop_sizes[0] + 1); c_offset[i] = lrand48() % (gp->orig_sizes[1] - gp->crop_sizes[1] + 1); } processTestMinibatch(inp, tv, outp); } libxsmm-1.17/samples/deeplearning/gxm/src/JitterData.cpp000066400000000000000000000547001415223013700233430ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #include "JitterData.hpp" using namespace std; using namespace gxm; JitterDataNode::JitterDataNode(JitterDataParams* p, MLEngine* e) : NNNode(p, e) { nname_ = p->get_node_name(); ntype_ = p->get_node_type(); mode_ = p->get_mode(); top_ = p->get_top_names(); bp_flag_ = p->get_bprop_flag(); has_weights_ = false; bot_compute_engine_ = p->get_compute_engine(); //Create output tensor tenTop_.resize(top_.size()); tenTopData_.resize(top_.size()); vector vc = p->get_crop_sizes(); vector vo = p->get_orig_sizes(); assert(vo.size() > 0); for(int i=0; isetOwner(this); tenTopData_[i] = tenTop_[i]->getBuf(DATA); if(top_[i].compare("data") == 0) { tenTop_[i]->setType(INPUT); int dtype = p->get_data_type(); tenTopData_[i]->setDataType(dtype); tenTopData_[i]->setBufferType(DATA); Shape tts; shape_setzero(&tts); tts.ndims = 4; tts.dims[0] = p->get_batch_size(); tts.dims[1] = p->get_channels(); if(vc.size() > 0) { tts.dims[2] = vc[0]; tts.dims[3] = vc[1]; } else { tts.dims[2] = vo[0]; tts.dims[3] = vo[1]; } tenTop_[i]->setShape(&tts); long long int size = tts.dims[0] * tts.dims[1]; bool phys_padding = p->get_physical_padding(); if(phys_padding) { gparams_.pad_h = p->get_pad_h(); gparams_.pad_w = p->get_pad_w(); size = size * (tts.dims[2] + 2*gparams_.pad_h) * (tts.dims[3] + 2*gparams_.pad_w); } else { gparams_.pad_h = 0; gparams_.pad_w = 0; size = size * tts.dims[2] * tts.dims[3]; } if(dtype == DT_FLOAT) size = size*sizeof(float); else if(dtype == DT_BF16) size = size*sizeof(float); tenTopData_[i]->setBufferSize(size); // Register output tensor in tensorMap bool inserted = e->register_tensor(top_[i], INPUT, tenTop_[i]); if(!inserted) printf("Warning: Tensor %s already registered\n",NNNode::top_[i].c_str()); } else if(top_[i].compare("label") == 0) { tenTop_[i]->setType(LABEL); int dtype = p->get_label_data_type(); tenTopData_[i]->setDataType(dtype); tenTopData_[i]->setBufferType(DATA); Shape tts; shape_setzero(&tts); tts.ndims = 1; tts.dims[0] = p->get_batch_size(); tenTop_[i]->setShape(&tts); long long int size = 1; for(int j=0; jsetBufferSize(size); // Register output tensor in tensorMap bool inserted = e->register_tensor(top_[i], LABEL, tenTop_[i]); if(!inserted) printf("Warning: Tensor %s already registered\n",NNNode::top_[i].c_str()); } } // If training mode, setup training and validation data files, else only latter int mode = p->get_mode(); ap.mirror = p->get_mirror(); ap.vignette = p->get_vignette(); ap.color_bump = p->get_color_bump(); train_source_path_ = p->get_train_source_path(); test_source_path_ = p->get_test_source_path(); train_list_path_ = p->get_train_list_path(); test_list_path_ = p->get_test_list_path(); numsplits_ = p->get_numsplits(); num_machines_ = e->get_num_machines(); num_epochs_ = e->get_num_epochs(); duplicates_ = num_machines_ == 1 ? 1 : num_machines_/numsplits_; batch_size_ = p->get_batch_size(); global_batch_size_ = batch_size_ * num_machines_; e->set_batch_size(batch_size_); gparams_.channels = p->get_channels(); gparams_.orig_sizes = vo; gparams_.crop_sizes = vc.size() > 0 ? vc : vo; gparams_.batch_size = batch_size_; gparams_.threads = e->get_num_threads(); gparams_.lookahead = p->get_lookahead(); if(p->get_mean_values().size() > 0) gparams_.mean_values = p->get_mean_values(); else if(p->get_mean_file().size() > 0) gparams_.mean_file = p->get_mean_file(); gparams_.scale_values = p->get_scale_values(); gparams_.test_views = p->get_num_test_views(); gparams_.scalejittering_min = p->get_jitter_min(); gparams_.scalejittering_max = p->get_jitter_max(); gparams_.min_percent_area = p->get_percent_min_area(); gparams_.max_percent_area = p->get_percent_max_area(); gparams_.min_aspect_ratio = p->get_min_aspect_ratio(); gparams_.max_aspect_ratio = p->get_max_aspect_ratio(); gparams_.test_smaller_side = p->get_test_smaller_side(); gparams_.shuffle = p->get_shuffle_flag(); current_epoch_ = 0; ctrain_pf_mb_ = 0; ctest_pf_mb_ = 0; ctrain_proc_mb_ = 0; ctest_proc_mb_ = 0; curr_test_view_ = 0; full_train_prefetch_ = true; full_test_prefetch_ = true; eptr_ = e; global_node_id_ = e->get_global_node_id(); tempbuf_.resize(gparams_.lookahead); cropbuf_.resize(gparams_.lookahead); for(int i=0; i < gparams_.lookahead; i++) { tempbuf_[i].resize(gparams_.batch_size); cropbuf_[i].resize(gparams_.batch_size); } if(mode == TRAIN) { num_train_files_ = p->get_num_train_files(); int factor = batch_size_ * duplicates_; train_batches_ = (num_train_files_ + factor - 1)/factor; train_files_ = train_batches_ * factor; train_files_per_mc_ = train_files_/duplicates_; train_list_per_mc_.resize(train_files_per_mc_); e->set_num_train_batches(train_batches_); num_test_files_ = p->get_num_test_files(); test_batches_ = (num_test_files_ + factor - 1)/factor; test_files_ = test_batches_ * factor; test_files_per_mc_ = test_files_/duplicates_; test_list_per_mc_.resize(test_files_per_mc_); e->set_num_test_batches(test_batches_); e->set_num_test_views(gparams_.test_views); } else if(mode == TEST) { int factor = batch_size_ * duplicates_; num_test_files_ = p->get_num_test_files(); test_batches_ = (num_test_files_ + factor - 1)/factor; test_files_ = test_batches_ * factor; test_files_per_mc_ = test_files_/duplicates_; test_list_per_mc_.resize(test_files_per_mc_); e->set_num_test_batches(test_batches_); e->set_num_test_views(gparams_.test_views); } #ifdef USE_MLSL MLSL::Session *s = e->get_session(); s->SetGlobalMinibatchSize(global_batch_size_); #endif r_offset = new long long int[gparams_.batch_size*60]; c_offset = new long long int[gparams_.batch_size*60]; augmentation = new int[gparams_.batch_size]; drand1 = new double[gparams_.batch_size*60]; drand2 = new double[gparams_.batch_size*60]; drand3 = new double[gparams_.batch_size*60]; if(mode == TRAIN) setupTrainIndices(); setupTestIndices(); labels_.resize(gparams_.lookahead); for(int i=0; i < gparams_.lookahead; i++) labels_[i].resize(gparams_.batch_size); } void JitterDataNode::setupTrainIndices() { std::ifstream infile(train_list_path_.c_str()); string line; train_list_.resize(num_train_files_); int idx=0; while (std::getline(infile, line) && idx < num_train_files_) { size_t pos = line.find_last_of(' '); int label = atoi(line.substr(pos + 1).c_str()); train_list_[idx] = std::make_pair(line.substr(0, pos), label); idx++; } #ifdef USE_MLSL size_t node_id = MLSL::Environment::GetEnv().GetProcessIdx(); if(node_id == 0) #endif printf("Read %d training filenames from list\n",idx); train_file_index_.resize(train_files_); for(int i=0; i= num_train_files_) train_file_index_[i] = train_file_index_[i-num_train_files_]; else train_file_index_[i] = i; } srand(727); } void JitterDataNode::setupTestIndices() { std::ifstream infile(test_list_path_.c_str()); string line; int idx=0; test_list_.resize(num_test_files_); while (std::getline(infile, line) && idx < num_test_files_) { size_t pos = line.find_last_of(' '); int label = atoi(line.substr(pos + 1).c_str()); test_list_[idx] = std::make_pair(line.substr(0, pos), label); idx++; } test_file_index_.resize(test_files_); for(int i=0; i= num_test_files_) test_file_index_[i] = test_file_index_[i-num_test_files_]; else test_file_index_[i] = i; } idx = global_node_id_%duplicates_; for(int n=0; n= gparams_.crop_sizes[0]); assert(scalejittering_max >= gparams_.crop_sizes[0]); if(eptr_->get_execution_mode() == TRAIN) { curr_scalejittering = r_offset[ridx] % (scalejittering_max-scalejittering_min+1) + scalejittering_min; ridx++; } else curr_scalejittering = (scalejittering_max+scalejittering_min)/2; if(img_height < img_width) { jittered_img_height = curr_scalejittering; jittered_img_width = (int)((float)jittered_img_height*(float)img_width/(float)img_height); } else { jittered_img_width = curr_scalejittering; jittered_img_height = (int)((float)jittered_img_width*(float)img_height/(float)img_width); } cv::resize( cv_img, jittered_cv_img, cv::Size(jittered_img_width, jittered_img_height), 0 , 0, cv::INTER_CUBIC ); img_height = jittered_cv_img.rows; img_width = jittered_cv_img.cols; // /* We only do random crop when we do training.*/ if (eptr_->get_execution_mode() == TRAIN) { *h_off = r_offset[ridx] % (img_height - gparams_.crop_sizes[0] + 1); *w_off = c_offset[ridx] % (img_width - gparams_.crop_sizes[1] + 1); ridx++; } else { *h_off = (img_height - gparams_.crop_sizes[0]) / 2; *w_off = (img_width - gparams_.crop_sizes[1]) / 2; } cv::Rect roi(*w_off, *h_off, gparams_.crop_sizes[1], gparams_.crop_sizes[0]); cv_cropped_img = jittered_cv_img(roi); } void JitterDataNode::cropTorch(const cv::Mat& cv_img, cv::Mat& cv_cropped_img, int *h_off, int *w_off) { int img_channels = cv_img.channels(); int img_height = cv_img.rows; int img_width = cv_img.cols; float min_percent_area = gparams_.min_percent_area; float max_percent_area = gparams_.max_percent_area; float min_aspect_ratio = gparams_.min_aspect_ratio; float max_aspect_ratio = gparams_.max_aspect_ratio; #ifdef _OPENMP int ridx=omp_get_thread_num()*60; int didx=omp_get_thread_num()*60; #else int ridx=0; int didx=0; #endif cv_cropped_img = cv_img; if(gparams_.crop_sizes[0] == img_height && gparams_.crop_sizes[1] == img_width) return; if(eptr_->get_execution_mode() == TRAIN) { float area = img_height*img_width; for(int attempt = 0; attempt < 60; attempt++) { float target_area = ((max_percent_area-min_percent_area)*((float)drand1[didx]) + min_percent_area)*area; float aspect_ratio = ((max_aspect_ratio-min_aspect_ratio)*((float)drand2[didx]) + min_aspect_ratio); int tmp_w = 0, tmp_h = 0; tmp_w = round(sqrt(target_area * aspect_ratio)); tmp_h = round(sqrt(target_area /aspect_ratio)); if((float)drand3[didx] < 0.5) { tmp_w += tmp_h; tmp_h = tmp_w - tmp_h; tmp_w -= tmp_h; } didx++; if( tmp_w < img_width && tmp_h < img_height) { int rw = img_width - tmp_w + 1; int rh = img_height - tmp_h + 1; *w_off = c_offset[ridx] % rw; *h_off = r_offset[ridx] % rh; ridx++; cv::Rect roi(*w_off, *h_off, tmp_w, tmp_h); cv_cropped_img = cv_img(roi); cv::resize(cv_cropped_img, cv_cropped_img, cv::Size(gparams_.crop_sizes[0], gparams_.crop_sizes[1]), 0 , 0, cv::INTER_CUBIC ); return; } } // Fall back printf("falling back to VGG jittering method\n"); cropVGG(cv_img, cv_cropped_img, h_off, w_off); } else { int jittered_img_width = 0, jittered_img_height = 0; int curr_scalejittering = gparams_.test_smaller_side; cv::Mat jittered_cv_img; assert(curr_scalejittering >= gparams_.crop_sizes[0]); if(img_height < img_width) { jittered_img_height = curr_scalejittering; jittered_img_width = (int)((float)jittered_img_height*(float)img_width/(float)img_height); } else { jittered_img_width = curr_scalejittering; jittered_img_height = (int)((float)jittered_img_width*(float)img_height/(float)img_width); } cv::resize( cv_img, jittered_cv_img, cv::Size(jittered_img_width, jittered_img_height), 0 , 0, cv::INTER_CUBIC ); *h_off = (jittered_img_height - gparams_.crop_sizes[0]) / 2; *w_off = (jittered_img_width - gparams_.crop_sizes[1]) / 2; cv::Rect roi(*w_off, *h_off, gparams_.crop_sizes[1], gparams_.crop_sizes[0]); cv_cropped_img = jittered_cv_img(roi); } } void JitterDataNode::imageTransform(vector& vcrop, float* outp) { int nImg = gparams_.batch_size; int nOfm = gparams_.channels; int ofh = gparams_.crop_sizes[0]; int ofw = gparams_.crop_sizes[1]; int padh = gparams_.pad_h; int padw = gparams_.pad_w; int ofhp = ofh + 2*padh; int ofwp = ofw + 2*padw; vector& mean = gparams_.mean_values; vector rscale; rscale.resize(gparams_.scale_values.size()); for(int i=0; i < nOfm; i++) rscale[i] = 1./gparams_.scale_values[i]; #ifdef _OPENMP #pragma omp parallel for #endif for(int img = 0; img < nImg; img++) { for(int h=0; h < ofh; h++) { const unsigned char* ptr = vcrop[img].ptr(h); int img_index = 0; for(int w = 0; w < ofw; w++) { for(int ofm = 0; ofm < nOfm; ofm++) { //assert(vcrop[img].channels() == nOfm); int out_idx; int oh = h+padh; int ow = w+padw; float inp; int fm; if(ofm < 3) { inp = static_cast(ptr[img_index++]); fm = (gparams_.scale_values.size() == 1) ? 0 : ofm; } if(gparams_.exec_mode == TRAIN) { if((augmentation[img] < 6) && (ap.mirror == true)) out_idx = img * ofhp * ofwp * nOfm + oh * ofwp * nOfm + (ofwp-ow-1) * nOfm + ofm; else out_idx = img * ofhp * ofwp * nOfm + oh * ofwp * nOfm + ow * nOfm + ofm; } else out_idx = img * ofhp * ofwp * nOfm + oh * ofwp * nOfm + ow * nOfm + ofm; if(ofm == 3) outp[out_idx] = 0.0; else outp[out_idx] = (inp - mean[ofm]) * rscale[fm]; } } } } } int myrandom (int i) { return std::rand()%i;} /* it's fine to alias in and out */ void JitterDataNode::convert_f32_bf16(float* in, libxsmm_bfloat16* out, unsigned int len) { unsigned int i = 0; #pragma omp parallel for private(i) for ( i = 0; i < len; i+=16 ) { __m512 vfp32 = gxm_fp32_to_bfp16_rne_adjustment_avx512f(_mm512_loadu_ps(in + i)); __m256i vbfp16 = gxm_fp32_to_bfp16_truncate_avx512f(vfp32); _mm256_storeu_si256( (__m256i*)(out+i), vbfp16 ); } } void JitterDataNode::forwardPropagate() { int nImg = gparams_.batch_size; int nOfm = gparams_.channels; int ofh = gparams_.crop_sizes[0]; int ofw = gparams_.crop_sizes[1]; int padh = gparams_.pad_h; int padw = gparams_.pad_w; int ofhp = ofh + 2*padh; int ofwp = ofw + 2*padw; int out_dtype = tenTopData_[0]->getDataType(); float *topdata = (float*)(tenTopData_[0]->getBuffer()); int* toplabel = (int*)(tenTopData_[1]->getBuffer()); if(first_fp) { int size = nImg * nOfm * ofhp *ofwp; #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; iget_execution_mode(); gparams_.exec_mode = em; current_epoch_ = eptr_->get_current_epoch(); if(em == TRAIN) { if(full_train_prefetch_) { if(gparams_.shuffle) random_shuffle(train_file_index_.begin(), train_file_index_.end(), myrandom); int idx = global_node_id_ % duplicates_; for(int n=0; nsetLPBuffer(bf16_img); } convert_f32_bf16(topdata, (libxsmm_bfloat16*)bf16_img, crop_img_size); } #ifdef GETSTATS if(global_node_id_ == 0) { MeanOfLayer("Data", topdata, crop_img_size); MeanOfLayer("Labels", toplabel, gparams_.batch_size); } #endif ctrain_proc_mb_++; if(ctrain_proc_mb_ == train_batches_) { ctrain_pf_mb_ = 0; ctrain_proc_mb_ = 0; full_train_prefetch_ = true; } } else if(em == TEST || em == VAL) { if(full_test_prefetch_) { for(int i=0; isetLPBuffer(bf16_img); } convert_f32_bf16(topdata, (libxsmm_bfloat16*)bf16_img, crop_img_size); } ctest_proc_mb_++; if(ctest_proc_mb_ == test_batches_) { ctest_pf_mb_ = 0; ctest_proc_mb_ = 0; full_test_prefetch_ = true; } } } libxsmm-1.17/samples/deeplearning/gxm/src/LMDBData.cpp000066400000000000000000000370161415223013700226210ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #include "LMDBData.hpp" using namespace std; using namespace gxm; LMDBDataNode::LMDBDataNode(LMDBDataParams* p, MLEngine* e) : NNNode(p, e) { nname_ = p->get_node_name(); ntype_ = p->get_node_type(); mode_ = p->get_mode(); top_ = p->get_top_names(); bp_flag_ = p->get_bprop_flag(); has_weights_ = false; bot_compute_engine_ = p->get_compute_engine(); //Create output tensor tenTop_.resize(top_.size()); tenTopData_.resize(top_.size()); vector vc = p->get_crop_sizes(); vector vo = p->get_orig_sizes(); assert(vo.size() > 0); for(int i=0; isetOwner(this); tenTopData_[i] = tenTop_[i]->getBuf(DATA); if(top_[i].compare("data") == 0) { tenTop_[i]->setType(INPUT); int dtype = p->get_data_type(); tenTopData_[i]->setDataType(dtype); tenTopData_[i]->setBufferType(DATA); Shape tts; shape_setzero(&tts); tts.ndims = 4; tts.dims[0] = p->get_batch_size(); tts.dims[1] = p->get_channels(); if(vc.size() > 0) { tts.dims[2] = vc[0]; tts.dims[3] = vc[1]; } else { tts.dims[2] = vo[0]; tts.dims[3] = vo[1]; } tenTop_[i]->setShape(&tts); long long int size = 1; for(int j=0; jsetBufferSize(size); // Register output tensor in tensorMap bool inserted = e->register_tensor(top_[i], INPUT, tenTop_[i]); if(!inserted) printf("Warning: Tensor %s already registered\n",NNNode::top_[i].c_str()); } else if(top_[i].compare("label") == 0) { tenTop_[i]->setType(LABEL); int dtype = p->get_label_data_type(); tenTopData_[i]->setDataType(dtype); tenTopData_[i]->setBufferType(DATA); Shape tts; shape_setzero(&tts); tts.ndims = 1; tts.dims[0] = p->get_batch_size(); tenTop_[i]->setShape(&tts); long long int size = 1; for(int j=0; jsetBufferSize(size); // Register output tensor in tensorMap bool inserted = e->register_tensor(top_[i], LABEL, tenTop_[i]); if(!inserted) printf("Warning: Tensor %s already registered\n",NNNode::top_[i].c_str()); } } // If training mode, setup training and validation data files, else only latter int mode = p->get_mode(); ap.mirror = p->get_mirror(); ap.vignette = p->get_vignette(); ap.color_bump = p->get_color_bump(); train_source_path_ = p->get_train_source_path(); test_source_path_ = p->get_test_source_path(); split_db_ = p->get_split_db_flag(); num_machines_ = e->get_num_machines(); num_epochs_ = e->get_num_epochs(); batch_size_ = p->get_batch_size(); global_batch_size_ = batch_size_ * num_machines_; e->set_batch_size(batch_size_); gparams_.channels = p->get_channels(); gparams_.orig_sizes = vo; gparams_.crop_sizes = vc.size() > 0 ? vc : vo; gparams_.batch_size = batch_size_; gparams_.threads = e->get_num_threads(); gparams_.lookahead = p->get_lookahead(); if(p->get_mean_values().size() > 0) gparams_.mean_values = p->get_mean_values(); else if(p->get_mean_file().size() > 0) gparams_.mean_file = p->get_mean_file(); gparams_.scale_values = p->get_scale_values(); gparams_.test_views = p->get_num_test_views(); jitters_ = p->get_jitters(); current_epoch_ = 0; ctrain_pf_mb_ = 0; ctest_pf_mb_ = 0; ctrain_proc_mb_ = 0; ctest_proc_mb_ = 0; curr_test_view_ = 0; full_train_prefetch_ = true; full_test_prefetch_ = true; eptr = e; global_node_id_ = e->get_global_node_id(); tempbuf_.resize(gparams_.lookahead); for(int i=0; i < gparams_.lookahead; i++) tempbuf_[i].resize(gparams_.batch_size); if(mode == TRAIN) { num_train_files_ = p->get_num_train_files(); train_batches_ = num_train_files_ % global_batch_size_ > 0 ? (((int)(num_train_files_/global_batch_size_)) + 1) : num_train_files_/global_batch_size_; e->set_num_train_batches(train_batches_); num_test_files_ = p->get_num_test_files(); test_batches_ = num_test_files_ % global_batch_size_ > 0 ? (((int)(num_test_files_/global_batch_size_)) + 1) : num_test_files_/global_batch_size_; e->set_num_test_batches(test_batches_); e->set_num_test_views(gparams_.test_views); } else if(mode == TEST) { num_test_files_ = p->get_num_test_files(); test_batches_ = num_test_files_ % global_batch_size_ > 0 ? (((int)(num_test_files_/global_batch_size_)) + 1) : num_test_files_/global_batch_size_; e->set_num_test_batches(test_batches_); e->set_num_test_views(gparams_.test_views); } #ifdef USE_MLSL MLSL::Session *s = e->get_session(); s->SetGlobalMinibatchSize(global_batch_size_); #endif tenSeeds_ = new unsigned int[gparams_.threads*16]; initSeeds(tenSeeds_, gparams_.threads); r_offset = new int[gparams_.batch_size](); c_offset = new int[gparams_.batch_size](); augmentation = new int[gparams_.batch_size](); configure(); #ifdef USE_MLSL node_id_ = MLSL::Environment::GetEnv().GetProcessIdx(); num_nodes_ = MLSL::Environment::GetEnv().GetProcessCount(); #else node_id_ = 0; num_nodes_ = 1; #endif } void LMDBDataNode::configure() { srand48(global_node_id_); train_lmdb_ = new LMDB(); train_lmdb_->Open(train_source_path_); train_cursor_ = train_lmdb_->NewCursor(); #ifdef USE_MLSL if(node_id_ > 0 && !split_db_) train_cursor_->Next(global_node_id_ - 1); // Each node computes num images to skip. #endif test_lmdb_ = new LMDB(); test_lmdb_->Open(test_source_path_); test_cursor_ = test_lmdb_->NewCursor(); #ifdef USE_MLSL if(node_id_ > 0 && !split_db_) test_cursor_->Next(global_node_id_ - 1); // Each node computes num images to skip. #endif } void LMDBDataNode::trainImageTransform(vector& v, float* outp) { int nImg = gparams_.batch_size; int nOfm = gparams_.channels; int ofh = gparams_.crop_sizes[0]; int ofw = gparams_.crop_sizes[1]; // vector& mean = gparams_.mean_values; // vector& scale = gparams_.scale_values; float (* __restrict output)[nOfm][ofh][ofw] = (float (*)[*][*][*])outp; #ifdef _OPENMP #pragma omp parallel for #endif for(int img = 0; img < nImg; img++) { for(int ofm = 0; ofm < nOfm; ofm++) { for(int h = 0; h < ofh; h++) { for(int w = 0; w < ofw; w++) { int ifh = v[img].height(); int ifw = v[img].width(); assert(v[img].channels() == nOfm); const unsigned char (* __restrict input)[ifh][ifw] = (unsigned char (*)[*][*])v[img].data().c_str(); int r_off = r_offset[img]; int c_off = c_offset[img]; float inp = (float)input[ofm][h+r_off][w+c_off]; int fm = (gparams_.scale_values.size() == 1) ? 0 : ofm; if((augmentation[img] < 6) && (ap.mirror == true)) output[img][ofm][h][ofw-w-1] = (inp - gparams_.mean_values[ofm]) * gparams_.scale_values[fm]; else output[img][ofm][h][w] = (inp - gparams_.mean_values[ofm]) * gparams_.scale_values[fm]; } } } } } void LMDBDataNode::testImageTransform(vector& v, int tv, float* outp) { int nImg = gparams_.batch_size; int nOfm = gparams_.channels; int ofh = gparams_.crop_sizes[0]; int ofw = gparams_.crop_sizes[1]; // vector& mean = gparams_.mean_values; // vector& scale = gparams_.scale_values; float (* __restrict output)[nOfm][ofh][ofw] = (float (*)[*][*][*])outp; int tv2 = tv/2; #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; igetBuffer()); int* toplabel = (int*)(tenTopData_[1]->getBuffer()); #if 0 //def DEBUG printf("Executing FP %s: Data %p, Label %p\n", NNNode::nname_.c_str(),topdata, toplabel); #endif int em = eptr->get_execution_mode(); gparams_.exec_mode = em; current_epoch_ = eptr->get_current_epoch(); if(em == TRAIN) { if(full_train_prefetch_) { for(int i=0; ivalue()); if(tempbuf_[i][img].channels() == 0) DecodeDatumNative(&(tempbuf_[i][img])); #if 0 //def DEBUG printf("filename: %s label: %d\n",train_cursor_->key().c_str(), tempbuf_[i][img].label()); #endif #ifdef USE_MLSL if(!split_db_) train_cursor_->Next(num_nodes_-1); else train_cursor_->Next(); #else train_cursor_->Next(); #endif } } ctrain_pf_mb_ += gparams_.lookahead; full_train_prefetch_ = false; } else { if(ctrain_pf_mb_ < train_batches_) { for(int img=0; imgvalue()); if(tempbuf_[i][img].channels() == 0) DecodeDatumNative(&(tempbuf_[i][img])); #if 0 //def DEBUG printf("filename: %s label: %d\n",train_cursor_->key().c_str(), tempbuf_[i][img].label()); #endif #ifdef USE_MLSL if(!split_db_) train_cursor_->Next(num_nodes_-1); else train_cursor_->Next(); #else train_cursor_->Next(); #endif } ctrain_pf_mb_++; } } #ifdef RETURNALL return; #endif int mbslot = ctrain_proc_mb_ % gparams_.lookahead; #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; ivalue()); if(tempbuf_[i][img].channels() == 0) DecodeDatumNative(&(tempbuf_[i][img])); #if 0 //def DEBUG printf("filename: %s label: %d\n",test_cursor_->key().c_str(), tempbuf_[i][img].label()); #endif #ifdef USE_MLSL if(!split_db_) test_cursor_->Next(num_nodes_-1); else test_cursor_->Next(); #else test_cursor_->Next(); #endif } } ctest_pf_mb_ += gparams_.lookahead; full_test_prefetch_ = false; } else { { if(ctest_pf_mb_ < test_batches_) { for(int img=0; imgvalue()); if(tempbuf_[i][img].channels() == 0) DecodeDatumNative(&(tempbuf_[i][img])); #if 0 //def DEBUG printf("filename: %s label: %d\n",test_cursor_->key().c_str(), tempbuf_[i][img].label()); #endif #ifdef USE_MLSL if(!split_db_) test_cursor_->Next(num_nodes_-1); else test_cursor_->Next(); #else test_cursor_->Next(); #endif } ctest_pf_mb_++; } } } int mbslot = ctest_proc_mb_ % gparams_.lookahead; #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; i #include #include "Node.hpp" using namespace std; using namespace gxm; bool addedFD, addedBD; void NNNode::createNNGraph(int mode) { if(mode == TRAIN) { Task *fTask = this->getBasicTask(BASIC_TASK_FORW); Task *bTask = this->getBasicTask(BASIC_TASK_BACK); Task *wTask = this->getBasicTask(BASIC_TASK_WGRAD); #if 0 Task *sTask = this->getBasicTask(BASIC_TASK_SOLVE); #endif string s = dynamic_cast(fTask->getNode())->nname_; #if 0 if(wTask != NULL) { addedFD = wTask->addForwDep(sTask); // addedBD = fTask->addBackDep(sTask); #ifdef DEBUG if(addedFD) printf("solver task (node %s) %p depends on weight task (node %s) %p\n", s.c_str(), sTask, s.c_str(), wTask); if(addedBD) printf("forward task (node %s) %p depends on solver task (node %s) %p\n",s.c_str(), fTask, s.c_str(), sTask); #endif } #endif for(auto it=nextNodes_.begin(); it != nextNodes_.end(); it++) { NNNode *nNode = *it; Task *fnTask = nNode->getBasicTask(BASIC_TASK_FORW); Task *bnTask = nNode->getBasicTask(BASIC_TASK_BACK); if(fnTask != NULL) addedFD = fTask->addForwDep(fnTask); if(bTask != NULL && bnTask != NULL) addedBD = bTask->addBackDep(bnTask); if(wTask != NULL && bnTask != NULL) addedBD = wTask->addBackDep(bnTask); #ifdef DEBUG if(addedFD) printf("forward task (node %s) %p depends on forward task (node %s) %p\n",nNode->nname_.c_str(), fnTask, s.c_str(), fTask); if(bTask != NULL && bnTask != NULL) if(addedBD) printf("backward task (node %s) %p depends on backward task (node %s) %p\n",s.c_str(), bTask, nNode->nname_.c_str(), bnTask); if(wTask != NULL && bnTask != NULL) if(addedBD) printf("weight task (node %s) %p depends on backward task (node %s) %p\n", s.c_str(), wTask, nNode->nname_.c_str(), bnTask); #endif nNode->createNNGraph(mode); } // Handle last node if(nextNodes_.size() == 0) { if(bTask != NULL) { addedFD = fTask->addForwDep(bTask); #ifdef DEBUG if(addedFD) printf("backward task (node %s) %p depends on forward task (node %s) %p\n",s.c_str(), bTask, s.c_str(), fTask); #endif } } } else if(mode == TEST) { Task *fTask = this->getBasicTask(BASIC_TASK_FORW); for(auto it=nextNodes_.begin(); it != nextNodes_.end(); it++) { NNNode *nNode = *it; Task *fnTask = nNode->getBasicTask(BASIC_TASK_FORW); fTask->addForwDep(fnTask); #ifdef DEBUG printf("forward task %p depends on forward task %p\n",fnTask,fTask); #endif nNode->createNNGraph(mode); } } } libxsmm-1.17/samples/deeplearning/gxm/src/Pooling.cpp000066400000000000000000000261441415223013700227200ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #include #include "Pooling.hpp" PoolingNode::PoolingNode(PoolingParams* p, MLEngine* e): NNNode(p, e) { nname_ = p->get_node_name(); ntype_ = p->get_node_type(); mode_ = p->get_mode(); bottom_ = p->get_bottom_names(); top_ = p->get_top_names(); bp_flag_ = p->get_bprop_flag(); has_weights_ = false; bot_compute_engine_ = p->get_compute_engine(); tenTop_ = new Tensor(top_[0]); assert(tenTop_ != NULL); tenTop_->setOwner(this); tenTop_->setType(ACT); tenTopData_ = tenTop_->getBuf(DATA); tenTopData_->setBufferType(DATA); #ifdef DEBUG printf("bottom name %s\n",bottom_[0].c_str()); #endif tenBot_ = e->get_tensor(bottom_[0], ACT); assert(tenBot_ != NULL); NNNode *pnn = (NNNode*)tenBot_->getOwner(); setPrevNode(pnn); NNNode::mode_ = pnn->getMode(); pnn->set_top_compute_engine(p->get_compute_engine()); bot_cengine_ = pnn->get_bot_compute_engine(); tenBotData_ = tenBot_->getBuf(DATA); out_dtype = p->get_data_type(); tenTopData_->setDataType(out_dtype); in_dtype = tenBotData_->getDataType(); // Get input tensor shape (bottom) Shape* bs = tenBot_->getShape(); assert(bs->ndims <= MAX_DIMS); // Create shape of output tensor (top) vector vd = p->get_kernel_dims(); vector vp = p->get_pads(); vector vs = p->get_strides(); assert((vd.size() == vp.size()) && (vd.size() == vs.size())); shape_setzero(&ts_); ts_.ndims = bs->ndims; // Number of dimensions ts_.dims[0] = bs->dims[0]; // Minibatch size ts_.dims[1] = bs->dims[1]; // Num output feature maps ts_.dims[2] = (bs->dims[2] - vd[0] + 2*vp[0])/vs[0] + 1; // Height if(ts_.ndims == 4) ts_.dims[3] = (bs->dims[3] - vd[1] + 2*vp[1])/vs[1] + 1; // Width else if(ts_.ndims == 5) { ts_.dims[3] = (bs->dims[3] - vd[1] + 2*vp[1])/vs[1] + 1; // Width ts_.dims[4] = (bs->dims[4] - vd[2] + 2*vp[2])/vs[2] + 1; // Depth (for 3D) } if(vp[0]) if((ts_.dims[2] - 1) * vs[0] >= bs->dims[2] + vp[0]) ts_.dims[2]--; if(vp[1]) if((ts_.dims[3] - 1) * vs[1] >= bs->dims[3] + vp[1]) ts_.dims[3]--; if(ts_.ndims == 5) { if(vp[2]) if((ts_.dims[4] - 1) * vs[2] >= bs->dims[4] + vp[2]) ts_.dims[4]--; } // Set output tensor shape tenTop_->setShape(&ts_); long long int tsize = 1; for(int i=0; isetBufferSize(tsize); // Tensor representing mask of selected neurons. long long int size = 1; for(int i=0; iis_inference_only()) { if(NNNode::bp_flag_) { tenBotDiff_ = tenBot_->addBuf(); // DIFF type and index tenBotDiff_->setDataType(in_dtype); tenBotDiff_->setBufferType(DIFF); long long int bsize = 1; for(int i=0; indims; i++) bsize = bsize*bs->dims[i]; if(in_dtype == DT_FLOAT) bsize = bsize*sizeof(float); else if(in_dtype == DT_BF16) bsize = bsize*sizeof(libxsmm_bfloat16); // Set the size of the input-gradient buffer tenBotDiff_->setBufferSize(bsize); } } else tenBotDiff_ = NULL; // Register output tensor in tensorMap bool inserted = e->register_tensor(top_[0], ACT, this->tenTop_); if(!inserted) printf("Warning: Tensor %s already registered\n",NNNode::top_[0].c_str()); // Setup parameter structure for convolution computation in library gparams_.bdims = bs->ndims; gparams_.tdims = ts_.ndims; gparams_.node_name = nname_; gparams_.nInput = bs->dims[1]; gparams_.nOutput = ts_.dims[1]; gparams_.batch_size = bs->dims[0]; gparams_.iHeight = bs->dims[2]; gparams_.iWidth = bs->dims[3]; gparams_.iDepth = bs->dims[4]; gparams_.oHeight = ts_.dims[2]; gparams_.oWidth = ts_.dims[3]; gparams_.oDepth = ts_.dims[4]; gparams_.pad_h = vp[0]; gparams_.pad_w = vp[1]; gparams_.pad_d = vp[2]; gparams_.stride_h = vs[0]; gparams_.stride_w = vs[1]; gparams_.stride_d = vs[2]; gparams_.kh = vd[0]; gparams_.kw = vd[1]; gparams_.kd = vd[2]; gparams_.ipad_w = 0; gparams_.ipad_h = 0; gparams_.opad_w = 0; gparams_.opad_h = 0; gparams_.pool_mode = p->get_pool_mode(); gparams_.in_data_type = in_dtype; gparams_.out_data_type = out_dtype; gparams_.algType = p->get_algo_type(); gparams_.num_threads = e->get_num_threads(); //get global scratch tensor buffer tenScratchData_ = e->getScratchBuffer(); eptr_ = e; configure(p->get_compute_engine()); } void PoolingNode::configure(int engine) { switch(engine) { case XSMM: impl = new PoolXSMM(&gparams_, engine); break; } } void PoolingNode::convert_bf16_f32(libxsmm_bfloat16* in, float* out, int len) { int i; #ifdef _OPENMP #pragma omp parallel for private(i) #endif for ( i = 0; i < len; i+=16 ) { __m256i vbfp16 = _mm256_loadu_si256( (const __m256i*)(in+i) ); __m512 vfp32 = gxm_bfp16_to_fp32_avx512f( vbfp16 ); _mm512_storeu_ps( out+i, vfp32 ); } } void PoolingNode::forwardPropagate() { #ifdef DEBUG void *bot = tenBotData_->getBuffer(); void *top = tenTopData_->getBuffer(); printf("Executing FP %s: input %p, output %p mask %p\n",NNNode::nname_.c_str(), bot, top, tenMask_); printf("Inputs: %d x %d x %d\n",gparams_.nInput, gparams_.iHeight, gparams_.iWidth); printf("Outputs: %d x %d x %d\n",gparams_.nOutput, gparams_.oHeight, gparams_.oWidth); #endif int nImg = gparams_.batch_size; int ofm = gparams_.nOutput; int ifh = gparams_.iHeight; int ifw = gparams_.iWidth; int ofh = gparams_.oHeight; int ofw = gparams_.oWidth; impl->set_bot_compute_engine(bot_cengine_); impl->set_top_compute_engine(top_compute_engine_); impl->set_next_node_type(next_ntype_); impl->set_node_name(nname_); impl->set_scratch_buffer(tenScratchData_); impl->forwardPropagate(tenBotData_, tenTopData_, tenMask_); #ifdef CHECK_BLOWUP_FP32 if(out_dtype == DT_FLOAT) { for(int i=0; i<16; i++) { float v = ((float*)tenTopData_->getBuffer())[i]; if(isnan(v) || isinf(v)) { printf("Warning! %s layer FP activations are NaN or Inf\n", nname_.c_str()); exit(-1); } } } else if(out_dtype == DT_BF16) { convert_bf16_f32((libxsmm_bfloat16*)tenTopData_->getBuffer(), cbptr, 16); for(int i=0; i<16; i++) { if(isnan(cbptr[i]) || isinf(cbptr[i])) { printf("Warning! %s layer FP activations are NaN or Inf\n", nname_.c_str()); exit(-1); } } } #endif #ifdef GETSTATS #ifdef USE_MLSL size_t node_id = MLSL::Environment::GetEnv().GetProcessIdx(); #else size_t node_id = 0; #endif if(node_id==0 && eptr_->get_current_batch() % STATFREQ == 0) { if(gparams_.in_data_type == DT_FLOAT && gparams_.out_data_type == DT_FLOAT) { float *ptr = (float*)tenBotData_->getBuffer(); string s = nname_ + "_Inp"; MeanOfLayer((char*)s.c_str(), ptr, nImg*ofm*ifh*ifw); ptr = (float*)tenTopData_->getBuffer(); s = nname_ + "_Outp"; MeanOfLayer((char*)s.c_str(), ptr, nImg*ofm*ofh*ofw); } else if(gparams_.in_data_type == DT_BF16 && gparams_.out_data_type == DT_BF16) { if(stptr == NULL) { int s = nImg*ofm*ofh*ofw; int is = nImg*ofm*ifh*ifw; if(s > is) stptr = (float*)libxsmm_aligned_malloc(s*sizeof(float), 2097152); else stptr = (float*)libxsmm_aligned_malloc(is*sizeof(float), 2097152); } libxsmm_bfloat16 *ptr = (libxsmm_bfloat16*)tenBotData_->getBuffer(); convert_bf16_f32(ptr, stptr, nImg*ofm*ifh*ifw); string s = nname_ + "_Inp"; MeanOfLayer((char*)s.c_str(), stptr, nImg*ofm*ifh*ifw); ptr = (libxsmm_bfloat16*)tenTopData_->getBuffer(); convert_bf16_f32(ptr, stptr, nImg*ofm*ofh*ofw); s = nname_ + "_Outp"; MeanOfLayer((char*)s.c_str(), stptr, nImg*ofm*ofh*ofw); } } #endif } void PoolingNode::backPropagate() { int nImg = gparams_.batch_size; int ofm = gparams_.nOutput; int ifh = gparams_.iHeight; int ifw = gparams_.iWidth; int ofh = gparams_.oHeight; int ofw = gparams_.oWidth; tenTopDiff_ = tenTop_->getBuf(DIFF); #ifdef DEBUG void *gtop = tenTopDiff_->getBuffer(); assert(gtop != NULL); void* gbot = tenBotDiff_->getBuffer(); printf("Executing BP %s: grad_output %p, grad_input %p\n",NNNode::nname_.c_str(), gtop, gbot); printf("Grad Outputs: %d x %d x %d\n", gparams_.nOutput, gparams_.oHeight, gparams_.oWidth); printf("Grad Inputs: %d x %d x %d\n", gparams_.nInput, gparams_.iHeight, gparams_.iWidth); #endif impl->backPropagate(tenTopDiff_, tenMask_, tenBotDiff_); #ifdef CHECK_BLOWUP_FP32 if(out_dtype == DT_FLOAT) { for(int i=0; i<16; i++) { float v = ((float*)tenBotDiff_->getBuffer())[i]; if(isnan(v) || isinf(v)) { printf("Warning! %s layer FP activations are NaN or Inf\n", nname_.c_str()); exit(-1); } } } else if(out_dtype == DT_BF16) { convert_bf16_f32((libxsmm_bfloat16*)tenBotDiff_->getBuffer(), cbptr, 16); for(int i=0; i<16; i++) { if(isnan(cbptr[i]) || isinf(cbptr[i])) { printf("Warning! %s layer FP activations are NaN or Inf\n", nname_.c_str()); exit(-1); } } } #endif #ifdef GETSTATS #ifdef USE_MLSL size_t node_id = MLSL::Environment::GetEnv().GetProcessIdx(); #else size_t node_id = 0; #endif if(node_id==0 && eptr_->get_current_batch() % STATFREQ == 0) { if(gparams_.in_data_type == DT_FLOAT && gparams_.out_data_type == DT_FLOAT) { float *ptr = (float*)tenTopDiff_->getBuffer(); string s = nname_ + "_delOutp"; MeanOfLayer((char*)s.c_str(), ptr, nImg*ofm*ofh*ofw); ptr = (float*)tenBotDiff_->getBuffer(); s = nname_ + "_delInp"; MeanOfLayer((char*)s.c_str(), ptr, nImg*ofm*ifh*ifw); } else if(gparams_.in_data_type == DT_BF16 && gparams_.out_data_type == DT_BF16) { libxsmm_bfloat16 *ptr = (libxsmm_bfloat16*)tenTopDiff_->getBuffer(); convert_bf16_f32(ptr, stptr, nImg*ofm*ofh*ofw); string s = nname_ + "_delOutp"; MeanOfLayer((char*)s.c_str(), stptr, nImg*ofm*ofh*ofw); ptr = (libxsmm_bfloat16*)tenBotDiff_->getBuffer(); convert_bf16_f32(ptr, stptr, nImg*ofm*ifh*ifw); s = nname_ + "_delInp"; MeanOfLayer((char*)s.c_str(), stptr, nImg*ofm*ifh*ifw); } } #endif } libxsmm-1.17/samples/deeplearning/gxm/src/PoolingXSMM.cpp000066400000000000000000000242071415223013700234230ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #include #include #include #include "PoolingXSMM.hpp" #define VLEN 16 PoolXSMM::PoolXSMM(PoolImplParams *gp, int engine) : PoolImpl(gp, engine) { pooling_desc.N = gp->batch_size/NUM_NUMA_NODES; pooling_desc.C = gp->nInput; pooling_desc.H = gp->iHeight; pooling_desc.W = gp->iWidth; pooling_desc.u = gp->stride_h; pooling_desc.v = gp->stride_w; pooling_desc.R = gp->kh; pooling_desc.S = gp->kw; pooling_desc.pad_h = gp->pad_h; pooling_desc.pad_w = gp->pad_w; pooling_desc.pad_h_in = gp->ipad_h; pooling_desc.pad_w_in = gp->ipad_w; pooling_desc.pad_h_out = gp->opad_h; pooling_desc.pad_w_out = gp->opad_w; pooling_desc.threads = gp->num_threads/NUM_NUMA_NODES; if(gp->in_data_type == DT_FLOAT && gp->out_data_type == DT_FLOAT) { pooling_desc.datatype_in = LIBXSMM_DNN_DATATYPE_F32; pooling_desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32; } else if(gp->in_data_type == DT_BF16 && gp->out_data_type == DT_BF16) { pooling_desc.datatype_in = LIBXSMM_DNN_DATATYPE_BF16; pooling_desc.datatype_out = LIBXSMM_DNN_DATATYPE_BF16; } pooling_desc.datatype_mask = LIBXSMM_DNN_DATATYPE_I32; pooling_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; if(gp->pool_mode == MAX) pooling_desc.pooling_type = LIBXSMM_DNN_POOLING_MAX; else if(gp->pool_mode == AVE) pooling_desc.pooling_type = LIBXSMM_DNN_POOLING_AVG; for(int n=0; niHeight; int ifw = gp->iWidth; int iph = gp->ipad_h; int ipw = gp->ipad_w; int ifhp = ifh + 2*iph; int ifwp = ifw + 2*ipw; int ofh = gp->oHeight; int ofw = gp->oWidth; int oph = gp->opad_h; int opw = gp->opad_w; int ofhp = ofh + 2*oph; int ofwp = ofw + 2*opw; void *input[NUM_NUMA_NODES]; void *output[NUM_NUMA_NODES]; int *pool_mask[NUM_NUMA_NODES]; int imoff = pooling_desc.N * pooling_desc.C * ifhp * ifwp; if(gp->in_data_type == DT_FLOAT) imoff *= sizeof(float); else if(gp->in_data_type == DT_BF16) imoff *= sizeof(libxsmm_bfloat16); input[0] = inpb->getBuffer(); for(int n=1; nin_data_type == DT_FLOAT) imoff *= sizeof(float); else if(gp->in_data_type == DT_BF16) imoff *= sizeof(libxsmm_bfloat16); output[0] = outpb->getBuffer(); for(int n=1; ngetBufferPtr(); for(int n=0; nsetBufferPtr(sptrptr); } if(prev_scratch_size == 0) prev_scratch_size = scratchp->getBufferSize(); if(!updated_scratch_fwd || prev_scratch_size != scratchp->getBufferSize()) { int max_size=0; for(int n=0; ngetBufferSize(); long long int mysize = libxsmm_dnn_pooling_get_scratch_size( libxsmm_handle[n], &status ); CHKERR_LIBXSMM_DNN( status ); if(ssize < mysize) { libxsmm_free(sptrptr[n]); sptrptr[n] = (void*)libxsmm_aligned_malloc(mysize, 2097152); max_size = mysize; #ifdef USE_MLSL if(MLSL::Environment::GetEnv().GetProcessIdx() == 0) #endif printf("%s allocated %lld bytes for scratch @ %p, prev size was %lld bytes\n",nname.c_str(), mysize, sptrptr[n], ssize); } else max_size = ssize; } } scratchp->setBufferSize(max_size); for(int n=0; ngetBufferSize(); } #if defined(_OPENMP) #pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif int ntps = gp->num_threads/NUM_NUMA_NODES; int n = tid/ntps; CHKERR_LIBXSMM_DNN( libxsmm_dnn_pooling_execute_st( libxsmm_handle[n], LIBXSMM_DNN_COMPUTE_KIND_FWD, n*ntps, tid ) ); } } void PoolXSMM::backPropagate(TensorBuf *deloutpb, int *mask, TensorBuf *delinpb, int tid) { int ifh = gp->iHeight; int ifw = gp->iWidth; int iph = gp->ipad_h; int ipw = gp->ipad_w; int ifhp = ifh + 2*iph; int ifwp = ifw + 2*ipw; int ofh = gp->oHeight; int ofw = gp->oWidth; int oph = gp->opad_h; int opw = gp->opad_w; int ofhp = ofh + 2*oph; int ofwp = ofw + 2*opw; void *deloutput[NUM_NUMA_NODES]; void *delinput[NUM_NUMA_NODES]; int* pool_mask[NUM_NUMA_NODES]; int imoff = pooling_desc.N * pooling_desc.C * ifhp * ifwp; if(gp->in_data_type == DT_FLOAT) imoff *= sizeof(float); else if(gp->in_data_type == DT_BF16) imoff *= sizeof(libxsmm_bfloat16); delinput[0] = delinpb->getBuffer(); for(int n=1; nin_data_type == DT_FLOAT) imoff *= sizeof(float); else if(gp->in_data_type == DT_BF16) imoff *= sizeof(libxsmm_bfloat16); deloutput[0] = deloutpb->getBuffer(); for(int n=1; ngetBufferPtr(); if(!updated_scratch_bwd) { for(int n=0; nnum_threads/NUM_NUMA_NODES; int n = tid/ntps; CHKERR_LIBXSMM_DNN(libxsmm_dnn_pooling_execute_st(libxsmm_handle[n], LIBXSMM_DNN_COMPUTE_KIND_BWD, n*ntps, tid ) ); } delinpb->setLayoutType(LIBXSMM_CUSTOM_LAYOUT); } libxsmm-1.17/samples/deeplearning/gxm/src/ReLU.cpp000066400000000000000000000171601415223013700221160ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #include #include "ReLU.hpp" using namespace std; using namespace gxm; ReLUNode::ReLUNode(ReLUParams* p, MLEngine* e): NNNode(p, e) { nname_ = p->get_node_name(); ntype_ = p->get_node_type(); mode_ = p->get_mode(); bottom_ = p->get_bottom_names(); top_ = p->get_top_names(); bp_flag_ = p->get_bprop_flag(); has_weights_ = false; bot_compute_engine_ = p->get_compute_engine(); assert((bottom_.size() == 1) && (top_.size() == 1)); tenTop_ = new Tensor(top_[0]); assert(tenTop_ != NULL); tenTop_->setOwner(this); tenTop_->setType(ACT); tenTopData_ = tenTop_->getBuf(DATA); tenTopData_->setBufferType(DATA); #ifdef DEBUG printf("bottom name %s\n",bottom_[0].c_str()); #endif tenBot_ = e->get_tensor(bottom_[0], ACT); assert(tenBot_ != NULL); NNNode *pnn = (NNNode*)tenBot_->getOwner(); setPrevNode(pnn); pnn->set_top_compute_engine(p->get_compute_engine()); bot_cengine_ = pnn->get_bot_compute_engine(); tenBotData_ = tenBot_->getBuf(DATA); //Output tensor data type = input tensor data type int dtype = p->get_data_type(); tenTopData_->setDataType(dtype); // Get input tensor shape (bottom) Shape* bs = tenBot_->getShape(); assert(bs->ndims <= MAX_DIMS); tenTop_->setShape(bs); long long int tsize = 1; for(int i=0; indims; i++) tsize = tsize*bs->dims[i]; if(dtype == DT_FLOAT) tsize = tsize*sizeof(float); else if(dtype == DT_INT) tsize = tsize*sizeof(int); // Set the logical size of the tensor buffer for bufId=0 (forward data buffer). // Note: we have no knowledge of the machine parameters here, so effectively this is single-machine config tenTopData_->setBufferSize(tsize); if(!e->is_inference_only()) { if(bp_flag_) { tenBotDiff_ = tenBot_->addBuf(); // DIFF type and index tenBotDiff_->setDataType(dtype); tenBotDiff_->setBufferType(DIFF); long long int bsize = 1; for(int i=0; indims; i++) bsize = bsize*bs->dims[i]; if(dtype == DT_FLOAT) bsize = bsize*sizeof(float); else if(dtype == DT_INT) bsize = bsize*sizeof(int); // Set the size of the input-gradient buffer tenBotDiff_->setBufferSize(bsize); } } else tenBotDiff_ = NULL; // Register output tensor in tensor map bool inserted = e->register_tensor(top_[0], ACT, tenTop_); if(!inserted) printf("Warning: Tensor %s already registered\n",top_[0].c_str()); gparams_.bdims = gparams_.tdims = bs->ndims; gparams_.batch_size = bs->dims[0]; gparams_.node_name = nname_; gparams_.nInput = bs->dims[1]; gparams_.nOutput = gparams_.nInput; if(bs->ndims == 5) { gparams_.iDepth = gparams_.iHeight = gparams_.iWidth = bs->dims[2]; gparams_.oDepth = gparams_.oHeight = gparams_.oWidth = bs->dims[3]; } else if(bs->ndims == 4) { gparams_.iDepth = gparams_.oDepth = 0; gparams_.iHeight = gparams_.oHeight = bs->dims[2]; gparams_.iWidth = gparams_.oWidth = bs->dims[3]; } gparams_.negative_slope = p->get_negative_slope(); gparams_.data_type = dtype; gparams_.algType = p->get_algo_type(); gparams_.num_threads = e->get_num_threads(); configure(p->get_compute_engine()); eptr_ = e; }; void ReLUNode::configure(int engine) { switch(engine) { case XSMM: impl = new ReLUXSMM(&gparams_, engine); break; } } void ReLUNode::forwardPropagate() { #ifdef DEBUG float* bot = (float*)(tenBotData_->getBuffer()); float* top = (float*)(tenTopData_->getBuffer()); printf("Executing FP %s: input %p, output %p\n",NNNode::nname_.c_str(), bot, top); if(gparams_.bdims > 4) printf("Inputs: %d x %d x %d x %d\n",gparams_.nInput, gparams_.iDepth, gparams_.iHeight, gparams_.iWidth); else if(gparams_.bdims > 3) printf("Inputs: %d x %d x %d\n",gparams_.nInput, gparams_.iHeight, gparams_.iWidth); if(gparams_.tdims > 4) printf("Outputs: %d x %d x %d x %d\n",gparams_.nOutput, gparams_.oDepth, gparams_.oHeight, gparams_.oWidth); else if(gparams_.tdims > 3) printf("Outputs: %d x %d x %d\n",gparams_.nOutput, gparams_.oHeight, gparams_.oWidth); #endif impl->set_bot_compute_engine(bot_cengine_); impl->set_top_compute_engine(top_compute_engine_); impl->forwardPropagate(tenBotData_, tenTopData_); #ifdef GETSTATS #ifdef USE_MLSL size_t node_id = MLSL::Environment::GetEnv().GetProcessIdx(); #else size_t node_id = 0; #endif if(node_id == 0 && eptr_->get_current_batch() % STATFREQ == 0) { float *ptr = (float*)tenBotData_->getBuffer(); float *pptr = (float*)tenBotData_->getPrivBuffer(); float *p = (pptr == NULL) ? ptr : pptr; string s = nname_ + "_Inp"; MeanOfLayer((char*)s.c_str(), p, gparams_.batch_size*gparams_.nInput* gparams_.iHeight*gparams_.iWidth); ptr = (float*)tenTopData_->getBuffer(); pptr = (float*)tenTopData_->getPrivBuffer(); p = (pptr == NULL) ? ptr : pptr; s = nname_ + "_Outp"; MeanOfLayer((char*)s.c_str(), p, gparams_.batch_size*gparams_.nOutput* gparams_.oHeight*gparams_.oWidth); } #endif } void ReLUNode::backPropagate() { tenTopDiff_ = tenTop_->getBuf(DIFF); #ifdef DEBUG float *gtop = (float*)(tenTopDiff_->getBuffer()); assert(gtop != NULL); float* gbot = (float*)(tenBotDiff_->getBuffer()); float* bot = (float*)(tenBotData_->getBuffer()); printf("Executing BP %s: grad_output %p, grad_input %p\n",NNNode::nname_.c_str(), gtop, gbot); if(gparams_.bdims > 4) { printf("Inputs: %d x %d x %d x %d\n",gparams_.nInput, gparams_.iDepth, gparams_.iHeight, gparams_.iWidth); printf("Grad Inputs: %d x %d x %d x %d\n",gparams_.nInput, gparams_.iDepth, gparams_.iHeight, gparams_.iWidth); } else if(gparams_.bdims > 3) { printf("Inputs: %d x %d x %d\n",gparams_.nInput, gparams_.iHeight, gparams_.iWidth); printf("Grad Inputs: %d x %d x %d\n",gparams_.nInput, gparams_.iHeight, gparams_.iWidth); } if(gparams_.tdims > 4) printf("Grad Outputs: %d x %d x %d x %d\n",gparams_.nOutput, gparams_.oDepth, gparams_.oHeight, gparams_.oWidth); else if(gparams_.tdims > 3) printf("Grad Outputs: %d x %d x %d\n",gparams_.nOutput, gparams_.oHeight, gparams_.oWidth); #endif impl->backPropagate(tenBotData_, tenTopDiff_, tenBotDiff_); #ifdef GETSTATS #ifdef USE_MLSL size_t node_id = MLSL::Environment::GetEnv().GetProcessIdx(); #else size_t node_id = 0; #endif if(node_id == 0 && eptr_->get_current_batch() % STATFREQ == 0) { float *ptr = (float*)tenTopDiff_->getBuffer(); float *pptr = (float*)tenTopDiff_->getPrivBuffer(); float *p = (pptr == NULL) ? ptr : pptr; string s = nname_ + "_delOutp"; MeanOfLayer((char*)s.c_str(), p, gparams_.batch_size*gparams_.nOutput* gparams_.oHeight*gparams_.oWidth); ptr = (float*)tenBotDiff_->getBuffer(); pptr = (float*)tenBotDiff_->getPrivBuffer(); p = (pptr == NULL) ? ptr : pptr; s = nname_ + "_delInp"; MeanOfLayer((char*)s.c_str(), p, gparams_.batch_size*gparams_.nInput* gparams_.iHeight*gparams_.iWidth); } #endif } libxsmm-1.17/samples/deeplearning/gxm/src/ReLUXSMM.cpp000066400000000000000000000044421415223013700226220ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #include #include #include #include #include "ReLUXSMM.hpp" void ReLUXSMM::forwardPropagate(TensorBuf *inpb, TensorBuf *outpb, int tid) { float *inp = (float*)inpb->getBuffer(); float *outp = (float*)outpb->getBuffer(); int nImg = gp->batch_size; int nOfm = gp->nOutput; int ofh = gp->oHeight; int ofw = gp->oWidth; __assume_aligned(inp,64); __assume_aligned(outp,64); int size = nImg * nOfm * ofh * ofw; #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; isetLayoutType(inpb->getLayoutType()); outpb->setLayout(inpb->getLayout()); } void ReLUXSMM::backPropagate(TensorBuf *inpb, TensorBuf *deloutpb, TensorBuf *delinpb, int tid) { float *inp = (float*)inpb->getBuffer(); float *deloutp = (float*)deloutpb->getBuffer(); float *delinp = (float*)delinpb->getBuffer(); int nImg = gp->batch_size; int nOfm = gp->nOutput; int ofh = gp->oHeight; int ofw = gp->oWidth; int nIfm = gp->nInput; int ifh = gp->iHeight; int ifw = gp->iWidth; int threads = gp->num_threads; __assume_aligned(inp,64); __assume_aligned(delinp,64); __assume_aligned(deloutp,64); int size = nImg * nOfm * ofh * ofw; #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; i 0.0) delinp[i] = deloutp[i]; else delinp[i] = 0.0; } delinpb->setLayoutType(deloutpb->getLayoutType()); delinpb->setLayout(deloutpb->getLayout()); } libxsmm-1.17/samples/deeplearning/gxm/src/SoftmaxLoss.cpp000066400000000000000000000137341415223013700235740ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #include #include "SoftmaxLoss.hpp" #ifdef USE_MLSL #include "mpi.h" #endif #define SMAXLOSS_TYPE_DIRECT 0 #define LOSSFREQ 100 SoftmaxLossNode::SoftmaxLossNode(SoftmaxLossParams* p, MLEngine* e) : NNNode(p, e) { nname_ = p->get_node_name(); ntype_ = p->get_node_type(); mode_ = p->get_mode(); top_ = p->get_top_names(); bottom_ = p->get_bottom_names(); has_weights_ = false; bp_flag_ = true; //Create output tensor tenTop_ = new Tensor(top_[0]); assert(tenTop_ != NULL); tenTop_->setOwner(this); tenTop_->setType(ACT); tenTopData_ = tenTop_->getBuf(DATA); tenTopData_->setBufferType(DATA); int dtype = p->get_data_type(); Shape *bs; tenBot_.resize(bottom_.size()); tenBotData_.resize(bottom_.size()); for(int i=0; iget_tensor(bottom_[i], LABEL); else tenBot_[i] = e->get_tensor(bottom_[i], ACT); assert(this->tenBot_[i] != NULL); tenBotData_[i] = tenBot_[i]->getBuf(DATA); if((bottom_[i]).find("label") == bottom_[i].npos) { setPrevNode((NNNode*)tenBot_[i]->getOwner()); // Get input tensor shape (bottom) bs = tenBot_[i]->getShape(); } } //Output tensor data type = input tensor data type tenTopData_->setDataType(dtype); assert(bs->ndims <= MAX_DIMS); shape_setzero(&ts_); ts_.ndims = 2; ts_.dims[0] = bs->dims[0]; // minibatch ts_.dims[1] = bs->dims[1]; // num output = num_input tenTop_->setShape(&ts_); long long int size = 1; for(int i=0; isetBufferSize(size); loss_weight_ = p->get_loss_weight(); if(!e->is_inference_only()) { if(bp_flag_) { for(int i=0; iaddBuf(); tenBotDiff_->setDataType(dtype); tenBotDiff_->setBufferType(DIFF); size = 1; for(int i=0; indims; i++) size = size*bs->dims[i]; if(dtype == DT_FLOAT) size = size*sizeof(float); else if(dtype == DT_INT) size = size*sizeof(int); // Set the size of the input-gradient buffer tenBotDiff_->setBufferSize(size); break; } } } } // Register output tensor in tensorMap bool inserted = e->register_tensor(top_[0], ACT, this->tenTop_); if(!inserted) printf("Warning: Tensor %s already registered\n",top_[0].c_str()); gparams_.node_name = nname_; gparams_.batch_size = bs->dims[0]; gparams_.nInput = bs->dims[1]; gparams_.nOutput = ts_.dims[1]; gparams_.loss_weight = loss_weight_[0]; gparams_.num_threads = e->get_num_threads(); eptr_ = e; #ifdef USE_MLSL node_id_ = MLSL::Environment::GetEnv().GetProcessIdx(); num_nodes_ = MLSL::Environment::GetEnv().GetProcessCount(); #else node_id_ = 0; num_nodes_ = 1; #endif test_loss_ = 0; impl = new SMaxLossLoop(&gparams_); } void SoftmaxLossNode::forwardPropagate() { #ifdef RETURNALL return; #endif struct timeval tvss, tvse, tvcs, tvce; float* bot = (float*)(tenBotData_[0]->getBuffer()); int* label = (int*)(tenBotData_[1]->getBuffer()); float* top = (float*)(tenTopData_->getBuffer()); #ifdef TIMING gettimeofday(&tvss, NULL); #endif impl->forwardPropagate(tenBotData_[0], tenBotData_[1], tenTopData_); #ifdef TIMING gettimeofday(&tvse, NULL); double smaxtime = (tvse.tv_sec + tvse.tv_usec*1e-6) - (tvss.tv_sec + tvss.tv_usec*1e-6); if(node_id_ == 0) printf("Softmax FP time: %f ms\n",smaxtime*1000); #endif #ifdef GETSTATS if(node_id_ == 0) { MeanOfLayer("SMFPIn", bot, gparams_.batch_size*gparams_.nInput); MeanOfLayer("SMFPOut", top, gparams_.batch_size*gparams_.nOutput); MeanOfLayer("SMFPLabel", label, gparams_.batch_size); } #endif #ifdef TIMING gettimeofday(&tvcs, NULL); #endif #ifdef USE_MLSL MPI_Allreduce(MPI_IN_PLACE, &gparams_.loss, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); #endif #ifdef TIMING gettimeofday(&tvce, NULL); double allrtime = (tvce.tv_sec + tvce.tv_usec*1e-6) - (tvcs.tv_sec + tvcs.tv_usec*1e-6); if(node_id_ == 0) printf("Softmax all-reduce time: %f ms\n",allrtime*1000); #endif if(node_id_ == 0 && eptr_->get_current_batch() % LOSSFREQ == 0) { gparams_.loss = gparams_.loss/num_nodes_; printf("loss = %.15f (weighted loss = %.15f)\n", gparams_.loss, gparams_.loss*gparams_.loss_weight); } } void SoftmaxLossNode::backPropagate() { #ifdef RETURNALL return; #endif float* gbot = (float*)(tenBotDiff_->getBuffer()); int* label = (int*)(tenBotData_[1]->getBuffer()); float* top = (float*)(tenTopData_->getBuffer()); #ifdef GETSTATS printf("Executing BP %s: Grad output %p, label %p Grad input %p\n",NNNode::nname_.c_str(), top, label, gbot); if(node_id_ == 0) MeanOfLayer("BPIn", top, gparams_.batch_size*gparams_.nOutput); #endif impl->set_num_nodes(num_nodes_); impl->backPropagate(tenTopData_, tenBotData_[1], tenBotDiff_); #ifdef GETSTATS if(node_id_ == 0) MeanOfLayer("BPOut", gbot, gparams_.batch_size*gparams_.nInput); #endif } libxsmm-1.17/samples/deeplearning/gxm/src/SoftmaxLossLoop.cpp000066400000000000000000000060101415223013700244130ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #include #include "SoftmaxLossLoop.hpp" #include "common.hpp" #include #include void SMaxLossLoop::forwardPropagate(TensorBuf* inpb, TensorBuf* labelb, TensorBuf* outpb) { int nImg = gp->batch_size; int nFM = gp->nInput; float* inp, *outp; int *label; label = (int*)labelb->getBuffer(); inp = (float*)inpb->getBuffer(); outp = (float*)outpb->getBuffer(); float (* __restrict input)[nFM] = (float (*)[*])inp; float (* __restrict output)[nFM] = (float (*)[*])outp; #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; i max) max = input[i][fm]; } float sum_of_exp = 0.0; #pragma omp simd reduction(+: sum_of_exp) for(int fm = 0; fm < nFM; fm++) { output[i][fm] = output[i][fm] - max; output[i][fm] = exp(output[i][fm]); sum_of_exp += output[i][fm]; } float recp_soe = 1.0/sum_of_exp; //Normalize each value by sum_of_exp #pragma omp simd for(int fm = 0; fm < nFM; fm++) output[i][fm] = output[i][fm]*recp_soe; } float loss = 0.0; #pragma omp parallel for reduction(+: loss) for(int img = 0; img < nImg; img++) { float val = output[img][label[img]] > FLT_MIN ? output[img][label[img]] : FLT_MIN; loss += log(val); } gp->loss = -loss/nImg; } void SMaxLossLoop::backPropagate(TensorBuf *outpb, TensorBuf* labelb, TensorBuf *delinpb) { int nImg = gp->batch_size; int nFM = gp->nInput; float *outp, *delinp; int* label; label = (int*)labelb->getBuffer(); delinp = (float*)delinpb->getBuffer(); outp = (float*)outpb->getBuffer(); #ifdef USE_MLSL float recp_mb = 1.0/(nImg * num_nodes); #else float recp_mb = 1.0/nImg; #endif float (* __restrict output )[nFM] = (float (*)[*])outp; float (* __restrict del_input )[nFM] = (float (*)[*])delinp; #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; iloss_weight; else del_input[i][fm] = output[i][fm] * recp_mb * gp->loss_weight; } } } libxsmm-1.17/samples/deeplearning/gxm/src/Solver.cpp000066400000000000000000000563531415223013700225700ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #include #include "Solver.hpp" #define VLEN 16 using namespace std; using namespace gxm; SolverNode::SolverNode(SolverParams* p, MLEngine* e): MLNode(p, e) { lr_policy_ = p->getLRPolicy(); base_lr_ = p->getLearningRate(); warmup_lr_ = p->getWarmupLR(); mval_ = p->getMomentum(); decayval_ = p->getWeightDecay(); power_ = p->getPower(); gamma_ = p->getGamma(); step_size_ = p->getStepSize(); max_iter_ = p->getMaxIter(); stepvalues_ = p->getStepValues(); warmup_max_epoch_ = p->getWarmupEpochs(); stepidx_ = 0; epochs_ = p->getEpochs(); test_epoch_ = p->getTestEpoch(); solver_type_ = p->getSolverType(); global_ = p->getGlobalFlag(); data_type_ = p->getDataType(); eptr_ = e; } void SolverNode::convert_bf16_f32(libxsmm_bfloat16 **in, float** out, int len) { #ifdef _OPENMP #pragma omp parallel #endif { int tid = omp_get_thread_num(); int ntps = eptr_->get_num_threads()/NUM_NUMA_NODES; int n = tid/ntps; int ltid = tid - n*ntps; libxsmm_bfloat16 *inp = in[n]; float *outp = out[n]; int jobs = (len % ntps == 0) ? len/ntps : len/ntps + 1; int tb = (ltid*jobs < len) ? ltid*jobs : len; int te = ((ltid+1)*jobs < len) ? (ltid+1)*jobs : len; for (int i = tb; i < te; i+=16 ) { __m256i vbfp16 = _mm256_loadu_si256( (const __m256i*)(inp+i) ); __m512 vfp32 = gxm_bfp16_to_fp32_avx512f( vbfp16 ); _mm512_storeu_ps( outp+i, vfp32 ); } } } void SolverNode::convert_bf16_f32(libxsmm_bfloat16 *in, float* out, int len) { int i; #ifdef _OPENMP #pragma omp parallel for private(i) #endif for (i = 0; i < len; i+=16 ) { __m256i vbfp16 = _mm256_loadu_si256( (const __m256i*)(in+i) ); __m512 vfp32 = gxm_bfp16_to_fp32_avx512f( vbfp16 ); _mm512_storeu_ps( out+i, vfp32 ); } } void SolverNode::applyUpdate(float **blob, float **inc, void **grad, int s, float** lr_mult, float** decay_mult, string tensorType) { int iter = eptr_->get_current_batch() + eptr_->get_num_train_batches() * eptr_->get_current_epoch(); int warmup_max_iter = eptr_->get_num_train_batches() * warmup_max_epoch_; // Warm-up if(eptr_->get_current_epoch() < warmup_max_epoch_) lrval_ = (iter*base_lr_ + (warmup_max_iter - iter) * warmup_lr_)/warmup_max_iter; else if(lr_policy_.compare("fixed") == 0) lrval_ = base_lr_; else if(lr_policy_.compare("step") == 0) lrval_ = base_lr_ * pow(gamma_, floor((double)iter/(double)step_size_)); else if(lr_policy_.compare("poly") == 0) lrval_ = base_lr_ * pow(((float)1. - ((float)iter/(float)max_iter_)), power_); else if(lr_policy_.compare("inv") == 0) lrval_ = base_lr_ * pow((1 + gamma_ * iter), (-power_)); else if(lr_policy_.compare("multistep") == 0) { if(stepidx_ < stepvalues_.size() && iter > stepvalues_[stepidx_]) stepidx_++; lrval_ = base_lr_ * pow(gamma_, (float)stepidx_); } eptr_->set_learning_rate(lrval_); if(tensorType=="WEIGHT" && data_type_ == BF16) { for(int n=0; nget_num_threads()/NUM_NUMA_NODES; int n = tid/ntps; int ltid = tid - n*ntps; int jobs = (sn % ntps == 0) ? (sn/ntps) : (sn/ntps) + 1; int tb = (ltid * jobs < sn) ? (ltid * jobs) : sn; int te = (ltid + 1)*jobs < sn ? (ltid + 1)*jobs : sn; float *wgp = (wgrad_ptr[n]+n*sn); for(int nn=0; nnget_num_threads()/NUM_NUMA_NODES; int n = tid/ntps; if(n != 0) { int ltid = tid - n*ntps; int jobs = (sn % ntps == 0) ? (sn/ntps) : (sn/ntps) + 1; int tb = (ltid * jobs < sn) ? (ltid * jobs) : sn; int te = (ltid + 1)*jobs < sn ? (ltid + 1)*jobs : sn; float *wgp = wgrad_ptr[0]+n*sn; float *rgp = wgrad_ptr[n]+n*sn; #pragma omp simd for(int i=tb; iget_num_threads()/NUM_NUMA_NODES; int n = tid/ntps; int ltid = tid - n*ntps; float *blobp = blob[n] + n*sn; float *incp = inc[n] + n*sn; float *lrp = lr_mult[n] + n*sn; float *dcp = decay_mult[n] + n*sn; int jobs = (sn % ntps == 0) ? (sn / ntps) : (sn / ntps) + 1; int tb = (ltid * jobs < sn) ? (ltid * jobs) : sn; int te = (ltid + 1)*jobs < sn ? (ltid + 1)*jobs : sn; #pragma omp barrier #pragma omp simd for(int i=tb; iget_current_epoch() < warmup_max_epoch_) { if(prev_lrval_ != -1) { mc_ = lrval_/prev_lrval_; prev_lrval_ = lrval_; } else prev_lrval_ = lrval_; } #ifdef _OPENMP #pragma omp parallel #endif { int tid = omp_get_thread_num(); int ntps = eptr_->get_num_threads()/NUM_NUMA_NODES; int n = tid/ntps; int ltid = tid - n*ntps; float *blobp = blob[n] + n*sn; float *incp = inc[n] + n*sn; float *lrp = lr_mult[n] + n*sn; float *dcp = decay_mult[n] + n*sn; int jobs = (sn % ntps == 0) ? (sn / ntps) : (sn / ntps) + 1; int tb = (ltid * jobs < sn) ? (ltid * jobs) : sn; int te = (ltid + 1)*jobs < sn ? (ltid + 1)*jobs : sn; #pragma omp barrier #pragma omp simd for(int i=tb; iget_current_epoch() < warmup_max_epoch_) { if(prev_lrval_ != -1) { mc1_ = lrval_/prev_lrval_; if(prev_lrval_1_ != -1) mc2_ = prev_lrval_/prev_lrval_1_; } prev_lrval_1_ = prev_lrval_; prev_lrval_ = lrval_; } #ifdef _OPENMP #pragma omp parallel #endif { int tid = omp_get_thread_num(); int ntps = eptr_->get_num_threads()/NUM_NUMA_NODES; int n = tid/ntps; int ltid = tid - n*ntps; int jobs = (sn % ntps == 0) ? (sn / ntps) : (sn / ntps) + 1; int tb = (ltid * jobs < sn) ? (ltid * jobs) : sn; int te = (ltid + 1)*jobs < sn ? (ltid + 1)*jobs : sn; float *bp = blob[n] + n*sn; float *incp = inc[n] + n*sn; float *lrp = lr_mult[n] + n*sn; float *dcp = decay_mult[n] + n*sn; float *wgp = wgrad_ptr[n] + n*sn; #pragma omp simd for(int i=tb; iget_num_threads()/NUM_NUMA_NODES; int n = tid/ntps; int ltid = tid - n*ntps; int jobs = (sn % ntps == 0) ? (sn / ntps) : (sn / ntps) + 1; int tb = (ltid * jobs < sn) ? (ltid * jobs) : sn; int te = (ltid + 1)*jobs < sn ? (ltid + 1)*jobs : sn; for(int nn=0; nnget_current_batch() + eptr_->get_num_train_batches() * eptr_->get_current_epoch(); int warmup_max_iter = eptr_->get_num_train_batches() * warmup_max_epoch_; // Warm-up if(eptr_->get_current_epoch() < warmup_max_epoch_) lrval_ = (iter*base_lr_ + (warmup_max_iter - iter) * warmup_lr_)/warmup_max_iter; else if(lr_policy_.compare("fixed") == 0) lrval_ = base_lr_; else if(lr_policy_.compare("step") == 0) lrval_ = base_lr_ * pow(gamma_, floor((double)iter/(double)step_size_)); else if(lr_policy_.compare("poly") == 0) lrval_ = base_lr_ * pow(((float)1. - ((float)iter/(float)max_iter_)), power_); else if(lr_policy_.compare("inv") == 0) lrval_ = base_lr_ * pow((1 + gamma_ * iter), (-power_)); else if(lr_policy_.compare("multistep") == 0) { if(stepidx_ < stepvalues_.size() && iter > stepvalues_[stepidx_]) stepidx_++; lrval_ = base_lr_ * pow(gamma_, (float)stepidx_); } eptr_->set_learning_rate(lrval_); #ifdef BF16_MLSL if(tensorType=="WEIGHT" && data_type_ == BF16) { for(int n=0; nget_num_threads()/NUM_NUMA_NODES; int n = tid/ntps; int ltid = tid - n*ntps; int jobs = (sn % ntps == 0) ? (sn/ntps) : (sn/ntps) + 1; int tb = (ltid * jobs < sn) ? (ltid * jobs) : sn; int te = (ltid + 1)*jobs < sn ? (ltid + 1)*jobs : sn; float *wgp = (wgrad_ptr[n]+n*sn); for(int nn=0; nnget_num_threads()/NUM_NUMA_NODES; int n = tid/ntps; if(n != 0) { int ltid = tid - n*ntps; int jobs = (sn % ntps == 0) ? (sn/ntps) : (sn/ntps) + 1; int tb = (ltid * jobs < sn) ? (ltid * jobs) : sn; int te = (ltid + 1)*jobs < sn ? (ltid + 1)*jobs : sn; float *wgp = wgrad_ptr[0]+n*sn; float *rgp = wgrad_ptr[n]+n*sn; #pragma vector nontemporal #pragma omp simd for(int i=tb; iget_num_threads()/NUM_NUMA_NODES; int n = tid/ntps; int ltid = tid - n*ntps; float *blobp = blob[n] + n*sn; float *incp = inc[n] + n*sn; int jobs = (sn % ntps == 0) ? (sn / ntps) : (sn / ntps) + 1; int tb = (ltid * jobs < sn) ? (ltid * jobs) : sn; int te = (ltid + 1)*jobs < sn ? (ltid + 1)*jobs : sn; #pragma omp barrier #pragma omp simd for(int i=tb; iget_current_epoch() < warmup_max_epoch_) { if(prev_lrval_ != -1) { mc_ = lrval_/prev_lrval_; prev_lrval_ = lrval_; } else prev_lrval_ = lrval_; } #ifdef _OPENMP #pragma omp parallel #endif { int tid = omp_get_thread_num(); int ntps = eptr_->get_num_threads()/NUM_NUMA_NODES; int n = tid/ntps; int ltid = tid - n*ntps; float *blobp = blob[n] + n*sn; float *incp = inc[n] + n*sn; int jobs = (sn % ntps == 0) ? (sn / ntps) : (sn / ntps) + 1; int tb = (ltid * jobs < sn) ? (ltid * jobs) : sn; int te = (ltid + 1)*jobs < sn ? (ltid + 1)*jobs : sn; #pragma omp barrier #pragma omp simd for(int i=tb; iget_current_epoch() < warmup_max_epoch_) { if(prev_lrval_ != -1) { mc1_ = lrval_/prev_lrval_; if(prev_lrval_1_ != -1) mc2_ = prev_lrval_/prev_lrval_1_; } prev_lrval_1_ = prev_lrval_; prev_lrval_ = lrval_; } #ifdef _OPENMP #pragma omp parallel #endif { int tid = omp_get_thread_num(); int ntps = eptr_->get_num_threads()/NUM_NUMA_NODES; int n = tid/ntps; int ltid = tid - n*ntps; int jobs = (sn % ntps == 0) ? (sn / ntps) : (sn / ntps) + 1; int tb = (ltid * jobs < sn) ? (ltid * jobs) : sn; int te = (ltid + 1)*jobs < sn ? (ltid + 1)*jobs : sn; float *incp = (inc[n]+n*sn); float *wgp = (wgrad_ptr[n]+n*sn); float *bp = (blob[n]+n*sn); #pragma omp simd for(int i=tb; iget_num_threads()/NUM_NUMA_NODES; int n = tid/ntps; int ltid = tid - n*ntps; int jobs = (sn % ntps == 0) ? (sn / ntps) : (sn / ntps) + 1; int tb = (ltid * jobs < sn) ? (ltid * jobs) : sn; int te = (ltid + 1)*jobs < sn ? (ltid + 1)*jobs : sn; for(int nn=0; nnget_current_batch() + eptr_->get_num_train_batches() * eptr_->get_current_epoch(); int warmup_max_iter = eptr_->get_num_train_batches() * warmup_max_epoch_; // Warm-up if(eptr_->get_current_epoch() < warmup_max_epoch_) lrval_ = (iter*base_lr_ + (warmup_max_iter - iter) * warmup_lr_)/warmup_max_iter; else if(lr_policy_.compare("fixed") == 0) lrval_ = base_lr_; else if(lr_policy_.compare("step") == 0) lrval_ = base_lr_ * pow(gamma_, floor((double)iter/(double)step_size_)); else if(lr_policy_.compare("poly") == 0) lrval_ = base_lr_ * pow(((float)1. - ((float)iter/(float)max_iter_)), power_); else if(lr_policy_.compare("inv") == 0) lrval_ = base_lr_ * pow((1 + gamma_ * iter), (-power_)); else if(lr_policy_.compare("multistep") == 0) { if(stepidx_ < stepvalues_.size() && iter > stepvalues_[stepidx_]) stepidx_++; lrval_ = base_lr_ * pow(gamma_, (float)stepidx_); } eptr_->set_learning_rate(lrval_); float *wgrad_ptr; if(tensorType=="WEIGHT" && data_type_ == BF16) { if(tmp_grad[0] == NULL) tmp_grad[0] = (float*)libxsmm_aligned_malloc(s*sizeof(float), 2097152); convert_bf16_f32((libxsmm_bfloat16*)grad, tmp_grad[0], s); #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; iget_current_epoch() < warmup_max_epoch_) { if(prev_lrval_ != -1) { mc_ = lrval_/prev_lrval_; prev_lrval_ = lrval_; } else prev_lrval_ = lrval_; } #ifdef _OPENMP #pragma omp parallel #endif { int tid = omp_get_thread_num(); int threads = omp_get_num_threads(); int jobs = (s % threads == 0) ? (s / threads) : (s / threads) + 1; int tb = (tid * jobs < s) ? (tid * jobs) : s; int te = (tid + 1)*jobs < s ? (tid + 1)*jobs : s; #pragma omp simd for(int i=tb; iget_current_epoch() < warmup_max_epoch_) { if(prev_lrval_ != -1) { mc1_ = lrval_/prev_lrval_; if(prev_lrval_1_ != -1) mc2_ = prev_lrval_/prev_lrval_1_; } prev_lrval_1_ = prev_lrval_; prev_lrval_ = lrval_; } #ifdef _OPENMP #pragma omp parallel #endif { int tid = omp_get_thread_num(); int threads = omp_get_num_threads(); int jobs = (s % threads == 0) ? (s / threads) : (s / threads) + 1; int tb = (tid * jobs < s) ? (tid * jobs) : s; int te = (tid + 1)*jobs < s ? (tid + 1)*jobs : s; #pragma omp simd for(int i=tb; iget_current_batch() + eptr_->get_num_train_batches() * eptr_->get_current_epoch(); int warmup_max_iter = eptr_->get_num_train_batches() * warmup_max_epoch_; // Warm-up if(eptr_->get_current_epoch() < warmup_max_epoch_) lrval_ = (iter*base_lr_ + (warmup_max_iter - iter) * warmup_lr_)/warmup_max_iter; else if(lr_policy_.compare("fixed") == 0) lrval_ = base_lr_; else if(lr_policy_.compare("step") == 0) lrval_ = base_lr_ * pow(gamma_, floor((double)iter/(double)step_size_)); else if(lr_policy_.compare("poly") == 0) lrval_ = base_lr_ * pow(((float)1. - ((float)iter/(float)max_iter_)), power_); else if(lr_policy_.compare("inv") == 0) lrval_ = base_lr_ * pow((1 + gamma_ * iter), (-power_)); else if(lr_policy_.compare("multistep") == 0) { if(stepidx_ < stepvalues_.size() && iter > stepvalues_[stepidx_]) stepidx_++; lrval_ = base_lr_ * pow(gamma_, (float)stepidx_); } eptr_->set_learning_rate(lrval_); float *wgrad_ptr; if(tensorType=="WEIGHT" && data_type_ == BF16) { if(tmp_grad[0] == NULL) tmp_grad[0] = (float*)libxsmm_aligned_malloc(s*sizeof(float), 2097152); convert_bf16_f32((libxsmm_bfloat16*)grad, tmp_grad[0], s); #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; iget_current_epoch() < warmup_max_epoch_) { if(prev_lrval_ != -1) { mc_ = lrval_/prev_lrval_; prev_lrval_ = lrval_; } else prev_lrval_ = lrval_; } #ifdef _OPENMP #pragma omp parallel #endif { int tid = omp_get_thread_num(); int threads = omp_get_num_threads(); int jobs = (s % threads == 0) ? (s / threads) : (s / threads) + 1; int tb = (tid * jobs < s) ? (tid * jobs) : s; int te = (tid + 1)*jobs < s ? (tid + 1)*jobs : s; #pragma omp simd for(int i=tb; iget_current_epoch() < warmup_max_epoch_) { if(prev_lrval_ != -1) { mc1_ = lrval_/prev_lrval_; if(prev_lrval_1_ != -1) mc2_ = prev_lrval_/prev_lrval_1_; } prev_lrval_1_ = prev_lrval_; prev_lrval_ = lrval_; } #ifdef _OPENMP #pragma omp parallel #endif { int tid = omp_get_thread_num(); int threads = omp_get_num_threads(); int jobs = (s % threads == 0) ? (s / threads) : (s / threads) + 1; int tb = (tid * jobs < s) ? (tid * jobs) : s; int te = (tid + 1)*jobs < s ? (tid + 1)*jobs : s; #pragma omp simd for(int i=tb; i #include "Split.hpp" using namespace std; using namespace gxm; SplitNode::SplitNode(SplitParams *p, MLEngine *e) : NNNode(p, e) { nname_ = p->get_node_name(); ntype_ = p->get_node_type(); mode_ = p->get_mode(); bottom_ = p->get_bottom_names(); top_ = p->get_top_names(); bp_flag_ = p->get_bprop_flag(); has_weights_ = false; bot_compute_engine_ = p->get_compute_engine(); if(nname_.find("label") != nname_.npos) tenBot_ = e->get_tensor(bottom_[0], LABEL); else tenBot_ = e->get_tensor(bottom_[0], ACT); assert(tenBot_ != NULL); NNNode *pnn = (NNNode*)tenBot_->getOwner(); setPrevNode(pnn); bot_cengine_ = pnn->get_bot_compute_engine(); pnn->set_top_compute_engine(p->get_compute_engine()); pnn->set_next_node_type(ntype_); tenBotData_ = tenBot_->getBuf(DATA); in_dtype = tenBotData_->getDataType(); out_dtype = in_dtype; Shape* bs = tenBot_->getShape(); assert(bs->ndims <= MAX_DIMS); // number of splits gparams_.nOutput.resize(top_.size()); tenTop_.resize(top_.size()); tenTopData_.resize(top_.size()); for(int i=0; isetOwner(this); if(nname_.find("label") != nname_.npos) tenTop_[i]->setType(LABEL); else tenTop_[i]->setType(ACT); Shape ts; shape_setzero(&ts); ts.ndims = bs->ndims; for(int j=0; jndims; j++) ts.dims[j] = bs->dims[j]; tenTop_[i]->setShape(&ts); tenTopData_[i] = tenTop_[i]->getBuf(DATA); tenTopData_[i]->setBufferType(DATA); if(nname_.find("label") == nname_.npos) tenTopData_[i]->setDataType(in_dtype); else tenTopData_[i]->setDataType(DT_INT); bool inserted; if(nname_.find("label") != nname_.npos) inserted = e->register_tensor(NNNode::top_[i], LABEL, tenTop_[i]); else inserted = e->register_tensor(NNNode::top_[i], ACT, tenTop_[i]); if(!inserted) printf("Warning: Tensor %s already registered\n",NNNode::top_[i].c_str()); } if(!e->is_inference_only()) { if(bp_flag_) { tenBotDiff_ = tenBot_->addBuf(); tenBotDiff_->setDataType(in_dtype); tenBotDiff_->setBufferType(DIFF); int elem = bs->dims[0]*bs->dims[1]*bs->dims[2]*bs->dims[3]; //printf("%s: elem = %d\n",nname_.c_str(),elem); if(in_dtype == DT_FLOAT) elem = elem*sizeof(float); else if(in_dtype == DT_BF16) elem = elem*sizeof(libxsmm_bfloat16); tenBotDiff_->setBufferSize(elem); } } gparams_.bdims = bs->ndims; gparams_.tdims = bs->ndims; gparams_.batch_size = bs->dims[0]; gparams_.nInput = bs->dims[1]; for(int i=0; idims[1]; gparams_.iHeight = bs->dims[2]; gparams_.iWidth = bs->dims[3]; gparams_.oHeight = gparams_.iHeight; gparams_.oWidth = gparams_.iWidth; gparams_.in_data_type = in_dtype; gparams_.out_data_type = out_dtype; gparams_.num_threads = e->get_num_threads(); eptr_ = e; configure(XSMM); } void SplitNode::configure(int engine) { switch(engine) { case XSMM: impl = new SplitLoop(&gparams_, engine); break; } } void SplitNode::convert_bf16_f32(libxsmm_bfloat16* in, float *out, int len) { #ifdef _OPENMP #pragma omp parallel for #endif for(int i=0; iset_bot_compute_engine(bot_cengine_); for(int i=0; iset_top_compute_engine(top_compute_engine_); impl->forwardPropagate(tenBotData_, tenTopData_); #ifdef GETSTATS #ifdef USE_MLSL size_t node_id = MLSL::Environment::GetEnv().GetProcessIdx(); #else size_t node_id = 0; #endif if(node_id == 0 && eptr_->get_current_batch() % STATFREQ == 0) { if(gparams_.in_data_type == DT_FLOAT) { float* bot = (float*) tenBotData_->getBuffer(); Shape *bs = tenBot_->getShape(); int size = bs->dims[0]*bs->dims[1]*bs->dims[2]*bs->dims[3]; string s = nname_ + "_Inp"; MeanOfLayer((char*)s.c_str(), bot, size); } else if(gparams_.in_data_type == DT_BF16) { int size = nImg*ifm*fh*fw; if(stptr == NULL) stptr = (float*)libxsmm_aligned_malloc(size*sizeof(float), 2097152); libxsmm_bfloat16* bot = (libxsmm_bfloat16*) tenBotData_->getBuffer(); Shape *bs = tenBot_->getShape(); libxsmm_convert_bf16_f32(bot, stptr, size); string s = nname_ + "_Inp"; MeanOfLayer((char*)s.c_str(), stptr, size); } for(int i=0; igetShape(); int size = ts->dims[0]*ts->dims[1]*ts->dims[2]*ts->dims[3]; float* top = (float*)tenTopData_[i]->getBuffer(); string s = nname_ + "_Outp_" + to_string(i); MeanOfLayer((char*)s.c_str(), top, size); } else if(gparams_.out_data_type == DT_BF16) { Shape *ts = tenTop_[i]->getShape(); int size = ts->dims[0]*ts->dims[1]*ts->dims[2]*ts->dims[3]; libxsmm_bfloat16* top = (libxsmm_bfloat16*)tenTopData_[i]->getBuffer(); libxsmm_convert_bf16_f32(top, stptr, size); string s = nname_ + "_Outp_" + to_string(i); MeanOfLayer((char*)s.c_str(), stptr, size); } } } #endif } void SplitNode::backPropagate() { int num_gtops=0; int nni; int nImg = gparams_.batch_size; int ifm = gparams_.nInput; int ofm = gparams_.nOutput[0]; int ifh = gparams_.iHeight; int ifw = gparams_.iWidth; int ofh = gparams_.oHeight; int ofw = gparams_.oWidth; for(int i=0; igetBuf(DIFF) != NULL) { nni = i; num_gtops++; } } tenTopDiff_.resize(num_gtops); if(num_gtops == 1) tenTopDiff_[0] = tenTop_[nni]->getBuf(DIFF); else { for(int i=0; igetBuf(DIFF); } impl->backPropagate(tenTopDiff_, tenBotDiff_); #ifdef CHECK_BLOWUP_FP32 if(in_dtype == DT_FLOAT) { for(int i=0; i<16; i++) { float v = ((float*)tenBotDiff_->getBuffer())[i]; if(isnan(v) || isinf(v)) { printf("Warning! %s layer BP activations are NaN or Inf\n", nname_.c_str()); exit(-1); } } } else if(in_dtype == DT_BF16) { convert_bf16_f32((libxsmm_bfloat16*)tenBotDiff_->getBuffer(), cbptr, 16); #ifdef USE_MLSL int node_id = MLSL::Environment::GetEnv().GetProcessIdx(); #else int node_id = 0; #endif if(node_id == 0) { for(int i=0; i<16; i++) { if(isnan(cbptr[i]) || isinf(cbptr[i])) { printf("Warning! %s layer BP activations are NaN or Inf\n", nname_.c_str()); MeanOfLayer((char*)((nname_+"_delin").c_str()), (libxsmm_bfloat16*)tenBotDiff_->getBuffer(), nImg*ifm*ifh*ifw); MeanOfLayer((char*)((nname_+"_delout0").c_str()), (libxsmm_bfloat16*)tenTopDiff_[0]->getBuffer(), nImg*ofm*ofh*ofw); MeanOfLayer((char*)((nname_+"_delout1").c_str()), (libxsmm_bfloat16*)tenTopDiff_[1]->getBuffer(), nImg*ofm*ofh*ofw); #ifdef USE_MLSL MPI_Finalize(); #endif exit(-1); } } } } #endif #ifdef GETSTATS #ifdef USE_MLSL size_t node_id = MLSL::Environment::GetEnv().GetProcessIdx(); #else size_t node_id = 0; #endif if(node_id == 0 && eptr_->get_current_batch() % STATFREQ == 0) { int size; for(int i=0; igetBuffer(); Shape *ts = tenTop_[i]->getShape(); int size = ts->dims[0]*ts->dims[1]*ts->dims[2]*ts->dims[3]; string s = nname_ + "_delOutp_" + to_string(i); MeanOfLayer((char*)s.c_str(), ptr, size); } else if(gparams_.out_data_type == DT_BF16) { libxsmm_bfloat16 *ptr = (libxsmm_bfloat16*)tenTopDiff_[i]->getBuffer(); Shape *ts = tenTop_[i]->getShape(); int size = ts->dims[0]*ts->dims[1]*ts->dims[2]*ts->dims[3]; libxsmm_convert_bf16_f32(ptr, stptr, size); string s = nname_ + "_delOutp_" + to_string(i); MeanOfLayer((char*)s.c_str(), stptr, size); } } } if(gparams_.in_data_type == DT_FLOAT) { float *ptr = (float*)tenBotDiff_->getBuffer(); Shape *bs = tenBot_->getShape(); size = bs->dims[0]*bs->dims[1]*bs->dims[2]*bs->dims[3]; string s = nname_ + "_delInp"; MeanOfLayer((char*)s.c_str(), ptr, size); } else if(gparams_.in_data_type == DT_BF16) { libxsmm_bfloat16 *ptr = (libxsmm_bfloat16*)tenBotDiff_->getBuffer(); Shape *bs = tenBot_->getShape(); size = bs->dims[0]*bs->dims[1]*bs->dims[2]*bs->dims[3]; libxsmm_convert_bf16_f32(ptr, stptr, size); string s = nname_ + "_delInp"; MeanOfLayer((char*)s.c_str(), stptr, size); } } #endif } libxsmm-1.17/samples/deeplearning/gxm/src/SplitLoop.cpp000066400000000000000000000176631415223013700232440ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #include #include #include #include "SplitLoop.hpp" # define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) #if 1 __m512i vnaninf = _mm512_set1_epi32( 0x7f800000 ); __m512i vrneadd = _mm512_set1_epi32( 0x00007fff ); __m512i vfixup = _mm512_set1_epi32( 0x00000001 ); __m512i vfixupmask = _mm512_set1_epi32( 0x00010000 ); # define _mm512_roundbf16rne(A) _mm512_mask_add_epi32( _mm512_castps_si512( A ), _mm512_cmp_epi32_mask( _mm512_and_epi32( _mm512_castps_si512( A ), vnaninf ), vnaninf, _MM_CMPINT_NE ), _mm512_castps_si512( A ), _mm512_mask_add_epi32( vrneadd , _mm512_cmp_epi32_mask( _mm512_and_epi32( _mm512_castps_si512( A ), vfixupmask ), vfixupmask, _MM_CMPINT_EQ ), vrneadd, vfixup ) ) # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)A,_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)A,_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) #else # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)A,_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)A,_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) #endif #define VLEN 16 void SplitLoop::forwardPropagate(TensorBuf *inpb, vector& outpb, int tid) { for(int i=0; isetBuffer(inpb->getBuffer()); outpb[i]->setBufferSize(inpb->getBufferSize()); outpb[i]->setLayoutType(inpb->getLayoutType()); } } void SplitLoop::backPropagate(vector& deloutpb, TensorBuf *delinpb, int tid) { assert(gp->bdims == gp->tdims); int nImg = gp->batch_size; int nIfm = gp->nInput; int ifh = gp->iHeight; int ifw = gp->iWidth; int in_dtype = delinpb->getDataType(); int out_dtype = deloutpb[0]->getDataType(); void* delinp = delinpb->getBuffer(); void *deloutp[deloutpb.size()]; int num_outp = 1; int size = nImg*nIfm*ifh*ifw; deloutp[0] = deloutpb[0]->getBuffer(); for(int i=1; igetBuffer(); num_outp++; } if(in_dtype == DT_FLOAT && out_dtype == DT_FLOAT) { #ifdef __AVX512F__ if (size % 16 == 0) { if ( num_outp == 2 ) { float* out1 = (float*)deloutp[0]; float* out2 = (float*)deloutp[1]; #ifdef _OPENMP #pragma omp parallel for #endif for(int j=0; jsetLayoutType(deloutpb[0]->getLayoutType()); } libxsmm-1.17/samples/deeplearning/gxm/src/db.cpp000066400000000000000000000020051415223013700216640ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #include "db.hpp" #include "db_lmdb.hpp" #include DB* GetDB(const string& backend) { #ifdef USE_LMDB if (backend == "lmdb") { return new LMDB(); } #endif // USE_LMDB printf("Unknown database backend\n"); exit(1); return NULL; } libxsmm-1.17/samples/deeplearning/gxm/src/db_lmdb.cpp000066400000000000000000000031141415223013700226640ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #ifdef USE_LMDB #include "db_lmdb.hpp" #include #include void LMDB::Open(const string& source) { MDB_CHECK(mdb_env_create(&mdb_env_)); int flags = MDB_RDONLY | MDB_NOTLS; flags |= MDB_NOLOCK; int rc = mdb_env_open(mdb_env_, source.c_str(), flags, 0664); MDB_CHECK(rc); printf("Opened lmdb %s\n", source.c_str()); } LMDBCursor* LMDB::NewCursor() { MDB_txn* mdb_txn; MDB_stat stat; MDB_cursor* mdb_cursor; MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, MDB_RDONLY, &mdb_txn)); MDB_CHECK(mdb_dbi_open(mdb_txn, NULL, 0, &mdb_dbi_)); MDB_CHECK(mdb_cursor_open(mdb_txn, mdb_dbi_, &mdb_cursor)); MDB_CHECK(mdb_stat(mdb_txn, mdb_dbi_, &stat)); int count = stat.ms_entries; printf("lmdb Database has %d files\n", count); return new LMDBCursor(mdb_txn, mdb_cursor, count); } #endif // USE_LMDB libxsmm-1.17/samples/deeplearning/gxm/src/main.cpp000066400000000000000000000064061415223013700222340ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #include #include "string.h" #include "Engine.hpp" #include "Node.hpp" #include "Accuracy.hpp" #include "Concat.hpp" #include "DummyData.hpp" #include "ImageData.hpp" #include "JitterData.hpp" #include "LMDBData.hpp" #include "Conv.hpp" #include "FullyConnected.hpp" #include "ReLU.hpp" #include "Dropout.hpp" #include "Pooling.hpp" #include "SoftmaxLoss.hpp" #include "Split.hpp" #include "FusedBNorm.hpp" #include "FusedConvBN.hpp" #include "Eltwise.hpp" #include "TypeList.hpp" using namespace std; using namespace gxm; TypeList nodeTypes[] = { {"Accuracy", parseAccuracyParams, CreateMLNode}, {"FusedBatchNorm", parseFusedBNormParams, CreateMLNode}, {"FusedConvBN", parseFusedConvBNParams, CreateMLNode}, {"Eltwise", parseEltwiseParams, CreateMLNode}, {"Split", parseSplitParams, CreateMLNode}, {"Concat", parseConcatParams, CreateMLNode}, {"DummyData", parseDummyDataParams, CreateMLNode}, {"ImageData", parseImageDataParams, CreateMLNode}, {"JitterData", parseJitterDataParams, CreateMLNode}, {"LMDBData", parseLMDBDataParams, CreateMLNode}, {"Convolution", parseConvParams, CreateMLNode}, {"FullyConnected", parseFCParams, CreateMLNode}, {"Pooling", parsePoolingParams, CreateMLNode}, {"ReLU", parseReLUParams, CreateMLNode}, {"Dropout", parseDropoutParams, CreateMLNode}, {"SoftmaxWithLoss", parseSoftmaxParams, CreateMLNode} }; const int numTypes = sizeof(nodeTypes)/sizeof(nodeTypes[0]); int main(int argc, char* argv[]) { //Command-line arguments for MLConfig, SolverConfig, MachineConfig #ifdef USE_MLSL MLSL::Environment::GetEnv().Init(&argc, &argv); #endif // Create MLEngine instance MLEngine *engine = new MLEngine(); if(strcmp(argv[1], "train") == 0) { string mlcfg(argv[2]); string solvercfg(argv[3]); engine->create(TRAIN, mlcfg, solvercfg); engine->run(TRAIN); } else if(strcmp(argv[1], "test") == 0) { string mlcfg(argv[2]); string solvercfg(argv[3]); engine->create(TEST, mlcfg, solvercfg); engine->run(TEST); } #ifdef USE_MLSL MLSL::Environment::GetEnv().Finalize(); #endif return 0; } libxsmm-1.17/samples/deeplearning/gxm/src/reduce_weight_grads.c000066400000000000000000000013721415223013700247430ustar00rootroot00000000000000int jobs = ofm * ifm * kh * kw; int jn = jobs/gp->num_numa_nodes; int jnv = jn/VLEN; int jpt = (jnv % ntps == 0) ? (jnv/ntps)*VLEN : ((jnv/ntps)+1)*VLEN; int ltid = tid - n*ntps; int tb = (ltid * jpt < jn) ? ltid*jpt : jn; int te = ((ltid+1)*jpt < jn) ? (ltid+1)*jpt : jn; float *wgp = (float*)dwt_ptr[n]+n*jn; for(int nn=0; nnnum_numa_nodes; nn++) { if(n == nn) continue; float *rgp = (float*)dwt_ptr[nn]+n*jn; #pragma omp simd for(int i=tb; inum_numa_nodes; nn++) { if(n == nn) continue; float *wgp = (float*)dwt_ptr[n]+nn*jn; float *rgp = (float*)dwt_ptr[nn]+nn*jn; #pragma vector nontemporal #pragma omp simd for(int i=tb; inum_numa_nodes; nn++) { libxsmm_bfloat16 *rem_ptr = (libxsmm_bfloat16*)dwt_ptr[nn]; for(int i=tb; i 0) { for(int i=ntps*jpt; i #include #include "proto/gxm.pb.h" #include "io.hpp" using namespace std; using namespace gxm; bool parseMachineConfig(const string fname, MachineParameter* param) { bool success = ReadProtoFromText(fname, param); if(!success) printf("Failed to parse Machine Parameter file %s\n",fname.c_str()); return success; } bool parseMLConfig(const string fname, NTGParameter* param) { bool success = ReadProtoFromText(fname, param); if(!success) printf("Failed to parse ML Parameter file %s\n",fname.c_str()); return success; } bool parseSolverConfig(const string fname, SolverParameter* param) { bool success = ReadProtoFromText(fname, param); if(!success) printf("Failed to parse ML Parameter file %s\n",fname.c_str()); return success; } libxsmm-1.17/samples/deeplearning/gxm/src/util/check.cpp000066400000000000000000000231451415223013700233410ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar, Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "check.hpp" void check_physical_pad(const char *s, float *tensor, int nImg, int nBfm, int fh, int fw, int ifm, int iph, int ipw ) { int fhi = fh + 2*iph; int fwi = fw + 2*ipw; bool success = true; bool padded = false; float (* __restrict tensor_vla)[nBfm][fhi][fwi][ifm] = (float (*)[*][*][*][ifm])tensor; if (iph > 0 || iph > 0) { for (int img = 0; img < nImg; img++) { for (int fm = 0; fm < nBfm; fm++) { for (int w = 0; w < fwi; w++) { for (int ph = 0; ph < iph; ph++) { for (int v = 0; v < ifm; v++) { if ( tensor_vla[img][fm][ph][w][v] != 0.0f ) { success = false; } if ( tensor_vla[img][fm][fhi-1-ph][w][v] != 0.0f ) { float val = tensor_vla[img][fm][fhi-1-ph][w][v]; success = false; } } } } for (int h = iph; h < fh+iph; h++) { for (int pw = 0; pw < ipw; pw++) { for (int v = 0; v < ifm; v++) { if ( tensor_vla[img][fm][h][pw][v] != 0.0f ) { success = false; } if ( tensor_vla[img][fm][h][fwi-1-pw][v] != 0.0f ) { success = false; } } } } } } padded = true; } if ( padded == true ) { if ( success == true ) { printf("%s pacific_rim is clear\n", s); } else { printf("%s pacific_rim is under attack\n", s); } } } void check_physical_pad(const char *s, libxsmm_bfloat16 *tensor, int nImg, int nBfm, int fh, int fw, int ifm, int iph, int ipw ) { int fhi = fh + 2*iph; int fwi = fw + 2*ipw; bool success = true; bool padded = false; libxsmm_bfloat16 (* __restrict tensor_vla)[nBfm][fhi][fwi][ifm] = (libxsmm_bfloat16 (*)[*][*][*][ifm])tensor; if (iph > 0 || iph > 0) { for (int img = 0; img < nImg; img++) { for (int fm = 0; fm < nBfm; fm++) { for (int w = 0; w < fwi; w++) { for (int ph = 0; ph < iph; ph++) { for (int v = 0; v < ifm; v++) { if ( tensor_vla[img][fm][ph][w][v] != 0 ) { success = false; } if ( tensor_vla[img][fm][fhi-1-ph][w][v] != 0 ) { success = false; } } } } for (int h = iph; h < fh+iph; h++) { for (int pw = 0; pw < ipw; pw++) { for (int v = 0; v < ifm; v++) { if ( tensor_vla[img][fm][h][pw][v] != 0 ) { success = false; } if ( tensor_vla[img][fm][h][fwi-1-pw][v] != 0 ) { success = false; } } } } } } padded = true; } if ( padded == true ) { if ( success == true ) { printf("%s pacific_rim is clear\n", s); } else { printf("%s pacific_rim is under attack\n", s); } } } void MeanOfLayer(char *s, libxsmm_bfloat16 *array, int size) { union libxsmm_bfloat16_hp max, min, sum, absum; max.i[0] = 0; max.i[1] = array[0]; min.i[0] = 0; min.i[1] = array[0]; sum.i[0] = 0; sum.i[1] = array[0]; absum.i[0] = 0; absum.i[1] = (array[0] > 0) ? array[0] : -array[0]; for(int i=1; i max.f) max.f = val.f; if(val.f < min.f) min.f = val.f; sum.f += val.f; absum.f += fabs(val.f); } printf("%s %f %f %f %f\n", s, sum.f, absum.f, max.f, min.f); } void MeanOfLayer(char *s, float *array, int size) { int nnz, mmt; double max, min; double sum, absum; double stddev_sum, stddev_absum; max = array[0]; min = array[0]; sum = 0; double psum=0, nsum=0; int pc=0, nc=0; absum = 0; nnz = 0; mmt = 0; int which_max = 0; int which_min = 0; int first_nz = -1; int last_nz = -1; for(int i=0; i 1000 || array[i] < -1000) {mmt++; printf(">>%d (%f)\n", i, array[i]);} if(mmt > 10) { printf("In %s more than 10 values out-of-range. exiting statistics loop...\n",s); exit(0); } #endif if(array[i] != 0) nnz++; if(array[i] > max) { max = array[i]; which_max = i; } if(array[i] < min) { min = array[i]; which_min = i; } sum += array[i]; if(array[i] > 0) { psum += array[i]; pc++; } else if(array[i] < 0) { nsum += array[i]; nc++; } absum += fabs(array[i]); } double mean = sum/(double)size; double absmean = absum/(double)size; stddev_sum = 0; for(int i=0; i 1000 || array[i] < -1000) {mmt++; printf(">>%d (%f)\n", i, array[i]);} if(mmt > 10) { printf("In %s more than 10 values out-of-range. exiting statistics loop...\n",s); exit(0); } #endif if(array[i] != 0) nnz++; if(array[i] > max) { max = array[i]; which_max = i; } if(array[i] < min) { min = array[i]; which_min = i; } sum += array[i]; if(array[i] > 0) { psum += array[i]; pc++; } else if(array[i] < 0) { nsum += array[i]; nc++; } absum += fabs(array[i]); } double mean = sum/(double)size; double absmean = absum/(double)size; stddev_sum = 0; for(int i=0; i 1000 || array[i] < -1000) {mmt++; printf(">>%d (%d)\n", i, array[i]);} if(mmt > 100) { printf("more than 100 values out-of-range. exiting statistics loop...\n"); break; } if(array[i] != 0) nnz++; if(array[i] > max) {max = array[i]; which_max = i;} if(array[i] < min) {min = array[i]; which_min = i;} sum += array[i]; absum += fabs(array[i]); } //printf("layer:%s(%d) mean:%f stddev=%f max:%f(%d) min:%f(%d) \n", layer, size, mean, stddev, max, which_max, min, which_min); // printf("%s:[%d] mean:%.10f (abs mean:%.10f) stddev:%.10f (abs stdev %.10f) max:%.10f(%d) min:%.10f(%d) nnz-perc:%.10f(%d:f=%d l=%d) \n", // s, size, mean, absmean, stddev, abstddev, max, which_max, min, which_min, ((double)nnz)/((double)size), nnz, first_nz, last_nz); printf("%s %d %d %d %d\n", s, sum, absum, max, min); } libxsmm-1.17/samples/deeplearning/gxm/src/util/fillers.cpp000066400000000000000000000066051415223013700237260ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Sasikanth Avancha, Dhiraj Kalamkar (Intel Corp.) ******************************************************************************/ #include "fillers.hpp" #define MEAN 0.0 #define NUM_SAMPLES 25 #define FAN_IN 0 #define FAN_OUT 1 #define AVERAGE 2 using namespace std; void Uniform(const float lower, const float upper, int n, float *dist) { /* for(int i=0; i distribution(lower,upper); for(int i=0; i distribution (mean, stddev); for(int i=0; i #include "io.hpp" using namespace std; using namespace gxm; const int kProtoReadBytesLimit = INT_MAX; bool ReadProtoFromText(string fname, Message* proto) { int fm = open(fname.c_str(), O_RDONLY); if (fm == -1) { printf("File %s not found\n",fname.c_str()); return false; } FileInputStream* input = new FileInputStream(fm); bool success = google::protobuf::TextFormat::Parse(input, proto); delete input; close(fm); return success; } bool ReadProtoFromBinary(string fname, Message* proto) { int fd = open(fname.c_str(), O_RDONLY); if (fd == -1) { printf("File %s not found\n",fname.c_str()); return false; } ZeroCopyInputStream* raw_input = new FileInputStream(fd); CodedInputStream* coded_input = new CodedInputStream(raw_input); coded_input->SetTotalBytesLimit(kProtoReadBytesLimit, 536870912); bool success = proto->ParseFromCodedStream(coded_input); if(!coded_input->ConsumedEntireMessage()) { printf("parsing failed\n"); exit(-1); } delete coded_input; delete raw_input; close(fd); return success; } void WriteProtoToText(const Message& proto, string filename) { int fd = open(filename.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0644); FileOutputStream* output = new FileOutputStream(fd); google::protobuf::TextFormat::Print(proto, output); delete output; close(fd); } void ReadNWriteMeanFile(string fname, Message* proto, string outname) { ReadProtoFromBinary(fname, proto); WriteProtoToText(*proto, outname); } #ifdef USE_OPENCV cv::Mat DecodeDatumToCVMatNative(const Datum& datum) { cv::Mat cv_img; const string& data = datum.data(); std::vector vec_data(data.c_str(), data.c_str() + data.size()); cv_img = cv::imdecode(vec_data, -1); if (!cv_img.data) { printf("Could not decode datum\n"); } return cv_img; } bool DecodeDatumNative(Datum* datum) { if (datum->encoded()) { cv::Mat cv_img = DecodeDatumToCVMatNative((*datum)); CVMatToDatum(cv_img, datum); return true; } else { return false; } } bool DecodeDatum(Datum* datum, bool is_color) { if (datum->encoded()) { cv::Mat cv_img = DecodeDatumToCVMat((*datum), is_color); CVMatToDatum(cv_img, datum); return true; } else { return false; } } cv::Mat DecodeDatumToCVMat(const Datum& datum, bool is_color) { cv::Mat cv_img; const string& data = datum.data(); std::vector vec_data(data.c_str(), data.c_str() + data.size()); int cv_read_flag = (is_color ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE); cv_img = cv::imdecode(vec_data, cv_read_flag); if (!cv_img.data) { printf("Could not decode datum\n"); } return cv_img; } void CVMatToDatum(const cv::Mat& cv_img, Datum* datum) { datum->set_channels(cv_img.channels()); datum->set_height(cv_img.rows); datum->set_width(cv_img.cols); datum->clear_data(); datum->clear_float_data(); datum->set_encoded(false); int datum_channels = datum->channels(); int datum_height = datum->height(); int datum_width = datum->width(); int datum_size = datum_channels * datum_height * datum_width; std::string buffer(datum_size, ' '); for (int h = 0; h < datum_height; ++h) { const uchar* ptr = cv_img.ptr(h); int img_index = 0; for (int w = 0; w < datum_width; ++w) { for (int c = 0; c < datum_channels; ++c) { int datum_index = (c * datum_height + h) * datum_width + w; buffer[datum_index] = static_cast(ptr[img_index++]); } } } datum->set_data(buffer); } #endif void initSeeds(unsigned int *seeds, int nthreads) { for(int i=0; i #include #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #include #include #include #include #if defined(_OPENMP) # include #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif /* include c-based dnn library */ #include "../common/dnn_common.h" #define CHKERR_LIBXSMM_DNN(A) { const int chkerr_libxsmm_dnn_ = A; if (LIBXSMM_DNN_SUCCESS != chkerr_libxsmm_dnn_) { \ fprintf(stderr, "%s\n", libxsmm_dnn_get_error(chkerr_libxsmm_dnn_)); global_status = chkerr_libxsmm_dnn_; } \ } #if defined(PROFILE) unsigned long long Gbl_blas_start, Gbl_blas_end, Gbl_eltwise_start, Gbl_eltwise_end, Gbl_conv_start, Gbl_conv_end; double Gbl_blas_total, Gbl_eltwise_total, Gbl_conv_total; #endif int main(int argc, char* argv[]) { float *wigold, *wfgold, *wogold, *wcgold, *rigold, *rfgold, *rogold, *rcgold, *bigold, *bfgold, *bogold, *bcgold; float *xgoldt, *cspgold,*hpgold, *csgoldt, *cogoldt, *hgoldt; float *dwgold, *drgold, *dbgold; float *dxgoldt, *dcspgold, *dhpgold, *dcsgold, *dhgoldt; float *icfogoldt, *wgold, *rgold; float *scratch_fwd, *scratch_bu; libxsmm_bfloat16 *xt, *csp, *hp, *w, *r, *b, *cst, *ht; libxsmm_bfloat16 *it, *ft, *ot, *cit, *cot; libxsmm_bfloat16 *dxt, *dcsp, *dhp, *dw, *dr, *db, *dcs, *dht; float forget_bias = 1.0f; float *h_test, *dxt_test, *dw_test, *dr_test, *db_test; void *scratch, *internalstate; size_t scratch_size = 0, internalstate_size = 0; int iters = 10; /* repetitions of benchmark */ int pass = 0; /* pass: 0--FWD, 1--BWD, 2--UPD, 3--BWD+UPD */ int N = 168; /* size of mini-batch */ int C = 512; /* number of inputs */ int K = 512; /* number of outputs */ int t = 50; /* number of time steps (>= 1) */ int bn = 24; int bk = 64; int bc = 64; const char *const env_check = getenv("CHECK"); const double check = LIBXSMM_ABS(0 == env_check ? 1/*enable by default*/ : atof(env_check)); #if defined(_OPENMP) int nThreads = omp_get_max_threads(); /* number of threads */ #else int nThreads = 1; /* number of threads */ #endif unsigned long long l_start, l_end; double l_total = 0.0; double flops = 0.0, tempflops = 0.0; const double tflops = 12; /* transcendental flops */ int j; libxsmm_dnn_rnncell_desc lstmcell_desc; libxsmm_dnn_rnncell* libxsmm_handle; libxsmm_dnn_tensor* libxsmm_input; libxsmm_dnn_tensor* libxsmm_cs_prev; libxsmm_dnn_tensor* libxsmm_hidden_state_prev; libxsmm_dnn_tensor* libxsmm_weight; libxsmm_dnn_tensor* libxsmm_recur_weight; libxsmm_dnn_tensor* libxsmm_bias; libxsmm_dnn_tensor* libxsmm_cs; libxsmm_dnn_tensor* libxsmm_hidden_state; libxsmm_dnn_tensor* libxsmm_i; libxsmm_dnn_tensor* libxsmm_f; libxsmm_dnn_tensor* libxsmm_o; libxsmm_dnn_tensor* libxsmm_ci; libxsmm_dnn_tensor* libxsmm_co; libxsmm_dnn_tensor* libxsmm_dinput; libxsmm_dnn_tensor* libxsmm_dcs_prev; libxsmm_dnn_tensor* libxsmm_dhidden_state_prev; libxsmm_dnn_tensor* libxsmm_dweight; libxsmm_dnn_tensor* libxsmm_drecur_weight; libxsmm_dnn_tensor* libxsmm_dbias; libxsmm_dnn_tensor* libxsmm_dcs; libxsmm_dnn_tensor* libxsmm_dhidden_state; libxsmm_dnn_tensor_datalayout* libxsmm_layout; libxsmm_dnn_err_t status; libxsmm_dnn_err_t global_status = LIBXSMM_DNN_SUCCESS; libxsmm_matdiff_info norms_fwd, norms_bwd, norms_upd_w, norms_upd_r, norms_upd_b, diff; libxsmm_matdiff_clear(&norms_fwd); libxsmm_matdiff_clear(&norms_bwd); libxsmm_matdiff_clear(&norms_upd_w); libxsmm_matdiff_clear(&norms_upd_r); libxsmm_matdiff_clear(&norms_upd_b); libxsmm_matdiff_clear(&diff); if (argc > 1 && !strncmp(argv[1], "-h", 3)) { printf("\nUsage: ./lstmdriver [reps] [pass: 0--FWD, 1--BWD, 2--UPD, 3--BWD+UPD] [N] [C] [K] [time_steps > 0]\n\n"); return 0; } libxsmm_rng_set_seed(1); /* reading new values from cli */ j = 1; if (argc > j) iters = atoi(argv[j++]); if (argc > j) pass = atoi(argv[j++]); if (argc > j) N = atoi(argv[j++]); if (argc > j) C = atoi(argv[j++]); if (argc > j) K = atoi(argv[j++]); if (argc > j) t = atoi(argv[j++]); if (argc > j) bn = atoi(argv[j++]); if (argc > j) bc = atoi(argv[j++]); if (argc > j) bk = atoi(argv[j++]); if (t <= 0) { printf("time_steps %d should be greater than or equal to 1\n\n", t); return 0; } if (!(pass == 0 || pass == 1 || pass == 2 || pass == 3)) { printf("Unknown pass: %d, valid arguments for pass = {0(FWD), 1(BWD), 2(UPD), 3(BWD+UPD)\n\n", pass); return 0; } #if defined(__SSE3__) _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); #endif /* print some summary */ printf("##########################################\n"); printf("# Setting Up (Common) #\n"); printf("##########################################\n"); printf("PARAMS: N:%d C:%d K:%d T:%d\n", N, C, K, t); printf("PARAMS: ITERS:%d", iters); if (LIBXSMM_FEQ(0, check)) printf(" Threads:%d\n", nThreads); else printf("\n"); printf("SIZE Weight (MB): %10.2f MiB\n", (double)(C*K*sizeof(float))/(1024.0*1024.0) ); printf("SIZE Input (MB): %10.2f MiB\n", (double)(N*C*sizeof(float))/(1024.0*1024.0) ); printf("SIZE Hidden State: %10.2f MiB\n", (double)(K*N*sizeof(float))/(1024.0*1024.0) ); /* allocate data */ xgoldt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); cspgold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); hpgold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); wigold = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); wfgold = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); wogold = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); wcgold = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); rigold = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); rfgold = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); rogold = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); rcgold = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); bigold = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); bfgold = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); bogold = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); bcgold = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); csgoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); cogoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); hgoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); dxgoldt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); dcspgold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); dhpgold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); wgold = (float*)libxsmm_aligned_malloc((C+K)*K*4*sizeof(float), 2097152); rgold = NULL; dwgold = (float*)libxsmm_aligned_malloc((C+K)*K*4*sizeof(float), 2097152); drgold = NULL; scratch_fwd = (float*)libxsmm_aligned_malloc((C+K)*N*sizeof(float), 2097152); scratch_bu = (float*)libxsmm_aligned_malloc(((C+K)*N*2 + K*N*t*5)*sizeof(float), 2097152); dbgold = (float*)libxsmm_aligned_malloc(K*4*sizeof(float), 2097152); dcsgold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); dhgoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); icfogoldt = (float*)libxsmm_aligned_malloc(K*N*t*4*sizeof(float), 2097152); xt = (libxsmm_bfloat16*)libxsmm_aligned_malloc(N*C*t*sizeof(libxsmm_bfloat16), 2097152); csp = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*N*sizeof(libxsmm_bfloat16), 2097152); hp = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*N*sizeof(libxsmm_bfloat16), 2097152); w = (libxsmm_bfloat16*)libxsmm_aligned_malloc(C*K*4*sizeof(libxsmm_bfloat16), 2097152); r = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*K*4*sizeof(libxsmm_bfloat16), 2097152); b = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*4*sizeof(libxsmm_bfloat16), 2097152); cst = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*N*t*sizeof(libxsmm_bfloat16), 2097152); ht = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*N*t*sizeof(libxsmm_bfloat16), 2097152); it = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*N*t*sizeof(libxsmm_bfloat16), 2097152); ft = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*N*t*sizeof(libxsmm_bfloat16), 2097152); ot = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*N*t*sizeof(libxsmm_bfloat16), 2097152); cit = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*N*t*sizeof(libxsmm_bfloat16), 2097152); cot = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*N*t*sizeof(libxsmm_bfloat16), 2097152); dxt = (libxsmm_bfloat16*)libxsmm_aligned_malloc(N*C*t*sizeof(libxsmm_bfloat16), 2097152); dcsp = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*N*sizeof(libxsmm_bfloat16), 2097152); dhp = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*N*sizeof(libxsmm_bfloat16), 2097152); dw = (libxsmm_bfloat16*)libxsmm_aligned_malloc(C*K*4*sizeof(libxsmm_bfloat16), 2097152); dr = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*K*4*sizeof(libxsmm_bfloat16), 2097152); db = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*4*sizeof(libxsmm_bfloat16), 2097152); dcs = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*N*sizeof(libxsmm_bfloat16), 2097152); dht = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*N*t*sizeof(libxsmm_bfloat16), 2097152); h_test = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); dxt_test = (float*)libxsmm_aligned_malloc(C*N*t*sizeof(float), 2097152); dw_test = (float*)libxsmm_aligned_malloc(K*C*4*sizeof(float), 2097152); dr_test = (float*)libxsmm_aligned_malloc(K*K*4*sizeof(float), 2097152); db_test = (float*)libxsmm_aligned_malloc(K*4*sizeof(float), 2097152); LIBXSMM_VLA_DECL(2, float, xgold, xgoldt, N * C); LIBXSMM_VLA_DECL(2, float, hgold, hgoldt, K * N); LIBXSMM_VLA_DECL(2, float, dhgold, dhgoldt, K * N); /*LIBXSMM_VLA_DECL(2, float, h, h_test, K * N);*/ /* initialize data */ /* FWD */ for (j = 0; j < t; ++j) { LIBXSMM_MATINIT_OMP(float, 24, &LIBXSMM_VLA_ACCESS(2, xgold, j, 0, N * C), N, C, N, 1.0); } LIBXSMM_MATINIT_OMP(float, 24, cspgold,N, K, N, 1.0); LIBXSMM_MATINIT_OMP(float, 24, hpgold, N, K, N, 1.0); LIBXSMM_MATINIT_OMP(float, 42, wigold, C, K, C, 1.0); LIBXSMM_MATINIT_OMP(float, 42, wfgold, C, K, C, 1.0); LIBXSMM_MATINIT_OMP(float, 42, wogold, C, K, C, 1.0); LIBXSMM_MATINIT_OMP(float, 42, wcgold, C, K, C, 1.0); LIBXSMM_MATINIT_OMP(float, 42, rigold, K, K, K, 1.0); LIBXSMM_MATINIT_OMP(float, 42, rfgold, K, K, K, 1.0); LIBXSMM_MATINIT_OMP(float, 42, rogold, K, K, K, 1.0); LIBXSMM_MATINIT_OMP(float, 42, rcgold, K, K, K, 1.0); LIBXSMM_MATINIT_OMP(float, 24, bigold, 1, K, 1, 1.0); LIBXSMM_MATINIT_OMP(float, 24, bfgold, 1, K, 1, 1.0); LIBXSMM_MATINIT_OMP(float, 24, bogold, 1, K, 1, 1.0); LIBXSMM_MATINIT_OMP(float, 24, bcgold, 1, K, 1, 1.0); zero_buf(csgoldt, K*N*t); zero_buf(cogoldt, K*N*t); zero_buf(hgoldt, K*N*t); /* BWD/UPD */ for (j = 0; j < t; ++j) { LIBXSMM_MATINIT_OMP(float, 24, &LIBXSMM_VLA_ACCESS(2, dhgold, j, 0, K * N), N, K, N, 1.0); } LIBXSMM_MATINIT_OMP(float, 24, dcsgold, N, K, N, 1.0); zero_buf(dxgoldt, N*C*t); zero_buf(dcspgold, K*N); zero_buf(dhpgold, K*N); zero_buf(dwgold, (C+K)*K*4); zero_buf(dbgold, K*4); /* first touch LIBXSMM */ zero_buf_bf16(xt, N*C*t); zero_buf_bf16(csp, K*N); zero_buf_bf16(hp, K*N); zero_buf_bf16(w, C*K*4); zero_buf_bf16(r, K*K*4); zero_buf_bf16(b, K*4); zero_buf_bf16(cst, K*N*t); zero_buf_bf16(ht, K*N*t); zero_buf_bf16(it, K*N*t); zero_buf_bf16(ft, K*N*t); zero_buf_bf16(ot, K*N*t); zero_buf_bf16(cit, K*N*t); zero_buf_bf16(cot, K*N*t); zero_buf_bf16(dxt, N*C*t); zero_buf_bf16(dcsp, K*N); zero_buf_bf16(dhp, K*N); zero_buf_bf16(dw, C*K*4); zero_buf_bf16(dr, K*K*4); zero_buf_bf16(db, K*4); zero_buf_bf16(dcs, K*N); zero_buf_bf16(dht, K*N*t); /* Make things bf16 */ rne_mask_fp32_bf16( wigold, wigold, C*K ); rne_mask_fp32_bf16( wcgold, wcgold, C*K ); rne_mask_fp32_bf16( wfgold, wfgold, C*K ); rne_mask_fp32_bf16( wogold, wogold, C*K ); rne_mask_fp32_bf16( rigold, rigold, K*K ); rne_mask_fp32_bf16( rcgold, rcgold, K*K ); rne_mask_fp32_bf16( rfgold, rfgold, K*K ); rne_mask_fp32_bf16( rogold, rogold, K*K ); rne_mask_fp32_bf16( bigold, bigold, K ); rne_mask_fp32_bf16( bcgold, bcgold, K ); rne_mask_fp32_bf16( bfgold, bfgold, K ); rne_mask_fp32_bf16( bogold, bogold, K ); rne_mask_fp32_bf16( xgoldt, xgoldt, N*C*t ); rne_mask_fp32_bf16( cspgold, cspgold, N*K ); rne_mask_fp32_bf16( hpgold, hpgold, N*K ); rne_mask_fp32_bf16( csgoldt, csgoldt, N*K*t ); rne_mask_fp32_bf16( cogoldt, cogoldt, N*K*t ); rne_mask_fp32_bf16( hgoldt, hgoldt, N*K*t ); rne_mask_fp32_bf16( icfogoldt, icfogoldt, N*K*t*4 ); rne_mask_fp32_bf16( wgold, wgold, (C+K)*K*4 ); rne_mask_fp32_bf16( dcsgold, dcsgold, K*N ); rne_mask_fp32_bf16( dhgoldt, dhgoldt, K*N*t ); rne_mask_fp32_bf16( dwgold, dwgold, 4*(C+K)*K ); rne_mask_fp32_bf16( dbgold, dbgold, 4*K ); rne_mask_fp32_bf16( dxgoldt, dxgoldt, N*C*t ); rne_mask_fp32_bf16( dcspgold, dcspgold, N*K ); rne_mask_fp32_bf16( dhpgold, dhpgold, K*N ); if (LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Computing Reference ... #\n"); printf("##########################################\n"); lstm_ref_fwd( N, C, K, t, forget_bias, wigold, wcgold, wfgold, wogold, rigold, rcgold, rfgold, rogold, bigold, bcgold, bfgold, bogold, xgoldt, cspgold, hpgold, csgoldt, cogoldt, hgoldt, icfogoldt, wgold, rgold, scratch_fwd ); /* Make things BF16 for bwd/upd pass refernce computation */ rne_mask_fp32_bf16( icfogoldt, icfogoldt, 4*K*N*t ); rne_mask_fp32_bf16( hgoldt, hgoldt, K*N*t ); rne_mask_fp32_bf16( cogoldt, cogoldt, K*N*t ); rne_mask_fp32_bf16( csgoldt, csgoldt, K*N*t ); rne_mask_fp32_bf16( cspgold, cspgold, K*N ); rne_mask_fp32_bf16( hpgold, hpgold, K*N ); rne_mask_fp32_bf16( dhgoldt, dhgoldt, t*K*N ); rne_mask_fp32_bf16( dcsgold, dcsgold, K*N ); lstm_ref_bwd_upd( N, C, K, t, xgoldt, cspgold, hpgold, csgoldt, cogoldt, hgoldt, icfogoldt, wgold, rgold, dcsgold, dhgoldt, dwgold, drgold, dbgold, dxgoldt, dcspgold, dhpgold, scratch_bu ); rne_mask_fp32_bf16( dxgoldt, dxgoldt, C*N*t ); rne_mask_fp32_bf16( dwgold, dwgold, 4*(C+K)*K ); rne_mask_fp32_bf16( dbgold, dbgold, 4*K ); rne_mask_fp32_bf16( dcspgold, dcspgold, N*K ); rne_mask_fp32_bf16( dhpgold, dhpgold, K*N ); printf("##########################################\n"); printf("# Computing Reference ... done #\n"); printf("##########################################\n"); } if (1 /* format == 'A' || format == 'L' */) { printf("\n"); printf("##########################################\n"); printf("# Setting Up (custom-Storage) #\n"); printf("##########################################\n"); if ( N % bn != 0 ) { bn = N; } if ( C % bc != 0 ) { bc = C; } if ( K % bk != 0 ) { bk = K; } /* setup LIBXSMM handle */ lstmcell_desc.threads = nThreads; lstmcell_desc.N = N; lstmcell_desc.C = C; lstmcell_desc.K = K; lstmcell_desc.max_T = t; lstmcell_desc.bn = bn; lstmcell_desc.bk = bk; lstmcell_desc.bc = bc; lstmcell_desc.cell_type = LIBXSMM_DNN_RNNCELL_LSTM; lstmcell_desc.datatype_in = LIBXSMM_DNN_DATATYPE_BF16; lstmcell_desc.datatype_out = LIBXSMM_DNN_DATATYPE_BF16; lstmcell_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NC; lstmcell_desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_CK; libxsmm_handle = libxsmm_dnn_create_rnncell( lstmcell_desc, &status ); CHKERR_LIBXSMM_DNN( status ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_allocate_forget_bias(libxsmm_handle, forget_bias) ); /* setup LIBXSMM buffers and filter */ libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_input = libxsmm_dnn_link_tensor( libxsmm_layout, xt, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_CS_PREV, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_cs_prev = libxsmm_dnn_link_tensor( libxsmm_layout, csp, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_hidden_state_prev = libxsmm_dnn_link_tensor( libxsmm_layout, hp, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_weight = libxsmm_dnn_link_tensor( libxsmm_layout, w, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_recur_weight = libxsmm_dnn_link_tensor( libxsmm_layout, r, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_BIAS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_bias = libxsmm_dnn_link_tensor( libxsmm_layout, b, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_CS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_cs = libxsmm_dnn_link_tensor( libxsmm_layout, cst, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_hidden_state = libxsmm_dnn_link_tensor( libxsmm_layout, ht, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_I, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_i = libxsmm_dnn_link_tensor( libxsmm_layout, it, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_F, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_f = libxsmm_dnn_link_tensor( libxsmm_layout, ft, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_O, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_o = libxsmm_dnn_link_tensor( libxsmm_layout, ot, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_CI, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_ci = libxsmm_dnn_link_tensor( libxsmm_layout, cit, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_CO, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_co = libxsmm_dnn_link_tensor( libxsmm_layout, cot, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dinput = libxsmm_dnn_link_tensor( libxsmm_layout, dxt, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_CS_PREV, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dcs_prev = libxsmm_dnn_link_tensor( libxsmm_layout, dcsp, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dhidden_state_prev = libxsmm_dnn_link_tensor( libxsmm_layout, dhp, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dweight = libxsmm_dnn_link_tensor( libxsmm_layout, dw, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_drecur_weight = libxsmm_dnn_link_tensor( libxsmm_layout, dr, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_BIAS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dbias = libxsmm_dnn_link_tensor( libxsmm_layout, db, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_CS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dcs = libxsmm_dnn_link_tensor( libxsmm_layout, dcs, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dhidden_state = libxsmm_dnn_link_tensor( libxsmm_layout, dht, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); /* copy in data to LIBXSMM bf16 format */ matrix_copy_f32_bf16(N*C*t, xgoldt, xt); matrix_copy_f32_bf16(K*N, cspgold, csp); matrix_copy_f32_bf16(K*N, hpgold, hp); convert_ck_f32_to_c4k_bf16(C, K, wigold, w); convert_ck_f32_to_c4k_bf16(C, K, wcgold, &(w[K])); convert_ck_f32_to_c4k_bf16(C, K, wfgold, &(w[2*K])); convert_ck_f32_to_c4k_bf16(C, K, wogold, &(w[3*K])); convert_ck_f32_to_c4k_bf16(K, K, rigold, r); convert_ck_f32_to_c4k_bf16(K, K, rcgold, &(r[K])); convert_ck_f32_to_c4k_bf16(K, K, rfgold, &(r[2*K])); convert_ck_f32_to_c4k_bf16(K, K, rogold, &(r[3*K])); matrix_copy_f32_bf16(K, bigold, &(b[0])); matrix_copy_f32_bf16(K, bcgold, &(b[K])); matrix_copy_f32_bf16(K, bfgold, &(b[2*K])); matrix_copy_f32_bf16(K, bogold, &(b[3*K])); matrix_copy_f32_bf16(K*N*t, dhgoldt, dht); matrix_copy_f32_bf16(K*N, dcsgold, dcs); /* bind buffers and filter to handle */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_input, LIBXSMM_DNN_RNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_cs_prev, LIBXSMM_DNN_RNN_REGULAR_CS_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_hidden_state_prev, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_weight, LIBXSMM_DNN_RNN_REGULAR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_recur_weight, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_bias, LIBXSMM_DNN_RNN_REGULAR_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_cs, LIBXSMM_DNN_RNN_REGULAR_CS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_hidden_state, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_i, LIBXSMM_DNN_RNN_INTERNAL_I ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_f, LIBXSMM_DNN_RNN_INTERNAL_F ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_o, LIBXSMM_DNN_RNN_INTERNAL_O ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_ci, LIBXSMM_DNN_RNN_INTERNAL_CI ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_co, LIBXSMM_DNN_RNN_INTERNAL_CO ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dinput, LIBXSMM_DNN_RNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dcs_prev, LIBXSMM_DNN_RNN_GRADIENT_CS_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dhidden_state_prev, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dweight, LIBXSMM_DNN_RNN_GRADIENT_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_drecur_weight, LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dbias, LIBXSMM_DNN_RNN_GRADIENT_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dcs, LIBXSMM_DNN_RNN_GRADIENT_CS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dhidden_state, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE ) ); /* let's allocate and bind scratch */ if (pass == 0) { scratch_size = libxsmm_dnn_rnncell_get_scratch_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, &status ); CHKERR_LIBXSMM_DNN( status ); scratch = libxsmm_aligned_malloc( scratch_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, scratch ) ); } else { scratch_size = libxsmm_dnn_rnncell_get_scratch_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, &status ); CHKERR_LIBXSMM_DNN( status ); scratch = libxsmm_aligned_malloc( scratch_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, scratch ) ); } zero_buf( (float*)scratch, scratch_size/4 ); /* let's allocate and bind internalstate */ if (pass == 0) { internalstate_size = libxsmm_dnn_rnncell_get_internalstate_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, &status ); CHKERR_LIBXSMM_DNN( status ); internalstate = (0 != internalstate_size ? libxsmm_aligned_malloc( internalstate_size, 2097152 ) : NULL); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, internalstate ) ); } else { internalstate_size = libxsmm_dnn_rnncell_get_internalstate_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, &status ); CHKERR_LIBXSMM_DNN( status ); internalstate = (0 != internalstate_size ? libxsmm_aligned_malloc( internalstate_size, 2097152 ) : NULL); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, internalstate ) ); } zero_buf( (float*)internalstate, internalstate_size/4 ); if ((pass == 0) && LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Correctness - FWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) ); } /* Upconvert libxsmm bf16 buffer to fp32 for correctness check */ matrix_copy_bf16_f32(t*K*N, ht, h_test); /* compare */ libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_F32, t*K*N, 1, &LIBXSMM_VLA_ACCESS(2, hgold, 0, 0, K * N), h_test, 0, 0); printf("L1 reference : %.25g\n", norms_fwd.l1_ref); printf("L1 test : %.25g\n", norms_fwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_fwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_fwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel); printf("Check-norm : %.24f\n", norms_fwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_fwd); } else { /* We need to always run FWD pass once to populate i, f, o, ci, co, cs, h */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) ); } } if ( (pass == 1) && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - BWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ) ); } /* Upconvert libxsmm bf16 buffer to fp32 for correctness check */ matrix_copy_bf16_f32(N*C*t, dxt, dxt_test); /* compare */ libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, N*C*t, 1, dxgoldt, dxt_test, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); } if ( (pass == 2) && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ) ); } /* Upconvert libxsmm bf16 buffer to fp32 for correctness check */ matrix_copy_bf16_f32(4*C*K, dw, dw_test); /* compare */ libxsmm_matdiff(&norms_upd_w, LIBXSMM_DATATYPE_F32, C*K*4, 1, dwgold, dw_test, 0, 0); printf("Delta weight\n"); printf("L1 reference : %.25g\n", norms_upd_w.l1_ref); printf("L1 test : %.25g\n", norms_upd_w.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_w.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_w.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_w.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_w.linf_rel); printf("Check-norm : %.24f\n", norms_upd_w.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_w); /* Upconvert libxsmm bf16 buffer to fp32 for correctness check */ matrix_copy_bf16_f32(4*K*K, dr, dr_test); libxsmm_matdiff(&norms_upd_r, LIBXSMM_DATATYPE_F32, K*K*4, 1, &(dwgold[C*K*4]), dr_test, 0, 0); printf("Delta recurrent weight\n"); printf("L1 reference : %.25g\n", norms_upd_r.l1_ref); printf("L1 test : %.25g\n", norms_upd_r.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_r.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_r.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_r.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_r.linf_rel); printf("Check-norm : %.24f\n", norms_upd_r.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_r); /* Upconvert libxsmm bf16 buffer to fp32 for correctness check */ matrix_copy_bf16_f32(4*K, db, db_test); libxsmm_matdiff(&norms_upd_b, LIBXSMM_DATATYPE_F32, K*4, 1, dbgold, db_test, 0, 0); printf("Delta bias\n"); printf("L1 reference : %.25g\n", norms_upd_b.l1_ref); printf("L1 test : %.25g\n", norms_upd_b.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_b.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_b.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_b.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_b.linf_rel); printf("Check-norm : %.24f\n", norms_upd_b.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_b); } if ( (pass == 3) && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - BWD+UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, 0, tid ) ); } /* Upconvert libxsmm bf16 buffer to fp32 for correctness check */ matrix_copy_bf16_f32(N*C*t, dxt, dxt_test); /* compare */ libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, N*C*t, 1, dxgoldt, dxt_test, 0, 0); printf("Delta input\n"); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); /* Upconvert libxsmm bf16 buffer to fp32 for correctness check */ matrix_copy_bf16_f32(4*C*K, dw, dw_test); libxsmm_matdiff(&norms_upd_w, LIBXSMM_DATATYPE_F32, C*K*4, 1, dwgold, dw_test, 0, 0); printf("Delta weight\n"); printf("L1 reference : %.25g\n", norms_upd_w.l1_ref); printf("L1 test : %.25g\n", norms_upd_w.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_w.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_w.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_w.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_w.linf_rel); printf("Check-norm : %.24f\n", norms_upd_w.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_w); /* Upconvert libxsmm bf16 buffer to fp32 for correctness check */ matrix_copy_bf16_f32(4*K*K, dr, dr_test); libxsmm_matdiff(&norms_upd_r, LIBXSMM_DATATYPE_F32, K*K*4, 1, &(dwgold[C*K*4]), dr_test, 0, 0); printf("Delta recurrent weight\n"); printf("L1 reference : %.25g\n", norms_upd_r.l1_ref); printf("L1 test : %.25g\n", norms_upd_r.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_r.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_r.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_r.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_r.linf_rel); printf("Check-norm : %.24f\n", norms_upd_r.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_r); /* Upconvert libxsmm bf16 buffer to fp32 for correctness check */ matrix_copy_bf16_f32(4*K, db, db_test); libxsmm_matdiff(&norms_upd_b, LIBXSMM_DATATYPE_F32, K*4, 1, dbgold, db_test, 0, 0); printf("Delta bias\n"); printf("L1 reference : %.25g\n", norms_upd_b.l1_ref); printf("L1 test : %.25g\n", norms_upd_b.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_b.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_b.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_b.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_b.linf_rel); printf("Check-norm : %.24f\n", norms_upd_b.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_b); } if ( pass == 0 ) { printf("##########################################\n"); printf("# Performance - FWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(j) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (j = 0; j < iters; ++j) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = (((2.0 * K * N * C) + (2.0 * K * N * K) + (2.0 * K * N) + (tflops * K * N)) * 4.0 + (4.0 * K * N) + (tflops * K * N)) * (double)t * (double)iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("##########################################\n"); printf("# Performance - FWD (BLAS) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM for performance */ #if defined(PROFILE) Gbl_blas_total = 0.0; Gbl_eltwise_total = 0.0; Gbl_conv_total = 0.0; #endif l_start = libxsmm_timer_tick(); for (j = 0; j < iters; ++j) { lstm_ref_fwd( N, C, K, t, forget_bias, wigold, wcgold, wfgold, wogold, rigold, rcgold, rfgold, rogold, bigold, bcgold, bfgold, bogold, xgoldt, cspgold, hpgold, csgoldt, cogoldt, hgoldt, icfogoldt, wgold, rgold, scratch_fwd ); } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); #if defined(PROFILE) flops = (((2.0 * K * N * C) + (2.0 * K * N * K)) * 4.0) * (double)t * (double)iters; printf("BLAS GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("BLAS time = %.5g\n", ((double)(Gbl_blas_total/iters))); printf("BLAS GFLOPS = %.5g\n", (flops*1e-9)/Gbl_blas_total); flops = ((tflops * K * N) * 4.0 + (4.0 * K * N) + (tflops * K * N)) * (double)t * (double)iters; printf("ELTWISE GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("ELTWISE time = %.5g\n", ((double)(Gbl_eltwise_total/iters))); printf("ELTWISE GFLOPS = %.5g\n", (flops*1e-9)/Gbl_eltwise_total); flops = ((C * K + K * K) * 4.0) * (double)iters; printf("CONVERSION GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("CONVERSION time = %.5g\n", ((double)(Gbl_conv_total/iters))); printf("CONVERSION GFLOPS = %.5g\n", (flops*1e-9)/Gbl_conv_total); #endif printf("PERFDUMP,FP,%s,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, ((double)(l_total/iters)), (flops*1e-9)/l_total); } if ( pass == 1 ) { printf("##########################################\n"); printf("# Performance - BWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(j) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (j = 0; j < iters; ++j) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = K * N; /* delta + delta_out */ flops += (6.0 * K * N + tflops * K * N); /* dJdd */ flops += (4.0 * K * N); /* dJdc */ flops += (4.0 * K * N); /* dJdi */ flops += (4.0 * K * N); /* dJdf */ flops += (4.0 * K * N + tflops * K * N); /* dJdo */ tempflops = (4.0 * K * C); /* W^T */ tempflops += (8.0 * K * N * C); /* W^T * dJd{c, i, f, o} */ tempflops += (3.0 * K * C); /* summation */ flops += tempflops; tempflops = (4.0 * K * K); /* R^T */ tempflops += (8.0 * K * N * K); /* R^T * dJd{c, i, f, o} */ flops += tempflops; flops *= t; /* for t time steps */ flops *= iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("bp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,BP,%s,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, ((double)(l_total/iters)), (flops*1e-9)/l_total); } if ( pass == 2 ) { printf("##########################################\n"); printf("# Performance - UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(j) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (j = 0; j < iters; ++j) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = K * N; /* delta + delta_out */ flops += (6.0 * K * N + tflops * K * N); /* dJdd */ flops += (4.0 * K * N); /* dJdc */ flops += (4.0 * K * N); /* dJdi */ flops += (4.0 * K * N); /* dJdf */ flops += (4.0 * K * N + tflops * K * N); /* dJdo */ tempflops = (4.0 * K * K); /* R^T */ tempflops += (8.0 * K * N * K); /* R^T * dJd{c, i, f, o} */ flops += tempflops; flops *= t; /* for t time steps */ tempflops = C * N; /* x^T */ tempflops += (8.0 * K * N * C); /* delta{c, i, f, o} * x^T */ tempflops *= t; /* for t time steps */ tempflops += (4.0 * K * C * (t-1)); /* for summation of dJdW{c, i, f, o} */ flops += tempflops; tempflops = 4.0 * K * N; /* delta^T */ tempflops += (8.0 * K * N * K); /* delta{c, i, f, o} * delta^T */ tempflops *= (t - 1); /* for (t - 1) time steps */ tempflops += (4.0 * K * N * (t-2)); /* for summation of dJdR{c, i, f, o} */ flops += tempflops; flops += (4.0 * K * N * (t - 1)); /* delbias */ flops *= iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("wu time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,WU,%s,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, ((double)(l_total/iters)), (flops*1e-9)/l_total); } if ( pass == 3 ) { printf("##########################################\n"); printf("# Performance - BWD+UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(j) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (j = 0; j < iters; ++j) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = K * N; /* delta + delta_out */ flops += (6.0 * K * N + tflops * K * N); /* dJdd */ flops += (4.0 * K * N); /* dJdc */ flops += (4.0 * K * N); /* dJdi */ flops += (4.0 * K * N); /* dJdf */ flops += (4.0 * K * N + tflops * K * N); /* dJdo */ tempflops = (4.0 * K * C); /* W^T */ tempflops += (8.0 * K * N * C); /* W^T * dJd{c, i, f, o} */ tempflops += (3.0 * K * C); /* summation */ flops += tempflops; tempflops = (4.0 * K * K); /* R^T */ tempflops += (8.0 * K * N * K); /* R^T * dJd{c, i, f, o} */ flops += tempflops; flops *= t; /* for t time steps */ tempflops = C * N; /* x^T */ tempflops += (8.0 * K * N * C); /* delta{c, i, f, o} * x^T */ tempflops *= t; /* for t time steps */ tempflops += (4.0 * K * C * (t-1)); /* for summation of dJdW{c, i, f, o} */ flops += tempflops; tempflops = 4.0 * K * N; /* delta^T */ tempflops += (8.0 * K * N * K); /* delta{c, i, f, o} * delta^T */ tempflops *= (t - 1); /* for (t - 1) time steps */ tempflops += (4.0 * K * N * (t-2)); /* for summation of dJdR{c, i, f, o} */ flops += tempflops; flops += (4.0 * K * N * (t - 1)); /* delbias */ flops *= iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("bp+wu time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("##########################################\n"); printf("# Performance - BWD+UPD (BLAS) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM for performance */ #if defined(PROFILE) Gbl_blas_total = 0.0; Gbl_eltwise_total = 0.0; #endif l_start = libxsmm_timer_tick(); for (j = 0; j < iters; ++j) { lstm_ref_bwd_upd( N, C, K, t, xgoldt, cspgold, hpgold, csgoldt, cogoldt, hgoldt, icfogoldt, wgold, rgold, dcsgold, dhgoldt, dwgold, drgold, dbgold, dxgoldt, dcspgold, dhpgold, scratch_bu ); } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("bp+wu time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); #if defined(PROFILE) flops = (((4.0 * K * N * C) + (4.0 * K * N * K)) * 4.0) * (double)t * (double)iters; printf("BLAS GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("BLAS time = %.5g\n", ((double)(Gbl_blas_total/iters))); printf("BLAS GFLOPS = %.5g\n", (flops*1e-9)/Gbl_blas_total); flops = (19.0 * K * N) * (double)t * (double)iters; printf("ELTWISE GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("ELTWISE time = %.5g\n", ((double)(Gbl_eltwise_total/iters))); printf("ELTWISE GFLOPS = %.5g\n", (flops*1e-9)/Gbl_eltwise_total); #endif printf("PERFDUMP,BP+WU,%s,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, ((double)(l_total/iters)), (flops*1e-9)/l_total); } /* clean-up */ if (pass == 0) { CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD ) ); } else { CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL ) ); } libxsmm_free(scratch); libxsmm_free(internalstate); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_CS_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_CS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_I ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_F ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_O ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_CI ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_CO ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_CS_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_CS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_input ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_cs_prev ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_hidden_state_prev ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_weight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_recur_weight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_bias ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_cs ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_hidden_state ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_i ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_f ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_o ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_ci ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_co ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dinput ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dcs_prev ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dhidden_state_prev ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dweight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_drecur_weight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dbias ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dcs ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dhidden_state ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_rnncell( libxsmm_handle ) ); } /* deallocate data */ libxsmm_free(xgoldt); libxsmm_free(cspgold); libxsmm_free(hpgold); libxsmm_free(wigold); libxsmm_free(wfgold); libxsmm_free(wogold); libxsmm_free(wcgold); libxsmm_free(rigold); libxsmm_free(rfgold); libxsmm_free(rogold); libxsmm_free(rcgold); libxsmm_free(bigold); libxsmm_free(bfgold); libxsmm_free(bogold); libxsmm_free(bcgold); libxsmm_free(csgoldt); libxsmm_free(cogoldt); libxsmm_free(hgoldt); libxsmm_free(wgold); libxsmm_free(icfogoldt); libxsmm_free(dxgoldt); libxsmm_free(dcspgold); libxsmm_free(dhpgold); libxsmm_free(dwgold); libxsmm_free(scratch_fwd); libxsmm_free(scratch_bu); libxsmm_free(dbgold); libxsmm_free(dcsgold); libxsmm_free(dhgoldt); libxsmm_free(xt); libxsmm_free(csp); libxsmm_free(hp); libxsmm_free(w); libxsmm_free(r); libxsmm_free(b); libxsmm_free(cst); libxsmm_free(ht); libxsmm_free(dxt); libxsmm_free(dcsp); libxsmm_free(dhp); libxsmm_free(dw); libxsmm_free(dr); libxsmm_free(db); libxsmm_free(dcs); libxsmm_free(dht); libxsmm_free(it); libxsmm_free(ft); libxsmm_free(ot); libxsmm_free(cit); libxsmm_free(cot); libxsmm_free(h_test); libxsmm_free(dxt_test); libxsmm_free(dw_test); libxsmm_free(dr_test); libxsmm_free(db_test); { const char *const env_check_scale = getenv("CHECK_SCALE"); const double check_scale = LIBXSMM_ABS(0 == env_check_scale ? 1.0 : atof(env_check_scale)); if (LIBXSMM_NEQ(0, check) && (check < 100.0 * check_scale * diff.normf_rel) && (global_status == LIBXSMM_DNN_SUCCESS)) { fprintf(stderr, "FAILED with an error of %f%%!\n", 100.0 * diff.normf_rel); exit(EXIT_FAILURE); } } /* some empty lines at the end */ printf("\n\n\n"); return global_status; } libxsmm-1.17/samples/deeplearning/lstmdriver/lstmdriver_nc_ck_bf16.vcxproj000066400000000000000000000551671415223013700272100ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 lstmdriver_nc_ck_bf16 {5A6F2BEB-65C8-4D93-9A5A-E2AA05466719} 10.0 Application Disabled Disabled Sequential v142 true Application true true Disabled Disabled Sequential v142 Application true Disabled Disabled Sequential v142 true Application Disabled Disabled Sequential v142 true true Application true Disabled Disabled Sequential v142 Application true Disabled Disabled true Sequential v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmext.lib;mkl_rt.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmext.lib;mkl_rt.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/deeplearning/lstmdriver/lstmdriver_nc_ck_f32.c000066400000000000000000001463061415223013700255670ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Kunal Banerjee (Intel Corp.) ******************************************************************************/ #include #include #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #include #include #include #include #if defined(_OPENMP) # include #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif /* include c-based dnn library */ #include "../common/dnn_common.h" #define CHKERR_LIBXSMM_DNN(A) { const int chkerr_libxsmm_dnn_ = A; if (LIBXSMM_DNN_SUCCESS != chkerr_libxsmm_dnn_) { \ fprintf(stderr, "%s\n", libxsmm_dnn_get_error(chkerr_libxsmm_dnn_)); global_status = chkerr_libxsmm_dnn_; } \ } #if defined(PROFILE) unsigned long long Gbl_blas_start, Gbl_blas_end, Gbl_eltwise_start, Gbl_eltwise_end, Gbl_conv_start, Gbl_conv_end, Gbl_copy_bias_start, Gbl_copy_bias_end; double Gbl_blas_total, Gbl_eltwise_total, Gbl_conv_total, Gbl_copy_bias_total; #endif int main(int argc, char* argv[]) { float *wigold, *wfgold, *wogold, *wcgold, *rigold, *rfgold, *rogold, *rcgold, *bigold, *bfgold, *bogold, *bcgold; float *xgoldt, *cspgold,*hpgold, *csgoldt, *cogoldt, *hgoldt; float *dwgold, *drgold, *dbgold; float *dxgoldt, *dcspgold, *dhpgold, *dcsgold, *dhgoldt; float *icfogoldt, *wgold, *rgold; float *scratch_fwd, *scratch_bu; float *xt, *csp, *hp, *w, *r, *b, *cst, *ht; float *it, *ft, *ot, *cit, *cot; float *dxt, *dcsp, *dhp, *dw, *dr, *db, *dcs, *dht; float forget_bias = 1.0f; void *scratch, *internalstate; size_t scratch_size = 0, internalstate_size = 0; int iters = 10; /* repetitions of benchmark */ int pass = 0; /* pass: 0--FWD, 1--BWD, 2--UPD, 3--BWD+UPD */ int N = 168; /* size of mini-batch */ int C = 512; /* number of inputs */ int K = 512; /* number of outputs */ int t = 50; /* number of time steps (>= 1) */ int bn = 24; int bc = 64; int bk = 64; const char *const env_check = getenv("CHECK"); const double check = LIBXSMM_ABS(0 == env_check ? 1/*enbaled by default*/ : atof(env_check)); #if defined(_OPENMP) int nThreads = omp_get_max_threads(); /* number of threads */ #else int nThreads = 1; /* number of threads */ #endif unsigned long long l_start, l_end; double l_total = 0.0; double flops = 0.0, tempflops = 0.0; const double tflops = 12; /* transcendental flops */ int j; libxsmm_dnn_rnncell_desc lstmcell_desc; libxsmm_dnn_rnncell* libxsmm_handle; libxsmm_dnn_tensor* libxsmm_input; libxsmm_dnn_tensor* libxsmm_cs_prev; libxsmm_dnn_tensor* libxsmm_hidden_state_prev; libxsmm_dnn_tensor* libxsmm_weight; libxsmm_dnn_tensor* libxsmm_recur_weight; libxsmm_dnn_tensor* libxsmm_bias; libxsmm_dnn_tensor* libxsmm_cs; libxsmm_dnn_tensor* libxsmm_hidden_state; libxsmm_dnn_tensor* libxsmm_i; libxsmm_dnn_tensor* libxsmm_f; libxsmm_dnn_tensor* libxsmm_o; libxsmm_dnn_tensor* libxsmm_ci; libxsmm_dnn_tensor* libxsmm_co; libxsmm_dnn_tensor* libxsmm_dinput; libxsmm_dnn_tensor* libxsmm_dcs_prev; libxsmm_dnn_tensor* libxsmm_dhidden_state_prev; libxsmm_dnn_tensor* libxsmm_dweight; libxsmm_dnn_tensor* libxsmm_drecur_weight; libxsmm_dnn_tensor* libxsmm_dbias; libxsmm_dnn_tensor* libxsmm_dcs; libxsmm_dnn_tensor* libxsmm_dhidden_state; libxsmm_dnn_tensor_datalayout* libxsmm_layout; libxsmm_dnn_err_t status; libxsmm_dnn_err_t global_status = LIBXSMM_DNN_SUCCESS; libxsmm_matdiff_info norms_fwd, norms_bwd, norms_upd_w, norms_upd_r, norms_upd_b, diff; libxsmm_matdiff_clear(&norms_fwd); libxsmm_matdiff_clear(&norms_bwd); libxsmm_matdiff_clear(&norms_upd_w); libxsmm_matdiff_clear(&norms_upd_r); libxsmm_matdiff_clear(&norms_upd_b); libxsmm_matdiff_clear(&diff); if (argc > 1 && !strncmp(argv[1], "-h", 3)) { printf("\nUsage: ./lstmdriver [reps] [pass: 0--FWD, 1--BWD, 2--UPD, 3--BWD+UPD] [N] [C] [K] [time_steps > 0]\n\n"); return 0; } libxsmm_rng_set_seed(1); /* reading new values from cli */ j = 1; if (argc > j) iters = atoi(argv[j++]); if (argc > j) pass = atoi(argv[j++]); if (argc > j) N = atoi(argv[j++]); if (argc > j) C = atoi(argv[j++]); if (argc > j) K = atoi(argv[j++]); if (argc > j) t = atoi(argv[j++]); if (argc > j) bn = atoi(argv[j++]); if (argc > j) bc = atoi(argv[j++]); if (argc > j) bk = atoi(argv[j++]); if (t <= 0) { printf("time_steps %d should be greater than or equal to 1\n\n", t); return 0; } if (!(pass == 0 || pass == 1 || pass == 2 || pass == 3)) { printf("Unknown pass: %d, valid arguments for pass = {0(FWD), 1(BWD), 2(UPD), 3(BWD+UPD)\n\n", pass); return 0; } #if defined(__SSE3__) _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); #endif /* print some summary */ printf("##########################################\n"); printf("# Setting Up (Common) #\n"); printf("##########################################\n"); printf("PARAMS: N:%d C:%d K:%d T:%d\n", N, C, K, t); printf("PARAMS: ITERS:%d", iters); if (LIBXSMM_FEQ(0, check)) printf(" Threads:%d\n", nThreads); else printf("\n"); printf("SIZE Weight (MB): %10.2f MiB\n", (double)(C*K*sizeof(float))/(1024.0*1024.0) ); printf("SIZE Input (MB): %10.2f MiB\n", (double)(N*C*sizeof(float))/(1024.0*1024.0) ); printf("SIZE Hidden State: %10.2f MiB\n", (double)(K*N*sizeof(float))/(1024.0*1024.0) ); /* allocate data */ xgoldt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); cspgold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); hpgold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); wigold = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); wfgold = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); wogold = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); wcgold = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); rigold = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); rfgold = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); rogold = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); rcgold = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); bigold = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); bfgold = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); bogold = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); bcgold = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); csgoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); cogoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); hgoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); dxgoldt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); dcspgold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); dhpgold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); wgold = (float*)libxsmm_aligned_malloc((C+K)*K*4*sizeof(float), 2097152); rgold = NULL; dwgold = (float*)libxsmm_aligned_malloc((C+K)*K*4*sizeof(float), 2097152); drgold = NULL; scratch_fwd = (float*)libxsmm_aligned_malloc((C+K)*N*sizeof(float), 2097152); scratch_bu = (float*)libxsmm_aligned_malloc(((C+K)*N*2 + K*N*t*5)*sizeof(float), 2097152); dbgold = (float*)libxsmm_aligned_malloc(K*4*sizeof(float), 2097152); dcsgold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); dhgoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); icfogoldt = (float*)libxsmm_aligned_malloc(K*N*t*4*sizeof(float), 2097152); xt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); csp = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); hp = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); w = (float*)libxsmm_aligned_malloc(C*K*4*sizeof(float), 2097152); r = (float*)libxsmm_aligned_malloc(K*K*4*sizeof(float), 2097152); b = (float*)libxsmm_aligned_malloc(K*4*sizeof(float), 2097152); cst = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); ht = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); it = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); ft = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); ot = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); cit = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); cot = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); dxt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); dcsp = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); dhp = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); dw = (float*)libxsmm_aligned_malloc(C*K*4*sizeof(float), 2097152); dr = (float*)libxsmm_aligned_malloc(K*K*4*sizeof(float), 2097152); db = (float*)libxsmm_aligned_malloc(K*4*sizeof(float), 2097152); dcs = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); dht = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); LIBXSMM_VLA_DECL(2, float, xgold, xgoldt, N * C); LIBXSMM_VLA_DECL(2, float, hgold, hgoldt, K * N); LIBXSMM_VLA_DECL(2, float, dhgold, dhgoldt, K * N); LIBXSMM_VLA_DECL(2, float, h, ht, K * N); /* initialize data */ /* FWD */ for (j = 0; j < t; ++j) { LIBXSMM_MATINIT_OMP(float, 24, &LIBXSMM_VLA_ACCESS(2, xgold, j, 0, N * C), N, C, N, 1.0); } LIBXSMM_MATINIT_OMP(float, 24, cspgold,N, K, N, 1.0); LIBXSMM_MATINIT_OMP(float, 24, hpgold, N, K, N, 1.0); LIBXSMM_MATINIT_OMP(float, 42, wigold, C, K, C, 1.0); LIBXSMM_MATINIT_OMP(float, 42, wfgold, C, K, C, 1.0); LIBXSMM_MATINIT_OMP(float, 42, wogold, C, K, C, 1.0); LIBXSMM_MATINIT_OMP(float, 42, wcgold, C, K, C, 1.0); LIBXSMM_MATINIT_OMP(float, 42, rigold, K, K, K, 1.0); LIBXSMM_MATINIT_OMP(float, 42, rfgold, K, K, K, 1.0); LIBXSMM_MATINIT_OMP(float, 42, rogold, K, K, K, 1.0); LIBXSMM_MATINIT_OMP(float, 42, rcgold, K, K, K, 1.0); LIBXSMM_MATINIT_OMP(float, 24, bigold, 1, K, 1, 1.0); LIBXSMM_MATINIT_OMP(float, 24, bfgold, 1, K, 1, 1.0); LIBXSMM_MATINIT_OMP(float, 24, bogold, 1, K, 1, 1.0); LIBXSMM_MATINIT_OMP(float, 24, bcgold, 1, K, 1, 1.0); zero_buf(csgoldt, K*N*t); zero_buf(cogoldt, K*N*t); zero_buf(hgoldt, K*N*t); /* BWD/UPD */ for (j = 0; j < t; ++j) { LIBXSMM_MATINIT_OMP(float, 24, &LIBXSMM_VLA_ACCESS(2, dhgold, j, 0, K * N), N, K, N, 1.0); } LIBXSMM_MATINIT_OMP(float, 24, dcsgold, N, K, N, 1.0); zero_buf(dxgoldt, N*C*t); zero_buf(dcspgold, K*N); zero_buf(dhpgold, K*N); zero_buf(dwgold, (C+K)*K*4); zero_buf(dbgold, K*4); /* first touch LIBXSMM */ zero_buf(xt, N*C*t); zero_buf(csp, K*N); zero_buf(hp, K*N); zero_buf(w, C*K*4); zero_buf(r, K*K*4); zero_buf(b, K*4); zero_buf(cst, K*N*t); zero_buf(ht, K*N*t); zero_buf(it, K*N*t); zero_buf(ft, K*N*t); zero_buf(ot, K*N*t); zero_buf(cit, K*N*t); zero_buf(cot, K*N*t); zero_buf(dxt, N*C*t); zero_buf(dcsp, K*N); zero_buf(dhp, K*N); zero_buf(dw, C*K*4); zero_buf(dr, K*K*4); zero_buf(db, K*4); zero_buf(dcs, K*N); zero_buf(dht, K*N*t); if (LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Computing Reference ... #\n"); printf("##########################################\n"); lstm_ref_fwd( N, C, K, t, forget_bias, wigold, wcgold, wfgold, wogold, rigold, rcgold, rfgold, rogold, bigold, bcgold, bfgold, bogold, xgoldt, cspgold, hpgold, csgoldt, cogoldt, hgoldt, icfogoldt, wgold, rgold, scratch_fwd ); lstm_ref_bwd_upd( N, C, K, t, xgoldt, cspgold, hpgold, csgoldt, cogoldt, hgoldt, icfogoldt, wgold, rgold, dcsgold, dhgoldt, dwgold, drgold, dbgold, dxgoldt, dcspgold, dhpgold, scratch_bu ); printf("##########################################\n"); printf("# Computing Reference ... done #\n"); printf("##########################################\n"); } if (1 /* format == 'A' || format == 'L' */) { printf("\n"); printf("##########################################\n"); printf("# Setting Up (custom-Storage) #\n"); printf("##########################################\n"); if ( N % bn != 0 ) { bn = N; } if ( C % bc != 0 ) { bc = C; } if ( K % bk != 0 ) { bk = K; } /* setup LIBXSMM handle */ lstmcell_desc.threads = nThreads; lstmcell_desc.N = N; lstmcell_desc.C = C; lstmcell_desc.K = K; lstmcell_desc.max_T = t; lstmcell_desc.bn = bn; lstmcell_desc.bk = bk; lstmcell_desc.bc = bc; lstmcell_desc.cell_type = LIBXSMM_DNN_RNNCELL_LSTM; lstmcell_desc.datatype_in = LIBXSMM_DNN_DATATYPE_F32; lstmcell_desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32; lstmcell_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NC; lstmcell_desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_CK; libxsmm_handle = libxsmm_dnn_create_rnncell( lstmcell_desc, &status ); CHKERR_LIBXSMM_DNN( status ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_allocate_forget_bias(libxsmm_handle, forget_bias) ); /* setup LIBXSMM buffers and filter */ libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_CS_PREV, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_cs_prev = libxsmm_dnn_link_tensor( libxsmm_layout, csp, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_input = libxsmm_dnn_link_tensor( libxsmm_layout, xt, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_hidden_state_prev = libxsmm_dnn_link_tensor( libxsmm_layout, hp, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_weight = libxsmm_dnn_link_tensor( libxsmm_layout, w, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_recur_weight = libxsmm_dnn_link_tensor( libxsmm_layout, r, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_BIAS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_bias = libxsmm_dnn_link_tensor( libxsmm_layout, b, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_CS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_cs = libxsmm_dnn_link_tensor( libxsmm_layout, cst, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_hidden_state = libxsmm_dnn_link_tensor( libxsmm_layout, ht, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_I, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_i = libxsmm_dnn_link_tensor( libxsmm_layout, it, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_F, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_f = libxsmm_dnn_link_tensor( libxsmm_layout, ft, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_O, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_o = libxsmm_dnn_link_tensor( libxsmm_layout, ot, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_CI, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_ci = libxsmm_dnn_link_tensor( libxsmm_layout, cit, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_CO, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_co = libxsmm_dnn_link_tensor( libxsmm_layout, cot, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dinput = libxsmm_dnn_link_tensor( libxsmm_layout, dxt, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_CS_PREV, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dcs_prev = libxsmm_dnn_link_tensor( libxsmm_layout, dcsp, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dhidden_state_prev = libxsmm_dnn_link_tensor( libxsmm_layout, dhp, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dweight = libxsmm_dnn_link_tensor( libxsmm_layout, dw, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_drecur_weight = libxsmm_dnn_link_tensor( libxsmm_layout, dr, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_BIAS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dbias = libxsmm_dnn_link_tensor( libxsmm_layout, db, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_CS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dcs = libxsmm_dnn_link_tensor( libxsmm_layout, dcs, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dhidden_state = libxsmm_dnn_link_tensor( libxsmm_layout, dht, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); /* copy in data to LIBXSMM format */ matrix_copy(N*C*t, xgoldt, xt); matrix_copy(K*N, cspgold, csp); matrix_copy(K*N, hpgold, hp); convert_ck_c4k(C, K, wigold, w); convert_ck_c4k(C, K, wcgold, &(w[K])); convert_ck_c4k(C, K, wfgold, &(w[2*K])); convert_ck_c4k(C, K, wogold, &(w[3*K])); convert_ck_c4k(K, K, rigold, r); convert_ck_c4k(K, K, rcgold, &(r[K])); convert_ck_c4k(K, K, rfgold, &(r[2*K])); convert_ck_c4k(K, K, rogold, &(r[3*K])); matrix_copy(K, bigold, &(b[0])); matrix_copy(K, bcgold, &(b[K])); matrix_copy(K, bfgold, &(b[2*K])); matrix_copy(K, bogold, &(b[3*K])); matrix_copy(K*N*t, dhgoldt, dht); matrix_copy(K*N, dcsgold, dcs); /* bind buffers and filter to handle */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_input, LIBXSMM_DNN_RNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_cs_prev, LIBXSMM_DNN_RNN_REGULAR_CS_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_hidden_state_prev, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_weight, LIBXSMM_DNN_RNN_REGULAR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_recur_weight, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_bias, LIBXSMM_DNN_RNN_REGULAR_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_cs, LIBXSMM_DNN_RNN_REGULAR_CS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_hidden_state, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_i, LIBXSMM_DNN_RNN_INTERNAL_I ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_f, LIBXSMM_DNN_RNN_INTERNAL_F ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_o, LIBXSMM_DNN_RNN_INTERNAL_O ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_ci, LIBXSMM_DNN_RNN_INTERNAL_CI ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_co, LIBXSMM_DNN_RNN_INTERNAL_CO ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dinput, LIBXSMM_DNN_RNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dcs_prev, LIBXSMM_DNN_RNN_GRADIENT_CS_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dhidden_state_prev, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dweight, LIBXSMM_DNN_RNN_GRADIENT_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_drecur_weight, LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dbias, LIBXSMM_DNN_RNN_GRADIENT_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dcs, LIBXSMM_DNN_RNN_GRADIENT_CS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dhidden_state, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE ) ); /* let's allocate and bind scratch */ if (pass == 0) { scratch_size = libxsmm_dnn_rnncell_get_scratch_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, &status ); CHKERR_LIBXSMM_DNN( status ); scratch = libxsmm_aligned_malloc( scratch_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, scratch ) ); } else { scratch_size = libxsmm_dnn_rnncell_get_scratch_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, &status ); CHKERR_LIBXSMM_DNN( status ); scratch = libxsmm_aligned_malloc( scratch_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, scratch ) ); } zero_buf( (float*)scratch, scratch_size/4 ); /* let's allocate and bind internalstate */ if (pass == 0) { internalstate_size = libxsmm_dnn_rnncell_get_internalstate_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, &status ); CHKERR_LIBXSMM_DNN( status ); internalstate = (0 != internalstate_size ? libxsmm_aligned_malloc( internalstate_size, 2097152 ) : NULL); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, internalstate ) ); } else { internalstate_size = libxsmm_dnn_rnncell_get_internalstate_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, &status ); CHKERR_LIBXSMM_DNN( status ); internalstate = (0 != internalstate_size ? libxsmm_aligned_malloc( internalstate_size, 2097152 ) : NULL); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, internalstate ) ); } zero_buf( (float*)internalstate, internalstate_size/4 ); if ((pass == 0) && LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Correctness - FWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) ); } /* compare */ libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_F32, K*N, 1, &LIBXSMM_VLA_ACCESS(2, hgold, t-1, 0, K * N), &LIBXSMM_VLA_ACCESS(2, h, t-1, 0, K * N), 0, 0); printf("L1 reference : %.25g\n", norms_fwd.l1_ref); printf("L1 test : %.25g\n", norms_fwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_fwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_fwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel); printf("Check-norm : %.24f\n", norms_fwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_fwd); } else { /* We need to always run FWD pass once to populate i, f, o, ci, co, cs, h */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) ); } } if ( (pass == 1) && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - BWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ) ); } /* compare */ libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, N*C*t, 1, dxgoldt, dxt, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); } if ( (pass == 2) && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ) ); } /* compare */ libxsmm_matdiff(&norms_upd_w, LIBXSMM_DATATYPE_F32, C*K*4, 1, dwgold, dw, 0, 0); printf("Delta weight\n"); printf("L1 reference : %.25g\n", norms_upd_w.l1_ref); printf("L1 test : %.25g\n", norms_upd_w.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_w.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_w.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_w.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_w.linf_rel); printf("Check-norm : %.24f\n", norms_upd_w.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_w); libxsmm_matdiff(&norms_upd_r, LIBXSMM_DATATYPE_F32, K*K*4, 1, &(dwgold[C*K*4]), dr, 0, 0); printf("Delta recurrent weight\n"); printf("L1 reference : %.25g\n", norms_upd_r.l1_ref); printf("L1 test : %.25g\n", norms_upd_r.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_r.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_r.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_r.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_r.linf_rel); printf("Check-norm : %.24f\n", norms_upd_r.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_r); libxsmm_matdiff(&norms_upd_b, LIBXSMM_DATATYPE_F32, K*4, 1, dbgold, db, 0, 0); printf("Delta bias\n"); printf("L1 reference : %.25g\n", norms_upd_b.l1_ref); printf("L1 test : %.25g\n", norms_upd_b.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_b.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_b.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_b.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_b.linf_rel); printf("Check-norm : %.24f\n", norms_upd_b.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_b); } if ( (pass == 3) && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - BWD+UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, 0, tid ) ); } /* compare */ libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, N*C*t, 1, dxgoldt, dxt, 0, 0); printf("Delta input\n"); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); libxsmm_matdiff(&norms_upd_w, LIBXSMM_DATATYPE_F32, C*K*4, 1, dwgold, dw, 0, 0); printf("Delta weight\n"); printf("L1 reference : %.25g\n", norms_upd_w.l1_ref); printf("L1 test : %.25g\n", norms_upd_w.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_w.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_w.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_w.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_w.linf_rel); printf("Check-norm : %.24f\n", norms_upd_w.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_w); libxsmm_matdiff(&norms_upd_r, LIBXSMM_DATATYPE_F32, K*K*4, 1, &(dwgold[C*K*4]), dr, 0, 0); printf("Delta recurrent weight\n"); printf("L1 reference : %.25g\n", norms_upd_r.l1_ref); printf("L1 test : %.25g\n", norms_upd_r.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_r.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_r.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_r.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_r.linf_rel); printf("Check-norm : %.24f\n", norms_upd_r.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_r); libxsmm_matdiff(&norms_upd_b, LIBXSMM_DATATYPE_F32, K*4, 1, dbgold, db, 0, 0); printf("Delta bias\n"); printf("L1 reference : %.25g\n", norms_upd_b.l1_ref); printf("L1 test : %.25g\n", norms_upd_b.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_b.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_b.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_b.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_b.linf_rel); printf("Check-norm : %.24f\n", norms_upd_b.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_b); } if ( pass == 0 ) { printf("##########################################\n"); printf("# Performance - FWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(j) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (j = 0; j < iters; ++j) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = (((2.0 * K * N * C) + (2.0 * K * N * K) + (2.0 * K * N) + (tflops * K * N)) * 4.0 + (4.0 * K * N) + (tflops * K * N)) * (double)t * (double)iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("##########################################\n"); printf("# Performance - FWD (BLAS) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM for performance */ #if defined(PROFILE) Gbl_blas_total = 0.0; Gbl_eltwise_total = 0.0; Gbl_conv_total = 0.0; Gbl_copy_bias_total = 0.0; #endif l_start = libxsmm_timer_tick(); for (j = 0; j < iters; ++j) { lstm_ref_fwd( N, C, K, t, forget_bias, wigold, wcgold, wfgold, wogold, rigold, rcgold, rfgold, rogold, bigold, bcgold, bfgold, bogold, xgoldt, cspgold, hpgold, csgoldt, cogoldt, hgoldt, icfogoldt, wgold, rgold, scratch_fwd ); } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); #if defined(PROFILE) printf("------------------------------------------\n"); flops = (((2.0 * K * N * C) + (2.0 * K * N * K)) * 4.0) * (double)t * (double)iters; printf("BLAS GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("BLAS time = %.5g\n", ((double)(Gbl_blas_total/iters))); printf("BLAS GFLOPS = %.5g\n", (flops*1e-9)/Gbl_blas_total); flops = ((tflops * K * N) * 4.0 + (4.0 * K * N) + (tflops * K * N)) * (double)t * (double)iters; printf("ELTWISE GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("ELTWISE time = %.5g\n", ((double)(Gbl_eltwise_total/iters))); printf("ELTWISE GFLOPS = %.5g\n", (flops*1e-9)/Gbl_eltwise_total); flops = ((C * K + K * K) * 4.0) * (double)iters; printf("WEIGHT CONVERSION GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("WEIGHT CONVERSION time = %.5g\n", ((double)(Gbl_conv_total/iters))); printf("WEIGHT CONVERSION GFLOPS = %.5g\n", (flops*1e-9)/Gbl_conv_total); flops = (N * K * 4.0) * (double)iters; printf("COPY BIAS GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("COPY BIAS time = %.5g\n", ((double)(Gbl_copy_bias_total/iters))); printf("COPY BIAS GFLOPS = %.5g\n", (flops*1e-9)/Gbl_copy_bias_total); printf("------------------------------------------\n"); #endif printf("PERFDUMP,FP,%s,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, ((double)(l_total/iters)), (flops*1e-9)/l_total); } if ( pass == 1 ) { printf("##########################################\n"); printf("# Performance - BWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(j) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (j = 0; j < iters; ++j) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = K * N; /* delta + delta_out */ flops += (6.0 * K * N + tflops * K * N); /* dJdd */ flops += (4.0 * K * N); /* dJdc */ flops += (4.0 * K * N); /* dJdi */ flops += (4.0 * K * N); /* dJdf */ flops += (4.0 * K * N + tflops * K * N); /* dJdo */ tempflops = (4.0 * K * C); /* W^T */ tempflops += (8.0 * K * N * C); /* W^T * dJd{c, i, f, o} */ tempflops += (3.0 * K * C); /* summation */ flops += tempflops; tempflops = (4.0 * K * K); /* R^T */ tempflops += (8.0 * K * N * K); /* R^T * dJd{c, i, f, o} */ flops += tempflops; flops *= t; /* for t time steps */ flops *= iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("bp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,BP,%s,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, ((double)(l_total/iters)), (flops*1e-9)/l_total); } if ( pass == 2 ) { printf("##########################################\n"); printf("# Performance - UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(j) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (j = 0; j < iters; ++j) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = K * N; /* delta + delta_out */ flops += (6.0 * K * N + tflops * K * N); /* dJdd */ flops += (4.0 * K * N); /* dJdc */ flops += (4.0 * K * N); /* dJdi */ flops += (4.0 * K * N); /* dJdf */ flops += (4.0 * K * N + tflops * K * N); /* dJdo */ tempflops = (4.0 * K * K); /* R^T */ tempflops += (8.0 * K * N * K); /* R^T * dJd{c, i, f, o} */ flops += tempflops; flops *= t; /* for t time steps */ tempflops = C * N; /* x^T */ tempflops += (8.0 * K * N * C); /* delta{c, i, f, o} * x^T */ tempflops *= t; /* for t time steps */ tempflops += (4.0 * K * C * (t-1)); /* for summation of dJdW{c, i, f, o} */ flops += tempflops; tempflops = 4.0 * K * N; /* delta^T */ tempflops += (8.0 * K * N * K); /* delta{c, i, f, o} * delta^T */ tempflops *= (t - 1); /* for (t - 1) time steps */ tempflops += (4.0 * K * N * (t-2)); /* for summation of dJdR{c, i, f, o} */ flops += tempflops; flops += (4.0 * K * N * (t - 1)); /* delbias */ flops *= iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("wu time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,WU,%s,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, ((double)(l_total/iters)), (flops*1e-9)/l_total); } if ( pass == 3 ) { printf("##########################################\n"); printf("# Performance - BWD+UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(j) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (j = 0; j < iters; ++j) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = K * N; /* delta + delta_out */ flops += (6.0 * K * N + tflops * K * N); /* dJdd */ flops += (4.0 * K * N); /* dJdc */ flops += (4.0 * K * N); /* dJdi */ flops += (4.0 * K * N); /* dJdf */ flops += (4.0 * K * N + tflops * K * N); /* dJdo */ tempflops = (4.0 * K * C); /* W^T */ tempflops += (8.0 * K * N * C); /* W^T * dJd{c, i, f, o} */ tempflops += (3.0 * K * C); /* summation */ flops += tempflops; tempflops = (4.0 * K * K); /* R^T */ tempflops += (8.0 * K * N * K); /* R^T * dJd{c, i, f, o} */ flops += tempflops; flops *= t; /* for t time steps */ tempflops = C * N; /* x^T */ tempflops += (8.0 * K * N * C); /* delta{c, i, f, o} * x^T */ tempflops *= t; /* for t time steps */ tempflops += (4.0 * K * C * (t-1)); /* for summation of dJdW{c, i, f, o} */ flops += tempflops; tempflops = 4.0 * K * N; /* delta^T */ tempflops += (8.0 * K * N * K); /* delta{c, i, f, o} * delta^T */ tempflops *= (t - 1); /* for (t - 1) time steps */ tempflops += (4.0 * K * N * (t-2)); /* for summation of dJdR{c, i, f, o} */ flops += tempflops; flops += (4.0 * K * N * (t - 1)); /* delbias */ flops *= iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("bp+wu time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("##########################################\n"); printf("# Performance - BWD+UPD (BLAS) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM for performance */ #if defined(PROFILE) Gbl_blas_total = 0.0; Gbl_eltwise_total = 0.0; #endif l_start = libxsmm_timer_tick(); for (j = 0; j < iters; ++j) { lstm_ref_bwd_upd( N, C, K, t, xgoldt, cspgold, hpgold, csgoldt, cogoldt, hgoldt, icfogoldt, wgold, rgold, dcsgold, dhgoldt, dwgold, drgold, dbgold, dxgoldt, dcspgold, dhpgold, scratch_bu ); } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("bp+wu time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); #if defined(PROFILE) printf("------------------------------------------\n"); flops = (((4.0 * K * N * C) + (4.0 * K * N * K)) * 4.0) * (double)t * (double)iters; printf("BLAS GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("BLAS time = %.5g\n", ((double)(Gbl_blas_total/iters))); printf("BLAS GFLOPS = %.5g\n", (flops*1e-9)/Gbl_blas_total); flops = (19.0 * K * N) * (double)t * (double)iters; printf("ELTWISE GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("ELTWISE time = %.5g\n", ((double)(Gbl_eltwise_total/iters))); printf("ELTWISE GFLOPS = %.5g\n", (flops*1e-9)/Gbl_eltwise_total); printf("------------------------------------------\n"); #endif printf("PERFDUMP,BP+WU,%s,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, ((double)(l_total/iters)), (flops*1e-9)/l_total); } /* clean-up */ if (pass == 0) { CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD ) ); } else { CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL ) ); } libxsmm_free(scratch); libxsmm_free(internalstate); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_CS_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_CS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_I ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_F ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_O ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_CI ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_CO ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_CS_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_CS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_input ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_cs_prev ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_hidden_state_prev ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_weight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_recur_weight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_bias ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_cs ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_hidden_state ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_i ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_f ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_o ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_ci ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_co ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dinput ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dcs_prev ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dhidden_state_prev ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dweight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_drecur_weight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dbias ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dcs ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dhidden_state ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_rnncell( libxsmm_handle ) ); } /* deallocate data */ libxsmm_free(xgoldt); libxsmm_free(cspgold); libxsmm_free(hpgold); libxsmm_free(wigold); libxsmm_free(wfgold); libxsmm_free(wogold); libxsmm_free(wcgold); libxsmm_free(rigold); libxsmm_free(rfgold); libxsmm_free(rogold); libxsmm_free(rcgold); libxsmm_free(bigold); libxsmm_free(bfgold); libxsmm_free(bogold); libxsmm_free(bcgold); libxsmm_free(csgoldt); libxsmm_free(cogoldt); libxsmm_free(hgoldt); libxsmm_free(wgold); libxsmm_free(icfogoldt); libxsmm_free(dxgoldt); libxsmm_free(dcspgold); libxsmm_free(dhpgold); libxsmm_free(dwgold); libxsmm_free(dbgold); libxsmm_free(dcsgold); libxsmm_free(dhgoldt); libxsmm_free(scratch_fwd); libxsmm_free(scratch_bu); libxsmm_free(xt); libxsmm_free(csp); libxsmm_free(hp); libxsmm_free(w); libxsmm_free(r); libxsmm_free(b); libxsmm_free(cst); libxsmm_free(ht); libxsmm_free(dxt); libxsmm_free(dcsp); libxsmm_free(dhp); libxsmm_free(dw); libxsmm_free(dr); libxsmm_free(db); libxsmm_free(dcs); libxsmm_free(dht); libxsmm_free(it); libxsmm_free(ft); libxsmm_free(ot); libxsmm_free(cit); libxsmm_free(cot); { const char *const env_check_scale = getenv("CHECK_SCALE"); const double check_scale = LIBXSMM_ABS(0 == env_check_scale ? 1.0 : atof(env_check_scale)); if (LIBXSMM_NEQ(0, check) && (check < 100.0 * check_scale * diff.normf_rel) && (global_status == LIBXSMM_DNN_SUCCESS)) { fprintf(stderr, "FAILED with an error of %f%%!\n", 100.0 * diff.normf_rel); exit(EXIT_FAILURE); } } /* some empty lines at the end */ printf("\n\n\n"); return global_status; } libxsmm-1.17/samples/deeplearning/lstmdriver/lstmdriver_nc_ck_f32.vcxproj000066400000000000000000000551651415223013700270420ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 lstmdriver_nc_ck_f32 {368B8E7B-305A-4736-A196-B236B0172BA7} 10.0 Application Disabled Disabled Sequential v142 true Application true true Disabled Disabled Sequential v142 Application true Disabled Disabled Sequential v142 true Application Disabled Disabled Sequential v142 true true Application true Disabled Disabled Sequential v142 Application true Disabled Disabled true Sequential v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmext.lib;mkl_rt.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmext.lib;mkl_rt.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/deeplearning/lstmdriver/lstmdriver_nc_kcck_bf16.c000066400000000000000000001640711415223013700262500ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas (Intel Corp.) ******************************************************************************/ #if 0 #define PROFILE #endif #include #include #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #include #include #include #include #if defined(_OPENMP) # include #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif /* include c-based dnn library */ #include "../common/dnn_common.h" #define CHKERR_LIBXSMM_DNN(A) { const int chkerr_libxsmm_dnn_ = A; if (LIBXSMM_DNN_SUCCESS != chkerr_libxsmm_dnn_) { \ fprintf(stderr, "%s\n", libxsmm_dnn_get_error(chkerr_libxsmm_dnn_)); global_status = chkerr_libxsmm_dnn_; } \ } #if defined(PROFILE) unsigned long long Gbl_blas_start, Gbl_blas_end, Gbl_eltwise_start, Gbl_eltwise_end, Gbl_conv_start, Gbl_conv_end; double Gbl_blas_total, Gbl_eltwise_total, Gbl_conv_total; #endif int main(int argc, char* argv[]) { float *wigold, *wfgold, *wogold, *wcgold, *rigold, *rfgold, *rogold, *rcgold, *bigold, *bfgold, *bogold, *bcgold; float *xgoldt, *cspgold,*hpgold, *csgoldt, *cogoldt, *hgoldt; float *dwgold, *drgold, *dbgold; float *dxgoldt, *dcspgold, *dhpgold, *dcsgold, *dhgoldt; float *icfogoldt, *wgold, *rgold; float *scratch_fwd, *scratch_bu; libxsmm_bfloat16 *xt, *csp, *hp, *w, *wt, *r, *rt, *b, *cst, *ht, *w_tmp, *r_tmp; libxsmm_bfloat16 *it, *ft, *ot, *cit, *cot; libxsmm_bfloat16 *dxt, *dcsp, *dhp, *dw, *dr, *db, *dcs, *dht; float forget_bias = 1.0f; float *h_test, *dxt_test, *dw_test, *dr_test, *db_test; void *scratch, *internalstate; size_t scratch_size = 0, internalstate_size = 0; int iters = 10; /* repetitions of benchmark */ int pass = 0; /* pass: 0--FWD, 1--BWD, 2--UPD, 3--BWD+UPD */ int N = 168; /* size of mini-batch */ int C = 512; /* number of inputs */ int K = 512; /* number of outputs */ int t = 50; /* number of time steps (>= 1) */ int bn = 24; int bk = 64; int bc = 64; const char *const env_check = getenv("CHECK"); const double check = LIBXSMM_ABS(0 == env_check ? 1/*enable by default*/ : atof(env_check)); #if defined(_OPENMP) int nThreads = omp_get_max_threads(); /* number of threads */ #else int nThreads = 1; /* number of threads */ #endif unsigned long long l_start, l_end; double l_total = 0.0; double flops = 0.0, tempflops = 0.0; const double tflops = 12; /* transcendental flops */ int j; libxsmm_dnn_rnncell_desc lstmcell_desc; libxsmm_dnn_rnncell* libxsmm_handle; libxsmm_dnn_tensor* libxsmm_input; libxsmm_dnn_tensor* libxsmm_cs_prev; libxsmm_dnn_tensor* libxsmm_hidden_state_prev; libxsmm_dnn_tensor* libxsmm_weight; libxsmm_dnn_tensor* libxsmm_recur_weight; libxsmm_dnn_tensor* libxsmm_weight_t; libxsmm_dnn_tensor* libxsmm_recur_weight_t; libxsmm_dnn_tensor* libxsmm_bias; libxsmm_dnn_tensor* libxsmm_cs; libxsmm_dnn_tensor* libxsmm_hidden_state; libxsmm_dnn_tensor* libxsmm_i; libxsmm_dnn_tensor* libxsmm_f; libxsmm_dnn_tensor* libxsmm_o; libxsmm_dnn_tensor* libxsmm_ci; libxsmm_dnn_tensor* libxsmm_co; libxsmm_dnn_tensor* libxsmm_dinput; libxsmm_dnn_tensor* libxsmm_dcs_prev; libxsmm_dnn_tensor* libxsmm_dhidden_state_prev; libxsmm_dnn_tensor* libxsmm_dweight; libxsmm_dnn_tensor* libxsmm_drecur_weight; libxsmm_dnn_tensor* libxsmm_dbias; libxsmm_dnn_tensor* libxsmm_dcs; libxsmm_dnn_tensor* libxsmm_dhidden_state; libxsmm_dnn_tensor_datalayout* libxsmm_layout; libxsmm_dnn_err_t status; libxsmm_dnn_err_t global_status = LIBXSMM_DNN_SUCCESS; libxsmm_matdiff_info norms_fwd, norms_bwd, norms_upd_w, norms_upd_r, norms_upd_b, diff; libxsmm_matdiff_clear(&norms_fwd); libxsmm_matdiff_clear(&norms_bwd); libxsmm_matdiff_clear(&norms_upd_w); libxsmm_matdiff_clear(&norms_upd_r); libxsmm_matdiff_clear(&norms_upd_b); libxsmm_matdiff_clear(&diff); if (argc > 1 && !strncmp(argv[1], "-h", 3)) { printf("\nUsage: ./lstmdriver [reps] [pass: 0--FWD, 1--BWD, 2--UPD, 3--BWD+UPD] [N] [C] [K] [time_steps > 0]\n\n"); return 0; } libxsmm_rng_set_seed(1); /* reading new values from cli */ j = 1; if (argc > j) iters = atoi(argv[j++]); if (argc > j) pass = atoi(argv[j++]); if (argc > j) N = atoi(argv[j++]); if (argc > j) C = atoi(argv[j++]); if (argc > j) K = atoi(argv[j++]); if (argc > j) t = atoi(argv[j++]); if (argc > j) bn = atoi(argv[j++]); if (argc > j) bc = atoi(argv[j++]); if (argc > j) bk = atoi(argv[j++]); if (t <= 0) { printf("time_steps %d should be greater than or equal to 1\n\n", t); return 0; } if (!(pass == 0 || pass == 1 || pass == 2 || pass == 3)) { printf("Unknown pass: %d, valid arguments for pass = {0(FWD), 1(BWD), 2(UPD), 3(BWD+UPD)\n\n", pass); return 0; } #if defined(__SSE3__) _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); #endif /* print some summary */ printf("##########################################\n"); printf("# Setting Up (Common) #\n"); printf("##########################################\n"); printf("PARAMS: N:%d C:%d K:%d T:%d\n", N, C, K, t); printf("PARAMS: ITERS:%d", iters); if (LIBXSMM_FEQ(0, check)) printf(" Threads:%d\n", nThreads); else printf("\n"); printf("SIZE Weight (MB): %10.2f MiB\n", (double)(C*K*sizeof(float))/(1024.0*1024.0) ); printf("SIZE Input (MB): %10.2f MiB\n", (double)(N*C*sizeof(float))/(1024.0*1024.0) ); printf("SIZE Hidden State: %10.2f MiB\n", (double)(K*N*sizeof(float))/(1024.0*1024.0) ); /* allocate data */ xgoldt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); cspgold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); hpgold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); wigold = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); wfgold = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); wogold = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); wcgold = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); rigold = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); rfgold = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); rogold = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); rcgold = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); bigold = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); bfgold = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); bogold = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); bcgold = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); csgoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); cogoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); hgoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); dxgoldt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); dcspgold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); dhpgold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); wgold = (float*)libxsmm_aligned_malloc((C+K)*K*4*sizeof(float), 2097152); rgold = NULL; dwgold = (float*)libxsmm_aligned_malloc((C+K)*K*4*sizeof(float), 2097152); drgold = NULL; scratch_fwd = (float*)libxsmm_aligned_malloc((C+K)*N*sizeof(float), 2097152); scratch_bu = (float*)libxsmm_aligned_malloc(((C+K)*N*2 + K*N*t*5)*sizeof(float), 2097152); dbgold = (float*)libxsmm_aligned_malloc(K*4*sizeof(float), 2097152); dcsgold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); dhgoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); icfogoldt = (float*)libxsmm_aligned_malloc(K*N*t*4*sizeof(float), 2097152); xt = (libxsmm_bfloat16*)libxsmm_aligned_malloc(N*C*t*sizeof(libxsmm_bfloat16), 2097152); csp = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*N*sizeof(libxsmm_bfloat16), 2097152); hp = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*N*sizeof(libxsmm_bfloat16), 2097152); w = (libxsmm_bfloat16*)libxsmm_aligned_malloc(C*K*4*sizeof(libxsmm_bfloat16), 2097152); r = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*K*4*sizeof(libxsmm_bfloat16), 2097152); wt = (libxsmm_bfloat16*)libxsmm_aligned_malloc(C*K*4*sizeof(libxsmm_bfloat16), 2097152); rt = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*K*4*sizeof(libxsmm_bfloat16), 2097152); w_tmp = (libxsmm_bfloat16*)libxsmm_aligned_malloc(C*K*4*sizeof(libxsmm_bfloat16), 2097152); r_tmp = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*K*4*sizeof(libxsmm_bfloat16), 2097152); b = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*4*sizeof(libxsmm_bfloat16), 2097152); cst = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*N*t*sizeof(libxsmm_bfloat16), 2097152); ht = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*N*t*sizeof(libxsmm_bfloat16), 2097152); it = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*N*t*sizeof(libxsmm_bfloat16), 2097152); ft = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*N*t*sizeof(libxsmm_bfloat16), 2097152); ot = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*N*t*sizeof(libxsmm_bfloat16), 2097152); cit = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*N*t*sizeof(libxsmm_bfloat16), 2097152); cot = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*N*t*sizeof(libxsmm_bfloat16), 2097152); dxt = (libxsmm_bfloat16*)libxsmm_aligned_malloc(N*C*t*sizeof(libxsmm_bfloat16), 2097152); dcsp = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*N*sizeof(libxsmm_bfloat16), 2097152); dhp = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*N*sizeof(libxsmm_bfloat16), 2097152); dw = (libxsmm_bfloat16*)libxsmm_aligned_malloc(C*K*4*sizeof(libxsmm_bfloat16), 2097152); dr = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*K*4*sizeof(libxsmm_bfloat16), 2097152); db = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*4*sizeof(libxsmm_bfloat16), 2097152); dcs = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*N*sizeof(libxsmm_bfloat16), 2097152); dht = (libxsmm_bfloat16*)libxsmm_aligned_malloc(K*N*t*sizeof(libxsmm_bfloat16), 2097152); h_test = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); dxt_test = (float*)libxsmm_aligned_malloc(C*N*t*sizeof(float), 2097152); dw_test = (float*)libxsmm_aligned_malloc(K*C*4*sizeof(float), 2097152); dr_test = (float*)libxsmm_aligned_malloc(K*K*4*sizeof(float), 2097152); db_test = (float*)libxsmm_aligned_malloc(K*4*sizeof(float), 2097152); LIBXSMM_VLA_DECL(2, float, xgold, xgoldt, N * C); LIBXSMM_VLA_DECL(2, float, hgold, hgoldt, K * N); LIBXSMM_VLA_DECL(2, float, dhgold, dhgoldt, K * N); /*LIBXSMM_VLA_DECL(2, float, h, h_test, K * N);*/ /* initialize data */ /* FWD */ for (j = 0; j < t; ++j) { LIBXSMM_MATINIT_OMP(float, 24, &LIBXSMM_VLA_ACCESS(2, xgold, j, 0, N * C), N, C, N, 1.0); } LIBXSMM_MATINIT_OMP(float, 24, cspgold,N, K, N, 1.0); LIBXSMM_MATINIT_OMP(float, 24, hpgold, N, K, N, 1.0); LIBXSMM_MATINIT_OMP(float, 42, wigold, C, K, C, 1.0); LIBXSMM_MATINIT_OMP(float, 42, wfgold, C, K, C, 1.0); LIBXSMM_MATINIT_OMP(float, 42, wogold, C, K, C, 1.0); LIBXSMM_MATINIT_OMP(float, 42, wcgold, C, K, C, 1.0); LIBXSMM_MATINIT_OMP(float, 42, rigold, K, K, K, 1.0); LIBXSMM_MATINIT_OMP(float, 42, rfgold, K, K, K, 1.0); LIBXSMM_MATINIT_OMP(float, 42, rogold, K, K, K, 1.0); LIBXSMM_MATINIT_OMP(float, 42, rcgold, K, K, K, 1.0); LIBXSMM_MATINIT_OMP(float, 24, bigold, 1, K, 1, 1.0); LIBXSMM_MATINIT_OMP(float, 24, bfgold, 1, K, 1, 1.0); LIBXSMM_MATINIT_OMP(float, 24, bogold, 1, K, 1, 1.0); LIBXSMM_MATINIT_OMP(float, 24, bcgold, 1, K, 1, 1.0); zero_buf(csgoldt, K*N*t); zero_buf(cogoldt, K*N*t); zero_buf(hgoldt, K*N*t); /* BWD/UPD */ for (j = 0; j < t; ++j) { LIBXSMM_MATINIT_OMP(float, 24, &LIBXSMM_VLA_ACCESS(2, dhgold, j, 0, K * N), N, K, N, 1.0); } LIBXSMM_MATINIT_OMP(float, 24, dcsgold, N, K, N, 1.0); zero_buf(dxgoldt, N*C*t); zero_buf(dcspgold, K*N); zero_buf(dhpgold, K*N); zero_buf(dwgold, (C+K)*K*4); zero_buf(dbgold, K*4); /* first touch LIBXSMM */ zero_buf_bf16(xt, N*C*t); zero_buf_bf16(csp, K*N); zero_buf_bf16(hp, K*N); zero_buf_bf16(w, C*K*4); zero_buf_bf16(r, K*K*4); zero_buf_bf16(b, K*4); zero_buf_bf16(cst, K*N*t); zero_buf_bf16(ht, K*N*t); zero_buf_bf16(it, K*N*t); zero_buf_bf16(ft, K*N*t); zero_buf_bf16(ot, K*N*t); zero_buf_bf16(cit, K*N*t); zero_buf_bf16(cot, K*N*t); zero_buf_bf16(dxt, N*C*t); zero_buf_bf16(dcsp, K*N); zero_buf_bf16(dhp, K*N); zero_buf_bf16(dw, C*K*4); zero_buf_bf16(dr, K*K*4); zero_buf_bf16(db, K*4); zero_buf_bf16(dcs, K*N); zero_buf_bf16(dht, K*N*t); /* Make things bf16 */ rne_mask_fp32_bf16( wigold, wigold, C*K ); rne_mask_fp32_bf16( wcgold, wcgold, C*K ); rne_mask_fp32_bf16( wfgold, wfgold, C*K ); rne_mask_fp32_bf16( wogold, wogold, C*K ); rne_mask_fp32_bf16( rigold, rigold, K*K ); rne_mask_fp32_bf16( rcgold, rcgold, K*K ); rne_mask_fp32_bf16( rfgold, rfgold, K*K ); rne_mask_fp32_bf16( rogold, rogold, K*K ); rne_mask_fp32_bf16( bigold, bigold, K ); rne_mask_fp32_bf16( bcgold, bcgold, K ); rne_mask_fp32_bf16( bfgold, bfgold, K ); rne_mask_fp32_bf16( bogold, bogold, K ); rne_mask_fp32_bf16( xgoldt, xgoldt, N*C*t ); rne_mask_fp32_bf16( cspgold, cspgold, N*K ); rne_mask_fp32_bf16( hpgold, hpgold, N*K ); rne_mask_fp32_bf16( csgoldt, csgoldt, N*K*t ); rne_mask_fp32_bf16( cogoldt, cogoldt, N*K*t ); rne_mask_fp32_bf16( hgoldt, hgoldt, N*K*t ); rne_mask_fp32_bf16( icfogoldt, icfogoldt, N*K*t*4 ); rne_mask_fp32_bf16( wgold, wgold, (C+K)*K*4 ); rne_mask_fp32_bf16( dcsgold, dcsgold, K*N ); rne_mask_fp32_bf16( dhgoldt, dhgoldt, K*N*t ); rne_mask_fp32_bf16( dwgold, dwgold, 4*(C+K)*K ); rne_mask_fp32_bf16( dbgold, dbgold, 4*K ); rne_mask_fp32_bf16( dxgoldt, dxgoldt, N*C*t ); rne_mask_fp32_bf16( dcspgold, dcspgold, N*K ); rne_mask_fp32_bf16( dhpgold, dhpgold, K*N ); if (LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Computing Reference ... #\n"); printf("##########################################\n"); lstm_ref_fwd( N, C, K, t, forget_bias, wigold, wcgold, wfgold, wogold, rigold, rcgold, rfgold, rogold, bigold, bcgold, bfgold, bogold, xgoldt, cspgold, hpgold, csgoldt, cogoldt, hgoldt, icfogoldt, wgold, rgold, scratch_fwd ); /* Make things BF16 for bwd/upd pass refernce computation */ rne_mask_fp32_bf16( icfogoldt, icfogoldt, 4*K*N*t ); rne_mask_fp32_bf16( hgoldt, hgoldt, K*N*t ); rne_mask_fp32_bf16( cogoldt, cogoldt, K*N*t ); rne_mask_fp32_bf16( csgoldt, csgoldt, K*N*t ); rne_mask_fp32_bf16( cspgold, cspgold, K*N ); rne_mask_fp32_bf16( hpgold, hpgold, K*N ); rne_mask_fp32_bf16( dhgoldt, dhgoldt, t*K*N ); rne_mask_fp32_bf16( dcsgold, dcsgold, K*N ); lstm_ref_bwd_upd( N, C, K, t, xgoldt, cspgold, hpgold, csgoldt, cogoldt, hgoldt, icfogoldt, wgold, rgold, dcsgold, dhgoldt, dwgold, drgold, dbgold, dxgoldt, dcspgold, dhpgold, scratch_bu ); rne_mask_fp32_bf16( dxgoldt, dxgoldt, C*N*t ); rne_mask_fp32_bf16( dwgold, dwgold, 4*(C+K)*K ); rne_mask_fp32_bf16( dbgold, dbgold, 4*K ); rne_mask_fp32_bf16( dcspgold, dcspgold, N*K ); rne_mask_fp32_bf16( dhpgold, dhpgold, K*N ); printf("##########################################\n"); printf("# Computing Reference ... done #\n"); printf("##########################################\n"); } if (1 /* format == 'A' || format == 'L' */) { printf("\n"); printf("##########################################\n"); printf("# Setting Up (custom-Storage) #\n"); printf("##########################################\n"); if ( N % bn != 0 ) { bn = N; } if ( C % bc != 0 ) { bc = C; } if ( K % bk != 0 ) { bk = K; } /* setup LIBXSMM handle */ lstmcell_desc.threads = nThreads; lstmcell_desc.N = N; lstmcell_desc.C = C; lstmcell_desc.K = K; lstmcell_desc.max_T = t; lstmcell_desc.bn = bn; lstmcell_desc.bk = bk; lstmcell_desc.bc = bc; lstmcell_desc.cell_type = LIBXSMM_DNN_RNNCELL_LSTM; lstmcell_desc.datatype_in = LIBXSMM_DNN_DATATYPE_BF16; lstmcell_desc.datatype_out = LIBXSMM_DNN_DATATYPE_BF16; lstmcell_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NC; lstmcell_desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED; libxsmm_handle = libxsmm_dnn_create_rnncell( lstmcell_desc, &status ); CHKERR_LIBXSMM_DNN( status ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_allocate_forget_bias(libxsmm_handle, forget_bias) ); /* setup LIBXSMM buffers and filter */ libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_input = libxsmm_dnn_link_tensor( libxsmm_layout, xt, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_CS_PREV, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_cs_prev = libxsmm_dnn_link_tensor( libxsmm_layout, csp, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_hidden_state_prev = libxsmm_dnn_link_tensor( libxsmm_layout, hp, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_weight = libxsmm_dnn_link_tensor( libxsmm_layout, w, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_weight_t = libxsmm_dnn_link_tensor( libxsmm_layout, wt, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_recur_weight = libxsmm_dnn_link_tensor( libxsmm_layout, r, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_recur_weight_t = libxsmm_dnn_link_tensor( libxsmm_layout, rt, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_BIAS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_bias = libxsmm_dnn_link_tensor( libxsmm_layout, b, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_CS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_cs = libxsmm_dnn_link_tensor( libxsmm_layout, cst, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_hidden_state = libxsmm_dnn_link_tensor( libxsmm_layout, ht, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_I, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_i = libxsmm_dnn_link_tensor( libxsmm_layout, it, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_F, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_f = libxsmm_dnn_link_tensor( libxsmm_layout, ft, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_O, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_o = libxsmm_dnn_link_tensor( libxsmm_layout, ot, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_CI, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_ci = libxsmm_dnn_link_tensor( libxsmm_layout, cit, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_CO, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_co = libxsmm_dnn_link_tensor( libxsmm_layout, cot, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dinput = libxsmm_dnn_link_tensor( libxsmm_layout, dxt, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_CS_PREV, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dcs_prev = libxsmm_dnn_link_tensor( libxsmm_layout, dcsp, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dhidden_state_prev = libxsmm_dnn_link_tensor( libxsmm_layout, dhp, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dweight = libxsmm_dnn_link_tensor( libxsmm_layout, dw, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_drecur_weight = libxsmm_dnn_link_tensor( libxsmm_layout, dr, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_BIAS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dbias = libxsmm_dnn_link_tensor( libxsmm_layout, db, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_CS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dcs = libxsmm_dnn_link_tensor( libxsmm_layout, dcs, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dhidden_state = libxsmm_dnn_link_tensor( libxsmm_layout, dht, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); /* copy in data to LIBXSMM bf16 format */ matrix_copy_f32_bf16(N*C*t, xgoldt, xt); matrix_copy_f32_bf16(K*N, cspgold, csp); matrix_copy_f32_bf16(K*N, hpgold, hp); convert_ck_f32_to_c4k_bf16(C, K, wigold, w_tmp); convert_ck_f32_to_c4k_bf16(C, K, wcgold, &(w_tmp[K])); convert_ck_f32_to_c4k_bf16(C, K, wfgold, &(w_tmp[2*K])); convert_ck_f32_to_c4k_bf16(C, K, wogold, &(w_tmp[3*K])); convert_ck_f32_to_c4k_bf16(K, K, rigold, r_tmp); convert_ck_f32_to_c4k_bf16(K, K, rcgold, &(r_tmp[K])); convert_ck_f32_to_c4k_bf16(K, K, rfgold, &(r_tmp[2*K])); convert_ck_f32_to_c4k_bf16(K, K, rogold, &(r_tmp[3*K])); matrix_copy_CK_to_KCCK_bf16(w_tmp, w, C, 4*K, bc, bk); matrix_copy_CK_to_KCCK_bf16(r_tmp, r, K, 4*K, bk, bk); matrix_copy_KCCK_to_CKKC_bf16(&w[0], &wt[0], C, K, bc, bk); matrix_copy_KCCK_to_CKKC_bf16(&w[C*K], &wt[C*K], C, K, bc, bk); matrix_copy_KCCK_to_CKKC_bf16(&w[2*C*K], &wt[2*C*K], C, K, bc, bk); matrix_copy_KCCK_to_CKKC_bf16(&w[3*C*K], &wt[3*C*K], C, K, bc, bk); matrix_copy_KCCK_to_CKKC_bf16(&r[0], &rt[0], K, K, bk, bk); matrix_copy_KCCK_to_CKKC_bf16(&r[K*K], &rt[K*K], K, K, bk, bk); matrix_copy_KCCK_to_CKKC_bf16(&r[2*K*K], &rt[2*K*K], K, K, bk, bk); matrix_copy_KCCK_to_CKKC_bf16(&r[3*K*K], &rt[3*K*K], K, K, bk, bk); matrix_copy_f32_bf16(K, bigold, &(b[0])); matrix_copy_f32_bf16(K, bcgold, &(b[K])); matrix_copy_f32_bf16(K, bfgold, &(b[2*K])); matrix_copy_f32_bf16(K, bogold, &(b[3*K])); matrix_copy_f32_bf16(K*N*t, dhgoldt, dht); matrix_copy_f32_bf16(K*N, dcsgold, dcs); /* bind buffers and filter to handle */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_input, LIBXSMM_DNN_RNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_cs_prev, LIBXSMM_DNN_RNN_REGULAR_CS_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_hidden_state_prev, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_weight, LIBXSMM_DNN_RNN_REGULAR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_recur_weight, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_weight_t, LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_recur_weight_t, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_bias, LIBXSMM_DNN_RNN_REGULAR_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_cs, LIBXSMM_DNN_RNN_REGULAR_CS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_hidden_state, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_i, LIBXSMM_DNN_RNN_INTERNAL_I ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_f, LIBXSMM_DNN_RNN_INTERNAL_F ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_o, LIBXSMM_DNN_RNN_INTERNAL_O ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_ci, LIBXSMM_DNN_RNN_INTERNAL_CI ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_co, LIBXSMM_DNN_RNN_INTERNAL_CO ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dinput, LIBXSMM_DNN_RNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dcs_prev, LIBXSMM_DNN_RNN_GRADIENT_CS_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dhidden_state_prev, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dweight, LIBXSMM_DNN_RNN_GRADIENT_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_drecur_weight, LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dbias, LIBXSMM_DNN_RNN_GRADIENT_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dcs, LIBXSMM_DNN_RNN_GRADIENT_CS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dhidden_state, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE ) ); /* let's allocate and bind scratch */ if (pass == 0) { scratch_size = libxsmm_dnn_rnncell_get_scratch_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, &status ); CHKERR_LIBXSMM_DNN( status ); scratch = libxsmm_aligned_malloc( scratch_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, scratch ) ); } else { scratch_size = libxsmm_dnn_rnncell_get_scratch_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, &status ); CHKERR_LIBXSMM_DNN( status ); scratch = libxsmm_aligned_malloc( scratch_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, scratch ) ); } zero_buf( (float*)scratch, scratch_size/4 ); /* let's allocate and bind internalstate */ if (pass == 0) { internalstate_size = libxsmm_dnn_rnncell_get_internalstate_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, &status ); CHKERR_LIBXSMM_DNN( status ); internalstate = (0 != internalstate_size ? libxsmm_aligned_malloc( internalstate_size, 2097152 ) : NULL); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, internalstate ) ); } else { internalstate_size = libxsmm_dnn_rnncell_get_internalstate_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, &status ); CHKERR_LIBXSMM_DNN( status ); internalstate = (0 != internalstate_size ? libxsmm_aligned_malloc( internalstate_size, 2097152 ) : NULL); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, internalstate ) ); } zero_buf( (float*)internalstate, internalstate_size/4 ); if ((pass == 0) && LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Correctness - FWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) ); } /* Upconvert libxsmm bf16 buffer to fp32 for correctness check */ matrix_copy_bf16_f32(t*K*N, ht, h_test); /* compare */ libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_F32, t*K*N, 1, &LIBXSMM_VLA_ACCESS(2, hgold, 0, 0, K * N), h_test, 0, 0); printf("L1 reference : %.25g\n", norms_fwd.l1_ref); printf("L1 test : %.25g\n", norms_fwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_fwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_fwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel); printf("Check-norm : %.24f\n", norms_fwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_fwd); } else { /* We need to always run FWD pass once to populate i, f, o, ci, co, cs, h */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) ); } } if ( (pass == 1) && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - BWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ) ); } /* Upconvert libxsmm bf16 buffer to fp32 for correctness check */ matrix_copy_bf16_f32(N*C*t, dxt, dxt_test); /* compare */ libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, N*C*t, 1, dxgoldt, dxt_test, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); } if ( (pass == 2) && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ) ); } /* Copy out dw matrices from KCCK bf16 format to CK bf16 format */ matrix_copy_KCCK_to_CK_bf16(dw, w_tmp, C, 4*K, bc, bk); /* Upconvert libxsmm bf16 buffer to fp32 for correctness check */ matrix_copy_bf16_f32(4*C*K, w_tmp, dw_test); /* compare */ libxsmm_matdiff(&norms_upd_w, LIBXSMM_DATATYPE_F32, C*K*4, 1, dwgold, dw_test, 0, 0); printf("Delta weight\n"); printf("L1 reference : %.25g\n", norms_upd_w.l1_ref); printf("L1 test : %.25g\n", norms_upd_w.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_w.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_w.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_w.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_w.linf_rel); printf("Check-norm : %.24f\n", norms_upd_w.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_w); /* Copy out dr matrices from KCCK bf16 format to CK bf16 format */ matrix_copy_KCCK_to_CK_bf16(dr, r_tmp, K, 4*K, bk, bk); /* Upconvert libxsmm bf16 buffer to fp32 for correctness check */ matrix_copy_bf16_f32(4*K*K, r_tmp, dr_test); libxsmm_matdiff(&norms_upd_r, LIBXSMM_DATATYPE_F32, K*K*4, 1, &(dwgold[C*K*4]), dr_test, 0, 0); printf("Delta recurrent weight\n"); printf("L1 reference : %.25g\n", norms_upd_r.l1_ref); printf("L1 test : %.25g\n", norms_upd_r.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_r.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_r.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_r.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_r.linf_rel); printf("Check-norm : %.24f\n", norms_upd_r.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_r); /* Upconvert libxsmm bf16 buffer to fp32 for correctness check */ matrix_copy_bf16_f32(4*K, db, db_test); libxsmm_matdiff(&norms_upd_b, LIBXSMM_DATATYPE_F32, K*4, 1, dbgold, db_test, 0, 0); printf("Delta bias\n"); printf("L1 reference : %.25g\n", norms_upd_b.l1_ref); printf("L1 test : %.25g\n", norms_upd_b.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_b.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_b.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_b.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_b.linf_rel); printf("Check-norm : %.24f\n", norms_upd_b.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_b); } if ( (pass == 3) && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - BWD+UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, 0, tid ) ); } /* Upconvert libxsmm bf16 buffer to fp32 for correctness check */ matrix_copy_bf16_f32(N*C*t, dxt, dxt_test); /* compare */ libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, N*C*t, 1, dxgoldt, dxt_test, 0, 0); printf("Delta input\n"); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); /* Copy out dw matrices from KCCK bf16 format to CK bf16 format */ matrix_copy_KCCK_to_CK_bf16(dw, w_tmp, C, 4*K, bc, bk); /* Upconvert libxsmm bf16 buffer to fp32 for correctness check */ matrix_copy_bf16_f32(4*C*K, w_tmp, dw_test); libxsmm_matdiff(&norms_upd_w, LIBXSMM_DATATYPE_F32, C*K*4, 1, dwgold, dw_test, 0, 0); printf("Delta weight\n"); printf("L1 reference : %.25g\n", norms_upd_w.l1_ref); printf("L1 test : %.25g\n", norms_upd_w.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_w.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_w.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_w.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_w.linf_rel); printf("Check-norm : %.24f\n", norms_upd_w.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_w); /* Copy out dr matrices from KCCK bf16 format to CK bf16 format */ matrix_copy_KCCK_to_CK_bf16(dr, r_tmp, K, 4*K, bk, bk); /* Upconvert libxsmm bf16 buffer to fp32 for correctness check */ matrix_copy_bf16_f32(4*K*K, r_tmp, dr_test); libxsmm_matdiff(&norms_upd_r, LIBXSMM_DATATYPE_F32, K*K*4, 1, &(dwgold[C*K*4]), dr_test, 0, 0); printf("Delta recurrent weight\n"); printf("L1 reference : %.25g\n", norms_upd_r.l1_ref); printf("L1 test : %.25g\n", norms_upd_r.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_r.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_r.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_r.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_r.linf_rel); printf("Check-norm : %.24f\n", norms_upd_r.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_r); /* Upconvert libxsmm bf16 buffer to fp32 for correctness check */ matrix_copy_bf16_f32(4*K, db, db_test); libxsmm_matdiff(&norms_upd_b, LIBXSMM_DATATYPE_F32, K*4, 1, dbgold, db_test, 0, 0); printf("Delta bias\n"); printf("L1 reference : %.25g\n", norms_upd_b.l1_ref); printf("L1 test : %.25g\n", norms_upd_b.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_b.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_b.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_b.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_b.linf_rel); printf("Check-norm : %.24f\n", norms_upd_b.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_b); } if ( pass == 0 ) { printf("##########################################\n"); printf("# Performance - FWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(j) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (j = 0; j < iters; ++j) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = (((2.0 * K * N * C) + (2.0 * K * N * K) + (2.0 * K * N) + (tflops * K * N)) * 4.0 + (4.0 * K * N) + (tflops * K * N)) * (double)t * (double)iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("##########################################\n"); printf("# Performance - FWD (BLAS) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM for performance */ #if defined(PROFILE) Gbl_blas_total = 0.0; Gbl_eltwise_total = 0.0; Gbl_conv_total = 0.0; #endif l_start = libxsmm_timer_tick(); for (j = 0; j < iters; ++j) { lstm_ref_fwd( N, C, K, t, forget_bias, wigold, wcgold, wfgold, wogold, rigold, rcgold, rfgold, rogold, bigold, bcgold, bfgold, bogold, xgoldt, cspgold, hpgold, csgoldt, cogoldt, hgoldt, icfogoldt, wgold, rgold, scratch_fwd ); } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); #if defined(PROFILE) flops = (((2.0 * K * N * C) + (2.0 * K * N * K)) * 4.0) * (double)t * (double)iters; printf("BLAS GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("BLAS time = %.5g\n", ((double)(Gbl_blas_total/iters))); printf("BLAS GFLOPS = %.5g\n", (flops*1e-9)/Gbl_blas_total); flops = ((tflops * K * N) * 4.0 + (4.0 * K * N) + (tflops * K * N)) * (double)t * (double)iters; printf("ELTWISE GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("ELTWISE time = %.5g\n", ((double)(Gbl_eltwise_total/iters))); printf("ELTWISE GFLOPS = %.5g\n", (flops*1e-9)/Gbl_eltwise_total); flops = ((C * K + K * K) * 4.0) * (double)iters; printf("CONVERSION GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("CONVERSION time = %.5g\n", ((double)(Gbl_conv_total/iters))); printf("CONVERSION GFLOPS = %.5g\n", (flops*1e-9)/Gbl_conv_total); #endif printf("PERFDUMP,FP,%s,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, ((double)(l_total/iters)), (flops*1e-9)/l_total); } if ( pass == 1 ) { printf("##########################################\n"); printf("# Performance - BWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(j) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (j = 0; j < iters; ++j) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = K * N; /* delta + delta_out */ flops += (6.0 * K * N + tflops * K * N); /* dJdd */ flops += (4.0 * K * N); /* dJdc */ flops += (4.0 * K * N); /* dJdi */ flops += (4.0 * K * N); /* dJdf */ flops += (4.0 * K * N + tflops * K * N); /* dJdo */ tempflops = (4.0 * K * C); /* W^T */ tempflops += (8.0 * K * N * C); /* W^T * dJd{c, i, f, o} */ tempflops += (3.0 * K * C); /* summation */ flops += tempflops; tempflops = (4.0 * K * K); /* R^T */ tempflops += (8.0 * K * N * K); /* R^T * dJd{c, i, f, o} */ flops += tempflops; flops *= t; /* for t time steps */ flops *= iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("bp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,BP,%s,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, ((double)(l_total/iters)), (flops*1e-9)/l_total); } if ( pass == 2 ) { printf("##########################################\n"); printf("# Performance - UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(j) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (j = 0; j < iters; ++j) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = K * N; /* delta + delta_out */ flops += (6.0 * K * N + tflops * K * N); /* dJdd */ flops += (4.0 * K * N); /* dJdc */ flops += (4.0 * K * N); /* dJdi */ flops += (4.0 * K * N); /* dJdf */ flops += (4.0 * K * N + tflops * K * N); /* dJdo */ tempflops = (4.0 * K * K); /* R^T */ tempflops += (8.0 * K * N * K); /* R^T * dJd{c, i, f, o} */ flops += tempflops; flops *= t; /* for t time steps */ tempflops = C * N; /* x^T */ tempflops += (8.0 * K * N * C); /* delta{c, i, f, o} * x^T */ tempflops *= t; /* for t time steps */ tempflops += (4.0 * K * C * (t-1)); /* for summation of dJdW{c, i, f, o} */ flops += tempflops; tempflops = 4.0 * K * N; /* delta^T */ tempflops += (8.0 * K * N * K); /* delta{c, i, f, o} * delta^T */ tempflops *= (t - 1); /* for (t - 1) time steps */ tempflops += (4.0 * K * N * (t-2)); /* for summation of dJdR{c, i, f, o} */ flops += tempflops; flops += (4.0 * K * N * (t - 1)); /* delbias */ flops *= iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("wu time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,WU,%s,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, ((double)(l_total/iters)), (flops*1e-9)/l_total); } if ( pass == 3 ) { printf("##########################################\n"); printf("# Performance - BWD+UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(j) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (j = 0; j < iters; ++j) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = K * N; /* delta + delta_out */ flops += (6.0 * K * N + tflops * K * N); /* dJdd */ flops += (4.0 * K * N); /* dJdc */ flops += (4.0 * K * N); /* dJdi */ flops += (4.0 * K * N); /* dJdf */ flops += (4.0 * K * N + tflops * K * N); /* dJdo */ tempflops = (4.0 * K * C); /* W^T */ tempflops += (8.0 * K * N * C); /* W^T * dJd{c, i, f, o} */ tempflops += (3.0 * K * C); /* summation */ flops += tempflops; tempflops = (4.0 * K * K); /* R^T */ tempflops += (8.0 * K * N * K); /* R^T * dJd{c, i, f, o} */ flops += tempflops; flops *= t; /* for t time steps */ tempflops = C * N; /* x^T */ tempflops += (8.0 * K * N * C); /* delta{c, i, f, o} * x^T */ tempflops *= t; /* for t time steps */ tempflops += (4.0 * K * C * (t-1)); /* for summation of dJdW{c, i, f, o} */ flops += tempflops; tempflops = 4.0 * K * N; /* delta^T */ tempflops += (8.0 * K * N * K); /* delta{c, i, f, o} * delta^T */ tempflops *= (t - 1); /* for (t - 1) time steps */ tempflops += (4.0 * K * N * (t-2)); /* for summation of dJdR{c, i, f, o} */ flops += tempflops; flops += (4.0 * K * N * (t - 1)); /* delbias */ flops *= iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("bp+wu time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("##########################################\n"); printf("# Performance - BWD+UPD (BLAS) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM for performance */ #if defined(PROFILE) Gbl_blas_total = 0.0; Gbl_eltwise_total = 0.0; #endif l_start = libxsmm_timer_tick(); for (j = 0; j < iters; ++j) { lstm_ref_bwd_upd( N, C, K, t, xgoldt, cspgold, hpgold, csgoldt, cogoldt, hgoldt, icfogoldt, wgold, rgold, dcsgold, dhgoldt, dwgold, drgold, dbgold, dxgoldt, dcspgold, dhpgold, scratch_bu ); } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("bp+wu time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); #if defined(PROFILE) flops = (((4.0 * K * N * C) + (4.0 * K * N * K)) * 4.0) * (double)t * (double)iters; printf("BLAS GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("BLAS time = %.5g\n", ((double)(Gbl_blas_total/iters))); printf("BLAS GFLOPS = %.5g\n", (flops*1e-9)/Gbl_blas_total); flops = (19.0 * K * N) * (double)t * (double)iters; printf("ELTWISE GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("ELTWISE time = %.5g\n", ((double)(Gbl_eltwise_total/iters))); printf("ELTWISE GFLOPS = %.5g\n", (flops*1e-9)/Gbl_eltwise_total); #endif printf("PERFDUMP,BP+WU,%s,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, ((double)(l_total/iters)), (flops*1e-9)/l_total); } /* clean-up */ if (pass == 0) { CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD ) ); } else { CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL ) ); } libxsmm_free(scratch); libxsmm_free(internalstate); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_CS_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_CS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_I ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_F ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_O ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_CI ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_CO ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_CS_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_CS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_input ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_cs_prev ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_hidden_state_prev ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_weight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_recur_weight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_weight_t ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_recur_weight_t ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_bias ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_cs ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_hidden_state ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_i ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_f ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_o ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_ci ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_co ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dinput ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dcs_prev ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dhidden_state_prev ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dweight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_drecur_weight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dbias ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dcs ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dhidden_state ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_rnncell( libxsmm_handle ) ); } /* deallocate data */ libxsmm_free(xgoldt); libxsmm_free(cspgold); libxsmm_free(hpgold); libxsmm_free(wigold); libxsmm_free(wfgold); libxsmm_free(wogold); libxsmm_free(wcgold); libxsmm_free(rigold); libxsmm_free(rfgold); libxsmm_free(rogold); libxsmm_free(rcgold); libxsmm_free(bigold); libxsmm_free(bfgold); libxsmm_free(bogold); libxsmm_free(bcgold); libxsmm_free(csgoldt); libxsmm_free(cogoldt); libxsmm_free(hgoldt); libxsmm_free(wgold); libxsmm_free(icfogoldt); libxsmm_free(dxgoldt); libxsmm_free(dcspgold); libxsmm_free(dhpgold); libxsmm_free(dwgold); libxsmm_free(scratch_fwd); libxsmm_free(scratch_bu); libxsmm_free(dbgold); libxsmm_free(dcsgold); libxsmm_free(dhgoldt); libxsmm_free(xt); libxsmm_free(csp); libxsmm_free(hp); libxsmm_free(w); libxsmm_free(r); libxsmm_free(wt); libxsmm_free(rt); libxsmm_free(w_tmp); libxsmm_free(r_tmp); libxsmm_free(b); libxsmm_free(cst); libxsmm_free(ht); libxsmm_free(dxt); libxsmm_free(dcsp); libxsmm_free(dhp); libxsmm_free(dw); libxsmm_free(dr); libxsmm_free(db); libxsmm_free(dcs); libxsmm_free(dht); libxsmm_free(it); libxsmm_free(ft); libxsmm_free(ot); libxsmm_free(cit); libxsmm_free(cot); libxsmm_free(h_test); libxsmm_free(dxt_test); libxsmm_free(dw_test); libxsmm_free(dr_test); libxsmm_free(db_test); { const char *const env_check_scale = getenv("CHECK_SCALE"); const double check_scale = LIBXSMM_ABS(0 == env_check_scale ? 1.0 : atof(env_check_scale)); if (LIBXSMM_NEQ(0, check) && (check < 100.0 * check_scale * diff.normf_rel) && (global_status == LIBXSMM_DNN_SUCCESS)) { fprintf(stderr, "FAILED with an error of %f%%!\n", 100.0 * diff.normf_rel); exit(EXIT_FAILURE); } } /* some empty lines at the end */ printf("\n\n\n"); return global_status; } libxsmm-1.17/samples/deeplearning/lstmdriver/lstmdriver_nc_kcck_bf16.vcxproj000066400000000000000000000551731415223013700275230ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 lstmdriver_nc_kcck_bf16 10.0 {B0027BD6-DCDF-4DCD-8D1B-4FB8BCA1EE9F} Application Disabled Disabled Sequential v142 true Application true true Disabled Disabled Sequential v142 Application true Disabled Disabled Sequential v142 true Application Disabled Disabled Sequential v142 true true Application true Disabled Disabled Sequential v142 Application true Disabled Disabled true Sequential v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmext.lib;mkl_rt.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmext.lib;mkl_rt.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/deeplearning/lstmdriver/lstmdriver_nc_kcck_f32.c000066400000000000000000002205561415223013700261050ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas, Kunal Banerjee (Intel Corp.) ******************************************************************************/ #include #include #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #include #include #include #include #if defined(_OPENMP) # include #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif /* include c-based dnn library */ #include "../common/dnn_common.h" #define CHKERR_LIBXSMM_DNN(A) { const int chkerr_libxsmm_dnn_ = A; if (LIBXSMM_DNN_SUCCESS != chkerr_libxsmm_dnn_) { \ fprintf(stderr, "%s\n", libxsmm_dnn_get_error(chkerr_libxsmm_dnn_)); global_status = chkerr_libxsmm_dnn_; } \ } int main(int argc, char* argv[]) { float *wigold, *wfgold, *wogold, *wcgold, *xgoldt, *rigold, *rfgold, *rogold, *rcgold, *hgoldt, *bigold, *bfgold, *bogold, *bcgold, *bfgold_fb; float *cspgold, *hpgold, *djdcspgold, *djdhpgold; float *igoldt, *fgoldt, *ogoldt, *cgoldt, *dgoldt, *bimgold, *bfmgold, *bomgold, *bcmgold, *doutgoldt; float *i1gold, *i2gold, *f1gold, *f2gold, *o1gold, *o2gold, *c1gold, *c2gold, *d1gold, *d2gold, *dhgold; float *xt, *csp, *hp, *w, *wt, *w_tmp, *r, *rt, *r_tmp, *b, *cst, *ht; float *it, *ft, *ot, *cit, *cot; float *dxt, *dcsp, *dhp, *dw, *dr, *db, *dcs, *dht; float *i3gold, *f3gold, *d3gold, *d4gold, *deltagoldt; float *djdhgoldt, *djdigoldt, *djdfgoldt, *djdcgoldt, *djdogoldt, *djdxgoldt; float *djdwigold, *djdwfgold, *djdwogold, *djdwcgold, *djdrigold, *djdrfgold, *djdrogold, *djdrcgold; float *djdbigold, *djdbfgold, *djdbogold, *djdbcgold, *djdcsgold, *wgoldTp, *rgoldTp, *xgoldTp, *hgoldTp; float *htest, *djdxtestt, *djdwtest, *djdrtest, *djdbtest, *djdwgold4, *djdrgold4, *djdbgold4; float forget_bias = 1.0f; const char transa = 'N', transb = 'N'; /* no transposes */ const float alpha = 1, beta = 1, beta0 = 0; void *scratch, *internalstate; size_t scratch_size = 0, internalstate_size = 0; int iters = 10; /* repetitions of benchmark */ int pass = 3; /* pass: 0--FWD, 1--BWD, 2--UPD, 3--BWD+UPD */ int N = 128; /* size of mini-batch */ int C = 512; /* number of inputs */ int K = 64; /* number of outputs */ int t = 5; /* number of time steps (> 1) */ int bk = 64; int bn = 64; int bc = 64; const char *const env_check = getenv("CHECK"); const double check = LIBXSMM_ABS(0 == env_check ? 1/*enable by default*/ : atof(env_check)); #if defined(_OPENMP) int nThreads = omp_get_max_threads(); /* number of threads */ #else int nThreads = 1; /* number of threads */ #endif unsigned long long l_start, l_end; double l_total = 0.0; double flops = 0.0, tempflops = 0.0; const double tflops = 12; /* transcendental flops */ int j, l; libxsmm_dnn_rnncell_desc lstmcell_desc; libxsmm_dnn_rnncell* libxsmm_handle; libxsmm_dnn_tensor* libxsmm_input; libxsmm_dnn_tensor* libxsmm_cs_prev; libxsmm_dnn_tensor* libxsmm_hidden_state_prev; libxsmm_dnn_tensor* libxsmm_weight; libxsmm_dnn_tensor* libxsmm_recur_weight; libxsmm_dnn_tensor* libxsmm_weight_t; libxsmm_dnn_tensor* libxsmm_recur_weight_t; libxsmm_dnn_tensor* libxsmm_bias; libxsmm_dnn_tensor* libxsmm_cs; libxsmm_dnn_tensor* libxsmm_hidden_state; libxsmm_dnn_tensor* libxsmm_i; libxsmm_dnn_tensor* libxsmm_f; libxsmm_dnn_tensor* libxsmm_o; libxsmm_dnn_tensor* libxsmm_ci; libxsmm_dnn_tensor* libxsmm_co; libxsmm_dnn_tensor* libxsmm_dinput; libxsmm_dnn_tensor* libxsmm_dcs_prev; libxsmm_dnn_tensor* libxsmm_dhidden_state_prev; libxsmm_dnn_tensor* libxsmm_dweight; libxsmm_dnn_tensor* libxsmm_drecur_weight; libxsmm_dnn_tensor* libxsmm_dbias; libxsmm_dnn_tensor* libxsmm_dcs; libxsmm_dnn_tensor* libxsmm_dhidden_state; libxsmm_dnn_tensor_datalayout* libxsmm_layout; libxsmm_dnn_err_t status; libxsmm_dnn_err_t global_status = LIBXSMM_DNN_SUCCESS; libxsmm_matdiff_info norms_fwd, norms_bwd, norms_upd_w, norms_upd_r, norms_upd_b, diff; memset(&norms_fwd, 0, sizeof(norms_fwd)); memset(&norms_bwd, 0, sizeof(norms_bwd)); memset(&norms_upd_w, 0, sizeof(norms_upd_w)); memset(&norms_upd_r, 0, sizeof(norms_upd_r)); memset(&norms_upd_b, 0, sizeof(norms_upd_b)); memset(&diff, 0, sizeof(diff)); if (argc > 1 && !strncmp(argv[1], "-h", 3)) { printf("\nUsage: ./lstmdriver [reps] [pass: 0--FWD, 1--BWD, 2--UPD, 3--BWD+UPD] [N] [C] [K] [time_steps > 0]\n\n"); return 0; } libxsmm_rng_set_seed(1); /* reading new values from cli */ j = 1; if (argc > j) iters = atoi(argv[j++]); if (argc > j) pass = atoi(argv[j++]); if (argc > j) N = atoi(argv[j++]); if (argc > j) C = atoi(argv[j++]); if (argc > j) K = atoi(argv[j++]); if (argc > j) t = atoi(argv[j++]); if (argc > j) bn = atoi(argv[j++]); if (argc > j) bk = atoi(argv[j++]); if (argc > j) bc = atoi(argv[j++]); if (t <= 0) { printf("time_steps %d should be greater than 1\n\n", t); return 0; } if (!(pass == 0 || pass == 1 || pass == 2 || pass == 3 || pass == 4)) { printf("Unknown pass: %d, valid arguments for pass = {0(FWD), 1(BWD), 2(UPD), 3(BWD+UPD)\n\n", pass); return 0; } #if defined(__SSE3__) _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); #endif /* print some summary */ printf("##########################################\n"); printf("# Setting Up (Common) #\n"); printf("##########################################\n"); printf("PARAMS: N:%d C:%d K:%d T:%d\n", N, C, K, t); printf("PARAMS: ITERS:%d", iters); if (LIBXSMM_FEQ(0, check)) printf(" Threads:%d\n", nThreads); else printf("\n"); printf("SIZE Weight (MB): %10.2f MiB\n", (double)(C*K*sizeof(float))/(1024.0*1024.0) ); printf("SIZE Input (MB): %10.2f MiB\n", (double)(N*C*sizeof(float))/(1024.0*1024.0) ); printf("SIZE Hidden State: %10.2f MiB\n", (double)(K*N*sizeof(float))/(1024.0*1024.0) ); /* allocate data */ xgoldt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); cspgold= (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); hpgold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); wigold = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); wfgold = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); wogold = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); wcgold = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); rigold = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); rfgold = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); rogold = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); rcgold = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); bigold = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); bfgold = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); bogold = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); bcgold = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); hgoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); bfgold_fb = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); bimgold= (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); bfmgold= (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); bomgold= (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); bcmgold= (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); igoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); fgoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); ogoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); cgoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); dgoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); i1gold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); i2gold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); i3gold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); f1gold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); f2gold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); f3gold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); o1gold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); o2gold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); c1gold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); c2gold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); d1gold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); d2gold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); d3gold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); d4gold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); dhgold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); djdhgoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); deltagoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); djdcspgold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); djdigoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); djdfgoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); djdcgoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); djdogoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); djdxgoldt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); djdwigold = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); djdwfgold = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); djdwogold = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); djdwcgold = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); djdrigold = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); djdrfgold = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); djdrogold = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); djdrcgold = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); djdbigold = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); djdbfgold = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); djdbogold = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); djdbcgold = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); djdcsgold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); djdhpgold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); wgoldTp = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); rgoldTp = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); xgoldTp = (float*)libxsmm_aligned_malloc(N*C*sizeof(float), 2097152); hgoldTp = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); doutgoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); xt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); csp = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); hp = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); w = (float*)libxsmm_aligned_malloc(C*K*4*sizeof(float), 2097152); r = (float*)libxsmm_aligned_malloc(K*K*4*sizeof(float), 2097152); wt = (float*)libxsmm_aligned_malloc(C*K*4*sizeof(float), 2097152); rt = (float*)libxsmm_aligned_malloc(K*K*4*sizeof(float), 2097152); w_tmp = (float*)libxsmm_aligned_malloc(C*K*4*sizeof(float), 2097152); r_tmp = (float*)libxsmm_aligned_malloc(K*K*4*sizeof(float), 2097152); b = (float*)libxsmm_aligned_malloc(K*4*sizeof(float), 2097152); cst = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); ht = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); it = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); ft = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); ot = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); cit = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); cot = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); dxt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); dcsp = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); dhp = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); dw = (float*)libxsmm_aligned_malloc(C*K*4*sizeof(float), 2097152); dr = (float*)libxsmm_aligned_malloc(K*K*4*sizeof(float), 2097152); db = (float*)libxsmm_aligned_malloc(K*4*sizeof(float), 2097152); dcs = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); dht = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); htest = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); djdxtestt = (float*)libxsmm_aligned_malloc(C*N*t*sizeof(float), 2097152); djdwtest = (float*)libxsmm_aligned_malloc(C*K*4*sizeof(float), 2097152); djdrtest = (float*)libxsmm_aligned_malloc(K*K*4*sizeof(float), 2097152); djdbtest = (float*)libxsmm_aligned_malloc(K*4*sizeof(float), 2097152); djdwgold4 = (float*)libxsmm_aligned_malloc(C*K*4*sizeof(float), 2097152); djdrgold4 = (float*)libxsmm_aligned_malloc(K*K*4*sizeof(float), 2097152); djdbgold4 = (float*)libxsmm_aligned_malloc(K*4*sizeof(float), 2097152); LIBXSMM_VLA_DECL(2, float, xgold, xgoldt, N * C); LIBXSMM_VLA_DECL(2, float, igold, igoldt, K * N); LIBXSMM_VLA_DECL(2, float, fgold, fgoldt, K * N); LIBXSMM_VLA_DECL(2, float, ogold, ogoldt, K * N); LIBXSMM_VLA_DECL(2, float, cgold, cgoldt, K * N); LIBXSMM_VLA_DECL(2, float, dgold, dgoldt, K * N); LIBXSMM_VLA_DECL(2, float, hgold, hgoldt, K * N); LIBXSMM_VLA_DECL(2, float, djdhgold, djdhgoldt, K * N); LIBXSMM_VLA_DECL(2, float, deltagold, deltagoldt, K * N); LIBXSMM_VLA_DECL(2, float, doutgold, doutgoldt, K * N); LIBXSMM_VLA_DECL(2, float, djdigold, djdigoldt, K * N); LIBXSMM_VLA_DECL(2, float, djdfgold, djdfgoldt, K * N); LIBXSMM_VLA_DECL(2, float, djdogold, djdogoldt, K * N); LIBXSMM_VLA_DECL(2, float, djdcgold, djdcgoldt, K * N); LIBXSMM_VLA_DECL(2, float, djdxgold, djdxgoldt, N * C); LIBXSMM_VLA_DECL(2, float, h, ht, K * N); /* initialize data */ /* FWD */ LIBXSMM_MATINIT_OMP(float, 24, cspgold,N, K, N, 1.0); LIBXSMM_MATINIT_OMP(float, 24, hpgold, N, K, N, 1.0); LIBXSMM_MATINIT_OMP(float, 42, wigold, C, K, C, 1.0); LIBXSMM_MATINIT_OMP(float, 42, wfgold, C, K, C, 1.0); LIBXSMM_MATINIT_OMP(float, 42, wogold, C, K, C, 1.0); LIBXSMM_MATINIT_OMP(float, 42, wcgold, C, K, C, 1.0); for (j = 0; j < t; ++j) { LIBXSMM_MATINIT_OMP(float, 24, &LIBXSMM_VLA_ACCESS(2, xgold, j, 0, N * C), N, C, N, 1.0); } LIBXSMM_MATINIT_OMP(float, 42, rigold, K, K, K, 1.0); LIBXSMM_MATINIT_OMP(float, 42, rfgold, K, K, K, 1.0); LIBXSMM_MATINIT_OMP(float, 42, rogold, K, K, K, 1.0); LIBXSMM_MATINIT_OMP(float, 42, rcgold, K, K, K, 1.0); LIBXSMM_MATINIT_OMP(float, 24, bigold, 1, K, 1, 1.0); LIBXSMM_MATINIT_OMP(float, 24, bfgold, 1, K, 1, 1.0); LIBXSMM_MATINIT_OMP(float, 24, bogold, 1, K, 1, 1.0); LIBXSMM_MATINIT_OMP(float, 24, bcgold, 1, K, 1, 1.0); for (j = 0; j < K; j++) { bfgold_fb[j] = bfgold[j] + forget_bias; } for (j = 0; j < N; j++) { matrix_copy(K, bigold, &(bimgold[j*K])); matrix_copy(K, bfgold_fb, &(bfmgold[j*K])); matrix_copy(K, bogold, &(bomgold[j*K])); matrix_copy(K, bcgold, &(bcmgold[j*K])); } for (j = 0; j < t; ++j) { zero_buf(&LIBXSMM_VLA_ACCESS(2, hgold, j, 0, K * N), K*N); zero_buf(&LIBXSMM_VLA_ACCESS(2, igold, j, 0, K * N), K*N); zero_buf(&LIBXSMM_VLA_ACCESS(2, fgold, j, 0, K * N), K*N); zero_buf(&LIBXSMM_VLA_ACCESS(2, ogold, j, 0, K * N), K*N); zero_buf(&LIBXSMM_VLA_ACCESS(2, cgold, j, 0, K * N), K*N); zero_buf(&LIBXSMM_VLA_ACCESS(2, dgold, j, 0, K * N), K*N); } zero_buf(i1gold, K*N); zero_buf(i2gold, K*N); zero_buf(f1gold, K*N); zero_buf(f2gold, K*N); zero_buf(o1gold, K*N); zero_buf(o2gold, K*N); zero_buf(c1gold, K*N); zero_buf(c2gold, K*N); zero_buf(d1gold, K*N); zero_buf(d2gold, K*N); zero_buf(dhgold, K*N); /* BWD/UPD */ for (j = 0; j < t; ++j) { LIBXSMM_MATINIT_OMP(float, 24, &LIBXSMM_VLA_ACCESS(2, djdhgold, j, 0, K * N), N, K, N, 1.0); } LIBXSMM_MATINIT_OMP(float, 24, djdcsgold, N, K, N, 1.0); zero_buf(i3gold, K*N); zero_buf(f3gold, K*N); zero_buf(d3gold, K*N); zero_buf(d4gold, K*N); zero_buf(deltagoldt, K*N*t); zero_buf(djdcspgold, K*N); zero_buf(djdigoldt, K*N*t); zero_buf(djdfgoldt, K*N*t); zero_buf(djdogoldt, K*N*t); zero_buf(djdcgoldt, K*N*t); zero_buf(djdxgoldt, N*C*t); zero_buf(djdwigold, C*K); zero_buf(djdwfgold, C*K); zero_buf(djdwogold, C*K); zero_buf(djdwcgold, C*K); zero_buf(djdrigold, K*K); zero_buf(djdrfgold, K*K); zero_buf(djdrogold, K*K); zero_buf(djdrcgold, K*K); zero_buf(djdbigold, K); zero_buf(djdbfgold, K); zero_buf(djdbogold, K); zero_buf(djdbcgold, K); zero_buf(djdhpgold, K*N); zero_buf(wgoldTp, C*K); zero_buf(rgoldTp, K*K); zero_buf(xgoldTp, N*C); zero_buf(hgoldTp, K*N); zero_buf(doutgoldt, K*N*t); /* first touch LIBXSMM */ zero_buf(xt, N*C*t); zero_buf(csp, K*N); zero_buf(hp, K*N); zero_buf(w, C*K*4); zero_buf(r, K*K*4); zero_buf(wt, C*K*4); zero_buf(rt, K*K*4); zero_buf(b, K*4); zero_buf(cst, K*N*t); zero_buf(ht, K*N*t); zero_buf(it, K*N*t); zero_buf(ft, K*N*t); zero_buf(ot, K*N*t); zero_buf(cit, K*N*t); zero_buf(cot, K*N*t); zero_buf(dxt, N*C*t); zero_buf(dcsp, K*N); zero_buf(dhp, K*N); zero_buf(dw, C*K*4); zero_buf(dr, K*K*4); zero_buf(db, K*4); zero_buf(dcs, K*N); zero_buf(dht, K*N*t); if (LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Computing Reference ... #\n"); printf("##########################################\n"); /* FWD */ for (j = 0; j < t; ++j) { LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &C, &alpha, wigold, &K, &LIBXSMM_VLA_ACCESS(2, xgold, j, 0, N * C), &C, &beta0, i1gold, &K); LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &C, &alpha, wfgold, &K, &LIBXSMM_VLA_ACCESS(2, xgold, j, 0, N * C), &C, &beta0, f1gold, &K); LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &C, &alpha, wogold, &K, &LIBXSMM_VLA_ACCESS(2, xgold, j, 0, N * C), &C, &beta0, o1gold, &K); LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &C, &alpha, wcgold, &K, &LIBXSMM_VLA_ACCESS(2, xgold, j, 0, N * C), &C, &beta0, c1gold, &K); if (j == 0) { LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &K, &alpha, rigold, &K, hpgold, &K, &beta0, i2gold, &K); LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &K, &alpha, rfgold, &K, hpgold, &K, &beta0, f2gold, &K); LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &K, &alpha, rogold, &K, hpgold, &K, &beta0, o2gold, &K); LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &K, &alpha, rcgold, &K, hpgold, &K, &beta0, c2gold, &K); } else { LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &K, &alpha, rigold, &K, &LIBXSMM_VLA_ACCESS(2, hgold, j-1, 0, K * N), &K, &beta0, i2gold, &K); LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &K, &alpha, rfgold, &K, &LIBXSMM_VLA_ACCESS(2, hgold, j-1, 0, K * N), &K, &beta0, f2gold, &K); LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &K, &alpha, rogold, &K, &LIBXSMM_VLA_ACCESS(2, hgold, j-1, 0, K * N), &K, &beta0, o2gold, &K); LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &K, &alpha, rcgold, &K, &LIBXSMM_VLA_ACCESS(2, hgold, j-1, 0, K * N), &K, &beta0, c2gold, &K); } matrix_add(K*N, i1gold, i2gold, &LIBXSMM_VLA_ACCESS(2, igold, j, 0, K * N)); matrix_add(K*N, &LIBXSMM_VLA_ACCESS(2, igold, j, 0, K * N), bimgold, &LIBXSMM_VLA_ACCESS(2, igold, j, 0, K * N)); matrix_add(K*N, f1gold, f2gold, &LIBXSMM_VLA_ACCESS(2, fgold, j, 0, K * N)); matrix_add(K*N, &LIBXSMM_VLA_ACCESS(2, fgold, j, 0, K * N), bfmgold, &LIBXSMM_VLA_ACCESS(2, fgold, j, 0, K * N)); matrix_add(K*N, o1gold, o2gold, &LIBXSMM_VLA_ACCESS(2, ogold, j, 0, K * N)); matrix_add(K*N, &LIBXSMM_VLA_ACCESS(2, ogold, j, 0, K * N), bomgold, &LIBXSMM_VLA_ACCESS(2, ogold, j, 0, K * N)); matrix_add(K*N, c1gold, c2gold, &LIBXSMM_VLA_ACCESS(2, cgold, j, 0, K * N)); matrix_add(K*N, &LIBXSMM_VLA_ACCESS(2, cgold, j, 0, K * N), bcmgold, &LIBXSMM_VLA_ACCESS(2, cgold, j, 0, K * N)); matrix_sigmoid(K*N, &LIBXSMM_VLA_ACCESS(2, igold, j, 0, K * N), &LIBXSMM_VLA_ACCESS(2, igold, j, 0, K * N)); matrix_sigmoid(K*N, &LIBXSMM_VLA_ACCESS(2, fgold, j, 0, K * N), &LIBXSMM_VLA_ACCESS(2, fgold, j, 0, K * N)); matrix_sigmoid(K*N, &LIBXSMM_VLA_ACCESS(2, ogold, j, 0, K * N), &LIBXSMM_VLA_ACCESS(2, ogold, j, 0, K * N)); matrix_tanh(K*N, &LIBXSMM_VLA_ACCESS(2, cgold, j, 0, K * N), &LIBXSMM_VLA_ACCESS(2, cgold, j, 0, K * N)); if (j == 0) { matrix_eltwise_mult(K*N, &LIBXSMM_VLA_ACCESS(2, fgold, j, 0, K * N), cspgold, d1gold); } else { matrix_eltwise_mult(K*N, &LIBXSMM_VLA_ACCESS(2, fgold, j, 0, K * N), &LIBXSMM_VLA_ACCESS(2, dgold, j-1, 0, K * N), d1gold); } matrix_eltwise_mult(K*N, &LIBXSMM_VLA_ACCESS(2, igold, j, 0, K * N), &LIBXSMM_VLA_ACCESS(2, cgold, j, 0, K * N), d2gold); matrix_add(K*N, d1gold, d2gold, &LIBXSMM_VLA_ACCESS(2, dgold, j, 0, K * N)); matrix_tanh(K*N, &LIBXSMM_VLA_ACCESS(2, dgold, j, 0, K * N), dhgold); matrix_eltwise_mult(K*N, &LIBXSMM_VLA_ACCESS(2, ogold, j, 0, K * N), dhgold, &LIBXSMM_VLA_ACCESS(2, hgold, j, 0, K * N)); } /* BWD/UPD */ for (j = t-1; j >= 0; --j) { /* compute deltagold */ if (j == t-1) { matrix_copy(K * N, &LIBXSMM_VLA_ACCESS(2, djdhgold, t-1, 0, K * N), &LIBXSMM_VLA_ACCESS(2, deltagold, t-1, 0, K * N)); } else { matrix_add(K * N, &LIBXSMM_VLA_ACCESS(2, doutgold, j, 0, K * N), &LIBXSMM_VLA_ACCESS(2, djdhgold, j, 0, K * N), &LIBXSMM_VLA_ACCESS(2, deltagold, j, 0, K * N)); } /* compute djdcspgold */ matrix_eltwise_mult(K * N, &LIBXSMM_VLA_ACCESS(2, deltagold, j, 0, K * N), &LIBXSMM_VLA_ACCESS(2, ogold, j, 0, K * N), d1gold); matrix_tanh_inverse(K * N, &LIBXSMM_VLA_ACCESS(2, dgold, j, 0, K * N), d2gold); matrix_eltwise_mult(K * N, d1gold, d2gold, d3gold); if (j == t-1) { matrix_add(K * N, d3gold, djdcsgold, djdcspgold); } else { matrix_add(K * N, d3gold, djdcspgold, djdcspgold); } /* compute djdcgold */ matrix_eltwise_mult(K * N, djdcspgold, &LIBXSMM_VLA_ACCESS(2, igold, j, 0, K * N), c1gold); matrix_complement_square(K * N, &LIBXSMM_VLA_ACCESS(2, cgold, j, 0, K * N), c2gold); matrix_eltwise_mult(K * N, c1gold, c2gold, &LIBXSMM_VLA_ACCESS(2, djdcgold, j, 0, K * N)); /* compute djdigold */ matrix_eltwise_mult(K * N, djdcspgold, &LIBXSMM_VLA_ACCESS(2, cgold, j, 0, K * N), i1gold); matrix_complement(K * N, &LIBXSMM_VLA_ACCESS(2, igold, j, 0, K * N), i2gold); matrix_eltwise_mult(K * N, &LIBXSMM_VLA_ACCESS(2, igold, j, 0, K * N), i2gold, i3gold); matrix_eltwise_mult(K * N, i1gold, i3gold, &LIBXSMM_VLA_ACCESS(2, djdigold, j, 0, K * N)); /* compute djdfgold */ if (j == 0) { matrix_eltwise_mult(K * N, djdcspgold, cspgold, f1gold); } else { matrix_eltwise_mult(K * N, djdcspgold, &LIBXSMM_VLA_ACCESS(2, dgold, j-1, 0, K * N), f1gold); } matrix_complement(K * N, &LIBXSMM_VLA_ACCESS(2, fgold, j, 0, K * N), f2gold); matrix_eltwise_mult(K * N, &LIBXSMM_VLA_ACCESS(2, fgold, j, 0, K * N), f2gold, f3gold); matrix_eltwise_mult(K * N, f1gold, f3gold, &LIBXSMM_VLA_ACCESS(2, djdfgold, j, 0, K * N)); /* compute djdogold */ matrix_tanh(K * N, &LIBXSMM_VLA_ACCESS(2, dgold, j, 0, K * N), o1gold); matrix_eltwise_mult(K * N, &LIBXSMM_VLA_ACCESS(2, deltagold, j, 0, K * N), o1gold, o1gold); matrix_complement(K * N, &LIBXSMM_VLA_ACCESS(2, ogold, j, 0, K * N), o2gold); matrix_eltwise_mult(K * N, &LIBXSMM_VLA_ACCESS(2, ogold, j, 0, K * N), o2gold, o2gold); matrix_eltwise_mult(K * N, o1gold, o2gold, &LIBXSMM_VLA_ACCESS(2, djdogold, j, 0, K * N)); /* update djdcspgold */ matrix_eltwise_mult(K * N, djdcspgold, &LIBXSMM_VLA_ACCESS(2, fgold, j, 0, K * N), djdcspgold); if (j > 0) { /* compute doutgold */ matrix_transpose(K, K, rigold, rgoldTp); LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &K, &alpha, rgoldTp, &K, &LIBXSMM_VLA_ACCESS(2, djdigold, j, 0, K * N), &K, &beta, &LIBXSMM_VLA_ACCESS(2, doutgold, j-1, 0, K * N), &K); matrix_transpose(K, K, rfgold, rgoldTp); LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &K, &alpha, rgoldTp, &K, &LIBXSMM_VLA_ACCESS(2, djdfgold, j, 0, K * N), &K, &beta, &LIBXSMM_VLA_ACCESS(2, doutgold, j-1, 0, K * N), &K); matrix_transpose(K, K, rogold, rgoldTp); LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &K, &alpha, rgoldTp, &K, &LIBXSMM_VLA_ACCESS(2, djdogold, j, 0, K * N), &K, &beta, &LIBXSMM_VLA_ACCESS(2, doutgold, j-1, 0, K * N), &K); matrix_transpose(K, K, rcgold, rgoldTp); LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &K, &alpha, rgoldTp, &K, &LIBXSMM_VLA_ACCESS(2, djdcgold, j, 0, K * N), &K, &beta, &LIBXSMM_VLA_ACCESS(2, doutgold, j-1, 0, K * N), &K); } else { /* compute djdhpgold */ matrix_transpose(K, K, rigold, rgoldTp); LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &K, &alpha, rgoldTp, &K, &LIBXSMM_VLA_ACCESS(2, djdigold, 0, 0, K * N), &K, &beta, djdhpgold, &K); matrix_transpose(K, K, rfgold, rgoldTp); LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &K, &alpha, rgoldTp, &K, &LIBXSMM_VLA_ACCESS(2, djdfgold, 0, 0, K * N), &K, &beta, djdhpgold, &K); matrix_transpose(K, K, rogold, rgoldTp); LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &K, &alpha, rgoldTp, &K, &LIBXSMM_VLA_ACCESS(2, djdogold, 0, 0, K * N), &K, &beta, djdhpgold, &K); matrix_transpose(K, K, rcgold, rgoldTp); LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &K, &alpha, rgoldTp, &K, &LIBXSMM_VLA_ACCESS(2, djdcgold, 0, 0, K * N), &K, &beta, djdhpgold, &K); } if (pass == 1 || pass == 3) { /* compute djdxgold */ matrix_transpose(C, K, wigold, wgoldTp); LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &C, &N, &K, &alpha, wgoldTp, &C, &LIBXSMM_VLA_ACCESS(2, djdigold, j, 0, K * N), &K, &beta, &LIBXSMM_VLA_ACCESS(2, djdxgold, j, 0, N * C), &C); matrix_transpose(C, K, wfgold, wgoldTp); LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &C, &N, &K, &alpha, wgoldTp, &C, &LIBXSMM_VLA_ACCESS(2, djdfgold, j, 0, K * N), &K, &beta, &LIBXSMM_VLA_ACCESS(2, djdxgold, j, 0, N * C), &C); matrix_transpose(C, K, wogold, wgoldTp); LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &C, &N, &K, &alpha, wgoldTp, &C, &LIBXSMM_VLA_ACCESS(2, djdogold, j, 0, K * N), &K, &beta, &LIBXSMM_VLA_ACCESS(2, djdxgold, j, 0, N * C), &C); matrix_transpose(C, K, wcgold, wgoldTp); LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &C, &N, &K, &alpha, wgoldTp, &C, &LIBXSMM_VLA_ACCESS(2, djdcgold, j, 0, K * N), &K, &beta, &LIBXSMM_VLA_ACCESS(2, djdxgold, j, 0, N * C), &C); } if (pass == 2 || pass == 3) { /* compute djdwgold */ matrix_transpose(N, C, &LIBXSMM_VLA_ACCESS(2, xgold, j, 0, N * C), xgoldTp); LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &C, &N, &alpha, &LIBXSMM_VLA_ACCESS(2, djdigold, j, 0, K * N), &K, xgoldTp, &N, &beta, djdwigold, &K); LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &C, &N, &alpha, &LIBXSMM_VLA_ACCESS(2, djdfgold, j, 0, K * N), &K, xgoldTp, &N, &beta, djdwfgold, &K); LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &C, &N, &alpha, &LIBXSMM_VLA_ACCESS(2, djdogold, j, 0, K * N), &K, xgoldTp, &N, &beta, djdwogold, &K); LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &C, &N, &alpha, &LIBXSMM_VLA_ACCESS(2, djdcgold, j, 0, K * N), &K, xgoldTp, &N, &beta, djdwcgold, &K); /* compute djdrgold */ if (j == 0) { matrix_transpose(N, K, hpgold, hgoldTp); } else { matrix_transpose(N, K, &LIBXSMM_VLA_ACCESS(2, hgold, j-1, 0, K * N), hgoldTp); } LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &K, &N, &alpha, &LIBXSMM_VLA_ACCESS(2, djdigold, j, 0, K * N), &K, hgoldTp, &N, &beta, djdrigold, &K); LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &K, &N, &alpha, &LIBXSMM_VLA_ACCESS(2, djdfgold, j, 0, K * N), &K, hgoldTp, &N, &beta, djdrfgold, &K); LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &K, &N, &alpha, &LIBXSMM_VLA_ACCESS(2, djdogold, j, 0, K * N), &K, hgoldTp, &N, &beta, djdrogold, &K); LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &K, &N, &alpha, &LIBXSMM_VLA_ACCESS(2, djdcgold, j, 0, K * N), &K, hgoldTp, &N, &beta, djdrcgold, &K); /* compute djdbgold */ for (l = 0; l < K*N; l++) { djdbigold[l%K] += LIBXSMM_VLA_ACCESS(2, djdigold, j, l, K * N); djdbfgold[l%K] += LIBXSMM_VLA_ACCESS(2, djdfgold, j, l, K * N); djdbogold[l%K] += LIBXSMM_VLA_ACCESS(2, djdogold, j, l, K * N); djdbcgold[l%K] += LIBXSMM_VLA_ACCESS(2, djdcgold, j, l, K * N); } } } printf("##########################################\n"); printf("# Computing Reference ... done #\n"); printf("##########################################\n"); } if (1 /* format == 'A' || format == 'L' */) { printf("\n"); printf("##########################################\n"); printf("# Setting Up (custom-Storage) #\n"); printf("##########################################\n"); if ( N % bn != 0 ) { bn = N; } if ( C % bc != 0 ) { bc = C; } if ( K % bk != 0 ) { bk = K; } /* setup LIBXSMM handle */ lstmcell_desc.threads = nThreads; lstmcell_desc.N = N; lstmcell_desc.C = C; lstmcell_desc.K = K; lstmcell_desc.max_T = t; lstmcell_desc.bn = bn; lstmcell_desc.bk = bk; lstmcell_desc.bc = bc; lstmcell_desc.cell_type = LIBXSMM_DNN_RNNCELL_LSTM; lstmcell_desc.datatype_in = LIBXSMM_DNN_DATATYPE_F32; lstmcell_desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32; lstmcell_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NC; lstmcell_desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED; libxsmm_handle = libxsmm_dnn_create_rnncell( lstmcell_desc, &status ); CHKERR_LIBXSMM_DNN( status ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_allocate_forget_bias(libxsmm_handle, forget_bias) ); /* setup LIBXSMM buffers and filter */ libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_input = libxsmm_dnn_link_tensor( libxsmm_layout, xt, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_CS_PREV, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_cs_prev = libxsmm_dnn_link_tensor( libxsmm_layout, csp, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_hidden_state_prev = libxsmm_dnn_link_tensor( libxsmm_layout, hp, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_weight = libxsmm_dnn_link_tensor( libxsmm_layout, w, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_weight_t = libxsmm_dnn_link_tensor( libxsmm_layout, wt, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_recur_weight = libxsmm_dnn_link_tensor( libxsmm_layout, r, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_recur_weight_t = libxsmm_dnn_link_tensor( libxsmm_layout, rt, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_BIAS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_bias = libxsmm_dnn_link_tensor( libxsmm_layout, b, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_CS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_cs = libxsmm_dnn_link_tensor( libxsmm_layout, cst, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_hidden_state = libxsmm_dnn_link_tensor( libxsmm_layout, ht, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_I, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_i = libxsmm_dnn_link_tensor( libxsmm_layout, it, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_F, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_f = libxsmm_dnn_link_tensor( libxsmm_layout, ft, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_O, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_o = libxsmm_dnn_link_tensor( libxsmm_layout, ot, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_CI, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_ci = libxsmm_dnn_link_tensor( libxsmm_layout, cit, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_CO, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_co = libxsmm_dnn_link_tensor( libxsmm_layout, cot, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dinput = libxsmm_dnn_link_tensor( libxsmm_layout, dxt, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_CS_PREV, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dcs_prev = libxsmm_dnn_link_tensor( libxsmm_layout, dcsp, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dhidden_state_prev = libxsmm_dnn_link_tensor( libxsmm_layout, dhp, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dweight = libxsmm_dnn_link_tensor( libxsmm_layout, dw, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_drecur_weight = libxsmm_dnn_link_tensor( libxsmm_layout, dr, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_BIAS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dbias = libxsmm_dnn_link_tensor( libxsmm_layout, db, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_CS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dcs = libxsmm_dnn_link_tensor( libxsmm_layout, dcs, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dhidden_state = libxsmm_dnn_link_tensor( libxsmm_layout, dht, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); /* copy in data to LIBXSMM format */ matrix_copy(N*C*t, xgoldt, xt); matrix_copy(K*N, cspgold, csp); matrix_copy(K*N, hpgold, hp); convert_ck_c4k_offset(C, K, 0, wigold, w_tmp); convert_ck_c4k_offset(C, K, 1, wcgold, w_tmp); convert_ck_c4k_offset(C, K, 2, wfgold, w_tmp); convert_ck_c4k_offset(C, K, 3, wogold, w_tmp); convert_ck_c4k_offset(K, K, 0, rigold, r_tmp); convert_ck_c4k_offset(K, K, 1, rcgold, r_tmp); convert_ck_c4k_offset(K, K, 2, rfgold, r_tmp); convert_ck_c4k_offset(K, K, 3, rogold, r_tmp); matrix_copy_CK_to_KCCK(w_tmp, w, C, 4*K, bc, bk); matrix_copy_CK_to_KCCK(r_tmp, r, K, 4*K, bk, bk); matrix_copy_CK_to_CKKC(wigold, wt, C, K, bc, bk); matrix_copy_CK_to_CKKC(wcgold, wt+(C*K) , C, K, bc, bk); matrix_copy_CK_to_CKKC(wfgold, wt+(2*C*K), C, K, bc, bk); matrix_copy_CK_to_CKKC(wogold, wt+(3*C*K), C, K, bc, bk); matrix_copy_CK_to_CKKC(rigold, rt, K, K, bk, bk); matrix_copy_CK_to_CKKC(rcgold, rt+(K*K), K, K, bk, bk); matrix_copy_CK_to_CKKC(rfgold, rt+(2*K*K), K, K, bk, bk); matrix_copy_CK_to_CKKC(rogold, rt+(3*K*K), K, K, bk, bk); matrix_copy(K, bigold, &(b[0])); matrix_copy(K, bcgold, &(b[K])); matrix_copy(K, bfgold, &(b[2*K])); matrix_copy(K, bogold, &(b[3*K])); matrix_copy(K*N*t, djdhgoldt, dht); matrix_copy(K*N, djdcsgold, dcs); /* bind buffers and filter to handle */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_input, LIBXSMM_DNN_RNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_cs_prev, LIBXSMM_DNN_RNN_REGULAR_CS_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_hidden_state_prev, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_weight, LIBXSMM_DNN_RNN_REGULAR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_recur_weight, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_weight_t, LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_recur_weight_t, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_bias, LIBXSMM_DNN_RNN_REGULAR_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_cs, LIBXSMM_DNN_RNN_REGULAR_CS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_hidden_state, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_i, LIBXSMM_DNN_RNN_INTERNAL_I ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_f, LIBXSMM_DNN_RNN_INTERNAL_F ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_o, LIBXSMM_DNN_RNN_INTERNAL_O ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_ci, LIBXSMM_DNN_RNN_INTERNAL_CI ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_co, LIBXSMM_DNN_RNN_INTERNAL_CO ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dinput, LIBXSMM_DNN_RNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dcs_prev, LIBXSMM_DNN_RNN_GRADIENT_CS_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dhidden_state_prev, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dweight, LIBXSMM_DNN_RNN_GRADIENT_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_drecur_weight, LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dbias, LIBXSMM_DNN_RNN_GRADIENT_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dcs, LIBXSMM_DNN_RNN_GRADIENT_CS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dhidden_state, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE ) ); /* let's allocate and bind scratch */ if (pass == 0) { scratch_size = libxsmm_dnn_rnncell_get_scratch_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, &status ); CHKERR_LIBXSMM_DNN( status ); scratch = libxsmm_aligned_malloc( scratch_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, scratch ) ); } else { scratch_size = libxsmm_dnn_rnncell_get_scratch_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, &status ); CHKERR_LIBXSMM_DNN( status ); scratch = libxsmm_aligned_malloc( scratch_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, scratch ) ); } zero_buf( (float*)scratch, scratch_size/4 ); /* let's allocate and bind internalstate */ if (pass == 0) { internalstate_size = libxsmm_dnn_rnncell_get_internalstate_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, &status ); CHKERR_LIBXSMM_DNN( status ); internalstate = (0 != internalstate_size ? libxsmm_aligned_malloc( internalstate_size, 2097152 ) : NULL); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, internalstate ) ); } else { internalstate_size = libxsmm_dnn_rnncell_get_internalstate_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, &status ); CHKERR_LIBXSMM_DNN( status ); internalstate = (0 != internalstate_size ? libxsmm_aligned_malloc( internalstate_size, 2097152 ) : NULL); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, internalstate ) ); } zero_buf( (float*)internalstate, internalstate_size/4 ); if ((pass == 0) && LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Correctness - FWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) ); } /* copy out data */ matrix_copy(K*N, &LIBXSMM_VLA_ACCESS(2, h, t-1, 0, K * N), htest); /* compare */ libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_F32, K*N, 1, &LIBXSMM_VLA_ACCESS(2, hgold, t-1, 0, K * N), htest, 0, 0); printf("L1 reference : %.25g\n", norms_fwd.l1_ref); printf("L1 test : %.25g\n", norms_fwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_fwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_fwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel); printf("Check-norm : %.24f\n", norms_fwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_fwd); } else { /* We need to always run FWD pass once to populate i, f, o, ci, co, cs, h */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) ); } } if ( (pass == 1) && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - BWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ) ); } /* copy out data */ matrix_copy(N*C*t, dxt, djdxtestt); /* compare */ libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, N*C*t, 1, djdxgoldt, djdxtestt, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); } if ( (pass == 2) && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ) ); } /* copy out data */ matrix_copy_KCCK_to_CK(dw, w_tmp, C, 4*K, bc, bk); matrix_copy_KCCK_to_CK(dr, r_tmp, K, 4*K, bk, bk); convert_c4k_4ck(C, K, w_tmp, djdwtest); convert_c4k_4ck(K, K, r_tmp, djdrtest); LIBXSMM_VLA_DECL(2, float, djdb4test, djdbtest, K); matrix_copy(K, &(db[0]), &LIBXSMM_VLA_ACCESS(2, djdb4test, 0, 0, K)); matrix_copy(K, &(db[K]), &LIBXSMM_VLA_ACCESS(2, djdb4test, 1, 0, K)); matrix_copy(K, &(db[2*K]), &LIBXSMM_VLA_ACCESS(2, djdb4test, 2, 0, K)); matrix_copy(K, &(db[3*K]), &LIBXSMM_VLA_ACCESS(2, djdb4test, 3, 0, K)); LIBXSMM_VLA_DECL(2, float, djdw4, djdwgold4, C*K); LIBXSMM_VLA_DECL(2, float, djdr4, djdrgold4, K*K); LIBXSMM_VLA_DECL(2, float, djdb4, djdbgold4, K); matrix_copy(C*K, djdwigold, &LIBXSMM_VLA_ACCESS(2, djdw4, 0, 0, C*K)); matrix_copy(C*K, djdwcgold, &LIBXSMM_VLA_ACCESS(2, djdw4, 1, 0, C*K)); matrix_copy(C*K, djdwfgold, &LIBXSMM_VLA_ACCESS(2, djdw4, 2, 0, C*K)); matrix_copy(C*K, djdwogold, &LIBXSMM_VLA_ACCESS(2, djdw4, 3, 0, C*K)); matrix_copy(K*K, djdrigold, &LIBXSMM_VLA_ACCESS(2, djdr4, 0, 0, K*K)); matrix_copy(K*K, djdrcgold, &LIBXSMM_VLA_ACCESS(2, djdr4, 1, 0, K*K)); matrix_copy(K*K, djdrfgold, &LIBXSMM_VLA_ACCESS(2, djdr4, 2, 0, K*K)); matrix_copy(K*K, djdrogold, &LIBXSMM_VLA_ACCESS(2, djdr4, 3, 0, K*K)); matrix_copy(K, djdbigold, &LIBXSMM_VLA_ACCESS(2, djdb4, 0, 0, K)); matrix_copy(K, djdbcgold, &LIBXSMM_VLA_ACCESS(2, djdb4, 1, 0, K)); matrix_copy(K, djdbfgold, &LIBXSMM_VLA_ACCESS(2, djdb4, 2, 0, K)); matrix_copy(K, djdbogold, &LIBXSMM_VLA_ACCESS(2, djdb4, 3, 0, K)); /* compare */ libxsmm_matdiff(&norms_upd_w, LIBXSMM_DATATYPE_F32, C*K*4, 1, djdwgold4, djdwtest, 0, 0); printf("Delta weight\n"); printf("L1 reference : %.25g\n", norms_upd_w.l1_ref); printf("L1 test : %.25g\n", norms_upd_w.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_w.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_w.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_w.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_w.linf_rel); printf("Check-norm : %.24f\n", norms_upd_w.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_w); libxsmm_matdiff(&norms_upd_r, LIBXSMM_DATATYPE_F32, K*K*4, 1, djdrgold4, djdrtest, 0, 0); printf("Delta recurrent weight\n"); printf("L1 reference : %.25g\n", norms_upd_r.l1_ref); printf("L1 test : %.25g\n", norms_upd_r.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_r.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_r.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_r.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_r.linf_rel); printf("Check-norm : %.24f\n", norms_upd_r.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_r); libxsmm_matdiff(&norms_upd_b, LIBXSMM_DATATYPE_F32, K*4, 1, djdbgold4, djdbtest, 0, 0); printf("Delta bias\n"); printf("L1 reference : %.25g\n", norms_upd_b.l1_ref); printf("L1 test : %.25g\n", norms_upd_b.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_b.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_b.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_b.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_b.linf_rel); printf("Check-norm : %.24f\n", norms_upd_b.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_b); } if ( (pass == 3) && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - BWD+UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, 0, tid ) ); } /* copy out data */ matrix_copy(N*C*t, dxt, djdxtestt); matrix_copy_KCCK_to_CK(dw, w_tmp, C, 4*K, bc, bk); matrix_copy_KCCK_to_CK(dr, r_tmp, K, 4*K, bk, bk); convert_c4k_4ck(C, K, w_tmp, djdwtest); convert_c4k_4ck(K, K, r_tmp, djdrtest); /*LIBXSMM_VLA_DECL(2, float, djdw4test, djdwtest, C*K);*/ /*LIBXSMM_VLA_DECL(2, float, djdr4test, djdrtest, K*K);*/ LIBXSMM_VLA_DECL(2, float, djdb4test, djdbtest, K); matrix_copy(K, &(db[0]), &LIBXSMM_VLA_ACCESS(2, djdb4test, 0, 0, K)); matrix_copy(K, &(db[K]), &LIBXSMM_VLA_ACCESS(2, djdb4test, 1, 0, K)); matrix_copy(K, &(db[2*K]), &LIBXSMM_VLA_ACCESS(2, djdb4test, 2, 0, K)); matrix_copy(K, &(db[3*K]), &LIBXSMM_VLA_ACCESS(2, djdb4test, 3, 0, K)); LIBXSMM_VLA_DECL(2, float, djdw4, djdwgold4, C*K); LIBXSMM_VLA_DECL(2, float, djdr4, djdrgold4, K*K); LIBXSMM_VLA_DECL(2, float, djdb4, djdbgold4, K); matrix_copy(C*K, djdwigold, &LIBXSMM_VLA_ACCESS(2, djdw4, 0, 0, C*K)); matrix_copy(C*K, djdwcgold, &LIBXSMM_VLA_ACCESS(2, djdw4, 1, 0, C*K)); matrix_copy(C*K, djdwfgold, &LIBXSMM_VLA_ACCESS(2, djdw4, 2, 0, C*K)); matrix_copy(C*K, djdwogold, &LIBXSMM_VLA_ACCESS(2, djdw4, 3, 0, C*K)); matrix_copy(K*K, djdrigold, &LIBXSMM_VLA_ACCESS(2, djdr4, 0, 0, K*K)); matrix_copy(K*K, djdrcgold, &LIBXSMM_VLA_ACCESS(2, djdr4, 1, 0, K*K)); matrix_copy(K*K, djdrfgold, &LIBXSMM_VLA_ACCESS(2, djdr4, 2, 0, K*K)); matrix_copy(K*K, djdrogold, &LIBXSMM_VLA_ACCESS(2, djdr4, 3, 0, K*K)); matrix_copy(K, djdbigold, &LIBXSMM_VLA_ACCESS(2, djdb4, 0, 0, K)); matrix_copy(K, djdbcgold, &LIBXSMM_VLA_ACCESS(2, djdb4, 1, 0, K)); matrix_copy(K, djdbfgold, &LIBXSMM_VLA_ACCESS(2, djdb4, 2, 0, K)); matrix_copy(K, djdbogold, &LIBXSMM_VLA_ACCESS(2, djdb4, 3, 0, K)); /* compare */ libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, N*C*t, 1, djdxgoldt, djdxtestt, 0, 0); printf("Delta input\n"); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); libxsmm_matdiff(&norms_upd_w, LIBXSMM_DATATYPE_F32, C*K*4, 1, djdwgold4, djdwtest, 0, 0); printf("Delta weight\n"); printf("L1 reference : %.25g\n", norms_upd_w.l1_ref); printf("L1 test : %.25g\n", norms_upd_w.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_w.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_w.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_w.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_w.linf_rel); printf("Check-norm : %.24f\n", norms_upd_w.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_w); libxsmm_matdiff(&norms_upd_r, LIBXSMM_DATATYPE_F32, K*K*4, 1, djdrgold4, djdrtest, 0, 0); printf("Delta recurrent weight\n"); printf("L1 reference : %.25g\n", norms_upd_r.l1_ref); printf("L1 test : %.25g\n", norms_upd_r.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_r.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_r.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_r.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_r.linf_rel); printf("Check-norm : %.24f\n", norms_upd_r.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_r); libxsmm_matdiff(&norms_upd_b, LIBXSMM_DATATYPE_F32, K*4, 1, djdbgold4, djdbtest, 0, 0); printf("Delta bias\n"); printf("L1 reference : %.25g\n", norms_upd_b.l1_ref); printf("L1 test : %.25g\n", norms_upd_b.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_b.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_b.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_b.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_b.linf_rel); printf("Check-norm : %.24f\n", norms_upd_b.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_b); } if ( pass == 0 ) { printf("##########################################\n"); printf("# Performance - FWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(j) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (j = 0; j < iters; ++j) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = (((2.0 * K * N * C) + (2.0 * K * N * K) + (2.0 * K * N) + (tflops * K * N)) * 4.0 + (4.0 * K * N) + (tflops * K * N)) * (double)t * (double)iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,FP,%s,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, ((double)(l_total/iters)), (flops*1e-9)/l_total); } if ( pass == 1 ) { printf("##########################################\n"); printf("# Performance - BWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(j) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (j = 0; j < iters; ++j) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = K * N; /* delta + delta_out */ flops += (6.0 * K * N + tflops * K * N); /* dJdd */ flops += (4.0 * K * N); /* dJdc */ flops += (4.0 * K * N); /* dJdi */ flops += (4.0 * K * N); /* dJdf */ flops += (4.0 * K * N + tflops * K * N); /* dJdo */ tempflops = (8.0 * K * N * C); /* W^T * dJd{c, i, f, o} */ tempflops += (3.0 * K * C); /* summation */ flops += tempflops; tempflops = (8.0 * K * N * K); /* R^T * dJd{c, i, f, o} */ flops += tempflops; flops *= t; /* for t time steps */ flops *= iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("bp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,BP,%s,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, ((double)(l_total/iters)), (flops*1e-9)/l_total); } if ( pass == 2 ) { printf("##########################################\n"); printf("# Performance - UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(j) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (j = 0; j < iters; ++j) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = K * N; /* delta + delta_out */ flops += (6.0 * K * N + tflops * K * N); /* dJdd */ flops += (4.0 * K * N); /* dJdc */ flops += (4.0 * K * N); /* dJdi */ flops += (4.0 * K * N); /* dJdf */ flops += (4.0 * K * N + tflops * K * N); /* dJdo */ tempflops = (8.0 * K * N * K); /* R^T * dJd{c, i, f, o} */ flops += tempflops; flops *= t; /* for t time steps */ tempflops = (8.0 * K * N * C); /* delta{c, i, f, o} * x^T */ tempflops *= t; /* for t time steps */ flops += tempflops; tempflops = (8.0 * K * N * K); /* delta{c, i, f, o} * delta^T */ tempflops *= t; /* for t time steps */ flops += tempflops; flops += (4.0 * K * N * t); /* delbias */ flops *= iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("wu time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,WU,%s,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, ((double)(l_total/iters)), (flops*1e-9)/l_total); } if ( pass == 3 ) { printf("##########################################\n"); printf("# Performance - BWD+UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM LSTM for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(j) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (j = 0; j < iters; ++j) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = K * N; /* delta + delta_out */ flops += (6.0 * K * N + tflops * K * N); /* dJdd */ flops += (4.0 * K * N); /* dJdc */ flops += (4.0 * K * N); /* dJdi */ flops += (4.0 * K * N); /* dJdf */ flops += (4.0 * K * N + tflops * K * N); /* dJdo */ tempflops = (8.0 * K * N * C); /* W^T * dJd{c, i, f, o} */ tempflops += (3.0 * K * C); /* summation */ flops += tempflops; tempflops = (8.0 * K * N * K); /* R^T * dJd{c, i, f, o} */ flops += tempflops; flops *= t; /* for t time steps */ tempflops = (8.0 * K * N * C); /* delta{c, i, f, o} * x^T */ tempflops *= t; /* for t time steps */ flops += tempflops; tempflops = (8.0 * K * N * K); /* delta{c, i, f, o} * delta^T */ tempflops *= t; /* for t time steps */ flops += tempflops; flops += (4.0 * K * N * t); /* delbias */ flops *= iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("bp+wu time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,BP+WU,%s,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, ((double)(l_total/iters)), (flops*1e-9)/l_total); } if ( pass == 4 ) { printf("###############################################\n"); printf("# Performance - FWD+BWD+UPD (nc-kcck Storage) #\n"); printf("###############################################\n"); /* run LIBXSMM LSTM for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(j) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (j = 0; j < iters; ++j) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = K * N; /* delta + delta_out */ flops += (6.0 * K * N + tflops * K * N); /* dJdd */ flops += (4.0 * K * N); /* dJdc */ flops += (4.0 * K * N); /* dJdi */ flops += (4.0 * K * N); /* dJdf */ flops += (4.0 * K * N + tflops * K * N); /* dJdo */ tempflops = (8.0 * K * N * C); /* W^T * dJd{c, i, f, o} */ tempflops += (3.0 * K * C); /* summation */ flops += tempflops; tempflops = (8.0 * K * N * K); /* R^T * dJd{c, i, f, o} */ flops += tempflops; flops *= t; /* for t time steps */ tempflops = (8.0 * K * N * C); /* delta{c, i, f, o} * x^T */ tempflops *= t; /* for t time steps */ flops += tempflops; tempflops = (8.0 * K * N * K); /* delta{c, i, f, o} * delta^T */ tempflops *= t; /* for t time steps */ flops += tempflops; flops += (4.0 * K * N * t); /* delbias */ flops += (((2.0 * K * N * C) + (2.0 * K * N * K) + (2.0 * K * N) + (tflops * K * N)) * 4.0 + (4.0 * K * N) + (tflops * K * N)) * (double)t; flops *= iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("fp+bp+wu time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,FP+BP+WU,%s,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, ((double)(l_total/iters)), (flops*1e-9)/l_total); } /* clean-up */ if (pass == 0) { CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD ) ); } else { CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL ) ); } libxsmm_free(scratch); libxsmm_free(internalstate); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_CS_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_CS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_I ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_F ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_O ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_CI ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_CO ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_CS_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_CS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_input ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_cs_prev ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_hidden_state_prev ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_weight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_recur_weight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_weight_t ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_recur_weight_t ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_bias ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_cs ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_hidden_state ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_i ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_f ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_o ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_ci ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_co ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dinput ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dcs_prev ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dhidden_state_prev ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dweight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_drecur_weight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dbias ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dcs ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dhidden_state ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_rnncell( libxsmm_handle ) ); } /* deallocate data */ libxsmm_free(xgoldt); libxsmm_free(cspgold); libxsmm_free(hpgold); libxsmm_free(wigold); libxsmm_free(wfgold); libxsmm_free(wogold); libxsmm_free(wcgold); libxsmm_free(rigold); libxsmm_free(rfgold); libxsmm_free(rogold); libxsmm_free(rcgold); libxsmm_free(bigold); libxsmm_free(bfgold); libxsmm_free(bogold); libxsmm_free(bcgold); libxsmm_free(hgoldt); libxsmm_free(bimgold); libxsmm_free(bfmgold); libxsmm_free(bomgold); libxsmm_free(bcmgold); libxsmm_free(bfgold_fb); libxsmm_free(igoldt); libxsmm_free(fgoldt); libxsmm_free(ogoldt); libxsmm_free(cgoldt); libxsmm_free(dgoldt); libxsmm_free(i1gold); libxsmm_free(i2gold); libxsmm_free(f1gold); libxsmm_free(f2gold); libxsmm_free(o1gold); libxsmm_free(o2gold); libxsmm_free(c1gold); libxsmm_free(c2gold); libxsmm_free(d1gold); libxsmm_free(d2gold); libxsmm_free(dhgold); libxsmm_free(i3gold); libxsmm_free(f3gold); libxsmm_free(d3gold); libxsmm_free(d4gold); libxsmm_free(deltagoldt); libxsmm_free(djdhgoldt); libxsmm_free(djdcspgold); libxsmm_free(djdigoldt); libxsmm_free(djdfgoldt); libxsmm_free(djdogoldt); libxsmm_free(djdcgoldt); libxsmm_free(djdxgoldt); libxsmm_free(djdwigold); libxsmm_free(djdwfgold); libxsmm_free(djdwogold); libxsmm_free(djdwcgold); libxsmm_free(djdrigold); libxsmm_free(djdrfgold); libxsmm_free(djdrogold); libxsmm_free(djdrcgold); libxsmm_free(djdbigold); libxsmm_free(djdbfgold); libxsmm_free(djdbogold); libxsmm_free(djdbcgold); libxsmm_free(djdhpgold); libxsmm_free(wgoldTp); libxsmm_free(rgoldTp); libxsmm_free(xgoldTp); libxsmm_free(hgoldTp); libxsmm_free(doutgoldt); libxsmm_free(xt); libxsmm_free(csp); libxsmm_free(hp); libxsmm_free(w); libxsmm_free(r); libxsmm_free(wt); libxsmm_free(rt); libxsmm_free(w_tmp); libxsmm_free(r_tmp); libxsmm_free(b); libxsmm_free(cst); libxsmm_free(ht); libxsmm_free(dxt); libxsmm_free(dcsp); libxsmm_free(dhp); libxsmm_free(dw); libxsmm_free(dr); libxsmm_free(db); libxsmm_free(dcs); libxsmm_free(dht); libxsmm_free(it); libxsmm_free(ft); libxsmm_free(ot); libxsmm_free(cit); libxsmm_free(cot); libxsmm_free(htest); libxsmm_free(djdxtestt); libxsmm_free(djdwtest); libxsmm_free(djdrtest); libxsmm_free(djdbtest); libxsmm_free(djdwgold4); libxsmm_free(djdrgold4); libxsmm_free(djdbgold4); libxsmm_free(djdcsgold); { const char *const env_check_scale = getenv("CHECK_SCALE"); const double check_scale = LIBXSMM_ABS(0 == env_check_scale ? 1.0 : atof(env_check_scale)); if (LIBXSMM_NEQ(0, check) && (check < 100.0 * check_scale * diff.normf_rel) && (global_status == LIBXSMM_DNN_SUCCESS)) { fprintf(stderr, "FAILED with an error of %f%%!\n", 100.0 * diff.normf_rel); exit(EXIT_FAILURE); } } /* some empty lines at the end */ printf("\n\n\n"); return global_status; } libxsmm-1.17/samples/deeplearning/lstmdriver/lstmdriver_nc_kcck_f32.vcxproj000066400000000000000000000551711415223013700273550ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 lstmdriver_nc_kcck_f32 {4B1FF769-DEE3-441D-9939-C60F5487F2A0} 10.0 Application Disabled Disabled Sequential v142 true Application true true Disabled Disabled Sequential v142 Application true Disabled Disabled Sequential v142 true Application Disabled Disabled Sequential v142 true true Application true Disabled Disabled Sequential v142 Application true Disabled Disabled true Sequential v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmext.lib;mkl_rt.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmext.lib;mkl_rt.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/deeplearning/lstmdriver/run_lstmcell.sh000077500000000000000000000113001415223013700244440ustar00rootroot00000000000000#!/usr/bin/env bash ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.), Kunal Banerjee (Intel Corp.) ############################################################################### set -eo pipefail UNAME=$(command -v uname) SORT=$(command -v sort) GREP=$(command -v grep) CUT=$(command -v cut) WC=$(command -v wc) TR=$(command -v tr) NUMA=-1 if [ "" = "${CHECK}" ] || [ "0" = "${CHECK}" ]; then if [ "" = "${CHECK_DNN_MB}" ]; then CHECK_DNN_MB=64; fi if [ "" = "${CHECK_DNN_ITERS}" ]; then CHECK_DNN_ITERS=1000; fi else # check if [ "" = "${CHECK_DNN_MB}" ]; then CHECK_DNN_MB=64; fi if [ "" = "${CHECK_DNN_ITERS}" ]; then CHECK_DNN_ITERS=1; fi fi if [ $# -ne 8 ] then echo "Usage: $(basename $0) format=(nc_ck, nc_kcck) bin=(f32, bf16) iters type=(0-fwd, 1-bwd, 2-upd, 3-bwdupd) MB bn bc bk" FORMAT=nc_ck BIN=f32 ITERS=${CHECK_DNN_ITERS} TYPE=0 MB=${CHECK_DNN_MB} BN=32 BC=32 BK=32 else FORMAT=$1 BIN=$2 ITERS=$3 TYPE=$4 MB=$5 BN=$6 BC=$7 BK=$8 fi if [ "${GREP}" ] && [ "${SORT}" ] && [ "${CUT}" ] && [ "${TR}" ] && [ "${WC}" ]; then if [ "$(command -v lscpu)" ]; then NS=$(lscpu | ${GREP} -m1 "Socket(s)" | ${TR} -d " " | ${CUT} -d: -f2) if [ "" = "${NS}" ]; then NS=1; fi NC=$((NS*$(lscpu | ${GREP} -m1 "Core(s) per socket" | ${TR} -d " " | ${CUT} -d: -f2))) NT=$((NC*$(lscpu | ${GREP} -m1 "Thread(s) per core" | ${TR} -d " " | ${CUT} -d: -f2))) elif [ -e /proc/cpuinfo ]; then NS=$(${GREP} "physical id" /proc/cpuinfo | ${SORT} -u | ${WC} -l | ${TR} -d " ") if [ "" = "${NS}" ] || [ "" = "${NS}" ]; then NS=1; fi NC=$((NS*$(${GREP} -m1 "cpu cores" /proc/cpuinfo | ${TR} -d " " | ${CUT} -d: -f2))) NT=$(${GREP} "core id" /proc/cpuinfo | ${WC} -l | ${TR} -d " ") elif [ "Darwin" = "$(uname)" ]; then NS=$(sysctl hw.packages | ${CUT} -d: -f2 | ${TR} -d " ") NC=$(sysctl hw.physicalcpu | ${CUT} -d: -f2 | ${TR} -d " ") NT=$(sysctl hw.logicalcpu | ${CUT} -d: -f2 | ${TR} -d " ") fi if [ "${NC}" ] && [ "${NT}" ]; then HT=$((NT/NC)) else NS=1 NC=1 NT=1 HT=1 fi if [ "$(command -v numactl)" ]; then NN=$(numactl -H | ${GREP} "available:" | ${CUT} -d' ' -f2) else NN=${NS} fi fi CPUFLAGS=$(if [ "${GREP}" ] && [ "${CUT}" ] && [ -e /proc/cpuinfo ]; then ${GREP} -m1 flags /proc/cpuinfo | ${CUT} -d: -f2- || true; fi) if [ "${GREP}" ] && [ "$(echo "${CPUFLAGS}" | ${GREP} -o avx512er)" ]; then if [ "0" != "$((0>NUMA))" ] && [ "0" != "$((NS #include #include #include #include #if defined(_OPENMP) # include #endif /* include c-based dnn library */ #include "../common/dnn_common.h" #define CHKERR_LIBXSMM_DNN(A) { const int chkerr_libxsmm_dnn_ = A; if (LIBXSMM_DNN_SUCCESS != chkerr_libxsmm_dnn_) { \ fprintf(stderr, "%s\n", libxsmm_dnn_get_error(chkerr_libxsmm_dnn_)); global_status = chkerr_libxsmm_dnn_; } \ } int main(int argc, char* argv[]) { libxsmm_bfloat16 **act_libxsmm, **fil_libxsmm, **delact_libxsmm, **delfil_libxsmm; libxsmm_bfloat16 **bias_libxsmm, **delbias_libxsmm; float **fil_master; unsigned char **relumask_libxsmm; void* scratch = NULL; size_t scratch_size = 0; /* some parameters we can overwrite via cli, default is some inner layer of overfeat */ int iters = 10; /* repetitions of benchmark */ int MB = 32; /* mini-batch size, "N" */ int fuse_type = 0; /* 0: nothing fused, 1: relu fused, 2: elementwise fused, 3: relu and elementwise fused */ char type = 'A'; /* 'A': ALL, 'F': FP, 'B': BP */ int bn = 64; int bk = 64; int bc = 64; int *C; /* number of input feature maps, "C" */ int num_layers = 0; const char *const env_check = getenv("CHECK"); const double check = LIBXSMM_ABS(0 == env_check ? 1 : atof(env_check)); #if defined(_OPENMP) int nThreads = omp_get_max_threads(); /* number of threads */ #else int nThreads = 1; /* number of threads */ #endif unsigned long long l_start, l_end; double l_total = 0.0; double gflop = 0.0; int i, j; double act_size = 0.0; double fil_size = 0.0; libxsmm_dnn_fullyconnected_desc fullyconnected_desc; libxsmm_dnn_fullyconnected** libxsmm_fc_layer; libxsmm_dnn_optimizer_desc optimizer_desc; libxsmm_dnn_optimizer** libxsmm_opt; libxsmm_dnn_softmaxloss_desc softmaxloss_desc; libxsmm_dnn_softmaxloss* libxsmm_softmax; libxsmm_dnn_tensor** libxsmm_act; libxsmm_dnn_tensor** libxsmm_delact; libxsmm_dnn_tensor** libxsmm_fil; libxsmm_dnn_tensor** libxsmm_delfil; libxsmm_dnn_tensor** libxsmm_bias; libxsmm_dnn_tensor** libxsmm_delbias; libxsmm_dnn_tensor** libxsmm_relumask; libxsmm_dnn_tensor** libxsmm_mafil; libxsmm_dnn_tensor_datalayout* libxsmm_layout; libxsmm_dnn_err_t status; libxsmm_dnn_err_t global_status = LIBXSMM_DNN_SUCCESS; if (argc > 1 && !strncmp(argv[1], "-h", 3)) { printf("Usage: %s iters MB fuse_type type bn bk bc C1 C2 ... CN\n", argv[0]); return 0; } libxsmm_rng_set_seed(1); /* reading new values from cli */ i = 1; num_layers = argc - 9; if (argc > i) iters = atoi(argv[i++]); if (argc > i) MB = atoi(argv[i++]); if (argc > i) fuse_type = atoi(argv[i++]); if (argc > i) type = *(argv[i++]); if (argc > i) bn = atoi(argv[i++]); if (argc > i) bk = atoi(argv[i++]); if (argc > i) bc = atoi(argv[i++]); /* allocate the number of channles buffer */ if ( num_layers < 1 ) { printf("Usage: %s iters MB fuse_type type bn bk bc C1 C2 ... CN\n", argv[0]); return 0; } C = (int*)malloc((num_layers+2)*sizeof(int)); for (j = 0 ; i < argc; ++i, ++j ) { C[j] = atoi(argv[i]); } /* handle softmax config */ C[num_layers+1] = C[num_layers]; if (type != 'A' && type != 'F' && type != 'B') { printf("type needs to be 'A' (All), 'F' (FP only), 'B' (BP only)\n"); return -1; } if ( (fuse_type < 0) || (fuse_type > 5) ) { printf("fuse type needs to be 0 (None), 1 (Bias), 2 (ReLU), 3 (Sigmoid), 4 (Bias+ReLU), 5 (Bias+Sigmoid)\n"); return -1; } #if defined(__SSE3__) _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); #endif /* print some summary */ printf("##########################################\n"); printf("# Setting Up (Common) #\n"); printf("##########################################\n"); printf("PARAMS: N:%d\n", MB); printf("PARAMS: Layers: %d\n", num_layers); printf("PARAMS: ITERS:%d", iters); if (LIBXSMM_FEQ(0, check)) printf(" Threads:%d\n", nThreads); else printf("\n"); for (i = 0; i < num_layers; ++i ) { if (i == 0) { act_size += (double)(MB*C[i]*sizeof(libxsmm_bfloat16))/(1024.0*1024.0); printf("SIZE Activations %i (%dx%d): %10.2f MiB\n", i, MB, C[i], (double)(MB*C[i]*sizeof(libxsmm_bfloat16))/(1024.0*1024.0) ); } act_size += (double)(MB*C[i+1]*sizeof(libxsmm_bfloat16))/(1024.0*1024.0); fil_size += (double)(C[i]*C[i+1]*sizeof(libxsmm_bfloat16))/(1024.0*1024.0); printf("SIZE Filter %i (%dx%d): %10.2f MiB\n", i, C[i], C[i+1], (double)(C[i]*C[i+1]*sizeof(libxsmm_bfloat16))/(1024.0*1024.0) ); printf("SIZE Activations %i (%dx%d): %10.2f MiB\n", i+1, MB, C[i+1], (double)(MB*C[i+1]*sizeof(libxsmm_bfloat16))/(1024.0*1024.0) ); } act_size += (double)(MB*C[num_layers+1]*sizeof(float))/(1024.0*1024.0); printf("SIZE Activations softmax (%dx%d): %10.2f MiB\n", MB, C[num_layers+1], (double)(MB*C[num_layers+1]*sizeof(libxsmm_bfloat16))/(1024.0*1024.0) ); printf("\nTOTAL SIZE Activations: %10.2f MiB\n", act_size ); printf("TOTAL SIZE Filter (incl. master): %10.2f MiB\n", 3.0*fil_size ); printf("TOTAL SIZE delActivations: %10.2f MiB\n", act_size ); printf("TOTAL SIZE delFilter: %10.2f MiB\n", fil_size ); printf("TOTAL SIZE MLP: %10.2f MiB\n", (4.0*fil_size) + (2.0*act_size) ); /* allocate data */ act_libxsmm = (libxsmm_bfloat16**)malloc( (num_layers+2)*sizeof(libxsmm_bfloat16*) ); delact_libxsmm = (libxsmm_bfloat16**)malloc( (num_layers+1)*sizeof(libxsmm_bfloat16*) ); for ( i = 0 ; i < num_layers+2; ++i ) { act_libxsmm[i] = (libxsmm_bfloat16*)libxsmm_aligned_malloc( MB*C[i]*sizeof(libxsmm_bfloat16), 2097152); /* softmax has no incoming gradients */ if ( i < num_layers+1 ) { delact_libxsmm[i] = (libxsmm_bfloat16*)libxsmm_aligned_malloc( MB*C[i]*sizeof(libxsmm_bfloat16), 2097152); } } fil_master = (float**) malloc( num_layers*sizeof(float*) ); fil_libxsmm = (libxsmm_bfloat16**)malloc( num_layers*sizeof(libxsmm_bfloat16*) ); delfil_libxsmm = (libxsmm_bfloat16**)malloc( num_layers*sizeof(libxsmm_bfloat16*) ); for ( i = 0 ; i < num_layers; ++i ) { fil_master[i] = (float*) libxsmm_aligned_malloc( C[i]*C[i+1]*sizeof(float), 2097152); fil_libxsmm[i] = (libxsmm_bfloat16*)libxsmm_aligned_malloc( C[i]*C[i+1]*sizeof(libxsmm_bfloat16), 2097152); delfil_libxsmm[i] = (libxsmm_bfloat16*)libxsmm_aligned_malloc( C[i]*C[i+1]*sizeof(libxsmm_bfloat16), 2097152); } bias_libxsmm = (libxsmm_bfloat16**)malloc( num_layers*sizeof(libxsmm_bfloat16*) ); delbias_libxsmm = (libxsmm_bfloat16**)malloc( num_layers*sizeof(libxsmm_bfloat16*) ); for ( i = 0 ; i < num_layers; ++i ) { bias_libxsmm[i] = (libxsmm_bfloat16*)libxsmm_aligned_malloc( C[i+1]*sizeof(libxsmm_bfloat16), 2097152); delbias_libxsmm[i] = (libxsmm_bfloat16*)libxsmm_aligned_malloc( C[i+1]*sizeof(libxsmm_bfloat16), 2097152); } relumask_libxsmm = (unsigned char**)malloc( num_layers*sizeof(unsigned char*) ); for ( i = 0 ; i < num_layers; ++i ) { relumask_libxsmm[i] = (unsigned char*)libxsmm_aligned_malloc( MB*C[i+1]*sizeof(unsigned char), 2097152); } /* init data */ for ( i = 0 ; i < num_layers+2; ++i ) { init_buf_bf16( act_libxsmm[i], MB*C[i], 0, 0 ); } for ( i = 0 ; i < num_layers+1; ++i ) { init_buf_bf16( delact_libxsmm[i], MB*C[i], 0, 0 ); } for ( i = 0 ; i < num_layers; ++i ) { init_buf( fil_master[i], C[i]*C[i+1], 0, 0 ); libxsmm_rne_convert_fp32_bf16( fil_master[i], fil_libxsmm[i], C[i]*C[i+1] ); } for ( i = 0 ; i < num_layers; ++i ) { init_buf_bf16( delfil_libxsmm[i], C[i]*C[i+1], 0, 0 ); } for ( i = 0 ; i < num_layers; ++i ) { init_buf_bf16( bias_libxsmm[i], C[i+1], 0, 0 ); } for ( i = 0 ; i < num_layers; ++i ) { init_buf_bf16( delbias_libxsmm[i], C[i+1], 0, 0 ); } for ( i = 0 ; i < num_layers; ++i ) { zero_buf_uint8( relumask_libxsmm[i], MB*C[i+1] ); } printf("\n"); printf("##########################################\n"); printf("# Setting Up (custom-Storage) #\n"); printf("##########################################\n"); libxsmm_fc_layer = (libxsmm_dnn_fullyconnected**) malloc( num_layers*sizeof(libxsmm_dnn_fullyconnected*) ); libxsmm_opt = (libxsmm_dnn_optimizer**) malloc( num_layers*sizeof(libxsmm_dnn_optimizer*) ); libxsmm_act = (libxsmm_dnn_tensor**) malloc( (num_layers+2)*sizeof(libxsmm_dnn_tensor*) ); libxsmm_delact = (libxsmm_dnn_tensor**) malloc( (num_layers+1)*sizeof(libxsmm_dnn_tensor*) ); libxsmm_fil = (libxsmm_dnn_tensor**) malloc( num_layers*sizeof(libxsmm_dnn_tensor*) ); libxsmm_delfil = (libxsmm_dnn_tensor**) malloc( num_layers*sizeof(libxsmm_dnn_tensor*) ); libxsmm_bias = (libxsmm_dnn_tensor**) malloc( num_layers*sizeof(libxsmm_dnn_tensor*) ); libxsmm_delbias = (libxsmm_dnn_tensor**) malloc( num_layers*sizeof(libxsmm_dnn_tensor*) ); libxsmm_relumask = (libxsmm_dnn_tensor**) malloc( num_layers*sizeof(libxsmm_dnn_tensor*) ); libxsmm_mafil = (libxsmm_dnn_tensor**) malloc( num_layers*sizeof(libxsmm_dnn_tensor*) ); for ( i = 0; i < num_layers; ++i ) { fullyconnected_desc.N = MB; fullyconnected_desc.C = C[i]; fullyconnected_desc.K = C[i+1]; fullyconnected_desc.bn = (MB % bn == 0) ? bn : MB; fullyconnected_desc.bc = (C[i ] % bc == 0) ? bc : C[i ]; fullyconnected_desc.bk = (C[i+1] % bk == 0) ? bk : C[i+1]; fullyconnected_desc.threads = nThreads; fullyconnected_desc.datatype_in = LIBXSMM_DNN_DATATYPE_BF16; fullyconnected_desc.datatype_out = LIBXSMM_DNN_DATATYPE_BF16; fullyconnected_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED; fullyconnected_desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED; if ( fuse_type == 0 ) { fullyconnected_desc.fuse_ops = LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE; } else if ( fuse_type == 1 ) { fullyconnected_desc.fuse_ops = LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS; } else if ( fuse_type == 2 ) { fullyconnected_desc.fuse_ops = LIBXSMM_DNN_FULLYCONNECTED_FUSE_RELU; } else if ( fuse_type == 3 ) { fullyconnected_desc.fuse_ops = LIBXSMM_DNN_FULLYCONNECTED_FUSE_SIGMOID; } else if ( fuse_type == 4 ) { fullyconnected_desc.fuse_ops = LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_RELU; } else if ( fuse_type == 5 ) { fullyconnected_desc.fuse_ops = LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_SIGMOID; } else { /* cannot happen */ } libxsmm_fc_layer[i] = libxsmm_dnn_create_fullyconnected( fullyconnected_desc, &status ); CHKERR_LIBXSMM_DNN( status ); optimizer_desc.C = C[i]; optimizer_desc.K = C[i+1]; optimizer_desc.bc = (C[i ] % bc == 0) ? bc : C[i ]; optimizer_desc.bk = (C[i+1] % bk == 0) ? bk : C[i+1]; optimizer_desc.learning_rate = 0.1f; optimizer_desc.threads = nThreads; optimizer_desc.opt_type = LIBXSMM_DNN_OPTIMIZER_SGD; optimizer_desc.datatype = LIBXSMM_DNN_DATATYPE_BF16; optimizer_desc.datatype_master = LIBXSMM_DNN_DATATYPE_F32; optimizer_desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED; libxsmm_opt[i] = libxsmm_dnn_create_optimizer( optimizer_desc, &status ); CHKERR_LIBXSMM_DNN( status ); /* setup LIBXSMM buffers */ if ( i == 0 ) { libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_fc_layer[i], LIBXSMM_DNN_REGULAR_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_act[i] = libxsmm_dnn_link_tensor( libxsmm_layout, act_libxsmm[i], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_fc_layer[i], LIBXSMM_DNN_GRADIENT_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delact[i] = libxsmm_dnn_link_tensor( libxsmm_layout, delact_libxsmm[i], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); } libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_fc_layer[i], LIBXSMM_DNN_REGULAR_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_act[i+1] = libxsmm_dnn_link_tensor( libxsmm_layout, act_libxsmm[i+1], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_fc_layer[i], LIBXSMM_DNN_GRADIENT_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delact[i+1] = libxsmm_dnn_link_tensor( libxsmm_layout, delact_libxsmm[i+1], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_fc_layer[i], LIBXSMM_DNN_REGULAR_FILTER, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_fil[i] = libxsmm_dnn_link_tensor( libxsmm_layout, fil_libxsmm[i], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_fc_layer[i], LIBXSMM_DNN_GRADIENT_FILTER, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delfil[i] = libxsmm_dnn_link_tensor( libxsmm_layout, delfil_libxsmm[i], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_fc_layer[i], LIBXSMM_DNN_REGULAR_CHANNEL_BIAS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_bias[i] = libxsmm_dnn_link_tensor( libxsmm_layout, bias_libxsmm[i], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_fc_layer[i], LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delbias[i] = libxsmm_dnn_link_tensor( libxsmm_layout, delbias_libxsmm[i], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_fc_layer[i], LIBXSMM_DNN_RELU_MASK, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_relumask[i] = libxsmm_dnn_link_tensor( libxsmm_layout, relumask_libxsmm[i], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); /* bind buffers and filter to handle */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_fc_layer[i], libxsmm_act[ i], LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_fc_layer[i], libxsmm_delact[i ], LIBXSMM_DNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_fc_layer[i], libxsmm_act[i+1], LIBXSMM_DNN_REGULAR_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_fc_layer[i], libxsmm_delact[i+1], LIBXSMM_DNN_GRADIENT_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_fc_layer[i], libxsmm_fil[i], LIBXSMM_DNN_REGULAR_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_fc_layer[i], libxsmm_delfil[i], LIBXSMM_DNN_GRADIENT_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_fc_layer[i], libxsmm_bias[i], LIBXSMM_DNN_REGULAR_CHANNEL_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_fc_layer[i], libxsmm_delbias[i], LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_fc_layer[i], libxsmm_relumask[i], LIBXSMM_DNN_RELU_MASK ) ); libxsmm_layout = libxsmm_dnn_optimizer_create_tensor_datalayout( libxsmm_opt[i], LIBXSMM_DNN_MASTER_FILTER, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_mafil[i] = libxsmm_dnn_link_tensor( libxsmm_layout, fil_master[i], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); /* bind filters to optimizer */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_optimizer_bind_tensor( libxsmm_opt[i], libxsmm_fil[i], LIBXSMM_DNN_REGULAR_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_optimizer_bind_tensor( libxsmm_opt[i], libxsmm_mafil[i], LIBXSMM_DNN_MASTER_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_optimizer_bind_tensor( libxsmm_opt[i], libxsmm_delfil[i], LIBXSMM_DNN_GRADIENT_FILTER ) ); /* let's allocate and bind scratch */ if ( libxsmm_dnn_fullyconnected_get_scratch_size( libxsmm_fc_layer[i], &status ) > scratch_size ) { scratch_size = libxsmm_dnn_fullyconnected_get_scratch_size( libxsmm_fc_layer[i], &status ); CHKERR_LIBXSMM_DNN( status ); if ( scratch != NULL ) { libxsmm_free( scratch ); } scratch = libxsmm_aligned_scratch( scratch_size, 2097152 ); init_buf( (float*)scratch, scratch_size/4, 0, 0 ); } if ( libxsmm_dnn_optimizer_get_scratch_size( libxsmm_opt[i], &status ) > scratch_size ) { scratch_size = libxsmm_dnn_optimizer_get_scratch_size( libxsmm_opt[i], &status ); CHKERR_LIBXSMM_DNN( status ); if ( scratch != NULL ) { libxsmm_free( scratch ); } scratch = libxsmm_aligned_scratch( scratch_size, 2097152 ); init_buf( (float*)scratch, scratch_size/4, 0, 0 ); } } /* create softmax layer */ softmaxloss_desc.N = MB; softmaxloss_desc.C = C[num_layers]; softmaxloss_desc.bn = (MB % bn == 0) ? bn : MB; softmaxloss_desc.bc = (C[num_layers] % bc == 0) ? bc : C[num_layers]; softmaxloss_desc.loss_weight = 1.0; softmaxloss_desc.threads = nThreads; softmaxloss_desc.datatype = LIBXSMM_DNN_DATATYPE_BF16; softmaxloss_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED; libxsmm_softmax = libxsmm_dnn_create_softmaxloss( softmaxloss_desc, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_layout = libxsmm_dnn_softmaxloss_create_tensor_datalayout( libxsmm_softmax, LIBXSMM_DNN_REGULAR_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_act[num_layers+1] = libxsmm_dnn_link_tensor( libxsmm_layout, act_libxsmm[num_layers+1], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_softmaxloss_bind_tensor( libxsmm_softmax, libxsmm_act[num_layers], LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_softmaxloss_bind_tensor( libxsmm_softmax, libxsmm_delact[num_layers], LIBXSMM_DNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_softmaxloss_bind_tensor( libxsmm_softmax, libxsmm_act[num_layers+1], LIBXSMM_DNN_REGULAR_OUTPUT ) ); if ( libxsmm_dnn_softmaxloss_get_scratch_size( libxsmm_softmax, &status ) > scratch_size ) { scratch_size = libxsmm_dnn_softmaxloss_get_scratch_size( libxsmm_softmax, &status ); CHKERR_LIBXSMM_DNN( status ); if ( scratch != NULL ) { libxsmm_free( scratch ); } scratch = libxsmm_aligned_scratch( scratch_size, 2097152 ); init_buf( (float*)scratch, scratch_size/4, 0, 0 ); } /* bind scratch to all layers */ for ( i = 0; i < num_layers; ++i ) { CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_scratch( libxsmm_fc_layer[i], scratch ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_optimizer_bind_scratch( libxsmm_opt[i], scratch ) ); } CHKERR_LIBXSMM_DNN( libxsmm_dnn_softmaxloss_bind_scratch( libxsmm_softmax, scratch ) ); if (type == 'A' || type == 'F') { printf("##########################################\n"); printf("# Performance - FWD (custom-Storage) #\n"); printf("##########################################\n"); l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i,j) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (j = 0; j < iters; ++j) { for ( i = 0; i < num_layers; ++i) { libxsmm_dnn_fullyconnected_execute_st( libxsmm_fc_layer[i], LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); } libxsmm_dnn_softmaxloss_execute_st( libxsmm_softmax, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); gflop = 0.0; for ( i = 0; i < num_layers; ++i) { gflop += (2.0*(double)MB*(double)C[i]*(double)C[i+1]*(double)iters) / (1000.0*1000.0*1000.0); } printf("GFLOP = %.5g\n", gflop/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", gflop/l_total); printf("PERFDUMP,FP,%s,%i,%i,", LIBXSMM_VERSION, nThreads, MB ); for ( i = 0; i < num_layers; ++i ) { printf("%i,", C[i] ); } printf("%f,%f\n", ((double)(l_total/iters)), gflop/l_total); } if (type == 'A' || type == 'B') { printf("##########################################\n"); printf("# Performance - BWD (custom-Storage) #\n"); printf("##########################################\n"); l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i,j) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (j = 0; j < iters; ++j) { libxsmm_dnn_softmaxloss_execute_st( libxsmm_softmax, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ); for ( i = num_layers-1; i > 0; --i) { libxsmm_dnn_fullyconnected_execute_st( libxsmm_fc_layer[i], LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, 0, tid ); libxsmm_dnn_optimizer_execute_st( libxsmm_opt[i], 0, tid ); } libxsmm_dnn_fullyconnected_execute_st( libxsmm_fc_layer[0], LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ); libxsmm_dnn_optimizer_execute_st( libxsmm_opt[i], 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); gflop = 0.0; for ( i = num_layers-1; i > 0; --i) { gflop += (4.0*(double)MB*(double)C[i]*(double)C[i+1]*(double)iters) / (1000.0*1000.0*1000.0); } gflop += (2.0*(double)MB*(double)C[0]*(double)C[1]*(double)iters) / (1000.0*1000.0*1000.0); printf("GFLOP = %.5g\n", gflop/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", gflop/l_total); printf("PERFDUMP,BP,%s,%i,%i,", LIBXSMM_VERSION, nThreads, MB ); for ( i = 0; i < num_layers; ++i ) { printf("%i,", C[i] ); } printf("%f,%f\n", ((double)(l_total/iters)), gflop/l_total); } if (type == 'A') { printf("##########################################\n"); printf("# Performance - FWD-BWD (custom-Storage) #\n"); printf("##########################################\n"); l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i,j) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (j = 0; j < iters; ++j) { for ( i = 0; i < num_layers; ++i) { libxsmm_dnn_fullyconnected_execute_st( libxsmm_fc_layer[i], LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); } libxsmm_dnn_softmaxloss_execute_st( libxsmm_softmax, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); libxsmm_dnn_softmaxloss_execute_st( libxsmm_softmax, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ); for ( i = (num_layers-1); i > 0; --i) { libxsmm_dnn_fullyconnected_execute_st( libxsmm_fc_layer[i], LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, 0, tid ); libxsmm_dnn_optimizer_execute_st( libxsmm_opt[i], 0, tid ); } libxsmm_dnn_fullyconnected_execute_st( libxsmm_fc_layer[0], LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ); libxsmm_dnn_optimizer_execute_st( libxsmm_opt[i], 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); gflop = 0.0; for ( i = num_layers-1; i > 0; --i) { gflop += (6.0*(double)MB*(double)C[i]*(double)C[i+1]*(double)iters) / (1000.0*1000.0*1000.0); } gflop += (4.0*(double)MB*(double)C[0]*(double)C[1]*(double)iters) / (1000.0*1000.0*1000.0); printf("GFLOP = %.5g\n", gflop/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", gflop/l_total); printf("PERFDUMP,BP,%s,%i,%i,", LIBXSMM_VERSION, nThreads, MB ); for ( i = 0; i < num_layers; ++i ) { printf("%i,", C[i] ); } printf("%f,%f\n", ((double)(l_total/iters)), gflop/l_total); } for ( i = 0; i < num_layers; ++i ) { /* clean-up */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_scratch( libxsmm_fc_layer[i] ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_fc_layer[i], LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_fc_layer[i], LIBXSMM_DNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_fc_layer[i], LIBXSMM_DNN_REGULAR_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_fc_layer[i], LIBXSMM_DNN_GRADIENT_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_fc_layer[i], LIBXSMM_DNN_REGULAR_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_fc_layer[i], LIBXSMM_DNN_GRADIENT_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_fc_layer[i], LIBXSMM_DNN_REGULAR_CHANNEL_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_fc_layer[i], LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_fc_layer[i], LIBXSMM_DNN_RELU_MASK ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_fullyconnected( libxsmm_fc_layer[i] ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_optimizer_release_scratch( libxsmm_opt[i] ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_optimizer_release_tensor( libxsmm_opt[i], LIBXSMM_DNN_REGULAR_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_optimizer_release_tensor( libxsmm_opt[i], LIBXSMM_DNN_MASTER_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_optimizer_release_tensor( libxsmm_opt[i], LIBXSMM_DNN_GRADIENT_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_optimizer( libxsmm_opt[i] ) ); } CHKERR_LIBXSMM_DNN( libxsmm_dnn_softmaxloss_release_scratch( libxsmm_softmax ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_softmaxloss_release_tensor( libxsmm_softmax, LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_softmaxloss_release_tensor( libxsmm_softmax, LIBXSMM_DNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_softmaxloss_release_tensor( libxsmm_softmax, LIBXSMM_DNN_REGULAR_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_softmaxloss( libxsmm_softmax ) ); for ( i = 0; i < num_layers; ++i ) { if ( i == 0 ) { CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_act[i] ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_delact[i] ) ); } CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_act[i+1] ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_delact[i+1] ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_fil[i] ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_delfil[i] ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_bias[i] ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_delbias[i] ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_relumask[i] ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_mafil[i] ) ); } CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_act[num_layers+1] ) ); /* deallocate data */ libxsmm_free(scratch); for ( i = 0; i < num_layers; ++i ) { if ( i == 0 ) { libxsmm_free(act_libxsmm[i]); libxsmm_free(delact_libxsmm[i]); } libxsmm_free(act_libxsmm[i+1]); libxsmm_free(delact_libxsmm[i+1]); libxsmm_free(fil_libxsmm[i]); libxsmm_free(delfil_libxsmm[i]); libxsmm_free(bias_libxsmm[i]); libxsmm_free(delbias_libxsmm[i]); libxsmm_free(relumask_libxsmm[i]); libxsmm_free(fil_master[i]); } libxsmm_free(act_libxsmm[num_layers+1]); free( libxsmm_act ); free( libxsmm_delact ); free( libxsmm_fil ); free( libxsmm_delfil ); free( libxsmm_bias ); free( libxsmm_delbias ); free( libxsmm_relumask ); free( libxsmm_mafil ); free( libxsmm_fc_layer ); free( libxsmm_opt ); free( act_libxsmm ); free( delact_libxsmm ); free( fil_master ); free( fil_libxsmm ); free( delfil_libxsmm ); free( bias_libxsmm ); free( delbias_libxsmm ); free( relumask_libxsmm ); free( C ); /* some empty lines at the end */ printf("\n\n\n"); return global_status; } libxsmm-1.17/samples/deeplearning/mlpdriver/mlp_example_f32.c000066400000000000000000000675101415223013700243520ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include #include #include #include #include #if defined(_OPENMP) # include #endif /* include c-based dnn library */ #include "../common/dnn_common.h" #define CHKERR_LIBXSMM_DNN(A) { const int chkerr_libxsmm_dnn_ = A; if (LIBXSMM_DNN_SUCCESS != chkerr_libxsmm_dnn_) { \ fprintf(stderr, "%s\n", libxsmm_dnn_get_error(chkerr_libxsmm_dnn_)); global_status = chkerr_libxsmm_dnn_; } \ } int main(int argc, char* argv[]) { float **act_libxsmm, **fil_libxsmm, **delact_libxsmm, **delfil_libxsmm; float **bias_libxsmm, **delbias_libxsmm; unsigned char **relumask_libxsmm; void* scratch = NULL; size_t scratch_size = 0; /* some parameters we can overwrite via cli, default is some inner layer of overfeat */ int iters = 10; /* repetitions of benchmark */ int MB = 32; /* mini-batch size, "N" */ int fuse_type = 0; /* 0: nothing fused, 1: relu fused, 2: elementwise fused, 3: relu and elementwise fused */ char type = 'A'; /* 'A': ALL, 'F': FP, 'B': BP */ int bn = 64; int bk = 64; int bc = 64; int *C; /* number of input feature maps, "C" */ int num_layers = 0; const char *const env_check = getenv("CHECK"); const double check = LIBXSMM_ABS(0 == env_check ? 1 : atof(env_check)); #if defined(_OPENMP) int nThreads = omp_get_max_threads(); /* number of threads */ #else int nThreads = 1; /* number of threads */ #endif unsigned long long l_start, l_end; double l_total = 0.0; double gflop = 0.0; int i, j; double fil_size = 0.0; double act_size = 0.0; libxsmm_dnn_fullyconnected_desc fullyconnected_desc; libxsmm_dnn_fullyconnected** libxsmm_fc_layer; libxsmm_dnn_optimizer_desc optimizer_desc; libxsmm_dnn_optimizer** libxsmm_opt; libxsmm_dnn_softmaxloss_desc softmaxloss_desc; libxsmm_dnn_softmaxloss* libxsmm_softmax; libxsmm_dnn_tensor** libxsmm_act; libxsmm_dnn_tensor** libxsmm_delact; libxsmm_dnn_tensor** libxsmm_fil; libxsmm_dnn_tensor** libxsmm_delfil; libxsmm_dnn_tensor** libxsmm_bias; libxsmm_dnn_tensor** libxsmm_delbias; libxsmm_dnn_tensor** libxsmm_relumask; libxsmm_dnn_tensor_datalayout* libxsmm_layout; libxsmm_dnn_err_t status; libxsmm_dnn_err_t global_status = LIBXSMM_DNN_SUCCESS; if (argc > 1 && !strncmp(argv[1], "-h", 3)) { printf("Usage: %s iters MB fuse_type type bn bk bc C1 C2 ... CN\n", argv[0]); return 0; } libxsmm_rng_set_seed(1); /* reading new values from cli */ i = 1; num_layers = argc - 9; if (argc > i) iters = atoi(argv[i++]); if (argc > i) MB = atoi(argv[i++]); if (argc > i) fuse_type = atoi(argv[i++]); if (argc > i) type = *(argv[i++]); if (argc > i) bn = atoi(argv[i++]); if (argc > i) bk = atoi(argv[i++]); if (argc > i) bc = atoi(argv[i++]); /* allocate the number of channles buffer */ if ( num_layers < 1 ) { printf("Usage: %s iters MB fuse_type type bn bk bc C1 C2 ... CN\n", argv[0]); return 0; } C = (int*)malloc((num_layers+2)*sizeof(int)); for (j = 0 ; i < argc; ++i, ++j ) { C[j] = atoi(argv[i]); } /* handle softmax config */ C[num_layers+1] = C[num_layers]; if (type != 'A' && type != 'F' && type != 'B') { printf("type needs to be 'A' (All), 'F' (FP only), 'B' (BP only)\n"); return -1; } if ( (fuse_type < 0) || (fuse_type > 5) ) { printf("fuse type needs to be 0 (None), 1 (Bias), 2 (ReLU), 3 (Sigmoid), 4 (Bias+ReLU), 5 (Bias+Sigmoid)\n"); return -1; } #if defined(__SSE3__) _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); #endif /* print some summary */ printf("##########################################\n"); printf("# Setting Up (Common) #\n"); printf("##########################################\n"); printf("PARAMS: N:%d\n", MB); printf("PARAMS: Layers: %d\n", num_layers); printf("PARAMS: ITERS:%d", iters); if (LIBXSMM_FEQ(0, check)) printf(" Threads:%d\n", nThreads); else printf("\n"); for (i = 0; i < num_layers; ++i ) { if (i == 0) { act_size += (double)(MB*C[i]*sizeof(float))/(1024.0*1024.0); printf("SIZE Activations %i (%dx%d): %10.2f MiB\n", i, MB, C[i], (double)(MB*C[i]*sizeof(float))/(1024.0*1024.0) ); } act_size += (double)(MB*C[i+1]*sizeof(float))/(1024.0*1024.0); fil_size += (double)(C[i]*C[i+1]*sizeof(float))/(1024.0*1024.0); printf("SIZE Filter %i (%dx%d): %10.2f MiB\n", i, C[i], C[i+1], (double)(C[i]*C[i+1]*sizeof(float))/(1024.0*1024.0) ); printf("SIZE Activations %i (%dx%d): %10.2f MiB\n", i+1, MB, C[i+1], (double)(MB*C[i+1]*sizeof(float))/(1024.0*1024.0) ); } act_size += (double)(MB*C[num_layers+1]*sizeof(float))/(1024.0*1024.0); printf("SIZE Activations softmax (%dx%d): %10.2f MiB\n", MB, C[num_layers+1], (double)(MB*C[num_layers+1]*sizeof(float))/(1024.0*1024.0) ); printf("\nTOTAL SIZE Activations: %10.2f MiB\n", act_size ); printf("TOTAL SIZE Filter: %10.2f MiB\n", fil_size ); printf("TOTAL SIZE delActivations: %10.2f MiB\n", act_size ); printf("TOTAL SIZE delFilter: %10.2f MiB\n", fil_size ); printf("TOTAL SIZE MLP: %10.2f MiB\n", (2.0*fil_size) + (2.0*act_size) ); /* allocate data */ /* +2 because of the softwax layer */ act_libxsmm = (float**)malloc( (num_layers+2)*sizeof(float*) ); delact_libxsmm = (float**)malloc( (num_layers+1)*sizeof(float*) ); for ( i = 0 ; i < num_layers+2; ++i ) { act_libxsmm[i] = (float*)libxsmm_aligned_malloc( MB*C[i]*sizeof(float), 2097152); /* softmax has no incoming gradients */ if ( i < num_layers+1 ) { delact_libxsmm[i] = (float*)libxsmm_aligned_malloc( MB*C[i]*sizeof(float), 2097152); } } fil_libxsmm = (float**)malloc( num_layers*sizeof(float*) ); delfil_libxsmm = (float**)malloc( num_layers*sizeof(float*) ); for ( i = 0 ; i < num_layers; ++i ) { fil_libxsmm[i] = (float*)libxsmm_aligned_malloc( C[i]*C[i+1]*sizeof(float), 2097152); delfil_libxsmm[i] = (float*)libxsmm_aligned_malloc( C[i]*C[i+1]*sizeof(float), 2097152); } bias_libxsmm = (float**)malloc( num_layers*sizeof(float*) ); delbias_libxsmm = (float**)malloc( num_layers*sizeof(float*) ); for ( i = 0 ; i < num_layers; ++i ) { bias_libxsmm[i] = (float*)libxsmm_aligned_malloc( C[i+1]*sizeof(float), 2097152); delbias_libxsmm[i] = (float*)libxsmm_aligned_malloc( C[i+1]*sizeof(float), 2097152); } relumask_libxsmm = (unsigned char**)malloc( num_layers*sizeof(unsigned char*) ); for ( i = 0 ; i < num_layers; ++i ) { relumask_libxsmm[i] = (unsigned char*)libxsmm_aligned_malloc( MB*C[i+1]*sizeof(unsigned char), 2097152); } /* init data */ for ( i = 0 ; i < num_layers+2; ++i ) { init_buf( act_libxsmm[i], MB*C[i], 0, 0 ); } for ( i = 0 ; i < num_layers+1; ++i ) { init_buf( delact_libxsmm[i], MB*C[i], 0, 0 ); } for ( i = 0 ; i < num_layers; ++i ) { init_buf( fil_libxsmm[i], C[i]*C[i+1], 0, 0 ); } for ( i = 0 ; i < num_layers; ++i ) { init_buf( delfil_libxsmm[i], C[i]*C[i+1], 0, 0 ); } for ( i = 0 ; i < num_layers; ++i ) { init_buf( bias_libxsmm[i], C[i+1], 0, 0 ); } for ( i = 0 ; i < num_layers; ++i ) { init_buf( delbias_libxsmm[i], C[i+1], 0, 0 ); } for ( i = 0 ; i < num_layers; ++i ) { zero_buf_uint8( relumask_libxsmm[i], MB*C[i+1] ); } printf("\n"); printf("##########################################\n"); printf("# Setting Up (custom-Storage) #\n"); printf("##########################################\n"); libxsmm_fc_layer = (libxsmm_dnn_fullyconnected**) malloc( num_layers*sizeof(libxsmm_dnn_fullyconnected*) ); libxsmm_opt = (libxsmm_dnn_optimizer**) malloc( num_layers*sizeof(libxsmm_dnn_optimizer*) ); libxsmm_act = (libxsmm_dnn_tensor**) malloc( (num_layers+2)*sizeof(libxsmm_dnn_tensor*) ); libxsmm_delact = (libxsmm_dnn_tensor**) malloc( (num_layers+1)*sizeof(libxsmm_dnn_tensor*) ); libxsmm_fil = (libxsmm_dnn_tensor**) malloc( num_layers*sizeof(libxsmm_dnn_tensor*) ); libxsmm_delfil = (libxsmm_dnn_tensor**) malloc( num_layers*sizeof(libxsmm_dnn_tensor*) ); libxsmm_bias = (libxsmm_dnn_tensor**) malloc( num_layers*sizeof(libxsmm_dnn_tensor*) ); libxsmm_delbias = (libxsmm_dnn_tensor**) malloc( num_layers*sizeof(libxsmm_dnn_tensor*) ); libxsmm_relumask = (libxsmm_dnn_tensor**) malloc( num_layers*sizeof(libxsmm_dnn_tensor*) ); for ( i = 0; i < num_layers; ++i ) { fullyconnected_desc.N = MB; fullyconnected_desc.C = C[i]; fullyconnected_desc.K = C[i+1]; fullyconnected_desc.bn = (MB % bn == 0) ? bn : MB; fullyconnected_desc.bc = (C[i ] % bc == 0) ? bc : C[i ]; fullyconnected_desc.bk = (C[i+1] % bk == 0) ? bk : C[i+1]; fullyconnected_desc.threads = nThreads; fullyconnected_desc.datatype_in = LIBXSMM_DNN_DATATYPE_F32; fullyconnected_desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32; fullyconnected_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED; fullyconnected_desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED; if ( fuse_type == 0 ) { fullyconnected_desc.fuse_ops = LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE; } else if ( fuse_type == 1 ) { fullyconnected_desc.fuse_ops = LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS; } else if ( fuse_type == 2 ) { fullyconnected_desc.fuse_ops = LIBXSMM_DNN_FULLYCONNECTED_FUSE_RELU; } else if ( fuse_type == 3 ) { fullyconnected_desc.fuse_ops = LIBXSMM_DNN_FULLYCONNECTED_FUSE_SIGMOID; } else if ( fuse_type == 4 ) { fullyconnected_desc.fuse_ops = LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_RELU; } else if ( fuse_type == 5 ) { fullyconnected_desc.fuse_ops = LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_SIGMOID; } else { /* cannot happen */ } libxsmm_fc_layer[i] = libxsmm_dnn_create_fullyconnected( fullyconnected_desc, &status ); CHKERR_LIBXSMM_DNN( status ); optimizer_desc.C = C[i]; optimizer_desc.K = C[i+1]; optimizer_desc.bc = (C[i ] % bc == 0) ? bc : C[i ]; optimizer_desc.bk = (C[i+1] % bk == 0) ? bk : C[i+1]; optimizer_desc.learning_rate = 0.1f; optimizer_desc.threads = nThreads; optimizer_desc.opt_type = LIBXSMM_DNN_OPTIMIZER_SGD; optimizer_desc.datatype = LIBXSMM_DNN_DATATYPE_F32; optimizer_desc.datatype_master = LIBXSMM_DNN_DATATYPE_F32; optimizer_desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED; libxsmm_opt[i] = libxsmm_dnn_create_optimizer( optimizer_desc, &status ); CHKERR_LIBXSMM_DNN( status ); /* setup LIBXSMM buffers */ if ( i == 0 ) { libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_fc_layer[i], LIBXSMM_DNN_REGULAR_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_act[i] = libxsmm_dnn_link_tensor( libxsmm_layout, act_libxsmm[i], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_fc_layer[i], LIBXSMM_DNN_GRADIENT_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delact[i] = libxsmm_dnn_link_tensor( libxsmm_layout, delact_libxsmm[i], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); } libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_fc_layer[i], LIBXSMM_DNN_REGULAR_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_act[i+1] = libxsmm_dnn_link_tensor( libxsmm_layout, act_libxsmm[i+1], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_fc_layer[i], LIBXSMM_DNN_GRADIENT_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delact[i+1] = libxsmm_dnn_link_tensor( libxsmm_layout, delact_libxsmm[i+1], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_fc_layer[i], LIBXSMM_DNN_REGULAR_FILTER, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_fil[i] = libxsmm_dnn_link_tensor( libxsmm_layout, fil_libxsmm[i], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_fc_layer[i], LIBXSMM_DNN_GRADIENT_FILTER, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delfil[i] = libxsmm_dnn_link_tensor( libxsmm_layout, delfil_libxsmm[i], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_fc_layer[i], LIBXSMM_DNN_REGULAR_CHANNEL_BIAS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_bias[i] = libxsmm_dnn_link_tensor( libxsmm_layout, bias_libxsmm[i], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_fc_layer[i], LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delbias[i] = libxsmm_dnn_link_tensor( libxsmm_layout, delbias_libxsmm[i], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_fc_layer[i], LIBXSMM_DNN_RELU_MASK, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_relumask[i] = libxsmm_dnn_link_tensor( libxsmm_layout, relumask_libxsmm[i], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); /* bind buffers and filter to fc layer */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_fc_layer[i], libxsmm_act[ i], LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_fc_layer[i], libxsmm_delact[i ], LIBXSMM_DNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_fc_layer[i], libxsmm_act[i+1], LIBXSMM_DNN_REGULAR_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_fc_layer[i], libxsmm_delact[i+1], LIBXSMM_DNN_GRADIENT_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_fc_layer[i], libxsmm_fil[i], LIBXSMM_DNN_REGULAR_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_fc_layer[i], libxsmm_delfil[i], LIBXSMM_DNN_GRADIENT_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_fc_layer[i], libxsmm_bias[i], LIBXSMM_DNN_REGULAR_CHANNEL_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_fc_layer[i], libxsmm_delbias[i], LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_fc_layer[i], libxsmm_relumask[i], LIBXSMM_DNN_RELU_MASK ) ); /* bind filters to optimizer */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_optimizer_bind_tensor( libxsmm_opt[i], libxsmm_fil[i], LIBXSMM_DNN_REGULAR_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_optimizer_bind_tensor( libxsmm_opt[i], libxsmm_delfil[i], LIBXSMM_DNN_GRADIENT_FILTER ) ); /* let's allocate and bind scratch */ if ( libxsmm_dnn_fullyconnected_get_scratch_size( libxsmm_fc_layer[i], &status ) > scratch_size ) { scratch_size = libxsmm_dnn_fullyconnected_get_scratch_size( libxsmm_fc_layer[i], &status ); CHKERR_LIBXSMM_DNN( status ); if ( scratch != NULL ) { libxsmm_free( scratch ); } scratch = libxsmm_aligned_scratch( scratch_size, 2097152 ); init_buf( (float*)scratch, scratch_size/4, 0, 0 ); } if ( libxsmm_dnn_optimizer_get_scratch_size( libxsmm_opt[i], &status ) > scratch_size ) { scratch_size = libxsmm_dnn_optimizer_get_scratch_size( libxsmm_opt[i], &status ); CHKERR_LIBXSMM_DNN( status ); if ( scratch != NULL ) { libxsmm_free( scratch ); } scratch = libxsmm_aligned_scratch( scratch_size, 2097152 ); init_buf( (float*)scratch, scratch_size/4, 0, 0 ); } } /* create softmax layer */ softmaxloss_desc.N = MB; softmaxloss_desc.C = C[num_layers]; softmaxloss_desc.bn = (MB % bn == 0) ? bn : MB; softmaxloss_desc.bc = (C[num_layers] % bc == 0) ? bc : C[num_layers]; softmaxloss_desc.loss_weight = 1.0; softmaxloss_desc.threads = nThreads; softmaxloss_desc.datatype = LIBXSMM_DNN_DATATYPE_F32; softmaxloss_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED; libxsmm_softmax = libxsmm_dnn_create_softmaxloss( softmaxloss_desc, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_layout = libxsmm_dnn_softmaxloss_create_tensor_datalayout( libxsmm_softmax, LIBXSMM_DNN_REGULAR_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_act[num_layers+1] = libxsmm_dnn_link_tensor( libxsmm_layout, act_libxsmm[num_layers+1], &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_softmaxloss_bind_tensor( libxsmm_softmax, libxsmm_act[num_layers], LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_softmaxloss_bind_tensor( libxsmm_softmax, libxsmm_delact[num_layers], LIBXSMM_DNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_softmaxloss_bind_tensor( libxsmm_softmax, libxsmm_act[num_layers+1], LIBXSMM_DNN_REGULAR_OUTPUT ) ); if ( libxsmm_dnn_softmaxloss_get_scratch_size( libxsmm_softmax, &status ) > scratch_size ) { scratch_size = libxsmm_dnn_softmaxloss_get_scratch_size( libxsmm_softmax, &status ); CHKERR_LIBXSMM_DNN( status ); if ( scratch != NULL ) { libxsmm_free( scratch ); } scratch = libxsmm_aligned_scratch( scratch_size, 2097152 ); init_buf( (float*)scratch, scratch_size/4, 0, 0 ); } /* bind scratch to all layers */ for ( i = 0; i < num_layers; ++i ) { CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_scratch( libxsmm_fc_layer[i], scratch ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_optimizer_bind_scratch( libxsmm_opt[i], scratch ) ); } CHKERR_LIBXSMM_DNN( libxsmm_dnn_softmaxloss_bind_scratch( libxsmm_softmax, scratch ) ); if (type == 'A' || type == 'F') { printf("##########################################\n"); printf("# Performance - FWD (custom-Storage) #\n"); printf("##########################################\n"); l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i,j) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (j = 0; j < iters; ++j) { for ( i = 0; i < num_layers; ++i) { libxsmm_dnn_fullyconnected_execute_st( libxsmm_fc_layer[i], LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); } libxsmm_dnn_softmaxloss_execute_st( libxsmm_softmax, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); gflop = 0.0; for ( i = 0; i < num_layers; ++i) { gflop += (2.0*(double)MB*(double)C[i]*(double)C[i+1]*(double)iters) / (1000.0*1000.0*1000.0); } printf("GFLOP = %.5g\n", gflop/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", gflop/l_total); printf("PERFDUMP,FP,%s,%i,%i,", LIBXSMM_VERSION, nThreads, MB ); for ( i = 0; i < num_layers; ++i ) { printf("%i,", C[i] ); } printf("%f,%f\n", ((double)(l_total/iters)), gflop/l_total); } if (type == 'A' || type == 'B') { printf("##########################################\n"); printf("# Performance - BWD (custom-Storage) #\n"); printf("##########################################\n"); l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i,j) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (j = 0; j < iters; ++j) { libxsmm_dnn_softmaxloss_execute_st( libxsmm_softmax, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ); for ( i = num_layers-1; i > 0; --i) { libxsmm_dnn_fullyconnected_execute_st( libxsmm_fc_layer[i], LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, 0, tid ); libxsmm_dnn_optimizer_execute_st( libxsmm_opt[i], 0, tid ); } libxsmm_dnn_fullyconnected_execute_st( libxsmm_fc_layer[0], LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ); libxsmm_dnn_optimizer_execute_st( libxsmm_opt[i], 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); gflop = 0.0; for ( i = num_layers-1; i > 0; --i) { gflop += (4.0*(double)MB*(double)C[i]*(double)C[i+1]*(double)iters) / (1000.0*1000.0*1000.0); } gflop += (2.0*(double)MB*(double)C[0]*(double)C[1]*(double)iters) / (1000.0*1000.0*1000.0); printf("GFLOP = %.5g\n", gflop/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", gflop/l_total); printf("PERFDUMP,BP,%s,%i,%i,", LIBXSMM_VERSION, nThreads, MB ); for ( i = 0; i < num_layers; ++i ) { printf("%i,", C[i] ); } printf("%f,%f\n", ((double)(l_total/iters)), gflop/l_total); } if (type == 'A') { printf("##########################################\n"); printf("# Performance - FWD-BWD (custom-Storage) #\n"); printf("##########################################\n"); l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i,j) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (j = 0; j < iters; ++j) { for ( i = 0; i < num_layers; ++i) { libxsmm_dnn_fullyconnected_execute_st( libxsmm_fc_layer[i], LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); } libxsmm_dnn_softmaxloss_execute_st( libxsmm_softmax, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); libxsmm_dnn_softmaxloss_execute_st( libxsmm_softmax, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ); for ( i = (num_layers-1); i > 0; --i) { libxsmm_dnn_fullyconnected_execute_st( libxsmm_fc_layer[i], LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, 0, tid ); libxsmm_dnn_optimizer_execute_st( libxsmm_opt[i], 0, tid ); } libxsmm_dnn_fullyconnected_execute_st( libxsmm_fc_layer[0], LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ); libxsmm_dnn_optimizer_execute_st( libxsmm_opt[i], 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); gflop = 0.0; for ( i = num_layers-1; i > 0; --i) { gflop += (6.0*(double)MB*(double)C[i]*(double)C[i+1]*(double)iters) / (1000.0*1000.0*1000.0); } gflop += (4.0*(double)MB*(double)C[0]*(double)C[1]*(double)iters) / (1000.0*1000.0*1000.0); printf("GFLOP = %.5g\n", gflop/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", gflop/l_total); printf("PERFDUMP,BP,%s,%i,%i,", LIBXSMM_VERSION, nThreads, MB ); for ( i = 0; i < num_layers; ++i ) { printf("%i,", C[i] ); } printf("%f,%f\n", ((double)(l_total/iters)), gflop/l_total); } for ( i = 0; i < num_layers; ++i ) { /* clean-up */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_scratch( libxsmm_fc_layer[i] ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_fc_layer[i], LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_fc_layer[i], LIBXSMM_DNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_fc_layer[i], LIBXSMM_DNN_REGULAR_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_fc_layer[i], LIBXSMM_DNN_GRADIENT_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_fc_layer[i], LIBXSMM_DNN_REGULAR_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_fc_layer[i], LIBXSMM_DNN_GRADIENT_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_fc_layer[i], LIBXSMM_DNN_REGULAR_CHANNEL_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_fc_layer[i], LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_fc_layer[i], LIBXSMM_DNN_RELU_MASK ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_fullyconnected( libxsmm_fc_layer[i] ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_optimizer_release_scratch( libxsmm_opt[i] ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_optimizer_release_tensor( libxsmm_opt[i], LIBXSMM_DNN_REGULAR_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_optimizer_release_tensor( libxsmm_opt[i], LIBXSMM_DNN_GRADIENT_FILTER ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_optimizer( libxsmm_opt[i] ) ); } CHKERR_LIBXSMM_DNN( libxsmm_dnn_softmaxloss_release_scratch( libxsmm_softmax ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_softmaxloss_release_tensor( libxsmm_softmax, LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_softmaxloss_release_tensor( libxsmm_softmax, LIBXSMM_DNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_softmaxloss_release_tensor( libxsmm_softmax, LIBXSMM_DNN_REGULAR_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_softmaxloss( libxsmm_softmax ) ); for ( i = 0; i < num_layers; ++i ) { if ( i == 0 ) { CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_act[i] ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_delact[i] ) ); } CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_act[i+1] ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_delact[i+1] ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_fil[i] ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_delfil[i] ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_bias[i] ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_delbias[i] ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_relumask[i] ) ); } CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_act[num_layers+1] ) ); /* deallocate data */ libxsmm_free(scratch); for ( i = 0; i < num_layers; ++i ) { if ( i == 0 ) { libxsmm_free(act_libxsmm[i]); libxsmm_free(delact_libxsmm[i]); } libxsmm_free(act_libxsmm[i+1]); libxsmm_free(delact_libxsmm[i+1]); libxsmm_free(fil_libxsmm[i]); libxsmm_free(delfil_libxsmm[i]); libxsmm_free(bias_libxsmm[i]); libxsmm_free(delbias_libxsmm[i]); libxsmm_free(relumask_libxsmm[i]); } libxsmm_free(act_libxsmm[num_layers+1]); free( libxsmm_act ); free( libxsmm_delact ); free( libxsmm_fil ); free( libxsmm_delfil ); free( libxsmm_bias ); free( libxsmm_delbias ); free( libxsmm_relumask ); free( libxsmm_fc_layer ); free( libxsmm_opt ); free( act_libxsmm ); free( delact_libxsmm ); free( fil_libxsmm ); free( delfil_libxsmm ); free( bias_libxsmm ); free( delbias_libxsmm ); free( relumask_libxsmm ); free( C ); /* some empty lines at the end */ printf("\n\n\n"); return global_status; } libxsmm-1.17/samples/deeplearning/poolingdriver/000077500000000000000000000000001415223013700220775ustar00rootroot00000000000000libxsmm-1.17/samples/deeplearning/poolingdriver/Makefile000066400000000000000000000066421415223013700235470ustar00rootroot00000000000000ROOTDIR = $(abspath $(dir $(firstword $(MAKEFILE_LIST)))) DEPDIR = ../../.. SRCDIR = . INCDIR = . BLDDIR = obj OUTDIR = . CXXFLAGS = $(NULL) CFLAGS = $(NULL) DFLAGS = $(NULL) BLAS = 0 OMP = 1 SYM = 1 # include common Makefile artifacts include $(DEPDIR)/Makefile.inc # necessary include directories IFLAGS += -I$(call quote,$(INCDIR)) IFLAGS += -I$(call quote,$(DEPDIR)/include) OUTNAME := $(shell basename "$(ROOTDIR)") HEADERS := $(wildcard $(INCDIR)/*.h) $(wildcard $(INCDIR)/*.hpp) $(wildcard $(INCDIR)/*.hxx) $(wildcard $(INCDIR)/*.hh) \ $(wildcard $(SRCDIR)/*.h) $(wildcard $(SRCDIR)/*.hpp) $(wildcard $(SRCDIR)/*.hxx) $(wildcard $(SRCDIR)/*.hh) \ $(DEPDIR)/include/libxsmm_source.h CPPSRCS := $(wildcard $(SRCDIR)/*.cpp) CXXSRCS := $(wildcard $(SRCDIR)/*.cxx) CCXSRCS := $(wildcard $(SRCDIR)/*.cc) CSOURCS := $(wildcard $(SRCDIR)/*.c) CPPOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CPPSRCS:.cpp=-cpp.o))) CXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CXXSRCS:.cxx=-cxx.o))) CCXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CCXSRCS:.cc=-cc.o))) COBJCTS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CSOURCS:.c=-c.o))) ifneq (,$(strip $(FC))) FXXSRCS := $(wildcard $(SRCDIR)/*.f) F77SRCS := $(wildcard $(SRCDIR)/*.F) F90SRCS := $(wildcard $(SRCDIR)/*.f90) $(wildcard $(SRCDIR)/*.F90) FXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(FXXSRCS:.f=-f.o))) F77OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F77SRCS:.F=-f77.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90SRCS:.f90=-f90.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90OBJS:.F90=-f90.o))) endif SOURCES := $(CPPSRCS) $(CXXSRCS) $(CCXSRCS) $(CSOURCS) OBJECTS := $(CPPOBJS) $(CXXOBJS) $(CCXOBJS) $(COBJCTS) FTNSRCS := $(FXXSRCS) $(F77SRCS) $(F90SRCS) MODULES := $(addsuffix .mod,$(basename $(FTNSRCS))) $(addsuffix .modmic,$(basename $(FTNSRCS))) FTNOBJS := $(FXXOBJS) $(F77OBJS) $(F90OBJS) XFILES := $(OUTDIR)/layer_example_f32 $(OUTDIR)/layer_example_bf16 .PHONY: all all: $(XFILES) .PHONY: compile compile: $(OBJECTS) $(FTNOBJS) $(OUTDIR)/layer_example_f32: $(OUTDIR)/.make $(BLDDIR)/layer_example_f32-c.o $(LIBDEP) $(EXTDEP) $(LD) -o $@ $(BLDDIR)/layer_example_f32-c.o $(call cleanld,$(EXTLIB) $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS)) $(OUTDIR)/layer_example_bf16: $(OUTDIR)/.make $(BLDDIR)/layer_example_bf16-c.o $(LIBDEP) $(EXTDEP) $(LD) -o $@ $(BLDDIR)/layer_example_bf16-c.o $(call cleanld,$(EXTLIB) $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS)) $(BLDDIR)/%-cpp.o: $(SRCDIR)/%.cpp .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc ./../common/dnn_common.h $(CXX) $(DFLAGS) $(IFLAGS) $(CXXFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-c.o: $(SRCDIR)/%.c .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc ./../common/dnn_common.h $(CC) $(DFLAGS) $(IFLAGS) $(CFLAGS) $(CTARGET) -c $< -o $@ .PHONY: clean clean: ifneq ($(call qapath,$(BLDDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(BLDDIR)),$(call qapath,.)) @rm -rf $(BLDDIR) endif endif ifneq (,$(wildcard $(BLDDIR))) # still exists @rm -f $(OBJECTS) $(OBJECTX) $(FTNOBJS) $(FTNOBJX) *__genmod.* fit.log *.dat @rm -f $(BLDDIR)/*.gcno $(BLDDIR)/*.gcda $(BLDDIR)/*.gcov endif @rm -f .make .state .PHONY: realclean realclean: clean ifneq ($(call qapath,$(OUTDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(OUTDIR)),$(call qapath,.)) @rm -rf $(OUTDIR) endif endif ifneq (,$(wildcard $(OUTDIR))) # still exists @rm -f $(OUTDIR)/libxsmm.$(DLIBEXT) $(OUTDIR)/*.stackdump @rm -f $(XFILES) $(MODULES) endif libxsmm-1.17/samples/deeplearning/poolingdriver/layer_example_bf16.c000066400000000000000000000622021415223013700257120ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include #if defined(_OPENMP) # include #endif /* include c-based dnn library */ #include "../common/dnn_common.h" #define CHKERR_LIBXSMM_DNN(A) { const int chkerr_libxsmm_dnn_ = A; if (LIBXSMM_DNN_SUCCESS != chkerr_libxsmm_dnn_) { \ fprintf(stderr, "%s\n", libxsmm_dnn_get_error(chkerr_libxsmm_dnn_)); global_status = chkerr_libxsmm_dnn_; } \ } int main(int argc, char* argv[]) { float *naive_input, *naive_output, *naive_delinput, *naive_deloutput; float *naive_input_pad, *naive_output_pad, *naive_delinput_pad, *naive_deloutput_pad; libxsmm_bfloat16 *naive_input_pad_bf16, *naive_output_pad_bf16, *naive_delinput_pad_bf16, *naive_deloutput_pad_bf16; libxsmm_bfloat16 *naive_libxsmm_output, *naive_libxsmm_delinput; float *naive_libxsmm_output_f32, *naive_libxsmm_delinput_f32; int *naive_mask, *naive_libxsmm_mask; libxsmm_bfloat16 *input_libxsmm, *output_libxsmm, *delinput_libxsmm, *deloutput_libxsmm; int *mask_libxsmm; int ifhp, ifwp, ofhp, ofwp, ofh, ofw; int stride_h, stride_w; naive_pooling_t naive_param; void* scratch; size_t scratch_size = 0; /* some parameters we can overwrite via cli, default is some inner layer of overfeat */ int iters = 10; /* repetitions of benchmark */ int ifw = 14; /* input width, "W" */ int ifh = 20; /* input height, "H" */ int nImg = 32; /* mini-batch size, "N" */ int nFm = 256; /* number of input feature maps, "C" */ int stride = 1; /* stride when accessing inputs */ int kh = 2; /* kernel size height */ int kw = 2; /* kernel size width */ int pad_h = 0; /* pad in h direction */ int pad_w = 0; /* pad in w direction */ int pad_h_in = 0; /* padding mode */ int pad_w_in = 0; /* padding mode */ int pad_h_out = 0; /* padding mode */ int pad_w_out = 0; /* padding mode */ int pool_type = 0; /* max pooling */ char type = 'A'; /* 'A': ALL, 'F': FP, 'B': BP, 'U', WU */ char format = 'L'; const char *const env_check = getenv("CHECK"); const double check = LIBXSMM_ABS(0 == env_check ? 1 : atof(env_check)); #if defined(_OPENMP) int nThreads = omp_get_max_threads(); /* number of threads */ #else int nThreads = 1; /* number of threads */ #endif unsigned long long l_start, l_end; double l_total = 0.0; double gb = 0.0; double gib = 0.0; int i; libxsmm_dnn_pooling_desc pooling_desc; libxsmm_dnn_pooling* libxsmm_handle; libxsmm_dnn_tensor* libxsmm_input; libxsmm_dnn_tensor* libxsmm_delinput; libxsmm_dnn_tensor* libxsmm_output; libxsmm_dnn_tensor* libxsmm_deloutput; libxsmm_dnn_tensor* libxsmm_mask; libxsmm_dnn_tensor_datalayout* libxsmm_layout; libxsmm_dnn_err_t status; libxsmm_dnn_err_t global_status = LIBXSMM_DNN_SUCCESS; libxsmm_matdiff_info norms_fwd, norms_bwd, diff; libxsmm_matdiff_clear(&norms_fwd); libxsmm_matdiff_clear(&norms_bwd); libxsmm_matdiff_clear(&diff); if (argc > 1 && !strncmp(argv[1], "-h", 3)) { printf("Usage: %s iters inpWidth inpHeight nImg nFm pad_w_in pad_h_in pad_w_out pad_h_out stride type format\n", argv[0]); return 0; } libxsmm_rng_set_seed(1); /* reading new values from cli */ i = 1; if (argc > i) iters = atoi(argv[i++]); if (argc > i) ifw = atoi(argv[i++]); if (argc > i) ifh = atoi(argv[i++]); if (argc > i) nImg = atoi(argv[i++]); if (argc > i) nFm = atoi(argv[i++]); if (argc > i) kw = atoi(argv[i++]); if (argc > i) kh = atoi(argv[i++]); if (argc > i) pad_w = atoi(argv[i++]); if (argc > i) pad_h = atoi(argv[i++]); if (argc > i) pad_w_in = atoi(argv[i++]); if (argc > i) pad_h_in = atoi(argv[i++]); if (argc > i) pad_w_out = atoi(argv[i++]); if (argc > i) pad_h_out = atoi(argv[i++]); if (argc > i) stride = atoi(argv[i++]); if (argc > i) pool_type = atoi(argv[i++]); if (argc > i) type = *(argv[i++]); if (type != 'A' && type != 'F' && type != 'B') { printf("type needs to be 'A' (All), 'F' (FP only), 'B' (BP only)\n"); return 0; } if (pool_type != 0 && pool_type != 1 ) { printf("pool_type needs to be '0' (max), '1' (avg)\n"); return 0; } stride_w = stride; stride_h = stride; /* deriving some values for naive code */ ofh = (ifh + 2 * pad_h - kh)/stride_h + 1; ofw = (ifw + 2 * pad_w - kw)/stride_w + 1; ifhp = ifh + 2 * pad_h_in; ifwp = ifw + 2 * pad_w_in; ofhp = ofh + 2 * pad_h_out; ofwp = ofw + 2 * pad_w_out; /* set struct for naive convolution */ naive_param.N = nImg; naive_param.C = nFm; naive_param.H = ifh; naive_param.W = ifw; naive_param.R = kh; naive_param.S = kw; naive_param.pad_h = pad_h; naive_param.pad_w = pad_w; naive_param.stride_h = stride_h; naive_param.stride_w = stride_w; naive_param.type = pool_type; #if defined(__SSE3__) _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); #endif /* print some summary */ printf("##########################################\n"); printf("# Setting Up (Common) #\n"); printf("##########################################\n"); printf("PARAMS: W:%d H:%d N:%d C:%d P:%d Q:%d STRIDE:%d\n", ifw, ifh, nImg, nFm, ofh, ofw, stride); printf("PARAMS: ITERS:%d", iters); if (LIBXSMM_FEQ(0, check)) printf(" Threads:%d\n", nThreads); else printf("\n"); printf(" InImg %dx%d Padded (%dx%d)\n", ifh, ifw, ifhp, ifwp); printf("OutImg %dx%d Padded (%dx%d)\n", ofh, ofw, ofhp, ofwp); printf("SIZE Input (MB): %10.2f MiB\n", (double)(nImg*nFm*ifhp*ifwp*sizeof(float))/(1024.0*1024.0) ); printf("SIZE Output (MB): %10.2f MiB\n", (double)(nImg*nFm*ofhp*ofwp*sizeof(float))/(1024.0*1024.0) ); printf("SIZE Input (1): %10.2f MiB\n", (double)(1*nFm*ifhp*ifwp* sizeof(float))/(1024.0*1024.0) ); printf("SIZE Output (1): %10.2f MiB\n", (double)(1*nFm*ofhp*ofwp* sizeof(float))/(1024.0*1024.0) ); #if defined(USE_OVERWRITE) printf("Using Overwrite Option\n"); #endif /* allocate data */ naive_input = (float*)libxsmm_aligned_malloc( nImg*nFm*ifh *ifw *sizeof(float), 2097152); naive_input_pad = (float*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(float), 2097152); naive_delinput = (float*)libxsmm_aligned_malloc( nImg*nFm*ifh *ifw *sizeof(float), 2097152); naive_delinput_pad = (float*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(float), 2097152); naive_mask = (int* )libxsmm_aligned_malloc( nImg*nFm*ofh *ofw *sizeof(float), 2097152); naive_output = (float*)libxsmm_aligned_malloc( nImg*nFm*ofh *ofw *sizeof(float), 2097152); naive_output_pad = (float*)libxsmm_aligned_malloc( nImg*nFm*ofhp*ofwp*sizeof(float), 2097152); naive_deloutput = (float*)libxsmm_aligned_malloc( nImg*nFm*ofh *ofw *sizeof(float), 2097152); naive_deloutput_pad = (float*)libxsmm_aligned_malloc( nImg*nFm*ofhp*ofwp*sizeof(float), 2097152); naive_input_pad_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(libxsmm_bfloat16), 2097152); naive_delinput_pad_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(libxsmm_bfloat16), 2097152); naive_output_pad_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nFm*ofhp*ofwp*sizeof(libxsmm_bfloat16), 2097152); naive_deloutput_pad_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nFm*ofhp*ofwp*sizeof(libxsmm_bfloat16), 2097152); naive_libxsmm_output = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nFm*ofhp*ofwp*sizeof(libxsmm_bfloat16), 2097152); naive_libxsmm_delinput = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(libxsmm_bfloat16), 2097152); naive_libxsmm_output_f32 = (float*)libxsmm_aligned_malloc( nImg*nFm*ofhp*ofwp*sizeof(float), 2097152); naive_libxsmm_delinput_f32 = (float*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(float), 2097152); naive_libxsmm_mask = (int* )libxsmm_aligned_malloc( nImg*nFm*ofh *ofw *sizeof(float), 2097152); input_libxsmm = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(libxsmm_bfloat16), 2097152); delinput_libxsmm = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(libxsmm_bfloat16), 2097152); output_libxsmm = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nFm*ofhp*ofwp*sizeof(libxsmm_bfloat16), 2097152); deloutput_libxsmm = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nFm*ofhp*ofwp*sizeof(libxsmm_bfloat16), 2097152); mask_libxsmm = (int* )libxsmm_aligned_malloc( nImg*nFm*ofh *ofw *sizeof(float), 2097152); /* initialize data */ init_buf(naive_input, nImg*nFm*ifh*ifw, 0, 0); copy_internal_nchw( naive_input_pad , naive_input, nImg, nFm, ifh, ifw, pad_h_in, pad_w_in); init_buf(naive_delinput, nImg*nFm*ifh*ifw, 0, 0); copy_internal_nchw( naive_delinput_pad , naive_delinput, nImg, nFm, ifh, ifw, pad_h_in, pad_w_in); init_buf(naive_output, nImg*nFm*ofh*ofw, 0, 0); copy_internal_nchw( naive_output_pad , naive_output, nImg, nFm, ofh, ofw, pad_h_out, pad_w_out); init_buf(naive_deloutput, nImg*nFm*ofh*ofw, 0, 0); copy_internal_nchw( naive_deloutput_pad , naive_deloutput, nImg, nFm, ofh, ofw, pad_h_out, pad_w_out); set_zeropad_nchw(naive_input_pad, nImg, nFm, ifhp, ifwp, pad_h_in, pad_w_in); set_zeropad_nchw(naive_delinput_pad, nImg, nFm, ifhp, ifwp, pad_h_in, pad_w_in); set_zeropad_nchw(naive_output_pad, nImg, nFm, ofhp, ofwp, pad_h_out, pad_w_out); set_zeropad_nchw(naive_deloutput_pad, nImg, nFm, ofhp, ofwp, pad_h_out, pad_w_out); libxsmm_rne_convert_fp32_bf16( naive_input_pad, naive_input_pad_bf16, nImg*nFm*ifhp*ifwp ); libxsmm_rne_convert_fp32_bf16( naive_delinput_pad, naive_delinput_pad_bf16, nImg*nFm*ifhp*ifwp ); libxsmm_rne_convert_fp32_bf16( naive_output_pad, naive_output_pad_bf16, nImg*nFm*ofhp*ofwp ); libxsmm_rne_convert_fp32_bf16( naive_deloutput_pad, naive_deloutput_pad_bf16, nImg*nFm*ofhp*ofwp ); zero_buf_int32(naive_mask, nImg*nFm*ofh*ofw); zero_buf_int32(mask_libxsmm, nImg*nFm*ofh*ofw); if (LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Computing Reference ... #\n"); printf("##########################################\n"); if (type == 'A' || type == 'F') { naive_pooling_fp(&naive_param, naive_input, naive_output, naive_mask); } if (type == 'A' || type == 'B') { naive_pooling_bp(&naive_param, naive_delinput, naive_deloutput, naive_mask); } printf("##########################################\n"); printf("# Computing Reference ... done #\n"); printf("##########################################\n"); } if (format == 'A' || format == 'L') { printf("\n"); printf("##########################################\n"); printf("# Setting Up (custom-Storage) #\n"); printf("##########################################\n"); /* setup LIBXSMM handle */ pooling_desc.N = nImg; pooling_desc.C = nFm; pooling_desc.H = ifh; pooling_desc.W = ifw; pooling_desc.u = stride_h; pooling_desc.v = stride_w; pooling_desc.R = kh; pooling_desc.S = kw; pooling_desc.pad_h = pad_h; pooling_desc.pad_w = pad_w; pooling_desc.pad_h_in = pad_h_in; pooling_desc.pad_w_in = pad_w_in; pooling_desc.pad_h_out = pad_h_out; pooling_desc.pad_w_out = pad_w_out; pooling_desc.threads = nThreads; pooling_desc.datatype_in = LIBXSMM_DNN_DATATYPE_BF16; pooling_desc.datatype_out = LIBXSMM_DNN_DATATYPE_BF16; pooling_desc.datatype_mask = LIBXSMM_DNN_DATATYPE_I32; pooling_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; if ( pool_type == 0 ) { pooling_desc.pooling_type = LIBXSMM_DNN_POOLING_MAX; } else if ( pool_type == 1 ) { pooling_desc.pooling_type = LIBXSMM_DNN_POOLING_AVG; } else { return 0; } libxsmm_handle = libxsmm_dnn_create_pooling( pooling_desc, &status ); CHKERR_LIBXSMM_DNN( status ); /* setup LIBXSMM buffers */ libxsmm_layout = libxsmm_dnn_pooling_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_input = libxsmm_dnn_link_tensor( libxsmm_layout, input_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); printf("inner activation blocking: %i\n", libxsmm_layout->dim_size[0] ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_pooling_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delinput = libxsmm_dnn_link_tensor( libxsmm_layout, delinput_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_pooling_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_output = libxsmm_dnn_link_tensor( libxsmm_layout, output_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_pooling_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_deloutput = libxsmm_dnn_link_tensor( libxsmm_layout, deloutput_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_pooling_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_POOLING_MASK, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_mask = libxsmm_dnn_link_tensor( libxsmm_layout, mask_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); /* copy in data to LIBXSMM format */ /* we can also use the layout functions and set the data on our own external to the library */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_input, (void*)naive_input_pad_bf16, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_output, (void*)naive_output_pad_bf16, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_delinput, (void*)naive_delinput_pad_bf16, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_deloutput, (void*)naive_deloutput_pad_bf16, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); /* bind buffers and filter to handle */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_pooling_bind_tensor( libxsmm_handle, libxsmm_input, LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_pooling_bind_tensor( libxsmm_handle, libxsmm_delinput, LIBXSMM_DNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_pooling_bind_tensor( libxsmm_handle, libxsmm_output, LIBXSMM_DNN_REGULAR_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_pooling_bind_tensor( libxsmm_handle, libxsmm_deloutput, LIBXSMM_DNN_GRADIENT_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_pooling_bind_tensor( libxsmm_handle, libxsmm_mask , LIBXSMM_DNN_POOLING_MASK ) ); /* let's allocate and bind scratch */ scratch_size = libxsmm_dnn_pooling_get_scratch_size( libxsmm_handle, &status ); CHKERR_LIBXSMM_DNN( status ); scratch = libxsmm_aligned_scratch( scratch_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_pooling_bind_scratch( libxsmm_handle, scratch ) ); /* set scratch to bogus to make sure that libxsmm takes care of zeroing internally */ init_buf( (float*)scratch, scratch_size/4, 0, 0 ); if ((type == 'A' || type == 'F') && LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Correctness - FWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolutions */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_pooling_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) ); } /* copy out data */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_output, (void*)naive_libxsmm_output, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); libxsmm_convert_bf16_f32( naive_libxsmm_output, naive_libxsmm_output_f32, nImg*nFm*ofhp*ofwp ); copy_internal_nchw( naive_output_pad, naive_output, nImg, nFm, ofh, ofw, pad_h_out, pad_w_out); /* compare */ libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_F32, nImg*nFm*ofhp*ofwp, 1, naive_output_pad, naive_libxsmm_output_f32, 0, 0); printf("L1 reference : %.25g\n", norms_fwd.l1_ref); printf("L1 test : %.25g\n", norms_fwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_fwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_fwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel); printf("Check-norm : %.24f\n", norms_fwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_fwd); } if ( (type == 'A' || type == 'B') && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - BWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolutions */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_pooling_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ) ); } /* copy out data */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_delinput, (void*)naive_libxsmm_delinput, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); libxsmm_convert_bf16_f32( naive_libxsmm_delinput, naive_libxsmm_delinput_f32, nImg*nFm*ifhp*ifwp ); copy_internal_nchw( naive_delinput_pad, naive_delinput, nImg, nFm, ifh, ifw, pad_h_in, pad_w_in); /* compare */ libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, nImg*nFm*ifhp*ifwp, 1, naive_delinput_pad, naive_libxsmm_delinput_f32, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); } if (type == 'A' || type == 'F') { printf("##########################################\n"); printf("# Performance - FWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolution for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_pooling_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); gb = ((double)nImg*(double)nFm*(((double)ifh*(double)ifw) + (2.0*(double)ofh*(double)ofw))*(double)sizeof(float)*(double)iters) / (1000*1000*1000); gib = ((double)nImg*(double)nFm*(((double)ifh*(double)ifw) + (2.0*(double)ofh*(double)ofw))*(double)sizeof(float)*(double)iters) / (1024*1024*1024); printf("GB = %.5g\n", gb/(double)iters); printf("GiB = %.5g\n", gib/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GB/s = %.5g\n", gb/l_total); printf("GiB/s = %.5g\n", gib/l_total); printf("PERFDUMP,FP,%s,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%.5g,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nFm, ifw, ifh, stride, pad_w_in, pad_h_in, pad_w_out, pad_h_out, ((double)(l_total/iters)), gb/l_total, gib/l_total, norms_fwd.l1_ref, norms_fwd.l1_tst, norms_fwd.l2_abs, norms_fwd.l2_rel, norms_fwd.linf_abs, norms_fwd.linf_rel, norms_fwd.normf_rel); } if ( (type == 'A' || type == 'B') ) { printf("##########################################\n"); printf("# Performance - BWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolution for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_pooling_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); gb = ((double)nImg*(double)nFm*(((double)ifh*(double)ifw) + (2.0*(double)ofh*(double)ofw))*(double)sizeof(float)*(double)iters) / (1000*1000*1000); gib = ((double)nImg*(double)nFm*(((double)ifh*(double)ifw) + (2.0*(double)ofh*(double)ofw))*(double)sizeof(float)*(double)iters) / (1024*1024*1024); printf("GB = %.5g\n", gb/(double)iters); printf("GiB = %.5g\n", gib/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GB/s = %.5g\n", gb/l_total); printf("GiB/s = %.5g\n", gib/l_total); printf("PERFDUMP,BP,%s,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%.5g,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nFm, ifw, ifh, stride, pad_w_in, pad_h_in, pad_w_out, pad_h_out, ((double)(l_total/iters)), gb/l_total, gib/l_total, norms_bwd.l1_ref, norms_bwd.l1_tst, norms_bwd.l2_abs, norms_bwd.l2_rel, norms_bwd.linf_abs, norms_bwd.linf_rel, norms_bwd.normf_rel); } /* clean-up */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_pooling_release_scratch( libxsmm_handle ) ); libxsmm_free(scratch); CHKERR_LIBXSMM_DNN( libxsmm_dnn_pooling_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_pooling_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_pooling_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_pooling_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_pooling_release_tensor( libxsmm_handle, LIBXSMM_DNN_POOLING_MASK ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_input ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_delinput ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_output ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_deloutput ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_mask ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_pooling( libxsmm_handle ) ); } /* deallocate data */ libxsmm_free(naive_input); libxsmm_free(naive_input_pad); libxsmm_free(naive_mask); libxsmm_free(naive_output); libxsmm_free(naive_output_pad); libxsmm_free(naive_delinput); libxsmm_free(naive_delinput_pad); libxsmm_free(naive_deloutput); libxsmm_free(naive_deloutput_pad); libxsmm_free(naive_libxsmm_output); libxsmm_free(naive_libxsmm_delinput); libxsmm_free(naive_libxsmm_mask); libxsmm_free(input_libxsmm); libxsmm_free(mask_libxsmm); libxsmm_free(output_libxsmm); libxsmm_free(delinput_libxsmm); libxsmm_free(deloutput_libxsmm); libxsmm_free(naive_libxsmm_output_f32); libxsmm_free(naive_libxsmm_delinput_f32); libxsmm_free(naive_input_pad_bf16); libxsmm_free(naive_output_pad_bf16); libxsmm_free(naive_delinput_pad_bf16); libxsmm_free(naive_deloutput_pad_bf16); { const char *const env_check_scale = getenv("CHECK_SCALE"); const double check_scale = LIBXSMM_ABS(0 == env_check_scale ? 1.0 : atof(env_check_scale)); if (LIBXSMM_NEQ(0, check) && (check < 100.0 * check_scale * diff.normf_rel) && (global_status == LIBXSMM_DNN_SUCCESS)) { fprintf(stderr, "FAILED with an error of %f%%!\n", 100.0 * diff.normf_rel); exit(EXIT_FAILURE); } } /* some empty lines at the end */ printf("\n\n\n"); return global_status; } libxsmm-1.17/samples/deeplearning/poolingdriver/layer_example_f32.c000066400000000000000000000566351415223013700255630ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include #if defined(_OPENMP) # include #endif /* include c-based dnn library */ #include "../common/dnn_common.h" #define CHKERR_LIBXSMM_DNN(A) { const int chkerr_libxsmm_dnn_ = A; if (LIBXSMM_DNN_SUCCESS != chkerr_libxsmm_dnn_) { \ fprintf(stderr, "%s\n", libxsmm_dnn_get_error(chkerr_libxsmm_dnn_)); global_status = chkerr_libxsmm_dnn_; } \ } int main(int argc, char* argv[]) { float *naive_input, *naive_output, *naive_delinput, *naive_deloutput; float *naive_input_pad, *naive_output_pad, *naive_delinput_pad, *naive_deloutput_pad; float *naive_libxsmm_output, *naive_libxsmm_delinput; int *naive_mask, *naive_libxsmm_mask; float *input_libxsmm, *output_libxsmm, *delinput_libxsmm, *deloutput_libxsmm; int *mask_libxsmm; int ifhp, ifwp, ofhp, ofwp, ofh, ofw; int stride_h, stride_w; naive_pooling_t naive_param; void* scratch; size_t scratch_size = 0; /* some parameters we can overwrite via cli, default is some inner layer of overfeat */ int iters = 10; /* repetitions of benchmark */ int ifw = 14; /* input width, "W" */ int ifh = 20; /* input height, "H" */ int nImg = 32; /* mini-batch size, "N" */ int nFm = 256; /* number of input feature maps, "C" */ int stride = 1; /* stride when accessing inputs */ int kh = 2; /* kernel size height */ int kw = 2; /* kernel size width */ int pad_h = 0; /* pad in h direction */ int pad_w = 0; /* pad in w direction */ int pad_h_in = 0; /* padding mode */ int pad_w_in = 0; /* padding mode */ int pad_h_out = 0; /* padding mode */ int pad_w_out = 0; /* padding mode */ int pool_type = 0; /* max pooling */ char type = 'A'; /* 'A': ALL, 'F': FP, 'B': BP, 'U', WU */ char format = 'L'; const char *const env_check = getenv("CHECK"); const double check = LIBXSMM_ABS(0 == env_check ? 1 : atof(env_check)); #if defined(_OPENMP) int nThreads = omp_get_max_threads(); /* number of threads */ #else int nThreads = 1; /* number of threads */ #endif unsigned long long l_start, l_end; double l_total = 0.0; double gb = 0.0; double gib = 0.0; int i; libxsmm_dnn_pooling_desc pooling_desc; libxsmm_dnn_pooling* libxsmm_handle; libxsmm_dnn_tensor* libxsmm_input; libxsmm_dnn_tensor* libxsmm_delinput; libxsmm_dnn_tensor* libxsmm_output; libxsmm_dnn_tensor* libxsmm_deloutput; libxsmm_dnn_tensor* libxsmm_mask; libxsmm_dnn_tensor_datalayout* libxsmm_layout; libxsmm_dnn_err_t status; libxsmm_dnn_err_t global_status = LIBXSMM_DNN_SUCCESS; libxsmm_matdiff_info norms_fwd, norms_bwd, diff; libxsmm_matdiff_clear(&norms_fwd); libxsmm_matdiff_clear(&norms_bwd); libxsmm_matdiff_clear(&diff); if (argc > 1 && !strncmp(argv[1], "-h", 3)) { printf("Usage: %s iters inpWidth inpHeight nImg nFm pad_w_in pad_h_in pad_w_out pad_h_out stride type format\n", argv[0]); return 0; } libxsmm_rng_set_seed(1); /* reading new values from cli */ i = 1; if (argc > i) iters = atoi(argv[i++]); if (argc > i) ifw = atoi(argv[i++]); if (argc > i) ifh = atoi(argv[i++]); if (argc > i) nImg = atoi(argv[i++]); if (argc > i) nFm = atoi(argv[i++]); if (argc > i) kw = atoi(argv[i++]); if (argc > i) kh = atoi(argv[i++]); if (argc > i) pad_w = atoi(argv[i++]); if (argc > i) pad_h = atoi(argv[i++]); if (argc > i) pad_w_in = atoi(argv[i++]); if (argc > i) pad_h_in = atoi(argv[i++]); if (argc > i) pad_w_out = atoi(argv[i++]); if (argc > i) pad_h_out = atoi(argv[i++]); if (argc > i) stride = atoi(argv[i++]); if (argc > i) pool_type = atoi(argv[i++]); if (argc > i) type = *(argv[i++]); if (type != 'A' && type != 'F' && type != 'B') { printf("type needs to be 'A' (All), 'F' (FP only), 'B' (BP only)\n"); return 0; } if (pool_type != 0 && pool_type != 1 ) { printf("pool_type needs to be '0' (max), '1' (avg)\n"); return 0; } stride_w = stride; stride_h = stride; /* deriving some values for naive code */ ofh = (ifh + 2 * pad_h - kh)/stride_h + 1; ofw = (ifw + 2 * pad_w - kw)/stride_w + 1; ifhp = ifh + 2 * pad_h_in; ifwp = ifw + 2 * pad_w_in; ofhp = ofh + 2 * pad_h_out; ofwp = ofw + 2 * pad_w_out; /* set struct for naive convolution */ naive_param.N = nImg; naive_param.C = nFm; naive_param.H = ifh; naive_param.W = ifw; naive_param.R = kh; naive_param.S = kw; naive_param.pad_h = pad_h; naive_param.pad_w = pad_w; naive_param.stride_h = stride_h; naive_param.stride_w = stride_w; naive_param.type = pool_type; #if defined(__SSE3__) _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); #endif /* print some summary */ printf("##########################################\n"); printf("# Setting Up (Common) #\n"); printf("##########################################\n"); printf("PARAMS: W:%d H:%d N:%d C:%d P:%d Q:%d STRIDE:%d\n", ifw, ifh, nImg, nFm, ofh, ofw, stride); printf("PARAMS: ITERS:%d", iters); if (LIBXSMM_FEQ(0, check)) printf(" Threads:%d\n", nThreads); else printf("\n"); printf(" InImg %dx%d Padded (%dx%d)\n", ifh, ifw, ifhp, ifwp); printf("OutImg %dx%d Padded (%dx%d)\n", ofh, ofw, ofhp, ofwp); printf("SIZE Input (MB): %10.2f MiB\n", (double)(nImg*nFm*ifhp*ifwp*sizeof(float))/(1024.0*1024.0) ); printf("SIZE Output (MB): %10.2f MiB\n", (double)(nImg*nFm*ofhp*ofwp*sizeof(float))/(1024.0*1024.0) ); printf("SIZE Input (1): %10.2f MiB\n", (double)(1*nFm*ifhp*ifwp* sizeof(float))/(1024.0*1024.0) ); printf("SIZE Output (1): %10.2f MiB\n", (double)(1*nFm*ofhp*ofwp* sizeof(float))/(1024.0*1024.0) ); #if defined(USE_OVERWRITE) printf("Using Overwrite Option\n"); #endif /* allocate data */ naive_input = (float*)libxsmm_aligned_malloc( nImg*nFm*ifh *ifw *sizeof(float), 2097152); naive_input_pad = (float*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(float), 2097152); naive_delinput = (float*)libxsmm_aligned_malloc( nImg*nFm*ifh *ifw *sizeof(float), 2097152); naive_delinput_pad = (float*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(float), 2097152); naive_mask = (int* )libxsmm_aligned_malloc( nImg*nFm*ofh *ofw *sizeof(float), 2097152); naive_output = (float*)libxsmm_aligned_malloc( nImg*nFm*ofh *ofw *sizeof(float), 2097152); naive_output_pad = (float*)libxsmm_aligned_malloc( nImg*nFm*ofhp*ofwp*sizeof(float), 2097152); naive_deloutput = (float*)libxsmm_aligned_malloc( nImg*nFm*ofh *ofw *sizeof(float), 2097152); naive_deloutput_pad = (float*)libxsmm_aligned_malloc( nImg*nFm*ofhp*ofwp*sizeof(float), 2097152); naive_libxsmm_output = (float*)libxsmm_aligned_malloc( nImg*nFm*ofhp*ofwp*sizeof(float), 2097152); naive_libxsmm_delinput = (float*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(float), 2097152); naive_libxsmm_mask = (int* )libxsmm_aligned_malloc( nImg*nFm*ofh *ofw *sizeof(float), 2097152); input_libxsmm = (float*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(float), 2097152); delinput_libxsmm = (float*)libxsmm_aligned_malloc( nImg*nFm*ifhp*ifwp*sizeof(float), 2097152); output_libxsmm = (float*)libxsmm_aligned_malloc( nImg*nFm*ofhp*ofwp*sizeof(float), 2097152); deloutput_libxsmm = (float*)libxsmm_aligned_malloc( nImg*nFm*ofhp*ofwp*sizeof(float), 2097152); mask_libxsmm = (int* )libxsmm_aligned_malloc( nImg*nFm*ofh *ofw *sizeof(float), 2097152); /* initialize data */ init_buf(naive_input, nImg*nFm*ifh*ifw, 0, 0); copy_internal_nchw( naive_input_pad , naive_input, nImg, nFm, ifh, ifw, pad_h_in, pad_w_in); init_buf(naive_delinput, nImg*nFm*ifh*ifw, 0, 0); copy_internal_nchw( naive_delinput_pad , naive_delinput, nImg, nFm, ifh, ifw, pad_h_in, pad_w_in); init_buf(naive_output, nImg*nFm*ofh*ofw, 0, 0); copy_internal_nchw( naive_output_pad , naive_output, nImg, nFm, ofh, ofw, pad_h_out, pad_w_out); init_buf(naive_deloutput, nImg*nFm*ofh*ofw, 0, 0); copy_internal_nchw( naive_deloutput_pad , naive_deloutput, nImg, nFm, ofh, ofw, pad_h_out, pad_w_out); set_zeropad_nchw(naive_input_pad, nImg, nFm, ifhp, ifwp, pad_h_in, pad_w_in); set_zeropad_nchw(naive_delinput_pad, nImg, nFm, ifhp, ifwp, pad_h_in, pad_w_in); set_zeropad_nchw(naive_output_pad, nImg, nFm, ofhp, ofwp, pad_h_out, pad_w_out); set_zeropad_nchw(naive_deloutput_pad, nImg, nFm, ofhp, ofwp, pad_h_out, pad_w_out); zero_buf_int32(naive_mask, nImg*nFm*ofh*ofw); zero_buf_int32(mask_libxsmm, nImg*nFm*ofh*ofw); zero_buf(input_libxsmm, nImg*nFm*ifhp*ifwp); zero_buf(delinput_libxsmm, nImg*nFm*ifhp*ifwp); zero_buf(output_libxsmm, nImg*nFm*ofhp*ofwp); zero_buf(deloutput_libxsmm, nImg*nFm*ofhp*ofwp); if (LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Computing Reference ... #\n"); printf("##########################################\n"); if (type == 'A' || type == 'F') { naive_pooling_fp(&naive_param, naive_input, naive_output, naive_mask); } if (type == 'A' || type == 'B') { naive_pooling_bp(&naive_param, naive_delinput, naive_deloutput, naive_mask); } printf("##########################################\n"); printf("# Computing Reference ... done #\n"); printf("##########################################\n"); } if (format == 'A' || format == 'L') { printf("\n"); printf("##########################################\n"); printf("# Setting Up (custom-Storage) #\n"); printf("##########################################\n"); /* setup LIBXSMM handle */ pooling_desc.N = nImg; pooling_desc.C = nFm; pooling_desc.H = ifh; pooling_desc.W = ifw; pooling_desc.u = stride_h; pooling_desc.v = stride_w; pooling_desc.R = kh; pooling_desc.S = kw; pooling_desc.pad_h = pad_h; pooling_desc.pad_w = pad_w; pooling_desc.pad_h_in = pad_h_in; pooling_desc.pad_w_in = pad_w_in; pooling_desc.pad_h_out = pad_h_out; pooling_desc.pad_w_out = pad_w_out; pooling_desc.threads = nThreads; pooling_desc.datatype_in = LIBXSMM_DNN_DATATYPE_F32; pooling_desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32; pooling_desc.datatype_mask = LIBXSMM_DNN_DATATYPE_I32; pooling_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; if ( pool_type == 0 ) { pooling_desc.pooling_type = LIBXSMM_DNN_POOLING_MAX; } else if ( pool_type == 1 ) { pooling_desc.pooling_type = LIBXSMM_DNN_POOLING_AVG; } else { return 0; } libxsmm_handle = libxsmm_dnn_create_pooling( pooling_desc, &status ); CHKERR_LIBXSMM_DNN( status ); /* setup LIBXSMM buffers */ libxsmm_layout = libxsmm_dnn_pooling_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_input = libxsmm_dnn_link_tensor( libxsmm_layout, input_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); printf("inner activation blocking: %i\n", libxsmm_layout->dim_size[0] ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_pooling_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_delinput = libxsmm_dnn_link_tensor( libxsmm_layout, delinput_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_pooling_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_output = libxsmm_dnn_link_tensor( libxsmm_layout, output_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_pooling_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_deloutput = libxsmm_dnn_link_tensor( libxsmm_layout, deloutput_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_pooling_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_POOLING_MASK, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_mask = libxsmm_dnn_link_tensor( libxsmm_layout, mask_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); /* copy in data to LIBXSMM format */ /* we can also use the layout functions and set the data on our own external to the library */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_input, (void*)naive_input_pad, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_output, (void*)naive_output_pad, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_delinput, (void*)naive_delinput_pad, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_deloutput, (void*)naive_deloutput_pad, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); /* bind buffers and filter to handle */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_pooling_bind_tensor( libxsmm_handle, libxsmm_input, LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_pooling_bind_tensor( libxsmm_handle, libxsmm_delinput, LIBXSMM_DNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_pooling_bind_tensor( libxsmm_handle, libxsmm_output, LIBXSMM_DNN_REGULAR_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_pooling_bind_tensor( libxsmm_handle, libxsmm_deloutput, LIBXSMM_DNN_GRADIENT_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_pooling_bind_tensor( libxsmm_handle, libxsmm_mask , LIBXSMM_DNN_POOLING_MASK ) ); /* let's allocate and bind scratch */ scratch_size = libxsmm_dnn_pooling_get_scratch_size( libxsmm_handle, &status ); CHKERR_LIBXSMM_DNN( status ); scratch = libxsmm_aligned_scratch( scratch_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_pooling_bind_scratch( libxsmm_handle, scratch ) ); /* set scratch to bogus to make sure that libxsmm takes care of zeroing internally */ init_buf( (float*)scratch, scratch_size/4, 0, 0 ); if ((type == 'A' || type == 'F') && LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Correctness - FWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolutions */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_pooling_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) ); } /* copy out data */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_output, (void*)naive_libxsmm_output, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); copy_internal_nchw( naive_output_pad, naive_output, nImg, nFm, ofh, ofw, pad_h_out, pad_w_out); /* compare */ libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_F32, nImg*nFm*ofhp*ofwp, 1, naive_output_pad, naive_libxsmm_output, 0, 0); printf("L1 reference : %.25g\n", norms_fwd.l1_ref); printf("L1 test : %.25g\n", norms_fwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_fwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_fwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel); printf("Check-norm : %.24f\n", norms_fwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_fwd); } if ( (type == 'A' || type == 'B') && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - BWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolutions */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_pooling_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ) ); } /* copy out data */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_delinput, (void*)naive_libxsmm_delinput, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) ); copy_internal_nchw( naive_delinput_pad, naive_delinput, nImg, nFm, ifh, ifw, pad_h_in, pad_w_in); /* compare */ libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, nImg*nFm*ifhp*ifwp, 1, naive_delinput_pad, naive_libxsmm_delinput, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); } if (type == 'A' || type == 'F') { printf("##########################################\n"); printf("# Performance - FWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolution for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_pooling_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); gb = ((double)nImg*(double)nFm*(((double)ifh*(double)ifw) + (2.0*(double)ofh*(double)ofw))*(double)sizeof(float)*(double)iters) / (1000*1000*1000); gib = ((double)nImg*(double)nFm*(((double)ifh*(double)ifw) + (2.0*(double)ofh*(double)ofw))*(double)sizeof(float)*(double)iters) / (1024*1024*1024); printf("GB = %.5g\n", gb/(double)iters); printf("GiB = %.5g\n", gib/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GB/s = %.5g\n", gb/l_total); printf("GiB/s = %.5g\n", gib/l_total); printf("PERFDUMP,FP,%s,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%.5g,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nFm, ifw, ifh, stride, pad_w_in, pad_h_in, pad_w_out, pad_h_out, ((double)(l_total/iters)), gb/l_total, gib/l_total, norms_fwd.l1_ref, norms_fwd.l1_tst, norms_fwd.l2_abs, norms_fwd.l2_rel, norms_fwd.linf_abs, norms_fwd.linf_rel, norms_fwd.normf_rel); } if ( (type == 'A' || type == 'B') ) { printf("##########################################\n"); printf("# Performance - BWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM convolution for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_pooling_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); gb = ((double)nImg*(double)nFm*(((double)ifh*(double)ifw) + (2.0*(double)ofh*(double)ofw))*(double)sizeof(float)*(double)iters) / (1000*1000*1000); gib = ((double)nImg*(double)nFm*(((double)ifh*(double)ifw) + (2.0*(double)ofh*(double)ofw))*(double)sizeof(float)*(double)iters) / (1024*1024*1024); printf("GB = %.5g\n", gb/(double)iters); printf("GiB = %.5g\n", gib/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GB/s = %.5g\n", gb/l_total); printf("GiB/s = %.5g\n", gib/l_total); printf("PERFDUMP,BP,%s,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%.5g,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nFm, ifw, ifh, stride, pad_w_in, pad_h_in, pad_w_out, pad_h_out, ((double)(l_total/iters)), gb/l_total, gib/l_total, norms_bwd.l1_ref, norms_bwd.l1_tst, norms_bwd.l2_abs, norms_bwd.l2_rel, norms_bwd.linf_abs, norms_bwd.linf_rel, norms_bwd.normf_rel); } /* clean-up */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_pooling_release_scratch( libxsmm_handle ) ); libxsmm_free(scratch); CHKERR_LIBXSMM_DNN( libxsmm_dnn_pooling_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_pooling_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_pooling_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_pooling_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_pooling_release_tensor( libxsmm_handle, LIBXSMM_DNN_POOLING_MASK ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_input ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_delinput ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_output ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_deloutput ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_mask ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_pooling( libxsmm_handle ) ); } /* deallocate data */ libxsmm_free(naive_input); libxsmm_free(naive_input_pad); libxsmm_free(naive_mask); libxsmm_free(naive_output); libxsmm_free(naive_output_pad); libxsmm_free(naive_delinput); libxsmm_free(naive_delinput_pad); libxsmm_free(naive_deloutput); libxsmm_free(naive_deloutput_pad); libxsmm_free(naive_libxsmm_output); libxsmm_free(naive_libxsmm_delinput); libxsmm_free(naive_libxsmm_mask); libxsmm_free(input_libxsmm); libxsmm_free(mask_libxsmm); libxsmm_free(output_libxsmm); libxsmm_free(delinput_libxsmm); libxsmm_free(deloutput_libxsmm); { const char *const env_check_scale = getenv("CHECK_SCALE"); const double check_scale = LIBXSMM_ABS(0 == env_check_scale ? 1.0 : atof(env_check_scale)); if (LIBXSMM_NEQ(0, check) && (check < 100.0 * check_scale * diff.normf_rel) && (global_status == LIBXSMM_DNN_SUCCESS)) { fprintf(stderr, "FAILED with an error of %f%%!\n", 100.0 * diff.normf_rel); exit(EXIT_FAILURE); } } /* some empty lines at the end */ printf("\n\n\n"); return global_status; } libxsmm-1.17/samples/deeplearning/poolingdriver/run_resnet50.sh000077500000000000000000000066271415223013700250020ustar00rootroot00000000000000#!/usr/bin/env bash UNAME=$(command -v uname) SORT=$(command -v sort) GREP=$(command -v grep) CUT=$(command -v cut) WC=$(command -v wc) TR=$(command -v tr) if [ "" = "${CHECK}" ] || [ "0" = "${CHECK}" ]; then if [ "" = "${CHECK_DNN_MB}" ]; then CHECK_DNN_MB=64; fi if [ "" = "${CHECK_DNN_ITERS}" ]; then CHECK_DNN_ITERS=1000; fi else # check if [ "" = "${CHECK_DNN_MB}" ]; then CHECK_DNN_MB=64; fi if [ "" = "${CHECK_DNN_ITERS}" ]; then CHECK_DNN_ITERS=1; fi fi if [ $# -ne 6 ] then echo "Usage: $(basename $0) mb iters numa (1-mcdram/0-DDR) prec (f32,bf16) TYPE (0-max, 1-avg) PASS ('A'-ALL/'F'-FP/'B'-BP) PAD (0-logical,1-physcial) ; using default values; using default values: 64 1000 1 f32 0 A" MB=${CHECK_DNN_MB} ITERS=${CHECK_DNN_ITERS} NUMA=-1 BIN=f32 TYPE=0 PASS="A" else MB=$1 ITERS=$2 NUMA=$3 BIN=$4 TYPE=$5 PASS=$6 fi if [ "${GREP}" ] && [ "${SORT}" ] && [ "${CUT}" ] && [ "${TR}" ] && [ "${WC}" ]; then if [ "$(command -v lscpu)" ]; then NS=$(lscpu | ${GREP} -m1 "Socket(s)" | ${TR} -d " " | ${CUT} -d: -f2) if [ "" = "${NS}" ]; then NS=1; fi NC=$((NS*$(lscpu | ${GREP} -m1 "Core(s) per socket" | ${TR} -d " " | ${CUT} -d: -f2))) NT=$((NC*$(lscpu | ${GREP} -m1 "Thread(s) per core" | ${TR} -d " " | ${CUT} -d: -f2))) elif [ -e /proc/cpuinfo ]; then NS=$(${GREP} "physical id" /proc/cpuinfo | ${SORT} -u | ${WC} -l | ${TR} -d " ") if [ "" = "${NS}" ] || [ "" = "${NS}" ]; then NS=1; fi NC=$((NS*$(${GREP} -m1 "cpu cores" /proc/cpuinfo | ${TR} -d " " | ${CUT} -d: -f2))) NT=$(${GREP} "core id" /proc/cpuinfo | ${WC} -l | ${TR} -d " ") elif [ "Darwin" = "$(uname)" ]; then NS=$(sysctl hw.packages | ${CUT} -d: -f2 | ${TR} -d " ") NC=$(sysctl hw.physicalcpu | ${CUT} -d: -f2 | ${TR} -d " ") NT=$(sysctl hw.logicalcpu | ${CUT} -d: -f2 | ${TR} -d " ") fi if [ "${NC}" ] && [ "${NT}" ]; then HT=$((NT/NC)) else NS=1 NC=1 NT=1 HT=1 fi if [ "$(command -v numactl)" ]; then NN=$(numactl -H | ${GREP} "available:" | ${CUT} -d' ' -f2) else NN=${NS} fi fi CPUFLAGS=$(if [ "${GREP}" ] && [ "${CUT}" ] && [ -e /proc/cpuinfo ]; then ${GREP} -m1 flags /proc/cpuinfo | ${CUT} -d: -f2- || true; fi) if [ "${GREP}" ] && [ "$(echo "${CPUFLAGS}" | ${GREP} -o avx512er)" ]; then if [ "0" != "$((0>NUMA))" ] && [ "0" != "$((NS #include #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #include #include #include #include #if defined(_OPENMP) # include #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif /* include c-based dnn library */ #include "../common/dnn_common.h" #define CHKERR_LIBXSMM_DNN(A) { const int chkerr_libxsmm_dnn_ = A; if (LIBXSMM_DNN_SUCCESS != chkerr_libxsmm_dnn_) { \ fprintf(stderr, "%s\n", libxsmm_dnn_get_error(chkerr_libxsmm_dnn_)); global_status = chkerr_libxsmm_dnn_; } \ } int main(int argc, char* argv[]) { /* Arrays related to FWD pass */ float *wgold, *xgoldt, *ugold, *hpgold, *hgoldt, *z1gold, *z2gold, *zgoldt, *bgold, *bmgold; float *w, *xt, *u, *hp, *ht, *htest, *b; /* Arrays related to BWD and UPD pass */ float *djdhgoldt, *deltagoldt, *djdugold, *djdwgold, *djdxgoldt, *djdbgold; float *zigold, *di1gold, *di2gold, *ugoldTp, *wgoldTp, *hgoldTp, *xgoldTp; float *djdht, *djdu, *djdw, *djdxt, *djdb, *djdxtestt, *djdwtest, *djdutest; const char transa = 'N', transb = 'N'; /* no transposes */ const float alpha = 1, beta = 1, beta0 = 0; void *scratch, *internalstate; size_t scratch_size = 0, internalstate_size = 0; int iters = 10; /* repetitions of benchmark */ int pass = 0; /* pass: 0--FWD, 1--BWD, 2--UPD, 3--BWD+UPD */ int nonlin = 2; /* nonlin=1 denotes ReLU, 2 denotes sigmoid, 3 denotes tanh */ int N = 168; /* size of mini-batch */ int C = 512; /* number of inputs */ int K = 256; /* number of outputs */ int t = 4; /* number of time steps (>= 1) */ int bn = 24; int bc = 64; int bk = 64; const char *const env_check = getenv("CHECK"); const double check = LIBXSMM_ABS(0 == env_check ? 1/*enable by default*/ : atof(env_check)); #if defined(_OPENMP) int nThreads = omp_get_max_threads(); /* number of threads */ #else int nThreads = 1; /* number of threads */ #endif unsigned long long l_start, l_end; double l_total = 0.0; double flops = 0.0, tempflops = 0.0; const double tflops = 12; /* transcendental flops */ int i, j, it; libxsmm_dnn_rnncell_desc rnncell_desc; libxsmm_dnn_rnncell* libxsmm_handle; libxsmm_dnn_tensor* libxsmm_input; libxsmm_dnn_tensor* libxsmm_hidden_state_prev; libxsmm_dnn_tensor* libxsmm_weight; libxsmm_dnn_tensor* libxsmm_recur_weight; libxsmm_dnn_tensor* libxsmm_bias; libxsmm_dnn_tensor* libxsmm_hidden_state; libxsmm_dnn_tensor* libxsmm_dinput; libxsmm_dnn_tensor* libxsmm_dweight; libxsmm_dnn_tensor* libxsmm_drecur_weight; libxsmm_dnn_tensor* libxsmm_dbias; libxsmm_dnn_tensor* libxsmm_dhidden_state; libxsmm_dnn_tensor_datalayout* libxsmm_layout; libxsmm_dnn_err_t status; libxsmm_dnn_err_t global_status = LIBXSMM_DNN_SUCCESS; libxsmm_matdiff_info norms_fwd, norms_bwd, norms_upd_w, norms_upd_u, norms_upd_b, diff; libxsmm_matdiff_clear(&norms_fwd); libxsmm_matdiff_clear(&norms_bwd); libxsmm_matdiff_clear(&norms_upd_w); libxsmm_matdiff_clear(&norms_upd_u); libxsmm_matdiff_clear(&norms_upd_b); libxsmm_matdiff_clear(&diff); if (argc > 1 && !strncmp(argv[1], "-h", 3)) { printf("\nUsage: ./rnndriver [reps] [pass: 0--FWD, 1--BWD, 2--UPD, 3--BWD+UPD] [nonlin: 1--ReLU, 2--sigmoid, 3--tanh] [N] [C] [K] [time_steps > 0]\n\n"); return 0; } libxsmm_rng_set_seed(1); /* reading new values from cli */ i = 1; if (argc > i) iters = atoi(argv[i++]); if (argc > i) pass = atoi(argv[i++]); if (argc > i) nonlin= atoi(argv[i++]); if (argc > i) N = atoi(argv[i++]); if (argc > i) C = atoi(argv[i++]); if (argc > i) K = atoi(argv[i++]); if (argc > i) t = atoi(argv[i++]); if (argc > i) bn = atoi(argv[i++]); if (argc > i) bc = atoi(argv[i++]); if (argc > i) bk = atoi(argv[i++]); if (t <= 0) { printf("time_steps %d should be greater than 0\n\n", t); return 0; } if (!(pass == 0 || pass == 1 || pass == 2 || pass == 3 || pass == 4)) { printf("Unknown pass: %d, valid arguments for pass = {0(FWD), 1(BWD), 2(UPD), 3(BWD+UPD)\n\n", pass); return 0; } if (nonlin != 1 && nonlin != 2 && nonlin != 3) { printf("Unsupported non-linear function used [1--ReLU, 2--sigmoid, 3--tanh]\n\n"); return 0; } #if defined(__SSE3__) _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); #endif /* print some summary */ printf("##########################################\n"); printf("# Setting Up (Common) #\n"); printf("##########################################\n"); printf("PARAMS: N:%d C:%d K:%d T:%d\n", N, C, K, t); printf("PARAMS: ITERS:%d", iters); if (LIBXSMM_FEQ(0, check)) printf(" Threads:%d\n", nThreads); else printf("\n"); printf("SIZE Weight (MB): %10.2f MiB\n", (double)(C*K*sizeof(float))/(1024.0*1024.0) ); printf("SIZE Input (MB): %10.2f MiB\n", (double)(N*C*sizeof(float))/(1024.0*1024.0) ); printf("SIZE Hidden State: %10.2f MiB\n", (double)(K*N*sizeof(float))/(1024.0*1024.0) ); /* allocate data */ xgoldt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); hpgold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); wgold = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); ugold = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); bgold = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); hgoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); zgoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); bmgold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); z1gold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); z2gold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); djdxgoldt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); djdwgold = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); djdugold = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); djdbgold = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); djdhgoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); deltagoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); zigold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); di1gold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); di2gold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); xgoldTp = (float*)libxsmm_aligned_malloc(N*C*sizeof(float), 2097152); wgoldTp = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); ugoldTp = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); hgoldTp = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); xt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); hp = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); w = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); u = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); ht = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); b = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); djdxt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); djdw = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); djdu = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); djdb = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); djdht = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); htest = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); djdxtestt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); djdwtest = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); djdutest = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); LIBXSMM_VLA_DECL(2, float, xgold, xgoldt, N*C); LIBXSMM_VLA_DECL(2, float, hgold, hgoldt, K*N); LIBXSMM_VLA_DECL(2, float, zgold, zgoldt, K*N); LIBXSMM_VLA_DECL(2, float, djdxgold, djdxgoldt, N*C); LIBXSMM_VLA_DECL(2, float, djdhgold, djdhgoldt, K*N); LIBXSMM_VLA_DECL(2, float, deltagold, deltagoldt, K*N); /* initialize data */ /* All data in gold is considered to be in column-major format */ for (it = 0; it < t; ++it) { init_buf(&LIBXSMM_VLA_ACCESS(2, xgold, it, 0, N*C), N*C, 0, 0); } init_buf(hpgold, N*K, 0, 0); init_buf(wgold, C*K, 0, 0); init_buf(ugold, K*K, 0, 0); init_buf(bgold, K, 0, 0); for (j = 0; j < N; j++) { matrix_copy(K, bgold, &(bmgold[j*K])); } zero_buf(hgoldt, K*N*t); zero_buf(zgoldt, K*N*t); zero_buf(z1gold, K*N); zero_buf(z2gold, K*N); for (it = 0; it < t; ++it) { init_buf(&LIBXSMM_VLA_ACCESS(2, djdhgold, it, 0, K*N), N*K, 0, 0); } zero_buf(djdxgoldt, N*C*t); zero_buf(djdwgold, C*K); zero_buf(djdugold, K*K); zero_buf(djdbgold, K); zero_buf(deltagoldt, K*N*t); zero_buf(zigold, K*N); zero_buf(di1gold, K*N); zero_buf(di2gold, K*N); zero_buf(xgoldTp, N*C); zero_buf(ugoldTp, K*K); zero_buf(wgoldTp, C*K); zero_buf(hgoldTp, K*N); /* first touch LIBXSMM */ zero_buf(xt, N*C*t); zero_buf(hp, K*N); zero_buf(w, C*K); zero_buf(u, K*K); zero_buf(b, K); zero_buf(ht, K*N*t); zero_buf(djdxt,N*C*t); zero_buf(djdw, C*K); zero_buf(djdu, K*K); zero_buf(djdb, K); zero_buf(djdht, K*N*t); LIBXSMM_VLA_DECL(2, float, h, ht, K*N); if (LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Computing Reference ... #\n"); printf("##########################################\n"); for (i = 0; i < t; ++i) { LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &C, &alpha, wgold, &K, &LIBXSMM_VLA_ACCESS(2, xgold, i, 0, N*C), &C, &beta0, z1gold, &K); if (0 == i) { LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &K, &alpha, ugold, &K, hpgold, &K, &beta0, z2gold, &K); } else { LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &K, &alpha, ugold, &K, &LIBXSMM_VLA_ACCESS(2, hgold, i-1, 0, K*N), &K, &beta0, z2gold, &K); } matrix_add(K*N, z1gold, z2gold, &LIBXSMM_VLA_ACCESS(2, zgold, i, 0, K*N)); matrix_add(K*N, &LIBXSMM_VLA_ACCESS(2, zgold, i, 0, K*N), bmgold, &LIBXSMM_VLA_ACCESS(2, zgold, i, 0, K*N)); if (1 == nonlin) { matrix_relu(K*N, &LIBXSMM_VLA_ACCESS(2, zgold, i, 0, K*N), &LIBXSMM_VLA_ACCESS(2, hgold, i, 0, K*N)); } else if (2 == nonlin) { matrix_sigmoid(K*N, &LIBXSMM_VLA_ACCESS(2, zgold, i, 0, K*N), &LIBXSMM_VLA_ACCESS(2, hgold, i, 0, K*N)); } else { matrix_tanh(K*N, &LIBXSMM_VLA_ACCESS(2, zgold, i, 0, K*N), &LIBXSMM_VLA_ACCESS(2, hgold, i, 0, K*N)); } } /* Conceptually, delta iterates over 0 ... t-1, whereas, djdh and z iterates over 1 ... t */ /* Hence these have identical array indices */ if (1 == nonlin) { matrix_relu_inverse(K*N, &LIBXSMM_VLA_ACCESS(2, zgold, t-1, 0, K*N), zigold); } else if (2 == nonlin) { matrix_sigmoid_inverse(K*N, &LIBXSMM_VLA_ACCESS(2, zgold, t-1, 0, K*N), zigold); } else { matrix_tanh_inverse(K*N, &LIBXSMM_VLA_ACCESS(2, zgold, t-1, 0, K*N), zigold); } matrix_eltwise_mult(K*N, zigold, &LIBXSMM_VLA_ACCESS(2, djdhgold, t-1, 0, K*N), &LIBXSMM_VLA_ACCESS(2, deltagold, t-1, 0, K*N)); matrix_transpose(K, K, ugold, ugoldTp); for (i = t-2; i >= 0; --i) { if (1 == nonlin) { matrix_relu_inverse(K*N, &LIBXSMM_VLA_ACCESS(2, zgold, i, 0, K*N), zigold); } else if (2 == nonlin) { matrix_sigmoid_inverse(K*N, &LIBXSMM_VLA_ACCESS(2, zgold, i, 0, K*N), zigold); } else { matrix_tanh_inverse(K*N, &LIBXSMM_VLA_ACCESS(2, zgold, i, 0, K*N), zigold); } LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &K, &alpha, ugoldTp, &K, &LIBXSMM_VLA_ACCESS(2, deltagold, i+1, 0, K*N), &K, &beta0, di1gold, &K); matrix_add(K*N, &LIBXSMM_VLA_ACCESS(2, djdhgold, i, 0, K*N), di1gold, di2gold); matrix_eltwise_mult(K*N, zigold, di2gold, &LIBXSMM_VLA_ACCESS(2, deltagold, i, 0, K*N)); } if (pass == 1 || pass == 3) { matrix_transpose(C, K, wgold, wgoldTp); for (i = 0; i < t; ++i) { LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &C, &N, &K, &alpha, wgoldTp, &C, &LIBXSMM_VLA_ACCESS(2, deltagold, i, 0, K*N), &K, &beta0, &LIBXSMM_VLA_ACCESS(2, djdxgold, i, 0, N*C), &C); } } if (pass == 2 || pass == 3) { for (i = 0; i < t; ++i) { if (0 == i) { matrix_transpose(N, K, hpgold, hgoldTp); } else { matrix_transpose(N, K, &LIBXSMM_VLA_ACCESS(2, hgold, i-1, 0, K*N), hgoldTp); } LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &K, &N, &alpha, &LIBXSMM_VLA_ACCESS(2, deltagold, i, 0, K*N), &K, hgoldTp, &N, &beta, djdugold, &K); matrix_transpose(N, C, &LIBXSMM_VLA_ACCESS(2, xgold, i, 0, N*C), xgoldTp); LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &C, &N, &alpha, &LIBXSMM_VLA_ACCESS(2, deltagold, i, 0, K*N), &K, xgoldTp, &N, &beta, djdwgold, &K); for (j = 0; j < K*N; j++) { djdbgold[j%K] += LIBXSMM_VLA_ACCESS(2, deltagold, i, j, K*N); } } } printf("##########################################\n"); printf("# Computing Reference ... done #\n"); printf("##########################################\n"); } if (1 /* format == 'A' || format == 'L' */) { printf("\n"); printf("##########################################\n"); printf("# Setting Up (custom-Storage) #\n"); printf("##########################################\n"); if ( N % bn != 0 ) { bn = N; } if ( C % bc != 0 ) { bc = C; } if ( K % bk != 0 ) { bk = K; } /* setup LIBXSMM handle */ rnncell_desc.threads = nThreads; rnncell_desc.N = N; rnncell_desc.C = C; rnncell_desc.K = K; rnncell_desc.bn = bn; rnncell_desc.bk = bk; rnncell_desc.bc = bc; rnncell_desc.max_T = t; if ( nonlin == 1 ) { rnncell_desc.cell_type = LIBXSMM_DNN_RNNCELL_RNN_RELU; } else if ( nonlin == 2 ) { rnncell_desc.cell_type = LIBXSMM_DNN_RNNCELL_RNN_SIGMOID; } else if ( nonlin == 3 ) { rnncell_desc.cell_type = LIBXSMM_DNN_RNNCELL_RNN_TANH; } else { /* should not happen */ } rnncell_desc.datatype_in = LIBXSMM_DNN_DATATYPE_F32; rnncell_desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32; rnncell_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NC; rnncell_desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_CK; libxsmm_handle = libxsmm_dnn_create_rnncell( rnncell_desc, &status ); CHKERR_LIBXSMM_DNN( status ); /* setup LIBXSMM buffers and filter */ libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_input = libxsmm_dnn_link_tensor( libxsmm_layout, xt, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_hidden_state_prev = libxsmm_dnn_link_tensor( libxsmm_layout, hp, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_weight = libxsmm_dnn_link_tensor( libxsmm_layout, w, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_recur_weight = libxsmm_dnn_link_tensor( libxsmm_layout, u, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_BIAS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_bias = libxsmm_dnn_link_tensor( libxsmm_layout, b, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_hidden_state = libxsmm_dnn_link_tensor( libxsmm_layout, ht, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dinput = libxsmm_dnn_link_tensor( libxsmm_layout, djdxt, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dweight = libxsmm_dnn_link_tensor( libxsmm_layout, djdw, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_drecur_weight = libxsmm_dnn_link_tensor( libxsmm_layout, djdu, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_BIAS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dbias = libxsmm_dnn_link_tensor( libxsmm_layout, djdb, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dhidden_state = libxsmm_dnn_link_tensor( libxsmm_layout, djdht, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); /* copy in data to LIBXSMM format */ matrix_copy( t*N*C, xgoldt, xt ); matrix_copy( K*N, hpgold, hp ); matrix_copy( C*K, wgold, w ); matrix_copy( K*K, ugold, u ); matrix_copy( K, bgold, b ); matrix_copy( t*K*N, djdhgoldt, djdht ); /* bind buffers and filter to handle */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_input, LIBXSMM_DNN_RNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_hidden_state_prev, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_weight, LIBXSMM_DNN_RNN_REGULAR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_recur_weight, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_bias, LIBXSMM_DNN_RNN_REGULAR_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_hidden_state, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dinput, LIBXSMM_DNN_RNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dweight, LIBXSMM_DNN_RNN_GRADIENT_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_drecur_weight, LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dbias, LIBXSMM_DNN_RNN_GRADIENT_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dhidden_state, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE ) ); /* let's allocate and bind scratch */ if (pass == 0) { scratch_size = libxsmm_dnn_rnncell_get_scratch_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, &status ); CHKERR_LIBXSMM_DNN( status ); scratch = libxsmm_aligned_malloc( scratch_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, scratch ) ); } else { scratch_size = libxsmm_dnn_rnncell_get_scratch_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, &status ); CHKERR_LIBXSMM_DNN( status ); scratch = libxsmm_aligned_malloc( scratch_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, scratch ) ); } zero_buf( (float*)scratch, scratch_size/4 ); /* let's allocate and bind internalstate */ if (pass == 0) { internalstate_size = libxsmm_dnn_rnncell_get_internalstate_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, &status ); CHKERR_LIBXSMM_DNN( status ); internalstate = libxsmm_aligned_malloc( internalstate_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, internalstate ) ); } else { internalstate_size = libxsmm_dnn_rnncell_get_internalstate_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, &status ); CHKERR_LIBXSMM_DNN( status ); internalstate = libxsmm_aligned_malloc( internalstate_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, internalstate ) ); } zero_buf( (float*)internalstate, internalstate_size/4 ); if ((pass == 0) && LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Correctness - FWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM RNN */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) ); } matrix_copy( N*K, &LIBXSMM_VLA_ACCESS(2, h, t-1, 0, K*N), htest ); /* compare */ libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_F32, K*N, 1, &LIBXSMM_VLA_ACCESS(2, hgold, t-1, 0, K*N), htest, 0, 0); printf("L1 reference : %.25g\n", norms_fwd.l1_ref); printf("L1 test : %.25g\n", norms_fwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_fwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_fwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel); printf("Check-norm : %.24f\n", norms_fwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_fwd); } else { /* We need to always run FWD pass once to populate zt, ht */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) ); } } if ( (pass == 1) && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - BWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM RNN */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ) ); } /* copy out data */ matrix_copy(N*C*t, djdxt, djdxtestt); /* compare */ libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, N*C*t, 1, djdxgoldt, djdxtestt, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); } if ( (pass == 2) && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM RNN */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ) ); } /* copy out data */ matrix_copy(C*K, djdw, djdwtest); matrix_copy(K*K, djdu, djdutest); /* compare */ libxsmm_matdiff(&norms_upd_w, LIBXSMM_DATATYPE_F32, C*K, 1, djdwgold, djdwtest, 0, 0); printf("Delta weight\n"); printf("L1 reference : %.25g\n", norms_upd_w.l1_ref); printf("L1 test : %.25g\n", norms_upd_w.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_w.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_w.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_w.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_w.linf_rel); printf("Check-norm : %.24f\n", norms_upd_w.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_w); libxsmm_matdiff(&norms_upd_u, LIBXSMM_DATATYPE_F32, K*K, 1, djdugold, djdutest, 0, 0); printf("Delta recurrent weight\n"); printf("L1 reference : %.25g\n", norms_upd_u.l1_ref); printf("L1 test : %.25g\n", norms_upd_u.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_u.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_u.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_u.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_u.linf_rel); printf("Check-norm : %.24f\n", norms_upd_u.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_u); libxsmm_matdiff(&norms_upd_b, LIBXSMM_DATATYPE_F32, K, 1, djdbgold, djdb, 0, 0); printf("Delta bias\n"); printf("L1 reference : %.25g\n", norms_upd_b.l1_ref); printf("L1 test : %.25g\n", norms_upd_b.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_b.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_b.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_b.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_b.linf_rel); printf("Check-norm : %.24f\n", norms_upd_b.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_b); } if ( (pass == 3) && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - BWD+UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM RNN */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, 0, tid ) ); } /* copy out data */ matrix_copy(N*C*t, djdxt, djdxtestt); matrix_copy(C*K, djdw, djdwtest); matrix_copy(K*K, djdu, djdutest); /* compare */ libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, N*C*t, 1, djdxgoldt, djdxtestt, 0, 0); printf("Delta input\n"); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); libxsmm_matdiff(&norms_upd_w, LIBXSMM_DATATYPE_F32, C*K, 1, djdwgold, djdwtest, 0, 0); printf("Delta weight\n"); printf("L1 reference : %.25g\n", norms_upd_w.l1_ref); printf("L1 test : %.25g\n", norms_upd_w.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_w.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_w.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_w.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_w.linf_rel); printf("Check-norm : %.24f\n", norms_upd_w.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_w); libxsmm_matdiff(&norms_upd_u, LIBXSMM_DATATYPE_F32, K*K, 1, djdugold, djdutest, 0, 0); printf("Delta recurrent weight\n"); printf("L1 reference : %.25g\n", norms_upd_u.l1_ref); printf("L1 test : %.25g\n", norms_upd_u.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_u.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_u.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_u.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_u.linf_rel); printf("Check-norm : %.24f\n", norms_upd_u.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_u); libxsmm_matdiff(&norms_upd_b, LIBXSMM_DATATYPE_F32, K, 1, djdbgold, djdb, 0, 0); printf("Delta bias\n"); printf("L1 reference : %.25g\n", norms_upd_b.l1_ref); printf("L1 test : %.25g\n", norms_upd_b.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_b.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_b.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_b.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_b.linf_rel); printf("Check-norm : %.24f\n", norms_upd_b.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_b); } if ( pass == 0 ) { printf("##########################################\n"); printf("# Performance - FWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM RNN for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = ((2.0 * K*N*C) + (2.0 * K*N*K) + (K*N) + (tflops * K*N)) * (double)t * (double)iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,FP,%s,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, ((double)(l_total/iters)), (flops*1e-9)/l_total); } if ( pass == 1 ) { printf("##########################################\n"); printf("# Performance - BWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM RNN for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = K*K; /* U^T */ flops += (2.0 * K*N*K); /* U^T * delta */ flops += (K*N); /* dJdh + (U^T * delta) */ flops += (tflops * K*N); /* sigma'(Z) */ flops += (K*N); /* sigma'(Z) * (dJdh + (U^T * delta)) */ flops *= t; /* for t time steps */ tempflops = C*K; /* W^T */ tempflops += (2.0 * K*N*C); /* W^T * delta */ tempflops *= t; /* for t time steps of input */ flops += tempflops; flops *= iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("bp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,BP,%s,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, ((double)(l_total/iters)), (flops*1e-9)/l_total); } if ( pass == 2 ) { printf("##########################################\n"); printf("# Performance - UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM RNN for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = K*K; /* U^T */ flops += (2.0 * K*N*K); /* U^T * delta */ flops += (K*N); /* dJdh + (U^T * delta) */ flops += (tflops * K*N); /* sigma'(Z) */ flops += (K*N); /* sigma'(Z) * (dJdh + (U^T * delta)) */ flops *= t; /* for t time steps */ tempflops = K*N; /* h^T */ tempflops += (2.0 * K*N*K); /* delta * h^T */ tempflops *= t; /* for t time steps */ tempflops += (K*K * (t-1)); /* for summation of dJdU */ flops += tempflops; tempflops = N*C; /* x^T */ tempflops += (2.0 * K*N*C); /* delta * x^T */ tempflops *= t; /* for t time steps */ tempflops += (C*K * (t-1)); /* for summation of dJdW */ flops += tempflops; flops *= iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("wu time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,WU,%s,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, ((double)(l_total/iters)), (flops*1e-9)/l_total); } if ( pass == 3 ) { printf("##########################################\n"); printf("# Performance - BWD+UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM RNN for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = K*K; /* U^T */ flops += (2.0 * K*N*K); /* U^T * delta */ flops += (K*N); /* dJdh + (U^T * delta) */ flops += (tflops * K*N); /* sigma'(Z) */ flops += (K*N); /* sigma'(Z) * (dJdh + (U^T * delta)) */ flops *= t; /* for t time steps */ tempflops = K*N; /* h^T */ tempflops += (2.0 * K*N*K); /* delta * h^T */ tempflops *= t; /* for t time steps */ tempflops += (K*K * (t-1)); /* for summation of dJdU */ flops += tempflops; tempflops = N*C; /* x^T */ tempflops += (2.0 * K*N*C); /* delta * x^T */ tempflops *= t; /* for t time steps */ tempflops += (C*K * (t-1)); /* for summation of dJdW */ flops += tempflops; tempflops = C*K; /* W^T */ tempflops += (2.0 * K*N*C); /* W^T * delta */ tempflops *= t; /* for t time steps of input */ flops += tempflops; flops *= iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("bp+wu time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,BP+WU,%s,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, ((double)(l_total/iters)), (flops*1e-9)/l_total); } if ( pass == 4 ) { printf("#############################################\n"); printf("# Performance - FWD+BWD+UPD (nc-ck Storage) #\n"); printf("#############################################\n"); /* run LIBXSMM RNN for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = (2.0 * K*N*K); /* U^T * delta */ flops += (K*N); /* dJdh + (U^T * delta) */ flops += (tflops * K*N); /* sigma'(Z) */ flops += (K*N); /* sigma'(Z) * (dJdh + (U^T * delta)) */ flops *= t; /* for t time steps */ tempflops = (2.0 * K*N*K); /* delta * h^T */ tempflops *= t; /* for t time steps */ tempflops += (K*K * (t-1)); /* for summation of dJdU */ flops += tempflops; tempflops = (2.0 * K*N*C); /* delta * x^T */ tempflops *= t; /* for t time steps */ tempflops += (C*K * (t-1)); /* for summation of dJdW */ flops += tempflops; tempflops = (2.0 * K*N*C); /* W^T * delta */ tempflops *= t; /* for t time steps of input */ flops += tempflops; flops *= iters; flops += ((2.0 * K*N*C) + (2.0 * K*N*K) + (K*N) + (tflops * K*N)) * (double)t * (double)iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("fp+bp+wu time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,FP+BP+WU,%s,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, ((double)(l_total/iters)), (flops*1e-9)/l_total); } /* clean-up */ if (pass == 0) { CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD ) ); } else { CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL ) ); } libxsmm_free(scratch); libxsmm_free(internalstate); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_input ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_hidden_state_prev ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_weight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_recur_weight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_bias ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_hidden_state ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dinput ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dweight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_drecur_weight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dbias ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dhidden_state ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_rnncell( libxsmm_handle ) ); } /* deallocate data */ libxsmm_free(xgoldt); libxsmm_free(hpgold); libxsmm_free(wgold); libxsmm_free(ugold); libxsmm_free(bgold); libxsmm_free(hgoldt); libxsmm_free(zgoldt); libxsmm_free(bmgold); libxsmm_free(z1gold); libxsmm_free(z2gold); libxsmm_free(djdxgoldt); libxsmm_free(djdwgold); libxsmm_free(djdugold); libxsmm_free(djdbgold); libxsmm_free(djdhgoldt); libxsmm_free(deltagoldt); libxsmm_free(zigold); libxsmm_free(di1gold); libxsmm_free(di2gold); libxsmm_free(xgoldTp); libxsmm_free(wgoldTp); libxsmm_free(ugoldTp); libxsmm_free(hgoldTp); libxsmm_free(xt); libxsmm_free(hp); libxsmm_free(w); libxsmm_free(u); libxsmm_free(b); libxsmm_free(ht); libxsmm_free(djdxt); libxsmm_free(djdw); libxsmm_free(djdu); libxsmm_free(djdb); libxsmm_free(djdht); libxsmm_free(htest); libxsmm_free(djdxtestt); libxsmm_free(djdwtest); libxsmm_free(djdutest); { const char *const env_check_scale = getenv("CHECK_SCALE"); const double check_scale = LIBXSMM_ABS(0 == env_check_scale ? 1.0 : atof(env_check_scale)); if (LIBXSMM_NEQ(0, check) && (check < 100.0 * check_scale * diff.normf_rel) && (global_status == LIBXSMM_DNN_SUCCESS)) { fprintf(stderr, "FAILED with an error of %f%%!\n", 100.0 * diff.normf_rel); exit(EXIT_FAILURE); } } /* some empty lines at the end */ printf("\n\n\n"); return global_status; } libxsmm-1.17/samples/deeplearning/rnndriver/rnndriver_nc_ck_f32.vcxproj000066400000000000000000000551631415223013700264740ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 rnndriver_nc_ck_f32 {9776BA53-EEDA-4A39-89A8-9DAB45F4D82A} 10.0 Application Disabled Disabled Sequential v142 true Application true true Disabled Disabled Sequential v142 Application true Disabled Disabled Sequential v142 true Application Disabled Disabled Sequential v142 true true Application true Disabled Disabled Sequential v142 Application true Disabled Disabled true Sequential v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmext.lib;mkl_rt.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmext.lib;mkl_rt.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/deeplearning/rnndriver/rnndriver_nc_kcck_f32.c000066400000000000000000001301601415223013700255300ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas, Kunal Banerjee (Intel Corp.) ******************************************************************************/ #include #include #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #include #include #include #include #if defined(_OPENMP) # include #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif /* include c-based dnn library */ #include "../common/dnn_common.h" #define CHKERR_LIBXSMM_DNN(A) { const int chkerr_libxsmm_dnn_ = A; if (LIBXSMM_DNN_SUCCESS != chkerr_libxsmm_dnn_) { \ fprintf(stderr, "%s\n", libxsmm_dnn_get_error(chkerr_libxsmm_dnn_)); global_status = chkerr_libxsmm_dnn_; } \ } int main(int argc, char* argv[]) { /* Arrays related to FWD pass */ float *wgold, *xgoldt, *ugold, *hpgold, *hgoldt, *z1gold, *z2gold, *zgoldt, *bgold, *bmgold; float *w, *wt, *xt, *u, *ut, *hp, *ht, *htest, *b; /* Arrays related to BWD and UPD pass */ float *djdhgoldt, *deltagoldt, *djdugold, *djdwgold, *djdxgoldt, *djdbgold; float *zigold, *di1gold, *di2gold, *ugoldTp, *wgoldTp, *hgoldTp, *xgoldTp; float *djdht, *djdu, *djdw, *djdxt, *djdb, *djdxtestt, *djdwtest, *djdutest; const char transa = 'N', transb = 'N'; /* no transposes */ const float alpha = 1, beta = 1, beta0 = 0; void *scratch, *internalstate; size_t scratch_size = 0, internalstate_size = 0; int iters = 10; /* repetitions of benchmark */ int pass = 0; /* pass: 0--FWD, 1--BWD, 2--UPD, 3--BWD+UPD */ int nonlin = 2; /* nonlin=1 denotes ReLU, 2 denotes sigmoid, 3 denotes tanh */ int N = 128; /* size of mini-batch */ int C = 512; /* number of inputs */ int K = 256; /* number of outputs */ int t = 4; /* number of time steps (> 1) */ int bk = 64; int bn = 64; int bc = 64; const char *const env_check = getenv("CHECK"); const double check = LIBXSMM_ABS(0 == env_check ? 1/*enable by default*/ : atof(env_check)); #if defined(_OPENMP) int nThreads = omp_get_max_threads(); /* number of threads */ #else int nThreads = 1; /* number of threads */ #endif unsigned long long l_start, l_end; double l_total = 0.0; double flops = 0.0, tempflops = 0.0; const double tflops = 12; /* transcendental flops */ int i, j, it; libxsmm_dnn_rnncell_desc rnncell_desc; libxsmm_dnn_rnncell* libxsmm_handle; libxsmm_dnn_tensor* libxsmm_input; libxsmm_dnn_tensor* libxsmm_hidden_state_prev; libxsmm_dnn_tensor* libxsmm_weight; libxsmm_dnn_tensor* libxsmm_weight_t; libxsmm_dnn_tensor* libxsmm_recur_weight; libxsmm_dnn_tensor* libxsmm_recur_weight_t; libxsmm_dnn_tensor* libxsmm_bias; libxsmm_dnn_tensor* libxsmm_hidden_state; libxsmm_dnn_tensor* libxsmm_dinput; libxsmm_dnn_tensor* libxsmm_dweight; libxsmm_dnn_tensor* libxsmm_drecur_weight; libxsmm_dnn_tensor* libxsmm_dbias; libxsmm_dnn_tensor* libxsmm_dhidden_state; libxsmm_dnn_tensor_datalayout* libxsmm_layout; libxsmm_dnn_err_t status; libxsmm_dnn_err_t global_status = LIBXSMM_DNN_SUCCESS; libxsmm_matdiff_info norms_fwd, norms_bwd, norms_upd_w, norms_upd_u, norms_upd_b, diff; libxsmm_matdiff_clear(&norms_fwd); libxsmm_matdiff_clear(&norms_bwd); libxsmm_matdiff_clear(&norms_upd_w); libxsmm_matdiff_clear(&norms_upd_u); libxsmm_matdiff_clear(&norms_upd_b); libxsmm_matdiff_clear(&diff); if (argc > 1 && !strncmp(argv[1], "-h", 3)) { printf("\nUsage: ./rnndriver [reps] [pass: 0--FWD, 1--BWD, 2--UPD, 3--BWD+UPD] [nonlin: 1--ReLU, 2--sigmoid, 3--tanh] [N] [C] [K] [time_steps > 0]\n\n"); return 0; } libxsmm_rng_set_seed(1); /* reading new values from cli */ i = 1; if (argc > i) iters = atoi(argv[i++]); if (argc > i) pass = atoi(argv[i++]); if (argc > i) nonlin= atoi(argv[i++]); if (argc > i) N = atoi(argv[i++]); if (argc > i) C = atoi(argv[i++]); if (argc > i) K = atoi(argv[i++]); if (argc > i) t = atoi(argv[i++]); if (argc > i) bn = atoi(argv[i++]); if (argc > i) bk = atoi(argv[i++]); if (argc > i) bc = atoi(argv[i++]); if (t <= 0) { printf("time_steps %d should be greater than 0\n\n", t); return 0; } if (!(pass == 0 || pass == 1 || pass == 2 || pass == 3 || pass == 4)) { printf("Unknown pass: %d, valid arguments for pass = {0(FWD), 1(BWD), 2(UPD), 3(BWD+UPD)\n\n", pass); return 0; } if (nonlin != 1 && nonlin != 2 && nonlin != 3) { printf("Unsupported non-linear function used [1--ReLU, 2--sigmoid, 3--tanh]\n\n"); return 0; } #if defined(__SSE3__) _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); #endif /* print some summary */ printf("##########################################\n"); printf("# Setting Up (Common) #\n"); printf("##########################################\n"); printf("PARAMS: N:%d C:%d K:%d T:%d\n", N, C, K, t); printf("PARAMS: ITERS:%d", iters); if (LIBXSMM_FEQ(0, check)) printf(" Threads:%d\n", nThreads); else printf("\n"); printf("SIZE Weight (MB): %10.2f MiB\n", (double)(C*K*sizeof(float))/(1024.0*1024.0) ); printf("SIZE Input (MB): %10.2f MiB\n", (double)(N*C*sizeof(float))/(1024.0*1024.0) ); printf("SIZE Hidden State: %10.2f MiB\n", (double)(K*N*sizeof(float))/(1024.0*1024.0) ); /* allocate data */ xgoldt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); hpgold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); wgold = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); ugold = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); bgold = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); hgoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); zgoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); bmgold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); z1gold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); z2gold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); djdxgoldt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); djdwgold = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); djdugold = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); djdbgold = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); djdhgoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); deltagoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); zigold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); di1gold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); di2gold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); xgoldTp = (float*)libxsmm_aligned_malloc(N*C*sizeof(float), 2097152); wgoldTp = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); ugoldTp = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); hgoldTp = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); xt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); hp = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); w = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); wt = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); u = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); ut = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); ht = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); b = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); djdxt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); djdw = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); djdu = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); djdb = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); djdht = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); htest = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); djdxtestt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); djdwtest = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); djdutest = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); LIBXSMM_VLA_DECL(2, float, xgold, xgoldt, N*C); LIBXSMM_VLA_DECL(2, float, hgold, hgoldt, K*N); LIBXSMM_VLA_DECL(2, float, zgold, zgoldt, K*N); LIBXSMM_VLA_DECL(2, float, djdxgold, djdxgoldt, N*C); LIBXSMM_VLA_DECL(2, float, djdhgold, djdhgoldt, K*N); LIBXSMM_VLA_DECL(2, float, deltagold, deltagoldt, K*N); /* initialize data */ /* All data in gold is considered to be in column-major format */ for (it = 0; it < t; ++it) { init_buf(&LIBXSMM_VLA_ACCESS(2, xgold, it, 0, N*C), N*C, 0, 0); } init_buf(hpgold, N*K, 0, 0); init_buf(wgold, C*K, 0, 0); init_buf(ugold, K*K, 0, 0); init_buf(bgold, K, 0, 0); for (j = 0; j < N; j++) { matrix_copy(K, bgold, &(bmgold[j*K])); } zero_buf(hgoldt, K*N*t); zero_buf(zgoldt, K*N*t); zero_buf(z1gold, K*N); zero_buf(z2gold, K*N); for (it = 0; it < t; ++it) { init_buf(&LIBXSMM_VLA_ACCESS(2, djdhgold, it, 0, K*N), N*K, 0, 0); } zero_buf(djdxgoldt, N*C*t); zero_buf(djdwgold, C*K); zero_buf(djdugold, K*K); zero_buf(djdbgold, K); zero_buf(deltagoldt, K*N*t); zero_buf(zigold, K*N); zero_buf(di1gold, K*N); zero_buf(di2gold, K*N); zero_buf(xgoldTp, N*C); zero_buf(ugoldTp, K*K); zero_buf(wgoldTp, C*K); zero_buf(hgoldTp, K*N); /* first touch LIBXSMM */ zero_buf(xt, N*C*t); zero_buf(hp, K*N); zero_buf(w, C*K); zero_buf(u, K*K); zero_buf(wt, C*K); zero_buf(ut, K*K); zero_buf(b, K); zero_buf(ht, K*N*t); zero_buf(djdxt,N*C*t); zero_buf(djdw, C*K); zero_buf(djdu, K*K); zero_buf(djdb, K); zero_buf(djdht, K*N*t); LIBXSMM_VLA_DECL(2, float, h, ht, K*N); if (LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Computing Reference ... #\n"); printf("##########################################\n"); for (i = 0; i < t; ++i) { LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &C, &alpha, wgold, &K, &LIBXSMM_VLA_ACCESS(2, xgold, i, 0, N*C), &C, &beta0, z1gold, &K); if (0 == i) { LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &K, &alpha, ugold, &K, hpgold, &K, &beta0, z2gold, &K); } else { LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &K, &alpha, ugold, &K, &LIBXSMM_VLA_ACCESS(2, hgold, i-1, 0, K*N), &K, &beta0, z2gold, &K); } matrix_add(K*N, z1gold, z2gold, &LIBXSMM_VLA_ACCESS(2, zgold, i, 0, K*N)); matrix_add(K*N, &LIBXSMM_VLA_ACCESS(2, zgold, i, 0, K*N), bmgold, &LIBXSMM_VLA_ACCESS(2, zgold, i, 0, K*N)); if (1 == nonlin) { matrix_relu(K*N, &LIBXSMM_VLA_ACCESS(2, zgold, i, 0, K*N), &LIBXSMM_VLA_ACCESS(2, hgold, i, 0, K*N)); } else if (2 == nonlin) { matrix_sigmoid(K*N, &LIBXSMM_VLA_ACCESS(2, zgold, i, 0, K*N), &LIBXSMM_VLA_ACCESS(2, hgold, i, 0, K*N)); } else { matrix_tanh(K*N, &LIBXSMM_VLA_ACCESS(2, zgold, i, 0, K*N), &LIBXSMM_VLA_ACCESS(2, hgold, i, 0, K*N)); } } /* Conceptually, delta iterates over 0 ... t-1, whereas, djdh and z iterates over 1 ... t */ /* Hence these have identical array indices */ if (1 == nonlin) { matrix_relu_inverse(K*N, &LIBXSMM_VLA_ACCESS(2, zgold, t-1, 0, K*N), zigold); } else if (2 == nonlin) { matrix_sigmoid_inverse(K*N, &LIBXSMM_VLA_ACCESS(2, zgold, t-1, 0, K*N), zigold); } else { matrix_tanh_inverse(K*N, &LIBXSMM_VLA_ACCESS(2, zgold, t-1, 0, K*N), zigold); } matrix_eltwise_mult(K*N, zigold, &LIBXSMM_VLA_ACCESS(2, djdhgold, t-1, 0, K*N), &LIBXSMM_VLA_ACCESS(2, deltagold, t-1, 0, K*N)); matrix_transpose(K, K, ugold, ugoldTp); for (i = t-2; i >= 0; --i) { if (1 == nonlin) { matrix_relu_inverse(K*N, &LIBXSMM_VLA_ACCESS(2, zgold, i, 0, K*N), zigold); } else if (2 == nonlin) { matrix_sigmoid_inverse(K*N, &LIBXSMM_VLA_ACCESS(2, zgold, i, 0, K*N), zigold); } else { matrix_tanh_inverse(K*N, &LIBXSMM_VLA_ACCESS(2, zgold, i, 0, K*N), zigold); } LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &K, &alpha, ugoldTp, &K, &LIBXSMM_VLA_ACCESS(2, deltagold, i+1, 0, K*N), &K, &beta0, di1gold, &K); matrix_add(K*N, &LIBXSMM_VLA_ACCESS(2, djdhgold, i, 0, K*N), di1gold, di2gold); matrix_eltwise_mult(K*N, zigold, di2gold, &LIBXSMM_VLA_ACCESS(2, deltagold, i, 0, K*N)); } if (pass == 1 || pass == 3) { matrix_transpose(C, K, wgold, wgoldTp); for (i = 0; i < t; ++i) { LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &C, &N, &K, &alpha, wgoldTp, &C, &LIBXSMM_VLA_ACCESS(2, deltagold, i, 0, K*N), &K, &beta0, &LIBXSMM_VLA_ACCESS(2, djdxgold, i, 0, N*C), &C); } } if (pass == 2 || pass == 3) { for (i = 0; i < t; ++i) { if (0 == i) { matrix_transpose(N, K, hpgold, hgoldTp); } else { matrix_transpose(N, K, &LIBXSMM_VLA_ACCESS(2, hgold, i-1, 0, K*N), hgoldTp); } LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &K, &N, &alpha, &LIBXSMM_VLA_ACCESS(2, deltagold, i, 0, K*N), &K, hgoldTp, &N, &beta, djdugold, &K); matrix_transpose(N, C, &LIBXSMM_VLA_ACCESS(2, xgold, i, 0, N*C), xgoldTp); LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &C, &N, &alpha, &LIBXSMM_VLA_ACCESS(2, deltagold, i, 0, K*N), &K, xgoldTp, &N, &beta, djdwgold, &K); for (j = 0; j < K*N; j++) { djdbgold[j%K] += LIBXSMM_VLA_ACCESS(2, deltagold, i, j, K*N); } } } printf("##########################################\n"); printf("# Computing Reference ... done #\n"); printf("##########################################\n"); } if (1 /* format == 'A' || format == 'L' */) { printf("\n"); printf("##########################################\n"); printf("# Setting Up (custom-Storage) #\n"); printf("##########################################\n"); /* setup LIBXSMM handle */ rnncell_desc.threads = nThreads; rnncell_desc.N = N; rnncell_desc.C = C; rnncell_desc.K = K; rnncell_desc.bn = bn; rnncell_desc.bk = bk; rnncell_desc.bc = bc; rnncell_desc.max_T = t; if ( nonlin == 1 ) { rnncell_desc.cell_type = LIBXSMM_DNN_RNNCELL_RNN_RELU; } else if ( nonlin == 2 ) { rnncell_desc.cell_type = LIBXSMM_DNN_RNNCELL_RNN_SIGMOID; } else if ( nonlin == 3 ) { rnncell_desc.cell_type = LIBXSMM_DNN_RNNCELL_RNN_TANH; } else { /* should not happen */ } rnncell_desc.datatype_in = LIBXSMM_DNN_DATATYPE_F32; rnncell_desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32; rnncell_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NC; rnncell_desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED; libxsmm_handle = libxsmm_dnn_create_rnncell( rnncell_desc, &status ); CHKERR_LIBXSMM_DNN( status ); /* setup LIBXSMM buffers and filter */ libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_input = libxsmm_dnn_link_tensor( libxsmm_layout, xt, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_hidden_state_prev = libxsmm_dnn_link_tensor( libxsmm_layout, hp, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_weight = libxsmm_dnn_link_tensor( libxsmm_layout, w, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_weight_t = libxsmm_dnn_link_tensor( libxsmm_layout, wt, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_recur_weight = libxsmm_dnn_link_tensor( libxsmm_layout, u, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_recur_weight_t = libxsmm_dnn_link_tensor( libxsmm_layout, ut, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_BIAS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_bias = libxsmm_dnn_link_tensor( libxsmm_layout, b, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_hidden_state = libxsmm_dnn_link_tensor( libxsmm_layout, ht, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dinput = libxsmm_dnn_link_tensor( libxsmm_layout, djdxt, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dweight = libxsmm_dnn_link_tensor( libxsmm_layout, djdw, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_drecur_weight = libxsmm_dnn_link_tensor( libxsmm_layout, djdu, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_BIAS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dbias = libxsmm_dnn_link_tensor( libxsmm_layout, djdb, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dhidden_state = libxsmm_dnn_link_tensor( libxsmm_layout, djdht, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); /* copy in data to LIBXSMM format */ matrix_copy( t*N*C, xgoldt, xt ); matrix_copy( K*N, hpgold, hp ); matrix_copy( K, bgold, b ); matrix_copy_CK_to_KCCK(wgold, w, C, K, bc, bk); matrix_copy_CK_to_KCCK(ugold, u, K, K, bk, bk); matrix_copy_CK_to_CKKC(wgold, wt, C, K, bc, bk); matrix_copy_CK_to_CKKC(ugold, ut, K, K, bk, bk); matrix_copy( t*K*N, djdhgoldt, djdht ); /* bind buffers and filter to handle */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_input, LIBXSMM_DNN_RNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_hidden_state_prev, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_weight, LIBXSMM_DNN_RNN_REGULAR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_recur_weight, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_weight_t, LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_recur_weight_t, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_bias, LIBXSMM_DNN_RNN_REGULAR_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_bias, LIBXSMM_DNN_RNN_REGULAR_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_hidden_state, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dinput, LIBXSMM_DNN_RNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dweight, LIBXSMM_DNN_RNN_GRADIENT_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_drecur_weight, LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dbias, LIBXSMM_DNN_RNN_GRADIENT_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dhidden_state, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE ) ); /* let's allocate and bind scratch */ if (pass == 0) { scratch_size = libxsmm_dnn_rnncell_get_scratch_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, &status ); CHKERR_LIBXSMM_DNN( status ); scratch = libxsmm_aligned_malloc( scratch_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, scratch ) ); } else { scratch_size = libxsmm_dnn_rnncell_get_scratch_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, &status ); CHKERR_LIBXSMM_DNN( status ); scratch = libxsmm_aligned_malloc( scratch_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, scratch ) ); } zero_buf( (float*)scratch, scratch_size/4 ); /* let's allocate and bind internalstate */ if (pass == 0) { internalstate_size = libxsmm_dnn_rnncell_get_internalstate_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, &status ); CHKERR_LIBXSMM_DNN( status ); internalstate = libxsmm_aligned_malloc( internalstate_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, internalstate ) ); } else { internalstate_size = libxsmm_dnn_rnncell_get_internalstate_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, &status ); CHKERR_LIBXSMM_DNN( status ); internalstate = libxsmm_aligned_malloc( internalstate_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, internalstate ) ); } zero_buf( (float*)internalstate, internalstate_size/4 ); if ((pass == 0) && LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Correctness - FWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM RNN */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) ); } matrix_copy( N*K, &LIBXSMM_VLA_ACCESS(2, h, t-1, 0, K*N), htest ); /* compare */ libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_F32, K*N, 1, &LIBXSMM_VLA_ACCESS(2, hgold, t-1, 0, K*N), htest, 0, 0); printf("L1 reference : %.25g\n", norms_fwd.l1_ref); printf("L1 test : %.25g\n", norms_fwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_fwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_fwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel); printf("Check-norm : %.24f\n", norms_fwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_fwd); } else { /* We need to always run FWD pass once to populate zt, ht */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) ); } } if ( (pass == 1) && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - BWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM RNN */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ) ); } /* copy out data */ matrix_copy(N*C*t, djdxt, djdxtestt); /* compare */ libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, N*C*t, 1, djdxgoldt, djdxtestt, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); } if ( (pass == 2) && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM RNN */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ) ); } /* copy out data */ matrix_copy_KCCK_to_CK(djdw, djdwtest, C, K, bc, bk); matrix_copy_KCCK_to_CK(djdu, djdutest, K, K, bk, bk); /* compare */ libxsmm_matdiff(&norms_upd_w, LIBXSMM_DATATYPE_F32, C*K, 1, djdwgold, djdwtest, 0, 0); printf("Delta weight\n"); printf("L1 reference : %.25g\n", norms_upd_w.l1_ref); printf("L1 test : %.25g\n", norms_upd_w.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_w.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_w.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_w.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_w.linf_rel); printf("Check-norm : %.24f\n", norms_upd_w.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_w); libxsmm_matdiff(&norms_upd_u, LIBXSMM_DATATYPE_F32, K*K, 1, djdugold, djdutest, 0, 0); printf("Delta recurrent weight\n"); printf("L1 reference : %.25g\n", norms_upd_u.l1_ref); printf("L1 test : %.25g\n", norms_upd_u.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_u.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_u.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_u.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_u.linf_rel); printf("Check-norm : %.24f\n", norms_upd_u.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_u); libxsmm_matdiff(&norms_upd_b, LIBXSMM_DATATYPE_F32, K, 1, djdbgold, djdb, 0, 0); printf("Delta bias\n"); printf("L1 reference : %.25g\n", norms_upd_b.l1_ref); printf("L1 test : %.25g\n", norms_upd_b.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_b.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_b.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_b.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_b.linf_rel); printf("Check-norm : %.24f\n", norms_upd_b.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_b); } if ( (pass == 3) && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - BWD+UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM RNN */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, 0, tid ) ); } /* copy out data */ matrix_copy(N*C*t, djdxt, djdxtestt); matrix_copy_KCCK_to_CK(djdw, djdwtest, C, K, bc, bk); matrix_copy_KCCK_to_CK(djdu, djdutest, K, K, bk, bk); /* compare */ libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, N*C*t, 1, djdxgoldt, djdxtestt, 0, 0); printf("Delta input\n"); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); libxsmm_matdiff(&norms_upd_w, LIBXSMM_DATATYPE_F32, C*K, 1, djdwgold, djdwtest, 0, 0); printf("Delta weight\n"); printf("L1 reference : %.25g\n", norms_upd_w.l1_ref); printf("L1 test : %.25g\n", norms_upd_w.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_w.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_w.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_w.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_w.linf_rel); printf("Check-norm : %.24f\n", norms_upd_w.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_w); libxsmm_matdiff(&norms_upd_u, LIBXSMM_DATATYPE_F32, K*K, 1, djdugold, djdutest, 0, 0); printf("Delta recurrent weight\n"); printf("L1 reference : %.25g\n", norms_upd_u.l1_ref); printf("L1 test : %.25g\n", norms_upd_u.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_u.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_u.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_u.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_u.linf_rel); printf("Check-norm : %.24f\n", norms_upd_u.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_u); libxsmm_matdiff(&norms_upd_b, LIBXSMM_DATATYPE_F32, K, 1, djdbgold, djdb, 0, 0); printf("Delta bias\n"); printf("L1 reference : %.25g\n", norms_upd_b.l1_ref); printf("L1 test : %.25g\n", norms_upd_b.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_b.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_b.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_b.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_b.linf_rel); printf("Check-norm : %.24f\n", norms_upd_b.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_b); } if ( pass == 0 ) { printf("##########################################\n"); printf("# Performance - FWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM RNN for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = ((2.0 * K*N*C) + (2.0 * K*N*K) + (K*N) + (tflops * K*N)) * (double)t * (double)iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,FP,%s,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, ((double)(l_total/iters)), (flops*1e-9)/l_total); } if ( pass == 1 ) { printf("##########################################\n"); printf("# Performance - BWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM RNN for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = (2.0 * K*N*K); /* U^T * delta */ flops += (K*N); /* dJdh + (U^T * delta) */ flops += (tflops * K*N); /* sigma'(Z) */ flops += (K*N); /* sigma'(Z) * (dJdh + (U^T * delta)) */ flops *= t; /* for t time steps */ tempflops = (2.0 * K*N*C); /* W^T * delta */ tempflops *= t; /* for t time steps of input */ flops += tempflops; flops *= iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("bp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,BP,%s,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, ((double)(l_total/iters)), (flops*1e-9)/l_total); } if ( pass == 2 ) { printf("##########################################\n"); printf("# Performance - UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM RNN for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = K*K; /* U^T */ flops += (2.0 * K*N*K); /* U^T * delta */ flops += (K*N); /* dJdh + (U^T * delta) */ flops += (tflops * K*N); /* sigma'(Z) */ flops += (K*N); /* sigma'(Z) * (dJdh + (U^T * delta)) */ flops *= t; /* for t time steps */ tempflops = K*N; /* h^T */ tempflops += (2.0 * K*N*K); /* delta * h^T */ tempflops *= t; /* for t time steps */ tempflops += (K*K * (t-1)); /* for summation of dJdU */ flops += tempflops; tempflops = N*C; /* x^T */ tempflops += (2.0 * K*N*C); /* delta * x^T */ tempflops *= t; /* for t time steps */ tempflops += (C*K * (t-1)); /* for summation of dJdW */ flops += tempflops; flops *= iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("wu time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,WU,%s,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, ((double)(l_total/iters)), (flops*1e-9)/l_total); } if ( pass == 3 ) { printf("##########################################\n"); printf("# Performance - BWD+UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM RNN for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = (2.0 * K*N*K); /* U^T * delta */ flops += (K*N); /* dJdh + (U^T * delta) */ flops += (tflops * K*N); /* sigma'(Z) */ flops += (K*N); /* sigma'(Z) * (dJdh + (U^T * delta)) */ flops *= t; /* for t time steps */ tempflops = (2.0 * K*N*K); /* delta * h^T */ tempflops *= t; /* for t time steps */ tempflops += (K*K * (t-1)); /* for summation of dJdU */ flops += tempflops; tempflops = (2.0 * K*N*C); /* delta * x^T */ tempflops *= t; /* for t time steps */ tempflops += (C*K * (t-1)); /* for summation of dJdW */ flops += tempflops; tempflops = (2.0 * K*N*C); /* W^T * delta */ tempflops *= t; /* for t time steps of input */ flops += tempflops; flops *= iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("bp+wu time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,BP+WU,%s,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, ((double)(l_total/iters)), (flops*1e-9)/l_total); } if ( pass == 4 ) { printf("##############################################\n"); printf("# Performance - FWD+BWD+UPD (custom-Storage) #\n"); printf("##############################################\n"); /* run LIBXSMM RNN for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = (2.0 * K*N*K); /* U^T * delta */ flops += (K*N); /* dJdh + (U^T * delta) */ flops += (tflops * K*N); /* sigma'(Z) */ flops += (K*N); /* sigma'(Z) * (dJdh + (U^T * delta)) */ flops *= t; /* for t time steps */ tempflops = (2.0 * K*N*K); /* delta * h^T */ tempflops *= t; /* for t time steps */ tempflops += (K*K * (t-1)); /* for summation of dJdU */ flops += tempflops; tempflops = (2.0 * K*N*C); /* delta * x^T */ tempflops *= t; /* for t time steps */ tempflops += (C*K * (t-1)); /* for summation of dJdW */ flops += tempflops; tempflops = (2.0 * K*N*C); /* W^T * delta */ tempflops *= t; /* for t time steps of input */ flops += tempflops; flops *= iters; flops += ((2.0 * K*N*C) + (2.0 * K*N*K) + (K*N) + (tflops * K*N)) * (double)t * (double)iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("fp+bp+wu time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,FP+BP+WU,%s,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, ((double)(l_total/iters)), (flops*1e-9)/l_total); } /* clean-up */ if (pass == 0) { CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD ) ); } else { CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL ) ); } libxsmm_free(scratch); libxsmm_free(internalstate); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_input ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_hidden_state_prev ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_weight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_recur_weight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_bias ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_hidden_state ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dinput ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dweight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_drecur_weight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dbias ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dhidden_state ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_rnncell( libxsmm_handle ) ); } /* deallocate data */ libxsmm_free(xgoldt); libxsmm_free(hpgold); libxsmm_free(wgold); libxsmm_free(ugold); libxsmm_free(bgold); libxsmm_free(hgoldt); libxsmm_free(zgoldt); libxsmm_free(bmgold); libxsmm_free(z1gold); libxsmm_free(z2gold); libxsmm_free(djdxgoldt); libxsmm_free(djdwgold); libxsmm_free(djdugold); libxsmm_free(djdbgold); libxsmm_free(djdhgoldt); libxsmm_free(deltagoldt); libxsmm_free(zigold); libxsmm_free(di1gold); libxsmm_free(di2gold); libxsmm_free(xgoldTp); libxsmm_free(wgoldTp); libxsmm_free(ugoldTp); libxsmm_free(hgoldTp); libxsmm_free(xt); libxsmm_free(hp); libxsmm_free(w); libxsmm_free(u); libxsmm_free(b); libxsmm_free(ht); libxsmm_free(djdxt); libxsmm_free(djdw); libxsmm_free(djdu); libxsmm_free(djdb); libxsmm_free(djdht); libxsmm_free(htest); libxsmm_free(djdxtestt); libxsmm_free(djdwtest); libxsmm_free(djdutest); { const char *const env_check_scale = getenv("CHECK_SCALE"); const double check_scale = LIBXSMM_ABS(0 == env_check_scale ? 1.0 : atof(env_check_scale)); if (LIBXSMM_NEQ(0, check) && (check < 100.0 * check_scale * diff.normf_rel) && (global_status == LIBXSMM_DNN_SUCCESS)) { fprintf(stderr, "FAILED with an error of %f%%!\n", 100.0 * diff.normf_rel); exit(EXIT_FAILURE); } } /* some empty lines at the end */ printf("\n\n\n"); return global_status; } libxsmm-1.17/samples/deeplearning/rnndriver/rnndriver_nc_kcck_f32.vcxproj000066400000000000000000000551671415223013700270160ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 rnndriver_nc_kcck_f32 {48FA5AA0-62D7-462F-B32A-69F8C2DCA062} 10.0 Application Disabled Disabled Sequential v142 true Application true true Disabled Disabled Sequential v142 Application true Disabled Disabled Sequential v142 true Application Disabled Disabled Sequential v142 true true Application true Disabled Disabled Sequential v142 Application true Disabled Disabled true Sequential v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmext.lib;mkl_rt.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmext.lib;mkl_rt.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/deeplearning/rnndriver/rnndriver_ncnc_kcck_f32.c000066400000000000000000001247531415223013700260640ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Kunal Banerjee (Intel Corp.) ******************************************************************************/ #include #include #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #include #include #include #include #if defined(_OPENMP) # include #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif /* include c-based dnn library */ #include "../common/dnn_common.h" #define CHKERR_LIBXSMM_DNN(A) { const int chkerr_libxsmm_dnn_ = A; if (LIBXSMM_DNN_SUCCESS != chkerr_libxsmm_dnn_) { \ fprintf(stderr, "%s\n", libxsmm_dnn_get_error(chkerr_libxsmm_dnn_)); global_status = chkerr_libxsmm_dnn_; } \ } int main(int argc, char* argv[]) { /* Arrays related to FWD pass */ float *wgold, *xgoldt, *ugold, *hpgold, *hgoldt, *z1gold, *z2gold, *zgoldt, *bgold, *bmgold; float *w, *wt, *xt, *u, *ut, *hp, *ht, *htest, *h_nc_buf, *b; /* Arrays related to BWD and UPD pass */ float *djdhgoldt, *deltagoldt, *djdugold, *djdwgold, *djdxgoldt, *djdbgold; float *zigold, *di1gold, *di2gold, *ugoldTp, *wgoldTp, *hgoldTp, *xgoldTp; float *djdht, *djdu, *djdw, *djdxt, *djdb, *djdxtestt, *djdwtest, *djdutest; const char transa = 'N', transb = 'N'; /* no transposes */ const float alpha = 1, beta = 1, beta0 = 0; void *scratch, *internalstate; size_t scratch_size = 0, internalstate_size = 0; int iters = 10; /* repetitions of benchmark */ int pass = 0; /* pass: 0--FWD, 1--BWD, 2--UPD, 3--BWD+UPD */ int nonlin = 2; /* nonlin=1 denotes ReLU, 2 denotes sigmoid, 3 denotes tanh */ int N = 128; /* size of mini-batch */ int C = 512; /* number of inputs */ int K = 256; /* number of outputs */ int t = 4; /* number of time steps (> 1) */ int bk = 64; int bn = 64; int bc = 64; const char *const env_check = getenv("CHECK"); const double check = LIBXSMM_ABS(0 == env_check ? 1/*enable by default*/ : atof(env_check)); #if defined(_OPENMP) int nThreads = omp_get_max_threads(); /* number of threads */ #else int nThreads = 1; /* number of threads */ #endif unsigned long long l_start, l_end; double l_total = 0.0; double flops = 0.0, tempflops = 0.0; const double tflops = 12; /* transcendental flops */ int i, j, it; libxsmm_dnn_rnncell_desc rnncell_desc; libxsmm_dnn_rnncell* libxsmm_handle; libxsmm_dnn_tensor* libxsmm_input; libxsmm_dnn_tensor* libxsmm_hidden_state_prev; libxsmm_dnn_tensor* libxsmm_weight; libxsmm_dnn_tensor* libxsmm_recur_weight; libxsmm_dnn_tensor* libxsmm_weight_t; libxsmm_dnn_tensor* libxsmm_recur_weight_t; libxsmm_dnn_tensor* libxsmm_bias; libxsmm_dnn_tensor* libxsmm_hidden_state; libxsmm_dnn_tensor* libxsmm_dinput; libxsmm_dnn_tensor* libxsmm_dweight; libxsmm_dnn_tensor* libxsmm_drecur_weight; libxsmm_dnn_tensor* libxsmm_dbias; libxsmm_dnn_tensor* libxsmm_dhidden_state; libxsmm_dnn_tensor_datalayout* libxsmm_layout; libxsmm_dnn_err_t status; libxsmm_dnn_err_t global_status = LIBXSMM_DNN_SUCCESS; libxsmm_matdiff_info norms_fwd, norms_bwd, norms_upd_w, norms_upd_u, norms_upd_b, diff; libxsmm_matdiff_clear(&norms_fwd); libxsmm_matdiff_clear(&norms_bwd); libxsmm_matdiff_clear(&norms_upd_w); libxsmm_matdiff_clear(&norms_upd_u); libxsmm_matdiff_clear(&norms_upd_b); libxsmm_matdiff_clear(&diff); if (argc > 1 && !strncmp(argv[1], "-h", 3)) { printf("\nUsage: ./rnndriver [reps] [pass: 0--FWD, 1--BWD, 2--UPD, 3--BWD+UPD] [nonlin: 1--ReLU, 2--sigmoid, 3--tanh] [N] [C] [K] [time_steps > 0]\n\n"); return 0; } libxsmm_rng_set_seed(1); /* reading new values from cli */ i = 1; if (argc > i) iters = atoi(argv[i++]); if (argc > i) pass = atoi(argv[i++]); if (argc > i) nonlin= atoi(argv[i++]); if (argc > i) N = atoi(argv[i++]); if (argc > i) C = atoi(argv[i++]); if (argc > i) K = atoi(argv[i++]); if (argc > i) t = atoi(argv[i++]); if (argc > i) bn = atoi(argv[i++]); if (argc > i) bk = atoi(argv[i++]); if (argc > i) bc = atoi(argv[i++]); if (t <= 0) { printf("time_steps %d should be greater than 0\n\n", t); return 0; } if (!(pass == 0 || pass == 1 || pass == 2 || pass == 3)) { printf("Unknown pass: %d, valid arguments for pass = {0(FWD), 1(BWD), 2(UPD), 3(BWD+UPD)\n\n", pass); return 0; } if (nonlin != 1 && nonlin != 2 && nonlin != 3) { printf("Unsupported non-linear function used [1--ReLU, 2--sigmoid, 3--tanh]\n\n"); return 0; } #if defined(__SSE3__) _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); #endif /* print some summary */ printf("##########################################\n"); printf("# Setting Up (Common) #\n"); printf("##########################################\n"); printf("PARAMS: N:%d C:%d K:%d T:%d\n", N, C, K, t); printf("PARAMS: ITERS:%d", iters); if (LIBXSMM_FEQ(0, check)) printf(" Threads:%d\n", nThreads); else printf("\n"); printf("SIZE Weight (MB): %10.2f MiB\n", (double)(C*K*sizeof(float))/(1024.0*1024.0) ); printf("SIZE Input (MB): %10.2f MiB\n", (double)(N*C*sizeof(float))/(1024.0*1024.0) ); printf("SIZE Hidden State: %10.2f MiB\n", (double)(K*N*sizeof(float))/(1024.0*1024.0) ); /* allocate data */ xgoldt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); hpgold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); wgold = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); ugold = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); bgold = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); hgoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); h_nc_buf = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); zgoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); bmgold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); z1gold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); z2gold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); djdxgoldt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); djdwgold = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); djdugold = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); djdbgold = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); djdhgoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); deltagoldt = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); zigold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); di1gold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); di2gold = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); xgoldTp = (float*)libxsmm_aligned_malloc(N*C*sizeof(float), 2097152); wgoldTp = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); ugoldTp = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); hgoldTp = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); xt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); hp = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); w = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); wt = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); u = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); ut = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); ht = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); b = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); djdxt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); djdw = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); djdu = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); djdb = (float*)libxsmm_aligned_malloc(K*sizeof(float), 2097152); djdht = (float*)libxsmm_aligned_malloc(K*N*t*sizeof(float), 2097152); htest = (float*)libxsmm_aligned_malloc(K*N*sizeof(float), 2097152); djdxtestt = (float*)libxsmm_aligned_malloc(N*C*t*sizeof(float), 2097152); djdwtest = (float*)libxsmm_aligned_malloc(C*K*sizeof(float), 2097152); djdutest = (float*)libxsmm_aligned_malloc(K*K*sizeof(float), 2097152); LIBXSMM_VLA_DECL(2, float, xgold, xgoldt, N*C); LIBXSMM_VLA_DECL(2, float, hgold, hgoldt, K*N); LIBXSMM_VLA_DECL(2, float, h_nc, h_nc_buf, K*N); LIBXSMM_VLA_DECL(2, float, zgold, zgoldt, K*N); LIBXSMM_VLA_DECL(2, float, djdxgold, djdxgoldt, N*C); LIBXSMM_VLA_DECL(2, float, djdhgold, djdhgoldt, K*N); LIBXSMM_VLA_DECL(2, float, deltagold, deltagoldt, K*N); /* initialize data */ /* All data in gold is considered to be in column-major format */ for (it = 0; it < t; ++it) { init_buf(&LIBXSMM_VLA_ACCESS(2, xgold, it, 0, N*C), N*C, 0, 0); } init_buf(hpgold, N*K, 0, 0); init_buf(wgold, C*K, 0, 0); init_buf(ugold, K*K, 0, 0); init_buf(bgold, K, 0, 0); for (j = 0; j < N; j++) { matrix_copy(K, bgold, &(bmgold[j*K])); } zero_buf(hgoldt, K*N*t); zero_buf(zgoldt, K*N*t); zero_buf(z1gold, K*N); zero_buf(z2gold, K*N); for (it = 0; it < t; ++it) { init_buf(&LIBXSMM_VLA_ACCESS(2, djdhgold, it, 0, K*N), N*K, 0, 0); } zero_buf(djdxgoldt, N*C*t); zero_buf(djdwgold, C*K); zero_buf(djdugold, K*K); zero_buf(djdbgold, K); zero_buf(deltagoldt, K*N*t); zero_buf(zigold, K*N); zero_buf(di1gold, K*N); zero_buf(di2gold, K*N); zero_buf(xgoldTp, N*C); zero_buf(ugoldTp, K*K); zero_buf(wgoldTp, C*K); zero_buf(hgoldTp, K*N); /* first touch LIBXSMM */ zero_buf(xt, N*C*t); zero_buf(hp, K*N); zero_buf(w, C*K); zero_buf(u, K*K); zero_buf(wt, C*K); zero_buf(ut, K*K); zero_buf(b, K); zero_buf(ht, K*N*t); zero_buf(djdxt,N*C*t); zero_buf(djdw, C*K); zero_buf(djdu, K*K); zero_buf(djdb, K); zero_buf(djdht, K*N*t); if (LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Computing Reference ... #\n"); printf("##########################################\n"); for (i = 0; i < t; ++i) { LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &C, &alpha, wgold, &K, &LIBXSMM_VLA_ACCESS(2, xgold, i, 0, N*C), &C, &beta0, z1gold, &K); if (0 == i) { LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &K, &alpha, ugold, &K, hpgold, &K, &beta0, z2gold, &K); } else { LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &K, &alpha, ugold, &K, &LIBXSMM_VLA_ACCESS(2, hgold, i-1, 0, K*N), &K, &beta0, z2gold, &K); } matrix_add(K*N, z1gold, z2gold, &LIBXSMM_VLA_ACCESS(2, zgold, i, 0, K*N)); matrix_add(K*N, &LIBXSMM_VLA_ACCESS(2, zgold, i, 0, K*N), bmgold, &LIBXSMM_VLA_ACCESS(2, zgold, i, 0, K*N)); if (1 == nonlin) { matrix_relu(K*N, &LIBXSMM_VLA_ACCESS(2, zgold, i, 0, K*N), &LIBXSMM_VLA_ACCESS(2, hgold, i, 0, K*N)); } else if (2 == nonlin) { matrix_sigmoid(K*N, &LIBXSMM_VLA_ACCESS(2, zgold, i, 0, K*N), &LIBXSMM_VLA_ACCESS(2, hgold, i, 0, K*N)); } else { matrix_tanh(K*N, &LIBXSMM_VLA_ACCESS(2, zgold, i, 0, K*N), &LIBXSMM_VLA_ACCESS(2, hgold, i, 0, K*N)); } } /* Conceptually, delta iterates over 0 ... t-1, whereas, djdh and z iterates over 1 ... t */ /* Hence these have identical array indices */ if (1 == nonlin) { matrix_relu_inverse(K*N, &LIBXSMM_VLA_ACCESS(2, zgold, t-1, 0, K*N), zigold); } else if (2 == nonlin) { matrix_sigmoid_inverse(K*N, &LIBXSMM_VLA_ACCESS(2, zgold, t-1, 0, K*N), zigold); } else { matrix_tanh_inverse(K*N, &LIBXSMM_VLA_ACCESS(2, zgold, t-1, 0, K*N), zigold); } matrix_eltwise_mult(K*N, zigold, &LIBXSMM_VLA_ACCESS(2, djdhgold, t-1, 0, K*N), &LIBXSMM_VLA_ACCESS(2, deltagold, t-1, 0, K*N)); matrix_transpose(K, K, ugold, ugoldTp); for (i = t-2; i >= 0; --i) { if (1 == nonlin) { matrix_relu_inverse(K*N, &LIBXSMM_VLA_ACCESS(2, zgold, i, 0, K*N), zigold); } else if (2 == nonlin) { matrix_sigmoid_inverse(K*N, &LIBXSMM_VLA_ACCESS(2, zgold, i, 0, K*N), zigold); } else { matrix_tanh_inverse(K*N, &LIBXSMM_VLA_ACCESS(2, zgold, i, 0, K*N), zigold); } LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &N, &K, &alpha, ugoldTp, &K, &LIBXSMM_VLA_ACCESS(2, deltagold, i+1, 0, K*N), &K, &beta0, di1gold, &K); matrix_add(K*N, &LIBXSMM_VLA_ACCESS(2, djdhgold, i, 0, K*N), di1gold, di2gold); matrix_eltwise_mult(K*N, zigold, di2gold, &LIBXSMM_VLA_ACCESS(2, deltagold, i, 0, K*N)); } if (pass == 1 || pass == 3) { matrix_transpose(C, K, wgold, wgoldTp); for (i = 0; i < t; ++i) { LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &C, &N, &K, &alpha, wgoldTp, &C, &LIBXSMM_VLA_ACCESS(2, deltagold, i, 0, K*N), &K, &beta0, &LIBXSMM_VLA_ACCESS(2, djdxgold, i, 0, N*C), &C); } } if (pass == 2 || pass == 3) { for (i = 0; i < t; ++i) { if (0 == i) { matrix_transpose(N, K, hpgold, hgoldTp); } else { matrix_transpose(N, K, &LIBXSMM_VLA_ACCESS(2, hgold, i-1, 0, K*N), hgoldTp); } LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &K, &N, &alpha, &LIBXSMM_VLA_ACCESS(2, deltagold, i, 0, K*N), &K, hgoldTp, &N, &beta, djdugold, &K); matrix_transpose(N, C, &LIBXSMM_VLA_ACCESS(2, xgold, i, 0, N*C), xgoldTp); LIBXSMM_XBLAS_SYMBOL(float)(&transa, &transb, &K, &C, &N, &alpha, &LIBXSMM_VLA_ACCESS(2, deltagold, i, 0, K*N), &K, xgoldTp, &N, &beta, djdwgold, &K); for (j = 0; j < K*N; j++) { djdbgold[j%K] += LIBXSMM_VLA_ACCESS(2, deltagold, i, j, K*N); } } } printf("##########################################\n"); printf("# Computing Reference ... done #\n"); printf("##########################################\n"); } if (1 /* format == 'A' || format == 'L' */) { printf("\n"); printf("##########################################\n"); printf("# Setting Up (custom-Storage) #\n"); printf("##########################################\n"); /* setup LIBXSMM handle */ rnncell_desc.threads = nThreads; rnncell_desc.N = N; rnncell_desc.C = C; rnncell_desc.K = K; rnncell_desc.bn = bn; rnncell_desc.bk = bk; rnncell_desc.bc = bc; rnncell_desc.max_T = t; if ( nonlin == 1 ) { rnncell_desc.cell_type = LIBXSMM_DNN_RNNCELL_RNN_RELU; } else if ( nonlin == 2 ) { rnncell_desc.cell_type = LIBXSMM_DNN_RNNCELL_RNN_SIGMOID; } else if ( nonlin == 3 ) { rnncell_desc.cell_type = LIBXSMM_DNN_RNNCELL_RNN_TANH; } else { /* should not happen */ } rnncell_desc.datatype_in = LIBXSMM_DNN_DATATYPE_F32; rnncell_desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32; rnncell_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED; rnncell_desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED; libxsmm_handle = libxsmm_dnn_create_rnncell( rnncell_desc, &status ); CHKERR_LIBXSMM_DNN( status ); /* setup LIBXSMM buffers and filter */ libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_input = libxsmm_dnn_link_tensor( libxsmm_layout, xt, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_hidden_state_prev = libxsmm_dnn_link_tensor( libxsmm_layout, hp, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_weight = libxsmm_dnn_link_tensor( libxsmm_layout, w, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_weight_t = libxsmm_dnn_link_tensor( libxsmm_layout, wt, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_recur_weight = libxsmm_dnn_link_tensor( libxsmm_layout, u, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_recur_weight_t = libxsmm_dnn_link_tensor( libxsmm_layout, ut, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_BIAS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_bias = libxsmm_dnn_link_tensor( libxsmm_layout, b, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_hidden_state = libxsmm_dnn_link_tensor( libxsmm_layout, ht, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dinput = libxsmm_dnn_link_tensor( libxsmm_layout, djdxt, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dweight = libxsmm_dnn_link_tensor( libxsmm_layout, djdw, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_drecur_weight = libxsmm_dnn_link_tensor( libxsmm_layout, djdu, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_BIAS, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dbias = libxsmm_dnn_link_tensor( libxsmm_layout, djdb, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dhidden_state = libxsmm_dnn_link_tensor( libxsmm_layout, djdht, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); /* copy in data to LIBXSMM format */ matrix_copy( K, bgold, b ); matrix_copy_NC_to_NCNC(xgoldt, xt, t, N, C, bn, bc); matrix_copy_NC_to_NCNC(hpgold, hp, 1, N, K, bn, bk); matrix_copy_CK_to_KCCK(wgold, w, C, K, bc, bk); matrix_copy_CK_to_KCCK(ugold, u, K, K, bk, bk); matrix_copy_CK_to_CKKC(wgold, wt, C, K, bc, bk); matrix_copy_CK_to_CKKC(ugold, ut, K, K, bk, bk); matrix_copy( t*K*N, djdhgoldt, djdht ); /* bind buffers and filter to handle */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_input, LIBXSMM_DNN_RNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_hidden_state_prev, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_weight, LIBXSMM_DNN_RNN_REGULAR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_weight_t, LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_recur_weight, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_recur_weight_t, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_recur_weight, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_bias, LIBXSMM_DNN_RNN_REGULAR_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_hidden_state, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dinput, LIBXSMM_DNN_RNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dweight, LIBXSMM_DNN_RNN_GRADIENT_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_drecur_weight, LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dbias, LIBXSMM_DNN_RNN_GRADIENT_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dhidden_state, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE ) ); /* let's allocate and bind scratch */ if (pass == 0) { scratch_size = libxsmm_dnn_rnncell_get_scratch_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, &status ); CHKERR_LIBXSMM_DNN( status ); scratch = libxsmm_aligned_malloc( scratch_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, scratch ) ); } else { scratch_size = libxsmm_dnn_rnncell_get_scratch_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, &status ); CHKERR_LIBXSMM_DNN( status ); scratch = libxsmm_aligned_malloc( scratch_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, scratch ) ); } zero_buf( (float*)scratch, scratch_size/4 ); /* let's allocate and bind internalstate */ if (pass == 0) { internalstate_size = libxsmm_dnn_rnncell_get_internalstate_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, &status ); CHKERR_LIBXSMM_DNN( status ); internalstate = libxsmm_aligned_malloc( internalstate_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, internalstate ) ); } else { internalstate_size = libxsmm_dnn_rnncell_get_internalstate_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, &status ); CHKERR_LIBXSMM_DNN( status ); internalstate = libxsmm_aligned_malloc( internalstate_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, internalstate ) ); } zero_buf( (float*)internalstate, internalstate_size/4 ); if ((pass == 0) && LIBXSMM_NEQ(0, check)) { printf("##########################################\n"); printf("# Correctness - FWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM RNN */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) ); } /* Copy out LIBXSMM result to NC format for correctness checking */ matrix_copy_NCNC_to_NC(ht, h_nc_buf, t, N, K, bn, bk); matrix_copy( N*K, &LIBXSMM_VLA_ACCESS(2, h_nc, t-1, 0, K*N), htest ); /* compare */ libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_F32, K*N, 1, &LIBXSMM_VLA_ACCESS(2, hgold, t-1, 0, K*N), htest, 0, 0); printf("L1 reference : %.25g\n", norms_fwd.l1_ref); printf("L1 test : %.25g\n", norms_fwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_fwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_fwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel); printf("Check-norm : %.24f\n", norms_fwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_fwd); } else { /* We need to always run FWD pass once to populate zt, ht */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) ); } } if ( (pass == 1) && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - BWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM RNN */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ) ); } /* copy out data */ matrix_copy(N*C*t, djdxt, djdxtestt); /* compare */ libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, N*C*t, 1, djdxgoldt, djdxtestt, 0, 0); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); } if ( (pass == 2) && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM RNN */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ) ); } /* copy out data */ matrix_copy(C*K, djdw, djdwtest); matrix_copy(K*K, djdu, djdutest); /* compare */ libxsmm_matdiff(&norms_upd_w, LIBXSMM_DATATYPE_F32, C*K, 1, djdwgold, djdwtest, 0, 0); printf("Delta weight\n"); printf("L1 reference : %.25g\n", norms_upd_w.l1_ref); printf("L1 test : %.25g\n", norms_upd_w.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_w.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_w.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_w.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_w.linf_rel); printf("Check-norm : %.24f\n", norms_upd_w.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_w); libxsmm_matdiff(&norms_upd_u, LIBXSMM_DATATYPE_F32, K*K, 1, djdugold, djdutest, 0, 0); printf("Delta recurrent weight\n"); printf("L1 reference : %.25g\n", norms_upd_u.l1_ref); printf("L1 test : %.25g\n", norms_upd_u.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_u.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_u.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_u.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_u.linf_rel); printf("Check-norm : %.24f\n", norms_upd_u.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_u); libxsmm_matdiff(&norms_upd_b, LIBXSMM_DATATYPE_F32, K, 1, djdbgold, djdb, 0, 0); printf("Delta bias\n"); printf("L1 reference : %.25g\n", norms_upd_b.l1_ref); printf("L1 test : %.25g\n", norms_upd_b.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_b.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_b.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_b.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_b.linf_rel); printf("Check-norm : %.24f\n", norms_upd_b.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_b); } if ( (pass == 3) && LIBXSMM_NEQ(0, check) ) { printf("##########################################\n"); printf("# Correctness - BWD+UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM RNN */ #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, 0, tid ) ); } /* copy out data */ matrix_copy(N*C*t, djdxt, djdxtestt); matrix_copy(C*K, djdw, djdwtest); matrix_copy(K*K, djdu, djdutest); /* compare */ libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, N*C*t, 1, djdxgoldt, djdxtestt, 0, 0); printf("Delta input\n"); printf("L1 reference : %.25g\n", norms_bwd.l1_ref); printf("L1 test : %.25g\n", norms_bwd.l1_tst); printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs); printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel); printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs); printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel); printf("Check-norm : %.24f\n", norms_bwd.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_bwd); libxsmm_matdiff(&norms_upd_w, LIBXSMM_DATATYPE_F32, C*K, 1, djdwgold, djdwtest, 0, 0); printf("Delta weight\n"); printf("L1 reference : %.25g\n", norms_upd_w.l1_ref); printf("L1 test : %.25g\n", norms_upd_w.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_w.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_w.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_w.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_w.linf_rel); printf("Check-norm : %.24f\n", norms_upd_w.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_w); libxsmm_matdiff(&norms_upd_u, LIBXSMM_DATATYPE_F32, K*K, 1, djdugold, djdutest, 0, 0); printf("Delta recurrent weight\n"); printf("L1 reference : %.25g\n", norms_upd_u.l1_ref); printf("L1 test : %.25g\n", norms_upd_u.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_u.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_u.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_u.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_u.linf_rel); printf("Check-norm : %.24f\n", norms_upd_u.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_u); libxsmm_matdiff(&norms_upd_b, LIBXSMM_DATATYPE_F32, K, 1, djdbgold, djdb, 0, 0); printf("Delta bias\n"); printf("L1 reference : %.25g\n", norms_upd_b.l1_ref); printf("L1 test : %.25g\n", norms_upd_b.l1_tst); printf("L2 abs.error : %.24f\n", norms_upd_b.l2_abs); printf("L2 rel.error : %.24f\n", norms_upd_b.l2_rel); printf("Linf abs.error: %.24f\n", norms_upd_b.linf_abs); printf("Linf rel.error: %.24f\n", norms_upd_b.linf_rel); printf("Check-norm : %.24f\n", norms_upd_b.normf_rel); libxsmm_matdiff_reduce(&diff, &norms_upd_b); } if ( pass == 0 ) { printf("##########################################\n"); printf("# Performance - FWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM RNN for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = ((2.0 * K*N*C) + (2.0 * K*N*K) + (K*N) + (tflops * K*N)) * (double)t * (double)iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("fp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,FP,%s,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, ((double)(l_total/iters)), (flops*1e-9)/l_total); } if ( pass == 1 ) { printf("##########################################\n"); printf("# Performance - BWD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM RNN for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = K*K; /* U^T */ flops += (2.0 * K*N*K); /* U^T * delta */ flops += (K*N); /* dJdh + (U^T * delta) */ flops += (tflops * K*N); /* sigma'(Z) */ flops += (K*N); /* sigma'(Z) * (dJdh + (U^T * delta)) */ flops *= t; /* for t time steps */ tempflops = C*K; /* W^T */ tempflops += (2.0 * K*N*C); /* W^T * delta */ tempflops *= t; /* for t time steps of input */ flops += tempflops; flops *= iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("bp time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,BP,%s,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, ((double)(l_total/iters)), (flops*1e-9)/l_total); } if ( pass == 2 ) { printf("##########################################\n"); printf("# Performance - UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM RNN for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = K*K; /* U^T */ flops += (2.0 * K*N*K); /* U^T * delta */ flops += (K*N); /* dJdh + (U^T * delta) */ flops += (tflops * K*N); /* sigma'(Z) */ flops += (K*N); /* sigma'(Z) * (dJdh + (U^T * delta)) */ flops *= t; /* for t time steps */ tempflops = K*N; /* h^T */ tempflops += (2.0 * K*N*K); /* delta * h^T */ tempflops *= t; /* for t time steps */ tempflops += (K*K * (t-1)); /* for summation of dJdU */ flops += tempflops; tempflops = N*C; /* x^T */ tempflops += (2.0 * K*N*C); /* delta * x^T */ tempflops *= t; /* for t time steps */ tempflops += (C*K * (t-1)); /* for summation of dJdW */ flops += tempflops; flops *= iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("wu time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,WU,%s,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, ((double)(l_total/iters)), (flops*1e-9)/l_total); } if ( pass == 3 ) { printf("##########################################\n"); printf("# Performance - BWD+UPD (custom-Storage) #\n"); printf("##########################################\n"); /* run LIBXSMM RNN for performance */ l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < iters; ++i) { libxsmm_dnn_rnncell_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, 0, tid ); } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); flops = K*K; /* U^T */ flops += (2.0 * K*N*K); /* U^T * delta */ flops += (K*N); /* dJdh + (U^T * delta) */ flops += (tflops * K*N); /* sigma'(Z) */ flops += (K*N); /* sigma'(Z) * (dJdh + (U^T * delta)) */ flops *= t; /* for t time steps */ tempflops = K*N; /* h^T */ tempflops += (2.0 * K*N*K); /* delta * h^T */ tempflops *= t; /* for t time steps */ tempflops += (K*K * (t-1)); /* for summation of dJdU */ flops += tempflops; tempflops = N*C; /* x^T */ tempflops += (2.0 * K*N*C); /* delta * x^T */ tempflops *= t; /* for t time steps */ tempflops += (C*K * (t-1)); /* for summation of dJdW */ flops += tempflops; tempflops = C*K; /* W^T */ tempflops += (2.0 * K*N*C); /* W^T * delta */ tempflops *= t; /* for t time steps of input */ flops += tempflops; flops *= iters; printf("GFLOP = %.5g\n", flops*1e-9/(double)iters); printf("bp+wu time = %.5g\n", ((double)(l_total/iters))); printf("GFLOPS = %.5g\n", (flops*1e-9)/l_total); printf("PERFDUMP,BP+WU,%s,%i,%i,%i,%i,%i,%.5g,%.5g\n", LIBXSMM_VERSION, nThreads, N, C, K, t, ((double)(l_total/iters)), (flops*1e-9)/l_total); } /* clean-up */ if (pass == 0) { CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD ) ); } else { CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_internalstate( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL ) ); } libxsmm_free(scratch); libxsmm_free(internalstate); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_input ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_hidden_state_prev ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_weight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_recur_weight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_bias ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_hidden_state ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dinput ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dweight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_drecur_weight ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dbias ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dhidden_state ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_rnncell( libxsmm_handle ) ); } /* deallocate data */ libxsmm_free(xgoldt); libxsmm_free(hpgold); libxsmm_free(wgold); libxsmm_free(ugold); libxsmm_free(bgold); libxsmm_free(hgoldt); libxsmm_free(zgoldt); libxsmm_free(bmgold); libxsmm_free(z1gold); libxsmm_free(z2gold); libxsmm_free(djdxgoldt); libxsmm_free(djdwgold); libxsmm_free(djdugold); libxsmm_free(djdbgold); libxsmm_free(djdhgoldt); libxsmm_free(deltagoldt); libxsmm_free(zigold); libxsmm_free(di1gold); libxsmm_free(di2gold); libxsmm_free(xgoldTp); libxsmm_free(wgoldTp); libxsmm_free(ugoldTp); libxsmm_free(hgoldTp); libxsmm_free(xt); libxsmm_free(hp); libxsmm_free(w); libxsmm_free(u); libxsmm_free(b); libxsmm_free(ht); libxsmm_free(djdxt); libxsmm_free(djdw); libxsmm_free(djdu); libxsmm_free(djdb); libxsmm_free(djdht); libxsmm_free(htest); libxsmm_free(djdxtestt); libxsmm_free(djdwtest); libxsmm_free(djdutest); { const char *const env_check_scale = getenv("CHECK_SCALE"); const double check_scale = LIBXSMM_ABS(0 == env_check_scale ? 1.0 : atof(env_check_scale)); if (LIBXSMM_NEQ(0, check) && (check < 100.0 * check_scale * diff.normf_rel) && (global_status == LIBXSMM_DNN_SUCCESS)) { fprintf(stderr, "FAILED with an error of %f%%!\n", 100.0 * diff.normf_rel); exit(EXIT_FAILURE); } } /* some empty lines at the end */ printf("\n\n\n"); return global_status; } libxsmm-1.17/samples/deeplearning/rnndriver/rnndriver_ncnc_kcck_f32.vcxproj000066400000000000000000000551731415223013700273340ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 rnndriver_ncnc_kcck_f32 {632FA101-8478-4D32-A1E3-B7A02E6C87DE} 10.0 Application Disabled Disabled Sequential v142 true Application true true Disabled Disabled Sequential v142 Application true Disabled Disabled Sequential v142 true Application Disabled Disabled Sequential v142 true true Application true Disabled Disabled Sequential v142 Application true Disabled Disabled true Sequential v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmext.lib;mkl_rt.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmext.lib;mkl_rt.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/deeplearning/rnndriver/run_rnncell.sh000077500000000000000000000105311415223013700241050ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.), Kunal Banerjee (Intel Corp.) ############################################################################### set -eo pipefail UNAME=$(command -v uname) SORT=$(command -v sort) GREP=$(command -v grep) CUT=$(command -v cut) WC=$(command -v wc) TR=$(command -v tr) NUMA=-1 if [ "" = "${CHECK}" ] || [ "0" = "${CHECK}" ]; then if [ "" = "${CHECK_DNN_MB}" ]; then CHECK_DNN_MB=256; fi if [ "" = "${CHECK_DNN_ITERS}" ]; then CHECK_DNN_ITERS=1000; fi else # check if [ "" = "${CHECK_DNN_MB}" ]; then CHECK_DNN_MB=256; fi if [ "" = "${CHECK_DNN_ITERS}" ]; then CHECK_DNN_ITERS=1; fi fi if [ $# -ne 9 ] then echo "Usage: $(basename $0) format=(nc_ck, nc_kcck, ncnc_kcck) bin=(f32, bf16) iters type=(0-fwd, 1-bwd, 2-upd, 3-bwdupd) act={1-relu, 2-sigmoid, 3-tanh} MB bn bc bk" FORMAT=nc_ck BIN=f32 ITERS=${CHECK_DNN_ITERS} TYPE=0 ACT=1 MB=${CHECK_DNN_MB} BN=32 BC=32 BK=32 else FORMAT=$1 BIN=$2 ITERS=$3 TYPE=$4 ACT=$5 MB=$6 BN=$7 BC=$8 BK=$9 fi if [ "${GREP}" ] && [ "${SORT}" ] && [ "${CUT}" ] && [ "${TR}" ] && [ "${WC}" ]; then if [ "$(command -v lscpu)" ]; then NS=$(lscpu | ${GREP} -m1 "Socket(s)" | ${TR} -d " " | ${CUT} -d: -f2) if [ "" = "${NS}" ]; then NS=1; fi NC=$((NS*$(lscpu | ${GREP} -m1 "Core(s) per socket" | ${TR} -d " " | ${CUT} -d: -f2))) NT=$((NC*$(lscpu | ${GREP} -m1 "Thread(s) per core" | ${TR} -d " " | ${CUT} -d: -f2))) elif [ -e /proc/cpuinfo ]; then NS=$(${GREP} "physical id" /proc/cpuinfo | ${SORT} -u | ${WC} -l | ${TR} -d " ") if [ "" = "${NS}" ] || [ "" = "${NS}" ]; then NS=1; fi NC=$((NS*$(${GREP} -m1 "cpu cores" /proc/cpuinfo | ${TR} -d " " | ${CUT} -d: -f2))) NT=$(${GREP} "core id" /proc/cpuinfo | ${WC} -l | ${TR} -d " ") elif [ "Darwin" = "$(uname)" ]; then NS=$(sysctl hw.packages | ${CUT} -d: -f2 | ${TR} -d " ") NC=$(sysctl hw.physicalcpu | ${CUT} -d: -f2 | ${TR} -d " ") NT=$(sysctl hw.logicalcpu | ${CUT} -d: -f2 | ${TR} -d " ") fi if [ "${NC}" ] && [ "${NT}" ]; then HT=$((NT/NC)) else NS=1 NC=1 NT=1 HT=1 fi if [ "$(command -v numactl)" ]; then NN=$(numactl -H | ${GREP} "available:" | ${CUT} -d' ' -f2) else NN=${NS} fi fi CPUFLAGS=$(if [ "${GREP}" ] && [ "${CUT}" ] && [ -e /proc/cpuinfo ]; then ${GREP} -m1 flags /proc/cpuinfo | ${CUT} -d: -f2- || true; fi) if [ "${GREP}" ] && [ "$(echo "${CPUFLAGS}" | ${GREP} -o avx512er)" ]; then if [ "0" != "$((0>NUMA))" ] && [ "0" != "$((NS #include #include #include void BlockSpMatStep1(int K, int C, int KB, int CB, unsigned int *colptr, unsigned int *rowidx, unsigned int *b_colptr[], int *nnzb) { int num_blocks = K / KB * C / CB; int blk_idx, i, k; for (blk_idx = 0; blk_idx < num_blocks; ++blk_idx) { nnzb[blk_idx] = 0; for (i = 0; i <= KB; ++i) { b_colptr[blk_idx][i] = 0; } } for (k = 0; k < K; ++k) { int k_blk_idx = k / KB; int k_blk_offset = k % KB; unsigned colstart = colptr[k]; unsigned colend = colptr[k + 1]; for (i = colstart; i < (int)colend; ++i) { int c = rowidx[i]; int c_blk_idx = c / CB; blk_idx = k_blk_idx * C / CB + c_blk_idx; nnzb[blk_idx]++; b_colptr[blk_idx][k_blk_offset + 1]++; } } for (blk_idx = 0; blk_idx < num_blocks; ++blk_idx) { for (i = 0; i < KB; ++i) { b_colptr[blk_idx][i + 1] += b_colptr[blk_idx][i]; } } } void BlockSpMatStep2(int K, int C, int KB, int CB, unsigned int *colptr, unsigned int *rowidx, float *values, unsigned int *b_colptr[], unsigned int *b_rowidx[], float *b_values[]) { int num_blocks = K / KB * C / CB; int blk_idx, k, i; for (k = 0; k < K; ++k) { int k_blk_idx = k / KB; int k_blk_offset = k % KB; unsigned colstart = colptr[k]; unsigned colend = colptr[k + 1]; for (i = colstart; i < (int)colend; ++i) { int c = rowidx[i]; int c_blk_idx = c / CB; int c_blk_offset = c % CB; blk_idx = k_blk_idx * C / CB + c_blk_idx; b_rowidx[blk_idx][b_colptr[blk_idx][k_blk_offset]] = c_blk_offset; b_values[blk_idx][b_colptr[blk_idx][k_blk_offset]] = values[i]; b_colptr[blk_idx][k_blk_offset]++; } } for (blk_idx = 0; blk_idx < num_blocks; ++blk_idx) { for (i = KB; i > 0; --i) { b_colptr[blk_idx][i] = b_colptr[blk_idx][i - 1]; } b_colptr[blk_idx][0] = 0; } } int main(int argc, char **argv) { int N = (argc > 1) ? atoi(argv[1]) : 2048; int C = (argc > 2) ? atoi(argv[2]) : 512; int K = (argc > 3) ? atoi(argv[3]) : 512; int NB = (argc > 4) ? atoi(argv[4]) : 32; int CB = (argc > 5) ? atoi(argv[5]) : 32; int KB = (argc > 6) ? atoi(argv[6]) : 32; int nb = (argc > 7) ? atoi(argv[7]) : 16; double sparse_frac = (argc > 8) ? atof(argv[8]) : 0.90; unsigned int REPS = (argc > 9) ? atoi(argv[9]) : 10; if (N < NB || K < KB || C < CB || NB < nb || C % CB != 0 || N % NB != 0 || nb % 16 != 0 || NB % nb != 0 || sparse_frac <= 0.0 || sparse_frac >= 1.0 || REPS <= 0) { return -1; } int l_n, l_c, l_nn, l_cc, l_nnn, l_k, l_kk, blk_idx; int i, k, n, c; libxsmm_gemm_prefetch_type prefetch = LIBXSMM_GEMM_PREFETCH_NONE; int flags = LIBXSMM_GEMM_FLAGS('N', 'N'); float *l_A = (float *)libxsmm_aligned_malloc(sizeof(float) * N * C, 64); float *l_B = (float *)libxsmm_aligned_malloc(sizeof(float) * C * K, 64); float *l_C = (float *)libxsmm_aligned_malloc(sizeof(float) * N * K, 64); float *l_C_gold = (float *)libxsmm_aligned_malloc(sizeof(float) * N * K, 64); LIBXSMM_VLA_DECL(5, float, l_p_A, l_A, C / CB, NB / nb, CB, nb); LIBXSMM_VLA_DECL(5, float, l_p_C, l_C, K / KB, NB / nb, KB, nb); LIBXSMM_VLA_DECL(5, float, l_p_C_gold, l_C_gold, K / KB, NB / nb, KB, 16); /* touch A */ for (l_n = 0; l_n < N / NB; ++l_n) { for (l_c = 0; l_c < C / CB; ++l_c) { for (l_nn = 0; l_nn < NB / nb; ++l_nn) { for (l_cc = 0; l_cc < CB; ++l_cc) { for (l_nnn = 0; l_nnn < nb; ++l_nnn) { LIBXSMM_VLA_ACCESS(5, l_p_A, l_n, l_c, l_nn, l_cc, l_nnn, C / CB, NB / nb, CB, nb) = (float)libxsmm_rng_f64(); } } } } } /* touch dense B and init sparse B*/ int nnz = 0; unsigned int *colptr = (unsigned int *)libxsmm_aligned_malloc( (K + 1) * sizeof(unsigned int), 64); colptr[0] = 0; for (l_k = 0; l_k < K; l_k++) { colptr[l_k + 1] = 0; for (l_c = 0; l_c < C; l_c++) { double tmp = libxsmm_rng_f64(); if (tmp < sparse_frac) { tmp = 0.0; } else { nnz++; colptr[l_k + 1]++; } l_B[l_k * C + l_c] = (float)tmp; } } for (l_k = 0; l_k < K; l_k++) { colptr[l_k + 1] += colptr[l_k]; } unsigned int *rowidx = (unsigned int *)libxsmm_aligned_malloc(nnz * sizeof(unsigned int), 64); float *values = (float *)libxsmm_aligned_malloc(nnz * sizeof(float), 64); for (l_k = 0; l_k < K; l_k++) { int offset = colptr[l_k]; for (l_c = 0; l_c < C; l_c++) { if (l_B[l_k * C + l_c] != 0) { rowidx[offset] = l_c; values[offset] = l_B[l_k * C + l_c]; offset++; } } } unsigned num_k_blocks = K / KB; unsigned num_c_blocks = C / CB; int num_blocks = num_k_blocks * num_c_blocks; unsigned int **b_colptr = (unsigned int **)libxsmm_aligned_malloc( num_blocks * sizeof(unsigned int *), 64); unsigned int **b_rowidx = (unsigned int **)libxsmm_aligned_malloc( num_blocks * sizeof(unsigned int *), 64); float **b_values = (float **)libxsmm_aligned_malloc(num_blocks * sizeof(float *), 64); int *nnzb = (int *)libxsmm_aligned_malloc(num_blocks * sizeof(int), 64); for (blk_idx = 0; blk_idx < num_blocks; ++blk_idx) { b_colptr[blk_idx] = (unsigned int *)libxsmm_aligned_malloc( (KB + 1) * sizeof(unsigned int), 64); } BlockSpMatStep1(K, C, KB, CB, colptr, rowidx, b_colptr, nnzb); for (blk_idx = 0; blk_idx < num_blocks; ++blk_idx) { b_rowidx[blk_idx] = (unsigned int *)libxsmm_aligned_malloc( nnzb[blk_idx] * sizeof(unsigned int), 64); b_values[blk_idx] = (float *)libxsmm_aligned_malloc(nnzb[blk_idx] * sizeof(float), 64); } BlockSpMatStep2(K, C, KB, CB, colptr, rowidx, values, b_colptr, b_rowidx, b_values); /* touch C */ for (l_n = 0; l_n < N / NB; ++l_n) { for (l_k = 0; l_k < K / KB; ++l_k) { for (l_nn = 0; l_nn < NB / nb; ++l_nn) { for (l_kk = 0; l_kk < KB; ++l_kk) { for (l_nnn = 0; l_nnn < nb; ++l_nnn) { LIBXSMM_VLA_ACCESS(5, l_p_C_gold, l_n, l_k, l_nn, l_kk, l_nnn, K / KB, NB / nb, KB, nb) = 0.0f; LIBXSMM_VLA_ACCESS(5, l_p_C, l_n, l_k, l_nn, l_kk, l_nnn, K / KB, NB / nb, KB, nb) = 0.0f; } } } } } /* dense routine */ for (l_n = 0; l_n < N / NB; ++l_n) { for (l_k = 0; l_k < K / KB; ++l_k) { for (l_c = 0; l_c < C / CB; ++l_c) { for (l_nn = 0; l_nn < NB / nb; ++l_nn) { for (l_kk = 0; l_kk < KB; ++l_kk) { k = l_k * KB + l_kk; for (l_cc = 0; l_cc < CB; ++l_cc) { c = l_c * CB + l_cc; for (l_nnn = 0; l_nnn < nb; ++l_nnn) { LIBXSMM_VLA_ACCESS(5, l_p_C_gold, l_n, l_k, l_nn, l_kk, l_nnn, K / KB, NB / nb, KB, nb) += LIBXSMM_VLA_ACCESS(5, l_p_A, l_n, l_c, l_nn, l_cc, l_nnn, C / CB, NB / nb, CB, nb) * l_B[k * C + c]; } } } } } } } /* FWD */ float alpha = 1.0; float beta = 1.0; libxsmm_descriptor_blob l_xgemm_blob; libxsmm_gemm_descriptor **l_xgemm_desc = (libxsmm_gemm_descriptor **)libxsmm_aligned_malloc( num_blocks * sizeof(libxsmm_gemm_descriptor *), 64); libxsmm_smmfunction *mykernel = (libxsmm_smmfunction *)libxsmm_aligned_malloc( num_blocks * sizeof(libxsmm_smmfunction), 64); for (blk_idx = 0; blk_idx < num_blocks; ++blk_idx) { l_xgemm_desc[blk_idx] = libxsmm_gemm_descriptor_dinit( &l_xgemm_blob, LIBXSMM_GEMM_PRECISION(float), NB / nb, KB, CB, CB, 0, KB, alpha, beta, flags, prefetch); mykernel[blk_idx] = libxsmm_create_xcsc_soa(l_xgemm_desc[blk_idx], b_colptr[blk_idx], b_rowidx[blk_idx], (const void *)b_values[blk_idx], nb).smm; } #ifdef _OPENMP # pragma omp parallel for LIBXSMM_OPENMP_COLLAPSE(2) private(k,n,c) #endif for (k = 0; k < K / KB; ++k) { for (n = 0; n < N / NB; ++n) { for (c = 0; c < C / CB; ++c) { mykernel[k * C / CB + c](&(l_A[(n * C / CB + c) * CB * NB]), b_values[k * C / CB + c], &(l_C[(n * K / KB + k) * NB * KB])); } } } /* check error */ float l_max_error = 0.0f; for (i = 0; i < N * K; ++i) { if (fabs(l_C[i] - l_C_gold[i]) > l_max_error) { l_max_error = (float)fabs(l_C[i] - l_C_gold[i]); } } printf("max error = %f\n", l_max_error); /* check performace */ unsigned long long l_start = libxsmm_timer_tick(); for (i = 0; i < (int)REPS; ++i) { #ifdef _OPENMP # pragma omp parallel for LIBXSMM_OPENMP_COLLAPSE(2) private(k,n,c) #endif for (k = 0; k < K / KB; ++k) { for (n = 0; n < N / NB; ++n) { for (c = 0; c < C / CB; ++c) { mykernel[k * C / CB + c]( &(l_A[(n * C / CB + c) * CB * NB]), b_values[k * C / CB + c], &(l_C[(n * K / KB + k) * NB * KB])); } } } } unsigned long long l_end = libxsmm_timer_tick(); double l_total = libxsmm_timer_duration(l_start, l_end); printf("%fs for sparse (asm)\n", l_total); printf("%f GFLOPS for sparse (asm)\n", ((double)((double)REPS * (double)N * (double)C * (double)K) * 2.0) / (l_total * 1.0e9)); /* clean up */ libxsmm_free(l_A); libxsmm_free(l_B); libxsmm_free(l_C); libxsmm_free(l_C_gold); for (blk_idx = 0; blk_idx < num_blocks; ++blk_idx) { libxsmm_free(b_values[blk_idx]); libxsmm_free(b_colptr[blk_idx]); libxsmm_free(b_rowidx[blk_idx]); } libxsmm_free(b_values); libxsmm_free(b_colptr); libxsmm_free(b_rowidx); return 0; } libxsmm-1.17/samples/deeplearning/sparse_weight_mult/parallel_sparse_weight_B_mult.vcxproj000066400000000000000000000540761415223013700325740ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 parallel_sparse_weight_B_mult 10.0 {2A2FC3A2-54BC-490D-917C-FE7E8A41443A} Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/deeplearning/sparse_weight_mult/parallel_sparse_weight_C_redmult.c000066400000000000000000000304121415223013700320030ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Xing Liu (Intel Corp.) ******************************************************************************/ #include #include #include #include void BlockSpMatStep1(int K, int C, int KB, int CB, unsigned int *colptr, unsigned int *rowidx, unsigned int *b_colptr[], int *nnzb) { int num_blocks = K / KB * C / CB; int blk_idx, i, k; for (blk_idx = 0; blk_idx < num_blocks; ++blk_idx) { nnzb[blk_idx] = 0; for (i = 0; i <= KB; ++i) { b_colptr[blk_idx][i] = 0; } } for (k = 0; k < K; ++k) { int k_blk_idx = k / KB; int k_blk_offset = k % KB; unsigned colstart = colptr[k]; unsigned colend = colptr[k + 1]; for (i = colstart; i < colend; ++i) { int c = rowidx[i]; int c_blk_idx = c / CB; int blk_idx = k_blk_idx * C / CB + c_blk_idx; nnzb[blk_idx]++; b_colptr[blk_idx][k_blk_offset + 1]++; } } for (blk_idx = 0; blk_idx < num_blocks; ++blk_idx) { for (i = 0; i < KB; ++i) { b_colptr[blk_idx][i + 1] += b_colptr[blk_idx][i]; } } } void BlockSpMatStep2(int K, int C, int KB, int CB, unsigned int *colptr, unsigned int *rowidx, float *values, unsigned int *b_colptr[], unsigned int *b_rowidx[], float *b_values[]) { int num_blocks = K / KB * C / CB; int blk_idx, k, i; for (k = 0; k < K; ++k) { int k_blk_idx = k / KB; int k_blk_offset = k % KB; unsigned colstart = colptr[k]; unsigned colend = colptr[k + 1]; for (i = colstart; i < colend; ++i) { int c = rowidx[i]; int c_blk_idx = c / CB; int c_blk_offset = c % CB; int blk_idx = k_blk_idx * C / CB + c_blk_idx; b_rowidx[blk_idx][b_colptr[blk_idx][k_blk_offset]] = c_blk_offset; b_values[blk_idx][b_colptr[blk_idx][k_blk_offset]] = values[i]; b_colptr[blk_idx][k_blk_offset]++; } } for (blk_idx = 0; blk_idx < num_blocks; ++blk_idx) { for (i = KB; i > 0; --i) { b_colptr[blk_idx][i] = b_colptr[blk_idx][i - 1]; } b_colptr[blk_idx][0] = 0; } } int main(int argc, char **argv) { int N = (argc > 1) ? atoi(argv[1]) : 2048; int C = (argc > 2) ? atoi(argv[2]) : 512; int K = (argc > 3) ? atoi(argv[3]) : 512; int NB = (argc > 4) ? atoi(argv[4]) : 32; int CB = (argc > 5) ? atoi(argv[5]) : 32; int KB = (argc > 6) ? atoi(argv[6]) : 32; int nb = (argc > 7) ? atoi(argv[7]) : 16; double sparse_frac = (argc > 8) ? atof(argv[8]) : 0.90; unsigned int REPS = (argc > 9) ? atoi(argv[9]) : 10; if (N < NB || K < KB || C < CB || NB < nb || C % CB != 0 || N % NB != 0 || nb % 16 != 0 || NB % nb != 0 || sparse_frac <= 0.0 || sparse_frac >= 1.0 || REPS <= 0) { return -1; } int l_n, l_c, l_nn, l_cc, l_nnn, l_k, l_kk, blk_idx; int i, k, n, c; libxsmm_gemm_prefetch_type prefetch = LIBXSMM_GEMM_PREFETCH_NONE; int flags = LIBXSMM_GEMM_FLAGS('N', 'N'); float *l_A = (float *)libxsmm_aligned_malloc(sizeof(float) * N * C, 64); float *l_B = (float *)libxsmm_aligned_malloc(sizeof(float) * N * K, 64); float *l_C_gold = (float *)libxsmm_aligned_malloc(sizeof(float) * C * K, 64); LIBXSMM_VLA_DECL(5, float, l_p_A, l_A, C / CB, NB / nb, CB, nb); LIBXSMM_VLA_DECL(5, float, l_p_B, l_B, K / KB, NB / nb, KB, nb); /* touch A */ for (l_n = 0; l_n < N / NB; ++l_n) { for (l_c = 0; l_c < C / CB; ++l_c) { for (l_nn = 0; l_nn < NB / nb; ++l_nn) { for (l_cc = 0; l_cc < CB; ++l_cc) { for (l_nnn = 0; l_nnn < nb; ++l_nnn) { LIBXSMM_VLA_ACCESS(5, l_p_A, l_n, l_c, l_nn, l_cc, l_nnn, C / CB, NB / nb, CB, nb) = (float)libxsmm_rng_f64(); } } } } } /* touch B */ for (l_n = 0; l_n < N / NB; ++l_n) { for (l_k = 0; l_k < K / KB; ++l_k) { for (l_nn = 0; l_nn < NB / nb; ++l_nn) { for (l_kk = 0; l_kk < KB; ++l_kk) { for (l_nnn = 0; l_nnn < nb; ++l_nnn) { LIBXSMM_VLA_ACCESS(5, l_p_B, l_n, l_k, l_nn, l_kk, l_nnn, K / KB, NB / nb, KB, nb) = (float)libxsmm_rng_f64(); } } } } } /* touch C */ for (k = 0; k < K; ++k) { for (c = 0; c < C; ++c) { l_C_gold[k * C + c] = 0.0f; } } /* init sparse C */ int nnz = 0; unsigned int *colptr = (unsigned int *)libxsmm_aligned_malloc( (K + 1) * sizeof(unsigned int), 64); colptr[0] = 0; for (l_k = 0; l_k < K; l_k++) { colptr[l_k + 1] = 0; for (l_c = 0; l_c < C; l_c++) { double tmp = libxsmm_rng_f64(); if (tmp < sparse_frac) { tmp = 0.0; } else { nnz++; colptr[l_k + 1]++; l_C_gold[l_k * C + l_c] = tmp; } } } for (l_k = 0; l_k < K; l_k++) { colptr[l_k + 1] += colptr[l_k]; } unsigned int *rowidx = (unsigned int *)libxsmm_aligned_malloc(nnz * sizeof(unsigned int), 64); float *values = (float *)libxsmm_aligned_malloc(nnz * sizeof(float), 64); for (l_k = 0; l_k < K; l_k++) { int offset = colptr[l_k]; for (l_c = 0; l_c < C; l_c++) { if (l_C_gold[l_k * C + l_c] != 0) { rowidx[offset] = l_c; values[offset] = l_C_gold[l_k * C + l_c]; offset++; } } } unsigned num_k_blocks = K / KB; unsigned num_c_blocks = C / CB; int num_blocks = num_k_blocks * num_c_blocks; unsigned int **c_colptr = (unsigned int **)libxsmm_aligned_malloc( num_blocks * sizeof(unsigned int *), 64); unsigned int **c_rowidx = (unsigned int **)libxsmm_aligned_malloc( num_blocks * sizeof(unsigned int *), 64); float **c_values = (float **)libxsmm_aligned_malloc(num_blocks * sizeof(float *), 64); int *nnzb = (int *)libxsmm_aligned_malloc(num_blocks * sizeof(int), 64); for (blk_idx = 0; blk_idx < num_blocks; ++blk_idx) { c_colptr[blk_idx] = (unsigned int *)libxsmm_aligned_malloc( (KB + 1) * sizeof(unsigned int), 64); } BlockSpMatStep1(K, C, KB, CB, colptr, rowidx, c_colptr, nnzb); for (blk_idx = 0; blk_idx < num_blocks; ++blk_idx) { c_rowidx[blk_idx] = (unsigned int *)libxsmm_aligned_malloc( nnzb[blk_idx] * sizeof(unsigned int), 64); c_values[blk_idx] = (float *)libxsmm_aligned_malloc(nnzb[blk_idx] * sizeof(float), 64); } BlockSpMatStep2(K, C, KB, CB, colptr, rowidx, values, c_colptr, c_rowidx, c_values); /* dense routine */ for (l_n = 0; l_n < N / NB; ++l_n) { for (l_k = 0; l_k < K / KB; ++l_k) { for (l_c = 0; l_c < C / CB; ++l_c) { for (l_nn = 0; l_nn < NB / nb; ++l_nn) { for (l_kk = 0; l_kk < KB; ++l_kk) { int k = l_k * KB + l_kk; for (l_cc = 0; l_cc < CB; ++l_cc) { int c = l_c * CB + l_cc; for (l_nnn = 0; l_nnn < nb; ++l_nnn) { l_C_gold[k * C + c] += LIBXSMM_VLA_ACCESS(5, l_p_A, l_n, l_c, l_nn, l_cc, l_nnn, C / CB, NB / nb, CB, nb) * LIBXSMM_VLA_ACCESS(5, l_p_B, l_n, l_k, l_nn, l_kk, l_nnn, K / KB, NB / nb, KB, nb); } } } } } } } /* UPD */ float alpha = 1.0; float beta = 1.0; libxsmm_descriptor_blob l_xgemm_blob; libxsmm_gemm_descriptor **l_xgemm_desc = (libxsmm_gemm_descriptor **)libxsmm_aligned_malloc( num_blocks * sizeof(libxsmm_gemm_descriptor *), 64); libxsmm_smmfunction *mykernel = (libxsmm_smmfunction *)libxsmm_aligned_malloc( num_blocks * sizeof(libxsmm_smmfunction), 64); for (blk_idx = 0; blk_idx < num_blocks; ++blk_idx) { l_xgemm_desc[blk_idx] = libxsmm_gemm_descriptor_dinit( &l_xgemm_blob, LIBXSMM_GEMM_PRECISION(float), CB, KB, NB / nb, CB, KB, 0, alpha, beta, flags, prefetch); mykernel[blk_idx] = libxsmm_create_xcsc_soa(l_xgemm_desc[blk_idx], c_colptr[blk_idx], c_rowidx[blk_idx], (const void *)c_values[blk_idx], nb).smm; } #ifdef _OPENMP # pragma omp parallel for LIBXSMM_OPENMP_COLLAPSE(2) private(k,n,c) #endif for (k = 0; k < K / KB; ++k) { for (c = 0; c < C / CB; ++c) { for (n = 0; n < N / NB; ++n) { if (c_values[k * C/CB + c] != NULL) { mykernel[k * C / CB + c](&(l_A[(n * C / CB + c) * CB * NB]), &(l_B[(n * K / KB + k) * KB * NB]), c_values[k * C / CB + c]); } } } } /* check error */ float l_max_error = 0.0f; for (l_k = 0; l_k < K/KB; ++l_k) { for (l_c = 0; l_c < C/CB; ++l_c) { int blk_idx = l_k * C/CB + l_c; for (l_kk = 0; l_kk < KB; ++l_kk) { int colstart = c_colptr[blk_idx][l_kk]; int colend = c_colptr[blk_idx][l_kk + 1]; k = l_k * KB + l_kk; for (i = colstart; i < colend; ++i) { l_cc = c_rowidx[blk_idx][i]; c = l_c * CB + l_cc; float v = c_values[blk_idx][i]; if ( fabs(v - l_C_gold[k * C + c]) > l_max_error ) { l_max_error = (float)fabs(v - l_C_gold[k * C + c]); } } } } } /* check performace */ unsigned long long l_start = libxsmm_timer_tick(); for (i = 0; i < REPS; ++i) { #ifdef _OPENMP # pragma omp parallel for LIBXSMM_OPENMP_COLLAPSE(2) private(k,n,c) #endif for (k = 0; k < K / KB; ++k) { for (c = 0; c < C / CB; ++c) { for (n = 0; n < N / NB; ++n) { if (c_values[k * C/CB + c] != NULL) { mykernel[k * C / CB + c](&(l_A[(n * C / CB + c) * CB * NB]), &(l_B[(n * K / KB + k) * KB * NB]), c_values[k * C / CB + c]); } } } } } unsigned long long l_end = libxsmm_timer_tick(); double l_total = libxsmm_timer_duration(l_start, l_end); printf("%fs for sparse (asm)\n", l_total); printf("%f GFLOPS for sparse (asm)\n", ((double)((double)REPS * (double)N * (double)C * (double)K) * 2.0) / (l_total * 1.0e9)); fflush(stdout); /* clean up */ libxsmm_free(l_A); libxsmm_free(l_B); libxsmm_free(l_C_gold); for (blk_idx = 0; blk_idx < num_blocks; ++blk_idx) { libxsmm_free(c_values[blk_idx]); libxsmm_free(c_colptr[blk_idx]); libxsmm_free(c_rowidx[blk_idx]); } libxsmm_free(c_values); libxsmm_free(c_colptr); libxsmm_free(c_rowidx); return 0; }libxsmm-1.17/samples/deeplearning/sparse_weight_mult/run.sh000077500000000000000000000033101415223013700242610ustar00rootroot00000000000000#!/usr/bin/env bash N=${1:-160} C=${2:-1024} K=${3:-1024} spar=${4:-0.9} rep=${5:-30} echo "N = " $N echo "C = " $C echo "K = " $K echo "Sparsity = " $spar echo "Repeats = " $rep fwd_max_perf=0 upd_max_perf=0 #for NB in 16 32 80 160 320 for NB in 160 320 do #for nb in 16 32 80 160 320 for nb in 16 32 do if [[ "$nb" -gt "$NB" ]]; then continue fi if [[ "$nb" == "32" && "$NB" == "80" ]]; then continue fi #for KB in 8 16 32 64 128 for KB in 128 do #for CB in 8 16 32 64 128 for CB in 64 128 do echo "NB =" $NB ", nb =" $nb ", CB =" $CB ", KB =" $KB KMP_AFFINITY=compact,granularity=fine,1,28 OMP_NUM_THREADS=28 srun ./parallel_sparse_weight_B_mult $N $C $K $NB $CB $KB $nb $spar $rep 2>&1 1>tmp if [ $? -eq 0 ] then fwd_perf=$(grep "GFLOPS " tmp | awk -F " " '{print $1}') fwd_perf=${fwd_perf%.*} echo " FWD_PERF =" $fwd_perf fwd_max_perf=$(( fwd_perf > fwd_max_perf ? fwd_perf : fwd_max_perf )) else echo " FWD Fail" fi rm -f tmp KMP_AFFINITY=compact,granularity=fine,1,28 OMP_NUM_THREADS=28 srun ./parallel_sparse_weight_C_redmult $N $C $K $NB $CB $KB $nb $spar $rep 2>&1 1>tmp if [ $? -eq 0 ] then upd_perf=$(grep "GFLOPS " tmp | awk -F " " '{print $1}') upd_perf=${upd_perf%.*} echo " UPD_PERF =" $upd_perf upd_max_perf=$(( upd_perf > upd_max_perf ? upd_perf : upd_max_perf )) else echo " UPD Fail" fi rm -f tmp done done done done echo "FWD_MAX_PERF =" $fwd_max_perf "GFLOPS" echo "UPD_MAX_PERF =" $upd_max_perf "GFLOPS"libxsmm-1.17/samples/deeplearning/sparse_weight_mult/simple.c000066400000000000000000000240421415223013700245600ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include #include #include #include #include int main(int argc, char* argv[]) { int N = ( argc > 1 ) ? atoi(argv[1]) : 64; int C = ( argc > 2 ) ? atoi(argv[2]) : 512; int K = ( argc > 3 ) ? atoi(argv[3]) : 32; double sparse_frac = ( argc > 4 ) ? atof(argv[4]) : 0.90; unsigned int REPS = ( argc > 5 ) ? atoi(argv[5]) : 1; unsigned int* l_rowptr = NULL; unsigned int* l_colidx = NULL; float* l_a_de = (float*)libxsmm_aligned_malloc(sizeof(float) * C * K, 64); float* l_a_sp_csr = NULL; float* l_b = (float*)libxsmm_aligned_malloc(sizeof(float) * N * C, 64); float* l_c_gold = (float*)libxsmm_aligned_malloc(sizeof(float) * N * K, 64); float* l_c_asm_csr = (float*)libxsmm_aligned_malloc(sizeof(float) * N * K, 64); float l_max_error = 0.0; int l_i, l_j, l_k, l_jj; unsigned int l_n; LIBXSMM_VLA_DECL(2, float, l_p_a_de, l_a_de, K); LIBXSMM_VLA_DECL(3, float, l_p_b, l_b, N/16, 16); LIBXSMM_VLA_DECL(3, float, l_p_c_asm_csr, l_c_asm_csr, N/16, 16); LIBXSMM_VLA_DECL(3, float, l_p_c_gold, l_c_gold, N/16, 16); unsigned long long l_start, l_end; double l_total; int NB, nb; int nnz = 0; if (argc != 6 && argc != 1) { fprintf( stderr, "arguments failure\n" ); return -1; } if ( N % 64 != 0 ) { fprintf( stderr, "N needs to be disable by 64\n" ); return -1; } NB = N / 16; nb = 16; /* touch B */ for ( l_i = 0; l_i < C; l_i++) { for ( l_j = 0; l_j < NB; l_j++) { for ( l_k = 0; l_k < nb; l_k++ ) { LIBXSMM_VLA_ACCESS(3, l_p_b, l_i, l_j, l_k, NB, nb) = (float)libxsmm_rng_f64(); } } } /* touch dense A */ for ( l_i = 0; l_i < K; l_i++ ) { for ( l_j = 0; l_j < C; l_j++ ) { float tmp = (float)libxsmm_rng_f64(); if ( tmp < sparse_frac ) { tmp = 0; } else { nnz++; } LIBXSMM_VLA_ACCESS(2, l_p_a_de, l_i, l_j, C) = tmp; } } printf("we just generated a %i x %i matrix with %i NZ entries\n", K, C, nnz); /* touch C */ for ( l_i = 0; l_i < K; l_i++) { for ( l_j = 0; l_j < NB; l_j++) { for ( l_k = 0; l_k < nb; l_k++ ) { LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, NB, nb) = 0.f; LIBXSMM_VLA_ACCESS(3, l_p_c_asm_csr, l_i, l_j, l_k, NB, nb) = 0.f; } } } /* create A, csr */ l_rowptr = (unsigned int*) libxsmm_aligned_malloc( (K+1)*sizeof(unsigned int), 64 ); l_colidx = (unsigned int*) libxsmm_aligned_malloc( nnz*sizeof(unsigned int), 64 ); l_a_sp_csr = (float* ) libxsmm_aligned_malloc( nnz*sizeof(float), 64 ); l_k = 0; l_rowptr[K] = nnz; for ( l_i = 0; l_i < K; l_i++ ) { l_rowptr[l_i] = l_k; for ( l_j = 0; l_j < C; l_j++ ) { if ( LIBXSMM_VLA_ACCESS(2, l_p_a_de, l_i, l_j, C) != 0.0 ) { l_colidx[l_k] = l_j; l_a_sp_csr[l_k] = LIBXSMM_VLA_ACCESS(2, l_p_a_de, l_i, l_j, C); l_k++; } } } /* dense routine */ l_start = libxsmm_timer_tick(); #if 1 for ( l_n = 0; l_n < REPS; l_n++) { # pragma omp parallel for private(l_j, l_jj, l_i, l_k) for ( l_j = 0; l_j < K; l_j++) { for ( l_jj = 0; l_jj < C; l_jj++) { for ( l_i = 0; l_i < NB; l_i++) { LIBXSMM_PRAGMA_SIMD for (l_k = 0; l_k < nb; l_k++) { LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_j, l_i, l_k, NB, nb) += LIBXSMM_VLA_ACCESS(3, l_p_b, l_jj, l_i, l_k, NB, nb) * l_a_de[(l_j*C)+l_jj]; } } } } } #endif l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); printf("%fs for dense\n", l_total); printf("%f GFLOPS for dense\n", ((double)((double)REPS * (double)N * (double)C * (double)K) * 2.0) / (l_total * 1.0e9)); /* sparse routine */ l_start = libxsmm_timer_tick(); for ( l_n = 0; l_n < REPS; l_n++) { for ( l_i = 0; l_i < N; l_i+= 64 ) { #if defined(_OPENMP) # pragma omp parallel for private(l_j,l_k) #endif for ( l_k = 0; l_k < K; l_k++) { #if defined(__AVX512F__) __m512 c0 = _mm512_loadu_ps( &l_c_asm_csr[(l_k*N)+l_i ] ); __m512 c1 = _mm512_loadu_ps( &l_c_asm_csr[(l_k*N)+l_i+16] ); __m512 c2 = _mm512_loadu_ps( &l_c_asm_csr[(l_k*N)+l_i+32] ); __m512 c3 = _mm512_loadu_ps( &l_c_asm_csr[(l_k*N)+l_i+48] ); #elif defined(__AVX2__) __m256 c0 = _mm256_loadu_ps( &l_c_asm_csr[(l_k*N)+l_i ] ); __m256 c1 = _mm256_loadu_ps( &l_c_asm_csr[(l_k*N)+l_i+ 8] ); __m256 c2 = _mm256_loadu_ps( &l_c_asm_csr[(l_k*N)+l_i+16] ); __m256 c3 = _mm256_loadu_ps( &l_c_asm_csr[(l_k*N)+l_i+24] ); __m256 c4 = _mm256_loadu_ps( &l_c_asm_csr[(l_k*N)+l_i+32] ); __m256 c5 = _mm256_loadu_ps( &l_c_asm_csr[(l_k*N)+l_i+40] ); __m256 c6 = _mm256_loadu_ps( &l_c_asm_csr[(l_k*N)+l_i+48] ); __m256 c7 = _mm256_loadu_ps( &l_c_asm_csr[(l_k*N)+l_i+56] ); #endif for ( l_j = 0; l_j < (int)(l_rowptr[l_k+1] - l_rowptr[l_k]); l_j++) { #if defined(__AVX512F__) c0 = _mm512_fmadd_ps( _mm512_set1_ps( l_a_sp_csr[l_rowptr[l_k] + l_j] ), _mm512_loadu_ps( &l_b[(l_colidx[l_rowptr[l_k] + l_j]*N) + l_i ] ), c0 ); c1 = _mm512_fmadd_ps( _mm512_set1_ps( l_a_sp_csr[l_rowptr[l_k] + l_j] ), _mm512_loadu_ps( &l_b[(l_colidx[l_rowptr[l_k] + l_j]*N) + l_i+16] ), c1 ); c2 = _mm512_fmadd_ps( _mm512_set1_ps( l_a_sp_csr[l_rowptr[l_k] + l_j] ), _mm512_loadu_ps( &l_b[(l_colidx[l_rowptr[l_k] + l_j]*N) + l_i+32] ), c2 ); c3 = _mm512_fmadd_ps( _mm512_set1_ps( l_a_sp_csr[l_rowptr[l_k] + l_j] ), _mm512_loadu_ps( &l_b[(l_colidx[l_rowptr[l_k] + l_j]*N) + l_i+48] ), c3 ); #elif defined(__AVX2__) c0 = _mm256_fmadd_ps( _mm256_set1_ps( l_a_sp_csr[l_rowptr[l_k] + l_j] ), _mm256_loadu_ps( &l_b[(l_colidx[l_rowptr[l_k] + l_j]*N) + l_i ] ), c0 ); c1 = _mm256_fmadd_ps( _mm256_set1_ps( l_a_sp_csr[l_rowptr[l_k] + l_j] ), _mm256_loadu_ps( &l_b[(l_colidx[l_rowptr[l_k] + l_j]*N) + l_i+ 8] ), c1 ); c2 = _mm256_fmadd_ps( _mm256_set1_ps( l_a_sp_csr[l_rowptr[l_k] + l_j] ), _mm256_loadu_ps( &l_b[(l_colidx[l_rowptr[l_k] + l_j]*N) + l_i+16] ), c2 ); c3 = _mm256_fmadd_ps( _mm256_set1_ps( l_a_sp_csr[l_rowptr[l_k] + l_j] ), _mm256_loadu_ps( &l_b[(l_colidx[l_rowptr[l_k] + l_j]*N) + l_i+24] ), c3 ); c4 = _mm256_fmadd_ps( _mm256_set1_ps( l_a_sp_csr[l_rowptr[l_k] + l_j] ), _mm256_loadu_ps( &l_b[(l_colidx[l_rowptr[l_k] + l_j]*N) + l_i+32] ), c4 ); c5 = _mm256_fmadd_ps( _mm256_set1_ps( l_a_sp_csr[l_rowptr[l_k] + l_j] ), _mm256_loadu_ps( &l_b[(l_colidx[l_rowptr[l_k] + l_j]*N) + l_i+40] ), c5 ); c6 = _mm256_fmadd_ps( _mm256_set1_ps( l_a_sp_csr[l_rowptr[l_k] + l_j] ), _mm256_loadu_ps( &l_b[(l_colidx[l_rowptr[l_k] + l_j]*N) + l_i+48] ), c6 ); c7 = _mm256_fmadd_ps( _mm256_set1_ps( l_a_sp_csr[l_rowptr[l_k] + l_j] ), _mm256_loadu_ps( &l_b[(l_colidx[l_rowptr[l_k] + l_j]*N) + l_i+56] ), c7 ); #else unsigned int l_ii; LIBXSMM_PRAGMA_SIMD for ( l_ii = 0; l_ii < 64; l_ii++ ) { l_c_asm_csr[(l_k*N)+l_i+l_ii] += l_a_sp_csr[l_rowptr[l_k]+l_j] * l_b[(l_colidx[l_rowptr[l_k]+l_j]*N)+l_i+l_ii]; } #endif #if 0 _mm_prefetch( &l_b[(l_colidx[l_rowptr[l_k] + l_j]*N) + l_i+ 64], _MM_HINT_T1 ); _mm_prefetch( &l_b[(l_colidx[l_rowptr[l_k] + l_j]*N) + l_i+ 80], _MM_HINT_T1 ); _mm_prefetch( &l_b[(l_colidx[l_rowptr[l_k] + l_j]*N) + l_i+ 96], _MM_HINT_T1 ); _mm_prefetch( &l_b[(l_colidx[l_rowptr[l_k] + l_j]*N) + l_i+112], _MM_HINT_T1 ); #endif } #if defined(__AVX512F__) _mm512_storeu_ps( &l_c_asm_csr[(l_k*N)+l_i] , c0 ); _mm512_storeu_ps( &l_c_asm_csr[(l_k*N)+l_i+16], c1 ); _mm512_storeu_ps( &l_c_asm_csr[(l_k*N)+l_i+32], c2 ); _mm512_storeu_ps( &l_c_asm_csr[(l_k*N)+l_i+48], c3 ); #elif defined(__AVX2__) _mm256_storeu_ps( &l_c_asm_csr[(l_k*N)+l_i] , c0 ); _mm256_storeu_ps( &l_c_asm_csr[(l_k*N)+l_i+ 8], c1 ); _mm256_storeu_ps( &l_c_asm_csr[(l_k*N)+l_i+16], c2 ); _mm256_storeu_ps( &l_c_asm_csr[(l_k*N)+l_i+24], c3 ); _mm256_storeu_ps( &l_c_asm_csr[(l_k*N)+l_i+32], c4 ); _mm256_storeu_ps( &l_c_asm_csr[(l_k*N)+l_i+40], c5 ); _mm256_storeu_ps( &l_c_asm_csr[(l_k*N)+l_i+48], c6 ); _mm256_storeu_ps( &l_c_asm_csr[(l_k*N)+l_i+56], c7 ); #endif } } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); printf("%fs for sparse (asm, csr)\n", l_total); printf("%f GFLOPS for sparse (asm, csr)\n", ((double)((double)REPS * (double)N * (double)C * (double)K) * 2.0) / (l_total * 1.0e9)); /* check for errors */ l_max_error = 0.f; for ( l_i = 0; l_i < NB; l_i++) { for ( l_j = 0; l_j < K; l_j++) { for ( l_k = 0; l_k < nb; l_k++ ) { if (fabs( LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_j, l_i, l_k, NB, nb) - LIBXSMM_VLA_ACCESS(3, l_p_c_asm_csr, l_j, l_i, l_k, NB, nb) ) > l_max_error ) { l_max_error = (float)fabs( LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_j, l_i, l_k, NB, nb) -LIBXSMM_VLA_ACCESS(3, l_p_c_asm_csr, l_j, l_i, l_k, NB, nb) ); } } } } printf("max error (csr): %f\n", l_max_error); /* free */ libxsmm_free( l_a_de ); libxsmm_free( l_b ); libxsmm_free( l_c_gold ); libxsmm_free( l_c_asm_csr ); libxsmm_free( l_a_sp_csr ); libxsmm_free( l_rowptr ); libxsmm_free( l_colidx ); return 0; } libxsmm-1.17/samples/deeplearning/sparse_weight_mult/simple.vcxproj000066400000000000000000000540201415223013700260300ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 simple 10.0 {3F142A69-06DA-4B8B-86D7-49F796A5C458} Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/deeplearning/sparse_weight_mult/sparse_weight_A_mult.c000066400000000000000000000143361415223013700274410ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include #include #include #include int main(int argc, char* argv[]) { unsigned int N = ( argc > 1 ) ? atoi(argv[1]) : 64; unsigned int C = ( argc > 2 ) ? atoi(argv[2]) : 512; unsigned int K = ( argc > 3 ) ? atoi(argv[3]) : 32; unsigned int nb = ( argc > 4 ) ? atoi(argv[4]) : 16; double sparse_frac = ( argc > 5 ) ? atof(argv[5]) : 0.90; unsigned int REPS = ( argc > 6 ) ? atoi(argv[6]) : 1; const libxsmm_gemm_prefetch_type prefetch = LIBXSMM_GEMM_PREFETCH_NONE; const int flags = LIBXSMM_GEMM_FLAGS('N', 'N'); const float alpha = 1, beta = 1; unsigned int* l_rowptr = NULL; unsigned int* l_colidx = NULL; float* l_a_de = (float*)libxsmm_aligned_malloc(sizeof(float) * C * K, 64); float* l_a_sp_csr = NULL; float* l_b = (float*)libxsmm_aligned_malloc(sizeof(float) * N * C, 64); float* l_c_gold = (float*)libxsmm_aligned_malloc(sizeof(float) * N * K, 64); float* l_c_asm_csr = (float*)libxsmm_aligned_malloc(sizeof(float) * N * K, 64); float l_max_error = 0.0; unsigned int l_k, l_n; unsigned int l_i, l_j, l_jj; unsigned int NB = N / nb; LIBXSMM_VLA_DECL(2, float, l_p_a_de, l_a_de, C); LIBXSMM_VLA_DECL(3, float, l_p_b, l_b, NB, nb); LIBXSMM_VLA_DECL(3, float, l_p_c_asm_csr, l_c_asm_csr, NB, nb); LIBXSMM_VLA_DECL(3, float, l_p_c_gold, l_c_gold, NB, nb); libxsmm_descriptor_blob l_xgemm_blob; const libxsmm_gemm_descriptor* l_xgemm_desc = 0; LIBXSMM_MMFUNCTION_TYPE(float) mykernel_csr = NULL; unsigned long long l_start, l_end; double l_total; unsigned int nnz = 0; if (argc != 7 && argc != 1) { fprintf( stderr, "arguments failure\n" ); return -1; } if ( N % nb != 0 ) { fprintf( stderr, "N needs to be disable by nb\n" ); return -1; } /* touch B */ for ( l_i = 0; l_i < C; l_i++) { for ( l_j = 0; l_j < NB; l_j++) { for ( l_k = 0; l_k < nb; l_k++ ) { LIBXSMM_VLA_ACCESS(3, l_p_b, l_i, l_j, l_k, NB, nb) = (float)libxsmm_rng_f64(); } } } /* touch dense A */ for ( l_i = 0; l_i < K; l_i++ ) { for ( l_j = 0; l_j < C; l_j++ ) { float tmp = (float)libxsmm_rng_f64(); if ( tmp < sparse_frac ) { tmp = 0; } else { nnz++; } LIBXSMM_VLA_ACCESS(2, l_p_a_de, l_i, l_j, C) = tmp; } } printf("we just generated a %i x %i matrix with %i NZ entries\n", K, C, nnz); /* touch C */ for ( l_i = 0; l_i < K; l_i++) { for ( l_j = 0; l_j < NB; l_j++) { for ( l_k = 0; l_k < nb; l_k++ ) { LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, NB, nb) = 0.f; LIBXSMM_VLA_ACCESS(3, l_p_c_asm_csr, l_i, l_j, l_k, NB, nb) = 0.f; } } } /* create B, csr */ l_rowptr = (unsigned int*) libxsmm_aligned_malloc( (K+1)*sizeof(unsigned int), 64 ); l_colidx = (unsigned int*) libxsmm_aligned_malloc( nnz*sizeof(unsigned int), 64 ); l_a_sp_csr = (float* ) libxsmm_aligned_malloc( nnz*sizeof(float), 64 ); l_k = 0; l_rowptr[K] = nnz; for ( l_i = 0; l_i < K; l_i++ ) { l_rowptr[l_i] = l_k; for ( l_j = 0; l_j < C; l_j++ ) { if ( LIBXSMM_VLA_ACCESS(2, l_p_a_de, l_i, l_j, C) != 0.0 ) { l_colidx[l_k] = l_j; l_a_sp_csr[l_k] = LIBXSMM_VLA_ACCESS(2, l_p_a_de, l_i, l_j, C); l_k++; } } } /* dense routine */ l_start = libxsmm_timer_tick(); #if 1 for ( l_n = 0; l_n < REPS; l_n++) { for ( l_i = 0; l_i < NB; l_i++) { for ( l_j = 0; l_j < K; l_j++) { for ( l_jj = 0; l_jj < C; l_jj++) { LIBXSMM_PRAGMA_SIMD for (l_k = 0; l_k < nb; l_k++) { LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_j, l_i, l_k, NB, nb) += LIBXSMM_VLA_ACCESS(3, l_p_b, l_jj, l_i, l_k, NB, nb) * l_a_de[(l_j*C)+l_jj]; } } } } } #endif l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); printf("%fs for dense\n", l_total); printf("%f GFLOPS for dense\n", ((double)((double)REPS * (double)N * (double)C * (double)K) * 2.0) / (l_total * 1.0e9)); l_xgemm_desc = libxsmm_gemm_descriptor_dinit(&l_xgemm_blob, LIBXSMM_GEMM_PRECISION(float), K, NB, C, 0, NB, NB, alpha, beta, flags, prefetch); /* sparse routine */ mykernel_csr = libxsmm_create_xcsr_soa(l_xgemm_desc, l_rowptr, l_colidx, (const void*)l_a_sp_csr, nb).smm; l_start = libxsmm_timer_tick(); for ( l_n = 0; l_n < REPS; l_n++) { mykernel_csr( l_a_sp_csr, l_b, l_c_asm_csr ); } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); printf("%fs for sparse (asm, csr)\n", l_total); printf("%f GFLOPS for sparse (asm, csr)\n", ((double)((double)REPS * (double)N * (double)C * (double)K) * 2.0) / (l_total * 1.0e9)); /* check for errors */ l_max_error = 0.f; for ( l_i = 0; l_i < NB; l_i++) { for ( l_j = 0; l_j < K; l_j++) { for ( l_k = 0; l_k < nb; l_k++ ) { if (fabs( LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_j, l_i, l_k, NB, nb) - LIBXSMM_VLA_ACCESS(3, l_p_c_asm_csr, l_j, l_i, l_k, NB, nb) ) > l_max_error ) { l_max_error = (float)fabs( LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_j, l_i, l_k, NB, nb) -LIBXSMM_VLA_ACCESS(3, l_p_c_asm_csr, l_j, l_i, l_k, NB, nb) ); } } } } printf("max error (csr): %f\n", l_max_error); /* free */ libxsmm_free( l_a_de ); libxsmm_free( l_b ); libxsmm_free( l_c_gold ); libxsmm_free( l_c_asm_csr ); libxsmm_free( l_a_sp_csr ); libxsmm_free( l_rowptr ); libxsmm_free( l_colidx ); return 0; } libxsmm-1.17/samples/deeplearning/sparse_weight_mult/sparse_weight_A_mult.vcxproj000066400000000000000000000540541415223013700307130ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 sparse_weight_A_mult 10.0 {3F44615A-AF78-4576-80F9-F0B162B9DE46} Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/deeplearning/sparse_weight_mult/sparse_weight_B_mult.c000066400000000000000000000204101415223013700274300ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include #include #include #include int main(int argc, char* argv[]) { unsigned int N = ( argc > 1 ) ? atoi(argv[1]) : 64; unsigned int C = ( argc > 2 ) ? atoi(argv[2]) : 512; unsigned int K = ( argc > 3 ) ? atoi(argv[3]) : 32; unsigned int nb = ( argc > 4 ) ? atoi(argv[4]) : 16; double sparse_frac = ( argc > 5 ) ? atof(argv[5]) : 0.90; unsigned int REPS = ( argc > 6 ) ? atoi(argv[6]) : 1; const libxsmm_gemm_prefetch_type prefetch = LIBXSMM_GEMM_PREFETCH_NONE; const int flags = LIBXSMM_GEMM_FLAGS('N', 'N'); const float alpha = 1, beta = 1; unsigned int* l_colptr = NULL; unsigned int* l_rowidx = NULL; unsigned int* l_rowptr = NULL; unsigned int* l_colidx = NULL; float* l_b_de = (float*)libxsmm_aligned_malloc(sizeof(float) * C * K, 64); float* l_b_sp_csc = NULL; float* l_b_sp_csr = NULL; float* l_a = (float*)libxsmm_aligned_malloc(sizeof(float) * N * C, 64); float* l_c_gold = (float*)libxsmm_aligned_malloc(sizeof(float) * N * K, 64); float* l_c_asm_csc = (float*)libxsmm_aligned_malloc(sizeof(float) * N * K, 64); float* l_c_asm_csr = (float*)libxsmm_aligned_malloc(sizeof(float) * N * K, 64); float l_max_error = 0.0; unsigned int l_k, l_n; unsigned int l_i, l_j, l_jj; LIBXSMM_VLA_DECL(2, float, l_p_b_de, l_b_de, C); LIBXSMM_VLA_DECL(3, float, l_p_a, l_a, C, nb); LIBXSMM_VLA_DECL(3, float, l_p_c_asm_csc, l_c_asm_csc, K, nb); LIBXSMM_VLA_DECL(3, float, l_p_c_asm_csr, l_c_asm_csr, K, nb); LIBXSMM_VLA_DECL(3, float, l_p_c_gold, l_c_gold, K, nb); libxsmm_descriptor_blob l_xgemm_blob; const libxsmm_gemm_descriptor* l_xgemm_desc = 0; LIBXSMM_MMFUNCTION_TYPE(float) mykernel_csc = NULL; LIBXSMM_MMFUNCTION_TYPE(float) mykernel_csr = NULL; unsigned long long l_start, l_end; double l_total; unsigned int NB; unsigned int nnz = 0; if (argc != 7 && argc != 1) { fprintf( stderr, "arguments failure\n" ); return -1; } NB = N / nb; /* touch A */ for ( l_i = 0; l_i < NB; l_i++) { for ( l_j = 0; l_j < C; l_j++) { for ( l_k = 0; l_k < nb; l_k++ ) { LIBXSMM_VLA_ACCESS(3, l_p_a, l_i, l_j, l_k, C, nb) = (float)libxsmm_rng_f64(); } } } /* touch dense B */ for ( l_i = 0; l_i < K; l_i++ ) { for ( l_j = 0; l_j < C; l_j++ ) { float tmp = (float)libxsmm_rng_f64(); if ( tmp < sparse_frac ) { tmp = 0; } else { nnz++; } LIBXSMM_VLA_ACCESS(2, l_p_b_de, l_i, l_j, C) = tmp; } } printf("we just generated a %i x %i matrix with %i NZ entries\n", K, C, nnz); /* touch C */ for ( l_i = 0; l_i < NB; l_i++) { for ( l_j = 0; l_j < K; l_j++) { for ( l_k = 0; l_k < nb; l_k++ ) { LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, K, nb) = 0.f; LIBXSMM_VLA_ACCESS(3, l_p_c_asm_csc, l_i, l_j, l_k, K, nb) = 0.f; LIBXSMM_VLA_ACCESS(3, l_p_c_asm_csr, l_i, l_j, l_k, K, nb) = 0.f; } } } /* create B, csc */ l_colptr = (unsigned int*) libxsmm_aligned_malloc( (K+1)*sizeof(unsigned int), 64 ); l_rowidx = (unsigned int*) libxsmm_aligned_malloc( nnz*sizeof(unsigned int), 64 ); l_b_sp_csc = (float* ) libxsmm_aligned_malloc( nnz*sizeof(float), 64 ); l_k = 0; l_colptr[K] = nnz; for ( l_i = 0; l_i < K; l_i++ ) { l_colptr[l_i] = l_k; for ( l_j = 0; l_j < C; l_j++ ) { if ( LIBXSMM_VLA_ACCESS(2, l_p_b_de, l_i, l_j, C) != 0.0 ) { l_rowidx[l_k] = l_j; l_b_sp_csc[l_k] = LIBXSMM_VLA_ACCESS(2, l_p_b_de, l_i, l_j, C); l_k++; } } } /* create B, csr */ l_rowptr = (unsigned int*) libxsmm_aligned_malloc( (C+1)*sizeof(unsigned int), 64 ); l_colidx = (unsigned int*) libxsmm_aligned_malloc( nnz*sizeof(unsigned int), 64 ); l_b_sp_csr = (float* ) libxsmm_aligned_malloc( nnz*sizeof(float), 64 ); l_k = 0; l_rowptr[C] = nnz; for ( l_j = 0; l_j < C; l_j++ ) { l_rowptr[l_j] = l_k; for ( l_i = 0; l_i < K; l_i++ ) { if ( LIBXSMM_VLA_ACCESS(2, l_p_b_de, l_i, l_j, C) != 0.0 ) { l_colidx[l_k] = l_i; l_b_sp_csr[l_k] = LIBXSMM_VLA_ACCESS(2, l_p_b_de, l_i, l_j, C); l_k++; } } } /* dense routine */ l_start = libxsmm_timer_tick(); #if 1 for ( l_n = 0; l_n < REPS; l_n++) { for ( l_i = 0; l_i < NB; l_i++) { for ( l_j = 0; l_j < K; l_j++) { for ( l_jj = 0; l_jj < C; l_jj++) { LIBXSMM_PRAGMA_SIMD for (l_k = 0; l_k < nb; l_k++) { LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, K, nb) += LIBXSMM_VLA_ACCESS(3, l_p_a, l_i, l_jj, l_k, C, nb) * l_b_de[(l_j*C)+l_jj]; } } } } } #endif l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); printf("%fs for dense\n", l_total); printf("%f GFLOPS for dense\n", ((double)((double)REPS * (double)N * (double)C * (double)K) * 2.0) / (l_total * 1.0e9)); l_xgemm_desc = libxsmm_gemm_descriptor_dinit(&l_xgemm_blob, LIBXSMM_GEMM_PRECISION(float), NB, K, C, C, 0, K, alpha, beta, flags, prefetch); /* sparse routine */ mykernel_csc = libxsmm_create_xcsc_soa(l_xgemm_desc, l_colptr, l_rowidx, (const void*)l_b_sp_csc, nb).smm; mykernel_csr = libxsmm_create_xcsr_soa(l_xgemm_desc, l_rowptr, l_colidx, (const void*)l_b_sp_csr, nb).smm; l_start = libxsmm_timer_tick(); for ( l_n = 0; l_n < REPS; l_n++) { mykernel_csc( l_a, l_b_sp_csc, l_c_asm_csc ); } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); printf("%fs for sparse (asm, csc)\n", l_total); printf("%f GFLOPS for sparse (asm, csc)\n", ((double)((double)REPS * (double)N * (double)C * (double)K) * 2.0) / (l_total * 1.0e9)); l_start = libxsmm_timer_tick(); for ( l_n = 0; l_n < REPS; l_n++) { mykernel_csr( l_a, l_b_sp_csr, l_c_asm_csr ); } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); printf("%fs for sparse (asm, csr)\n", l_total); printf("%f GFLOPS for sparse (asm, csr)\n", ((double)((double)REPS * (double)N * (double)C * (double)K) * 2.0) / (l_total * 1.0e9)); /* check for errors */ l_max_error = 0.f; for ( l_i = 0; l_i < NB; l_i++) { for ( l_j = 0; l_j < K; l_j++) { for ( l_k = 0; l_k < nb; l_k++ ) { if (fabs( LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, K, nb) - LIBXSMM_VLA_ACCESS(3, l_p_c_asm_csc, l_i, l_j, l_k, K, nb) ) > l_max_error ) { l_max_error = (float)fabs( LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, K, nb) -LIBXSMM_VLA_ACCESS(3, l_p_c_asm_csc, l_i, l_j, l_k, K, nb) ); } } } } printf("max error (csc): %f\n", l_max_error); l_max_error = 0.f; for ( l_i = 0; l_i < NB; l_i++) { for ( l_j = 0; l_j < K; l_j++) { for ( l_k = 0; l_k < nb; l_k++ ) { if (fabs( LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, K, nb) - LIBXSMM_VLA_ACCESS(3, l_p_c_asm_csr, l_i, l_j, l_k, K, nb) ) > l_max_error ) { l_max_error = (float)fabs( LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, K, nb) -LIBXSMM_VLA_ACCESS(3, l_p_c_asm_csr, l_i, l_j, l_k, K, nb) ); } } } } printf("max error (csr): %f\n", l_max_error); /* free */ libxsmm_free( l_b_de ); libxsmm_free( l_a ); libxsmm_free( l_c_gold ); libxsmm_free( l_c_asm_csc ); libxsmm_free( l_c_asm_csr ); libxsmm_free( l_b_sp_csc ); libxsmm_free( l_colptr ); libxsmm_free( l_rowidx ); libxsmm_free( l_b_sp_csr ); libxsmm_free( l_rowptr ); libxsmm_free( l_colidx ); return 0; } libxsmm-1.17/samples/deeplearning/sparse_weight_mult/sparse_weight_B_mult.vcxproj000066400000000000000000000540541415223013700307140ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 sparse_weight_B_mult 10.0 {153DD877-3C0E-4A5B-9E72-CD31F7080F0C} Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/deeplearning/sparse_weight_mult/sparse_weight_C_redmult.c000066400000000000000000000146151415223013700301360ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include #include #include #include int main(int argc, char* argv[]) { unsigned int N = ( argc > 1 ) ? atoi(argv[1]) : 64; unsigned int C = ( argc > 2 ) ? atoi(argv[2]) : 512; unsigned int K = ( argc > 3 ) ? atoi(argv[3]) : 32; unsigned int nb = ( argc > 4 ) ? atoi(argv[4]) : 32; double sparse_frac = ( argc > 5 ) ? atof(argv[5]) : 0.9; unsigned int REPS = ( argc > 6 ) ? atoi(argv[6]) : 1; const libxsmm_gemm_prefetch_type prefetch = LIBXSMM_GEMM_PREFETCH_NONE; const int flags = LIBXSMM_GEMM_FLAGS('N', 'N'); const float alpha = 1, beta = 1; unsigned int* l_colptr = NULL; unsigned int* l_rowidx = NULL; float* l_c_de = (float*)libxsmm_aligned_malloc(sizeof(float) * C * K, 64); float* l_c_sp_csc = NULL; float* l_a = (float*)libxsmm_aligned_malloc(sizeof(float) * N * C, 64); float* l_b = (float*)libxsmm_aligned_malloc(sizeof(float) * N * K, 64); float l_max_error = 0.0; unsigned int l_k, l_n; unsigned int l_i, l_j, l_jj; unsigned int NB = N / nb; LIBXSMM_VLA_DECL(3, float, l_p_a, l_a, C, nb); LIBXSMM_VLA_DECL(3, float, l_p_b, l_b, K, nb); LIBXSMM_VLA_DECL(2, float, l_p_c_de, l_c_de, C); libxsmm_descriptor_blob l_xgemm_blob; libxsmm_gemm_descriptor* l_xgemm_desc = 0; LIBXSMM_MMFUNCTION_TYPE(float) mykernel_csc = NULL; unsigned long long l_start, l_end; double l_total; unsigned int nnz = 0; if (argc != 7 && argc != 1) { fprintf( stderr, "arguments failure\n" ); return -1; } if ( (N % nb != 0) || (nb > N) ) { fprintf( stderr, "N needs to be disable by %i\n", nb ); return -1; } /* touch A */ for ( l_i = 0; l_i < NB; l_i++) { for ( l_j = 0; l_j < C; l_j++) { for ( l_k = 0; l_k < nb; l_k++ ) { LIBXSMM_VLA_ACCESS(3, l_p_a, l_i, l_j, l_k, C, nb) = (float)libxsmm_rng_f64(); } } } /* touch B */ for ( l_i = 0; l_i < NB; l_i++) { for ( l_j = 0; l_j < K; l_j++) { for ( l_k = 0; l_k < nb; l_k++ ) { LIBXSMM_VLA_ACCESS(3, l_p_b, l_i, l_j, l_k, K, nb) = (float)libxsmm_rng_f64(); } } } /* touch dense C */ for ( l_i = 0; l_i < K; l_i++ ) { for ( l_j = 0; l_j < C; l_j++ ) { double tmp = libxsmm_rng_f64(); if ( tmp < sparse_frac ) { tmp = (double)0; } else { nnz++; } LIBXSMM_VLA_ACCESS(2, l_p_c_de, l_i, l_j, C) = (float)tmp; } } printf("we just generated a %i x %i matrix with %i NZ entries\n", K, C, nnz); /* create C, csc */ l_colptr = (unsigned int*) libxsmm_aligned_malloc( (K+1)*sizeof(unsigned int), 64 ); l_rowidx = (unsigned int*) libxsmm_aligned_malloc( nnz*sizeof(unsigned int), 64 ); l_c_sp_csc = (float* ) libxsmm_aligned_malloc( nnz*sizeof(float), 64 ); l_k = 0; l_colptr[K] = nnz; for ( l_i = 0; l_i < K; l_i++ ) { l_colptr[l_i] = l_k; for ( l_j = 0; l_j < C; l_j++ ) { if ( LIBXSMM_VLA_ACCESS(2, l_p_c_de, l_i, l_j, C) != 0.0 ) { l_rowidx[l_k] = l_j; l_c_sp_csc[l_k] = LIBXSMM_VLA_ACCESS(2, l_p_c_de, l_i, l_j, C); l_k++; } } } #if 0 for ( l_i = 0; l_i < K; l_i++) { for ( l_j = 0; l_j < l_colptr[l_i+1]-l_colptr[l_i]; l_j++ ) { printf("(%i, %i): %f %f\n", l_i, l_rowidx[l_colptr[l_i]+l_j], LIBXSMM_VLA_ACCESS(2, l_p_c_de, l_i, l_rowidx[l_colptr[l_i]+l_j], C), l_c_sp_csc[l_colptr[l_i]+l_j] ); } } #endif /* dense routine */ l_start = libxsmm_timer_tick(); #if 1 for ( l_n = 0; l_n < REPS; l_n++) { for ( l_i = 0; l_i < NB; l_i++) { for ( l_j = 0; l_j < K; l_j++) { for ( l_jj = 0; l_jj < C; l_jj++) { LIBXSMM_PRAGMA_SIMD for (l_k = 0; l_k < nb; l_k++) { LIBXSMM_VLA_ACCESS(2, l_p_c_de, l_j, l_jj, C) += LIBXSMM_VLA_ACCESS(3, l_p_a, l_i, l_jj, l_k, C, nb) * LIBXSMM_VLA_ACCESS(3, l_p_b, l_i, l_j, l_k, K, nb); } } } } } #endif l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); printf("%fs for dense\n", l_total); printf("%f GFLOPS for dense\n", ((double)((double)REPS * (double)N * (double)C * (double)K) * 2.0) / (l_total * 1.0e9)); l_xgemm_desc = libxsmm_gemm_descriptor_dinit(&l_xgemm_blob, LIBXSMM_GEMM_PRECISION(float), C, K, NB, C, K, 0, alpha, beta, flags, prefetch); /* sparse routine */ mykernel_csc = libxsmm_create_xcsc_soa(l_xgemm_desc, l_colptr, l_rowidx, (const void*)l_c_sp_csc, nb).smm; l_start = libxsmm_timer_tick(); for ( l_n = 0; l_n < REPS; l_n++) { mykernel_csc( l_a, l_b, l_c_sp_csc ); } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); printf("%fs for sparse (asm, csc)\n", l_total); printf("%f GFLOPS for sparse (asm, csc)\n", ((double)((double)REPS * (double)N * (double)C * (double)K) * 2.0) / (l_total * 1.0e9)); /* check for errors */ l_max_error = 0.f; for ( l_i = 0; l_i < K; l_i++) { for ( l_j = 0; l_j < l_colptr[l_i+1]-l_colptr[l_i]; l_j++ ) { #if 0 printf("(%i, %i): %f %f\n", l_i, l_rowidx[l_colptr[l_i]+l_j], LIBXSMM_VLA_ACCESS(2, l_p_c_de, l_i, l_rowidx[l_colptr[l_i]+l_j], C), l_c_sp_csc[l_colptr[l_i]+l_j] ); #endif if (fabs( LIBXSMM_VLA_ACCESS(2, l_p_c_de, l_i, l_rowidx[l_colptr[l_i]+l_j], C) - l_c_sp_csc[l_colptr[l_i]+l_j] ) > l_max_error ) { l_max_error = (float)fabs( LIBXSMM_VLA_ACCESS(2, l_p_c_de, l_i, l_rowidx[l_colptr[l_i]+l_j], C) - l_c_sp_csc[l_colptr[l_i]+l_j] ); } } } printf("max error (csc): %f\n", l_max_error); /* free */ libxsmm_free( l_c_de ); libxsmm_free( l_a ); libxsmm_free( l_b ); libxsmm_free( l_c_sp_csc ); libxsmm_free( l_colptr ); libxsmm_free( l_rowidx ); return 0; } libxsmm-1.17/samples/deeplearning/sparse_weight_mult/sparse_weight_C_redmult.vcxproj000066400000000000000000000540621415223013700314070ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 sparse_weight_C_redmult 10.0 {2BB7B2E8-17C5-481C-93B1-9EFA7E9DCA47} Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/deeplearning/tf_lstm_ops/000077500000000000000000000000001415223013700215455ustar00rootroot00000000000000libxsmm-1.17/samples/deeplearning/tf_lstm_ops/Makefile000066400000000000000000000054161415223013700232130ustar00rootroot00000000000000ROOTDIR = $(abspath $(dir $(firstword $(MAKEFILE_LIST)))) #TF_CFLAGS=$(shell python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))') #TF_LFLAGS=$(shell python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))') DEPDIR = ../../.. SRCDIR = . INCDIR = . BLDDIR = obj OUTDIR = xsmm_lstm CXXFLAGS = $(NULL) CFLAGS = $(NULL) DFLAGS = $(NULL) GCC_OMP_FLAGS = -fopenmp BLAS = 0 OMP = 1 SYM = 1 # include common Makefile artifacts include $(DEPDIR)/Makefile.inc # necessary include directories IFLAGS += -I$(call quote,$(INCDIR)) IFLAGS += -I$(call quote,$(DEPDIR)/include) override CXX=g++ OUTNAME := $(shell basename "$(ROOTDIR)") HEADERS := $(wildcard $(INCDIR)/*.h) $(wildcard $(INCDIR)/*.hpp) $(wildcard $(INCDIR)/*.hxx) $(wildcard $(INCDIR)/*.hh) \ $(wildcard $(SRCDIR)/*.h) $(wildcard $(SRCDIR)/*.hpp) $(wildcard $(SRCDIR)/*.hxx) $(wildcard $(SRCDIR)/*.hh) \ $(DEPDIR)/include/libxsmm_source.h CCXSRCS := $(wildcard $(SRCDIR)/*.cc) CSOURCS := $(wildcard $(SRCDIR)/*.c) CCXOBJS := $(patsubst %,$(BLDDIR)/%,$(call qndir,$(CCXSRCS:.cc=-cc.o))) COBJCTS := $(patsubst %,$(BLDDIR)/%,$(call qndir,$(CSOURCS:.c=-c.o))) SOURCES := $(CCXSRCS) $(CSOURCS) OBJECTS := $(CCXOBJS) $(COBJCTS) XFILES := $(OUTDIR)/libxsmm_lstm.so TF_FLAGS := $(BLDDIR)/tf_flags -include $(TF_FLAGS) .PHONY: all all: $(XFILES) .PHONY: compile compile: $(OBJECTS) $(TF_FLAGS): $(eval TF_CFLAGS=$(shell python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))') ) $(eval TF_LFLAGS=$(shell python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))') ) @echo "TF_CFLAGS = $(TF_CFLAGS)" @echo "TF_LFLAGS = $(TF_LFLAGS)" @mkdir -p $(BLDDIR) @echo "TF_CFLAGS=$(TF_CFLAGS)" > $@ @echo "TF_LFLAGS=$(TF_LFLAGS)" >> $@ $(OUTDIR)/libxsmm_lstm.so: $(OUTDIR)/.make $(CCXOBJS) $(COBJCTS) $(LIBDEP) $(EXTDEP) $(TF_FLAGS) $(LD) -o $@ -shared $(CCXOBJS) $(COBJCTS) $(MAINLIB) $(TF_LFLAGS) -lsvml -liomp5 -fPIC $(BLDDIR)/%-cc.o: $(SRCDIR)/%.cc .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(TF_FLAGS) g++ -std=c++11 $(GCC_OMP_FLAGS) $(TF_CFLAGS) -fPIC -c $< -o $@ $(BLDDIR)/%-c.o: $(SRCDIR)/%.c .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CC) $(DFLAGS) $(IFLAGS) $(CFLAGS) $(CTARGET) -c $< -o $@ .PHONY: install install: $(XFILES) setup.py $(OUTDIR)/__init__.py python setup.py bdist_wheel pip install -U dist/xsmm_lstm-*.whl .PHONY: uninstall uninstall: pip uninstall xsmm_lstm .PHONY: clean clean: ifneq ($(call qapath,$(BLDDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(BLDDIR)),$(call qapath,.)) @rm -rf $(BLDDIR) endif endif ifneq (,$(wildcard $(BLDDIR))) # still exists @rm -f $(OBJECTS) endif @rm -f .make .state $(XFILES) $(OUTDIR)/*.pyc @rm -rf dist xsmm_lstm.egg-info libxsmm-1.17/samples/deeplearning/tf_lstm_ops/README.md000066400000000000000000000004051415223013700230230ustar00rootroot00000000000000# Xsmm LSTM This code may be integrated with Tensorflow to make use of LIBXSMM's LSTM. Support for creating a Python wheel and a pip package can be found in the [directory](https://github.com/hfp/libxsmm/tree/master/samples/deeplearning/tf_lstm_ops) as well. libxsmm-1.17/samples/deeplearning/tf_lstm_ops/lstm_bwd.c000066400000000000000000000672511415223013700235370ustar00rootroot00000000000000#include #include #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #include #include #include #if defined(_OPENMP) # include #endif #include "lstm_bwd.h" #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif #define CHKERR_LIBXSMM_DNN(A) if ( A != LIBXSMM_DNN_SUCCESS ) fprintf(stderr, "%s\n", libxsmm_dnn_get_error(A) ); #if 0 # define PRINT_LAYOUT2(DESC, LAYOUT) print_layout2(DESC, LAYOUT) #else # define PRINT_LAYOUT2(DESC, LAYOUT) #endif void print_layout2(char *desc, libxsmm_dnn_tensor_datalayout *layout) { char *dim_name[] = {"N", "H", "W", "C", "K", "R", "S", "X", "RLM", "RLK", "RLN"}; int i; printf("%s: F:%d TT: %d [", desc, layout->format, layout->tensor_type); for(i = layout->num_dims - 1; i >= 0; i--) { printf("%s:%d%s", dim_name[layout->dim_type[i]], layout->dim_size[i], i == 0 ? "" : ", "); } printf("]\n"); } void zero_buf(float* buf, size_t size) { int i; #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < (int)size; ++i) { buf[i] = 0.0f; } } void* lstm_bwd_create( int N, /* minibatch size */ int C, /* input size */ int K, /* output size */ int t, /* timesteps = 1 */ int nThreads, /* number of threads */ const int w_in_kcck, const int w_in_trans, const float *xt, const float *csp, const float *hp, const float *ht, const float *w, const float *r, const float *cst, const float *it, const float *ft, const float *ot, const float *cit, const float *cot, const float *dcs, const float *dht, float *dxt, float *dcspt, float *dhpt, float *dw, float *dr, float *db ) { libxsmm_dnn_rnncell_desc lstmcell_desc; libxsmm_dnn_rnncell* libxsmm_handle; libxsmm_dnn_tensor* libxsmm_input; libxsmm_dnn_tensor* libxsmm_cs_prev; libxsmm_dnn_tensor* libxsmm_hidden_state_prev; libxsmm_dnn_tensor* libxsmm_hidden_state = NULL; libxsmm_dnn_tensor* libxsmm_weight; libxsmm_dnn_tensor* libxsmm_recur_weight; libxsmm_dnn_tensor* libxsmm_cs; libxsmm_dnn_tensor* libxsmm_i; libxsmm_dnn_tensor* libxsmm_f; libxsmm_dnn_tensor* libxsmm_o; libxsmm_dnn_tensor* libxsmm_ci; libxsmm_dnn_tensor* libxsmm_co; libxsmm_dnn_tensor* libxsmm_dinput; libxsmm_dnn_tensor* libxsmm_dcs_prev; libxsmm_dnn_tensor* libxsmm_dhidden_state_prev; libxsmm_dnn_tensor* libxsmm_dweight; libxsmm_dnn_tensor* libxsmm_drecur_weight; libxsmm_dnn_tensor* libxsmm_dbias; libxsmm_dnn_tensor* libxsmm_dcs; libxsmm_dnn_tensor* libxsmm_dhidden_state; libxsmm_dnn_tensor_datalayout* libxsmm_layout; libxsmm_dnn_err_t status; if (N <= 0) { printf("N: %d should be > 0\n", N); } if (C <= 0) { printf("C: %d should be > 0\n", C); } if (K <= 0) { printf("K: %d should be > 0\n", K); } if (xt == 0 || csp == 0 || hp == 0 || w == 0 || r == 0 || (t > 1 && ht == 0) || cst == 0 || it == 0 || ft == 0 || ot == 0 || cit == 0 || cot == 0 || dxt == 0 || dcspt== 0|| dhpt== 0|| dw == 0 || dr == 0 || db == 0 || dht == 0 || dcs == 0) { printf("None of the pointers should be NULL::\n"); printf("x:%p\n", xt); printf("csp:%p\n", csp); printf("h_prev:%p\n", hp); printf("ht:%p\n", ht); printf("w:%p\n", w); printf("r:%p\n", r); printf("cs:%p\n", cst); printf("i:%p\n", it); printf("f:%p\n", ft); printf("o:%p\n", ot); printf("ci:%p\n", cit); printf("co:%p\n", cot); printf("dcs:%p\n", dcs); printf("dxt:%p\n", dxt); printf("dcspt:%p\n", dcspt); printf("dhpt:%p\n", dhpt); printf("dw:%p\n", dw); printf("dr:%p\n", dr); printf("db:%p\n", db); printf("dht:%p\n", dht); } /* setup LIBXSMM handle */ lstmcell_desc.threads = nThreads; lstmcell_desc.N = N; lstmcell_desc.C = C; lstmcell_desc.K = K; lstmcell_desc.max_T = t; lstmcell_desc.bn = 24; if(N % 24 == 0) lstmcell_desc.bn = 24; else if(N % 16 == 0) lstmcell_desc.bn = 16; else if(N % 12 == 0) lstmcell_desc.bn = 12; else if(N % 8 == 0) lstmcell_desc.bn = 8; else if(N % 6 == 0) lstmcell_desc.bn = 6; lstmcell_desc.bc = 64; lstmcell_desc.bk = 64; lstmcell_desc.cell_type = LIBXSMM_DNN_RNNCELL_LSTM; lstmcell_desc.datatype_in = LIBXSMM_DNN_DATATYPE_F32; lstmcell_desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32; lstmcell_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NC; lstmcell_desc.filter_format = (w_in_kcck ? LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED : LIBXSMM_DNN_TENSOR_FORMAT_CK); libxsmm_handle = libxsmm_dnn_create_rnncell( lstmcell_desc, &status ); CHKERR_LIBXSMM_DNN( status ); /* setup LIBXSMM buffers and filter */ libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); PRINT_LAYOUT2("Xt", libxsmm_layout); libxsmm_input = libxsmm_dnn_link_tensor( libxsmm_layout, xt, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_CS_PREV, &status ); CHKERR_LIBXSMM_DNN( status ); PRINT_LAYOUT2("CSP", libxsmm_layout); libxsmm_cs_prev = libxsmm_dnn_link_tensor( libxsmm_layout, csp, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV, &status ); CHKERR_LIBXSMM_DNN( status ); PRINT_LAYOUT2("HP", libxsmm_layout); libxsmm_hidden_state_prev = libxsmm_dnn_link_tensor( libxsmm_layout, hp, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); if(t > 1) { libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE, &status ); CHKERR_LIBXSMM_DNN( status ); PRINT_LAYOUT2("HT", libxsmm_layout); libxsmm_hidden_state = libxsmm_dnn_link_tensor( libxsmm_layout, ht, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); } libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, w_in_trans ? LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS : LIBXSMM_DNN_RNN_REGULAR_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); PRINT_LAYOUT2("W", libxsmm_layout); libxsmm_weight = libxsmm_dnn_link_tensor( libxsmm_layout, w, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, w_in_trans ? LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS : LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); PRINT_LAYOUT2("R", libxsmm_layout); libxsmm_recur_weight = libxsmm_dnn_link_tensor( libxsmm_layout, r, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_CS, &status ); CHKERR_LIBXSMM_DNN( status ); PRINT_LAYOUT2("CSt", libxsmm_layout); libxsmm_cs = libxsmm_dnn_link_tensor( libxsmm_layout, cst, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_I, &status ); CHKERR_LIBXSMM_DNN( status ); PRINT_LAYOUT2("It", libxsmm_layout); libxsmm_i = libxsmm_dnn_link_tensor( libxsmm_layout, it, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_F, &status ); CHKERR_LIBXSMM_DNN( status ); PRINT_LAYOUT2("Ft", libxsmm_layout); libxsmm_f = libxsmm_dnn_link_tensor( libxsmm_layout, ft, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_O, &status ); CHKERR_LIBXSMM_DNN( status ); PRINT_LAYOUT2("Ot", libxsmm_layout); libxsmm_o = libxsmm_dnn_link_tensor( libxsmm_layout, ot, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_CI, &status ); CHKERR_LIBXSMM_DNN( status ); PRINT_LAYOUT2("CIt", libxsmm_layout); libxsmm_ci = libxsmm_dnn_link_tensor( libxsmm_layout, cit, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_CO, &status ); CHKERR_LIBXSMM_DNN( status ); PRINT_LAYOUT2("COt", libxsmm_layout); libxsmm_co = libxsmm_dnn_link_tensor( libxsmm_layout, cot, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); PRINT_LAYOUT2("dXt", libxsmm_layout); libxsmm_dinput = libxsmm_dnn_link_tensor( libxsmm_layout, dxt, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_CS_PREV, &status ); CHKERR_LIBXSMM_DNN( status ); PRINT_LAYOUT2("dCSPt", libxsmm_layout); libxsmm_dcs_prev = libxsmm_dnn_link_tensor( libxsmm_layout, dcspt, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV, &status ); CHKERR_LIBXSMM_DNN( status ); PRINT_LAYOUT2("dHPt", libxsmm_layout); libxsmm_dhidden_state_prev = libxsmm_dnn_link_tensor( libxsmm_layout, dhpt, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); PRINT_LAYOUT2("dW", libxsmm_layout); libxsmm_dweight = libxsmm_dnn_link_tensor( libxsmm_layout, dw, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); PRINT_LAYOUT2("dR", libxsmm_layout); libxsmm_drecur_weight = libxsmm_dnn_link_tensor( libxsmm_layout, dr, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_BIAS, &status ); CHKERR_LIBXSMM_DNN( status ); PRINT_LAYOUT2("dB", libxsmm_layout); libxsmm_dbias = libxsmm_dnn_link_tensor( libxsmm_layout, db, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_CS, &status ); CHKERR_LIBXSMM_DNN( status ); PRINT_LAYOUT2("dCS", libxsmm_layout); libxsmm_dcs = libxsmm_dnn_link_tensor( libxsmm_layout, dcs, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE, &status ); CHKERR_LIBXSMM_DNN( status ); PRINT_LAYOUT2("dHt", libxsmm_layout); libxsmm_dhidden_state = libxsmm_dnn_link_tensor( libxsmm_layout, dht, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); /* bind buffers and filter to handle */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_input, LIBXSMM_DNN_RNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_cs_prev, LIBXSMM_DNN_RNN_REGULAR_CS_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_hidden_state_prev, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV ) ); if(t > 1) { CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_hidden_state, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE ) ); } if(w_in_trans) { CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_weight, LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_recur_weight, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS ) ); } else { CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_weight, LIBXSMM_DNN_RNN_REGULAR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_recur_weight, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT ) ); } CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_cs, LIBXSMM_DNN_RNN_REGULAR_CS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_i, LIBXSMM_DNN_RNN_INTERNAL_I ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_f, LIBXSMM_DNN_RNN_INTERNAL_F ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_o, LIBXSMM_DNN_RNN_INTERNAL_O ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_ci, LIBXSMM_DNN_RNN_INTERNAL_CI ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_co, LIBXSMM_DNN_RNN_INTERNAL_CO ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dinput, LIBXSMM_DNN_RNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dcs_prev, LIBXSMM_DNN_RNN_GRADIENT_CS_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dhidden_state_prev, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dweight, LIBXSMM_DNN_RNN_GRADIENT_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_drecur_weight, LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dbias, LIBXSMM_DNN_RNN_GRADIENT_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dcs, LIBXSMM_DNN_RNN_GRADIENT_CS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_dhidden_state, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE ) ); size_t scratch_size = libxsmm_dnn_rnncell_get_scratch_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, &status ); CHKERR_LIBXSMM_DNN( status ); if (scratch_size > 0) { void* scratch = libxsmm_aligned_malloc( scratch_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, scratch ) ); zero_buf( (float*)scratch, scratch_size/4 ); } return (void*)libxsmm_handle; } void lstm_bwd_set_ptr( void* libxsmm_handle_, int w_in_trans, const int t, const float *xt, const float *csp, const float *hp, const float *ht, const float *w, const float *r, const float *cst, const float *it, const float *ft, const float *ot, const float *cit, const float *cot, const float *dcs, const float *dht, float *dxt, float *dcspt, float *dhpt, float *dw, float *dr, float *db ) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; libxsmm_dnn_rnncell* handle = (libxsmm_dnn_rnncell*) libxsmm_handle_; if (xt == 0 || csp == 0 || hp == 0 || w == 0 || r == 0 || cst == 0 || it == 0 || ft == 0 || ot == 0 || cit == 0 || cot == 0 || dxt == 0 || dcspt== 0|| dhpt== 0|| dw == 0 || dr == 0 || db == 0 || dht == 0 || dcs == 0) { printf("None of the pointers should be NULL::\n"); printf("x:%p\n", xt); printf("cst:%p\n", csp); printf("h_prev:%p\n", hp); printf("ht:%p\n", ht); printf("w:%p\n", w); printf("r:%p\n", r); printf("cs:%p\n", cst); printf("i:%p\n", it); printf("f:%p\n", ft); printf("o:%p\n", ot); printf("ci:%p\n", cit); printf("co:%p\n", cot); printf("dcs:%p\n", dcs); printf("dxt:%p\n", dxt); printf("dcspt:%p\n", dcspt); printf("dhpt:%p\n", dhpt); printf("dw:%p\n", dw); printf("dr:%p\n", dr); printf("db:%p\n", db); printf("dht:%p\n", dht); } /* bind buffers and filter to handle */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_set_sequence_length( handle, t) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_INPUT, &status), xt) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_CS_PREV, &status), csp) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV, &status), hp) ); if(ht != 0) { CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE, &status), ht) ); } if(w_in_trans) { CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS, &status), w) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS, &status), r) ); } else { CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT, &status), w) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT, &status), r) ); } CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_CS, &status), cst) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_INTERNAL_I, &status), it) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_INTERNAL_F, &status), ft) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_INTERNAL_O, &status), ot) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_INTERNAL_CI, &status), cit) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_INTERNAL_CO, &status), cot) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_GRADIENT_INPUT, &status), dxt) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_GRADIENT_CS_PREV, &status), dcspt) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV, &status), dhpt) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_GRADIENT_WEIGHT, &status), dw) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT, &status), dr) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_GRADIENT_BIAS, &status), db) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_GRADIENT_CS, &status), dcs) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE, &status), dht) ); } void lstm_bwd_execute_omp( void* libxsmm_handle_ ) { #ifdef _OPENMP libxsmm_dnn_rnncell* handle = (libxsmm_dnn_rnncell*) libxsmm_handle_; /* run LIBXSMM LSTM BWD */ #pragma omp parallel { int tid = omp_get_thread_num(); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( handle, LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, 0, tid ) ); } #else printf("%s:%d Shouldn't come here... exiting\n", __FILE__, __LINE__); exit(1); #endif } void lstm_bwd_execute_st( void* libxsmm_handle_, int tid ) { libxsmm_dnn_rnncell* handle = (libxsmm_dnn_rnncell*) libxsmm_handle_; /* run LIBXSMM LSTM BWD */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( handle, LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, 0, tid ) ); } void lstm_bwd_destroy( void* libxsmm_handle_ ) { libxsmm_dnn_rnncell* handle = (libxsmm_dnn_rnncell*) libxsmm_handle_; libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_INPUT, &status) ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_CS_PREV, &status) ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV, &status) ) ); if(libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE, &status)) { CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE, &status) ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE ) ); } if(libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT, &status)) { CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT, &status) ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT ) ); } if(libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT, &status)) { CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT, &status) ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT ) ); } if(libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS, &status)) { CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS, &status) ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS ) ); } if(libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS, &status)) { CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS, &status) ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS ) ); } CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_CS, &status) ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_INTERNAL_I, &status) ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_INTERNAL_F, &status) ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_INTERNAL_O, &status) ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_INTERNAL_CI, &status) ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_INTERNAL_CO, &status) ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_GRADIENT_INPUT, &status) ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_GRADIENT_CS_PREV, &status) ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV, &status) ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_GRADIENT_WEIGHT, &status) ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT, &status) ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_GRADIENT_BIAS, &status) ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_GRADIENT_CS, &status) ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE, &status) ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_REGULAR_CS_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_REGULAR_CS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_INTERNAL_I ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_INTERNAL_F ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_INTERNAL_O ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_INTERNAL_CI ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_INTERNAL_CO ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_GRADIENT_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_GRADIENT_CS_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_GRADIENT_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_GRADIENT_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_GRADIENT_CS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE ) ); size_t scratch_size = libxsmm_dnn_rnncell_get_scratch_size( handle, LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, &status ); if (scratch_size > 0) { void *scratch = libxsmm_dnn_rnncell_get_scratch_ptr( handle, &status ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_scratch( handle, LIBXSMM_DNN_COMPUTE_KIND_BWDUPD ) ); if(scratch) libxsmm_free(scratch); } CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_rnncell( handle ) ); } libxsmm-1.17/samples/deeplearning/tf_lstm_ops/lstm_bwd.h000066400000000000000000000043771415223013700235440ustar00rootroot00000000000000#ifndef _LSTM_BWD_H_ #define _LSTM_BWD_H_ #ifdef __cplusplus extern "C" { #endif void* lstm_bwd_create( int N, /* minibatch size */ int C, /* input size */ int K, /* output size */ int t, /* timesteps = 1 */ int nThreads, /* number of threads */ const int w_in_kcck, const int w_in_trans, const float *xt, const float *csp, const float *hp, const float *ht, const float *w, const float *r, const float *cst, const float *it, const float *ft, const float *ot, const float *cit, const float *cot, const float *dcs, const float *dht, float *dxt, float *dcspt, float *dhpt, float *dw, float *dr, float *db ); void lstm_bwd_set_ptr( void* libxsmm_handle_, int w_in_trans, const int t, const float *xt, const float *csp, const float *hp, const float *ht, const float *w, const float *r, const float *cst, const float *it, const float *ft, const float *ot, const float *cit, const float *cot, const float *dcs, const float *dht, float *dxt, float *dcspt, float *dhpt, float *dw, float *dr, float *db ); void lstm_bwd_execute_omp( void* libxsmm_handle_ ); void lstm_bwd_execute_st( void* libxsmm_handle_, int tid ); void lstm_bwd_destroy( void* libxsmm_handle_ ); #ifdef __cplusplus } #endif #endif /*_LSTM_BWD_H_*/ libxsmm-1.17/samples/deeplearning/tf_lstm_ops/lstm_fwd.c000066400000000000000000000451241415223013700235360ustar00rootroot00000000000000#include #if defined(__linux__) # include # define gettid() syscall(SYS_gettid) #else # define gettid() libxsmm_get_tid() #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #include #include #include #if defined(_OPENMP) # include #endif #include "lstm_fwd.h" #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif #define CHKERR_LIBXSMM_DNN(A) if ( A != LIBXSMM_DNN_SUCCESS ) fprintf(stderr, "%s\n", libxsmm_dnn_get_error(A) ); #if 0 # define PRINT_LAYOUT(DESC, LAYOUT) print_layout(DESC, LAYOUT) #else # define PRINT_LAYOUT(DESC, LAYOUT) #endif void print_layout(char *desc, libxsmm_dnn_tensor_datalayout *layout) { char *dim_name[] = {"N", "H", "W", "C", "K", "R", "S", "X", "RLM", "RLK", "RLN"}; int i; printf("%s: F:%d TT: %d [", desc, layout->format, layout->tensor_type); for(i = layout->num_dims - 1; i >= 0; i--) { printf("%s:%d%s", dim_name[layout->dim_type[i]], layout->dim_size[i], i == 0 ? "" : ", "); } printf("]\n"); } static void zero_buf(float* buf, size_t size) { int i; #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < (int)size; ++i) { buf[i] = 0.0f; } } void* lstm_fwd_create( int N, /* minibatch size */ int C, /* input size */ int K, /* output size */ int t, /* timesteps = 1 */ int nThreads, /* number of threads */ const float forget_bias, const int w_in_kcck, const float *xt, const float *csp, const float *hp, const float *w, const float *r, const float *b, float *cst, float *ht, float *it, float *ft, float *ot, float *cit, float *cot ) { libxsmm_dnn_rnncell_desc lstmcell_desc; libxsmm_dnn_rnncell* libxsmm_handle; libxsmm_dnn_tensor* libxsmm_input; libxsmm_dnn_tensor* libxsmm_cs_prev; libxsmm_dnn_tensor* libxsmm_hidden_state_prev; libxsmm_dnn_tensor* libxsmm_weight; libxsmm_dnn_tensor* libxsmm_recur_weight; libxsmm_dnn_tensor* libxsmm_bias; libxsmm_dnn_tensor* libxsmm_cs; libxsmm_dnn_tensor* libxsmm_hidden_state; libxsmm_dnn_tensor* libxsmm_i; libxsmm_dnn_tensor* libxsmm_f; libxsmm_dnn_tensor* libxsmm_o; libxsmm_dnn_tensor* libxsmm_ci; libxsmm_dnn_tensor* libxsmm_co; libxsmm_dnn_tensor_datalayout* libxsmm_layout; libxsmm_dnn_err_t status; if (N <= 0) { printf("N: %d should be > 0\n", N); } if (C <= 0) { printf("C: %d should be > 0\n", C); } if (K <= 0) { printf("K: %d should be > 0\n", K); } if (xt == 0 || csp == 0 || hp == 0 || w == 0 || r == 0 || b == 0 || cst == 0 || ht == 0 || it == 0 || ft == 0 || ot == 0 || cit == 0 || cot == 0) { printf("None of the pointers should be NULL::\n"); printf("x:%p\n", xt); printf("cs_prev:%p\n", csp); printf("h_prev:%p\n", hp); printf("w:%p\n", w); printf("r:%p\n", r); printf("b:%p\n", b); printf("cs:%p\n", cst); printf("h:%p\n", ht); printf("i:%p\n", it); printf("f:%p\n", ft); printf("o:%p\n", ot); printf("ci:%p\n", cit); printf("co:%p\n", cot); } /* setup LIBXSMM handle */ lstmcell_desc.threads = nThreads; lstmcell_desc.N = N; lstmcell_desc.C = C; lstmcell_desc.K = K; lstmcell_desc.max_T = t; lstmcell_desc.bn = 24; if(N % 24 == 0) lstmcell_desc.bn = 24; else if(N % 16 == 0) lstmcell_desc.bn = 16; else if(N % 12 == 0) lstmcell_desc.bn = 12; else if(N % 8 == 0) lstmcell_desc.bn = 8; else if(N % 6 == 0) lstmcell_desc.bn = 6; lstmcell_desc.bc = 64; lstmcell_desc.bk = 64; lstmcell_desc.cell_type = LIBXSMM_DNN_RNNCELL_LSTM; lstmcell_desc.datatype_in = LIBXSMM_DNN_DATATYPE_F32; lstmcell_desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32; lstmcell_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NC; lstmcell_desc.filter_format = (w_in_kcck ? LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED : LIBXSMM_DNN_TENSOR_FORMAT_CK); libxsmm_handle = libxsmm_dnn_create_rnncell( lstmcell_desc, &status ); CHKERR_LIBXSMM_DNN( status ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_allocate_forget_bias(libxsmm_handle, forget_bias) ); /* setup LIBXSMM buffers and filter */ libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_INPUT, &status ); CHKERR_LIBXSMM_DNN( status ); PRINT_LAYOUT("Xt", libxsmm_layout); libxsmm_input = libxsmm_dnn_link_tensor( libxsmm_layout, xt, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_CS_PREV, &status ); CHKERR_LIBXSMM_DNN( status ); PRINT_LAYOUT("CSP", libxsmm_layout); libxsmm_cs_prev = libxsmm_dnn_link_tensor( libxsmm_layout, csp, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV, &status ); CHKERR_LIBXSMM_DNN( status ); PRINT_LAYOUT("HP", libxsmm_layout); libxsmm_hidden_state_prev = libxsmm_dnn_link_tensor( libxsmm_layout, hp, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); PRINT_LAYOUT("W", libxsmm_layout); libxsmm_weight = libxsmm_dnn_link_tensor( libxsmm_layout, w, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT, &status ); CHKERR_LIBXSMM_DNN( status ); PRINT_LAYOUT("R", libxsmm_layout); libxsmm_recur_weight = libxsmm_dnn_link_tensor( libxsmm_layout, r, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_BIAS, &status ); CHKERR_LIBXSMM_DNN( status ); PRINT_LAYOUT("B", libxsmm_layout); libxsmm_bias = libxsmm_dnn_link_tensor( libxsmm_layout, b, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_CS, &status ); CHKERR_LIBXSMM_DNN( status ); PRINT_LAYOUT("CSt", libxsmm_layout); libxsmm_cs = libxsmm_dnn_link_tensor( libxsmm_layout, cst, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE, &status ); CHKERR_LIBXSMM_DNN( status ); PRINT_LAYOUT("Ht", libxsmm_layout); libxsmm_hidden_state = libxsmm_dnn_link_tensor( libxsmm_layout, ht, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_I, &status ); CHKERR_LIBXSMM_DNN( status ); PRINT_LAYOUT("It", libxsmm_layout); libxsmm_i = libxsmm_dnn_link_tensor( libxsmm_layout, it, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_F, &status ); CHKERR_LIBXSMM_DNN( status ); PRINT_LAYOUT("Ft", libxsmm_layout); libxsmm_f = libxsmm_dnn_link_tensor( libxsmm_layout, ft, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_O, &status ); CHKERR_LIBXSMM_DNN( status ); PRINT_LAYOUT("Ot", libxsmm_layout); libxsmm_o = libxsmm_dnn_link_tensor( libxsmm_layout, ot, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_CI, &status ); CHKERR_LIBXSMM_DNN( status ); PRINT_LAYOUT("CIt", libxsmm_layout); libxsmm_ci = libxsmm_dnn_link_tensor( libxsmm_layout, cit, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); libxsmm_layout = libxsmm_dnn_rnncell_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_RNN_INTERNAL_CO, &status ); CHKERR_LIBXSMM_DNN( status ); PRINT_LAYOUT("COt", libxsmm_layout); libxsmm_co = libxsmm_dnn_link_tensor( libxsmm_layout, cot, &status ); CHKERR_LIBXSMM_DNN( status ); libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout ); /* bind buffers and filter to handle */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_input, LIBXSMM_DNN_RNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_cs_prev, LIBXSMM_DNN_RNN_REGULAR_CS_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_hidden_state_prev, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_weight, LIBXSMM_DNN_RNN_REGULAR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_recur_weight, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_bias, LIBXSMM_DNN_RNN_REGULAR_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_cs, LIBXSMM_DNN_RNN_REGULAR_CS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_hidden_state, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_i, LIBXSMM_DNN_RNN_INTERNAL_I ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_f, LIBXSMM_DNN_RNN_INTERNAL_F ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_o, LIBXSMM_DNN_RNN_INTERNAL_O ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_ci, LIBXSMM_DNN_RNN_INTERNAL_CI ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_tensor( libxsmm_handle, libxsmm_co, LIBXSMM_DNN_RNN_INTERNAL_CO ) ); size_t scratch_size = libxsmm_dnn_rnncell_get_scratch_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, &status ); CHKERR_LIBXSMM_DNN( status ); if(scratch_size > 0) { void *scratch = libxsmm_aligned_malloc( scratch_size, 2097152 ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_bind_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, scratch ) ); zero_buf( (float*)scratch, scratch_size/4 ); } return (void*)libxsmm_handle; } void lstm_fwd_set_ptr( void* libxsmm_handle_, const float forget_bias, const int t, const float *xt, const float *csp, const float *hp, const float *w, const float *r, const float *b, float *cst, float *ht, float *it, float *ft, float *ot, float *cit, float *cot ) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; libxsmm_dnn_rnncell* handle = (libxsmm_dnn_rnncell*) libxsmm_handle_; if (xt == 0 || csp == 0 || hp == 0 || w == 0 || r == 0 || b == 0 || cst == 0 || ht == 0 || it == 0 || ft == 0 || ot == 0 || cit == 0 || cot == 0) { printf("None of the pointers should be NULL::\n"); printf("x:%p\n", xt); printf("cs_prev:%p\n", csp); printf("h_prev:%p\n", hp); printf("w:%p\n", w); printf("r:%p\n", r); printf("b:%p\n", b); printf("cs:%p\n", cst); printf("h:%p\n", ht); printf("i:%p\n", it); printf("f:%p\n", ft); printf("o:%p\n", ot); printf("ci:%p\n", cit); printf("co:%p\n", cot); } CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_allocate_forget_bias(handle, forget_bias) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_set_sequence_length( handle, t) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr(libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_INPUT, &status), xt) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr(libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_CS_PREV, &status), csp) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr(libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV, &status), hp) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr(libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT, &status), w) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr(libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT, &status), r) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr(libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_BIAS, &status), b) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr(libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_CS, &status), cst) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr(libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE, &status), ht) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr(libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_INTERNAL_I, &status), it) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr(libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_INTERNAL_F, &status), ft) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr(libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_INTERNAL_O, &status), ot) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr(libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_INTERNAL_CI, &status), cit) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_set_tensor_data_ptr(libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_INTERNAL_CO, &status), cot) ); } void lstm_fwd_execute_omp( void* libxsmm_handle_) { #ifdef _OPENMP libxsmm_dnn_rnncell* handle = (libxsmm_dnn_rnncell*) libxsmm_handle_; /* run LIBXSMM LSTM FWD */ #pragma omp parallel { int tid = omp_get_thread_num(); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) ); } #else printf("%s:%d Shouldn't come here... exiting\n", __FILE__, __LINE__); exit(1); #endif } void lstm_fwd_execute_st( void* libxsmm_handle_, int tid ) { libxsmm_dnn_rnncell* handle = (libxsmm_dnn_rnncell*) libxsmm_handle_; /* run LIBXSMM LSTM FWD */ CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_execute_st( handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) ); } void lstm_fwd_destroy( void* libxsmm_handle_ ) { libxsmm_dnn_rnncell* handle = (libxsmm_dnn_rnncell*) libxsmm_handle_; libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_INPUT, &status) ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_CS_PREV, &status) ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV, &status) ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT, &status) ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT, &status) ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_BIAS, &status) ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_CS, &status) ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE, &status) ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_INTERNAL_I, &status) ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_INTERNAL_F, &status) ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_INTERNAL_O, &status) ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_INTERNAL_CI, &status) ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_dnn_rnncell_get_tensor(handle, LIBXSMM_DNN_RNN_INTERNAL_CO, &status) ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_REGULAR_INPUT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_REGULAR_CS_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_REGULAR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_REGULAR_BIAS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_REGULAR_CS ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_INTERNAL_I ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_INTERNAL_F ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_INTERNAL_O ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_INTERNAL_CI ) ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_tensor( handle, LIBXSMM_DNN_RNN_INTERNAL_CO ) ); size_t scratch_size = libxsmm_dnn_rnncell_get_scratch_size( handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, &status ); if(scratch_size > 0) { void *scratch = libxsmm_dnn_rnncell_get_scratch_ptr( handle, /*LIBXSMM_DNN_COMPUTE_KIND_FWD,*/ &status ); CHKERR_LIBXSMM_DNN( libxsmm_dnn_rnncell_release_scratch( handle, LIBXSMM_DNN_COMPUTE_KIND_FWD ) ); if(scratch) libxsmm_free(scratch); } CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_rnncell( handle ) ); } libxsmm-1.17/samples/deeplearning/tf_lstm_ops/lstm_fwd.h000066400000000000000000000033001415223013700235310ustar00rootroot00000000000000#ifndef _LSTM_FWD_H_ #define _LSTM_FWD_H_ #ifdef __cplusplus extern "C" { #endif void *lstm_fwd_create( int N, /* minibatch size */ int C, /* input size */ int K, /* output size */ int t, /* timesteps = 1 */ int nThreads, /* number of threads */ const float forget_bias, const int w_in_kcck, const float *xt, const float *csp, const float *hp, const float *w, const float *r, const float *b, float *cst, float *ht, float *it, float *ft, float *ot, float *cit, float *cot ); void lstm_fwd_set_ptr( void* libxsmm_handle_, const float forget_bias, const int t, const float *xt, const float *csp, const float *hp, const float *w, const float *r, const float *b, float *cst, float *ht, float *it, float *ft, float *ot, float *cit, float *cot ); void lstm_fwd_execute_omp( void* libxsmm_handle_); void lstm_fwd_execute_st( void* libxsmm_handle_, int tid ); void lstm_fwd_destroy( void* libxsmm_handle_ ); #ifdef __cplusplus } #endif #endif /*_LSTM_FWD_H_*/ libxsmm-1.17/samples/deeplearning/tf_lstm_ops/setup.py000066400000000000000000000013401415223013700232550ustar00rootroot00000000000000import setuptools with open("README.md", "r") as fh: long_description = fh.read() setuptools.setup( name='xsmm_lstm', version='0.1', author="Dhiraj Kalamkar", author_email="dhiraj.d.kalamkar@intel.com", description="Tensorflow wrapper for libxsmm LSTM Cell", long_description=long_description, #long_description_content_type="text/markdown", url="https://github.com/ddkalamk/libxsmm", #packages=setuptools.find_packages(), packages=['xsmm_lstm'], #package_dir={'': '.'}, package_data={'xsmm_lstm': ['libxsmm_lstm.so']}, include_package_data=True, classifiers=[ "License :: OSI Approved :: MIT License", "Operating System :: Linux", ], ) libxsmm-1.17/samples/deeplearning/tf_lstm_ops/xsmm_lstm/000077500000000000000000000000001415223013700235705ustar00rootroot00000000000000libxsmm-1.17/samples/deeplearning/tf_lstm_ops/xsmm_lstm/__init__.py000066400000000000000000000346261415223013700257140ustar00rootroot00000000000000from __future__ import absolute_import from __future__ import division from __future__ import print_function #import abc from tensorflow.contrib.rnn.ops import gen_lstm_ops from tensorflow.contrib.util import loader from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.layers import base as base_layer from tensorflow.python.ops import array_ops from tensorflow.python.ops import init_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import nn_ops from tensorflow.python.ops import rnn_cell_impl from tensorflow.python.platform import resource_loader from tensorflow.contrib.rnn import LSTMBlockWrapper from tensorflow.python.keras.utils import tf_utils import tensorflow as tf import os module_dir = os.path.dirname(__file__) lib_name = os.path.join(module_dir, 'libxsmm_lstm.so') xsmm_lstm = tf.load_op_library(lib_name) @tf.RegisterGradient("XsmmLSTMCell") def _LSTMBlockCellGrad(op, *grad): """Gradient for XsmmLSTMCell.""" w_in_kcck = False try: w_in_kcck = op.get_attr("w_in_kcck") except: pass if w_in_kcck: (x, cs_prev, h_prev, w, wT, wci, wcf, wco, b) = op.inputs else: (x, cs_prev, h_prev, w, wci, wcf, wco, b) = op.inputs wT = w (i, cs, f, o, ci, co, _) = op.outputs (_, cs_grad, _, _, _, _, h_grad) = grad (cs_prev_grad, h_prev_grad, x_grad, w_grad, b_grad, wci_grad, wcf_grad, wco_grad) = xsmm_lstm.xsmm_lstm_cell_grad( x=x, cs_prev=cs_prev, h_prev=h_prev, w=w, w_t=wT, wci=wci, wcf=wcf, wco=wco, b=b, i=i, cs=cs, f=f, o=o, ci=ci, co=co, cs_grad=cs_grad, h_grad=h_grad, use_peephole=op.get_attr("use_peephole"), w_in_kcck=w_in_kcck) if w_in_kcck: return (x_grad, cs_prev_grad, h_prev_grad, w_grad, None, wci_grad, wcf_grad, wco_grad, b_grad) else: return (x_grad, cs_prev_grad, h_prev_grad, w_grad, wci_grad, wcf_grad, wco_grad, b_grad) @ops.RegisterGradient("XsmmFusedLSTM") def _XsmmFusedLSTMGrad(op, *grad): """Gradient for XsmmFusedLSTM.""" seq_len_max, x, cs_prev, h_prev, w, wci, wcf, wco, b = op.inputs i, cs, f, o, ci, co, h = op.outputs cs_grad = grad[1] h_grad = grad[6] (x_grad, cs_prev_grad, h_prev_grad, w_grad, wci_grad, wcf_grad, wco_grad, b_grad) = xsmm_lstm.xsmm_fused_lstm_grad( seq_len_max, x, cs_prev, h_prev, w, wci, wcf, wco, b, i, cs, f, o, ci, co, h, cs_grad, h_grad, use_peephole=op.get_attr("use_peephole"), use_residue=op.get_attr("use_residue"), use_dropout=op.get_attr("use_dropout")) return [ None, x_grad, cs_prev_grad, h_prev_grad, w_grad, wci_grad, wcf_grad, wco_grad, b_grad ] class XsmmFusedLSTM(LSTMBlockWrapper): """XsmmFusedLSTM implementation of LSTM. This is an extremely efficient LSTM implementation, that uses a single TF op for the entire LSTM. It should be both faster and more memory-efficient than LSTMBlockCell defined above. The implementation is based on: http://arxiv.org/abs/1409.2329. We add forget_bias (default: 1) to the biases of the forget gate in order to reduce the scale of forgetting in the beginning of the training. The variable naming is consistent with `rnn_cell_impl.LSTMCell`. """ def __init__(self, num_units, forget_bias=1.0, cell_clip=None, use_peephole=False, dropout=0.0, residual_connection=False, reuse=None, dtype=None, name="lstm_fused_cell"): """Initialize the LSTM cell. Args: num_units: int, The number of units in the LSTM cell. forget_bias: float, The bias added to forget gates (see above). cell_clip: clip the cell to this value. Default is no cell clipping. use_peephole: Whether to use peephole connections or not. residual_connection: Whether to add residue connections or not. dropout: Whether to apply dropout or not. reuse: (optional) boolean describing whether to reuse variables in an existing scope. If not `True`, and the existing scope already has the given variables, an error is raised. dtype: the dtype of variables of this layer. name: String, the name of the layer. Layers with the same name will share weights, but to avoid mistakes we require reuse=True in such cases. By default this is "lstm_cell", for variable-name compatibility with `tf.nn.rnn_cell.LSTMCell`. """ super(XsmmFusedLSTM, self).__init__( _reuse=reuse, name=name, dtype=dtype) self._num_units = num_units self._forget_bias = forget_bias self._cell_clip = cell_clip if cell_clip is not None else -1 self._use_peephole = use_peephole self._residual_connection = residual_connection self._dropout = dropout # Inputs must be 3-dimensional. self.input_spec = base_layer.InputSpec(ndim=3) @property def num_units(self): """Number of units in this cell (output dimension).""" return self._num_units def build(self, input_shape): input_size = input_shape[2].value self._kernel = self.add_variable( "kernel", [input_size + self._num_units, self._num_units * 4]) self._bias = self.add_variable( "bias", [self._num_units * 4], initializer=init_ops.constant_initializer(0.0)) if self._use_peephole: self._w_i_diag = self.add_variable("w_i_diag", [self._num_units]) self._w_f_diag = self.add_variable("w_f_diag", [self._num_units]) self._w_o_diag = self.add_variable("w_o_diag", [self._num_units]) self.built = True def _call_cell(self, inputs, initial_cell_state=None, initial_output=None, dtype=None, sequence_length=None): """Run this LSTM on inputs, starting from the given state. Args: inputs: `3-D` tensor with shape `[time_len, batch_size, input_size]` initial_cell_state: initial value for cell state, shape `[batch_size, self._num_units]` initial_output: initial value of cell output, shape `[batch_size, self._num_units]` dtype: The data type for the initial state and expected output. sequence_length: Specifies the length of each sequence in inputs. An `int32` or `int64` vector (tensor) size `[batch_size]`, values in `[0, time_len)` or None. Returns: A pair containing: - Cell state (cs): A `3-D` tensor of shape `[time_len, batch_size, output_size]` - Output (h): A `3-D` tensor of shape `[time_len, batch_size, output_size]` """ inputs_shape = inputs.get_shape().with_rank(3) time_len = inputs_shape[0].value if time_len is None: time_len = array_ops.shape(inputs)[0] if self._use_peephole: wci = self._w_i_diag wco = self._w_o_diag wcf = self._w_f_diag else: wci = wcf = wco = array_ops.zeros([self._num_units], dtype=dtype) if sequence_length is None: max_seq_len = math_ops.to_int64(time_len) else: max_seq_len = math_ops.to_int64(math_ops.reduce_max(sequence_length)) print(" Xsmm LSTM Fused Cell: dropout = %.3f, Resudue = %s" % (self._dropout, self._residual_connection)) orig_inputs = inputs if self._dropout > 0.0: inputs = tf.nn.dropout(inputs, 1 - self._dropout) ''' _, cs, _, _, _, _, h = gen_lstm_ops.block_lstm( seq_len_max=max_seq_len, x=inputs, cs_prev=initial_cell_state, h_prev=initial_output, w=self._kernel, wci=wci, wcf=wcf, wco=wco, b=self._bias, forget_bias=self._forget_bias, cell_clip=self._cell_clip, use_peephole=self._use_peephole) ''' _, cs, _, _, _, _, h = xsmm_lstm.xsmm_fused_lstm( seq_len_max=max_seq_len, x=inputs, cs_prev=initial_cell_state, h_prev=initial_output, w=self._kernel, wci=wci, wcf=wcf, wco=wco, b=self._bias, forget_bias=self._forget_bias, cell_clip=self._cell_clip, use_peephole=self._use_peephole, use_residue=False, use_dropout=False) if self._residual_connection: with tf.name_scope("fused_residual_connection"): h = h + orig_inputs return cs, h class XsmmLSTMCell(rnn_cell_impl.RNNCell): """LIbxsmm LSTM Cell""" def __init__(self, num_units, forget_bias=1.0, state_is_tuple=True, activation=None, reuse=None, name=None, dtype=None, w_in_kcck=True, **kwargs): """Initialize the libxsmm LSTM cell. Args: num_units: int, The number of units in the LSTM cell. forget_bias: float, The bias added to forget gates (see above). Must set to `0.0` manually when restoring from CudnnLSTM-trained checkpoints. state_is_tuple: If True, accepted and returned states are 2-tuples of the `c_state` and `m_state`. If False, they are concatenated along the column axis. The latter behavior will soon be deprecated. activation: Activation function of the inner states. Default: `tanh`. It could also be string that is within Keras activation function names. reuse: (optional) Python boolean describing whether to reuse variables in an existing scope. If not `True`, and the existing scope already has the given variables, an error is raised. name: String, the name of the layer. Layers with the same name will share weights, but to avoid mistakes we require reuse=True in such cases. dtype: Default dtype of the layer (default of `None` means use the type of the first input). Required when `build` is called before `call`. **kwargs: Dict, keyword named properties for common layer attributes, like `trainable` etc when constructing the cell from configs of get_config(). When restoring from CudnnLSTM-trained checkpoints, must use `CudnnCompatibleLSTMCell` instead. """ super(XsmmLSTMCell, self).__init__( _reuse=reuse, name=name, dtype=dtype, **kwargs) if not state_is_tuple: logging.warn("%s: Using a concatenated state is slower and will soon be " "deprecated. Use state_is_tuple=True.", self) # Inputs must be 2-dimensional. self.input_spec = base_layer.InputSpec(ndim=2) self._num_units = num_units self._forget_bias = forget_bias self._state_is_tuple = state_is_tuple self._w_in_kcck = w_in_kcck if activation: self._activation = activations.get(activation) else: self._activation = math_ops.tanh @property def state_size(self): return (rnn_cell_impl.LSTMStateTuple(self._num_units, self._num_units) if self._state_is_tuple else 2 * self._num_units) @property def output_size(self): return self._num_units @tf_utils.shape_type_conversion def build(self, inputs_shape): if inputs_shape[-1] is None: raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s" % str(inputs_shape)) input_depth = inputs_shape[-1] h_depth = self._num_units C = input_depth + h_depth K = 4 * self._num_units ctxt = tf.get_default_graph()._get_control_flow_context() if ctxt: ctxt = ctxt.GetWhileContext() self._kernel = self.add_variable( "kernel", shape=[input_depth + h_depth, 4 * self._num_units]) self._bias = self.add_variable( "bias", shape=[4 * self._num_units], initializer=init_ops.zeros_initializer(dtype=self.dtype)) if self._w_in_kcck: if ctxt: ctxt.Exit() def block_transpose(inp, C, BC, K, BK): inp_packed = tf.reshape(tf.transpose(tf.reshape(inp, [C//BC, BC, K//BK, BK]), perm=[2, 0, 1, 3]), [C, K]) inp_packed_trans = tf.reshape(tf.transpose(tf.reshape(inp, [C//BC, BC, 4, K//(BK*4), BK]), perm=[2, 0, 3, 4, 1]), [C, K]) return inp_packed, inp_packed_trans with tf.variable_scope("kernel_transpose") as vs: with tf.name_scope(""), tf.name_scope(vs.name): BC = 64 if input_depth % 64 == 0 else input_depth BK = 64 if h_depth % 64 == 0 else h_depth W, R = tf.split(self._kernel, [input_depth, h_depth], 0) W, WT = block_transpose(W, input_depth, BC, K, BK) R, RT = block_transpose(R, h_depth, BK, K, BK) self._kernel = tf.concat([W, R], 0) self._kernel_trans = tf.concat([WT, RT], 0) if ctxt: ctxt.Enter() else: self._kernel_trans = self._kernel self.built = True def call(self, inputs, state): """Long short-term memory cell (LSTM). Args: inputs: `2-D` tensor with shape `[batch_size, input_size]`. state: An `LSTMStateTuple` of state tensors, each shaped `[batch_size, num_units]`, if `state_is_tuple` has been set to `True`. Otherwise, a `Tensor` shaped `[batch_size, 2 * num_units]`. Returns: A pair containing the new hidden state, and the new state (either a `LSTMStateTuple` or a concatenated state, depending on `state_is_tuple`). """ if len(state) != 2: raise ValueError("Expecting state to be a tuple with length 2.") if False: #self._use_peephole: wci = self._w_i_diag wcf = self._w_f_diag wco = self._w_o_diag else: wci = wcf = wco = array_ops.zeros([self._num_units]) (cs_prev, h_prev) = state (_, cs, _, _, _, _, h) = xsmm_lstm.xsmm_lstm_cell( x=inputs, cs_prev=cs_prev, h_prev=h_prev, w=self._kernel, w_t=self._kernel_trans, wci=wci, wcf=wcf, wco=wco, b=self._bias, forget_bias=self._forget_bias, cell_clip=-1, use_peephole=False, w_in_kcck=self._w_in_kcck, name=self._name) new_state = rnn_cell_impl.LSTMStateTuple(cs, h) return h, new_state def get_config(self): config = { "num_units": self._num_units, "forget_bias": self._forget_bias, "state_is_tuple": self._state_is_tuple, "activation": activations.serialize(self._activation), "reuse": self._reuse, } base_config = super(XsmmLSTMCell, self).get_config() return dict(list(base_config.items()) + list(config.items())) libxsmm-1.17/samples/deeplearning/tf_lstm_ops/xsmm_lstm/test.py000066400000000000000000000057051415223013700251300ustar00rootroot00000000000000import tensorflow as tf from tensorflow.contrib import rnn import numpy as np import xsmm_lstm import sys from os import isatty GREEN='' RED='' BOLD='' ENDC='' if isatty(sys.stdout.fileno()): GREEN='\033[92m' RED ='\033[91m' BOLD='\033[1m' ENDC='\033[0m' def isclose(buf, ref, xmm): avg_ref = np.mean(ref) avg_abs_ref_orig = np.mean(np.absolute(ref)) avg_abs_ref = avg_abs_ref_orig if avg_abs_ref_orig != 0 else 0.1 avg_xmm = np.mean(xmm) avg_abs_xmm = np.mean(np.absolute(xmm)) if avg_abs_ref_orig == avg_abs_xmm == 0: return size = ref.size it = np.nditer([ref, xmm], flags=['multi_index']) count = 0 print_count = 0 max_print = 5 print_always = 1 for x, y in it: rdiff = abs(x - y) / avg_abs_ref diff = abs((x - y) / x) if x != 0 else rdiff if (diff > 1e-5 and rdiff > 1e-5) or print_count < print_always: if print_count < max_print: print(" %-10s %-10s: ref: %10s xmm: %10s diff: %9e" % (buf, it.multi_index, x, y, diff)) if diff > 1e-5: count += 1 print_count += 1 if count > 0: print("%s %sdoes NOT match%s, error count = %d (out of %d) AVG=%g ABSAVG=%g" % (buf, RED+BOLD, ENDC, count, size, avg_ref, avg_abs_ref_orig)) else: print("%s %sDOES match%s, size = %d AVG=%g ABSAVG=%g" % (buf, GREEN+BOLD, ENDC, size, avg_ref, avg_abs_ref_orig)) N=64 C=128 K=192 T=10 forget_bias=1.0 tf.set_random_seed(1) #x = tf.constant(-0.1, shape=[N,C], dtype = tf.float32) #x2 = tf.constant(0.1, shape=[N,C], dtype = tf.float32) x = tf.random_normal(shape=[N,C], dtype = tf.float32) #+ 0.5 x2 = tf.random_normal(shape=[N,C], dtype = tf.float32) #+ 0.5 lstm_cell_ref = rnn.LSTMBlockCell(K, forget_bias=forget_bias, name='test') #lstm_cell_ref = rnn.BasicLSTMCell(K, forget_bias=forget_bias, name='test') #lstm_cell = rnn.LSTMBlockCell(K, forget_bias=forget_bias, name='test', reuse=True) lstm_cell = xsmm_lstm.XsmmLSTMCell(K, forget_bias=forget_bias, name='test', reuse=True) init_state = lstm_cell_ref.zero_state(N, dtype=tf.float32) x_fused = tf.convert_to_tensor([x] + [x2 for _ in range(T-1)]) print("x_fused is: %s" % x_fused) outputs_ref, states_ref = tf.nn.dynamic_rnn(lstm_cell_ref, x_fused, dtype=tf.float32, initial_state=init_state, time_major=True) outputs, states = tf.nn.dynamic_rnn(lstm_cell, x_fused, dtype=tf.float32, initial_state=init_state, time_major=True) init = tf.global_variables_initializer() W = tf.global_variables()[0] B = tf.global_variables()[1] g_ref = tf.gradients(outputs_ref, [x_fused] + [W, B, init_state.c, init_state.h]) g = tf.gradients(outputs, [x_fused] + [W, B, init_state.c, init_state.h]) g_names = ["dx_fused"] + ["dW", "dB", "dcsp", "dhp"] #print(tf.get_default_graph().as_graph_def()) with tf.Session(config=tf.ConfigProto(inter_op_parallelism_threads=1)) as sess: sess.run(init) g_print, g_print_ref = sess.run([g,g_ref]) for t,t_ref, p, p_ref, name in zip(g, g_ref, g_print, g_print_ref, g_names): if t.name != t_ref.name: isclose("TEST: %-4s " % name + t.name, p_ref, p) libxsmm-1.17/samples/deeplearning/tf_lstm_ops/xsmm_lstm_kernels.cc000066400000000000000000001647231415223013700256370ustar00rootroot00000000000000#ifndef _OPENMP #define EIGEN_USE_THREADS #endif #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/lib/core/blocking_counter.h" #include "tensorflow/core/lib/core/threadpool.h" #include "lstm_fwd.h" #include "lstm_bwd.h" #include #define gettid() syscall(SYS_gettid) typedef Eigen::ThreadPoolDevice CPUDevice; using namespace tensorflow; template class XsmmLSTMCellOp : public OpKernel { public: explicit XsmmLSTMCellOp(OpKernelConstruction* ctx) : OpKernel(ctx), cached_batch_size(-2), cached_input_size(-2), cached_cell_size(-2), xsmm_handle(nullptr), cached_num_threads(-1), w_in_kcck_(false) { OP_REQUIRES_OK(ctx, ctx->GetAttr("forget_bias", &forget_bias_)); OP_REQUIRES_OK(ctx, ctx->GetAttr("cell_clip", &cell_clip_)); OP_REQUIRES_OK(ctx, ctx->GetAttr("use_peephole", &use_peephole_)); if(XSMM_OP) OP_REQUIRES_OK(ctx, ctx->GetAttr("w_in_kcck", &w_in_kcck_)); OP_REQUIRES(ctx, use_peephole_ == false, errors::InvalidArgument("Peephole is not supported for XsmmLSTMCell")); printf("\nUsing XsmmLSTMCellFwd: forget_bias=%g\n", forget_bias_); } #if 0 bool UsesOmp() override { #ifdef _OPENMP return true; #else return false; #endif } #endif void Compute(OpKernelContext* ctx) override { const Tensor* x_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("x", &x_tensor)); const Tensor* cs_prev_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("cs_prev", &cs_prev_tensor)); const Tensor* h_prev_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("h_prev", &h_prev_tensor)); const Tensor* w_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("w", &w_tensor)); const Tensor* wci_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("wci", &wci_tensor)); const Tensor* wcf_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("wcf", &wcf_tensor)); const Tensor* wco_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("wco", &wco_tensor)); const Tensor* b_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("b", &b_tensor)); const int64 batch_size = x_tensor->dim_size(0); const int64 input_size = x_tensor->dim_size(1); const int64 cell_size = cs_prev_tensor->dim_size(1); // Sanity checks for our input shapes. OP_REQUIRES(ctx, cs_prev_tensor->dim_size(0) == batch_size, errors::InvalidArgument("cs_prev.dims(0) != batch_size: ", cs_prev_tensor->dim_size(0), " vs. ", batch_size)); OP_REQUIRES(ctx, cs_prev_tensor->dim_size(1) == cell_size, errors::InvalidArgument("cs_prev.dims(1) != cell_size: ", cs_prev_tensor->dim_size(1), " vs. ", cell_size)); OP_REQUIRES(ctx, h_prev_tensor->dim_size(0) == batch_size, errors::InvalidArgument("h_prev.dims(0) != batch_size: ", h_prev_tensor->dim_size(0), " vs. ", batch_size)); OP_REQUIRES(ctx, h_prev_tensor->dim_size(1) == cell_size, errors::InvalidArgument( "h_prev.dims(1) != cell_size: ", h_prev_tensor->dim_size(1), " vs. ", cell_size)); OP_REQUIRES(ctx, w_tensor->dim_size(0) == input_size + cell_size, errors::InvalidArgument( "w.dim_size(0) != input_size + cell_size: ", w_tensor->dim_size(0), " vs. ", input_size + cell_size)); OP_REQUIRES(ctx, w_tensor->dim_size(1) == cell_size * 4, errors::InvalidArgument( "w.dim_size(1) != cell_size * 4: ", w_tensor->dim_size(1), " vs. ", cell_size * 4)); OP_REQUIRES(ctx, b_tensor->dim_size(0) == cell_size * 4, errors::InvalidArgument( "b.dim_size(0) != cell_size * 4: ", b_tensor->dim_size(0), " vs. ", cell_size * 4)); // Allocate our output tensors. Tensor* i_tensor = nullptr; //OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output( // {"h_prev"}, "i", // TensorShape({batch_size, cell_size}), &i_tensor)); OP_REQUIRES_OK(ctx, ctx->allocate_output( "i", TensorShape({batch_size, cell_size}), &i_tensor)); Tensor* cs_tensor = nullptr; OP_REQUIRES_OK( ctx, ctx->allocate_output("cs", TensorShape({batch_size, cell_size}), &cs_tensor)); Tensor* f_tensor = nullptr; OP_REQUIRES_OK( ctx, ctx->allocate_output("f", TensorShape({batch_size, cell_size}), &f_tensor)); Tensor* o_tensor = nullptr; //OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output( // {"cs_prev"}, "o", // TensorShape({batch_size, cell_size}), &o_tensor)); OP_REQUIRES_OK(ctx, ctx->allocate_output( "o", TensorShape({batch_size, cell_size}), &o_tensor)); Tensor* ci_tensor = nullptr; OP_REQUIRES_OK( ctx, ctx->allocate_output("ci", TensorShape({batch_size, cell_size}), &ci_tensor)); Tensor* co_tensor = nullptr; OP_REQUIRES_OK( ctx, ctx->allocate_output("co", TensorShape({batch_size, cell_size}), &co_tensor)); Tensor* h_tensor = nullptr; OP_REQUIRES_OK( ctx, ctx->allocate_output("h", TensorShape({batch_size, cell_size}), &h_tensor)); //auto h_prev_ptr = h_prev_tensor->flat(); //auto h_ptr = h_tensor->flat(); //h_ptr(0) = h_prev_ptr(0) + 0.1; //printf("\nXsmmLSTMCell: parameters N=%ld C=%ld K=%ld forget_bias=%g\n", batch_size, input_size, cell_size, forget_bias_); /* print_tensor_ptr("x", x_tensor); print_tensor_ptr("cs_prev", cs_prev_tensor); print_tensor_ptr("h_prev", h_prev_tensor); print_tensor_ptr("w", w_tensor); print_tensor_ptr("b", b_tensor); print_tensor_ptr("cs_prev_b4", cs_prev_tensor); //print_tensor_ptr("wci", wci_tensor); //print_tensor_ptr("wcf", wcf_tensor); //print_tensor_ptr("wco", wco_tensor); print_tensor_ptr("i", i_tensor); print_tensor_ptr("cs", cs_tensor); print_tensor_ptr("f", f_tensor); print_tensor_ptr("o", o_tensor); print_tensor_ptr("ci", ci_tensor); print_tensor_ptr("co", co_tensor); print_tensor_ptr("h", h_tensor); volatile int debug = 1; printf("Address of debug = %p tid = %ld \n", &debug, (long)gettid()); //while(debug == 1) { } */ int offset_r = input_size * cell_size * 4; const float *xt = get_tensor_ptr(x_tensor); const float *csp = get_tensor_ptr(cs_prev_tensor); const float *hp = get_tensor_ptr(h_prev_tensor); const float *w = get_tensor_ptr(w_tensor); const float *r = get_tensor_ptr(w_tensor)+offset_r; const float *b = get_tensor_ptr(b_tensor); float *cst = get_tensor_ptr(cs_tensor); float *ht = get_tensor_ptr(h_tensor); float *it = get_tensor_ptr(i_tensor); float *ft = get_tensor_ptr(f_tensor); float *ot = get_tensor_ptr(o_tensor); float *cit = get_tensor_ptr(ci_tensor); float *cot = get_tensor_ptr(co_tensor); #if 0 lstm_fwd(batch_size, input_size, cell_size, 1, forget_bias_, xt, csp, hp, w, r, b, cst, ht, it, ft, ot, cit, cot); #else #if defined(_OPENMP) int nThreads = omp_get_max_threads(); /* number of threads */ #else #ifndef DISABLE_EIGEN_THREADS const DeviceBase::CpuWorkerThreads* worker_threads = ctx->device()->tensorflow_cpu_worker_threads(); int nThreads = worker_threads->num_threads; #else int nThreads = 1; /* number of threads */ #endif #endif if(xsmm_handle == nullptr || batch_size != cached_batch_size || input_size != cached_input_size || cell_size != cached_cell_size || cached_num_threads != nThreads) { if(xsmm_handle != nullptr) { //printf("Destroying existing libxsmm handle New NCK = (%d %d %d), old NCK = (%d %d %d)\n", batch_size, input_size, cell_size, cached_batch_size, cached_input_size, cached_cell_size); lstm_fwd_destroy( xsmm_handle ); xsmm_handle = nullptr; } //printf("Creating new libxsmm handle NCK = (%d %d %d) nThreads = %d\n", batch_size, input_size, cell_size, nThreads); xsmm_handle = lstm_fwd_create( batch_size, input_size, cell_size, 1, nThreads, forget_bias_, (w_in_kcck_ ? 1 : 0), xt, csp, hp, w, r, b, cst, ht, it, ft, ot, cit, cot); cached_batch_size = batch_size; cached_input_size = input_size; cached_cell_size = cell_size; cached_num_threads = nThreads; OP_REQUIRES(ctx, xsmm_handle != nullptr, errors::InvalidArgument("lstm_fwd_create)_ returned null Xsmm handle")); } else { //printf("Reusing existing libxsmm handle\n"); } lstm_fwd_set_ptr( xsmm_handle, forget_bias_, 1, xt, csp, hp, w, r, b, cst, ht, it, ft, ot, cit, cot ); #if defined(_OPENMP) #pragma message "Using OPENMP Threading" #if 0 #pragma omp parallel { int tid = omp_get_thread_num(); //printf("TID %3d: executing lstm_fwd_execute_st OS tid = %6d\n", tid, gettid()); lstm_fwd_execute_st( xsmm_handle, tid ); } #else lstm_fwd_execute_omp( xsmm_handle ); #endif #else #pragma message "Using EIGEN Threading" #ifndef DISABLE_EIGEN_THREADS BlockingCounter count(cached_num_threads); for (int i = 0; i < cached_num_threads; ++i) { worker_threads->workers->Schedule([=, &count]() { lstm_fwd_execute_st( xsmm_handle, i ); count.DecrementCount(); }); } count.Wait(); #else #pragma message "NOT using threading" lstm_fwd_execute_st( xsmm_handle, 0 ); #endif #endif #endif /* print_tensor_ptr("x", x_tensor); print_tensor_ptr("cs_prev", cs_prev_tensor); print_tensor_ptr("h_prev", h_prev_tensor); print_tensor_ptr("w", w_tensor); print_tensor_ptr("b", b_tensor); print_tensor_ptr("i", i_tensor); print_tensor_ptr("cs", cs_tensor); print_tensor_ptr("f", f_tensor); print_tensor_ptr("o", o_tensor); print_tensor_ptr("ci", ci_tensor); print_tensor_ptr("co", co_tensor); print_tensor_ptr("h", h_tensor); */ } private: float forget_bias_; float cell_clip_; bool use_peephole_; bool w_in_kcck_; int cached_batch_size, cached_input_size, cached_cell_size, cached_num_threads; void *xsmm_handle; void print_tensor_ptr(const char *name, const Tensor* t) { auto ptr = t->flat(); const T* p = ptr.data(); int dims = t->dims(); if(dims > 0) printf(" XsmmLSTM: %-10s: [%d", name, t->dim_size(0)); else printf(" XsmmLSTM: %-10s: [", name); for(int i = 1; i < dims; i++) printf(", %d", t->dim_size(i)); printf("] @%p (%lld) %g\n", p, t->NumElements(), p[0]); //for(int i = 0; i < t->NumElements(); i++) // printf("DUMP: %-10s %6d %12g\n", name, i, p[i]); } T *get_tensor_ptr(Tensor* t) { return t->flat().data(); } const T *get_tensor_ptr(const Tensor* t) { return t->flat().data(); } }; #define REGISTER_KERNEL(T) \ REGISTER_KERNEL_BUILDER( \ Name("LSTMBlockCell").Device(DEVICE_CPU).TypeConstraint("T").Label("xsmm"), \ XsmmLSTMCellOp); REGISTER_KERNEL(float); //REGISTER_KERNEL(bfloat16); // REGISTER_KERNEL(double); #undef REGISTER_KERNEL #define REGISTER_KERNEL(T) \ REGISTER_KERNEL_BUILDER( \ Name("XsmmLSTMCell").Device(DEVICE_CPU).TypeConstraint("T"), \ XsmmLSTMCellOp); REGISTER_KERNEL(float); #undef REGISTER_KERNEL template class XsmmLSTMCellGradOp : public OpKernel { public: explicit XsmmLSTMCellGradOp(OpKernelConstruction* ctx) : OpKernel(ctx), cached_batch_size(-2), cached_input_size(-2), cached_cell_size(-2), xsmm_handle(nullptr), cached_num_threads(-1), w_in_kcck_(false) { OP_REQUIRES_OK(ctx, ctx->GetAttr("use_peephole", &use_peephole_)); OP_REQUIRES_OK(ctx, ctx->GetAttr("w_in_kcck", &w_in_kcck_)); printf("\nUsing XsmmLSTMCellBwd\n"); } #if 0 explicit XsmmLSTMCellGradOp(OpKernelConstruction* ctx) : OpKernel(ctx) { OP_REQUIRES_OK(ctx, ctx->GetAttr("use_peephole", &use_peephole_)); printf("\nUsing XsmmLSTMCellBwd\n"); } #endif #if 0 bool UsesOmp() override { #ifdef _OPENMP return true; #else return false; #endif } #endif void Compute(OpKernelContext* ctx) override { const Tensor* x_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("x", &x_tensor)); const Tensor* cs_prev_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("cs_prev", &cs_prev_tensor)); const Tensor* h_prev_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("h_prev", &h_prev_tensor)); const Tensor* w_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("w", &w_tensor)); const Tensor* wT_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("w_t", &wT_tensor)); const Tensor* wci_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("wci", &wci_tensor)); const Tensor* wcf_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("wcf", &wcf_tensor)); const Tensor* wco_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("wco", &wco_tensor)); const Tensor* b_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("b", &b_tensor)); const Tensor* i_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("i", &i_tensor)); const Tensor* cs_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("cs", &cs_tensor)); const Tensor* f_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("f", &f_tensor)); const Tensor* o_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("o", &o_tensor)); const Tensor* ci_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("ci", &ci_tensor)); const Tensor* co_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("co", &co_tensor)); const Tensor* cs_grad_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("cs_grad", &cs_grad_tensor)); const Tensor* h_grad_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("h_grad", &h_grad_tensor)); const int64 batch_size = x_tensor->dim_size(0); const int64 input_size = x_tensor->dim_size(1); const int64 cell_size = cs_prev_tensor->dim_size(1); // Sanity checks for our input shapes. OP_REQUIRES(ctx, cs_prev_tensor->dim_size(0) == batch_size, errors::InvalidArgument("cs_prev.dims(0) != batch_size: ", cs_prev_tensor->dim_size(0), " vs. ", batch_size)); OP_REQUIRES(ctx, cs_prev_tensor->dim_size(1) == cell_size, errors::InvalidArgument("cs_prev.dims(1) != cell_size: ", cs_prev_tensor->dim_size(1), " vs. ", cell_size)); OP_REQUIRES(ctx, h_prev_tensor->dim_size(0) == batch_size, errors::InvalidArgument("h_prev.dims(0) != batch_size: ", h_prev_tensor->dim_size(0), " vs. ", batch_size)); OP_REQUIRES(ctx, h_prev_tensor->dim_size(1) == cell_size, errors::InvalidArgument( "h_prev.dims(1) != cell_size: ", h_prev_tensor->dim_size(1), " vs. ", cell_size)); OP_REQUIRES(ctx, w_tensor->dim_size(0) == input_size + cell_size, errors::InvalidArgument( "w.dim_size(0) != input_size + cell_size: ", w_tensor->dim_size(0), " vs. ", input_size + cell_size)); OP_REQUIRES(ctx, w_tensor->dim_size(1) == cell_size * 4, errors::InvalidArgument( "w.dim_size(1) != cell_size * 4: ", w_tensor->dim_size(1), " vs. ", cell_size * 4)); OP_REQUIRES(ctx, b_tensor->dim_size(0) == cell_size * 4, errors::InvalidArgument( "b.dim_size(0) != cell_size * 4: ", b_tensor->dim_size(0), " vs. ", cell_size * 4)); OP_REQUIRES(ctx, i_tensor->dim_size(0) == batch_size, errors::InvalidArgument( "i.dim_size(0) != batch_size: ", i_tensor->dim_size(0), " vs. ", batch_size)); OP_REQUIRES(ctx, i_tensor->dim_size(1) == cell_size, errors::InvalidArgument( "i.dim_size(1) != cell_size: ", i_tensor->dim_size(1), " vs. ", cell_size)); OP_REQUIRES(ctx, cs_tensor->dim_size(0) == batch_size, errors::InvalidArgument( "cs.dim_size(0) != batch_size: ", cs_tensor->dim_size(0), " vs. ", batch_size)); OP_REQUIRES(ctx, cs_tensor->dim_size(1) == cell_size, errors::InvalidArgument( "cs.dim_size(1) != cell_size: ", cs_tensor->dim_size(1), " vs. ", cell_size)); OP_REQUIRES(ctx, f_tensor->dim_size(0) == batch_size, errors::InvalidArgument( "f.dim_size(0) != batch_size: ", f_tensor->dim_size(0), " vs. ", batch_size)); OP_REQUIRES(ctx, f_tensor->dim_size(1) == cell_size, errors::InvalidArgument( "i.dim_size(1) != cell_size: ", f_tensor->dim_size(1), " vs. ", cell_size)); OP_REQUIRES(ctx, o_tensor->dim_size(0) == batch_size, errors::InvalidArgument( "o.dim_size(0) != batch_size: ", o_tensor->dim_size(0), " vs. ", batch_size)); OP_REQUIRES(ctx, o_tensor->dim_size(1) == cell_size, errors::InvalidArgument( "o.dim_size(1) != cell_size: ", o_tensor->dim_size(1), " vs. ", cell_size)); OP_REQUIRES(ctx, ci_tensor->dim_size(0) == batch_size, errors::InvalidArgument( "ci.dim_size(0) != batch_size: ", ci_tensor->dim_size(0), " vs. ", batch_size)); OP_REQUIRES(ctx, ci_tensor->dim_size(1) == cell_size, errors::InvalidArgument( "ci.dim_size(1) != cell_size: ", ci_tensor->dim_size(1), " vs. ", cell_size)); OP_REQUIRES(ctx, co_tensor->dim_size(0) == batch_size, errors::InvalidArgument( "co.dim_size(0) != batch_size: ", co_tensor->dim_size(0), " vs. ", batch_size)); OP_REQUIRES(ctx, co_tensor->dim_size(1) == cell_size, errors::InvalidArgument( "co.dim_size(1) != cell_size: ", co_tensor->dim_size(1), " vs. ", cell_size)); OP_REQUIRES(ctx, cs_grad_tensor->dim_size(0) == batch_size, errors::InvalidArgument( "cs_grad_tensor.dims(0) != batch_size: ", cs_grad_tensor->dim_size(0), " vs. ", batch_size)); OP_REQUIRES(ctx, cs_grad_tensor->dim_size(1) == cell_size, errors::InvalidArgument("cs_grad_tensor.dims(1) != cell_size: ", cs_grad_tensor->dim_size(1), " vs. ", cell_size)); OP_REQUIRES(ctx, h_grad_tensor->dim_size(0) == batch_size, errors::InvalidArgument("h_grad_tensor.dims(0) != batch_size: ", h_grad_tensor->dim_size(0), " vs. ", batch_size)); OP_REQUIRES(ctx, h_grad_tensor->dim_size(1) == cell_size, errors::InvalidArgument("h_grad_tensor.dims(1) != cell_size: ", h_grad_tensor->dim_size(1), " vs. ", cell_size)); // Allocate our output tensors. Tensor* cs_prev_grad_tensor = nullptr; /*OP_REQUIRES_OK( ctx, ctx->forward_input_or_allocate_output( {"cs_grad"}, "cs_prev_grad", TensorShape({batch_size, cell_size}), &cs_prev_grad_tensor)); */ OP_REQUIRES_OK( ctx, ctx->allocate_output( "cs_prev_grad", TensorShape({batch_size, cell_size}), &cs_prev_grad_tensor)); Tensor* h_prev_grad_tensor = nullptr; OP_REQUIRES_OK( ctx, ctx->allocate_output( "h_prev_grad", TensorShape({batch_size, cell_size}), &h_prev_grad_tensor)); Tensor* x_grad_tensor = nullptr; OP_REQUIRES_OK( ctx, ctx->allocate_output( "x_grad", TensorShape({batch_size, input_size}), &x_grad_tensor)); Tensor* w_grad_tensor = nullptr; OP_REQUIRES_OK( ctx, ctx->allocate_output( "w_grad", TensorShape({input_size+cell_size, 4*cell_size}), &w_grad_tensor)); Tensor* b_grad_tensor = nullptr; OP_REQUIRES_OK( ctx, ctx->allocate_output( "b_grad", TensorShape({4*cell_size}), &b_grad_tensor)); Tensor* wci_grad_tensor = nullptr; OP_REQUIRES_OK( ctx, ctx->forward_input_or_allocate_output( {"wci"}, "wci_grad", wci_tensor->shape(), &wci_grad_tensor)); Tensor* wcf_grad_tensor = nullptr; OP_REQUIRES_OK( ctx, ctx->forward_input_or_allocate_output( {"wcf"}, "wcf_grad", wcf_tensor->shape(), &wcf_grad_tensor)); Tensor* wco_grad_tensor = nullptr; OP_REQUIRES_OK( ctx, ctx->forward_input_or_allocate_output( {"wco"}, "wco_grad", wco_tensor->shape(), &wco_grad_tensor)); // Allocate our temp tensors. Tensor do_tensor; OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum::v(), TensorShape({batch_size, cell_size}), &do_tensor)); Tensor dcs_tensor; OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum::v(), TensorShape({batch_size, cell_size}), &dcs_tensor)); Tensor dci_tensor; OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum::v(), TensorShape({batch_size, cell_size}), &dci_tensor)); Tensor df_tensor; OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum::v(), TensorShape({batch_size, cell_size}), &df_tensor)); Tensor di_tensor; OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum::v(), TensorShape({batch_size, cell_size}), &di_tensor)); const Device& device = ctx->eigen_device(); print_tensor_ptr("xt", x_tensor); print_tensor_ptr("csp", cs_prev_tensor); print_tensor_ptr("hp", h_prev_tensor); print_tensor_ptr("w", w_tensor); print_tensor_ptr("b", b_tensor); print_tensor_ptr("it", i_tensor); print_tensor_ptr("cst", cs_tensor); print_tensor_ptr("ft", f_tensor); print_tensor_ptr("ot", o_tensor); print_tensor_ptr("cit", ci_tensor); print_tensor_ptr("cot", co_tensor); print_tensor_ptr("dcst", cs_grad_tensor); print_tensor_ptr("dht", h_grad_tensor); print_tensor_ptr("dcsp", cs_prev_grad_tensor); print_tensor_ptr("dhp", h_prev_grad_tensor); print_tensor_ptr("dxt", x_grad_tensor); print_tensor_ptr("dw", w_grad_tensor); print_tensor_ptr("db", b_grad_tensor); //printf("w_tensor = %p, wT_tensor = %p\n", get_tensor_ptr(w_tensor), get_tensor_ptr(wT_tensor)); int w_in_trans = (get_tensor_ptr(w_tensor) != get_tensor_ptr(wT_tensor)); int offset_r = input_size * cell_size * 4; const float *xt = get_tensor_ptr(x_tensor); const float *csp = get_tensor_ptr(cs_prev_tensor); const float *hp = get_tensor_ptr(h_prev_tensor); const float *w = get_tensor_ptr(wT_tensor); const float *r = get_tensor_ptr(wT_tensor)+offset_r; const float *b = get_tensor_ptr(b_tensor); const float *it = get_tensor_ptr(i_tensor); const float *cst = get_tensor_ptr(cs_tensor); const float *ft = get_tensor_ptr(f_tensor); const float *ot = get_tensor_ptr(o_tensor); const float *cit = get_tensor_ptr(ci_tensor); const float *cot = get_tensor_ptr(co_tensor); const float *dcs = get_tensor_ptr(cs_grad_tensor); const float *dht = get_tensor_ptr(h_grad_tensor); float *dcspt = get_tensor_ptr(cs_prev_grad_tensor); float *dhpt = get_tensor_ptr(h_prev_grad_tensor); float *dxt = get_tensor_ptr(x_grad_tensor); float *dw = get_tensor_ptr(w_grad_tensor); float *dr = get_tensor_ptr(w_grad_tensor)+offset_r; float *db = get_tensor_ptr(b_grad_tensor); #if 0 lstm_bwd(batch_size, input_size, cell_size, 1, xt, csp, hp, 0, w, r, cst, it, ft, ot, cit, cot, dcs, dht, dxt, dcspt, dhpt, dw, dr, db ); #else #if defined(_OPENMP) int nThreads = omp_get_max_threads(); /* number of threads */ #else #ifndef DISABLE_EIGEN_THREADS const DeviceBase::CpuWorkerThreads* worker_threads = ctx->device()->tensorflow_cpu_worker_threads(); int nThreads = worker_threads->num_threads; #else int nThreads = 1; /* number of threads */ #endif #endif if(xsmm_handle == nullptr || batch_size != cached_batch_size || input_size != cached_input_size || cell_size != cached_cell_size || cached_num_threads != nThreads) { if(xsmm_handle != nullptr) { //printf("Destroying existing libxsmm handle New NCK = (%d %d %d), old NCK = (%d %d %d)\n", batch_size, input_size, cell_size, cached_batch_size, cached_input_size, cached_cell_size); lstm_bwd_destroy( xsmm_handle ); xsmm_handle = nullptr; } //printf("Creating new libxsmm handle NCK = (%d %d %d) nThreads = %d\n", batch_size, input_size, cell_size, nThreads); xsmm_handle = lstm_bwd_create( batch_size, input_size, cell_size, 1, nThreads, (w_in_kcck_ ? 1 : 0), w_in_trans, xt, csp, hp, 0, w, r, cst, it, ft, ot, cit, cot, dcs, dht, dxt, dcspt, dhpt, dw, dr, db ); cached_batch_size = batch_size; cached_input_size = input_size; cached_cell_size = cell_size; cached_num_threads = nThreads; OP_REQUIRES(ctx, xsmm_handle != nullptr, errors::InvalidArgument("lstm_bwd_create)_ returned null Xsmm handle")); } else { //printf("Reusing existing libxsmm handle\n"); } lstm_bwd_set_ptr( xsmm_handle, w_in_trans, 1, xt, csp, hp, 0, w, r, cst, it, ft, ot, cit, cot, dcs, dht, dxt, dcspt, dhpt, dw, dr, db ); #if defined(_OPENMP) #if 0 #pragma omp parallel { int tid = omp_get_thread_num(); lstm_bwd_execute_st( xsmm_handle, tid ); } #else lstm_bwd_execute_omp( xsmm_handle ); #endif #else #ifndef DISABLE_EIGEN_THREADS BlockingCounter count(cached_num_threads); for (int i = 0; i < cached_num_threads; ++i) { worker_threads->workers->Schedule([=, &count]() { lstm_bwd_execute_st( xsmm_handle, i ); count.DecrementCount(); }); } count.Wait(); #else lstm_bwd_execute_st( xsmm_handle, 0 ); #endif #endif #endif } protected: bool use_peephole_; bool w_in_kcck_; int cached_batch_size, cached_input_size, cached_cell_size, cached_num_threads; void *xsmm_handle; void print_tensor_ptr(const char *name, const Tensor* t) { return; auto ptr = t->flat(); const T* p = ptr.data(); int dims = t->dims(); if(dims > 0) printf(" XsmmLSTM: %-10s: [%d", name, t->dim_size(0)); else printf(" XsmmLSTM: %-10s: [", name); for(int i = 1; i < dims; i++) printf(", %d", t->dim_size(i)); printf("] @%p (%lld) %g\n", p, t->NumElements(), p[0]); //for(int i = 0; i < t->NumElements(); i++) // printf("DUMP: %-10s %6d %12g\n", name, i, p[i]); } T *get_tensor_ptr(Tensor* t) { return t->flat().data(); } const T *get_tensor_ptr(const Tensor* t) { return t->flat().data(); } }; #define REGISTER_KERNEL(T) \ REGISTER_KERNEL_BUILDER( \ Name("XsmmLSTMCellGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ XsmmLSTMCellGradOp); REGISTER_KERNEL(float); // REGISTER_KERNEL(double); #undef REGISTER_KERNEL template class XsmmFusedLSTMOp : public OpKernel { public: explicit XsmmFusedLSTMOp(OpKernelConstruction* ctx) : OpKernel(ctx), cached_batch_size(-2), cached_input_size(-2), cached_cell_size(-2), cached_timelen(-2), xsmm_handle(nullptr), cached_num_threads(-1) { OP_REQUIRES_OK(ctx, ctx->GetAttr("forget_bias", &forget_bias_)); OP_REQUIRES_OK(ctx, ctx->GetAttr("cell_clip", &cell_clip_)); OP_REQUIRES_OK(ctx, ctx->GetAttr("use_peephole", &use_peephole_)); OP_REQUIRES_OK(ctx, ctx->GetAttr("use_residue", &use_residue_)); OP_REQUIRES_OK(ctx, ctx->GetAttr("use_dropout", &use_dropout_)); OP_REQUIRES(ctx, use_peephole_ == false, errors::InvalidArgument("Peephole is not supported for XsmmLSTMCell")); printf("\nUsing XsmmFusedLSTMFwd: forget_bias=%g\n", forget_bias_); } #if 0 bool UsesOmp() override { #ifdef _OPENMP return true; #else return false; #endif } #endif void Compute(OpKernelContext* ctx) override { const Tensor* seq_len_max_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("seq_len_max", &seq_len_max_tensor)); const Tensor* x; OP_REQUIRES_OK(ctx, ctx->input("x", &x)); OP_REQUIRES(ctx, x->dims() == 3, errors::InvalidArgument("x must be 3D")); const int64 timelen = x->dim_size(0); const int64 batch_size = x->dim_size(1); const int64 input_size = x->dim_size(2); const Tensor* cs_prev_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("cs_prev", &cs_prev_tensor)); OP_REQUIRES(ctx, cs_prev_tensor->dims() == 2, errors::InvalidArgument("cs_prev must be 2D")); OP_REQUIRES(ctx, cs_prev_tensor->dim_size(0) == batch_size, errors::InvalidArgument("cs_prev.dims(0) != batch_size: ", cs_prev_tensor->dim_size(0), " vs. ", batch_size)); const int64 cell_size = cs_prev_tensor->dim_size(1); if (batch_size * input_size % 2 == 1) { LOG(WARNING) << "XsmmFusedLSTMOp is inefficient when both batch_size and " << "input_size are odd. You are using: batch_size=" << batch_size << ", input_size=" << input_size; } if (batch_size * cell_size % 2 == 1) { LOG(WARNING) << "XsmmFusedLSTMOp is inefficient when both batch_size and " << "cell_size are odd. You are using: batch_size=" << batch_size << ", cell_size=" << cell_size; } const Tensor* h_prev_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("h_prev", &h_prev_tensor)); OP_REQUIRES(ctx, h_prev_tensor->dims() == 2, errors::InvalidArgument("h_prev must be 2D")); OP_REQUIRES(ctx, h_prev_tensor->dim_size(0) == batch_size, errors::InvalidArgument("h_prev.dims(0) != batch_size: ", h_prev_tensor->dim_size(0), " vs. ", batch_size)); OP_REQUIRES(ctx, h_prev_tensor->dim_size(1) == cell_size, errors::InvalidArgument( "h_prev.dims(1) != cell_size: ", h_prev_tensor->dim_size(1), " vs. ", cell_size)); const Tensor* w_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("w", &w_tensor)); OP_REQUIRES(ctx, w_tensor->dims() == 2, errors::InvalidArgument("w must be 2D")); OP_REQUIRES(ctx, w_tensor->dim_size(0) == input_size + cell_size, errors::InvalidArgument( "w.dim_size(0) != input_size + cell_size: ", w_tensor->dim_size(0), " vs. ", input_size + cell_size)); OP_REQUIRES(ctx, w_tensor->dim_size(1) == cell_size * 4, errors::InvalidArgument( "w.dim_size(1) != cell_size * 4: ", w_tensor->dim_size(1), " vs. ", cell_size * 4)); const Tensor* wci_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("wci", &wci_tensor)); OP_REQUIRES(ctx, wci_tensor->dims() == 1, errors::InvalidArgument("wci must be 1D")); OP_REQUIRES(ctx, wci_tensor->dim_size(0) == cell_size, errors::InvalidArgument( "wci.dim_size(0) != cell_size: ", wci_tensor->dim_size(0), " vs. ", cell_size)); const Tensor* wcf_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("wcf", &wcf_tensor)); OP_REQUIRES(ctx, wcf_tensor->dims() == 1, errors::InvalidArgument("wcf must be 1D")); OP_REQUIRES(ctx, wcf_tensor->dim_size(0) == cell_size, errors::InvalidArgument( "wcf.dim_size(0) != cell_size: ", wcf_tensor->dim_size(0), " vs. ", cell_size)); const Tensor* wco_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("wco", &wco_tensor)); OP_REQUIRES(ctx, wco_tensor->dims() == 1, errors::InvalidArgument("wco must be 1D")); OP_REQUIRES(ctx, wco_tensor->dim_size(0) == cell_size, errors::InvalidArgument( "wco.dim_size(0) != cell_size: ", wco_tensor->dim_size(0), " vs. ", cell_size)); const Tensor* b_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("b", &b_tensor)); OP_REQUIRES(ctx, b_tensor->dims() == 1, errors::InvalidArgument("b must be 1D")); OP_REQUIRES(ctx, b_tensor->dim_size(0) == cell_size * 4, errors::InvalidArgument( "b.dim_size(0) != cell_size * 4: ", b_tensor->dim_size(0), " vs. ", cell_size * 4)); TensorShape batch_cell_shape({timelen, batch_size, cell_size}); Tensor* i_out; OP_REQUIRES_OK(ctx, ctx->allocate_output("i", batch_cell_shape, &i_out)); Tensor* cs_out; OP_REQUIRES_OK(ctx, ctx->allocate_output("cs", batch_cell_shape, &cs_out)); Tensor* f_out; OP_REQUIRES_OK(ctx, ctx->allocate_output("f", batch_cell_shape, &f_out)); Tensor* o_out; OP_REQUIRES_OK(ctx, ctx->allocate_output("o", batch_cell_shape, &o_out)); Tensor* ci_out; OP_REQUIRES_OK(ctx, ctx->allocate_output("ci", batch_cell_shape, &ci_out)); Tensor* co_out; OP_REQUIRES_OK(ctx, ctx->allocate_output("co", batch_cell_shape, &co_out)); Tensor* h_out; OP_REQUIRES_OK(ctx, ctx->allocate_output("h", batch_cell_shape, &h_out)); //printf("Inside %s:%d %s()\n", __FILE__, __LINE__, __PRETTY_FUNCTION__); #if 0 //TMP tensors Tensor xh_tensor; OP_REQUIRES_OK(ctx, ctx->allocate_temp( DataTypeToEnum::v(), TensorShape({batch_size, input_size + cell_size}), &xh_tensor)); Tensor icfo_tensor; OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum::v(), TensorShape({batch_size, cell_size * 4}), &icfo_tensor)); #endif const Device& device = ctx->eigen_device(); const int64 seq_len_max = seq_len_max_tensor->scalar()(); int offset_r = input_size * cell_size * 4; const float *xt = get_tensor_ptr(x); const float *csp = get_tensor_ptr(cs_prev_tensor); const float *hp = get_tensor_ptr(h_prev_tensor); const float *w = get_tensor_ptr(w_tensor); const float *r = get_tensor_ptr(w_tensor)+offset_r; const float *b = get_tensor_ptr(b_tensor); float *cst = get_tensor_ptr(cs_out); float *ht = get_tensor_ptr(h_out); float *it = get_tensor_ptr(i_out); float *ft = get_tensor_ptr(f_out); float *ot = get_tensor_ptr(o_out); float *cit = get_tensor_ptr(ci_out); float *cot = get_tensor_ptr(co_out); #if defined(_OPENMP) int nThreads = omp_get_max_threads(); /* number of threads */ #else #ifndef DISABLE_EIGEN_THREADS const DeviceBase::CpuWorkerThreads* worker_threads = ctx->device()->tensorflow_cpu_worker_threads(); int nThreads = worker_threads->num_threads; #else int nThreads = 1; /* number of threads */ #endif #endif if(xsmm_handle == nullptr || batch_size != cached_batch_size || input_size != cached_input_size || cell_size != cached_cell_size || timelen > cached_timelen || cached_num_threads != nThreads) { if(xsmm_handle != nullptr) { //printf("Destroying existing libxsmm handle New NCKT = (%d %d %d %d), old NCKT = (%d %d %d %d) \n", batch_size, input_size, cell_size, timelen, cached_batch_size, cached_input_size, cached_cell_size, cached_timelen); lstm_fwd_destroy( xsmm_handle ); xsmm_handle = nullptr; } //printf("Creating new libxsmm handle NCKT = (%d %d %d %d) nThreads = %d\n", batch_size, input_size, cell_size, timelen, nThreads); xsmm_handle = lstm_fwd_create( batch_size, input_size, cell_size, timelen, nThreads, forget_bias_, 0, xt, csp, hp, w, r, b, cst, ht, it, ft, ot, cit, cot); cached_batch_size = batch_size; cached_input_size = input_size; cached_cell_size = cell_size; cached_timelen = timelen; cached_num_threads = nThreads; OP_REQUIRES(ctx, xsmm_handle != nullptr, errors::InvalidArgument("lstm_fwd_create)_ returned null Xsmm handle")); } else { //printf("Reusing existing libxsmm handle\n"); } lstm_fwd_set_ptr( xsmm_handle, forget_bias_, timelen, xt, csp, hp, w, r, b, cst, ht, it, ft, ot, cit, cot ); #if defined(_OPENMP) #if 0 #pragma omp parallel { int tid = omp_get_thread_num(); //printf("Thread %d calling lstm_fwd_execute_st OS TID = %d\n", tid, gettid()); lstm_fwd_execute_st( xsmm_handle, tid ); } #else lstm_fwd_execute_omp( xsmm_handle ); #endif #else #ifndef DISABLE_EIGEN_THREADS BlockingCounter count(cached_num_threads); for (int i = 0; i < cached_num_threads; ++i) { worker_threads->workers->Schedule([=, &count]() { lstm_fwd_execute_st( xsmm_handle, i ); count.DecrementCount(); }); } count.Wait(); #else lstm_fwd_execute_st( xsmm_handle, 0 ); #endif #endif #if 0 // Orig Code SliceHelper slicer(ctx); for (int64 t = 0; t < seq_len_max; ++t) { const Tensor x_tensor = slicer.InputSlice(*x, t, "x"); const Tensor& cs_prev_tensor2 = t == 0 ? *cs_prev_tensor : slicer.OutputSlice(cs_out, t - 1, "cs_prev"); const Tensor& h_prev_tensor2 = t == 0 ? *h_prev_tensor : slicer.OutputSlice(h_out, t - 1, "h_prev"); Tensor i_tensor = slicer.OutputSlice(i_out, t, "i_out"); Tensor cs_tensor = slicer.OutputSlice(cs_out, t, "cs_out"); Tensor f_tensor = slicer.OutputSlice(f_out, t, "f_out"); Tensor o_tensor = slicer.OutputSlice(o_out, t, "o_out"); Tensor ci_tensor = slicer.OutputSlice(ci_out, t, "ci_out"); Tensor co_tensor = slicer.OutputSlice(co_out, t, "co_out"); Tensor h_tensor = slicer.OutputSlice(h_out, t, "h_out"); functor::LSTMBlockCellFprop(batch_size, input_size, cell_size)( ctx, device, forget_bias_, cell_clip_, use_peephole_, x_tensor.matrix(), cs_prev_tensor2.matrix(), h_prev_tensor2.matrix(), w_tensor->matrix(), wci_tensor->vec(), wcf_tensor->vec(), wco_tensor->vec(), b_tensor->vec(), xh_tensor.matrix(), i_tensor.matrix(), cs_tensor.matrix(), f_tensor.matrix(), o_tensor.matrix(), ci_tensor.matrix(), co_tensor.matrix(), icfo_tensor.matrix(), h_tensor.matrix()); slicer.FinishTimeStep(); } if (seq_len_max < timelen) { Tensor cs_tensor = cs_out->Slice(seq_len_max, timelen); Tensor h_tensor = h_out->Slice(seq_len_max, timelen); functor::TensorUnalignedZero()(device, cs_tensor.unaligned_flat()); functor::TensorUnalignedZero()(device, h_tensor.unaligned_flat()); } #endif } private: float forget_bias_; float cell_clip_; bool use_peephole_; bool use_residue_; bool use_dropout_; int cached_batch_size, cached_input_size, cached_cell_size, cached_timelen, cached_num_threads; void *xsmm_handle; void print_tensor_ptr(const char *name, const Tensor* t) { auto ptr = t->flat(); const T* p = ptr.data(); int dims = t->dims(); if(dims > 0) printf(" XsmmLSTM: %-10s: [%d", name, t->dim_size(0)); else printf(" XsmmLSTM: %-10s: [", name); for(int i = 1; i < dims; i++) printf(", %d", t->dim_size(i)); printf("] @%p (%lld) %g\n", p, t->NumElements(), p[0]); //for(int i = 0; i < t->NumElements(); i++) // printf("DUMP: %-10s %6d %12g\n", name, i, p[i]); } T *get_tensor_ptr(Tensor* t) { return t->flat().data(); } const T *get_tensor_ptr(const Tensor* t) { return t->flat().data(); } }; #define REGISTER_KERNEL(T) \ REGISTER_KERNEL_BUILDER( \ Name("XsmmFusedLSTM").Device(DEVICE_CPU).TypeConstraint("T"), \ XsmmFusedLSTMOp); REGISTER_KERNEL(float); // REGISTER_KERNEL(double); #undef REGISTER_KERNEL template class XsmmFusedLSTMGradOp : public OpKernel { public: explicit XsmmFusedLSTMGradOp(OpKernelConstruction* ctx) : OpKernel(ctx), cached_batch_size(-2), cached_input_size(-2), cached_cell_size(-2), cached_timelen(-2), xsmm_handle(nullptr), cached_num_threads(-1) { OP_REQUIRES_OK(ctx, ctx->GetAttr("use_peephole", &use_peephole_)); OP_REQUIRES_OK(ctx, ctx->GetAttr("use_residue", &use_residue_)); OP_REQUIRES_OK(ctx, ctx->GetAttr("use_dropout", &use_dropout_)); OP_REQUIRES(ctx, use_peephole_ == false, errors::InvalidArgument("Peephole is not supported for XsmmLSTMCell")); printf("\nUsing XsmmFusedLSTMBwd:\n"); } #if 0 bool UsesOmp() override { #ifdef _OPENMP return true; #else return false; #endif } #endif void Compute(OpKernelContext* ctx) override { const Tensor* seq_len_max_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("seq_len_max", &seq_len_max_tensor)); const Tensor* x; OP_REQUIRES_OK(ctx, ctx->input("x", &x)); OP_REQUIRES(ctx, x->dims() == 3, errors::InvalidArgument("x must be 3D")); const int64 timelen = x->dim_size(0); const int64 batch_size = x->dim_size(1); const int64 input_size = x->dim_size(2); const Tensor* cs_prev_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("cs_prev", &cs_prev_tensor)); const Tensor* h_prev_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("h_prev", &h_prev_tensor)); const Tensor* w_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("w", &w_tensor)); const int64 cell_size = w_tensor->dim_size(1) / 4; OP_REQUIRES(ctx, input_size + cell_size == w_tensor->dim_size(0), errors::InvalidArgument( "w matrix rows don't match: ", input_size + cell_size, " vs. ", w_tensor->dim_size(0))); const Tensor* wci_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("wci", &wci_tensor)); const Tensor* wcf_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("wcf", &wcf_tensor)); const Tensor* wco_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("wco", &wco_tensor)); const Tensor* b_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("b", &b_tensor)); OP_REQUIRES( ctx, cell_size == b_tensor->dim_size(0) / 4, errors::InvalidArgument("w and b cell_size don't match: ", cell_size, " vs. ", b_tensor->dim_size(0))); const Tensor* i_out = nullptr; OP_REQUIRES_OK(ctx, ctx->input("i", &i_out)); const Tensor* cs_out = nullptr; OP_REQUIRES_OK(ctx, ctx->input("cs", &cs_out)); const Tensor* f_out = nullptr; OP_REQUIRES_OK(ctx, ctx->input("f", &f_out)); const Tensor* o_out = nullptr; OP_REQUIRES_OK(ctx, ctx->input("o", &o_out)); const Tensor* ci_out = nullptr; OP_REQUIRES_OK(ctx, ctx->input("ci", &ci_out)); const Tensor* co_out = nullptr; OP_REQUIRES_OK(ctx, ctx->input("co", &co_out)); const Tensor* h_out = nullptr; OP_REQUIRES_OK(ctx, ctx->input("h", &h_out)); const Tensor* cs_grad = nullptr; OP_REQUIRES_OK(ctx, ctx->input("cs_grad", &cs_grad)); const Tensor* h_grad = nullptr; OP_REQUIRES_OK(ctx, ctx->input("h_grad", &h_grad)); TensorShape batch_input_shape({timelen, batch_size, input_size}); Tensor* x_grad; OP_REQUIRES_OK(ctx, ctx->allocate_output("x_grad", batch_input_shape, &x_grad)); Tensor* cs_prev_grad_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->allocate_output("cs_prev_grad", cs_prev_tensor->shape(), &cs_prev_grad_tensor)); Tensor* h_prev_grad_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->allocate_output("h_prev_grad", h_prev_tensor->shape(), &h_prev_grad_tensor)); Tensor* w_grad_tensor = nullptr; OP_REQUIRES_OK( ctx, ctx->allocate_output("w_grad", w_tensor->shape(), &w_grad_tensor)); Tensor* wci_grad_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->allocate_output("wci_grad", wci_tensor->shape(), &wci_grad_tensor)); Tensor* wcf_grad_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->allocate_output("wcf_grad", wcf_tensor->shape(), &wcf_grad_tensor)); Tensor* wco_grad_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->allocate_output("wco_grad", wco_tensor->shape(), &wco_grad_tensor)); Tensor* b_grad_tensor = nullptr; OP_REQUIRES_OK( ctx, ctx->allocate_output("b_grad", b_tensor->shape(), &b_grad_tensor)); TensorShape batch_cell_shape({batch_size, cell_size}); //printf("Inside %s:%d %s()\n", __FILE__, __LINE__, __PRETTY_FUNCTION__); #if 0 // TMP tensors Tensor xh_tensor; OP_REQUIRES_OK(ctx, ctx->allocate_temp( DataTypeToEnum::v(), TensorShape({batch_size, input_size + cell_size}), &xh_tensor)); Tensor xh_grad_tensor; OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum::v(), xh_tensor.shape(), &xh_grad_tensor)); Tensor do_tensor; OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum::v(), batch_cell_shape, &do_tensor)); Tensor dcs_tensor; OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum::v(), batch_cell_shape, &dcs_tensor)); Tensor dci_tensor; OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum::v(), batch_cell_shape, &dci_tensor)); Tensor df_tensor; OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum::v(), batch_cell_shape, &df_tensor)); Tensor di_tensor; OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum::v(), batch_cell_shape, &di_tensor)); Tensor dicfo_tensor; OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum::v(), TensorShape({batch_size, cell_size * 4}), &dicfo_tensor)); Tensor cs_grad_tensor; OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum::v(), batch_cell_shape, &cs_grad_tensor)); Tensor h_grad_tensor; OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum::v(), batch_cell_shape, &h_grad_tensor)); #endif const Device& device = ctx->eigen_device(); #if 0 // Orig Impl functor::TensorZero()(device, cs_grad_tensor.flat()); functor::TensorZero()(device, cs_prev_grad_tensor->flat()); functor::TensorZero()(device, h_grad_tensor.flat()); functor::TensorZero()(device, h_prev_grad_tensor->flat()); functor::TensorZero()(device, w_grad_tensor->flat()); functor::TensorZero()(device, wci_grad_tensor->flat()); functor::TensorZero()(device, wcf_grad_tensor->flat()); functor::TensorZero()(device, wco_grad_tensor->flat()); functor::TensorZero()(device, b_grad_tensor->flat()); #endif const int64 seq_len_max = seq_len_max_tensor->scalar()(); int offset_r = input_size * cell_size * 4; const float *xt = get_tensor_ptr(x); const float *csp = get_tensor_ptr(cs_prev_tensor); const float *hp = get_tensor_ptr(h_prev_tensor); const float *ht = get_tensor_ptr(h_out); const float *w = get_tensor_ptr(w_tensor); const float *r = get_tensor_ptr(w_tensor)+offset_r; const float *b = get_tensor_ptr(b_tensor); const float *it = get_tensor_ptr(i_out); const float *cst = get_tensor_ptr(cs_out); const float *ft = get_tensor_ptr(f_out); const float *ot = get_tensor_ptr(o_out); const float *cit = get_tensor_ptr(ci_out); const float *cot = get_tensor_ptr(co_out); const float *dcs = get_tensor_ptr(cs_grad); const float *dht = get_tensor_ptr(h_grad); float *dcspt = get_tensor_ptr(cs_prev_grad_tensor); float *dhpt = get_tensor_ptr(h_prev_grad_tensor); float *dxt = get_tensor_ptr(x_grad); float *dw = get_tensor_ptr(w_grad_tensor); float *dr = get_tensor_ptr(w_grad_tensor)+offset_r; float *db = get_tensor_ptr(b_grad_tensor); #if defined(_OPENMP) int nThreads = omp_get_max_threads(); /* number of threads */ #else #ifndef DISABLE_EIGEN_THREADS const DeviceBase::CpuWorkerThreads* worker_threads = ctx->device()->tensorflow_cpu_worker_threads(); int nThreads = worker_threads->num_threads; #else int nThreads = 1; /* number of threads */ #endif #endif if(xsmm_handle == nullptr || batch_size != cached_batch_size || input_size != cached_input_size || cell_size != cached_cell_size || timelen > cached_timelen || cached_num_threads != nThreads) { if(xsmm_handle != nullptr) { //printf("Destroying existing libxsmm handle New NCKT = (%d %d %d %d), old NCKT = (%d %d %d %d) \n", batch_size, input_size, cell_size, timelen, cached_batch_size, cached_input_size, cached_cell_size, cached_timelen); lstm_bwd_destroy( xsmm_handle ); xsmm_handle = nullptr; } //printf("Creating new libxsmm handle NCKT = (%d %d %d %d) nThreads = %d\n", batch_size, input_size, cell_size, timelen, nThreads); xsmm_handle = lstm_bwd_create( batch_size, input_size, cell_size, timelen, nThreads, 0, 0, xt, csp, hp, ht, w, r, cst, it, ft, ot, cit, cot, dcs, dht, dxt, dcspt, dhpt, dw, dr, db ); cached_batch_size = batch_size; cached_input_size = input_size; cached_cell_size = cell_size; cached_timelen = timelen; cached_num_threads = nThreads; OP_REQUIRES(ctx, xsmm_handle != nullptr, errors::InvalidArgument("lstm_bwd_create)_ returned null Xsmm handle")); } else { //printf("Reusing existing libxsmm handle\n"); } lstm_bwd_set_ptr( xsmm_handle, 0, timelen, xt, csp, hp, ht, w, r, cst, it, ft, ot, cit, cot, dcs, dht, dxt, dcspt, dhpt, dw, dr, db ); #if defined(_OPENMP) #if 0 #pragma omp parallel { int tid = omp_get_thread_num(); //printf("Thread %d calling lstm_bwd_execute_st OS TID = %d\n", tid, gettid()); lstm_bwd_execute_st( xsmm_handle, tid ); } #else lstm_bwd_execute_omp( xsmm_handle ); #endif #else #ifndef DISABLE_EIGEN_THREADS BlockingCounter count(cached_num_threads); for (int i = 0; i < cached_num_threads; ++i) { worker_threads->workers->Schedule([=, &count]() { lstm_bwd_execute_st( xsmm_handle, i ); count.DecrementCount(); }); } count.Wait(); #else lstm_bwd_execute_st( xsmm_handle, 0 ); #endif #endif #if 0 // Orig Impl SliceHelper slicer(ctx); for (int64 t = seq_len_max - 1; t >= 0; --t) { const Tensor& x_tensor = slicer.InputSlice(*x, t, "x"); const Tensor& cs_prev_tensor2 = t == 0 ? *cs_prev_tensor : slicer.InputSlice(*cs_out, t - 1, "cs_prev"); const Tensor& h_prev_tensor2 = t == 0 ? *h_prev_tensor : slicer.InputSlice(*h_out, t - 1, "h_prev"); const Tensor& i_tensor = slicer.InputSlice(*i_out, t, "i_out"); const Tensor& cs_tensor = slicer.InputSlice(*cs_out, t, "cs_out"); const Tensor& f_tensor = slicer.InputSlice(*f_out, t, "f_out"); const Tensor& o_tensor = slicer.InputSlice(*o_out, t, "o_out"); const Tensor& ci_tensor = slicer.InputSlice(*ci_out, t, "ci_out"); const Tensor& co_tensor = slicer.InputSlice(*co_out, t, "co_out"); // Grab previous CS grad. const Tensor& const_cs_prev_grad_tensor = *cs_prev_grad_tensor; const Tensor const_cs_grad_slice = slicer.InputSlice(*cs_grad, t, "cs_grad"); functor::TensorAdd()( device, const_cs_prev_grad_tensor.flat(), const_cs_grad_slice.flat(), cs_grad_tensor.flat()); // Combine previous h grad and h grad coming on top. const Tensor& const_h_prev_grad_tensor = *h_prev_grad_tensor; const Tensor const_h_grad_slice = slicer.InputSlice(*h_grad, t, "h_grad"); functor::TensorAdd()( device, const_h_prev_grad_tensor.flat(), const_h_grad_slice.flat(), h_grad_tensor.flat()); const Tensor& const_cs_grad_tensor = cs_grad_tensor; const Tensor& const_h_grad_tensor = h_grad_tensor; Tensor x_grad_tensor = slicer.OutputSlice(x_grad, t, "x_grad"); functor::BlockLSTMBprop(batch_size, input_size, cell_size)( ctx, device, use_peephole_, x_tensor.matrix(), cs_prev_tensor2.matrix(), h_prev_tensor2.matrix(), w_tensor->matrix(), wci_tensor->vec(), wcf_tensor->vec(), wco_tensor->vec(), b_tensor->vec(), xh_tensor.matrix(), i_tensor.matrix(), cs_tensor.matrix(), f_tensor.matrix(), o_tensor.matrix(), ci_tensor.matrix(), co_tensor.matrix(), const_cs_grad_tensor.matrix(), const_h_grad_tensor.matrix(), do_tensor.matrix(), dcs_tensor.matrix(), dci_tensor.matrix(), df_tensor.matrix(), di_tensor.matrix(), dicfo_tensor.matrix(), cs_prev_grad_tensor->matrix(), h_prev_grad_tensor->matrix(), xh_grad_tensor.matrix(), x_grad_tensor.matrix(), w_grad_tensor->matrix(), wci_grad_tensor->vec(), wcf_grad_tensor->vec(), wco_grad_tensor->vec(), b_grad_tensor->vec()); slicer.FinishTimeStep(); } if (seq_len_max < timelen) { Tensor x_grad_tensor = x_grad->Slice(seq_len_max, timelen); functor::TensorUnalignedZero()( device, x_grad_tensor.unaligned_flat()); } #endif } private: bool use_peephole_; bool use_residue_; bool use_dropout_; int cached_batch_size, cached_input_size, cached_cell_size, cached_timelen, cached_num_threads; void *xsmm_handle; void print_tensor_ptr(const char *name, const Tensor* t) { auto ptr = t->flat(); const T* p = ptr.data(); int dims = t->dims(); if(dims > 0) printf(" XsmmLSTM: %-10s: [%d", name, t->dim_size(0)); else printf(" XsmmLSTM: %-10s: [", name); for(int i = 1; i < dims; i++) printf(", %d", t->dim_size(i)); printf("] @%p (%lld) %g\n", p, t->NumElements(), p[0]); //for(int i = 0; i < t->NumElements(); i++) // printf("DUMP: %-10s %6d %12g\n", name, i, p[i]); } T *get_tensor_ptr(Tensor* t) { return t->flat().data(); } const T *get_tensor_ptr(const Tensor* t) { return t->flat().data(); } }; #define REGISTER_KERNEL(T) \ REGISTER_KERNEL_BUILDER( \ Name("XsmmFusedLSTMGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ XsmmFusedLSTMGradOp); REGISTER_KERNEL(float); // REGISTER_KERNEL(double); #undef REGISTER_KERNEL libxsmm-1.17/samples/deeplearning/tf_lstm_ops/xsmm_lstm_ops.cc000066400000000000000000000263211415223013700247640ustar00rootroot00000000000000#include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/shape_inference.h" using namespace tensorflow; using shape_inference::DimensionHandle; using shape_inference::InferenceContext; using shape_inference::ShapeHandle; REGISTER_OP("XsmmLSTMCell") .Input("x: T") .Input("cs_prev: T") .Input("h_prev: T") .Input("w: T") .Input("w_t: T") .Input("wci: T") .Input("wcf: T") .Input("wco: T") .Input("b: T") .Output("i: T") .Output("cs: T") .Output("f: T") .Output("o: T") .Output("ci: T") .Output("co: T") .Output("h: T") .Attr("forget_bias: float = 1.0") .Attr("cell_clip: float = 3.0") .Attr("use_peephole: bool = false") .Attr("w_in_kcck: bool = false") .Attr("T: {half, float}") .SetShapeFn([](InferenceContext* c) { ShapeHandle x, cs_prev; TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &x)); TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 2, &cs_prev)); DimensionHandle batch_size = c->Dim(x, 0); DimensionHandle cell_size = c->Dim(cs_prev, 1); ShapeHandle output = c->Matrix(batch_size, cell_size); for (int i = 0; i < 7; ++i) { c->set_output(i, output); } return tensorflow::Status::OK(); }) .Doc(R"doc( Computes the LSTM cell forward propagation for 1 time step. This implementation uses 1 weight matrix and 1 bias vector, and there's an optional peephole connection. This kernel op implements the following mathematical equations: ```python xh = [x, h_prev] [i, f, ci, o] = xh * w + b f = f + forget_bias if not use_peephole: wci = wcf = wco = 0 i = sigmoid(cs_prev * wci + i) f = sigmoid(cs_prev * wcf + f) ci = tanh(ci) cs = ci .* i + cs_prev .* f cs = clip(cs, cell_clip) o = sigmoid(cs * wco + o) co = tanh(cs) h = co .* o ``` cell_clip: Value to clip the 'cs' value to. use_peephole: Whether to use peephole weights. forget_bias: The forget gate bias. x: The input to the LSTM cell, shape (batch_size, num_inputs). cs_prev: Value of the cell state at previous time step. h_prev: Output of the previous cell at previous time step. w: The weight matrix. wci: The weight matrix for input gate peephole connection. wcf: The weight matrix for forget gate peephole connection. wco: The weight matrix for output gate peephole connection. b: The bias vector. i: The input gate. cs: The cell state before the tanh. f: The forget gate. o: The output gate. ci: The cell input. co: The cell after the tanh. h: The output h vector. )doc"); REGISTER_OP("XsmmLSTMCellGrad") .Input("x: T") .Input("cs_prev: T") .Input("h_prev: T") .Input("w: T") .Input("w_t: T") .Input("wci: T") .Input("wcf: T") .Input("wco: T") .Input("b: T") .Input("i: T") .Input("cs: T") .Input("f: T") .Input("o: T") .Input("ci: T") .Input("co: T") .Input("cs_grad: T") .Input("h_grad: T") .Output("cs_prev_grad: T") .Output("h_prev_grad: T") .Output("x_grad: T") .Output("w_grad: T") .Output("b_grad: T") .Output("wci_grad: T") .Output("wcf_grad: T") .Output("wco_grad: T") .Attr("use_peephole: bool") .Attr("w_in_kcck: bool = false") .Attr("T: {float}") .SetShapeFn([](InferenceContext* c) { ShapeHandle x, cs_prev; TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &x)); TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 2, &cs_prev)); DimensionHandle batch_size = c->Dim(x, 0); DimensionHandle input_size = c->Dim(x, 1); DimensionHandle cell_size = c->Dim(cs_prev, 1); DimensionHandle cell_size_times_4; DimensionHandle input_plus_cell_size; TF_RETURN_IF_ERROR(c->Multiply(cell_size, 4, &cell_size_times_4)); TF_RETURN_IF_ERROR(c->Add(input_size, cell_size, &input_plus_cell_size)); ShapeHandle cell_size_vec = c->Vector(cell_size); c->set_output(0, c->Matrix(batch_size, cell_size)); c->set_output(1, c->Matrix(batch_size, cell_size)); c->set_output(2, c->Matrix(batch_size, input_size)); c->set_output(3, c->Matrix(input_plus_cell_size, cell_size_times_4)); c->set_output(4, c->Vector(cell_size_times_4)); c->set_output(5, cell_size_vec); c->set_output(6, cell_size_vec); c->set_output(7, cell_size_vec); return tensorflow::Status::OK(); }) .Doc(R"doc( Computes the LSTM cell backward propagation for 1 timestep. This implementation is to be used in conjunction of LSTMBlockCell. use_peephole: Whether the cell uses peephole connections. x: The input to the LSTM cell, shape (batch_size, num_inputs). cs_prev: The previous cell state. h_prev: The previous h state. w: The weight matrix. wci: The weight matrix for input gate peephole connection. wcf: The weight matrix for forget gate peephole connection. wco: The weight matrix for output gate peephole connection. b: The bias vector. i: The input gate. cs: The cell state before the tanh. f: The forget gate. o: The output gate. ci: The cell input. co: The cell after the tanh. cs_grad: The current gradient of cs. h_grad: The gradient of h vector. cs_prev_grad: The gradient of cs to be back-propped. h_prev_grad: The gradient of h to be back-propped. x_grad: The gradient of x to be back-propped. w_grad: The gradient of w to be back-propped. b_grad: The gradient of b to be back-propped. wci_grad: The gradient for wci to be back-propped. wcf_grad: The gradient for wcf to be back-propped. wco_grad: The gradient for wco to be back-propped. )doc"); REGISTER_OP("XsmmFusedLSTM") .Input("seq_len_max: int64") .Input("x: T") .Input("cs_prev: T") .Input("h_prev: T") .Input("w: T") .Input("wci: T") .Input("wcf: T") .Input("wco: T") .Input("b: T") .Output("i: T") .Output("cs: T") .Output("f: T") .Output("o: T") .Output("ci: T") .Output("co: T") .Output("h: T") .Attr("forget_bias: float = 1.0") .Attr("cell_clip: float = 3.0") .Attr("use_peephole: bool = false") .Attr("use_residue: bool = false") .Attr("use_dropout: bool = false") .Attr("T: {half, float}") .SetShapeFn([](InferenceContext* c) { ShapeHandle x, b; TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 3, &x)); TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 1), 1, &b)); DimensionHandle timelen = c->Dim(x, 0); DimensionHandle batch_size = c->Dim(x, 1); DimensionHandle cell_size; TF_RETURN_IF_ERROR( c->Divide(c->Dim(b, 0), 4, true /* evenly_divisible */, &cell_size)); DCHECK_EQ(7, c->num_outputs()); ShapeHandle output = c->MakeShape({timelen, batch_size, cell_size}); for (int i = 0; i < 7; ++i) { c->set_output(i, output); } return Status::OK(); }) .Doc(R"doc( Computes the LSTM cell forward propagation for all the time steps. This is equivalent to applying LSTMBlockCell in a loop, like so: ```python for x1 in unpack(x): i1, cs1, f1, o1, ci1, co1, h1 = LSTMBlock( x1, cs_prev, h_prev, w, wci, wcf, wco, b) cs_prev = cs1 h_prev = h1 i.append(i1) cs.append(cs1) f.append(f1) o.append(o1) ci.append(ci1) co.append(co1) h.append(h1) return pack(i), pack(cs), pack(f), pack(o), pack(ci), pack(ch), pack(h) ``` cell_clip: Value to clip the 'cs' value to. use_peephole: Whether to use peephole weights. forget_bias: The forget gate bias. seq_len_max: Maximum time length actually used by this input. Outputs are padded with zeros beyond this length. x: The sequence input to the LSTM, shape (timelen, batch_size, num_inputs). cs_prev: Value of the initial cell state. h_prev: Initial output of cell (to be used for peephole). w: The weight matrix. wci: The weight matrix for input gate peephole connection. wcf: The weight matrix for forget gate peephole connection. wco: The weight matrix for output gate peephole connection. b: The bias vector. i: The input gate over the whole time sequence. cs: The cell state before the tanh over the whole time sequence. f: The forget gate over the whole time sequence. o: The output gate over the whole time sequence. ci: The cell input over the whole time sequence. co: The cell after the tanh over the whole time sequence. h: The output h vector over the whole time sequence. )doc"); REGISTER_OP("XsmmFusedLSTMGrad") .Input("seq_len_max: int64") .Input("x: T") .Input("cs_prev: T") .Input("h_prev: T") .Input("w: T") .Input("wci: T") .Input("wcf: T") .Input("wco: T") .Input("b: T") .Input("i: T") .Input("cs: T") .Input("f: T") .Input("o: T") .Input("ci: T") .Input("co: T") .Input("h: T") .Input("cs_grad: T") .Input("h_grad: T") .Output("x_grad: T") .Output("cs_prev_grad: T") .Output("h_prev_grad: T") .Output("w_grad: T") .Output("wci_grad: T") .Output("wcf_grad: T") .Output("wco_grad: T") .Output("b_grad: T") .Attr("use_peephole: bool") .Attr("use_residue: bool") .Attr("use_dropout: bool") .Attr("T: {half, float}") .SetShapeFn([](InferenceContext* c) { ShapeHandle x, cs_prev, h_prev, w, wci, wco, wcf, b; TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 3, &x)); TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &cs_prev)); TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 2, &h_prev)); TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 2, &w)); TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 1, &wci)); TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 1, &wco)); TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 1, &wcf)); TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 1, &b)); c->set_output(0, x); c->set_output(1, cs_prev); c->set_output(2, h_prev); c->set_output(3, w); c->set_output(4, wci); c->set_output(5, wco); c->set_output(6, wcf); c->set_output(7, b); return Status::OK(); }) .Doc(R"doc( Computes the LSTM cell backward propagation for the entire time sequence. This implementation is to be used in conjunction of LSTMBlock. use_peephole: Whether to use peephole weights. seq_len_max: Maximum time length actually used by this input. Outputs are padded with zeros beyond this length. x: The sequence input to the LSTM, shape (timelen, batch_size, num_inputs). cs_prev: Value of the initial cell state. h_prev: Initial output of cell (to be used for peephole). w: The weight matrix. wci: The weight matrix for input gate peephole connection. wcf: The weight matrix for forget gate peephole connection. wco: The weight matrix for output gate peephole connection. b: The bias vector. i: The input gate over the whole time sequence. cs: The cell state before the tanh over the whole time sequence. f: The forget gate over the whole time sequence. o: The output gate over the whole time sequence. ci: The cell input over the whole time sequence. co: The cell after the tanh over the whole time sequence. h: The output h vector over the whole time sequence. cs_grad: The current gradient of cs. h_grad: The gradient of h vector. x_grad: The gradient of x to be back-propped. cs_prev_grad: The gradient of cs_prev to be back-propped. h_prev_grad: The gradient of h_prev to be back-propped. w_grad: The gradient for w to be back-propped. wci_grad: The gradient for wci to be back-propped. wcf_grad: The gradient for wcf to be back-propped. wco_grad: The gradient for wco to be back-propped. b_grad: The gradient for w to be back-propped. )doc"); libxsmm-1.17/samples/deeplearning/tvm_cnnlayer/000077500000000000000000000000001415223013700217155ustar00rootroot00000000000000libxsmm-1.17/samples/deeplearning/tvm_cnnlayer/README000066400000000000000000000021421415223013700225740ustar00rootroot00000000000000 *Please install TVM by fetching the following commit id from the master branch * Commit ID: 5a27632e274fff57087ed0b6eb2856b6e5946cfb * Please follow the instuctions in ./libxsmm_wrapper/README to install the libxsmm implementation and wrapper * Run the script 'mb1_tuned_latest.py' as follows * $LD_PRELOAD=./libxsmm_wrapper/libxsmm_wrapper.so python -u mb1_tuned_latest.py -d * layer_name can be any layer from resnet2,resnet3, ..., resnet20 * These layers are from resnet-50 with minibatch size =1 * Eg. $LD_PRELOAD=./libxsmm_wrapper/libxsmm_wrapper.so python -u mb1_tuned_latest.py -d resnet3 * A sample slurm job script 'resnet3.slurm' is given to run on a cluster. Please run using: $sbatch resnet3.slurm * Tuning can be paused, by commenting out the tuning function in the script and just measuring the best parameters * contained in log file. The place to comment out is line no. 406 in mb1_tuned_latest.py * Result containing the best performance is recorded in a generated excel sheet. libxsmm-1.17/samples/deeplearning/tvm_cnnlayer/libxsmm_wrapper/000077500000000000000000000000001415223013700251305ustar00rootroot00000000000000libxsmm-1.17/samples/deeplearning/tvm_cnnlayer/libxsmm_wrapper/Makefile000066400000000000000000000007231415223013700265720ustar00rootroot00000000000000CC= icpc CFLAGS= -O3 -fPIC -std=c++11 -fopenmp LDFLAGS= -shared SOURCES = batch_reduce_plus_init.cc LIBXSMMDIR=./../../../../ INC=-I$(LIBXSMMDIR)/include LIBS = $(LIBXSMMDIR)/lib/libxsmm.a $(LIBXSMMDIR)/lib/libxsmmext.a \ $(LIBXSMMDIR)/lib/libxsmmnoblas.a $(LIBXSMMDIR)/lib/libxsmmgen.a \ $(LIBXSMMDIR)/lib/libxsmmf.a TARGET= libxsmm_wrapper.so all: $(CC) $(INC) $(CFLAGS) -fPIC $(SOURCES) $(LIBS) -o $(TARGET) $(LDFLAGS) clean: rm -f $(TARGET) libxsmm-1.17/samples/deeplearning/tvm_cnnlayer/libxsmm_wrapper/batch_reduce_plus_init.cc000066400000000000000000000071521415223013700321420ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Anand Venkat (Intel Corp.) ******************************************************************************/ #include #include extern "C" int batch_reduce_kernel_update(const float *weight, const float *input, float *output, int blocks, int ofmblock, int ifmblock, int ofw, int stride_w, int r, int s, int ifh, int ifw){ int ld_b = stride_w*ifmblock; libxsmm_smmfunction_reducebatch_addr batchreduce_kernela = libxsmm_smmdispatch_reducebatch_addr(ofmblock,ofw, ifmblock,NULL,&ld_b,NULL,NULL,NULL, NULL, NULL); const unsigned long long cblocks = blocks; const float * A[cblocks]; const float * B[cblocks]; int weight_stride = ofmblock*ifmblock*r*s; int input_stride = ifw*ifh*ifmblock; if(r == 1 && s == 1){ for (int icb = 0; icb < cblocks; icb ++) { A[icb] = &weight[icb*weight_stride]; B[icb] = &input[icb*input_stride]; } }else{/*Eg.if( r == 3 && s == 3){*/ for( int k = 0 ; k < blocks/(r*s); k++){ for(int i=0; i < r; i++){ for(int j =0; j < s; j++){ A[k*r*s + i*s + j] = &weight[k*r*s*ofmblock*ifmblock + (i*s + j)*ofmblock*ifmblock]; B[k*r*s + i*s + j] = &input[k*ifw*ifh*ifmblock + i*ifw*ifmblock + j*ifmblock]; } } } } /* Reduce batch gemm call */ batchreduce_kernela(A, B, output, &cblocks); return 0; } extern "C" int batch_reduce_kernel_init_update(const float *weight, const float *input, float *output, int blocks, int ofmblock, int ifmblock,int r, int s, int ifh, int ifw,int ofw, int stride_w ){ float beta = 0.0; int lda = ofmblock; int ldx = ofmblock; int ld_b = stride_w*ifmblock; int l_flags = ( LIBXSMM_GEMM_FLAGS('N', 'N') ); libxsmm_smmfunction_reducebatch_addr batchreduce_kernela = libxsmm_smmdispatch_reducebatch_addr(ofmblock,ofw, ifmblock,&lda,&ld_b,&ldx,NULL,&beta, &l_flags, NULL); const unsigned long long cblocks = blocks; const float * A[cblocks]; const float * B[cblocks]; int weight_stride = ofmblock*ifmblock*r*s; int input_stride = ifw*ifh*ifmblock; if(r == 1 && s == 1){ for (int icb = 0; icb < cblocks; icb ++) { A[icb] = &weight[icb*weight_stride]; B[icb] = &input[icb*input_stride]; } }else{ /*if( r == 3 && s == 3){*/ for( int k = 0 ; k < blocks/(r*s); k++) for(int i=0; i < r; i++) for(int j =0; j < s; j++){ A[k*r*s + i*s + j] = &weight[k*r*s*ofmblock*ifmblock + (i*s + j)*ofmblock*ifmblock]; B[k*r*s + i*s + j] = &input[k*ifw*ifh*ifmblock + i*ifw*ifmblock + j*ifmblock]; } } /* Reduce batch gemm call */ batchreduce_kernela(A, B, output, &cblocks); return 0; } extern "C" int batch_reduce_kernel_init(float *output, int ofmblock, int ofw){ int num_elements = ofw*ofmblock; LIBXSMM_PRAGMA_SIMD for(int i=0; i < num_elements; i++) output[i] = 0.0; return 0; } libxsmm-1.17/samples/deeplearning/tvm_cnnlayer/mb1_tuned_latest.py000077500000000000000000000474031415223013700255340ustar00rootroot00000000000000#!/usr/bin/env python3 ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Anand Venkat (Intel Corp.) ############################################################################### import logging import sys import numpy as np import tvm import topi import time from topi.util import get_const_tuple import math import topi.testing import xlwt import argparse import os import ctypes from tvm import autotvm from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner parser = argparse.ArgumentParser() parser.add_argument("-d", nargs=1, type=str, default=["resnet3"]) args = parser.parse_args() layer = args.d[0] #Resnet-50 layers (excluding first layer) _resnet_layers ={ 'resnet2':[1,256,64,56,56,1,1,0], 'resnet3':[1,64,64,56,56,1,1,0], 'resnet4':[1,64,64,56,56,3,1,1], 'resnet5':[1,64,256,56,56,1,1,0], 'resnet6':[1,512,256,56,56,1,2,0], 'resnet7':[1,128,256,56,56,1,2,0], 'resnet8':[1,128,128,28,28,3,1,1], 'resnet9':[1,512,128,28,28,1,1,0], 'resnet10':[1,128,512,28,28,1,1,0], 'resnet11':[1,1024,512,28,28,1,2,0], 'resnet12':[1,256,512,28,28,1,2,0], 'resnet13':[1,256,256,14,14,3,1,1], 'resnet14':[1,1024,256,14,14,1,1,0], 'resnet15':[1,256,1024,14,14,1,1,0], 'resnet16':[1,2048,1024,14,14,1,2,0], 'resnet17':[1,512,1024,14,14,1,2,0], 'resnet18':[1,512,512,7,7,3,1,1], 'resnet19':[1,2048,512,7,7,1,1,0], 'resnet20':[1,512,2048,7,7,1,1,0] } ''' Convert input from NCHW format to NCHW16C format where the innermost data dimension is vectorized for AVX-512 ''' def convert_input(a_np, batch, in_channel,input_height,input_width,pad_height,pad_width,vlen,A): to_return = np.zeros((batch, math.ceil(in_channel/vlen),input_height + 2*pad_height, input_width+ 2*pad_width,vlen),dtype = A.dtype) for i in range(batch): for j in range(math.ceil(in_channel/vlen)): for k in range(input_height + 2*pad_height): for l in range(input_width + 2*pad_width): for m in range(vlen): if k < pad_height or k >= input_height + pad_height or l < pad_width or l >= input_width+ pad_width or j*vlen + m >= in_channel: to_return[i,j,k,l,m] = float(0) else: to_return[i,j,k,l,m] = a_np[i,j*vlen + m,k-pad_height,l-pad_width] return to_return ''' Convert output from NCHW format to NCHW16C format where the innermost data dimension is vectorized for AVX-512 ''' def convert_output(a_np, batch, out_channel,output_height,output_width,vlen): to_return = np.zeros((batch, out_channel,output_height, output_width), dtype = float) for i in range(batch): for j in range(math.ceil(out_channel/vlen)): for k in range(output_height): for l in range(output_width): for m in range(vlen): to_return[i,j*vlen + m,k,l] = a_np[i,j,k,l,m] return to_return ''' Convert weights from KCRS format to KCRS16C16K format where the innermost data dimension is vectorized for AVX-512 ''' def convert_weight(w_np, in_channel, out_channel, kernel_height, kernel_width, vlen,W): to_return = np.zeros((math.ceil(out_channel/vlen), math.ceil(in_channel/vlen),kernel_height, kernel_width,vlen,vlen), dtype = W.dtype) for i in range(math.ceil(out_channel/vlen)): for j in range(math.ceil(in_channel/vlen)): for k in range(kernel_height): for l in range(kernel_width): for m in range(vlen): for n in range(vlen): if i*vlen + n >= out_channel or j*vlen + m >= in_channel: to_return[i,j,k,l,m,n] =float(0) else: to_return[i,j,k,l,m,n] = w_np[i*vlen + n,j*vlen+ m,k,l] return to_return # Get the reference output tensor for correctness check def get_ref_data(batch,out_channel,in_channel,input_height,input_width,kernel_height,kernel_width,stride_height,padding): a_np = np.random.uniform(size=(batch,in_channel,input_height,input_width)).astype(float) w_np = np.random.uniform(size=(out_channel,in_channel,kernel_height,kernel_width)).astype(float) if batch == 1: b_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride_height, padding) #b_np = topi.nn.conv2d_NCHWc(a_np, w_np,out_channel,kernel_height,stride_height, # padding, layout="NCHWc", out_layout="NCHWc", out_dtype='float32') if batch == 1: return a_np, w_np, b_np else: return a_np, w_np #special case for small height and width (e.g.. h = w = 7), where (h*w) becomes dimension of the brgemm (M) def intrin_libxsmm_hxw(ofmblock,ofw,ifmblock, stride_width,ifw,rco, ifh,r,s, ifh_stride, ifw_stride,\ ofh, stride_height, out_channel,output_height, output_width, in_channel): last_input_width_index = (ofw-1)*stride_width + s-1 last_input_height_index = (ofh-1)*stride_height + r-1 ry = tvm.reduce_axis((0, r), name='ry') rx = tvm.reduce_axis((0, s), name='rx') A = tvm.placeholder((rco,r,s,ifmblock, ofmblock), name='w') B = tvm.placeholder((rco,last_input_height_index + 1,last_input_width_index + 1,ifmblock), name='b') k = tvm.reduce_axis((0, ifmblock), name='k') k_outer = tvm.reduce_axis((0, rco), name='k_outer') C = tvm.compute( (ofh,ofw,ofmblock), lambda z,m,n: tvm.sum(A[k_outer,ry,rx,k,n] * B[k_outer,ry + z*stride_height,rx + m*stride_width,k], axis=[k_outer,ry,rx,k]), name='out') s1 = tvm.create_schedule(C.op) ifw1,ofw1,ofmblock1 = s1[C].op.axis rco_outer,ry,rx,rci = s1[C].op.reduce_axis s1[C].reorder(ifw1,rco_outer,ry,rx,ofw1,ofmblock1,rci) xx_ptr = tvm.decl_buffer(A.shape, A.dtype, name="W",offset_factor = 1, data_alignment=64) yy_ptr = tvm.decl_buffer(B.shape, B.dtype, name="X",offset_factor=1,\ strides=[tvm.var("s3"),tvm.var("s2"), ifmblock, 1],#offset_factor=16 data_alignment=64) zz_ptr = tvm.decl_buffer(C.shape, C.dtype, name="OUT",offset_factor=1,#offset_factor=1, strides=[output_width*ofmblock, ofmblock, 1], data_alignment=64) def intrin_func(ins, outs): # tvm call extern is used to interface to libxsmm bacth reduce kernel gemm implementation # rco*r*s is the number of batches init_and_compute = tvm.call_extern ("int32","batch_reduce_kernel_init_update", ins[0].access_ptr("r"),ins[1].access_ptr("r"),outs[0].access_ptr("w"),\ rco*r*s,ofmblock,ifmblock,r,s,ifh_stride,ifw_stride, ofw*ofh, stride_width) reset = tvm.call_extern ("int32","batch_reduce_kernel_init", outs[0].access_ptr("w"),ofmblock, ofw*ofh) body = tvm.call_extern ("int32","batch_reduce_kernel_update", ins[0].access_ptr("r"),ins[1].access_ptr("r"),outs[0].access_ptr("w"), rco*r*s,ofmblock,\ ifmblock,ofw*ofh, stride_width,r,s, ifh_stride,ifw_stride) if math.ceil(in_channel/ifmblock) == rco: return init_and_compute, None, init_and_compute else: return init_and_compute,reset,body with tvm.build_config(data_alignment=64): return tvm.decl_tensor_intrin(C.op, intrin_func, name="GEMM", binds= {A: xx_ptr, B: yy_ptr, C: zz_ptr}) # regular case of batch reduce gemm with ofw corresponding to batch reduce brgemm dimension(M) def intrin_libxsmm_tuned(ofmblock,ofw,ifmblock, stride_width,ifw,rco, ifh,r,s, ifh_stride, ifw_stride, in_channel): last_input_width_index = (ofw-1)*stride_width + s-1 A = tvm.placeholder((rco,r,s,ifmblock, ofmblock), name='w') B = tvm.placeholder((rco,r,last_input_width_index + 1,ifmblock), name='b') k = tvm.reduce_axis((0, ifmblock), name='k') k_outer = tvm.reduce_axis((0, rco), name='k_outer') ry = tvm.reduce_axis((0, r), name='ry') rx = tvm.reduce_axis((0, s), name='rx') C = tvm.compute( (ofw,ofmblock), lambda m,n: tvm.sum(A[k_outer,ry,rx,k,n] * B[k_outer,ry, rx + m*stride_width,k], axis=[k_outer,ry,rx,k]), name='out') s1 = tvm.create_schedule(C.op) w,ofm = s1[C].op.axis kco,ky,kx,kci = s1[C].op.reduce_axis s1[C].reorder(kco,ky,kx,w,ofm,kci) xx_ptr = tvm.decl_buffer(A.shape, A.dtype, name="W",offset_factor=1, data_alignment=64) yy_ptr = tvm.decl_buffer(B.shape, B.dtype, name="some", offset_factor=1,strides=[tvm.var("s3"), tvm.var("s2"), ifmblock, 1], data_alignment=64) zz_ptr = tvm.decl_buffer(C.shape, C.dtype, name="OUT",offset_factor=1, data_alignment=64) def intrin_func(ins, outs): # tvm call extern is used to interface to libxsmm batch reduce kernel gemm implementation # rco*r*s is the number of batches init_and_compute = tvm.call_extern ("int32","batch_reduce_kernel_init_update", ins[0].access_ptr("r"),ins[1].access_ptr("r"),outs[0].access_ptr("w"),\ rco*r*s,ofmblock,ifmblock,r,s,ifh_stride,ifw_stride, ofw, stride_width) reset = tvm.call_extern ("int32","batch_reduce_kernel_init", outs[0].access_ptr("w"),ofmblock, ofw) body = tvm.call_extern ("int32","batch_reduce_kernel_update", ins[0].access_ptr("r"),ins[1].access_ptr("r"),outs[0].access_ptr("w"), rco*r*s,ofmblock,\ ifmblock,ofw, stride_width,r,s, ifh_stride,ifw_stride) if math.ceil(in_channel/ifmblock) == rco: return init_and_compute, None, init_and_compute else: return init_and_compute,reset,body with tvm.build_config(data_alignment=64): return tvm.decl_tensor_intrin(C.op, intrin_func, name="GEMM", binds={A: xx_ptr, B: yy_ptr, C: zz_ptr}) #AutoTVM template for libxmm brgemm based tensorize implementation @autotvm.template def conv_auto_tuned(ofmblock,ofw, ifmblock, stride_width,input_width,\ in_channel,input_height, filter_height, filter_width,ofh, stride_height, batch, out_channel): A1 = tvm.placeholder((batch,math.ceil(in_channel/ifmblock),input_height, input_width, ifmblock), name='input') W1 = tvm.placeholder((math.ceil(out_channel/ofmblock), math.ceil(in_channel/ifmblock), filter_height, filter_width, ifmblock,ofmblock), name='weight') rco1 = tvm.reduce_axis((0, math.ceil(in_channel/ifmblock)), name='rco1') ry1 = tvm.reduce_axis((0, filter_height), name='ry1') rx1 = tvm.reduce_axis((0, filter_width), name='rx1') rci1 = tvm.reduce_axis((0, ifmblock), name='rci1') cfg = autotvm.get_config() cfg.define_knob("pack", [0,1]) pack = False w_tile = [] factor_found = False for i in range(6, min(ofw+1,29)): if ofw % i == 0: w_tile.append((i, ofw//i) ) factor_found = True if factor_found == False: w_tile.append((ofw,1)) #tile factors for output width cfg.define_knob("tile_w", w_tile) # pack data when stride > 1 and pack flag set so that data for brgemm is continuous if filter_height == 1 and filter_width == 1 and stride_width > 1 and stride_height > 1 and cfg['pack'].val == 1 : A2 = tvm.compute((batch, math.ceil(in_channel/ifmblock),ofh,ofw,ifmblock), lambda n,c,h,w,vlen1: A1[n, c,h*stride_height,w*stride_width,vlen1]) B1 = tvm.compute((batch, math.ceil(out_channel/ofmblock),ofh, ofw,ofmblock), lambda nn,ff,yy, xx, vlen1: tvm.sum( W1[ff,rco1,ry1,rx1,rci1,vlen1] * A2[nn, rco1, ry1 + yy, rx1 + xx,rci1], axis=[rco1,ry1, rx1, rci1]),name='output') pack = True else: # Compute the convolution B1 = tvm.compute((batch, math.ceil(out_channel/ofmblock),ofh, ofw,ofmblock), lambda nn,ff,yy, xx, vlen1: tvm.sum( W1[ff,rco1,ry1,rx1,rci1,vlen1] * A1[nn, rco1, ry1 + stride_height*yy, rx1 + stride_width*xx,rci1], axis=[rco1,ry1, rx1, rci1]), name='output') s = tvm.create_schedule(B1.op) n,ko,h,w,ki = s[B1].op.axis rco,ry,rx, rci = s[B1].op.reduce_axis cfg.define_split("tile_h", h, num_outputs=3)#output height cfg.define_split("tile_c", rco, num_outputs=2) #input channel dimension cfg.define_split("tile_k",ko, num_outputs=2) #output channel dimension w_factor_inner, _ = cfg["tile_w"].val wo, wi = s[B1].split(w, w_factor_inner) #tiling rco_o,rco_i = cfg["tile_c"].apply(s, B1, rco) ko_o, ko_i = cfg["tile_k"].apply(s, B1, ko) ho,hm, hi = cfg["tile_h"].apply(s, B1, h) s[B1].reorder(n,ko_o,ho,ko_i,rco_o,hm,wo,hi,rco_i,ry,rx,wi,ki,rci) cfg.define_reorder("reorder_outer", [ko_i,rco_o,hm,wo], policy="all") cfg.add_flop(np.prod(get_const_tuple(B1.shape))*in_channel*filter_height*filter_width*2) cfg["reorder_outer"].apply(s, B1,[ko_i,rco_o,hm,wo]) if (filter_height == 1 and filter_width == 1 and stride_width == 1 and stride_height == 1) or pack: if cfg["tile_h"].size[1] > 1 and w_factor_inner == ofw:#cfg["tile_w"].size[2] == ofw: libxsmm_tensorize = intrin_libxsmm_hxw(ofmblock,w_factor_inner,ifmblock, 1, w_factor_inner, cfg["tile_c"].size[1],cfg["tile_h"].size[2],\ filter_height, filter_width,ofh,ofw,cfg["tile_h"].size[2],1, out_channel, ofh,ofw, in_channel) s[B1].tensorize(hi, libxsmm_tensorize) else: libxsmm_tensorize = intrin_libxsmm_tuned(ofmblock,w_factor_inner,ifmblock, 1, w_factor_inner, cfg["tile_c"].size[1], cfg["tile_h"].size[2],\ filter_height, filter_width,ofh, ofw, in_channel) s[B1].tensorize(rco_i, libxsmm_tensorize) else: libxsmm_tensorize = intrin_libxsmm_tuned(ofmblock,w_factor_inner,ifmblock, stride_width, w_factor_inner,\ cfg["tile_c"].size[1], cfg["tile_h"].size[2],\ filter_height, filter_width,input_height,input_width, in_channel) s[B1].tensorize(rco_i, libxsmm_tensorize) par = s[B1].fuse(n,ko_o,ho) s[B1].parallel(par) if pack: n1,c1,h1,w1,v1 = s[A2].op.axis par2 = s[A2].fuse(n1,c1,h1) s[A2].parallel(par) s[A2].vectorize(v1) s = s.normalize() return s, [W1, A1, B1] def driver(): book = xlwt.Workbook(encoding="utf-8") sheet1 = book.add_sheet("Sheet 1") row1=0 sheet1.write(0,0,"Layer") sheet1.write(0,1,"AutoTVM_FLOPS") row1 = row1 + 1 batch = _resnet_layers[layer][0] in_channel = _resnet_layers[layer][2] out_channel = _resnet_layers[layer][1] input_height = _resnet_layers[layer][3] input_width = _resnet_layers[layer][4] kernel_height = _resnet_layers[layer][5] kernel_width = _resnet_layers[layer][5] pad_height = _resnet_layers[layer][7] pad_width = _resnet_layers[layer][7] stride_height = _resnet_layers[layer][6] stride_width = _resnet_layers[layer][6] vlen = 64 assert(pad_height == pad_width) assert(stride_height == stride_width) assert(kernel_height == kernel_width) output_width = ((input_width + 2 * pad_width - kernel_width) // stride_width) + 1 output_height = ((input_height + 2 * pad_height - kernel_height) // stride_height) + 1 assert(output_height == output_width) assert(input_height == input_width) ctx = tvm.context('llvm', 0) sheet1.write(row1,0,layer) if not ctx.exist: print("Skip because %s is not enabled" % device) return task = autotvm.task.create(conv_auto_tuned, args=(vlen,output_width, vlen, stride_width,input_width + 2*pad_width, in_channel,\ input_height + 2*pad_height, kernel_height, kernel_width,output_height, stride_height, batch, out_channel),\ target='llvm -mtriple=x86_64 -mcpu=skylake-avx512 -mattr=+skx,+fma,+fma4,+avx512ifma,+avx512f,+avx512cd,+avx512bw,+avx512vl,+avx512dq') logging.getLogger('autotvm').setLevel(logging.DEBUG) logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout)) measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(), runner=autotvm.LocalRunner(number=1000, repeat=1,min_repeat_ms=1000)) tuner = autotvm.tuner.RandomTuner(task) #Please limit n_trial to reduce tuning time n_trial= len(task.config_space) log_file = layer + ".log" #comment out the following call to tuner to just run the best case from log file history tuner.tune(n_trial=n_trial, measure_option=measure_option, callbacks=[ autotvm.callback.progress_bar(n_trial, prefix=layer), autotvm.callback.log_to_file(log_file)]) with autotvm.apply_history_best( layer+'.log'): with tvm.target.create("llvm"): a_np, w_np, b_np = get_ref_data(batch,out_channel,in_channel,input_height,input_width,kernel_height, kernel_width,stride_height,pad_height) s, arg_bufs = conv_auto_tuned(vlen,output_width, vlen, stride_width,input_width + 2*pad_width, in_channel,\ input_height + 2*pad_height, kernel_height, kernel_width,output_height, stride_height, batch, out_channel) a_np2 = convert_input(a_np, batch, in_channel,input_height,input_width,pad_height,pad_width,vlen, arg_bufs[1]) w_np2 = convert_weight(w_np, in_channel, out_channel, kernel_height, kernel_width,vlen,arg_bufs[0]) ctx = tvm.context('llvm', 0) b = tvm.nd.array(np.zeros((batch, math.ceil(out_channel/vlen),output_height, output_width,vlen), dtype=arg_bufs[2].dtype), ctx) a = tvm.nd.array(a_np2, ctx) w = tvm.nd.array(w_np2, ctx) func = tvm.build(s, arg_bufs,target=\ 'llvm -mtriple=x86_64 -mcpu=skylake-avx512 -mattr=+skx,+fma,+fma4,+avx512ifma,+avx512f,+avx512cd,+avx512bw,+avx512vl,+avx512dq', name="conv2d") func(w,a,b) b_np_A = convert_output(b.asnumpy(), 1,out_channel, output_height, output_width,vlen) np.testing.assert_allclose(b_np_A, b_np, rtol=1e-5) evaluator1 = func.time_evaluator(func.entry_name, ctx, number=1000,repeat=1, min_repeat_ms=1) t1 = evaluator1(w,a, b).mean gflops_tvm1 = np.prod(get_const_tuple(arg_bufs[2].shape))*in_channel*kernel_height*kernel_width*2 gflops_tvm1 = gflops_tvm1/1e9/t1 print("Time for conv(tuned) is : {0:.6f}".format(t1)) print("GFLOPS : {0:.3f} ".format( gflops_tvm1)) sheet1.write(row1,1,gflops_tvm1) row1 = row1 + 1 book.save( "AutoTVM_tensorize_resnet" + layer +".xls") if __name__ == "__main__": driver() libxsmm-1.17/samples/deeplearning/tvm_cnnlayer/resnet3.slurm000077500000000000000000000007421415223013700243720ustar00rootroot00000000000000#!/usr/bin/env bash #SBATCH --job-name resnet3 #SBATCH --time 1-00:00:00 #SBATCH -N 1 #SBATCH -c 112 #SBATCH --output resnet3.out #SBATCH --partition clx #SBATCH --mail-type=END,FAIL # notifications for job done & fail #SBATCH --mail-user=anand.venkat@intel.com # send-to address export KMP_AFFINITY=granularity=fine,compact,1,28 export OMP_NUM_THREADS=28 export TVM_NUM_THREADS=28 LD_PRELOAD=./libxsmm_wrapper/libxsmm_wrapper.so srun python -u mb1_tuned_latest.py -d resnet3 libxsmm-1.17/samples/edge/000077500000000000000000000000001415223013700154635ustar00rootroot00000000000000libxsmm-1.17/samples/edge/Makefile000066400000000000000000000130571415223013700171310ustar00rootroot00000000000000ROOTDIR = $(abspath $(dir $(firstword $(MAKEFILE_LIST)))) DEPDIR = ../.. SRCDIR = . INCDIR = . BLDDIR = obj OUTDIR = . CXXFLAGS = $(NULL) CFLAGS = $(NULL) DFLAGS = $(NULL) UNUSED = 1 BLAS = 1 OMP = 1 SYM = 1 # include common Makefile artifacts include $(DEPDIR)/Makefile.inc # necessary include directories IFLAGS += -I$(call quote,$(INCDIR)) IFLAGS += -I$(call quote,$(DEPDIR)/include) OUTNAME := $(shell basename "$(ROOTDIR)") HEADERS := $(wildcard $(INCDIR)/*.h) $(wildcard $(INCDIR)/*.hpp) $(wildcard $(INCDIR)/*.hxx) $(wildcard $(INCDIR)/*.hh) \ $(wildcard $(SRCDIR)/*.h) $(wildcard $(SRCDIR)/*.hpp) $(wildcard $(SRCDIR)/*.hxx) $(wildcard $(SRCDIR)/*.hh) \ $(DEPDIR)/include/libxsmm_source.h CPPSRCS := $(wildcard $(SRCDIR)/*.cpp) CXXSRCS := $(wildcard $(SRCDIR)/*.cxx) CCXSRCS := $(wildcard $(SRCDIR)/*.cc) CSOURCS := $(wildcard $(SRCDIR)/*.c) CPPOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CPPSRCS:.cpp=-cpp.o))) CXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CXXSRCS:.cxx=-cxx.o))) CCXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CCXSRCS:.cc=-cc.o))) COBJCTS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CSOURCS:.c=-c.o))) ifneq (,$(strip $(FC))) FXXSRCS := $(wildcard $(SRCDIR)/*.f) F77SRCS := $(wildcard $(SRCDIR)/*.F) F90SRCS := $(wildcard $(SRCDIR)/*.f90) $(wildcard $(SRCDIR)/*.F90) FXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(FXXSRCS:.f=-f.o))) F77OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F77SRCS:.F=-f77.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90SRCS:.f90=-f90.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90OBJS:.F90=-f90.o))) endif SOURCES := $(CPPSRCS) $(CXXSRCS) $(CCXSRCS) $(CSOURCS) OBJECTS := $(CPPOBJS) $(CXXOBJS) $(CCXOBJS) $(COBJCTS) FTNSRCS := $(FXXSRCS) $(F77SRCS) $(F90SRCS) MODULES := $(addsuffix .mod,$(basename $(FTNSRCS))) $(addsuffix .modmic,$(basename $(FTNSRCS))) FTNOBJS := $(FXXOBJS) $(F77OBJS) $(F90OBJS) XFILES := $(OUTDIR)/edge_vol_int \ $(OUTDIR)/asparse_srsoa_f64 $(OUTDIR)/bsparse_srsoa_f64 \ $(OUTDIR)/asparse_srsoa_f32 $(OUTDIR)/bsparse_srsoa_f32 \ $(OUTDIR)/bsparse_scsoa_f64 $(OUTDIR)/bsparse_scsoa_f32 \ $(OUTDIR)/dense_packedacrm_f64 $(OUTDIR)/dense_packedacrm_f32 \ $(OUTDIR)/dense_packedbcrm_f64 $(OUTDIR)/dense_packedbcrm_f32 .PHONY: all all: $(XFILES) .PHONY: compile compile: $(OBJECTS) $(FTNOBJS) $(OUTDIR)/edge_vol_int: $(OUTDIR)/.make $(BLDDIR)/edge_vol_int-c.o $(BLDDIR)/edge_proxy_common-c.o $(LIBDEP) $(LD) -o $@ $(BLDDIR)/edge_vol_int-c.o $(BLDDIR)/edge_proxy_common-c.o $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(OUTDIR)/asparse_srsoa_f64: $(OUTDIR)/.make $(BLDDIR)/asparse_srsoa-c.o $(LIBDEP) $(LD) -o $@ $(BLDDIR)/asparse_srsoa-c.o $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(OUTDIR)/bsparse_srsoa_f64: $(OUTDIR)/.make $(BLDDIR)/bsparse_srsoa-c.o $(LIBDEP) $(LD) -o $@ $(BLDDIR)/bsparse_srsoa-c.o $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(OUTDIR)/asparse_srsoa_f32: $(OUTDIR)/.make $(BLDDIR)/asparse_srsoa-c-f32.o $(LIBDEP) $(LD) -o $@ $(BLDDIR)/asparse_srsoa-c-f32.o $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(OUTDIR)/bsparse_srsoa_f32: $(OUTDIR)/.make $(BLDDIR)/bsparse_srsoa-c-f32.o $(LIBDEP) $(LD) -o $@ $(BLDDIR)/bsparse_srsoa-c-f32.o $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(OUTDIR)/bsparse_scsoa_f64: $(OUTDIR)/.make $(BLDDIR)/bsparse_scsoa-c.o $(LIBDEP) $(LD) -o $@ $(BLDDIR)/bsparse_scsoa-c.o $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(OUTDIR)/bsparse_scsoa_f32: $(OUTDIR)/.make $(BLDDIR)/bsparse_scsoa-c-f32.o $(LIBDEP) $(LD) -o $@ $(BLDDIR)/bsparse_scsoa-c-f32.o $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(OUTDIR)/dense_packedacrm_f64: $(OUTDIR)/.make $(BLDDIR)/dense_packedacrm-c.o $(LIBDEP) $(LD) -o $@ $(BLDDIR)/dense_packedacrm-c.o $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(OUTDIR)/dense_packedacrm_f32: $(OUTDIR)/.make $(BLDDIR)/dense_packedacrm-c-f32.o $(LIBDEP) $(LD) -o $@ $(BLDDIR)/dense_packedacrm-c-f32.o $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(OUTDIR)/dense_packedbcrm_f64: $(OUTDIR)/.make $(BLDDIR)/dense_packedbcrm-c.o $(LIBDEP) $(LD) -o $@ $(BLDDIR)/dense_packedbcrm-c.o $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(OUTDIR)/dense_packedbcrm_f32: $(OUTDIR)/.make $(BLDDIR)/dense_packedbcrm-c-f32.o $(LIBDEP) $(LD) -o $@ $(BLDDIR)/dense_packedbcrm-c-f32.o $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(BLDDIR)/%-cpp.o: $(SRCDIR)/%.cpp .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CXX) $(DFLAGS) $(IFLAGS) $(CXXFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-c.o: $(SRCDIR)/%.c .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CC) $(DFLAGS) $(IFLAGS) $(CFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-cpp-f32.o: $(SRCDIR)/%.cpp .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CXX) -D__EDGE_EXECUTE_F32__ $(DFLAGS) $(IFLAGS) $(CXXFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-c-f32.o: $(SRCDIR)/%.c .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CC) -D__EDGE_EXECUTE_F32__ $(DFLAGS) $(IFLAGS) $(CFLAGS) $(CTARGET) -c $< -o $@ .PHONY: clean clean: ifneq ($(call qapath,$(BLDDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(BLDDIR)),$(call qapath,.)) @rm -rf $(BLDDIR) endif endif ifneq (,$(wildcard $(BLDDIR))) # still exists @rm -f $(OBJECTS) $(OBJECTX) $(FTNOBJS) $(FTNOBJX) *__genmod.* fit.log *.dat @rm -f $(BLDDIR)/*.gcno $(BLDDIR)/*.gcda $(BLDDIR)/*.gcov endif @rm -f .make .state .PHONY: realclean realclean: clean ifneq ($(call qapath,$(OUTDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(OUTDIR)),$(call qapath,.)) @rm -rf $(OUTDIR) endif endif ifneq (,$(wildcard $(OUTDIR))) # still exists @rm -f $(OUTDIR)/libxsmm.$(DLIBEXT) $(OUTDIR)/*.stackdump @rm -f $(XFILES) $(MODULES) endif libxsmm-1.17/samples/edge/analyze_test_matops_out.sh000077500000000000000000000025661415223013700230070ustar00rootroot00000000000000#!/usr/bin/env bash ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### if [ $# -eq 1 ] then OUTFILE=$1 else echo "you have to provide an outfile (stdout of test_matops.sh) and csc/csr!" exit -1 fi TOTALFLOPS=0 WEIGHTAVGGFLOPS=0 for i in `cat ${OUTFILE} | grep PERFDUMP | awk -F"," '{print $3 "," $7 "," $8 "," $10 "," $11}'` do FLOPS=`echo $i | awk -F"," '{print $2}'` TOTALFLOPS=`echo $TOTALFLOPS+$FLOPS | bc` done for i in `cat ${OUTFILE} | grep PERFDUMP | awk -F"," '{print $3 "," $7 "," $8 "," $10 "," $11}'` do FLOPS=`echo $i | awk -F"," '{print $2}'` GFLOPS=`echo $i | awk -F"," '{print $5}'` WEIGHT=`echo $FLOPS/$TOTALFLOPS | bc -l` WEIGHTGFLOPS=`echo $GFLOPS*$WEIGHT | bc -l` WEIGHTAVGGFLOPS=`echo $WEIGHTAVGGFLOPS+$WEIGHTGFLOPS | bc -l` done echo $OUTFILE","$WEIGHTAVGGFLOPS libxsmm-1.17/samples/edge/asparse_srsoa.c000066400000000000000000000154211415223013700204770ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include #include "common_edge_proxy.h" int main(int argc, char* argv[]) { int M = ( argc == 7 ) ? atoi(argv[1]) : 9; int N = ( argc == 7 ) ? atoi(argv[2]) : 10; int K = ( argc == 7 ) ? atoi(argv[3]) : 9; unsigned int N_CRUNS = ( argc == 7 ) ? atoi(argv[4]) : 8; unsigned int REPS = ( argc == 7 ) ? atoi(argv[5]) : 1; char* l_csr_file = ( argc == 7 ) ? argv[6] : "file.csr"; const libxsmm_gemm_prefetch_type prefetch = LIBXSMM_GEMM_PREFETCH_NONE; const int flags = LIBXSMM_GEMM_FLAGS('N', 'N'); const REALTYPE alpha = 1, beta = 1; REALTYPE* l_a_de = (REALTYPE*)libxsmm_aligned_malloc(sizeof(REALTYPE) * K * K, 64); REALTYPE* l_a_sp = NULL; REALTYPE* l_b = (REALTYPE*)libxsmm_aligned_malloc(sizeof(REALTYPE) * K * N * N_CRUNS, 64); unsigned int* l_rowptr = NULL; unsigned int* l_colidx = NULL; unsigned int l_rowcount, l_colcount, l_elements; REALTYPE* l_c = (REALTYPE*)libxsmm_aligned_malloc(sizeof(REALTYPE) * K * N * N_CRUNS, 64); REALTYPE* l_c_gold = (REALTYPE*)libxsmm_aligned_malloc(sizeof(REALTYPE) * K * N * N_CRUNS, 64); REALTYPE* l_c_asm = (REALTYPE*)libxsmm_aligned_malloc(sizeof(REALTYPE) * K * N * N_CRUNS, 64); REALTYPE l_max_error = 0.0; unsigned int l_k, l_n; int l_i, l_j, l_jj; LIBXSMM_VLA_DECL(3, REALTYPE, l_p_b, l_b, N, N_CRUNS); LIBXSMM_VLA_DECL(3, REALTYPE, l_p_c_asm, l_c_asm, N, N_CRUNS); LIBXSMM_VLA_DECL(3, REALTYPE, l_p_c_gold, l_c_gold, N, N_CRUNS); libxsmm_descriptor_blob l_xgemm_blob; const libxsmm_gemm_descriptor* l_xgemm_desc = 0; LIBXSMM_MMFUNCTION_TYPE(REALTYPE) mykernel = NULL; unsigned long long l_start, l_end; double l_total; unsigned long long l_libxsmmflops; libxsmm_kernel_info l_kinfo; if (argc != 7) { fprintf( stderr, "arguments: M #iters CSR-file!\n" ); return -1; } /* touch B */ for ( l_i = 0; l_i < K; l_i++) { for ( l_j = 0; l_j < N; l_j++) { for ( l_k = 0; l_k < N_CRUNS; l_k++ ) { LIBXSMM_VLA_ACCESS(3, l_p_b, l_i, l_j, l_k, N, N_CRUNS) = (REALTYPE)libxsmm_rng_f64(); } } } /* touch C */ for ( l_i = 0; l_i < K; l_i++) { for ( l_j = 0; l_j < N; l_j++) { for ( l_k = 0; l_k < N_CRUNS; l_k++ ) { LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, N, N_CRUNS) = (REALTYPE)0.0; LIBXSMM_VLA_ACCESS(3, l_p_c_asm, l_i, l_j, l_k, N, N_CRUNS) = (REALTYPE)0.0; } } } /* read A, CSR */ libxsmm_sparse_csr_reader( l_csr_file, &l_rowptr, &l_colidx, &l_a_sp, &l_rowcount, &l_colcount, &l_elements ); /* copy b to dense */ printf("CSR matrix data structure we just read:\n"); printf("rows: %u, columns: %u, elements: %u\n", l_rowcount, l_colcount, l_elements); for ( l_n = 0; l_n < (((unsigned int)K) * K); l_n++) { l_a_de[l_n] = 0.0; } for ( l_n = 0; l_n < (unsigned int)K; l_n++) { const unsigned int l_rowelems = l_rowptr[l_n+1] - l_rowptr[l_n]; assert(l_rowptr[l_n+1] >= l_rowptr[l_n]); for ( l_k = 0; l_k < l_rowelems; l_k++) { l_a_de[(l_n * K) + l_colidx[l_rowptr[l_n] + l_k]] = l_a_sp[l_rowptr[l_n] + l_k]; } } /* dense routine */ l_start = libxsmm_timer_tick(); #if 1 for ( l_n = 0; l_n < REPS; l_n++) { for ( l_i = 0; l_i < K; l_i++) { for ( l_j = 0; l_j < N; l_j++) { for ( l_jj = 0; l_jj < K; l_jj++) { LIBXSMM_PRAGMA_SIMD for (l_k = 0; l_k < N_CRUNS; l_k++) { LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, N, N_CRUNS) += l_a_de[(l_i*K)+l_jj] * LIBXSMM_VLA_ACCESS(3, l_p_b, l_jj, l_j, l_k, N, N_CRUNS); } } } } } #endif l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); printf("%fs for dense\n", l_total); printf("%f GFLOPS for dense\n", ((double)((double)REPS * (double)K * (double)K * (double)N * (double)N_CRUNS) * 2.0) / (l_total * 1.0e9)); l_xgemm_desc = libxsmm_gemm_descriptor_dinit(&l_xgemm_blob, LIBXSMM_GEMM_PRECISION(REALTYPE), K, N, K, 0, N, N, alpha, beta, flags, prefetch); /* sparse routine */ #if defined(__EDGE_EXECUTE_F32__) mykernel = libxsmm_create_xcsr_soa( l_xgemm_desc, l_rowptr, l_colidx, (const void*)l_a_sp, N_CRUNS ).smm; #else mykernel = libxsmm_create_xcsr_soa( l_xgemm_desc, l_rowptr, l_colidx, (const void*)l_a_sp, N_CRUNS ).dmm; #endif l_start = libxsmm_timer_tick(); for ( l_n = 0; l_n < REPS; l_n++) { mykernel( l_a_sp, l_b, l_c_asm ); } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); libxsmm_get_kernel_info( LIBXSMM_CONST_VOID_PTR(mykernel), &l_kinfo); l_libxsmmflops = l_kinfo.nflops; printf("%fs for sparse (asm)\n", l_total); printf("%f GFLOPS for sparse (asm), calculated\n", ((double)((double)REPS * (double)N * (double)l_elements * (double)N_CRUNS) * 2.0) / (l_total * 1.0e9)); printf("%f GFLOPS for sparse (asm), libxsmm \n", ((double)((double)REPS * (double)l_libxsmmflops)) / (l_total * 1.0e9)); /* check for errors */ l_max_error = (REALTYPE)0.0; for ( l_i = 0; l_i < K; l_i++) { for ( l_j = 0; l_j < N; l_j++) { for ( l_k = 0; l_k < N_CRUNS; l_k++ ) { if (fabs( LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, N, N_CRUNS) - LIBXSMM_VLA_ACCESS(3, l_p_c_asm, l_i, l_j, l_k, N, N_CRUNS) ) > l_max_error ) { l_max_error = (REALTYPE)fabs( LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, N, N_CRUNS) -LIBXSMM_VLA_ACCESS(3, l_p_c_asm, l_i, l_j, l_k, N, N_CRUNS) ); } } } } printf("max error: %f\n", l_max_error); printf("PERFDUMP,%s,%u,%i,%i,%i,%u,%u,%f,%f,%f\n", l_csr_file, REPS, M, N, K, l_elements, K * l_elements * N_CRUNS * 2, l_max_error, l_total, ((double)((double)REPS * (double)K * (double)l_elements * (double)N_CRUNS) * 2.0) / (l_total * 1.0e9) ); /* free */ libxsmm_free( l_a_de ); libxsmm_free( l_b ); libxsmm_free( l_c ); libxsmm_free( l_c_gold ); libxsmm_free( l_c_asm ); free( l_a_sp ); free( l_rowptr ); free( l_colidx ); return 0; } libxsmm-1.17/samples/edge/asparse_srsoa.vcxproj000066400000000000000000000541561415223013700217600ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 asparse_srsoa {2377A071-A5AF-4629-91C4-0E5EBF795BD5} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/edge/bsparse_scsoa.c000066400000000000000000000332211415223013700204570ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include #include "common_edge_proxy.h" LIBXSMM_INLINE void qfma_fill_in( REALTYPE* rm_dense_data, unsigned int m, unsigned int n, unsigned int **colptr, unsigned int **rowidx, REALTYPE **values) { REALTYPE* cm_dense = NULL; REALTYPE* cm_dense_data = NULL; unsigned int i = 0; unsigned int j = 0; unsigned int l_max_reg_block = 28; unsigned int l_max_cols = 0; unsigned int l_n_chunks = 0; unsigned int l_n_chunksize = 0; unsigned int l_n_limit = 0; unsigned int l_n_processed = 0; unsigned int l_nnz = 0; unsigned int* l_colptr = NULL; unsigned int* l_rowidx = NULL; REALTYPE* l_values = NULL; unsigned int l_count = 0; unsigned int l_found_qmadd = 0; cm_dense = (REALTYPE*)malloc( m*n*sizeof(REALTYPE) ); cm_dense_data = (REALTYPE*)malloc( m*n*sizeof(REALTYPE) ); /* set all values in copy to 1 or 0 */ for ( j = 0; j < n; ++j ) { for ( i = 0; i < m; ++i ) { cm_dense[(j*m)+i] = (REALTYPE)(LIBXSMM_FEQ(rm_dense_data[(i*n)+j], 0) ? 0 : 1); cm_dense_data[(j*m)+i] = rm_dense_data[(i*n)+j]; } } #if 1 /* finding max. active columns */ l_max_cols = 0; for ( j = 0; j < n; ++j ) { for ( i = 0; i < m; ++i ) { if (cm_dense[(j*m) + i] > 0.0) { l_max_cols = j+1; } } } /* calculate n blocking as in the generator */ l_n_chunks = ( (l_max_cols % l_max_reg_block) == 0 ) ? (l_max_cols / l_max_reg_block) : (l_max_cols / l_max_reg_block) + 1; assert(0 != l_n_chunks); /* mute static analysis (division-by-zero); such invalid input must be caught upfront */ l_n_chunksize = ( (l_max_cols % l_n_chunks) == 0 ) ? (l_max_cols / l_n_chunks) : (l_max_cols / l_n_chunks) + 1; /* qmadd padding */ l_n_processed = 0; l_n_limit = l_n_chunksize; while ( l_n_processed < l_max_cols ) { /* first pass look for qmadds and potential qmadds in the same rows */ for ( i = 0; i < m; ++i ) { if ( i >= m-3 ) continue; l_found_qmadd = 0; for ( j = l_n_processed; j < l_n_limit - l_n_processed; ++j ) { if ( LIBXSMM_FEQ(cm_dense[(j*m)+(i+0)], 1) && LIBXSMM_FEQ(cm_dense[(j*m)+(i+1)], 1) && LIBXSMM_FEQ(cm_dense[(j*m)+(i+2)], 1) && LIBXSMM_FEQ(cm_dense[(j*m)+(i+3)], 1) ) { cm_dense[(j*m)+(i+0)] = (REALTYPE)10.0; cm_dense[(j*m)+(i+1)] = (REALTYPE)10.0; cm_dense[(j*m)+(i+2)] = (REALTYPE)10.0; cm_dense[(j*m)+(i+3)] = (REALTYPE)10.0; l_found_qmadd = 1; } } /* if we found qmadd in at least one column, let's check the other columns in the current block for 3 nnz */ /* -> let's pad them to 4 nnz */ if (l_found_qmadd == 1) { for ( j = l_n_processed; j < l_n_limit - l_n_processed; ++j ) { if ( LIBXSMM_FEQ( cm_dense[(j*m)+(i+0)] + cm_dense[(j*m)+(i+1)] + cm_dense[(j*m)+(i+2)] + cm_dense[(j*m)+(i+3)], 3) ) { cm_dense[(j*m)+(i+0)] = (REALTYPE)10.0; cm_dense[(j*m)+(i+1)] = (REALTYPE)10.0; cm_dense[(j*m)+(i+2)] = (REALTYPE)10.0; cm_dense[(j*m)+(i+3)] = (REALTYPE)10.0; } } i += 3; } } /* second pass look out for consecutive 4 rows which have 3 nnz in a specifc column */ for ( i = 0; i < m; ++i ) { if ( i >= m-3 ) continue; l_found_qmadd = 0; /* first check if already a qmadd in that row */ for ( j = l_n_processed; j < l_n_limit - l_n_processed; ++j ) { if ( LIBXSMM_FEQ(cm_dense[(j*m)+(i+0)], 10) ) { l_found_qmadd = 1; } } /* we are in a potential candidate row for padding 0 for qmadd */ if ( l_found_qmadd == 0 ) { for ( j = l_n_processed; j < l_n_limit - l_n_processed; ++j ) { if ( LIBXSMM_FEQ( cm_dense[(j*m)+(i+0)] + cm_dense[(j*m)+(i+1)] + cm_dense[(j*m)+(i+2)] + cm_dense[(j*m)+(i+3)], 3) ) { cm_dense[(j*m)+(i+0)] = (REALTYPE)10.0; cm_dense[(j*m)+(i+1)] = (REALTYPE)10.0; cm_dense[(j*m)+(i+2)] = (REALTYPE)10.0; cm_dense[(j*m)+(i+3)] = (REALTYPE)10.0; l_found_qmadd = 1; } } } if ( l_found_qmadd > 0 ) { i += 3; } } /* adjust n progression */ l_n_processed += l_n_chunksize; l_n_limit = LIBXSMM_MIN(l_n_processed + l_n_chunksize, l_max_cols); } #endif /* creating a new CSC matrix */ /* determining new number of NNZ */ l_nnz = 0; for ( j = 0; j < n; ++j ) { for ( i = 0; i < m; ++i ) { if (cm_dense[(j*m) + i] > 0.0) { l_nnz++; } } } (*colptr) = (unsigned int*) malloc( (n+1)*sizeof(unsigned int) ); (*rowidx) = (unsigned int*) malloc( l_nnz*sizeof(unsigned int) ); (*values) = (REALTYPE* ) malloc( l_nnz*sizeof(REALTYPE ) ); l_colptr = (*colptr); l_rowidx = (*rowidx); l_values = (*values); /* generating CSC from dense padded structure */ l_count = 0; for ( j = 0; j < n; ++j ) { l_colptr[j] = l_count; for ( i = 0; i < m; ++i ) { if (cm_dense[(j*m) + i] > (REALTYPE)0.0) { l_rowidx[l_count] = i; l_values[l_count] = (REALTYPE)cm_dense_data[(j*m) + i]; l_count++; } } } l_colptr[n] = l_nnz; free ( cm_dense ); free ( cm_dense_data ); } int main(int argc, char* argv[]) { int M = ( argc == 7 ) ? atoi(argv[1]) : 9; int N = ( argc == 7 ) ? atoi(argv[2]) : 10; int K = ( argc == 7 ) ? atoi(argv[3]) : 20; unsigned int N_CRUNS = ( argc == 7 ) ? atoi(argv[4]) : 8; unsigned int REPS = ( argc == 7 ) ? atoi(argv[5]) : 1; char* l_csc_file = ( argc == 7 ) ? argv[6] : "file.csc"; const libxsmm_gemm_prefetch_type prefetch = LIBXSMM_GEMM_PREFETCH_NONE; const int flags = LIBXSMM_GEMM_FLAGS('N', 'N'); const REALTYPE alpha = 1, beta = 1; edge_mat_desc mat_desc = libxsmm_sparse_csc_reader_desc( l_csc_file ); unsigned int l_rowcount = mat_desc.row_count; unsigned int l_colcount = mat_desc.col_count; unsigned int l_elements = mat_desc.num_elements; REALTYPE* l_a = (REALTYPE*)libxsmm_aligned_malloc(K * M * N_CRUNS * sizeof(REALTYPE), 64); REALTYPE* l_b_de = (REALTYPE*)libxsmm_aligned_malloc(K * N * sizeof(REALTYPE), 64); REALTYPE* l_b_sp = NULL; unsigned int* l_colptr = NULL; unsigned int* l_rowidx = NULL; REALTYPE* l_b_sp_padded = NULL; unsigned int* l_colptr_padded = NULL; unsigned int* l_rowidx_padded = NULL; REALTYPE* l_c_gold = (REALTYPE*)libxsmm_aligned_malloc(M * N * N_CRUNS * sizeof(REALTYPE), 64); REALTYPE* l_c_asm = (REALTYPE*)libxsmm_aligned_malloc(M * N * N_CRUNS * sizeof(REALTYPE), 64); REALTYPE l_max_error = 0.0; unsigned int l_k, l_n; int l_i, l_j, l_jj; LIBXSMM_VLA_DECL(3, REALTYPE, l_p_a, l_a, K, N_CRUNS); LIBXSMM_VLA_DECL(3, REALTYPE, l_p_c_asm, l_c_asm, N, N_CRUNS); LIBXSMM_VLA_DECL(3, REALTYPE, l_p_c_gold, l_c_gold, N, N_CRUNS); libxsmm_descriptor_blob l_xgemm_blob; const libxsmm_gemm_descriptor* l_xgemm_desc = 0; LIBXSMM_MMFUNCTION_TYPE(REALTYPE) mykernel = NULL; unsigned long long l_start, l_end; double l_total; unsigned long long l_libxsmmflops; libxsmm_kernel_info l_kinfo; if (argc != 7) { fprintf( stderr, "arguments: M CRUNS #iters csc-file!\n" ); exit(-1); } if ((unsigned int)K != l_rowcount) { fprintf( stderr, "arguments K needs to match number of rows of the sparse matrix!\n" ); exit(-1); } if ((unsigned int)N != l_colcount) { fprintf( stderr, "arguments N needs to match number of columns of the sparse matrix!\n" ); exit(-1); } if (M != 9) { fprintf( stderr, "arguments M needs to match 9!\n" ); exit(-1); } /* touch A */ for ( l_i = 0; l_i < M; l_i++) { for ( l_j = 0; l_j < K; l_j++) { for ( l_k = 0; l_k < N_CRUNS; l_k++ ) { LIBXSMM_VLA_ACCESS(3, l_p_a, l_i, l_j, l_k, K, N_CRUNS) = (REALTYPE)libxsmm_rng_f64(); } } } /* touch C */ for ( l_i = 0; l_i < M; l_i++) { for ( l_j = 0; l_j < N; l_j++) { for ( l_k = 0; l_k < N_CRUNS; l_k++ ) { LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, N, N_CRUNS) = (REALTYPE)0.0; LIBXSMM_VLA_ACCESS(3, l_p_c_asm, l_i, l_j, l_k, N, N_CRUNS) = (REALTYPE)0.0; } } } /* read B, csc */ libxsmm_sparse_csc_reader( l_csc_file, &l_colptr, &l_rowidx, &l_b_sp, &l_rowcount, &l_colcount, &l_elements ); /* copy b to dense */ printf("csc matrix data structure we just read:\n"); printf("rows: %u, columns: %u, elements: %u\n", l_rowcount, l_colcount, l_elements); for ( l_n = 0; l_n < (((unsigned int)K) * N); l_n++) { l_b_de[l_n] = 0.0; } for ( l_n = 0; l_n < (unsigned int)N; l_n++) { const unsigned int l_colelems = l_colptr[l_n+1] - l_colptr[l_n]; assert(l_colptr[l_n+1] >= l_colptr[l_n]); for ( l_k = 0; l_k < l_colelems; l_k++) { l_b_de[(l_rowidx[l_colptr[l_n] + l_k] * N) + l_n] = l_b_sp[l_colptr[l_n] + l_k]; } } /* pad B to a better qmadd matrix */ if ( libxsmm_get_target_archid() == LIBXSMM_X86_AVX512_KNM ) { qfma_fill_in( l_b_de, K, N, &l_colptr_padded, &l_rowidx_padded, &l_b_sp_padded ); printf("qfma padded CSC matrix data structure we just read:\n"); printf("rows: %u, columns: %u, elements: %u\n", l_rowcount, l_colcount, l_colptr_padded[N]); } /* dense routine */ l_start = libxsmm_timer_tick(); #if 1 for ( l_n = 0; l_n < REPS; l_n++) { for ( l_i = 0; l_i < M; l_i++) { for ( l_j = 0; l_j < N; l_j++) { for ( l_jj = 0; l_jj < K; l_jj++) { LIBXSMM_PRAGMA_SIMD for (l_k = 0; l_k < N_CRUNS; l_k++) { LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, N, N_CRUNS) += LIBXSMM_VLA_ACCESS(3, l_p_a, l_i, l_jj, l_k, K, N_CRUNS) * l_b_de[(l_jj*N)+l_j]; } } } } } #endif l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); printf("%fs for dense\n", l_total); printf("%f GFLOPS for dense\n", ((double)((double)REPS * (double)M * (double)N * (double)K * (double)N_CRUNS) * 2.0) / (l_total * 1.0e9)); l_xgemm_desc = libxsmm_gemm_descriptor_dinit(&l_xgemm_blob, LIBXSMM_GEMM_PRECISION(REALTYPE), M, N, K, K, 0, N, alpha, beta, flags, prefetch); /* sparse routine */ #if defined(__EDGE_EXECUTE_F32__) if ( libxsmm_get_target_archid() == LIBXSMM_X86_AVX512_KNM ) { mykernel = libxsmm_create_xcsc_soa(l_xgemm_desc, l_colptr_padded, l_rowidx_padded, (const void*)l_b_sp, N_CRUNS).smm; } else { mykernel = libxsmm_create_xcsc_soa(l_xgemm_desc, l_colptr, l_rowidx, (const void*)l_b_sp, N_CRUNS).smm; } #else mykernel = libxsmm_create_xcsc_soa(l_xgemm_desc, l_colptr, l_rowidx, (const void*)l_b_sp, N_CRUNS).dmm; #endif if ( libxsmm_get_target_archid() == LIBXSMM_X86_AVX512_KNM ) { l_start = libxsmm_timer_tick(); for ( l_n = 0; l_n < REPS; l_n++) { #if defined(__EDGE_EXECUTE_F32__) mykernel( l_a, l_b_sp_padded, l_c_asm ); #else mykernel( l_a, l_b_sp, l_c_asm ); #endif } l_end = libxsmm_timer_tick(); } else { l_start = libxsmm_timer_tick(); for ( l_n = 0; l_n < REPS; l_n++) { mykernel( l_a, l_b_sp, l_c_asm ); } l_end = libxsmm_timer_tick(); } l_total = libxsmm_timer_duration(l_start, l_end); libxsmm_get_kernel_info( LIBXSMM_CONST_VOID_PTR(mykernel), &l_kinfo); l_libxsmmflops = l_kinfo.nflops; printf("%fs for sparse (asm)\n", l_total); printf("%f GFLOPS for sparse (asm), calculated\n", ((double)((double)REPS * (double)M * (double)l_elements * (double)N_CRUNS) * 2.0) / (l_total * 1.0e9)); printf("%f GFLOPS for sparse (asm), libxsmm \n", ((double)((double)REPS * (double)l_libxsmmflops)) / (l_total * 1.0e9)); /* check for errors */ l_max_error = (REALTYPE)0.0; for ( l_i = 0; l_i < M; l_i++) { for ( l_j = 0; l_j < N; l_j++) { for ( l_k = 0; l_k < N_CRUNS; l_k++ ) { if (fabs( LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, N, N_CRUNS) - LIBXSMM_VLA_ACCESS(3, l_p_c_asm, l_i, l_j, l_k, N, N_CRUNS) ) > l_max_error ) { l_max_error = (REALTYPE)fabs( LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, N, N_CRUNS) -LIBXSMM_VLA_ACCESS(3, l_p_c_asm, l_i, l_j, l_k, N, N_CRUNS) ); } } } } printf("max error: %f\n", l_max_error); printf("PERFDUMP,%s,%u,%i,%i,%i,%u,%u,%f,%f,%f\n", l_csc_file, REPS, M, N, K, l_elements, M * l_elements * N_CRUNS * 2, l_max_error, l_total, ((double)((double)REPS * (double)M * (double)l_elements * (double)N_CRUNS) * 2.0) / (l_total * 1.0e9) ); /* free */ libxsmm_free( l_b_de ); libxsmm_free( l_a ); libxsmm_free( l_c_gold ); libxsmm_free( l_c_asm ); free( l_b_sp ); free( l_colptr ); free( l_rowidx ); if ( l_b_sp_padded != NULL ) free( l_b_sp_padded ); if ( l_colptr_padded != NULL ) free( l_colptr_padded ); if ( l_rowidx_padded != NULL ) free( l_rowidx_padded ); return 0; } libxsmm-1.17/samples/edge/bsparse_scsoa.vcxproj000066400000000000000000000541561415223013700217420ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 bsparse_scsoa {6D35CCC6-EB8B-4D2B-8AFC-5954B2990438} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/edge/bsparse_srsoa.c000066400000000000000000000163741415223013700205100ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #define N_QUANTITIES 9 #include #include "common_edge_proxy.h" int main(int argc, char* argv[]) { int M = ( argc == 7 ) ? atoi(argv[1]) : 9; int N = ( argc == 7 ) ? atoi(argv[2]) : 10; int K = ( argc == 7 ) ? atoi(argv[3]) : 20; unsigned int N_CRUNS = ( argc == 7 ) ? atoi(argv[4]) : 8; unsigned int REPS = ( argc == 7 ) ? atoi(argv[5]) : 1; char* l_csr_file = ( argc == 7 ) ? argv[6] : "file.csr"; const libxsmm_gemm_prefetch_type prefetch = LIBXSMM_GEMM_PREFETCH_NONE; const int flags = LIBXSMM_GEMM_FLAGS('N', 'N'); const REALTYPE alpha = 1, beta = 1; edge_mat_desc mat_desc = libxsmm_sparse_csr_reader_desc( l_csr_file ); unsigned int l_rowcount = mat_desc.row_count; unsigned int l_colcount = mat_desc.col_count; unsigned int l_elements = mat_desc.num_elements; REALTYPE* l_a = (REALTYPE*)libxsmm_aligned_malloc(K * M * N_CRUNS * sizeof(REALTYPE), 64); REALTYPE* l_b_de = (REALTYPE*)libxsmm_aligned_malloc(K * N * sizeof(REALTYPE), 64); REALTYPE* l_b_sp = NULL; unsigned int* l_rowptr = NULL; unsigned int* l_colidx = NULL; REALTYPE* l_c_gold = (REALTYPE*)libxsmm_aligned_malloc(M * N * N_CRUNS * sizeof(REALTYPE), 64); REALTYPE* l_c_asm = (REALTYPE*)libxsmm_aligned_malloc(M * N * N_CRUNS * sizeof(REALTYPE), 64); REALTYPE l_max_error = 0.0; unsigned int l_k, l_n; int l_i, l_j, l_jj; LIBXSMM_VLA_DECL(3, REALTYPE, l_p_a, l_a, K, N_CRUNS); LIBXSMM_VLA_DECL(3, REALTYPE, l_p_c_asm, l_c_asm, N, N_CRUNS); LIBXSMM_VLA_DECL(3, REALTYPE, l_p_c_gold, l_c_gold, N, N_CRUNS); libxsmm_descriptor_blob l_xgemm_blob; const libxsmm_gemm_descriptor* l_xgemm_desc = 0; LIBXSMM_MMFUNCTION_TYPE(REALTYPE) mykernel = NULL; unsigned long long l_start, l_end; double l_total; unsigned long long l_libxsmmflops; libxsmm_kernel_info l_kinfo; if (argc != 7) { fprintf( stderr, "arguments: M CRUNS #iters CSR-file!\n" ); exit(-1); } if ((unsigned int)K != l_rowcount) { fprintf( stderr, "arguments K needs to match number of rows of the sparse matrix!\n" ); exit(-1); } if ((unsigned int)N != l_colcount) { fprintf( stderr, "arguments N needs to match number of columns of the sparse matrix!\n" ); exit(-1); } if (M != 9) { fprintf( stderr, "arguments M needs to match 9!\n" ); exit(-1); } /* touch A */ for ( l_i = 0; l_i < M; l_i++) { for ( l_j = 0; l_j < K; l_j++) { for ( l_k = 0; l_k < N_CRUNS; l_k++ ) { LIBXSMM_VLA_ACCESS(3, l_p_a, l_i, l_j, l_k, K, N_CRUNS) = (REALTYPE)libxsmm_rng_f64(); } } } /* touch C */ for ( l_i = 0; l_i < M; l_i++) { for ( l_j = 0; l_j < N; l_j++) { for ( l_k = 0; l_k < N_CRUNS; l_k++ ) { LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, N, N_CRUNS) = (REALTYPE)0.0; LIBXSMM_VLA_ACCESS(3, l_p_c_asm, l_i, l_j, l_k, N, N_CRUNS) = (REALTYPE)0.0; } } } /* read B, CSR */ libxsmm_sparse_csr_reader( l_csr_file, &l_rowptr, &l_colidx, &l_b_sp, &l_rowcount, &l_colcount, &l_elements ); /* copy b to dense */ printf("CSR matrix data structure we just read:\n"); printf("rows: %u, columns: %u, elements: %u\n", l_rowcount, l_colcount, l_elements); for ( l_n = 0; l_n < (((unsigned int)K) * N); l_n++) { l_b_de[l_n] = 0.0; } for ( l_n = 0; l_n < (unsigned int)K; l_n++) { const unsigned int l_rowelems = l_rowptr[l_n+1] - l_rowptr[l_n]; assert(l_rowptr[l_n+1] >= l_rowptr[l_n]); for ( l_k = 0; l_k < l_rowelems; l_k++) { l_b_de[(l_n * N) + l_colidx[l_rowptr[l_n] + l_k]] = l_b_sp[l_rowptr[l_n] + l_k]; } } /* dense routine */ l_start = libxsmm_timer_tick(); #if 1 for ( l_n = 0; l_n < REPS; l_n++) { for ( l_i = 0; l_i < N_QUANTITIES; l_i++) { for ( l_j = 0; l_j < N; l_j++) { for ( l_jj = 0; l_jj < K; l_jj++) { LIBXSMM_PRAGMA_SIMD for (l_k = 0; l_k < N_CRUNS; l_k++) { LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, N, N_CRUNS) += LIBXSMM_VLA_ACCESS(3, l_p_a, l_i, l_jj, l_k, K, N_CRUNS) * l_b_de[(l_jj*N)+l_j]; } } } } } #endif l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); printf("%fs for dense\n", l_total); printf("%f GFLOPS for dense\n", ((double)((double)REPS * (double)M * (double)N * (double)K * (double)N_CRUNS) * 2.0) / (l_total * 1.0e9)); l_xgemm_desc = libxsmm_gemm_descriptor_dinit(&l_xgemm_blob, LIBXSMM_GEMM_PRECISION(REALTYPE), M, N, K, K, 0, N, alpha, beta, flags, prefetch); /* sparse routine */ #if defined(__EDGE_EXECUTE_F32__) mykernel = libxsmm_create_xcsr_soa( l_xgemm_desc, l_rowptr, l_colidx, (const void*)l_b_sp, N_CRUNS ).smm; #else mykernel = libxsmm_create_xcsr_soa( l_xgemm_desc, l_rowptr, l_colidx, (const void*)l_b_sp, N_CRUNS ).dmm; #endif l_start = libxsmm_timer_tick(); for ( l_n = 0; l_n < REPS; l_n++) { mykernel( l_a, l_b_sp, l_c_asm ); } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); libxsmm_get_kernel_info( LIBXSMM_CONST_VOID_PTR(mykernel), &l_kinfo); l_libxsmmflops = l_kinfo.nflops; printf("%fs for sparse (asm)\n", l_total); printf("%f GFLOPS for sparse (asm), caculated \n", ((double)((double)REPS * (double)M * (double)l_elements * (double)N_CRUNS) * 2.0) / (l_total * 1.0e9)); printf("%f GFLOPS for sparse (asm), libxsmm \n", ((double)((double)REPS * (double)l_libxsmmflops)) / (l_total * 1.0e9)); /* check for errors */ l_max_error = (REALTYPE)0.0; for ( l_i = 0; l_i < M; l_i++) { for ( l_j = 0; l_j < N; l_j++) { for ( l_k = 0; l_k < N_CRUNS; l_k++ ) { if (fabs( LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, N, N_CRUNS) - LIBXSMM_VLA_ACCESS(3, l_p_c_asm, l_i, l_j, l_k, N, N_CRUNS) ) > l_max_error ) { l_max_error = (REALTYPE)fabs( LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, N, N_CRUNS) -LIBXSMM_VLA_ACCESS(3, l_p_c_asm, l_i, l_j, l_k, N, N_CRUNS) ); } } } } printf("max error: %f\n", l_max_error); printf("PERFDUMP,%s,%u,%i,%i,%i,%u,%u,%f,%f,%f\n", l_csr_file, REPS, M, N, K, l_elements, M * l_elements * N_CRUNS * 2, l_max_error, l_total, ((double)((double)REPS * (double)M * (double)l_elements * (double)N_CRUNS) * 2.0) / (l_total * 1.0e9) ); /* free */ libxsmm_free( l_b_de ); libxsmm_free( l_a ); libxsmm_free( l_c_gold ); libxsmm_free( l_c_asm ); free( l_b_sp ); free( l_rowptr ); free( l_colidx ); return 0; } libxsmm-1.17/samples/edge/bsparse_srsoa.vcxproj000066400000000000000000000541561415223013700217610ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 bsparse_srsoa {CFD0F2A1-CC22-4321-A433-21BC6BA0F32D} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/edge/common_edge_proxy.h000066400000000000000000000311171415223013700213540ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include #include #include #include #include #if defined(__EDGE_EXECUTE_F32__) #define REALTYPE float #else #define REALTYPE double #endif typedef struct edge_mat_desc { unsigned int row_count; unsigned int col_count; unsigned int num_elements; } edge_mat_desc; static void libxsmm_sparse_csr_reader( const char* i_csr_file_in, unsigned int** o_row_idx, unsigned int** o_column_idx, REALTYPE** o_values, unsigned int* o_row_count, unsigned int* o_column_count, unsigned int* o_element_count ) { FILE *l_csr_file_handle; const unsigned int l_line_length = 512; char l_line[512/*l_line_length*/+1]; unsigned int l_header_read = 0; unsigned int* l_row_idx_id = NULL; unsigned int l_i = 0; l_csr_file_handle = fopen( i_csr_file_in, "r" ); if ( l_csr_file_handle == NULL ) { fprintf( stderr, "cannot open CSR file!\n" ); return; } while (fgets(l_line, l_line_length, l_csr_file_handle) != NULL) { if ( strlen(l_line) == l_line_length ) { fprintf( stderr, "could not read file length!\n" ); return; } /* check if we are still reading comments header */ if ( l_line[0] == '%' ) { continue; } else { /* if we are the first line after comment header, we allocate our data structures */ if ( l_header_read == 0 ) { if (3 == sscanf(l_line, "%u %u %u", o_row_count, o_column_count, o_element_count) && 0 != *o_row_count && 0 != *o_column_count && 0 != *o_element_count) { /* allocate CSC datastructure matching mtx file */ *o_column_idx = (unsigned int*) malloc(sizeof(unsigned int) * (*o_element_count)); *o_row_idx = (unsigned int*) malloc(sizeof(unsigned int) * (*o_row_count + (size_t)1)); *o_values = (REALTYPE*) malloc(sizeof(double) * (*o_element_count)); l_row_idx_id = (unsigned int*) malloc(sizeof(unsigned int) * (*o_row_count)); /* check if mallocs were successful */ if ( ( *o_row_idx == NULL ) || ( *o_column_idx == NULL ) || ( *o_values == NULL ) || ( l_row_idx_id == NULL ) ) { fprintf( stderr, "could not allocate sp data!\n" ); return; } /* set everything to zero for init */ memset(*o_row_idx, 0, sizeof(unsigned int)*(*o_row_count + (size_t)1)); memset(*o_column_idx, 0, sizeof(unsigned int)*(*o_element_count)); memset(*o_values, 0, sizeof(double)*(*o_element_count)); memset(l_row_idx_id, 0, sizeof(unsigned int)*(*o_row_count)); /* init column idx */ for ( l_i = 0; l_i < (*o_row_count + 1); l_i++) (*o_row_idx)[l_i] = (*o_element_count); /* init */ (*o_row_idx)[0] = 0; l_i = 0; l_header_read = 1; } else { fprintf( stderr, "could not csr description!\n" ); return; } /* now we read the actual content */ } else { unsigned int l_row, l_column; REALTYPE l_value; /* read a line of content */ #if defined(__EDGE_EXECUTE_F32__) if ( sscanf(l_line, "%u %u %f", &l_row, &l_column, &l_value) != 3 ) { fprintf( stderr, "could not read element!\n" ); return; } #else if ( sscanf(l_line, "%u %u %lf", &l_row, &l_column, &l_value) != 3 ) { fprintf( stderr, "could not read element!\n" ); return; } #endif /* adjust numbers to zero termination */ l_row--; l_column--; /* add these values to row and value structure */ (*o_column_idx)[l_i] = l_column; (*o_values)[l_i] = l_value; l_i++; /* handle columns, set id to own for this column, yeah we need to handle empty columns */ l_row_idx_id[l_row] = 1; (*o_row_idx)[l_row+1] = l_i; } } } /* close mtx file */ fclose( l_csr_file_handle ); /* check if we read a file which was consistent */ if ( l_i != (*o_element_count) ) { fprintf( stderr, "we were not able to read all elements!\n" ); return; } if ( NULL == l_row_idx_id ) { fprintf( stderr, "allocating memory for row indexes failed!\n" ); return; } /* let's handle empty rows */ for ( l_i = 0; l_i < (*o_row_count); l_i++) { if ( l_row_idx_id[l_i] == 0 ) { (*o_row_idx)[l_i+1] = (*o_row_idx)[l_i]; } } /* free helper data structure */ if ( l_row_idx_id != NULL ) { free( l_row_idx_id ); } } static void libxsmm_sparse_csc_reader( const char* i_csc_file_in, unsigned int** o_column_idx, unsigned int** o_row_idx, REALTYPE** o_values, unsigned int* o_row_count, unsigned int* o_column_count, unsigned int* o_element_count ) { FILE *l_csc_file_handle; const unsigned int l_line_length = 512; char l_line[512/*l_line_length*/+1]; unsigned int l_header_read = 0; unsigned int* l_column_idx_id = NULL; unsigned int l_i = 0; l_csc_file_handle = fopen( i_csc_file_in, "r" ); if ( l_csc_file_handle == NULL ) { fprintf( stderr, "cannot open CSC file!\n" ); return; } while (fgets(l_line, l_line_length, l_csc_file_handle) != NULL) { if ( strlen(l_line) == l_line_length ) { fprintf( stderr, "could not read file length!\n" ); return; } /* check if we are still reading comments header */ if ( l_line[0] == '%' ) { continue; } else { /* if we are the first line after comment header, we allocate our data structures */ if ( l_header_read == 0 ) { if ( sscanf(l_line, "%u %u %u", o_row_count, o_column_count, o_element_count) == 3 ) { /* allocate CSC datastructure matching mtx file */ *o_row_idx = (unsigned int*) malloc(sizeof(unsigned int) * (*o_element_count)); *o_column_idx = (unsigned int*) malloc(sizeof(unsigned int) * (*o_column_count + (size_t)1)); *o_values = (REALTYPE*) malloc(sizeof(double) * (*o_element_count)); l_column_idx_id = (unsigned int*) malloc(sizeof(unsigned int) * (*o_column_count)); /* check if mallocs were successful */ if ( ( *o_row_idx == NULL ) || ( *o_column_idx == NULL ) || ( *o_values == NULL ) || ( l_column_idx_id == NULL ) ) { fprintf( stderr, "could not allocate sp data!\n" ); return; } /* set everything to zero for init */ memset(*o_column_idx, 0, sizeof(unsigned int)*(*o_column_count + (size_t)1)); memset(*o_row_idx, 0, sizeof(unsigned int)*(*o_element_count)); memset(*o_values, 0, sizeof(double)*(*o_element_count)); memset(l_column_idx_id, 0, sizeof(unsigned int)*(*o_column_count)); /* init column idx */ for ( l_i = 0; l_i < (*o_column_count + 1); l_i++ ) { (*o_column_idx)[l_i] = (*o_element_count); } /* init */ (*o_column_idx)[0] = 0; l_i = 0; l_header_read = 1; } else { fprintf( stderr, "could not csr description!\n" ); return; } /* now we read the actual content */ } else { unsigned int l_row, l_column; REALTYPE l_value; /* read a line of content */ #if defined(__EDGE_EXECUTE_F32__) if ( sscanf(l_line, "%u %u %f", &l_row, &l_column, &l_value) != 3 ) { fprintf( stderr, "could not read element!\n" ); return; } #else if ( sscanf(l_line, "%u %u %lf", &l_row, &l_column, &l_value) != 3 ) { fprintf( stderr, "could not read element!\n" ); return; } #endif /* adjust numbers to zero termination */ l_row--; l_column--; /* add these values to row and value structure */ (*o_row_idx)[l_i] = l_row; (*o_values)[l_i] = l_value; l_i++; /* handle columns, set id to own for this column, yeah we need to handle empty columns */ l_column_idx_id[l_column] = 1; (*o_column_idx)[l_column+1] = l_i; } } } /* close mtx file */ fclose( l_csc_file_handle ); /* check if we read a file which was consistent */ if ( l_i != (*o_element_count) ) { fprintf( stderr, "we were not able to read all elements!\n" ); return; } if ( NULL == l_column_idx_id ) { fprintf( stderr, "allocating memory for column indexes failed!\n" ); return; } /* let's handle empty rows */ for ( l_i = 0; l_i < (*o_column_count); l_i++) { if ( l_column_idx_id[l_i] == 0 ) { (*o_column_idx)[l_i+1] = (*o_column_idx)[l_i]; } } /* free helper data structure */ if ( l_column_idx_id != NULL ) { free( l_column_idx_id ); } } static edge_mat_desc libxsmm_sparse_csr_reader_desc( const char* i_csr_file_in ) { FILE *l_csr_file_handle; const unsigned int l_line_length = 512; char l_line[512/*l_line_length*/+1]; unsigned int l_header_read = 0; unsigned int l_row_count = 0; unsigned int l_col_count = 0; unsigned int l_num_elements = 0; edge_mat_desc desc; desc.row_count = 0; desc.col_count = 0; desc.num_elements = 0; l_csr_file_handle = fopen( i_csr_file_in, "r" ); if ( l_csr_file_handle == NULL ) { fprintf( stderr, "cannot open CSR file!\n" ); return desc; } while (fgets(l_line, l_line_length, l_csr_file_handle) != NULL) { if ( strlen(l_line) == l_line_length ) { fprintf( stderr, "could not read file length!\n" ); return desc; } /* check if we are still reading comments header */ if ( l_line[0] == '%' ) { continue; } else { /* if we are the first line after comment header, we allocate our data structures */ if ( l_header_read == 0 ) { if ( sscanf(l_line, "%u %u %u", &l_row_count, &l_col_count, &l_num_elements) == 3 ) { l_header_read = 1; desc.row_count = l_row_count; desc.col_count = l_col_count; desc.num_elements = l_num_elements; } else { fprintf( stderr, "could not csr description!\n" ); return desc; } } else { } } } return desc; } static edge_mat_desc libxsmm_sparse_csc_reader_desc( const char* i_csc_file_in ) { FILE *l_csc_file_handle; const unsigned int l_line_length = 512; char l_line[512/*l_line_length*/+1]; unsigned int l_header_read = 0; unsigned int l_row_count = 0; unsigned int l_col_count = 0; unsigned int l_num_elements = 0; edge_mat_desc desc; desc.row_count = 0; desc.col_count = 0; desc.num_elements = 0; l_csc_file_handle = fopen( i_csc_file_in, "r" ); if ( l_csc_file_handle == NULL ) { fprintf( stderr, "cannot open CSC file!\n" ); return desc; } while (fgets(l_line, l_line_length, l_csc_file_handle) != NULL) { if ( strlen(l_line) == l_line_length ) { fprintf( stderr, "could not read file length!\n" ); return desc; } /* check if we are still reading comments header */ if ( l_line[0] == '%' ) { continue; } else { /* if we are the first line after comment header, we allocate our data structures */ if ( l_header_read == 0 ) { if ( sscanf(l_line, "%u %u %u", &l_row_count, &l_col_count, &l_num_elements) == 3 ) { l_header_read = 1; desc.row_count = l_row_count; desc.col_count = l_col_count; desc.num_elements = l_num_elements; } else { fprintf( stderr, "could not csc description!\n" ); return desc; } } else { } } } return desc; } libxsmm-1.17/samples/edge/dense_packedacrm.c000066400000000000000000000135751415223013700211120ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include #include #include #include #if defined(__EDGE_EXECUTE_F32__) #define REALTYPE float #else #define REALTYPE double #endif static void matMulFusedAC( unsigned int i_r, unsigned int i_m, unsigned int i_n, unsigned int i_k, unsigned int i_ldA, unsigned int i_ldB, unsigned int i_ldC, REALTYPE i_beta, const REALTYPE *i_a, const REALTYPE *i_b, REALTYPE *o_c ) { unsigned int l_m = 0; unsigned int l_n = 0; unsigned int l_r = 0; unsigned int l_k = 0; /* init result matrix */ for ( l_m = 0; l_m < i_m; l_m++ ) { for ( l_n = 0; l_n < i_n; l_n++ ) { for ( l_r = 0; l_r < i_r; l_r++ ) { o_c[l_m*i_ldC*i_r + l_n*i_r + l_r] = (i_beta != (REALTYPE)0) ? o_c[l_m*i_ldC*i_r + l_n*i_r + l_r] * i_beta : 0; } } } /* perform matmul */ for ( l_k = 0; l_k < i_k; l_k++ ) { for ( l_m = 0; l_m < i_m; l_m++ ) { for ( l_n = 0; l_n < i_n; l_n++ ) { for ( l_r = 0; l_r < i_r; l_r++ ) { o_c[l_m*i_ldC*i_r + l_n*i_r + l_r] += i_a[l_m*i_ldA*i_r + l_k*i_r + l_r] * i_b[l_k*i_ldB + l_n]; } } } } } int main(int argc, char* argv[]) { #if defined(__EDGE_EXECUTE_F32__) unsigned int l_r = 16; #else unsigned int l_r = 8; #endif unsigned int l_m = 1 < argc ? atoi(argv[1]) : 0; unsigned int l_n = 2 < argc ? atoi(argv[2]) : 0; unsigned int l_k = 3 < argc ? atoi(argv[3]) : 0; REALTYPE l_beta = (REALTYPE)(4 < argc ? atof(argv[4]) : 0); REALTYPE l_alpha = 1.0; unsigned int l_reps = 5 < argc ? atoi(argv[5]) : 0; double flops = (double)l_m * (double)l_n * (double)l_k * (double)l_r * (double)l_reps; REALTYPE* a = (REALTYPE*) libxsmm_aligned_malloc( l_m*l_k*l_r*sizeof(REALTYPE), 64 ); REALTYPE* b = (REALTYPE*) libxsmm_aligned_malloc( l_k*l_n*sizeof(REALTYPE), 64 ); REALTYPE* c1 = (REALTYPE*) libxsmm_aligned_malloc( l_m*l_n*l_r*sizeof(REALTYPE), 64 ); REALTYPE* c2 = (REALTYPE*) libxsmm_aligned_malloc( l_m*l_n*l_r*sizeof(REALTYPE), 64 ); libxsmm_descriptor_blob l_xgemm_blob; const libxsmm_gemm_descriptor* l_xgemm_desc = 0; LIBXSMM_MMFUNCTION_TYPE(REALTYPE) mykernel = NULL; const libxsmm_gemm_prefetch_type prefetch = LIBXSMM_GEMM_PREFETCH_NONE; const int flags = LIBXSMM_GEMM_FLAGS('N', 'N'); libxsmm_timer_tickint l_start, l_end; double l_total_ref, l_total_opt; double max_error = 0.0; double gflops_ref = 0.0; double gflops_opt = 0.0; double gflops_opt2 = 0.0; unsigned int i = 0; unsigned long long l_libxsmmflops; libxsmm_kernel_info l_kinfo; for ( i = 0; i < l_m*l_n*l_r; ++i ) { c1[i] = (REALTYPE)libxsmm_rng_f64(); } for ( i = 0; i < l_m*l_n*l_r; ++i ) { c2[i] = c1[i]; } for ( i = 0; i < l_m*l_k*l_r; ++i ) { a[i] = (REALTYPE)libxsmm_rng_f64(); } for ( i = 0; i < l_k*l_n; ++i ) { b[i] = (REALTYPE)libxsmm_rng_f64(); } /* JIT code */ l_xgemm_desc = libxsmm_gemm_descriptor_dinit(&l_xgemm_blob, LIBXSMM_GEMM_PRECISION(REALTYPE), l_m, l_n, l_k, l_k, l_n, l_n, l_alpha, l_beta, flags, prefetch); #if defined(__EDGE_EXECUTE_F32__) mykernel = libxsmm_create_pgemm_ac_rm( l_xgemm_desc, l_r ).smm; #else mykernel = libxsmm_create_pgemm_ac_rm( l_xgemm_desc, l_r ).dmm; #endif /* run reference */ matMulFusedAC( l_r, l_m, l_n, l_k, l_k, l_n, l_n, l_beta, a, b, c1); /* run optimized */ mykernel( a, b, c2 ); /* check correctness */ for ( i = 0; i < l_m*l_n*l_r; ++i ) { if ( max_error < fabs( c1[i] - c2[i] ) ) { max_error = fabs( c1[i] - c2[i] ); } } printf("Max. Error: %f\n", max_error); /* lets run some performance test */ l_start = libxsmm_timer_tick(); for ( i = 0; i < l_reps; ++i ) { /* run reference */ matMulFusedAC( l_r, l_m, l_n, l_k, l_k, l_n, l_n, l_beta, a, b, c1); } l_end = libxsmm_timer_tick(); l_total_ref = libxsmm_timer_duration(l_start, l_end); l_start = libxsmm_timer_tick(); for ( i = 0; i < l_reps; ++i ) { /* run optimized */ mykernel( a, b, c2); } l_end = libxsmm_timer_tick(); l_total_opt = libxsmm_timer_duration(l_start, l_end); libxsmm_get_kernel_info( LIBXSMM_CONST_VOID_PTR(mykernel), &l_kinfo); l_libxsmmflops = l_kinfo.nflops; gflops_ref = (flops/l_total_ref)/1e9; gflops_opt = (flops/l_total_opt)/1e9; gflops_opt2 = (((double)l_libxsmmflops)/l_total_opt)/1e9; printf("GFLOPS ref: %f\n", gflops_ref); printf("GFLOPS opt, calculated: %f\n", gflops_opt); printf("GFLOPS opt, libxsmm: %f\n", gflops_opt2); libxsmm_free( a ); libxsmm_free( b ); libxsmm_free( c1 ); libxsmm_free( c2 ); return 0; } libxsmm-1.17/samples/edge/dense_packedacrm.vcxproj000066400000000000000000000540441415223013700223570ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 dense_packedacrm {FE7A7F77-2363-480D-A2C5-D9B72260A8AB} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/edge/dense_packedbcrm.c000066400000000000000000000135731415223013700211110ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include #include #include #include #if defined(__EDGE_EXECUTE_F32__) #define REALTYPE float #else #define REALTYPE double #endif static void matMulFusedBC( unsigned int i_r, unsigned int i_m, unsigned int i_n, unsigned int i_k, unsigned int i_ldA, unsigned int i_ldB, unsigned int i_ldC, REALTYPE i_beta, const REALTYPE *i_a, const REALTYPE *i_b, REALTYPE *o_c ) { unsigned int l_m = 0; unsigned int l_n = 0; unsigned int l_r = 0; unsigned int l_k = 0; /* init result matrix */ for ( l_m = 0; l_m < i_m; l_m++ ) { for ( l_n = 0; l_n < i_n; l_n++ ) { for ( l_r = 0; l_r < i_r; l_r++ ) { o_c[l_m*i_ldC*i_r + l_n*i_r + l_r] = (i_beta != (REALTYPE)0) ? o_c[l_m*i_ldC*i_r + l_n*i_r + l_r] * i_beta : 0; } } } /* perform matmul */ for ( l_k = 0; l_k < i_k; l_k++ ) { for ( l_m = 0; l_m < i_m; l_m++ ) { for ( l_n = 0; l_n < i_n; l_n++ ) { for ( l_r = 0; l_r < i_r; l_r++ ) { o_c[l_m*i_ldC*i_r + l_n*i_r + l_r] += i_a[l_m*i_ldA + l_k] * i_b[l_k*i_ldB*i_r + l_n*i_r + l_r]; } } } } } int main(int argc, char* argv[]) { #if defined(__EDGE_EXECUTE_F32__) unsigned int l_r = 16; #else unsigned int l_r = 8; #endif unsigned int l_m = 1 < argc ? atoi(argv[1]) : 0; unsigned int l_n = 2 < argc ? atoi(argv[2]) : 0; unsigned int l_k = 3 < argc ? atoi(argv[3]) : 0; REALTYPE l_beta = (REALTYPE)(4 < argc ? atof(argv[4]) : 0); REALTYPE l_alpha = 1.0; unsigned int l_reps = 5 < argc ? atoi(argv[5]) : 0; double flops = (double)l_m * (double)l_n * (double)l_k * (double)l_r * (double)l_reps; REALTYPE* a = (REALTYPE*) libxsmm_aligned_malloc( l_m*l_k*sizeof(REALTYPE), 64 ); REALTYPE* b = (REALTYPE*) libxsmm_aligned_malloc( l_k*l_n*l_r*sizeof(REALTYPE), 64 ); REALTYPE* c1 = (REALTYPE*) libxsmm_aligned_malloc( l_m*l_n*l_r*sizeof(REALTYPE), 64 ); REALTYPE* c2 = (REALTYPE*) libxsmm_aligned_malloc( l_m*l_n*l_r*sizeof(REALTYPE), 64 ); libxsmm_descriptor_blob l_xgemm_blob; const libxsmm_gemm_descriptor* l_xgemm_desc = 0; LIBXSMM_MMFUNCTION_TYPE(REALTYPE) mykernel = NULL; const libxsmm_gemm_prefetch_type prefetch = LIBXSMM_GEMM_PREFETCH_NONE; const int flags = LIBXSMM_GEMM_FLAGS('N', 'N'); libxsmm_timer_tickint l_start, l_end; double l_total_ref, l_total_opt; double max_error = 0.0; double gflops_ref = 0.0; double gflops_opt = 0.0; double gflops_opt2 = 0.0; unsigned int i = 0; unsigned long long l_libxsmmflops; libxsmm_kernel_info l_kinfo; for ( i = 0; i < l_m*l_n*l_r; ++i ) { c1[i] = (REALTYPE)libxsmm_rng_f64(); } for ( i = 0; i < l_m*l_n*l_r; ++i ) { c2[i] = c1[i]; } for ( i = 0; i < l_m*l_k; ++i ) { a[i] = (REALTYPE)libxsmm_rng_f64(); } for ( i = 0; i < l_k*l_n*l_r; ++i ) { b[i] = (REALTYPE)libxsmm_rng_f64(); } /* JIT code */ l_xgemm_desc = libxsmm_gemm_descriptor_dinit(&l_xgemm_blob, LIBXSMM_GEMM_PRECISION(REALTYPE), l_m, l_n, l_k, l_k, l_n, l_n, l_alpha, l_beta, flags, prefetch); #if defined(__EDGE_EXECUTE_F32__) mykernel = libxsmm_create_pgemm_bc_rm( l_xgemm_desc, l_r ).smm; #else mykernel = libxsmm_create_pgemm_bc_rm( l_xgemm_desc, l_r ).dmm; #endif /* run reference */ matMulFusedBC( l_r, l_m, l_n, l_k, l_k, l_n, l_n, l_beta, a, b, c1); /* run optimized */ mykernel( a, b, c2 ); /* check correctness */ for ( i = 0; i < l_m*l_n*l_r; ++i ) { if ( max_error < fabs( c1[i] - c2[i] ) ) { max_error = fabs( c1[i] - c2[i] ); } } printf("Max. Error: %f\n", max_error); /* lets run some performance test */ l_start = libxsmm_timer_tick(); for ( i = 0; i < l_reps; ++i ) { /* run reference */ matMulFusedBC( l_r, l_m, l_n, l_k, l_k, l_n, l_n, l_beta, a, b, c1); } l_end = libxsmm_timer_tick(); l_total_ref = libxsmm_timer_duration(l_start, l_end); l_start = libxsmm_timer_tick(); for ( i = 0; i < l_reps; ++i ) { /* run optimized */ mykernel( a, b, c2); } l_end = libxsmm_timer_tick(); l_total_opt = libxsmm_timer_duration(l_start, l_end); libxsmm_get_kernel_info( LIBXSMM_CONST_VOID_PTR(mykernel), &l_kinfo); l_libxsmmflops = l_kinfo.nflops; gflops_ref = (flops/l_total_ref)/1e9; gflops_opt = (flops/l_total_opt)/1e9; gflops_opt2 = (((double)l_libxsmmflops)/l_total_opt)/1e9; printf("GFLOPS ref: %f\n", gflops_ref); printf("GFLOPS opt, calculated: %f\n", gflops_opt); printf("GFLOPS opt, libxsmm: %f\n", gflops_opt2); libxsmm_free( a ); libxsmm_free( b ); libxsmm_free( c1 ); libxsmm_free( c2 ); return 0; } libxsmm-1.17/samples/edge/dense_packedbcrm.vcxproj000066400000000000000000000540441415223013700223600ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 dense_packedbcrm {7E33EB2B-7C25-4308-A86E-6D9A90F8613D} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/edge/edge_download_mats.sh000077500000000000000000000233411415223013700216440ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) MKDIR=$(command -v mkdir) WGET=$(command -v wget) # ls -1 | xargs NAMES="tet4_0_fluxL_0_csc.mtx tet4_0_fluxL_0_csr.mtx tet4_0_fluxL_1_csc.mtx tet4_0_fluxL_1_csr.mtx tet4_0_fluxL_2_csc.mtx tet4_0_fluxL_2_csr.mtx tet4_0_fluxL_3_csc.mtx tet4_0_fluxL_3_csr.mtx tet4_0_fluxN_0_csc.mtx tet4_0_fluxN_0_csr.mtx tet4_0_fluxN_1_csc.mtx tet4_0_fluxN_1_csr.mtx tet4_0_fluxN_10_csc.mtx tet4_0_fluxN_10_csr.mtx tet4_0_fluxN_11_csc.mtx tet4_0_fluxN_11_csr.mtx tet4_0_fluxN_2_csc.mtx tet4_0_fluxN_2_csr.mtx tet4_0_fluxN_3_csc.mtx tet4_0_fluxN_3_csr.mtx tet4_0_fluxN_4_csc.mtx tet4_0_fluxN_4_csr.mtx tet4_0_fluxN_5_csc.mtx tet4_0_fluxN_5_csr.mtx tet4_0_fluxN_6_csc.mtx tet4_0_fluxN_6_csr.mtx tet4_0_fluxN_7_csc.mtx tet4_0_fluxN_7_csr.mtx tet4_0_fluxN_8_csc.mtx tet4_0_fluxN_8_csr.mtx tet4_0_fluxN_9_csc.mtx tet4_0_fluxN_9_csr.mtx tet4_0_fluxT_0_csc.mtx tet4_0_fluxT_0_csr.mtx tet4_0_fluxT_1_csc.mtx tet4_0_fluxT_1_csr.mtx tet4_0_fluxT_2_csc.mtx tet4_0_fluxT_2_csr.mtx tet4_0_fluxT_3_csc.mtx tet4_0_fluxT_3_csr.mtx tet4_0_ma_0_csc.mtx tet4_0_ma_0_csr.mtx tet4_0_stiffT_0_csc.mtx tet4_0_stiffT_0_csr.mtx tet4_0_stiffT_1_csc.mtx tet4_0_stiffT_1_csr.mtx tet4_0_stiffT_2_csc.mtx tet4_0_stiffT_2_csr.mtx tet4_0_stiffV_0_csc.mtx tet4_0_stiffV_0_csr.mtx tet4_0_stiffV_1_csc.mtx tet4_0_stiffV_1_csr.mtx tet4_0_stiffV_2_csc.mtx tet4_0_stiffV_2_csr.mtx tet4_1_fluxL_0_csc.mtx tet4_1_fluxL_0_csr.mtx tet4_1_fluxL_1_csc.mtx tet4_1_fluxL_1_csr.mtx tet4_1_fluxL_2_csc.mtx tet4_1_fluxL_2_csr.mtx tet4_1_fluxL_3_csc.mtx tet4_1_fluxL_3_csr.mtx tet4_1_fluxN_0_csc.mtx tet4_1_fluxN_0_csr.mtx tet4_1_fluxN_1_csc.mtx tet4_1_fluxN_1_csr.mtx tet4_1_fluxN_10_csc.mtx tet4_1_fluxN_10_csr.mtx tet4_1_fluxN_11_csc.mtx tet4_1_fluxN_11_csr.mtx tet4_1_fluxN_2_csc.mtx tet4_1_fluxN_2_csr.mtx tet4_1_fluxN_3_csc.mtx tet4_1_fluxN_3_csr.mtx tet4_1_fluxN_4_csc.mtx tet4_1_fluxN_4_csr.mtx tet4_1_fluxN_5_csc.mtx tet4_1_fluxN_5_csr.mtx tet4_1_fluxN_6_csc.mtx tet4_1_fluxN_6_csr.mtx tet4_1_fluxN_7_csc.mtx tet4_1_fluxN_7_csr.mtx tet4_1_fluxN_8_csc.mtx tet4_1_fluxN_8_csr.mtx tet4_1_fluxN_9_csc.mtx tet4_1_fluxN_9_csr.mtx tet4_1_fluxT_0_csc.mtx tet4_1_fluxT_0_csr.mtx tet4_1_fluxT_1_csc.mtx tet4_1_fluxT_1_csr.mtx tet4_1_fluxT_2_csc.mtx tet4_1_fluxT_2_csr.mtx tet4_1_fluxT_3_csc.mtx tet4_1_fluxT_3_csr.mtx tet4_1_ma_0_csc.mtx tet4_1_ma_0_csr.mtx tet4_1_stiffT_0_csc.mtx tet4_1_stiffT_0_csr.mtx tet4_1_stiffT_1_csc.mtx tet4_1_stiffT_1_csr.mtx tet4_1_stiffT_2_csc.mtx tet4_1_stiffT_2_csr.mtx tet4_1_stiffV_0_csc.mtx tet4_1_stiffV_0_csr.mtx tet4_1_stiffV_1_csc.mtx tet4_1_stiffV_1_csr.mtx tet4_1_stiffV_2_csc.mtx tet4_1_stiffV_2_csr.mtx tet4_2_fluxL_0_csc.mtx tet4_2_fluxL_0_csr.mtx tet4_2_fluxL_1_csc.mtx tet4_2_fluxL_1_csr.mtx tet4_2_fluxL_2_csc.mtx tet4_2_fluxL_2_csr.mtx tet4_2_fluxL_3_csc.mtx tet4_2_fluxL_3_csr.mtx tet4_2_fluxN_0_csc.mtx tet4_2_fluxN_0_csr.mtx tet4_2_fluxN_1_csc.mtx tet4_2_fluxN_1_csr.mtx tet4_2_fluxN_10_csc.mtx tet4_2_fluxN_10_csr.mtx tet4_2_fluxN_11_csc.mtx tet4_2_fluxN_11_csr.mtx tet4_2_fluxN_2_csc.mtx tet4_2_fluxN_2_csr.mtx tet4_2_fluxN_3_csc.mtx tet4_2_fluxN_3_csr.mtx tet4_2_fluxN_4_csc.mtx tet4_2_fluxN_4_csr.mtx tet4_2_fluxN_5_csc.mtx tet4_2_fluxN_5_csr.mtx tet4_2_fluxN_6_csc.mtx tet4_2_fluxN_6_csr.mtx tet4_2_fluxN_7_csc.mtx tet4_2_fluxN_7_csr.mtx tet4_2_fluxN_8_csc.mtx tet4_2_fluxN_8_csr.mtx tet4_2_fluxN_9_csc.mtx tet4_2_fluxN_9_csr.mtx tet4_2_fluxT_0_csc.mtx tet4_2_fluxT_0_csr.mtx tet4_2_fluxT_1_csc.mtx tet4_2_fluxT_1_csr.mtx tet4_2_fluxT_2_csc.mtx tet4_2_fluxT_2_csr.mtx tet4_2_fluxT_3_csc.mtx tet4_2_fluxT_3_csr.mtx tet4_2_ma_0_csc.mtx tet4_2_ma_0_csr.mtx tet4_2_stiffT_0_csc.mtx tet4_2_stiffT_0_csr.mtx tet4_2_stiffT_1_csc.mtx tet4_2_stiffT_1_csr.mtx tet4_2_stiffT_2_csc.mtx tet4_2_stiffT_2_csr.mtx tet4_2_stiffV_0_csc.mtx tet4_2_stiffV_0_csr.mtx tet4_2_stiffV_1_csc.mtx tet4_2_stiffV_1_csr.mtx tet4_2_stiffV_2_csc.mtx tet4_2_stiffV_2_csr.mtx tet4_3_fluxL_0_csc.mtx tet4_3_fluxL_0_csr.mtx tet4_3_fluxL_1_csc.mtx tet4_3_fluxL_1_csr.mtx tet4_3_fluxL_2_csc.mtx tet4_3_fluxL_2_csr.mtx tet4_3_fluxL_3_csc.mtx tet4_3_fluxL_3_csr.mtx tet4_3_fluxN_0_csc.mtx tet4_3_fluxN_0_csr.mtx tet4_3_fluxN_1_csc.mtx tet4_3_fluxN_1_csr.mtx tet4_3_fluxN_10_csc.mtx tet4_3_fluxN_10_csr.mtx tet4_3_fluxN_11_csc.mtx tet4_3_fluxN_11_csr.mtx tet4_3_fluxN_2_csc.mtx tet4_3_fluxN_2_csr.mtx tet4_3_fluxN_3_csc.mtx tet4_3_fluxN_3_csr.mtx tet4_3_fluxN_4_csc.mtx tet4_3_fluxN_4_csr.mtx tet4_3_fluxN_5_csc.mtx tet4_3_fluxN_5_csr.mtx tet4_3_fluxN_6_csc.mtx tet4_3_fluxN_6_csr.mtx tet4_3_fluxN_7_csc.mtx tet4_3_fluxN_7_csr.mtx tet4_3_fluxN_8_csc.mtx tet4_3_fluxN_8_csr.mtx tet4_3_fluxN_9_csc.mtx tet4_3_fluxN_9_csr.mtx tet4_3_fluxT_0_csc.mtx tet4_3_fluxT_0_csr.mtx tet4_3_fluxT_1_csc.mtx tet4_3_fluxT_1_csr.mtx tet4_3_fluxT_2_csc.mtx tet4_3_fluxT_2_csr.mtx tet4_3_fluxT_3_csc.mtx tet4_3_fluxT_3_csr.mtx tet4_3_ma_0_csc.mtx tet4_3_ma_0_csr.mtx tet4_3_stiffT_0_csc.mtx tet4_3_stiffT_0_csr.mtx tet4_3_stiffT_1_csc.mtx tet4_3_stiffT_1_csr.mtx tet4_3_stiffT_2_csc.mtx tet4_3_stiffT_2_csr.mtx tet4_3_stiffV_0_csc.mtx tet4_3_stiffV_0_csr.mtx tet4_3_stiffV_1_csc.mtx tet4_3_stiffV_1_csr.mtx tet4_3_stiffV_2_csc.mtx tet4_3_stiffV_2_csr.mtx tet4_4_fluxL_0_csc.mtx tet4_4_fluxL_0_csr.mtx tet4_4_fluxL_1_csc.mtx tet4_4_fluxL_1_csr.mtx tet4_4_fluxL_2_csc.mtx tet4_4_fluxL_2_csr.mtx tet4_4_fluxL_3_csc.mtx tet4_4_fluxL_3_csr.mtx tet4_4_fluxN_0_csc.mtx tet4_4_fluxN_0_csr.mtx tet4_4_fluxN_1_csc.mtx tet4_4_fluxN_1_csr.mtx tet4_4_fluxN_10_csc.mtx tet4_4_fluxN_10_csr.mtx tet4_4_fluxN_11_csc.mtx tet4_4_fluxN_11_csr.mtx tet4_4_fluxN_2_csc.mtx tet4_4_fluxN_2_csr.mtx tet4_4_fluxN_3_csc.mtx tet4_4_fluxN_3_csr.mtx tet4_4_fluxN_4_csc.mtx tet4_4_fluxN_4_csr.mtx tet4_4_fluxN_5_csc.mtx tet4_4_fluxN_5_csr.mtx tet4_4_fluxN_6_csc.mtx tet4_4_fluxN_6_csr.mtx tet4_4_fluxN_7_csc.mtx tet4_4_fluxN_7_csr.mtx tet4_4_fluxN_8_csc.mtx tet4_4_fluxN_8_csr.mtx tet4_4_fluxN_9_csc.mtx tet4_4_fluxN_9_csr.mtx tet4_4_fluxT_0_csc.mtx tet4_4_fluxT_0_csr.mtx tet4_4_fluxT_1_csc.mtx tet4_4_fluxT_1_csr.mtx tet4_4_fluxT_2_csc.mtx tet4_4_fluxT_2_csr.mtx tet4_4_fluxT_3_csc.mtx tet4_4_fluxT_3_csr.mtx tet4_4_ma_0_csc.mtx tet4_4_ma_0_csr.mtx tet4_4_stiffT_0_csc.mtx tet4_4_stiffT_0_csr.mtx tet4_4_stiffT_1_csc.mtx tet4_4_stiffT_1_csr.mtx tet4_4_stiffT_2_csc.mtx tet4_4_stiffT_2_csr.mtx tet4_4_stiffV_0_csc.mtx tet4_4_stiffV_0_csr.mtx tet4_4_stiffV_1_csc.mtx tet4_4_stiffV_1_csr.mtx tet4_4_stiffV_2_csc.mtx tet4_4_stiffV_2_csr.mtx tet4_5_fluxL_0_csc.mtx tet4_5_fluxL_0_csr.mtx tet4_5_fluxL_1_csc.mtx tet4_5_fluxL_1_csr.mtx tet4_5_fluxL_2_csc.mtx tet4_5_fluxL_2_csr.mtx tet4_5_fluxL_3_csc.mtx tet4_5_fluxL_3_csr.mtx tet4_5_fluxN_0_csc.mtx tet4_5_fluxN_0_csr.mtx tet4_5_fluxN_1_csc.mtx tet4_5_fluxN_1_csr.mtx tet4_5_fluxN_10_csc.mtx tet4_5_fluxN_10_csr.mtx tet4_5_fluxN_11_csc.mtx tet4_5_fluxN_11_csr.mtx tet4_5_fluxN_2_csc.mtx tet4_5_fluxN_2_csr.mtx tet4_5_fluxN_3_csc.mtx tet4_5_fluxN_3_csr.mtx tet4_5_fluxN_4_csc.mtx tet4_5_fluxN_4_csr.mtx tet4_5_fluxN_5_csc.mtx tet4_5_fluxN_5_csr.mtx tet4_5_fluxN_6_csc.mtx tet4_5_fluxN_6_csr.mtx tet4_5_fluxN_7_csc.mtx tet4_5_fluxN_7_csr.mtx tet4_5_fluxN_8_csc.mtx tet4_5_fluxN_8_csr.mtx tet4_5_fluxN_9_csc.mtx tet4_5_fluxN_9_csr.mtx tet4_5_fluxT_0_csc.mtx tet4_5_fluxT_0_csr.mtx tet4_5_fluxT_1_csc.mtx tet4_5_fluxT_1_csr.mtx tet4_5_fluxT_2_csc.mtx tet4_5_fluxT_2_csr.mtx tet4_5_fluxT_3_csc.mtx tet4_5_fluxT_3_csr.mtx tet4_5_ma_0_csc.mtx tet4_5_ma_0_csr.mtx tet4_5_stiffT_0_csc.mtx tet4_5_stiffT_0_csr.mtx tet4_5_stiffT_1_csc.mtx tet4_5_stiffT_1_csr.mtx tet4_5_stiffT_2_csc.mtx tet4_5_stiffT_2_csr.mtx tet4_5_stiffV_0_csc.mtx tet4_5_stiffV_0_csr.mtx tet4_5_stiffV_1_csc.mtx tet4_5_stiffV_1_csr.mtx tet4_5_stiffV_2_csc.mtx tet4_5_stiffV_2_csr.mtx tet4_6_fluxL_0_csc.mtx tet4_6_fluxL_0_csr.mtx tet4_6_fluxL_1_csc.mtx tet4_6_fluxL_1_csr.mtx tet4_6_fluxL_2_csc.mtx tet4_6_fluxL_2_csr.mtx tet4_6_fluxL_3_csc.mtx tet4_6_fluxL_3_csr.mtx tet4_6_fluxN_0_csc.mtx tet4_6_fluxN_0_csr.mtx tet4_6_fluxN_1_csc.mtx tet4_6_fluxN_1_csr.mtx tet4_6_fluxN_10_csc.mtx tet4_6_fluxN_10_csr.mtx tet4_6_fluxN_11_csc.mtx tet4_6_fluxN_11_csr.mtx tet4_6_fluxN_2_csc.mtx tet4_6_fluxN_2_csr.mtx tet4_6_fluxN_3_csc.mtx tet4_6_fluxN_3_csr.mtx tet4_6_fluxN_4_csc.mtx tet4_6_fluxN_4_csr.mtx tet4_6_fluxN_5_csc.mtx tet4_6_fluxN_5_csr.mtx tet4_6_fluxN_6_csc.mtx tet4_6_fluxN_6_csr.mtx tet4_6_fluxN_7_csc.mtx tet4_6_fluxN_7_csr.mtx tet4_6_fluxN_8_csc.mtx tet4_6_fluxN_8_csr.mtx tet4_6_fluxN_9_csc.mtx tet4_6_fluxN_9_csr.mtx tet4_6_fluxT_0_csc.mtx tet4_6_fluxT_0_csr.mtx tet4_6_fluxT_1_csc.mtx tet4_6_fluxT_1_csr.mtx tet4_6_fluxT_2_csc.mtx tet4_6_fluxT_2_csr.mtx tet4_6_fluxT_3_csc.mtx tet4_6_fluxT_3_csr.mtx tet4_6_ma_0_csc.mtx tet4_6_ma_0_csr.mtx tet4_6_stiffT_0_csc.mtx tet4_6_stiffT_0_csr.mtx tet4_6_stiffT_1_csc.mtx tet4_6_stiffT_1_csr.mtx tet4_6_stiffT_2_csc.mtx tet4_6_stiffT_2_csr.mtx tet4_6_stiffV_0_csc.mtx tet4_6_stiffV_0_csr.mtx tet4_6_stiffV_1_csc.mtx tet4_6_stiffV_1_csr.mtx tet4_6_stiffV_2_csc.mtx tet4_6_stiffV_2_csr.mtx tet4_fluxMatrix_csr_de.mtx tet4_fluxMatrix_csr_sp.mtx tet4_starMatrix_csc.mtx tet4_starMatrix_csr.mtx" if [ "${MKDIR}" ] && [ "${WGET}" ]; then ${MKDIR} -p ${HERE}/mats; cd ${HERE}/mats for NAME in ${NAMES}; do ${WGET} -N https://github.com/hfp/libxsmm/raw/master/samples/edge/mats/${NAME} done fi libxsmm-1.17/samples/edge/edge_proxy_common.c000066400000000000000000000134071415223013700213510ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "edge_proxy_common.h" void edge_sparse_csr_reader_double( const char* i_csr_file_in, unsigned int** o_row_idx, unsigned int** o_column_idx, double** o_values, unsigned int* o_row_count, unsigned int* o_column_count, unsigned int* o_element_count ) { FILE *l_csr_file_handle; const unsigned int l_line_length = 512; char l_line[512/*l_line_length*/+1]; unsigned int l_header_read = 0; unsigned int* l_row_idx_id = NULL; unsigned int l_i = 0; l_csr_file_handle = fopen( i_csr_file_in, "r" ); if ( l_csr_file_handle == NULL ) { fprintf( stderr, "cannot open CSR file!\n" ); return; } while (fgets(l_line, l_line_length, l_csr_file_handle) != NULL) { if ( strlen(l_line) == l_line_length ) { fprintf( stderr, "could not read file length!\n" ); return; } /* check if we are still reading comments header */ if ( l_line[0] == '%' ) { continue; } else { /* if we are the first line after comment header, we allocate our data structures */ if ( l_header_read == 0 ) { if (3 == sscanf(l_line, "%u %u %u", o_row_count, o_column_count, o_element_count) && 0 != *o_row_count && 0 != *o_column_count && 0 != *o_element_count) { /* allocate CSC datastructure matching mtx file */ *o_column_idx = (unsigned int*) malloc(sizeof(unsigned int) * (*o_element_count)); *o_row_idx = (unsigned int*) malloc(sizeof(unsigned int) * (*o_row_count + 1)); *o_values = (double*) malloc(sizeof(double) * (*o_element_count)); l_row_idx_id = (unsigned int*) malloc(sizeof(unsigned int) * (*o_row_count)); /* check if mallocs were successful */ if ( ( *o_row_idx == NULL ) || ( *o_column_idx == NULL ) || ( *o_values == NULL ) || ( l_row_idx_id == NULL ) ) { fprintf( stderr, "could not allocate sp data!\n" ); return; } /* set everything to zero for init */ memset(*o_row_idx, 0, sizeof(unsigned int)*(*o_row_count + 1)); memset(*o_column_idx, 0, sizeof(unsigned int)*(*o_element_count)); memset(*o_values, 0, sizeof(double)*(*o_element_count)); memset(l_row_idx_id, 0, sizeof(unsigned int)*(*o_row_count)); /* init column idx */ for ( l_i = 0; l_i < (*o_row_count + 1); l_i++) (*o_row_idx)[l_i] = (*o_element_count); /* init */ (*o_row_idx)[0] = 0; l_i = 0; l_header_read = 1; } else { fprintf( stderr, "could not csr description!\n" ); return; } /* now we read the actual content */ } else { unsigned int l_row, l_column; double l_value; /* read a line of content */ if ( sscanf(l_line, "%u %u %lf", &l_row, &l_column, &l_value) != 3 ) { fprintf( stderr, "could not read element!\n" ); return; } /* adjust numbers to zero termination */ l_row--; l_column--; /* add these values to row and value structure */ (*o_column_idx)[l_i] = l_column; (*o_values)[l_i] = l_value; l_i++; /* handle columns, set id to own for this column, yeah we need to handle empty columns */ l_row_idx_id[l_row] = 1; (*o_row_idx)[l_row+1] = l_i; } } } /* close mtx file */ fclose( l_csr_file_handle ); /* check if we read a file which was consistent */ if ( l_i != (*o_element_count) ) { fprintf( stderr, "we were not able to read all elements!\n" ); return; } /* let's handle empty rows */ for ( l_i = 0; l_i < (*o_row_count); l_i++) { if ( l_row_idx_id[l_i] == 0 ) { (*o_row_idx)[l_i+1] = (*o_row_idx)[l_i]; } } /* free helper data structure */ if ( l_row_idx_id != NULL ) { free( l_row_idx_id ); } } void edge_sparse_csr_reader_float( const char* i_csr_file_in, unsigned int** o_row_idx, unsigned int** o_column_idx, float** o_values, unsigned int* o_row_count, unsigned int* o_column_count, unsigned int* o_element_count ) { double* l_values; unsigned int i; /* read using double */ edge_sparse_csr_reader_double( i_csr_file_in, o_row_idx, o_column_idx, &l_values, o_row_count, o_column_count, o_element_count ); /* converting double values into float */ *o_values = (float*) malloc((*o_element_count)*sizeof(float)); for ( i = 0; i < (*o_element_count); ++i ) { (*o_values)[i] = (float)l_values[i]; } free(l_values); } libxsmm-1.17/samples/edge/edge_proxy_common.h000066400000000000000000000035451415223013700213600ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include #include #include #ifndef EDGE_COMMON_H #define EDGE_COMMON_H void edge_sparse_csr_reader_double( const char* i_csr_file_in, unsigned int** o_row_idx, unsigned int** o_column_idx, double** o_values, unsigned int* o_row_count, unsigned int* o_column_count, unsigned int* o_element_count ); void edge_sparse_csr_reader_float( const char* i_csr_file_in, unsigned int** o_row_idx, unsigned int** o_column_idx, float** o_values, unsigned int* o_row_count, unsigned int* o_column_count, unsigned int* o_element_count ); #endif /* EDGE_COMMON_H */ libxsmm-1.17/samples/edge/edge_vol_int.c000066400000000000000000000457011415223013700202740ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "edge_proxy_common.h" #include #include #include #include #include #include #if defined(_OPENMP) # include #endif /*#define EDGE_HP_1G*/ /*#define HANDLE_AMOK*/ #if defined(EDGE_HP_1G) || defined(EDGE_HP_2M) #include #include #endif LIBXSMM_INLINE void* edge_hp_malloc( size_t nbytes, size_t alignment ) { void* ret_ptr = NULL; #if defined(EDGE_HP_1G) size_t num_large_pages = nbytes / (1073741824L); if ( nbytes > num_large_pages*1073741824L ) { num_large_pages++; } nbytes = (size_t) num_large_pages * 1073741824L; printf("trying to allocate %ld 1G pages\n", num_large_pages); /*ret_ptr = mmap( NULL, nbytes, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANONYMOUS | MAP_PRIVATE | MAP_HUGETLB | MAP_HUGE_1GB, -1, 0 );*/ ret_ptr = mmap( NULL, nbytes, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_HUGETLB | MAP_HUGE_1GB, -1, 0 ); if ( (ret_ptr == (void *)(-1)) ) { fprintf(stderr,"1G mmap call failed\n"); exit(1); } #elif defined(EDGE_HP_2M) size_t num_large_pages = nbytes / (2097152UL); if ( nbytes > num_large_pages*2097152UL ) { num_large_pages++; } nbytes = (size_t) num_large_pages * 2097152UL; printf("trying to allocate %ld 2M pages\n", num_large_pages); /*ret_ptr = mmap( NULL, nbytes, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANONYMOUS | MAP_PRIVATE | MAP_HUGETLB, -1, 0 );*/ ret_ptr = mmap( NULL, nbytes, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_HUGETLB, -1, 0 ); if ( (ret_ptr == (void *)(-1)) ) { fprintf(stderr,"2M mmap call failed\n"); exit(1); } #else ret_ptr = libxsmm_aligned_malloc( nbytes, alignment ); #endif return ret_ptr; } LIBXSMM_INLINE void edge_hp_free( void* ptr, size_t nbytes ) { LIBXSMM_UNUSED( nbytes ); #if defined(EDGE_HP_1G) /* to be implemented */ #elif defined(EDGE_HP_2M) /* to be implemented */ #else libxsmm_free( ptr ); #endif } #if defined(__AVX512F__) LIBXSMM_INLINE void matMulFusedAC( unsigned short i_r, unsigned int i_m, unsigned int i_n, unsigned int i_k, unsigned int i_ldA, unsigned int i_ldB, unsigned int i_ldC, double i_beta, const double *i_a, const double *i_b, double *o_c ) { unsigned int l_m, l_n, l_k; const __m512d beta = _mm512_set1_pd( i_beta ); LIBXSMM_UNUSED(i_r); for( l_m = 0; l_m < i_m; l_m++ ) { for( l_n = 0; l_n < i_n; l_n++ ) { __m512d vc = (i_beta != 0.0) ? _mm512_mul_pd( _mm512_loadu_pd( &(o_c[l_m*i_ldC*8 + l_n*8 + 0]) ), beta ) : _mm512_setzero_pd(); _mm512_storeu_pd(&(o_c[l_m*i_ldC*8 + l_n*8 + 0]), vc); } } for( l_m = 0; l_m < i_m; l_m++ ) { for( l_n = 0; l_n < i_n; l_n++ ) { __m512d vc = _mm512_loadu_pd( &(o_c[l_m*i_ldC*8 + l_n*8 + 0]) ); for( l_k = 0; l_k < i_k; l_k++ ) { const __m512d alpha = _mm512_set1_pd( i_b[l_k*i_ldB + l_n] ); vc = _mm512_fmadd_pd( alpha, _mm512_loadu_pd( &(i_a[l_m*i_ldA*8 + l_k*8 + 0]) ), vc); } _mm512_storeu_pd( &(o_c[l_m*i_ldC*8 + l_n*8 + 0]), vc ); } } } LIBXSMM_INLINE void matMulFusedBC( unsigned short i_r, unsigned int i_m, unsigned int i_n, unsigned int i_k, unsigned int i_ldA, unsigned int i_ldB, unsigned int i_ldC, double i_beta, const double *i_a, const double *i_b, double *o_c ) { unsigned int l_m, l_n, l_k; const __m512d beta = _mm512_set1_pd( i_beta ); LIBXSMM_UNUSED(i_r); for( l_m = 0; l_m < i_m; l_m++ ) { for( l_n = 0; l_n < i_n; l_n++ ) { __m512d vc = (i_beta != 0.0) ? _mm512_mul_pd( _mm512_loadu_pd( &(o_c[l_m*i_ldC*8 + l_n*8 + 0]) ), beta ) : _mm512_setzero_pd(); _mm512_storeu_pd(&(o_c[l_m*i_ldC*8 + l_n*8 + 0]), vc); } } for( l_m = 0; l_m < i_m; l_m++ ) { for( l_n = 0; l_n < i_n; l_n++ ) { __m512d vc = _mm512_loadu_pd( &(o_c[l_m*i_ldC*8 + l_n*8 + 0]) ); for( l_k = 0; l_k < i_k; l_k++ ) { const __m512d alpha = _mm512_set1_pd( i_a[l_m*i_ldA + l_k] ); vc = _mm512_fmadd_pd( alpha, _mm512_loadu_pd( &(i_b[l_k*i_ldB*8 + l_n*8 + 0]) ), vc); } _mm512_storeu_pd( &(o_c[l_m*i_ldC*8 + l_n*8 + 0]), vc ); } } } #endif LIBXSMM_INLINE void amok_detect( const double* i_runtimes, size_t* io_amoks, const size_t i_workers ) { double time_avg; size_t i; time_avg = 0.0; for (i = 0; i < i_workers; i++) { if ( io_amoks[8*i] == 0 ) { time_avg += i_runtimes[8*i]; } } time_avg = time_avg/((double)(i_workers-io_amoks[8*i_workers])); /* let detect amoks */ for (i = 0; i < i_workers; i++) { if ( io_amoks[8*i] == 0 ) { if ( i_runtimes[8*i] > time_avg*1.07 ) { /* this is the amok condition */ io_amoks[8*i_workers]++; io_amoks[8*i] = 1; } } } } LIBXSMM_INLINE void amok_balance( const size_t* i_amoks, const size_t i_workers, const size_t i_worksize, const size_t i_mytid, size_t* io_chunk, size_t* io_mystart, size_t* io_myend ) { size_t l_chunk, l_start, l_end; size_t l_cur_amoks = i_amoks[8*i_workers]; size_t l_non_amoks = i_workers - l_cur_amoks; l_chunk = (i_worksize % l_non_amoks == 0) ? (i_worksize / l_non_amoks) : ((i_worksize / l_non_amoks) + 1); if (i_amoks[8*i_mytid] != 0) { l_start = 0; l_end = 0; } else { size_t l_tid_offset = 0; size_t l_z; for ( l_z = 0; l_z < i_mytid; l_z++) { if ( i_amoks[8*l_z] != 0 ) { l_tid_offset++; } } l_tid_offset = i_mytid - l_tid_offset; l_start = (l_tid_offset * l_chunk < i_worksize) ? (l_tid_offset * l_chunk) : i_worksize; l_end = ((l_tid_offset+1) * l_chunk < i_worksize) ? ((l_tid_offset+1) * l_chunk) : i_worksize; } *io_chunk = l_chunk; *io_mystart = l_start; *io_myend = l_end; } int main(int argc, char* argv[]) { char* mat_a = 0; unsigned int *mat_a_rowptr, *mat_a_colidx; unsigned int mat_a_rowcount, mat_a_colcount, mat_a_nnz; double* mat_a_values; libxsmm_dmmfunction a_kernel; char* mat_b = 0; unsigned int *mat_b_rowptr, *mat_b_colidx; unsigned int mat_b_rowcount, mat_b_colcount, mat_b_nnz; double* mat_b_values; libxsmm_dmmfunction b_kernel; char* mat_c = 0; unsigned int *mat_c_rowptr, *mat_c_colidx; unsigned int mat_c_rowcount, mat_c_colcount, mat_c_nnz; double* mat_c_values; libxsmm_dmmfunction c_kernel; char* mat_st = 0; unsigned int *mat_st_rowptr, *mat_st_colidx; unsigned int mat_st_rowcount, mat_st_colcount, mat_st_nnz; double* mat_st_values; libxsmm_dmmfunction st_kernel; int num_modes = 9; int num_quants = 9; size_t num_elems = 0; size_t num_cfr = 8; size_t num_reps = 1; size_t elem_size; /* OpenMP: signed induction variables */ int i, j; const libxsmm_gemm_descriptor *l_xgemm_desc_stiff = 0, *l_xgemm_desc_star = 0; libxsmm_descriptor_blob l_xgemm_blob_stiff, l_xgemm_blob_star; const libxsmm_gemm_prefetch_type prefetch = LIBXSMM_GEMM_PREFETCH_NONE; const int flags = LIBXSMM_GEMM_FLAGS('N', 'N'); const double alpha = 1, beta = 1; double flops_vol; double* q; double* qt; double* qs; double* star; double* global; unsigned long long l_start, l_end; double l_total; unsigned int l_num_threads; unsigned int l_star_ent = num_quants*num_quants; double* l_total_thread; double* l_cur_thread_time; double time_max; double time_min; double time_avg; size_t* amoks; /* read cmd */ if ((argc > 1 && !strncmp(argv[1], "-h", 3)) || (argc != 8)) { printf("Usage: %s stif1 stif2 stif3 star nModes nElems nReps\n", argv[0]); return 0; } libxsmm_rng_set_seed(1); /* some empty lines at the beginning */ printf("\n"); i = 1; if (argc > (int)i) mat_a = argv[i++]; if (argc > (int)i) mat_b = argv[i++]; if (argc > (int)i) mat_c = argv[i++]; if (argc > (int)i) mat_st = argv[i++]; if (argc > (int)i) num_modes = atoi(argv[i++]); if (argc > (int)i) num_elems = atoi(argv[i++]); if (argc > (int)i) num_reps = atoi(argv[i++]); elem_size = num_modes*num_quants*num_cfr; #if defined(_OPENMP) #pragma omp parallel { #pragma omp master { l_num_threads = omp_get_num_threads(); } } #else l_num_threads = 1; #endif l_total_thread = (double*)malloc(8*l_num_threads*sizeof(double)); l_cur_thread_time = (double*)malloc(8*l_num_threads*sizeof(double)); amoks = (size_t*)malloc(8*(l_num_threads+1)*sizeof(size_t)); for ( i = 0; i < 8*((int)l_num_threads+1); i++ ) { amoks[i] = 0; } /* read matrices */ printf("reading sparse matrices... "); edge_sparse_csr_reader_double( mat_a, &mat_a_rowptr, &mat_a_colidx, &mat_a_values, &mat_a_rowcount, &mat_a_colcount, &mat_a_nnz ); edge_sparse_csr_reader_double( mat_b, &mat_b_rowptr, &mat_b_colidx, &mat_b_values, &mat_b_rowcount, &mat_b_colcount, &mat_b_nnz ); edge_sparse_csr_reader_double( mat_c, &mat_c_rowptr, &mat_c_colidx, &mat_c_values, &mat_c_rowcount, &mat_c_colcount, &mat_c_nnz ); edge_sparse_csr_reader_double( mat_st, &mat_st_rowptr, &mat_st_colidx, &mat_st_values, &mat_st_rowcount, &mat_st_colcount, &mat_st_nnz ); printf("done!\n\n"); /* generate kernels */ printf("generating code... "); l_xgemm_desc_stiff = libxsmm_dgemm_descriptor_init(&l_xgemm_blob_stiff, num_quants, num_modes, num_modes, num_modes, 0, num_modes, alpha, beta, flags, prefetch); l_xgemm_desc_star = libxsmm_dgemm_descriptor_init(&l_xgemm_blob_star, num_quants, num_modes, num_quants, 0, num_modes, num_modes, alpha, beta, flags, prefetch); a_kernel = libxsmm_create_xcsr_soa( l_xgemm_desc_stiff, mat_a_rowptr, mat_a_colidx, (const void*)mat_a_values, (unsigned int)num_cfr ).dmm; b_kernel = libxsmm_create_xcsr_soa( l_xgemm_desc_stiff, mat_b_rowptr, mat_b_colidx, (const void*)mat_b_values, (unsigned int)num_cfr ).dmm; c_kernel = libxsmm_create_xcsr_soa( l_xgemm_desc_stiff, mat_c_rowptr, mat_c_colidx, (const void*)mat_c_values, (unsigned int)num_cfr ).dmm; st_kernel = libxsmm_create_xcsr_soa( l_xgemm_desc_star, mat_st_rowptr, mat_st_colidx, (const void*)mat_st_values, (unsigned int)num_cfr ).dmm; if ( a_kernel == 0 ) { printf("a kernel could not be built -> exit!"); exit(-1); } if ( b_kernel == 0 ) { printf("b kernel could not be built -> exit!"); exit(-1); } if ( b_kernel == 0 ) { printf("c kernel could not be built -> exit!"); exit(-1); } if ( st_kernel == 0 ) { printf("st kernel could not be built -> exit!"); exit(-1); } printf("done!\n\n"); /* copying code to 1 GB page */ #if 0 #if defined(EDGE_HP_1G) || defined(EDGE_HP_2M) printf("copying code to 1GB page...\n"); onegcode = (void*)edge_hp_malloc( 5*1024*1024, 2097152 ); memcpy( onegcode, (void*) a_kernel, 1505 ); memcpy( onegcode+(1*1024*1024)+64, (void*) b_kernel, 2892 ); memcpy( onegcode+(2*1024*1024)+128, (void*) c_kernel, 3249 ); memcpy( onegcode+(3*1024*1024)+196, (void*)st_kernel, 11010 ); a_kernel = (libxsmm_dmmfunction)onegcode; b_kernel = (libxsmm_dmmfunction)(onegcode+(1*1024*1024)+64); c_kernel = (libxsmm_dmmfunction)(onegcode+(2*1024*1024)+128); st_kernel = (libxsmm_dmmfunction)(onegcode+(3*1024*1024)+196); printf("...done\n\n"); #endif #endif /* create unknowns and t-unknowns */ printf("allocating and initializing fake data... \n"); /* DoFs */ printf(" q: %f MiB\n", ((double)(num_elems*num_modes*num_quants*num_cfr*sizeof(double))) / ( 1024.0*1024.0) ); q = (double*)edge_hp_malloc( num_elems*num_modes*num_quants*num_cfr*sizeof(double), 2097152); /* tDofs */ printf(" qt: %f MiB\n", ((double)(num_elems*num_modes*num_quants*num_cfr*sizeof(double))) / ( 1024.0*1024.0) ); qt = (double*)edge_hp_malloc( num_elems*num_modes*num_quants*num_cfr*sizeof(double), 2097152); /* star matrices */ printf(" star: %f MiB\n", ((double)(num_elems*3*l_star_ent*sizeof(double))) / ( 1024.0*1024.0 ) ); star = (double*)edge_hp_malloc( num_elems*3*l_star_ent*sizeof(double), 2097152); /* stiffness matrices */ printf("global: %f MiB\n", ((double)(3*num_modes*num_modes*sizeof(double))) / ( 1024.0*1024 ) ); global = (double*)edge_hp_malloc( 3*num_modes*num_modes*sizeof(double), 2097152); /* per thread scratch */ printf(" t: %f MiB\n", ((double)(l_num_threads*num_modes*num_quants*num_cfr*sizeof(double)))/ ( 1024.0*1024.0) ); qs = (double*)edge_hp_malloc( l_num_threads*num_modes*num_quants*num_cfr*sizeof(double), 2097152); for (i = 0; i < (int)num_elems; i++) { for (j = 0; j < (int)elem_size; j++) { q[i*elem_size + j] = libxsmm_rng_f64(); } } for (i = 0; i < (int)num_elems; i++) { for (j = 0; j < (int)elem_size; j++) { qt[i*elem_size + j] = libxsmm_rng_f64(); } } for (i = 0; i < (int)l_num_threads; i++) { for (j = 0; j < (int)elem_size; j++) { qs[i*elem_size + j] = libxsmm_rng_f64(); } } for (i = 0; i < (int)num_elems; i++) { for (j = 0; j < (int)mat_st_nnz*3; j++) { star[(i*3*mat_st_nnz)+j] = libxsmm_rng_f64(); } } for (i = 0; i < 3; i++) { for (j = 0; j < num_modes*num_modes; j++) { global[(i*num_modes*num_modes)+j] = libxsmm_rng_f64(); } } printf("allocation done!\n\n"); printf("running benchmark...\n"); l_start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel private(i, j) #endif { #if defined(_OPENMP) int mytid = omp_get_thread_num(); #else int mytid = 0; #endif libxsmm_timer_tickint mystart, myend; #if defined(HANDLE_AMOK) size_t cur_amoks = 0; size_t non_amoks = l_num_threads; #endif size_t l_el_chunk = 0; size_t l_el_start = 0; size_t l_el_end = 0; /* initial work distribution */ amok_balance( amoks, l_num_threads, num_elems, mytid, &l_el_chunk, &l_el_start, &l_el_end ); for (i = 0; i < (int)num_reps; i++) { #if defined(HANDLE_AMOK) /* did we had an amok? */ if (cur_amoks != amoks[8*l_num_threads]) { cur_amoks = amoks[8*l_num_threads]; non_amoks = l_num_threads - cur_amoks; /* re-balance work */ amok_balance( amoks, l_num_threads, num_elems, mytid, &l_el_chunk, &l_el_start, &l_el_end ); } #endif mystart = libxsmm_timer_tick(); for (j = (int)l_el_start; j < (int)l_el_end; j++) { #if 1 st_kernel( star+(j*3*mat_st_nnz) , qt+(j*elem_size), qs+(mytid*elem_size) ); a_kernel( qs+(mytid*elem_size), global , q+(j*elem_size) ); st_kernel( star+(j*3*mat_st_nnz)+mat_st_nnz , qt+(j*elem_size), qs+(mytid*elem_size) ); b_kernel( qs+(mytid*elem_size), global+(num_modes*num_modes) , q+(j*elem_size) ); st_kernel( star+(j*3*mat_st_nnz)+(2*mat_st_nnz), qt+(j*elem_size), qs+(mytid*elem_size) ); c_kernel( qs+(mytid*elem_size), global+(2*num_modes*num_modes), q+(j*elem_size) ); #else matMulFusedBC( 8, num_quants, num_modes, num_quants, num_quants, num_modes, num_modes, 1.0, star+(j*3*mat_st_nnz), qt+(j*elem_size), qs+(mytid*elem_size) ); matMulFusedAC( 8, num_quants, num_modes, num_modes, num_modes, num_modes, num_modes, 1.0, qs+(mytid*elem_size), global, q+(j*elem_size) ); matMulFusedBC( 8, num_quants, num_modes, num_quants, num_quants, num_modes, num_modes, 1.0, star+(j*3*mat_st_nnz)+mat_st_nnz, qt+(j*elem_size), qs+(mytid*elem_size) ); matMulFusedAC( 8, num_quants, num_modes, num_modes, num_modes, num_modes, num_modes, 1.0, qs+(mytid*elem_size), global+(num_modes*num_modes) , q+(j*elem_size) ); matMulFusedBC( 8, num_quants, num_modes, num_quants, num_quants, num_modes, num_modes, 1.0, star+(j*3*mat_st_nnz)+(2*mat_st_nnz), qt+(j*elem_size), qs+(mytid*elem_size) ); matMulFusedAC( 8, num_quants, num_modes, num_modes, num_modes, num_modes, num_modes, 1.0, qs+(mytid*elem_size), global+(2*num_modes*num_modes), q+(j*elem_size) ); #endif } myend = libxsmm_timer_tick(); l_cur_thread_time[8*mytid] = libxsmm_timer_duration( mystart, myend ); l_total_thread[8*mytid] += libxsmm_timer_duration( mystart, myend ); #if defined(_OPENMP) #pragma omp barrier #endif #if defined(HANDLE_AMOK) /* checking for amoks is centralized business */ if (mytid == 0) { /* amok check */ amok_detect( l_cur_thread_time, amoks, l_num_threads ); } #if defined(_OPENMP) #pragma omp barrier #endif #endif } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); printf("...done!\n\n"); /* some timing stats */ time_max = 0.0; time_min = 80000000; time_avg = 0.0; for (i = 0; i < (int)l_num_threads; i++) { if( amoks[8*i] == 0 ) { if( l_total_thread[8*i] > time_max) time_max = l_total_thread[8*i]; if( l_total_thread[8*i] < time_min) time_min = l_total_thread[8*i]; time_avg += l_total_thread[8*i]; } } time_avg = time_avg/((double)(l_num_threads-amoks[8*l_num_threads])); flops_vol = (double)num_quants * (double)mat_a_nnz * (double)num_cfr * 2.0; flops_vol += (double)num_quants * (double)mat_b_nnz * (double)num_cfr * 2.0; flops_vol += (double)num_quants * (double)mat_c_nnz * (double)num_cfr * 2.0; flops_vol += (double)num_modes * (double)mat_st_nnz * (double)num_cfr * 6.0; /* 3 star matrix mul */ printf("%fs time for vol (asm), min %f, max %f, avg %f, #amoks %llu, amok-threads ", l_total, time_min, time_max, time_avg, (unsigned long long)amoks[8*l_num_threads]); for ( i = 0; i < (int)l_num_threads; i++ ) { if ( amoks[8*i] != 0 ) { printf("%i,", i); } } printf("\n"); printf("%f GFLOPS for vol (asm)\n", ((double)num_elems * (double)num_reps * flops_vol) / (l_total * 1.0e9)); printf("%f GiB/s for vol (asm)\n", (double)((double)num_elems * (double)elem_size * 8.0 * 3.0 * (double)num_reps) / (l_total * 1024.0*1024.0*1024.0) ); printf("done!\n\n"); /* some empty lines at the end */ printf("\n\n"); return 0; } libxsmm-1.17/samples/edge/edge_vol_int.vcxproj000066400000000000000000000542351415223013700215470ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 edge_vol_int {2432BA35-1862-4AF1-831C-6B2A7A079C84} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/edge/test_dense_packedacrm.sh000077500000000000000000000023101415223013700223250ustar00rootroot00000000000000#!/usr/bin/env bash # Arguments M N K beta reps # l_r is fixed to 16, when we FP32 # l_r is fixed to 8, when we FP64 ITERS=10 # scatter, elemnet echo "scatter, element, f32" ./dense_packedacrm_f32 9 729 35 0.0 ${ITERS} ./dense_packedacrm_f32 9 729 35 1.0 ${ITERS} # scatter, surface echo "scatter, surface, f32" ./dense_packedacrm_f32 9 81 35 0.0 ${ITERS} ./dense_packedacrm_f32 9 81 35 1.0 ${ITERS} # gather, element echo "gather, element, f32" ./dense_packedacrm_f32 9 35 729 0.0 ${ITERS} ./dense_packedacrm_f32 9 35 729 1.0 ${ITERS} # gather, surface echo "gather, surface, f32" ./dense_packedacrm_f32 9 35 81 0.0 ${ITERS} ./dense_packedacrm_f32 9 35 81 1.0 ${ITERS} # scatter, elemnet echo "scatter, element, f64" ./dense_packedacrm_f64 9 729 35 0.0 ${ITERS} ./dense_packedacrm_f64 9 729 35 1.0 ${ITERS} # scatter, surface echo "scatter, surface, f64" ./dense_packedacrm_f64 9 81 35 0.0 ${ITERS} ./dense_packedacrm_f64 9 81 35 1.0 ${ITERS} # gather, element echo "gather, element, f64" ./dense_packedacrm_f64 9 35 729 0.0 ${ITERS} ./dense_packedacrm_f64 9 35 729 1.0 ${ITERS} # gather, surface echo "gather, surface, f64" ./dense_packedacrm_f64 9 35 81 0.0 ${ITERS} ./dense_packedacrm_f64 9 35 81 1.0 ${ITERS} libxsmm-1.17/samples/edge/test_dense_packedbcrm.sh000077500000000000000000000023101415223013700223260ustar00rootroot00000000000000#!/usr/bin/env bash # Arguments M N K beta reps # l_r is fixed to 16, when we FP32 # l_r is fixed to 8, when we FP64 ITERS=10 # scatter, elemnet echo "scatter, element, f32" ./dense_packedbcrm_f32 9 729 35 0.0 ${ITERS} ./dense_packedbcrm_f32 9 729 35 1.0 ${ITERS} # scatter, surface echo "scatter, surface, f32" ./dense_packedbcrm_f32 9 81 35 0.0 ${ITERS} ./dense_packedbcrm_f32 9 81 35 1.0 ${ITERS} # gather, element echo "gather, element, f32" ./dense_packedbcrm_f32 9 35 729 0.0 ${ITERS} ./dense_packedbcrm_f32 9 35 729 1.0 ${ITERS} # gather, surface echo "gather, surface, f32" ./dense_packedbcrm_f32 9 35 81 0.0 ${ITERS} ./dense_packedbcrm_f32 9 35 81 1.0 ${ITERS} # scatter, elemnet echo "scatter, element, f64" ./dense_packedbcrm_f64 9 729 35 0.0 ${ITERS} ./dense_packedbcrm_f64 9 729 35 1.0 ${ITERS} # scatter, surface echo "scatter, surface, f64" ./dense_packedbcrm_f64 9 81 35 0.0 ${ITERS} ./dense_packedbcrm_f64 9 81 35 1.0 ${ITERS} # gather, element echo "gather, element, f64" ./dense_packedbcrm_f64 9 35 729 0.0 ${ITERS} ./dense_packedbcrm_f64 9 35 729 1.0 ${ITERS} # gather, surface echo "gather, surface, f64" ./dense_packedbcrm_f64 9 35 81 0.0 ${ITERS} ./dense_packedbcrm_f64 9 35 81 1.0 ${ITERS} libxsmm-1.17/samples/edge/test_matops.sh000077500000000000000000000071011415223013700203630ustar00rootroot00000000000000#!/usr/bin/env bash ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # sde can be downloaded here SDE64_BIN=/swtools/sde/kits/latest/sde64 SDE64_ARCH="-skx" SDE64_FLAGS="-ptr_check -null_check -ptr_raise" SDE=${SDE64_BIN}" "${SDE64_FLAGS}" "${SDE64_ARCH}" -- " GREP=$(command -v grep) # iterations, order, precision, arch and format if [ $# -eq 5 ] then REPS=$1 PDEG=$2 PREC=$3 VLEN=$4 FRMT=$5 else REPS=1000 PDEG=5 PREC=f64 VLEN=64 FRMT=csc fi if [[ $PDEG == "1" ]] then K=4 N=3 elif [[ $PDEG == "2" ]] then K=10 N=6 elif [[ $PDEG == "3" ]] then K=20 N=10 elif [[ $PDEG == "4" ]] then K=35 N=15 elif [[ $PDEG == "5" ]] then K=56 N=21 elif [[ $PDEG == "6" ]] then K=84 N=28 else echo "PDEG need to be in the range of 1 to 6" return -1 fi if [[ $PREC == "f32" ]] then if [[ $VLEN == "32" ]] then CRUN=8 elif [[ $VLEN == "64" ]] then CRUN=16 elif [[ $VLEN == "16" ]] then CRUN=4 else echo "VLEN need to be either 16/32/64" return -3 fi elif [[ $PREC == "f64" ]] then if [[ $VLEN == "32" ]] then CRUN=4 elif [[ $VLEN == "64" ]] then CRUN=8 elif [[ $VLEN == "16" ]] then CRUN=2 else echo "VLEN need to be either 16/32/64" return -3 fi else echo "PREC needs to be either f32/f64" return -2 fi if [[ $VLEN == "64" ]] then #on an AVX512 platform we can run natively CPUFLAGS=$(if [ "${GREP}" ] && [ -e /proc/cpuinfo ]; then ${GREP} -m1 flags /proc/cpuinfo | cut -d: -f2-; fi) if [ "$(echo "${CPUFLAGS}" | ${GREP} -o avx512f)" ]; then SDE= fi if [ "$(echo "${CPUFLAGS}" | ${GREP} -o asimd)" ]; then SDE= fi else SDE= fi # number of quantities is always 9 M=9 if [[ $FRMT == "csr" ]] then # test flux matrices, CSR for i in `ls mats/tet4_${PDEG}_fluxN*_csr.mtx`; do ${SDE} ./bsparse_packed_csr_${PREC} ${M} ${N} ${K} ${CRUN} ${REPS} $i; done for i in `ls mats/tet4_${PDEG}_fluxT*_csr.mtx`; do ${SDE} ./bsparse_packed_csr_${PREC} ${M} ${K} ${N} ${CRUN} ${REPS} $i; done # test stiffness matrices, CSR for i in `ls mats/tet4_${PDEG}_stiff*_csr.mtx`; do ${SDE} ./bsparse_packed_csr_${PREC} ${M} ${K} ${K} ${CRUN} ${REPS} $i; done elif [[ $FRMT == "csc" ]] then # test flux matrices, CSC for i in `ls mats/tet4_${PDEG}_fluxN*_csc.mtx`; do ${SDE} ./bsparse_packed_csc_${PREC} ${M} ${N} ${K} ${CRUN} ${REPS} $i; done for i in `ls mats/tet4_${PDEG}_fluxT*_csc.mtx`; do ${SDE} ./bsparse_packed_csc_${PREC} ${M} ${K} ${N} ${CRUN} ${REPS} $i; done # test stiffness matrices, CSC for i in `ls mats/tet4_${PDEG}_stiff*_csc.mtx`; do ${SDE} ./bsparse_packed_csc_${PREC} ${M} ${K} ${K} ${CRUN} ${REPS} $i; done else echo "FRMT need to be either csr/csc" return -4 fi # test star matrices ${SDE} ./asparse_packed_csr_${PREC} ${M} ${K} ${M} ${CRUN} ${REPS} mats/tet4_starMatrix_csr.mtx # test flux matrices ${SDE} ./asparse_packed_csr_${PREC} ${M} ${K} ${M} ${CRUN} ${REPS} mats/tet4_fluxMatrix_csr_sp.mtx ${SDE} ./asparse_packed_csr_${PREC} ${M} ${K} ${M} ${CRUN} ${REPS} mats/tet4_fluxMatrix_csr_de.mtx libxsmm-1.17/samples/eigen/000077500000000000000000000000001415223013700156465ustar00rootroot00000000000000libxsmm-1.17/samples/eigen/Makefile000066400000000000000000000122271415223013700173120ustar00rootroot00000000000000ROOTDIR = $(abspath $(dir $(firstword $(MAKEFILE_LIST)))) DEPDIR = ../.. SRCDIR = . INCDIR = . BLDDIR = obj OUTDIR = . CXXFLAGS = $(NULL) CFLAGS = $(NULL) DFLAGS = -DLIBXSMM_BLAS_CONST BLAS = 1 OMP = 1 SYM = 1 # explore AVX/ARCH=native SSE = 0 # include common Makefile artifacts include $(DEPDIR)/Makefile.inc # necessary include directories IFLAGS += -I$(call quote,$(INCDIR)) IFLAGS += -I$(call quote,$(DEPDIR)/include) OUTNAME := $(shell basename "$(ROOTDIR)") HEADERS := $(wildcard $(INCDIR)/*.h) $(wildcard $(INCDIR)/*.hpp) $(wildcard $(INCDIR)/*.hxx) $(wildcard $(INCDIR)/*.hh) \ $(wildcard $(SRCDIR)/*.h) $(wildcard $(SRCDIR)/*.hpp) $(wildcard $(SRCDIR)/*.hxx) $(wildcard $(SRCDIR)/*.hh) \ $(DEPDIR)/include/libxsmm_source.h CPPSRCS := $(wildcard $(SRCDIR)/*.cpp) CXXSRCS := $(wildcard $(SRCDIR)/*.cxx) CCXSRCS := $(wildcard $(SRCDIR)/*.cc) CSOURCS := $(wildcard $(SRCDIR)/*.c) CPPOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CPPSRCS:.cpp=-cpp.o))) CXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CXXSRCS:.cxx=-cxx.o))) CCXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CCXSRCS:.cc=-cc.o))) COBJCTS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CSOURCS:.c=-c.o))) ifneq (,$(strip $(FC))) FXXSRCS := $(wildcard $(SRCDIR)/*.f) F77SRCS := $(wildcard $(SRCDIR)/*.F) F90SRCS := $(wildcard $(SRCDIR)/*.f90) $(wildcard $(SRCDIR)/*.F90) FXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(FXXSRCS:.f=-f.o))) F77OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F77SRCS:.F=-f77.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90SRCS:.f90=-f90.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90OBJS:.F90=-f90.o))) endif SOURCES := $(CPPSRCS) $(CXXSRCS) $(CCXSRCS) $(CSOURCS) OBJECTS := $(CPPOBJS) $(CXXOBJS) $(CCXOBJS) $(COBJCTS) FTNSRCS := $(FXXSRCS) $(F77SRCS) $(F90SRCS) MODULES := $(addsuffix .mod,$(basename $(FTNSRCS))) $(addsuffix .modmic,$(basename $(FTNSRCS))) FTNOBJS := $(FXXOBJS) $(F77OBJS) $(F90OBJS) XFILES := $(OUTDIR)/$(OUTNAME)_smm $(OUTDIR)/$(OUTNAME)_tensor ifneq (0,$(EIGEN)) ifeq (,$(strip $(EIGENROOT))) ifneq (,$(wildcard $(DEPDIR)/../eigen*/Eigen/Dense)) EIGENROOT = $(lastword $(sort $(wildcard $(DEPDIR)/../eigen*))) else ifneq (,$(wildcard $(HOME)/eigen*/Eigen/Dense)) EIGENROOT = $(lastword $(sort $(wildcard $(HOME)/eigen*))) else ifneq (,$(wildcard /usr/include/eigen3/Eigen/Dense)) EIGENROOT = /usr/include/eigen3 else ifneq (,$(wildcard /usr/local/opt/eigen/include/eigen3/Eigen/Dense)) EIGENROOT = /usr/local/opt/eigen/include/eigen3 endif endif endif ifneq (,$(EIGENROOT)) DFLAGS += -D__EIGEN IFLAGS += -I$(call quote,$(EIGENROOT)) EIGEN ?= 1 ifneq (0,$(MKL)) DFLAGS += -DEIGEN_USE_MKL_ALL endif ifneq (0,$(shell echo "$$((1 < $(BLAS) || 1 < $(EIGEN)))")) DFLAGS += -DEIGEN_USE_THREADS else DFLAGS += -DEIGEN_DONT_PARALLELIZE endif ifneq (,$(wildcard $(EIGENROOT)/unsupported/Eigen/CXX11/ThreadPool)) DFLAGS += -D__EIGEN_UNSUPPORTED endif else EIGEN := 0 endif .PHONY: all all: $(XFILES) .PHONY: compile compile: $(OBJECTS) $(FTNOBJS) # no need to link against any of LIBXSMM's libraries since the sample code uses header-only LIBXSMM $(OUTDIR)/$(OUTNAME)_smm: $(OUTDIR)/.make $(BLDDIR)/$(OUTNAME)_smm-cpp.o $(DEPDIR)/include/libxsmm_source.h $(LD) -o $@ $(BLDDIR)/$(OUTNAME)_smm-cpp.o $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) # no need to link against any of LIBXSMM's libraries since the sample code uses header-only LIBXSMM $(OUTDIR)/$(OUTNAME)_tensor: $(OUTDIR)/.make $(BLDDIR)/$(OUTNAME)_tensor-cpp.o $(DEPDIR)/include/libxsmm_source.h $(LD) -o $@ $(BLDDIR)/$(OUTNAME)_tensor-cpp.o $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(BLDDIR)/%-cpp.o: $(SRCDIR)/%.cpp .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CXX) $(DFLAGS) $(IFLAGS) $(CXXFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-c.o: $(SRCDIR)/%.c .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CC) $(DFLAGS) $(IFLAGS) $(CFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-f.o: $(SRCDIR)/%.f .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.f90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.F90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f77.o: $(SRCDIR)/%.F .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ .PHONY: clean clean: ifneq ($(call qapath,$(BLDDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(BLDDIR)),$(call qapath,.)) @rm -rf $(BLDDIR) endif endif ifneq (,$(wildcard $(BLDDIR))) # still exists @rm -f $(OBJECTS) $(OBJECTX) $(FTNOBJS) $(FTNOBJX) *__genmod.* fit.log *.dat @rm -f $(BLDDIR)/*.gcno $(BLDDIR)/*.gcda $(BLDDIR)/*.gcov endif @rm -f .make .state .PHONY: realclean realclean: clean ifneq ($(call qapath,$(OUTDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(OUTDIR)),$(call qapath,.)) @rm -rf $(OUTDIR) endif endif ifneq (,$(wildcard $(OUTDIR))) # still exists @rm -f $(OUTDIR)/libxsmm.$(DLIBEXT) $(OUTDIR)/*.stackdump @rm -f $(XFILES) $(MODULES) endif libxsmm-1.17/samples/eigen/eigen_smm-cp2k.plt000066400000000000000000000301251415223013700211700ustar00rootroot00000000000000MPARM = 1 NPARM = 2 KPARM = 3 FLOPS = 5 HIM = -1 HIN = HIM HIK = HIM BASENAME = "eigen_smm" FILENAME = system("sh -c \"echo ${FILENAME}\"") if (FILENAME eq "") { FILENAME = BASENAME."-cp2k.pdf" } FILECOUNT = 1 # initial file number # MULTI =-1: multiple files; no titles # MULTI = 0: multiple files with titles # MULTI = 1: single file with titles MULTI = system("sh -c \"echo ${MULTI}\"") if (MULTI eq "") { MULTI = 1 } XFLOPS(M, N, K) = 2.0 * M * N * K NFLOPS(M, N, K) = XFLOPS(column(M), column(N), column(K)) NBYTES(M, N, K, ELEMSIZE) = ELEMSIZE * (column(M) * column(K) + column(K) * column(N) + column(M) * column(N)) AI(M, N, K, ELEMSIZE) = NFLOPS(M, N, K) / NBYTES(M, N, K, ELEMSIZE) TIME(M, N, K, F) = NFLOPS(M, N, K) * 1E-9 / column(F) BW(M, N, K, F, ELEMSIZE) = (column(M) * column(K) + column(K) * column(N)) * ELEMSIZE / (TIME(M, N, K, F) * 1024 * 1024 * 1024) stats BASENAME."-cp2k.dat" using (column(MPARM)*column(NPARM)*column(KPARM)) nooutput; MNK = STATS_stddev**(1.0/3.0); MAXMNK = int(STATS_max) stats BASENAME."-cp2k.dat" using (log(column(FLOPS))) nooutput; NSAMPLES = STATS_records; GEOFLOPS = exp(STATS_sum/STATS_records) stats BASENAME."-cp2k.dat" using FLOPS nooutput; MEDFLOPS = STATS_median; AVGFLOPS = STATS_mean; MINFLOPS = STATS_min; MAXFLOPS = STATS_max stats BASENAME."-cp2k.dat" using NPARM nooutput; XN = int(STATS_max) stats BASENAME."-cp2k.dat" using ((NFLOPS(MPARM,NPARM,KPARM)<=XFLOPS(13,13,13))?column(FLOPS):1/0) nooutput; BIN1_FLOPS = STATS_mean; BIN1_NSAMPLES = STATS_records stats BASENAME."-cp2k.dat" using (((XFLOPS(13,13,13)-1) { set title "Performance (Selected Kernels)" } set origin -0.03, 0 set pm3d interpolate 0, 0 #set colorbox horizontal user origin 0, 0.1 size 1, 0.1 #set autoscale fix if (0HIM) { set xrange [*:MNK] } if (0>HIN) { set yrange [*:MNK] } if (0>HIK) { set zrange [*:MNK] } set xlabel "M" set ylabel "N" offset -3.0 set zlabel "K" offset 1.0 set ticslevel 0 set cblabel "GFLOP/s" offset 1.5 set format x "%g"; set format y "%g"; set format z "%g"; set format cb "%g" splot BASENAME."-cp2k.dat" using MPARM:NPARM:KPARM:FLOPS notitle with points pointtype 7 linetype palette reset if (MULTI<=0) { set output "".FILECOUNT."-".FILENAME; FILECOUNT = FILECOUNT + 1 } if (MULTI>-1) { set title "Performance (K-Average for ".sprintf("%u Kernels", NSAMPLES).")" } set origin -0.02, 0 set dgrid3d #9, 9 set pm3d interpolate 0, 0 map set autoscale fix set xlabel "M" set ylabel "N" offset -1.5 set cblabel "GFLOP/s" offset 0.5 set format x "%g"; set format y "%g"; set format cb "%g" set mxtics 2 splot BASENAME."-plot-avg.dat" using (("".strcol(3)."" eq "i")?(I1($1, XN)):(1/0)):(("".strcol(3)."" eq "i")?(J1($1, XN)):(1/0)):2 notitle with pm3d reset if (MULTI<=0) { set output "".FILECOUNT."-".FILENAME; FILECOUNT = FILECOUNT + 1 } if (MULTI>-1) { set title "Performance (Average per Bin)" } set style fill solid 0.4 noborder set boxwidth 0.5 set grid y2tics linecolor "grey" unset key unset xtics set xtics ("MNK <= 13^3" 0, "13^3 < MNK <= 23^3" 1, "23^3 < MNK" 2) scale 0 offset 0, 0.2 set x2tics ("Small" 0, "Medium" 1, "Larger" 2) scale 0 set xlabel "Problem Size (MNK)" set ytics format "" set y2tics nomirror set y2label "GFLOP/s" set xrange [-0.5:2.5] set yrange [0:*] set autoscale fix set label sprintf("{/=9 ".FORMAT(BIN1_FLOPS)." GFLOP/s}", BIN1_FLOPS) at 0.0, BIN1_FLOPS centre offset 0, -1 front set label sprintf("{/=9 ".FORMAT(BIN2_FLOPS)." GFLOP/s}", BIN2_FLOPS) at 1.0, BIN2_FLOPS centre offset 0, -1 front set label sprintf("{/=9 ".FORMAT(BIN3_FLOPS)." GFLOP/s}", BIN3_FLOPS) at 2.0, BIN3_FLOPS centre offset 0, -1 front set label sprintf("{/=9 (".FORMAT(BIN1_MEMBW)." GB/s)}", BIN1_MEMBW) at 0.0, BIN1_FLOPS centre offset 0, -2 front set label sprintf("{/=9 (".FORMAT(BIN2_MEMBW)." GB/s)}", BIN2_MEMBW) at 1.0, BIN2_FLOPS centre offset 0, -2 front set label sprintf("{/=9 (".FORMAT(BIN3_MEMBW)." GB/s)}", BIN3_MEMBW) at 2.0, BIN3_FLOPS centre offset 0, -2 front set label sprintf("{/=9 N=%u}", BIN1_NSAMPLES) at 0.0, 0.0 centre offset 0, 0.5 front set label sprintf("{/=9 N=%u}", BIN2_NSAMPLES) at 1.0, 0.0 centre offset 0, 0.5 front set label sprintf("{/=9 N=%u}", BIN3_NSAMPLES) at 2.0, 0.0 centre offset 0, 0.5 front plot BASENAME."-cp2k.dat" \ using (0.0):(BIN1_FLOPS) notitle smooth unique with boxes linetype 1 linecolor "grey", \ "" using (1.0):(BIN2_FLOPS) notitle smooth unique with boxes linetype 1 linecolor "grey", \ "" using (2.0):(BIN3_FLOPS) notitle smooth unique with boxes linetype 1 linecolor "grey" reset if (MULTI<=0) { set output "".FILECOUNT."-".FILENAME; FILECOUNT = FILECOUNT + 1 } if (MULTI>-1) { set title "Cummulative Performance Distribution (CDF for ".sprintf("%u Kernels", NSAMPLES).")" } set xlabel "Probability\n\n{/=9 Min.: ".sprintf(FORMAT(MINFLOPS), MINFLOPS)." GFLOP/s Geo.: ".sprintf(FORMAT(GEOFLOPS), GEOFLOPS)." GFLOP/s Med.: ".sprintf(FORMAT(MEDFLOPS), MEDFLOPS)." GFLOP/s Avg.: ".sprintf(FORMAT(AVGFLOPS), AVGFLOPS)." GFLOP/s Max.: ".sprintf(FORMAT(MAXFLOPS), MAXFLOPS)." GFLOP/s}" set ylabel "GB/s" set y2label "GFLOP/s" set format x "%g%%" set format y "%g" set format y2 "%g" set ytics nomirror set y2tics nomirror set grid x y2 linecolor "grey" set xrange [0:100] set yrange [0:*] set y2range [0:*] set fit quiet f(x) = b * x + a fit f(x) BASENAME."-plot-cdf.dat" using (("".strcol(3)."" eq "i")?(100*$2/FREQSUM):(1/0)):1 via a, b g(x) = (x - a) / b x50 = 0.5 * (100 + MAX(0, g(0))) h(x) = d * x + c dx = 100.0 / FREQN fit [x50-2.0*dx:x50+2.0*dx] h(x) BASENAME."-plot-cdf.dat" using (("".strcol(3)."" eq "i")?(100*$2/FREQSUM):(1/0)):1 via c, d set arrow from x50, second h(x50) to x50, second 0 front set arrow from x50, second h(x50) to 100, second h(x50) front set label sprintf("%.0f%%", x50) at x50, second 0.5 * h(x50) left offset 1 front set label sprintf(FORMAT(h(x50))." GFLOP/s", h(x50)) at 0.5 * (x50 + 100.0), second h(x50) centre offset 0, -1 front set key left invert plot BASENAME."-plot-mbw.dat" using (("".strcol(3)."" eq "i")?(100*$2/FREQSUM):(1/0)):1 axes x1y1 title "Memory Bandwidth" with lines linecolor "grey", \ BASENAME."-plot-cdf.dat" using (("".strcol(3)."" eq "i")?(100*$2/FREQSUM):(1/0)):1 axes x1y2 title "Compute Performance" with lines linewidth 2 reset if (MULTI<=0) { set output "".FILECOUNT."-".FILENAME; FILECOUNT = FILECOUNT + 1 } if (MULTI>-1) { set title "Arithmetic Intensity (".sprintf("%u Kernels", NSAMPLES).")" } set grid x y2 linecolor "grey" set key left #spacing 0.5 set ytics format "" set y2tics nomirror set y2label "GFLOP/s" #set xlabel "FLOPS/Byte\n\n{/=9 ".sprintf("N: %u", NSAMPLES)." Min.: ".sprintf("%.1f", MINAI)." Geo.: ".sprintf("%.1f", GEOAI)." Med.: ".sprintf("%.1f", MEDAI)." Avg.: ".sprintf("%.1f", AVGAI)." Max.: ".sprintf("%.1f", MAXAI)."}" set xlabel "FLOPS/Byte (Min.: ".sprintf("%.1f", MINAI)." Geo.: ".sprintf("%.1f", GEOAI)." Med.: ".sprintf("%.1f", MEDAI)." Avg.: ".sprintf("%.1f", AVGAI)." Max.: ".sprintf("%.1f", MAXAI).")" set yrange [0:*] set autoscale fix plot BASENAME."-cp2k.dat" using (AI(MPARM,NPARM,KPARM,8)):FLOPS notitle smooth sbezier with lines linecolor "grey" linewidth 2, \ "" using (AI(MPARM,NPARM,KPARM,8)):FLOPS notitle smooth unique with points pointtype 7 pointsize 0.1 reset if (MULTI<=0) { set output "".FILECOUNT."-".FILENAME; FILECOUNT = FILECOUNT + 1 } if (MULTI>-1) { set title "Memory Bandwidth Consumption (".sprintf("%u Kernels", NSAMPLES).")" } set grid x y2 linecolor "grey" set key left #spacing 0.5 set ytics format "" set y2tics nomirror set y2label "GB/s" set xlabel "Problem Size (MNK^{1/3})\n\n{/=9 Min.: ".sprintf("%.0f GB/s", MINMEMBW)." Geo.: ".sprintf("%.0f GB/s", GEOMEMBW)." Med.: ".sprintf("%.0f GB/s", MEDMEMBW)." Avg.: ".sprintf("%.0f GB/s", AVGMEMBW)." Max.: ".sprintf("%.0f GB/s", MAXMEMBW)."}" set yrange [0:*] set autoscale fix plot BASENAME."-cp2k.dat" using ((column(MPARM)*column(NPARM)*column(KPARM))**(1.0/3.0)):(BW(MPARM,NPARM,KPARM,FLOPS,8)) notitle smooth sbezier with lines linecolor "grey" linewidth 2, \ "" using ((column(MPARM)*column(NPARM)*column(KPARM))**(1.0/3.0)):(BW(MPARM,NPARM,KPARM,FLOPS,8)) notitle with points pointtype 7 pointsize 0.1 reset if (MULTI<=0) { set output "".FILECOUNT."-".FILENAME; FILECOUNT = FILECOUNT + 1 } if (MULTI>-1) { set title "Compute Consumption (".sprintf("%u Kernels", NSAMPLES).")" } set grid x y2 linecolor "grey" set key left #spacing 0.5 set ytics format "" set y2tics nomirror set y2label "GFLOP/s" set xlabel "Problem Size (MNK^{1/3})\n\n{/=9 Min.: ".sprintf(FORMAT(MINFLOPS), MINFLOPS)." GFLOP/s Geo.: ".sprintf(FORMAT(GEOFLOPS), GEOFLOPS)." GFLOP/s Med.: ".sprintf(FORMAT(MEDFLOPS), MEDFLOPS)." GFLOP/s Avg.: ".sprintf(FORMAT(AVGFLOPS), AVGFLOPS)." GFLOP/s Max.: ".sprintf(FORMAT(MAXFLOPS), MAXFLOPS)." GFLOP/s}" set yrange [0:*] set autoscale fix plot BASENAME."-cp2k.dat" using ((column(MPARM)*column(NPARM)*column(KPARM))**(1.0/3.0)):FLOPS notitle smooth sbezier with lines linecolor "grey" linewidth 2, \ "" using ((column(MPARM)*column(NPARM)*column(KPARM))**(1.0/3.0)):FLOPS notitle with points pointtype 7 pointsize 0.1 if (0!=system("sh -c \"if [ -e eigen_smm-plot-join.dat ]; then echo 1; else echo 0; fi\"")) { reset if (MULTI<=0) { set output "".FILECOUNT."-".FILENAME; FILECOUNT = FILECOUNT + 1 } if (MULTI>-1) { set title "Performance (Selected Kernels)" } set style fill solid 0.4 border -1 set style data histograms set style histogram cluster #gap 2 #set boxwidth 0.5 relative set grid y2tics lc "grey" set key left #spacing 0.5 set xtics rotate by -45 scale 0; set bmargin 6 set ytics format "" set y2tics nomirror set y2label "GFLOP/s" set yrange [0:*] plot BASENAME."-plot-join.dat" using FLOPS:xtic("(".strcol(MPARM).",".strcol(NPARM).",".strcol(KPARM).")") notitle } libxsmm-1.17/samples/eigen/eigen_smm-cp2k.set000066400000000000000000000003331415223013700211620ustar00rootroot000000000000002 2 2 4 4 4 5 5 5 5 5 13 5 13 5 5 13 13 6 6 6 8 8 8 10 10 10 12 12 12 13 5 5 13 5 13 13 13 5 13 13 13 13 13 26 13 26 13 13 26 26 14 14 14 16 16 16 18 18 18 20 20 20 23 23 23 26 13 13 26 13 26 26 26 13 26 26 26 32 32 32 libxsmm-1.17/samples/eigen/eigen_smm-cp2k.sh000077500000000000000000000073161415223013700210140ustar00rootroot00000000000000#!/usr/bin/env bash ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) SCRT=${HERE}/../../scripts/libxsmm_utilities.py FILE=eigen_smm-cp2k.txt RUNS0=$(${SCRT} -1 $((64*64*64-0)) 19 23, 6, 14 16 29, 5 16 13 24 26, 9 16 22, 32, 64, 78, 16 29 55 0 0) RUNS1=$(${SCRT} -1 $((64*64*64-0)) 19 23, 6, 14 16 29, 5 32 13 24 26, 9 32 22, 32, 64, 78, 16 29 55 0 0) RUNS2=$(${SCRT} -1 $((64*64*64-0)) 20 23, 6, 14 16 29, 5 32 13 24 26, 9 32 22, 32, 64, 78, 16 29 55, 12 0 0) RUNS3=$(${SCRT} -1 $((64*64*64-0)) 26 23, 6, 14 16 29, 14 32 29, 5 32 13 24 26, 9 32 22, 32, 64, 78, 16 29 55, 32 29 55, 12 0 0) RUNS4=$(${SCRT} -1 $((64*64*64-1)) 31 23, 6, 14 16 29, 14 32 29, 5 32 13 24 26, 9 32 22, 32, 64, 78, 16 29 55, 32 29 55, 12, 13 26 28 32 45 0 0) RUNS5=$(${SCRT} -1 $((64*64*64-0)) 31 23, 6, 14 16 29, 14 32 29, 5 32 13 24 26, 9 32 22, 32, 64, 78, 16 29 55, 32 29 55, 12, 13 26 28 32 45 0 0) RUNS6=$(${SCRT} -1 $((80*80*80-0)) 35 23, 6, 14 16 29, 14 32 29, 5 32 13 24 26, 9 32 22, 32, 64, 78, 16 29 55, 32 29 55, 12, 13 26 28 32 45, 7 13 25 32 0 0) RUNS7=$(${SCRT} -1 $((80*80*80-0)) 35 23, 6, 14 16 29, 14 32 29, 5 32 13 24 26, 9 32 22, 64, 78, 16 29 55, 32 29 55, 12, 4 5 7 9 13 25 26 28 32 45 0 0) RUNS8=$(${SCRT} -1 $((80*80*80-0)) 37 23, 6, 14 16 29, 14 32 29, 5 32 13 24 26, 9 32 22, 64, 78, 16 29 55, 32 29 55, 12, 4 5 7 9 13 25 26 28 32 45, 4 10 0 0) RUNS9=$(${SCRT} -1 $((80*80*80-0)) 38 23, 6, 14 16 29, 14 32 29, 5 32 13 24 26, 9 32 22, 64, 78, 16 29 55, 32 29 55, 12, 4 5 7 9 13 25 26 28 32 45, 4 10 15 0 0) RUNS10=$(${SCRT} -1 $((128*128*128)) 41 23, 6, 14 16 29, 14 32 29, 5 32 13 24 26, 9 32 22, 64, 78, 16 29 55, 32 29 55, 12, 4 5 7 9 13 25 26 28 32 45, 4 10 15, 6 7 8 0 0) RUNS11=$(${SCRT} -1 $((128*128*128)) 46 23, 6, 14 16 29, 14 32 29, 5 32 13 24 26, 9 32 22, 64, 78, 16 29 55, 32 29 55, 12, 4 5 7 9 13 25 26 28 32 45, 4 10 15, 6 7 8, 13 14 25 26 32 0 0) CASE=6 if [ "$1" ]; then CASE=$1 shift fi case "$1" in "-"*) RUNS=RUNS${1:1}; shift ;; esac if [ -z "${RUNS}" ]; then RUNS=RUNS11 fi if [ "$1" ]; then SIZE=$1 shift else SIZE=0 fi # NREPEAT not defined if not given per CLI if [ "$1" ]; then NREPEAT=$1 fi cat /dev/null > ${FILE} NRUN=1 NMAX=$(echo ${!RUNS} | wc -w | tr -d " ") for RUN in ${!RUNS} ; do MVALUE=$(echo ${RUN} | cut --output-delimiter=' ' -d_ -f1) NVALUE=$(echo ${RUN} | cut --output-delimiter=' ' -d_ -f2) KVALUE=$(echo ${RUN} | cut --output-delimiter=' ' -d_ -f3) >&2 echo -n "${NRUN} of ${NMAX} (M=${MVALUE} N=${NVALUE} K=${KVALUE})... " ERROR=$({ ${HERE}/eigen_smm ${CASE} ${MVALUE} ${NVALUE} ${KVALUE} ${SIZE} ${NREPEAT} >> ${FILE}; } 2>&1) RESULT=$? if [ 0 != ${RESULT} ]; then echo "FAILED(${RESULT}) ${ERROR}" exit 1 else echo "OK ${ERROR}" fi echo >> ${FILE} NRUN=$((NRUN+1)) done libxsmm-1.17/samples/eigen/eigen_smm-plot.sh000077500000000000000000000074751415223013700211410ustar00rootroot00000000000000#!/usr/bin/env bash ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### SORT=$(command -v sort) JOIN=$(command -v join) GREP=$(command -v grep) CAT=$(command -v cat) CUT=$(command -v cut) SED=$(command -v sed) AWK=$(command -v awk) RM=$(command -v rm) VARIANT="LIBXSMM streamed (A,B)" HERE=$(cd "$(dirname "$0")" && pwd -P) FILE=${HERE}/eigen_smm-cp2k.txt PERF=$(${GREP} -A2 "${VARIANT}" ${FILE} \ | ${GREP} -e "performance" \ | ${CUT} -d" " -f2 \ | ${SORT} -n) NUM=$(echo "${PERF}" | wc -l | tr -d " ") MIN=$(echo ${PERF} | ${CUT} -d" " -f1) MAX=$(echo ${PERF} | ${CUT} -d" " -f${NUM}) echo "num=${NUM}" echo "min=${MIN}" echo "max=${MAX}" BC=$(command -v bc) if [ "${BC}" ]; then AVG=$(echo "$(echo -n "scale=3;(${PERF})/${NUM}" | tr "\n" "+")" | ${BC}) NUM2=$((NUM / 2)) if [ "0" = "$((NUM % 2))" ]; then A=$(echo ${PERF} | ${CUT} -d" " -f${NUM2}) B=$(echo ${PERF} | ${CUT} -d" " -f$((NUM2 + 1))) MED=$(echo "$(echo -n "scale=3;(${A} + ${B})/2")" | ${BC}) else MED=$(echo ${PERF} | ${CUT} -d" " -f$((NUM2 + 1))) fi echo "avg=${AVG}" echo "med=${MED}" fi if [ -f /cygdrive/c/Program\ Files/gnuplot/bin/wgnuplot ]; then WGNUPLOT=/cygdrive/c/Program\ Files/gnuplot/bin/wgnuplot GNUPLOT=/cygdrive/c/Program\ Files/gnuplot/bin/gnuplot elif [ -f /cygdrive/c/Program\ Files\ \(x86\)/gnuplot/bin/wgnuplot ]; then WGNUPLOT=/cygdrive/c/Program\ Files\ \(x86\)/gnuplot/bin/wgnuplot GNUPLOT=/cygdrive/c/Program\ Files\ \(x86\)/gnuplot/bin/gnuplot else GNUPLOT=$(command -v gnuplot) WGNUPLOT=${GNUPLOT} fi GNUPLOT_MAJOR=0 GNUPLOT_MINOR=0 if [ -f "${GNUPLOT}" ]; then GNUPLOT_MAJOR=$("${GNUPLOT}" --version | ${SED} "s/.\+ \([0-9]\).\([0-9]\) .*/\1/") GNUPLOT_MINOR=$("${GNUPLOT}" --version | ${SED} "s/.\+ \([0-9]\).\([0-9]\) .*/\2/") fi GNUPLOT_VERSION=$((GNUPLOT_MAJOR * 10000 + GNUPLOT_MINOR * 100)) if [ "40600" -le "${GNUPLOT_VERSION}" ]; then # determine behavior of sort command export LC_ALL=C.UTF-8 if [ "" = "$1" ]; then FILENAME=eigen_smm-cp2k.pdf else FILENAME=$1 shift fi if [ "" = "$1" ]; then MULTI=1 else MULTI=$1 shift fi ${SED} \ -e "/^m=/,/${VARIANT}/{//!d}" \ -e "/${VARIANT}/d" \ -e "/\.\.\./,/Finished/{//!d}" \ -e "/Finished/d" \ -e "/\.\.\./d" \ -e "/^$/d" \ ${FILE} \ | ${SED} \ -e "s/m=//" -e "s/n=//" -e "s/k=//" -e "s/ (..*) / /" \ -e "s/size=//" \ -e "/duration:/d" \ | ${SED} \ -e "N;s/ memory=..*\n..*//" \ -e "N;s/\n\tperformance:\(..*\) GFLOPS\/s/\1/" \ -e "N;s/\n\tbandwidth:\(..*\) GB\/s/\1/" \ > "${HERE}/eigen_smm-cp2k.dat" if [ -f "${HERE}/eigen_smm-cp2k.set" ]; then ${JOIN} --nocheck-order \ <(${CUT} "${HERE}/eigen_smm-cp2k.set" -d" " -f1-3 | ${SORT} -nk1) \ <(${SORT} -nk1 "${HERE}/eigen_smm-cp2k.dat") \ | ${AWK} \ '{ if ($2==$4 && $3==$5) printf("%s %s %s %s %s %s\n", $1, $2, $3, $6, $7, $8) }' \ | ${SORT} \ -b -n -k1 -k2 -k3 \ > "${HERE}/eigen_smm-plot-join.dat" else ${RM} "${HERE}/eigen_smm-plot-join.dat" fi env \ GDFONTPATH=/cygdrive/c/Windows/Fonts \ FILENAME=${FILENAME} \ MULTI=${MULTI} \ "${WGNUPLOT}" eigen_smm-cp2k.plt fi libxsmm-1.17/samples/eigen/eigen_smm.cpp000066400000000000000000000475341415223013700203320ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ /** This sample uses LIBXSMM's header-only implementation. */ #include #if !defined(__EIGEN) && 0 # define __EIGEN #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #if defined(__EIGEN) # include #endif #include #include #include #if defined(_OPENMP) # include #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif #if 0 /* enable padding on a per-matrix basis */ # define PAD(TYPE, VALUE) (LIBXSMM_UP2((VALUE) * sizeof(TYPE), LIBXSMM_ALIGNMENT) / sizeof(TYPE)) #else # define PAD(TYPE, VALUE) (VALUE) #endif #if !defined(RANDOMIZED) && 0 # define RANDOMIZED #endif #if !defined(ITYPE) # define ITYPE double #endif #if !defined(OTYPE) # define OTYPE ITYPE #endif #if defined(__EIGEN) template LIBXSMM_INLINE LIBXSMM_RETARGETABLE void smm_eigen_dynamic(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const itype *LIBXSMM_RESTRICT a, const itype *LIBXSMM_RESTRICT b, otype *LIBXSMM_RESTRICT c) { typedef Eigen::Matrix matrix_itype; typedef Eigen::Matrix matrix_otype; matrix_otype::Map(c, m, n).noalias() = matrix_itype::Map(a, m, k) * matrix_itype::Map(b, k, n); } #endif /*defined(__EIGEN)*/ template LIBXSMM_INLINE LIBXSMM_RETARGETABLE void smm_xsmm_specialized(const libxsmm_mmfunction& xmm, const itype* a, const itype* b, otype* c, const itype* next_a, const itype* next_b, const otype* next_c) { #if (0 != LIBXSMM_PREFETCH) xmm(a, b, c, next_a, next_b, next_c); #else xmm(a, b, c); LIBXSMM_UNUSED(next_a); LIBXSMM_UNUSED(next_b); LIBXSMM_UNUSED(next_c); #endif } int main(int argc, char* argv[]) { int result = EXIT_SUCCESS; try { const libxsmm_blasint benchmark = (1 < argc ? std::atoi(argv[1]) : 0); const libxsmm_blasint m = (2 < argc ? std::atoi(argv[2]) : 23); const libxsmm_blasint k = (4 < argc ? std::atoi(argv[4]) : m); const libxsmm_blasint n = (3 < argc ? std::atoi(argv[3]) : k); const libxsmm_blasint q = (5 < argc ? std::atoi(argv[5]) : 0/*auto*/); const libxsmm_blasint nrepeat = (6 < argc ? std::atoi(argv[6]) : (0 >= q ? 13 : 1)); const libxsmm_blasint lda = m, ldb = k, ldc = m; const char transa = 'N', transb = 'N'; const OTYPE alpha = 1, beta = 1; const libxsmm_blasint asize = PAD(ITYPE, lda * k), bsize = PAD(ITYPE, ldb * n), csize = PAD(OTYPE, ldc * n); const libxsmm_blasint max_size = ((2ULL << 30/*2 GB*/) / ((static_cast(asize) + bsize) * sizeof(ITYPE) + csize * sizeof(OTYPE))); const libxsmm_blasint s = LIBXSMM_MIN(0 < q ? q : max_size, max_size); const libxsmm_blasint aspace = LIBXSMM_ALIGNMENT / sizeof(ITYPE); const size_t bwsize = (static_cast(asize)/*load*/ + static_cast(bsize)/*load*/) * sizeof(ITYPE) + (sizeof(OTYPE) * static_cast(csize) * 2/*RFO*/); const double gflops = 2E-9 * s * m * n * k; #if LIBXSMM_TYPEINFO(ITYPE, FP) const char ops[] = "FLOPS"; const double scale = 1.0 / s; #else const char ops[] = "OPS"; const double scale = 1; #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload target(LIBXSMM_OFFLOAD_TARGET) #endif { #if defined(_OPENMP) const libxsmm_blasint chunksize = s / omp_get_max_threads(); #endif struct raii { // avoid std::vector (first-touch init. causes NUMA issue) ITYPE *a, *b; OTYPE *c; size_t m_size, m_shuffle; raii(libxsmm_blasint asize_, libxsmm_blasint bsize_, libxsmm_blasint csize_, libxsmm_blasint size_) : a(new ITYPE[static_cast(asize_)]), b(new ITYPE[static_cast(bsize_)]) , c(new OTYPE[static_cast(csize_)]) , m_size(static_cast(size_)), m_shuffle(libxsmm_shuffle(static_cast(size_))) {} ~raii() { delete[] a; delete[] b; delete[] c; } #if defined(RANDOMIZED) libxsmm_blasint shuffle(libxsmm_blasint i) const { return (i * m_shuffle) % m_size; } #else libxsmm_blasint shuffle(libxsmm_blasint i) const { return i; } #endif } helper(s * asize + aspace - 1, s * bsize + aspace - 1, s * csize + aspace - 1, s); ITYPE *const a = LIBXSMM_ALIGN(helper.a, LIBXSMM_ALIGNMENT); ITYPE *const b = LIBXSMM_ALIGN(helper.b, LIBXSMM_ALIGNMENT); OTYPE *const c = LIBXSMM_ALIGN(helper.c, LIBXSMM_ALIGNMENT); #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { LIBXSMM_MATINIT(ITYPE, 42 + helper.shuffle(i), a + static_cast(asize) * helper.shuffle(i), m, k, lda, scale); LIBXSMM_MATINIT(ITYPE, 24 + helper.shuffle(i), b + static_cast(bsize) * helper.shuffle(i), k, n, ldb, scale); LIBXSMM_MATINIT(OTYPE, 22 + i, c + static_cast(csize) * i, m, n, ldc, scale); } // initialize LIBXSMM libxsmm_init(); fprintf(stdout, "m=%lli n=%lli k=%lli size=%lli memory=%.1f MB (%s)\n\n", static_cast(m), static_cast(n), static_cast(k), static_cast(s), 1.0 * (s * ((static_cast(asize) + bsize) * sizeof(ITYPE) + csize * sizeof(OTYPE))) / (1ULL << 20), 8 == sizeof(ITYPE) ? "DP" : "SP"); const libxsmm_mmfunction xmm(LIBXSMM_GEMM_FLAGS(transa, transb), m, n, k, lda, ldb, ldc, alpha, beta, LIBXSMM_PREFETCH_AUTO); switch (benchmark) { case 0: if (xmm) { fprintf(stdout, "LIBXSMM batched (A,B,C)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { const ITYPE *const ai = a + static_cast(asize) * helper.shuffle(i), *const bi = b + static_cast(bsize) * helper.shuffle(i); OTYPE *const ci = c + static_cast(csize) * i; smm_xsmm_specialized(xmm, ai, bi, ci, LIBXSMM_GEMM_PREFETCH_A(ai + asize), LIBXSMM_GEMM_PREFETCH_B(bi + bsize), LIBXSMM_GEMM_PREFETCH_C(ci + csize)); } } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * bwsize / (duration * (1ULL << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } /* fallthrough */ #if defined(__EIGEN) case 1: { fprintf(stdout, "Eigen/dynamic batched (A,B,C)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { const ITYPE *const ai = a + static_cast(asize) * helper.shuffle(i), *const bi = b + static_cast(bsize) * helper.shuffle(i); OTYPE *const ci = c + static_cast(csize) * i; smm_eigen_dynamic(m, n, k, ai, bi, ci); } } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * bwsize / (duration * (1ULL << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } #endif /*defined(__EIGEN)*/ break; case 2: if (xmm) { fprintf(stdout, "LIBXSMM streamed (A,C)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { const ITYPE *const ai = a + static_cast(asize) * helper.shuffle(i); OTYPE *const ci = c + static_cast(csize) * i; smm_xsmm_specialized(xmm, ai, b, ci, LIBXSMM_GEMM_PREFETCH_A(ai + asize), LIBXSMM_GEMM_PREFETCH_B(b), LIBXSMM_GEMM_PREFETCH_C(ci + csize)); } } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - bsize * sizeof(ITYPE)) / (duration * (1ULL << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } /* fallthrough */ #if defined(__EIGEN) case 3: { fprintf(stdout, "Eigen/dynamic streamed (A,C)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { const ITYPE *const ai = a + static_cast(asize) * helper.shuffle(i); OTYPE *const ci = c + static_cast(csize) * i; smm_eigen_dynamic(m, n, k, ai, b, ci); } } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - bsize * sizeof(ITYPE)) / (duration * (1ULL << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } #endif /*defined(__EIGEN)*/ break; case 4: if (xmm) { fprintf(stdout, "LIBXSMM streamed (B,C)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { const ITYPE *const bi = b + static_cast(bsize) * helper.shuffle(i); OTYPE *const ci = c + static_cast(csize) * i; smm_xsmm_specialized(xmm, a, bi, ci, LIBXSMM_GEMM_PREFETCH_A(a), LIBXSMM_GEMM_PREFETCH_B(bi + bsize), LIBXSMM_GEMM_PREFETCH_C(ci + csize)); } } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - asize * sizeof(ITYPE)) / (duration * (1ULL << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } /* fallthrough */ #if defined(__EIGEN) case 5: { fprintf(stdout, "Eigen/dynamic streamed (B,C)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { const ITYPE *const bi = b + static_cast(bsize) * helper.shuffle(i); OTYPE *const ci = c + static_cast(csize) * i; smm_eigen_dynamic(m, n, k, a, bi, ci); } } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - asize * sizeof(ITYPE)) / (duration * (1ULL << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } #endif /*defined(__EIGEN)*/ break; case 6: if (xmm) { fprintf(stdout, "LIBXSMM streamed (A,B)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { #if defined(_OPENMP) /* attempt to write to disjunct cachelines */ const libxsmm_blasint j = omp_get_thread_num() * chunksize * csize; #else const libxsmm_blasint j = 0; #endif const ITYPE *const ai = a + static_cast(asize) * helper.shuffle(i), *const bi = b + static_cast(bsize) * helper.shuffle(i); smm_xsmm_specialized(xmm, ai, bi, c + j, LIBXSMM_GEMM_PREFETCH_A(ai + asize), LIBXSMM_GEMM_PREFETCH_B(bi + bsize), LIBXSMM_GEMM_PREFETCH_C(c + j)); } } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - sizeof(OTYPE) * csize * 2) / (duration * (1ULL << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } /* fallthrough */ #if defined(__EIGEN) case 7: { fprintf(stdout, "Eigen/dynamic streamed (A,B)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { #if defined(_OPENMP) /* attempt to write to disjunct cachelines */ const libxsmm_blasint j = omp_get_thread_num() * chunksize * csize; #else const libxsmm_blasint j = 0; #endif const ITYPE *const ai = a + static_cast(asize) * helper.shuffle(i), *const bi = b + static_cast(bsize) * helper.shuffle(i); smm_eigen_dynamic(m, n, k, ai, bi, c + j); } } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - sizeof(OTYPE) * csize * 2) / (duration * (1ULL << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } #endif /*defined(__EIGEN)*/ break; case 8: if (xmm) { fprintf(stdout, "LIBXSMM cached...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { #if defined(_OPENMP) /* attempt to write to disjunct cachelines */ const libxsmm_blasint j = omp_get_thread_num() * chunksize * csize; #else const libxsmm_blasint j = 0; #endif smm_xsmm_specialized(xmm, a, b, c + j, LIBXSMM_GEMM_PREFETCH_A(a), LIBXSMM_GEMM_PREFETCH_B(b), LIBXSMM_GEMM_PREFETCH_C(c + j)); } } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } /* fallthrough */ #if defined(__EIGEN) case 9: { fprintf(stdout, "Eigen/dynamic cached...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { #if defined(_OPENMP) /* attempt to write to disjunct cachelines */ const libxsmm_blasint j = omp_get_thread_num() * chunksize * csize; #else const libxsmm_blasint j = 0; #endif smm_eigen_dynamic(m, n, k, a, b, c + j); } } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } #endif /*defined(__EIGEN)*/ break; default: throw "invalid case selected!"; } /*switch*/ // finalize LIBXSMM libxsmm_finalize(); fprintf(stdout, "Finished\n"); } } catch(const std::exception& e) { fprintf(stderr, "Error: %s\n", e.what()); result = EXIT_FAILURE; } catch(const char* message) { fprintf(stderr, "Error: %s\n", message); result = EXIT_FAILURE; } catch(...) { fprintf(stderr, "Error: unknown exception caught!\n"); result = EXIT_FAILURE; } return result; } libxsmm-1.17/samples/eigen/eigen_smm.sh000077500000000000000000000050141415223013700201500ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) NAME=$(basename $0 .sh) GREP=$(command -v grep) ENV=$(command -v env) if [ "Windows_NT" = "${OS}" ]; then # Cygwin's "env" does not set PATH ("Files/Black: No such file or directory") export PATH=${PATH}:${HERE}/../../lib:/usr/x86_64-w64-mingw32/sys-root/mingw/bin # Cygwin's ldd hangs with dyn. linked executables or certain shared libraries LDD=$(command -v cygcheck) EXE=.exe else if [ "$(command -v ldd)" ]; then LDD=ldd elif [ "$(command -v otool)" ]; then LDD="otool -L" else LDD=echo fi fi MICINFO=$(command -v micinfo) if [ "${MICINFO}" ]; then MICCORES=$(${MICINFO} 2>/dev/null | sed -n "0,/[[:space:]]\+Total No of Active Cores :[[:space:]]\+\([0-9]\+\)/s//\1/p") fi if [ "" = "${MICCORES}" ]; then MICCORES=61 fi MICTPERC=3 if [ "-mic" != "$1" ]; then if [ "$(${LDD} ${HERE}/${NAME}${EXE} 2>/dev/null | ${GREP} libiomp5\.)" ]; then ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ KMP_AFFINITY=compact,granularity=fine,1 \ MIC_KMP_AFFINITY=compact,granularity=fine \ MIC_KMP_HW_SUBSET=$((MICCORES-1))c${MICTPERC}t \ MIC_ENV_PREFIX=MIC \ OFFLOAD_INIT=on_start \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" else ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ OMP_PROC_BIND=TRUE \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" fi else shift ${ENV} \ SINK_LD_LIBRARY_PATH=${SINK_LD_LIBRARY_PATH}:${MIC_LD_LIBRARY_PATH}:${HERE}/../../lib \ micnativeloadex \ ${HERE}/${NAME}${EXE} -a "$*" \ -e "KMP_AFFINITY=compact,granularity=fine" \ -e "MIC_KMP_HW_SUBSET=$((MICCORES-1))${MICTPERC}t" fi libxsmm-1.17/samples/eigen/eigen_smm.vcxproj000066400000000000000000000520531415223013700212330ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 eigen_smm {D962EA7C-C1FD-460A-9478-084935ABB454} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(EIGENROOT);$(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__BLAS=0;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console Console MaxSpeed $(EIGENROOT);$(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__BLAS=0;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console true Console X64 Full $(EIGENROOT);$(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__BLAS=0;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console Console X64 MaxSpeed $(EIGENROOT);$(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__BLAS=0;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console true Console Disabled $(EIGENROOT);$(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__BLAS=0;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console MSVCRT Console X64 Disabled $(EIGENROOT);$(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__BLAS=0;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console MSVCRT Console libxsmm-1.17/samples/eigen/eigen_tensor.cpp000066400000000000000000000133071415223013700210370ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ /** This sample uses LIBXSMM's header-only implementation. */ #include #if !defined(USE_LIBXSMM) # define USE_LIBXSMM #endif #if defined(USE_LIBXSMM) # if !defined(EIGEN_VECTORIZE_AVX) # define EIGEN_VECTORIZE_AVX # endif # if !defined(EIGEN_USE_LIBXSMM) # define EIGEN_USE_LIBXSMM # endif #endif #if !defined(__EIGEN) && !defined(__EIGEN_UNSUPPORTED) && 0 # define __EIGEN_UNSUPPORTED # define __EIGEN #endif #if !defined(EIGEN_USE_THREADS) && defined(__EIGEN) && (defined(_OPENMP) \ || !defined(__BLAS) || (defined(__BLAS) && 1 < (__BLAS))) # define EIGEN_USE_THREADS #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #if defined(__EIGEN_UNSUPPORTED) # include # include #endif #include #include #include #include #include #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif #if !defined(ITYPE) # define ITYPE float #endif #if !defined(CHECK) && (LIBXSMM_EQUAL(ITYPE, float) || LIBXSMM_EQUAL(ITYPE, double)) # if !defined(MKL_DIRECT_CALL_SEQ) && !defined(MKL_DIRECT_CALL) LIBXSMM_BLAS_SYMBOL_DECL(ITYPE, gemm) # endif # define CHECK #endif int main(int argc, char* argv[]) { int result = EXIT_SUCCESS; try { #if !defined(__EIGEN_UNSUPPORTED) LIBXSMM_UNUSED(argc); LIBXSMM_UNUSED(argv); throw std::runtime_error("Eigen or Eigen/unsupported not found!"); #else LIBXSMM_BLAS_CONST libxsmm_blasint m = (1 < argc ? std::atoi(argv[1]) : 512); LIBXSMM_BLAS_CONST libxsmm_blasint k = (3 < argc ? atoi(argv[3]) : m); LIBXSMM_BLAS_CONST libxsmm_blasint n = (2 < argc ? atoi(argv[2]) : k); const int nrepeat = LIBXSMM_MAX(4 < argc ? atoi(argv[4]) : 13 / LIBXSMM_MAX(1, libxsmm_icbrt_u64(1ULL * m * n * k) >> 10), 3); # if defined(CHECK) && (!defined(__BLAS) || (0 != __BLAS)) const double env_check = (0 == getenv("CHECK") ? 1.0 : atof(getenv("CHECK"))); const double check = LIBXSMM_ABS(env_check); # endif const double gflops = 2.0 * m * n * k * 1E-9; const int max_nthreads = Eigen::nbThreads(); const int env_nthreads = 0 == getenv("NTHREADS") ? max_nthreads : atoi(getenv("NTHREADS")); const int nthreads = LIBXSMM_CLMP(env_nthreads, 1, max_nthreads); # if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload target(LIBXSMM_OFFLOAD_TARGET) # endif { Eigen::ThreadPool threadpool(nthreads); Eigen::ThreadPoolDevice device(&threadpool, threadpool.NumThreads()); Eigen::Tensor ta(m, k), tb(k, n), tc(m, n); LIBXSMM_BLAS_CONST char transa = 'N', transb = 'N'; LIBXSMM_BLAS_CONST ITYPE alpha(1), beta(0); unsigned long long start; double d1; { std::array,1> product_dims = { Eigen::IndexPair(1, 0), }; ta.setRandom(); tb.setRandom(); start = libxsmm_timer_tick(); for (int i = 0; i < nrepeat; ++i) { tc.device(device) = ta.contract(tb, product_dims); } d1 = libxsmm_timer_duration(start, libxsmm_timer_tick()); } libxsmm_gemm_print(stdout, libxsmm_gemm_precision_enum::value, &transa, &transb, &m, &n, &k, &alpha, ta.data(), &m, tb.data(), &k, &beta, tc.data(), &m); fprintf(stdout, "\n\n"); # if defined(CHECK) && (!defined(__BLAS) || (0 != __BLAS)) Eigen::Tensor td(m, n); double d2; { start = libxsmm_timer_tick(); for (int i = 0; i < nrepeat; ++i) { LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, ta.data(), &m, tb.data(), &k, &beta, td.data(), &m); } d2 = libxsmm_timer_duration(start, libxsmm_timer_tick()); } # endif if (0 < d1) { fprintf(stdout, "\tEigen" # if !defined(USE_LIBXSMM) "+XSMM" # endif ": %.1f GFLOPS/s\n", gflops * nrepeat / d1); } # if defined(CHECK) && (!defined(__BLAS) || (0 != __BLAS)) if (0 < d2) { fprintf(stdout, "\tBLAS: %.1f GFLOPS/s\n", gflops * nrepeat / d2); } libxsmm_matdiff_info diff; result = libxsmm_matdiff(&diff, LIBXSMM_DATATYPE(ITYPE), m, n, td.data(), tc.data(), &m, &m); if (EXIT_SUCCESS == result) { fprintf(stdout, "\tdiff: L2abs=%f Linf=%f\n", diff.l2_abs, diff.linf_abs); if (check < diff.l2_rel) { fprintf(stderr, "FAILED.\n"); result = EXIT_FAILURE; } } # endif } fprintf(stdout, "Finished\n"); #endif /*defined(__EIGEN_UNSUPPORTED)*/ } catch(const std::exception& e) { fprintf(stderr, "Error: %s\n", e.what()); result = EXIT_FAILURE; } catch(const char* message) { fprintf(stderr, "Error: %s\n", message); result = EXIT_FAILURE; } catch(...) { fprintf(stderr, "Error: unknown exception caught!\n"); result = EXIT_FAILURE; } return result; } libxsmm-1.17/samples/eigen/eigen_tensor.sh000077500000000000000000000050141415223013700206660ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) NAME=$(basename $0 .sh) GREP=$(command -v grep) ENV=$(command -v env) if [ "Windows_NT" = "${OS}" ]; then # Cygwin's "env" does not set PATH ("Files/Black: No such file or directory") export PATH=${PATH}:${HERE}/../../lib:/usr/x86_64-w64-mingw32/sys-root/mingw/bin # Cygwin's ldd hangs with dyn. linked executables or certain shared libraries LDD=$(command -v cygcheck) EXE=.exe else if [ "$(command -v ldd)" ]; then LDD=ldd elif [ "$(command -v otool)" ]; then LDD="otool -L" else LDD=echo fi fi MICINFO=$(command -v micinfo) if [ "${MICINFO}" ]; then MICCORES=$(${MICINFO} 2>/dev/null | sed -n "0,/[[:space:]]\+Total No of Active Cores :[[:space:]]\+\([0-9]\+\)/s//\1/p") fi if [ "" = "${MICCORES}" ]; then MICCORES=61 fi MICTPERC=3 if [ "-mic" != "$1" ]; then if [ "$(${LDD} ${HERE}/${NAME}${EXE} 2>/dev/null | ${GREP} libiomp5\.)" ]; then ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ KMP_AFFINITY=compact,granularity=fine,1 \ MIC_KMP_AFFINITY=compact,granularity=fine \ MIC_KMP_HW_SUBSET=$((MICCORES-1))c${MICTPERC}t \ MIC_ENV_PREFIX=MIC \ OFFLOAD_INIT=on_start \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" else ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ OMP_PROC_BIND=TRUE \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" fi else shift ${ENV} \ SINK_LD_LIBRARY_PATH=${SINK_LD_LIBRARY_PATH}:${MIC_LD_LIBRARY_PATH}:${HERE}/../../lib \ micnativeloadex \ ${HERE}/${NAME}${EXE} -a "$*" \ -e "KMP_AFFINITY=compact,granularity=fine" \ -e "MIC_KMP_HW_SUBSET=$((MICCORES-1))${MICTPERC}t" fi libxsmm-1.17/samples/eigen/eigen_tensor.vcxproj000066400000000000000000000520641415223013700217530ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 eigen_tensor {315C606A-433F-43DB-8A26-A44AB66C99B1} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(EIGENROOT);$(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__BLAS=0;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console Console MaxSpeed $(EIGENROOT);$(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__BLAS=0;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console true Console X64 Full $(EIGENROOT);$(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__BLAS=0;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console Console X64 MaxSpeed $(EIGENROOT);$(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__BLAS=0;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console true Console Disabled $(EIGENROOT);$(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__BLAS=0;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console MSVCRT Console X64 Disabled $(EIGENROOT);$(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__BLAS=0;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console MSVCRT Console libxsmm-1.17/samples/eltwise/000077500000000000000000000000001415223013700162335ustar00rootroot00000000000000libxsmm-1.17/samples/eltwise/Makefile000066400000000000000000000053771415223013700177070ustar00rootroot00000000000000ROOTDIR = $(abspath $(dir $(firstword $(MAKEFILE_LIST)))) DEPDIR = ../.. SRCDIR = . INCDIR = . BLDDIR = obj OUTDIR = . CXXFLAGS = $(NULL) CFLAGS = $(NULL) DFLAGS = $(NULL) OMP = 1 SYM = 1 # explore AVX/ARCH=native SSE = 0 # include common Makefile artifacts include $(DEPDIR)/Makefile.inc # necessary include directories IFLAGS += -I$(call quote,$(INCDIR)) IFLAGS += -I$(call quote,$(DEPDIR)/include) OUTNAME := $(shell basename "$(ROOTDIR)") HEADERS := $(wildcard $(INCDIR)/*.h) $(wildcard $(INCDIR)/*.hpp) $(wildcard $(INCDIR)/*.hxx) $(wildcard $(INCDIR)/*.hh) \ $(wildcard $(SRCDIR)/*.h) $(wildcard $(SRCDIR)/*.hpp) $(wildcard $(SRCDIR)/*.hxx) $(wildcard $(SRCDIR)/*.hh) \ $(DEPDIR)/include/libxsmm_source.h CPPSRCS := $(wildcard $(SRCDIR)/*.cpp) CXXSRCS := $(wildcard $(SRCDIR)/*.cxx) CCXSRCS := $(wildcard $(SRCDIR)/*.cc) CSOURCS := $(wildcard $(SRCDIR)/*.c) CPPOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CPPSRCS:.cpp=-cpp.o))) CXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CXXSRCS:.cxx=-cxx.o))) CCXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CCXSRCS:.cc=-cc.o))) COBJCTS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CSOURCS:.c=-c.o))) SOURCES := $(CPPSRCS) $(CXXSRCS) $(CCXSRCS) $(CSOURCS) OBJECTS := $(CPPOBJS) $(CXXOBJS) $(CCXOBJS) $(COBJCTS) MODULES := $(addsuffix .mod,$(basename $(FTNSRCS))) $(addsuffix .modmic,$(basename $(FTNSRCS))) XFILES := $(OUTDIR)/eltwise_reduce $(OUTDIR)/eltwise_scale .PHONY: all all: $(XFILES) .PHONY: compile compile: $(OBJECTS) $(OUTDIR)/eltwise_reduce: $(OUTDIR)/.make $(BLDDIR)/eltwise_reduce-c.o $(LIBDEP) $(LD) -o $@ $(BLDDIR)/eltwise_reduce-c.o $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(OUTDIR)/eltwise_scale: $(OUTDIR)/.make $(BLDDIR)/eltwise_scale-c.o $(LIBDEP) $(LD) -o $@ $(BLDDIR)/eltwise_scale-c.o $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(BLDDIR)/%-cpp.o: $(SRCDIR)/%.cpp .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CXX) $(DFLAGS) $(IFLAGS) $(CXXFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-c.o: $(SRCDIR)/%.c .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CC) $(DFLAGS) $(IFLAGS) $(CFLAGS) $(CTARGET) -c $< -o $@ .PHONY: clean clean: ifneq ($(call qapath,$(BLDDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(BLDDIR)),$(call qapath,.)) @rm -rf $(BLDDIR) endif endif ifneq (,$(wildcard $(BLDDIR))) # still exists @rm -f $(OBJECTS) $(OBJECTX) $(FTNOBJS) $(FTNOBJX) *__genmod.* fit.log *.dat @rm -f $(BLDDIR)/*.gcno $(BLDDIR)/*.gcda $(BLDDIR)/*.gcov endif @rm -f .make .state .PHONY: realclean realclean: clean ifneq ($(call qapath,$(OUTDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(OUTDIR)),$(call qapath,.)) @rm -rf $(OUTDIR) endif endif ifneq (,$(wildcard $(OUTDIR))) # still exists @rm -f $(OUTDIR)/libxsmm.$(DLIBEXT) $(OUTDIR)/*.stackdump @rm -f $(XFILES) $(MODULES) endif libxsmm-1.17/samples/eltwise/eltwise_reduce.c000066400000000000000000000164251415223013700214120ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas (Intel Corp.) ******************************************************************************/ #include #include #include #include #include LIBXSMM_INLINE void sfill_matrix ( float *matrix, unsigned int ld, unsigned int m, unsigned int n ) { unsigned int i, j; double dtmp; if ( ld < m ) { fprintf(stderr,"Error is sfill_matrix: ld=%u m=%u mismatched!\n",ld,m); exit(EXIT_FAILURE); } for ( j = 1; j <= n; j++ ) { /* Fill through the leading dimension */ for ( i = 1; i <= ld; i++ ) { dtmp = 1.0 - 2.0*libxsmm_rng_f64(); matrix [ (j-1)*ld + (i-1) ] = (float) dtmp; } } } int main(int argc, char* argv[]) { unsigned int m = 64, n = 64, reduce_elts = 1, reduce_elts_squared = 1, reduce_rows = 1, result_size, i, j, k, iters = 10000; libxsmm_blasint ld_in = 64/*, ld_out = 64*/; float *sinp, *result_reduce_elts, *result_reduce_elts_squared, *ref_result_reduce_elts, *ref_result_reduce_elts_squared; libxsmm_meltw_redu_flags jit_flags = LIBXSMM_MELTW_FLAG_REDUCE_NONE; libxsmm_meltwfunction_reduce kernel; libxsmm_meltw_reduce_param params; libxsmm_matdiff_info norms_elts, norms_elts_squared; unsigned long long l_start, l_end; double l_total = 0.0, l_total2 = 0.0; libxsmm_init(); libxsmm_matdiff_clear(&norms_elts); libxsmm_matdiff_clear(&norms_elts_squared); if ( argc > 1 ) m = atoi(argv[1]); if ( argc > 2 ) n = atoi(argv[2]); if ( argc > 3 ) ld_in = atoi(argv[3]); if ( argc > 4 ) reduce_elts = atoi(argv[4]); if ( argc > 5 ) reduce_elts_squared = atoi(argv[5]); if ( argc > 6 ) reduce_rows = atoi(argv[6]); if ( argc > 7 ) iters = atoi(argv[7]); m = LIBXSMM_MAX(m,1); n = LIBXSMM_MAX(n,1); ld_in = LIBXSMM_MAX(ld_in,(libxsmm_blasint)m); result_size = (reduce_rows == 1) ? n : m; /* Allocate arrays */ sinp = (float*) malloc( ld_in*n*sizeof(float) ); result_reduce_elts = (float*) malloc(result_size*sizeof(float) ); result_reduce_elts_squared = (float*) malloc(result_size*sizeof(float) ); ref_result_reduce_elts = (float*) malloc(result_size*sizeof(float) ); ref_result_reduce_elts_squared = (float*) malloc(result_size*sizeof(float) ); /* Fill matrices with random data */ sfill_matrix ( sinp, ld_in, m, n ); /* Calculate reference results... */ if (reduce_rows == 1) { for (j = 0; j < n; j++) { ref_result_reduce_elts[j] = 0; ref_result_reduce_elts_squared[j] = 0; for (i = 0; i < m; i++) { ref_result_reduce_elts[j] += sinp[j*ld_in + i]; ref_result_reduce_elts_squared[j] += sinp[j*ld_in + i] * sinp[j*ld_in + i]; } } } else { /* In this case we reduce columns */ for (i = 0; i < m; i++) { ref_result_reduce_elts[i] = 0; ref_result_reduce_elts_squared[i] = 0; for (j = 0; j < n; j++) { ref_result_reduce_elts[i] += sinp[j*ld_in + i]; ref_result_reduce_elts_squared[i] += sinp[j*ld_in + i] * sinp[j*ld_in + i]; } } } /* Generate JITED kernel */ if (reduce_rows == 1) { jit_flags = LIBXSMM_MELTW_FLAG_REDUCE_OP_ADD | LIBXSMM_MELTW_FLAG_REDUCE_ROWS; } else { jit_flags = LIBXSMM_MELTW_FLAG_REDUCE_OP_ADD | LIBXSMM_MELTW_FLAG_REDUCE_COLS; } if (reduce_elts == 1) { jit_flags |= LIBXSMM_MELTW_FLAG_REDUCE_ELTS; } if (reduce_elts_squared == 1) { jit_flags |= LIBXSMM_MELTW_FLAG_REDUCE_ELTS_SQUARED; } printf("JITing reduce kernel... \n"); kernel = libxsmm_dispatch_meltw_reduce(m, n, &ld_in, &ld_in, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, jit_flags); /* Call JITed kernel and compare result */ printf("Calling JITed reduce kernel... \n"); params.in_ptr = sinp; params.out_ptr_0 = result_reduce_elts; params.out_ptr_1 = result_reduce_elts_squared; kernel( ¶ms ); /* compare */ printf("##########################################\n"); printf("# Correctness - Eltwise reduce #\n"); printf("##########################################\n"); libxsmm_matdiff(&norms_elts, LIBXSMM_DATATYPE_F32, result_size, 1, ref_result_reduce_elts, result_reduce_elts, 0, 0); printf("L1 reference : %.25g\n", norms_elts.l1_ref); printf("L1 test : %.25g\n", norms_elts.l1_tst); printf("L2 abs.error : %.24f\n", norms_elts.l2_abs); printf("L2 rel.error : %.24f\n", norms_elts.l2_rel); printf("Linf abs.error: %.24f\n", norms_elts.linf_abs); printf("Linf rel.error: %.24f\n", norms_elts.linf_rel); printf("Check-norm : %.24f\n\n", norms_elts.normf_rel); /* compare */ printf("##########################################\n"); printf("# Correctness - Eltwise-square reduce #\n"); printf("##########################################\n"); libxsmm_matdiff(&norms_elts_squared, LIBXSMM_DATATYPE_F32, result_size, 1, ref_result_reduce_elts_squared, result_reduce_elts_squared, 0, 0); printf("L1 reference : %.25g\n", norms_elts_squared.l1_ref); printf("L1 test : %.25g\n", norms_elts_squared.l1_tst); printf("L2 abs.error : %.24f\n", norms_elts_squared.l2_abs); printf("L2 rel.error : %.24f\n", norms_elts_squared.l2_rel); printf("Linf abs.error: %.24f\n", norms_elts_squared.linf_abs); printf("Linf rel.error: %.24f\n", norms_elts_squared.linf_rel); printf("Check-norm : %.24f\n\n", norms_elts_squared.normf_rel); l_start = libxsmm_timer_tick(); /* Calculate reference results... */ for (k = 0; k < iters; k++) { if (reduce_rows == 1) { for (j = 0; j < n; j++) { ref_result_reduce_elts[j] = 0; ref_result_reduce_elts_squared[j] = 0; for (i = 0; i < m; i++) { ref_result_reduce_elts[j] += sinp[j*ld_in + i]; ref_result_reduce_elts_squared[j] += sinp[j*ld_in + i] * sinp[j*ld_in + i]; } } } else { /* In this case we reduce columns */ for (i = 0; i < m; i++) { ref_result_reduce_elts[i] = 0; ref_result_reduce_elts_squared[i] = 0; for (j = 0; j < n; j++) { ref_result_reduce_elts[i] += sinp[j*ld_in + i]; ref_result_reduce_elts_squared[i] += sinp[j*ld_in + i] * sinp[j*ld_in + i]; } } } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); printf("Reference time = %.5g\n", ((double)(l_total))); l_start = libxsmm_timer_tick(); for (k = 0; k < iters; k++) { kernel( ¶ms ); } l_end = libxsmm_timer_tick(); l_total2 = libxsmm_timer_duration(l_start, l_end); printf("Optimized time = %.5g\n", ((double)(l_total2))); printf("Speedup is = %.5g\n", ((double)(l_total/l_total2))); free(sinp); free(result_reduce_elts); free(result_reduce_elts_squared); free(ref_result_reduce_elts); free(ref_result_reduce_elts_squared); return EXIT_SUCCESS; } libxsmm-1.17/samples/eltwise/eltwise_reduce.vcxproj000066400000000000000000000547361415223013700226720ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 eltwise_reduce 10.0 {A05787E8-13A7-43E3-BC15-FBCEEFC68880} Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST GenerateParallelCode true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST GenerateParallelCode true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST GenerateParallelCode true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST GenerateParallelCode true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST GenerateParallelCode true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST GenerateParallelCode true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/eltwise/eltwise_scale.c000066400000000000000000000161631415223013700212310ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas (Intel Corp.) ******************************************************************************/ #include #include #include #include #include LIBXSMM_INLINE void sfill_matrix ( float *matrix, unsigned int ld, unsigned int m, unsigned int n ) { unsigned int i, j; double dtmp; if ( ld < m ) { fprintf(stderr,"Error is sfill_matrix: ld=%u m=%u mismatched!\n",ld,m); exit(EXIT_FAILURE); } for ( j = 1; j <= n; j++ ) { /* Fill through the leading dimension */ for ( i = 1; i <= ld; i++ ) { dtmp = 1.0 - 2.0*libxsmm_rng_f64(); matrix [ (j-1)*ld + (i-1) ] = (float) dtmp; } } } int main(int argc, char* argv[]) { unsigned int m = 64, n = 64, perform_scale = 1, perform_shift = 1, perform_bias = 1, scale_rows = 1, vectors_size, i, j, k, iters = 10000; libxsmm_blasint ld_in = 64, ld_out = 64; float *sinp, *sout, *scale_vals, *shift_vals, *bias_vals, *ref_out; libxsmm_meltw_scal_flags jit_flags = 0; libxsmm_meltwfunction_scale kernel; libxsmm_meltw_scale_param params; libxsmm_matdiff_info norms_out; unsigned long long l_start, l_end; double l_total = 0.0, l_total2 = 0.0; libxsmm_init(); libxsmm_matdiff_clear(&norms_out); if ( argc > 1 ) m = atoi(argv[1]); if ( argc > 2 ) n = atoi(argv[2]); if ( argc > 3 ) ld_in = atoi(argv[3]); if ( argc > 4 ) ld_out = atoi(argv[4]); if ( argc > 5 ) perform_shift = atoi(argv[5]); if ( argc > 6 ) perform_scale = atoi(argv[6]); if ( argc > 7 ) perform_bias = atoi(argv[7]); if ( argc > 8 ) scale_rows = atoi(argv[8]); if ( argc > 9 ) iters = atoi(argv[9]); m = LIBXSMM_MAX(m,1); n = LIBXSMM_MAX(n,1); ld_in = LIBXSMM_MAX(ld_in,(libxsmm_blasint)m); ld_out = LIBXSMM_MAX(ld_out,(libxsmm_blasint)m); vectors_size = (scale_rows == 1) ? n : m; /* Allocate arrays */ sinp = (float*) malloc( ld_in*n*sizeof(float) ); sout = (float*) malloc( ld_out*n*sizeof(float) ); ref_out = (float*) malloc( ld_out*n*sizeof(float) ); scale_vals = (float*) malloc(vectors_size*sizeof(float) ); shift_vals = (float*) malloc(vectors_size*sizeof(float) ); bias_vals = (float*) malloc(vectors_size*sizeof(float) ); /* Fill matrices with random data */ sfill_matrix ( sinp, ld_in, m, n ); sfill_matrix ( scale_vals, vectors_size, vectors_size, 1 ); sfill_matrix ( shift_vals, vectors_size, vectors_size, 1 ); sfill_matrix ( bias_vals, vectors_size, vectors_size, 1 ); /* Calculate reference results... */ if (scale_rows == 1) { for (j = 0; j < n; j++) { float scale = scale_vals[j]; float shift = shift_vals[j]; float bias = bias_vals[j]; for (i = 0; i < m; i++) { float out; out = sinp[j*ld_in + i]; if (perform_shift) out += shift; if (perform_scale) out *= scale; if (perform_bias) out += bias; ref_out[j*ld_out + i] = out; } } } else { /* In this case we reduce columns */ for (i = 0; i < m; i++) { float scale = scale_vals[i]; float shift = shift_vals[i]; float bias = bias_vals[i]; for (j = 0; j < n; j++) { float out; out = sinp[j*ld_in + i]; if (perform_shift) out += shift; if (perform_scale) out *= scale; if (perform_bias) out += bias; ref_out[j*ld_out + i] = out; } } } /* Generate JITED kernel */ if (scale_rows == 1) { jit_flags = LIBXSMM_MELTW_FLAG_SCALE_ROWS; } else { jit_flags = LIBXSMM_MELTW_FLAG_SCALE_COLS; } if (perform_scale == 1) { jit_flags |= LIBXSMM_MELTW_FLAG_SCALE_MULT; } if (perform_shift == 1) { jit_flags |= LIBXSMM_MELTW_FLAG_SCALE_SHIFT; } if (perform_bias == 1) { jit_flags |= LIBXSMM_MELTW_FLAG_SCALE_ADD_BIAS; } printf("JITing scale kernel... \n"); kernel = libxsmm_dispatch_meltw_scale(m, n, &ld_in, &ld_out, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, jit_flags); /* Call JITed kernel and compare result */ printf("Calling JITed reduce kernel... \n"); params.in_ptr = sinp; params.out_ptr = sout; params.shift_vals_ptr = shift_vals; params.scale_vals_ptr = scale_vals; params.bias_vals_ptr = bias_vals; kernel( ¶ms ); /* compare */ printf("##########################################\n"); printf("# Correctness - Eltwise scale out #\n"); printf("##########################################\n"); libxsmm_matdiff(&norms_out, LIBXSMM_DATATYPE_F32, n * ld_out, 1, ref_out, sout, 0, 0); printf("L1 reference : %.25g\n", norms_out.l1_ref); printf("L1 test : %.25g\n", norms_out.l1_tst); printf("L2 abs.error : %.24f\n", norms_out.l2_abs); printf("L2 rel.error : %.24f\n", norms_out.l2_rel); printf("Linf abs.error: %.24f\n", norms_out.linf_abs); printf("Linf rel.error: %.24f\n", norms_out.linf_rel); printf("Check-norm : %.24f\n\n", norms_out.normf_rel); l_start = libxsmm_timer_tick(); /* Calculate reference results... */ for (k = 0; k < iters; k++) { /* Calculate reference results... */ if (scale_rows == 1) { for (j = 0; j < n; j++) { float scale = scale_vals[j]; float shift = shift_vals[j]; float bias = bias_vals[j]; for (i = 0; i < m; i++) { float out; out = sinp[j*ld_in + i]; if (perform_shift) out += shift; if (perform_scale) out *= scale; if (perform_bias) out += bias; ref_out[j*ld_out + i] = out; } } } else { /* In this case we reduce columns */ for (i = 0; i < m; i++) { float scale = scale_vals[i]; float shift = shift_vals[i]; float bias = bias_vals[i]; for (j = 0; j < n; j++) { float out; out = sinp[j*ld_in + i]; if (perform_shift) out += shift; if (perform_scale) out *= scale; if (perform_bias) out += bias; ref_out[j*ld_out + i] = out; } } } } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); printf("Reference time = %.5g\n", ((double)(l_total))); l_start = libxsmm_timer_tick(); for (k = 0; k < iters; k++) { kernel( ¶ms ); } l_end = libxsmm_timer_tick(); l_total2 = libxsmm_timer_duration(l_start, l_end); printf("Optimized time = %.5g\n", ((double)(l_total2))); printf("Speedup is = %.5g\n", ((double)(l_total/l_total2))); free(sinp); free(sout); free(ref_out); free(scale_vals); free(bias_vals); free(shift_vals); return EXIT_SUCCESS; } libxsmm-1.17/samples/generator/000077500000000000000000000000001415223013700165455ustar00rootroot00000000000000libxsmm-1.17/samples/generator/left_sparse_test_csc.mtx000066400000000000000000000356731415223013700235130ustar00rootroot00000000000000%%MatrixMarket matrix coordinate real general % 84 84 686 1 2 2 2 5 6 1 6 1 3 6 3.333333333333333 4 6 -0.3333333333333333 1 8 1 4 8 3 1 11 0.6 3 11 -0.6666666666666666 4 11 -0.3333333333333333 5 11 10 7 11 0.2 9 11 0.1333333333333333 10 11 0.06666666666666667 2 12 2 6 12 8.4 8 12 -0.4 1 13 0.6 3 13 2.666666666666667 4 13 -0.3333333333333333 7 13 4.2 9 13 -0.5333333333333333 10 13 0.06666666666666667 2 15 2 8 15 8 1 16 0.6 3 16 1.111111111111111 4 16 1.222222222222222 9 16 4.444444444444445 10 16 -0.7111111111111111 1 18 0.6 4 18 2.333333333333333 10 18 3.733333333333333 2 21 2.857142857142857 6 21 -1.5 8 21 -1 11 21 14 13 21 0.2857142857142857 16 21 0.2142857142857143 18 21 0.1428571428571428 1 22 0.4 3 22 0.5714285714285714 4 22 -0.2857142857142857 5 22 2.5 7 22 -1.3 9 22 -0.2 10 22 0.1 12 22 12.85714285714286 14 22 0.5142857142857142 15 22 -0.3571428571428572 17 22 0.1857142857142857 19 22 0.02857142857142857 20 22 -0.01428571428571429 2 23 0.8571428571428571 6 23 4.8 8 23 -0.3 13 23 10.28571428571429 16 23 -0.6857142857142857 18 23 0.04285714285714286 1 24 0.4 3 24 2 4 24 -0.2857142857142857 7 24 4.2 9 24 -0.7 10 24 0.1 14 24 4.8 17 24 -0.6 19 24 0.1 20 24 -0.01428571428571429 1 26 0.4 3 26 -0.2857142857142857 4 26 0.5714285714285714 5 26 2.5 7 26 0.05 9 26 -0.8 10 26 -0.65 15 26 12.5 17 26 0.25 19 26 0.2857142857142857 20 26 0.1785714285714286 2 27 0.8571428571428571 6 27 2.1 8 27 2.4 16 27 10.5 18 27 -0.8571428571428571 1 28 0.4 3 28 1.142857142857143 4 28 0.5714285714285714 7 28 1.05 9 28 3.2 10 28 -0.65 17 28 5.25 19 28 -1.142857142857143 20 28 0.1785714285714286 2 30 0.8571428571428571 8 30 4.5 18 30 9.642857142857142 1 31 0.4 3 31 0.4761904761904762 4 31 1.238095238095238 9 31 2.5 10 31 1.1 19 31 5.357142857142857 20 31 -1.071428571428571 1 33 0.4 4 33 1.714285714285714 10 33 3.6 20 33 4.285714285714286 1 36 0.2857142857142857 3 36 -0.4761904761904762 4 36 -0.2380952380952381 5 36 5.833333333333333 7 36 0.3333333333333333 9 36 0.2222222222222222 10 36 0.1111111111111111 12 36 -2 14 36 -0.1142857142857143 15 36 -1.5 17 36 -0.08571428571428572 19 36 -0.05714285714285714 20 36 -0.02857142857142857 21 36 18 23 36 0.2777777777777778 25 36 0.01587301587301587 27 36 0.2222222222222222 29 36 0.0126984126984127 30 36 0.1666666666666667 32 36 0.009523809523809525 34 36 0.006349206349206349 35 36 0.003174603174603175 2 37 1.428571428571429 6 37 3.166666666666667 8 37 -0.6666666666666666 11 37 2.8 13 37 -3.085714285714286 16 37 -0.8142857142857143 18 37 0.1714285714285714 22 37 17.11111111111111 24 37 0.7857142857142857 26 37 -0.3111111111111111 28 37 0.3428571428571429 31 37 0.09047619047619047 33 37 -0.01904761904761905 1 38 0.2857142857142857 3 38 0.8095238095238095 4 38 -0.2380952380952381 5 38 0.8333333333333334 7 38 -0.06666666666666667 9 38 -0.3777777777777778 10 38 0.1111111111111111 12 38 5.714285714285714 14 38 -1.657142857142857 15 38 -0.2142857142857143 17 38 0.01714285714285714 19 38 0.09714285714285714 20 38 -0.02857142857142857 23 38 15.27777777777778 25 38 0.873015873015873 27 38 -0.6349206349206349 29 38 0.1841269841269841 30 38 0.02380952380952381 32 38 -0.001904761904761905 34 38 -0.01079365079365079 35 38 0.003174603174603175 2 39 0.4285714285714285 6 39 2.7 8 39 -0.2 13 39 7.714285714285714 16 39 -0.6942857142857143 18 39 0.05142857142857143 24 39 11.78571428571429 28 39 -0.8571428571428571 31 39 0.07714285714285714 33 39 -0.005714285714285714 1 40 0.2857142857142857 3 40 1.523809523809524 4 40 -0.2380952380952381 7 40 3.6 9 40 -0.7111111111111111 10 40 0.1111111111111111 14 40 5.485714285714286 17 40 -0.9257142857142857 19 40 0.1828571428571429 20 40 -0.02857142857142857 25 40 5.238095238095238 29 40 -0.6095238095238096 32 40 0.1028571428571429 34 40 -0.02031746031746032 35 40 0.003174603174603175 2 42 1.428571428571429 6 42 -0.5 8 42 3 11 42 2.8 13 42 0.05714285714285714 16 42 -1.757142857142857 18 42 -2.028571428571428 26 42 16.8 28 42 0.3428571428571429 31 42 0.4571428571428571 33 42 0.4 1 43 0.2857142857142857 3 43 0.2857142857142857 4 43 0.2857142857142857 5 43 0.8333333333333334 7 43 -0.4333333333333333 9 43 0.6 10 43 -0.5 12 43 2.571428571428572 14 43 0.1028571428571429 15 43 2.928571428571428 17 43 -1.522857142857143 19 43 -0.4057142857142857 20 43 0.2542857142857143 27 43 15.42857142857143 29 43 0.6171428571428571 30 43 -0.7619047619047619 32 43 0.3961904761904762 34 43 0.08 35 43 -0.04571428571428571 2 44 0.4285714285714285 6 44 1.6 8 44 0.9 13 44 2.057142857142857 16 44 5.622857142857143 18 44 -0.6085714285714285 28 44 12.34285714285714 31 44 -1.462857142857143 33 44 0.12 1 45 0.2857142857142857 3 45 1 4 45 0.2857142857142857 7 45 1.4 9 45 2.1 10 45 -0.5 14 45 0.96 17 45 4.92 19 45 -1.42 20 45 0.2542857142857143 29 45 5.76 32 45 -1.28 34 45 0.28 35 45 -0.04571428571428571 1 47 0.2857142857142857 3 47 -0.1428571428571428 4 47 0.7142857142857143 5 47 0.8333333333333334 7 47 0.01666666666666667 9 47 -0.6 10 47 0.25 15 47 5.5 17 47 0.11 19 47 -0.8171428571428572 20 47 -0.8642857142857143 30 47 14.66666666666667 32 47 0.2933333333333333 34 47 0.44 35 47 0.3142857142857143 2 48 0.4285714285714285 6 48 0.7 8 48 1.8 16 48 4.62 18 48 2.451428571428572 31 48 12.32 33 48 -1.32 1 49 0.2857142857142857 3 49 0.5714285714285714 4 49 0.7142857142857143 7 49 0.35 9 49 2.4 10 49 0.25 17 49 2.31 19 49 3.268571428571429 20 49 -0.8642857142857143 32 49 6.16 34 49 -1.76 35 49 0.3142857142857143 2 51 0.4285714285714285 8 51 2.5 18 51 7.071428571428571 33 51 11 1 52 0.2857142857142857 3 52 0.2380952380952381 4 52 1.047619047619048 9 52 1.388888888888889 10 52 1.611111111111111 19 52 3.928571428571428 20 52 0.7857142857142857 34 52 6.111111111111111 35 52 -1.396825396825397 1 54 0.2857142857142857 4 54 1.285714285714286 10 54 3 20 54 4.714285714285714 35 54 4.714285714285714 2 57 1.666666666666667 6 57 -1.4 8 57 -0.9333333333333333 11 57 9.163636363636364 13 57 0.6545454545454545 16 57 0.4909090909090909 18 57 0.3272727272727273 22 57 -2.333333333333333 24 57 -0.1666666666666667 26 57 -1.866666666666667 28 57 -0.1333333333333333 31 57 -0.1 33 57 -0.06666666666666667 36 57 22 38 57 0.2545454545454545 40 57 0.01818181818181818 43 57 0.2121212121212121 45 57 0.01515151515151515 47 57 0.1696969696969697 49 57 0.01212121212121212 52 57 0.00909090909090909 54 57 0.006060606060606061 1 58 0.2142857142857143 3 58 0.07936507936507936 4 58 -0.1984126984126984 5 58 2.333333333333333 7 58 -0.7333333333333333 9 58 -0.04444444444444445 10 58 0.1111111111111111 12 58 6.545454545454546 14 58 0.7792207792207793 15 58 -0.8181818181818182 17 58 0.2571428571428571 19 58 0.01558441558441558 20 58 -0.03896103896103896 21 58 3 23 58 -4.166666666666667 25 58 -0.3412698412698413 27 58 -1.333333333333333 29 58 -0.1587301587301587 30 58 0.1666666666666667 32 58 -0.05238095238095238 34 58 -0.003174603174603175 35 58 0.007936507936507936 37 58 21.27272727272727 39 58 0.7878787878787878 41 58 0.05627705627705628 42 58 -0.2727272727272727 44 58 0.3787878787878788 46 58 0.03102453102453102 48 58 0.1212121212121212 50 58 0.01443001443001443 51 58 -0.01515151515151515 53 58 0.004761904761904762 55 58 0.0002886002886002886 56 58 -0.0007215007215007215 2 59 0.7936507936507936 6 59 3 8 59 -0.4444444444444444 11 59 0.7636363636363637 13 59 2.197402597402597 16 59 -1.051948051948052 18 59 0.1558441558441558 22 59 6.222222222222222 24 59 -4.357142857142857 26 59 -0.1555555555555556 28 59 -0.4476190476190476 31 59 0.2142857142857143 33 59 -0.03174603174603174 38 59 19.85454545454546 40 59 1.418181818181818 43 59 -0.5656565656565656 45 59 0.3961038961038961 47 59 0.01414141414141414 49 59 0.04069264069264069 52 59 -0.01948051948051948 54 59 0.002886002886002886 1 60 0.2142857142857143 3 60 0.7936507936507936 4 60 -0.1984126984126984 5 60 0.3333333333333333 7 60 0.6666666666666666 9 60 -0.4444444444444444 10 60 0.1111111111111111 12 60 2.571428571428572 14 60 -0.8571428571428571 15 60 -0.1168831168831169 17 60 -0.2337662337662338 19 60 0.1558441558441558 20 60 -0.03896103896103896 23 60 9.166666666666666 25 60 -1.746031746031746 27 60 -0.5238095238095238 29 60 0.1746031746031746 30 60 0.02380952380952381 32 60 0.04761904761904762 34 60 -0.03174603174603174 35 60 0.007936507936507936 39 60 17.33333333333333 41 60 1.238095238095238 44 60 -0.8333333333333334 46 60 0.1587301587301587 48 60 0.04761904761904762 50 60 -0.01587301587301587 51 60 -0.002164502164502165 53 60 -0.004329004329004329 55 60 0.002886002886002886 56 60 -0.0007215007215007215 2 61 0.2380952380952381 6 61 1.6 8 61 -0.1333333333333333 13 61 5.142857142857143 16 61 -0.561038961038961 18 61 0.04675324675324675 24 61 10.47619047619048 28 61 -1.047619047619048 31 61 0.1142857142857143 33 61 -0.009523809523809525 40 61 13 45 61 -0.9523809523809523 49 61 0.09523809523809523 52 61 -0.01038961038961039 54 61 0.0008658008658008658 1 62 0.2142857142857143 3 62 1.19047619047619 4 62 -0.1984126984126984 7 62 3 9 62 -0.6666666666666666 10 62 0.1111111111111111 14 62 5.142857142857143 17 62 -1.051948051948052 19 62 0.2337662337662338 20 62 -0.03896103896103896 25 62 6.547619047619047 29 62 -1.047619047619048 32 62 0.2142857142857143 34 62 -0.04761904761904762 35 62 0.007936507936507936 41 62 5.571428571428571 46 62 -0.5952380952380952 50 62 0.09523809523809523 53 62 -0.01948051948051948 55 62 0.004329004329004329 56 62 -0.0007215007215007215 1 64 0.2142857142857143 3 64 -0.2645502645502645 4 64 0.1455026455026455 5 64 2.333333333333333 7 64 0.1333333333333333 9 64 -0.4296296296296296 10 64 -0.3703703703703703 12 64 -0.5454545454545454 14 64 -0.03116883116883117 15 64 6.272727272727272 17 64 0.3584415584415584 19 64 0.4207792207792208 20 64 0.2649350649350649 21 64 3 23 64 0.04629629629629629 25 64 0.002645502645502645 27 64 -2.296296296296296 29 64 -0.1312169312169312 30 64 -3.083333333333333 32 64 -0.1761904761904762 34 64 -0.1544973544973545 35 64 -0.08835978835978836 42 64 21 44 64 0.3240740740740741 46 64 0.01851851851851852 48 64 0.4713804713804714 50 64 0.02693602693602693 51 64 0.4772727272727273 53 64 0.02727272727272727 55 64 0.02154882154882155 56 64 0.01178451178451178 2 65 0.7936507936507936 6 65 1.266666666666667 8 65 1.288888888888889 11 65 0.7636363636363637 13 65 -0.8415584415584415 16 65 3.405194805194805 18 65 -1.262337662337662 22 65 2.851851851851852 24 65 0.130952380952381 26 65 3.214814814814815 28 65 -3.542857142857143 31 65 -1.673809523809524 33 65 0.4634920634920635 43 65 19.96296296296296 45 65 0.9166666666666666 47 65 -0.6599326599326599 49 65 0.7272727272727273 52 65 0.2590909090909091 54 65 -0.06464646464646465 1 66 0.2142857142857143 3 66 0.4497354497354497 4 66 0.1455026455026455 5 66 0.3333333333333333 7 66 -0.02666666666666667 9 66 0.7303703703703703 10 66 -0.3703703703703703 12 66 1.558441558441559 14 66 -0.4519480519480519 15 66 0.8961038961038961 17 66 -0.07168831168831169 19 66 -0.7153246753246754 20 66 0.2649350649350649 23 66 2.546296296296296 25 66 0.1455026455026455 27 66 6.560846560846561 29 66 -1.902645502645503 30 66 -0.4404761904761905 32 66 0.03523809523809524 34 66 0.2626455026455026 35 66 -0.08835978835978836 44 66 17.82407407407407 46 66 1.018518518518519 48 66 -1.346801346801347 50 66 0.3905723905723906 51 66 0.06818181818181818 53 66 -0.005454545454545455 55 66 -0.03663299663299664 56 66 0.01178451178451178 2 67 0.2380952380952381 6 67 1.08 8 67 0.3866666666666667 13 67 2.103896103896104 16 67 2.903376623376623 18 67 -0.3787012987012987 24 67 1.964285714285714 28 67 8.857142857142858 31 67 -1.427142857142857 33 67 0.1390476190476191 45 67 13.75 49 67 -1.818181818181818 52 67 0.2209090909090909 54 67 -0.01939393939393939 1 68 0.2142857142857143 3 68 0.8465608465608465 4 68 0.1455026455026455 7 68 1.44 9 68 1.374814814814815 10 68 -0.3703703703703703 14 68 1.496103896103896 17 68 3.871168831168831 19 68 -1.346493506493506 20 68 0.2649350649350649 25 68 0.873015873015873 29 68 6.298412698412698 32 68 -1.902857142857143 34 68 0.4943915343915344 35 68 -0.08835978835978836 46 68 6.111111111111111 50 68 -1.292929292929293 53 68 0.2945454545454546 55 68 -0.06895622895622895 56 68 0.01178451178451178 2 70 0.7936507936507936 6 70 -0.2 8 70 2.755555555555556 11 70 0.7636363636363637 13 70 0.01558441558441558 16 70 -1.051948051948052 18 70 2.337662337662338 26 70 6.066666666666666 28 70 0.1238095238095238 31 70 -1.857142857142857 33 70 -2.888888888888889 47 70 19.3030303030303 49 70 0.3939393939393939 52 70 0.7090909090909091 54 70 0.7353535353535353 1 71 0.2142857142857143 3 71 0.1587301587301587 4 71 0.4365079365079365 5 71 0.3333333333333333 7 71 -0.1733333333333333 9 71 0.5511111111111111 10 71 -0.04444444444444445 12 71 0.7012987012987013 14 71 0.02805194805194805 15 71 1.753246753246753 17 71 -0.9116883116883117 19 71 0.4675324675324675 20 71 -0.5579220779220779 27 71 5.571428571428571 29 71 0.2228571428571429 30 71 3.095238095238095 32 71 -1.60952380952381 34 71 -0.5777777777777777 35 71 0.4168253968253968 48 71 17.72727272727273 50 71 0.7090909090909091 51 71 -1.181818181818182 53 71 0.6145454545454545 55 71 0.1470707070707071 56 71 -0.09191919191919191 2 72 0.2380952380952381 6 72 0.64 8 72 0.8266666666666667 13 72 0.561038961038961 16 72 3.366233766233766 18 72 0.7012987012987013 28 72 4.457142857142857 31 72 5.942857142857143 33 72 -0.8666666666666667 49 72 14.18181818181818 52 72 -2.269090909090909 54 72 0.2206060606060606 1 73 0.2142857142857143 3 73 0.5555555555555556 4 73 0.4365079365079365 7 73 0.5600000000000001 9 73 1.928888888888889 10 73 -0.04444444444444445 14 73 0.2618181818181818 17 73 2.945454545454545 19 73 1.636363636363636 20 73 -0.5579220779220779 29 73 2.08 32 73 5.2 34 73 -2.022222222222222 35 73 0.4168253968253968 50 73 6.618181818181818 53 73 -1.985454545454545 55 73 0.5147474747474747 56 73 -0.09191919191919191 1 75 0.2142857142857143 3 75 -0.07936507936507936 4 75 0.6746031746031746 5 75 0.3333333333333333 7 75 0.006666666666666667 9 75 -0.3955555555555555 10 75 0.7222222222222222 15 75 2.454545454545455 17 75 0.04909090909090909 19 75 -0.8555844155844156 20 75 -0.1675324675324675 30 75 8.666666666666666 32 75 0.1733333333333333 34 75 -0.7511111111111111 35 75 -0.9698412698412698 51 75 16.54545454545455 53 75 0.3309090909090909 55 75 0.5882828282828283 56 75 0.4595959595959596 2 76 0.2380952380952381 6 76 0.28 8 76 1.186666666666667 16 76 2.061818181818182 18 76 2.566753246753247 31 76 7.28 33 76 2.253333333333333 52 76 13.89818181818182 54 76 -1.764848484848485 1 77 0.2142857142857143 3 77 0.3174603174603174 4 77 0.6746031746031746 7 77 0.14 9 77 1.582222222222222 10 77 0.7222222222222222 17 77 1.030909090909091 19 77 3.422337662337662 20 77 -0.1675324675324675 32 77 3.64 34 77 3.004444444444444 35 77 -0.9698412698412698 53 77 6.949090909090909 55 77 -2.353131313131313 56 77 0.4595959595959596 2 79 0.2380952380952381 8 79 1.466666666666667 18 79 4.628571428571429 33 79 9.533333333333333 54 79 12.13333333333333 1 80 0.2142857142857143 3 80 0.1322751322751323 4 80 0.8597883597883598 9 80 0.8148148148148148 10 80 1.62962962962963 19 80 2.571428571428572 20 80 1.714285714285714 34 80 5.296296296296297 35 80 0.3783068783068783 55 80 6.74074074074074 56 80 -1.685185185185185 1 82 0.2142857142857143 4 82 0.9920634920634921 10 82 2.444444444444445 20 82 4.285714285714286 35 82 5.674603174603175 56 82 5.055555555555555 libxsmm-1.17/samples/generator/left_sparse_test_csr.mtx000066400000000000000000000356731415223013700235320ustar00rootroot00000000000000%%MatrixMarket matrix coordinate real general % 84 84 686 1 2 2 1 6 1 1 8 1 1 11 0.6 1 13 0.6 1 16 0.6 1 18 0.6 1 22 0.4 1 24 0.4 1 26 0.4 1 28 0.4 1 31 0.4 1 33 0.4 1 36 0.2857142857142857 1 38 0.2857142857142857 1 40 0.2857142857142857 1 43 0.2857142857142857 1 45 0.2857142857142857 1 47 0.2857142857142857 1 49 0.2857142857142857 1 52 0.2857142857142857 1 54 0.2857142857142857 1 58 0.2142857142857143 1 60 0.2142857142857143 1 62 0.2142857142857143 1 64 0.2142857142857143 1 66 0.2142857142857143 1 68 0.2142857142857143 1 71 0.2142857142857143 1 73 0.2142857142857143 1 75 0.2142857142857143 1 77 0.2142857142857143 1 80 0.2142857142857143 1 82 0.2142857142857143 2 5 6 2 12 2 2 15 2 2 21 2.857142857142857 2 23 0.8571428571428571 2 27 0.8571428571428571 2 30 0.8571428571428571 2 37 1.428571428571429 2 39 0.4285714285714285 2 42 1.428571428571429 2 44 0.4285714285714285 2 48 0.4285714285714285 2 51 0.4285714285714285 2 57 1.666666666666667 2 59 0.7936507936507936 2 61 0.2380952380952381 2 65 0.7936507936507936 2 67 0.2380952380952381 2 70 0.7936507936507936 2 72 0.2380952380952381 2 76 0.2380952380952381 2 79 0.2380952380952381 3 6 3.333333333333333 3 11 -0.6666666666666666 3 13 2.666666666666667 3 16 1.111111111111111 3 22 0.5714285714285714 3 24 2 3 26 -0.2857142857142857 3 28 1.142857142857143 3 31 0.4761904761904762 3 36 -0.4761904761904762 3 38 0.8095238095238095 3 40 1.523809523809524 3 43 0.2857142857142857 3 45 1 3 47 -0.1428571428571428 3 49 0.5714285714285714 3 52 0.2380952380952381 3 58 0.07936507936507936 3 60 0.7936507936507936 3 62 1.19047619047619 3 64 -0.2645502645502645 3 66 0.4497354497354497 3 68 0.8465608465608465 3 71 0.1587301587301587 3 73 0.5555555555555556 3 75 -0.07936507936507936 3 77 0.3174603174603174 3 80 0.1322751322751323 4 6 -0.3333333333333333 4 11 -0.3333333333333333 4 13 -0.3333333333333333 4 16 1.222222222222222 4 18 2.333333333333333 4 22 -0.2857142857142857 4 24 -0.2857142857142857 4 26 0.5714285714285714 4 28 0.5714285714285714 4 31 1.238095238095238 4 33 1.714285714285714 4 36 -0.2380952380952381 4 38 -0.2380952380952381 4 40 -0.2380952380952381 4 43 0.2857142857142857 4 45 0.2857142857142857 4 47 0.7142857142857143 4 49 0.7142857142857143 4 52 1.047619047619048 4 54 1.285714285714286 4 58 -0.1984126984126984 4 60 -0.1984126984126984 4 62 -0.1984126984126984 4 64 0.1455026455026455 4 66 0.1455026455026455 4 68 0.1455026455026455 4 71 0.4365079365079365 4 73 0.4365079365079365 4 75 0.6746031746031746 4 77 0.6746031746031746 4 80 0.8597883597883598 4 82 0.9920634920634921 4 8 3 5 11 10 5 22 2.5 5 26 2.5 5 36 5.833333333333333 5 38 0.8333333333333334 5 43 0.8333333333333334 5 47 0.8333333333333334 5 58 2.333333333333333 5 60 0.3333333333333333 5 64 2.333333333333333 5 66 0.3333333333333333 5 71 0.3333333333333333 5 75 0.3333333333333333 6 12 8.4 6 21 -1.5 6 23 4.8 6 27 2.1 6 37 3.166666666666667 6 39 2.7 6 42 -0.5 6 44 1.6 6 48 0.7 6 57 -1.4 6 59 3 6 61 1.6 6 65 1.266666666666667 6 67 1.08 6 70 -0.2 6 72 0.64 6 76 0.28 7 11 0.2 7 13 4.2 7 22 -1.3 7 24 4.2 7 26 0.05 7 28 1.05 7 36 0.3333333333333333 7 38 -0.06666666666666667 7 40 3.6 7 43 -0.4333333333333333 7 45 1.4 7 47 0.01666666666666667 7 49 0.35 7 58 -0.7333333333333333 7 60 0.6666666666666666 7 62 3 7 64 0.1333333333333333 7 66 -0.02666666666666667 7 68 1.44 7 71 -0.1733333333333333 7 73 0.5600000000000001 7 75 0.006666666666666667 7 77 0.14 8 12 -0.4 8 15 8 8 21 -1 8 23 -0.3 8 27 2.4 8 30 4.5 8 37 -0.6666666666666666 8 39 -0.2 8 42 3 8 44 0.9 8 48 1.8 8 51 2.5 8 57 -0.9333333333333333 8 59 -0.4444444444444444 8 61 -0.1333333333333333 8 65 1.288888888888889 8 67 0.3866666666666667 8 70 2.755555555555556 8 72 0.8266666666666667 8 76 1.186666666666667 8 79 1.466666666666667 9 11 0.1333333333333333 9 13 -0.5333333333333333 9 16 4.444444444444445 9 22 -0.2 9 24 -0.7 9 26 -0.8 9 28 3.2 9 31 2.5 9 36 0.2222222222222222 9 38 -0.3777777777777778 9 40 -0.7111111111111111 9 43 0.6 9 45 2.1 9 47 -0.6 9 49 2.4 9 52 1.388888888888889 9 58 -0.04444444444444445 9 60 -0.4444444444444444 9 62 -0.6666666666666666 9 64 -0.4296296296296296 9 66 0.7303703703703703 9 68 1.374814814814815 9 71 0.5511111111111111 9 73 1.928888888888889 9 75 -0.3955555555555555 9 77 1.582222222222222 9 80 0.8148148148148148 10 11 0.06666666666666667 10 13 0.06666666666666667 10 16 -0.7111111111111111 10 18 3.733333333333333 10 22 0.1 10 24 0.1 10 26 -0.65 10 28 -0.65 10 31 1.1 10 33 3.6 10 36 0.1111111111111111 10 38 0.1111111111111111 10 40 0.1111111111111111 10 43 -0.5 10 45 -0.5 10 47 0.25 10 49 0.25 10 52 1.611111111111111 10 54 3 10 58 0.1111111111111111 10 60 0.1111111111111111 10 62 0.1111111111111111 10 64 -0.3703703703703703 10 66 -0.3703703703703703 10 68 -0.3703703703703703 10 71 -0.04444444444444445 10 73 -0.04444444444444445 10 75 0.7222222222222222 10 77 0.7222222222222222 10 80 1.62962962962963 10 82 2.444444444444445 11 21 14 11 37 2.8 11 42 2.8 11 57 9.163636363636364 11 59 0.7636363636363637 11 65 0.7636363636363637 11 70 0.7636363636363637 12 22 12.85714285714286 12 36 -2 12 38 5.714285714285714 12 43 2.571428571428572 12 58 6.545454545454546 12 60 2.571428571428572 12 64 -0.5454545454545454 12 66 1.558441558441559 12 71 0.7012987012987013 13 21 0.2857142857142857 13 23 10.28571428571429 13 37 -3.085714285714286 13 39 7.714285714285714 13 42 0.05714285714285714 13 44 2.057142857142857 13 57 0.6545454545454545 13 59 2.197402597402597 13 61 5.142857142857143 13 65 -0.8415584415584415 13 67 2.103896103896104 13 70 0.01558441558441558 13 72 0.561038961038961 14 22 0.5142857142857142 14 24 4.8 14 36 -0.1142857142857143 14 38 -1.657142857142857 14 40 5.485714285714286 14 43 0.1028571428571429 14 45 0.96 14 58 0.7792207792207793 14 60 -0.8571428571428571 14 62 5.142857142857143 14 64 -0.03116883116883117 14 66 -0.4519480519480519 14 68 1.496103896103896 14 71 0.02805194805194805 14 73 0.2618181818181818 15 22 -0.3571428571428572 15 26 12.5 15 36 -1.5 15 38 -0.2142857142857143 15 43 2.928571428571428 15 47 5.5 15 58 -0.8181818181818182 15 60 -0.1168831168831169 15 64 6.272727272727272 15 66 0.8961038961038961 15 71 1.753246753246753 15 75 2.454545454545455 16 21 0.2142857142857143 16 23 -0.6857142857142857 16 27 10.5 16 37 -0.8142857142857143 16 39 -0.6942857142857143 16 42 -1.757142857142857 16 44 5.622857142857143 16 48 4.62 16 57 0.4909090909090909 16 59 -1.051948051948052 16 61 -0.561038961038961 16 65 3.405194805194805 16 67 2.903376623376623 16 70 -1.051948051948052 16 72 3.366233766233766 16 76 2.061818181818182 17 22 0.1857142857142857 17 24 -0.6 17 26 0.25 17 28 5.25 17 36 -0.08571428571428572 17 38 0.01714285714285714 17 40 -0.9257142857142857 17 43 -1.522857142857143 17 45 4.92 17 47 0.11 17 49 2.31 17 58 0.2571428571428571 17 60 -0.2337662337662338 17 62 -1.051948051948052 17 64 0.3584415584415584 17 66 -0.07168831168831169 17 68 3.871168831168831 17 71 -0.9116883116883117 17 73 2.945454545454545 17 75 0.04909090909090909 17 77 1.030909090909091 18 21 0.1428571428571428 18 23 0.04285714285714286 18 27 -0.8571428571428571 18 30 9.642857142857142 18 37 0.1714285714285714 18 39 0.05142857142857143 18 42 -2.028571428571428 18 44 -0.6085714285714285 18 48 2.451428571428572 18 51 7.071428571428571 18 57 0.3272727272727273 18 59 0.1558441558441558 18 61 0.04675324675324675 18 65 -1.262337662337662 18 67 -0.3787012987012987 18 70 2.337662337662338 18 72 0.7012987012987013 18 76 2.566753246753247 18 79 4.628571428571429 19 22 0.02857142857142857 19 24 0.1 19 26 0.2857142857142857 19 28 -1.142857142857143 19 31 5.357142857142857 19 36 -0.05714285714285714 19 38 0.09714285714285714 19 40 0.1828571428571429 19 43 -0.4057142857142857 19 45 -1.42 19 47 -0.8171428571428572 19 49 3.268571428571429 19 52 3.928571428571428 19 58 0.01558441558441558 19 60 0.1558441558441558 19 62 0.2337662337662338 19 64 0.4207792207792208 19 66 -0.7153246753246754 19 68 -1.346493506493506 19 71 0.4675324675324675 19 73 1.636363636363636 19 75 -0.8555844155844156 19 77 3.422337662337662 19 80 2.571428571428572 20 22 -0.01428571428571429 20 24 -0.01428571428571429 20 26 0.1785714285714286 20 28 0.1785714285714286 20 31 -1.071428571428571 20 33 4.285714285714286 20 36 -0.02857142857142857 20 38 -0.02857142857142857 20 40 -0.02857142857142857 20 43 0.2542857142857143 20 45 0.2542857142857143 20 47 -0.8642857142857143 20 49 -0.8642857142857143 20 52 0.7857142857142857 20 54 4.714285714285714 20 58 -0.03896103896103896 20 60 -0.03896103896103896 20 62 -0.03896103896103896 20 64 0.2649350649350649 20 66 0.2649350649350649 20 68 0.2649350649350649 20 71 -0.5579220779220779 20 73 -0.5579220779220779 20 75 -0.1675324675324675 20 77 -0.1675324675324675 20 80 1.714285714285714 20 82 4.285714285714286 21 36 18 21 58 3 21 64 3 22 37 17.11111111111111 22 57 -2.333333333333333 22 59 6.222222222222222 22 65 2.851851851851852 23 36 0.2777777777777778 23 38 15.27777777777778 23 58 -4.166666666666667 23 60 9.166666666666666 23 64 0.04629629629629629 23 66 2.546296296296296 24 37 0.7857142857142857 24 39 11.78571428571429 24 57 -0.1666666666666667 24 59 -4.357142857142857 24 61 10.47619047619048 24 65 0.130952380952381 24 67 1.964285714285714 25 36 0.01587301587301587 25 38 0.873015873015873 25 40 5.238095238095238 25 58 -0.3412698412698413 25 60 -1.746031746031746 25 62 6.547619047619047 25 64 0.002645502645502645 25 66 0.1455026455026455 25 68 0.873015873015873 26 37 -0.3111111111111111 26 42 16.8 26 57 -1.866666666666667 26 59 -0.1555555555555556 26 65 3.214814814814815 26 70 6.066666666666666 27 36 0.2222222222222222 27 38 -0.6349206349206349 27 43 15.42857142857143 27 58 -1.333333333333333 27 60 -0.5238095238095238 27 64 -2.296296296296296 27 66 6.560846560846561 27 71 5.571428571428571 28 37 0.3428571428571429 28 39 -0.8571428571428571 28 42 0.3428571428571429 28 44 12.34285714285714 28 57 -0.1333333333333333 28 59 -0.4476190476190476 28 61 -1.047619047619048 28 65 -3.542857142857143 28 67 8.857142857142858 28 70 0.1238095238095238 28 72 4.457142857142857 29 36 0.0126984126984127 29 38 0.1841269841269841 29 40 -0.6095238095238096 29 43 0.6171428571428571 29 45 5.76 29 58 -0.1587301587301587 29 60 0.1746031746031746 29 62 -1.047619047619048 29 64 -0.1312169312169312 29 66 -1.902645502645503 29 68 6.298412698412698 29 71 0.2228571428571429 29 73 2.08 30 36 0.1666666666666667 30 38 0.02380952380952381 30 43 -0.7619047619047619 30 47 14.66666666666667 30 58 0.1666666666666667 30 60 0.02380952380952381 30 64 -3.083333333333333 30 66 -0.4404761904761905 30 71 3.095238095238095 30 75 8.666666666666666 31 37 0.09047619047619047 31 39 0.07714285714285714 31 42 0.4571428571428571 31 44 -1.462857142857143 31 48 12.32 31 57 -0.1 31 59 0.2142857142857143 31 61 0.1142857142857143 31 65 -1.673809523809524 31 67 -1.427142857142857 31 70 -1.857142857142857 31 72 5.942857142857143 31 76 7.28 32 36 0.009523809523809525 32 38 -0.001904761904761905 32 40 0.1028571428571429 32 43 0.3961904761904762 32 45 -1.28 32 47 0.2933333333333333 32 49 6.16 32 58 -0.05238095238095238 32 60 0.04761904761904762 32 62 0.2142857142857143 32 64 -0.1761904761904762 32 66 0.03523809523809524 32 68 -1.902857142857143 32 71 -1.60952380952381 32 73 5.2 32 75 0.1733333333333333 32 77 3.64 33 37 -0.01904761904761905 33 39 -0.005714285714285714 33 42 0.4 33 44 0.12 33 48 -1.32 33 51 11 33 57 -0.06666666666666667 33 59 -0.03174603174603174 33 61 -0.009523809523809525 33 65 0.4634920634920635 33 67 0.1390476190476191 33 70 -2.888888888888889 33 72 -0.8666666666666667 33 76 2.253333333333333 33 79 9.533333333333333 34 36 0.006349206349206349 34 38 -0.01079365079365079 34 40 -0.02031746031746032 34 43 0.08 34 45 0.28 34 47 0.44 34 49 -1.76 34 52 6.111111111111111 34 58 -0.003174603174603175 34 60 -0.03174603174603174 34 62 -0.04761904761904762 34 64 -0.1544973544973545 34 66 0.2626455026455026 34 68 0.4943915343915344 34 71 -0.5777777777777777 34 73 -2.022222222222222 34 75 -0.7511111111111111 34 77 3.004444444444444 34 80 5.296296296296297 35 36 0.003174603174603175 35 38 0.003174603174603175 35 40 0.003174603174603175 35 43 -0.04571428571428571 35 45 -0.04571428571428571 35 47 0.3142857142857143 35 49 0.3142857142857143 35 52 -1.396825396825397 35 54 4.714285714285714 35 58 0.007936507936507936 35 60 0.007936507936507936 35 62 0.007936507936507936 35 64 -0.08835978835978836 35 66 -0.08835978835978836 35 68 -0.08835978835978836 35 71 0.4168253968253968 35 73 0.4168253968253968 35 75 -0.9698412698412698 35 77 -0.9698412698412698 35 80 0.3783068783068783 35 82 5.674603174603175 36 57 22 37 58 21.27272727272727 38 57 0.2545454545454545 38 59 19.85454545454546 39 58 0.7878787878787878 39 60 17.33333333333333 40 57 0.01818181818181818 40 59 1.418181818181818 40 61 13 41 58 0.05627705627705628 41 60 1.238095238095238 41 62 5.571428571428571 42 58 -0.2727272727272727 42 64 21 43 57 0.2121212121212121 43 59 -0.5656565656565656 43 65 19.96296296296296 44 58 0.3787878787878788 44 60 -0.8333333333333334 44 64 0.3240740740740741 44 66 17.82407407407407 45 57 0.01515151515151515 45 59 0.3961038961038961 45 61 -0.9523809523809523 45 65 0.9166666666666666 45 67 13.75 46 58 0.03102453102453102 46 60 0.1587301587301587 46 62 -0.5952380952380952 46 64 0.01851851851851852 46 66 1.018518518518519 46 68 6.111111111111111 47 57 0.1696969696969697 47 59 0.01414141414141414 47 65 -0.6599326599326599 47 70 19.3030303030303 48 58 0.1212121212121212 48 60 0.04761904761904762 48 64 0.4713804713804714 48 66 -1.346801346801347 48 71 17.72727272727273 49 57 0.01212121212121212 49 59 0.04069264069264069 49 61 0.09523809523809523 49 65 0.7272727272727273 49 67 -1.818181818181818 49 70 0.3939393939393939 49 72 14.18181818181818 50 58 0.01443001443001443 50 60 -0.01587301587301587 50 62 0.09523809523809523 50 64 0.02693602693602693 50 66 0.3905723905723906 50 68 -1.292929292929293 50 71 0.7090909090909091 50 73 6.618181818181818 51 58 -0.01515151515151515 51 60 -0.002164502164502165 51 64 0.4772727272727273 51 66 0.06818181818181818 51 71 -1.181818181818182 51 75 16.54545454545455 52 57 0.00909090909090909 52 59 -0.01948051948051948 52 61 -0.01038961038961039 52 65 0.2590909090909091 52 67 0.2209090909090909 52 70 0.7090909090909091 52 72 -2.269090909090909 52 76 13.89818181818182 53 58 0.004761904761904762 53 60 -0.004329004329004329 53 62 -0.01948051948051948 53 64 0.02727272727272727 53 66 -0.005454545454545455 53 68 0.2945454545454546 53 71 0.6145454545454545 53 73 -1.985454545454545 53 75 0.3309090909090909 53 77 6.949090909090909 54 57 0.006060606060606061 54 59 0.002886002886002886 54 61 0.0008658008658008658 54 65 -0.06464646464646465 54 67 -0.01939393939393939 54 70 0.7353535353535353 54 72 0.2206060606060606 54 76 -1.764848484848485 54 79 12.13333333333333 55 58 0.0002886002886002886 55 60 0.002886002886002886 55 62 0.004329004329004329 55 64 0.02154882154882155 55 66 -0.03663299663299664 55 68 -0.06895622895622895 55 71 0.1470707070707071 55 73 0.5147474747474747 55 75 0.5882828282828283 55 77 -2.353131313131313 55 80 6.74074074074074 56 58 -0.0007215007215007215 56 60 -0.0007215007215007215 56 62 -0.0007215007215007215 56 64 0.01178451178451178 56 66 0.01178451178451178 56 68 0.01178451178451178 56 71 -0.09191919191919191 56 73 -0.09191919191919191 56 75 0.4595959595959596 56 77 0.4595959595959596 56 80 -1.685185185185185 56 82 5.055555555555555 libxsmm-1.17/samples/generator/right_sparse_test_csc.mtx000066400000000000000000000004031415223013700236550ustar00rootroot00000000000000%%MatrixMarket matrix coordinate real general % 9 9 24 7 1 -2.01 8 1 -0.01 9 1 -0.01 7 2 -0.01 8 2 -2.01 9 2 -0.01 7 3 -0.01 8 3 -0.01 9 3 -2.01 7 4 -1 8 4 -1 8 5 -1 9 5 -1 7 6 -1 9 6 -1 1 7 -10 4 7 -10 6 7 -10 2 8 -10 4 8 -10 5 8 -10 3 9 -10 5 9 -10 6 9 -10 libxsmm-1.17/samples/generator/test_xGEMM.sh000077500000000000000000000213151415223013700210620ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### MAKE=${MAKE:-make} M="4 10 20 35 56 84" M="4 12 20 36 56 84" #M="4 8 12 16 20 24 27 28 29 30 31 36 40 48 56 84" #M="1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100" #M="8 16 24 40 56 88" #M="56" cd ./../../ #${MAKE} realclean ${MAKE} generator #exit if compiler fails rc=$?; if [ $rc != 0 ]; then exit $rc; fi cd ./samples/generator/ # select architecture for validation ARCH=hsw PREC=DP if [ $# -eq 2 ] then ARCH=$1 PREC=$2 fi # build also assembly variant ASM=1 # set SDE to run AVX512 code SDE_KNL="sde64 -knl -mix -- " # select precision if [ "${PREC}" == 'DP' ]; then DATATYPE="double" elif [ "${PREC}" == 'SP' ]; then DATATYPE="float" fi N="9" #N="1 2 3 4 5 6 7 8 9 10 30 31 60 62" for m in ${M} do for n in ${N} do K="${m} ${N}" # K="${m}" # K="1 2 3 4 5 6 7 8 9 10 30 31 60 62" for k in ${K} do lda=$m ldb=$k ldc=$m rm -rf kernel_${m}_${n}_${k}_${PREC}.* rm -rf xgemm_${m}_${n}_${k}_${PREC} ./../../bin/libxsmm_gemm_generator dense kernel_${m}_${n}_${k}_${PREC}.h dense_test_mul $m $n $k $lda $ldb $ldc 1 1 1 1 ${ARCH} nopf ${PREC} if [ $ASM -eq 1 ] then ./../../bin/libxsmm_gemm_generator dense_asm kernel_${m}_${n}_${k}_${PREC}.s dense_test_mul $m $n $k $lda $ldb $ldc 1 1 1 1 ${ARCH} nopf ${PREC} fi if [ "${ARCH}" == 'wsm' ]; then icc -O2 -msse3 -ansi-alias -DNDEBUG -DMY_M=$m -DMY_N=$n -DMY_K=$k -DMY_LDA=$lda -DMY_LDB=$ldb -DMY_LDC=$ldc -DREALTYPE=${DATATYPE} -DGEMM_HEADER=\"kernel_${m}_${n}_${k}_${PREC}.h\" validation.c -o xgemm_${m}_${n}_${k}_${PREC}_icc # gcc -O2 -msse3 -fstrict-aliasing -DNDEBUG -DMY_M=$m -DMY_N=$n -DMY_K=$k -DMY_LDA=$lda -DMY_LDB=$ldb -DMY_LDC=$ldc -DREALTYPE=${DATATYPE} -DGEMM_HEADER=\"kernel_${m}_${n}_${k}_${PREC}.h\" validation.c -o xgemm_${m}_${n}_${k}_${PREC}_gcc ./xgemm_${m}_${n}_${k}_${PREC}_icc # ./xgemm_${m}_${n}_${k}_${PREC}_gcc if [ $ASM -eq 1 ] then as kernel_${m}_${n}_${k}_${PREC}.s -o kernel_${m}_${n}_${k}_${PREC}.o icc -O2 -msse3 -ansi-alias -DNDEBUG -DMY_M=$m -DMY_N=$n -DMY_K=$k -DMY_LDA=$lda -DMY_LDB=$ldb -DMY_LDC=$ldc -DREALTYPE=${DATATYPE} -DUSE_ASM_DIRECT validation.c kernel_${m}_${n}_${k}_${PREC}.o -o xgemm_${m}_${n}_${k}_${PREC}_asm_icc # gcc -O2 -msse3 -fstrict-aliasing -DNDEBUG -DMY_M=$m -DMY_N=$n -DMY_K=$k -DMY_LDA=$lda -DMY_LDB=$ldb -DMY_LDC=$ldc -DREALTYPE=${DATATYPE} -DUSE_ASM_DIRECT validation.c kernel_${m}_${n}_${k}_${PREC}.o -o xgemm_${m}_${n}_${k}_${PREC}_asm_gcc ./xgemm_${m}_${n}_${k}_${PREC}_asm_icc # ./xgemm_${m}_${n}_${k}_${PREC}_asm_gcc fi elif [ "${ARCH}" == 'snb' ]; then icc -O3 -mavx -ansi-alias -DNDEBUG -DMY_M=$m -DMY_N=$n -DMY_K=$k -DMY_LDA=$lda -DMY_LDB=$ldb -DMY_LDC=$ldc -DREALTYPE=${DATATYPE} -DGEMM_HEADER=\"kernel_${m}_${n}_${k}_${PREC}.h\" validation.c -o xgemm_${m}_${n}_${k}_${PREC}_icc # gcc -O3 -mavx -fstrict-aliasing -DNDEBUG -DMY_M=$m -DMY_N=$n -DMY_K=$k -DMY_LDA=$lda -DMY_LDB=$ldb -DMY_LDC=$ldc -DREALTYPE=${DATATYPE} -DGEMM_HEADER=\"kernel_${m}_${n}_${k}_${PREC}.h\" validation.c -o xgemm_${m}_${n}_${k}_${PREC}_gcc ./xgemm_${m}_${n}_${k}_${PREC}_icc # ./xgemm_${m}_${n}_${k}_${PREC}_gcc if [ $ASM -eq 1 ] then as kernel_${m}_${n}_${k}_${PREC}.s -o kernel_${m}_${n}_${k}_${PREC}.o icc -O2 -ansi-alias -mavx -DNDEBUG -DMY_M=$m -DMY_N=$n -DMY_K=$k -DMY_LDA=$lda -DMY_LDB=$ldb -DMY_LDC=$ldc -DREALTYPE=${DATATYPE} -DUSE_ASM_DIRECT validation.c kernel_${m}_${n}_${k}_${PREC}.o -o xgemm_${m}_${n}_${k}_${PREC}_asm_icc # gcc -O2 -fstrict-aliasing -mavx -DNDEBUG -DMY_M=$m -DMY_N=$n -DMY_K=$k -DMY_LDA=$lda -DMY_LDB=$ldb -DMY_LDC=$ldc -DREALTYPE=${DATATYPE} -DUSE_ASM_DIRECT validation.c kernel_${m}_${n}_${k}_${PREC}.o -o xgemm_${m}_${n}_${k}_${PREC}_asm_gcc ./xgemm_${m}_${n}_${k}_${PREC}_asm_icc # ./xgemm_${m}_${n}_${k}_${PREC}_asm_gcc fi elif [ "${ARCH}" == 'hsw' ]; then #icc -O3 -xCORE_AVX2 -fma -D__USE_MKL -mkl=sequential -DNDEBUG -DMY_M=$m -DMY_N=$n -DMY_K=$k -DMY_LDA=$lda -DMY_LDB=$ldb -DMY_LDC=$ldc -DREALTYPE=${DATATYPE} -DGEMM_HEADER=\"kernel_${m}_${n}_${k}_${PREC}.h\" validation.c -o xgemm_${m}_${n}_${k}_${PREC}_icc icc -O2 -ansi-alias -xCORE_AVX2 -fma -DNDEBUG -DMY_M=$m -DMY_N=$n -DMY_K=$k -DMY_LDA=$lda -DMY_LDB=$ldb -DMY_LDC=$ldc -DREALTYPE=${DATATYPE} -DGEMM_HEADER=\"kernel_${m}_${n}_${k}_${PREC}.h\" validation.c -o xgemm_${m}_${n}_${k}_${PREC}_icc # gcc -O2 -fstrict-aliasing -mavx2 -mfma -DNDEBUG -DMY_M=$m -DMY_N=$n -DMY_K=$k -DMY_LDA=$lda -DMY_LDB=$ldb -DMY_LDC=$ldc -DREALTYPE=${DATATYPE} -DGEMM_HEADER=\"kernel_${m}_${n}_${k}_${PREC}.h\" validation.c -o xgemm_${m}_${n}_${k}_${PREC}_gcc ./xgemm_${m}_${n}_${k}_${PREC}_icc # ./xgemm_${m}_${n}_${k}_${PREC}_gcc if [ $ASM -eq 1 ] then as kernel_${m}_${n}_${k}_${PREC}.s -o kernel_${m}_${n}_${k}_${PREC}.o icc -O2 -ansi-alias -mavx -D__AVX2__ -fma -DNDEBUG -DMY_M=$m -DMY_N=$n -DMY_K=$k -DMY_LDA=$lda -DMY_LDB=$ldb -DMY_LDC=$ldc -DREALTYPE=${DATATYPE} -DUSE_ASM_DIRECT validation.c kernel_${m}_${n}_${k}_${PREC}.o -o xgemm_${m}_${n}_${k}_${PREC}_asm_icc # gcc -O2 -fstrict-aliasing -mavx2 -D__AVX2__ -mfma -DNDEBUG -DMY_M=$m -DMY_N=$n -DMY_K=$k -DMY_LDA=$lda -DMY_LDB=$ldb -DMY_LDC=$ldc -DREALTYPE=${DATATYPE} -DUSE_ASM_DIRECT validation.c kernel_${m}_${n}_${k}_${PREC}.o -o xgemm_${m}_${n}_${k}_${PREC}_asm_gcc ./xgemm_${m}_${n}_${k}_${PREC}_asm_icc # ./xgemm_${m}_${n}_${k}_${PREC}_asm_gcc fi elif [ "${ARCH}" == 'knl' ]; then #icc -O2 -ansi-alias -D__USE_MKL -mkl=sequential -xCOMMON-AVX512 -fma -DNDEBUG -DMY_M=$m -DMY_N=$n -DMY_K=$k -DMY_LDA=$lda -DMY_LDB=$ldb -DMY_LDC=$ldc -DREALTYPE=${DATATYPE} -DGEMM_HEADER=\"kernel_${m}_${n}_${k}_${PREC}.h\" validation.c -o xgemm_${m}_${n}_${k}_${PREC}_icc icc -O2 -ansi-alias -xCOMMON-AVX512 -fma -DNDEBUG -DMY_M=$m -DMY_N=$n -DMY_K=$k -DMY_LDA=$lda -DMY_LDB=$ldb -DMY_LDC=$ldc -DREALTYPE=${DATATYPE} -DGEMM_HEADER=\"kernel_${m}_${n}_${k}_${PREC}.h\" validation.c -o xgemm_${m}_${n}_${k}_${PREC}_icc # gcc -O2 -fstrict-aliasing -mavx512f -mfma -DNDEBUG -DMY_M=$m -DMY_N=$n -DMY_K=$k -DMY_LDA=$lda -DMY_LDB=$ldb -DMY_LDC=$ldc -DREALTYPE=${DATATYPE} -DGEMM_HEADER=\"kernel_${m}_${n}_${k}_${PREC}.h\" validation.c -o xgemm_${m}_${n}_${k}_${PREC}_gcc ${SDE_KNL} ./xgemm_${m}_${n}_${k}_${PREC}_icc # ${SDE_KNL} ./xgemm_${m}_${n}_${k}_${PREC}_gcc if [ $ASM -eq 1 ] then as kernel_${m}_${n}_${k}_${PREC}.s -o kernel_${m}_${n}_${k}_${PREC}.o icc -O2 -ansi-alias -xCOMMON_AVX512 -fma -DNDEBUG -DMY_M=$m -DMY_N=$n -DMY_K=$k -DMY_LDA=$lda -DMY_LDB=$ldb -DMY_LDC=$ldc -DREALTYPE=${DATATYPE} -DUSE_ASM_DIRECT validation.c kernel_${m}_${n}_${k}_${PREC}.o -o xgemm_${m}_${n}_${k}_${PREC}_asm_icc # gcc -O2 -fstrict-aliasing -mavx512f -mfma -DNDEBUG -DMY_M=$m -DMY_N=$n -DMY_K=$k -DMY_LDA=$lda -DMY_LDB=$ldb -DMY_LDC=$ldc -DREALTYPE=${DATATYPE} -DUSE_ASM_DIRECT validation.c kernel_${m}_${n}_${k}_${PREC}.o -o xgemm_${m}_${n}_${k}_${PREC}_asm_gcc ${SDE_KNL} ./xgemm_${m}_${n}_${k}_${PREC}_asm_icc # ${SDE_KNL} ./xgemm_${m}_${n}_${k}_${PREC}_asm_gcc fi elif [ "${ARCH}" == 'noarch' ]; then icc -O2 -ansi-alias -xHOST -fma -DNDEBUG -DMY_M=$m -DMY_N=$n -DMY_K=$k -DMY_LDA=$lda -DMY_LDB=$ldb -DMY_LDC=$ldc -DREALTYPE=${DATATYPE} -DGEMM_HEADER=\"kernel_${m}_${n}_${k}_${PREC}.h\" validation.c -o xgemm_${m}_${n}_${k}_${PREC}_icc # gcc -O2 -ansi-alias -DNDEBUG -DMY_M=$m -DMY_N=$n -DMY_K=$k -DMY_LDA=$lda -DMY_LDB=$ldb -DMY_LDC=$ldc -DREALTYPE=${DATATYPE} -DGEMM_HEADER=\"kernel_${m}_${n}_${k}_${PREC}.h\" validation.c -o xgemm_${m}_${n}_${k}_${PREC}_gcc ./xgemm_${m}_${n}_${k}_${PREC}_icc # ./xgemm_${m}_${n}_${k}_${PREC}_gcc else echo "unsupported architecture!" fi done done done libxsmm-1.17/samples/generator/validation.c000066400000000000000000000227141415223013700210510ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include #include #include #include #include #ifdef __USE_MKL #define MKL_DIRECT_CALL_SEQ #include #endif /*#define STREAM_A_B*/ #ifdef STREAM_A_B #define STREAM_A_B_SIZE 1000 #define STREAM_A_B_PREFETCH #endif #ifdef USE_ASM_DIRECT void dense_test_mul(const REALTYPE* a, const REALTYPE* b, REALTYPE* c); #else #include GEMM_HEADER #endif #ifndef MY_M #define MY_M 20 #endif #ifndef MY_N #define MY_N 9 #endif #ifndef MY_K #define MY_K MY_N #endif #ifndef MY_LDA #define MY_LDA MY_M #endif #ifndef MY_LDB #define MY_LDB MY_K #endif #ifndef MY_LDC #define MY_LDC MY_M #endif #define REPS 100000 /*#define REPS 1*/ static double sec(struct timeval start, struct timeval end) { return ((double)(((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)))) / 1.0e6; } void run_test(void) { /* allocate */ #ifdef STREAM_A_B REALTYPE* l_a = (REALTYPE*)_mm_malloc(MY_LDA * MY_K * sizeof(REALTYPE) * STREAM_A_B_SIZE, 64); REALTYPE* l_b = (REALTYPE*)_mm_malloc(MY_LDB * MY_N * sizeof(REALTYPE) * STREAM_A_B_SIZE, 64); unsigned int l_s; #else REALTYPE* l_a = (REALTYPE*)_mm_malloc(MY_LDA * MY_K * sizeof(REALTYPE), 64); REALTYPE* l_b = (REALTYPE*)_mm_malloc(MY_LDB * MY_N * sizeof(REALTYPE), 64); #endif REALTYPE* l_c = (REALTYPE*)_mm_malloc(MY_LDC * MY_N * sizeof(REALTYPE), 64); REALTYPE* l_c_gold = (REALTYPE*)_mm_malloc(MY_LDC * MY_N * sizeof(REALTYPE), 64); REALTYPE l_max_error = 0.0; unsigned int l_i; unsigned int l_j; unsigned int l_t; unsigned int l_m; unsigned int l_n; unsigned int l_k; struct timeval l_start, l_end; double l_total; #ifdef STREAM_A_B for ( l_s = 0; l_s < STREAM_A_B_SIZE; l_s++ ) { REALTYPE* l_p_a = l_a + (l_s * MY_K * MY_LDA); #else REALTYPE* l_p_a = l_a; #endif /* touch A */ for ( l_i = 0; l_i < MY_LDA; l_i++) { for ( l_j = 0; l_j < MY_K; l_j++) { #if REPS==1 l_p_a[(l_j * MY_LDA) + l_i] = (REALTYPE)libxsmm_rng_f64(); #else l_p_a[(l_j * MY_LDA) + l_i] = (REALTYPE)(l_i + (l_j * MY_M)); #endif } } #ifdef STREAM_A_B } #endif #ifdef STREAM_A_B for ( l_s = 0; l_s < STREAM_A_B_SIZE; l_s++ ) { REALTYPE* l_p_b = l_b + (l_s * MY_N * MY_LDB); #else { REALTYPE* l_p_b = l_b; #endif /* touch B */ for ( l_i = 0; l_i < MY_LDB; l_i++ ) { for ( l_j = 0; l_j < MY_N; l_j++ ) { #if REPS==1 l_p_b[(l_j * MY_LDB) + l_i] = (REALTYPE)libxsmm_rng_f64(); #else l_p_b[(l_j * MY_LDB) + l_i] = (REALTYPE)(l_i + (l_j * MY_K)); #endif } } } #ifdef STREAM_A_B } #endif /* touch C */ for ( l_i = 0; l_i < MY_LDC; l_i++) { for ( l_j = 0; l_j < MY_N; l_j++) { l_c[(l_j * MY_LDC) + l_i] = (REALTYPE)0.0; l_c_gold[(l_j * MY_LDC) + l_i] = (REALTYPE)0.0; } } #ifdef __USE_MKL { char l_trans = 'N'; int l_M = MY_M; int l_N = MY_N; int l_K = MY_K; int l_lda = MY_LDA; int l_ldb = MY_LDB; int l_ldc = MY_LDC; if (sizeof(REALTYPE) == sizeof(double)) { double l_one = 1.0; dgemm(&l_trans, &l_trans, &l_M, &l_N, &l_K, &l_one, (double*)l_a, &l_lda, (double*)l_b, &l_ldb, &l_one, (double*)l_c_gold, &l_ldc); } else { float l_one = 1.0f; sgemm(&l_trans, &l_trans, &l_M, &l_N, &l_K, &l_one, (float*)l_a, &l_lda, (float*)l_b, &l_ldb, &l_one, (float*)l_c_gold, &l_ldc); } } /* touch C */ for ( l_i = 0; l_i < MY_LDC; l_i++) { for ( l_j = 0; l_j < MY_N; l_j++) { l_c[(l_j * MY_LDC) + l_i] = (REALTYPE)0.0; l_c_gold[(l_j * MY_LDC) + l_i] = (REALTYPE)0.0; } } #endif /* C routine */ gettimeofday(&l_start, NULL); #ifndef __USE_MKL #pragma nounroll_and_jam for ( l_t = 0; l_t < REPS; l_t++ ) { #ifdef STREAM_A_B REALTYPE* l_p_a = l_a - (MY_K * MY_LDA); REALTYPE* l_p_b = l_b - (MY_N * MY_LDB); for ( l_s = 0; l_s < STREAM_A_B_SIZE; l_s++ ) { l_p_a += (MY_K * MY_LDA); l_p_b += (MY_N * MY_LDB); #else REALTYPE* l_p_a = l_a; REALTYPE* l_p_b = l_b; #endif for ( l_n = 0; l_n < MY_N; l_n++ ) { for ( l_k = 0; l_k < MY_K; l_k++ ) { #pragma vector always for ( l_m = 0; l_m < MY_M; l_m++ ) { l_c_gold[(l_n * MY_LDC) + l_m] += l_p_a[(l_k * MY_LDA) + l_m] * l_p_b[(l_n * MY_LDB) + l_k]; } } } #ifdef STREAM_A_B } #endif } #else char l_trans = 'N'; int l_M = MY_M; int l_N = MY_N; int l_K = MY_K; int l_lda = MY_LDA; int l_ldb = MY_LDB; int l_ldc = MY_LDC; if (sizeof(REALTYPE) == sizeof(double)) { double l_one = 1.0; for ( l_t = 0; l_t < REPS; l_t++ ) { #ifdef STREAM_A_B REALTYPE* l_p_a = l_a - (MY_K * MY_LDA); REALTYPE* l_p_b = l_b - (MY_N * MY_LDB); for ( l_s = 0; l_s < STREAM_A_B_SIZE; l_s++ ) { l_p_a += (MY_K * MY_LDA); l_p_b += (MY_N * MY_LDB); #else REALTYPE* l_p_a = l_a; REALTYPE* l_p_b = l_b; #endif dgemm(&l_trans, &l_trans, &l_M, &l_N, &l_K, &l_one, (double*)l_p_a, &l_lda, (double*)l_p_b, &l_ldb, &l_one, (double*)l_c_gold, &l_ldc); #ifdef STREAM_A_B } #endif } } else { float l_one = 1.0f; for ( l_t = 0; l_t < REPS; l_t++ ) { #ifdef STREAM_A_B REALTYPE* l_p_a = l_a - (MY_K * MY_LDA); REALTYPE* l_p_b = l_b - (MY_N * MY_LDB); for ( l_s = 0; l_s < STREAM_A_B_SIZE; l_s++ ) { l_p_a += (MY_K * MY_LDA); l_p_b += (MY_N * MY_LDB); #else REALTYPE* l_p_a = l_a; REALTYPE* l_p_b = l_b; #endif sgemm(&l_trans, &l_trans, &l_M, &l_N, &l_K, &l_one, (float*)l_p_a, &l_lda, (float*)l_p_b, &l_ldb, &l_one, (float*)l_c_gold, &l_ldc); #ifdef STREAM_A_B } #endif } } #endif gettimeofday(&l_end, NULL); l_total = sec(l_start, l_end); #ifndef __USE_MKL printf("%fs for C\n", l_total); #ifdef STREAM_A_B printf("%f GFLOPS for C\n", ((double)((double)REPS * (double)MY_M * (double)MY_N * (double)MY_K) * 2.0 * ((double)STREAM_A_B_SIZE)) / (l_total * 1.0e9)); #else printf("%f GFLOPS for C\n", ((double)((double)REPS * (double)MY_M * (double)MY_N * (double)MY_K) * 2.0) / (l_total * 1.0e9)); #endif #else printf("%fs for MKL\n", l_total); #ifdef STREAM_A_B printf("%f GFLOPS for MKL\n", ((double)((double)REPS * (double)MY_M * (double)MY_N * (double)MY_K) * 2.0 * ((double)STREAM_A_B_SIZE)) / (l_total * 1.0e9)); #else printf("%f GFLOPS for MKL\n", ((double)((double)REPS * (double)MY_M * (double)MY_N * (double)MY_K) * 2.0) / (l_total * 1.0e9)); #endif #endif gettimeofday(&l_start, NULL); libxsmm_timer_tickint l_cyc_start = libxsmm_timer_cycles(); for ( l_t = 0; l_t < REPS; l_t++ ) { #ifdef STREAM_A_B REALTYPE* l_p_a = l_a - (MY_K * MY_LDA); REALTYPE* l_p_b = l_b - (MY_N * MY_LDB); for ( l_s = 0; l_s < STREAM_A_B_SIZE; l_s++ ) { l_p_a += (MY_K * MY_LDA); l_p_b += (MY_N * MY_LDB); #else REALTYPE* l_p_a = l_a; REALTYPE* l_p_b = l_b; #endif #ifdef STREAM_A_B_PREFETCH dense_test_mul(l_p_a, l_p_b, l_c, l_p_a + (MY_K * MY_LDA), l_p_b + (MY_N * MY_LDB), NULL); #else dense_test_mul(l_p_a, l_p_b, l_c); #endif #ifdef STREAM_A_B } #endif } libxsmm_timer_tickint l_cyc_end = libxsmm_timer_cycles(); gettimeofday(&l_end, NULL); l_total = sec(l_start, l_end); printf("%fs for assembly\n", l_total); #ifdef STREAM_A_B printf("%f GFLOPS for assembly\n", ((double)((double)REPS * (double)MY_M * (double)MY_N * (double)MY_K) * 2.0 * ((double)STREAM_A_B_SIZE)) / (l_total * 1.0e9)); #else printf("%f GFLOPS for assembly\n", ((double)((double)REPS * (double)MY_M * (double)MY_N * (double)MY_K) * 2.0) / (l_total * 1.0e9)); printf("%f FLOPS/cycle for assembly (using libxsmm_timer_cycles())\n", ((double)((double)REPS * (double)MY_M * (double)MY_N * (double)MY_K) * 2.0) / ((double)(l_cyc_end - l_cyc_start))); #endif /* check result */ for ( l_i = 0; l_i < MY_M; l_i++) { for ( l_j = 0; l_j < MY_N; l_j++) { #if 0 printf("Entries in row %i, column %i, gold: %f, assembly: %f\n", l_i+1, l_j+1, l_c_gold[(l_j*MY_M)+l_i], l_c[(l_j*MY_M)+l_i]); #endif if (l_max_error < fabs( l_c_gold[(l_j * MY_LDC) + l_i] - l_c[(l_j * MY_LDC) + l_i])) l_max_error = fabs( l_c_gold[(l_j * MY_LDC) + l_i] - l_c[(l_j * MY_LDC) + l_i]); } } printf("max. error: %f\n", l_max_error); /* free */ _mm_free(l_a); _mm_free(l_b); _mm_free(l_c); _mm_free(l_c_gold); } int main(int argc, char* argv[]) { printf("------------------------------------------------\n"); printf("RUNNING (%ix%i) X (%ix%i) = (%ix%i)", MY_M, MY_K, MY_K, MY_N, MY_M, MY_N); #ifdef STREAM_A_B printf(", STREAM_A_B"); #endif if (sizeof(REALTYPE) == sizeof(double)) { printf(", DP\n"); } else { printf(", SP\n"); } printf("------------------------------------------------\n"); run_test(); printf("------------------------------------------------\n"); return 0; } libxsmm-1.17/samples/hello/000077500000000000000000000000001415223013700156625ustar00rootroot00000000000000libxsmm-1.17/samples/hello/BUILD000066400000000000000000000001461415223013700164450ustar00rootroot00000000000000cc_binary( name = "hello", srcs = ["hello.cpp"], deps = [ "@xsmm//:xsmm" ], ) libxsmm-1.17/samples/hello/Makefile000066400000000000000000000107161415223013700173270ustar00rootroot00000000000000ROOTDIR = $(abspath $(dir $(firstword $(MAKEFILE_LIST)))) DEPDIR = ../.. SRCDIR = . INCDIR = . BLDDIR = obj OUTDIR = . CXXFLAGS = $(NULL) CFLAGS = $(NULL) DFLAGS = $(NULL) BLAS = 2 OMP = 0 SYM = 1 # include common Makefile artifacts include $(DEPDIR)/Makefile.inc # necessary include directories IFLAGS += -I$(call quote,$(INCDIR)) IFLAGS += -I$(call quote,$(DEPDIR)/include) OUTNAME := $(shell basename "$(ROOTDIR)") HEADERS := $(wildcard $(INCDIR)/*.h) $(wildcard $(INCDIR)/*.hpp) $(wildcard $(INCDIR)/*.hxx) $(wildcard $(INCDIR)/*.hh) \ $(wildcard $(SRCDIR)/*.h) $(wildcard $(SRCDIR)/*.hpp) $(wildcard $(SRCDIR)/*.hxx) $(wildcard $(SRCDIR)/*.hh) \ $(DEPDIR)/include/libxsmm_source.h CPPSRCS := $(wildcard $(SRCDIR)/*.cpp) CXXSRCS := $(wildcard $(SRCDIR)/*.cxx) CCXSRCS := $(wildcard $(SRCDIR)/*.cc) CSOURCS := $(wildcard $(SRCDIR)/*.c) CPPOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CPPSRCS:.cpp=-cpp.o))) CXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CXXSRCS:.cxx=-cxx.o))) CCXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CCXSRCS:.cc=-cc.o))) COBJCTS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CSOURCS:.c=-c.o))) ifneq (,$(strip $(FC))) FXXSRCS := $(wildcard $(SRCDIR)/*.f) F77SRCS := $(wildcard $(SRCDIR)/*.F) F90SRCS := $(wildcard $(SRCDIR)/*.f90) $(wildcard $(SRCDIR)/*.F90) FXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(FXXSRCS:.f=-f.o))) F77OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F77SRCS:.F=-f77.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90SRCS:.f90=-f90.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90OBJS:.F90=-f90.o))) endif SOURCES := $(CPPSRCS) $(CXXSRCS) $(CCXSRCS) $(CSOURCS) OBJECTS := $(CPPOBJS) $(CXXOBJS) $(CCXOBJS) $(COBJCTS) FTNSRCS := $(FXXSRCS) $(F77SRCS) $(F90SRCS) MODULES := $(addsuffix .mod,$(basename $(FTNSRCS))) $(addsuffix .modmic,$(basename $(FTNSRCS))) FTNOBJS := $(FXXOBJS) $(F77OBJS) $(F90OBJS) XFILES := $(OUTDIR)/$(OUTNAME) $(OUTDIR)/$(OUTNAME)c $(OUTDIR)/$(OUTNAME)f .PHONY: all all: $(XFILES) .PHONY: compile compile: $(OBJECTS) $(FTNOBJS) ifneq (,$(strip $(FC))) ifneq (0,$(shell echo "$$((3>$(XSMM_GCC) || 40600<=$(FC_VERSION_NUM)))")) $(OUTDIR)/$(OUTNAME)f: $(OUTDIR)/.make $(FTNOBJS) $(FORTDEP) $(LIBDEP) $(EXTDEP) $(FLD) -o $@ $(FTNOBJS) $(FORTLIB) $(EXTLIB) $(MAINLIB) $(FCMTFLAGS) $(SLDFLAGS) $(LDFLAGS) $(FLDFLAGS) $(ELDFLAGS) else .PHONY: $(OUTDIR)/$(OUTNAME)f endif else .PHONY: $(OUTDIR)/$(OUTNAME)f endif $(OUTDIR)/$(OUTNAME)c: $(OUTDIR)/.make $(BLDDIR)/$(OUTNAME)-c.o $(DEPDIR)/include/libxsmm_source.h $(LD) -o $@ $(BLDDIR)/$(OUTNAME)-c.o $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(OUTDIR)/$(OUTNAME): $(OUTDIR)/.make $(BLDDIR)/$(OUTNAME)-cpp.o $(DEPDIR)/include/libxsmm_source.h $(XLD) -o $@ $(BLDDIR)/$(OUTNAME)-cpp.o $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(BLDDIR)/%-cpp.o: $(SRCDIR)/%.cpp .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CXX) $(DFLAGS) $(IFLAGS) $(CXXFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-cc.o: $(SRCDIR)/%.cc .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CXX) $(DFLAGS) $(IFLAGS) $(CXXFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-c.o: $(SRCDIR)/%.c .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CC) $(DFLAGS) $(IFLAGS) $(CFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-f.o: $(SRCDIR)/%.f .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.f90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.F90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f77.o: $(SRCDIR)/%.F .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ .PHONY: clean clean: ifneq ($(call qapath,$(BLDDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(BLDDIR)),$(call qapath,.)) @rm -rf $(BLDDIR) endif endif ifneq (,$(wildcard $(BLDDIR))) # still exists @rm -f $(OBJECTS) $(OBJECTX) $(FTNOBJS) $(FTNOBJX) *__genmod.* fit.log *.dat @rm -f $(BLDDIR)/*.gcno $(BLDDIR)/*.gcda $(BLDDIR)/*.gcov endif @rm -f .make .state .PHONY: realclean realclean: clean ifneq ($(call qapath,$(OUTDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(OUTDIR)),$(call qapath,.)) @rm -rf $(OUTDIR) endif endif ifneq (,$(wildcard $(OUTDIR))) # still exists @rm -f $(OUTDIR)/libxsmm.$(DLIBEXT) $(OUTDIR)/*.stackdump @rm -f $(XFILES) $(MODULES) endif libxsmm-1.17/samples/hello/README.md000066400000000000000000000023261415223013700171440ustar00rootroot00000000000000# Hello LIBXSMM This example is focused on a specific functionality but may be considered as "Hello LIBXSMM". Copy and paste the example code and build it either manually and as described in our [main documentation](https://libxsmm.readthedocs.io/#hello-libxsmm) (see underneath the source code), or use GNU Make: ```bash cd /path/to/libxsmm make cd /path/to/libxsmm/samples/hello make ./hello ``` Alternatively, one can use the Bazel build system. To further simplify, [Bazelisk](https://github.com/bazelbuild/bazelisk) is used to boot-strap [Bazel](https://bazel.build/): ```bash cd /path/to/libxsmm/samples/hello bazelisk build //... ./bazel-bin/hello ``` The [C/C++ code](https://github.com/hfp/libxsmm/blob/master/samples/hello/hello.cpp) given here uses LIBXSMM in header-only form (`#include `), which is in contrast to the code shown in the [main documentation](https://libxsmm.readthedocs.io/#hello-libxsmm). The [Fortran code](https://github.com/hfp/libxsmm/blob/master/samples/hello/hello.f) (`hello.f`) can be manually compiled like `gfortran -I/path/to/libxsmm/include hello.f -L/path/to/libxsmm/lib -libxsmmf -lxsmm -lxsmmnoblas -o hello` or as part of the above described invocation of GNU Make. libxsmm-1.17/samples/hello/WORKSPACE000066400000000000000000000000731415223013700171430ustar00rootroot00000000000000local_repository( name = "xsmm", path = "../..", ) libxsmm-1.17/samples/hello/hello.c000066400000000000000000000033641415223013700171370ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include int main(/*int argc, char* argv[]*/) { const size_t t = sizeof(double); int flags = LIBXSMM_GEMM_FLAG_NONE, batchsize = 1000, m = 13, n = 5, k = 7, ki, i, j; double *a = malloc(t * batchsize * m * k), *b = malloc(t * batchsize * k * n); double *c = malloc(t * m * n), alpha = 1, beta = 1; /* generates and dispatches a matrix multiplication kernel */ libxsmm_dmmfunction kernel = libxsmm_dmmdispatch( m, n, k, NULL /*lda*/, NULL /*ldb*/, NULL /*ldc*/, &alpha, &beta, &flags, NULL /*prefetch*/); assert(kernel); for (i = 0; i < batchsize; ++i) { /* initialize input */ for (ki = 0; ki < k; ++ki) { for (j = 0; j < m; ++j) a[i * j * ki] = ((double)1) / ((i + j + ki) % 25); for (j = 0; j < n; ++j) b[i * j * ki] = ((double)7) / ((i + j + ki) % 75); } } memset(c, 0, t * m * n); /* kernel multiplies and accumulates matrices: C += Ai * Bi */ for (i = 0; i < batchsize; ++i) kernel(a + i * m * k, b + i * k * n, c); free(a), free(b), free(c); return 0; } libxsmm-1.17/samples/hello/hello.cpp000066400000000000000000000032341415223013700174730ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include #include int main(/*int argc, char* argv[]*/) { typedef double T; int batchsize = 1000, m = 13, n = 5, k = 7; std::vector a(batchsize * m * k), b(batchsize * k * n), c(m * n, 0); /* C/C++ and Fortran interfaces are available */ typedef libxsmm_mmfunction kernel_type; /* generates and dispatches a matrix multiplication kernel (C++ functor) */ kernel_type kernel(LIBXSMM_GEMM_FLAG_NONE, m, n, k, 1.0 /*alpha*/, 1.0 /*beta*/); assert(kernel); for (int i = 0; i < batchsize; ++i) { /* initialize input */ for (int ki = 0; ki < k; ++ki) { for (int j = 0; j < m; ++j) a[i * j * ki] = static_cast(1) / ((i + j + ki) % 25); for (int j = 0; j < n; ++j) b[i * j * ki] = static_cast(7) / ((i + j + ki) % 75); } } /* kernel multiplies and accumulates matrices: C += Ai * Bi */ for (int i = 0; i < batchsize; ++i) kernel(&a[i * m * k], &b[i * k * n], &c[0]); } libxsmm-1.17/samples/hello/hello.f000066400000000000000000000041151415223013700171350ustar00rootroot00000000000000!=======================================================================! ! Copyright (c) Intel Corporation - All rights reserved. ! ! This file is part of the LIBXSMM library. ! ! ! ! For information on the license, see the LICENSE file. ! ! Further information: https://github.com/hfp/libxsmm/ ! ! SPDX-License-Identifier: BSD-3-Clause ! !=======================================================================! ! Hans Pabst (Intel Corp.) !=======================================================================! PROGRAM hello USE :: LIBXSMM, ONLY: LIBXSMM_BLASINT_KIND, & & LIBXSMM_MMFUNCTION => LIBXSMM_DMMFUNCTION,& & libxsmm_mmdispatch => libxsmm_dmmdispatch,& & libxsmm_mmcall => libxsmm_dmmcall IMPLICIT NONE INTEGER, PARAMETER :: T = KIND(0D0) INTEGER :: batchsize = 1000, i INTEGER(LIBXSMM_BLASINT_KIND) :: j, ki INTEGER(LIBXSMM_BLASINT_KIND) :: m = 13, n = 5, k = 7 REAL(T), ALLOCATABLE :: a(:,:,:), b(:,:,:), c(:,:) TYPE(LIBXSMM_MMFUNCTION) :: xmm ALLOCATE(a(m,k,batchsize), b(k,n,batchsize), c(m,n)) ! initialize input DO i = 1, batchsize DO ki = 1, k DO j = 1, m a(j,ki,i) = REAL(1, T) / REAL(MOD(i+j+ki, 25), T) END DO DO j = 1, n b(ki,j,i) = REAL(7, T) / REAL(MOD(i+j+ki, 75), T) END DO END DO END DO c(:,:) = REAL(0, T) ! generates and dispatches a matrix multiplication kernel CALL libxsmm_mmdispatch(xmm, m, n, k, & & alpha=REAL(1, T), beta=REAL(1, T)) ! kernel multiplies and accumulates matrices: C += Ai * Bi DO i = 1, batchsize CALL libxsmm_mmcall(xmm, a(:,:,i), b(:,:,i), c) END DO DEALLOCATE(a, b, c) END PROGRAM libxsmm-1.17/samples/hello/hello.vcxproj000066400000000000000000000536041415223013700204120ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 hello 10.0 {558B85BC-AA23-4049-B721-41E4F5C814B2} Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/magazine/000077500000000000000000000000001415223013700163525ustar00rootroot00000000000000libxsmm-1.17/samples/magazine/Makefile000066400000000000000000000144511415223013700200170ustar00rootroot00000000000000ROOTDIR = $(abspath $(dir $(firstword $(MAKEFILE_LIST)))) DEPDIR = ../.. SRCDIR = . INCDIR = . BLDDIR = obj OUTDIR = . CXXFLAGS = $(NULL) CFLAGS = $(NULL) DFLAGS = $(NULL) BLAS = 1 SYM = 1 OMP = 0 # explore AVX/ARCH=native SSE = 0 # include common Makefile artifacts include $(DEPDIR)/Makefile.inc # necessary include directories IFLAGS += -I$(call quote,$(INCDIR)) IFLAGS += -I$(call quote,$(DEPDIR)/include) OUTNAME := $(shell basename "$(ROOTDIR)") HEADERS := $(wildcard $(INCDIR)/*.h) $(wildcard $(INCDIR)/*.hpp) $(wildcard $(INCDIR)/*.hxx) $(wildcard $(INCDIR)/*.hh) \ $(wildcard $(SRCDIR)/*.h) $(wildcard $(SRCDIR)/*.hpp) $(wildcard $(SRCDIR)/*.hxx) $(wildcard $(SRCDIR)/*.hh) \ $(DEPDIR)/include/libxsmm_source.h CPPSRCS := $(wildcard $(SRCDIR)/*.cpp) CXXSRCS := $(wildcard $(SRCDIR)/*.cxx) CCXSRCS := $(wildcard $(SRCDIR)/*.cc) CSOURCS := $(wildcard $(SRCDIR)/*.c) CPPOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CPPSRCS:.cpp=-cpp.o))) CXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CXXSRCS:.cxx=-cxx.o))) CCXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CCXSRCS:.cc=-cc.o))) COBJCTS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CSOURCS:.c=-c.o))) ifneq (,$(strip $(FC))) FXXSRCS := $(wildcard $(SRCDIR)/*.f) F77SRCS := $(wildcard $(SRCDIR)/*.F) F90SRCS := $(wildcard $(SRCDIR)/*.f90) $(wildcard $(SRCDIR)/*.F90) FXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(FXXSRCS:.f=-f.o))) F77OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F77SRCS:.F=-f77.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90SRCS:.f90=-f90.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90OBJS:.F90=-f90.o))) endif SOURCES := $(CPPSRCS) $(CXXSRCS) $(CCXSRCS) $(CSOURCS) OBJECTS := $(CPPOBJS) $(CXXOBJS) $(CCXOBJS) $(COBJCTS) FTNSRCS := $(FXXSRCS) $(F77SRCS) $(F90SRCS) MODULES := $(addsuffix .mod,$(basename $(FTNSRCS))) $(addsuffix .modmic,$(basename $(FTNSRCS))) FTNOBJS := $(FXXOBJS) $(F77OBJS) $(F90OBJS) XFILES := $(OUTDIR)/$(OUTNAME)_xsmm $(OUTDIR)/$(OUTNAME)_batch \ $(OUTDIR)/$(OUTNAME)_blas \ $(OUTDIR)/$(OUTNAME)_blaze \ $(OUTDIR)/$(OUTNAME)_eigen ifneq (0,$(BLAZE)) ifeq (,$(strip $(BLAZEROOT))) ifneq (,$(wildcard $(DEPDIR)/../blaze*/blaze/Blaze.h)) BLAZEROOT = $(lastword $(sort $(wildcard $(DEPDIR)/../blaze*))) else ifneq (,$(wildcard $(HOME)/blaze*/blaze/Blaze.h)) BLAZEROOT = $(lastword $(sort $(wildcard $(HOME)/blaze*))) endif endif endif ifneq (,$(BLAZEROOT)) DFLAGS += -D__BLAZE IFLAGS += -I$(call quote,$(BLAZEROOT)) BLAZE ?= 1 DFLAGS += -DBLAZE_USE_SHARED_MEMORY_PARALLELIZATION=$(shell echo "$$(($(BLAZE)-1))") else BLAZE := 0 endif ifneq (0,$(EIGEN)) ifeq (,$(strip $(EIGENROOT))) ifneq (,$(wildcard $(DEPDIR)/../eigen*/Eigen/Dense)) EIGENROOT = $(lastword $(sort $(wildcard $(DEPDIR)/../eigen*))) else ifneq (,$(wildcard $(HOME)/eigen*/Eigen/Dense)) EIGENROOT = $(lastword $(sort $(wildcard $(HOME)/eigen*))) else ifneq (,$(wildcard /usr/include/eigen3/Eigen/Dense)) EIGENROOT = /usr/include/eigen3 else ifneq (,$(wildcard /usr/local/opt/eigen/include/eigen3/Eigen/Dense)) EIGENROOT = /usr/local/opt/eigen/include/eigen3 endif endif endif ifneq (,$(EIGENROOT)) DFLAGS += -D__EIGEN IFLAGS += -I$(call quote,$(EIGENROOT)) EIGEN ?= 1 #ifneq (0,$(MKL)) #DFLAGS += -DEIGEN_USE_MKL_ALL #endif ifneq (0,$(shell echo "$$((1 < $(BLAS) || 1 < $(EIGEN)))")) DFLAGS += -DEIGEN_USE_THREADS else DFLAGS += -DEIGEN_DONT_PARALLELIZE endif ifneq (,$(wildcard $(EIGENROOT)/unsupported/Eigen/CXX11/ThreadPool)) DFLAGS += -D__EIGEN_UNSUPPORTED endif else EIGEN := 0 endif .PHONY: all all: $(XFILES) .PHONY: compile compile: $(OBJECTS) $(FTNOBJS) $(OUTDIR)/$(OUTNAME)_xsmm: $(OUTDIR)/.make $(BLDDIR)/$(OUTNAME)_xsmm-c.o $(LIBDEP) $(EXTDEP) $(NOBLASDEP) $(LD) -o $@ $(BLDDIR)/$(OUTNAME)_xsmm-c.o $(call cleanld,$(MAINLIB) $(NOBLASLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS)) $(OUTDIR)/$(OUTNAME)_batch: $(OUTDIR)/.make $(BLDDIR)/$(OUTNAME)_batch-c.o $(LIBDEP) $(EXTDEP) $(NOBLASDEP) $(LD) -o $@ $(BLDDIR)/$(OUTNAME)_batch-c.o $(call cleanld,$(EXTLIB) $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS)) $(OUTDIR)/$(OUTNAME)_blas: $(OUTDIR)/.make $(BLDDIR)/$(OUTNAME)_blas-c.o $(LIBDEP) $(EXTDEP) $(LD) -o $@ $(BLDDIR)/$(OUTNAME)_blas-c.o $(call cleanld,$(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS)) ifneq (,$(BLAZEROOT)) $(OUTDIR)/$(OUTNAME)_blaze: $(OUTDIR)/.make $(BLDDIR)/$(OUTNAME)_blaze-cpp.o $(LD) -o $@ $(BLDDIR)/$(OUTNAME)_blaze-cpp.o $(call cleanld,$(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS)) else .PHONY: $(OUTDIR)/$(OUTNAME)_blaze endif ifneq (,$(EIGENROOT)) $(OUTDIR)/$(OUTNAME)_eigen: $(OUTDIR)/.make $(BLDDIR)/$(OUTNAME)_eigen-cpp.o $(LD) -o $@ $(BLDDIR)/$(OUTNAME)_eigen-cpp.o $(call cleanld,$(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS)) else .PHONY: $(OUTDIR)/$(OUTNAME)_eigen endif $(BLDDIR)/%-cpp.o: $(SRCDIR)/%.cpp .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CXX) $(DFLAGS) $(IFLAGS) $(CXXFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-c.o: $(SRCDIR)/%.c .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CC) $(DFLAGS) $(IFLAGS) $(CFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-f.o: $(SRCDIR)/%.f .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.f90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.F90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f77.o: $(SRCDIR)/%.F .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ .PHONY: clean clean: ifneq ($(call qapath,$(BLDDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(BLDDIR)),$(call qapath,.)) @rm -rf $(BLDDIR) endif endif ifneq (,$(wildcard $(BLDDIR))) # still exists @rm -f $(OBJECTS) $(OBJECTX) $(FTNOBJS) $(FTNOBJX) *__genmod.* fit.log *.dat @rm -f $(BLDDIR)/*.gcno $(BLDDIR)/*.gcda $(BLDDIR)/*.gcov endif @rm -f .make .state .PHONY: realclean realclean: clean ifneq ($(call qapath,$(OUTDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(OUTDIR)),$(call qapath,.)) @rm -rf $(OUTDIR) endif endif ifneq (,$(wildcard $(OUTDIR))) # still exists @rm -f $(OUTDIR)/libxsmm.$(DLIBEXT) $(OUTDIR)/*.stackdump @rm -f $(XFILES) $(MODULES) endif libxsmm-1.17/samples/magazine/README.md000066400000000000000000000064631415223013700176420ustar00rootroot00000000000000# Magazine ## Overview This collection of code samples accompany an article written for [issue #34](https://software.intel.com/sites/default/files/parallel-universe-issue-34.pdf) of the magazine [The Parallel Universe](https://software.intel.com/en-us/download/parallel-universe-magazine-issue-34-october-2018), an Intel publication. The articles focuses on Blaze-, Eigen-, and LIBXSMM-variants of Small Matrix Multiplications (SMMs). The set of sample codes now also includes a variant relying on BLAS and a variant that showcases LIBXSMM's explicit batch-interface. The baseline requirements are libraries that can operate on column-major storage order, "zero copy" when using existing memory buffers, and an API that is powerful enough to describe leading dimensions. Typically a library-internal parallelization of matrix multiplication is desired. However, for the magazine sample collection there is no performance gain expected since the matrices are small, and nested parallelism may only add overhead. Hence library-internal parallelism is disabled (BLAZE_USE_SHARED_MEMORY_PARALLELIZATION=0, EIGEN_DONT_PARALLELIZE). LIBXSMM provides parallelization on a per-functions basis and no global toggle is needed. The sample codes rely on the minimum programming language supported by the library in question (API): C++ in case of Blaze and Eigen, and C in case of LIBXSMM (both C++ and Fortran interfaces are available as well). For Blaze and Eigen, the build-system ensures to not map implementation into a BLAS library (normally desired but this would not test the library-native implementation). ## Results To reproduce or repeat the performance measurements on a system of choice, all matrix operands are streamed by default. The file [magazine.h](https://github.com/hfp/libxsmm/blob/master/samples/magazine/magazine.h) can be edited to reproduce the desired combination (STREAM_A, STREAM_B, and STREAM_C). Whether or not matrix operands are streamed is motivated in publication. To reduce dependency on the compiler's OpenMP implementation, the benchmarks run single-threaded by default (`make OMP=1` can parallelize the batch of matrix multiplications). The outer/batch-level parallelization is also disabled to avoid accounting for proper first-touch memory population on multi-socket systems (NUMA). For the latter, the init-function (located in magazine.h) is not parallelized for simplicity. ```bash cd libxsmm; make cd samples/magazine; make ``` To run the benchmark kernels presented by the article: ```bash ./benchmark.sh ``` Please note that if multiple threads are enabled and used, an appropriate pin-strategy should be used (OMP_PLACES=threads, OMP_PROC_BIND=TRUE). To finally produce the benchmark charts: ```bash ./benchmark-plot.sh blaze ./benchmark-plot.sh eigen ./benchmark-plot.sh xsmm ``` The plot script relies at least on Gnuplot. ImageMagick (mogrify) can be also useful if PNGs are created, e.g., `./benchmark-plot.sh xsmm png 0` (the last argument disables single-file charts in contrast to multi-page PDFs created by default, the option also disables chart titles). The set of kernels executed during the benchmark can be larger than the kernels presented by the plots: [benchmark.set](https://github.com/hfp/libxsmm/blob/master/samples/magazine/benchmark.set) selects the kernels independent of the kernels executed (union). libxsmm-1.17/samples/magazine/benchmark-plot-all.sh000077500000000000000000000017511415223013700223710ustar00rootroot00000000000000#!/usr/bin/env bash ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) "${HERE}"/benchmark-plot.sh eigen "$@" "${HERE}"/benchmark-plot.sh blaze "$@" "${HERE}"/benchmark-plot.sh xsmm "$@" "${HERE}"/benchmark-plot.sh xbat "$@" "${HERE}"/benchmark-plot.sh blas "$@" libxsmm-1.17/samples/magazine/benchmark-plot.sh000077500000000000000000000053411415223013700216220ustar00rootroot00000000000000#!/usr/bin/env bash ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) FIND=$(command -v find) SORT=$(command -v sort) JOIN=$(command -v join) CUT=$(command -v cut) SED=$(command -v sed) AWK=$(command -v awk) RM=$(command -v rm) if [ "" = "$1" ]; then KIND=xsmm else KIND=$1 shift fi if [ "" = "$1" ]; then FILEEXT=pdf else FILEEXT=$1 shift fi if [ "" = "$1" ]; then MULTI=1 else MULTI=$1 shift fi if [ -f /cygdrive/c/Program\ Files/gnuplot/bin/wgnuplot ]; then WGNUPLOT=/cygdrive/c/Program\ Files/gnuplot/bin/wgnuplot GNUPLOT=/cygdrive/c/Program\ Files/gnuplot/bin/gnuplot elif [ -f /cygdrive/c/Program\ Files\ \(x86\)/gnuplot/bin/wgnuplot ]; then WGNUPLOT=/cygdrive/c/Program\ Files\ \(x86\)/gnuplot/bin/wgnuplot GNUPLOT=/cygdrive/c/Program\ Files\ \(x86\)/gnuplot/bin/gnuplot else GNUPLOT=$(command -v gnuplot) WGNUPLOT=${GNUPLOT} fi GNUPLOT_MAJOR=0 GNUPLOT_MINOR=0 if [ -f "${GNUPLOT}" ]; then GNUPLOT_MAJOR=$("${GNUPLOT}" --version | ${SED} "s/.\+ \([0-9]\).\([0-9]\) .*/\1/") GNUPLOT_MINOR=$("${GNUPLOT}" --version | ${SED} "s/.\+ \([0-9]\).\([0-9]\) .*/\2/") fi GNUPLOT_VERSION=$((GNUPLOT_MAJOR * 10000 + GNUPLOT_MINOR * 100)) if [ "40600" -le "${GNUPLOT_VERSION}" ]; then if [ -f "${HERE}/benchmark.set" ]; then # determine behavior of sort command export LC_ALL=C.UTF-8 ${JOIN} --nocheck-order \ <(${CUT} "${HERE}/benchmark.set" -d" " -f1-3 | ${SORT} -nk1) \ <(${SORT} -nk1 benchmark-${KIND}.txt) \ | ${AWK} \ '{ if ($2==$4 && $3==$5) printf("%s %s %s %s %s\n", $1, $2, $3, $6, $8) }' \ | ${SORT} \ -b -n -k1 -k2 -k3 \ > benchmark-${KIND}.join fi env GDFONTPATH=/cygdrive/c/Windows/Fonts \ FILEEXT=${FILEEXT} KIND=${KIND} MULTI=${MULTI} \ "${WGNUPLOT}" "${HERE}/benchmark.plt" if [ "1" != "${MULTI}" ] && [ "pdf" != "${FILEEXT}" ] && [ "$(command -v mogrify)" ]; then ${FIND} . -name "benchmark*.${FILEEXT}" -type f -exec mogrify -trim -transparent-color white {} \; fi else >&2 echo "Error: missing prerequisites!" exit 1 fi libxsmm-1.17/samples/magazine/benchmark.plt000066400000000000000000000315341415223013700210330ustar00rootroot00000000000000MPARM = 1 NPARM = 2 KPARM = 3 FLOPS = 4 HIM = -1 HIN = HIM HIK = HIM KIND = system("sh -c \"echo ${KIND}\"") if (KIND eq "") { KIND = "xsmm" } FILEEXT = system("sh -c \"echo ${FILEEXT}\"") if (FILEEXT eq "") { FILEEXT = "pdf" } BASENAME = "benchmark" FILEINP = BASENAME."-".KIND.".txt" FILEOUT = BASENAME."-".KIND.".".FILEEXT FILECOUNT = 1 # initial file number # MULTI =-1: multiple files; no titles # MULTI = 0: multiple files with titles # MULTI = 1: single file with titles MULTI = system("sh -c \"echo ${MULTI}\"") if (MULTI eq "") { MULTI = 1 } XFLOPS(M, N, K) = 2.0 * M * N * K NFLOPS(M, N, K) = XFLOPS(column(M), column(N), column(K)) NBYTES(M, N, K, ELEMSIZE) = ELEMSIZE * (column(M) * column(K) + column(K) * column(N) + column(M) * column(N)) AI(M, N, K, ELEMSIZE) = NFLOPS(M, N, K) / NBYTES(M, N, K, ELEMSIZE) TIME(M, N, K, F) = NFLOPS(M, N, K) * 1E-9 / column(F) BW(M, N, K, F, ELEMSIZE) = (column(M) * column(K) + column(K) * column(N)) * ELEMSIZE / (TIME(M, N, K, F) * 1024 * 1024 * 1024) stats FILEINP using (column(MPARM)*column(NPARM)*column(KPARM)) nooutput; MNK = STATS_stddev**(1.0/3.0); MAXMNK = int(STATS_max) stats FILEINP using (log(column(FLOPS))) nooutput; NSAMPLES = STATS_records; GEOFLOPS = exp(STATS_sum/STATS_records) stats FILEINP using FLOPS nooutput; MEDFLOPS = STATS_median; AVGFLOPS = STATS_mean; MINFLOPS = STATS_min; MAXFLOPS = STATS_max stats FILEINP using NPARM nooutput; XN = int(STATS_max) stats FILEINP using ((NFLOPS(MPARM,NPARM,KPARM)<=XFLOPS(13,13,13))?column(FLOPS):1/0) nooutput; BIN1_FLOPS = STATS_mean; BIN1_NSAMPLES = STATS_records stats FILEINP using (((XFLOPS(13,13,13)-1) { set title "Performance (Selected Kernels)" } set origin -0.03, 0 set pm3d interpolate 0, 0 #set colorbox horizontal user origin 0, 0.1 size 1, 0.1 #set autoscale fix if (0HIM) { set xrange [*:MNK] } if (0>HIN) { set yrange [*:MNK] } if (0>HIK) { set zrange [*:MNK] } set xlabel "M" set ylabel "N" offset -3.0 set zlabel "K" offset 1.0 set ticslevel 0 set cblabel "GFLOP/s" offset 1.5 set format x "%g"; set format y "%g"; set format z "%g"; set format cb "%g" splot FILEINP using MPARM:NPARM:KPARM:FLOPS notitle with points pointtype 7 linetype palette reset if (MULTI<=0) { set output BASENAME."-".KIND."-".FILECOUNT.".".FILEEXT; FILECOUNT = FILECOUNT + 1 } if (MULTI>-1) { set title "Performance (K-Average for ".sprintf("%u Kernels", NSAMPLES).")" } set origin -0.02, 0 set dgrid3d #9, 9 set pm3d interpolate 0, 0 map set autoscale fix set xlabel "M" set ylabel "N" offset -1.5 set cblabel "GFLOP/s" offset 0.5 set format x "%g"; set format y "%g"; set format cb "%g" set mxtics 2 splot BASENAME."-avg.dat" using (("".strcol(3)."" eq "i")?(I1($1, XN)):(1/0)):(("".strcol(3)."" eq "i")?(J1($1, XN)):(1/0)):2 notitle with pm3d reset if (MULTI<=0) { set output BASENAME."-".KIND."-".FILECOUNT.".".FILEEXT; FILECOUNT = FILECOUNT + 1 } if (MULTI>-1) { set title "Performance (Average per Bin)" } set style fill solid 0.4 noborder set boxwidth 0.5 set grid y2tics linecolor "grey" unset key unset xtics set xtics ("MNK <= 13^3" 0, "13^3 < MNK <= 23^3" 1, "23^3 < MNK" 2) scale 0 offset 0, 0.2 set x2tics ("Small" 0, "Medium" 1, "Larger" 2) scale 0 set xlabel "Problem Size (MNK)" set ytics format "" set y2tics nomirror set y2label "GFLOP/s" set xrange [-0.5:2.5] set yrange [0:*] set autoscale fix set label sprintf("{/=9 ".FORMAT(BIN1_FLOPS)." GFLOP/s}", BIN1_FLOPS) at 0.0, BIN1_FLOPS centre offset 0, -1 front set label sprintf("{/=9 ".FORMAT(BIN2_FLOPS)." GFLOP/s}", BIN2_FLOPS) at 1.0, BIN2_FLOPS centre offset 0, -1 front set label sprintf("{/=9 ".FORMAT(BIN3_FLOPS)." GFLOP/s}", BIN3_FLOPS) at 2.0, BIN3_FLOPS centre offset 0, -1 front set label sprintf("{/=9 (".FORMAT(BIN1_MEMBW)." GB/s)}", BIN1_MEMBW) at 0.0, BIN1_FLOPS centre offset 0, -2 front set label sprintf("{/=9 (".FORMAT(BIN2_MEMBW)." GB/s)}", BIN2_MEMBW) at 1.0, BIN2_FLOPS centre offset 0, -2 front set label sprintf("{/=9 (".FORMAT(BIN3_MEMBW)." GB/s)}", BIN3_MEMBW) at 2.0, BIN3_FLOPS centre offset 0, -2 front set label sprintf("{/=9 N=%u}", BIN1_NSAMPLES) at 0.0, 0.0 centre offset 0, 0.5 front set label sprintf("{/=9 N=%u}", BIN2_NSAMPLES) at 1.0, 0.0 centre offset 0, 0.5 front set label sprintf("{/=9 N=%u}", BIN3_NSAMPLES) at 2.0, 0.0 centre offset 0, 0.5 front plot FILEINP \ using (0.0):(BIN1_FLOPS) notitle smooth unique with boxes linetype 1 linecolor "grey", \ "" using (1.0):(BIN2_FLOPS) notitle smooth unique with boxes linetype 1 linecolor "grey", \ "" using (2.0):(BIN3_FLOPS) notitle smooth unique with boxes linetype 1 linecolor "grey" reset if (MULTI<=0) { set output BASENAME."-".KIND."-".FILECOUNT.".".FILEEXT; FILECOUNT = FILECOUNT + 1 } if (MULTI>-1) { set title "Cummulative Performance Distribution (CDF for ".sprintf("%u Kernels", NSAMPLES).")" } set xlabel "Probability\n\n{/=9 Min.: ".sprintf(FORMAT(MINFLOPS), MINFLOPS)." GFLOP/s Geo.: ".sprintf(FORMAT(GEOFLOPS), GEOFLOPS)." GFLOP/s Med.: ".sprintf(FORMAT(MEDFLOPS), MEDFLOPS)." GFLOP/s Avg.: ".sprintf(FORMAT(AVGFLOPS), AVGFLOPS)." GFLOP/s Max.: ".sprintf(FORMAT(MAXFLOPS), MAXFLOPS)." GFLOP/s}" set ylabel "GB/s" set y2label "GFLOP/s" set format x "%g%%" set format y "%g" set format y2 "%g" set ytics nomirror set y2tics nomirror set grid x y2 linecolor "grey" set xrange [0:100] set yrange [0:*] set y2range [0:*] set fit quiet f(x) = b * x + a fit f(x) BASENAME."-cdf.dat" using (("".strcol(3)."" eq "i")?(100*$2/FREQSUM):(1/0)):1 via a, b g(x) = (x - a) / b x50 = 0.5 * (100 + MAX(0, g(0))) h(x) = d * x + c dx = 100.0 / FREQN fit [x50-3.0*dx:x50+3.0*dx] h(x) BASENAME."-cdf.dat" using (("".strcol(3)."" eq "i")?(100*$2/FREQSUM):(1/0)):1 via c, d set arrow from x50, second h(x50) to x50, second 0 front set arrow from x50, second h(x50) to 100, second h(x50) front set label sprintf("%.0f%%", x50) at x50, second 0.5 * h(x50) left offset 1 front set label sprintf(FORMAT(h(x50))." GFLOP/s", h(x50)) at 0.5 * (x50 + 100.0), second h(x50) centre offset 0, -1 front set key left invert plot BASENAME."-mbw.dat" using (("".strcol(3)."" eq "i")?(100*$2/FREQSUM):(1/0)):1 axes x1y1 title "Memory Bandwidth" with lines linecolor "grey", \ BASENAME."-cdf.dat" using (("".strcol(3)."" eq "i")?(100*$2/FREQSUM):(1/0)):1 axes x1y2 title "Compute Performance" with lines linewidth 2 reset if (MULTI<=0) { set output BASENAME."-".KIND."-".FILECOUNT.".".FILEEXT; FILECOUNT = FILECOUNT + 1 } if (MULTI>-1) { set title "Arithmetic Intensity (".sprintf("%u Kernels", NSAMPLES).")" } set grid x y2 linecolor "grey" set key left #spacing 0.5 set ytics format "" set y2tics nomirror set y2label "GFLOP/s" #set xlabel "FLOPS/Byte\n\n{/=9 ".sprintf("N: %u", NSAMPLES)." Min.: ".sprintf("%.1f", MINAI)." Geo.: ".sprintf("%.1f", GEOAI)." Med.: ".sprintf("%.1f", MEDAI)." Avg.: ".sprintf("%.1f", AVGAI)." Max.: ".sprintf("%.1f", MAXAI)."}" set xlabel "FLOPS/Byte (Min.: ".sprintf("%.1f", MINAI)." Geo.: ".sprintf("%.1f", GEOAI)." Med.: ".sprintf("%.1f", MEDAI)." Avg.: ".sprintf("%.1f", AVGAI)." Max.: ".sprintf("%.1f", MAXAI).")" set yrange [0:*] set autoscale fix plot FILEINP using (AI(MPARM,NPARM,KPARM,8)):FLOPS notitle smooth sbezier with lines linecolor "grey" linewidth 2, \ "" using (AI(MPARM,NPARM,KPARM,8)):FLOPS notitle smooth unique with points pointtype 7 pointsize 0.1 reset if (MULTI<=0) { set output BASENAME."-".KIND."-".FILECOUNT.".".FILEEXT; FILECOUNT = FILECOUNT + 1 } if (MULTI>-1) { set title "Memory Bandwidth Consumption (".sprintf("%u Kernels", NSAMPLES).")" } set grid x y2 linecolor "grey" set key left #spacing 0.5 set ytics format "" set y2tics nomirror set y2label "GB/s" set xlabel "Problem Size (MNK^{1/3})\n\n{/=9 Min.: ".sprintf("%.0f GB/s", MINMEMBW)." Geo.: ".sprintf("%.0f GB/s", GEOMEMBW)." Med.: ".sprintf("%.0f GB/s", MEDMEMBW)." Avg.: ".sprintf("%.0f GB/s", AVGMEMBW)." Max.: ".sprintf("%.0f GB/s", MAXMEMBW)."}" set yrange [0:*] set autoscale fix plot FILEINP using ((column(MPARM)*column(NPARM)*column(KPARM))**(1.0/3.0)):(BW(MPARM,NPARM,KPARM,FLOPS,8)) notitle smooth sbezier with lines linecolor "grey" linewidth 2, \ "" using ((column(MPARM)*column(NPARM)*column(KPARM))**(1.0/3.0)):(BW(MPARM,NPARM,KPARM,FLOPS,8)) notitle with points pointtype 7 pointsize 0.1 reset if (MULTI<=0) { set output BASENAME."-".KIND."-".FILECOUNT.".".FILEEXT; FILECOUNT = FILECOUNT + 1 } if (MULTI>-1) { set title "Compute Consumption (".sprintf("%u Kernels", NSAMPLES).")" } set grid x y2 linecolor "grey" set key left #spacing 0.5 set ytics format "" set y2tics nomirror set y2label "GFLOP/s" set xlabel "Problem Size (MNK^{1/3})\n\n{/=9 Min.: ".sprintf(FORMAT(MINFLOPS), MINFLOPS)." GFLOP/s Geo.: ".sprintf(FORMAT(GEOFLOPS), GEOFLOPS)." GFLOP/s Med.: ".sprintf(FORMAT(MEDFLOPS), MEDFLOPS)." GFLOP/s Avg.: ".sprintf(FORMAT(AVGFLOPS), AVGFLOPS)." GFLOP/s Max.: ".sprintf(FORMAT(MAXFLOPS), MAXFLOPS)." GFLOP/s}" set yrange [0:*] set autoscale fix plot FILEINP using ((column(MPARM)*column(NPARM)*column(KPARM))**(1.0/3.0)):FLOPS notitle smooth sbezier with lines linecolor "grey" linewidth 2, \ "" using ((column(MPARM)*column(NPARM)*column(KPARM))**(1.0/3.0)):FLOPS notitle with points pointtype 7 pointsize 0.1 if (0!=system("sh -c \"if [ -e ".BASENAME."-".KIND.".join ]; then echo 1; else echo 0; fi\"")) { reset if (MULTI<=0) { set output BASENAME."-".KIND."-".FILECOUNT.".".FILEEXT; FILECOUNT = FILECOUNT + 1 } if (MULTI>-1) { set title "Performance (Selected Kernels)" } set style fill solid 0.9 border -1 set style data histograms set style histogram cluster #gap 2 #set boxwidth 0.5 relative set grid y2tics lc "grey" set key left #spacing 0.5 set xtics rotate by -45 scale 0; set bmargin 12 set ytics format "" set y2tics nomirror set y2label "GFLOP/s" set yrange [0:*] plot BASENAME."-".KIND.".join" using FLOPS:xtic("(".strcol(MPARM).",".strcol(NPARM).",".strcol(KPARM).")") notitle if (0!=system("sh -c \"if [ -e ".BASENAME."-eigen.join ]; then echo 1; else echo 0; fi\"")) { if (0!=system("sh -c \"if [ -e ".BASENAME."-blaze.join ]; then echo 1; else echo 0; fi\"")) { if (0!=system("sh -c \"if [ -e ".BASENAME."-xbat.join ]; then echo 1; else echo 0; fi\"")) { if (0!=system("sh -c \"if [ -e ".BASENAME."-xsmm.join ]; then echo 1; else echo 0; fi\"")) { if (0!=system("sh -c \"if [ -e ".BASENAME."-blas.join ]; then echo 1; else echo 0; fi\"")) { set output BASENAME.".".FILEEXT plot BASENAME."-eigen.join" using FLOPS:xtic("{/=8 (".strcol(MPARM).",".strcol(NPARM).",".strcol(KPARM).")}") title "Eigen", \ BASENAME."-blaze.join" using FLOPS title "Blaze", \ BASENAME."-xbat.join" using FLOPS title "Xbatch", \ BASENAME."-xsmm.join" using FLOPS title "Xsmm", \ BASENAME."-blas.join" using FLOPS title "Blas" }}}}} } libxsmm-1.17/samples/magazine/benchmark.set000066400000000000000000000004601415223013700210210ustar00rootroot000000000000002 2 2 3 3 3 4 4 4 5 5 5 5 5 13 5 13 5 5 13 13 6 6 6 8 8 8 10 10 10 12 12 12 13 5 5 13 5 7 13 5 13 13 13 5 13 13 13 13 13 26 13 26 13 13 26 26 14 14 14 15 15 15 16 16 16 18 18 18 20 20 20 23 23 23 24 24 24 25 25 25 26 13 13 26 13 26 26 26 13 26 26 26 28 28 28 30 30 30 32 32 32 35 35 35 36 36 36 40 40 40 libxsmm-1.17/samples/magazine/benchmark.sh000077500000000000000000000070241415223013700206460ustar00rootroot00000000000000#!/usr/bin/env bash ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) CAT=$(command -v cat) TR=$(command -v tr) # setup thread affinity export OMP_SCHEDULE=static OMP_PROC_BIND=TRUE OUT_BLAZE=benchmark-blaze.txt OUT_EIGEN=benchmark-eigen.txt OUT_XSMM=benchmark-xsmm.txt OUT_XBAT=benchmark-xbat.txt OUT_BLAS=benchmark-blas.txt SCRT=${HERE}/../../scripts/libxsmm_utilities.py # MNK: comma separated numbers are on its own others are combined into triplets RUNS1=$(${SCRT} -1 $((128*128*128)) 21 \ 2, 3, 4, 5, 8, 10, 15, 16, 20, 23, 24, 25, 28, 30, 32, 35, 36, 40, \ 5 7 13, \ 0 0) RUNS2=$(${SCRT} -1 $((128*128*128)) 46 \ 4 5 7 9 13 25 26 28 32 45, \ 13 14 25 26 32, \ 5 32 13 24 26, \ 14 16 29, \ 14 32 29, \ 16 29 55, \ 32 29 55, \ 9 32 22, \ 4 10 15, \ 6 7 8, \ 23, \ 64, \ 78, \ 12, \ 6, \ 0 0) if [ "$1" ]; then SIZE=$1 shift else SIZE=0 fi if [ "$1" ]; then RUNS=RUNS$1 shift else RUNS=RUNS1 fi ${CAT} /dev/null > ${OUT_BLAZE} ${CAT} /dev/null > ${OUT_EIGEN} ${CAT} /dev/null > ${OUT_XSMM} ${CAT} /dev/null > ${OUT_XBAT} ${CAT} /dev/null > ${OUT_BLAS} NRUN=1 NMAX=$(echo ${!RUNS} | wc -w | tr -d " ") for RUN in ${!RUNS} ; do MVALUE=$(echo ${RUN} | cut --output-delimiter=' ' -d_ -f1) NVALUE=$(echo ${RUN} | cut --output-delimiter=' ' -d_ -f2) KVALUE=$(echo ${RUN} | cut --output-delimiter=' ' -d_ -f3) echo "${NRUN} of ${NMAX} (M=${MVALUE} N=${NVALUE} K=${KVALUE})... " echo -n "${MVALUE} ${NVALUE} ${KVALUE} " >> ${OUT_BLAZE} ${HERE}/magazine_blaze ${SIZE} ${MVALUE} ${NVALUE} ${KVALUE} | ${TR} "\n" " " >> ${OUT_BLAZE} echo >> ${OUT_BLAZE} echo -n "${MVALUE} ${NVALUE} ${KVALUE} " >> ${OUT_EIGEN} ${HERE}/magazine_eigen ${SIZE} ${MVALUE} ${NVALUE} ${KVALUE} | ${TR} "\n" " " >> ${OUT_EIGEN} echo >> ${OUT_EIGEN} echo -n "${MVALUE} ${NVALUE} ${KVALUE} " >> ${OUT_XSMM} ${HERE}/magazine_xsmm ${SIZE} ${MVALUE} ${NVALUE} ${KVALUE} | ${TR} "\n" " " >> ${OUT_XSMM} echo >> ${OUT_XSMM} echo -n "${MVALUE} ${NVALUE} ${KVALUE} " >> ${OUT_XBAT} ${HERE}/magazine_batch ${SIZE} ${MVALUE} ${NVALUE} ${KVALUE} | ${TR} "\n" " " >> ${OUT_XBAT} echo >> ${OUT_XBAT} echo -n "${MVALUE} ${NVALUE} ${KVALUE} " >> ${OUT_BLAS} ${HERE}/magazine_blas ${SIZE} ${MVALUE} ${NVALUE} ${KVALUE} | ${TR} "\n" " " >> ${OUT_BLAS} echo >> ${OUT_BLAS} NRUN=$((NRUN+1)) done libxsmm-1.17/samples/magazine/magazine.h000066400000000000000000000055751415223013700203320ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #ifndef MAGAZINE_H #define MAGAZINE_H #include #if defined(_OPENMP) # include #endif #if defined(_WIN32) # include #else # include #endif #if !defined(TYPE) # define TYPE double #endif #if 1 # define STREAM_A(EXPR) (EXPR) #else # define STREAM_A(EXPR) 0 #endif #if 1 # define STREAM_B(EXPR) (EXPR) #else # define STREAM_B(EXPR) 0 #endif #if 0 # define STREAM_C(EXPR) (EXPR) # define SYNC(IDX, INC, END) ((IDX) * (INC)) #elif defined(_OPENMP) # define STREAM_C(EXPR) (EXPR) # define SYNC(IDX, INC, END) (((1048573 * omp_get_thread_num()) % (END)) * (INC)) #else # define STREAM_C(EXPR) 0 /* synchronization among C matrices */ # define SYNC(IDX, INC, END) 0 #endif /** * Permuting the data introduces a dependency to LIBXSMM * even for the Eigen/Blaze/Blas based sample code. */ #if 0 /* process batch of A, B, and C in "random" order */ # define SHUFFLE #endif #if 0 /* PAD (alignment) must be power of two */ # define PAD 64 #else # define PAD 1 #endif #if defined(SHUFFLE) # include #endif static void init(int seed, TYPE* dst, int nrows, int ncols, int ld, double scale) { const double seed1 = scale * seed + scale; int i, j; for (i = 0; i < ncols; ++i) { for (j = 0; j < nrows; ++j) { const int k = i * ld + j; dst[k] = (TYPE)(seed1 * (1.0 + i * nrows + j)); } for (; j < ld; ++j) { const int k = i * ld + j; dst[k] = (TYPE)(seed); } } } static double norm(const TYPE* src, int nrows, int ncols, int ld) { int i, j; double result = 0, comp = 0; for (i = 0; i < ncols; ++i) { for (j = 0; j < nrows; ++j) { const int k = i * ld + j; const double v = src[k], a = (0 <= v ? v : -v) - comp, b = result + a; comp = (b - result) - a; result = b; } } return result; } static double seconds(void) { #if defined(_OPENMP) return omp_get_wtime(); #elif defined(_WIN32) LARGE_INTEGER t, f; QueryPerformanceCounter(&t); QueryPerformanceFrequency(&f); return (double)t.QuadPart / f.QuadPart; #else struct timeval t; gettimeofday(&t, 0); return 1E-6 * (1000000ULL * t.tv_sec + t.tv_usec); #endif } #endif /*MAGAZINE_H*/ libxsmm-1.17/samples/magazine/magazine_batch.c000066400000000000000000000115651415223013700214620ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include "magazine.h" #if !defined(SHUFFLE) # include #endif #if defined(_OPENMP) # define USEOMP(FUNCTION) LIBXSMM_USEOMP(FUNCTION) #else # define USEOMP(FUNCTION) (FUNCTION) #endif int main(int argc, char* argv[]) { /* batch-size is used to stream matrix-operands from memory */ const int batchsize = (1 < argc ? atoi(argv[1]) : 0/*auto*/); /* default: M, N, and K are 13, 5, and 7 respectively */ const libxsmm_blasint m = (2 < argc ? atoi(argv[2]) : 13); const libxsmm_blasint n = (3 < argc ? atoi(argv[3]) : 5); const libxsmm_blasint k = (4 < argc ? atoi(argv[4]) : 7); /* leading dimensions are made multiples of the size of a cache-line */ const libxsmm_blasint lda = (5 < argc ? LIBXSMM_MAX(atoi(argv[5]), m) : (libxsmm_blasint)(LIBXSMM_UP2(sizeof(TYPE) * m, PAD) / sizeof(TYPE))); const libxsmm_blasint ldb = (6 < argc ? LIBXSMM_MAX(atoi(argv[6]), k) : (libxsmm_blasint)(LIBXSMM_UP2(sizeof(TYPE) * k, PAD) / sizeof(TYPE))); const libxsmm_blasint ldc = (7 < argc ? LIBXSMM_MAX(atoi(argv[7]), m) : (libxsmm_blasint)(LIBXSMM_UP2(sizeof(TYPE) * m, PAD) / sizeof(TYPE))); /* micro-kernels are limited to certain alpha- and beta-values */ const char transa = 'n', transb = 'n'; const TYPE alpha = 1, beta = 1; /* calculate matrix sizes incl. padded elements */ const size_t na = LIBXSMM_UP2(sizeof(TYPE) * lda * k, PAD) / sizeof(TYPE); const size_t nb = LIBXSMM_UP2(sizeof(TYPE) * ldb * n, PAD) / sizeof(TYPE); const size_t nc = LIBXSMM_UP2(sizeof(TYPE) * ldc * n, PAD) / sizeof(TYPE); /* calculate default batch-size to hit work-set size of approx. 2 GB */ const int size = (0 >= batchsize ? (int)((2ULL << 30/*2 GB*/) / (sizeof(TYPE) * (na + nb + nc))) : batchsize); #if defined(SHUFFLE) const size_t shuffle = libxsmm_shuffle((unsigned int)size); #endif /* allocate A, B, and C matrix buffers */ TYPE *const a = (TYPE*)libxsmm_aligned_malloc(sizeof(TYPE) * na * size, LIBXSMM_CACHELINE); TYPE *const b = (TYPE*)libxsmm_aligned_malloc(sizeof(TYPE) * nb * size, LIBXSMM_CACHELINE); TYPE *const c = (TYPE*)libxsmm_aligned_malloc(sizeof(TYPE) * nc * size, LIBXSMM_CACHELINE); libxsmm_blasint *const ia = (libxsmm_blasint*)libxsmm_malloc(sizeof(libxsmm_blasint) * size); libxsmm_blasint *const ib = (libxsmm_blasint*)libxsmm_malloc(sizeof(libxsmm_blasint) * size); libxsmm_blasint *const ic = (libxsmm_blasint*)libxsmm_malloc(sizeof(libxsmm_blasint) * size); const double scale = 1.0 / size; libxsmm_timer_tickint start; double duration; #if 1 /* synchronize among C-locations */ const libxsmm_blasint xsize = size; #else /* assume no data race (C-index) */ const libxsmm_blasint xsize = -size; #endif int i; /* initialize data according to touch-first policy */ #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < size; ++i) { #if defined(SHUFFLE) const int j = (i * shuffle) % size; #else const int j = i; #endif init(25 + i, a + j * na, (int)m, (int)k, (int)lda, scale); init(75 + i, b + j * nb, (int)k, (int)n, (int)ldb, scale); if (LIBXSMM_NEQ(0, beta)) { /* no need to initialize for beta=0 */ init(42 + i, c + j * nc, (int)m, (int)n, (int)ldc, scale); } ia[i] = (int)STREAM_A(j * na); ib[i] = (int)STREAM_B(j * nb); ic[i] = (int)STREAM_C(SYNC(j, nc, size)); } start = libxsmm_timer_tick(); USEOMP(libxsmm_gemm_batch)(LIBXSMM_GEMM_PRECISION(TYPE), LIBXSMM_GEMM_PRECISION(TYPE), &transa, &transb, m, n, k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc, 0/*index_base*/, sizeof(int)/*index_stride*/, ia, ib, ic, xsize); duration = libxsmm_timer_duration(start, libxsmm_timer_tick()); if (0 < duration) { const double gflops = 2.0 * m * n * k * 1E-9; printf("%.1f GFLOPS/s\n", gflops / duration * size); } printf("%.1f ms\n", 1000.0 * duration); { /* calculate checksum */ double check = 0; for (i = 0; i < size; ++i) { const double cn = norm(c + STREAM_C(SYNC(i, nc, size)), (int)m, (int)n, (int)ldc); if (check < cn) check = cn; } printf("\n%f (check)\n", check); } libxsmm_free(ia); libxsmm_free(ib); libxsmm_free(ic); libxsmm_free(a); libxsmm_free(b); libxsmm_free(c); return EXIT_SUCCESS; } libxsmm-1.17/samples/magazine/magazine_batch.vcxproj000066400000000000000000000544051415223013700227330ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 magazine_batch {3853732E-7E62-4029-9998-96CF710CC9F5} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmext.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmext.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/magazine/magazine_blas.c000066400000000000000000000151271415223013700213200ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include "magazine.h" #if defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL) # include #define GEMM_float sgemm #define GEMM_double dgemm #else /* prototypes for GEMM */ #define GEMM_float sgemm_ #define GEMM_double dgemm_ void dgemm_(const char*, const char*, const int*, const int*, const int*, const double*, const double*, const int*, const double*, const int*, const double*, double*, const int*); void sgemm_(const char*, const char*, const int*, const int*, const int*, const float*, const float*, const int*, const float*, const int*, const float*, float*, const int*); #endif #include #include #if !defined(GEMM) # define CONCATENATE_AUX(A, B) A##B # define CONCATENATE(A, B) CONCATENATE_AUX(A, B) # define GEMM CONCATENATE(GEMM_, TYPE) #endif #if !defined(__INTEL_MKL__) || (20190003 <= (10000*__INTEL_MKL__+__INTEL_MKL_UPDATE__)) # define NOFALLBACK #endif int main(int argc, char* argv[]) { /* batch-size is used to stream matrix-operands from memory */ const int batchsize = (1 < argc ? atoi(argv[1]) : 0/*auto*/); /* default: M, N, and K are 13, 5, and 7 respectively */ const int m = (2 < argc ? atoi(argv[2]) : 13); const int n = (3 < argc ? atoi(argv[3]) : 5); const int k = (4 < argc ? atoi(argv[4]) : 7); /* leading dimensions are made multiples of the size of a cache-line */ const int lda = (5 < argc ? (m < atoi(argv[5]) ? atoi(argv[5]) : m) : (int)(((sizeof(TYPE) * m + PAD - 1) & ~(PAD - 1)) / sizeof(TYPE))); const int ldb = (6 < argc ? (k < atoi(argv[6]) ? atoi(argv[6]) : k) : (int)(((sizeof(TYPE) * k + PAD - 1) & ~(PAD - 1)) / sizeof(TYPE))); const int ldc = (7 < argc ? (m < atoi(argv[7]) ? atoi(argv[7]) : m) : (int)(((sizeof(TYPE) * m + PAD - 1) & ~(PAD - 1)) / sizeof(TYPE))); /* micro-kernels are limited to certain alpha- and beta-values */ const char transa = 'n', transb = 'n'; const TYPE alpha = 1, beta = 1; /* calculate matrix sizes incl. padded elements */ const size_t na = ((sizeof(TYPE) * lda * k + PAD - 1) & ~(PAD - 1)) / sizeof(TYPE); const size_t nb = ((sizeof(TYPE) * ldb * n + PAD - 1) & ~(PAD - 1)) / sizeof(TYPE); const size_t nc = ((sizeof(TYPE) * ldc * n + PAD - 1) & ~(PAD - 1)) / sizeof(TYPE); /* calculate default batch-size to hit work-set size of approx. 2 GB */ const int size = (0 >= batchsize ? (int)((2ULL << 30/*2 GB*/) / (sizeof(TYPE) * (na + nb + nc))) : batchsize); #if defined(SHUFFLE) const size_t shuffle = libxsmm_shuffle((unsigned int)size); #endif /* allocate A, B, and C matrix buffers */ void *const va = malloc(sizeof(TYPE) * na * size + PAD - 1); void *const vb = malloc(sizeof(TYPE) * nb * size + PAD - 1); void *const vc = malloc(sizeof(TYPE) * nc * size + PAD - 1); /* align memory according to PAD */ TYPE *const a = (TYPE*)(((uintptr_t)va + PAD - 1) & ~(PAD - 1)); TYPE *const b = (TYPE*)(((uintptr_t)vb + PAD - 1) & ~(PAD - 1)); TYPE *const c = (TYPE*)(((uintptr_t)vc + PAD - 1) & ~(PAD - 1)); const double scale = 1.0 / size; double duration = 0; int i; #if defined(mkl_jit_create_sgemm) && defined(mkl_jit_create_dgemm) void* jitter; CONCATENATE(GEMM, _jit_kernel_t) kernel = NULL; if (MKL_JIT_SUCCESS == CONCATENATE(mkl_cblas_jit_create_, GEMM)(&jitter, MKL_COL_MAJOR, ('N' == transa || 'n' == transa) ? MKL_NOTRANS : MKL_TRANS, ('N' == transb || 'n' == transb) ? MKL_NOTRANS : MKL_TRANS, m, n, k, alpha, lda, ldb, beta, ldc)) { /* explicitly dispatch a kernel according to parameters */ kernel = CONCATENATE(CONCATENATE(mkl_jit_get_, GEMM), _ptr)(jitter); } else jitter = NULL; #endif /* initialize data according to touch-first policy */ #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < size; ++i) { #if defined(SHUFFLE) const int j = (i * shuffle) % size; #else const int j = i; #endif init(25 + i, a + j * na, m, k, lda, scale); init(75 + i, b + j * nb, k, n, ldb, scale); if (0 != beta) { /* no need to initialize for beta=0 */ init(42 + i, c + j * nc, m, n, ldc, scale); } } #if defined(mkl_jit_create_sgemm) && defined(mkl_jit_create_dgemm) if (NULL != jitter) { # if defined(_OPENMP) # pragma omp parallel # endif { /* OpenMP thread pool is already populated (parallel region) */ # if defined(_OPENMP) # pragma omp single # endif duration = seconds(); # if defined(_OPENMP) # pragma omp for private(i) # endif for (i = 0; i < size; ++i) { #if defined(SHUFFLE) const int j = (i * shuffle) % size; #else const int j = i; #endif kernel(jitter, a + STREAM_A(j * na), b + STREAM_B(j * nb), c + STREAM_C(j * nc)); } } duration = seconds() - duration; } else # if defined(NOFALLBACK) if (0/*false*/) # endif #endif { #if defined(_OPENMP) # pragma omp parallel #endif { /* OpenMP thread pool is already populated (parallel region) */ #if defined(_OPENMP) # pragma omp single #endif duration = seconds(); #if defined(_OPENMP) # pragma omp for private(i) #endif for (i = 0; i < size; ++i) { #if defined(SHUFFLE) const int j = (i * shuffle) % size; #else const int j = i; #endif GEMM(&transa, &transb, &m, &n, &k, &alpha, a + STREAM_A(j * na), &lda, b + STREAM_B(j * nb), &ldb, &beta, c + STREAM_C(j * nc), &ldc); } } duration = seconds() - duration; } if (0 < duration) { const double gflops = 2.0 * m * n * k * 1E-9; printf("%.1f GFLOPS/s\n", gflops / duration * size); } printf("%.1f ms\n", 1000.0 * duration); { /* calculate checksum */ double check = 0; for (i = 0; i < size; ++i) { const double cn = norm(c + STREAM_C(i * nc), m, n, ldc); if (check < cn) check = cn; } printf("\n%f (check)\n", check); } #if defined(mkl_jit_create_sgemm) && defined(mkl_jit_create_dgemm) mkl_jit_destroy(jitter); #endif free(va); free(vb); free(vc); return EXIT_SUCCESS; } libxsmm-1.17/samples/magazine/magazine_blas.vcxproj000066400000000000000000000536361415223013700226000ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 magazine_blas {D014A987-5908-4BF1-9561-601A3301D103} 10.0 Application Disabled Disabled v142 true Sequential Application true true Disabled Disabled v142 Sequential Application true Disabled Disabled v142 true Sequential Application Disabled Disabled v142 true Sequential true Application true Disabled Disabled v142 Sequential Application true Disabled Disabled true v142 Sequential <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;MKL_DIRECT_CALL_SEQ;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) mkl_rt.lib;%(AdditionalDependencies) Console MaxSpeed $(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;MKL_DIRECT_CALL_SEQ;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) mkl_rt.lib;%(AdditionalDependencies) true Console X64 Full $(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;MKL_DIRECT_CALL_SEQ;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) mkl_rt.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;MKL_DIRECT_CALL_SEQ;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) mkl_rt.lib;%(AdditionalDependencies) true Console Disabled $(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;MKL_DIRECT_CALL_SEQ;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;MKL_DIRECT_CALL_SEQ;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/magazine/magazine_blaze.cpp000066400000000000000000000142511415223013700220310ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include "magazine.h" #if !defined(__BLAZE) && 0 # define __BLAZE #endif #if defined(__BLAZE) # if !defined(BLAZE_USE_SHARED_MEMORY_PARALLELIZATION) /* Example uses outer parallelism hence Blaze-internal parallelism is disabled */ # define BLAZE_USE_SHARED_MEMORY_PARALLELIZATION 0 # endif # define _mm512_setzero_epi16 _mm512_setzero_si512 # define _mm512_setzero_epi8 _mm512_setzero_si512 # include #endif #include #include int main(int argc, char* argv[]) { #if defined(__BLAZE) typedef TYPE T; typedef blaze::CustomMatrix matrix_type; /* batch-size is used to stream matrix-operands from memory */ const int batchsize = (1 < argc ? atoi(argv[1]) : 0/*auto*/); /* default: M, N, and K are 13, 5, and 7 respectively */ const int m = (2 < argc ? atoi(argv[2]) : 13); const int n = (3 < argc ? atoi(argv[3]) : 5); const int k = (4 < argc ? atoi(argv[4]) : 7); /* leading dimensions are used to each pad (row-major!) */ const int lda = (5 < argc ? (m < atoi(argv[5]) ? atoi(argv[5]) : m) : static_cast(((sizeof(T) * m + PAD - 1) & ~(PAD - 1)) / sizeof(T))); const int ldb = (6 < argc ? (k < atoi(argv[6]) ? atoi(argv[6]) : k) : static_cast(((sizeof(T) * k + PAD - 1) & ~(PAD - 1)) / sizeof(T))); const int ldc = (7 < argc ? (m < atoi(argv[7]) ? atoi(argv[7]) : m) : static_cast(((sizeof(T) * m + PAD - 1) & ~(PAD - 1)) / sizeof(T))); #if 0 const char transa = 'n', transb = 'n'; #endif const T alpha = 1, beta = 1; /* calculate matrix sizes incl. padded elements */ const size_t na = ((sizeof(T) * lda * k + PAD - 1) & ~(PAD - 1)) / sizeof(T); const size_t nb = ((sizeof(T) * ldb * n + PAD - 1) & ~(PAD - 1)) / sizeof(T); const size_t nc = ((sizeof(T) * ldc * n + PAD - 1) & ~(PAD - 1)) / sizeof(T); /* calculate default batch-size to hit work-set size of approx. 2 GB */ const int size = (0 >= batchsize ? static_cast((2ULL << 30/*2 GB*/) / (sizeof(T) * (na + nb + nc))) : batchsize); #if defined(SHUFFLE) const size_t shuffle = libxsmm_shuffle((unsigned int)size); #endif size_t sa = sizeof(T) * na * size + PAD - 1; size_t sb = sizeof(T) * nb * size + PAD - 1; size_t sc = sizeof(T) * nc * size + PAD - 1; /* allocate A, B, and C matrix buffers */ void *const va = malloc(sa), *const vb = malloc(sb), *const vc = malloc(sc), *wa = va, *wb = vb, *wc = vc; /* align memory according to PAD */ #if defined(PAD) && (1 < (PAD)) T *const pa = static_cast(std::align(PAD, sa - PAD + 1, wa, sa)); T *const pb = static_cast(std::align(PAD, sb - PAD + 1, wb, sb)); T *const pc = static_cast(std::align(PAD, sc - PAD + 1, wc, sc)); #else T *const pa = static_cast(wa); T *const pb = static_cast(wb); T *const pc = static_cast(wc); #endif const double scale = 1.0 / size; blaze::timing::WcTimer timer; /* initialize data according to touch-first policy */ #if defined(_OPENMP) # pragma omp parallel for #endif for (int i = 0; i < size; ++i) { #if defined(SHUFFLE) const int j = (i * shuffle) % size; #else const int j = i; #endif init(25 + i, pa + j * na, m, k, lda, scale); init(75 + i, pb + j * nb, k, n, ldb, scale); if (0 != beta) { /* no need to initialize for beta=0 */ init(42 + i, pc + j * nc, m, n, ldc, scale); } } #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) # pragma omp single #endif timer.start(); #if defined(_OPENMP) # pragma omp for #endif for (int i = 0; i < size; ++i) { #if defined(SHUFFLE) const int j = (i * shuffle) % size; #else const int j = i; #endif const matrix_type a(pa + STREAM_A(j * na), m, k, lda); const matrix_type b(pb + STREAM_B(j * nb), k, n, ldb); matrix_type c(pc + STREAM_C(j * nc), m, n, ldc); /** * Expression templates attempt to delay evaluation until the sequence point * is reached, or an "expression object" goes out of scope and hence must * materialize the effect. Ideally, a complex expression is mapped to the * best possible implementation, e.g., c = alpha * a * b + beta * c may be * mapped to GEMM or definitely omits alpha*a in case of alpha=1, or similar * for special cases for beta=0 and beta=1. * However, to not rely on an ideal transformation a *manually specialized* * expression is written for, e.g., alpha=1 and beta=1 (c += a * b). * NOTE: changing alpha or beta from above may not have an effect * depending on what is selected below (expression). */ #if 0 /* alpha=1 anyway */ c = alpha * a * b + beta * c; #elif 0 (void)alpha; /* unused */ c = a * b + beta * c; #elif 0 /* beta=0 */ (void)alpha; /* unused */ (void)beta; /* unused */ c = a * b; #else /* beta=1 */ (void)alpha; /* unused */ (void)beta; /* unused */ c += a * b; #endif } } timer.end(); if (0 < timer.total()) { const double gflops = 2.0 * m * n * k * 1E-9; printf("%.1f GFLOPS/s\n", gflops / timer.total() * size); } printf("%.1f ms\n", 1000.0 * timer.total()); { /* calculate checksum */ double check = 0; for (int i = 0; i < size; ++i) { const double cn = norm(pc + STREAM_C(i * nc), m, n, ldc); if (check < cn) check = cn; } printf("\n%f (check)\n", check); } free(va); free(vb); free(vc); return EXIT_SUCCESS; #else (void)argc; /* unused */ (void)argv; /* unused */ return EXIT_FAILURE; #endif } libxsmm-1.17/samples/magazine/magazine_blaze.vcxproj000066400000000000000000000477431415223013700227560ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 magazine_blaze {9D98361B-9BB5-485C-BAC7-AFBA216B973E} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(BLAZEROOT);%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console Console MaxSpeed $(BLAZEROOT);%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console true Console X64 Full $(BLAZEROOT);%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console Console X64 MaxSpeed $(BLAZEROOT);%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console true Console Disabled $(BLAZEROOT);%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console MSVCRT Console X64 Disabled $(BLAZEROOT);%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console MSVCRT Console libxsmm-1.17/samples/magazine/magazine_eigen.cpp000066400000000000000000000157441415223013700220330ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include "magazine.h" #if !defined(__EIGEN) && 0 # define __EIGEN #endif #if defined(__EIGEN) # if !defined(EIGEN_DONT_PARALLELIZE) # define EIGEN_DONT_PARALLELIZE # endif # if defined(EIGEN_USE_MKL_ALL) # undef EIGEN_USE_MKL_ALL # endif # include # if defined(__EIGEN_TIMER) # include # endif #endif #include #include #if defined(__EIGEN) template struct stride_helper { stride_helper(int pad_a, int pad_b, int pad_c): a(pad_a), b(pad_b), c(pad_c) {} /* dynamic strides may slow-down also if lda == m, etc. */ Eigen::OuterStride a, b, c; }; template<> struct stride_helper { stride_helper(...) {} Eigen::OuterStride<0> a, b, c; }; #endif int main(int argc, char* argv[]) { #if defined(__EIGEN) typedef TYPE T; typedef Eigen::Matrix matrix_type; /* batch-size is used to stream matrix-operands from memory */ const int batchsize = (1 < argc ? atoi(argv[1]) : 0/*auto*/); /* default: M, N, and K are 13, 5, and 7 respectively */ const int m = (2 < argc ? atoi(argv[2]) : 13); const int n = (3 < argc ? atoi(argv[3]) : 5); const int k = (4 < argc ? atoi(argv[4]) : 7); /* leading dimensions are used to each pad (row-major!) */ const int lda = (5 < argc ? (m < atoi(argv[5]) ? atoi(argv[5]) : m) : static_cast(((sizeof(T) * m + PAD - 1) & ~(PAD - 1)) / sizeof(T))); const int ldb = (6 < argc ? (k < atoi(argv[6]) ? atoi(argv[6]) : k) : static_cast(((sizeof(T) * k + PAD - 1) & ~(PAD - 1)) / sizeof(T))); const int ldc = (7 < argc ? (m < atoi(argv[7]) ? atoi(argv[7]) : m) : static_cast(((sizeof(T) * m + PAD - 1) & ~(PAD - 1)) / sizeof(T))); /* Eigen specifies leading dimensions per "outer stride" */ stride_helper<(sizeof(T) stride(lda, ldb, ldc); #if 0 const char transa = 'n', transb = 'n'; #endif const T alpha = 1, beta = 1; /* calculate matrix sizes incl. padded elements */ const size_t na = ((sizeof(T) * lda * k + PAD - 1) & ~(PAD - 1)) / sizeof(T); const size_t nb = ((sizeof(T) * ldb * n + PAD - 1) & ~(PAD - 1)) / sizeof(T); const size_t nc = ((sizeof(T) * ldc * n + PAD - 1) & ~(PAD - 1)) / sizeof(T); /* calculate default batch-size to hit work-set size of approx. 2 GB */ const int size = (0 >= batchsize ? static_cast((2ULL << 30/*2 GB*/) / (sizeof(T) * (na + nb + nc))) : batchsize); #if defined(SHUFFLE) const size_t shuffle = libxsmm_shuffle((unsigned int)size); #endif size_t sa = sizeof(T) * na * size + PAD - 1; size_t sb = sizeof(T) * nb * size + PAD - 1; size_t sc = sizeof(T) * nc * size + PAD - 1; /* allocate A, B, and C matrix buffers */ void *const va = malloc(sa), *const vb = malloc(sb), *const vc = malloc(sc), *wa = va, *wb = vb, *wc = vc; /* align memory according to PAD */ #if defined(PAD) && (1 < (PAD)) T *const pa = static_cast(std::align(PAD, sa - PAD + 1, wa, sa)); T *const pb = static_cast(std::align(PAD, sb - PAD + 1, wb, sb)); T *const pc = static_cast(std::align(PAD, sc - PAD + 1, wc, sc)); #else T *const pa = static_cast(wa); T *const pb = static_cast(wb); T *const pc = static_cast(wc); #endif const double scale = 1.0 / size; double duration = 0; #if defined(__EIGEN_TIMER) Eigen::BenchTimer timer; #endif /* initialize data according to touch-first policy */ #if defined(_OPENMP) # pragma omp parallel for #endif for (int i = 0; i < size; ++i) { #if defined(SHUFFLE) const int j = (i * shuffle) % size; #else const int j = i; #endif init(25 + i, pa + j * na, m, k, lda, scale); init(75 + i, pb + j * nb, k, n, ldb, scale); if (0 != beta) { /* no need to initialize for beta=0 */ init(42 + i, pc + j * nc, m, n, ldc, scale); } } #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) # pragma omp single #endif #if defined(__EIGEN_TIMER) timer.start(); #else duration = seconds(); #endif #if defined(_OPENMP) # pragma omp for #endif for (int i = 0; i < size; ++i) { #if defined(SHUFFLE) const int j = (i * shuffle) % size; #else const int j = i; #endif /* using "matrix_type" instead of "auto" induces an unnecessary copy */ const auto a = matrix_type::Map/*Aligned*/(pa + STREAM_A(j * na), m, k, stride.a); const auto b = matrix_type::Map/*Aligned*/(pb + STREAM_B(j * nb), k, n, stride.b); auto c = matrix_type::Map/*Aligned*/(pc + STREAM_C(j * nc), m, n, stride.c); /** * Expression templates attempt to delay evaluation until the sequence point * is reached, or an "expression object" goes out of scope and hence must * materialize the effect. Ideally, a complex expression is mapped to the * best possible implementation, e.g., c = alpha * a * b + beta * c may be * mapped to GEMM or definitely omits alpha*a in case of alpha=1, or similar * for special cases for beta=0 and beta=1. However, to not rely on an ideal * transformation a *manually specialized* expression is written for, e.g., * alpha=1 and beta=1 (c += a * b) or tweaked manually ("noalias"). * NOTE: changing alpha or beta from above may not have an effect * depending on what is selected below (expression). */ #if 0 /* alpha=1 anyway */ c.noalias() = alpha * a * b + beta * c; #elif 0 (void)alpha; /* unused */ c.noalias() = a * b + beta * c; #elif 0 /* beta=0 */ (void)alpha; /* unused */ (void)beta; /* unused */ c.noalias() = a * b; #else /* beta=1 */ (void)alpha; /* unused */ (void)beta; /* unused */ c.noalias() += a * b; #endif } } #if defined(__EIGEN_TIMER) timer.stop(); duration = timer.total(); #else duration = seconds() - duration; #endif if (0 < duration) { const double gflops = 2.0 * m * n * k * 1E-9; printf("%.1f GFLOPS/s\n", gflops / duration * size); printf("%.1f ms\n", 1000.0 * duration); } { /* calculate checksum */ double check = 0; for (int i = 0; i < size; ++i) { const double cn = norm(pc + STREAM_C(i * nc), m, n, ldc); if (check < cn) check = cn; } printf("\n%f (check)\n", check); } free(va); free(vb); free(vc); return EXIT_SUCCESS; #else (void)argc; /* unused */ (void)argv; /* unused */ return EXIT_FAILURE; #endif } libxsmm-1.17/samples/magazine/magazine_eigen.vcxproj000066400000000000000000000477431415223013700227500ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 magazine_eigen {AE28E17A-6A33-463E-B586-F7B7501256F1} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(EIGENROOT);%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console Console MaxSpeed $(EIGENROOT);%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console true Console X64 Full $(EIGENROOT);%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console Console X64 MaxSpeed $(EIGENROOT);%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console true Console Disabled $(EIGENROOT);%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console MSVCRT Console X64 Disabled $(EIGENROOT);%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console MSVCRT Console libxsmm-1.17/samples/magazine/magazine_xsmm.c000066400000000000000000000155341415223013700213650ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include "magazine.h" #if !defined(SHUFFLE) # include #endif #if 0 /* auto-dispatch SMM kernel */ # define AUTO #endif #if 0 /* disable auto-prefetch */ # define NOPREFETCH #endif int main(int argc, char* argv[]) { /* batch-size is used to stream matrix-operands from memory */ const int batchsize = (1 < argc ? atoi(argv[1]) : 0/*auto*/); /* default: M, N, and K are 13, 5, and 7 respectively */ const libxsmm_blasint m = (2 < argc ? atoi(argv[2]) : 13); const libxsmm_blasint n = (3 < argc ? atoi(argv[3]) : 5); const libxsmm_blasint k = (4 < argc ? atoi(argv[4]) : 7); /* leading dimensions are made multiples of the size of a cache-line */ const libxsmm_blasint lda = (5 < argc ? LIBXSMM_MAX(atoi(argv[5]), m) : (libxsmm_blasint)(LIBXSMM_UP2(sizeof(TYPE) * m, PAD) / sizeof(TYPE))); const libxsmm_blasint ldb = (6 < argc ? LIBXSMM_MAX(atoi(argv[6]), k) : (libxsmm_blasint)(LIBXSMM_UP2(sizeof(TYPE) * k, PAD) / sizeof(TYPE))); const libxsmm_blasint ldc = (7 < argc ? LIBXSMM_MAX(atoi(argv[7]), m) : (libxsmm_blasint)(LIBXSMM_UP2(sizeof(TYPE) * m, PAD) / sizeof(TYPE))); /* micro-kernels are limited to certain alpha- and beta-values */ const char transa = 'n', transb = 'n'; const TYPE alpha = 1, beta = 1; /* calculate matrix sizes incl. padded elements */ const size_t na = LIBXSMM_UP2(sizeof(TYPE) * lda * k, PAD) / sizeof(TYPE); const size_t nb = LIBXSMM_UP2(sizeof(TYPE) * ldb * n, PAD) / sizeof(TYPE); const size_t nc = LIBXSMM_UP2(sizeof(TYPE) * ldc * n, PAD) / sizeof(TYPE); /* calculate default batch-size to hit work-set size of approx. 2 GB */ const int size = (0 >= batchsize ? (int)((2ULL << 30/*2 GB*/) / (sizeof(TYPE) * (na + nb + nc))) : batchsize); #if defined(SHUFFLE) const size_t shuffle = libxsmm_shuffle((unsigned int)size); #endif /* allocate A, B, and C matrix buffers */ TYPE *const a = (TYPE*)libxsmm_aligned_malloc(sizeof(TYPE) * na * size, LIBXSMM_CACHELINE); TYPE *const b = (TYPE*)libxsmm_aligned_malloc(sizeof(TYPE) * nb * size, LIBXSMM_CACHELINE); TYPE *const c = (TYPE*)libxsmm_aligned_malloc(sizeof(TYPE) * nc * size, LIBXSMM_CACHELINE); const double scale = 1.0 / size; libxsmm_timer_tickint start; double duration; int i, j; /** * LIBXSMM's C interface really is type-specific, and the helper macros (such as LIBXSMM_MMFUNCTION_TYPE) * are only for "entertainment". The C++ interface on the other hand provides overloaded functions * and some helpers for more type-generic programming tasks (e.g., libxsmm_mmfunction). */ #if !defined(AUTO) /* explicitly dispatch a kernel according to parameters */ const int flags = LIBXSMM_GEMM_FLAGS(transa, transb); # if !defined(NOPREFETCH) && (STREAM_A(1) || STREAM_B(1) || STREAM_C(1)) /* prefetch */ const int prefetch = LIBXSMM_PREFETCH_AUTO; # else const int prefetch = LIBXSMM_PREFETCH_NONE; # endif union { /* convert between fn.ptr and (data)pointer */ LIBXSMM_MMFUNCTION_TYPE(TYPE) fun; const void* ptr; } xmm; xmm.fun = LIBXSMM_MMDISPATCH_SYMBOL(TYPE)(m, n, k, &lda, &ldb, &ldc, &alpha, &beta, &flags, &prefetch); #endif /* initialize data according to touch-first policy */ #if defined(_OPENMP) # pragma omp parallel for private(i, j) #endif for (i = 0; i < size; ++i) { #if defined(SHUFFLE) j = (i * shuffle) % size; #else j = i; #endif init(25 + i, a + j * na, (int)m, (int)k, (int)lda, scale); init(75 + i, b + j * nb, (int)k, (int)n, (int)ldb, scale); if (LIBXSMM_NEQ(0, beta)) { /* no need to initialize for beta=0 */ init(42 + i, c + j * nc, (int)m, (int)n, (int)ldc, scale); } } #if defined(_OPENMP) # pragma omp parallel #endif { /* OpenMP thread pool is already populated (parallel region) */ #if defined(_OPENMP) # pragma omp single #endif start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp for private(i, j) #endif for (i = 0; i < size - 1; ++i) { #if defined(SHUFFLE) # if !defined(AUTO) && !defined(NOPREFETCH) && (STREAM_A(1) || STREAM_B(1) || STREAM_C(1)) /* prefetch */ const int p = ((i + 1) * shuffle) % size; # endif j = (i * shuffle) % size; #else # if !defined(AUTO) && !defined(NOPREFETCH) && (STREAM_A(1) || STREAM_B(1) || STREAM_C(1)) /* prefetch */ const int p = i + 1; /* next location */ # endif j = i; #endif #if defined(AUTO) libxsmm_dgemm(&transa, &transb, &m, &n, &k, &alpha, a + STREAM_A(j * na), &lda, b + STREAM_B(j * nb), &ldb, &beta, c + STREAM_C(SYNC(j, nc, size)), &ldc); #elif !defined(NOPREFETCH) && (STREAM_A(1) || STREAM_B(1) || STREAM_C(1)) /* prefetch */ xmm.fun(a + STREAM_A(j * na), b + STREAM_B(j * nb), c + STREAM_C(SYNC(j, nc, size)), a + STREAM_A(p * na), b + STREAM_B(p * nb), c + STREAM_C(SYNC(p, nc, size))); #else xmm.fun(a + STREAM_A(j * na), b + STREAM_B(j * nb), c + STREAM_C(SYNC(j, nc, size))); #endif } } #if defined(SHUFFLE) j = ((size - 1) * shuffle) % size; #else j = size - 1; #endif #if defined(AUTO) libxsmm_dgemm(&transa, &transb, &m, &n, &k, &alpha, a + STREAM_A(j * na), &lda, b + STREAM_B(j * nb), &ldb, &beta, c + STREAM_C(SYNC(j, nc, size)), &ldc); #elif !defined(NOPREFETCH) && (STREAM_A(1) || STREAM_B(1) || STREAM_C(1)) /* prefetch */ xmm.fun(a + STREAM_A(j * na), b + STREAM_B(j * nb), c + STREAM_C(SYNC(j, nc, size)), a + STREAM_A(j * na), b + STREAM_B(j * nb), c + STREAM_C(SYNC(j, nc, size))); #else xmm.fun(a + STREAM_A(j * na), b + STREAM_B(j * nb), c + STREAM_C(SYNC(j, nc, size))); #endif duration = libxsmm_timer_duration(start, libxsmm_timer_tick()); if (0 < duration) { libxsmm_kernel_info info; #if defined(AUTO) /* no explicit kernel hence no query */ info.nflops = 2 * m * n * k; #else libxsmm_get_kernel_info(xmm.ptr, &info); #endif printf("%.1f GFLOPS/s\n", (1E-9 * info.nflops) / duration * size); } printf("%.1f ms\n", 1000.0 * duration); { /* calculate checksum */ double check = 0; for (i = 0; i < size; ++i) { const double cn = norm(c + STREAM_C(i * nc), (int)m, (int)n, (int)ldc); if (check < cn) check = cn; } printf("\n%f (check)\n", check); } libxsmm_free(a); libxsmm_free(b); libxsmm_free(c); return EXIT_SUCCESS; } libxsmm-1.17/samples/magazine/magazine_xsmm.vcxproj000066400000000000000000000541451415223013700226370ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 magazine_xsmm {0EC5E28C-54A4-403B-AA7F-541007E74116} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/matcopy/000077500000000000000000000000001415223013700162335ustar00rootroot00000000000000libxsmm-1.17/samples/matcopy/Makefile000066400000000000000000000101231415223013700176700ustar00rootroot00000000000000ROOTDIR = $(abspath $(dir $(firstword $(MAKEFILE_LIST)))) DEPDIR = ../.. SRCDIR = . INCDIR = . BLDDIR = obj OUTDIR = . CXXFLAGS = $(NULL) CFLAGS = $(NULL) DFLAGS = $(NULL) OMP = 1 SYM = 1 # explore AVX/ARCH=native SSE = 0 # include common Makefile artifacts include $(DEPDIR)/Makefile.inc # necessary include directories IFLAGS += -I$(call quote,$(INCDIR)) IFLAGS += -I$(call quote,$(DEPDIR)/include) OUTNAME := $(shell basename "$(ROOTDIR)") HEADERS := $(wildcard $(INCDIR)/*.h) $(wildcard $(INCDIR)/*.hpp) $(wildcard $(INCDIR)/*.hxx) $(wildcard $(INCDIR)/*.hh) \ $(wildcard $(SRCDIR)/*.h) $(wildcard $(SRCDIR)/*.hpp) $(wildcard $(SRCDIR)/*.hxx) $(wildcard $(SRCDIR)/*.hh) \ $(DEPDIR)/include/libxsmm_source.h CPPSRCS := $(wildcard $(SRCDIR)/*.cpp) CXXSRCS := $(wildcard $(SRCDIR)/*.cxx) CCXSRCS := $(wildcard $(SRCDIR)/*.cc) CSOURCS := $(wildcard $(SRCDIR)/*.c) CPPOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CPPSRCS:.cpp=-cpp.o))) CXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CXXSRCS:.cxx=-cxx.o))) CCXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CCXSRCS:.cc=-cc.o))) COBJCTS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CSOURCS:.c=-c.o))) ifneq (,$(strip $(FC))) FXXSRCS := $(wildcard $(SRCDIR)/*.f) F77SRCS := $(wildcard $(SRCDIR)/*.F) F90SRCS := $(wildcard $(SRCDIR)/*.f90) $(wildcard $(SRCDIR)/*.F90) FXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(FXXSRCS:.f=-f.o))) F77OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F77SRCS:.F=-f77.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90SRCS:.f90=-f90.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90OBJS:.F90=-f90.o))) endif SOURCES := $(CPPSRCS) $(CXXSRCS) $(CCXSRCS) $(CSOURCS) OBJECTS := $(CPPOBJS) $(CXXOBJS) $(CCXOBJS) $(COBJCTS) FTNSRCS := $(FXXSRCS) $(F77SRCS) $(F90SRCS) MODULES := $(addsuffix .mod,$(basename $(FTNSRCS))) $(addsuffix .modmic,$(basename $(FTNSRCS))) FTNOBJS := $(FXXOBJS) $(F77OBJS) $(F90OBJS) XFILES := $(OUTDIR)/$(OUTNAME) $(OUTDIR)/$(OUTNAME)f .PHONY: all all: $(XFILES) .PHONY: compile compile: $(OBJECTS) $(FTNOBJS) ifneq (,$(strip $(FC))) ifneq (0,$(shell echo "$$((3>$(XSMM_GCC) || 40600<=$(FC_VERSION_NUM)))")) $(OUTDIR)/$(OUTNAME)f: $(OUTDIR)/.make $(FTNOBJS) $(FORTDEP) $(LIBDEP) $(EXTDEP) $(FLD) -o $@ $(FTNOBJS) $(FORTLIB) $(EXTLIB) $(MAINLIB) $(FCMTFLAGS) $(SLDFLAGS) $(LDFLAGS) $(FLDFLAGS) $(ELDFLAGS) else .PHONY: $(OUTDIR)/$(OUTNAME)f endif else .PHONY: $(OUTDIR)/$(OUTNAME)f endif $(OUTDIR)/$(OUTNAME): $(OUTDIR)/.make $(OBJECTS) $(LIBDEP) $(LD) -o $@ $(OBJECTS) $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(BLDDIR)/%-cpp.o: $(SRCDIR)/%.cpp .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CXX) $(DFLAGS) $(IFLAGS) $(CXXFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-c.o: $(SRCDIR)/%.c .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CC) $(DFLAGS) $(IFLAGS) $(CFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-f.o: $(SRCDIR)/%.f .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.f90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.F90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f77.o: $(SRCDIR)/%.F .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ .PHONY: clean clean: ifneq ($(call qapath,$(BLDDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(BLDDIR)),$(call qapath,.)) @rm -rf $(BLDDIR) endif endif ifneq (,$(wildcard $(BLDDIR))) # still exists @rm -f $(OBJECTS) $(OBJECTX) $(FTNOBJS) $(FTNOBJX) *__genmod.* fit.log *.dat @rm -f $(BLDDIR)/*.gcno $(BLDDIR)/*.gcda $(BLDDIR)/*.gcov endif @rm -f .make .state .PHONY: realclean realclean: clean ifneq ($(call qapath,$(OUTDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(OUTDIR)),$(call qapath,.)) @rm -rf $(OUTDIR) endif endif ifneq (,$(wildcard $(OUTDIR))) # still exists @rm -f $(OUTDIR)/libxsmm.$(DLIBEXT) $(OUTDIR)/*.stackdump @rm -f $(XFILES) $(MODULES) endif libxsmm-1.17/samples/matcopy/matcopy.c000066400000000000000000000073731415223013700200650ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Hans Pabst (Intel Corp.) ******************************************************************************/ #include #include #include #include #include #if !defined(ELEM_TYPE) # define ELEM_TYPE float #endif int main(int argc, char* argv[]) { const unsigned int m = (unsigned int)LIBXSMM_MAX(1 < argc ? atoi(argv[1]) : 16, 0); const unsigned int n = (unsigned int)LIBXSMM_MAX(2 < argc ? atoi(argv[2]) : 0, (int)m); const unsigned int ldi = (unsigned int)LIBXSMM_MAX(3 < argc ? atoi(argv[3]) : 0, (int)m); const unsigned int ldo = (unsigned int)LIBXSMM_MAX(4 < argc ? atoi(argv[4]) : 0, (int)m); const int unroll = (5 < argc ? atoi(argv[5]) : 1), prefetch = (6 < argc ? atoi(argv[6]) : 0); const int flags = ((7 < argc && 0 != atoi(argv[7])) ? LIBXSMM_MATCOPY_FLAG_ZERO_SOURCE : 0); const unsigned int iters = (unsigned int)LIBXSMM_MAX(8 < argc ? atoi(argv[8]) : 0, 1); /* we should modify to test all data-types */ const libxsmm_mcopy_descriptor* desc; libxsmm_xmcopyfunction kernel; libxsmm_descriptor_blob blob; libxsmm_timer_tickint l_start; libxsmm_timer_tickint l_end; unsigned int error = 0, i, j; ELEM_TYPE *a, *b; double copy_time; libxsmm_init(); printf("This is a tester for JIT matcopy kernels!\n"); desc = libxsmm_mcopy_descriptor_init(&blob, sizeof(ELEM_TYPE), m, n, ldo, ldi, flags, prefetch, &unroll); a = (ELEM_TYPE*)((0 < n && 0 < ldi) ? malloc(sizeof(ELEM_TYPE) * n * ldi) : NULL); b = (ELEM_TYPE*)((0 < n && 0 < ldo) ? malloc(sizeof(ELEM_TYPE) * n * ldo) : NULL); if (NULL == a || NULL == b) { printf("buffer allocation failed!\n"); free(a); free(b); exit(EXIT_FAILURE); } assert(NULL != a && NULL != b); for (i = 0; i < n; ++i) { for (j = 0; j < ldi; ++j) { a[j+ldi*i] = (ELEM_TYPE)rand(); if (0 != (LIBXSMM_MATCOPY_FLAG_ZERO_SOURCE & flags) && j < m) { b[j+ldo*i] = (ELEM_TYPE)rand(); } } for (j = m; j < ldo; ++j) { b[j+ldo*i] = (ELEM_TYPE)0xCD; } } /* test dispatch call */ kernel = libxsmm_dispatch_mcopy(desc); if (kernel == 0) { printf("JIT error -> exit!!!!\n"); exit(EXIT_FAILURE); } /* let's call */ kernel(a, &ldi, b, &ldo, &a[128]); l_start = libxsmm_timer_tick(); for (i = 0; i < iters; ++i) { kernel(a, &ldi, b, &ldo, &a[128]); } l_end = libxsmm_timer_tick(); copy_time = libxsmm_timer_duration(l_start, l_end); for (i = 0; i < n; ++i) { for (j = 0; j < m; ++j) { if (0 != (LIBXSMM_MATCOPY_FLAG_ZERO_SOURCE & flags)) { if (LIBXSMM_NEQ(0, b[j+ldo*i])) { printf("ERROR!!!\n"); error = 1; i = n; break; } } else if (LIBXSMM_NEQ(a[j+ldi*i], b[j+ldo*i])) { printf("ERROR!!!\n"); error = 1; i = n; break; } } for (j = m; j < ldo; ++j) { if (LIBXSMM_NEQ((ELEM_TYPE)0xCD, b[j+ldo*i])) { printf("ERROR!!!\n"); error = 1; i = n; break; } } } if (error == 0) { printf("CORRECT copy!!!!\n"); printf("Time taken is\t%.5f seconds\n", copy_time); return EXIT_SUCCESS; } else return EXIT_FAILURE; } libxsmm-1.17/samples/matcopy/matcopy.f000066400000000000000000000247311415223013700200650ustar00rootroot00000000000000!=======================================================================! ! Copyright (c) Intel Corporation - All rights reserved. ! ! This file is part of the LIBXSMM library. ! ! ! ! For information on the license, see the LICENSE file. ! ! Further information: https://github.com/hfp/libxsmm/ ! ! SPDX-License-Identifier: BSD-3-Clause ! !=======================================================================! ! Hans Pabst (Intel Corp.) !=======================================================================! PROGRAM matcopy USE :: LIBXSMM, ONLY: LIBXSMM_BLASINT_KIND, & & libxsmm_timer_duration, & & libxsmm_timer_tick, & & libxsmm_init, & & xcopy => libxsmm_xmatcopy, & & ptr0 => libxsmm_ptr_null, & & ptr => libxsmm_ptr IMPLICIT NONE INTEGER, PARAMETER :: T = KIND(0D0) INTEGER, PARAMETER :: S = T INTEGER, PARAMETER :: W = 50 REAL(T), PARAMETER :: X = REAL(-1, T) ! pattern REAL(T), PARAMETER :: Z = REAL( 0, T) ! zero REAL(T), ALLOCATABLE, TARGET :: a1(:), b1(:) !DIR$ ATTRIBUTES ALIGN:64 :: a1, b1 INTEGER(LIBXSMM_BLASINT_KIND) :: m, n, ldi, ldo, h, i, j REAL(T), POINTER :: an(:,:,:), bn(:,:,:) DOUBLE PRECISION :: d, duration(4) INTEGER(8) :: start INTEGER :: r, nrepeat, ncount, error INTEGER :: k, nmb INTEGER :: nbytes INTEGER :: argc, check, zero CHARACTER(32) :: argv ! CHECK: 0 (OFF), 1 (ON) CALL GET_ENVIRONMENT_VARIABLE("CHECK", argv, check) IF (0.EQ.check) THEN ! check length check = 1 ! default state ELSE ! read given value READ(argv, "(I32)") check END IF ! ZERO: 0 (OFF), 1 (ZERO), 2 (COPY+ZERO) CALL GET_ENVIRONMENT_VARIABLE("ZERO", argv, zero) IF (0.EQ.zero) THEN ! check length zero = 0 ! default state ELSE ! read given value READ(argv, "(I32)") zero END IF argc = COMMAND_ARGUMENT_COUNT() IF (1 <= argc) THEN CALL GET_COMMAND_ARGUMENT(1, argv) READ(argv, "(I32)") m ELSE m = 4096 END IF IF (2 <= argc) THEN CALL GET_COMMAND_ARGUMENT(2, argv) READ(argv, "(I32)") n ELSE n = m END IF IF (3 <= argc) THEN CALL GET_COMMAND_ARGUMENT(3, argv) READ(argv, "(I32)") ldi ldi = MAX(ldi, m) ELSE ldi = m END IF IF (4 <= argc) THEN CALL GET_COMMAND_ARGUMENT(4, argv) READ(argv, "(I32)") ldo ldo = MAX(ldi, m) ELSE ldo = ldi END IF IF (5 <= argc) THEN CALL GET_COMMAND_ARGUMENT(5, argv) READ(argv, "(I32)") nrepeat ELSE nrepeat = 6 END IF IF (6 <= argc) THEN CALL GET_COMMAND_ARGUMENT(6, argv) READ(argv, "(I32)") nmb IF (0.GE.nmb) nmb = 2048 ELSE ! 2 GB by default nmb = 2048 END IF nbytes = m * n * S ! size in Byte k = INT(ISHFT(INT(nmb,8), 20) / INT(nbytes,8)) IF (0.GE.k) k = 1 WRITE(*, "(3(A,I0),2(A,I0),A,I0,A)") & & "m=", m, " n=", n, " k=", k, " ldi=", ldi, " ldo=", ldo, & & " size=", INT(k,8) * INT(nbytes,8) / ISHFT(1, 20), "MB" CALL libxsmm_init() ALLOCATE(a1(ldi*n*k), b1(ldo*n*k)) an(1:ldi,1:n, 1:k) => a1 bn(1:ldo,1:n, 1:k) => b1 !$OMP PARALLEL DO DEFAULT(NONE) PRIVATE(h, i, j) & !$OMP SHARED(n, k, ldi, ldo, an, bn, check) DO h = 1, k DO j = 1, n DO i = 1, ldi an(i,j,h) = initial_value(i-1, j-1, ldi*h) END DO DO i = 1, MAX(MIN(check,1),ldo) bn(i,j,h) = X END DO END DO END DO !$OMP END PARALLEL DO error = 0 duration = 0D0 ! matcopy bandwidth assumes NTS in case of copy WRITE(*, "(A)") REPEAT("-", W) DO r = 1, nrepeat IF (0.NE.zero) THEN start = libxsmm_timer_tick() DO h = 1, k !CALL libxsmm_xmatcopy(bn(:,:,h), m=m,n=n, ldi=ldi,ldo=ldo) CALL xcopy(ptr(bn(:,:,h)), ptr0(), S, m, n, ldi, ldo) END DO d = libxsmm_timer_duration(start, libxsmm_timer_tick()) IF ((0.GE.d).OR.(0.LT.diff(check, m, bn))) THEN error = 1 EXIT END IF IF (1.LT.r) duration(1) = duration(1) + d IF (0.NE.check) THEN WRITE(*, "(A,F10.1,A,1A,F10.1,A)") "LIBXSMM (zero):", 1D3 & & * d, " ms", CHAR(9), REAL(1 * k, 8) * REAL(nbytes, 8) & & / (REAL(ISHFT(1, 20), 8) * d), " MB/s" END IF END IF IF ((0.EQ.zero).OR.(1.LT.zero)) THEN start = libxsmm_timer_tick() DO h = 1, k !CALL libxsmm_xmatcopy(bn(:,:,h), an(:,:,h), m,n, ldi,ldo) CALL xcopy(ptr(bn(:,:,h)), ptr(an(:,:,h)), & & S, m, n, ldi, ldo) END DO d = libxsmm_timer_duration(start, libxsmm_timer_tick()) IF ((0.GE.d).OR.(0.LT.diff(check, m, bn, an))) THEN error = 2 EXIT END IF IF (1.LT.r) duration(2) = duration(2) + d IF (0.NE.check) THEN WRITE(*, "(A,F10.1,A,1A,F10.1,A)") "LIBXSMM (copy):", 1D3 & & * d, " ms", CHAR(9), REAL(2 * k, 8) * REAL(nbytes, 8) & & / (REAL(ISHFT(1, 20), 8) * d), " MB/s" END IF END IF ! skip non-LIBXSMM measurements IF (0.EQ.check) CYCLE IF (0.NE.zero) THEN start = libxsmm_timer_tick() DO h = 1, k bn(1:m,:,h) = Z END DO d = libxsmm_timer_duration(start, libxsmm_timer_tick()) IF ((0.GE.d).OR.(0.LT.diff(check, m, bn))) THEN error = 3 EXIT END IF IF (1.LT.r) duration(3) = duration(3) + d WRITE(*, "(A,F10.1,A,1A,F10.1,A)") "FORTRAN (zero):", 1D3 & & * d, " ms", CHAR(9), REAL(1 * k, 8) * REAL(nbytes, 8) & & / (REAL(ISHFT(1, 20), 8) * d), " MB/s" END IF IF ((0.EQ.zero).OR.(1.LT.zero)) THEN start = libxsmm_timer_tick() DO h = 1, k bn(1:m,:,h) = an(1:m,:,h) END DO d = libxsmm_timer_duration(start, libxsmm_timer_tick()) IF ((0.GE.d).OR.(0.LT.diff(check, m, bn, an))) THEN error = 4 EXIT END IF IF (1.LT.r) duration(4) = duration(4) + d WRITE(*, "(A,F10.1,A,1A,F10.1,A)") "FORTRAN (copy):", 1D3 & & * d, " ms", CHAR(9), REAL(2 * k, 8) * REAL(nbytes, 8) & & / (REAL(ISHFT(1, 20), 8) * d), " MB/s" END IF WRITE(*, "(A)") REPEAT("-", W) END DO DEALLOCATE(a1, b1) IF (0.EQ.error) THEN IF ((1.LT.nrepeat).OR.(0.EQ.check)) THEN ncount = MERGE(nrepeat - 1, nrepeat, 2.LT.nrepeat) IF (1.LT.ncount) THEN WRITE(*, "(A,I0,A)") "Arithmetic average of ", & & ncount, " iterations" WRITE(*, "(A)") REPEAT("-", W) END IF IF (0.LT.duration(1)) THEN WRITE(*, "(A,F10.1,A)") "LIBXSMM (zero):", & & (REAL(1*k*ncount, 8) * REAL(nbytes, 8)) & & / (REAL(ISHFT(1, 20), 8) * duration(1)), " MB/s" END IF IF (0.LT.duration(2)) THEN WRITE(*, "(A,F10.1,A)") "LIBXSMM (copy):", & & (REAL(2*k*ncount, 8) * REAL(nbytes, 8)) & & / (REAL(ISHFT(1, 20), 8) * duration(2)), " MB/s" END IF IF (0.LT.duration(3)) THEN WRITE(*, "(A,F10.1,A)") "FORTRAN (zero):", & & (REAL(1*k*ncount, 8) * REAL(nbytes, 8)) & & / (REAL(ISHFT(1, 20), 8) * duration(3)), " MB/s" END IF IF (0.LT.duration(4)) THEN WRITE(*, "(A,F10.1,A)") "FORTRAN (copy):", & & (REAL(2*k*ncount, 8) * REAL(nbytes, 8)) & & / (REAL(ISHFT(1, 20), 8) * duration(4)), " MB/s" END IF WRITE(*, "(A)") REPEAT("-", W) END IF ELSE SELECT CASE (error) CASE (1) WRITE(*, "(A)") "Error: LIBXSMM-zero failed!" CASE (2) WRITE(*, "(A)") "Error: LIBXSMM-copy failed!" CASE (3) WRITE(*, "(A)") "Error: FORTRAN-zero failed!" CASE (4) WRITE(*, "(A)") "Error: FORTRAN-copy failed!" CASE DEFAULT WRITE(*, "(A)") "Unknown error!" END SELECT END IF CONTAINS PURE REAL(T) FUNCTION initial_value(i, j, m) INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: i, j, m initial_value = REAL(j * m + i, T) END FUNCTION PURE REAL(T) FUNCTION diff(check, m, mat, ref) INTEGER, INTENT(IN) :: check REAL(T), INTENT(IN) :: mat(:,:,:) REAL(T), INTENT(IN), OPTIONAL :: ref(:,:,:) INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m INTEGER(LIBXSMM_BLASINT_KIND) :: h, i, j diff = Z IF (0.NE.check) THEN DO h = LBOUND(mat,3), UBOUND(mat,3) DO j = LBOUND(mat,2), UBOUND(mat,2) DO i = LBOUND(mat,1), m IF (PRESENT(ref)) THEN ! copy diff = MAX(diff, ABS(mat(i,j,h) - ref(i,j,h))) ELSE ! zero diff = MAX(diff, ABS(mat(i,j,h) - Z)) END IF END DO DO i = m+1, UBOUND(mat,1) diff = MAX(diff, ABS(mat(i,j,h) - X)) END DO END DO END DO END IF END FUNCTION END PROGRAM libxsmm-1.17/samples/matcopy/matcopy.vcxproj000066400000000000000000000546441415223013700213410ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 matcopy {917B25D4-AA0C-4C7F-ADF2-44847CF5864C} 10.0 Application Disabled Disabled v142 Sequential true Application true true Disabled Disabled v142 Sequential Application true Disabled Disabled v142 Sequential true Application Disabled Disabled v142 Sequential true true Application true Disabled Disabled v142 Sequential Application true Disabled Disabled true v142 Sequential <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 ProgramDatabase None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 ProgramDatabase None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/matcopy/matcopy_opentuner.py000077500000000000000000000150771415223013700223750ustar00rootroot00000000000000#!/usr/bin/env python3 ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### # # This script is based on OpenTuner's tutorial: # "Optimizing Block Matrix Multiplication". # import opentuner from opentuner import ConfigurationManipulator from opentuner import IntegerParameter from opentuner import MeasurementInterface from opentuner import Result import random import json import time import sys import re class MatcopyTune(MeasurementInterface): def manipulator(self): """ Define the search space by creating a ConfigurationManipulator """ self.mintilesize = 2 self.granularity = 1 assert(0 < self.granularity) minsize = max(self.mintilesize / self.granularity, 1) maxsize = minsize + self.granularity m_max = max(min(self.args.maxm, self.args.end), maxsize) n_max = max(min(self.args.maxn, self.args.end), maxsize) m_max = (m_max + self.granularity - 1) / self.granularity n_max = (n_max + self.granularity - 1) / self.granularity m_param = IntegerParameter("M", minsize, m_max) n_param = IntegerParameter("N", minsize, n_max) manipulator = ConfigurationManipulator() manipulator.add_parameter(m_param) manipulator.add_parameter(n_param) return manipulator def seed_configurations(self): m_seed = [self.args.n, self.args.m][0 != self.args.m] n_seed = [self.args.m, self.args.n][0 != self.args.n] if 0 == m_seed or 0 == n_seed: return [] else: return [{"M": max(m_seed, self.mintilesize), "N": max(n_seed, self.mintilesize)}] def objective(self): return opentuner.search.objective.MaximizeAccuracyMinimizeSize() def run(self, desired_result, input, limit): """ Compile and run a given configuration then return performance """ cfg = desired_result.configuration.data nruns = max(self.args.nruns, 1) begin = max(self.args.begin, self.mintilesize) end = max(self.args.end, self.mintilesize) m = random.randint(begin, end) n = random.randint(begin, end) if (self.args.tight): ldi = ldo = m else: ldi = max(random.randint(begin, end), m) ldo = max(random.randint(begin, end), m) kind = ["COPY", "ZERO"][self.args.zero] run_cmd = ( "CHECK=0 " + # no checks and only LIBXSMM measurement ["ZERO=0", "ZERO=1"][self.args.zero] + " LIBXSMM_M" + kind + "_M=" + str(self.granularity * cfg["M"]) + " LIBXSMM_M" + kind + "_N=" + str(self.granularity * cfg["N"]) + " ./matcopyf " + str(m) + " " + str(n) + " " + str(ldi) + " " + str(ldo) + " " + str(nruns)) + " " + str(self.args.nmb) run_result = self.call_program(run_cmd) if (0 == run_result["returncode"]): match = re.search( "LIBXSMM \\(" + kind.lower() + "\\):\\s+([0-9]+(\\.[0-9]*)*)", str(run_result["stdout"])) assert(match is not None) bandwidth = float(match.group(1)) assert(0 < bandwidth) kernelsize = (self.granularity**2) * cfg["M"] * cfg["N"] return Result(time=1/bandwidth, accuracy=bandwidth, size=kernelsize) else: sys.tracebacklimit = 0 raise RuntimeError("Execution failed for \"" + run_cmd + "\"!") def save_final_config(self, configuration): """ called at the end of tuning """ filename = ( "matcopy-" + str(max(self.args.begin, 1)) + "_" + str(max(self.args.end, 1)) + ["_", "_zero_"][self.args.zero] + ["_", "_tight_"][self.args.tight] + str(max(self.args.nruns, 1)) + "_" + str(self.args.nmb) + time.strftime("-%Y%m%d-%H%M%S") + ".json") print("Optimal block size written to " + filename + ": ", configuration.data) # self.manipulator().save_to_file(configuration.data, filename) with open(filename, 'w') as fd: json.dump(configuration.data, fd) # https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse def str2bool(v): argparser = opentuner.default_argparser() if isinstance(v, bool): return v if v.lower() in ('yes', 'true', 't', 'y', '1'): return True elif v.lower() in ('no', 'false', 'f', 'n', '0'): return False else: raise argparser.ArgumentTypeError('Boolean value expected.') if __name__ == "__main__": argparser = opentuner.default_argparser() argparser.add_argument( "begin", type=int, help="Begin of the range (min. M and N)") argparser.add_argument( "end", type=int, help="End of the range (exclusive)") argparser.add_argument( "m", type=int, default=0, nargs='?', help="Initial tile size (M)") argparser.add_argument( "n", type=int, default=0, nargs='?', help="Initial tile size (N)") argparser.add_argument( "nruns", type=int, default=1, nargs='?', help="Number of experiments per epoch") argparser.add_argument( "nmb", type=int, default=512, nargs='?', help="Problem size (MB)") argparser.add_argument( "maxm", type=int, default=160, nargs='?', help="Max. tile size (M)") argparser.add_argument( "maxn", type=int, default=160, nargs='?', help="Max. tile size (N)") argparser.add_argument( "zero", type=str2bool, nargs='?', const=True, default=False, help="Zeroing instead of copy") argparser.add_argument( "tight", type=str2bool, nargs='?', const=True, default=True, help="Use tight leading dimension") MatcopyTune.main(argparser.parse_args()) libxsmm-1.17/samples/nek/000077500000000000000000000000001415223013700153345ustar00rootroot00000000000000libxsmm-1.17/samples/nek/Makefile000066400000000000000000000121241415223013700167740ustar00rootroot00000000000000ROOTDIR = $(abspath $(dir $(firstword $(MAKEFILE_LIST)))) DEPDIR = ../.. SRCDIR = . INCDIR = . BLDDIR = obj OUTDIR = . CXXFLAGS = $(NULL) CFLAGS = $(NULL) DFLAGS = $(NULL) # Fortran code here does not allow for PEDANTIC=2 override PEDANTIC = 1 BLAS = 1 OMP = 1 SYM = 1 # explore AVX/ARCH=native SSE = 0 # include common Makefile artifacts include $(DEPDIR)/Makefile.inc # necessary include directories IFLAGS += -I$(call quote,$(INCDIR)) IFLAGS += -I$(call quote,$(DEPDIR)/include) OUTNAME := $(shell basename "$(ROOTDIR)") HEADERS := $(wildcard $(INCDIR)/*.h) $(wildcard $(INCDIR)/*.hpp) $(wildcard $(INCDIR)/*.hxx) $(wildcard $(INCDIR)/*.hh) \ $(wildcard $(SRCDIR)/*.h) $(wildcard $(SRCDIR)/*.hpp) $(wildcard $(SRCDIR)/*.hxx) $(wildcard $(SRCDIR)/*.hh) \ $(DEPDIR)/include/libxsmm_source.h CPPSRCS := $(wildcard $(SRCDIR)/*.cpp) CXXSRCS := $(wildcard $(SRCDIR)/*.cxx) CCXSRCS := $(wildcard $(SRCDIR)/*.cc) CSOURCS := $(wildcard $(SRCDIR)/*.c) FXXSRCS := $(wildcard $(SRCDIR)/*.f) F77SRCS := $(wildcard $(SRCDIR)/*.F) F90SRCS := $(wildcard $(SRCDIR)/*.f90) $(wildcard $(SRCDIR)/*.F90) CPPOBJS := $(patsubst %,$(BLDDIR)/%,$(call qndir,$(CPPSRCS:.cpp=-cpp.o))) CXXOBJS := $(patsubst %,$(BLDDIR)/%,$(call qndir,$(CXXSRCS:.cxx=-cxx.o))) CCXOBJS := $(patsubst %,$(BLDDIR)/%,$(call qndir,$(CCXSRCS:.cc=-cc.o))) COBJCTS := $(patsubst %,$(BLDDIR)/%,$(call qndir,$(CSOURCS:.c=-c.o))) FXXOBJS := $(patsubst %,$(BLDDIR)/%,$(call qndir,$(FXXSRCS:.f=-f.o))) F77OBJS := $(patsubst %,$(BLDDIR)/%,$(call qndir,$(F77SRCS:.F=-f77.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(call qndir,$(F90SRCS:.f90=-f90.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(call qndir,$(F90OBJS:.F90=-f90.o))) SOURCES := $(CPPSRCS) $(CXXSRCS) $(CCXSRCS) $(CSOURCS) OBJECTS := $(CPPOBJS) $(CXXOBJS) $(CCXOBJS) $(COBJCTS) FTNSRCS := $(FXXSRCS) $(F77SRCS) $(F90SRCS) MODULES := $(addsuffix .mod,$(basename $(FTNSRCS))) $(addsuffix .modmic,$(basename $(FTNSRCS))) FTNOBJS := $(FXXOBJS) $(F77OBJS) $(F90OBJS) ifneq (,$(strip $(FC))) XFILES := $(OUTDIR)/axhm $(OUTDIR)/grad $(OUTDIR)/rstr $(OUTDIR)/libstream_update_kernels.$(SLIBEXT) endif .PHONY: all all: $(XFILES) .PHONY: compile compile: $(OBJECTS) $(FTNOBJS) $(OUTDIR)/libstream_update_kernels.$(SLIBEXT): $(OUTDIR)/.make $(BLDDIR)/stream_update_kernels-c.o $(BLDDIR)/stream_update_kernels-f.o $(AR) -rs $@ $(BLDDIR)/stream_update_kernels-c.o $(BLDDIR)/stream_update_kernels-f.o $(OUTDIR)/axhm: $(OUTDIR)/.make $(OUTDIR)/libstream_update_kernels.$(SLIBEXT) $(BLDDIR)/axhm-f.o $(BLDDIR)/mxm_std-f.o $(FORTDEP) $(LIBDEP) $(FLD) -o $@ $(BLDDIR)/axhm-f.o $(BLDDIR)/mxm_std-f.o $(OUTDIR)/libstream_update_kernels.$(SLIBEXT) \ $(FORTLIB) $(MAINLIB) $(FCMTFLAGS) $(SLDFLAGS) $(LDFLAGS) $(FLDFLAGS) $(ELDFLAGS) $(OUTDIR)/grad: $(OUTDIR)/.make $(OUTDIR)/libstream_update_kernels.$(SLIBEXT) $(BLDDIR)/grad-f.o $(BLDDIR)/mxm_std-f.o $(FORTDEP) $(LIBDEP) $(FLD) -o $@ $(BLDDIR)/grad-f.o $(BLDDIR)/mxm_std-f.o $(OUTDIR)/libstream_update_kernels.$(SLIBEXT) \ $(FORTLIB) $(MAINLIB) $(FCMTFLAGS) $(SLDFLAGS) $(LDFLAGS) $(FLDFLAGS) $(ELDFLAGS) $(OUTDIR)/rstr: $(OUTDIR)/.make $(OUTDIR)/libstream_update_kernels.$(SLIBEXT) $(BLDDIR)/rstr-f.o $(BLDDIR)/mxm_std-f.o $(FORTDEP) $(LIBDEP) $(FLD) -o $@ $(BLDDIR)/rstr-f.o $(BLDDIR)/mxm_std-f.o $(OUTDIR)/libstream_update_kernels.$(SLIBEXT) \ $(FORTLIB) $(MAINLIB) $(FCMTFLAGS) $(SLDFLAGS) $(LDFLAGS) $(FLDFLAGS) $(ELDFLAGS) $(BLDDIR)/mxm_std-f.o: $(SRCDIR)/mxm_std.f .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/stream_update_kernels-f.o: $(SRCDIR)/stream_update_kernels.f .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/axhm-f.o: $(SRCDIR)/axhm.f $(BLDDIR)/stream_update_kernels-f.o .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/grad-f.o: $(SRCDIR)/grad.f $(BLDDIR)/stream_update_kernels-f.o .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/rstr-f.o: $(SRCDIR)/rstr.f $(BLDDIR)/stream_update_kernels-f.o .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-c.o: $(SRCDIR)/%.c .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CC) $(DFLAGS) $(IFLAGS) $(CFLAGS) $(CTARGET) -c $< -o $@ .PHONY: clean clean: ifneq ($(call qapath,$(BLDDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(BLDDIR)),$(call qapath,.)) @rm -rf $(BLDDIR) endif endif ifneq (,$(wildcard $(BLDDIR))) # still exists @rm -f $(OBJECTS) $(OBJECTX) $(FTNOBJS) $(FTNOBJX) *__genmod.* fit.log *.dat @rm -f $(BLDDIR)/*.gcno $(BLDDIR)/*.gcda $(BLDDIR)/*.gcov endif @rm -f .make .state .PHONY: realclean realclean: clean ifneq ($(call qapath,$(OUTDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(OUTDIR)),$(call qapath,.)) @rm -rf $(OUTDIR) endif endif ifneq (,$(wildcard $(OUTDIR))) # still exists @rm -f $(OUTDIR)/libxsmm.$(DLIBEXT) $(OUTDIR)/*.stackdump @rm -f $(XFILES) $(MODULES) endif libxsmm-1.17/samples/nek/README.md000066400000000000000000000017751415223013700166250ustar00rootroot00000000000000# NEK Sample Collection This directory contains kernels taken from Nek{Box,5000}. They aim to represent most of the matrix-matrix workloads. Please note that the [mxm_std.f](https://github.com/hfp/libxsmm/blob/master/samples/nek/mxm_std.f) source code is protected by an (US) GOVERNMENT LICENSE, and under the copyright of the University of Chicago. ## stpm Small tensor-product multiple (stpm) replicates the axhelm kernel, which computes the Laplacian with spectral elements. Usage: ```bash ./stpm m n k size1 size ``` The elements are m-by-n-by-k, mode picks the LIBXSMM interface used, and size scales the number of spectral elements. ## rstr Restriction operator transforms elements from one size to another. This occurs in multi-grid, the convection operator, and, when the sizes are the same, the local Schwarz solves. Usage: ```bash ./rstr m n k mm nn kk size1 size ``` The input elements are m-by-n-by-k and the output elements are mm-by-nn-by-kk. When m=mm, n=nn, k=kk, this half of a Schwarz solve. libxsmm-1.17/samples/nek/axhm.f000066400000000000000000000367231415223013700164530ustar00rootroot00000000000000!=======================================================================! ! Copyright (c) Intel Corporation - All rights reserved. ! ! This file is part of the LIBXSMM library. ! ! ! ! For information on the license, see the LICENSE file. ! ! Further information: https://github.com/hfp/libxsmm/ ! ! SPDX-License-Identifier: BSD-3-Clause ! !=======================================================================! ! Hans Pabst (Intel Corp.), Alexander Heinecke (Intel Corp.), and ! Maxwell Hutchinson (University of Chicago) !=======================================================================! PROGRAM stpm USE :: LIBXSMM, libxsmm_mmcall => libxsmm_dmmcall_abc USE :: STREAM_UPDATE_KERNELS !$ USE omp_lib IMPLICIT NONE INTEGER, PARAMETER :: T = KIND(0D0) REAL(T), PARAMETER :: alpha = 1, beta = 0 REAL(T), ALLOCATABLE, DIMENSION(:,:,:,:), TARGET :: a, b, c, d !DIR$ ATTRIBUTES ALIGN:64 :: a, b, c, d REAL(T), ALLOCATABLE, DIMENSION(:,:,:,:), TARGET :: g1, g2, g3 !DIR$ ATTRIBUTES ALIGN:64 :: g1, g2, g3 REAL(T), ALLOCATABLE, TARGET :: dx(:,:), dy(:,:), dz(:,:) REAL(T), ALLOCATABLE, TARGET, SAVE :: tm1(:,:,:) !DIR$ ATTRIBUTES ALIGN:64 :: tm1 REAL(T), ALLOCATABLE, TARGET, SAVE :: tm2(:,:,:) !DIR$ ATTRIBUTES ALIGN:64 :: tm2 REAL(T), ALLOCATABLE, TARGET, SAVE :: tm3(:,:,:) !DIR$ ATTRIBUTES ALIGN:64 :: tm3 !$OMP THREADPRIVATE(tm1, tm2, tm3) TYPE(LIBXSMM_DMMFUNCTION) :: xmm1, xmm2, xmm3 DOUBLE PRECISION :: duration, max_diff, h1, h2 INTEGER :: argc, m, n, k, routine, check INTEGER(8) :: i, j, ix, iy, iz, r, s INTEGER(8) :: size0, size1, size INTEGER(8) :: repetitions, start CHARACTER(32) :: argv argc = COMMAND_ARGUMENT_COUNT() IF (1 <= argc) THEN CALL GET_COMMAND_ARGUMENT(1, argv) READ(argv, "(I32)") m ELSE m = 8 END IF IF (3 <= argc) THEN CALL GET_COMMAND_ARGUMENT(3, argv) READ(argv, "(I32)") k ELSE k = m END IF IF (2 <= argc) THEN CALL GET_COMMAND_ARGUMENT(2, argv) READ(argv, "(I32)") n ELSE n = k END IF IF (4 <= argc) THEN CALL GET_COMMAND_ARGUMENT(4, argv) READ(argv, "(I32)") size1 ELSE size1 = 0 END IF IF (5 <= argc) THEN CALL GET_COMMAND_ARGUMENT(5, argv) READ(argv, "(I32)") size ELSE size = 0 ! 1 repetition by default END IF ! Initialize LIBXSMM CALL libxsmm_init() ! workload is about 2 GByte in memory by default size0 = (m * n * k) * T * 6 ! size of single element in Byte size1 = MERGE(2048_8, MERGE(size1, ISHFT(ABS(size0 * size1) & & + ISHFT(1, 20) - 1, -20), 0.LE.size1), 0.EQ.size1) size = ISHFT(MERGE(MAX(size, size1), ISHFT(ABS(size) * size0 & & + ISHFT(1, 20) - 1, -20), 0.LE.size), 20) / size0 s = ISHFT(size1, 20) / size0 repetitions = size / s duration = 0 max_diff = 0 ALLOCATE(a(m,n,k,s)) ALLOCATE(b(m,n,k,s)) ALLOCATE(c(m,n,k,s)) ALLOCATE(g1(m,n,k,s), g2(m,n,k,s), g3(m,n,k,s)) ALLOCATE(dx(m,m), dy(n,n), dz(k,k)) ! Initialize !$OMP PARALLEL DO PRIVATE(i) DEFAULT(NONE) SHARED(a, b, c, g1, g2, g3, m, n, k, s) DO i = 1, s DO ix = 1, m DO iy = 1, n DO iz = 1, k a(ix,iy,iz,i) = (ix + iy * m + iz * m * n) b(ix,iy,iz,i) = -(ix + iy * m + iz * m * n) c(ix,iy,iz,i) = 0. g1(ix,iy,iz,i) = 1. g2(ix,iy,iz,i) = 1. g3(ix,iy,iz,i) = 1. END DO END DO END DO END DO dx = 1.; dy = 1.; dz = 1. h1 = 1.; h2 = 1. WRITE(*, "(3(A,I0),A,I0,A,I0,A,I0)") & & "m=", m, " n=", n, " k=", k, " elements=", UBOUND(a, 4), & & " size=", size1, "MB repetitions=", repetitions CALL GETENV("CHECK", argv) READ(argv, "(I32)") check IF (0.NE.check) THEN WRITE(*, "(A)") "Calculating check..." ALLOCATE(d(m,n,k,s)) ! Initialize !$OMP PARALLEL DO PRIVATE(i) DEFAULT(NONE) SHARED(d, m, n, k, s) DO i = 1, s DO ix = 1, m DO iy = 1, n DO iz = 1, k d(ix,iy,iz,i) = REAL(0, T) END DO END DO END DO END DO !$OMP PARALLEL PRIVATE(i, j, r) DEFAULT(NONE) & !$OMP SHARED(a, b, d, dx, dy, dz, g1, g2, g3, m, n, k, h1, h2, repetitions) ALLOCATE(tm1(m,n,k), tm2(m,n,k), tm3(m*n,k,1)) tm1 = 0; tm2 = 0; tm3 = 0 DO r = 1, repetitions !$OMP DO DO i = LBOUND(a, 4), UBOUND(a, 4) ! PGI: cannot deduce generic procedure (libxsmm_blas_gemm) CALL libxsmm_blas_dgemm(m=m, n=n*k, k=m, & & a=dx, b=a(:,:,1,i), c=tm1(:,:,1), & & alpha=alpha, beta=beta) DO j = 1, k ! PGI: cannot deduce generic procedure (libxsmm_blas_gemm) CALL libxsmm_blas_dgemm(m=m, n=n, k=n, & & a=a(:,:,j,i), b=dy, c=tm2(:,:,j), & & alpha=alpha, beta=beta) END DO ! PGI: cannot deduce generic procedure (libxsmm_blas_gemm) CALL libxsmm_blas_dgemm(m=m*n, n=k, k=k, & & a=a(:,:,1,i), b=dz, c=tm3(:,:,1), & & alpha=alpha, beta=beta) !DEC$ vector aligned nontemporal d(:,:,:,i) = h1 * (g1(:,:,:,i) * tm1 & & + g2(:,:,:,i) * tm2 & & + g3(:,:,:,i) * RESHAPE(tm3, (/m,n,k/)))& & + h2 * b(:,:,:,i) * a(:,:,:,i) END DO END DO ! Deallocate thread-local arrays DEALLOCATE(tm1, tm2, tm3) !$OMP END PARALLEL END IF c(:,:,:,:) = REAL(0, T) WRITE(*, "(A)") "Streamed... (BLAS)" !$OMP PARALLEL PRIVATE(i, j, r, start) DEFAULT(NONE) & !$OMP SHARED(a, dx, dy, dz, g1, g2, g3, b, c, m, n, k, h1, h2, duration, repetitions) ALLOCATE(tm1(m,n,k), tm2(m,n,k), tm3(m,n,k)) tm1 = 0; tm2 = 0; tm3 = 0 !$OMP MASTER start = libxsmm_timer_tick() !$OMP END MASTER !$OMP BARRIER DO r = 1, repetitions !$OMP DO DO i = LBOUND(a, 4), UBOUND(a, 4) ! PGI: cannot deduce generic procedure (libxsmm_blas_gemm) CALL libxsmm_blas_dgemm(m=m, n=n*k, k=m, & & a=dx, b=a(:,:,1,i), c=tm1(:,:,1), & & alpha=alpha, beta=beta) DO j = 1, k ! PGI: cannot deduce generic procedure (libxsmm_blas_gemm) CALL libxsmm_blas_dgemm(m=m, n=n, k=n, & & a=a(:,:,j,i), b=dy, c=tm2(:,:,j), & & alpha=alpha, beta=beta) END DO ! PGI: cannot deduce generic procedure (libxsmm_blas_gemm) CALL libxsmm_blas_dgemm(m=m*n, n=k, k=k, & & a=a(:,:,1,i), b=dz, c=tm3(:,:,1), & & alpha=alpha, beta=beta) CALL stream_update_helmholtz( & & g1(1,1,1,i), g2(1,1,1,i), g3(1,1,1,i), & & tm1(1,1,1), tm2(1,1,1), tm3(1,1,1), & & a(1,1,1,i), b(1,1,1,i), c(1,1,1,i), & & h1, h2, m*n*k) END DO END DO !$OMP BARRIER !$OMP MASTER duration = libxsmm_timer_duration(start, libxsmm_timer_tick()) !$OMP END MASTER ! Deallocate thread-local arrays DEALLOCATE(tm1, tm2, tm3) !$OMP END PARALLEL ! Print Performance Summary and check results CALL performance(duration, m, n, k, size) IF (check.NE.0) max_diff = MAX(max_diff, validate(d, c)) c(:,:,:,:) = REAL(0, T) WRITE(*, "(A)") "Streamed... (mxm)" !$OMP PARALLEL PRIVATE(i, j, r, start) DEFAULT(NONE) & !$OMP SHARED(a, dx, dy, dz, g1, g2, g3, b, c, m, n, k, h1, h2, duration, repetitions) ALLOCATE(tm1(m,n,k), tm2(m,n,k), tm3(m,n,k)) tm1 = 0; tm2 = 0; tm3 = 0 !$OMP MASTER start = libxsmm_timer_tick() !$OMP END MASTER !$OMP BARRIER DO r = 1, repetitions !$OMP DO DO i = LBOUND(a, 4), UBOUND(a, 4) CALL mxmf2(dx, m, a(:,:,:,i), m, tm1, n*k) DO j = 1, k CALL mxmf2(a(:,:,j,i), m, dy, n, tm2(:,:,j), n) END DO CALL mxmf2(a(:,:,:,i), m*n, dz, k, tm3, k) CALL stream_update_helmholtz( & & g1(1,1,1,i), g2(1,1,1,i), g3(1,1,1,i), & & tm1(1,1,1), tm2(1,1,1), tm3(1,1,1), & & a(1,1,1,i), b(1,1,1,i), c(1,1,1,i), & & h1, h2, m*n*k) END DO END DO !$OMP BARRIER !$OMP MASTER duration = libxsmm_timer_duration(start, libxsmm_timer_tick()) !$OMP END MASTER ! Deallocate thread-local arrays DEALLOCATE(tm1, tm2, tm3) !$OMP END PARALLEL ! Print Performance Summary and check results CALL performance(duration, m, n, k, size) IF (check.NE.0) max_diff = MAX(max_diff, validate(d, c)) c(:,:,:,:) = REAL(0, T) WRITE(*, "(A)") "Streamed... (auto-dispatched)" !$OMP PARALLEL PRIVATE(i, j, r, start) DEFAULT(NONE) & !$OMP SHARED(a, b, dx, dy, dz, g1, g2, g3, c, m, n, k, h1, h2, duration, repetitions) ALLOCATE(tm1(m,n,k), tm2(m,n,k), tm3(m,n,k)) tm1 = 0; tm2 = 0; tm3 = 0 !$OMP MASTER start = libxsmm_timer_tick() !$OMP END MASTER !$OMP BARRIER DO r = 1, repetitions !$OMP DO DO i = LBOUND(a, 4), UBOUND(a, 4) ! PGI: cannot deduce generic procedure (libxsmm_gemm) CALL libxsmm_dgemm(m=m, n=n*k, k=m, & & a=dx, b=a(:,:,1,i), c=tm1(:,:,1), & & alpha=alpha, beta=beta) DO j = 1, k ! PGI: cannot deduce generic procedure (libxsmm_gemm) CALL libxsmm_dgemm(m=m, n=n, k=n, & & a=a(:,:,j,i), b=dy, c=tm2(:,:,j), & & alpha=alpha, beta=beta) END DO ! PGI: cannot deduce generic procedure (libxsmm_gemm) CALL libxsmm_dgemm(m=m*n, n=k, k=k, & & a=a(:,:,1,i), b=dz, c=tm3(:,:,1), & & alpha=alpha, beta=beta) CALL stream_update_helmholtz( & & g1(1,1,1,i), g2(1,1,1,i), g3(1,1,1,i), & & tm1(1,1,1), tm2(1,1,1), tm3(1,1,1), & & a(1,1,1,i), b(1,1,1,i), c(1,1,1,i), & & h1, h2, m*n*k) END DO END DO !$OMP BARRIER !$OMP MASTER duration = libxsmm_timer_duration(start, libxsmm_timer_tick()) !$OMP END MASTER ! Deallocate thread-local arrays DEALLOCATE(tm1, tm2, tm3) !$OMP END PARALLEL ! Print Performance Summary and check results CALL performance(duration, m, n, k, size) IF (check.NE.0) max_diff = MAX(max_diff, validate(d, c)) c(:,:,:,:) = REAL(0, T) WRITE(*, "(A)") "Streamed... (specialized)" CALL libxsmm_dispatch(xmm1, m, n*k, m, alpha=alpha, beta=beta) CALL libxsmm_dispatch(xmm2, m, n, n, alpha=alpha, beta=beta) CALL libxsmm_dispatch(xmm3, m*n, k, k, alpha=alpha, beta=beta) IF (libxsmm_available(xmm1).AND. & & libxsmm_available(xmm2).AND. & & libxsmm_available(xmm3)) & & THEN !$OMP PARALLEL PRIVATE(i, j, r, start) & !DEFAULT(NONE) !$OMP SHARED(a, dx, dy, dz, g1, g2, g3, b, c, m, n, k, h1, h2, duration, repetitions, xmm1, xmm2, xmm3) ALLOCATE(tm1(m,n,k), tm2(m,n,k), tm3(m,n,k)) tm1 = 0; tm2 = 0; tm3 = 0 !$OMP MASTER start = libxsmm_timer_tick() !$OMP END MASTER !$OMP BARRIER DO r = 1, repetitions !$OMP DO DO i = LBOUND(a, 4), UBOUND(a, 4) CALL libxsmm_mmcall(xmm1, dx, a(1,1,1,i), tm1) DO j = 1, k CALL libxsmm_mmcall(xmm2, a(1,1,j,i), dy, tm2(1,1,j)) END DO CALL libxsmm_mmcall(xmm3, a(1,1,1,i), dz, tm3) CALL stream_update_helmholtz( & & g1(1,1,1,i), g2(1,1,1,i), g3(1,1,1,i), & & tm1(1,1,1), tm2(1,1,1), tm3(1,1,1), & & a(1,1,1,i), b(1,1,1,i), c(1,1,1,i), & & h1, h2, m*n*k) END DO END DO !$OMP BARRIER !$OMP MASTER duration = libxsmm_timer_duration(start, libxsmm_timer_tick()) !$OMP END MASTER ! Deallocate thread-local arrays DEALLOCATE(tm1, tm2, tm3) !$OMP END PARALLEL ! Print Performance Summary and check results CALL performance(duration, m, n, k, size) IF (check.NE.0) max_diff = MAX(max_diff, validate(d, c)) ELSE WRITE(*,*) "Could not build specialized function(s)!" END IF ! Deallocate global arrays IF (0.NE.check) DEALLOCATE(d) DEALLOCATE(dx, dy, dz) DEALLOCATE(g1, g2, g3) DEALLOCATE(a, b, c) ! finalize LIBXSMM CALL libxsmm_finalize() IF ((0.NE.check).AND.(1.LT.max_diff)) STOP 1 CONTAINS FUNCTION validate(ref, test) RESULT(diff) REAL(T), DIMENSION(:,:,:,:), INTENT(IN) :: ref, test REAL(T) :: diff diff = MAXVAL((ref - test) * (ref - test)) WRITE(*, "(1A,A,F10.1,A)") CHAR(9), "diff: ", diff END FUNCTION SUBROUTINE performance(duration, m, n, k, size) DOUBLE PRECISION, INTENT(IN) :: duration INTEGER, INTENT(IN) :: m, n, k INTEGER(8), INTENT(IN) :: size IF (0.LT.duration) THEN WRITE(*, "(1A,A,F10.1,A)") CHAR(9), "performance:", & & size * m * n * k * (2*(m+n+k) + 2 + 4) * 1D-9 / duration, & & " GFLOPS/s" WRITE(*, "(1A,A,F10.1,A)") CHAR(9), "bandwidth: ", & & size * m * n * k * (6) * T / (duration * ISHFT(1_8, 30)), & & " GB/s" END IF WRITE(*, "(1A,A,F10.1,A)") CHAR(9), "duration: ", & & (1D3 * duration) / repetitions, " ms" END SUBROUTINE END PROGRAM libxsmm-1.17/samples/nek/axhm.sh000077500000000000000000000050141415223013700166300ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) NAME=$(basename $0 .sh) GREP=$(command -v grep) ENV=$(command -v env) if [ "Windows_NT" = "${OS}" ]; then # Cygwin's "env" does not set PATH ("Files/Black: No such file or directory") export PATH=${PATH}:${HERE}/../../lib:/usr/x86_64-w64-mingw32/sys-root/mingw/bin # Cygwin's ldd hangs with dyn. linked executables or certain shared libraries LDD=$(command -v cygcheck) EXE=.exe else if [ "$(command -v ldd)" ]; then LDD=ldd elif [ "$(command -v otool)" ]; then LDD="otool -L" else LDD=echo fi fi MICINFO=$(command -v micinfo) if [ "${MICINFO}" ]; then MICCORES=$(${MICINFO} 2>/dev/null | sed -n "0,/[[:space:]]\+Total No of Active Cores :[[:space:]]\+\([0-9]\+\)/s//\1/p") fi if [ "" = "${MICCORES}" ]; then MICCORES=61 fi MICTPERC=3 if [ "-mic" != "$1" ]; then if [ "$(${LDD} ${HERE}/${NAME}${EXE} 2>/dev/null | ${GREP} libiomp5\.)" ]; then ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ KMP_AFFINITY=scatter,granularity=fine,1 \ MIC_KMP_AFFINITY=scatter,granularity=fine \ MIC_KMP_HW_SUBSET=$((MICCORES-1))c${MICTPERC}t \ MIC_ENV_PREFIX=MIC \ OFFLOAD_INIT=on_start \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" else ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ OMP_PROC_BIND=TRUE \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" fi else shift ${ENV} \ SINK_LD_LIBRARY_PATH=${SINK_LD_LIBRARY_PATH}:${MIC_LD_LIBRARY_PATH}:${HERE}/../../lib \ micnativeloadex \ ${HERE}/${NAME}${EXE} -a "$*" \ -e "KMP_AFFINITY=scatter,granularity=fine" \ -e "MIC_KMP_HW_SUBSET=$((MICCORES-1))${MICTPERC}t" fi libxsmm-1.17/samples/nek/grad.f000066400000000000000000000351361415223013700164300ustar00rootroot00000000000000!=======================================================================! ! Copyright (c) Intel Corporation - All rights reserved. ! ! This file is part of the LIBXSMM library. ! ! ! ! For information on the license, see the LICENSE file. ! ! Further information: https://github.com/hfp/libxsmm/ ! ! SPDX-License-Identifier: BSD-3-Clause ! !=======================================================================! ! Hans Pabst (Intel Corp.), Alexander Heinecke (Intel Corp.), and ! Maxwell Hutchinson (University of Chicago) !=======================================================================! PROGRAM grad USE :: LIBXSMM, libxsmm_mmcall => libxsmm_dmmcall_abc USE :: STREAM_UPDATE_KERNELS !$ USE omp_lib IMPLICIT NONE INTEGER, PARAMETER :: T = KIND(0D0) REAL(T), PARAMETER :: alpha = 1, beta = 0 REAL(T), ALLOCATABLE, DIMENSION(:,:,:,:), TARGET :: a !DIR$ ATTRIBUTES ALIGN:64 :: a REAL(T), ALLOCATABLE, DIMENSION(:,:,:,:), TARGET :: cx, cy, cz !DIR$ ATTRIBUTES ALIGN:64 :: cx, cy, cz REAL(T), ALLOCATABLE, DIMENSION(:,:,:,:), TARGET :: rx, ry, rz !DIR$ ATTRIBUTES ALIGN:64 :: rx, ry, rz REAL(T), ALLOCATABLE, TARGET :: dx(:,:), dy(:,:), dz(:,:) REAL(T), ALLOCATABLE, TARGET, SAVE :: tm1(:,:,:) !DIR$ ATTRIBUTES ALIGN:64 :: tm1 REAL(T), ALLOCATABLE, TARGET, SAVE :: tm2(:,:,:) !DIR$ ATTRIBUTES ALIGN:64 :: tm2 REAL(T), ALLOCATABLE, TARGET, SAVE :: tm3(:,:,:) !DIR$ ATTRIBUTES ALIGN:64 :: tm3 !$OMP THREADPRIVATE(tm1, tm2, tm3) TYPE(LIBXSMM_DMMFUNCTION) :: xmm1, xmm2, xmm3 DOUBLE PRECISION :: duration, max_diff INTEGER :: argc, m, n, k, routine, check INTEGER(8) :: i, j, ix, iy, iz, r, s INTEGER(8) :: size0, size1, size INTEGER(8) :: repetitions, start CHARACTER(32) :: argv argc = COMMAND_ARGUMENT_COUNT() IF (1 <= argc) THEN CALL GET_COMMAND_ARGUMENT(1, argv) READ(argv, "(I32)") m ELSE m = 8 END IF IF (3 <= argc) THEN CALL GET_COMMAND_ARGUMENT(3, argv) READ(argv, "(I32)") k ELSE k = m END IF IF (2 <= argc) THEN CALL GET_COMMAND_ARGUMENT(2, argv) READ(argv, "(I32)") n ELSE n = k END IF IF (4 <= argc) THEN CALL GET_COMMAND_ARGUMENT(4, argv) READ(argv, "(I32)") size1 ELSE size1 = 0 END IF IF (5 <= argc) THEN CALL GET_COMMAND_ARGUMENT(5, argv) READ(argv, "(I32)") size ELSE size = 0 ! 1 repetition by default END IF ! Initialize LIBXSMM CALL libxsmm_init() ! workload is about 2 GByte in memory by default size0 = (m * n * k) * T * 5 ! size of a single stream element in Byte size1 = MERGE(2048_8, MERGE(size1, ISHFT(ABS(size0 * size1) & & + ISHFT(1, 20) - 1, -20), 0.LE.size1), 0.EQ.size1) size = ISHFT(MERGE(MAX(size, size1), ISHFT(ABS(size) * size0 & & + ISHFT(1, 20) - 1, -20), 0.LE.size), 20) / size0 s = ISHFT(size1, 20) / size0 repetitions = size / s duration = 0 max_diff = 0 ALLOCATE(cx(m,n,k,s), cy(m,n,k,s), cz(m,n,k,s)) ALLOCATE(dx(m,m), dy(n,n), dz(k,k)) ALLOCATE(a(m,n,k,s)) ! Initialize !$OMP PARALLEL DO PRIVATE(i) DEFAULT(NONE) SHARED(a, cx, cy, cz, m, n, k, s) DO i = 1, s DO ix = 1, m DO iy = 1, n DO iz = 1, k a(ix,iy,iz,i) = ix + iy * m + iz * m * n cx(ix,iy,iz,i) = REAL(0, T) cy(ix,iy,iz,i) = REAL(0, T) cz(ix,iy,iz,i) = REAL(0, T) END DO END DO END DO END DO dx = 1.; dy = 2.; dz = 3. WRITE(*, "(3(A,I0),A,I0,A,I0,A,I0)") & & "m=", m, " n=", n, " k=", k, " elements=", UBOUND(a, 4), & & " size=", size1, "MB repetitions=", repetitions CALL GETENV("CHECK", argv) READ(argv, "(I32)") check IF (0.NE.check) THEN ALLOCATE(rx(m,n,k,s), ry(m,n,k,s), rz(m,n,k,s)) !$OMP PARALLEL DO PRIVATE(i) DEFAULT(NONE) SHARED(rx, ry, rz, m, n, k, s) DO i = 1, s DO ix = 1, m DO iy = 1, n DO iz = 1, k rx(ix,iy,iz,i) = REAL(0, T) ry(ix,iy,iz,i) = REAL(0, T) rz(ix,iy,iz,i) = REAL(0, T) END DO END DO END DO END DO WRITE(*, "(A)") "Calculating check..." !$OMP PARALLEL PRIVATE(i, j, r) DEFAULT(NONE) & !$OMP SHARED(a, dx, dy, dz, rx, ry, rz, m, n, k, repetitions) DO r = 1, repetitions !$OMP DO DO i = LBOUND(a, 4), UBOUND(a, 4) rx(:,:,:,i) = RESHAPE( & & MATMUL(dx, RESHAPE(a(:,:,:,i), (/m,n*k/))), & & (/m,n,k/)) DO j = 1, k ry(:,:,j,i) = MATMUL(a(:,:,j,i), dy) END DO rz(:,:,:,i) = RESHAPE( & & MATMUL(RESHAPE(a(:,:,:,i), (/m*n,k/)), dz), & & (/m,n,k/)) END DO END DO ! Deallocate thread-local arrays !$OMP END PARALLEL END IF WRITE(*, "(A)") "Streamed... (BLAS)" !$OMP PARALLEL PRIVATE(i, j, r, start) DEFAULT(NONE) & !$OMP SHARED(a, dx, dy, dz, cx, cy, cz, m, n, k, duration, repetitions) ALLOCATE(tm1(m,n,k), tm2(m,n,k), tm3(m,n,k)) tm1 = 0; tm2 = 0; tm3 = 0 !$OMP MASTER start = libxsmm_timer_tick() !$OMP END MASTER !$OMP BARRIER DO r = 1, repetitions !$OMP DO DO i = LBOUND(a, 4), UBOUND(a, 4) ! PGI: cannot deduce generic procedure (libxsmm_blas_gemm) CALL libxsmm_blas_dgemm(m=m, n=n*k, k=m, & & a=dx, b=a(:,:,1,i), c=tm1(:,:,1), & & alpha=alpha, beta=beta) CALL stream_vector_copy(tm1(1,1,1), cx(1,1,1,i), m*n*k) DO j = 1, k ! PGI: cannot deduce generic procedure (libxsmm_blas_gemm) CALL libxsmm_blas_dgemm(m=m, n=n, k=n, & & a=a(:,:,j,i), b=dy, c=tm2(:,:,j), & & alpha=alpha, beta=beta) END DO CALL stream_vector_copy(tm2(1,1,1), cy(1,1,1,i), m*n*k) ! PGI: cannot deduce generic procedure (libxsmm_blas_gemm) CALL libxsmm_blas_dgemm(m=m*n, n=k, k=k, & & a=a(:,:,1,i), b=dz, c=tm3(:,:,1), & & alpha=alpha, beta=beta) CALL stream_vector_copy(tm3(1,1,1), cz(1,1,1,i), m*n*k) END DO END DO !$OMP BARRIER !$OMP MASTER duration = libxsmm_timer_duration(start, libxsmm_timer_tick()) !$OMP END MASTER ! Deallocate thread-local arrays DEALLOCATE(tm1, tm2, tm3) !$OMP END PARALLEL ! Print Performance Summary and check results call performance(duration, m, n, k, size) IF (check.NE.0) THEN max_diff = MAX(max_diff, validate(rx, ry, rz, cx, cy, cz)) END IF WRITE(*, "(A)") "Streamed... (mxm)" !$OMP PARALLEL PRIVATE(i, j, r, start) DEFAULT(NONE) & !$OMP SHARED(a, dx, dy, dz, cx, cy, cz, m, n, k, duration, repetitions) ALLOCATE(tm1(m,n,k), tm2(m,n,k), tm3(m,n,k)) tm1 = 0; tm2 = 0; tm3 = 0 !$OMP MASTER start = libxsmm_timer_tick() !$OMP END MASTER !$OMP BARRIER DO r = 1, repetitions !$OMP DO DO i = LBOUND(a, 4), UBOUND(a, 4) CALL mxmf2(dx, m, a(:,:,:,i), m, tm1(:,:,:), n*k) CALL stream_vector_copy(tm1(1,1,1), cx(1,1,1,i), m*n*k) DO j = 1, k CALL mxmf2(a(:,:,j,i), m, dy, n, tm2(:,:,j), n) END DO CALL stream_vector_copy(tm2(1,1,1), cy(1,1,1,i), m*n*k) CALL mxmf2(a(:,:,:,i), m*n, dz, k, tm3(:,:,:), k) CALL stream_vector_copy(tm3(1,1,1), cz(1,1,1,i), m*n*k) END DO END DO !$OMP BARRIER !$OMP MASTER duration = libxsmm_timer_duration(start, libxsmm_timer_tick()) !$OMP END MASTER ! Deallocate thread-local arrays DEALLOCATE(tm1, tm2, tm3) !$OMP END PARALLEL ! Print Performance Summary and check results call performance(duration, m, n, k, size) IF (check.NE.0) THEN max_diff = MAX(max_diff, validate(rx, ry, rz, cx, cy, cz)) END IF WRITE(*, "(A)") "Streamed... (auto-dispatched)" !$OMP PARALLEL PRIVATE(i, j, r, start) DEFAULT(NONE) & !$OMP SHARED(a, dx, dy, dz, cx, cy, cz, m, n, k, duration, repetitions) ALLOCATE(tm1(m,n,k), tm2(m,n,k), tm3(m,n,k)) tm1 = 0; tm2 = 0; tm3 = 0 !$OMP MASTER start = libxsmm_timer_tick() !$OMP END MASTER !$OMP BARRIER DO r = 1, repetitions !$OMP DO DO i = LBOUND(a, 4), UBOUND(a, 4) ! PGI: cannot deduce generic procedure (libxsmm_gemm) CALL libxsmm_dgemm(m=m, n=n*k, k=m, & & a=dx, b=a(:,:,1,i), c=tm1(:,:,1), & & alpha=alpha, beta=beta) CALL stream_vector_copy(tm1(1,1,1), cx(1,1,1,i), m*n*k) DO j = 1, k ! PGI: cannot deduce generic procedure (libxsmm_gemm) CALL libxsmm_dgemm(m=m, n=n, k=n, & & a=a(:,:,j,i), b=dy, c=tm2(:,:,j), & & alpha=alpha, beta=beta) END DO CALL stream_vector_copy(tm2(1,1,1), cy(1,1,1,i), m*n*k) ! PGI: cannot deduce generic procedure (libxsmm_gemm) CALL libxsmm_dgemm(m=m*n, n=k, k=k, & & a=a(:,:,1,i), b=dz, c=tm3(:,:,1), & & alpha=alpha, beta=beta) CALL stream_vector_copy(tm3(1,1,1), cz(1,1,1,i), m*n*k) END DO END DO !$OMP BARRIER !$OMP MASTER duration = libxsmm_timer_duration(start, libxsmm_timer_tick()) !$OMP END MASTER ! Deallocate thread-local arrays DEALLOCATE(tm1, tm2, tm3) !$OMP END PARALLEL ! Print Performance Summary and check results call performance(duration, m, n, k, size) IF (check.NE.0) THEN max_diff = MAX(max_diff, validate(rx, ry, rz, cx, cy, cz)) END IF WRITE(*, "(A)") "Streamed... (specialized)" CALL libxsmm_dispatch(xmm1, m, n*k, m, alpha=alpha, beta=beta) CALL libxsmm_dispatch(xmm2, m, n, n, alpha=alpha, beta=beta) CALL libxsmm_dispatch(xmm3, m*n, k, k, alpha=alpha, beta=beta) IF (libxsmm_available(xmm1).AND. & & libxsmm_available(xmm2).AND. & & libxsmm_available(xmm3)) & & THEN !$OMP PARALLEL PRIVATE(i, j, r, start) & !DEFAULT(NONE) !$OMP SHARED(a, dx, dy, dz, cx, cy, cz, m, n, k, duration, repetitions, xmm1, xmm2, xmm3) ALLOCATE(tm1(m,n,k), tm2(m,n,k), tm3(m,n,k)) tm1 = 0; tm2 = 0; tm3 = 0 !$OMP MASTER start = libxsmm_timer_tick() !$OMP END MASTER !$OMP BARRIER DO r = 1, repetitions !$OMP DO DO i = LBOUND(a, 4), UBOUND(a, 4) CALL libxsmm_mmcall(xmm1, dx, a(1,1,1,i), tm1(1,1,1)) CALL stream_vector_copy(tm1(1,1,1), cx(1,1,1,i), m*n*k) DO j = 1, k CALL libxsmm_mmcall(xmm2, a(1,1,j,i), dy, tm2(1,1,j)) END DO CALL stream_vector_copy(tm2(1,1,1), cy(1,1,1,i), m*n*k) CALL libxsmm_mmcall(xmm3, a(1,1,1,i), dz, tm3(1,1,1)) CALL stream_vector_copy(tm3(1,1,1), cz(1,1,1,i), m*n*k) END DO END DO !$OMP BARRIER !$OMP MASTER duration = libxsmm_timer_duration(start, libxsmm_timer_tick()) !$OMP END MASTER ! Deallocate thread-local arrays DEALLOCATE(tm1, tm2, tm3) !$OMP END PARALLEL ! Print Performance Summary and check results call performance(duration, m, n, k, size) IF (check.NE.0) THEN max_diff = MAX(max_diff, validate(rx, ry, rz, cx, cy, cz)) END IF ELSE WRITE(*,*) "Could not build specialized function(s)!" END IF ! Deallocate global arrays IF (0.NE.check) DEALLOCATE(rx, ry, rz) DEALLOCATE(dx, dy, dz) DEALLOCATE(cx, cy, cz) DEALLOCATE(a) ! finalize LIBXSMM CALL libxsmm_finalize() IF ((0.NE.check).AND.(1.LT.max_diff)) STOP 1 CONTAINS FUNCTION validate( & & refx, refy, refz, & & testx, testy, testz) & & RESULT(diff) REAL(T), DIMENSION(:,:,:,:), INTENT(IN) :: refx, refy, refz REAL(T), DIMENSION(:,:,:,:), INTENT(IN) :: testx, testy, testz real(T) :: diff diff = MAXVAL((refx - testx) * (refx - testx)) diff = MAX(MAXVAL((refy - testy) * (refy - testy)), diff) diff = MAX(MAXVAL((refz - testz) * (refz - testz)), diff) WRITE(*, "(1A,A,F10.1,A)") CHAR(9), "diff: ", diff END FUNCTION SUBROUTINE performance(duration, m, n, k, size) DOUBLE PRECISION, INTENT(IN) :: duration INTEGER, INTENT(IN) :: m, n, k INTEGER(8), INTENT(IN) :: size IF (0.LT.duration) THEN WRITE(*, "(1A,A,F10.1,A)") CHAR(9), "performance:", & & size * m * n * k * (2*(m+n+k) - 3) * 1D-9 / duration, & & " GFLOPS/s" WRITE(*, "(1A,A,F10.1,A)") CHAR(9), "bandwidth: ", & & size * m * n * k * (4) * T / (duration * ISHFT(1_8, 30)), & & " GB/s" END IF WRITE(*, "(1A,A,F10.1,A)") CHAR(9), "duration: ", & & (1D3 * duration)/repetitions, " ms" END SUBROUTINE END PROGRAM libxsmm-1.17/samples/nek/grad.sh000077500000000000000000000050141415223013700166100ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) NAME=$(basename $0 .sh) GREP=$(command -v grep) ENV=$(command -v env) if [ "Windows_NT" = "${OS}" ]; then # Cygwin's "env" does not set PATH ("Files/Black: No such file or directory") export PATH=${PATH}:${HERE}/../../lib:/usr/x86_64-w64-mingw32/sys-root/mingw/bin # Cygwin's ldd hangs with dyn. linked executables or certain shared libraries LDD=$(command -v cygcheck) EXE=.exe else if [ "$(command -v ldd)" ]; then LDD=ldd elif [ "$(command -v otool)" ]; then LDD="otool -L" else LDD=echo fi fi MICINFO=$(command -v micinfo) if [ "${MICINFO}" ]; then MICCORES=$(${MICINFO} 2>/dev/null | sed -n "0,/[[:space:]]\+Total No of Active Cores :[[:space:]]\+\([0-9]\+\)/s//\1/p") fi if [ "" = "${MICCORES}" ]; then MICCORES=61 fi MICTPERC=3 if [ "-mic" != "$1" ]; then if [ "$(${LDD} ${HERE}/${NAME}${EXE} 2>/dev/null | ${GREP} libiomp5\.)" ]; then ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ KMP_AFFINITY=scatter,granularity=fine,1 \ MIC_KMP_AFFINITY=scatter,granularity=fine \ MIC_KMP_HW_SUBSET=$((MICCORES-1))c${MICTPERC}t \ MIC_ENV_PREFIX=MIC \ OFFLOAD_INIT=on_start \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" else ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ OMP_PROC_BIND=TRUE \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" fi else shift ${ENV} \ SINK_LD_LIBRARY_PATH=${SINK_LD_LIBRARY_PATH}:${MIC_LD_LIBRARY_PATH}:${HERE}/../../lib \ micnativeloadex \ ${HERE}/${NAME}${EXE} -a "$*" \ -e "KMP_AFFINITY=scatter,granularity=fine" \ -e "MIC_KMP_HW_SUBSET=$((MICCORES-1))${MICTPERC}t" fi libxsmm-1.17/samples/nek/mxm_std.f000066400000000000000000004657431415223013700172010ustar00rootroot00000000000000!> unrolled loop version ! ! COPYRIGHT ! !The following is a notice of limited availability of the code, and disclaimer !which must be included in the prologue of the code and in all source listings !of the code. ! !Copyright Notice ! + 2012 University of Chicago ! !Permission is hereby granted to use, reproduce, prepare derivative works, and !to redistribute to others. This software was authored by: ! ! P. Fischer: (630) 252-6018; FAX: (630) 252-5986; email: fischer@mcs.anl.gov ! Mathematics and Computer Science Division ! Argonne National Laboratory, Argonne IL 60439 ! ! M Hutchinson: maxhutch@gmail.com ! ! GOVERNMENT LICENSE ! !Portions of this material resulted from work developed under a U.S. !Government Contract and are subject to the following license: the Government !is granted for itself and others acting on its behalf a paid-up, nonexclusive, !irrevocable worldwide license in this computer software to reproduce, prepare !derivative works, and perform publicly and display publicly. ! ! DISCLAIMER ! !This computer code material was prepared, in part, as an account of work !sponsored by an agency of the United States Government. Neither the United !States, nor the University of Chicago, nor any of their employees, makes any !warranty express or implied, or assumes any legal liability or responsibility !for the accuracy, completeness, or usefulness of any information, apparatus, !product, or process disclosed, or represents that its use would not infringe !privately owned rights. ! ! subroutine mxmf2(A,N1,B,N2,C,N3) integer :: n1, n2, n3 real(8) :: a(n1,n2),b(n2,n3),c(n1,n3) select case (n2) case (1 : 8) select case (n2) case (8) call mxf8(a,n1,b,n2,c,n3) case (1) call mxf1(a,n1,b,n2,c,n3) case (2) call mxf2(a,n1,b,n2,c,n3) case (3) call mxf3(a,n1,b,n2,c,n3) case (4) call mxf4(a,n1,b,n2,c,n3) case (5) call mxf5(a,n1,b,n2,c,n3) case (6) call mxf6(a,n1,b,n2,c,n3) case (7) call mxf7(a,n1,b,n2,c,n3) end select case (9 : 16) select case (n2) case (12) call mxf12(a,n1,b,n2,c,n3) case (9) call mxf9(a,n1,b,n2,c,n3) case (10) call mxf10(a,n1,b,n2,c,n3) case (11) call mxf11(a,n1,b,n2,c,n3) case (13) call mxf13(a,n1,b,n2,c,n3) case (14) call mxf14(a,n1,b,n2,c,n3) case (15) call mxf15(a,n1,b,n2,c,n3) case (16) call mxf16(a,n1,b,n2,c,n3) end select case (17 : 24) select case (n2) case (17) call mxf17(a,n1,b,n2,c,n3) case (18) call mxf18(a,n1,b,n2,c,n3) case (19) call mxf19(a,n1,b,n2,c,n3) case (20) call mxf20(a,n1,b,n2,c,n3) case (21) call mxf21(a,n1,b,n2,c,n3) case (22) call mxf22(a,n1,b,n2,c,n3) case (23) call mxf23(a,n1,b,n2,c,n3) case (24) call mxf24(a,n1,b,n2,c,n3) end select case default call mxm44_0(a,n1,b,n2,c,n3) end select return end subroutine mxmf2 !----------------------------------------------------------------------- subroutine mxf1(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,1),b(1,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) enddo enddo return end subroutine mxf1 !----------------------------------------------------------------------- subroutine mxf2(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,2),b(2,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) enddo enddo return end subroutine mxf2 !----------------------------------------------------------------------- subroutine mxf3(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,3),b(3,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) enddo enddo return end subroutine mxf3 !----------------------------------------------------------------------- subroutine mxf4(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,4),b(4,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) enddo enddo return end subroutine mxf4 !----------------------------------------------------------------------- subroutine mxf5(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,5),b(5,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) enddo enddo return end subroutine mxf5 !----------------------------------------------------------------------- subroutine mxf6(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,6),b(6,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) enddo enddo return end subroutine mxf6 !----------------------------------------------------------------------- subroutine mxf7(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,7),b(7,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) enddo enddo return end subroutine mxf7 !----------------------------------------------------------------------- subroutine mxf8(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,8),b(8,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) enddo enddo return end subroutine mxf8 !----------------------------------------------------------------------- subroutine mxf9(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,9),b(9,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) enddo enddo return end subroutine mxf9 !----------------------------------------------------------------------- subroutine mxf10(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,10),b(10,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) enddo enddo return end subroutine mxf10 !----------------------------------------------------------------------- subroutine mxf11(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,11),b(11,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) enddo enddo return end subroutine mxf11 !----------------------------------------------------------------------- subroutine mxf12(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,12),b(12,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) enddo enddo return end subroutine mxf12 !----------------------------------------------------------------------- subroutine mxf13(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,13),b(13,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) enddo enddo return end subroutine mxf13 !----------------------------------------------------------------------- subroutine mxf14(a,n1,b,n2,c,n3) !use kinds, only : DP real(8) :: a(n1,14),b(14,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) enddo enddo return end subroutine mxf14 !----------------------------------------------------------------------- subroutine mxf15(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,15),b(15,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) & & + a(i,15)*b(15,j) enddo enddo return end subroutine mxf15 !----------------------------------------------------------------------- subroutine mxf16(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,16),b(16,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) & & + a(i,15)*b(15,j) & & + a(i,16)*b(16,j) enddo enddo return end subroutine mxf16 !----------------------------------------------------------------------- subroutine mxf17(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,17),b(17,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) & & + a(i,15)*b(15,j) & & + a(i,16)*b(16,j) & & + a(i,17)*b(17,j) enddo enddo return end subroutine mxf17 !----------------------------------------------------------------------- subroutine mxf18(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,18),b(18,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) & & + a(i,15)*b(15,j) & & + a(i,16)*b(16,j) & & + a(i,17)*b(17,j) & & + a(i,18)*b(18,j) enddo enddo return end subroutine mxf18 !----------------------------------------------------------------------- subroutine mxf19(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,19),b(19,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) & & + a(i,15)*b(15,j) & & + a(i,16)*b(16,j) & & + a(i,17)*b(17,j) & & + a(i,18)*b(18,j) & & + a(i,19)*b(19,j) enddo enddo return end subroutine mxf19 !----------------------------------------------------------------------- subroutine mxf20(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,20),b(20,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) & & + a(i,15)*b(15,j) & & + a(i,16)*b(16,j) & & + a(i,17)*b(17,j) & & + a(i,18)*b(18,j) & & + a(i,19)*b(19,j) & & + a(i,20)*b(20,j) enddo enddo return end subroutine mxf20 !----------------------------------------------------------------------- subroutine mxf21(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,21),b(21,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) & & + a(i,15)*b(15,j) & & + a(i,16)*b(16,j) & & + a(i,17)*b(17,j) & & + a(i,18)*b(18,j) & & + a(i,19)*b(19,j) & & + a(i,20)*b(20,j) & & + a(i,21)*b(21,j) enddo enddo return end subroutine mxf21 !----------------------------------------------------------------------- subroutine mxf22(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,22),b(22,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) & & + a(i,15)*b(15,j) & & + a(i,16)*b(16,j) & & + a(i,17)*b(17,j) & & + a(i,18)*b(18,j) & & + a(i,19)*b(19,j) & & + a(i,20)*b(20,j) & & + a(i,21)*b(21,j) & & + a(i,22)*b(22,j) enddo enddo return end subroutine mxf22 !----------------------------------------------------------------------- subroutine mxf23(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,23),b(23,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) & & + a(i,15)*b(15,j) & & + a(i,16)*b(16,j) & & + a(i,17)*b(17,j) & & + a(i,18)*b(18,j) & & + a(i,19)*b(19,j) & & + a(i,20)*b(20,j) & & + a(i,21)*b(21,j) & & + a(i,22)*b(22,j) & & + a(i,23)*b(23,j) enddo enddo return end subroutine mxf23 !----------------------------------------------------------------------- subroutine mxf24(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,24),b(24,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) & & + a(i,15)*b(15,j) & & + a(i,16)*b(16,j) & & + a(i,17)*b(17,j) & & + a(i,18)*b(18,j) & & + a(i,19)*b(19,j) & & + a(i,20)*b(20,j) & & + a(i,21)*b(21,j) & & + a(i,22)*b(22,j) & & + a(i,23)*b(23,j) & & + a(i,24)*b(24,j) enddo enddo return end subroutine mxf24 !----------------------------------------------------------------------- subroutine mxm44_0(a, m, b, k, c, n) ! matrix multiply with a 4x4 pencil ! use kinds,, only : DP real(8) :: a(m,k), b(k,n), c(m,n) real(8) :: s11, s12, s13, s14, s21, s22, s23, s24 real(8) :: s31, s32, s33, s34, s41, s42, s43, s44 mresid = iand(m,3) nresid = iand(n,3) m1 = m - mresid + 1 n1 = n - nresid + 1 do i=1,m-mresid,4 do j=1,n-nresid,4 s11 = 0.0d0 s21 = 0.0d0 s31 = 0.0d0 s41 = 0.0d0 s12 = 0.0d0 s22 = 0.0d0 s32 = 0.0d0 s42 = 0.0d0 s13 = 0.0d0 s23 = 0.0d0 s33 = 0.0d0 s43 = 0.0d0 s14 = 0.0d0 s24 = 0.0d0 s34 = 0.0d0 s44 = 0.0d0 do l=1,k s11 = s11 + a(i,l)*b(l,j) s12 = s12 + a(i,l)*b(l,j+1) s13 = s13 + a(i,l)*b(l,j+2) s14 = s14 + a(i,l)*b(l,j+3) s21 = s21 + a(i+1,l)*b(l,j) s22 = s22 + a(i+1,l)*b(l,j+1) s23 = s23 + a(i+1,l)*b(l,j+2) s24 = s24 + a(i+1,l)*b(l,j+3) s31 = s31 + a(i+2,l)*b(l,j) s32 = s32 + a(i+2,l)*b(l,j+1) s33 = s33 + a(i+2,l)*b(l,j+2) s34 = s34 + a(i+2,l)*b(l,j+3) s41 = s41 + a(i+3,l)*b(l,j) s42 = s42 + a(i+3,l)*b(l,j+1) s43 = s43 + a(i+3,l)*b(l,j+2) s44 = s44 + a(i+3,l)*b(l,j+3) enddo c(i,j) = s11 c(i,j+1) = s12 c(i,j+2) = s13 c(i,j+3) = s14 c(i+1,j) = s21 c(i+2,j) = s31 c(i+3,j) = s41 c(i+1,j+1) = s22 c(i+2,j+1) = s32 c(i+3,j+1) = s42 c(i+1,j+2) = s23 c(i+2,j+2) = s33 c(i+3,j+2) = s43 c(i+1,j+3) = s24 c(i+2,j+3) = s34 c(i+3,j+3) = s44 enddo ! Residual when n is not multiple of 4 if (nresid /= 0) then if (nresid == 1) then s11 = 0.0d0 s21 = 0.0d0 s31 = 0.0d0 s41 = 0.0d0 do l=1,k s11 = s11 + a(i,l)*b(l,n) s21 = s21 + a(i+1,l)*b(l,n) s31 = s31 + a(i+2,l)*b(l,n) s41 = s41 + a(i+3,l)*b(l,n) enddo c(i,n) = s11 c(i+1,n) = s21 c(i+2,n) = s31 c(i+3,n) = s41 elseif (nresid == 2) then s11 = 0.0d0 s21 = 0.0d0 s31 = 0.0d0 s41 = 0.0d0 s12 = 0.0d0 s22 = 0.0d0 s32 = 0.0d0 s42 = 0.0d0 do l=1,k s11 = s11 + a(i,l)*b(l,j) s12 = s12 + a(i,l)*b(l,j+1) s21 = s21 + a(i+1,l)*b(l,j) s22 = s22 + a(i+1,l)*b(l,j+1) s31 = s31 + a(i+2,l)*b(l,j) s32 = s32 + a(i+2,l)*b(l,j+1) s41 = s41 + a(i+3,l)*b(l,j) s42 = s42 + a(i+3,l)*b(l,j+1) enddo c(i,j) = s11 c(i,j+1) = s12 c(i+1,j) = s21 c(i+2,j) = s31 c(i+3,j) = s41 c(i+1,j+1) = s22 c(i+2,j+1) = s32 c(i+3,j+1) = s42 else s11 = 0.0d0 s21 = 0.0d0 s31 = 0.0d0 s41 = 0.0d0 s12 = 0.0d0 s22 = 0.0d0 s32 = 0.0d0 s42 = 0.0d0 s13 = 0.0d0 s23 = 0.0d0 s33 = 0.0d0 s43 = 0.0d0 do l=1,k s11 = s11 + a(i,l)*b(l,j) s12 = s12 + a(i,l)*b(l,j+1) s13 = s13 + a(i,l)*b(l,j+2) s21 = s21 + a(i+1,l)*b(l,j) s22 = s22 + a(i+1,l)*b(l,j+1) s23 = s23 + a(i+1,l)*b(l,j+2) s31 = s31 + a(i+2,l)*b(l,j) s32 = s32 + a(i+2,l)*b(l,j+1) s33 = s33 + a(i+2,l)*b(l,j+2) s41 = s41 + a(i+3,l)*b(l,j) s42 = s42 + a(i+3,l)*b(l,j+1) s43 = s43 + a(i+3,l)*b(l,j+2) enddo c(i,j) = s11 c(i+1,j) = s21 c(i+2,j) = s31 c(i+3,j) = s41 c(i,j+1) = s12 c(i+1,j+1) = s22 c(i+2,j+1) = s32 c(i+3,j+1) = s42 c(i,j+2) = s13 c(i+1,j+2) = s23 c(i+2,j+2) = s33 c(i+3,j+2) = s43 endif endif enddo ! Residual when m is not multiple of 4 if (mresid == 0) then return elseif (mresid == 1) then do j=1,n-nresid,4 s11 = 0.0d0 s12 = 0.0d0 s13 = 0.0d0 s14 = 0.0d0 do l=1,k s11 = s11 + a(m,l)*b(l,j) s12 = s12 + a(m,l)*b(l,j+1) s13 = s13 + a(m,l)*b(l,j+2) s14 = s14 + a(m,l)*b(l,j+3) enddo c(m,j) = s11 c(m,j+1) = s12 c(m,j+2) = s13 c(m,j+3) = s14 enddo ! mresid is 1, check nresid if (nresid == 0) then return elseif (nresid == 1) then s11 = 0.0d0 do l=1,k s11 = s11 + a(m,l)*b(l,n) enddo c(m,n) = s11 return elseif (nresid == 2) then s11 = 0.0d0 s12 = 0.0d0 do l=1,k s11 = s11 + a(m,l)*b(l,n-1) s12 = s12 + a(m,l)*b(l,n) enddo c(m,n-1) = s11 c(m,n) = s12 return else s11 = 0.0d0 s12 = 0.0d0 s13 = 0.0d0 do l=1,k s11 = s11 + a(m,l)*b(l,n-2) s12 = s12 + a(m,l)*b(l,n-1) s13 = s13 + a(m,l)*b(l,n) enddo c(m,n-2) = s11 c(m,n-1) = s12 c(m,n) = s13 return endif elseif (mresid == 2) then do j=1,n-nresid,4 s11 = 0.0d0 s12 = 0.0d0 s13 = 0.0d0 s14 = 0.0d0 s21 = 0.0d0 s22 = 0.0d0 s23 = 0.0d0 s24 = 0.0d0 do l=1,k s11 = s11 + a(m-1,l)*b(l,j) s12 = s12 + a(m-1,l)*b(l,j+1) s13 = s13 + a(m-1,l)*b(l,j+2) s14 = s14 + a(m-1,l)*b(l,j+3) s21 = s21 + a(m,l)*b(l,j) s22 = s22 + a(m,l)*b(l,j+1) s23 = s23 + a(m,l)*b(l,j+2) s24 = s24 + a(m,l)*b(l,j+3) enddo c(m-1,j) = s11 c(m-1,j+1) = s12 c(m-1,j+2) = s13 c(m-1,j+3) = s14 c(m,j) = s21 c(m,j+1) = s22 c(m,j+2) = s23 c(m,j+3) = s24 enddo ! mresid is 2, check nresid if (nresid == 0) then return elseif (nresid == 1) then s11 = 0.0d0 s21 = 0.0d0 do l=1,k s11 = s11 + a(m-1,l)*b(l,n) s21 = s21 + a(m,l)*b(l,n) enddo c(m-1,n) = s11 c(m,n) = s21 return elseif (nresid == 2) then s11 = 0.0d0 s21 = 0.0d0 s12 = 0.0d0 s22 = 0.0d0 do l=1,k s11 = s11 + a(m-1,l)*b(l,n-1) s12 = s12 + a(m-1,l)*b(l,n) s21 = s21 + a(m,l)*b(l,n-1) s22 = s22 + a(m,l)*b(l,n) enddo c(m-1,n-1) = s11 c(m-1,n) = s12 c(m,n-1) = s21 c(m,n) = s22 return else s11 = 0.0d0 s21 = 0.0d0 s12 = 0.0d0 s22 = 0.0d0 s13 = 0.0d0 s23 = 0.0d0 do l=1,k s11 = s11 + a(m-1,l)*b(l,n-2) s12 = s12 + a(m-1,l)*b(l,n-1) s13 = s13 + a(m-1,l)*b(l,n) s21 = s21 + a(m,l)*b(l,n-2) s22 = s22 + a(m,l)*b(l,n-1) s23 = s23 + a(m,l)*b(l,n) enddo c(m-1,n-2) = s11 c(m-1,n-1) = s12 c(m-1,n) = s13 c(m,n-2) = s21 c(m,n-1) = s22 c(m,n) = s23 return endif else ! mresid is 3 do j=1,n-nresid,4 s11 = 0.0d0 s21 = 0.0d0 s31 = 0.0d0 s12 = 0.0d0 s22 = 0.0d0 s32 = 0.0d0 s13 = 0.0d0 s23 = 0.0d0 s33 = 0.0d0 s14 = 0.0d0 s24 = 0.0d0 s34 = 0.0d0 do l=1,k s11 = s11 + a(m-2,l)*b(l,j) s12 = s12 + a(m-2,l)*b(l,j+1) s13 = s13 + a(m-2,l)*b(l,j+2) s14 = s14 + a(m-2,l)*b(l,j+3) s21 = s21 + a(m-1,l)*b(l,j) s22 = s22 + a(m-1,l)*b(l,j+1) s23 = s23 + a(m-1,l)*b(l,j+2) s24 = s24 + a(m-1,l)*b(l,j+3) s31 = s31 + a(m,l)*b(l,j) s32 = s32 + a(m,l)*b(l,j+1) s33 = s33 + a(m,l)*b(l,j+2) s34 = s34 + a(m,l)*b(l,j+3) enddo c(m-2,j) = s11 c(m-2,j+1) = s12 c(m-2,j+2) = s13 c(m-2,j+3) = s14 c(m-1,j) = s21 c(m-1,j+1) = s22 c(m-1,j+2) = s23 c(m-1,j+3) = s24 c(m,j) = s31 c(m,j+1) = s32 c(m,j+2) = s33 c(m,j+3) = s34 enddo ! mresid is 3, check nresid if (nresid == 0) then return elseif (nresid == 1) then s11 = 0.0d0 s21 = 0.0d0 s31 = 0.0d0 do l=1,k s11 = s11 + a(m-2,l)*b(l,n) s21 = s21 + a(m-1,l)*b(l,n) s31 = s31 + a(m,l)*b(l,n) enddo c(m-2,n) = s11 c(m-1,n) = s21 c(m,n) = s31 return elseif (nresid == 2) then s11 = 0.0d0 s21 = 0.0d0 s31 = 0.0d0 s12 = 0.0d0 s22 = 0.0d0 s32 = 0.0d0 do l=1,k s11 = s11 + a(m-2,l)*b(l,n-1) s12 = s12 + a(m-2,l)*b(l,n) s21 = s21 + a(m-1,l)*b(l,n-1) s22 = s22 + a(m-1,l)*b(l,n) s31 = s31 + a(m,l)*b(l,n-1) s32 = s32 + a(m,l)*b(l,n) enddo c(m-2,n-1) = s11 c(m-2,n) = s12 c(m-1,n-1) = s21 c(m-1,n) = s22 c(m,n-1) = s31 c(m,n) = s32 return else s11 = 0.0d0 s21 = 0.0d0 s31 = 0.0d0 s12 = 0.0d0 s22 = 0.0d0 s32 = 0.0d0 s13 = 0.0d0 s23 = 0.0d0 s33 = 0.0d0 do l=1,k s11 = s11 + a(m-2,l)*b(l,n-2) s12 = s12 + a(m-2,l)*b(l,n-1) s13 = s13 + a(m-2,l)*b(l,n) s21 = s21 + a(m-1,l)*b(l,n-2) s22 = s22 + a(m-1,l)*b(l,n-1) s23 = s23 + a(m-1,l)*b(l,n) s31 = s31 + a(m,l)*b(l,n-2) s32 = s32 + a(m,l)*b(l,n-1) s33 = s33 + a(m,l)*b(l,n) enddo c(m-2,n-2) = s11 c(m-2,n-1) = s12 c(m-2,n) = s13 c(m-1,n-2) = s21 c(m-1,n-1) = s22 c(m-1,n) = s23 c(m,n-2) = s31 c(m,n-1) = s32 c(m,n) = s33 return endif endif return end subroutine mxm44_0 !----------------------------------------------------------------------- subroutine mxm44_2(a, m, b, k, c, n) ! use kinds,, only : DP real(8) :: a(m,2), b(2,n), c(m,n) nresid = iand(n,3) n1 = n - nresid + 1 do j=1,n-nresid,4 do i=1,m c(i,j+0) = a(i,1)*b(1,j+0) + a(i,2)*b(2,j+0) c(i,j+1) = a(i,1)*b(1,j+1) + a(i,2)*b(2,j+1) c(i,j+2) = a(i,1)*b(1,j+2) + a(i,2)*b(2,j+2) c(i,j+3) = a(i,1)*b(1,j+3) + a(i,2)*b(2,j+3) enddo enddo if (nresid == 0) then return elseif (nresid == 1) then do i=1,m c(i,n) = a(i,1)*b(1,n) + a(i,2)*b(2,n) enddo elseif (nresid == 2) then do i=1,m c(i,n-1) = a(i,1)*b(1,n-1) + a(i,2)*b(2,n-1) c(i,n-0) = a(i,1)*b(1,n-0) + a(i,2)*b(2,n-0) enddo else do i=1,m c(i,n-2) = a(i,1)*b(1,n-2) + a(i,2)*b(2,n-2) c(i,n-1) = a(i,1)*b(1,n-1) + a(i,2)*b(2,n-1) c(i,n-0) = a(i,1)*b(1,n-0) + a(i,2)*b(2,n-0) enddo endif return end subroutine mxm44_2 !----------------------------------------------------------------------- subroutine initab(a,b,n) ! use kinds,, only : DP real(8) :: a(2),b(2) do i=1,n-1 x = i k = mod(i,19) + 2 l = mod(i,17) + 5 m = mod(i,31) + 3 a(i) = -.25*(a(i)+a(i+1)) + (x*x + k + l)/(x*x+m) b(i) = -.25*(b(i)+b(i+1)) + (x*x + k + m)/(x*x+l) enddo a(n) = -.25*(a(n)+a(n)) + (x*x + k + l)/(x*x+m) b(n) = -.25*(b(n)+b(n)) + (x*x + k + m)/(x*x+l) return end subroutine initab !----------------------------------------------------------------------- subroutine mxms(a,n1,b,n2,c,n3) !---------------------------------------------------------------------- ! Matrix-vector product routine. ! NOTE: Use assembly coded routine if available. !--------------------------------------------------------------------- ! use kinds,, only : DP DOUBLE PRECISION :: A(N1,N2),B(N2,N3),C(N1,N3) N0=N1*N3 DO I=1,N0 C(I,1)=0. END DO DO J=1,N3 DO K=1,N2 BB=B(K,J) DO I=1,N1 C(I,J)=C(I,J)+A(I,K)*BB END DO END DO END DO return end subroutine mxms !----------------------------------------------------------------------- subroutine mxmu4(a,n1,b,n2,c,n3) !---------------------------------------------------------------------- ! Matrix-vector product routine. ! NOTE: Use assembly coded routine if available. !--------------------------------------------------------------------- ! use kinds,, only : DP DOUBLE PRECISION :: A(N1,N2),B(N2,N3),C(N1,N3) N0=N1*N3 DO I=1,N0 C(I,1)=0. END DO i1 = n1 - mod(n1,4) + 1 DO J=1,N3 DO K=1,N2 BB=B(K,J) DO I=1,N1-3,4 C(I ,J)=C(I ,J)+A(I ,K)*BB C(I+1,J)=C(I+1,J)+A(I+1,K)*BB C(I+2,J)=C(I+2,J)+A(I+2,K)*BB C(I+3,J)=C(I+3,J)+A(I+3,K)*BB END DO DO i=i1,N1 C(I ,J)=C(I ,J)+A(I ,K)*BB END DO END DO END DO return end subroutine mxmu4 !----------------------------------------------------------------------- subroutine madd (a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,n2),b(n2,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,j)+b(i,j) enddo enddo return end subroutine madd !----------------------------------------------------------------------- subroutine mxmUR2(a,n1,b,n2,c,n3) !---------------------------------------------------------------------- ! Matrix-vector product routine. ! NOTE: Use assembly coded routine if available. !--------------------------------------------------------------------- ! use kinds,, only : DP DOUBLE PRECISION :: A(N1,N2),B(N2,N3),C(N1,N3) if (n2 <= 8) then if (n2 == 1) then call mxmur2_1(a,n1,b,n2,c,n3) elseif (n2 == 2) then call mxmur2_2(a,n1,b,n2,c,n3) elseif (n2 == 3) then call mxmur2_3(a,n1,b,n2,c,n3) elseif (n2 == 4) then call mxmur2_4(a,n1,b,n2,c,n3) elseif (n2 == 5) then call mxmur2_5(a,n1,b,n2,c,n3) elseif (n2 == 6) then call mxmur2_6(a,n1,b,n2,c,n3) elseif (n2 == 7) then call mxmur2_7(a,n1,b,n2,c,n3) else call mxmur2_8(a,n1,b,n2,c,n3) endif elseif (n2 <= 16) then if (n2 == 9) then call mxmur2_9(a,n1,b,n2,c,n3) elseif (n2 == 10) then call mxmur2_10(a,n1,b,n2,c,n3) elseif (n2 == 11) then call mxmur2_11(a,n1,b,n2,c,n3) elseif (n2 == 12) then call mxmur2_12(a,n1,b,n2,c,n3) elseif (n2 == 13) then call mxmur2_13(a,n1,b,n2,c,n3) elseif (n2 == 14) then call mxmur2_14(a,n1,b,n2,c,n3) elseif (n2 == 15) then call mxmur2_15(a,n1,b,n2,c,n3) else call mxmur2_16(a,n1,b,n2,c,n3) endif else N0=N1*N3 DO I=1,N0 C(I,1)=0. END DO DO J=1,N3 DO K=1,N2 BB=B(K,J) DO I=1,N1 C(I,J)=C(I,J)+A(I,K)*BB END DO END DO END DO endif return end subroutine mxmUR2 subroutine mxmur2_1(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,1),b(1,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) enddo enddo return end subroutine mxmur2_1 subroutine mxmur2_2(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,2),b(2,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) enddo enddo return end subroutine mxmur2_2 subroutine mxmur2_3(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,3),b(3,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) enddo enddo return end subroutine mxmur2_3 subroutine mxmur2_4(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,4),b(4,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) enddo enddo return end subroutine mxmur2_4 subroutine mxmur2_5(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,5),b(5,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) enddo enddo return end subroutine mxmur2_5 subroutine mxmur2_6(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,6),b(6,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) enddo enddo return end subroutine mxmur2_6 subroutine mxmur2_7(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,7),b(7,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) enddo enddo return end subroutine mxmur2_7 subroutine mxmur2_8(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,8),b(8,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) enddo enddo return end subroutine mxmur2_8 subroutine mxmur2_9(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,9),b(9,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) enddo enddo return end subroutine mxmur2_9 subroutine mxmur2_10(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,10),b(10,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) enddo enddo return end subroutine mxmur2_10 subroutine mxmur2_11(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,11),b(11,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) enddo enddo return end subroutine mxmur2_11 subroutine mxmur2_12(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,12),b(12,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) enddo enddo return end subroutine mxmur2_12 subroutine mxmur2_13(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,13),b(13,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) enddo enddo return end subroutine mxmur2_13 subroutine mxmur2_14(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,14),b(14,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) enddo enddo return end subroutine mxmur2_14 subroutine mxmur2_15(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,15),b(15,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) & & + a(i,15)*b(15,j) enddo enddo return end subroutine mxmur2_15 subroutine mxmur2_16(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,16),b(16,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) & & + a(i,15)*b(15,j) & & + a(i,16)*b(16,j) enddo enddo return end subroutine mxmur2_16 !----------------------------------------------------------------------- subroutine mxmUR3(a,n1,b,n2,c,n3) !---------------------------------------------------------------------- ! Matrix-vector product routine. ! NOTE: Use assembly coded routine if available. !--------------------------------------------------------------------- ! use kinds,, only : DP DOUBLE PRECISION :: A(N1,N2),B(N2,N3),C(N1,N3) N0=N1*N3 DO I=1,N0 C(I,1)=0. END DO if (n3 <= 8) then if (n3 == 1) then call mxmur3_1(a,n1,b,n2,c,n3) elseif (n3 == 2) then call mxmur3_2(a,n1,b,n2,c,n3) elseif (n3 == 3) then call mxmur3_3(a,n1,b,n2,c,n3) elseif (n3 == 4) then call mxmur3_4(a,n1,b,n2,c,n3) elseif (n3 == 5) then call mxmur3_5(a,n1,b,n2,c,n3) elseif (n3 == 6) then call mxmur3_6(a,n1,b,n2,c,n3) elseif (n3 == 7) then call mxmur3_7(a,n1,b,n2,c,n3) else call mxmur3_8(a,n1,b,n2,c,n3) endif elseif (n3 <= 16) then if (n3 == 9) then call mxmur3_9(a,n1,b,n2,c,n3) elseif (n3 == 10) then call mxmur3_10(a,n1,b,n2,c,n3) elseif (n3 == 11) then call mxmur3_11(a,n1,b,n2,c,n3) elseif (n3 == 12) then call mxmur3_12(a,n1,b,n2,c,n3) elseif (n3 == 13) then call mxmur3_13(a,n1,b,n2,c,n3) elseif (n3 == 14) then call mxmur3_14(a,n1,b,n2,c,n3) elseif (n3 == 15) then call mxmur3_15(a,n1,b,n2,c,n3) else call mxmur3_16(a,n1,b,n2,c,n3) endif else DO J=1,N3 DO K=1,N2 BB=B(K,J) DO I=1,N1 C(I,J)=C(I,J)+A(I,K)*BB END DO END DO END DO endif return end subroutine mxmUR3 subroutine mxmur3_16(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,n2),b(n2,16),c(n1,16) do k=1,n2 tmp1 = b(k, 1) tmp2 = b(k, 2) tmp3 = b(k, 3) tmp4 = b(k, 4) tmp5 = b(k, 5) tmp6 = b(k, 6) tmp7 = b(k, 7) tmp8 = b(k, 8) tmp9 = b(k, 9) tmp10 = b(k,10) tmp11 = b(k,11) tmp12 = b(k,12) tmp13 = b(k,13) tmp14 = b(k,14) tmp15 = b(k,15) tmp16 = b(k,16) do i=1,n1 c(i, 1) = c(i, 1) + a(i,k) * tmp1 c(i, 2) = c(i, 2) + a(i,k) * tmp2 c(i, 3) = c(i, 3) + a(i,k) * tmp3 c(i, 4) = c(i, 4) + a(i,k) * tmp4 c(i, 5) = c(i, 5) + a(i,k) * tmp5 c(i, 6) = c(i, 6) + a(i,k) * tmp6 c(i, 7) = c(i, 7) + a(i,k) * tmp7 c(i, 8) = c(i, 8) + a(i,k) * tmp8 c(i, 9) = c(i, 9) + a(i,k) * tmp9 c(i,10) = c(i,10) + a(i,k) * tmp10 c(i,11) = c(i,11) + a(i,k) * tmp11 c(i,12) = c(i,12) + a(i,k) * tmp12 c(i,13) = c(i,13) + a(i,k) * tmp13 c(i,14) = c(i,14) + a(i,k) * tmp14 c(i,15) = c(i,15) + a(i,k) * tmp15 c(i,16) = c(i,16) + a(i,k) * tmp16 enddo enddo return end subroutine mxmur3_16 subroutine mxmur3_15(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,n2),b(n2,15),c(n1,15) do k=1,n2 tmp1 = b(k, 1) tmp2 = b(k, 2) tmp3 = b(k, 3) tmp4 = b(k, 4) tmp5 = b(k, 5) tmp6 = b(k, 6) tmp7 = b(k, 7) tmp8 = b(k, 8) tmp9 = b(k, 9) tmp10 = b(k,10) tmp11 = b(k,11) tmp12 = b(k,12) tmp13 = b(k,13) tmp14 = b(k,14) tmp15 = b(k,15) do i=1,n1 c(i, 1) = c(i, 1) + a(i,k) * tmp1 c(i, 2) = c(i, 2) + a(i,k) * tmp2 c(i, 3) = c(i, 3) + a(i,k) * tmp3 c(i, 4) = c(i, 4) + a(i,k) * tmp4 c(i, 5) = c(i, 5) + a(i,k) * tmp5 c(i, 6) = c(i, 6) + a(i,k) * tmp6 c(i, 7) = c(i, 7) + a(i,k) * tmp7 c(i, 8) = c(i, 8) + a(i,k) * tmp8 c(i, 9) = c(i, 9) + a(i,k) * tmp9 c(i,10) = c(i,10) + a(i,k) * tmp10 c(i,11) = c(i,11) + a(i,k) * tmp11 c(i,12) = c(i,12) + a(i,k) * tmp12 c(i,13) = c(i,13) + a(i,k) * tmp13 c(i,14) = c(i,14) + a(i,k) * tmp14 c(i,15) = c(i,15) + a(i,k) * tmp15 enddo enddo return end subroutine mxmur3_15 subroutine mxmur3_14(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,n2),b(n2,14),c(n1,14) do k=1,n2 tmp1 = b(k, 1) tmp2 = b(k, 2) tmp3 = b(k, 3) tmp4 = b(k, 4) tmp5 = b(k, 5) tmp6 = b(k, 6) tmp7 = b(k, 7) tmp8 = b(k, 8) tmp9 = b(k, 9) tmp10 = b(k,10) tmp11 = b(k,11) tmp12 = b(k,12) tmp13 = b(k,13) tmp14 = b(k,14) do i=1,n1 c(i, 1) = c(i, 1) + a(i,k) * tmp1 c(i, 2) = c(i, 2) + a(i,k) * tmp2 c(i, 3) = c(i, 3) + a(i,k) * tmp3 c(i, 4) = c(i, 4) + a(i,k) * tmp4 c(i, 5) = c(i, 5) + a(i,k) * tmp5 c(i, 6) = c(i, 6) + a(i,k) * tmp6 c(i, 7) = c(i, 7) + a(i,k) * tmp7 c(i, 8) = c(i, 8) + a(i,k) * tmp8 c(i, 9) = c(i, 9) + a(i,k) * tmp9 c(i,10) = c(i,10) + a(i,k) * tmp10 c(i,11) = c(i,11) + a(i,k) * tmp11 c(i,12) = c(i,12) + a(i,k) * tmp12 c(i,13) = c(i,13) + a(i,k) * tmp13 c(i,14) = c(i,14) + a(i,k) * tmp14 enddo enddo return end subroutine mxmur3_14 subroutine mxmur3_13(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,n2),b(n2,13),c(n1,13) do k=1,n2 tmp1 = b(k, 1) tmp2 = b(k, 2) tmp3 = b(k, 3) tmp4 = b(k, 4) tmp5 = b(k, 5) tmp6 = b(k, 6) tmp7 = b(k, 7) tmp8 = b(k, 8) tmp9 = b(k, 9) tmp10 = b(k,10) tmp11 = b(k,11) tmp12 = b(k,12) tmp13 = b(k,13) do i=1,n1 c(i, 1) = c(i, 1) + a(i,k) * tmp1 c(i, 2) = c(i, 2) + a(i,k) * tmp2 c(i, 3) = c(i, 3) + a(i,k) * tmp3 c(i, 4) = c(i, 4) + a(i,k) * tmp4 c(i, 5) = c(i, 5) + a(i,k) * tmp5 c(i, 6) = c(i, 6) + a(i,k) * tmp6 c(i, 7) = c(i, 7) + a(i,k) * tmp7 c(i, 8) = c(i, 8) + a(i,k) * tmp8 c(i, 9) = c(i, 9) + a(i,k) * tmp9 c(i,10) = c(i,10) + a(i,k) * tmp10 c(i,11) = c(i,11) + a(i,k) * tmp11 c(i,12) = c(i,12) + a(i,k) * tmp12 c(i,13) = c(i,13) + a(i,k) * tmp13 enddo enddo return end subroutine mxmur3_13 subroutine mxmur3_12(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,n2),b(n2,12),c(n1,12) do k=1,n2 tmp1 = b(k, 1) tmp2 = b(k, 2) tmp3 = b(k, 3) tmp4 = b(k, 4) tmp5 = b(k, 5) tmp6 = b(k, 6) tmp7 = b(k, 7) tmp8 = b(k, 8) tmp9 = b(k, 9) tmp10 = b(k,10) tmp11 = b(k,11) tmp12 = b(k,12) do i=1,n1 c(i, 1) = c(i, 1) + a(i,k) * tmp1 c(i, 2) = c(i, 2) + a(i,k) * tmp2 c(i, 3) = c(i, 3) + a(i,k) * tmp3 c(i, 4) = c(i, 4) + a(i,k) * tmp4 c(i, 5) = c(i, 5) + a(i,k) * tmp5 c(i, 6) = c(i, 6) + a(i,k) * tmp6 c(i, 7) = c(i, 7) + a(i,k) * tmp7 c(i, 8) = c(i, 8) + a(i,k) * tmp8 c(i, 9) = c(i, 9) + a(i,k) * tmp9 c(i,10) = c(i,10) + a(i,k) * tmp10 c(i,11) = c(i,11) + a(i,k) * tmp11 c(i,12) = c(i,12) + a(i,k) * tmp12 enddo enddo return end subroutine mxmur3_12 subroutine mxmur3_11(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,n2),b(n2,11),c(n1,11) do k=1,n2 tmp1 = b(k, 1) tmp2 = b(k, 2) tmp3 = b(k, 3) tmp4 = b(k, 4) tmp5 = b(k, 5) tmp6 = b(k, 6) tmp7 = b(k, 7) tmp8 = b(k, 8) tmp9 = b(k, 9) tmp10 = b(k,10) tmp11 = b(k,11) do i=1,n1 c(i, 1) = c(i, 1) + a(i,k) * tmp1 c(i, 2) = c(i, 2) + a(i,k) * tmp2 c(i, 3) = c(i, 3) + a(i,k) * tmp3 c(i, 4) = c(i, 4) + a(i,k) * tmp4 c(i, 5) = c(i, 5) + a(i,k) * tmp5 c(i, 6) = c(i, 6) + a(i,k) * tmp6 c(i, 7) = c(i, 7) + a(i,k) * tmp7 c(i, 8) = c(i, 8) + a(i,k) * tmp8 c(i, 9) = c(i, 9) + a(i,k) * tmp9 c(i,10) = c(i,10) + a(i,k) * tmp10 c(i,11) = c(i,11) + a(i,k) * tmp11 enddo enddo return end subroutine mxmur3_11 subroutine mxmur3_10(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,n2),b(n2,10),c(n1,10) do k=1,n2 tmp1 = b(k, 1) tmp2 = b(k, 2) tmp3 = b(k, 3) tmp4 = b(k, 4) tmp5 = b(k, 5) tmp6 = b(k, 6) tmp7 = b(k, 7) tmp8 = b(k, 8) tmp9 = b(k, 9) tmp10 = b(k,10) do i=1,n1 c(i, 1) = c(i, 1) + a(i,k) * tmp1 c(i, 2) = c(i, 2) + a(i,k) * tmp2 c(i, 3) = c(i, 3) + a(i,k) * tmp3 c(i, 4) = c(i, 4) + a(i,k) * tmp4 c(i, 5) = c(i, 5) + a(i,k) * tmp5 c(i, 6) = c(i, 6) + a(i,k) * tmp6 c(i, 7) = c(i, 7) + a(i,k) * tmp7 c(i, 8) = c(i, 8) + a(i,k) * tmp8 c(i, 9) = c(i, 9) + a(i,k) * tmp9 c(i,10) = c(i,10) + a(i,k) * tmp10 enddo enddo return end subroutine mxmur3_10 subroutine mxmur3_9(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,n2),b(n2,9),c(n1,9) do k=1,n2 tmp1 = b(k, 1) tmp2 = b(k, 2) tmp3 = b(k, 3) tmp4 = b(k, 4) tmp5 = b(k, 5) tmp6 = b(k, 6) tmp7 = b(k, 7) tmp8 = b(k, 8) tmp9 = b(k, 9) do i=1,n1 c(i, 1) = c(i, 1) + a(i,k) * tmp1 c(i, 2) = c(i, 2) + a(i,k) * tmp2 c(i, 3) = c(i, 3) + a(i,k) * tmp3 c(i, 4) = c(i, 4) + a(i,k) * tmp4 c(i, 5) = c(i, 5) + a(i,k) * tmp5 c(i, 6) = c(i, 6) + a(i,k) * tmp6 c(i, 7) = c(i, 7) + a(i,k) * tmp7 c(i, 8) = c(i, 8) + a(i,k) * tmp8 c(i, 9) = c(i, 9) + a(i,k) * tmp9 enddo enddo return end subroutine mxmur3_9 subroutine mxmur3_8(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,n2),b(n2,8),c(n1,8) do k=1,n2 tmp1 = b(k, 1) tmp2 = b(k, 2) tmp3 = b(k, 3) tmp4 = b(k, 4) tmp5 = b(k, 5) tmp6 = b(k, 6) tmp7 = b(k, 7) tmp8 = b(k, 8) do i=1,n1 c(i, 1) = c(i, 1) + a(i,k) * tmp1 c(i, 2) = c(i, 2) + a(i,k) * tmp2 c(i, 3) = c(i, 3) + a(i,k) * tmp3 c(i, 4) = c(i, 4) + a(i,k) * tmp4 c(i, 5) = c(i, 5) + a(i,k) * tmp5 c(i, 6) = c(i, 6) + a(i,k) * tmp6 c(i, 7) = c(i, 7) + a(i,k) * tmp7 c(i, 8) = c(i, 8) + a(i,k) * tmp8 enddo enddo return end subroutine mxmur3_8 subroutine mxmur3_7(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,n2),b(n2,7),c(n1,7) do k=1,n2 tmp1 = b(k, 1) tmp2 = b(k, 2) tmp3 = b(k, 3) tmp4 = b(k, 4) tmp5 = b(k, 5) tmp6 = b(k, 6) tmp7 = b(k, 7) do i=1,n1 c(i, 1) = c(i, 1) + a(i,k) * tmp1 c(i, 2) = c(i, 2) + a(i,k) * tmp2 c(i, 3) = c(i, 3) + a(i,k) * tmp3 c(i, 4) = c(i, 4) + a(i,k) * tmp4 c(i, 5) = c(i, 5) + a(i,k) * tmp5 c(i, 6) = c(i, 6) + a(i,k) * tmp6 c(i, 7) = c(i, 7) + a(i,k) * tmp7 enddo enddo return end subroutine mxmur3_7 subroutine mxmur3_6(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,n2),b(n2,6),c(n1,6) do k=1,n2 tmp1 = b(k, 1) tmp2 = b(k, 2) tmp3 = b(k, 3) tmp4 = b(k, 4) tmp5 = b(k, 5) tmp6 = b(k, 6) do i=1,n1 c(i, 1) = c(i, 1) + a(i,k) * tmp1 c(i, 2) = c(i, 2) + a(i,k) * tmp2 c(i, 3) = c(i, 3) + a(i,k) * tmp3 c(i, 4) = c(i, 4) + a(i,k) * tmp4 c(i, 5) = c(i, 5) + a(i,k) * tmp5 c(i, 6) = c(i, 6) + a(i,k) * tmp6 enddo enddo return end subroutine mxmur3_6 subroutine mxmur3_5(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,n2),b(n2,5),c(n1,5) do k=1,n2 tmp1 = b(k, 1) tmp2 = b(k, 2) tmp3 = b(k, 3) tmp4 = b(k, 4) tmp5 = b(k, 5) do i=1,n1 c(i, 1) = c(i, 1) + a(i,k) * tmp1 c(i, 2) = c(i, 2) + a(i,k) * tmp2 c(i, 3) = c(i, 3) + a(i,k) * tmp3 c(i, 4) = c(i, 4) + a(i,k) * tmp4 c(i, 5) = c(i, 5) + a(i,k) * tmp5 enddo enddo return end subroutine mxmur3_5 subroutine mxmur3_4(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,n2),b(n2,4),c(n1,4) do k=1,n2 tmp1 = b(k, 1) tmp2 = b(k, 2) tmp3 = b(k, 3) tmp4 = b(k, 4) do i=1,n1 c(i, 1) = c(i, 1) + a(i,k) * tmp1 c(i, 2) = c(i, 2) + a(i,k) * tmp2 c(i, 3) = c(i, 3) + a(i,k) * tmp3 c(i, 4) = c(i, 4) + a(i,k) * tmp4 enddo enddo return end subroutine mxmur3_4 subroutine mxmur3_3(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,n2),b(n2,3),c(n1,3) do k=1,n2 tmp1 = b(k, 1) tmp2 = b(k, 2) tmp3 = b(k, 3) do i=1,n1 c(i, 1) = c(i, 1) + a(i,k) * tmp1 c(i, 2) = c(i, 2) + a(i,k) * tmp2 c(i, 3) = c(i, 3) + a(i,k) * tmp3 enddo enddo return end subroutine mxmur3_3 subroutine mxmur3_2(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,n2),b(n2,2),c(n1,2) do k=1,n2 tmp1 = b(k, 1) tmp2 = b(k, 2) do i=1,n1 c(i, 1) = c(i, 1) + a(i,k) * tmp1 c(i, 2) = c(i, 2) + a(i,k) * tmp2 enddo enddo return end subroutine mxmur3_2 subroutine mxmur3_1(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,n2),b(n2,1),c(n1,1) do k=1,n2 tmp1 = b(k, 1) do i=1,n1 c(i, 1) = c(i, 1) + a(i,k) * tmp1 enddo enddo return end subroutine mxmur3_1 !---------------------------------------------------------------------- subroutine mxmd(a,n1,b,n2,c,n3) ! Matrix-vector product routine. ! NOTE: Use assembly coded routine if available. !--------------------------------------------------------------------- ! use kinds,, only : DP DOUBLE PRECISION :: A(N1,N2),B(N2,N3),C(N1,N3) DOUBLE PRECISION :: ONE,ZERO,EPS one=1.0 zero=0.0 call dgemm('N','N',n1,n3,n2, & REAL(ONE, KIND=8),A,N1,B,N2, & REAL(ZERO,KIND=8),C,N1) return end subroutine mxmd !----------------------------------------------------------------------- subroutine mxmfb(a,n1,b,n2,c,n3) !----------------------------------------------------------------------- ! Matrix-vector product routine. ! NOTE: Use assembly coded routine if available. !---------------------------------------------------------------------- ! use kinds,, only : DP DOUBLE PRECISION :: A(N1,N2),B(N2,N3),C(N1,N3) integer :: wdsize save wdsize data wdsize/0/ ! First call: determine word size for dgemm/sgemm discrimination, below. if (wdsize == 0) then one = 1.0 eps = 1.e-12 wdsize = 8 if (one+eps == 1.0) wdsize = 4 endif if (n2 <= 8) then if (n2 == 1) then call mxmfb_1(a,n1,b,n2,c,n3) elseif (n2 == 2) then call mxmfb_2(a,n1,b,n2,c,n3) elseif (n2 == 3) then call mxmfb_3(a,n1,b,n2,c,n3) elseif (n2 == 4) then call mxmfb_4(a,n1,b,n2,c,n3) elseif (n2 == 5) then call mxmfb_5(a,n1,b,n2,c,n3) elseif (n2 == 6) then call mxmfb_6(a,n1,b,n2,c,n3) elseif (n2 == 7) then call mxmfb_7(a,n1,b,n2,c,n3) else call mxmfb_8(a,n1,b,n2,c,n3) endif elseif (n2 <= 16) then if (n2 == 9) then call mxmfb_9(a,n1,b,n2,c,n3) elseif (n2 == 10) then call mxmfb_10(a,n1,b,n2,c,n3) elseif (n2 == 11) then call mxmfb_11(a,n1,b,n2,c,n3) elseif (n2 == 12) then call mxmfb_12(a,n1,b,n2,c,n3) elseif (n2 == 13) then call mxmfb_13(a,n1,b,n2,c,n3) elseif (n2 == 14) then call mxmfb_14(a,n1,b,n2,c,n3) elseif (n2 == 15) then call mxmfb_15(a,n1,b,n2,c,n3) else call mxmfb_16(a,n1,b,n2,c,n3) endif elseif (n2 <= 24) then if (n2 == 17) then call mxmfb_17(a,n1,b,n2,c,n3) elseif (n2 == 18) then call mxmfb_18(a,n1,b,n2,c,n3) elseif (n2 == 19) then call mxmfb_19(a,n1,b,n2,c,n3) elseif (n2 == 20) then call mxmfb_20(a,n1,b,n2,c,n3) elseif (n2 == 21) then call mxmfb_21(a,n1,b,n2,c,n3) elseif (n2 == 22) then call mxmfb_22(a,n1,b,n2,c,n3) elseif (n2 == 23) then call mxmfb_23(a,n1,b,n2,c,n3) elseif (n2 == 24) then call mxmfb_24(a,n1,b,n2,c,n3) endif else one=1.0 zero=0.0 if (wdsize == 4) then call sgemm('N','N',n1,n3,n2,ONE,A,N1,B,N2,ZERO,C,N1) else call dgemm('N','N',n1,n3,n2, & REAL(ONE, KIND=8),A,N1,B,N2, & REAL(ZERO,KIND=8),C,N1) endif endif return end subroutine mxmfb !----------------------------------------------------------------------- subroutine mxmfb_1(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,1),b(1,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) enddo enddo return end subroutine mxmfb_1 !----------------------------------------------------------------------- subroutine mxmfb_2(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,2),b(2,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) enddo enddo return end subroutine mxmfb_2 !----------------------------------------------------------------------- subroutine mxmfb_3(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,3),b(3,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) enddo enddo return end subroutine mxmfb_3 !----------------------------------------------------------------------- subroutine mxmfb_4(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,4),b(4,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) enddo enddo return end subroutine mxmfb_4 !----------------------------------------------------------------------- subroutine mxmfb_5(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,5),b(5,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) enddo enddo return end subroutine mxmfb_5 !----------------------------------------------------------------------- subroutine mxmfb_6(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,6),b(6,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) enddo enddo return end subroutine mxmfb_6 !----------------------------------------------------------------------- subroutine mxmfb_7(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,7),b(7,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) enddo enddo return end subroutine mxmfb_7 !----------------------------------------------------------------------- subroutine mxmfb_8(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,8),b(8,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) enddo enddo return end subroutine mxmfb_8 !----------------------------------------------------------------------- subroutine mxmfb_9(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,9),b(9,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) enddo enddo return end subroutine mxmfb_9 !----------------------------------------------------------------------- subroutine mxmfb_10(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,10),b(10,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) enddo enddo return end subroutine mxmfb_10 !----------------------------------------------------------------------- subroutine mxmfb_11(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,11),b(11,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) enddo enddo return end subroutine mxmfb_11 !----------------------------------------------------------------------- subroutine mxmfb_12(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,12),b(12,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) enddo enddo return end subroutine mxmfb_12 !----------------------------------------------------------------------- subroutine mxmfb_13(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,13),b(13,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) enddo enddo return end subroutine mxmfb_13 !----------------------------------------------------------------------- subroutine mxmfb_14(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,14),b(14,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) enddo enddo return end subroutine mxmfb_14 !----------------------------------------------------------------------- subroutine mxmfb_15(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,15),b(15,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) & & + a(i,15)*b(15,j) enddo enddo return end subroutine mxmfb_15 !----------------------------------------------------------------------- subroutine mxmfb_16(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,16),b(16,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) & & + a(i,15)*b(15,j) & & + a(i,16)*b(16,j) enddo enddo return end subroutine mxmfb_16 !----------------------------------------------------------------------- subroutine mxmfb_17(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,17),b(17,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) & & + a(i,15)*b(15,j) & & + a(i,16)*b(16,j) & & + a(i,17)*b(17,j) enddo enddo return end subroutine mxmfb_17 !----------------------------------------------------------------------- subroutine mxmfb_18(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,18),b(18,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) & & + a(i,15)*b(15,j) & & + a(i,16)*b(16,j) & & + a(i,17)*b(17,j) & & + a(i,18)*b(18,j) enddo enddo return end subroutine mxmfb_18 !----------------------------------------------------------------------- subroutine mxmfb_19(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,19),b(19,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) & & + a(i,15)*b(15,j) & & + a(i,16)*b(16,j) & & + a(i,17)*b(17,j) & & + a(i,18)*b(18,j) & & + a(i,19)*b(19,j) enddo enddo return end subroutine mxmfb_19 !----------------------------------------------------------------------- subroutine mxmfb_20(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,20),b(20,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) & & + a(i,15)*b(15,j) & & + a(i,16)*b(16,j) & & + a(i,17)*b(17,j) & & + a(i,18)*b(18,j) & & + a(i,19)*b(19,j) & & + a(i,20)*b(20,j) enddo enddo return end subroutine mxmfb_20 !----------------------------------------------------------------------- subroutine mxmfb_21(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,21),b(21,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) & & + a(i,15)*b(15,j) & & + a(i,16)*b(16,j) & & + a(i,17)*b(17,j) & & + a(i,18)*b(18,j) & & + a(i,19)*b(19,j) & & + a(i,20)*b(20,j) & & + a(i,21)*b(21,j) enddo enddo return end subroutine mxmfb_21 !----------------------------------------------------------------------- subroutine mxmfb_22(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,22),b(22,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) & & + a(i,15)*b(15,j) & & + a(i,16)*b(16,j) & & + a(i,17)*b(17,j) & & + a(i,18)*b(18,j) & & + a(i,19)*b(19,j) & & + a(i,20)*b(20,j) & & + a(i,21)*b(21,j) & & + a(i,22)*b(22,j) enddo enddo return end subroutine mxmfb_22 !----------------------------------------------------------------------- subroutine mxmfb_23(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,23),b(23,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) & & + a(i,15)*b(15,j) & & + a(i,16)*b(16,j) & & + a(i,17)*b(17,j) & & + a(i,18)*b(18,j) & & + a(i,19)*b(19,j) & & + a(i,20)*b(20,j) & & + a(i,21)*b(21,j) & & + a(i,22)*b(22,j) & & + a(i,23)*b(23,j) enddo enddo return end subroutine mxmfb_23 !----------------------------------------------------------------------- subroutine mxmfb_24(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,24),b(24,n3),c(n1,n3) do j=1,n3 do i=1,n1 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) & & + a(i,15)*b(15,j) & & + a(i,16)*b(16,j) & & + a(i,17)*b(17,j) & & + a(i,18)*b(18,j) & & + a(i,19)*b(19,j) & & + a(i,20)*b(20,j) & & + a(i,21)*b(21,j) & & + a(i,22)*b(22,j) & & + a(i,23)*b(23,j) & & + a(i,24)*b(24,j) enddo enddo return end subroutine mxmfb_24 !----------------------------------------------------------------------- subroutine mxmf3(a,n1,b,n2,c,n3) !----------------------------------------------------------------------- ! Matrix-vector product routine. ! NOTE: Use assembly coded routine if available. !---------------------------------------------------------------------- ! use kinds,, only : DP DOUBLE PRECISION :: A(N1,N2),B(N2,N3),C(N1,N3) integer :: wdsize save wdsize data wdsize/0/ ! First call: determine word size for dgemm/sgemm discrimination, below. if (wdsize == 0) then one = 1.0 eps = 1.e-12 wdsize = 8 if (one+eps == 1.0) wdsize = 4 endif if (n2 <= 8) then if (n2 == 1) then call mxmf3_1(a,n1,b,n2,c,n3) elseif (n2 == 2) then call mxmf3_2(a,n1,b,n2,c,n3) elseif (n2 == 3) then call mxmf3_3(a,n1,b,n2,c,n3) elseif (n2 == 4) then call mxmf3_4(a,n1,b,n2,c,n3) elseif (n2 == 5) then call mxmf3_5(a,n1,b,n2,c,n3) elseif (n2 == 6) then call mxmf3_6(a,n1,b,n2,c,n3) elseif (n2 == 7) then call mxmf3_7(a,n1,b,n2,c,n3) else call mxmf3_8(a,n1,b,n2,c,n3) endif elseif (n2 <= 16) then if (n2 == 9) then call mxmf3_9(a,n1,b,n2,c,n3) elseif (n2 == 10) then call mxmf3_10(a,n1,b,n2,c,n3) elseif (n2 == 11) then call mxmf3_11(a,n1,b,n2,c,n3) elseif (n2 == 12) then call mxmf3_12(a,n1,b,n2,c,n3) elseif (n2 == 13) then call mxmf3_13(a,n1,b,n2,c,n3) elseif (n2 == 14) then call mxmf3_14(a,n1,b,n2,c,n3) elseif (n2 == 15) then call mxmf3_15(a,n1,b,n2,c,n3) else call mxmf3_16(a,n1,b,n2,c,n3) endif elseif (n2 <= 24) then if (n2 == 17) then call mxmf3_17(a,n1,b,n2,c,n3) elseif (n2 == 18) then call mxmf3_18(a,n1,b,n2,c,n3) elseif (n2 == 19) then call mxmf3_19(a,n1,b,n2,c,n3) elseif (n2 == 20) then call mxmf3_20(a,n1,b,n2,c,n3) elseif (n2 == 21) then call mxmf3_21(a,n1,b,n2,c,n3) elseif (n2 == 22) then call mxmf3_22(a,n1,b,n2,c,n3) elseif (n2 == 23) then call mxmf3_23(a,n1,b,n2,c,n3) elseif (n2 == 24) then call mxmf3_24(a,n1,b,n2,c,n3) endif else one=1.0 zero=0.0 if (wdsize == 4) then call sgemm('N','N',n1,n3,n2,ONE,A,N1,B,N2,ZERO,C,N1) else call dgemm('N','N',n1,n3,n2, & REAL(ONE, KIND=8),A,N1,B,N2, & REAL(ZERO,KIND=8),C,N1) endif endif return end subroutine mxmf3 !----------------------------------------------------------------------- subroutine mxmf3_1(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,1),b(1,n3),c(n1,n3) do i=1,n1 do j=1,n3 c(i,j) = a(i,1)*b(1,j) enddo enddo return end subroutine mxmf3_1 !----------------------------------------------------------------------- subroutine mxmf3_2(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,2),b(2,n3),c(n1,n3) do i=1,n1 do j=1,n3 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) enddo enddo return end subroutine mxmf3_2 !----------------------------------------------------------------------- subroutine mxmf3_3(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,3),b(3,n3),c(n1,n3) do i=1,n1 do j=1,n3 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) enddo enddo return end subroutine mxmf3_3 !----------------------------------------------------------------------- subroutine mxmf3_4(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,4),b(4,n3),c(n1,n3) do i=1,n1 do j=1,n3 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) enddo enddo return end subroutine mxmf3_4 !----------------------------------------------------------------------- subroutine mxmf3_5(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,5),b(5,n3),c(n1,n3) do i=1,n1 do j=1,n3 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) enddo enddo return end subroutine mxmf3_5 !----------------------------------------------------------------------- subroutine mxmf3_6(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,6),b(6,n3),c(n1,n3) do i=1,n1 do j=1,n3 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) enddo enddo return end subroutine mxmf3_6 !----------------------------------------------------------------------- subroutine mxmf3_7(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,7),b(7,n3),c(n1,n3) do i=1,n1 do j=1,n3 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) enddo enddo return end subroutine mxmf3_7 !----------------------------------------------------------------------- subroutine mxmf3_8(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,8),b(8,n3),c(n1,n3) do i=1,n1 do j=1,n3 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) enddo enddo return end subroutine mxmf3_8 !----------------------------------------------------------------------- subroutine mxmf3_9(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,9),b(9,n3),c(n1,n3) do i=1,n1 do j=1,n3 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) enddo enddo return end subroutine mxmf3_9 !----------------------------------------------------------------------- subroutine mxmf3_10(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,10),b(10,n3),c(n1,n3) do i=1,n1 do j=1,n3 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) enddo enddo return end subroutine mxmf3_10 !----------------------------------------------------------------------- subroutine mxmf3_11(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,11),b(11,n3),c(n1,n3) do i=1,n1 do j=1,n3 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) enddo enddo return end subroutine mxmf3_11 !----------------------------------------------------------------------- subroutine mxmf3_12(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,12),b(12,n3),c(n1,n3) do i=1,n1 do j=1,n3 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) enddo enddo return end subroutine mxmf3_12 !----------------------------------------------------------------------- subroutine mxmf3_13(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,13),b(13,n3),c(n1,n3) do i=1,n1 do j=1,n3 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) enddo enddo return end subroutine mxmf3_13 !----------------------------------------------------------------------- subroutine mxmf3_14(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,14),b(14,n3),c(n1,n3) do i=1,n1 do j=1,n3 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) enddo enddo return end subroutine mxmf3_14 !----------------------------------------------------------------------- subroutine mxmf3_15(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,15),b(15,n3),c(n1,n3) do i=1,n1 do j=1,n3 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) & & + a(i,15)*b(15,j) enddo enddo return end subroutine mxmf3_15 !----------------------------------------------------------------------- subroutine mxmf3_16(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,16),b(16,n3),c(n1,n3) do i=1,n1 do j=1,n3 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) & & + a(i,15)*b(15,j) & & + a(i,16)*b(16,j) enddo enddo return end subroutine mxmf3_16 !----------------------------------------------------------------------- subroutine mxmf3_17(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,17),b(17,n3),c(n1,n3) do i=1,n1 do j=1,n3 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) & & + a(i,15)*b(15,j) & & + a(i,16)*b(16,j) & & + a(i,17)*b(17,j) enddo enddo return end subroutine mxmf3_17 !----------------------------------------------------------------------- subroutine mxmf3_18(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,18),b(18,n3),c(n1,n3) do i=1,n1 do j=1,n3 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) & & + a(i,15)*b(15,j) & & + a(i,16)*b(16,j) & & + a(i,17)*b(17,j) & & + a(i,18)*b(18,j) enddo enddo return end subroutine mxmf3_18 !----------------------------------------------------------------------- subroutine mxmf3_19(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,19),b(19,n3),c(n1,n3) do i=1,n1 do j=1,n3 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) & & + a(i,15)*b(15,j) & & + a(i,16)*b(16,j) & & + a(i,17)*b(17,j) & & + a(i,18)*b(18,j) & & + a(i,19)*b(19,j) enddo enddo return end subroutine mxmf3_19 !----------------------------------------------------------------------- subroutine mxmf3_20(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,20),b(20,n3),c(n1,n3) do i=1,n1 do j=1,n3 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) & & + a(i,15)*b(15,j) & & + a(i,16)*b(16,j) & & + a(i,17)*b(17,j) & & + a(i,18)*b(18,j) & & + a(i,19)*b(19,j) & & + a(i,20)*b(20,j) enddo enddo return end subroutine mxmf3_20 !----------------------------------------------------------------------- subroutine mxmf3_21(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,21),b(21,n3),c(n1,n3) do i=1,n1 do j=1,n3 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) & & + a(i,15)*b(15,j) & & + a(i,16)*b(16,j) & & + a(i,17)*b(17,j) & & + a(i,18)*b(18,j) & & + a(i,19)*b(19,j) & & + a(i,20)*b(20,j) & & + a(i,21)*b(21,j) enddo enddo return end subroutine mxmf3_21 !----------------------------------------------------------------------- subroutine mxmf3_22(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,22),b(22,n3),c(n1,n3) do i=1,n1 do j=1,n3 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) & & + a(i,15)*b(15,j) & & + a(i,16)*b(16,j) & & + a(i,17)*b(17,j) & & + a(i,18)*b(18,j) & & + a(i,19)*b(19,j) & & + a(i,20)*b(20,j) & & + a(i,21)*b(21,j) & & + a(i,22)*b(22,j) enddo enddo return end subroutine mxmf3_22 !----------------------------------------------------------------------- subroutine mxmf3_23(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,23),b(23,n3),c(n1,n3) do i=1,n1 do j=1,n3 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) & & + a(i,15)*b(15,j) & & + a(i,16)*b(16,j) & & + a(i,17)*b(17,j) & & + a(i,18)*b(18,j) & & + a(i,19)*b(19,j) & & + a(i,20)*b(20,j) & & + a(i,21)*b(21,j) & & + a(i,22)*b(22,j) & & + a(i,23)*b(23,j) enddo enddo return end subroutine mxmf3_23 !----------------------------------------------------------------------- subroutine mxmf3_24(a,n1,b,n2,c,n3) ! use kinds,, only : DP real(8) :: a(n1,24),b(24,n3),c(n1,n3) do i=1,n1 do j=1,n3 c(i,j) = a(i,1)*b(1,j) & & + a(i,2)*b(2,j) & & + a(i,3)*b(3,j) & & + a(i,4)*b(4,j) & & + a(i,5)*b(5,j) & & + a(i,6)*b(6,j) & & + a(i,7)*b(7,j) & & + a(i,8)*b(8,j) & & + a(i,9)*b(9,j) & & + a(i,10)*b(10,j) & & + a(i,11)*b(11,j) & & + a(i,12)*b(12,j) & & + a(i,13)*b(13,j) & & + a(i,14)*b(14,j) & & + a(i,15)*b(15,j) & & + a(i,16)*b(16,j) & & + a(i,17)*b(17,j) & & + a(i,18)*b(18,j) & & + a(i,19)*b(19,j) & & + a(i,20)*b(20,j) & & + a(i,21)*b(21,j) & & + a(i,22)*b(22,j) & & + a(i,23)*b(23,j) & & + a(i,24)*b(24,j) enddo enddo return end subroutine mxmf3_24 !----------------------------------------------------------------------- subroutine mxm44(a,n1,b,n2,c,n3) !----------------------------------------------------------------------- ! NOTE -- this code has been set up with the "mxmf3" routine ! referenced in memtime.f. On most machines, the f2 ! and f3 versions give the same performance (f2 is the ! nekton standard). On the t3e, f3 is noticeably faster. ! pff 10/5/98 ! Matrix-vector product routine. ! NOTE: Use assembly coded routine if available. !---------------------------------------------------------------------- ! use kinds,, only : DP DOUBLE PRECISION :: A(N1,N2),B(N2,N3),C(N1,N3) if (n2 == 1) then call mxm44_2_t(a,n1,b,2,c,n3) elseif (n2 == 2) then call mxm44_2_t(a,n1,b,n2,c,n3) else call mxm44_0_t(a,n1,b,n2,c,n3) endif return end subroutine mxm44 !----------------------------------------------------------------------- subroutine mxm44_0_t(a, m, b, k, c, n) ! subroutine matmul44(m, n, k, a, lda, b, ldb, c, ldc) ! use kinds,, only : DP ! real*8 a(lda,k), b(ldb,n), c(ldc,n) real(8) :: a(m,k), b(k,n), c(m,n) real(8) :: s11, s12, s13, s14, s21, s22, s23, s24 real(8) :: s31, s32, s33, s34, s41, s42, s43, s44 ! matrix multiply with a 4x4 pencil mresid = iand(m,3) nresid = iand(n,3) m1 = m - mresid + 1 n1 = n - nresid + 1 do i=1,m-mresid,4 do j=1,n-nresid,4 s11 = 0.0d0 s21 = 0.0d0 s31 = 0.0d0 s41 = 0.0d0 s12 = 0.0d0 s22 = 0.0d0 s32 = 0.0d0 s42 = 0.0d0 s13 = 0.0d0 s23 = 0.0d0 s33 = 0.0d0 s43 = 0.0d0 s14 = 0.0d0 s24 = 0.0d0 s34 = 0.0d0 s44 = 0.0d0 do l=1,k s11 = s11 + a(i,l)*b(l,j) s12 = s12 + a(i,l)*b(l,j+1) s13 = s13 + a(i,l)*b(l,j+2) s14 = s14 + a(i,l)*b(l,j+3) s21 = s21 + a(i+1,l)*b(l,j) s22 = s22 + a(i+1,l)*b(l,j+1) s23 = s23 + a(i+1,l)*b(l,j+2) s24 = s24 + a(i+1,l)*b(l,j+3) s31 = s31 + a(i+2,l)*b(l,j) s32 = s32 + a(i+2,l)*b(l,j+1) s33 = s33 + a(i+2,l)*b(l,j+2) s34 = s34 + a(i+2,l)*b(l,j+3) s41 = s41 + a(i+3,l)*b(l,j) s42 = s42 + a(i+3,l)*b(l,j+1) s43 = s43 + a(i+3,l)*b(l,j+2) s44 = s44 + a(i+3,l)*b(l,j+3) enddo c(i,j) = s11 c(i,j+1) = s12 c(i,j+2) = s13 c(i,j+3) = s14 c(i+1,j) = s21 c(i+2,j) = s31 c(i+3,j) = s41 c(i+1,j+1) = s22 c(i+2,j+1) = s32 c(i+3,j+1) = s42 c(i+1,j+2) = s23 c(i+2,j+2) = s33 c(i+3,j+2) = s43 c(i+1,j+3) = s24 c(i+2,j+3) = s34 c(i+3,j+3) = s44 enddo ! Residual when n is not multiple of 4 if (nresid /= 0) then if (nresid == 1) then s11 = 0.0d0 s21 = 0.0d0 s31 = 0.0d0 s41 = 0.0d0 do l=1,k s11 = s11 + a(i,l)*b(l,n) s21 = s21 + a(i+1,l)*b(l,n) s31 = s31 + a(i+2,l)*b(l,n) s41 = s41 + a(i+3,l)*b(l,n) enddo c(i,n) = s11 c(i+1,n) = s21 c(i+2,n) = s31 c(i+3,n) = s41 elseif (nresid == 2) then s11 = 0.0d0 s21 = 0.0d0 s31 = 0.0d0 s41 = 0.0d0 s12 = 0.0d0 s22 = 0.0d0 s32 = 0.0d0 s42 = 0.0d0 do l=1,k s11 = s11 + a(i,l)*b(l,j) s12 = s12 + a(i,l)*b(l,j+1) s21 = s21 + a(i+1,l)*b(l,j) s22 = s22 + a(i+1,l)*b(l,j+1) s31 = s31 + a(i+2,l)*b(l,j) s32 = s32 + a(i+2,l)*b(l,j+1) s41 = s41 + a(i+3,l)*b(l,j) s42 = s42 + a(i+3,l)*b(l,j+1) enddo c(i,j) = s11 c(i,j+1) = s12 c(i+1,j) = s21 c(i+2,j) = s31 c(i+3,j) = s41 c(i+1,j+1) = s22 c(i+2,j+1) = s32 c(i+3,j+1) = s42 else s11 = 0.0d0 s21 = 0.0d0 s31 = 0.0d0 s41 = 0.0d0 s12 = 0.0d0 s22 = 0.0d0 s32 = 0.0d0 s42 = 0.0d0 s13 = 0.0d0 s23 = 0.0d0 s33 = 0.0d0 s43 = 0.0d0 do l=1,k s11 = s11 + a(i,l)*b(l,j) s12 = s12 + a(i,l)*b(l,j+1) s13 = s13 + a(i,l)*b(l,j+2) s21 = s21 + a(i+1,l)*b(l,j) s22 = s22 + a(i+1,l)*b(l,j+1) s23 = s23 + a(i+1,l)*b(l,j+2) s31 = s31 + a(i+2,l)*b(l,j) s32 = s32 + a(i+2,l)*b(l,j+1) s33 = s33 + a(i+2,l)*b(l,j+2) s41 = s41 + a(i+3,l)*b(l,j) s42 = s42 + a(i+3,l)*b(l,j+1) s43 = s43 + a(i+3,l)*b(l,j+2) enddo c(i,j) = s11 c(i+1,j) = s21 c(i+2,j) = s31 c(i+3,j) = s41 c(i,j+1) = s12 c(i+1,j+1) = s22 c(i+2,j+1) = s32 c(i+3,j+1) = s42 c(i,j+2) = s13 c(i+1,j+2) = s23 c(i+2,j+2) = s33 c(i+3,j+2) = s43 endif endif enddo ! Residual when m is not multiple of 4 if (mresid == 0) then return elseif (mresid == 1) then do j=1,n-nresid,4 s11 = 0.0d0 s12 = 0.0d0 s13 = 0.0d0 s14 = 0.0d0 do l=1,k s11 = s11 + a(m,l)*b(l,j) s12 = s12 + a(m,l)*b(l,j+1) s13 = s13 + a(m,l)*b(l,j+2) s14 = s14 + a(m,l)*b(l,j+3) enddo c(m,j) = s11 c(m,j+1) = s12 c(m,j+2) = s13 c(m,j+3) = s14 enddo ! mresid is 1, check nresid if (nresid == 0) then return elseif (nresid == 1) then s11 = 0.0d0 do l=1,k s11 = s11 + a(m,l)*b(l,n) enddo c(m,n) = s11 return elseif (nresid == 2) then s11 = 0.0d0 s12 = 0.0d0 do l=1,k s11 = s11 + a(m,l)*b(l,n-1) s12 = s12 + a(m,l)*b(l,n) enddo c(m,n-1) = s11 c(m,n) = s12 return else s11 = 0.0d0 s12 = 0.0d0 s13 = 0.0d0 do l=1,k s11 = s11 + a(m,l)*b(l,n-2) s12 = s12 + a(m,l)*b(l,n-1) s13 = s13 + a(m,l)*b(l,n) enddo c(m,n-2) = s11 c(m,n-1) = s12 c(m,n) = s13 return endif elseif (mresid == 2) then do j=1,n-nresid,4 s11 = 0.0d0 s12 = 0.0d0 s13 = 0.0d0 s14 = 0.0d0 s21 = 0.0d0 s22 = 0.0d0 s23 = 0.0d0 s24 = 0.0d0 do l=1,k s11 = s11 + a(m-1,l)*b(l,j) s12 = s12 + a(m-1,l)*b(l,j+1) s13 = s13 + a(m-1,l)*b(l,j+2) s14 = s14 + a(m-1,l)*b(l,j+3) s21 = s21 + a(m,l)*b(l,j) s22 = s22 + a(m,l)*b(l,j+1) s23 = s23 + a(m,l)*b(l,j+2) s24 = s24 + a(m,l)*b(l,j+3) enddo c(m-1,j) = s11 c(m-1,j+1) = s12 c(m-1,j+2) = s13 c(m-1,j+3) = s14 c(m,j) = s21 c(m,j+1) = s22 c(m,j+2) = s23 c(m,j+3) = s24 enddo ! mresid is 2, check nresid if (nresid == 0) then return elseif (nresid == 1) then s11 = 0.0d0 s21 = 0.0d0 do l=1,k s11 = s11 + a(m-1,l)*b(l,n) s21 = s21 + a(m,l)*b(l,n) enddo c(m-1,n) = s11 c(m,n) = s21 return elseif (nresid == 2) then s11 = 0.0d0 s21 = 0.0d0 s12 = 0.0d0 s22 = 0.0d0 do l=1,k s11 = s11 + a(m-1,l)*b(l,n-1) s12 = s12 + a(m-1,l)*b(l,n) s21 = s21 + a(m,l)*b(l,n-1) s22 = s22 + a(m,l)*b(l,n) enddo c(m-1,n-1) = s11 c(m-1,n) = s12 c(m,n-1) = s21 c(m,n) = s22 return else s11 = 0.0d0 s21 = 0.0d0 s12 = 0.0d0 s22 = 0.0d0 s13 = 0.0d0 s23 = 0.0d0 do l=1,k s11 = s11 + a(m-1,l)*b(l,n-2) s12 = s12 + a(m-1,l)*b(l,n-1) s13 = s13 + a(m-1,l)*b(l,n) s21 = s21 + a(m,l)*b(l,n-2) s22 = s22 + a(m,l)*b(l,n-1) s23 = s23 + a(m,l)*b(l,n) enddo c(m-1,n-2) = s11 c(m-1,n-1) = s12 c(m-1,n) = s13 c(m,n-2) = s21 c(m,n-1) = s22 c(m,n) = s23 return endif else ! mresid is 3 do j=1,n-nresid,4 s11 = 0.0d0 s21 = 0.0d0 s31 = 0.0d0 s12 = 0.0d0 s22 = 0.0d0 s32 = 0.0d0 s13 = 0.0d0 s23 = 0.0d0 s33 = 0.0d0 s14 = 0.0d0 s24 = 0.0d0 s34 = 0.0d0 do l=1,k s11 = s11 + a(m-2,l)*b(l,j) s12 = s12 + a(m-2,l)*b(l,j+1) s13 = s13 + a(m-2,l)*b(l,j+2) s14 = s14 + a(m-2,l)*b(l,j+3) s21 = s21 + a(m-1,l)*b(l,j) s22 = s22 + a(m-1,l)*b(l,j+1) s23 = s23 + a(m-1,l)*b(l,j+2) s24 = s24 + a(m-1,l)*b(l,j+3) s31 = s31 + a(m,l)*b(l,j) s32 = s32 + a(m,l)*b(l,j+1) s33 = s33 + a(m,l)*b(l,j+2) s34 = s34 + a(m,l)*b(l,j+3) enddo c(m-2,j) = s11 c(m-2,j+1) = s12 c(m-2,j+2) = s13 c(m-2,j+3) = s14 c(m-1,j) = s21 c(m-1,j+1) = s22 c(m-1,j+2) = s23 c(m-1,j+3) = s24 c(m,j) = s31 c(m,j+1) = s32 c(m,j+2) = s33 c(m,j+3) = s34 enddo ! mresid is 3, check nresid if (nresid == 0) then return elseif (nresid == 1) then s11 = 0.0d0 s21 = 0.0d0 s31 = 0.0d0 do l=1,k s11 = s11 + a(m-2,l)*b(l,n) s21 = s21 + a(m-1,l)*b(l,n) s31 = s31 + a(m,l)*b(l,n) enddo c(m-2,n) = s11 c(m-1,n) = s21 c(m,n) = s31 return elseif (nresid == 2) then s11 = 0.0d0 s21 = 0.0d0 s31 = 0.0d0 s12 = 0.0d0 s22 = 0.0d0 s32 = 0.0d0 do l=1,k s11 = s11 + a(m-2,l)*b(l,n-1) s12 = s12 + a(m-2,l)*b(l,n) s21 = s21 + a(m-1,l)*b(l,n-1) s22 = s22 + a(m-1,l)*b(l,n) s31 = s31 + a(m,l)*b(l,n-1) s32 = s32 + a(m,l)*b(l,n) enddo c(m-2,n-1) = s11 c(m-2,n) = s12 c(m-1,n-1) = s21 c(m-1,n) = s22 c(m,n-1) = s31 c(m,n) = s32 return else s11 = 0.0d0 s21 = 0.0d0 s31 = 0.0d0 s12 = 0.0d0 s22 = 0.0d0 s32 = 0.0d0 s13 = 0.0d0 s23 = 0.0d0 s33 = 0.0d0 do l=1,k s11 = s11 + a(m-2,l)*b(l,n-2) s12 = s12 + a(m-2,l)*b(l,n-1) s13 = s13 + a(m-2,l)*b(l,n) s21 = s21 + a(m-1,l)*b(l,n-2) s22 = s22 + a(m-1,l)*b(l,n-1) s23 = s23 + a(m-1,l)*b(l,n) s31 = s31 + a(m,l)*b(l,n-2) s32 = s32 + a(m,l)*b(l,n-1) s33 = s33 + a(m,l)*b(l,n) enddo c(m-2,n-2) = s11 c(m-2,n-1) = s12 c(m-2,n) = s13 c(m-1,n-2) = s21 c(m-1,n-1) = s22 c(m-1,n) = s23 c(m,n-2) = s31 c(m,n-1) = s32 c(m,n) = s33 return endif endif return end subroutine mxm44_0_t !----------------------------------------------------------------------- subroutine mxm44_2_t(a, m, b, k, c, n) ! use kinds,, only : DP real(8) :: a(m,2), b(2,n), c(m,n) nresid = iand(n,3) n1 = n - nresid + 1 do j=1,n-nresid,4 do i=1,m c(i,j+0) = a(i,1)*b(1,j+0) + a(i,2)*b(2,j+0) c(i,j+1) = a(i,1)*b(1,j+1) + a(i,2)*b(2,j+1) c(i,j+2) = a(i,1)*b(1,j+2) + a(i,2)*b(2,j+2) c(i,j+3) = a(i,1)*b(1,j+3) + a(i,2)*b(2,j+3) enddo enddo if (nresid == 0) then return elseif (nresid == 1) then do i=1,m c(i,n) = a(i,1)*b(1,n) + a(i,2)*b(2,n) enddo elseif (nresid == 2) then do i=1,m c(i,n-1) = a(i,1)*b(1,n-1) + a(i,2)*b(2,n-1) c(i,n-0) = a(i,1)*b(1,n-0) + a(i,2)*b(2,n-0) enddo else do i=1,m c(i,n-2) = a(i,1)*b(1,n-2) + a(i,2)*b(2,n-2) c(i,n-1) = a(i,1)*b(1,n-1) + a(i,2)*b(2,n-1) c(i,n-0) = a(i,1)*b(1,n-0) + a(i,2)*b(2,n-0) enddo endif return end subroutine mxm44_2_t !----------------------------------------------------------------------- libxsmm-1.17/samples/nek/rstr.f000066400000000000000000000347631415223013700165120ustar00rootroot00000000000000!=======================================================================! ! Copyright (c) Intel Corporation - All rights reserved. ! ! This file is part of the LIBXSMM library. ! ! ! ! For information on the license, see the LICENSE file. ! ! Further information: https://github.com/hfp/libxsmm/ ! ! SPDX-License-Identifier: BSD-3-Clause ! !=======================================================================! ! Hans Pabst (Intel Corp.), Alexander Heinecke (Intel Corp.), and ! Maxwell Hutchinson (University of Chicago) !=======================================================================! PROGRAM stpm USE :: LIBXSMM, libxsmm_mmcall => libxsmm_dmmcall_abc USE :: STREAM_UPDATE_KERNELS !$ USE omp_lib IMPLICIT NONE INTEGER, PARAMETER :: T = KIND(0D0) REAL(T), PARAMETER :: alpha = 1, beta = 0 REAL(T), ALLOCATABLE, DIMENSION(:,:,:,:), TARGET :: a, c, d !DIR$ ATTRIBUTES ALIGN:64 :: a, c, d REAL(T), ALLOCATABLE, TARGET :: dx(:,:), dy(:,:), dz(:,:) REAL(T), ALLOCATABLE, TARGET, SAVE :: tm1(:,:,:) REAL(T), ALLOCATABLE, TARGET, SAVE :: tm2(:,:,:) REAL(T), ALLOCATABLE, TARGET, SAVE :: tm3(:,:,:) !$OMP THREADPRIVATE(tm1, tm2, tm3) TYPE(LIBXSMM_DMMFUNCTION) :: xmm1, xmm2, xmm3 DOUBLE PRECISION :: duration, max_diff INTEGER :: argc, m, n, k, routine, check, mm, nn, kk INTEGER(8) :: i, j, ix, iy, iz, r, s INTEGER(8) :: size0, size1, size INTEGER(8) :: repetitions, start CHARACTER(32) :: argv argc = COMMAND_ARGUMENT_COUNT() IF (1 <= argc) THEN CALL GET_COMMAND_ARGUMENT(1, argv) READ(argv, "(I32)") m ELSE m = 8 END IF IF (3 <= argc) THEN CALL GET_COMMAND_ARGUMENT(3, argv) READ(argv, "(I32)") k ELSE k = m END IF IF (2 <= argc) THEN CALL GET_COMMAND_ARGUMENT(2, argv) READ(argv, "(I32)") n ELSE n = k END IF mm = 0 IF (4 <= argc) THEN CALL GET_COMMAND_ARGUMENT(4, argv) READ(argv, "(I32)") mm END IF mm = MERGE(10, mm, 0.EQ.mm) nn = 0 IF (5 <= argc) THEN CALL GET_COMMAND_ARGUMENT(5, argv) READ(argv, "(I32)") nn END IF nn = MERGE(mm, nn, 0.EQ.nn) kk = 0 IF (6 <= argc) THEN CALL GET_COMMAND_ARGUMENT(6, argv) READ(argv, "(I32)") kk END IF kk = MERGE(mm, kk, 0.EQ.kk) IF (7 <= argc) THEN CALL GET_COMMAND_ARGUMENT(7, argv) READ(argv, "(I32)") size1 ELSE size1 = 0 END IF IF (8 <= argc) THEN CALL GET_COMMAND_ARGUMENT(8, argv) READ(argv, "(I32)") size ELSE size = 0 ! 1 repetition by default END IF ! Initialize LIBXSMM CALL libxsmm_init() ! workload is about 2 GByte in memory by default size0 = ((m * n * k) + (nn * mm * kk)) * T ! size of single stream element in Byte size1 = MERGE(2048_8, MERGE(size1, ISHFT(ABS(size0 * size1) & & + ISHFT(1, 20) - 1, -20), 0.LE.size1), 0.EQ.size1) size = ISHFT(MERGE(MAX(size, size1), ISHFT(ABS(size) * size0 & & + ISHFT(1, 20) - 1, -20), 0.LE.size), 20) / size0 s = ISHFT(size1, 20) / size0 repetitions = size / s duration = 0 max_diff = 0 ALLOCATE(a(m,n,k,s)) ALLOCATE(c(mm,nn,kk,s)) ALLOCATE(dx(mm,m), dy(n,nn), dz(k,kk)) ! Initialize !$OMP PARALLEL DO PRIVATE(i, ix, iy, iz) DEFAULT(NONE) & !$OMP SHARED(a, m, mm, n, nn, k, kk, s) DO i = 1, s DO ix = 1, m DO iy = 1, n DO iz = 1, k a(ix,iy,iz,i) = ix + iy*m + iz*m*n END DO END DO END DO END DO !$OMP PARALLEL DO PRIVATE(i, ix, iy, iz) DEFAULT(NONE) & !$OMP SHARED(c, m, mm, n, nn, k, kk, s) DO i = 1, s DO ix = 1, mm DO iy = 1, nn DO iz = 1, kk c(ix,iy,iz,i) = REAL(0, T) END DO END DO END DO END DO dx = 1. dy = 1. dz = 1. WRITE(*, "(6(A,I0),A,I0,A,I0,A,I0)") & & "m=", m, " n=", n, " k=", k, & & " mm=", mm, " nn=", nn, " kk=", kk, & & " elements=", UBOUND(a, 4), & & " size=", size1, "MB repetitions=", repetitions CALL GETENV("CHECK", argv) READ(argv, "(I32)") check IF (0.NE.check) THEN ALLOCATE(d(mm,nn,kk,s)) !$OMP PARALLEL DO PRIVATE(i, ix, iy, iz) DEFAULT(NONE) & !$OMP SHARED(d, m, mm, n, nn, k, kk, s) DO i = 1, s DO ix = 1, mm DO iy = 1, nn DO iz = 1, kk d(ix,iy,iz,i) = REAL(0, T) END DO END DO END DO END DO WRITE(*, "(A)") "Calculating check..." !$OMP PARALLEL PRIVATE(i, j, r) DEFAULT(NONE) & !$OMP SHARED(a, dx, dy, dz, d, m, n, k, mm, nn, kk, & !$OMP repetitions) ALLOCATE(tm1(mm,n,k), tm2(mm,nn,k)) tm1 = 0; tm2 = 0; DO r = 1, repetitions !$OMP DO DO i = LBOUND(a, 4), UBOUND(a, 4) tm1 = RESHAPE( & & MATMUL(dx, RESHAPE(a(:,:,:,i), (/m,n*k/))), & & (/mm, n, k/)) ! [mm,m]x[m,n*k]->[mm,n*k] DO j = 1, k tm2(:,:,j) = MATMUL(tm1(:,:,j), dy) ! [mm,n]x[n,nn]->[mm,nn] END DO ! because we can't RESHAPE d d(:,:,:,i) = RESHAPE( & & MATMUL(RESHAPE(tm2, (/mm*nn, k/)), dz), & & (/mm,nn,kk/)) ! [mm*nn,k]x[k,kk]->[mm*nn,kk] END DO END DO ! Deallocate thread-local arrays DEALLOCATE(tm1, tm2) !$OMP END PARALLEL END IF WRITE(*, "(A)") "Streamed... (BLAS)" !$OMP PARALLEL PRIVATE(i, j, r, start) DEFAULT(NONE) & !$OMP SHARED(a, dx, dy, dz, c, m, n, k, mm, nn, kk, & !$OMP duration, repetitions) ALLOCATE(tm1(mm,n,k), tm2(mm,nn,k), tm3(mm,nn,kk)) tm1 = 0; tm2 = 0; tm3 = 3 !$OMP MASTER start = libxsmm_timer_tick() !$OMP END MASTER !$OMP BARRIER DO r = 1, repetitions !$OMP DO DO i = LBOUND(a, 4), UBOUND(a, 4) ! PGI: cannot deduce generic procedure (libxsmm_blas_gemm) CALL libxsmm_blas_dgemm(m=mm, n=n*k, k=m, & & a=dx, b=a(:,:,1,i), c=tm1(:,:,1), & & alpha=alpha, beta=beta) DO j = 1, k ! PGI: cannot deduce generic procedure (libxsmm_blas_gemm) CALL libxsmm_blas_dgemm(m=mm, n=nn, k=n, & & a=tm1(:,:,j), b=dy, c=tm2(:,:,j), & & alpha=alpha, beta=beta) END DO ! PGI: cannot deduce generic procedure (libxsmm_blas_gemm) CALL libxsmm_blas_dgemm(m=mm*nn, n=kk, k=k, & & a=tm2(:,:,1), b=dz, c=tm3(:,:,1), & & alpha=alpha, beta=beta) CALL stream_vector_copy(tm3(1,1,1), c(1,1,1,i), mm*nn*kk) END DO END DO !$OMP BARRIER !$OMP MASTER duration = libxsmm_timer_duration(start, libxsmm_timer_tick()) !$OMP END MASTER ! Deallocate thread-local arrays DEALLOCATE(tm1, tm2, tm3) !$OMP END PARALLEL CALL performance(duration, m, n, k, mm, nn, kk, size) IF (check.NE.0) max_diff = MAX(max_diff, validate(c, d)) WRITE(*, "(A)") "Streamed... (mxm)" !$OMP PARALLEL PRIVATE(i, j, r, start) DEFAULT(NONE) & !$OMP SHARED(a, dx, dy, dz, c, m, n, k, mm, nn, kk, & !$OMP duration, repetitions) ALLOCATE(tm1(mm,n,k), tm2(mm,nn,k), tm3(mm,nn,kk)) tm1 = 0; tm2 = 0; tm3 = 3 !$OMP MASTER start = libxsmm_timer_tick() !$OMP END MASTER !$OMP BARRIER DO r = 1, repetitions !$OMP DO DO i = LBOUND(a, 4), UBOUND(a, 4) CALL mxmf2(dx, mm, a(:,:,:,i), m, tm1, n*k) DO j = 1, k CALL mxmf2(tm1(:,:,j), mm, dy, n, tm2(:,:,j), nn) END DO CALL mxmf2(tm2, mm*nn, dz, k, tm3, kk) CALL stream_vector_copy(tm3(1,1,1), c(1,1,1,i), mm*nn*kk) END DO END DO !$OMP BARRIER !$OMP MASTER duration = libxsmm_timer_duration(start, libxsmm_timer_tick()) !$OMP END MASTER ! Deallocate thread-local arrays DEALLOCATE(tm1, tm2, tm3) !$OMP END PARALLEL CALL performance(duration, m, n, k, mm, nn, kk, size) IF (check.NE.0) max_diff = MAX(max_diff, validate(c, d)) WRITE(*, "(A)") "Streamed... (auto-dispatched)" !$OMP PARALLEL PRIVATE(i, j, r, start) DEFAULT(NONE) & !$OMP SHARED(a, dx, dy, dz, c, m, n, k, mm, nn, kk, & !$OMP duration, repetitions) ALLOCATE(tm1(mm,n,k), tm2(mm,nn,k), tm3(mm,nn,kk)) tm1 = 0; tm2 = 0; tm3 = 3 !$OMP MASTER start = libxsmm_timer_tick() !$OMP END MASTER !$OMP BARRIER DO r = 1, repetitions !$OMP DO DO i = LBOUND(a, 4), UBOUND(a, 4) ! PGI: cannot deduce generic procedure (libxsmm_gemm) CALL libxsmm_dgemm(m=mm, n=n*k, k=m, & & a=dx, b=a(:,:,1,i), c=tm1(:,:,1), & & alpha=alpha, beta=beta) DO j = 1, k ! PGI: cannot deduce generic procedure (libxsmm_gemm) CALL libxsmm_dgemm(m=mm, n=nn, k=n, & & a=tm1(:,:,j), b=dy, c=tm2(:,:,j), & & alpha=alpha, beta=beta) END DO ! PGI: cannot deduce generic procedure (libxsmm_gemm) CALL libxsmm_dgemm(m=mm*nn, n=kk, k=k, & & a=tm2(:,:,1), b=dz, c=tm3(:,:,1), & & alpha=alpha, beta=beta) CALL stream_vector_copy(tm3(1,1,1), c(1,1,1,i), mm*nn*kk) END DO END DO !$OMP BARRIER !$OMP MASTER duration = libxsmm_timer_duration(start, libxsmm_timer_tick()) !$OMP END MASTER ! Deallocate thread-local arrays DEALLOCATE(tm1, tm2, tm3) !$OMP END PARALLEL CALL performance(duration, m, n, k, mm, nn, kk, size) IF (check.NE.0) max_diff = MAX(max_diff, validate(c, d)) WRITE(*, "(A)") "Streamed... (specialized)" CALL libxsmm_dispatch(xmm1, mm, n*k, m, & & alpha=alpha, beta=beta) CALL libxsmm_dispatch(xmm2, mm, nn, n, & & alpha=alpha, beta=beta) CALL libxsmm_dispatch(xmm3, mm*nn, kk, k, & & alpha=alpha, beta=beta) IF (libxsmm_available(xmm1).AND. & & libxsmm_available(xmm2).AND. & & libxsmm_available(xmm3)) & & THEN !$OMP PARALLEL PRIVATE(i, j, r, start) & !DEFAULT(NONE) !$OMP SHARED(a, dx, dy, dz, c, m, n, k, mm, nn, kk, & !$OMP duration, repetitions, xmm1, xmm2, xmm3) ALLOCATE(tm1(mm,n,k), tm2(mm,nn,k), tm3(mm,nn,kk)) tm1 = 0; tm2 = 0; tm3 = 3 !$OMP MASTER start = libxsmm_timer_tick() !$OMP END MASTER !$OMP BARRIER DO r = 1, repetitions !$OMP DO DO i = LBOUND(a, 4), UBOUND(a, 4) ! [mm,m]x[m,n*k]->[mm,n*k] CALL libxsmm_mmcall(xmm1, dx, a(1,1,1,i), tm1) DO j = 1, k ! [mm,n]x[n,nn]->[mm,nn] CALL libxsmm_mmcall(xmm2, tm1(1,1,j), dy, tm2(1,1,j)) END DO ! [mm*nn,k]x[k,kk]->[mm*nn,kk] CALL libxsmm_mmcall(xmm3, tm2, dz, tm3(1,1,1)) CALL stream_vector_copy( & & tm3(1,1,1), c(1,1,1,i), mm*nn*kk) END DO END DO !$OMP BARRIER !$OMP MASTER duration = libxsmm_timer_duration(start, libxsmm_timer_tick()) !$OMP END MASTER ! Deallocate thread-local arrays DEALLOCATE(tm1, tm2, tm3) !$OMP END PARALLEL CALL performance(duration, m, n, k, mm, nn, kk, size) IF (check.NE.0) max_diff = MAX(max_diff, validate(c, d)) ELSE WRITE(*,*) "Could not build specialized function(s)!" END IF ! Deallocate global arrays IF (check.NE.0) DEALLOCATE(d) DEALLOCATE(dx, dy, dz) DEALLOCATE(a, c) ! finalize LIBXSMM CALL libxsmm_finalize() IF ((0.NE.check).AND.(1.LT.max_diff)) STOP 1 CONTAINS FUNCTION validate(ref, test) RESULT(diff) REAL(T), DIMENSION(:,:,:,:), intent(in) :: ref, test REAL(T) :: diff diff = MAXVAL((ref - test) * (ref - test)) WRITE(*, "(1A,A,F10.1,A)") CHAR(9), "diff: ", diff END FUNCTION SUBROUTINE performance(duration, m, n, k, mm, nn, kk, size) DOUBLE PRECISION, INTENT(IN) :: duration INTEGER, INTENT(IN) :: m, n, k, mm, nn, kk INTEGER(8), INTENT(IN) :: size IF (0.LT.duration) THEN WRITE(*, "(1A,A,F10.1,A)") CHAR(9), "performance:", (size & & * ((2*m-1)*mm*n*k + mm*(2*n-1)*nn*k + mm*nn*(2*k-1)*kk) & & * 1D-9 / duration), " GFLOPS/s" WRITE(*, "(1A,A,F10.1,A)") CHAR(9), "bandwidth: ", (size & & * ((m*n*k) + (mm*nn*kk)) & & * T / (duration * LSHIFT(1_8, 30))), " GB/s" END IF WRITE(*, "(1A,A,F10.1,A)") CHAR(9), "duration: ", & & (1D3 * duration) / repetitions, " ms" END SUBROUTINE END PROGRAM libxsmm-1.17/samples/nek/rstr.sh000077500000000000000000000050141415223013700166650ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) NAME=$(basename $0 .sh) GREP=$(command -v grep) ENV=$(command -v env) if [ "Windows_NT" = "${OS}" ]; then # Cygwin's "env" does not set PATH ("Files/Black: No such file or directory") export PATH=${PATH}:${HERE}/../../lib:/usr/x86_64-w64-mingw32/sys-root/mingw/bin # Cygwin's ldd hangs with dyn. linked executables or certain shared libraries LDD=$(command -v cygcheck) EXE=.exe else if [ "$(command -v ldd)" ]; then LDD=ldd elif [ "$(command -v otool)" ]; then LDD="otool -L" else LDD=echo fi fi MICINFO=$(command -v micinfo) if [ "${MICINFO}" ]; then MICCORES=$(${MICINFO} 2>/dev/null | sed -n "0,/[[:space:]]\+Total No of Active Cores :[[:space:]]\+\([0-9]\+\)/s//\1/p") fi if [ "" = "${MICCORES}" ]; then MICCORES=61 fi MICTPERC=3 if [ "-mic" != "$1" ]; then if [ "$(${LDD} ${HERE}/${NAME}${EXE} 2>/dev/null | ${GREP} libiomp5\.)" ]; then ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ KMP_AFFINITY=scatter,granularity=fine,1 \ MIC_KMP_AFFINITY=scatter,granularity=fine \ MIC_KMP_HW_SUBSET=$((MICCORES-1))c${MICTPERC}t \ MIC_ENV_PREFIX=MIC \ OFFLOAD_INIT=on_start \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" else ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ OMP_PROC_BIND=TRUE \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" fi else shift ${ENV} \ SINK_LD_LIBRARY_PATH=${SINK_LD_LIBRARY_PATH}:${MIC_LD_LIBRARY_PATH}:${HERE}/../../lib \ micnativeloadex \ ${HERE}/${NAME}${EXE} -a "$*" \ -e "KMP_AFFINITY=scatter,granularity=fine" \ -e "MIC_KMP_HW_SUBSET=$((MICCORES-1))${MICTPERC}t" fi libxsmm-1.17/samples/nek/stream_update_kernels.c000066400000000000000000000643161415223013700220720ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include /*#define DISABLE_NONTEMPORAL_STORES*/ LIBXSMM_INLINE void stream_init(int i_length, size_t i_start_address, int* o_trip_prolog, int* o_trip_stream) { /* let's calculate the prologue until C is cacheline aligned */ /* @TODO we need to add shifts */ if ( (i_start_address % 64) != 0 ) { *o_trip_prolog = (64 - (i_start_address % 64))/sizeof(double); } /* let's calculate the end of the streaming part */ /* @TODO we need to add shifts */ *o_trip_stream = (((i_length-(*o_trip_prolog))/sizeof(double))*sizeof(double))+(*o_trip_prolog); /* some bound checks */ *o_trip_prolog = ((*o_trip_prolog) > i_length) ? i_length : (*o_trip_prolog); *o_trip_stream = ((*o_trip_stream) > i_length) ? (*o_trip_prolog) : (*o_trip_stream); } /* avoid warning about external function definition with no prior declaration */ void LIBXSMM_FSYMBOL(stream_vector_copy)(const double* /*i_a*/, double* /*io_c*/, const int* /*i_length*/); void LIBXSMM_FSYMBOL(stream_vector_copy)(const double* i_a, double* io_c, const int* i_length) { int l_n = 0; int l_trip_prolog = 0; int l_trip_stream = 0; assert(0 != i_length); /* init the trip counts */ stream_init( *i_length, (size_t)io_c, &l_trip_prolog, &l_trip_stream ); /* run the prologue */ for (; l_n < l_trip_prolog; l_n++) { io_c[l_n] = i_a[l_n]; } /* run the bulk, hopefully using streaming stores */ #if defined(__SSE3__) && defined(__AVX__) && !defined(__AVX512F__) && !defined(LIBXSMM_INTRINSICS_STATIC) { /* we need manual unrolling as the compiler otherwise generates too many dependencies */ for (; l_n < l_trip_stream; l_n+=8) { # ifdef DISABLE_NONTEMPORAL_STORES _mm256_store_pd( &(io_c[l_n]), _mm256_loadu_pd(&(i_a[l_n])) ); _mm256_store_pd( &(io_c[l_n+4]), _mm256_loadu_pd(&(i_a[l_n+4])) ); # else _mm256_stream_pd( &(io_c[l_n]), _mm256_loadu_pd(&(i_a[l_n])) ); _mm256_stream_pd( &(io_c[l_n+4]), _mm256_loadu_pd(&(i_a[l_n+4])) ); # endif } } #elif defined(__SSE3__) && defined(__AVX__) && defined(__AVX512F__) { for (; l_n < l_trip_stream; l_n+=8) { # ifdef DISABLE_NONTEMPORAL_STORES _mm512_store_pd( &(io_c[l_n]), _mm512_loadu_pd(&(i_a[l_n])) ); # else LIBXSMM_INTRINSICS_MM512_STREAM_PD( &(io_c[l_n]), _mm512_loadu_pd(&(i_a[l_n])) ); # endif } } #else for (; l_n < l_trip_stream; l_n++) { io_c[l_n] = i_a[l_n]; } #endif /* run the epilogue */ for (; l_n < *i_length; l_n++) { io_c[l_n] = i_a[l_n]; } } void LIBXSMM_FSYMBOL(stream_vector_set)(const double* /*i_scalar*/, double* /*io_c*/, const int* /*i_length*/); void LIBXSMM_FSYMBOL(stream_vector_set)(const double* i_scalar, double* io_c, const int* i_length) { int l_n = 0; int l_trip_prolog = 0; int l_trip_stream = 0; assert(0 != i_length); assert(0 != i_scalar); /* init the trip counts */ stream_init( *i_length, (size_t)io_c, &l_trip_prolog, &l_trip_stream ); /* run the prologue */ for (; l_n < l_trip_prolog; l_n++) { io_c[l_n] = *i_scalar; } /* run the bulk, hopefully using streaming stores */ #if defined(__SSE3__) && defined(__AVX__) && !defined(__AVX512F__) && !defined(LIBXSMM_INTRINSICS_STATIC) { /* we need manual unrolling as the compiler otherwise generates too many dependencies */ const __m256d vec_scalar = _mm256_broadcast_sd(i_scalar); for (; l_n < l_trip_stream; l_n+=8) { # ifdef DISABLE_NONTEMPORAL_STORES _mm256_store_pd( &(io_c[l_n]), vec_scalar ); _mm256_store_pd( &(io_c[l_n+4]), vec_scalar ); # else _mm256_stream_pd( &(io_c[l_n]), vec_scalar ); _mm256_stream_pd( &(io_c[l_n+4]), vec_scalar ); # endif } } #elif defined(__SSE3__) && defined(__AVX__) && defined(__AVX512F__) { const __m512d vec_scalar = _mm512_broadcastsd_pd(_mm_load_sd(i_scalar)); for (; l_n < l_trip_stream; l_n+=8) { # ifdef DISABLE_NONTEMPORAL_STORES _mm512_store_pd( &(io_c[l_n]), vec_scalar ); # else LIBXSMM_INTRINSICS_MM512_STREAM_PD( &(io_c[l_n]), vec_scalar ); # endif } } #else for (; l_n < l_trip_stream; l_n++) { io_c[l_n] = *i_scalar; } #endif /* run the epilogue */ for (; l_n < *i_length; l_n++) { io_c[l_n] = *i_scalar; } } void LIBXSMM_FSYMBOL(stream_vector_compscale)(const double* /*i_a*/, const double* /*i_b*/, double* /*io_c*/, const int* /*i_length*/); void LIBXSMM_FSYMBOL(stream_vector_compscale)(const double* i_a, const double* i_b, double* io_c, const int* i_length) { int l_n = 0; int l_trip_prolog = 0; int l_trip_stream = 0; assert(0 != i_length); /* init the trip counts */ stream_init( *i_length, (size_t)io_c, &l_trip_prolog, &l_trip_stream ); /* run the prologue */ for (; l_n < l_trip_prolog; l_n++) { io_c[l_n] = i_a[l_n]*i_b[l_n]; } /* run the bulk, hopefully using streaming stores */ #if defined(__SSE3__) && defined(__AVX__) && !defined(__AVX512F__) && !defined(LIBXSMM_INTRINSICS_STATIC) { /* we need manual unrolling as the compiler otherwise generates too many dependencies */ for (; l_n < l_trip_stream; l_n+=8) { __m256d vec_a_1, vec_b_1; __m256d vec_a_2, vec_b_2; vec_a_1 = _mm256_loadu_pd(&(i_a[l_n])); vec_a_2 = _mm256_loadu_pd(&(i_a[l_n+4])); vec_b_1 = _mm256_loadu_pd(&(i_b[l_n])); vec_b_2 = _mm256_loadu_pd(&(i_b[l_n+4])); # ifdef DISABLE_NONTEMPORAL_STORES _mm256_store_pd( &(io_c[l_n]), _mm256_mul_pd( vec_a_1, vec_b_1 ) ); _mm256_store_pd( &(io_c[l_n+4]), _mm256_mul_pd( vec_a_2, vec_b_2 ) ); # else _mm256_stream_pd( &(io_c[l_n]), _mm256_mul_pd( vec_a_1, vec_b_1 ) ); _mm256_stream_pd( &(io_c[l_n+4]), _mm256_mul_pd( vec_a_2, vec_b_2 ) ); # endif } } #elif defined(__SSE3__) && defined(__AVX__) && defined(__AVX512F__) { for (; l_n < l_trip_stream; l_n+=8) { __m512d vec_a, vec_b; vec_a = _mm512_loadu_pd(&(i_a[l_n])); vec_b = _mm512_loadu_pd(&(i_b[l_n])); # ifdef DISABLE_NONTEMPORAL_STORES _mm512_store_pd( &(io_c[l_n]), _mm512_mul_pd( vec_a, vec_b ) ); # else LIBXSMM_INTRINSICS_MM512_STREAM_PD( &(io_c[l_n]), _mm512_mul_pd( vec_a, vec_b ) ); # endif } } #else for (; l_n < l_trip_stream; l_n++) { io_c[l_n] = i_a[l_n]*i_b[l_n]; } #endif /* run the epilogue */ for (; l_n < *i_length; l_n++) { io_c[l_n] = i_a[l_n]*i_b[l_n]; } } void LIBXSMM_FSYMBOL(stream_update_helmholtz)( const double* i_g1, const double* i_g2, const double* i_g3, const double* i_tm1, const double* i_tm2, const double* i_tm3, const double* i_a, const double* i_b, double* io_c, const double* i_h1, const double* i_h2, const int* i_length); void LIBXSMM_FSYMBOL(stream_update_helmholtz)( const double* i_g1, const double* i_g2, const double* i_g3, const double* i_tm1, const double* i_tm2, const double* i_tm3, const double* i_a, const double* i_b, double* io_c, const double* i_h1, const double* i_h2, const int* i_length) { int l_n = 0; int l_trip_prolog = 0; int l_trip_stream = 0; assert(0 != i_length); assert(0 != i_h1); assert(0 != i_h2); /* init the trip counts */ stream_init( *i_length, (size_t)io_c, &l_trip_prolog, &l_trip_stream ); /* run the prologue */ /* #if !defined(__SSE3__) */ { for (; l_n < l_trip_prolog; l_n++) { io_c[l_n] = (*i_h1)*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]) + (*i_h2)*(i_b[l_n]*i_a[l_n]); } } /* #else { const __m128d vec_h1 = _mm_loaddup_pd(i_h1); const __m128d vec_h2 = _mm_loaddup_pd(i_h2); const __m128i mask = _mm_set_epi32(0,0,-1,-1); for (; l_n < l_trip_prolog; l_n++) { __m128d vec_g1, vec_g2, vec_g3, vec_tm1, vec_tm2, vec_tm3, vec_a, vec_b; vec_g1 = _mm_load_sd(&(i_g1[l_n])); vec_tm1 = _mm_load_sd(&(i_tm1[l_n])); vec_g1 = _mm_mul_sd(vec_g1, vec_tm1); vec_g2 = _mm_load_sd(&(i_g2[l_n])); vec_tm2 = _mm_load_sd(&(i_tm2[l_n])); vec_g2 = _mm_mul_sd(vec_g2, vec_tm2); vec_g3 = _mm_load_sd(&(i_g3[l_n])); vec_tm3 = _mm_load_sd(&(i_tm3[l_n])); vec_g3 = _mm_mul_sd(vec_g3, vec_tm3); vec_a = _mm_load_sd(&(i_a[l_n])); vec_b = _mm_load_sd(&(i_b[l_n])); vec_a = _mm_mul_sd(vec_a, vec_b); vec_g1 = _mm_add_sd(vec_g1, vec_g2); vec_a = _mm_mul_sd(vec_a, vec_h2); vec_g1 = _mm_add_sd(vec_g1, vec_g3); vec_g1 = _mm_mul_sd(vec_g1, vec_h1); _mm_maskmoveu_si128(_mm_castpd_si128(_mm_add_pd( vec_g1, vec_a )), mask, (char*)(&(io_c[l_n]))); } } #endif */ /* run the bulk, hopefully using streaming stores */ #if defined(__SSE3__) && defined(__AVX__) && !defined(__AVX512F__) && !defined(LIBXSMM_INTRINSICS_STATIC) { const __m256d vec_h1 = _mm256_broadcast_sd(i_h1); const __m256d vec_h2 = _mm256_broadcast_sd(i_h2); /* we need manual unrolling as the compiler otherwise generates too many dependencies */ for (; l_n < l_trip_stream; l_n+=8) { __m256d vec_g1_1, vec_g2_1, vec_g3_1, vec_tm1_1, vec_tm2_1, vec_tm3_1, vec_a_1, vec_b_1; __m256d vec_g1_2, vec_g2_2, vec_g3_2, vec_tm1_2, vec_tm2_2, vec_tm3_2, vec_a_2, vec_b_2; vec_g1_1 = _mm256_loadu_pd(&(i_g1[l_n])); vec_tm1_1 = _mm256_loadu_pd(&(i_tm1[l_n])); vec_g1_2 = _mm256_loadu_pd(&(i_g1[l_n+4])); vec_tm1_2 = _mm256_loadu_pd(&(i_tm1[l_n+4])); vec_g1_1 = _mm256_mul_pd(vec_g1_1, vec_tm1_1); vec_g2_1 = _mm256_loadu_pd(&(i_g2[l_n])); vec_g1_2 = _mm256_mul_pd(vec_g1_2, vec_tm1_2); vec_g2_2 = _mm256_loadu_pd(&(i_g2[l_n+4])); vec_tm2_1 = _mm256_loadu_pd(&(i_tm2[l_n])); vec_g2_1 = _mm256_mul_pd(vec_g2_1, vec_tm2_1); vec_tm2_2 = _mm256_loadu_pd(&(i_tm2[l_n+4])); vec_g2_2 = _mm256_mul_pd(vec_g2_2, vec_tm2_2); vec_g3_1 = _mm256_loadu_pd(&(i_g3[l_n])); vec_tm3_1 = _mm256_loadu_pd(&(i_tm3[l_n])); vec_g3_2 = _mm256_loadu_pd(&(i_g3[l_n+4])); vec_tm3_2 = _mm256_loadu_pd(&(i_tm3[l_n+4])); vec_g3_1 = _mm256_mul_pd(vec_g3_1, vec_tm3_1); vec_a_1 = _mm256_loadu_pd(&(i_a[l_n])); vec_g3_2 = _mm256_mul_pd(vec_g3_2, vec_tm3_2); vec_a_2 = _mm256_loadu_pd(&(i_a[l_n+4])); vec_b_1 = _mm256_loadu_pd(&(i_b[l_n])); vec_a_1 = _mm256_mul_pd(vec_a_1, vec_b_1); vec_b_2 = _mm256_loadu_pd(&(i_b[l_n+4])); vec_a_2 = _mm256_mul_pd(vec_a_2, vec_b_2); vec_g1_1 = _mm256_add_pd(vec_g1_1, vec_g2_1); vec_a_1 = _mm256_mul_pd(vec_a_1, vec_h2); vec_g1_2 = _mm256_add_pd(vec_g1_2, vec_g2_2); vec_a_2 = _mm256_mul_pd(vec_a_2, vec_h2); vec_g1_1 = _mm256_add_pd(vec_g1_1, vec_g3_1); vec_g1_1 = _mm256_mul_pd(vec_g1_1, vec_h1); # ifdef DISABLE_NONTEMPORAL_STORES _mm256_store_pd( &(io_c[l_n]), _mm256_add_pd( vec_g1_1, vec_a_1 ) ); # else _mm256_stream_pd( &(io_c[l_n]), _mm256_add_pd( vec_g1_1, vec_a_1 ) ); # endif vec_g1_2 = _mm256_add_pd(vec_g1_2, vec_g3_2); vec_g1_2 = _mm256_mul_pd(vec_g1_2, vec_h1); # ifdef DISABLE_NONTEMPORAL_STORES _mm256_store_pd( &(io_c[l_n+4]), _mm256_add_pd( vec_g1_2, vec_a_2 ) ); # else _mm256_stream_pd( &(io_c[l_n+4]), _mm256_add_pd( vec_g1_2, vec_a_2 ) ); # endif } } #elif defined(__SSE3__) && defined(__AVX__) && defined(__AVX512F__) { const __m512d vec_h1 = _mm512_broadcastsd_pd(_mm_load_sd(i_h1)); const __m512d vec_h2 = _mm512_broadcastsd_pd(_mm_load_sd(i_h2)); for (; l_n < l_trip_stream; l_n+=8) { __m512d vec_g1, vec_g2, vec_g3, vec_tm1, vec_tm2, vec_tm3, vec_a, vec_b; vec_g1 = _mm512_loadu_pd(&(i_g1[l_n])); vec_tm1 = _mm512_loadu_pd(&(i_tm1[l_n])); vec_g1 = _mm512_mul_pd(vec_g1, vec_tm1); vec_g2 = _mm512_loadu_pd(&(i_g2[l_n])); vec_tm2 = _mm512_loadu_pd(&(i_tm2[l_n])); vec_g2 = _mm512_mul_pd(vec_g2, vec_tm2); vec_g3 = _mm512_loadu_pd(&(i_g3[l_n])); vec_tm3 = _mm512_loadu_pd(&(i_tm3[l_n])); vec_g3 = _mm512_mul_pd(vec_g3, vec_tm3); vec_a = _mm512_loadu_pd(&(i_a[l_n])); vec_b = _mm512_loadu_pd(&(i_b[l_n])); vec_a = _mm512_mul_pd(vec_a, vec_b); vec_g1 = _mm512_add_pd(vec_g1, vec_g2); vec_a = _mm512_mul_pd(vec_a, vec_h2); vec_g1 = _mm512_add_pd(vec_g1, vec_g3); vec_g1 = _mm512_mul_pd(vec_g1, vec_h1); # ifdef DISABLE_NONTEMPORAL_STORES _mm512_store_pd( &(io_c[l_n]), _mm512_add_pd( vec_g1, vec_a ) ); # else LIBXSMM_INTRINSICS_MM512_STREAM_PD( &(io_c[l_n]), _mm512_add_pd( vec_g1, vec_a ) ); # endif } } #else for (; l_n < l_trip_stream; l_n++) { io_c[l_n] = (*i_h1)*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]) + (*i_h2)*(i_b[l_n]*i_a[l_n]); } #endif /* run the epilogue */ /* #if !defined(__SSE3__) */ { for (; l_n < *i_length; l_n++) { io_c[l_n] = (*i_h1)*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]) + (*i_h2)*(i_b[l_n]*i_a[l_n]); } } /* #else { const __m128d vec_h1 = _mm_loaddup_pd(i_h1); const __m128d vec_h2 = _mm_loaddup_pd(i_h2); const __m128i mask = _mm_set_epi32(0,0,-1,-1); for (; l_n < *i_length; l_n++) { __m128d vec_g1, vec_g2, vec_g3, vec_tm1, vec_tm2, vec_tm3, vec_a, vec_b; vec_g1 = _mm_load_sd(&(i_g1[l_n])); vec_tm1 = _mm_load_sd(&(i_tm1[l_n])); vec_g1 = _mm_mul_sd(vec_g1, vec_tm1); vec_g2 = _mm_load_sd(&(i_g2[l_n])); vec_tm2 = _mm_load_sd(&(i_tm2[l_n])); vec_g2 = _mm_mul_sd(vec_g2, vec_tm2); vec_g3 = _mm_load_sd(&(i_g3[l_n])); vec_tm3 = _mm_load_sd(&(i_tm3[l_n])); vec_g3 = _mm_mul_sd(vec_g3, vec_tm3); vec_a = _mm_load_sd(&(i_a[l_n])); vec_b = _mm_load_sd(&(i_b[l_n])); vec_a = _mm_mul_sd(vec_a, vec_b); vec_g1 = _mm_add_sd(vec_g1, vec_g2); vec_a = _mm_mul_sd(vec_a, vec_h2); vec_g1 = _mm_add_sd(vec_g1, vec_g3); vec_g1 = _mm_mul_sd(vec_g1, vec_h1); _mm_maskmoveu_si128(_mm_castpd_si128(_mm_add_pd( vec_g1, vec_a )), mask, (char*)(&(io_c[l_n]))); } } #endif */ } void LIBXSMM_FSYMBOL(stream_update_helmholtz_no_h2)( const double* i_g1, const double* i_g2, const double* i_g3, const double* i_tm1, const double* i_tm2, const double* i_tm3, double* io_c, const double* i_h1, const int* i_length); void LIBXSMM_FSYMBOL(stream_update_helmholtz_no_h2)( const double* i_g1, const double* i_g2, const double* i_g3, const double* i_tm1, const double* i_tm2, const double* i_tm3, double* io_c, const double* i_h1, const int* i_length) { int l_n = 0; int l_trip_prolog = 0; int l_trip_stream = 0; assert(0 != i_length); assert(0 != i_h1); /* init the trip counts */ stream_init( *i_length, (size_t)io_c, &l_trip_prolog, &l_trip_stream ); /* run the prologue */ for (; l_n < l_trip_prolog; l_n++) { io_c[l_n] = (*i_h1)*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]); } /* run the bulk, hopefully using streaming stores */ #if defined(__SSE3__) && defined(__AVX__) && !defined(__AVX512F__) && !defined(LIBXSMM_INTRINSICS_STATIC) { const __m256d vec_h1 = _mm256_broadcast_sd(i_h1); /* we need manual unrolling as the compiler otherwise generates too many dependencies */ for (; l_n < l_trip_stream; l_n+=8) { __m256d vec_g1_1, vec_g2_1, vec_g3_1, vec_tm1_1, vec_tm2_1, vec_tm3_1; __m256d vec_g1_2, vec_g2_2, vec_g3_2, vec_tm1_2, vec_tm2_2, vec_tm3_2; vec_g1_1 = _mm256_loadu_pd(&(i_g1[l_n])); vec_tm1_1 = _mm256_loadu_pd(&(i_tm1[l_n])); vec_g1_2 = _mm256_loadu_pd(&(i_g1[l_n+4])); vec_tm1_2 = _mm256_loadu_pd(&(i_tm1[l_n+4])); vec_g1_1 = _mm256_mul_pd(vec_g1_1, vec_tm1_1); vec_g2_1 = _mm256_loadu_pd(&(i_g2[l_n])); vec_g1_2 = _mm256_mul_pd(vec_g1_2, vec_tm1_2); vec_g2_2 = _mm256_loadu_pd(&(i_g2[l_n+4])); vec_tm2_1 = _mm256_loadu_pd(&(i_tm2[l_n])); vec_g2_1 = _mm256_mul_pd(vec_g2_1, vec_tm2_1); vec_tm2_2 = _mm256_loadu_pd(&(i_tm2[l_n+4])); vec_g2_2 = _mm256_mul_pd(vec_g2_2, vec_tm2_2); vec_g3_1 = _mm256_loadu_pd(&(i_g3[l_n])); vec_tm3_1 = _mm256_loadu_pd(&(i_tm3[l_n])); vec_g3_2 = _mm256_loadu_pd(&(i_g3[l_n+4])); vec_tm3_2 = _mm256_loadu_pd(&(i_tm3[l_n+4])); vec_g3_1 = _mm256_mul_pd(vec_g3_1, vec_tm3_1); vec_g3_2 = _mm256_mul_pd(vec_g3_2, vec_tm3_2); vec_g1_1 = _mm256_add_pd(vec_g1_1, vec_g2_1); vec_g1_2 = _mm256_add_pd(vec_g1_2, vec_g2_2); vec_g1_1 = _mm256_add_pd(vec_g1_1, vec_g3_1); # ifdef DISABLE_NONTEMPORAL_STORES _mm256_store_pd( &(io_c[l_n]), _mm256_mul_pd(vec_g1_1, vec_h1) ); # else _mm256_stream_pd( &(io_c[l_n]), _mm256_mul_pd(vec_g1_1, vec_h1) ); # endif vec_g1_2 = _mm256_add_pd(vec_g1_2, vec_g3_2); # ifdef DISABLE_NONTEMPORAL_STORES _mm256_store_pd( &(io_c[l_n+4]), _mm256_mul_pd(vec_g1_2, vec_h1) ); # else _mm256_stream_pd( &(io_c[l_n+4]), _mm256_mul_pd(vec_g1_2, vec_h1) ); # endif } } #elif defined(__SSE3__) && defined(__AVX__) && defined(__AVX512F__) { const __m512d vec_h1 = _mm512_broadcastsd_pd(_mm_load_sd(i_h1)); for (; l_n < l_trip_stream; l_n+=8) { __m512d vec_g1, vec_g2, vec_g3, vec_tm1, vec_tm2, vec_tm3; vec_g1 = _mm512_loadu_pd(&(i_g1[l_n])); vec_tm1 = _mm512_loadu_pd(&(i_tm1[l_n])); vec_g1 = _mm512_mul_pd(vec_g1, vec_tm1); vec_g2 = _mm512_loadu_pd(&(i_g2[l_n])); vec_tm2 = _mm512_loadu_pd(&(i_tm2[l_n])); vec_g2 = _mm512_mul_pd(vec_g2, vec_tm2); vec_g3 = _mm512_loadu_pd(&(i_g3[l_n])); vec_tm3 = _mm512_loadu_pd(&(i_tm3[l_n])); vec_g3 = _mm512_mul_pd(vec_g3, vec_tm3); vec_g1 = _mm512_add_pd(vec_g1, vec_g2); vec_g1 = _mm512_add_pd(vec_g1, vec_g3); # ifdef DISABLE_NONTEMPORAL_STORES _mm512_store_pd( &(io_c[l_n]), _mm512_mul_pd(vec_g1, vec_h1) ); # else LIBXSMM_INTRINSICS_MM512_STREAM_PD( &(io_c[l_n]), _mm512_mul_pd(vec_g1, vec_h1) ); # endif } } #else for (; l_n < l_trip_stream; l_n++) { io_c[l_n] = (*i_h1)*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]); } #endif /* run the epilogue */ for (; l_n < *i_length; l_n++) { io_c[l_n] = (*i_h1)*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]); } } void LIBXSMM_FSYMBOL(stream_update_var_helmholtz)( const double* i_g1, const double* i_g2, const double* i_g3, const double* i_tm1, const double* i_tm2, const double* i_tm3, const double* i_a, const double* i_b, double* io_c, const double* i_h1, const double* i_h2, const int* i_length); void LIBXSMM_FSYMBOL(stream_update_var_helmholtz)( const double* i_g1, const double* i_g2, const double* i_g3, const double* i_tm1, const double* i_tm2, const double* i_tm3, const double* i_a, const double* i_b, double* io_c, const double* i_h1, const double* i_h2, const int* i_length) { int l_n = 0; int l_trip_prolog = 0; int l_trip_stream = 0; assert(0 != i_length); /* init the trip counts */ stream_init( *i_length, (size_t)io_c, &l_trip_prolog, &l_trip_stream ); /* run the prologue */ /* #if !defined(__SSE3__) */ { for (; l_n < l_trip_prolog; l_n++) { io_c[l_n] = i_h1[l_n]*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]) + i_h2[l_n]*(i_b[l_n]*i_a[l_n]); } } /* #else { const __m128i mask = _mm_set_epi32(0,0,-1,-1); for (; l_n < l_trip_prolog; l_n++) { __m128d vec_g1, vec_g2, vec_g3, vec_tm1, vec_tm2, vec_tm3, vec_a, vec_b, vec_h1, vec_h2; vec_g1 = _mm_load_sd(&(i_g1[l_n])); vec_tm1 = _mm_load_sd(&(i_tm1[l_n])); vec_g1 = _mm_mul_sd(vec_g1, vec_tm1); vec_g2 = _mm_load_sd(&(i_g2[l_n])); vec_tm2 = _mm_load_sd(&(i_tm2[l_n])); vec_g2 = _mm_mul_sd(vec_g2, vec_tm2); vec_g3 = _mm_load_sd(&(i_g3[l_n])); vec_tm3 = _mm_load_sd(&(i_tm3[l_n])); vec_g3 = _mm_mul_sd(vec_g3, vec_tm3); vec_a = _mm_load_sd(&(i_a[l_n])); vec_b = _mm_load_sd(&(i_b[l_n])); vec_a = _mm_mul_sd(vec_a, vec_b); vec_g1 = _mm_add_sd(vec_g1, vec_g2); vec_h2 = _mm_load_sd(&(i_h2[l_n])); vec_a = _mm_mul_sd(vec_a, vec_h2); vec_g1 = _mm_add_sd(vec_g1, vec_g3); vec_h1 = _mm_load_sd(&(i_h1[l_n])); vec_g1 = _mm_mul_sd(vec_g1, vec_h1); _mm_maskmoveu_si128(_mm_castpd_si128(_mm_add_pd( vec_g1, vec_a )), mask, (char*)(&(io_c[l_n]))); } } #endif */ /* run the bulk, hopefully using streaming stores */ #if defined(__SSE3__) && defined(__AVX__) && !defined(__AVX512F__) && !defined(LIBXSMM_INTRINSICS_STATIC) { /* we need manual unrolling as the compiler otherwise generates too many dependencies */ for (; l_n < l_trip_stream; l_n+=8) { __m256d vec_g1_1, vec_g2_1, vec_g3_1, vec_tm1_1, vec_tm2_1, vec_tm3_1, vec_a_1, vec_b_1, vec_h1_1, vec_h2_1; __m256d vec_g1_2, vec_g2_2, vec_g3_2, vec_tm1_2, vec_tm2_2, vec_tm3_2, vec_a_2, vec_b_2, vec_h1_2, vec_h2_2; vec_g1_1 = _mm256_loadu_pd(&(i_g1[l_n])); vec_tm1_1 = _mm256_loadu_pd(&(i_tm1[l_n])); vec_g1_2 = _mm256_loadu_pd(&(i_g1[l_n+4])); vec_tm1_2 = _mm256_loadu_pd(&(i_tm1[l_n+4])); vec_g1_1 = _mm256_mul_pd(vec_g1_1, vec_tm1_1); vec_g2_1 = _mm256_loadu_pd(&(i_g2[l_n])); vec_g1_2 = _mm256_mul_pd(vec_g1_2, vec_tm1_2); vec_g2_2 = _mm256_loadu_pd(&(i_g2[l_n+4])); vec_tm2_1 = _mm256_loadu_pd(&(i_tm2[l_n])); vec_g2_1 = _mm256_mul_pd(vec_g2_1, vec_tm2_1); vec_tm2_2 = _mm256_loadu_pd(&(i_tm2[l_n+4])); vec_g2_2 = _mm256_mul_pd(vec_g2_2, vec_tm2_2); vec_g3_1 = _mm256_loadu_pd(&(i_g3[l_n])); vec_tm3_1 = _mm256_loadu_pd(&(i_tm3[l_n])); vec_g3_2 = _mm256_loadu_pd(&(i_g3[l_n+4])); vec_tm3_2 = _mm256_loadu_pd(&(i_tm3[l_n+4])); vec_g3_1 = _mm256_mul_pd(vec_g3_1, vec_tm3_1); vec_a_1 = _mm256_loadu_pd(&(i_a[l_n])); vec_g3_2 = _mm256_mul_pd(vec_g3_2, vec_tm3_2); vec_a_2 = _mm256_loadu_pd(&(i_a[l_n+4])); vec_b_1 = _mm256_loadu_pd(&(i_b[l_n])); vec_a_1 = _mm256_mul_pd(vec_a_1, vec_b_1); vec_b_2 = _mm256_loadu_pd(&(i_b[l_n+4])); vec_a_2 = _mm256_mul_pd(vec_a_2, vec_b_2); vec_h2_1 = _mm256_loadu_pd(&(i_h2[l_n])); vec_g1_1 = _mm256_add_pd(vec_g1_1, vec_g2_1); vec_a_1 = _mm256_mul_pd(vec_a_1, vec_h2_1); vec_h2_2 = _mm256_loadu_pd(&(i_h2[l_n+4])); vec_g1_2 = _mm256_add_pd(vec_g1_2, vec_g2_2); vec_a_2 = _mm256_mul_pd(vec_a_2, vec_h2_2); vec_h1_1 = _mm256_loadu_pd(&(i_h1[l_n])); vec_g1_1 = _mm256_add_pd(vec_g1_1, vec_g3_1); vec_g1_1 = _mm256_mul_pd(vec_g1_1, vec_h1_1); # ifdef DISABLE_NONTEMPORAL_STORES _mm256_store_pd( &(io_c[l_n]), _mm256_add_pd( vec_g1_1, vec_a_1 ) ); # else _mm256_stream_pd( &(io_c[l_n]), _mm256_add_pd( vec_g1_1, vec_a_1 ) ); # endif vec_h1_2 = _mm256_loadu_pd(&(i_h1[l_n+4])); vec_g1_2 = _mm256_add_pd(vec_g1_2, vec_g3_2); vec_g1_2 = _mm256_mul_pd(vec_g1_2, vec_h1_2); # ifdef DISABLE_NONTEMPORAL_STORES _mm256_store_pd( &(io_c[l_n+4]), _mm256_add_pd( vec_g1_2, vec_a_2 ) ); # else _mm256_stream_pd( &(io_c[l_n+4]), _mm256_add_pd( vec_g1_2, vec_a_2 ) ); # endif } } #elif defined(__SSE3__) && defined(__AVX__) && defined(__AVX512F__) { for (; l_n < l_trip_stream; l_n+=8) { __m512d vec_g1, vec_g2, vec_g3, vec_tm1, vec_tm2, vec_tm3, vec_a, vec_b, vec_h1, vec_h2; vec_g1 = _mm512_loadu_pd(&(i_g1[l_n])); vec_tm1 = _mm512_loadu_pd(&(i_tm1[l_n])); vec_g1 = _mm512_mul_pd(vec_g1, vec_tm1); vec_g2 = _mm512_loadu_pd(&(i_g2[l_n])); vec_tm2 = _mm512_loadu_pd(&(i_tm2[l_n])); vec_g2 = _mm512_mul_pd(vec_g2, vec_tm2); vec_g3 = _mm512_loadu_pd(&(i_g3[l_n])); vec_tm3 = _mm512_loadu_pd(&(i_tm3[l_n])); vec_g3 = _mm512_mul_pd(vec_g3, vec_tm3); vec_a = _mm512_loadu_pd(&(i_a[l_n])); vec_b = _mm512_loadu_pd(&(i_b[l_n])); vec_a = _mm512_mul_pd(vec_a, vec_b); vec_g1 = _mm512_add_pd(vec_g1, vec_g2); vec_h2 = _mm512_loadu_pd(&(i_h2[l_n])); vec_a = _mm512_mul_pd(vec_a, vec_h2); vec_g1 = _mm512_add_pd(vec_g1, vec_g3); vec_h1 = _mm512_loadu_pd(&(i_h1[l_n])); vec_g1 = _mm512_mul_pd(vec_g1, vec_h1); # ifdef DISABLE_NONTEMPORAL_STORES _mm512_store_pd( &(io_c[l_n]), _mm512_add_pd( vec_g1, vec_a ) ); # else LIBXSMM_INTRINSICS_MM512_STREAM_PD( &(io_c[l_n]), _mm512_add_pd( vec_g1, vec_a ) ); # endif } } #else for (; l_n < l_trip_stream; l_n++) { io_c[l_n] = i_h1[l_n]*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]) + i_h2[l_n]*(i_b[l_n]*i_a[l_n]); } #endif /* run the epilogue */ /* #if !defined(__SSE3__) */ { for (; l_n < *i_length; l_n++) { io_c[l_n] = i_h1[l_n]*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]) + i_h2[l_n]*(i_b[l_n]*i_a[l_n]); } } /* #else { const __m128i mask = _mm_set_epi32(0,0,-1,-1); for (; l_n < *i_length; l_n++) { __m128d vec_g1, vec_g2, vec_g3, vec_tm1, vec_tm2, vec_tm3, vec_a, vec_b; vec_g1 = _mm_load_sd(&(i_g1[l_n])); vec_tm1 = _mm_load_sd(&(i_tm1[l_n])); vec_g1 = _mm_mul_sd(vec_g1, vec_tm1); vec_g2 = _mm_load_sd(&(i_g2[l_n])); vec_tm2 = _mm_load_sd(&(i_tm2[l_n])); vec_g2 = _mm_mul_sd(vec_g2, vec_tm2); vec_g3 = _mm_load_sd(&(i_g3[l_n])); vec_tm3 = _mm_load_sd(&(i_tm3[l_n])); vec_g3 = _mm_mul_sd(vec_g3, vec_tm3); vec_a = _mm_load_sd(&(i_a[l_n])); vec_b = _mm_load_sd(&(i_b[l_n])); vec_a = _mm_mul_sd(vec_a, vec_b); vec_g1 = _mm_add_sd(vec_g1, vec_g2); vec_h2 = _mm_load_sd(&(i_h2[l_n])); vec_a = _mm_mul_sd(vec_a, vec_h2); vec_g1 = _mm_add_sd(vec_g1, vec_g3); vec_h1 = _mm_load_sd(&(i_h1[l_n])); vec_g1 = _mm_mul_sd(vec_g1, vec_h1); _mm_maskmoveu_si128(_mm_castpd_si128(_mm_add_pd( vec_g1, vec_a )), mask, (char*)(&(io_c[l_n]))); } } #endif */ } libxsmm-1.17/samples/nek/stream_update_kernels.f000066400000000000000000000124141415223013700220650ustar00rootroot00000000000000!=======================================================================! ! Copyright (c) Intel Corporation - All rights reserved. ! ! This file is part of the LIBXSMM library. ! ! ! ! For information on the license, see the LICENSE file. ! ! Further information: https://github.com/hfp/libxsmm/ ! ! SPDX-License-Identifier: BSD-3-Clause ! !=======================================================================! ! Alexander Heinecke (Intel Corp.) !=======================================================================! MODULE STREAM_UPDATE_KERNELS USE, INTRINSIC :: ISO_C_BINDING IMPLICIT NONE INTERFACE SUBROUTINE stream_update_helmholtz( i_g1, i_g2, i_g3, & & i_tm1, i_tm2, i_tm3, & & i_a, i_b, io_c, & & i_h1, i_h2, i_length ) IMPORT :: C_DOUBLE, C_INT REAL(KIND=C_DOUBLE), DIMENSION(*), INTENT(IN) :: i_g1 REAL(KIND=C_DOUBLE), DIMENSION(*), INTENT(IN) :: i_g2 REAL(KIND=C_DOUBLE), DIMENSION(*), INTENT(IN) :: i_g3 REAL(KIND=C_DOUBLE), DIMENSION(*), INTENT(IN) :: i_tm1 REAL(KIND=C_DOUBLE), DIMENSION(*), INTENT(IN) :: i_tm2 REAL(KIND=C_DOUBLE), DIMENSION(*), INTENT(IN) :: i_tm3 REAL(KIND=C_DOUBLE), DIMENSION(*), INTENT(IN) :: i_a REAL(KIND=C_DOUBLE), DIMENSION(*), INTENT(IN) :: i_b REAL(KIND=C_DOUBLE), DIMENSION(*), INTENT(INOUT) :: io_c REAL(KIND=C_DOUBLE), INTENT(IN) :: i_h1 REAL(KIND=C_DOUBLE), INTENT(IN) :: i_h2 INTEGER(C_INT), INTENT(IN) :: i_length END SUBROUTINE SUBROUTINE stream_update_helmholtz_no_h2( & & i_g1, i_g2, i_g3, & & i_tm1, i_tm2, i_tm3, & & io_c, i_h1, i_length ) IMPORT :: C_DOUBLE, C_INT REAL(KIND=C_DOUBLE), DIMENSION(*), INTENT(IN) :: i_g1 REAL(KIND=C_DOUBLE), DIMENSION(*), INTENT(IN) :: i_g2 REAL(KIND=C_DOUBLE), DIMENSION(*), INTENT(IN) :: i_g3 REAL(KIND=C_DOUBLE), DIMENSION(*), INTENT(IN) :: i_tm1 REAL(KIND=C_DOUBLE), DIMENSION(*), INTENT(IN) :: i_tm2 REAL(KIND=C_DOUBLE), DIMENSION(*), INTENT(IN) :: i_tm3 REAL(KIND=C_DOUBLE), DIMENSION(*), INTENT(INOUT) :: io_c REAL(KIND=C_DOUBLE), INTENT(IN) :: i_h1 INTEGER(C_INT), INTENT(IN) :: i_length END SUBROUTINE SUBROUTINE stream_update_var_helmholtz( & & i_g1, i_g2, i_g3, & & i_tm1, i_tm2, i_tm3, & & i_a, i_b, io_c, & & i_h1, i_h2, i_length ) IMPORT :: C_DOUBLE, C_INT REAL(KIND=C_DOUBLE), DIMENSION(*), INTENT(IN) :: i_g1 REAL(KIND=C_DOUBLE), DIMENSION(*), INTENT(IN) :: i_g2 REAL(KIND=C_DOUBLE), DIMENSION(*), INTENT(IN) :: i_g3 REAL(KIND=C_DOUBLE), DIMENSION(*), INTENT(IN) :: i_tm1 REAL(KIND=C_DOUBLE), DIMENSION(*), INTENT(IN) :: i_tm2 REAL(KIND=C_DOUBLE), DIMENSION(*), INTENT(IN) :: i_tm3 REAL(KIND=C_DOUBLE), DIMENSION(*), INTENT(IN) :: i_a REAL(KIND=C_DOUBLE), DIMENSION(*), INTENT(IN) :: i_b REAL(KIND=C_DOUBLE), DIMENSION(*), INTENT(INOUT) :: io_c REAL(KIND=C_DOUBLE), DIMENSION(*), INTENT(IN) :: i_h1 REAL(KIND=C_DOUBLE), DIMENSION(*), INTENT(IN) :: i_h2 INTEGER(C_INT), INTENT(IN) :: i_length END SUBROUTINE SUBROUTINE stream_vector_compscale( i_a, i_b, io_c, i_length ) IMPORT :: C_DOUBLE, C_INT REAL(KIND=C_DOUBLE), DIMENSION(*), INTENT(IN) :: i_a REAL(KIND=C_DOUBLE), DIMENSION(*), INTENT(IN) :: i_b REAL(KIND=C_DOUBLE), DIMENSION(*), INTENT(INOUT) :: io_c INTEGER(C_INT), INTENT(IN) :: i_length END SUBROUTINE SUBROUTINE stream_vector_copy( i_a, io_c, i_length ) IMPORT :: C_DOUBLE, C_INT REAL(KIND=C_DOUBLE), DIMENSION(*), INTENT(IN) :: i_a REAL(KIND=C_DOUBLE), DIMENSION(*), INTENT(INOUT) :: io_c INTEGER(C_INT), INTENT(IN) :: i_length END SUBROUTINE SUBROUTINE stream_vector_set( i_scalar, io_c, i_length ) IMPORT :: C_DOUBLE, C_INT REAL(KIND=C_DOUBLE), INTENT(IN) :: i_scalar REAL(KIND=C_DOUBLE), DIMENSION(*), INTENT(INOUT) :: io_c INTEGER(C_INT), INTENT(IN) :: i_length END SUBROUTINE END INTERFACE END MODULE libxsmm-1.17/samples/nek/torture.sh000077500000000000000000000015061415223013700174010ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### for i in `seq 1 1 300` do ./rstr 32 32 32 32 32 32 16 >/dev/null done libxsmm-1.17/samples/packed/000077500000000000000000000000001415223013700160065ustar00rootroot00000000000000libxsmm-1.17/samples/packed/gemm/000077500000000000000000000000001415223013700167335ustar00rootroot00000000000000libxsmm-1.17/samples/packed/gemm/Makefile000066400000000000000000000074731415223013700204060ustar00rootroot00000000000000ROOTDIR = $(abspath $(dir $(firstword $(MAKEFILE_LIST)))) DEPDIR = ../../.. SRCDIR = . INCDIR = . BLDDIR = obj OUTDIR = . CXXFLAGS = $(NULL) CFLAGS = $(NULL) DFLAGS = $(NULL) # Fortran code here does not allow for PEDANTIC=2 # override PEDANTIC = 1 PEDANTIC = 0 BLAS = 0 OMP = 0 SYM = 1 # include common Makefile artifacts include $(DEPDIR)/Makefile.inc # necessary include directories IFLAGS += -I$(call quote,$(INCDIR)) IFLAGS += -I$(call quote,$(DEPDIR)/include) OUTNAME := $(shell basename "$(ROOTDIR)") HEADERS := $(wildcard $(INCDIR)/*.h) $(wildcard $(INCDIR)/*.hpp) $(wildcard $(INCDIR)/*.hxx) $(wildcard $(INCDIR)/*.hh) \ $(wildcard $(SRCDIR)/*.h) $(wildcard $(SRCDIR)/*.hpp) $(wildcard $(SRCDIR)/*.hxx) $(wildcard $(SRCDIR)/*.hh) \ $(DEPDIR)/include/libxsmm_source.h CPPSRCS := $(wildcard $(SRCDIR)/*.cpp) CXXSRCS := $(wildcard $(SRCDIR)/*.cxx) CCXSRCS := $(wildcard $(SRCDIR)/*.cc) CSOURCS := $(wildcard $(SRCDIR)/*.c) CPPOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CPPSRCS:.cpp=-cpp.o))) CXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CXXSRCS:.cxx=-cxx.o))) CCXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CCXSRCS:.cc=-cc.o))) COBJCTS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CSOURCS:.c=-c.o))) ifneq (,$(strip $(FC))) FXXSRCS := $(wildcard $(SRCDIR)/*.f) F77SRCS := $(wildcard $(SRCDIR)/*.F) F90SRCS := $(wildcard $(SRCDIR)/*.f90) $(wildcard $(SRCDIR)/*.F90) FXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(FXXSRCS:.f=-f.o))) F77OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F77SRCS:.F=-f77.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90SRCS:.f90=-f90.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90OBJS:.F90=-f90.o))) endif SOURCES := $(CPPSRCS) $(CXXSRCS) $(CCXSRCS) $(CSOURCS) OBJECTS := $(CPPOBJS) $(CXXOBJS) $(CCXOBJS) $(COBJCTS) FTNSRCS := $(FXXSRCS) $(F77SRCS) $(F90SRCS) MODULES := $(addsuffix .mod,$(basename $(FTNSRCS))) $(addsuffix .modmic,$(basename $(FTNSRCS))) FTNOBJS := $(FXXOBJS) $(F77OBJS) $(F90OBJS) XFILES := $(OUTDIR)/$(OUTNAME) .PHONY: all all: $(XFILES) .PHONY: compile compile: $(OBJECTS) $(FTNOBJS) ifneq (,$(strip $(FC))) $(BLDDIR)/%-f.o: $(SRCDIR)/%.f .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.f90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.F90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f77.o: $(SRCDIR)/%.F .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ endif $(OUTDIR)/$(OUTNAME): $(OUTDIR)/.make $(OBJECTS) $(FTNOBJS) $(LIBDEP) $(LIB_FLD) -o $@ $(OBJECTS) $(FTNOBJS) $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(BLDDIR)/%-cpp.o: $(SRCDIR)/%.cpp .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CXX) $(DFLAGS) $(IFLAGS) $(CXXFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-c.o: $(SRCDIR)/%.c .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CC) $(DFLAGS) $(IFLAGS) $(CFLAGS) $(CTARGET) -c $< -o $@ .PHONY: clean clean: ifneq ($(call qapath,$(BLDDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(BLDDIR)),$(call qapath,.)) @rm -rf $(BLDDIR) endif endif ifneq (,$(wildcard $(BLDDIR))) # still exists @rm -f $(OBJECTS) $(OBJECTX) $(FTNOBJS) $(FTNOBJX) *__genmod.* fit.log *.dat @rm -f $(BLDDIR)/*.gcno $(BLDDIR)/*.gcda $(BLDDIR)/*.gcov endif @rm -f .make .state .PHONY: realclean realclean: clean ifneq ($(call qapath,$(OUTDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(OUTDIR)),$(call qapath,.)) @rm -rf $(OUTDIR) endif endif ifneq (,$(wildcard $(OUTDIR))) # still exists @rm -f $(OUTDIR)/libxsmm.$(DLIBEXT) $(OUTDIR)/*.stackdump @rm -f $(XFILES) $(MODULES) endif libxsmm-1.17/samples/packed/gemm/blas_aux.c000066400000000000000000000011741415223013700207000ustar00rootroot00000000000000/* Optionally link-in the BLAS routines lsame_() and xerbla_() */ #if !defined(__BLAS) || (0 != __BLAS) #include int lsame_(const char* ca, const char* cb) { if ( *ca == *cb ) return 1; if ( (*cb >= 'a') && (*cb <= 'z') ) { if ( *ca == *cb + 32 ) return 1; } else if ( (*cb >= 'A') && (*cb <= 'Z') ) { if ( *ca == *cb - 32 ) return 1; } return 0; } void xerbla_(const char* c, const int* info) { printf(" ** On entry to %s parameter number %02d had an illegal value\n", c, *info); } int ilaenv_ ( int *ispec, char *name, char *opts, int *n1, int *n2, int *n3, int *n4 ) { return ( 1 ); } #endif libxsmm-1.17/samples/packed/gemm/dgemm.f000066400000000000000000000130431415223013700201740ustar00rootroot00000000000000 SUBROUTINE DGEMM(TRANSA,TRANSB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC) ! ! -- Reference BLAS level3 routine (version 3.4.0) -- ! -- Reference BLAS is a software package provided by Univ. of Tennessee, -- ! -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- ! November 2011 ! ! .. Scalar Arguments .. DOUBLE PRECISION ALPHA,BETA INTEGER K,LDA,LDB,LDC,M,N CHARACTER TRANSA,TRANSB ! .. ! .. Array Arguments .. DOUBLE PRECISION A(LDA,*),B(LDB,*),C(LDC,*) ! .. ! ! ===================================================================== ! ! .. External Functions .. LOGICAL LSAME EXTERNAL LSAME ! .. ! .. External Subroutines .. EXTERNAL XERBLA ! .. ! .. Intrinsic Functions .. INTRINSIC MAX ! .. ! .. Local Scalars .. DOUBLE PRECISION TEMP INTEGER I,INFO,J,L,NCOLA,NROWA,NROWB LOGICAL NOTA,NOTB ! .. ! .. Parameters .. DOUBLE PRECISION ONE,ZERO PARAMETER (ONE=1.0D+0,ZERO=0.0D+0) ! .. ! ! Set NOTA and NOTB as true if A and B respectively are not ! transposed and set NROWA, NCOLA and NROWB as the number of rows ! and columns of A and the number of rows of B respectively. ! NOTA = LSAME(TRANSA,'N') NOTB = LSAME(TRANSB,'N') IF (NOTA) THEN NROWA = M NCOLA = K ELSE NROWA = K NCOLA = M END IF IF (NOTB) THEN NROWB = K ELSE NROWB = N END IF ! ! Test the input parameters. ! INFO = 0 IF ((.NOT.NOTA) .AND. (.NOT.LSAME(TRANSA,'C')) .AND. & (.NOT.LSAME(TRANSA,'T'))) THEN INFO = 1 ELSE IF ((.NOT.NOTB) .AND. (.NOT.LSAME(TRANSB,'C')) .AND. & (.NOT.LSAME(TRANSB,'T'))) THEN INFO = 2 ELSE IF (M.LT.0) THEN INFO = 3 ELSE IF (N.LT.0) THEN INFO = 4 ELSE IF (K.LT.0) THEN INFO = 5 ELSE IF (LDA.LT.MAX(1,NROWA)) THEN INFO = 8 ELSE IF (LDB.LT.MAX(1,NROWB)) THEN INFO = 10 ELSE IF (LDC.LT.MAX(1,M)) THEN INFO = 13 END IF IF (INFO.NE.0) THEN CALL XERBLA('DGEMM ',INFO) RETURN END IF ! ! Quick return if possible. ! IF ((M.EQ.0) .OR. (N.EQ.0) .OR. & (((ALPHA.EQ.ZERO).OR. (K.EQ.0)).AND. (BETA.EQ.ONE))) RETURN ! ! And if alpha.eq.zero. ! IF (ALPHA.EQ.ZERO) THEN IF (BETA.EQ.ZERO) THEN DO 20 J = 1,N DO 10 I = 1,M C(I,J) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40 J = 1,N DO 30 I = 1,M C(I,J) = BETA*C(I,J) 30 CONTINUE 40 CONTINUE END IF RETURN END IF ! ! Start the operations. ! IF (NOTB) THEN IF (NOTA) THEN ! ! Form C := alpha*A*B + beta*C. ! DO 90 J = 1,N IF (BETA.EQ.ZERO) THEN DO 50 I = 1,M C(I,J) = ZERO 50 CONTINUE ELSE IF (BETA.NE.ONE) THEN DO 60 I = 1,M C(I,J) = BETA*C(I,J) 60 CONTINUE END IF DO 80 L = 1,K IF (B(L,J).NE.ZERO) THEN TEMP = ALPHA*B(L,J) DO 70 I = 1,M C(I,J) = C(I,J) + TEMP*A(I,L) 70 CONTINUE END IF 80 CONTINUE 90 CONTINUE ELSE ! ! Form C := alpha*A**T*B + beta*C ! DO 120 J = 1,N DO 110 I = 1,M TEMP = ZERO DO 100 L = 1,K TEMP = TEMP + A(L,I)*B(L,J) 100 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 110 CONTINUE 120 CONTINUE END IF ELSE IF (NOTA) THEN ! ! Form C := alpha*A*B**T + beta*C ! DO 170 J = 1,N IF (BETA.EQ.ZERO) THEN DO 130 I = 1,M C(I,J) = ZERO 130 CONTINUE ELSE IF (BETA.NE.ONE) THEN DO 140 I = 1,M C(I,J) = BETA*C(I,J) 140 CONTINUE END IF DO 160 L = 1,K IF (B(J,L).NE.ZERO) THEN TEMP = ALPHA*B(J,L) DO 150 I = 1,M C(I,J) = C(I,J) + TEMP*A(I,L) 150 CONTINUE END IF 160 CONTINUE 170 CONTINUE ELSE ! ! Form C := alpha*A**T*B**T + beta*C ! DO 200 J = 1,N DO 190 I = 1,M TEMP = ZERO DO 180 L = 1,K TEMP = TEMP + A(L,I)*B(J,L) 180 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 190 CONTINUE 200 CONTINUE END IF END IF ! RETURN ! ! End of DGEMM . ! END libxsmm-1.17/samples/packed/gemm/gemm.c000066400000000000000000000672441415223013700200410ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Greg Henry, Hans Pabst, Alexander Heinecke (Intel Corp.) ******************************************************************************/ #if 0 #define USE_KERNEL_GENERATION_DIRECTLY #endif #if 0 #define USE_PREDEFINED_ASSEMBLY #define USE_XSMM_GENERATED #define TIME_MKL #endif #if 0 #define TEST_SINGLE #endif #if !defined(USE_PREDEFINED_ASSEMBLY) && !defined(USE_XSMM_GENERATED) && !defined(TIME_MKL) && \ (!defined(__linux__) || !defined(USE_KERNEL_GENERATION_DIRECTLY)) # define USE_XSMM_GENERATED # include #else # include # include # include # include # include #endif #include #include #include #include #define BUFSIZE 32*32 #define BUFSIZE2 64000 #if 0 #define TRIANGLE_IS_IDENTITY #endif #if 1 #define AVX2_TESTING #endif #if 0 #define AVX512_TESTING #endif #if !defined(AVX2_TESTING) && !defined(AVX512_TESTING) #define AVX2_TESTING #endif #if defined(AVX2_TESTING) && defined(AVX512_TESTING) #error Compile with either AVX2_TESTING or AVX512_TESTING never both #endif LIBXSMM_INLINE void dcopy_to_temp ( int layout, double *A, int lda, int m, int n, double *Atemp, unsigned int VLEN ) { int i, j; if ( lda*n > BUFSIZE ) { printf("Reference routine not set up for matrices so large\n"); exit(-1); } if ( layout == 102 ) { /* printf("Column major\n"); */ for ( j = 0; j < n; j++ ) { for ( i = 0; i < m; i++ ) { Atemp[i+j*m] = A[i*VLEN+j*lda*VLEN]; } } #if EVENTUALLY_USE_THIS_LOOP_IT_SHOULD_BE_FASTER for ( j = 0; j < n; j++ ) { for ( i = 0, ia = 0; i < m; i++, ia+=VLEN ) { Atemp[i+j*m] = A[ ia+j*lda*VLEN ]; } } #endif } else { /* printf("Row major\n"); */ for ( j = 0; j < n; j++ ) { for ( i = 0; i < m; i++ ) { /* Transpose the data */ Atemp[i+j*m] = A[j*VLEN+i*lda*VLEN]; } } } } LIBXSMM_INLINE void scopy_to_temp ( int layout, float *A, int lda, int m, int n, float *Atemp, unsigned int VLEN ) { int i, j; if ( lda*n > BUFSIZE ) { printf("Reference routine not set up for matrices so large\n"); exit(-1); } if ( layout == 102 ) { /* printf("Column major\n"); */ for ( j = 0; j < n; j++ ) { for ( i = 0; i < m; i++ ) { Atemp[i+j*m] = A[i*VLEN+j*lda*VLEN]; } } } else { /* printf("Row major\n"); */ for ( j = 0; j < n; j++ ) { for ( i = 0; i < m; i++ ) { /* Transpose the data */ Atemp[i+j*m] = A[j*VLEN+i*lda*VLEN]; } } } } LIBXSMM_INLINE void dcopy_from_temp ( int layout, double *A, int lda, int m, int n, double *Atemp, unsigned int VLEN ) { int i, j, ia; if ( lda*n > BUFSIZE ) { printf("Reference routine not set up for matrices so large\n"); } if ( layout == 102 ) { for ( j = 0; j < n; j++ ) { for ( i = 0, ia = 0; i < m; i++, ia+=VLEN ) { A[ia+j*lda*VLEN] = Atemp[i+j*m]; } } } else { for ( j = 0; j < n; j++ ) { for ( i = 0; i < m; i++ ) { /* Transpose the data */ A[j*VLEN+i*lda*VLEN] = Atemp[i+j*m]; } } } } LIBXSMM_INLINE void scopy_from_temp ( int layout, float *A, int lda, int m, int n, float *Atemp, unsigned int VLEN ) { int i, j, ia; if ( lda*n > BUFSIZE ) { printf("Reference routine not set up for matrices so large\n"); } if ( layout == 102 ) { for ( j = 0; j < n; j++ ) { for ( i = 0, ia = 0; i < m; i++, ia+=VLEN ) { A[ia+j*lda*VLEN] = Atemp[i+j*m]; } } } else { for ( j = 0; j < n; j++ ) { for ( i = 0; i < m; i++ ) { /* Transpose the data */ A[j*VLEN+i*lda*VLEN] = Atemp[i+j*m]; } } } } #if !defined(USE_MKL_FOR_REFERENCE) && !defined(LIBXSMM_NOFORTRAN) && (!defined(__BLAS) || (0 != __BLAS)) extern void dgemm_(); /* Reference code for compact dgemm. Note that this just copies data into a buffer from the compact storage and calls the regular dgemm code. This is very naive reference code just used for testing purposes */ LIBXSMM_INLINE void compact_dgemm_ ( unsigned int *layout, char *transa, char *transb, unsigned int *m, unsigned int *n, unsigned int *k, double *alpha, double *A, unsigned int *lda, double *B, unsigned int *ldb, double *beta, double *C, unsigned int *ldc, unsigned int *nmat, unsigned int *VLEN ) { unsigned int i, j, num, info; double *Ap, Atemp[BUFSIZE]; double *Bp, Btemp[BUFSIZE]; double *Cp, Ctemp[BUFSIZE]; static int ntimes = 0; char ntrans='N'; if ( ++ntimes < 3 ) printf("Inside reference compact_dgemm_()\n"); if ( ++ntimes < 3 ) printf("layout=%d m/n/k=%d %d %d lda/b/c=%d %d %d nmat=%d VLEN=%d\n",*layout,*m,*n,*k,*lda,*ldb,*ldc,*nmat,*VLEN); for ( i = 0, num = 0; i < (*nmat); i+= *VLEN, num++ ) { for ( j = 0; j < *VLEN; j++ ) { /* Unpack the data, call a reference DGEMM, repack the data */ Ap = &A[j+num*(*lda)*(*k)*(*VLEN)]; Bp = &B[j+num*(*ldb)*(*n)*(*VLEN)]; Cp = &C[j+num*(*ldc)*(*n)*(*VLEN)]; if (++ntimes < 3 ) printf("Doing a dgemm at place i=%d j=%d num=%d Ap[%d]=%g\n",i,j,num,j+num*(*lda)*(*k)*(*VLEN),Ap[0]); dcopy_to_temp ( *layout, Ap, *lda, *m, *k, Atemp, *VLEN ); dcopy_to_temp ( *layout, Bp, *ldb, *k, *n, Btemp, *VLEN ); dcopy_to_temp ( *layout, Cp, *ldc, *m, *n, Ctemp, *VLEN ); dgemm_ ( transa, transb, m, n, k, alpha, Atemp, m, Btemp, k, beta, Ctemp, m ); dcopy_from_temp ( *layout, Cp, *ldc, *m, *n, Ctemp, *VLEN ); } } } extern void sgemm_(); /* Reference code for compact sgemm. Note that this just copies data into a buffer from the compact storage and calls the regular sgemm code. This is very naive reference code just used for testing purposes */ /* Note: if layout==101 (row major), then this code is known to only work when * nmat == VLEN. To check for accuracy otherwise, transpose everything */ LIBXSMM_INLINE void compact_sgemm_ ( char *transa, char *transb, unsigned int *layout, unsigned int *m, unsigned int *n, unsigned int *k, float *alpha, float *A, unsigned int *lda, float *B, unsigned int *ldb, float *beta, float *C, unsigned int *ldc, unsigned int *nmat, unsigned int *VLEN ) { unsigned int i, j, num, info; float *Ap, Atemp[BUFSIZE]; float *Bp, Btemp[BUFSIZE]; float *Cp, Ctemp[BUFSIZE]; static int ntimes = 0; char ntrans='N'; if ( ++ntimes < 3 ) printf("Inside reference compact_sgemm_()\n"); if ( ++ntimes < 3 ) printf("layout=%d m/n/k=%d %d %d lda/b/c=%d %d %d nmat=%d VLEN=%d\n",*layout,*m,*n,*k,*lda,*ldb,*ldc,*nmat,*VLEN); for ( i = 0, num = 0; i < (*nmat); i+= *VLEN, num++ ) { for ( j = 0; j < *VLEN; j++ ) { /* Unpack the data, call a reference DGEMM, repack the data */ Ap = &A[j+num*(*lda)*(*k)*(*VLEN)]; Bp = &B[j+num*(*ldb)*(*n)*(*VLEN)]; Cp = &C[j+num*(*ldc)*(*n)*(*VLEN)]; if (++ntimes < 3 ) printf("Doing a sgemm at place i=%d j=%d num=%d Ap[%d]=%g\n",i,j,num,j+num*(*lda)*(*k)*(*VLEN),Ap[0]); scopy_to_temp ( *layout, Ap, *lda, *m, *k, Atemp, *VLEN ); scopy_to_temp ( *layout, Bp, *ldb, *k, *n, Btemp, *VLEN ); scopy_to_temp ( *layout, Cp, *ldc, *m, *n, Ctemp, *VLEN ); sgemm_ ( transa, transb, m, n, k, alpha, Atemp, m, Btemp, k, beta, Ctemp, m ); scopy_from_temp ( *layout, Cp, *ldc, *m, *n, Ctemp, *VLEN ); } } } #endif LIBXSMM_INLINE void dfill_matrix ( double *matrix, unsigned int ld, unsigned int m, unsigned int n ) { unsigned int i, j; double dtmp; if ( ld < m ) { fprintf(stderr,"Error in dfill_matrix: ld=%u m=%u mismatched!\n",ld,m); exit(-1); } for ( j = 1; j <= n; j++ ) { /* Fill through the leading dimension */ for ( i = 1; i <= ld; i++ ) { dtmp = 1.0 - 2.0*libxsmm_rng_f64(); matrix [ (j-1)*ld + (i-1) ] = dtmp; } } } LIBXSMM_INLINE void dfill_identity ( double *matrix, unsigned int ld, unsigned int m, unsigned int n, int VLEN, int number_of_cases ) { unsigned int h, i, j, k, ia; double dtmp; if ( ld < m ) { fprintf(stderr,"Error in dfill_identity: ld=%u m=%u mismatched!\n",ld,m); exit(-1); } for ( h = 0; h < (unsigned int)number_of_cases; h++ ) { ia = h*ld*n*VLEN; for ( j = 1; j <= n; j++ ) { for ( i = 1; i <= ld; i++ ) { if ( i == j ) dtmp = 1.0; else dtmp = 0.0; for ( k = 0; k < (unsigned int)VLEN; k++ ) matrix[ia++] = dtmp; } } } } LIBXSMM_INLINE void sfill_matrix ( float *matrix, unsigned int ld, unsigned int m, unsigned int n ) { unsigned int i, j; double dtmp; if ( ld < m ) { fprintf(stderr,"Error is sfill_matrix: ld=%u m=%u mismatched!\n",ld,m); exit(-1); } for ( j = 1; j <= n; j++ ) { /* Fill through the leading dimension */ for ( i = 1; i <= ld; i++ ) { dtmp = 1.0 - 2.0*libxsmm_rng_f64(); matrix [ (j-1)*ld + (i-1) ] = (float) dtmp; } } } LIBXSMM_INLINE double residual_d ( double *A, unsigned int lda, unsigned int m, unsigned int n, double *B, unsigned int ldb, unsigned int *nerrs, unsigned int *ncorr ) { unsigned int i, j, address, i4, j4, k4, i8, j8, k8; double atmp, btmp, dtmp, ref, derror; static int ntimes = 0; *nerrs = 0; *ncorr = 0; derror = 0.0; for ( j = 1; j<= n; j++ ) { for ( i = 1; i <= m; i++ ) { atmp = A[ (j-1)*lda + (i-1)]; btmp = B[ (j-1)*ldb + (i-1)]; ref = LIBXSMM_MAX(atmp,-atmp); if ( atmp >= btmp ) { dtmp = atmp - btmp; } else { dtmp = btmp - atmp; } if ( isnan(dtmp) || isinf(dtmp) ) { if ( ++ntimes < 15 ) { printf("Denormal bug: A(%u,%u) is %g B(%u,%u) is %g\n",i,j,atmp,i,j,btmp); } } if ( (dtmp / ref > 1.0e-12) && (dtmp > 1.0e-15) ) { *nerrs = *nerrs + 1; if ( ++ntimes < 15 ) { address = (j-1)*lda + (i-1); j4 = (int)(address/(lda*4)) + 1; i4 = (int)((address-(j4-1)*lda*4) / 4) + 1; k4 = (address-(j4-1)*lda*4 - (i4-1)*4) + 1; j8 = (int)(address/(lda*8)) + 1; i8 = (int)((address-(j8-1)*lda*8) / 8) + 1; k8 = (address-(j8-1)*lda*8 - (i8-1)*8) + 1; printf("Bug #%i: A[%u]=A(%u,%u)=A4(%u,%u,%u)=A8(%u,%u,%u) expected=%g instead=%g err=%g\n",ntimes,address,i,j,i4,j4,k4,i8,j8,k8,atmp,btmp,dtmp); } } else { if ( (*nerrs > 0) && (ntimes < 10) && (*ncorr < 40) ) { printf("Cor #%u: A[%u]=A(%u,%u) expected=%g\n",*ncorr+1,(j-1)*lda+(i-1),i,j,atmp); } *ncorr = *ncorr + 1; } derror += dtmp; } } return ( derror ); } LIBXSMM_INLINE double residual_s ( float *A, unsigned int lda, unsigned int m, unsigned int n, float *B, unsigned int ldb, unsigned int *nerrs, unsigned int *ncorr ) { unsigned int i, j, address, i4, j4, k4, i8, j8, k8; double atmp, btmp, dtmp, ref, derror; static int ntimes = 0; *nerrs = 0; *ncorr = 0; derror = 0.0; for ( j = 1; j<= n; j++ ) { for ( i = 1; i <= m; i++ ) { atmp = (double) A[ (j-1)*lda + (i-1)]; btmp = (double) B[ (j-1)*ldb + (i-1)]; ref = LIBXSMM_MAX(atmp,-atmp); if ( atmp >= btmp ) { dtmp = atmp - btmp; } else { dtmp = btmp - atmp; } if ( isnan(dtmp) || isinf(dtmp) ) { if ( ++ntimes < 15 ) { printf("Denormal bug: A(%u,%u) is %g B(%u,%u) is %g\n",i,j,atmp,i,j,btmp); } } if ( (dtmp / ref > 1.0e-4) && (dtmp > 1.0e-7) ) { *nerrs = *nerrs + 1; if ( ++ntimes < 15 ) { address = (j-1)*lda + (i-1); j4 = (int)(address/(lda*4)) + 1; i4 = (int)((address-(j4-1)*lda*4) / 4) + 1; k4 = (address-(j4-1)*lda*4 - (i4-1)*4) + 1; j8 = (int)(address/(lda*8)) + 1; i8 = (int)((address-(j8-1)*lda*8) / 8) + 1; k8 = (address-(j8-1)*lda*8 - (i8-1)*8) + 1; printf("Bug #%i: A[%u]=A(%u,%u)=A4(%u,%u,%u)=A8(%u,%u,%u) expected=%g instead=%g err=%g\n",ntimes,address,i,j,i4,j4,k4,i8,j8,k8,atmp,btmp,dtmp); } } else { if ( (*nerrs > 0) && (ntimes < 10) && (*ncorr < 40) ) { printf("Cor #%u: A(%u,%u) expected=%g\n",*ncorr+1,i,j,atmp); } *ncorr = *ncorr + 1; } derror += dtmp; } } return ( derror ); } #ifdef USE_PREDEFINED_ASSEMBLY extern void gemm_(); #endif #ifdef MKL_TIMER extern double dsecnd_(); #endif int main(int argc, char* argv[]) { unsigned int m=8, n=8, k=8, lda=8, ldb=8, ldc=8, nerrs, num, nmat; unsigned int layout, asize, bsize, ntest, ncorr; #ifdef AVX512_TESTING unsigned int VLEND=8, VLENS=16; int arch=LIBXSMM_X86_AVX512_CORE; #else unsigned int VLEND=4, VLENS=8; int arch=LIBXSMM_X86_AVX2; #endif unsigned int nmats, nmatd; unsigned int i, j, l, iunroll, junroll, loopi, loopj; char side='L', uplo='U', transa='N', transb='N', diag='N'; unsigned int typesize8 = 8; unsigned int typesize4 = 4; float *sa, *sb, *sc, *sd, *sc1; double *da, *db, *dc, *dd, *dc1; double dalpha = 1.0; float salpha = (float)dalpha; double dbeta = 1.0; float sbeta = (float)dbeta; double dtmp; const unsigned char *cptr = NULL; unsigned long op_count; const libxsmm_pgemm_descriptor* desc8 = NULL; const libxsmm_pgemm_descriptor* desc4 = NULL; #ifdef USE_XSMM_GENERATED libxsmm_descriptor_blob blob; libxsmm_pgemm_xfunction mykernel = NULL; #endif #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__) void (*opcode_routine)(); unsigned char *routine_output; libxsmm_generated_code io_generated_code; int pagesize = sysconf(_SC_PAGE_SIZE); if (pagesize == -1) fprintf(stderr,"sysconf pagesize\n"); routine_output = (unsigned char *) mmap(NULL, BUFSIZE2, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0,0); if (mprotect(routine_output, BUFSIZE2, PROT_EXEC | PROT_READ | PROT_WRITE ) == -1) fprintf(stderr,"mprotect\n"); printf("Routine ready\n"); io_generated_code.generated_code = &routine_output[0]; io_generated_code.buffer_size = BUFSIZE2; io_generated_code.code_size = 0; io_generated_code.code_type = 2; io_generated_code.last_error = 0; io_generated_code.sf_size = 0; #endif printf("\nUSAGE: %s m n k lda ldb ldc nmat layout ntest transa transb iunroll junroll loopj loopi\n",argv[0]); if ( argc <= 3 ) { #ifdef TEST_SINGLE printf("Compact SGEMM a C_mxn<-C_mxn+A_mxk*B_kxn matrix of leading dims lda/b/c\n"); printf("This will test the jit of 1 VLEN=%d ",VLENS); if ( VLENS==8 ) printf("(AVX2)"); else printf("(AVX512)"); #else printf("Compact DGEMM a C_mxn<-C_mxn+A_mxk*B_kxn matrix of leading dims lda/b/c\n"); printf("This will test the jit of 1 VLEN=%d ",VLEND); if ( VLEND==4 ) printf("(AVX2)"); else printf("(AVX512)"); #endif printf(" work of nmat at a time\n"); printf("Configurable: M-loop controlled by iunroll & loopi. N-loop by junroll & loopj\n"); printf("Defaults: m=n=k=lda=ldb=ldc=nmat=8, layout=102 (col major), transa=/b='N', ntest=1\n"); } if ( argc > 1 ) m = atoi(argv[1]); else m = 8; if ( argc > 2 ) n = atoi(argv[2]); else n = 8; if ( argc > 3 ) k = atoi(argv[3]); else k = 8; if ( argc > 4 ) lda= atoi(argv[4]); else lda = 8; if ( argc > 5 ) ldb= atoi(argv[5]); else ldb = 8; if ( argc > 6 ) ldc= atoi(argv[6]); else ldc = 8; if ( argc > 7 ) nmat = atoi(argv[7]); else nmat = 8; if ( argc > 8 ) layout = atoi(argv[8]); else layout=102; if ( argc > 9 ) ntest = atoi(argv[9]); else ntest = 1; if ( argc > 10 ) transa = argv[10][0]; else transa = 'N'; if ( argc > 11 ) transb = argv[11][0]; else transb = 'N'; if ( argc > 12 ) iunroll=atoi(argv[12]); else iunroll=0; if ( argc > 13 ) junroll=atoi(argv[13]); else junroll=0; if ( argc > 14 ) loopj=atoi(argv[14]); else loopj=0; if ( argc > 15 ) loopi=atoi(argv[15]); else loopi=0; salpha = (float)dalpha; m = LIBXSMM_MAX(m,1); n = LIBXSMM_MAX(n,1); k = LIBXSMM_MAX(k,1); ntest = LIBXSMM_MAX(ntest,1); nmat = LIBXSMM_MAX(nmat,VLEND); layout = LIBXSMM_MAX(LIBXSMM_MIN(layout,102),101); if ( transa!='N' && transa!='n' && transa!='T' && transa!='t' ) transa='N'; if ( transb!='N' && transb!='n' && transb!='T' && transb!='t' ) transb='N'; lda = LIBXSMM_MAX(lda,m); ldb = LIBXSMM_MAX(ldb,k); ldc = LIBXSMM_MAX(ldc,m); nmats = LIBXSMM_MAX(VLENS,nmat - (nmat%VLENS)); nmatd = LIBXSMM_MAX(VLEND,nmat - (nmat%VLEND)); #ifdef TEST_SINGLE nmat = nmats; #else nmat = nmatd; #endif op_count = (unsigned long)(nmat * 2.0 * (double)m * (double)n * (double)k); #ifdef TEST_SINGLE printf("This is a real*%d tester for JIT compact SGEMM %c%c kernels! (m=%u n=%u k=%u lda=%u ldb=%u ldc=%u layout=%d nmat=%d alpha=%g beta=%g iun=%d jun=%d loopi=%d loopj=%d VLEN=%d)\n",typesize4,transa,transb,m,n,k,lda,ldb,ldc,layout,nmat,dalpha,dbeta,iunroll,junroll,loopi,loopj,VLENS); #else printf("This is a real*%d tester for JIT compact DGEMM %c%c kernels! (m=%u n=%u k=%u lda=%u ldb=%u ldc=%u layout=%d nmat=%d alpha=%g beta=%g iun=%d jun=%d loopi=%d loopj=%d VLEN=%d)\n",typesize8,transa,transb,m,n,k,lda,ldb,ldc,layout,nmat,dalpha,dbeta,iunroll,junroll,loopi,loopj,VLEND); #endif #ifdef USE_XSMM_GENERATED printf("This code tests the LIBXSMM generated kernels\n"); #endif #ifdef USE_PREDEFINED_ASSEMBLY printf("This code tests some predefined assembly kernel\n"); #endif #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__) printf("This code tests kernel generation directly\n"); #endif #ifdef TIME_MKL printf("This code tests MKL compact batch directly\n"); #endif #ifdef AVX512_TESTING printf("This tests AVX512 binaries\n"); #endif #ifdef AVX2_TESTING printf("This tests AVX2 binaries\n"); #endif desc8 = libxsmm_pgemm_descriptor_init(&blob, typesize8, m, n, k, lda, ldb, ldc, &dalpha, transa, transb, layout ); #ifdef TEST_SINGLE desc4 = libxsmm_pgemm_descriptor_init(&blob, typesize4, m, n, k, lda, ldb, ldc, &dalpha, transa, transb, layout ); #endif printf("Descriptor set\n"); #ifdef USE_XSMM_GENERATED printf("calling libxsmm_dispatch_pgemm: typesize8=%u\n",typesize8); mykernel = libxsmm_dispatch_pgemm(desc8); printf("done calling libxsmm_dispatch_pgemm: typesize8=%u\n",typesize8); if ( mykernel == NULL ) printf("R8 Kernel after the create call is null\n"); #ifdef TEST_SINGLE mykernel = libxsmm_dispatch_pgemm(desc4); if ( mykernel == NULL ) printf("R4 kernel after the create call is null\n"); #endif #endif #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__) libxsmm_generator_pgemm_kernel( &io_generated_code, desc8, arch, iunroll, junroll, loopi, loopj ); #endif #ifndef NO_ACCURACY_CHECK printf("mallocing matrices\n"); #endif sa = (float *) malloc ( lda*k*nmat*sizeof(float) ); da = (double *) malloc ( lda*k*nmat*sizeof(double) ); sb = (float *) malloc ( ldb*n*nmat*sizeof(float) ); db = (double *) malloc ( ldb*n*nmat*sizeof(double) ); sc1 = (float *) malloc ( ldc*n*nmat*sizeof(float) ); dc1 = (double *) malloc ( ldc*n*nmat*sizeof(double) ); sc = (float *) malloc ( ldc*n*nmat*sizeof(float) ); dc = (double *) malloc ( ldc*n*nmat*sizeof(double) ); sd = (float *) malloc ( ldc*n*nmat*sizeof(float) ); dd = (double *) malloc ( ldc*n*nmat*sizeof(double) ); #ifndef NO_ACCURACY_CHECK printf("filling matrices\n"); #endif sfill_matrix ( sa, lda, m, k*nmat ); sfill_matrix ( sb, ldb, k, n*nmat ); sfill_matrix ( sc, ldc, m, n*nmat ); dfill_matrix ( da, lda, m, k*nmat ); dfill_matrix ( db, ldb, k, n*nmat ); dfill_matrix ( dc, ldc, m, n*nmat ); #ifndef NO_ACCURACY_CHECK for ( i = 0; i < ldc*n*nmat; i++ ) sd[i]=sc[i]; for ( i = 0; i < ldc*n*nmat; i++ ) dd[i]=dc[i]; for ( i = 0; i < ldc*n*nmat; i++ ) sc1[i]=sc[i]; for ( i = 0; i < ldc*n*nmat; i++ ) dc1[i]=dc[i]; printf("Pointing at the kernel now\n"); #endif #ifdef USE_XSMM_GENERATED cptr = (const unsigned char*) mykernel; #endif #ifdef USE_PREDEFINED_ASSEMBLY cptr = (const unsigned char*) gemm_; #endif #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__) cptr = (const unsigned char*) &routine_output[0]; opcode_routine = (void *) &cptr[0]; #endif #ifndef TIME_MKL # define DUMP_ASSEMBLY_FILE #endif #ifdef DUMP_ASSEMBLY_FILE printf("Dumping assembly file\n"); FILE *fp = fopen("foo.s","w"); char buffer[80]; fputs("\t.text\n",fp); fputs("\t.align 256\n",fp); fputs("\t.globl gemm_\n",fp); fputs("gemm_:\n",fp); for (i = 0; i < 7000; i+=4 ) { sprintf(buffer,".byte 0x%02x, 0x%02x, 0x%02x, 0x%02x\n",cptr[i],cptr[i+1],cptr[i+2],cptr[i+3]); fputs(buffer,fp); } fputs("\tretq\n",fp); fputs("\t.type gemm_,@function\n",fp); fputs("\t.size gemm_,.-gemm_\n",fp); fclose(fp); #endif #if defined(USE_MKL_FOR_REFERENCE) || defined(TIME_MKL) # include MKL_LAYOUT CLAYOUT = (layout == 101) ? MKL_ROW_MAJOR : MKL_COL_MAJOR; MKL_SIDE SIDE = (side == 'R' || side == 'r') ? MKL_RIGHT : MKL_LEFT; MKL_UPLO UPLO = (uplo == 'U' || uplo == 'u') ? MKL_UPPER : MKL_LOWER; MKL_TRANSPOSE TRANSA = (transa == 'N' || transa == 'n') ? MKL_NOTRANS : MKL_TRANS; MKL_TRANSPOSE TRANSB = (transb == 'N' || transb == 'n') ? MKL_NOTRANS : MKL_TRANS; MKL_DIAG DIAG = (diag == 'N' || diag == 'n') ? MKL_NONUNIT : MKL_UNIT; MKL_COMPACT_PACK CMP_FORMAT = mkl_get_format_compact(); #if 0 MKL_COMPACT_PACK CMP_FORMAT = MKL_COMPACT_AVX; #endif #endif #ifndef NO_ACCURACY_CHECK printf("Before routine, initial A(1,1)=%g A[256]=%g\n",da[0],da[256]); #endif #ifdef USE_PREDEFINED_ASSEMBLY double one = 1.0; #endif double timer, firsttime = 0; #ifdef MKL_TIMER double tmptimer; tmptimer = dsecnd_(); #else unsigned long long l_start, l_end; #endif timer = 0.0; for ( j = 0; j < (int)ntest; j++ ) { for ( i = 0; i < ldc*n*nmat; i++ ) dc[i]=dc1[i]; for ( i = 0 , num = 0; i < (int)nmat; i+= (int)VLEND, num++ ) { double *Ap = &da[num*lda*k*VLEND]; double *Bp = &db[num*ldb*n*VLEND]; double *Cp = &dc[num*ldc*n*VLEND]; #ifdef MKL_TIMER tmptimer = dsecnd_(); #else l_start = libxsmm_timer_tick(); #endif #if !defined(USE_XSMM_GENERATED) && !defined(USE_PREDEFINED_ASSEMBLY) && !defined(USE_KERNEL_GENERATION_DIRECTLY) && !defined(TIME_MKL) && !defined(USE_PREDEFINED_ASSEMBLY_XCT) gen_compact_dgemm_ ( &layout, &m, &n, &k, &dalpha, Ap, &lda, Bp, &ldb, &dbeta, Cp, &ldc, &VLEND ); #endif #ifdef USE_XSMM_GENERATED mykernel ( Ap, Bp, Cp ); #endif #ifdef USE_PREDEFINED_ASSEMBLY gemm_ ( Ap, Bp, Cp ); #endif #ifdef USE_KERNEL_GENERATION_DIRECTLY (*opcode_routine)( Ap, Bp, Cp ); #endif #ifdef TIME_MKL mkl_dgemm_compact ( CLAYOUT, TRANSA, TRANSB, m, n, k, dalpha, da, lda, db, ldb, dbeta, dc, ldc, CMP_FORMAT, nmat ); i+=nmatd; /* Because MKL will do everything */ #endif #ifdef MKL_TIMER dtmp = dsecnd_() - tmptimer; #else l_end = libxsmm_timer_tick(); dtmp = libxsmm_timer_duration(l_start,l_end); #endif if ( j == 0 ) firsttime=dtmp; timer += dtmp; } } if ( ntest >= 100 ) { /* Skip the first timing: super necessary if using MKL */ timer = (timer-firsttime)/((double)(ntest-1)); } else { timer /= ((double)ntest); } #ifndef NO_ACCURACY_CHECK printf("Average time to get through %u matrices: %g\n",nmat,timer); printf("Gflops: %g\n",(double)op_count/(timer*1.0e9)); printf("after routine, new C(1,1)=%g C[256]=%g\n",dc[0],dc[256]); #endif #ifdef TEST_SINGLE printf("Before r4 routine, initial C(1,1)=%g C[256]=%g\n",sc[0],sc[256]); for ( i = 0 , num = 0; i < nmats; i+= VLENS, num++ ) { float *Ap = &sa[num*lda*k*VLENS]; float *Bp = &sb[num*ldb*n*VLENS]; float *Cp = &sc[num*ldc*n*VLENS]; #ifdef USE_XSMM_GENERATED mykernel ( Ap, Bp, Cp ); #endif } printf("after r4 routine, new C(1,1)=%g C]256]=%g\n",dc[0],dc[256]); #endif #ifndef NO_ACCURACY_CHECK /* Call some reference code now on a copy of the B matrix (C) */ double timer2 = 0.0; for ( j = 0; j < (int)ntest; j++ ) { for ( i = 0; i < ldc*n*nmat; i++ ) dd[i]=dc1[i]; #ifdef MKL_TIMER tmptimer = dsecnd_(); #else l_start = libxsmm_timer_tick(); #endif #ifndef USE_MKL_FOR_REFERENCE compact_dgemm_ ( &layout, &transa, &transb, &m, &n, &k, &dalpha, da, &lda, db, &ldb, &dbeta, dd, &ldc, &nmat, &VLEND ); #else mkl_dgemm_compact ( CLAYOUT, TRANSA, TRANSB, m, n, k, dalpha, da, lda, db, ldb, dbeta, dd, ldc, CMP_FORMAT, nmat ); #endif #ifdef MKL_TIMER timer2 += dsecnd_() - tmptimer; #else l_end = libxsmm_timer_tick(); timer2 += libxsmm_timer_duration(l_start,l_end); #endif } timer2 /= ((double)ntest); printf("Reference time=%g Reference Gflops=%g\n",timer2,op_count/(timer2*1.0e9)); /* Compute the residual between B and C */ dtmp = residual_d ( dc, ldc, m, n*nmat, dd, ldc, &nerrs, &ncorr ); printf("R8 mnk=%u %u %u ldabc=%u %u %u error: %g number of errors: %u corrects: %u",m,n,k,lda,ldb,ldc,dtmp,nerrs,ncorr); if ( nerrs > 0 ) printf(" ->FAILED at %ux%u real*8 %u case",m,n,layout); printf("\n"); #ifdef TEST_SINGLE /* Call some reference code now on a copy of the B matrix (C) */ compact_dgemm_ ( &layout, &transa, &transb, &m, &n, &k, &salpha, sa, &lda, sb, &ldb, &sbeta, sd, &ldc, &nmat, &VLENS ); /* Compute the residual between C and D */ dtmp = residual_s ( sc, ldc, m, n*nmat, sd, ldc, &nerrs, &ncorr ); printf("R4 mnk=%u %u %u ldabc=%u %u %u error: %g number of errors: %u corrects: %u",m,n,k,lda,ldb,ldc,dtmp,nerrs,ncorr); if ( nerrs > 0 ) printf(" ->FAILED at %ux%u real*4 case",m,n); printf("\n"); #endif #else for ( j = 0, nerrs = 0; j < lda*n*nmat; j++ ) { if ( isnan(dc[j]) || isinf(dc[j]) ) { if ( ++nerrs < 10 ) { printf("WARNING: dc[%d]=%g\n",j,dc[j]); } } } printf("%g,real*8 m/n/k=%u %u %u lda-c=%u %u %u Denormals=%u Time=%g Gflops=%g",op_count/(timer*1.0e9),m,n,k,lda,ldb,ldc,nerrs,timer,op_count/(timer*1.0e9)); if ( nerrs > 0 ) printf(" -> FAILED at %ux%u real*8 case",m,n); printf("\n"); #endif free(dd); free(sd); free(dc); free(sc); free(dc1); free(sc1); free(db); free(sb); free(da); free(sa); return 0; } libxsmm-1.17/samples/packed/gemm/gemm.vcxproj000066400000000000000000000547371415223013700213150ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 gemm {2FB902F2-8622-43F0-9AE4-A2EA228CA0F8} 10.0 Application Disabled Disabled Sequential v142 true Application true true Disabled Disabled Sequential v142 Application true Disabled Disabled Sequential v142 true Application Disabled Disabled Sequential v142 true true Application true Disabled Disabled Sequential v142 Application true Disabled Disabled true Sequential v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;mkl_rt.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;mkl_rt.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/packed/gemm/sgemm.f000066400000000000000000000130431415223013700202130ustar00rootroot00000000000000 SUBROUTINE SGEMM(TRANSA,TRANSB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC) ! ! -- Reference BLAS level3 routine (version 3.4.0) -- ! -- Reference BLAS is a software package provided by Univ. of Tennessee, -- ! -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- ! November 2011 ! ! .. Scalar Arguments .. REAL*4 ALPHA,BETA INTEGER K,LDA,LDB,LDC,M,N CHARACTER TRANSA,TRANSB ! .. ! .. Array Arguments .. REAL*4 A(LDA,*),B(LDB,*),C(LDC,*) ! .. ! ! ===================================================================== ! ! .. External Functions .. LOGICAL LSAME EXTERNAL LSAME ! .. ! .. External Subroutines .. EXTERNAL XERBLA ! .. ! .. Intrinsic Functions .. INTRINSIC MAX ! .. ! .. Local Scalars .. REAL*4 TEMP INTEGER I,INFO,J,L,NCOLA,NROWA,NROWB LOGICAL NOTA,NOTB ! .. ! .. Parameters .. REAL*4 ONE,ZERO PARAMETER (ONE=1.0E+0,ZERO=0.0E+0) ! .. ! ! Set NOTA and NOTB as true if A and B respectively are not ! transposed and set NROWA, NCOLA and NROWB as the number of rows ! and columns of A and the number of rows of B respectively. ! NOTA = LSAME(TRANSA,'N') NOTB = LSAME(TRANSB,'N') IF (NOTA) THEN NROWA = M NCOLA = K ELSE NROWA = K NCOLA = M END IF IF (NOTB) THEN NROWB = K ELSE NROWB = N END IF ! ! Test the input parameters. ! INFO = 0 IF ((.NOT.NOTA) .AND. (.NOT.LSAME(TRANSA,'C')) .AND. & (.NOT.LSAME(TRANSA,'T'))) THEN INFO = 1 ELSE IF ((.NOT.NOTB) .AND. (.NOT.LSAME(TRANSB,'C')) .AND. & (.NOT.LSAME(TRANSB,'T'))) THEN INFO = 2 ELSE IF (M.LT.0) THEN INFO = 3 ELSE IF (N.LT.0) THEN INFO = 4 ELSE IF (K.LT.0) THEN INFO = 5 ELSE IF (LDA.LT.MAX(1,NROWA)) THEN INFO = 8 ELSE IF (LDB.LT.MAX(1,NROWB)) THEN INFO = 10 ELSE IF (LDC.LT.MAX(1,M)) THEN INFO = 13 END IF IF (INFO.NE.0) THEN CALL XERBLA('SGEMM ',INFO) RETURN END IF ! ! Quick return if possible. ! IF ((M.EQ.0) .OR. (N.EQ.0) .OR. & (((ALPHA.EQ.ZERO).OR. (K.EQ.0)).AND. (BETA.EQ.ONE))) RETURN ! ! And if alpha.eq.zero. ! IF (ALPHA.EQ.ZERO) THEN IF (BETA.EQ.ZERO) THEN DO 20 J = 1,N DO 10 I = 1,M C(I,J) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40 J = 1,N DO 30 I = 1,M C(I,J) = BETA*C(I,J) 30 CONTINUE 40 CONTINUE END IF RETURN END IF ! ! Start the operations. ! IF (NOTB) THEN IF (NOTA) THEN ! ! Form C := alpha*A*B + beta*C. ! DO 90 J = 1,N IF (BETA.EQ.ZERO) THEN DO 50 I = 1,M C(I,J) = ZERO 50 CONTINUE ELSE IF (BETA.NE.ONE) THEN DO 60 I = 1,M C(I,J) = BETA*C(I,J) 60 CONTINUE END IF DO 80 L = 1,K IF (B(L,J).NE.ZERO) THEN TEMP = ALPHA*B(L,J) DO 70 I = 1,M C(I,J) = C(I,J) + TEMP*A(I,L) 70 CONTINUE END IF 80 CONTINUE 90 CONTINUE ELSE ! ! Form C := alpha*A**T*B + beta*C ! DO 120 J = 1,N DO 110 I = 1,M TEMP = ZERO DO 100 L = 1,K TEMP = TEMP + A(L,I)*B(L,J) 100 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 110 CONTINUE 120 CONTINUE END IF ELSE IF (NOTA) THEN ! ! Form C := alpha*A*B**T + beta*C ! DO 170 J = 1,N IF (BETA.EQ.ZERO) THEN DO 130 I = 1,M C(I,J) = ZERO 130 CONTINUE ELSE IF (BETA.NE.ONE) THEN DO 140 I = 1,M C(I,J) = BETA*C(I,J) 140 CONTINUE END IF DO 160 L = 1,K IF (B(J,L).NE.ZERO) THEN TEMP = ALPHA*B(J,L) DO 150 I = 1,M C(I,J) = C(I,J) + TEMP*A(I,L) 150 CONTINUE END IF 160 CONTINUE 170 CONTINUE ELSE ! ! Form C := alpha*A**T*B**T + beta*C ! DO 200 J = 1,N DO 190 I = 1,M TEMP = ZERO DO 180 L = 1,K TEMP = TEMP + A(L,I)*B(J,L) 180 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 190 CONTINUE 200 CONTINUE END IF END IF ! RETURN ! ! End of SGEMM . ! END libxsmm-1.17/samples/packed/getrf/000077500000000000000000000000001415223013700171155ustar00rootroot00000000000000libxsmm-1.17/samples/packed/getrf/Makefile000066400000000000000000000074731415223013700205700ustar00rootroot00000000000000ROOTDIR = $(abspath $(dir $(firstword $(MAKEFILE_LIST)))) DEPDIR = ../../.. SRCDIR = . INCDIR = . BLDDIR = obj OUTDIR = . CXXFLAGS = $(NULL) CFLAGS = $(NULL) DFLAGS = $(NULL) # Fortran code here does not allow for PEDANTIC=2 # override PEDANTIC = 1 PEDANTIC = 0 BLAS = 0 OMP = 0 SYM = 1 # include common Makefile artifacts include $(DEPDIR)/Makefile.inc # necessary include directories IFLAGS += -I$(call quote,$(INCDIR)) IFLAGS += -I$(call quote,$(DEPDIR)/include) OUTNAME := $(shell basename "$(ROOTDIR)") HEADERS := $(wildcard $(INCDIR)/*.h) $(wildcard $(INCDIR)/*.hpp) $(wildcard $(INCDIR)/*.hxx) $(wildcard $(INCDIR)/*.hh) \ $(wildcard $(SRCDIR)/*.h) $(wildcard $(SRCDIR)/*.hpp) $(wildcard $(SRCDIR)/*.hxx) $(wildcard $(SRCDIR)/*.hh) \ $(DEPDIR)/include/libxsmm_source.h CPPSRCS := $(wildcard $(SRCDIR)/*.cpp) CXXSRCS := $(wildcard $(SRCDIR)/*.cxx) CCXSRCS := $(wildcard $(SRCDIR)/*.cc) CSOURCS := $(wildcard $(SRCDIR)/*.c) CPPOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CPPSRCS:.cpp=-cpp.o))) CXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CXXSRCS:.cxx=-cxx.o))) CCXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CCXSRCS:.cc=-cc.o))) COBJCTS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CSOURCS:.c=-c.o))) ifneq (,$(strip $(FC))) FXXSRCS := $(wildcard $(SRCDIR)/*.f) F77SRCS := $(wildcard $(SRCDIR)/*.F) F90SRCS := $(wildcard $(SRCDIR)/*.f90) $(wildcard $(SRCDIR)/*.F90) FXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(FXXSRCS:.f=-f.o))) F77OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F77SRCS:.F=-f77.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90SRCS:.f90=-f90.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90OBJS:.F90=-f90.o))) endif SOURCES := $(CPPSRCS) $(CXXSRCS) $(CCXSRCS) $(CSOURCS) OBJECTS := $(CPPOBJS) $(CXXOBJS) $(CCXOBJS) $(COBJCTS) FTNSRCS := $(FXXSRCS) $(F77SRCS) $(F90SRCS) MODULES := $(addsuffix .mod,$(basename $(FTNSRCS))) $(addsuffix .modmic,$(basename $(FTNSRCS))) FTNOBJS := $(FXXOBJS) $(F77OBJS) $(F90OBJS) XFILES := $(OUTDIR)/$(OUTNAME) .PHONY: all all: $(XFILES) .PHONY: compile compile: $(OBJECTS) $(FTNOBJS) ifneq (,$(strip $(FC))) $(BLDDIR)/%-f.o: $(SRCDIR)/%.f .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.f90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.F90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f77.o: $(SRCDIR)/%.F .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ endif $(OUTDIR)/$(OUTNAME): $(OUTDIR)/.make $(OBJECTS) $(FTNOBJS) $(LIBDEP) $(LIB_FLD) -o $@ $(OBJECTS) $(FTNOBJS) $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(BLDDIR)/%-cpp.o: $(SRCDIR)/%.cpp .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CXX) $(DFLAGS) $(IFLAGS) $(CXXFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-c.o: $(SRCDIR)/%.c .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CC) $(DFLAGS) $(IFLAGS) $(CFLAGS) $(CTARGET) -c $< -o $@ .PHONY: clean clean: ifneq ($(call qapath,$(BLDDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(BLDDIR)),$(call qapath,.)) @rm -rf $(BLDDIR) endif endif ifneq (,$(wildcard $(BLDDIR))) # still exists @rm -f $(OBJECTS) $(OBJECTX) $(FTNOBJS) $(FTNOBJX) *__genmod.* fit.log *.dat @rm -f $(BLDDIR)/*.gcno $(BLDDIR)/*.gcda $(BLDDIR)/*.gcov endif @rm -f .make .state .PHONY: realclean realclean: clean ifneq ($(call qapath,$(OUTDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(OUTDIR)),$(call qapath,.)) @rm -rf $(OUTDIR) endif endif ifneq (,$(wildcard $(OUTDIR))) # still exists @rm -f $(OUTDIR)/libxsmm.$(DLIBEXT) $(OUTDIR)/*.stackdump @rm -f $(XFILES) $(MODULES) endif libxsmm-1.17/samples/packed/getrf/blas_aux.c000066400000000000000000000011741415223013700210620ustar00rootroot00000000000000/* Optionally link-in the BLAS routines lsame_() and xerbla_() */ #if !defined(__BLAS) || (0 != __BLAS) #include int lsame_(const char* ca, const char* cb) { if ( *ca == *cb ) return 1; if ( (*cb >= 'a') && (*cb <= 'z') ) { if ( *ca == *cb + 32 ) return 1; } else if ( (*cb >= 'A') && (*cb <= 'Z') ) { if ( *ca == *cb - 32 ) return 1; } return 0; } void xerbla_(const char* c, const int* info) { printf(" ** On entry to %s parameter number %02d had an illegal value\n", c, *info); } int ilaenv_ ( int *ispec, char *name, char *opts, int *n1, int *n2, int *n3, int *n4 ) { return ( 1 ); } #endif libxsmm-1.17/samples/packed/getrf/dgetrf.f000066400000000000000000001017611415223013700205450ustar00rootroot00000000000000 SUBROUTINE DGETRFNP( M, N, A, LDA, INFO ) ! ! -- LAPACK computational routine (version 3.4.0) -- ! -- LAPACK is a software package provided by Univ. of Tennessee, -- ! -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- ! November 2011 ! ! .. Scalar Arguments .. INTEGER INFO, LDA, M, N ! .. ! .. Array Arguments .. DOUBLE PRECISION A( LDA, * ) ! .. ! ! ===================================================================== ! ! .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) ! .. ! .. Local Scalars .. DOUBLE PRECISION SFMIN INTEGER I, J, JP ! .. ! .. External Functions .. DOUBLE PRECISION DLAMCH INTEGER IDAMAX EXTERNAL DLAMCH, IDAMAX ! .. ! .. External Subroutines .. EXTERNAL DGER, DSCAL, DSWAP, XERBLA ! .. ! .. Intrinsic Functions .. INTRINSIC MAX, MIN ! .. ! .. Executable Statements .. ! ! Test the input parameters. ! INFO = 0 IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DGETRFNP', -INFO ) RETURN END IF ! ! Quick return if possible ! IF( M.EQ.0 .OR. N.EQ.0 ) RETURN ! ! Compute machine safe minimum ! SFMIN = DLAMCH('S') ! DO 10 J = 1, MIN( M, N ) JP = J IF( A( JP, J ).NE.ZERO ) THEN ! ! Compute elements J+1:M of J-th column. ! IF( J.LT.M ) THEN IF( ABS(A( J, J )) .GE. SFMIN ) THEN CALL DSCAL( M-J, ONE / A( J, J ), A( J+1, J ), 1 ) ELSE DO 20 I = 1, M-J A( J+I, J ) = A( J+I, J ) / A( J, J ) 20 CONTINUE END IF END IF ! ELSE IF( INFO.EQ.0 ) THEN ! INFO = J END IF ! IF( J.LT.MIN( M, N ) ) THEN ! ! Update trailing submatrix. ! CALL DGER( M-J, N-J, -ONE, A( J+1, J ), 1,& A( J, J+1 ),LDA,A( J+1, J+1 ), LDA ) END IF 10 CONTINUE RETURN ! ! End of DGETRFNP ! END SUBROUTINE DGETRF( M, N, A, LDA, IPIV, INFO ) ! ! -- LAPACK computational routine (version 3.4.0) -- ! -- LAPACK is a software package provided by Univ. of Tennessee, -- ! -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- ! November 2011 ! ! .. Scalar Arguments .. INTEGER INFO, LDA, M, N ! .. ! .. Array Arguments .. INTEGER IPIV( * ) DOUBLE PRECISION A( LDA, * ) ! .. ! ! ===================================================================== ! ! .. Parameters .. DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) ! .. ! .. Local Scalars .. INTEGER I, IINFO, J, JB, NB ! .. ! .. External Subroutines .. EXTERNAL DGEMM, DGETF2, DLASWP, DTRSM, XERBLA ! .. ! .. External Functions .. INTEGER ILAENV EXTERNAL ILAENV ! .. ! .. Intrinsic Functions .. INTRINSIC MAX, MIN ! .. ! .. Executable Statements .. ! ! Test the input parameters. ! INFO = 0 IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DGETRF', -INFO ) RETURN END IF ! ! Quick return if possible ! IF( M.EQ.0 .OR. N.EQ.0 ) RETURN ! ! Determine the block size for this environment. ! NB = ILAENV( 1, 'DGETRF', ' ', M, N, -1, -1 ) IF( NB.LE.1 .OR. NB.GE.MIN( M, N ) ) THEN ! ! Use unblocked code. ! CALL DGETF2( M, N, A, LDA, IPIV, INFO ) ELSE ! ! Use blocked code. ! DO 20 J = 1, MIN( M, N ), NB JB = MIN( MIN( M, N )-J+1, NB ) ! ! Factor diagonal and subdiagonal blocks and test for exact ! singularity. ! CALL DGETF2( M-J+1, JB, A( J, J ), LDA, IPIV( J ), IINFO ) ! ! Adjust INFO and the pivot indices. ! IF( INFO.EQ.0 .AND. IINFO.GT.0 ) INFO = IINFO + J - 1 DO 10 I = J, MIN( M, J+JB-1 ) IPIV( I ) = J - 1 + IPIV( I ) 10 CONTINUE ! ! Apply interchanges to columns 1:J-1. ! CALL DLASWP( J-1, A, LDA, J, J+JB-1, IPIV, 1 ) ! IF( J+JB.LE.N ) THEN ! ! Apply interchanges to columns J+JB:N. ! CALL DLASWP( N-J-JB+1, A( 1, J+JB ), LDA, J, J+JB-1, & IPIV, 1 ) ! ! Compute block row of U. ! CALL DTRSM( 'Left', 'Lower', 'No transpose', 'Unit',JB,& N-J-JB+1, ONE, A( J, J ), LDA, A( J, J+JB ),& LDA ) IF( J+JB.LE.M ) THEN ! ! Update trailing submatrix. ! CALL DGEMM( 'No transpose', 'No transpose', M-J-JB+1,& N-J-JB+1, JB, -ONE, A( J+JB, J ), LDA,& A( J, J+JB ), LDA, ONE, A( J+JB, J+JB ),& LDA ) END IF END IF 20 CONTINUE END IF RETURN ! ! End of DGETRF ! END SUBROUTINE DGETF2( M, N, A, LDA, IPIV, INFO ) ! ! -- LAPACK computational routine (version 3.4.0) -- ! -- LAPACK is a software package provided by Univ. of Tennessee, -- ! -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- ! November 2011 ! ! .. Scalar Arguments .. INTEGER INFO, LDA, M, N ! .. ! .. Array Arguments .. INTEGER IPIV( * ) DOUBLE PRECISION A( LDA, * ) ! .. ! ! ===================================================================== ! ! .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) ! .. ! .. Local Scalars .. DOUBLE PRECISION SFMIN INTEGER I, J, JP ! .. ! .. External Functions .. DOUBLE PRECISION DLAMCH INTEGER IDAMAX EXTERNAL DLAMCH, IDAMAX ! .. ! .. External Subroutines .. EXTERNAL DGER, DSCAL, DSWAP, XERBLA ! .. ! .. Intrinsic Functions .. INTRINSIC MAX, MIN ! .. ! .. Executable Statements .. ! ! Test the input parameters. ! INFO = 0 IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DGETF2', -INFO ) RETURN END IF ! ! Quick return if possible ! IF( M.EQ.0 .OR. N.EQ.0 ) RETURN ! ! Compute machine safe minimum ! SFMIN = DLAMCH('S') ! DO 10 J = 1, MIN( M, N ) ! ! Find pivot and test for singularity. ! JP = J - 1 + IDAMAX( M-J+1, A( J, J ), 1 ) IPIV( J ) = JP IF( A( JP, J ).NE.ZERO ) THEN ! ! Apply the interchange to columns 1:N. ! IF(JP.NE.J) CALL DSWAP( N, A( J, 1 ), LDA, A( JP, 1 ), LDA ) ! ! Compute elements J+1:M of J-th column. ! IF( J.LT.M ) THEN IF( ABS(A( J, J )) .GE. SFMIN ) THEN CALL DSCAL( M-J, ONE / A( J, J ), A( J+1, J ), 1 ) ELSE DO 20 I = 1, M-J A( J+I, J ) = A( J+I, J ) / A( J, J ) 20 CONTINUE END IF END IF ! ELSE IF( INFO.EQ.0 ) THEN ! INFO = J END IF ! IF( J.LT.MIN( M, N ) ) THEN ! ! Update trailing submatrix. ! CALL DGER( M-J, N-J, -ONE, A( J+1, J ), 1,A( J, J+1 ),LDA,& A( J+1, J+1 ), LDA ) END IF 10 CONTINUE RETURN ! ! End of DGETF2 ! END SUBROUTINE DLASWP( N, A, LDA, K1, K2, IPIV, INCX ) ! ! -- LAPACK auxiliary routine (version 3.4.0) -- ! -- LAPACK is a software package provided by Univ. of Tennessee, -- ! -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- ! November 2011 ! ! .. Scalar Arguments .. INTEGER INCX, K1, K2, LDA, N ! .. ! .. Array Arguments .. INTEGER IPIV( * ) DOUBLE PRECISION A( LDA, * ) ! .. ! ! ===================================================================== ! ! .. Local Scalars .. INTEGER I, I1, I2, INC, IP, IX, IX0, J, K, N32 DOUBLE PRECISION TEMP ! .. ! .. Executable Statements .. ! ! Interchange row I with row IPIV(I) for each of rows K1 through K2. ! IF( INCX.GT.0 ) THEN IX0 = K1 I1 = K1 I2 = K2 INC = 1 ELSE IF( INCX.LT.0 ) THEN IX0 = 1 + ( 1-K2 )*INCX I1 = K2 I2 = K1 INC = -1 ELSE RETURN END IF ! N32 = ( N / 32 )*32 IF( N32.NE.0 ) THEN DO 30 J = 1, N32, 32 IX = IX0 DO 20 I = I1, I2, INC IP = IPIV( IX ) IF( IP.NE.I ) THEN DO 10 K = J, J + 31 TEMP = A( I, K ) A( I, K ) = A( IP, K ) A( IP, K ) = TEMP 10 CONTINUE END IF IX = IX + INCX 20 CONTINUE 30 CONTINUE END IF IF( N32.NE.N ) THEN N32 = N32 + 1 IX = IX0 DO 50 I = I1, I2, INC IP = IPIV( IX ) IF( IP.NE.I ) THEN DO 40 K = N32, N TEMP = A( I, K ) A( I, K ) = A( IP, K ) A( IP, K ) = TEMP 40 CONTINUE END IF IX = IX + INCX 50 CONTINUE END IF ! RETURN ! ! End of DLASWP ! END SUBROUTINE DTRSM(SIDE,UPLO,TRANSA,DIAG,M,N,ALPHA,A,LDA,B,LDB) ! ! -- Reference BLAS level3 routine (version 3.4.0) -- ! -- Reference BLAS is a software package provided by Univ. of Tennessee, -- ! -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- ! November 2011 ! ! .. Scalar Arguments .. DOUBLE PRECISION ALPHA INTEGER LDA,LDB,M,N CHARACTER DIAG,SIDE,TRANSA,UPLO ! .. ! .. Array Arguments .. DOUBLE PRECISION A(LDA,*),B(LDB,*) ! .. ! ! ===================================================================== ! ! .. External Functions .. LOGICAL LSAME EXTERNAL LSAME ! .. ! .. External Subroutines .. EXTERNAL XERBLA ! .. ! .. Intrinsic Functions .. INTRINSIC MAX ! .. ! .. Local Scalars .. DOUBLE PRECISION TEMP INTEGER I,INFO,J,K,NROWA LOGICAL LSIDE,NOUNIT,UPPER ! .. ! .. Parameters .. DOUBLE PRECISION ONE,ZERO PARAMETER (ONE=1.0D+0,ZERO=0.0D+0) ! .. ! ! Test the input parameters. ! LSIDE = LSAME(SIDE,'L') IF (LSIDE) THEN NROWA = M ELSE NROWA = N END IF NOUNIT = LSAME(DIAG,'N') UPPER = LSAME(UPLO,'U') ! INFO = 0 IF ((.NOT.LSIDE) .AND. (.NOT.LSAME(SIDE,'R'))) THEN INFO = 1 ELSE IF ((.NOT.UPPER) .AND. (.NOT.LSAME(UPLO,'L'))) THEN INFO = 2 ELSE IF ((.NOT.LSAME(TRANSA,'N')) .AND. & (.NOT.LSAME(TRANSA,'T')) .AND. & (.NOT.LSAME(TRANSA,'C'))) THEN INFO = 3 ELSE IF ((.NOT.LSAME(DIAG,'U')) .AND. (.NOT.LSAME(DIAG,'N'))) THEN INFO = 4 ELSE IF (M.LT.0) THEN INFO = 5 ELSE IF (N.LT.0) THEN INFO = 6 ELSE IF (LDA.LT.MAX(1,NROWA)) THEN INFO = 9 ELSE IF (LDB.LT.MAX(1,M)) THEN INFO = 11 END IF IF (INFO.NE.0) THEN CALL XERBLA('DTRSM ',INFO) RETURN END IF ! ! Quick return if possible. ! IF (M.EQ.0 .OR. N.EQ.0) RETURN ! ! And when alpha.eq.zero. ! IF (ALPHA.EQ.ZERO) THEN DO 20 J = 1,N DO 10 I = 1,M B(I,J) = ZERO 10 CONTINUE 20 CONTINUE RETURN END IF ! ! Start the operations. ! IF (LSIDE) THEN IF (LSAME(TRANSA,'N')) THEN ! ! Form B := alpha*inv( A )*B. ! IF (UPPER) THEN DO 60 J = 1,N IF (ALPHA.NE.ONE) THEN DO 30 I = 1,M B(I,J) = ALPHA*B(I,J) 30 CONTINUE END IF DO 50 K = M,1,-1 IF (B(K,J).NE.ZERO) THEN IF (NOUNIT) B(K,J) = B(K,J)/A(K,K) DO 40 I = 1,K - 1 B(I,J) = B(I,J) - B(K,J)*A(I,K) 40 CONTINUE END IF 50 CONTINUE 60 CONTINUE ELSE DO 100 J = 1,N IF (ALPHA.NE.ONE) THEN DO 70 I = 1,M B(I,J) = ALPHA*B(I,J) 70 CONTINUE END IF DO 90 K = 1,M IF (B(K,J).NE.ZERO) THEN IF (NOUNIT) B(K,J) = B(K,J)/A(K,K) DO 80 I = K + 1,M B(I,J) = B(I,J) - B(K,J)*A(I,K) 80 CONTINUE END IF 90 CONTINUE 100 CONTINUE END IF ELSE ! ! Form B := alpha*inv( A**T )*B. ! IF (UPPER) THEN DO 130 J = 1,N DO 120 I = 1,M TEMP = ALPHA*B(I,J) DO 110 K = 1,I - 1 TEMP = TEMP - A(K,I)*B(K,J) 110 CONTINUE IF (NOUNIT) TEMP = TEMP/A(I,I) B(I,J) = TEMP 120 CONTINUE 130 CONTINUE ELSE DO 160 J = 1,N DO 150 I = M,1,-1 TEMP = ALPHA*B(I,J) DO 140 K = I + 1,M TEMP = TEMP - A(K,I)*B(K,J) 140 CONTINUE IF (NOUNIT) TEMP = TEMP/A(I,I) B(I,J) = TEMP 150 CONTINUE 160 CONTINUE END IF END IF ELSE IF (LSAME(TRANSA,'N')) THEN ! ! Form B := alpha*B*inv( A ). ! IF (UPPER) THEN DO 210 J = 1,N IF (ALPHA.NE.ONE) THEN DO 170 I = 1,M B(I,J) = ALPHA*B(I,J) 170 CONTINUE END IF DO 190 K = 1,J - 1 IF (A(K,J).NE.ZERO) THEN DO 180 I = 1,M B(I,J) = B(I,J) - A(K,J)*B(I,K) 180 CONTINUE END IF 190 CONTINUE IF (NOUNIT) THEN TEMP = ONE/A(J,J) DO 200 I = 1,M B(I,J) = TEMP*B(I,J) 200 CONTINUE END IF 210 CONTINUE ELSE DO 260 J = N,1,-1 IF (ALPHA.NE.ONE) THEN DO 220 I = 1,M B(I,J) = ALPHA*B(I,J) 220 CONTINUE END IF DO 240 K = J + 1,N IF (A(K,J).NE.ZERO) THEN DO 230 I = 1,M B(I,J) = B(I,J) - A(K,J)*B(I,K) 230 CONTINUE END IF 240 CONTINUE IF (NOUNIT) THEN TEMP = ONE/A(J,J) DO 250 I = 1,M B(I,J) = TEMP*B(I,J) 250 CONTINUE END IF 260 CONTINUE END IF ELSE ! ! Form B := alpha*B*inv( A**T ). ! IF (UPPER) THEN DO 310 K = N,1,-1 IF (NOUNIT) THEN TEMP = ONE/A(K,K) DO 270 I = 1,M B(I,K) = TEMP*B(I,K) 270 CONTINUE END IF DO 290 J = 1,K - 1 IF (A(J,K).NE.ZERO) THEN TEMP = A(J,K) DO 280 I = 1,M B(I,J) = B(I,J) - TEMP*B(I,K) 280 CONTINUE END IF 290 CONTINUE IF (ALPHA.NE.ONE) THEN DO 300 I = 1,M B(I,K) = ALPHA*B(I,K) 300 CONTINUE END IF 310 CONTINUE ELSE DO 360 K = 1,N IF (NOUNIT) THEN TEMP = ONE/A(K,K) DO 320 I = 1,M B(I,K) = TEMP*B(I,K) 320 CONTINUE END IF DO 340 J = K + 1,N IF (A(J,K).NE.ZERO) THEN TEMP = A(J,K) DO 330 I = 1,M B(I,J) = B(I,J) - TEMP*B(I,K) 330 CONTINUE END IF 340 CONTINUE IF (ALPHA.NE.ONE) THEN DO 350 I = 1,M B(I,K) = ALPHA*B(I,K) 350 CONTINUE END IF 360 CONTINUE END IF END IF END IF ! RETURN ! ! End of DTRSM . ! END SUBROUTINE DGEMM(TRANSA,TRANSB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC) ! ! -- Reference BLAS level3 routine (version 3.4.0) -- ! -- Reference BLAS is a software package provided by Univ. of Tennessee, -- ! -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- ! November 2011 ! ! .. Scalar Arguments .. DOUBLE PRECISION ALPHA,BETA INTEGER K,LDA,LDB,LDC,M,N CHARACTER TRANSA,TRANSB ! .. ! .. Array Arguments .. DOUBLE PRECISION A(LDA,*),B(LDB,*),C(LDC,*) ! .. ! ! ===================================================================== ! ! .. External Functions .. LOGICAL LSAME EXTERNAL LSAME ! .. ! .. External Subroutines .. EXTERNAL XERBLA ! .. ! .. Intrinsic Functions .. INTRINSIC MAX ! .. ! .. Local Scalars .. DOUBLE PRECISION TEMP INTEGER I,INFO,J,L,NCOLA,NROWA,NROWB LOGICAL NOTA,NOTB ! .. ! .. Parameters .. DOUBLE PRECISION ONE,ZERO PARAMETER (ONE=1.0D+0,ZERO=0.0D+0) ! .. ! ! Set NOTA and NOTB as true if A and B respectively are not ! transposed and set NROWA, NCOLA and NROWB as the number of rows ! and columns of A and the number of rows of B respectively. ! NOTA = LSAME(TRANSA,'N') NOTB = LSAME(TRANSB,'N') IF (NOTA) THEN NROWA = M NCOLA = K ELSE NROWA = K NCOLA = M END IF IF (NOTB) THEN NROWB = K ELSE NROWB = N END IF ! ! Test the input parameters. ! INFO = 0 IF ((.NOT.NOTA) .AND. (.NOT.LSAME(TRANSA,'C')) .AND. & (.NOT.LSAME(TRANSA,'T'))) THEN INFO = 1 ELSE IF ((.NOT.NOTB) .AND. (.NOT.LSAME(TRANSB,'C')) .AND. & (.NOT.LSAME(TRANSB,'T'))) THEN INFO = 2 ELSE IF (M.LT.0) THEN INFO = 3 ELSE IF (N.LT.0) THEN INFO = 4 ELSE IF (K.LT.0) THEN INFO = 5 ELSE IF (LDA.LT.MAX(1,NROWA)) THEN INFO = 8 ELSE IF (LDB.LT.MAX(1,NROWB)) THEN INFO = 10 ELSE IF (LDC.LT.MAX(1,M)) THEN INFO = 13 END IF IF (INFO.NE.0) THEN CALL XERBLA('DGEMM ',INFO) RETURN END IF ! ! Quick return if possible. ! IF ((M.EQ.0) .OR. (N.EQ.0) .OR. & (((ALPHA.EQ.ZERO).OR. (K.EQ.0)).AND. (BETA.EQ.ONE))) RETURN ! ! And if alpha.eq.zero. ! IF (ALPHA.EQ.ZERO) THEN IF (BETA.EQ.ZERO) THEN DO 20 J = 1,N DO 10 I = 1,M C(I,J) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40 J = 1,N DO 30 I = 1,M C(I,J) = BETA*C(I,J) 30 CONTINUE 40 CONTINUE END IF RETURN END IF ! ! Start the operations. ! IF (NOTB) THEN IF (NOTA) THEN ! ! Form C := alpha*A*B + beta*C. ! DO 90 J = 1,N IF (BETA.EQ.ZERO) THEN DO 50 I = 1,M C(I,J) = ZERO 50 CONTINUE ELSE IF (BETA.NE.ONE) THEN DO 60 I = 1,M C(I,J) = BETA*C(I,J) 60 CONTINUE END IF DO 80 L = 1,K IF (B(L,J).NE.ZERO) THEN TEMP = ALPHA*B(L,J) DO 70 I = 1,M C(I,J) = C(I,J) + TEMP*A(I,L) 70 CONTINUE END IF 80 CONTINUE 90 CONTINUE ELSE ! ! Form C := alpha*A**T*B + beta*C ! DO 120 J = 1,N DO 110 I = 1,M TEMP = ZERO DO 100 L = 1,K TEMP = TEMP + A(L,I)*B(L,J) 100 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 110 CONTINUE 120 CONTINUE END IF ELSE IF (NOTA) THEN ! ! Form C := alpha*A*B**T + beta*C ! DO 170 J = 1,N IF (BETA.EQ.ZERO) THEN DO 130 I = 1,M C(I,J) = ZERO 130 CONTINUE ELSE IF (BETA.NE.ONE) THEN DO 140 I = 1,M C(I,J) = BETA*C(I,J) 140 CONTINUE END IF DO 160 L = 1,K IF (B(J,L).NE.ZERO) THEN TEMP = ALPHA*B(J,L) DO 150 I = 1,M C(I,J) = C(I,J) + TEMP*A(I,L) 150 CONTINUE END IF 160 CONTINUE 170 CONTINUE ELSE ! ! Form C := alpha*A**T*B**T + beta*C ! DO 200 J = 1,N DO 190 I = 1,M TEMP = ZERO DO 180 L = 1,K TEMP = TEMP + A(L,I)*B(J,L) 180 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 190 CONTINUE 200 CONTINUE END IF END IF ! RETURN ! ! End of DGEMM . ! END SUBROUTINE DSWAP(N,DX,INCX,DY,INCY) ! ! -- Reference BLAS level1 routine (version 3.4.0) -- ! -- Reference BLAS is a software package provided by Univ. of Tennessee, -- ! -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- ! November 2011 ! ! .. Scalar Arguments .. INTEGER INCX,INCY,N ! .. ! .. Array Arguments .. DOUBLE PRECISION DX(*),DY(*) ! .. ! ! ===================================================================== ! ! .. Local Scalars .. DOUBLE PRECISION DTEMP INTEGER I,IX,IY,M,MP1 ! .. ! .. Intrinsic Functions .. INTRINSIC MOD ! .. IF (N.LE.0) RETURN IF (INCX.EQ.1 .AND. INCY.EQ.1) THEN ! ! code for both increments equal to 1 ! ! ! clean-up loop ! M = MOD(N,3) IF (M.NE.0) THEN DO I = 1,M DTEMP = DX(I) DX(I) = DY(I) DY(I) = DTEMP END DO IF (N.LT.3) RETURN END IF MP1 = M + 1 DO I = MP1,N,3 DTEMP = DX(I) DX(I) = DY(I) DY(I) = DTEMP DTEMP = DX(I+1) DX(I+1) = DY(I+1) DY(I+1) = DTEMP DTEMP = DX(I+2) DX(I+2) = DY(I+2) DY(I+2) = DTEMP END DO ELSE ! ! code for unequal increments or equal increments not equal ! to 1 ! IX = 1 IY = 1 IF (INCX.LT.0) IX = (-N+1)*INCX + 1 IF (INCY.LT.0) IY = (-N+1)*INCY + 1 DO I = 1,N DTEMP = DX(IX) DX(IX) = DY(IY) DY(IY) = DTEMP IX = IX + INCX IY = IY + INCY END DO END IF RETURN END SUBROUTINE DSCAL(N,DA,DX,INCX) ! ! -- Reference BLAS level1 routine (version 3.4.0) -- ! -- Reference BLAS is a software package provided by Univ. of Tennessee, -- ! -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- ! November 2011 ! ! .. Scalar Arguments .. DOUBLE PRECISION DA INTEGER INCX,N ! .. ! .. Array Arguments .. DOUBLE PRECISION DX(*) ! .. ! ! ===================================================================== ! ! .. Local Scalars .. INTEGER I,M,MP1,NINCX ! .. ! .. Intrinsic Functions .. INTRINSIC MOD ! .. IF (N.LE.0 .OR. INCX.LE.0) RETURN IF (INCX.EQ.1) THEN ! ! code for increment equal to 1 ! ! ! clean-up loop ! M = MOD(N,5) IF (M.NE.0) THEN DO I = 1,M DX(I) = DA*DX(I) END DO IF (N.LT.5) RETURN END IF MP1 = M + 1 DO I = MP1,N,5 DX(I) = DA*DX(I) DX(I+1) = DA*DX(I+1) DX(I+2) = DA*DX(I+2) DX(I+3) = DA*DX(I+3) DX(I+4) = DA*DX(I+4) END DO ELSE ! ! code for increment not equal to 1 ! NINCX = N*INCX DO I = 1,NINCX,INCX DX(I) = DA*DX(I) END DO END IF RETURN END SUBROUTINE DGER(M,N,ALPHA,X,INCX,Y,INCY,A,LDA) ! ! -- Reference BLAS level2 routine (version 3.4.0) -- ! -- Reference BLAS is a software package provided by Univ. of Tennessee, -- ! -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- ! November 2011 ! ! .. Scalar Arguments .. DOUBLE PRECISION ALPHA INTEGER INCX,INCY,LDA,M,N ! .. ! .. Array Arguments .. DOUBLE PRECISION A(LDA,*),X(*),Y(*) ! .. ! ! ===================================================================== ! ! .. Parameters .. DOUBLE PRECISION ZERO PARAMETER (ZERO=0.0D+0) ! .. ! .. Local Scalars .. DOUBLE PRECISION TEMP INTEGER I,INFO,IX,J,JY,KX ! .. ! .. External Subroutines .. EXTERNAL XERBLA ! .. ! .. Intrinsic Functions .. INTRINSIC MAX ! .. ! ! Test the input parameters. ! INFO = 0 IF (M.LT.0) THEN INFO = 1 ELSE IF (N.LT.0) THEN INFO = 2 ELSE IF (INCX.EQ.0) THEN INFO = 5 ELSE IF (INCY.EQ.0) THEN INFO = 7 ELSE IF (LDA.LT.MAX(1,M)) THEN INFO = 9 END IF IF (INFO.NE.0) THEN CALL XERBLA('DGER ',INFO) RETURN END IF ! ! Quick return if possible. ! IF ((M.EQ.0) .OR. (N.EQ.0) .OR. (ALPHA.EQ.ZERO)) RETURN ! ! Start the operations. In this version the elements of A are ! accessed sequentially with one pass through A. ! IF (INCY.GT.0) THEN JY = 1 ELSE JY = 1 - (N-1)*INCY END IF IF (INCX.EQ.1) THEN DO 20 J = 1,N IF (Y(JY).NE.ZERO) THEN TEMP = ALPHA*Y(JY) DO 10 I = 1,M A(I,J) = A(I,J) + X(I)*TEMP 10 CONTINUE END IF JY = JY + INCY 20 CONTINUE ELSE IF (INCX.GT.0) THEN KX = 1 ELSE KX = 1 - (M-1)*INCX END IF DO 40 J = 1,N IF (Y(JY).NE.ZERO) THEN TEMP = ALPHA*Y(JY) IX = KX DO 30 I = 1,M A(I,J) = A(I,J) + X(IX)*TEMP IX = IX + INCX 30 CONTINUE END IF JY = JY + INCY 40 CONTINUE END IF ! RETURN ! ! End of DGER . ! END INTEGER FUNCTION IDAMAX(N,DX,INCX) ! ! -- Reference BLAS level1 routine (version 3.4.0) -- ! -- Reference BLAS is a software package provided by Univ. of Tennessee, -- ! -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- ! November 2011 ! ! .. Scalar Arguments .. INTEGER INCX,N ! .. ! .. Array Arguments .. DOUBLE PRECISION DX(*) ! .. ! ! ===================================================================== ! ! .. Local Scalars .. DOUBLE PRECISION DMAX INTEGER I,IX ! .. ! .. Intrinsic Functions .. INTRINSIC DABS ! .. IDAMAX = 0 IF (N.LT.1 .OR. INCX.LE.0) RETURN IDAMAX = 1 IF (N.EQ.1) RETURN IF (INCX.EQ.1) THEN ! ! code for increment equal to 1 ! DMAX = DABS(DX(1)) DO I = 2,N IF (DABS(DX(I)).GT.DMAX) THEN IDAMAX = I DMAX = DABS(DX(I)) END IF END DO ELSE ! ! code for increment not equal to 1 ! IX = 1 DMAX = DABS(DX(1)) IX = IX + INCX DO I = 2,N IF (DABS(DX(IX)).GT.DMAX) THEN IDAMAX = I DMAX = DABS(DX(IX)) END IF IX = IX + INCX END DO END IF RETURN END DOUBLE PRECISION FUNCTION DLAMCH( CMACH ) ! ! -- LAPACK auxiliary routine (version 3.4.0) -- ! -- LAPACK is a software package provided by Univ. of Tennessee, -- ! -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- ! November 2011 ! ! .. Scalar Arguments .. CHARACTER CMACH ! .. ! ! .. Scalar Arguments .. DOUBLE PRECISION A, B ! .. ! ! ===================================================================== ! ! .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) ! .. ! .. Local Scalars .. DOUBLE PRECISION RND, EPS, SFMIN, SMALL, RMACH ! .. ! .. External Functions .. LOGICAL LSAME EXTERNAL LSAME ! .. ! .. Intrinsic Functions .. INTRINSIC DIGITS, EPSILON, HUGE, MAXEXPONENT INTRINSIC MINEXPONENT, RADIX, TINY ! .. ! .. Executable Statements .. ! ! ! Assume rounding, not chopping. Always. ! RND = ONE ! IF( ONE.EQ.RND ) THEN EPS = EPSILON(ZERO) * 0.5 ELSE EPS = EPSILON(ZERO) END IF ! IF( LSAME( CMACH, 'E' ) ) THEN RMACH = EPS ELSE IF( LSAME( CMACH, 'S' ) ) THEN SFMIN = TINY(ZERO) SMALL = ONE / HUGE(ZERO) IF( SMALL.GE.SFMIN ) THEN ! ! Use SMALL plus a bit, to avoid the possibility of rounding ! causing overflow when computing 1/sfmin. ! SFMIN = SMALL*( ONE+EPS ) END IF RMACH = SFMIN ELSE IF( LSAME( CMACH, 'B' ) ) THEN RMACH = RADIX(ZERO) ELSE IF( LSAME( CMACH, 'P' ) ) THEN RMACH = EPS * RADIX(ZERO) ELSE IF( LSAME( CMACH, 'N' ) ) THEN RMACH = DIGITS(ZERO) ELSE IF( LSAME( CMACH, 'R' ) ) THEN RMACH = RND ELSE IF( LSAME( CMACH, 'M' ) ) THEN RMACH = MINEXPONENT(ZERO) ELSE IF( LSAME( CMACH, 'U' ) ) THEN RMACH = tiny(zero) ELSE IF( LSAME( CMACH, 'L' ) ) THEN RMACH = MAXEXPONENT(ZERO) ELSE IF( LSAME( CMACH, 'O' ) ) THEN RMACH = HUGE(ZERO) ELSE RMACH = ZERO END IF ! DLAMCH = RMACH RETURN ! ! End of DLAMCH ! END DOUBLE PRECISION FUNCTION DLAMC3( A, B ) ! ! -- LAPACK auxiliary routine (version 3.4.0) -- ! Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. ! November 2010 ! ! .. Scalar Arguments .. DOUBLE PRECISION A, B ! .. ! ===================================================================== ! ! .. Executable Statements .. ! DLAMC3 = A + B ! RETURN ! ! End of DLAMC3 ! END libxsmm-1.17/samples/packed/getrf/getrf.c000066400000000000000000000661631415223013700204040ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Greg Henry, Hans Pabst, Alexander Heinecke (Intel Corp.) ******************************************************************************/ #if 0 #define USE_KERNEL_GENERATION_DIRECTLY #endif #if 0 #define USE_PREDEFINED_ASSEMBLY #define USE_XSMM_GENERATED #define TIME_MKL #endif #if 0 #define TEST_SINGLE #endif #if !defined(USE_PREDEFINED_ASSEMBLY) && !defined(USE_XSMM_GENERATED) && !defined(TIME_MKL) && \ (!defined(__linux__) || !defined(USE_KERNEL_GENERATION_DIRECTLY)) # define USE_XSMM_GENERATED # include #else # include # include # include # include # include #endif #include #include #include #include #define BUFSIZE 32*32 #define BUFSIZE2 64000 #if 0 #define TRIANGLE_IS_IDENTITY #endif LIBXSMM_INLINE void dcopy_to_temp ( int layout, double *A, int lda, int m, int n, double *Atemp, unsigned int VLEN ) { int i, j; if ( lda*n > BUFSIZE ) { printf("Reference routine not set up for matrices so large\n"); exit(-1); } if ( layout == 102 ) { /* printf("Column major\n"); */ for ( j = 0; j < n; j++ ) { for ( i = 0; i < m; i++ ) { Atemp[i+j*m] = A[i*VLEN+j*lda*VLEN]; } } #if EVENTUALLY_USE_THIS_LOOP_IT_SHOULD_BE_FASTER for ( j = 0; j < n; j++ ) { for ( i = 0, ia = 0; i < m; i++, ia+=VLEN ) { Atemp[i+j*m] = A[ ia+j*lda*VLEN ]; } } #endif } else { /* printf("Row major\n"); */ for ( j = 0; j < n; j++ ) { for ( i = 0; i < m; i++ ) { /* Transpose the data */ Atemp[i+j*m] = A[j*VLEN+i*lda*VLEN]; } } } } LIBXSMM_INLINE void scopy_to_temp ( int layout, float *A, int lda, int m, int n, float *Atemp, unsigned int VLEN ) { int i, j; if ( lda*n > BUFSIZE ) { printf("Reference routine not set up for matrices so large\n"); exit(-1); } if ( layout == 102 ) { /* printf("Column major\n"); */ for ( j = 0; j < n; j++ ) { for ( i = 0; i < m; i++ ) { Atemp[i+j*m] = A[i*VLEN+j*lda*VLEN]; } } } else { /* printf("Row major\n"); */ for ( j = 0; j < n; j++ ) { for ( i = 0; i < m; i++ ) { /* Transpose the data */ Atemp[i+j*m] = A[j*VLEN+i*lda*VLEN]; } } } } LIBXSMM_INLINE void dcopy_from_temp ( int layout, double *A, int lda, int m, int n, double *Atemp, unsigned int VLEN ) { int i, j, ia; if ( lda*n > BUFSIZE ) { printf("Reference routine not set up for matrices so large\n"); } if ( layout == 102 ) { for ( j = 0; j < n; j++ ) { for ( i = 0, ia = 0; i < m; i++, ia+=VLEN ) { A[ia+j*lda*VLEN] = Atemp[i+j*m]; } } } else { for ( j = 0; j < n; j++ ) { for ( i = 0; i < m; i++ ) { /* Transpose the data */ A[j*VLEN+i*lda*VLEN] = Atemp[i+j*m]; } } } } LIBXSMM_INLINE void scopy_from_temp ( int layout, float *A, int lda, int m, int n, float *Atemp, unsigned int VLEN ) { int i, j, ia; if ( lda*n > BUFSIZE ) { printf("Reference routine not set up for matrices so large\n"); } if ( layout == 102 ) { for ( j = 0; j < n; j++ ) { for ( i = 0, ia = 0; i < m; i++, ia+=VLEN ) { A[ia+j*lda*VLEN] = Atemp[i+j*m]; } } } else { for ( j = 0; j < n; j++ ) { for ( i = 0; i < m; i++ ) { /* Transpose the data */ A[j*VLEN+i*lda*VLEN] = Atemp[i+j*m]; } } } } void show_real_matrix ( unsigned int m, unsigned int n, double *A, unsigned int lda ) { unsigned int i, j; for ( i = 1; i <= m; i++ ) { for ( j = 1; j <= n; j++ ) { printf("%g ",A[(j-1)*lda+(i-1)]); } printf("\n"); } } #if !defined(USE_MKL_FOR_REFERENCE) && !defined(LIBXSMM_NOFORTRAN) && (!defined(__BLAS) || (0 != __BLAS)) extern void dgetrf_(); extern void dgetrfnp_(); /* Reference code for compact dgetrf. Note that this just copies data into a buffer from the compact storage and calls the regular dgetrf code. This is very naive reference code just used for testing purposes */ LIBXSMM_INLINE void compact_dgetrf_ ( unsigned int *layout, unsigned int *m, unsigned int *n, double *A, unsigned int *lda, unsigned int *nmat, unsigned int *VLEN ) { unsigned int i, j, num, info, col; double *Ap, Atemp[BUFSIZE]; static int ntimes = 0; if ( ++ntimes < 3 ) printf("Inside reference compact_dgetrf_()\n"); if ( ++ntimes < 3 ) printf("layout=%d m=%d n=%d lda=%d nmat=%d VLEN=%d\n",*layout,*m,*n,*lda,*nmat,*VLEN); if ( *layout == 102 ) col = *n; else col = *m; for ( i = 0, num = 0; i < (*nmat); i+= *VLEN, num++ ) { for ( j = 0; j < *VLEN; j++ ) { /* Unpack the data, call a reference DGETRF, repack the data */ Ap = &A[j+num*(*lda)*col*(*VLEN)]; if (++ntimes < 6 ) printf("Doing a dgetrf at place i=%d j=%d num=%d Ap[%d]=%g\n",i,j,num,j+num*(*lda)*col*(*VLEN),Ap[0]); dcopy_to_temp ( *layout, Ap, *lda, *m, *n, Atemp, *VLEN ); #if 0 if ( *m <= 4 && *n <= 4 ) { printf("Matrix with i=%d j=%d num=%d loc=%ld lda=%d\n",i,j,num,j+num*(*lda)*col*(*VLEN),*lda); show_real_matrix ( *m, *n, Atemp, *m ); } #endif info = 0; dgetrfnp_ ( m, n, Atemp, m, &info ); #if 0 if ( *m <= 4 && *n <= 4 ) { printf("Result with i=%d j=%d num=%d loc=%ld\n",i,j,num,j+num*(*lda)*col*(*VLEN)); show_real_matrix ( *m, *n, Atemp, *m ); } #endif if ( info != 0 ) printf("*** BAD news reference code got info=%d in case i=%d num=%d j=%d\n",info,i,num,j); dcopy_from_temp ( *layout, Ap, *lda, *m, *n, Atemp, *VLEN ); #if 0 printf("i=%d num=%d j=%d Ap[20]=%g\n",i,num,j,Ap[20]); #endif } } } extern void sgetrf_(); extern void sgetrfnp_(); /* Reference code for compact sgetrf. Note that this just copies data into a buffer from the compact storage and calls the regular sgetrf code. This is very naive reference code just used for testing purposes */ /* Note: if layout==101 (row major), then this code is known to only work when * nmat == VLEN. To check for accuracy otherwise, transpose everything */ LIBXSMM_INLINE void compact_sgetrf_ ( unsigned int *layout, unsigned int *m, unsigned int *n, float *A, unsigned int *lda, unsigned int *nmat, unsigned int *VLEN ) { unsigned int i, j, num, info; float *Ap, Atemp[BUFSIZE]; static int ntimes = 0; if ( ++ntimes < 3 ) printf("Inside reference compact_sgetrf_()\n"); if ( ++ntimes < 3 ) printf("layout=%d VLEN=%d nmat=%d\n",*layout, *VLEN, *nmat ); for ( i = 0, num = 0; i < (*nmat); i+= *VLEN, num++ ) { for ( j = 0; j < *VLEN; j++ ) { /* Unpack the data, call a reference SGETRF, repack the data */ Ap = &A[j+num*(*lda)*(*n)*(*VLEN)]; if (++ntimes < 3 ) printf("Doing a sgetrf at place i=%d j=%d num=%d Ap[%d]=%g\n",i,j,num,j+num*(*lda)*(*n)*(*VLEN),Ap[0]); scopy_to_temp ( *layout, Ap, *lda, *m, *n, Atemp, *VLEN ); sgetrfnp_ ( m, n, Atemp, m, &info ); if ( info != 0 ) printf("Bad news! Serial reference got info=%d\n",info); scopy_from_temp ( *layout, Ap, *lda, *m, *n, Atemp, *VLEN ); } } } #endif #define DUPLICATE_ELEMENTS_ACROSS LIBXSMM_INLINE void dfill_matrix ( int layout, double *matrix, unsigned int nmat, unsigned int ld, unsigned int m, unsigned int n, unsigned int VLEN ) { unsigned int i, j, k, k1, row, col; size_t address; double dtmp = 0; if ( layout == 102 ) { row = m; col = n; } else { row = n; col = m; } if ( ld < row ) { fprintf(stderr,"Error is dfill_matrix: ld=%u row=%u (m=%u n=%u) mismatched!\n",ld,row, m, n); exit(-1); } for ( k1 = 1; k1 <= nmat/VLEN; k1++ ) { for ( j = 1; j <= col; j++ ) { for ( i = 1; i <= ld; i++ ) { for ( k = 1; k <= VLEN; k++ ) { address = (k1-1)*col*ld*VLEN + (j-1)*ld*VLEN + (i-1)*VLEN + (k-1); #ifdef DUPLICATE_ELEMENTS_ACROSS if ( k == 1 ) #endif if ( i <= row ) dtmp = 1.0 - 2.0*libxsmm_rng_f64(); else dtmp = -99.9; matrix [ address ] = dtmp; } } } } } LIBXSMM_INLINE void dfill_identity ( double *matrix, unsigned int ld, unsigned int m, unsigned int n, int VLEN, int number_of_cases ) { unsigned int h, i, j, k, ia; double dtmp = 0; if ( ld < m ) { fprintf(stderr,"Error in dfill_identity: ld=%u m=%u mismatched!\n",ld,m); exit(-1); } for ( h = 0; h < (unsigned int)number_of_cases; h++ ) { ia = h*ld*n*VLEN; for ( j = 1; j <= n; j++ ) { for ( i = 1; i <= ld; i++ ) { if ( i == j ) dtmp = 1.0; else dtmp = 0.0; for ( k = 0; k < (unsigned int)VLEN; k++ ) matrix[ia++] = dtmp; } } } } LIBXSMM_INLINE void sfill_matrix ( int layout, float *matrix, unsigned int nmat, unsigned int ld, unsigned int m, unsigned int n, unsigned int VLEN ) { unsigned int i, j, k, k1, row, col; size_t address; double dtmp = 0; if ( layout == 102 ) { row = m; col = n; } else { row = n; col = m; } if ( ld < row ) { fprintf(stderr,"Error is sfill_matrix: ld=%u row=%u (m=%u n=%u) mismatched!\n",ld,row, m, n); exit(-1); } for ( k1 = 1; k1 <= nmat/VLEN; k1++ ) { for ( j = 1; j <= col; j++ ) { for ( i = 1; i <= ld; i++ ) { for ( k = 1; k <= VLEN; k++ ) { address = (k1-1)*col*ld*VLEN + (j-1)*ld*VLEN + (i-1)*VLEN + (k-1); #ifdef DUPLICATE_ELEMENTS_ACROSS if ( k == 1 ) #endif if ( i <= row ) dtmp = 1.0 - 2.0*libxsmm_rng_f64(); else dtmp = -99.9; matrix [ address ] = (float) dtmp; } } } } } LIBXSMM_INLINE double residual_s ( unsigned int layout, float *A, unsigned int nmat, unsigned int VLEN, unsigned int lda, unsigned int m, unsigned int n, float *B, unsigned int ldb, unsigned int *nerrs, unsigned int *ncorr ) { unsigned int i, j, k, k1, row, col; double atmp, btmp, dtmp, ref, derror; static int ntimes = 0; size_t address; *nerrs = 0; *ncorr = 0; derror = 0.0; if ( layout == 102 ) { row = m; col = n; } else { row = n; col = m; } for ( k1 = 1; k1 <= nmat/VLEN; k1++ ) { for ( j = 1; j <= col; j++ ) { for ( i = 1; i <= row; i++ ) { for ( k = 1; k <= VLEN; k++ ) { address= (k1-1)*col*lda*VLEN + (j-1)*lda*VLEN + (i-1)*VLEN + (k-1); atmp = (double) A[ address ]; address= (k1-1)*col*ldb*VLEN + (j-1)*ldb*VLEN + (i-1)*VLEN + (k-1); btmp = (double) B[ address ]; ref = LIBXSMM_MAX(atmp,-atmp); if ( atmp > btmp ) { dtmp = atmp - btmp; } else { dtmp = btmp - atmp; } if ( isnan(dtmp) || isinf(dtmp) ) { if ( ++ntimes < 15 ) { printf("Denormal bug: A[%ld]=A(%u,%u,%u,%u) is %g B(%u,%u,%u,%u) is %g\n",address,k,i,j,k1,atmp,k,i,j,k1,btmp); } } if ( (dtmp / ref > 1.0e-4) && (dtmp > 1.0e-7) ) { *nerrs = *nerrs + 1; if ( ++ntimes < 15 ) { printf("Bug #%i: A[%ld]=A(%u,%u,%u,%u) expected=%g instead=%g err=%g\n",ntimes,address,k,i,j,k1,atmp,btmp,dtmp); } } else { if ( (*nerrs > 0) && (ntimes < 10) && (*ncorr < 40) ) { printf("Cor #%u: A[%ld]=A(%u,%u,%u,%u) expected=%g\n",*ncorr+1,address,k,i,j,k1,atmp); } *ncorr = *ncorr + 1; } derror += dtmp; } } } } return ( derror ); } LIBXSMM_INLINE double residual_d ( unsigned int layout, double *A, unsigned int nmat, unsigned int VLEN, unsigned int lda, unsigned int m, unsigned int n, double *B, unsigned int ldb, unsigned int *nerrs, unsigned int *ncorr ) { unsigned int i, j; double atmp, btmp, dtmp, ref, derror; static int ntimes = 0; *nerrs = 0; *ncorr = 0; derror = 0.0; for ( j = 1; j<= n; j++ ) { for ( i = 1; i <= m; i++ ) { atmp = (double) A[ (j-1)*lda + (i-1)]; btmp = (double) B[ (j-1)*ldb + (i-1)]; ref = LIBXSMM_MAX(atmp,-atmp); if ( atmp >= btmp ) { dtmp = atmp - btmp; } else { dtmp = btmp - atmp; } if ( isnan(dtmp) || isinf(dtmp) ) { if ( ++ntimes < 15 ) { printf("Denormal bug: A(%u,%u) is %g B(%u,%u) is %g\n",i,j,atmp,i,j,btmp); } } if ( (dtmp / ref > 1.0e-12) && (dtmp > 1.0e-15) ) { *nerrs = *nerrs + 1; if ( ++ntimes < 15 ) { printf("Bug #%d: A(%u,%u) expected=%g instead=%g err=%g\n",ntimes,i,j,atmp,btmp,dtmp); } } else { if ( (*nerrs > 0) && (ntimes < 10) && (*ncorr < 40) ) { printf("Cor #%u: A(%u,%u) expected=%g\n",*ncorr+1,i,j,atmp); } *ncorr = *ncorr + 1; } derror += dtmp; } } return ( derror ); } #ifdef USE_PREDEFINED_ASSEMBLY extern void getrf_(); #endif #ifdef MKL_TIMER extern double dsecnd_(); #endif #if 1 #ifndef AVX2_TESTING #define AVX2_TESTING #endif #else #ifndef AVX512_TESTING #define AVX512_TESTING #endif #endif #if !defined(AVX2_TESTING) && !defined(AVX512_TESTING) #define AVX2_TESTING #endif #if defined(AVX2_TESTING) && defined(AVX512_TESTING) #error Compile with either AVX2_TESTING or AVX512_TESTING never both #endif int main(int argc, char* argv[]) { unsigned int m=8, n=8, lda=8, ldb=8, nerrs, num, nmat, ntest; unsigned int layout, asize, bsize; #ifdef AVX512_TESTING unsigned int VLEND=8, VLENS=16; int arch=LIBXSMM_X86_AVX512_CORE; #else unsigned int VLEND=4, VLENS=8; int arch=LIBXSMM_X86_AVX2; #endif unsigned int ncorr; unsigned int i, j, large_entry; char side='L', uplo='L', trans='N', diag='N'; float *sa, *sb, *sc, *sd; double *da, *db, *dc, *dd, *tmpbuf; double dalpha = 1.0; float salpha; double dtmp; size_t sizea; const unsigned char *cptr; unsigned long op_count; unsigned int typesize8 = 8; const libxsmm_getrf_descriptor* desc8 = NULL; #ifdef TEST_SINGLE unsigned int typesize4 = 4; const libxsmm_getrf_descriptor* desc4 = NULL; #endif #ifdef USE_XSMM_GENERATED libxsmm_descriptor_blob blob; libxsmm_getrf_xfunction mykernel = NULL; #endif #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__) void (*opcode_routine)(); unsigned char *routine_output; libxsmm_generated_code io_generated_code; int pagesize = sysconf(_SC_PAGE_SIZE); if (pagesize == -1) fprintf(stderr,"sysconf pagesize\n"); routine_output = (unsigned char *) mmap(NULL, BUFSIZE2, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0,0); if (mprotect(routine_output, BUFSIZE2, PROT_EXEC | PROT_READ | PROT_WRITE ) == -1) fprintf(stderr,"mprotect\n"); printf("Routine ready\n"); io_generated_code.generated_code = &routine_output[0]; io_generated_code.buffer_size = BUFSIZE2; io_generated_code.code_size = 0; io_generated_code.code_type = 2; io_generated_code.last_error = 0; io_generated_code.sf_size = 0; #endif printf("\nUSAGE: %s m n lda nmat layout ntest\n",argv[0]); if ( argc <= 3 ) { printf("Compact LU (GETRF, no pivots) a mxn matrix of leading dim lda\n"); printf("This will test the jit of 1 VLEN work of nmat at a time\n"); printf("Defaults: m=n=lda=nmat=8, layout=102 (col major), ntest=1\n"); } if ( argc > 1 ) m = atoi(argv[1]); else m = 8; if ( argc > 2 ) n = atoi(argv[2]); else n = 8; if ( argc > 3 ) lda= atoi(argv[3]); else lda = 8; if ( argc > 4 ) nmat = atoi(argv[4]); else nmat = 8; if ( argc > 5 ) layout = atoi(argv[5]); else layout=102; if ( argc > 6 ) ntest = atoi(argv[6]); else ntest = 1; salpha = (float)dalpha; m = LIBXSMM_MAX(m,1); n = LIBXSMM_MAX(n,1); ntest = LIBXSMM_MAX(ntest,1); #ifdef TEST_SINGLE nmat = LIBXSMM_MAX(VLENS,nmat - (nmat%VLENS)); #else nmat = LIBXSMM_MAX(VLEND,nmat - (nmat%VLEND)); #endif layout = LIBXSMM_MAX(LIBXSMM_MIN(layout,102),101); if ( layout == 102 ) lda = LIBXSMM_MAX(lda,m); else lda = LIBXSMM_MAX(lda,n); if ( m >= n ) { op_count = nmat * (double)n * (double)n * (3.0*(double)m-(double)n) / 3.0; } else { op_count = nmat * (double)m * (double)m * (3.0*(double)n-(double)m) / 3.0; } printf("This is a real*%d tester for JIT compact DGETRF kernels! (m=%u n=%u lda=%u layout=%d nmat=%d)\n",typesize8,m,n,lda,layout,nmat); #ifdef USE_XSMM_GENERATED printf("This code tests the LIBXSMM generated kernels\n"); #endif #ifdef USE_PREDEFINED_ASSEMBLY printf("This code tests some predefined assembly kernel\n"); #endif #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__) printf("This code tests kernel generation directly\n"); #endif #ifdef TIME_MKL printf("This code tests MKL compact batch directly\n"); #endif #ifdef AVX512_TESTING printf("This binary tests only AVX512 codes\n"); #endif #ifdef AVX2_TESTING printf("This binary tests only AVX2 codes\n"); #endif desc8 = libxsmm_getrf_descriptor_init(&blob, typesize8, m, n, lda, layout); #ifdef TEST_SINGLE desc4 = libxsmm_getrf_descriptor_init(&blob, typesize4, m, n, lda, layout); #endif #ifdef USE_XSMM_GENERATED printf("calling libxsmm_dispatch_getrf: typesize8=%u\n",typesize8); mykernel = libxsmm_dispatch_getrf(desc8); printf("done calling libxsmm_dispatch_getrf: typesize8=%u\n",typesize8); if ( mykernel == NULL ) printf("R8 Kernel after the create call is null\n"); #ifdef TEST_SINGLE mykernel = libxsmm_dispatch_getrf(desc4); if ( mykernel == NULL ) printf("R4 kernel after the create call is null\n"); #endif #endif #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__) libxsmm_generator_getrf_kernel( &io_generated_code, desc8, arch ); #endif #ifndef NO_ACCURACY_CHECK printf("mallocing matrices\n"); #endif if ( layout == 102 ) sizea = lda*n*nmat; else sizea = lda*m*nmat; sa = (float *) malloc ( sizea*sizeof(float) ); da = (double *) malloc ( sizea*sizeof(double) ); sc = (float *) malloc ( sizea*sizeof(float) ); dc = (double *) malloc ( sizea*sizeof(double) ); sd = (float *) malloc ( sizea*sizeof(float) ); dd = (double *) malloc ( sizea*sizeof(double) ); large_entry = LIBXSMM_MIN(256,sizea); large_entry = large_entry - (large_entry%16); while ( large_entry > m*n*nmat ) { large_entry /= 2; } large_entry = LIBXSMM_MAX(large_entry,4); #ifndef NO_ACCURACY_CHECK printf("filling matrices\n"); #endif sfill_matrix ( layout, sa, nmat, lda, m, n, VLEND ); #ifdef TRIANGLE_IS_IDENTITY printf("Warning: setting triangular matrix to identity. Not good for accuracy testing\n"); dfill_identity ( da, lda, m, m, VLEND, nmat/VLEND ); #else dfill_matrix ( layout, da, nmat, lda, m, n, VLEND ); #endif #ifndef NO_ACCURACY_CHECK for ( i = 0; i < sizea; i++ ) sc[i]=sa[i]; for ( i = 0; i < sizea; i++ ) dc[i]=da[i]; for ( i = 0; i < sizea; i++ ) sd[i]=sa[i]; for ( i = 0; i < sizea; i++ ) dd[i]=da[i]; printf("Pointing at the kernel now\n"); #endif #ifdef USE_XSMM_GENERATED cptr = (const unsigned char*) mykernel; #endif #ifdef USE_PREDEFINED_ASSEMBLY cptr = (const unsigned char*) getrf_; #endif #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__) cptr = (const unsigned char*) &routine_output[0]; opcode_routine = (void *) &cptr[0]; #endif #ifndef TIME_MKL # define DUMP_ASSEMBLY_FILE #endif #ifdef DUMP_ASSEMBLY_FILE #define ASSEMBLY_DUMP_SIZE 4000 printf("Dumping assembly file (first %d bytes)\n",ASSEMBLY_DUMP_SIZE); FILE *fp = fopen("foo.s","w"); char buffer[80]; fputs("\t.text\n",fp); fputs("\t.align 256\n",fp); fputs("\t.globl getrf_\n",fp); fputs("getrf_:\n",fp); for (i = 0; i < ASSEMBLY_DUMP_SIZE; i+=4 ) { sprintf(buffer,".byte 0x%02x, 0x%02x, 0x%02x, 0x%02x\n",cptr[i],cptr[i+1],cptr[i+2],cptr[i+3]); fputs(buffer,fp); } fputs("\tretq\n",fp); fputs("\t.type getrf_,@function\n",fp); fputs("\t.size getrf_,.-getrf_\n",fp); fclose(fp); #endif #if defined(USE_MKL_FOR_REFERENCE) || defined(TIME_MKL) # include int info; MKL_LAYOUT CLAYOUT = (layout == 101) ? MKL_ROW_MAJOR : MKL_COL_MAJOR; MKL_SIDE SIDE = (side == 'R' || side == 'r') ? MKL_RIGHT : MKL_LEFT; MKL_UPLO UPLO = (uplo == 'U' || uplo == 'u') ? MKL_UPPER : MKL_LOWER; MKL_TRANSPOSE TRANSA = (trans == 'N' || trans == 'n') ? MKL_NOTRANS : MKL_TRANS; MKL_DIAG DIAG = (diag == 'N' || diag == 'n') ? MKL_NONUNIT : MKL_UNIT; MKL_COMPACT_PACK CMP_FORMAT = mkl_get_format_compact(); #if 0 MKL_COMPACT_PACK CMP_FORMAT = MKL_COMPACT_AVX; #endif #endif #ifndef NO_ACCURACY_CHECK printf("Before routine, initial A(1,1)=%g A[%d]=%g\n",da[0],large_entry,da[large_entry]); #endif #ifdef USE_PREDEFINED_ASSEMBLY double one = 1.0; #endif double timer, firsttime = 0; #ifdef MKL_TIMER double tmptimer; tmptimer = dsecnd_(); #else unsigned long long l_start, l_end; #endif timer = 0.0; for ( j = 0; j < (int)ntest; j++ ) { #ifndef TRIANGLE_IS_IDENTITY for ( i = 0; i < (int)sizea; i++ ) dc[i]=da[i]; #endif for ( i = 0 , num = 0; i < (int)nmat; i+= (int)VLEND, num++ ) { double *Ap; if ( layout == 102 ) Ap = &dc[num*lda*n*VLEND]; else Ap = &dc[num*lda*m*VLEND]; #ifdef MKL_TIMER tmptimer = dsecnd_(); #else l_start = libxsmm_timer_tick(); #endif #if !defined(USE_XSMM_GENERATED) && !defined(USE_PREDEFINED_ASSEMBLY) && !defined(USE_KERNEL_GENERATION_DIRECTLY) && !defined(TIME_MKL) && !defined(USE_PREDEFINED_ASSEMBLY_XCT) gen_compact_dgetrf_ ( &layout, &m, &n, Ap, &lda, &VLEND ); #endif #ifdef USE_XSMM_GENERATED mykernel ( Ap, Ap, NULL ); #endif #ifdef USE_PREDEFINED_ASSEMBLY getrf_ ( Ap, Ap, &one ); #endif #ifdef USE_KERNEL_GENERATION_DIRECTLY (*opcode_routine)( Ap ); #endif #ifdef TIME_MKL #if 1 info = 0; mkl_dgetrfnp_compact ( CLAYOUT, m, n, dc, lda, &info, CMP_FORMAT, nmat ); i+=nmat; /* Because MKL will do everything */ #else mkl_dgetrfnp_compact ( CLAYOUT, m, n, Ap, lda, &info, CMP_FORMAT, VLEND ); #endif #endif #ifdef USE_PREDEFINED_ASSEMBLY_XCT getrf_xct_ ( Ap, &one ); #endif #ifdef MKL_TIMER dtmp = dsecnd_() - tmptimer; #else l_end = libxsmm_timer_tick(); dtmp = libxsmm_timer_duration(l_start,l_end); #endif if ( j == 0 ) firsttime=dtmp; timer += dtmp; } } if ( ntest >= 100 ) { /* Skip the first timing: super necessary if using MKL */ timer = (timer-firsttime)/((double)(ntest-1)); } else { timer /= ((double)ntest); } #ifndef NO_ACCURACY_CHECK printf("Average time to get through %u matrices: %g\n",nmat,timer); printf("Gflops: %g\n",(double)op_count/(timer*1.0e9)); printf("after routine, new C(1,1)=%g C[%d]=%g\n",dc[0],large_entry,dc[large_entry]); #endif #ifdef TEST_SINGLE printf("Before r4 routine, initial C(1,1)=%g C[%d]=%g\n",sc[0],large_entry,sc[large_entry]); for ( i = 0 , num = 0; i < nmat; i+= VLENS, num++ ) { float *Ap; if ( layout == 102 ) Ap = &sc[num*lda*n*VLENS]; else Ap = &sc[num*lda*m*VLENS]; #ifdef USE_XSMM_GENERATED mykernel ( Ap, Ap, NULL ); #endif #ifdef USE_KERNEL_GENERATION_DIRECTLY (*opcode_routine)( Ap ); #endif #ifdef TIME_MKL info = 0; mkl_sgetrfnp_compact ( CLAYOUT, m, n, sc, lda, &info, CMP_FORMAT, nmat ); i+=nmat; /* Because MKL will do everything */ #endif } printf("after r4 routine, new C(1,1)=%g C[%d]=%g\n",dc[0],large_entry,dc[large_entry]); #endif #ifndef NO_ACCURACY_CHECK /* Call some reference code now on a copy of the B matrix (C) */ double timer2 = 0.0; for ( j = 0; j < (int)ntest; j++ ) { #ifndef TRIANGLE_IS_IDENTITY for ( i = 0; i < (int)sizea; i++ ) dd[i]=da[i]; #endif #ifdef MKL_TIMER tmptimer = dsecnd_(); #else l_start = libxsmm_timer_tick(); #endif #if !defined(USE_MKL_FOR_REFERENCE) && !defined(LIBXSMM_NOFORTRAN) && (!defined(__BLAS) || (0 != __BLAS)) compact_dgetrf_ ( &layout, &m, &n, dd, &lda, &nmat, &VLEND ); #elif defined(USE_MKL_FOR_REFERENCE) mkl_dgetrfnp_compact ( CLAYOUT, m, n, dd, lda, info, CMP_FORMAT, nmat ); #endif #ifdef MKL_TIMER timer2 += dsecnd_() - tmptimer; #else l_end = libxsmm_timer_tick(); timer2 += libxsmm_timer_duration(l_start,l_end); #endif } timer2 /= ((double)ntest); printf("Reference time=%g Reference Gflops=%g\n",timer2,op_count/(timer2*1.0e9)); #ifndef TEST_SINGLE /* Compute the residual between B and C */ dtmp = residual_d ( layout, dc, nmat, VLEND, lda, m, n, dd, lda, &nerrs, &ncorr ); printf("R8 m=%u n=%u lda=%u error: %g number of errors: %u corrects: %u",m,n,lda,dtmp,nerrs,ncorr); if ( nerrs > 0 ) printf(" ->FAILED at %ux%u real*8 %u case",m,n,layout); printf("\n"); #endif #ifdef TEST_SINGLE /* Call some reference code now on a copy of the B matrix (C) */ for ( i = 0; i < lda*n*nmat; i++ ) sd[i]=sa[i]; compact_sgetrf_ ( &layout, &m, &n, sd, &lda, &nmat, &VLENS ); /* Compute the residual between C and D */ dtmp = residual_s ( layout, sc, nmat, VLENS, lda, m, n, sd, lda, &nerrs, &ncorr ); printf("float m=%u n=%u lda=%u error: %g number of errors: %u corrects: %u\n",m,n,lda,dtmp,nerrs,ncorr); if ( nerrs > 0 ) printf(" ->FAILED at %ux%u real*4 case",m,n); printf("\n"); #endif #else for ( j = 0, nerrs = 0; j < lda*n*nmat; j++ ) { if ( isnan(dc[j]) || isinf(dc[j]) ) { if ( ++nerrs < 10 ) { printf("WARNING: dc[%d]=%g\n",j,dc[j]); } } } printf("%g,real*8 m=%u n=%u lda=%u Denormals=%u Time=%g Gflops=%g",op_count/(timer*1.0e9),m,n,lda,nerrs,timer,op_count/(timer*1.0e9)); if ( nerrs > 0 ) printf(" -> FAILED at %ux%u real*8 case",m,n); printf("\n"); #endif free(dd); free(sd); free(dc); free(sc); free(da); free(sa); return 0; } libxsmm-1.17/samples/packed/getrf/getrf.vcxproj000066400000000000000000000551151415223013700216500ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 getrf {8DD27DEF-EFA5-4CC3-87E6-A5D804C744A6} 10.0 Application Disabled Disabled Sequential v142 true Application true true Disabled Disabled Sequential v142 Application true Disabled Disabled Sequential v142 true Application Disabled Disabled Sequential v142 true true Application true Disabled Disabled Sequential v142 Application true Disabled Disabled true Sequential v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;LIBXSMM_NOFORTRAN;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;mkl_rt.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;LIBXSMM_NOFORTRAN;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;LIBXSMM_NOFORTRAN;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;mkl_rt.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;LIBXSMM_NOFORTRAN;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;LIBXSMM_NOFORTRAN;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;LIBXSMM_NOFORTRAN;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/packed/getrf/sgetrf.f000066400000000000000000001014511415223013700205600ustar00rootroot00000000000000 SUBROUTINE SGETRFNP( M, N, A, LDA, INFO ) ! ! -- LAPACK computational routine (version 3.4.0) -- ! -- LAPACK is a software package provided by Univ. of Tennessee, -- ! -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- ! November 2011 ! ! .. Scalar Arguments .. INTEGER INFO, LDA, M, N ! .. ! .. Array Arguments .. REAL*4 A( LDA, * ) ! .. ! ! ===================================================================== ! ! .. Parameters .. REAL*4 ONE, ZERO PARAMETER ( ONE = 1.0+0, ZERO = 0.0+0 ) ! .. ! .. Local Scalars .. REAL*4 SFMIN INTEGER I, J, JP ! .. ! .. External Functions .. REAL*4 SLAMCH INTEGER IDAMAX EXTERNAL SLAMCH, IDAMAX ! .. ! .. External Subroutines .. EXTERNAL SGER, SSCAL, SSWAP, XERBLA ! .. ! .. Intrinsic Functions .. INTRINSIC MAX, MIN ! .. ! .. Executable Statements .. ! ! Test the input parameters. ! INFO = 0 IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SGETRFNP', -INFO ) RETURN END IF ! ! Quick return if possible ! IF( M.EQ.0 .OR. N.EQ.0 ) RETURN ! ! Compute machine safe minimum ! SFMIN = SLAMCH('S') ! DO 10 J = 1, MIN ( M, N ) JP = J IF( A( JP, J ).NE.ZERO ) THEN ! ! Compute elements J+1:M of J-th column. ! IF( J.LT.M ) THEN IF( ABS(A( J, J )) .GE. SFMIN ) THEN CALL SSCAL( M-J, ONE / A( J, J ), A( J+1, J ), 1 ) ELSE DO 20 I = 1, M-J A( J+I, J ) = A( J+I, J ) / A( J, J ) 20 CONTINUE END IF END IF ! ELSE IF( INFO.EQ.0 ) THEN ! INFO = J END IF ! IF( J.LT.MIN( M, N ) ) THEN ! ! Update trailing submatrix. ! CALL SGER( M-J, N-J, -ONE, A( J+1, J ), 1,& A( J, J+1 ),LDA,A( J+1, J+1 ), LDA ) END IF 10 CONTINUE RETURN ! ! End of SGETRFNP ! END SUBROUTINE SGETRF( M, N, A, LDA, IPIV, INFO ) ! ! -- LAPACK computational routine (version 3.4.0) -- ! -- LAPACK is a software package provided by Univ. of Tennessee, -- ! -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- ! November 2011 ! ! .. Scalar Arguments .. INTEGER INFO, LDA, M, N ! .. ! .. Array Arguments .. INTEGER IPIV( * ) REAL*4 A( LDA, * ) ! .. ! ! ===================================================================== ! ! .. Parameters .. REAL*4 ONE PARAMETER ( ONE = 1.0E+0 ) ! .. ! .. Local Scalars .. INTEGER I, IINFO, J, JB, NB ! .. ! .. External Subroutines .. EXTERNAL SGEMM, SGETF2, SLASWP, STRSM, XERBLA ! .. ! .. External Functions .. INTEGER ILAENV EXTERNAL ILAENV ! .. ! .. Intrinsic Functions .. INTRINSIC MAX, MIN ! .. ! .. Executable Statements .. ! ! Test the input parameters. ! INFO = 0 IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SGETRF', -INFO ) RETURN END IF ! ! Quick return if possible ! IF( M.EQ.0 .OR. N.EQ.0 ) RETURN ! ! Determine the block size for this environment. ! NB = ILAENV( 1, 'SGETRF', ' ', M, N, -1, -1 ) IF( NB.LE.1 .OR. NB.GE.MIN( M, N ) ) THEN ! ! Use unblocked code. ! CALL SGETF2( M, N, A, LDA, IPIV, INFO ) ELSE ! ! Use blocked code. ! DO 20 J = 1, MIN( M, N ), NB JB = MIN( MIN( M, N )-J+1, NB ) ! ! Factor diagonal and subdiagonal blocks and test for exact ! singularity. ! CALL SGETF2( M-J+1, JB, A( J, J ), LDA, IPIV( J ), IINFO ) ! ! Adjust INFO and the pivot indices. ! IF( INFO.EQ.0 .AND. IINFO.GT.0 ) INFO = IINFO + J - 1 DO 10 I = J, MIN( M, J+JB-1 ) IPIV( I ) = J - 1 + IPIV( I ) 10 CONTINUE ! ! Apply interchanges to columns 1:J-1. ! CALL SLASWP( J-1, A, LDA, J, J+JB-1, IPIV, 1 ) ! IF( J+JB.LE.N ) THEN ! ! Apply interchanges to columns J+JB:N. ! CALL SLASWP( N-J-JB+1, A( 1, J+JB ), LDA, J, J+JB-1,& IPIV, 1 ) ! ! Compute block row of U. ! CALL STRSM( 'Left', 'Lower', 'No transpose', 'Unit', JB,& N-J-JB+1, ONE, A( J, J ), LDA, A( J, J+JB ),& LDA ) IF( J+JB.LE.M ) THEN ! ! Update trailing submatrix. ! CALL SGEMM( 'No transpose', 'No transpose', M-J-JB+1,& N-J-JB+1, JB, -ONE, A( J+JB, J ), LDA,& A( J, J+JB ), LDA, ONE, A( J+JB, J+JB ),& LDA ) END IF END IF 20 CONTINUE END IF RETURN ! ! End of SGETRF ! END SUBROUTINE SGETF2( M, N, A, LDA, IPIV, INFO ) ! ! -- LAPACK computational routine (version 3.4.0) -- ! -- LAPACK is a software package provided by Univ. of Tennessee, -- ! -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- ! November 2011 ! ! .. Scalar Arguments .. INTEGER INFO, LDA, M, N ! .. ! .. Array Arguments .. INTEGER IPIV( * ) REAL*4 A( LDA, * ) ! .. ! ! ===================================================================== ! ! .. Parameters .. REAL*4 ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) ! .. ! .. Local Scalars .. REAL*4 SFMIN INTEGER I, J, JP ! .. ! .. External Functions .. REAL*4 SLAMCH INTEGER IDAMAX EXTERNAL SLAMCH, IDAMAX ! .. ! .. External Subroutines .. EXTERNAL SGER, SSCAL, SSWAP, XERBLA ! .. ! .. Intrinsic Functions .. INTRINSIC MAX, MIN ! .. ! .. Executable Statements .. ! ! Test the input parameters. ! INFO = 0 IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SGETF2', -INFO ) RETURN END IF ! ! Quick return if possible ! IF( M.EQ.0 .OR. N.EQ.0 ) RETURN ! ! Compute machine safe minimum ! SFMIN = SLAMCH('S') ! DO 10 J = 1, MIN( M, N ) ! ! Find pivot and test for singularity. ! JP = J - 1 + IDAMAX( M-J+1, A( J, J ), 1 ) IPIV( J ) = JP IF( A( JP, J ).NE.ZERO ) THEN ! ! Apply the interchange to columns 1:N. ! IF(JP.NE.J) CALL SSWAP( N, A( J, 1 ), LDA, A( JP, 1 ), LDA ) ! ! Compute elements J+1:M of J-th column. ! IF( J.LT.M ) THEN IF( ABS(A( J, J )) .GE. SFMIN ) THEN CALL SSCAL( M-J, ONE / A( J, J ), A( J+1, J ), 1 ) ELSE DO 20 I = 1, M-J A( J+I, J ) = A( J+I, J ) / A( J, J ) 20 CONTINUE END IF END IF ! ELSE IF( INFO.EQ.0 ) THEN ! INFO = J END IF ! IF( J.LT.MIN( M, N ) ) THEN ! ! Update trailing submatrix. ! CALL SGER( M-J, N-J, -ONE, A( J+1, J ), 1, A( J, J+1 ), & LDA, A( J+1, J+1 ), LDA ) END IF 10 CONTINUE RETURN ! ! End of SGETF2 ! END SUBROUTINE SLASWP( N, A, LDA, K1, K2, IPIV, INCX ) ! ! -- LAPACK auxiliary routine (version 3.4.0) -- ! -- LAPACK is a software package provided by Univ. of Tennessee, -- ! -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- ! November 2011 ! ! .. Scalar Arguments .. INTEGER INCX, K1, K2, LDA, N ! .. ! .. Array Arguments .. INTEGER IPIV( * ) REAL*4 A( LDA, * ) ! .. ! ! ===================================================================== ! ! .. Local Scalars .. INTEGER I, I1, I2, INC, IP, IX, IX0, J, K, N32 REAL*4 TEMP ! .. ! .. Executable Statements .. ! ! Interchange row I with row IPIV(I) for each of rows K1 through K2. ! IF( INCX.GT.0 ) THEN IX0 = K1 I1 = K1 I2 = K2 INC = 1 ELSE IF( INCX.LT.0 ) THEN IX0 = 1 + ( 1-K2 )*INCX I1 = K2 I2 = K1 INC = -1 ELSE RETURN END IF ! N32 = ( N / 32 )*32 IF( N32.NE.0 ) THEN DO 30 J = 1, N32, 32 IX = IX0 DO 20 I = I1, I2, INC IP = IPIV( IX ) IF( IP.NE.I ) THEN DO 10 K = J, J + 31 TEMP = A( I, K ) A( I, K ) = A( IP, K ) A( IP, K ) = TEMP 10 CONTINUE END IF IX = IX + INCX 20 CONTINUE 30 CONTINUE END IF IF( N32.NE.N ) THEN N32 = N32 + 1 IX = IX0 DO 50 I = I1, I2, INC IP = IPIV( IX ) IF( IP.NE.I ) THEN DO 40 K = N32, N TEMP = A( I, K ) A( I, K ) = A( IP, K ) A( IP, K ) = TEMP 40 CONTINUE END IF IX = IX + INCX 50 CONTINUE END IF ! RETURN ! ! End of SLASWP ! END SUBROUTINE STRSM(SIDE,UPLO,TRANSA,DIAG,M,N,ALPHA,A,LDA,B,LDB) ! ! -- Reference BLAS level3 routine (version 3.4.0) -- ! -- Reference BLAS is a software package provided by Univ. of Tennessee, -- ! -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- ! November 2011 ! ! .. Scalar Arguments .. REAL*4 ALPHA INTEGER LDA,LDB,M,N CHARACTER DIAG,SIDE,TRANSA,UPLO ! .. ! .. Array Arguments .. REAL*4 A(LDA,*),B(LDB,*) ! .. ! ! ===================================================================== ! ! .. External Functions .. LOGICAL LSAME EXTERNAL LSAME ! .. ! .. External Subroutines .. EXTERNAL XERBLA ! .. ! .. Intrinsic Functions .. INTRINSIC MAX ! .. ! .. Local Scalars .. REAL*4 TEMP INTEGER I,INFO,J,K,NROWA LOGICAL LSIDE,NOUNIT,UPPER ! .. ! .. Parameters .. REAL*4 ONE,ZERO PARAMETER (ONE=1.0E+0,ZERO=0.0E+0) ! .. ! ! Test the input parameters. ! LSIDE = LSAME(SIDE,'L') IF (LSIDE) THEN NROWA = M ELSE NROWA = N END IF NOUNIT = LSAME(DIAG,'N') UPPER = LSAME(UPLO,'U') ! INFO = 0 IF ((.NOT.LSIDE) .AND. (.NOT.LSAME(SIDE,'R'))) THEN INFO = 1 ELSE IF ((.NOT.UPPER) .AND. (.NOT.LSAME(UPLO,'L'))) THEN INFO = 2 ELSE IF ((.NOT.LSAME(TRANSA,'N')) .AND. & (.NOT.LSAME(TRANSA,'T')) .AND. & (.NOT.LSAME(TRANSA,'C'))) THEN INFO = 3 ELSE IF ((.NOT.LSAME(DIAG,'U')) .AND. (.NOT.LSAME(DIAG,'N'))) THEN INFO = 4 ELSE IF (M.LT.0) THEN INFO = 5 ELSE IF (N.LT.0) THEN INFO = 6 ELSE IF (LDA.LT.MAX(1,NROWA)) THEN INFO = 9 ELSE IF (LDB.LT.MAX(1,M)) THEN INFO = 11 END IF IF (INFO.NE.0) THEN CALL XERBLA('STRSM ',INFO) RETURN END IF ! ! Quick return if possible. ! IF (M.EQ.0 .OR. N.EQ.0) RETURN ! ! And when alpha.eq.zero. ! IF (ALPHA.EQ.ZERO) THEN DO 20 J = 1,N DO 10 I = 1,M B(I,J) = ZERO 10 CONTINUE 20 CONTINUE RETURN END IF ! ! Start the operations. ! IF (LSIDE) THEN IF (LSAME(TRANSA,'N')) THEN ! ! Form B := alpha*inv( A )*B. ! IF (UPPER) THEN DO 60 J = 1,N IF (ALPHA.NE.ONE) THEN DO 30 I = 1,M B(I,J) = ALPHA*B(I,J) 30 CONTINUE END IF DO 50 K = M,1,-1 IF (B(K,J).NE.ZERO) THEN IF (NOUNIT) B(K,J) = B(K,J)/A(K,K) DO 40 I = 1,K - 1 B(I,J) = B(I,J) - B(K,J)*A(I,K) 40 CONTINUE END IF 50 CONTINUE 60 CONTINUE ELSE DO 100 J = 1,N IF (ALPHA.NE.ONE) THEN DO 70 I = 1,M B(I,J) = ALPHA*B(I,J) 70 CONTINUE END IF DO 90 K = 1,M IF (B(K,J).NE.ZERO) THEN IF (NOUNIT) B(K,J) = B(K,J)/A(K,K) DO 80 I = K + 1,M B(I,J) = B(I,J) - B(K,J)*A(I,K) 80 CONTINUE END IF 90 CONTINUE 100 CONTINUE END IF ELSE ! ! Form B := alpha*inv( A**T )*B. ! IF (UPPER) THEN DO 130 J = 1,N DO 120 I = 1,M TEMP = ALPHA*B(I,J) DO 110 K = 1,I - 1 TEMP = TEMP - A(K,I)*B(K,J) 110 CONTINUE IF (NOUNIT) TEMP = TEMP/A(I,I) B(I,J) = TEMP 120 CONTINUE 130 CONTINUE ELSE DO 160 J = 1,N DO 150 I = M,1,-1 TEMP = ALPHA*B(I,J) DO 140 K = I + 1,M TEMP = TEMP - A(K,I)*B(K,J) 140 CONTINUE IF (NOUNIT) TEMP = TEMP/A(I,I) B(I,J) = TEMP 150 CONTINUE 160 CONTINUE END IF END IF ELSE IF (LSAME(TRANSA,'N')) THEN ! ! Form B := alpha*B*inv( A ). ! IF (UPPER) THEN DO 210 J = 1,N IF (ALPHA.NE.ONE) THEN DO 170 I = 1,M B(I,J) = ALPHA*B(I,J) 170 CONTINUE END IF DO 190 K = 1,J - 1 IF (A(K,J).NE.ZERO) THEN DO 180 I = 1,M B(I,J) = B(I,J) - A(K,J)*B(I,K) 180 CONTINUE END IF 190 CONTINUE IF (NOUNIT) THEN TEMP = ONE/A(J,J) DO 200 I = 1,M B(I,J) = TEMP*B(I,J) 200 CONTINUE END IF 210 CONTINUE ELSE DO 260 J = N,1,-1 IF (ALPHA.NE.ONE) THEN DO 220 I = 1,M B(I,J) = ALPHA*B(I,J) 220 CONTINUE END IF DO 240 K = J + 1,N IF (A(K,J).NE.ZERO) THEN DO 230 I = 1,M B(I,J) = B(I,J) - A(K,J)*B(I,K) 230 CONTINUE END IF 240 CONTINUE IF (NOUNIT) THEN TEMP = ONE/A(J,J) DO 250 I = 1,M B(I,J) = TEMP*B(I,J) 250 CONTINUE END IF 260 CONTINUE END IF ELSE ! ! Form B := alpha*B*inv( A**T ). ! IF (UPPER) THEN DO 310 K = N,1,-1 IF (NOUNIT) THEN TEMP = ONE/A(K,K) DO 270 I = 1,M B(I,K) = TEMP*B(I,K) 270 CONTINUE END IF DO 290 J = 1,K - 1 IF (A(J,K).NE.ZERO) THEN TEMP = A(J,K) DO 280 I = 1,M B(I,J) = B(I,J) - TEMP*B(I,K) 280 CONTINUE END IF 290 CONTINUE IF (ALPHA.NE.ONE) THEN DO 300 I = 1,M B(I,K) = ALPHA*B(I,K) 300 CONTINUE END IF 310 CONTINUE ELSE DO 360 K = 1,N IF (NOUNIT) THEN TEMP = ONE/A(K,K) DO 320 I = 1,M B(I,K) = TEMP*B(I,K) 320 CONTINUE END IF DO 340 J = K + 1,N IF (A(J,K).NE.ZERO) THEN TEMP = A(J,K) DO 330 I = 1,M B(I,J) = B(I,J) - TEMP*B(I,K) 330 CONTINUE END IF 340 CONTINUE IF (ALPHA.NE.ONE) THEN DO 350 I = 1,M B(I,K) = ALPHA*B(I,K) 350 CONTINUE END IF 360 CONTINUE END IF END IF END IF ! RETURN ! ! End of STRSM . ! END SUBROUTINE SGEMM(TRANSA,TRANSB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC) ! ! -- Reference BLAS level3 routine (version 3.4.0) -- ! -- Reference BLAS is a software package provided by Univ. of Tennessee, -- ! -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- ! November 2011 ! ! .. Scalar Arguments .. REAL*4 ALPHA,BETA INTEGER K,LDA,LDB,LDC,M,N CHARACTER TRANSA,TRANSB ! .. ! .. Array Arguments .. REAL*4 A(LDA,*),B(LDB,*),C(LDC,*) ! .. ! ! ===================================================================== ! ! .. External Functions .. LOGICAL LSAME EXTERNAL LSAME ! .. ! .. External Subroutines .. EXTERNAL XERBLA ! .. ! .. Intrinsic Functions .. INTRINSIC MAX ! .. ! .. Local Scalars .. REAL*4 TEMP INTEGER I,INFO,J,L,NCOLA,NROWA,NROWB LOGICAL NOTA,NOTB ! .. ! .. Parameters .. REAL*4 ONE,ZERO PARAMETER (ONE=1.0E+0,ZERO=0.0E+0) ! .. ! ! Set NOTA and NOTB as true if A and B respectively are not ! transposed and set NROWA, NCOLA and NROWB as the number of rows ! and columns of A and the number of rows of B respectively. ! NOTA = LSAME(TRANSA,'N') NOTB = LSAME(TRANSB,'N') IF (NOTA) THEN NROWA = M NCOLA = K ELSE NROWA = K NCOLA = M END IF IF (NOTB) THEN NROWB = K ELSE NROWB = N END IF ! ! Test the input parameters. ! INFO = 0 IF ((.NOT.NOTA) .AND. (.NOT.LSAME(TRANSA,'C')) .AND. & (.NOT.LSAME(TRANSA,'T'))) THEN INFO = 1 ELSE IF ((.NOT.NOTB) .AND. (.NOT.LSAME(TRANSB,'C')) .AND. & (.NOT.LSAME(TRANSB,'T'))) THEN INFO = 2 ELSE IF (M.LT.0) THEN INFO = 3 ELSE IF (N.LT.0) THEN INFO = 4 ELSE IF (K.LT.0) THEN INFO = 5 ELSE IF (LDA.LT.MAX(1,NROWA)) THEN INFO = 8 ELSE IF (LDB.LT.MAX(1,NROWB)) THEN INFO = 10 ELSE IF (LDC.LT.MAX(1,M)) THEN INFO = 13 END IF IF (INFO.NE.0) THEN CALL XERBLA('SGEMM ',INFO) RETURN END IF ! ! Quick return if possible. ! IF ((M.EQ.0) .OR. (N.EQ.0) .OR. & (((ALPHA.EQ.ZERO).OR. (K.EQ.0)).AND. (BETA.EQ.ONE))) RETURN ! ! And if alpha.eq.zero. ! IF (ALPHA.EQ.ZERO) THEN IF (BETA.EQ.ZERO) THEN DO 20 J = 1,N DO 10 I = 1,M C(I,J) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40 J = 1,N DO 30 I = 1,M C(I,J) = BETA*C(I,J) 30 CONTINUE 40 CONTINUE END IF RETURN END IF ! ! Start the operations. ! IF (NOTB) THEN IF (NOTA) THEN ! ! Form C := alpha*A*B + beta*C. ! DO 90 J = 1,N IF (BETA.EQ.ZERO) THEN DO 50 I = 1,M C(I,J) = ZERO 50 CONTINUE ELSE IF (BETA.NE.ONE) THEN DO 60 I = 1,M C(I,J) = BETA*C(I,J) 60 CONTINUE END IF DO 80 L = 1,K IF (B(L,J).NE.ZERO) THEN TEMP = ALPHA*B(L,J) DO 70 I = 1,M C(I,J) = C(I,J) + TEMP*A(I,L) 70 CONTINUE END IF 80 CONTINUE 90 CONTINUE ELSE ! ! Form C := alpha*A**T*B + beta*C ! DO 120 J = 1,N DO 110 I = 1,M TEMP = ZERO DO 100 L = 1,K TEMP = TEMP + A(L,I)*B(L,J) 100 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 110 CONTINUE 120 CONTINUE END IF ELSE IF (NOTA) THEN ! ! Form C := alpha*A*B**T + beta*C ! DO 170 J = 1,N IF (BETA.EQ.ZERO) THEN DO 130 I = 1,M C(I,J) = ZERO 130 CONTINUE ELSE IF (BETA.NE.ONE) THEN DO 140 I = 1,M C(I,J) = BETA*C(I,J) 140 CONTINUE END IF DO 160 L = 1,K IF (B(J,L).NE.ZERO) THEN TEMP = ALPHA*B(J,L) DO 150 I = 1,M C(I,J) = C(I,J) + TEMP*A(I,L) 150 CONTINUE END IF 160 CONTINUE 170 CONTINUE ELSE ! ! Form C := alpha*A**T*B**T + beta*C ! DO 200 J = 1,N DO 190 I = 1,M TEMP = ZERO DO 180 L = 1,K TEMP = TEMP + A(L,I)*B(J,L) 180 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 190 CONTINUE 200 CONTINUE END IF END IF ! RETURN ! ! End of SGEMM . ! END SUBROUTINE SSWAP(N,SX,INCX,SY,INCY) ! ! -- Reference BLAS level1 routine (version 3.4.0) -- ! -- Reference BLAS is a software package provided by Univ. of Tennessee, -- ! -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- ! November 2011 ! ! .. Scalar Arguments .. INTEGER INCX,INCY,N ! .. ! .. Array Arguments .. REAL SX(*),SY(*) ! .. ! ! ===================================================================== ! ! .. Local Scalars .. REAL STEMP INTEGER I,IX,IY,M,MP1 ! .. ! .. Intrinsic Functions .. INTRINSIC MOD ! .. IF (N.LE.0) RETURN IF (INCX.EQ.1 .AND. INCY.EQ.1) THEN ! ! code for both increments equal to 1 ! ! ! clean-up loop ! M = MOD(N,3) IF (M.NE.0) THEN DO I = 1,M STEMP = SX(I) SX(I) = SY(I) SY(I) = STEMP END DO IF (N.LT.3) RETURN END IF MP1 = M + 1 DO I = MP1,N,3 STEMP = SX(I) SX(I) = SY(I) SY(I) = STEMP STEMP = SX(I+1) SX(I+1) = SY(I+1) SY(I+1) = STEMP STEMP = SX(I+2) SX(I+2) = SY(I+2) SY(I+2) = STEMP END DO ELSE ! ! code for unequal increments or equal increments not equal ! to 1 ! IX = 1 IY = 1 IF (INCX.LT.0) IX = (-N+1)*INCX + 1 IF (INCY.LT.0) IY = (-N+1)*INCY + 1 DO I = 1,N STEMP = SX(IX) SX(IX) = SY(IY) SY(IY) = STEMP IX = IX + INCX IY = IY + INCY END DO END IF RETURN END SUBROUTINE SSCAL(N,SA,SX,INCX) ! ! -- Reference BLAS level1 routine (version 3.4.0) -- ! -- Reference BLAS is a software package provided by Univ. of Tennessee, -- ! -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- ! November 2011 ! ! .. Scalar Arguments .. REAL SA INTEGER INCX,N ! .. ! .. Array Arguments .. REAL SX(*) ! .. ! ! ===================================================================== ! ! .. Local Scalars .. INTEGER I,M,MP1,NINCX ! .. ! .. Intrinsic Functions .. INTRINSIC MOD ! .. IF (N.LE.0 .OR. INCX.LE.0) RETURN IF (INCX.EQ.1) THEN ! ! code for increment equal to 1 ! ! ! clean-up loop ! M = MOD(N,5) IF (M.NE.0) THEN DO I = 1,M SX(I) = SA*SX(I) END DO IF (N.LT.5) RETURN END IF MP1 = M + 1 DO I = MP1,N,5 SX(I) = SA*SX(I) SX(I+1) = SA*SX(I+1) SX(I+2) = SA*SX(I+2) SX(I+3) = SA*SX(I+3) SX(I+4) = SA*SX(I+4) END DO ELSE ! ! code for increment not equal to 1 ! NINCX = N*INCX DO I = 1,NINCX,INCX SX(I) = SA*SX(I) END DO END IF RETURN END SUBROUTINE SGER(M,N,ALPHA,X,INCX,Y,INCY,A,LDA) ! ! -- Reference BLAS level2 routine (version 3.4.0) -- ! -- Reference BLAS is a software package provided by Univ. of Tennessee, -- ! -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- ! November 2011 ! ! .. Scalar Arguments .. REAL ALPHA INTEGER INCX,INCY,LDA,M,N ! .. ! .. Array Arguments .. REAL A(LDA,*),X(*),Y(*) ! .. ! ! ===================================================================== ! ! .. Parameters .. REAL ZERO PARAMETER (ZERO=0.0E+0) ! .. ! .. Local Scalars .. REAL TEMP INTEGER I,INFO,IX,J,JY,KX ! .. ! .. External Subroutines .. EXTERNAL XERBLA ! .. ! .. Intrinsic Functions .. INTRINSIC MAX ! .. ! ! Test the input parameters. ! INFO = 0 IF (M.LT.0) THEN INFO = 1 ELSE IF (N.LT.0) THEN INFO = 2 ELSE IF (INCX.EQ.0) THEN INFO = 5 ELSE IF (INCY.EQ.0) THEN INFO = 7 ELSE IF (LDA.LT.MAX(1,M)) THEN INFO = 9 END IF IF (INFO.NE.0) THEN CALL XERBLA('SGER ',INFO) RETURN END IF ! ! Quick return if possible. ! IF ((M.EQ.0) .OR. (N.EQ.0) .OR. (ALPHA.EQ.ZERO)) RETURN ! ! Start the operations. In this version the elements of A are ! accessed sequentially with one pass through A. ! IF (INCY.GT.0) THEN JY = 1 ELSE JY = 1 - (N-1)*INCY END IF IF (INCX.EQ.1) THEN DO 20 J = 1,N IF (Y(JY).NE.ZERO) THEN TEMP = ALPHA*Y(JY) DO 10 I = 1,M A(I,J) = A(I,J) + X(I)*TEMP 10 CONTINUE END IF JY = JY + INCY 20 CONTINUE ELSE IF (INCX.GT.0) THEN KX = 1 ELSE KX = 1 - (M-1)*INCX END IF DO 40 J = 1,N IF (Y(JY).NE.ZERO) THEN TEMP = ALPHA*Y(JY) IX = KX DO 30 I = 1,M A(I,J) = A(I,J) + X(IX)*TEMP IX = IX + INCX 30 CONTINUE END IF JY = JY + INCY 40 CONTINUE END IF ! RETURN ! ! End of SGER . ! END INTEGER FUNCTION ISAMAX(N,SX,INCX) ! ! -- Reference BLAS level1 routine (version 3.4.0) -- ! -- Reference BLAS is a software package provided by Univ. of Tennessee, -- ! -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- ! November 2011 ! ! .. Scalar Arguments .. INTEGER INCX,N ! .. ! .. Array Arguments .. REAL SX(*) ! .. ! ! ===================================================================== ! ! .. Local Scalars .. REAL SMAX INTEGER I,IX ! .. ! .. Intrinsic Functions .. INTRINSIC ABS ! .. ISAMAX = 0 IF (N.LT.1 .OR. INCX.LE.0) RETURN ISAMAX = 1 IF (N.EQ.1) RETURN IF (INCX.EQ.1) THEN ! ! code for increment equal to 1 ! SMAX = ABS(SX(1)) DO I = 2,N IF (ABS(SX(I)).GT.SMAX) THEN ISAMAX = I SMAX = ABS(SX(I)) END IF END DO ELSE ! ! code for increment not equal to 1 ! IX = 1 SMAX = ABS(SX(1)) IX = IX + INCX DO I = 2,N IF (ABS(SX(IX)).GT.SMAX) THEN ISAMAX = I SMAX = ABS(SX(IX)) END IF IX = IX + INCX END DO END IF RETURN END REAL FUNCTION SLAMCH( CMACH ) ! ! -- LAPACK auxiliary routine (version 3.4.0) -- ! -- LAPACK is a software package provided by Univ. of Tennessee, -- ! -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- ! November 2011 ! ! .. Scalar Arguments .. CHARACTER CMACH ! .. ! ! ===================================================================== ! ! .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) ! .. ! .. Local Scalars .. REAL RND, EPS, SFMIN, SMALL, RMACH ! .. ! .. External Functions .. LOGICAL LSAME EXTERNAL LSAME ! .. ! .. Intrinsic Functions .. INTRINSIC DIGITS, EPSILON, HUGE, MAXEXPONENT INTRINSIC MINEXPONENT, RADIX, TINY ! .. ! .. Executable Statements .. ! ! ! Assume rounding, not chopping. Always. ! RND = ONE ! IF( ONE.EQ.RND ) THEN EPS = EPSILON(ZERO) * 0.5 ELSE EPS = EPSILON(ZERO) END IF ! IF( LSAME( CMACH, 'E' ) ) THEN RMACH = EPS ELSE IF( LSAME( CMACH, 'S' ) ) THEN SFMIN = TINY(ZERO) SMALL = ONE / HUGE(ZERO) IF( SMALL.GE.SFMIN ) THEN ! ! Use SMALL plus a bit, to avoid the possibility of rounding ! causing overflow when computing 1/sfmin. ! SFMIN = SMALL*( ONE+EPS ) END IF RMACH = SFMIN ELSE IF( LSAME( CMACH, 'B' ) ) THEN RMACH = RADIX(ZERO) ELSE IF( LSAME( CMACH, 'P' ) ) THEN RMACH = EPS * RADIX(ZERO) ELSE IF( LSAME( CMACH, 'N' ) ) THEN RMACH = DIGITS(ZERO) ELSE IF( LSAME( CMACH, 'R' ) ) THEN RMACH = RND ELSE IF( LSAME( CMACH, 'M' ) ) THEN RMACH = MINEXPONENT(ZERO) ELSE IF( LSAME( CMACH, 'U' ) ) THEN RMACH = tiny(zero) ELSE IF( LSAME( CMACH, 'L' ) ) THEN RMACH = MAXEXPONENT(ZERO) ELSE IF( LSAME( CMACH, 'O' ) ) THEN RMACH = HUGE(ZERO) ELSE RMACH = ZERO END IF ! SLAMCH = RMACH RETURN ! ! End of SLAMCH ! END REAL FUNCTION SLAMC3( A, B ) ! ! -- LAPACK auxiliary routine (version 3.4.0) -- ! Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. ! November 2010 ! ! .. Scalar Arguments .. REAL A, B ! .. ! ===================================================================== ! ! .. Executable Statements .. ! SLAMC3 = A + B ! RETURN ! ! End of SLAMC3 ! END libxsmm-1.17/samples/packed/trmm/000077500000000000000000000000001415223013700167655ustar00rootroot00000000000000libxsmm-1.17/samples/packed/trmm/Makefile000066400000000000000000000074731415223013700204400ustar00rootroot00000000000000ROOTDIR = $(abspath $(dir $(firstword $(MAKEFILE_LIST)))) DEPDIR = ../../.. SRCDIR = . INCDIR = . BLDDIR = obj OUTDIR = . CXXFLAGS = $(NULL) CFLAGS = $(NULL) DFLAGS = $(NULL) # Fortran code here does not allow for PEDANTIC=2 # override PEDANTIC = 1 PEDANTIC = 0 BLAS = 0 OMP = 0 SYM = 1 # include common Makefile artifacts include $(DEPDIR)/Makefile.inc # necessary include directories IFLAGS += -I$(call quote,$(INCDIR)) IFLAGS += -I$(call quote,$(DEPDIR)/include) OUTNAME := $(shell basename "$(ROOTDIR)") HEADERS := $(wildcard $(INCDIR)/*.h) $(wildcard $(INCDIR)/*.hpp) $(wildcard $(INCDIR)/*.hxx) $(wildcard $(INCDIR)/*.hh) \ $(wildcard $(SRCDIR)/*.h) $(wildcard $(SRCDIR)/*.hpp) $(wildcard $(SRCDIR)/*.hxx) $(wildcard $(SRCDIR)/*.hh) \ $(DEPDIR)/include/libxsmm_source.h CPPSRCS := $(wildcard $(SRCDIR)/*.cpp) CXXSRCS := $(wildcard $(SRCDIR)/*.cxx) CCXSRCS := $(wildcard $(SRCDIR)/*.cc) CSOURCS := $(wildcard $(SRCDIR)/*.c) CPPOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CPPSRCS:.cpp=-cpp.o))) CXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CXXSRCS:.cxx=-cxx.o))) CCXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CCXSRCS:.cc=-cc.o))) COBJCTS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CSOURCS:.c=-c.o))) ifneq (,$(strip $(FC))) FXXSRCS := $(wildcard $(SRCDIR)/*.f) F77SRCS := $(wildcard $(SRCDIR)/*.F) F90SRCS := $(wildcard $(SRCDIR)/*.f90) $(wildcard $(SRCDIR)/*.F90) FXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(FXXSRCS:.f=-f.o))) F77OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F77SRCS:.F=-f77.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90SRCS:.f90=-f90.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90OBJS:.F90=-f90.o))) endif SOURCES := $(CPPSRCS) $(CXXSRCS) $(CCXSRCS) $(CSOURCS) OBJECTS := $(CPPOBJS) $(CXXOBJS) $(CCXOBJS) $(COBJCTS) FTNSRCS := $(FXXSRCS) $(F77SRCS) $(F90SRCS) MODULES := $(addsuffix .mod,$(basename $(FTNSRCS))) $(addsuffix .modmic,$(basename $(FTNSRCS))) FTNOBJS := $(FXXOBJS) $(F77OBJS) $(F90OBJS) XFILES := $(OUTDIR)/$(OUTNAME) .PHONY: all all: $(XFILES) .PHONY: compile compile: $(OBJECTS) $(FTNOBJS) ifneq (,$(strip $(FC))) $(BLDDIR)/%-f.o: $(SRCDIR)/%.f .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.f90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.F90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f77.o: $(SRCDIR)/%.F .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ endif $(OUTDIR)/$(OUTNAME): $(OUTDIR)/.make $(OBJECTS) $(FTNOBJS) $(LIBDEP) $(LIB_FLD) -o $@ $(OBJECTS) $(FTNOBJS) $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(BLDDIR)/%-cpp.o: $(SRCDIR)/%.cpp .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CXX) $(DFLAGS) $(IFLAGS) $(CXXFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-c.o: $(SRCDIR)/%.c .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CC) $(DFLAGS) $(IFLAGS) $(CFLAGS) $(CTARGET) -c $< -o $@ .PHONY: clean clean: ifneq ($(call qapath,$(BLDDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(BLDDIR)),$(call qapath,.)) @rm -rf $(BLDDIR) endif endif ifneq (,$(wildcard $(BLDDIR))) # still exists @rm -f $(OBJECTS) $(OBJECTX) $(FTNOBJS) $(FTNOBJX) *__genmod.* fit.log *.dat @rm -f $(BLDDIR)/*.gcno $(BLDDIR)/*.gcda $(BLDDIR)/*.gcov endif @rm -f .make .state .PHONY: realclean realclean: clean ifneq ($(call qapath,$(OUTDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(OUTDIR)),$(call qapath,.)) @rm -rf $(OUTDIR) endif endif ifneq (,$(wildcard $(OUTDIR))) # still exists @rm -f $(OUTDIR)/libxsmm.$(DLIBEXT) $(OUTDIR)/*.stackdump @rm -f $(XFILES) $(MODULES) endif libxsmm-1.17/samples/packed/trmm/blas_aux.c000066400000000000000000000010171415223013700207260ustar00rootroot00000000000000/* Optionally link-in the BLAS routines lsame_() and xerbla_() */ #if !defined(__BLAS) || (0 != __BLAS) #include int lsame_(const char* ca, const char* cb) { if ( *ca == *cb ) return 1; if ( (*cb >= 'a') && (*cb <= 'z') ) { if ( *ca == *cb + 32 ) return 1; } else if ( (*cb >= 'A') && (*cb <= 'Z') ) { if ( *ca == *cb - 32 ) return 1; } return 0; } void xerbla_(const char* c, const int* info) { printf(" ** On entry to %s parameter number %02d had an illegal value\n", c, *info); } #endif libxsmm-1.17/samples/packed/trmm/dtrmm.f000066400000000000000000000165041415223013700202650ustar00rootroot00000000000000 SUBROUTINE DTRMM(SIDE,UPLO,TRANSA,DIAG,M,N,ALPHA,A,LDA,B,LDB) ! ! -- Reference BLAS level3 routine (version 3.7.0) -- ! -- Reference BLAS is a software package provided by Univ. of Tennessee, -- ! -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- ! December 2016 ! ! .. Scalar Arguments .. DOUBLE PRECISION ALPHA INTEGER LDA,LDB,M,N CHARACTER DIAG,SIDE,TRANSA,UPLO ! .. ! .. Array Arguments .. DOUBLE PRECISION A(LDA,*),B(LDB,*) ! .. ! ! ===================================================================== ! ! .. External Functions .. LOGICAL LSAME EXTERNAL LSAME ! .. ! .. External Subroutines .. EXTERNAL XERBLA ! .. ! .. Intrinsic Functions .. INTRINSIC MAX ! .. ! .. Local Scalars .. DOUBLE PRECISION TEMP INTEGER I,INFO,J,K,NROWA LOGICAL LSIDE,NOUNIT,UPPER ! .. ! .. Parameters .. DOUBLE PRECISION ONE,ZERO PARAMETER (ONE=1.0D+0,ZERO=0.0D+0) ! .. ! ! Test the input parameters. ! LSIDE = LSAME(SIDE,'L') IF (LSIDE) THEN NROWA = M ELSE NROWA = N END IF NOUNIT = LSAME(DIAG,'N') UPPER = LSAME(UPLO,'U') ! INFO = 0 IF ((.NOT.LSIDE) .AND. (.NOT.LSAME(SIDE,'R'))) THEN INFO = 1 ELSE IF ((.NOT.UPPER) .AND. (.NOT.LSAME(UPLO,'L'))) THEN INFO = 2 ELSE IF ((.NOT.LSAME(TRANSA,'N')) .AND. & (.NOT.LSAME(TRANSA,'T')) .AND. & (.NOT.LSAME(TRANSA,'C'))) THEN INFO = 3 ELSE IF ((.NOT.LSAME(DIAG,'U')) .AND. (.NOT.LSAME(DIAG,'N'))) THEN INFO = 4 ELSE IF (M.LT.0) THEN INFO = 5 ELSE IF (N.LT.0) THEN INFO = 6 ELSE IF (LDA.LT.MAX(1,NROWA)) THEN INFO = 9 ELSE IF (LDB.LT.MAX(1,M)) THEN INFO = 11 END IF IF (INFO.NE.0) THEN CALL XERBLA('DTRMM ',INFO) RETURN END IF ! ! Quick return if possible. ! IF (M.EQ.0 .OR. N.EQ.0) RETURN ! ! And when alpha.eq.zero. ! IF (ALPHA.EQ.ZERO) THEN DO 20 J = 1,N DO 10 I = 1,M B(I,J) = ZERO 10 CONTINUE 20 CONTINUE RETURN END IF ! ! Start the operations. ! IF (LSIDE) THEN IF (LSAME(TRANSA,'N')) THEN ! ! Form B := alpha*A*B. ! IF (UPPER) THEN DO 50 J = 1,N DO 40 K = 1,M IF (B(K,J).NE.ZERO) THEN TEMP = ALPHA*B(K,J) DO 30 I = 1,K - 1 B(I,J) = B(I,J) + TEMP*A(I,K) 30 CONTINUE IF (NOUNIT) TEMP = TEMP*A(K,K) B(K,J) = TEMP END IF 40 CONTINUE 50 CONTINUE ELSE DO 80 J = 1,N DO 70 K = M,1,-1 IF (B(K,J).NE.ZERO) THEN TEMP = ALPHA*B(K,J) B(K,J) = TEMP IF (NOUNIT) B(K,J) = B(K,J)*A(K,K) DO 60 I = K + 1,M B(I,J) = B(I,J) + TEMP*A(I,K) 60 CONTINUE END IF 70 CONTINUE 80 CONTINUE END IF ELSE ! ! Form B := alpha*A**T*B. ! IF (UPPER) THEN DO 110 J = 1,N DO 100 I = M,1,-1 TEMP = B(I,J) IF (NOUNIT) TEMP = TEMP*A(I,I) DO 90 K = 1,I - 1 TEMP = TEMP + A(K,I)*B(K,J) 90 CONTINUE B(I,J) = ALPHA*TEMP 100 CONTINUE 110 CONTINUE ELSE DO 140 J = 1,N DO 130 I = 1,M TEMP = B(I,J) IF (NOUNIT) TEMP = TEMP*A(I,I) DO 120 K = I + 1,M TEMP = TEMP + A(K,I)*B(K,J) 120 CONTINUE B(I,J) = ALPHA*TEMP 130 CONTINUE 140 CONTINUE END IF END IF ELSE IF (LSAME(TRANSA,'N')) THEN ! ! Form B := alpha*B*A. ! IF (UPPER) THEN DO 180 J = N,1,-1 TEMP = ALPHA IF (NOUNIT) TEMP = TEMP*A(J,J) DO 150 I = 1,M B(I,J) = TEMP*B(I,J) 150 CONTINUE DO 170 K = 1,J - 1 IF (A(K,J).NE.ZERO) THEN TEMP = ALPHA*A(K,J) DO 160 I = 1,M B(I,J) = B(I,J) + TEMP*B(I,K) 160 CONTINUE END IF 170 CONTINUE 180 CONTINUE ELSE DO 220 J = 1,N TEMP = ALPHA IF (NOUNIT) TEMP = TEMP*A(J,J) DO 190 I = 1,M B(I,J) = TEMP*B(I,J) 190 CONTINUE DO 210 K = J + 1,N IF (A(K,J).NE.ZERO) THEN TEMP = ALPHA*A(K,J) DO 200 I = 1,M B(I,J) = B(I,J) + TEMP*B(I,K) 200 CONTINUE END IF 210 CONTINUE 220 CONTINUE END IF ELSE ! ! Form B := alpha*B*A**T. ! IF (UPPER) THEN DO 260 K = 1,N DO 240 J = 1,K - 1 IF (A(J,K).NE.ZERO) THEN TEMP = ALPHA*A(J,K) DO 230 I = 1,M B(I,J) = B(I,J) + TEMP*B(I,K) 230 CONTINUE END IF 240 CONTINUE TEMP = ALPHA IF (NOUNIT) TEMP = TEMP*A(K,K) IF (TEMP.NE.ONE) THEN DO 250 I = 1,M B(I,K) = TEMP*B(I,K) 250 CONTINUE END IF 260 CONTINUE ELSE DO 300 K = N,1,-1 DO 280 J = K + 1,N IF (A(J,K).NE.ZERO) THEN TEMP = ALPHA*A(J,K) DO 270 I = 1,M B(I,J) = B(I,J) + TEMP*B(I,K) 270 CONTINUE END IF 280 CONTINUE TEMP = ALPHA IF (NOUNIT) TEMP = TEMP*A(K,K) IF (TEMP.NE.ONE) THEN DO 290 I = 1,M B(I,K) = TEMP*B(I,K) 290 CONTINUE END IF 300 CONTINUE END IF END IF END IF ! RETURN ! ! End of DTRMM . ! END libxsmm-1.17/samples/packed/trmm/strmm.f000066400000000000000000000164241415223013700203050ustar00rootroot00000000000000 SUBROUTINE STRMM(SIDE,UPLO,TRANSA,DIAG,M,N,ALPHA,A,LDA,B,LDB) ! ! -- Reference BLAS level3 routine (version 3.7.0) -- ! -- Reference BLAS is a software package provided by Univ. of Tennessee, -- ! -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- ! December 2016 ! ! .. Scalar Arguments .. REAL ALPHA INTEGER LDA,LDB,M,N CHARACTER DIAG,SIDE,TRANSA,UPLO ! .. ! .. Array Arguments .. REAL A(LDA,*),B(LDB,*) ! .. ! ! ===================================================================== ! ! .. External Functions .. LOGICAL LSAME EXTERNAL LSAME ! .. ! .. External Subroutines .. EXTERNAL XERBLA ! .. ! .. Intrinsic Functions .. INTRINSIC MAX ! .. ! .. Local Scalars .. REAL TEMP INTEGER I,INFO,J,K,NROWA LOGICAL LSIDE,NOUNIT,UPPER ! .. ! .. Parameters .. REAL ONE,ZERO PARAMETER (ONE=1.0E+0,ZERO=0.0E+0) ! .. ! ! Test the input parameters. ! LSIDE = LSAME(SIDE,'L') IF (LSIDE) THEN NROWA = M ELSE NROWA = N END IF NOUNIT = LSAME(DIAG,'N') UPPER = LSAME(UPLO,'U') ! INFO = 0 IF ((.NOT.LSIDE) .AND. (.NOT.LSAME(SIDE,'R'))) THEN INFO = 1 ELSE IF ((.NOT.UPPER) .AND. (.NOT.LSAME(UPLO,'L'))) THEN INFO = 2 ELSE IF ((.NOT.LSAME(TRANSA,'N')) .AND. & (.NOT.LSAME(TRANSA,'T')) .AND. & (.NOT.LSAME(TRANSA,'C'))) THEN INFO = 3 ELSE IF ((.NOT.LSAME(DIAG,'U')) .AND. (.NOT.LSAME(DIAG,'N'))) THEN INFO = 4 ELSE IF (M.LT.0) THEN INFO = 5 ELSE IF (N.LT.0) THEN INFO = 6 ELSE IF (LDA.LT.MAX(1,NROWA)) THEN INFO = 9 ELSE IF (LDB.LT.MAX(1,M)) THEN INFO = 11 END IF IF (INFO.NE.0) THEN CALL XERBLA('STRMM ',INFO) RETURN END IF ! ! Quick return if possible. ! IF (M.EQ.0 .OR. N.EQ.0) RETURN ! ! And when alpha.eq.zero. ! IF (ALPHA.EQ.ZERO) THEN DO 20 J = 1,N DO 10 I = 1,M B(I,J) = ZERO 10 CONTINUE 20 CONTINUE RETURN END IF ! ! Start the operations. ! IF (LSIDE) THEN IF (LSAME(TRANSA,'N')) THEN ! ! Form B := alpha*A*B. ! IF (UPPER) THEN DO 50 J = 1,N DO 40 K = 1,M IF (B(K,J).NE.ZERO) THEN TEMP = ALPHA*B(K,J) DO 30 I = 1,K - 1 B(I,J) = B(I,J) + TEMP*A(I,K) 30 CONTINUE IF (NOUNIT) TEMP = TEMP*A(K,K) B(K,J) = TEMP END IF 40 CONTINUE 50 CONTINUE ELSE DO 80 J = 1,N DO 70 K = M,1,-1 IF (B(K,J).NE.ZERO) THEN TEMP = ALPHA*B(K,J) B(K,J) = TEMP IF (NOUNIT) B(K,J) = B(K,J)*A(K,K) DO 60 I = K + 1,M B(I,J) = B(I,J) + TEMP*A(I,K) 60 CONTINUE END IF 70 CONTINUE 80 CONTINUE END IF ELSE ! ! Form B := alpha*A**T*B. ! IF (UPPER) THEN DO 110 J = 1,N DO 100 I = M,1,-1 TEMP = B(I,J) IF (NOUNIT) TEMP = TEMP*A(I,I) DO 90 K = 1,I - 1 TEMP = TEMP + A(K,I)*B(K,J) 90 CONTINUE B(I,J) = ALPHA*TEMP 100 CONTINUE 110 CONTINUE ELSE DO 140 J = 1,N DO 130 I = 1,M TEMP = B(I,J) IF (NOUNIT) TEMP = TEMP*A(I,I) DO 120 K = I + 1,M TEMP = TEMP + A(K,I)*B(K,J) 120 CONTINUE B(I,J) = ALPHA*TEMP 130 CONTINUE 140 CONTINUE END IF END IF ELSE IF (LSAME(TRANSA,'N')) THEN ! ! Form B := alpha*B*A. ! IF (UPPER) THEN DO 180 J = N,1,-1 TEMP = ALPHA IF (NOUNIT) TEMP = TEMP*A(J,J) DO 150 I = 1,M B(I,J) = TEMP*B(I,J) 150 CONTINUE DO 170 K = 1,J - 1 IF (A(K,J).NE.ZERO) THEN TEMP = ALPHA*A(K,J) DO 160 I = 1,M B(I,J) = B(I,J) + TEMP*B(I,K) 160 CONTINUE END IF 170 CONTINUE 180 CONTINUE ELSE DO 220 J = 1,N TEMP = ALPHA IF (NOUNIT) TEMP = TEMP*A(J,J) DO 190 I = 1,M B(I,J) = TEMP*B(I,J) 190 CONTINUE DO 210 K = J + 1,N IF (A(K,J).NE.ZERO) THEN TEMP = ALPHA*A(K,J) DO 200 I = 1,M B(I,J) = B(I,J) + TEMP*B(I,K) 200 CONTINUE END IF 210 CONTINUE 220 CONTINUE END IF ELSE ! ! Form B := alpha*B*A**T. ! IF (UPPER) THEN DO 260 K = 1,N DO 240 J = 1,K - 1 IF (A(J,K).NE.ZERO) THEN TEMP = ALPHA*A(J,K) DO 230 I = 1,M B(I,J) = B(I,J) + TEMP*B(I,K) 230 CONTINUE END IF 240 CONTINUE TEMP = ALPHA IF (NOUNIT) TEMP = TEMP*A(K,K) IF (TEMP.NE.ONE) THEN DO 250 I = 1,M B(I,K) = TEMP*B(I,K) 250 CONTINUE END IF 260 CONTINUE ELSE DO 300 K = N,1,-1 DO 280 J = K + 1,N IF (A(J,K).NE.ZERO) THEN TEMP = ALPHA*A(J,K) DO 270 I = 1,M B(I,J) = B(I,J) + TEMP*B(I,K) 270 CONTINUE END IF 280 CONTINUE TEMP = ALPHA IF (NOUNIT) TEMP = TEMP*A(K,K) IF (TEMP.NE.ONE) THEN DO 290 I = 1,M B(I,K) = TEMP*B(I,K) 290 CONTINUE END IF 300 CONTINUE END IF END IF END IF ! RETURN ! ! End of STRMM . ! END libxsmm-1.17/samples/packed/trmm/trmm.c000066400000000000000000000616311415223013700201170ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Greg Henry, Hans Pabst, Alexander Heinecke (Intel Corp.) ******************************************************************************/ #if 0 #define USE_KERNEL_GENERATION_DIRECTLY #endif #if 0 #define USE_PREDEFINED_ASSEMBLY #define USE_XSMM_GENERATED #define TIME_MKL #endif #if 0 #define TEST_SINGLE #endif #if !defined(USE_PREDEFINED_ASSEMBLY) && !defined(USE_XSMM_GENERATED) && !defined(TIME_MKL) && \ (!defined(__linux__) || !defined(USE_KERNEL_GENERATION_DIRECTLY)) # define USE_XSMM_GENERATED # include #else # include # include # include # include # include #endif #include #include #include #include #define BUFSIZE 32*32 #define BUFSIZE2 64000 #if 0 #define TRIANGLE_IS_IDENTITY #endif LIBXSMM_INLINE void dcopy_to_temp ( int layout, double *A, int lda, int m, int n, double *Atemp, unsigned int VLEN ) { int i, j; if ( lda*n > BUFSIZE ) { printf("Reference routine not set up for matrices so large\n"); exit(-1); } if ( layout == 102 ) { /* printf("Column major\n"); */ for ( j = 0; j < n; j++ ) { for ( i = 0; i < m; i++ ) { Atemp[i+j*m] = A[i*VLEN+j*lda*VLEN]; } } #if EVENTUALLY_USE_THIS_LOOP_IT_SHOULD_BE_FASTER for ( j = 0; j < n; j++ ) { for ( i = 0, ia = 0; i < m; i++, ia+=VLEN ) { Atemp[i+j*m] = A[ ia+j*lda*VLEN ]; } } #endif } else { /* printf("Row major\n"); */ for ( j = 0; j < n; j++ ) { for ( i = 0; i < m; i++ ) { /* Transpose the data */ Atemp[i+j*m] = A[j*VLEN+i*lda*VLEN]; } } } } LIBXSMM_INLINE void scopy_to_temp ( int layout, float *A, int lda, int m, int n, float *Atemp, unsigned int VLEN ) { int i, j; if ( lda*n > BUFSIZE ) { printf("Reference routine not set up for matrices so large\n"); exit(-1); } if ( layout == 102 ) { /* printf("Column major\n"); */ for ( j = 0; j < n; j++ ) { for ( i = 0; i < m; i++ ) { Atemp[i+j*m] = A[i*VLEN+j*lda*VLEN]; } } } else { /* printf("Row major\n"); */ for ( j = 0; j < n; j++ ) { for ( i = 0; i < m; i++ ) { /* Transpose the data */ Atemp[i+j*m] = A[j*VLEN+i*lda*VLEN]; } } } } LIBXSMM_INLINE void dcopy_from_temp ( int layout, double *A, int lda, int m, int n, double *Atemp, unsigned int VLEN ) { int i, j, ia; if ( lda*n > BUFSIZE ) { printf("Reference routine not set up for matrices so large\n"); } if ( layout == 102 ) { for ( j = 0; j < n; j++ ) { for ( i = 0, ia = 0; i < m; i++, ia+=VLEN ) { A[ia+j*lda*VLEN] = Atemp[i+j*m]; } } } else { for ( j = 0; j < n; j++ ) { for ( i = 0; i < m; i++ ) { /* Transpose the data */ A[j*VLEN+i*lda*VLEN] = Atemp[i+j*m]; } } } } LIBXSMM_INLINE void scopy_from_temp ( int layout, float *A, int lda, int m, int n, float *Atemp, unsigned int VLEN ) { int i, j, ia; if ( lda*n > BUFSIZE ) { printf("Reference routine not set up for matrices so large\n"); } if ( layout == 102 ) { for ( j = 0; j < n; j++ ) { for ( i = 0, ia = 0; i < m; i++, ia+=VLEN ) { A[ia+j*lda*VLEN] = Atemp[i+j*m]; } } } else { for ( j = 0; j < n; j++ ) { for ( i = 0; i < m; i++ ) { /* Transpose the data */ A[j*VLEN+i*lda*VLEN] = Atemp[i+j*m]; } } } } #if !defined(USE_MKL_FOR_REFERENCE) && !defined(LIBXSMM_NOFORTRAN) && (!defined(__BLAS) || (0 != __BLAS)) extern void dtrmm_(); /* Reference code for compact dtrmm. Note that this just copies data into a buffer from the compact storage and calls the regular dtrmm code. This is very naive reference code just used for testing purposes */ /* Note: if layout==101 (row major), then this code is known to only work when * nmat == VLEN. To check for accuracy otherwise, transpose everything */ LIBXSMM_INLINE void compact_dtrmm_ ( unsigned int *layout, char *side, char *uplo, char *transa, char *diag, unsigned int *m, unsigned int *n, double *alpha, double *A, unsigned int *lda, double *B, unsigned int *ldb, unsigned int *nmat, unsigned int *VLEN ) { int i, j, num, asize, offseta, offsetb; double *Ap, *Bp, Atemp[BUFSIZE], Btemp[BUFSIZE]; static int ntimes = 0; if ( ++ntimes < 3 ) printf("Inside reference compact_dtrmm_()\n"); if ( *layout == 102 ) { if ( (*side == 'L') || (*side == 'l') ) asize = *m; else asize = *n; offsetb = (*ldb)*(*n)*(*VLEN); } else { if ( (*side == 'L') || (*side == 'l') ) asize = *n; else asize = *m; offsetb = (*ldb)*(*m)*(*VLEN); } offseta = (*lda)*asize*(*VLEN); if ( ++ntimes < 3 ) printf("m/n=%u,%u layout=%u asize=%i VLEN=%u nmat=%u offseta=%i offsetb=%i\n",*m,*n,*layout, asize, *VLEN, *nmat, offseta, offsetb ); for ( i = 0, num = 0; i < (int)(*nmat); i+= *VLEN, num++ ) { for ( j = 0; j < (int)*VLEN; j++ ) { /* Unpack the data, call a reference DTRMM, repack the data */ Ap = &A[j+num*offseta]; Bp = &B[j+num*offsetb]; if (++ntimes < 15 ) printf("Doing a dtrmm at place i=%d j=%d num=%d Ap[%d]=%g Bp[%d]=%g\n",i,j,num,j+num*offseta,Ap[0],j+num*offsetb,Bp[0]); dcopy_to_temp ( *layout, Ap, *lda, asize, asize, Atemp, *VLEN ); dcopy_to_temp ( *layout, Bp, *ldb, *m, *n, Btemp, *VLEN ); dtrmm_ ( side, uplo, transa, diag, m, n, alpha, Atemp, &asize, Btemp, m); dcopy_from_temp ( *layout, Bp, *ldb, *m, *n, Btemp, *VLEN ); } } } extern void strmm_(); /* Reference code for compact strmm. Note that this just copies data into a buffer from the compact storage and calls the regular strmm code. This is very naive reference code just used for testing purposes */ /* Note: if layout==101 (row major), then this code is known to only work when * nmat == VLEN. To check for accuracy otherwise, transpose everything */ LIBXSMM_INLINE void compact_strmm_ ( unsigned int *layout, char *side, char *uplo, char *transa, char *diag, unsigned int *m, unsigned int *n, float *alpha, float *A, unsigned int *lda, float *B, unsigned int *ldb, unsigned int *nmat, unsigned int *VLEN ) { int i, j, num, asize; float *Ap, *Bp, Atemp[BUFSIZE], Btemp[BUFSIZE]; if ( (*side == 'L') || (*side == 'l') ) asize = *m; else asize = *n; for ( i = 0, num = 0; i < (int)(*nmat); i+= *VLEN, num++ ) { for ( j = 0; j < (int)*VLEN; j++ ) { /* Unpack the data, call a reference DTRMM, repack the data */ Ap = &A[j+num*(*lda)*asize*(*VLEN)]; Bp = &B[j+num*(*ldb)*(*n)*(*VLEN)]; scopy_to_temp ( *layout, Ap, *lda, asize, asize, Atemp, *VLEN ); scopy_to_temp ( *layout, Bp, *ldb, *m, *n, Btemp, *VLEN ); strmm_ ( side, uplo, transa, diag, m, n, alpha, Atemp, &asize, Btemp, m); scopy_from_temp ( *layout, Bp, *ldb, *m, *n, Btemp, *VLEN ); } } } #endif LIBXSMM_INLINE void dfill_matrix ( double *matrix, unsigned int ld, unsigned int m, unsigned int n ) { unsigned int i, j; double dtmp; if ( ld < m ) { fprintf(stderr,"Error in dfill_matrix: ld=%u m=%u mismatched!\n",ld,m); exit(-1); } for ( j = 1; j <= n; j++ ) { /* Fill through the leading dimension */ for ( i = 1; i <= ld; i++ ) { dtmp = 1.0 - 2.0*libxsmm_rng_f64(); matrix [ (j-1)*ld + (i-1) ] = dtmp; } } } LIBXSMM_INLINE void dfill_identity ( double *matrix, unsigned int ld, unsigned int m, unsigned int n, int VLEN, int number_of_cases ) { unsigned int h, i, j, k, ia; double dtmp; if ( ld < m ) { fprintf(stderr,"Error in dfill_identity: ld=%u m=%u mismatched!\n",ld,m); exit(-1); } for ( h = 0; h < (unsigned int)number_of_cases; h++ ) { ia = h*ld*n*VLEN; for ( j = 1; j <= n; j++ ) { for ( i = 1; i <= ld; i++ ) { if ( i == j ) dtmp = 1.0; else dtmp = 0.0; for ( k = 0; k < (unsigned int)VLEN; k++ ) matrix[ia++] = dtmp; } } } } LIBXSMM_INLINE void sfill_matrix ( float *matrix, unsigned int ld, unsigned int m, unsigned int n ) { unsigned int i, j; double dtmp; if ( ld < m ) { fprintf(stderr,"Error is sfill_matrix: ld=%u m=%u mismatched!\n",ld,m); exit(-1); } for ( j = 1; j <= n; j++ ) { /* Fill through the leading dimension */ for ( i = 1; i <= ld; i++ ) { dtmp = 1.0 - 2.0*libxsmm_rng_f64(); matrix [ (j-1)*ld + (i-1) ] = (float) dtmp; } } } LIBXSMM_INLINE double residual_d ( double *A, unsigned int lda, unsigned int m, unsigned int n, double *B, unsigned int ldb, unsigned int *nerrs, unsigned int *ncorr ) { unsigned int i, j; double atmp, btmp, dtmp, ref, derror; static int ntimes = 0; *nerrs = 0; *ncorr = 0; derror = 0.0; for ( j = 1; j<= n; j++ ) { for ( i = 1; i <= m; i++ ) { atmp = A[ (j-1)*lda + (i-1)]; btmp = B[ (j-1)*ldb + (i-1)]; ref = LIBXSMM_MAX(atmp,-atmp); if ( atmp >= btmp ) { dtmp = atmp - btmp; } else { dtmp = btmp - atmp; } if ( isnan(dtmp) || isinf(dtmp) ) { if ( ++ntimes < 15 ) { printf("Denormal bug: A(%u,%u) is %g B(%u,%u) is %g\n",i,j,atmp,i,j,btmp); } } if ( dtmp / ref > 1.0e-12 ) { *nerrs = *nerrs + 1; if ( ++ntimes < 15 ) { printf("Bug #%i: A[%u]=A(%u,%u) expected=%g instead=%g err=%g\n",ntimes,(j-1)*lda+(i-1),i,j,atmp,btmp,dtmp); } } else { if ( (*nerrs > 0) && (ntimes < 10) && (*ncorr < 40) ) { printf("Cor #%u: A[%u]=A(%u,%u) expected=%g\n",*ncorr+1,(j-1)*lda+(i-1),i,j,atmp); } *ncorr = *ncorr + 1; } derror += dtmp; } } return derror; } LIBXSMM_INLINE double residual_s ( float *A, unsigned int lda, unsigned int m, unsigned int n, float *B, unsigned int ldb, unsigned int *nerrs, unsigned int *ncorr ) { unsigned int i, j; double atmp, btmp, dtmp, ref, derror; static int ntimes = 0; *nerrs = 0; *ncorr = 0; derror = 0.0; for ( j = 1; j<= n; j++ ) { for ( i = 1; i <= m; i++ ) { atmp = (double) A[ (j-1)*lda + (i-1)]; btmp = (double) B[ (j-1)*ldb + (i-1)]; ref = LIBXSMM_MAX(atmp,-atmp); if ( atmp >= btmp ) { dtmp = atmp - btmp; } else { dtmp = btmp - atmp; } if ( isnan(dtmp) || isinf(dtmp) ) { if ( ++ntimes < 15 ) { printf("Denormal bug: A(%u,%u) is %g B(%u,%u) is %g\n",i,j,atmp,i,j,btmp); } } if ( dtmp / ref > 1.0e-4 ) { *nerrs = *nerrs + 1; if ( ++ntimes < 15 ) { printf("Bug #%d: A(%u,%u) expected=%g instead=%g err=%g\n",ntimes,i,j,atmp,btmp,dtmp); } } else { if ( (*nerrs > 0) && (ntimes < 10) && (*ncorr < 40) ) { printf("Cor #%u: A(%u,%u) expected=%g\n",*ncorr+1,i,j,atmp); } *ncorr = *ncorr + 1; } derror += dtmp; } } return derror; } #ifdef USE_PREDEFINED_ASSEMBLY extern void trmm_(); #endif #ifdef MKL_TIMER extern double dsecnd_(); #endif int main(int argc, char* argv[]) { unsigned int m=8, n=8, lda=8, ldb=8, nerrs, num, nmat, nmats, nmatd, ntest; unsigned int layout, asize, VLEND=4, VLENS=8, bsize; unsigned int ncorr; unsigned int i, j; char side, uplo, trans, diag; float *sa, *sb, *sc, *sd; double *da, *db, *dc, *dd, *tmpbuf; double dalpha = 1.0; float salpha; double dtmp; const unsigned char *cptr = NULL; unsigned long op_count; unsigned int typesize8 = 8; const libxsmm_trmm_descriptor* desc8 = NULL; #ifdef TEST_SINGLE unsigned int typesize4 = 4; const libxsmm_trmm_descriptor* desc4 = NULL; #endif #ifdef USE_XSMM_GENERATED libxsmm_descriptor_blob blob; libxsmm_trmm_xfunction mykernel = NULL; #endif #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__) void (*opcode_routine)(); unsigned char *routine_output; libxsmm_generated_code io_generated_code; int pagesize = sysconf(_SC_PAGE_SIZE); if (pagesize == -1) fprintf(stderr,"sysconf pagesize\n"); routine_output = (unsigned char *) mmap(NULL, BUFSIZE2, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0,0); if (mprotect(routine_output, BUFSIZE2, PROT_EXEC | PROT_READ | PROT_WRITE ) == -1) fprintf(stderr,"mprotect\n"); printf("Routine ready\n"); io_generated_code.generated_code = &routine_output[0]; io_generated_code.buffer_size = BUFSIZE2; io_generated_code.code_size = 0; io_generated_code.code_type = 2; io_generated_code.last_error = 0; io_generated_Code.sf_size = 0; #endif if ( argc <= 3 ) { printf("\nUSAGE: %s m n lda ldb nmat side uplo trans diag layout ntest alpha\n",argv[0]); printf("Compact TRMM a mxn matrix of leading dimension ldb\n"); printf("This will test the jit of 1 VLEN work of nmat at a time\n"); printf("Defaults: m=n=lda=ldb=nmat=8, alpha=1.0, side=uplo='L',trans=diag='N',layout=102,ntest=1\n"); } if ( argc > 1 ) m = atoi(argv[1]); else m = 8; if ( argc > 2 ) n = atoi(argv[2]); else n = 8; if ( argc > 3 ) lda= atoi(argv[3]); else lda = 8; if ( argc > 4 ) ldb = atoi(argv[4]); else ldb = 8; if ( argc > 5 ) nmat = atoi(argv[5]); else nmat = 8; if ( argc > 6 ) side = argv[6][0]; else side = 'L'; if ( argc > 7 ) uplo = argv[7][0]; else uplo = 'L'; if ( argc > 8 ) trans = argv[8][0]; else trans = 'N'; if ( argc > 9 ) diag = argv[9][0]; else diag = 'N'; if ( argc > 10 ) layout = atoi(argv[10]); else layout=102; if ( argc > 11 ) ntest = atoi(argv[11]); else ntest = 1; if ( argc > 12 ) dalpha = atof(argv[12]); else dalpha = 1.0; salpha = (float)dalpha; m = LIBXSMM_MAX(m,1); n = LIBXSMM_MAX(n,1); /* A is either mxm or nxn depending on side */ if ( (side == 'L') || (side=='l') ) asize = m; else asize = n; lda = LIBXSMM_MAX(lda,asize); if ( layout == 102 ) { /* Column major: B is mxn, and stored in B format */ ldb = LIBXSMM_MAX(ldb,m); bsize = ldb*n; } else { /* Row major: B is mxn, and stored in B^T format */ ldb = LIBXSMM_MAX(ldb,n); bsize = ldb*m; } nmats = LIBXSMM_MAX(VLENS,nmat - (nmat%VLENS)); nmatd = LIBXSMM_MAX(VLEND,nmat - (nmat%VLEND)); nmat = LIBXSMM_MAX(nmats,nmatd); op_count = n * m * asize; printf("This is a real*%u tester for JIT compact TRMM kernels! (%c%c%c%c m=%u n=%u lda=%u ldb=%u layout=%u nmat=%u)\n",typesize8,side,uplo,trans,diag,m,n,lda,ldb,layout,nmat); #ifdef USE_XSMM_GENERATED printf("This code tests the LIBXSMM generated kernels\n"); #endif #ifdef USE_PREDEFINED_ASSEMBLY printf("This code tests some predefined assembly kernel\n"); #endif #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__) printf("This code tests kernel generation directly\n"); #endif #ifdef TIME_MKL printf("This code tests MKL compact batch directly\n"); #endif desc8 = libxsmm_trmm_descriptor_init(&blob, typesize8, m, n, lda, ldb, &dalpha, trans, diag, side, uplo, layout); #ifdef TEST_SINGLE desc4 = libxsmm_trmm_descriptor_init(&blob, typesize4, m, n, lda, ldb, &salpha, trans, diag, side, uplo, layout); #endif #ifdef USE_XSMM_GENERATED printf("calling libxsmm_dispatch_trmm: typesize8=%u\n",typesize8); mykernel = libxsmm_dispatch_trmm(desc8); printf("done calling libxsmm_dispatch_trmm: typesize8=%u\n",typesize8); if ( mykernel == NULL ) printf("R8 Kernel after the create call is null\n"); #ifdef TEST_SINGLE mykernel = libxsmm_dispatch_trmm(desc4); if ( mykernel == NULL ) printf("R4 kernel after the create call is null\n"); #endif #endif #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__) libxsmm_generator_trmm_kernel ( &io_generated_code, desc8, "hsw" ); #endif #ifndef NO_ACCURACY_CHECK printf("mallocing matrices\n"); #endif sa = (float *) malloc ( lda*asize*nmats*sizeof(float) ); da = (double *) malloc ( lda*asize*nmatd*sizeof(double) ); sb = (float *) malloc ( bsize*nmats*sizeof(float) ); db = (double *) malloc ( bsize*nmatd*sizeof(double) ); sc = (float *) malloc ( bsize*nmats*sizeof(float) ); dc = (double *) malloc ( bsize*nmatd*sizeof(double) ); sd = (float *) malloc ( bsize*nmats*sizeof(float) ); dd = (double *) malloc ( bsize*nmatd*sizeof(double) ); tmpbuf = (double *) malloc ( asize*VLEND*sizeof(double) ); #ifndef NO_ACCURACY_CHECK printf("filling matrices\n"); #endif sfill_matrix ( sa, lda, asize, asize*nmats ); #ifdef TRIANGLE_IS_IDENTITY printf("Warning: setting triangular matrix to identity. Not good for accuracy testing\n"); dfill_identity ( da, lda, asize, asize, VLEND, nmatd/VLEND ); #else dfill_matrix ( da, lda, asize, asize*nmatd ); #endif sfill_matrix ( sb, bsize, bsize, nmats ); dfill_matrix ( db, bsize, bsize, nmatd ); #ifndef NO_ACCURACY_CHECK for ( i = 0; i < (int)(bsize*nmats); i++ ) sc[i]=sb[i]; for ( i = 0; i < (int)(bsize*nmatd); i++ ) dc[i]=db[i]; for ( i = 0; i < (int)(bsize*nmats); i++ ) sd[i]=sb[i]; for ( i = 0; i < (int)(bsize*nmatd); i++ ) dd[i]=db[i]; printf("Pointing at the kernel now\n"); #endif #ifdef USE_XSMM_GENERATED cptr = (const unsigned char*)mykernel; #endif #ifdef USE_PREDEFINED_ASSEMBLY cptr = (const unsigned char*) trmm_; #endif #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__) cptr = (const unsigned char*) &routine_output[0]; opcode_routine = (void *) &cptr[0]; #endif #ifndef TIME_MKL # define DUMP_ASSEMBLY_FILE #endif #ifdef DUMP_ASSEMBLY_FILE printf("Dumping assembly file\n"); FILE *fp = fopen("foo.s","w"); char buffer[80]; fputs("\t.text\n",fp); fputs("\t.align 256\n",fp); fputs("\t.globl trmm_\n",fp); fputs("trmm_:\n",fp); for (i = 0; i < 4000; i+=4 ) { sprintf(buffer,".byte 0x%02x, 0x%02x, 0x%02x, 0x%02x\n",cptr[i],cptr[i+1],cptr[i+2],cptr[i+3]); fputs(buffer,fp); } fputs("\tretq\n",fp); fputs("\t.type trmm_,@function\n",fp); fputs("\t.size trmm_,.-trmm_\n",fp); fclose(fp); #endif #if defined(USE_MKL_FOR_REFERENCE) || defined(TIME_MKL) # include MKL_LAYOUT CLAYOUT = (layout == 101) ? MKL_ROW_MAJOR : MKL_COL_MAJOR; MKL_SIDE SIDE = (side == 'R' || side == 'r') ? MKL_RIGHT : MKL_LEFT; MKL_UPLO UPLO = (uplo == 'U' || uplo == 'u') ? MKL_UPPER : MKL_LOWER; MKL_TRANSPOSE TRANSA = (trans == 'N' || trans == 'n') ? MKL_NOTRANS : MKL_TRANS; MKL_DIAG DIAG = (diag == 'N' || diag == 'n') ? MKL_NONUNIT : MKL_UNIT; MKL_COMPACT_PACK CMP_FORMAT = mkl_get_format_compact(); #if 0 MKL_COMPACT_PACK CMP_FORMAT = MKL_COMPACT_AVX; #endif #endif #ifndef NO_ACCURACY_CHECK printf("Before routine, initial B(1,1)=%g B[256]=%g\n",db[0],db[256]); #endif #ifdef USE_PREDEFINED_ASSEMBLY double one = 1.0; #endif double timer, firsttime = 0; #ifdef MKL_TIMER double tmptimer; tmptimer = dsecnd_(); #else unsigned long long l_start, l_end; #endif timer = 0.0; for ( j = 0; j < (int)ntest; j++ ) { #ifndef TRIANGLE_IS_IDENTITY for ( i = 0; i < (int)(bsize*nmatd); i++ ) db[i]=dd[i]; #endif for ( i = 0 , num = 0; i < (int)nmatd; i+= (int)VLEND, num++ ) { double *Ap = &da[num*lda*asize*VLEND]; double *Bp = &db[num*bsize*VLEND]; #ifdef MKL_TIMER tmptimer = dsecnd_(); #else l_start = libxsmm_timer_tick(); #endif #ifdef USE_XSMM_GENERATED mykernel ( Ap, Bp, tmpbuf ); #endif #ifdef USE_PREDEFINED_ASSEMBLY trmm_ ( Ap, Bp, &one ); #endif #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__) (*opcode_routine)( Ap, Bp ); #endif #ifdef TIME_MKL mkl_dtrmm_compact ( CLAYOUT, SIDE, UPLO, TRANSA, DIAG, m, n, dalpha, da, lda, db, ldb, CMP_FORMAT, nmatd ); i+=nmatd; /* Because MKL will do everything */ #endif #ifdef MKL_TIMER dtmp = dsecnd_() - tmptimer; #else l_end = libxsmm_timer_tick(); dtmp = libxsmm_timer_duration(l_start,l_end); #endif if ( j == 0 ) firsttime=dtmp; timer += dtmp; } } if ( ntest >= 100 ) { /* Skip the first timing: super necessary if using MKL */ timer = (timer-firsttime)/((double)(ntest-1)); } else { timer /= ((double)ntest); } #ifndef NO_ACCURACY_CHECK printf("Average time to get through %u matrices: %g\n",nmatd,timer); printf("Gflops: %g\n",(double)(op_count*nmatd)/(timer*1.0e9)); printf("after routine, new B(1,1)=%g B[256]=%g\n",db[0],db[256]); #endif #ifdef TEST_SINGLE printf("Before r4 routine, initial B(1,1)=%g B[256]=%g\n",sb[0],sb[256]); for ( i = 0 , num = 0; i < nmats; i+= VLENS, num++ ) { float *Ap = &sa[num*lda*asize*VLENS]; float *Bp = &sb[num*bsize*VLENS]; #ifdef USE_XSMM_GENERATED mykernel ( Ap, Bp, NULL ); #endif } printf("after r4 routine, new B(1,1)=%g B]256]=%g\n",db[0],db[256]); #endif #ifndef NO_ACCURACY_CHECK /* Call some reference code now on a copy of the B matrix (C) */ double timer2 = 0.0; for ( j = 0; j < (int)ntest; j++ ) { #ifndef TRIANGLE_IS_IDENTITY for ( i = 0; i < (int)(bsize*nmatd); i++ ) dc[i]=dd[i]; #endif #ifdef MKL_TIMER tmptimer = dsecnd_(); #else l_start = libxsmm_timer_tick(); #endif #ifdef USE_MKL_FOR_REFERENCE mkl_dtrmm_compact ( CLAYOUT, SIDE, UPLO, TRANSA, DIAG, m, n, dalpha, da, lda, dc, ldb, CMP_FORMAT, nmatd ); #elif !defined(LIBXSMM_NOFORTRAN) && (!defined(__BLAS) || (0 != __BLAS)) if ( (layout == 101) && (nmatd!=VLEND) ) { unsigned int lay = 102, m1 = n, n1 = m; char side1='L', uplo1='L'; if ( side == 'L' || side == 'l' ) side1 = 'R'; if ( uplo == 'L' || uplo == 'l' ) uplo1 = 'U'; compact_dtrmm_ ( &lay, &side1, &uplo1, &trans, &diag, &m1, &n1, &dalpha, da, &lda, dc, &ldb, &nmatd, &VLEND ); } else { compact_dtrmm_ ( &layout, &side, &uplo, &trans, &diag, &m, &n, &dalpha, da, &lda, dc, &ldb, &nmatd, &VLEND ); } #endif #ifdef MKL_TIMER timer2 += dsecnd_() - tmptimer; #else l_end = libxsmm_timer_tick(); timer2 += libxsmm_timer_duration(l_start,l_end); #endif } timer2 /= ((double)ntest); printf("Reference time=%g Reference Gflops=%g\n",timer2,(op_count*nmatd)/(timer2*1.0e9)); /* Compute the residual between B and C */ dtmp = residual_d ( dc, bsize, bsize, nmatd, db, bsize, &nerrs, &ncorr ); printf("R8 %c%c%c%c m=%u n=%u lda=%u ldb=%u error: %g number of errors: %u corrects: %u",side,uplo,trans,diag,m,n,lda,ldb,dtmp,nerrs,ncorr); if ( nerrs > 0 ) printf(" ->FAILED at %ux%u real*8 %u case",m,n,layout); printf("\n"); #ifdef TEST_SINGLE /* Call some reference code now on a copy of the B matrix (C) */ compact_strmm_ ( &layout, &side, &uplo, &trans, &diag, &m, &n, &salpha, sa, &lda, sc, &ldb, &nmats, &VLENS ); /* Compute the residual between B and C */ dtmp = residual_s ( sc, bsize, bsize, nmats, sb, bsize, &nerrs, &ncorr ); printf("R4 %c%c%c%c m=%u n=%u lda=%u ldb=%u error: %g number of errors: %u corrects: %u\n",side,uplo,trans,diag,m,n,lda,ldb,dtmp,nerrs,ncorr); if ( nerrs > 0 ) printf(" ->FAILED at %ux%u real*4 case",m,n); printf("\n"); #endif #else for ( j = 0, nerrs = 0; j < bsize*nmatd; j++ ) { if ( isnan(db[j]) || isinf(db[j]) ) { if ( ++nerrs < 10 ) { printf("WARNING: db[%d]=%g\n",j,db[j]); } } } printf("%g,real*8 %c%c%c%c m=%u n=%u lda=%u ldb=%u Denormals=%u Time=%g Gflops=%g",(op_count*nmatd)/(timer*1.0e9),side,uplo,trans,diag,m,n,lda,ldb,nerrs,timer,(op_count*nmatd)/(timer*1.0e9)); if ( nerrs > 0 ) printf(" -> FAILED at %ux%u real*8 case",m,n); printf("\n"); #endif free(dd); free(sd); free(dc); free(sc); free(db); free(sb); free(da); free(sa); return 0; } libxsmm-1.17/samples/packed/trmm/trmm.vcxproj000066400000000000000000000547371415223013700214010ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 trmm {0AAD9667-E74F-4898-8CC0-68E31607BB41} 10.0 Application Disabled Disabled Sequential v142 true Application true true Disabled Disabled Sequential v142 Application true Disabled Disabled Sequential v142 true Application Disabled Disabled Sequential v142 true true Application true Disabled Disabled Sequential v142 Application true Disabled Disabled true Sequential v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;mkl_rt.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;mkl_rt.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/packed/trsm/000077500000000000000000000000001415223013700167735ustar00rootroot00000000000000libxsmm-1.17/samples/packed/trsm/Makefile000066400000000000000000000074731415223013700204460ustar00rootroot00000000000000ROOTDIR = $(abspath $(dir $(firstword $(MAKEFILE_LIST)))) DEPDIR = ../../.. SRCDIR = . INCDIR = . BLDDIR = obj OUTDIR = . CXXFLAGS = $(NULL) CFLAGS = $(NULL) DFLAGS = $(NULL) # Fortran code here does not allow for PEDANTIC=2 # override PEDANTIC = 1 PEDANTIC = 0 BLAS = 0 OMP = 0 SYM = 1 # include common Makefile artifacts include $(DEPDIR)/Makefile.inc # necessary include directories IFLAGS += -I$(call quote,$(INCDIR)) IFLAGS += -I$(call quote,$(DEPDIR)/include) OUTNAME := $(shell basename "$(ROOTDIR)") HEADERS := $(wildcard $(INCDIR)/*.h) $(wildcard $(INCDIR)/*.hpp) $(wildcard $(INCDIR)/*.hxx) $(wildcard $(INCDIR)/*.hh) \ $(wildcard $(SRCDIR)/*.h) $(wildcard $(SRCDIR)/*.hpp) $(wildcard $(SRCDIR)/*.hxx) $(wildcard $(SRCDIR)/*.hh) \ $(DEPDIR)/include/libxsmm_source.h CPPSRCS := $(wildcard $(SRCDIR)/*.cpp) CXXSRCS := $(wildcard $(SRCDIR)/*.cxx) CCXSRCS := $(wildcard $(SRCDIR)/*.cc) CSOURCS := $(wildcard $(SRCDIR)/*.c) CPPOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CPPSRCS:.cpp=-cpp.o))) CXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CXXSRCS:.cxx=-cxx.o))) CCXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CCXSRCS:.cc=-cc.o))) COBJCTS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CSOURCS:.c=-c.o))) ifneq (,$(strip $(FC))) FXXSRCS := $(wildcard $(SRCDIR)/*.f) F77SRCS := $(wildcard $(SRCDIR)/*.F) F90SRCS := $(wildcard $(SRCDIR)/*.f90) $(wildcard $(SRCDIR)/*.F90) FXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(FXXSRCS:.f=-f.o))) F77OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F77SRCS:.F=-f77.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90SRCS:.f90=-f90.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90OBJS:.F90=-f90.o))) endif SOURCES := $(CPPSRCS) $(CXXSRCS) $(CCXSRCS) $(CSOURCS) OBJECTS := $(CPPOBJS) $(CXXOBJS) $(CCXOBJS) $(COBJCTS) FTNSRCS := $(FXXSRCS) $(F77SRCS) $(F90SRCS) MODULES := $(addsuffix .mod,$(basename $(FTNSRCS))) $(addsuffix .modmic,$(basename $(FTNSRCS))) FTNOBJS := $(FXXOBJS) $(F77OBJS) $(F90OBJS) XFILES := $(OUTDIR)/$(OUTNAME) .PHONY: all all: $(XFILES) .PHONY: compile compile: $(OBJECTS) $(FTNOBJS) ifneq (,$(strip $(FC))) $(BLDDIR)/%-f.o: $(SRCDIR)/%.f .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.f90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.F90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f77.o: $(SRCDIR)/%.F .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ endif $(OUTDIR)/$(OUTNAME): $(OUTDIR)/.make $(OBJECTS) $(FTNOBJS) $(LIBDEP) $(LIB_FLD) -o $@ $(OBJECTS) $(FTNOBJS) $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(BLDDIR)/%-cpp.o: $(SRCDIR)/%.cpp .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CXX) $(DFLAGS) $(IFLAGS) $(CXXFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-c.o: $(SRCDIR)/%.c .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CC) $(DFLAGS) $(IFLAGS) $(CFLAGS) $(CTARGET) -c $< -o $@ .PHONY: clean clean: ifneq ($(call qapath,$(BLDDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(BLDDIR)),$(call qapath,.)) @rm -rf $(BLDDIR) endif endif ifneq (,$(wildcard $(BLDDIR))) # still exists @rm -f $(OBJECTS) $(OBJECTX) $(FTNOBJS) $(FTNOBJX) *__genmod.* fit.log *.dat @rm -f $(BLDDIR)/*.gcno $(BLDDIR)/*.gcda $(BLDDIR)/*.gcov endif @rm -f .make .state .PHONY: realclean realclean: clean ifneq ($(call qapath,$(OUTDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(OUTDIR)),$(call qapath,.)) @rm -rf $(OUTDIR) endif endif ifneq (,$(wildcard $(OUTDIR))) # still exists @rm -f $(OUTDIR)/libxsmm.$(DLIBEXT) $(OUTDIR)/*.stackdump @rm -f $(XFILES) $(MODULES) endif libxsmm-1.17/samples/packed/trsm/blas_aux.c000066400000000000000000000010171415223013700207340ustar00rootroot00000000000000/* Optionally link-in the BLAS routines lsame_() and xerbla_() */ #if !defined(__BLAS) || (0 != __BLAS) #include int lsame_(const char* ca, const char* cb) { if ( *ca == *cb ) return 1; if ( (*cb >= 'a') && (*cb <= 'z') ) { if ( *ca == *cb + 32 ) return 1; } else if ( (*cb >= 'A') && (*cb <= 'Z') ) { if ( *ca == *cb - 32 ) return 1; } return 0; } void xerbla_(const char* c, const int* info) { printf(" ** On entry to %s parameter number %02d had an illegal value\n", c, *info); } #endif libxsmm-1.17/samples/packed/trsm/dtrsm.f000066400000000000000000000276411415223013700203050ustar00rootroot00000000000000 SUBROUTINE DTRSM(SIDE,UPLO,TRANSA,DIAG,M,N,ALPHA,A,LDA,B,LDB) ! .. Scalar Arguments .. DOUBLE PRECISION ALPHA INTEGER LDA,LDB,M,N CHARACTER DIAG,SIDE,TRANSA,UPLO ! .. ! .. Array Arguments .. DOUBLE PRECISION A(LDA,*),B(LDB,*) ! .. ! ! Purpose ! ======= ! ! DTRSM solves one of the matrix equations ! ! op( A )*X = alpha*B, or X*op( A ) = alpha*B, ! ! where alpha is a scalar, X and B are m by n matrices, A is a unit, or ! non-unit, upper or lower triangular matrix and op( A ) is one of ! ! op( A ) = A or op( A ) = A'. ! ! The matrix X is overwritten on B. ! ! Arguments ! ========== ! ! SIDE - CHARACTER*1. ! On entry, SIDE specifies whether op( A ) appears on the left ! or right of X as follows: ! ! SIDE = 'L' or 'l' op( A )*X = alpha*B. ! ! SIDE = 'R' or 'r' X*op( A ) = alpha*B. ! ! Unchanged on exit. ! ! UPLO - CHARACTER*1. ! On entry, UPLO specifies whether the matrix A is an upper or ! lower triangular matrix as follows: ! ! UPLO = 'U' or 'u' A is an upper triangular matrix. ! ! UPLO = 'L' or 'l' A is a lower triangular matrix. ! ! Unchanged on exit. ! ! TRANSA - CHARACTER*1. ! On entry, TRANSA specifies the form of op( A ) to be used in ! the matrix multiplication as follows: ! ! TRANSA = 'N' or 'n' op( A ) = A. ! ! TRANSA = 'T' or 't' op( A ) = A'. ! ! TRANSA = 'C' or 'c' op( A ) = A'. ! ! Unchanged on exit. ! ! DIAG - CHARACTER*1. ! On entry, DIAG specifies whether or not A is unit triangular ! as follows: ! ! DIAG = 'U' or 'u' A is assumed to be unit triangular. ! ! DIAG = 'N' or 'n' A is not assumed to be unit ! triangular. ! ! Unchanged on exit. ! ! M - INTEGER. ! On entry, M specifies the number of rows of B. M must be at ! least zero. ! Unchanged on exit. ! ! N - INTEGER. ! On entry, N specifies the number of columns of B. N must be ! at least zero. ! Unchanged on exit. ! ! ALPHA - DOUBLE PRECISION. ! On entry, ALPHA specifies the scalar alpha. When alpha is ! zero then A is not referenced and B need not be set before ! entry. ! Unchanged on exit. ! ! A - DOUBLE PRECISION array of DIMENSION ( LDA, k ), where k is m ! when SIDE = 'L' or 'l' and is n when SIDE = 'R' or 'r'. ! Before entry with UPLO = 'U' or 'u', the leading k by k ! upper triangular part of the array A must contain the upper ! triangular matrix and the strictly lower triangular part of ! A is not referenced. ! Before entry with UPLO = 'L' or 'l', the leading k by k ! lower triangular part of the array A must contain the lower ! triangular matrix and the strictly upper triangular part of ! A is not referenced. ! Note that when DIAG = 'U' or 'u', the diagonal elements of ! A are not referenced either, but are assumed to be unity. ! Unchanged on exit. ! ! LDA - INTEGER. ! On entry, LDA specifies the first dimension of A as declared ! in the calling (sub) program. When SIDE = 'L' or 'l' then ! LDA must be at least max( 1, m ), when SIDE = 'R' or 'r' ! then LDA must be at least max( 1, n ). ! Unchanged on exit. ! ! B - DOUBLE PRECISION array of DIMENSION ( LDB, n ). ! Before entry, the leading m by n part of the array B must ! contain the right-hand side matrix B, and on exit is ! overwritten by the solution matrix X. ! ! LDB - INTEGER. ! On entry, LDB specifies the first dimension of B as declared ! in the calling (sub) program. LDB must be at least ! max( 1, m ). ! Unchanged on exit. ! ! ! Level 3 Blas routine. ! ! ! -- Written on 8-February-1989. ! Jack Dongarra, Argonne National Laboratory. ! Iain Duff, AERE Harwell. ! Jeremy Du Croz, Numerical Algorithms Group Ltd. ! Sven Hammarling, Numerical Algorithms Group Ltd. ! ! ! .. External Functions .. LOGICAL LSAME EXTERNAL LSAME ! .. ! .. External Subroutines .. EXTERNAL XERBLA ! .. ! .. Intrinsic Functions .. INTRINSIC MAX ! .. ! .. Local Scalars .. DOUBLE PRECISION TEMP INTEGER I,INFO,J,K,NROWA LOGICAL LSIDE,NOUNIT,UPPER ! .. ! .. Parameters .. DOUBLE PRECISION ONE,ZERO PARAMETER (ONE=1.0D+0,ZERO=0.0D+0) ! .. ! ! Test the input parameters. ! LSIDE = LSAME(SIDE,'L') IF (LSIDE) THEN NROWA = M ELSE NROWA = N END IF NOUNIT = LSAME(DIAG,'N') UPPER = LSAME(UPLO,'U') ! INFO = 0 IF ((.NOT.LSIDE) .AND. (.NOT.LSAME(SIDE,'R'))) THEN INFO = 1 ELSE IF ((.NOT.UPPER) .AND. (.NOT.LSAME(UPLO,'L'))) THEN INFO = 2 ELSE IF ((.NOT.LSAME(TRANSA,'N')) .AND. & (.NOT.LSAME(TRANSA,'T')) .AND. & (.NOT.LSAME(TRANSA,'C'))) THEN INFO = 3 ELSE IF ((.NOT.LSAME(DIAG,'U')) .AND. (.NOT.LSAME(DIAG,'N'))) THEN INFO = 4 ELSE IF (M.LT.0) THEN INFO = 5 ELSE IF (N.LT.0) THEN INFO = 6 ELSE IF (LDA.LT.MAX(1,NROWA)) THEN INFO = 9 ELSE IF (LDB.LT.MAX(1,M)) THEN INFO = 11 END IF IF (INFO.NE.0) THEN CALL XERBLA('DTRSM ',INFO) RETURN END IF ! ! Quick return if possible. ! IF (N.EQ.0) RETURN ! ! And when alpha.eq.zero. ! IF (ALPHA.EQ.ZERO) THEN DO 20 J = 1,N DO 10 I = 1,M B(I,J) = ZERO 10 CONTINUE 20 CONTINUE RETURN END IF ! ! Start the operations. ! IF (LSIDE) THEN IF (LSAME(TRANSA,'N')) THEN ! ! Form B := alpha*inv( A )*B. ! IF (UPPER) THEN DO 60 J = 1,N IF (ALPHA.NE.ONE) THEN DO 30 I = 1,M B(I,J) = ALPHA*B(I,J) 30 CONTINUE END IF DO 50 K = M,1,-1 IF (B(K,J).NE.ZERO) THEN IF (NOUNIT) B(K,J) = B(K,J)/A(K,K) DO 40 I = 1,K - 1 B(I,J) = B(I,J) - B(K,J)*A(I,K) 40 CONTINUE END IF 50 CONTINUE 60 CONTINUE ELSE DO 100 J = 1,N IF (ALPHA.NE.ONE) THEN DO 70 I = 1,M B(I,J) = ALPHA*B(I,J) 70 CONTINUE END IF DO 90 K = 1,M IF (B(K,J).NE.ZERO) THEN IF (NOUNIT) B(K,J) = B(K,J)/A(K,K) DO 80 I = K + 1,M B(I,J) = B(I,J) - B(K,J)*A(I,K) 80 CONTINUE END IF 90 CONTINUE 100 CONTINUE END IF ELSE ! ! Form B := alpha*inv( A' )*B. ! IF (UPPER) THEN DO 130 J = 1,N DO 120 I = 1,M TEMP = ALPHA*B(I,J) DO 110 K = 1,I - 1 TEMP = TEMP - A(K,I)*B(K,J) 110 CONTINUE IF (NOUNIT) TEMP = TEMP/A(I,I) B(I,J) = TEMP 120 CONTINUE 130 CONTINUE ELSE DO 160 J = 1,N DO 150 I = M,1,-1 TEMP = ALPHA*B(I,J) DO 140 K = I + 1,M TEMP = TEMP - A(K,I)*B(K,J) 140 CONTINUE IF (NOUNIT) TEMP = TEMP/A(I,I) B(I,J) = TEMP 150 CONTINUE 160 CONTINUE END IF END IF ELSE IF (LSAME(TRANSA,'N')) THEN ! ! Form B := alpha*B*inv( A ). ! IF (UPPER) THEN DO 210 J = 1,N IF (ALPHA.NE.ONE) THEN DO 170 I = 1,M B(I,J) = ALPHA*B(I,J) 170 CONTINUE END IF DO 190 K = 1,J - 1 IF (A(K,J).NE.ZERO) THEN DO 180 I = 1,M B(I,J) = B(I,J) - A(K,J)*B(I,K) 180 CONTINUE END IF 190 CONTINUE IF (NOUNIT) THEN TEMP = ONE/A(J,J) DO 200 I = 1,M B(I,J) = TEMP*B(I,J) 200 CONTINUE END IF 210 CONTINUE ELSE DO 260 J = N,1,-1 IF (ALPHA.NE.ONE) THEN DO 220 I = 1,M B(I,J) = ALPHA*B(I,J) 220 CONTINUE END IF DO 240 K = J + 1,N IF (A(K,J).NE.ZERO) THEN DO 230 I = 1,M B(I,J) = B(I,J) - A(K,J)*B(I,K) 230 CONTINUE END IF 240 CONTINUE IF (NOUNIT) THEN TEMP = ONE/A(J,J) DO 250 I = 1,M B(I,J) = TEMP*B(I,J) 250 CONTINUE END IF 260 CONTINUE END IF ELSE ! ! Form B := alpha*B*inv( A' ). ! IF (UPPER) THEN DO 310 K = N,1,-1 IF (NOUNIT) THEN TEMP = ONE/A(K,K) DO 270 I = 1,M B(I,K) = TEMP*B(I,K) 270 CONTINUE END IF DO 290 J = 1,K - 1 IF (A(J,K).NE.ZERO) THEN TEMP = A(J,K) DO 280 I = 1,M B(I,J) = B(I,J) - TEMP*B(I,K) 280 CONTINUE END IF 290 CONTINUE IF (ALPHA.NE.ONE) THEN DO 300 I = 1,M B(I,K) = ALPHA*B(I,K) 300 CONTINUE END IF 310 CONTINUE ELSE DO 360 K = 1,N IF (NOUNIT) THEN TEMP = ONE/A(K,K) DO 320 I = 1,M B(I,K) = TEMP*B(I,K) 320 CONTINUE END IF DO 340 J = K + 1,N IF (A(J,K).NE.ZERO) THEN TEMP = A(J,K) DO 330 I = 1,M B(I,J) = B(I,J) - TEMP*B(I,K) 330 CONTINUE END IF 340 CONTINUE IF (ALPHA.NE.ONE) THEN DO 350 I = 1,M B(I,K) = ALPHA*B(I,K) 350 CONTINUE END IF 360 CONTINUE END IF END IF END IF ! RETURN ! ! End of DTRSM . ! END libxsmm-1.17/samples/packed/trsm/strsm.f000066400000000000000000000276001415223013700203170ustar00rootroot00000000000000 SUBROUTINE STRSM(SIDE,UPLO,TRANSA,DIAG,M,N,ALPHA,A,LDA,B,LDB) ! .. Scalar Arguments .. REAL*4 ALPHA INTEGER LDA,LDB,M,N CHARACTER DIAG,SIDE,TRANSA,UPLO ! .. ! .. Array Arguments .. REAL*4 A(LDA,*),B(LDB,*) ! .. ! ! Purpose ! ======= ! ! STRSM solves one of the matrix equations ! ! op( A )*X = alpha*B, or X*op( A ) = alpha*B, ! ! where alpha is a scalar, X and B are m by n matrices, A is a unit, or ! non-unit, upper or lower triangular matrix and op( A ) is one of ! ! op( A ) = A or op( A ) = A'. ! ! The matrix X is overwritten on B. ! ! Arguments ! ========== ! ! SIDE - CHARACTER*1. ! On entry, SIDE specifies whether op( A ) appears on the left ! or right of X as follows: ! ! SIDE = 'L' or 'l' op( A )*X = alpha*B. ! ! SIDE = 'R' or 'r' X*op( A ) = alpha*B. ! ! Unchanged on exit. ! ! UPLO - CHARACTER*1. ! On entry, UPLO specifies whether the matrix A is an upper or ! lower triangular matrix as follows: ! ! UPLO = 'U' or 'u' A is an upper triangular matrix. ! ! UPLO = 'L' or 'l' A is a lower triangular matrix. ! ! Unchanged on exit. ! ! TRANSA - CHARACTER*1. ! On entry, TRANSA specifies the form of op( A ) to be used in ! the matrix multiplication as follows: ! ! TRANSA = 'N' or 'n' op( A ) = A. ! ! TRANSA = 'T' or 't' op( A ) = A'. ! ! TRANSA = 'C' or 'c' op( A ) = A'. ! ! Unchanged on exit. ! ! DIAG - CHARACTER*1. ! On entry, DIAG specifies whether or not A is unit triangular ! as follows: ! ! DIAG = 'U' or 'u' A is assumed to be unit triangular. ! ! DIAG = 'N' or 'n' A is not assumed to be unit ! triangular. ! ! Unchanged on exit. ! ! M - INTEGER. ! On entry, M specifies the number of rows of B. M must be at ! least zero. ! Unchanged on exit. ! ! N - INTEGER. ! On entry, N specifies the number of columns of B. N must be ! at least zero. ! Unchanged on exit. ! ! ALPHA - REAL*4 ! On entry, ALPHA specifies the scalar alpha. When alpha is ! zero then A is not referenced and B need not be set before ! entry. ! Unchanged on exit. ! ! A - REAL*4 array of DIMENSION ( LDA, k ), where k is m ! when SIDE = 'L' or 'l' and is n when SIDE = 'R' or 'r'. ! Before entry with UPLO = 'U' or 'u', the leading k by k ! upper triangular part of the array A must contain the upper ! triangular matrix and the strictly lower triangular part of ! A is not referenced. ! Before entry with UPLO = 'L' or 'l', the leading k by k ! lower triangular part of the array A must contain the lower ! triangular matrix and the strictly upper triangular part of ! A is not referenced. ! Note that when DIAG = 'U' or 'u', the diagonal elements of ! A are not referenced either, but are assumed to be unity. ! Unchanged on exit. ! ! LDA - INTEGER. ! On entry, LDA specifies the first dimension of A as declared ! in the calling (sub) program. When SIDE = 'L' or 'l' then ! LDA must be at least max( 1, m ), when SIDE = 'R' or 'r' ! then LDA must be at least max( 1, n ). ! Unchanged on exit. ! ! B - REAL*4 array of DIMENSION ( LDB, n ). ! Before entry, the leading m by n part of the array B must ! contain the right-hand side matrix B, and on exit is ! overwritten by the solution matrix X. ! ! LDB - INTEGER. ! On entry, LDB specifies the first dimension of B as declared ! in the calling (sub) program. LDB must be at least ! max( 1, m ). ! Unchanged on exit. ! ! ! Level 3 Blas routine. ! ! ! -- Written on 8-February-1989. ! Jack Dongarra, Argonne National Laboratory. ! Iain Duff, AERE Harwell. ! Jeremy Du Croz, Numerical Algorithms Group Ltd. ! Sven Hammarling, Numerical Algorithms Group Ltd. ! ! ! .. External Functions .. LOGICAL LSAME EXTERNAL LSAME ! .. ! .. External Subroutines .. EXTERNAL XERBLA ! .. ! .. Intrinsic Functions .. INTRINSIC MAX ! .. ! .. Local Scalars .. REAL*4 TEMP INTEGER I,INFO,J,K,NROWA LOGICAL LSIDE,NOUNIT,UPPER ! .. ! .. Parameters .. REAL*4 ONE,ZERO PARAMETER (ONE=1.0+0,ZERO=0.0+0) ! .. ! ! Test the input parameters. ! LSIDE = LSAME(SIDE,'L') IF (LSIDE) THEN NROWA = M ELSE NROWA = N END IF NOUNIT = LSAME(DIAG,'N') UPPER = LSAME(UPLO,'U') ! INFO = 0 IF ((.NOT.LSIDE) .AND. (.NOT.LSAME(SIDE,'R'))) THEN INFO = 1 ELSE IF ((.NOT.UPPER) .AND. (.NOT.LSAME(UPLO,'L'))) THEN INFO = 2 ELSE IF ((.NOT.LSAME(TRANSA,'N')) .AND. & (.NOT.LSAME(TRANSA,'T')) .AND. & (.NOT.LSAME(TRANSA,'C'))) THEN INFO = 3 ELSE IF ((.NOT.LSAME(DIAG,'U')) .AND. (.NOT.LSAME(DIAG,'N'))) THEN INFO = 4 ELSE IF (M.LT.0) THEN INFO = 5 ELSE IF (N.LT.0) THEN INFO = 6 ELSE IF (LDA.LT.MAX(1,NROWA)) THEN INFO = 9 ELSE IF (LDB.LT.MAX(1,M)) THEN INFO = 11 END IF IF (INFO.NE.0) THEN CALL XERBLA('STRSM ',INFO) RETURN END IF ! ! Quick return if possible. ! IF (N.EQ.0) RETURN ! ! And when alpha.eq.zero. ! IF (ALPHA.EQ.ZERO) THEN DO 20 J = 1,N DO 10 I = 1,M B(I,J) = ZERO 10 CONTINUE 20 CONTINUE RETURN END IF ! ! Start the operations. ! IF (LSIDE) THEN IF (LSAME(TRANSA,'N')) THEN ! ! Form B := alpha*inv( A )*B. ! IF (UPPER) THEN DO 60 J = 1,N IF (ALPHA.NE.ONE) THEN DO 30 I = 1,M B(I,J) = ALPHA*B(I,J) 30 CONTINUE END IF DO 50 K = M,1,-1 IF (B(K,J).NE.ZERO) THEN IF (NOUNIT) B(K,J) = B(K,J)/A(K,K) DO 40 I = 1,K - 1 B(I,J) = B(I,J) - B(K,J)*A(I,K) 40 CONTINUE END IF 50 CONTINUE 60 CONTINUE ELSE DO 100 J = 1,N IF (ALPHA.NE.ONE) THEN DO 70 I = 1,M B(I,J) = ALPHA*B(I,J) 70 CONTINUE END IF DO 90 K = 1,M IF (B(K,J).NE.ZERO) THEN IF (NOUNIT) B(K,J) = B(K,J)/A(K,K) DO 80 I = K + 1,M B(I,J) = B(I,J) - B(K,J)*A(I,K) 80 CONTINUE END IF 90 CONTINUE 100 CONTINUE END IF ELSE ! ! Form B := alpha*inv( A' )*B. ! IF (UPPER) THEN DO 130 J = 1,N DO 120 I = 1,M TEMP = ALPHA*B(I,J) DO 110 K = 1,I - 1 TEMP = TEMP - A(K,I)*B(K,J) 110 CONTINUE IF (NOUNIT) TEMP = TEMP/A(I,I) B(I,J) = TEMP 120 CONTINUE 130 CONTINUE ELSE DO 160 J = 1,N DO 150 I = M,1,-1 TEMP = ALPHA*B(I,J) DO 140 K = I + 1,M TEMP = TEMP - A(K,I)*B(K,J) 140 CONTINUE IF (NOUNIT) TEMP = TEMP/A(I,I) B(I,J) = TEMP 150 CONTINUE 160 CONTINUE END IF END IF ELSE IF (LSAME(TRANSA,'N')) THEN ! ! Form B := alpha*B*inv( A ). ! IF (UPPER) THEN DO 210 J = 1,N IF (ALPHA.NE.ONE) THEN DO 170 I = 1,M B(I,J) = ALPHA*B(I,J) 170 CONTINUE END IF DO 190 K = 1,J - 1 IF (A(K,J).NE.ZERO) THEN DO 180 I = 1,M B(I,J) = B(I,J) - A(K,J)*B(I,K) 180 CONTINUE END IF 190 CONTINUE IF (NOUNIT) THEN TEMP = ONE/A(J,J) DO 200 I = 1,M B(I,J) = TEMP*B(I,J) 200 CONTINUE END IF 210 CONTINUE ELSE DO 260 J = N,1,-1 IF (ALPHA.NE.ONE) THEN DO 220 I = 1,M B(I,J) = ALPHA*B(I,J) 220 CONTINUE END IF DO 240 K = J + 1,N IF (A(K,J).NE.ZERO) THEN DO 230 I = 1,M B(I,J) = B(I,J) - A(K,J)*B(I,K) 230 CONTINUE END IF 240 CONTINUE IF (NOUNIT) THEN TEMP = ONE/A(J,J) DO 250 I = 1,M B(I,J) = TEMP*B(I,J) 250 CONTINUE END IF 260 CONTINUE END IF ELSE ! ! Form B := alpha*B*inv( A' ). ! IF (UPPER) THEN DO 310 K = N,1,-1 IF (NOUNIT) THEN TEMP = ONE/A(K,K) DO 270 I = 1,M B(I,K) = TEMP*B(I,K) 270 CONTINUE END IF DO 290 J = 1,K - 1 IF (A(J,K).NE.ZERO) THEN TEMP = A(J,K) DO 280 I = 1,M B(I,J) = B(I,J) - TEMP*B(I,K) 280 CONTINUE END IF 290 CONTINUE IF (ALPHA.NE.ONE) THEN DO 300 I = 1,M B(I,K) = ALPHA*B(I,K) 300 CONTINUE END IF 310 CONTINUE ELSE DO 360 K = 1,N IF (NOUNIT) THEN TEMP = ONE/A(K,K) DO 320 I = 1,M B(I,K) = TEMP*B(I,K) 320 CONTINUE END IF DO 340 J = K + 1,N IF (A(J,K).NE.ZERO) THEN TEMP = A(J,K) DO 330 I = 1,M B(I,J) = B(I,J) - TEMP*B(I,K) 330 CONTINUE END IF 340 CONTINUE IF (ALPHA.NE.ONE) THEN DO 350 I = 1,M B(I,K) = ALPHA*B(I,K) 350 CONTINUE END IF 360 CONTINUE END IF END IF END IF ! RETURN ! ! End of STRSM . ! END libxsmm-1.17/samples/packed/trsm/trsm.c000066400000000000000000000613301415223013700201270ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Greg Henry, Hans Pabst, Alexander Heinecke (Intel Corp.) ******************************************************************************/ #if 0 #define USE_KERNEL_GENERATION_DIRECTLY #endif #if 0 #define USE_PREDEFINED_ASSEMBLY #define USE_XSMM_GENERATED #define TIME_MKL #endif #if 0 #define TEST_SINGLE #endif #if !defined(USE_PREDEFINED_ASSEMBLY) && !defined(USE_XSMM_GENERATED) && !defined(TIME_MKL) && \ (!defined(__linux__) || !defined(USE_KERNEL_GENERATION_DIRECTLY)) # define USE_XSMM_GENERATED # include #else # include # include # include # include # include #endif #include #include #include #include #define BUFSIZE 32*32 #define BUFSIZE2 64000 #if 0 #define TRIANGLE_IS_IDENTITY #endif LIBXSMM_INLINE void dcopy_to_temp ( int layout, double *A, int lda, int m, int n, double *Atemp, unsigned int VLEN ) { int i, j; if ( lda*n > BUFSIZE ) { printf("Reference routine not set up for matrices so large\n"); exit(-1); } if ( layout == 102 ) { /* printf("Column major\n"); */ for ( j = 0; j < n; j++ ) { for ( i = 0; i < m; i++ ) { Atemp[i+j*m] = A[i*VLEN+j*lda*VLEN]; } } #if EVENTUALLY_USE_THIS_LOOP_IT_SHOULD_BE_FASTER for ( j = 0; j < n; j++ ) { for ( i = 0, ia = 0; i < m; i++, ia+=VLEN ) { Atemp[i+j*m] = A[ ia+j*lda*VLEN ]; } } #endif } else { /* printf("Row major\n"); */ for ( j = 0; j < n; j++ ) { for ( i = 0; i < m; i++ ) { /* Transpose the data */ Atemp[i+j*m] = A[j*VLEN+i*lda*VLEN]; } } } } LIBXSMM_INLINE void scopy_to_temp ( int layout, float *A, int lda, int m, int n, float *Atemp, unsigned int VLEN ) { int i, j; if ( lda*n > BUFSIZE ) { printf("Reference routine not set up for matrices so large\n"); exit(-1); } if ( layout == 102 ) { /* printf("Column major\n"); */ for ( j = 0; j < n; j++ ) { for ( i = 0; i < m; i++ ) { Atemp[i+j*m] = A[i*VLEN+j*lda*VLEN]; } } } else { /* printf("Row major\n"); */ for ( j = 0; j < n; j++ ) { for ( i = 0; i < m; i++ ) { /* Transpose the data */ Atemp[i+j*m] = A[j*VLEN+i*lda*VLEN]; } } } } LIBXSMM_INLINE void dcopy_from_temp ( int layout, double *A, int lda, int m, int n, double *Atemp, unsigned int VLEN ) { int i, j, ia; if ( lda*n > BUFSIZE ) { printf("Reference routine not set up for matrices so large\n"); } if ( layout == 102 ) { for ( j = 0; j < n; j++ ) { for ( i = 0, ia = 0; i < m; i++, ia+=VLEN ) { A[ia+j*lda*VLEN] = Atemp[i+j*m]; } } } else { for ( j = 0; j < n; j++ ) { for ( i = 0; i < m; i++ ) { /* Transpose the data */ A[j*VLEN+i*lda*VLEN] = Atemp[i+j*m]; } } } } LIBXSMM_INLINE void scopy_from_temp ( int layout, float *A, int lda, int m, int n, float *Atemp, unsigned int VLEN ) { int i, j, ia; if ( lda*n > BUFSIZE ) { printf("Reference routine not set up for matrices so large\n"); } if ( layout == 102 ) { for ( j = 0; j < n; j++ ) { for ( i = 0, ia = 0; i < m; i++, ia+=VLEN ) { A[ia+j*lda*VLEN] = Atemp[i+j*m]; } } } else { for ( j = 0; j < n; j++ ) { for ( i = 0; i < m; i++ ) { /* Transpose the data */ A[j*VLEN+i*lda*VLEN] = Atemp[i+j*m]; } } } } #if !defined(USE_MKL_FOR_REFERENCE) && !defined(LIBXSMM_NOFORTRAN) && (!defined(__BLAS) || (0 != __BLAS)) extern void dtrsm_(); /* Reference code for compact dtrsm. Note that this just copies data into a buffer from the compact storage and calls the regular dtrsm code. This is very naive reference code just used for testing purposes */ /* Note: if layout==101 (row major), then this code is known to only work when * nmat == VLEN. To check for accuracy otherwise, transpose everything */ LIBXSMM_INLINE void compact_dtrsm_ ( unsigned int *layout, char *side, char *uplo, char *transa, char *diag, unsigned int *m, unsigned int *n, double *alpha, double *A, unsigned int *lda, double *B, unsigned int *ldb, unsigned int *nmat, unsigned int *VLEN ) { int i, j, num, asize, offseta, offsetb; double *Ap, *Bp, Atemp[BUFSIZE], Btemp[BUFSIZE]; static int ntimes = 0; if ( ++ntimes < 3 ) printf("Inside reference compact_dtrsm_()\n"); if ( *layout == 102 ) { if ( (*side == 'L') || (*side == 'l') ) asize = *m; else asize = *n; offsetb = (*ldb)*(*n)*(*VLEN); } else { if ( (*side == 'L') || (*side == 'l') ) asize = *n; else asize = *m; offsetb = (*ldb)*(*m)*(*VLEN); } offseta = (*lda)*asize*(*VLEN); if ( ++ntimes < 3 ) printf("m/n=%u,%u layout=%u asize=%i VLEN=%u nmat=%u offseta=%i offsetb=%i\n",*m,*n,*layout, asize, *VLEN, *nmat, offseta, offsetb ); for ( i = 0, num = 0; i < (int)(*nmat); i+= *VLEN, num++ ) { for ( j = 0; j < (int)*VLEN; j++ ) { /* Unpack the data, call a reference DTRSM, repack the data */ Ap = &A[j+num*offseta]; Bp = &B[j+num*offsetb]; if (++ntimes < 15 ) printf("Doing a dtrsm at place i=%d j=%d num=%d Ap[%d]=%g Bp[%d]=%g\n",i,j,num,j+num*offseta,Ap[0],j+num*offsetb,Bp[0]); dcopy_to_temp ( *layout, Ap, *lda, asize, asize, Atemp, *VLEN ); dcopy_to_temp ( *layout, Bp, *ldb, *m, *n, Btemp, *VLEN ); dtrsm_ ( side, uplo, transa, diag, m, n, alpha, Atemp, &asize, Btemp, m); dcopy_from_temp ( *layout, Bp, *ldb, *m, *n, Btemp, *VLEN ); } } } extern void strsm_(); /* Reference code for compact strsm. Note that this just copies data into a buffer from the compact storage and calls the regular strsm code. This is very naive reference code just used for testing purposes */ /* Note: if layout==101 (row major), then this code is known to only work when * nmat == VLEN. To check for accuracy otherwise, transpose everything */ LIBXSMM_INLINE void compact_strsm_ ( unsigned int *layout, char *side, char *uplo, char *transa, char *diag, unsigned int *m, unsigned int *n, float *alpha, float *A, unsigned int *lda, float *B, unsigned int *ldb, unsigned int *nmat, unsigned int *VLEN ) { int i, j, num, asize; float *Ap, *Bp, Atemp[BUFSIZE], Btemp[BUFSIZE]; if ( (*side == 'L') || (*side == 'l') ) asize = *m; else asize = *n; for ( i = 0, num = 0; i < (int)(*nmat); i+= *VLEN, num++ ) { for ( j = 0; j < (int)*VLEN; j++ ) { /* Unpack the data, call a reference DTRSM, repack the data */ Ap = &A[j+num*(*lda)*asize*(*VLEN)]; Bp = &B[j+num*(*ldb)*(*n)*(*VLEN)]; scopy_to_temp ( *layout, Ap, *lda, asize, asize, Atemp, *VLEN ); scopy_to_temp ( *layout, Bp, *ldb, *m, *n, Btemp, *VLEN ); strsm_ ( side, uplo, transa, diag, m, n, alpha, Atemp, &asize, Btemp, m); scopy_from_temp ( *layout, Bp, *ldb, *m, *n, Btemp, *VLEN ); } } } #endif LIBXSMM_INLINE void dfill_matrix ( double *matrix, unsigned int ld, unsigned int m, unsigned int n ) { unsigned int i, j; double dtmp; if ( ld < m ) { fprintf(stderr,"Error in dfill_matrix: ld=%u m=%u mismatched!\n",ld,m); exit(-1); } for ( j = 1; j <= n; j++ ) { /* Fill through the leading dimension */ for ( i = 1; i <= ld; i++ ) { dtmp = 1.0 - 2.0*libxsmm_rng_f64(); matrix [ (j-1)*ld + (i-1) ] = dtmp; } } } LIBXSMM_INLINE void dfill_identity ( double *matrix, unsigned int ld, unsigned int m, unsigned int n, int VLEN, int number_of_cases ) { unsigned int h, i, j, k, ia; double dtmp; if ( ld < m ) { fprintf(stderr,"Error in dfill_identity: ld=%u m=%u mismatched!\n",ld,m); exit(-1); } for ( h = 0; h < (unsigned int)number_of_cases; h++ ) { ia = h*ld*n*VLEN; for ( j = 1; j <= n; j++ ) { for ( i = 1; i <= ld; i++ ) { if ( i == j ) dtmp = 1.0; else dtmp = 0.0; for ( k = 0; k < (unsigned int)VLEN; k++ ) matrix[ia++] = dtmp; } } } } LIBXSMM_INLINE void sfill_matrix ( float *matrix, unsigned int ld, unsigned int m, unsigned int n ) { unsigned int i, j; double dtmp; if ( ld < m ) { fprintf(stderr,"Error is sfill_matrix: ld=%u m=%u mismatched!\n",ld,m); exit(-1); } for ( j = 1; j <= n; j++ ) { /* Fill through the leading dimension */ for ( i = 1; i <= ld; i++ ) { dtmp = 1.0 - 2.0*libxsmm_rng_f64(); matrix [ (j-1)*ld + (i-1) ] = (float) dtmp; } } } LIBXSMM_INLINE double residual_d ( double *A, unsigned int lda, unsigned int m, unsigned int n, double *B, unsigned int ldb, unsigned int *nerrs, unsigned int *ncorr ) { unsigned int i, j; double atmp, btmp, dtmp, ref, derror; static int ntimes = 0; *nerrs = 0; *ncorr = 0; derror = 0.0; for ( j = 1; j<= n; j++ ) { for ( i = 1; i <= m; i++ ) { atmp = A[ (j-1)*lda + (i-1)]; btmp = B[ (j-1)*ldb + (i-1)]; ref = LIBXSMM_MAX(atmp,-atmp); if ( atmp >= btmp ) { dtmp = atmp - btmp; } else { dtmp = btmp - atmp; } if ( isnan(dtmp) || isinf(dtmp) ) { if ( ++ntimes < 15 ) { printf("Denormal bug: A(%u,%u) is %g B(%u,%u) is %g\n",i,j,atmp,i,j,btmp); } } if ( dtmp / ref > 1.0e-12 ) { *nerrs = *nerrs + 1; if ( ++ntimes < 15 ) { printf("Bug #%i: A[%u]=A(%u,%u) expected=%g instead=%g err=%g\n",ntimes,(j-1)*lda+(i-1),i,j,atmp,btmp,dtmp); } } else { if ( (*nerrs > 0) && (ntimes < 10) && (*ncorr < 40) ) { printf("Cor #%u: A[%u]=A(%u,%u) expected=%g\n",*ncorr+1,(j-1)*lda+(i-1),i,j,atmp); } *ncorr = *ncorr + 1; } derror += dtmp; } } return derror; } LIBXSMM_INLINE double residual_s ( float *A, unsigned int lda, unsigned int m, unsigned int n, float *B, unsigned int ldb, unsigned int *nerrs, unsigned int *ncorr ) { unsigned int i, j; double atmp, btmp, dtmp, ref, derror; static int ntimes = 0; *nerrs = 0; *ncorr = 0; derror = 0.0; for ( j = 1; j<= n; j++ ) { for ( i = 1; i <= m; i++ ) { atmp = (double) A[ (j-1)*lda + (i-1)]; btmp = (double) B[ (j-1)*ldb + (i-1)]; ref = LIBXSMM_MAX(atmp,-atmp); if ( atmp >= btmp ) { dtmp = atmp - btmp; } else { dtmp = btmp - atmp; } if ( isnan(dtmp) || isinf(dtmp) ) { if ( ++ntimes < 15 ) { printf("Denormal bug: A(%u,%u) is %g B(%u,%u) is %g\n",i,j,atmp,i,j,btmp); } } if ( dtmp / ref > 1.0e-4 ) { *nerrs = *nerrs + 1; if ( ++ntimes < 15 ) { printf("Bug #%d: A(%u,%u) expected=%g instead=%g err=%g\n",ntimes,i,j,atmp,btmp,dtmp); } } else { if ( (*nerrs > 0) && (ntimes < 10) && (*ncorr < 40) ) { printf("Cor #%u: A(%u,%u) expected=%g\n",*ncorr+1,i,j,atmp); } *ncorr = *ncorr + 1; } derror += dtmp; } } return derror; } #ifdef USE_PREDEFINED_ASSEMBLY extern void trsm_xct_(); #endif #ifdef MKL_TIMER extern double dsecnd_(); #endif int main(int argc, char* argv[]) { unsigned int m=8, n=8, lda=8, ldb=8, nerrs, num, nmat, nmats, nmatd, ntest; unsigned int layout, asize, VLEND=4, VLENS=8, bsize; unsigned int ncorr; unsigned int i, j; char side, uplo, trans, diag; float *sa, *sb, *sc, *sd; double *da, *db, *dc, *dd, *tmpbuf; double dalpha = 1.0; float salpha; double dtmp; const unsigned char *cptr = NULL; unsigned long op_count; unsigned int typesize8 = 8; const libxsmm_trsm_descriptor* desc8 = NULL; #ifdef TEST_SINGLE unsigned int typesize4 = 4; const libxsmm_trsm_descriptor* desc4 = NULL; #endif #ifdef USE_XSMM_GENERATED libxsmm_descriptor_blob blob; libxsmm_trsm_xfunction mykernel = NULL; #endif #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__) void (*opcode_routine)(); unsigned char *routine_output; libxsmm_generated_code io_generated_code; int pagesize = sysconf(_SC_PAGE_SIZE); if (pagesize == -1) fprintf(stderr,"sysconf pagesize\n"); routine_output = (unsigned char *) mmap(NULL, BUFSIZE2, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0,0); if (mprotect(routine_output, BUFSIZE2, PROT_EXEC | PROT_READ | PROT_WRITE ) == -1) fprintf(stderr,"mprotect\n"); printf("Routine ready\n"); io_generated_code.generated_code = &routine_output[0]; io_generated_code.buffer_size = BUFSIZE2; io_generated_code.code_size = 0; io_generated_code.code_type = 2; io_generated_code.last_error = 0; io_generated_code.sf_size = 0; #endif if ( argc <= 3 ) { printf("\nUSAGE: %s m n lda ldb nmat side uplo trans diag layout ntest alpha\n",argv[0]); printf("Compact TRSM a mxn matrix of leading dimension ldb\n"); printf("This will test the jit of 1 VLEN work of nmat at a time\n"); printf("Defaults: m=n=lda=ldb=nmat=8, alpha=1.0, side=uplo='L',trans=diag='N',layout=102,ntest=1\n"); } if ( argc > 1 ) m = atoi(argv[1]); else m = 8; if ( argc > 2 ) n = atoi(argv[2]); else n = 8; if ( argc > 3 ) lda= atoi(argv[3]); else lda = 8; if ( argc > 4 ) ldb = atoi(argv[4]); else ldb = 8; if ( argc > 5 ) nmat = atoi(argv[5]); else nmat = 8; if ( argc > 6 ) side = argv[6][0]; else side = 'L'; if ( argc > 7 ) uplo = argv[7][0]; else uplo = 'L'; if ( argc > 8 ) trans = argv[8][0]; else trans = 'N'; if ( argc > 9 ) diag = argv[9][0]; else diag = 'N'; if ( argc > 10 ) layout = atoi(argv[10]); else layout=102; if ( argc > 11 ) ntest = atoi(argv[11]); else ntest = 1; if ( argc > 12 ) dalpha = atof(argv[12]); else dalpha = 1.0; salpha = (float)dalpha; m = LIBXSMM_MAX(m,1); n = LIBXSMM_MAX(n,1); /* A is either mxm or nxn depending on side */ if ( (side == 'L') || (side=='l') ) asize = m; else asize = n; lda = LIBXSMM_MAX(lda,asize); if ( layout == 102 ) { /* Column major: B is mxn, and stored in B format */ ldb = LIBXSMM_MAX(ldb,m); bsize = ldb*n; } else { /* Row major: B is mxn, and stored in B^T format */ ldb = LIBXSMM_MAX(ldb,n); bsize = ldb*m; } nmats = LIBXSMM_MAX(VLENS,nmat - (nmat%VLENS)); nmatd = LIBXSMM_MAX(VLEND,nmat - (nmat%VLEND)); nmat = LIBXSMM_MAX(nmats,nmatd); op_count = n * m * asize; printf("This is a real*%u tester for JIT compact TRSM kernels! (%c%c%c%c m=%u n=%u lda=%u ldb=%u layout=%u nmat=%u)\n",typesize8,side,uplo,trans,diag,m,n,lda,ldb,layout,nmat); #ifdef USE_XSMM_GENERATED printf("This code tests the LIBXSMM generated kernels\n"); #endif #ifdef USE_PREDEFINED_ASSEMBLY printf("This code tests some predefined assembly kernel\n"); #endif #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__) printf("This code tests kernel generation directly\n"); #endif #ifdef TIME_MKL printf("This code tests MKL compact batch directly\n"); #endif desc8 = libxsmm_trsm_descriptor_init(&blob, typesize8, m, n, lda, ldb, &dalpha, trans, diag, side, uplo, layout); #ifdef TEST_SINGLE desc4 = libxsmm_trsm_descriptor_init(&blob, typesize4, m, n, lda, ldb, &salpha, trans, diag, side, uplo, layout); #endif #ifdef USE_XSMM_GENERATED printf("calling libxsmm_dispatch_trsm: typesize8=%u\n",typesize8); mykernel = libxsmm_dispatch_trsm(desc8); printf("done calling libxsmm_dispatch_trsm: typesize8=%u\n",typesize8); if ( mykernel == NULL ) printf("R8 Kernel after the create call is null\n"); #ifdef TEST_SINGLE mykernel = libxsmm_dispatch_trsm(desc4); if ( mykernel == NULL ) printf("R4 kernel after the create call is null\n"); #endif #endif #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__) libxsmm_generator_trsm_kernel ( &io_generated_code, desc8, "hsw" ); #endif #ifndef NO_ACCURACY_CHECK printf("mallocing matrices\n"); #endif sa = (float *) malloc ( lda*asize*nmats*sizeof(float) ); da = (double *) malloc ( lda*asize*nmatd*sizeof(double) ); sb = (float *) malloc ( bsize*nmats*sizeof(float) ); db = (double *) malloc ( bsize*nmatd*sizeof(double) ); sc = (float *) malloc ( bsize*nmats*sizeof(float) ); dc = (double *) malloc ( bsize*nmatd*sizeof(double) ); sd = (float *) malloc ( bsize*nmats*sizeof(float) ); dd = (double *) malloc ( bsize*nmatd*sizeof(double) ); tmpbuf = (double *) malloc ( asize*VLEND*sizeof(double) ); #ifndef NO_ACCURACY_CHECK printf("filling matrices\n"); #endif sfill_matrix ( sa, lda, asize, asize*nmats ); #ifdef TRIANGLE_IS_IDENTITY printf("Warning: setting triangular matrix to identity. Not good for accuracy testing\n"); dfill_identity ( da, lda, asize, asize, VLEND, nmatd/VLEND ); #else dfill_matrix ( da, lda, asize, asize*nmatd ); #endif sfill_matrix ( sb, bsize, bsize, nmats ); dfill_matrix ( db, bsize, bsize, nmatd ); #ifndef NO_ACCURACY_CHECK for ( i = 0; i < (int)(bsize*nmats); i++ ) sc[i]=sb[i]; for ( i = 0; i < (int)(bsize*nmatd); i++ ) dc[i]=db[i]; for ( i = 0; i < (int)(bsize*nmats); i++ ) sd[i]=sb[i]; for ( i = 0; i < (int)(bsize*nmatd); i++ ) dd[i]=db[i]; printf("Pointing at the kernel now\n"); #endif #ifdef USE_XSMM_GENERATED cptr = (const unsigned char*) mykernel; #endif #ifdef USE_PREDEFINED_ASSEMBLY cptr = (const unsigned char*) trsm_xct_; #endif #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__) cptr = (const unsigned char*) &routine_output[0]; opcode_routine = (void *) &cptr[0]; #endif #ifndef TIME_MKL # define DUMP_ASSEMBLY_FILE #endif #ifdef DUMP_ASSEMBLY_FILE printf("Dumping assembly file\n"); FILE *fp = fopen("foo.s","w"); char buffer[80]; fputs("\t.text\n",fp); fputs("\t.align 256\n",fp); fputs("\t.globl trsm_xct_\n",fp); fputs("trsm_xct_:\n",fp); for (i = 0; i < 4000; i+=4 ) { sprintf(buffer,".byte 0x%02x, 0x%02x, 0x%02x, 0x%02x\n",cptr[i],cptr[i+1],cptr[i+2],cptr[i+3]); fputs(buffer,fp); } fputs("\tretq\n",fp); fputs("\t.type trsm_xct_,@function\n",fp); fputs("\t.size trsm_xct_,.-trsm_xct_\n",fp); fclose(fp); #endif #if defined(USE_MKL_FOR_REFERENCE) || defined(TIME_MKL) # include MKL_LAYOUT CLAYOUT = (layout == 101) ? MKL_ROW_MAJOR : MKL_COL_MAJOR; MKL_SIDE SIDE = (side == 'R' || side == 'r') ? MKL_RIGHT : MKL_LEFT; MKL_UPLO UPLO = (uplo == 'U' || uplo == 'u') ? MKL_UPPER : MKL_LOWER; MKL_TRANSPOSE TRANSA = (trans == 'N' || trans == 'n') ? MKL_NOTRANS : MKL_TRANS; MKL_DIAG DIAG = (diag == 'N' || diag == 'n') ? MKL_NONUNIT : MKL_UNIT; MKL_COMPACT_PACK CMP_FORMAT = mkl_get_format_compact(); #if 0 MKL_COMPACT_PACK CMP_FORMAT = MKL_COMPACT_AVX; #endif #endif #ifndef NO_ACCURACY_CHECK printf("Before routine, initial B(1,1)=%g B[256]=%g\n",db[0],db[256]); #endif #ifdef USE_PREDEFINED_ASSEMBLY double one = 1.0; #endif double timer; #ifdef MKL_TIMER double tmptimer; tmptimer = dsecnd_(); #else unsigned long long l_start, l_end; #endif timer = 0.0; for ( j = 0; j < (int)ntest; j++ ) { #ifndef TRIANGLE_IS_IDENTITY for ( i = 0; i < (int)(bsize*nmatd); i++ ) db[i]=dd[i]; #endif for ( i = 0 , num = 0; i < (int)nmatd; i+= (int)VLEND, num++ ) { double *Ap = &da[num*lda*asize*VLEND]; double *Bp = &db[num*bsize*VLEND]; #ifdef MKL_TIMER tmptimer = dsecnd_(); #else l_start = libxsmm_timer_tick(); #endif #ifdef USE_XSMM_GENERATED mykernel ( Ap, Bp, tmpbuf ); #endif #ifdef USE_PREDEFINED_ASSEMBLY trsm_xct_ ( Ap, Bp, &one ); #endif #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__) (*opcode_routine)( Ap, Bp ); #endif #ifdef TIME_MKL mkl_dtrsm_compact ( CLAYOUT, SIDE, UPLO, TRANSA, DIAG, m, n, dalpha, da, lda, db, ldb, CMP_FORMAT, nmatd ); i+=nmatd; /* Because MKL will do everything */ #endif #ifdef MKL_TIMER timer += dsecnd_() - tmptimer; #else l_end = libxsmm_timer_tick(); timer += libxsmm_timer_duration(l_start,l_end); #endif } } timer /= ((double)ntest); #ifndef NO_ACCURACY_CHECK printf("Average time to get through %u matrices: %g\n",nmatd,timer); printf("Gflops: %g\n",(double)(op_count*nmatd)/(timer*1.0e9)); printf("after routine, new B(1,1)=%g B[256]=%g\n",db[0],db[256]); #endif #ifdef TEST_SINGLE printf("Before r4 routine, initial B(1,1)=%g B[256]=%g\n",sb[0],sb[256]); for ( i = 0 , num = 0; i < nmats; i+= VLENS, num++ ) { float *Ap = &sa[num*lda*asize*VLENS]; float *Bp = &sb[num*bsize*VLENS]; #ifdef USE_XSMM_GENERATED mykernel ( Ap, Bp, NULL ); #endif } printf("after r4 routine, new B(1,1)=%g B]256]=%g\n",db[0],db[256]); #endif #ifndef NO_ACCURACY_CHECK /* Call some reference code now on a copy of the B matrix (C) */ double timer2 = 0.0; for ( j = 0; j < (int)ntest; j++ ) { #ifndef TRIANGLE_IS_IDENTITY for ( i = 0; i < (int)(bsize*nmatd); i++ ) dc[i]=dd[i]; #endif #ifdef MKL_TIMER tmptimer = dsecnd_(); #else l_start = libxsmm_timer_tick(); #endif #ifdef USE_MKL_FOR_REFERENCE mkl_dtrsm_compact ( CLAYOUT, SIDE, UPLO, TRANSA, DIAG, m, n, dalpha, da, lda, dc, ldb, CMP_FORMAT, nmatd ); #elif !defined(LIBXSMM_NOFORTRAN) && (!defined(__BLAS) || (0 != __BLAS)) if ( (layout == 101) && (nmatd!=VLEND) ) { unsigned int lay = 102, m1 = n, n1 = m; char side1='L', uplo1='L'; if ( side == 'L' || side == 'l' ) side1 = 'R'; if ( uplo == 'L' || uplo == 'l' ) uplo1 = 'U'; compact_dtrsm_ ( &lay, &side1, &uplo1, &trans, &diag, &m1, &n1, &dalpha, da, &lda, dc, &ldb, &nmatd, &VLEND ); } else { compact_dtrsm_ ( &layout, &side, &uplo, &trans, &diag, &m, &n, &dalpha, da, &lda, dc, &ldb, &nmatd, &VLEND ); } #endif #ifdef MKL_TIMER timer2 += dsecnd_() - tmptimer; #else l_end = libxsmm_timer_tick(); timer2 += libxsmm_timer_duration(l_start,l_end); #endif } timer2 /= ((double)ntest); printf("Reference time=%g Reference Gflops=%g\n",timer2,(op_count*nmatd)/(timer2*1.0e9)); /* Compute the residual between B and C */ dtmp = residual_d ( dc, bsize, bsize, nmatd, db, bsize, &nerrs, &ncorr ); printf("R8 %c%c%c%c m=%u n=%u lda=%u ldb=%u error: %g number of errors: %u corrects: %u",side,uplo,trans,diag,m,n,lda,ldb,dtmp,nerrs,ncorr); if ( nerrs > 0 ) printf(" ->FAILED at %ux%u real*8 %u case",m,n,layout); printf("\n"); #ifdef TEST_SINGLE /* Call some reference code now on a copy of the B matrix (C) */ compact_strsm_ ( &layout, &side, &uplo, &trans, &diag, &m, &n, &salpha, sa, &lda, sc, &ldb, &nmats, &VLENS ); /* Compute the residual between B and C */ dtmp = residual_s ( sc, bsize, bsize, nmats, sb, bsize, &nerrs, &ncorr ); printf("R4 %c%c%c%c m=%u n=%u lda=%u ldb=%u error: %g number of errors: %u corrects: %u\n",side,uplo,trans,diag,m,n,lda,ldb,dtmp,nerrs,ncorr); if ( nerrs > 0 ) printf(" ->FAILED at %ux%u real*4 case",m,n); printf("\n"); #endif #else for ( j = 0, nerrs = 0; j < bsize*nmatd; j++ ) { if ( isnan(db[j]) || isinf(db[j]) ) { if ( ++nerrs < 10 ) { printf("WARNING: db[%d]=%g\n",j,db[j]); } } } printf("%g,real*8 %c%c%c%c m=%u n=%u lda=%u ldb=%u Denormals=%u Time=%g Gflops=%g",(op_count*nmatd)/(timer*1.0e9),side,uplo,trans,diag,m,n,lda,ldb,nerrs,timer,(op_count*nmatd)/(timer*1.0e9)); if ( nerrs > 0 ) printf(" -> FAILED at %ux%u real*8 case",m,n); printf("\n"); #endif free(dd); free(sd); free(dc); free(sc); free(db); free(sb); free(da); free(sa); return 0; } libxsmm-1.17/samples/packed/trsm/trsm.vcxproj000066400000000000000000000547371415223013700214150ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 trsm {3D150C8C-0FF4-47D5-A466-C3D76A5FB183} 10.0 Application Disabled Disabled Sequential v142 true Application true true Disabled Disabled Sequential v142 Application true Disabled Disabled Sequential v142 true Application Disabled Disabled Sequential v142 true true Application true Disabled Disabled Sequential v142 Application true Disabled Disabled true Sequential v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;mkl_rt.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;mkl_rt.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/pyfr/000077500000000000000000000000001415223013700155375ustar00rootroot00000000000000libxsmm-1.17/samples/pyfr/Makefile000066400000000000000000000047661415223013700172140ustar00rootroot00000000000000 # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of Intel Corporation nor the names of its contributors # may be used to endorse or promote products derived from this software # without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ################################################################################## # Alexander Heinecke (Intel Corp.) ################################################################################## AVX?=3 CC=icc CFLAGS=-qopenmp -O2 -mkl -I./../../include/ -c LDFLAGS=-mkl -L./../../lib -lxsmm ifeq ($(AVX), 1) CXXFLAGS+=-mavx endif ifeq ($(AVX), 2) CXXFLAGS+=-xCORE-AVX2 -fma endif ifeq ($(AVX), 3) CXXFLAGS+=-xCOMMON-AVX512 -fma endif default: pyfr_gemm_cm pyfr_gemm_rm pyfr_driver_asp_reg pyfr_gemm_cm.o: pyfr_gemm_cm.c $(CC) $(CFLAGS) pyfr_gemm_cm.c pyfr_gemm_rm.o: pyfr_gemm_rm.c $(CC) $(CFLAGS) pyfr_gemm_rm.c pyfr_driver_asp_reg.o : pyfr_driver_asp_reg.c $(CC) $(CFLAGS) pyfr_driver_asp_reg.c pyfr_gemm_cm: pyfr_gemm_cm.o $(CC) pyfr_gemm_cm.o $(LDFLAGS) -o pyfr_gemm_cm pyfr_gemm_rm: pyfr_gemm_rm.o $(CC) pyfr_gemm_rm.o $(LDFLAGS) -o pyfr_gemm_rm pyfr_driver_asp_reg: pyfr_driver_asp_reg.o $(CC) pyfr_driver_asp_reg.o $(LDFLAGS) -mkl -o pyfr_driver_asp_reg clean: rm -rf *.o kernel.c rm -rf pyfr_gemm_cm pyfr_gemm_rm libxsmm-1.17/samples/pyfr/pyfr_download_mats.sh000077500000000000000000000025521415223013700217750ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) MKDIR=$(command -v mkdir) WGET=$(command -v wget) DATASET="p1 p2 p3 p4 p5 p6" KINDS="hex pri quad tet tri" FILES="m0-de m0-sp m132-de m132-sp m3-de m3-sp m460-de m460-sp m6-de m6-sp" if [ "${MKDIR}" ] && [ "${WGET}" ]; then ${MKDIR} -p ${HERE}/mats; cd ${HERE}/mats for DATA in ${DATASET}; do mkdir ${DATA}; cd ${DATA} for KIND in ${KINDS}; do mkdir ${KIND}; cd ${KIND} for FILE in ${FILES}; do ${WGET} -N https://github.com/hfp/libxsmm/raw/master/samples/pyfr/mats/${DATA}/${KIND}/${FILE}.mtx done cd .. done cd .. done fi libxsmm-1.17/samples/pyfr/pyfr_driver_asp_reg.c000066400000000000000000000354061415223013700217460ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include #include #include #include #include #include #define REPS 100 #define REALTYPE double static double sec(struct timeval start, struct timeval end) { return ((double)(((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)))) / 1.0e6; } int my_csr_reader( const char* i_csr_file_in, unsigned int** o_row_idx, unsigned int** o_column_idx, REALTYPE** o_values, unsigned int* o_row_count, unsigned int* o_column_count, unsigned int* o_element_count ) { FILE *l_csr_file_handle; const unsigned int l_line_length = 512; char l_line[512/*l_line_length*/+1]; unsigned int l_header_read = 0; unsigned int* l_row_idx_id = NULL; unsigned int l_i = 0; l_csr_file_handle = fopen( i_csr_file_in, "r" ); if ( l_csr_file_handle == NULL ) { fprintf( stderr, "cannot open CSR file!\n" ); return -1; } while (fgets(l_line, l_line_length, l_csr_file_handle) != NULL) { if ( strlen(l_line) == l_line_length ) { fprintf( stderr, "could not read file length!\n" ); return -1; } /* check if we are still reading comments header */ if ( l_line[0] == '%' ) { continue; } else { /* if we are the first line after comment header, we allocate our data structures */ if ( l_header_read == 0 ) { if (3 == sscanf(l_line, "%u %u %u", o_row_count, o_column_count, o_element_count) && 0 != *o_row_count && 0 != *o_column_count && 0 != *o_element_count) { /* allocate CSC datastructure matching mtx file */ *o_column_idx = (unsigned int*) malloc(sizeof(unsigned int) * (*o_element_count)); *o_row_idx = (unsigned int*) malloc(sizeof(unsigned int) * (*o_row_count + 1)); *o_values = (REALTYPE*) malloc(sizeof(double) * (*o_element_count)); l_row_idx_id = (unsigned int*) malloc(sizeof(unsigned int) * (*o_row_count)); /* check if mallocs were successful */ if ( ( *o_row_idx == NULL ) || ( *o_column_idx == NULL ) || ( *o_values == NULL ) || ( l_row_idx_id == NULL ) ) { fprintf( stderr, "could not allocate sp data!\n" ); return -1; } /* set everything to zero for init */ memset(*o_row_idx, 0, sizeof(unsigned int)*(*o_row_count + 1)); memset(*o_column_idx, 0, sizeof(unsigned int)*(*o_element_count)); memset(*o_values, 0, sizeof(double)*(*o_element_count)); memset(l_row_idx_id, 0, sizeof(unsigned int)*(*o_row_count)); /* init column idx */ for ( l_i = 0; l_i < (*o_row_count + 1); l_i++) (*o_row_idx)[l_i] = (*o_element_count); /* init */ (*o_row_idx)[0] = 0; l_i = 0; l_header_read = 1; } else { fprintf( stderr, "could not csr description!\n" ); return -1; } /* now we read the actual content */ } else { unsigned int l_row, l_column; REALTYPE l_value; /* read a line of content */ if ( sscanf(l_line, "%u %u %lf", &l_row, &l_column, &l_value) != 3 ) { fprintf( stderr, "could not read element!\n" ); return -1; } /* adjust numbers to zero termination */ l_row--; l_column--; /* add these values to row and value structure */ (*o_column_idx)[l_i] = l_column; (*o_values)[l_i] = l_value; l_i++; /* handle columns, set id to own for this column, yeah we need to handle empty columns */ l_row_idx_id[l_row] = 1; (*o_row_idx)[l_row+1] = l_i; } } } /* close mtx file */ fclose( l_csr_file_handle ); /* check if we read a file which was consistent */ if ( l_i != (*o_element_count) ) { fprintf( stderr, "we were not able to read all elements!\n" ); return -1; } /* let's handle empty rows */ for ( l_i = 0; l_i < (*o_row_count); l_i++) { if ( l_row_idx_id[l_i] == 0 ) { (*o_row_idx)[l_i+1] = (*o_row_idx)[l_i]; } } /* free helper data structure */ if ( l_row_idx_id != NULL ) { free( l_row_idx_id ); } return 0; } int main(int argc, char* argv[]) { char* l_csr_file; REALTYPE* l_a_sp; unsigned int* l_rowptr; unsigned int* l_colidx; unsigned int l_rowcount, l_colcount, l_elements; REALTYPE* l_a_dense; REALTYPE* l_b; REALTYPE* l_c_betaone; REALTYPE* l_c_betazero; REALTYPE* l_c_gold_betaone; REALTYPE* l_c_gold_betazero; REALTYPE* l_c_dense_betaone; REALTYPE* l_c_dense_betazero; REALTYPE l_max_error = 0.0; unsigned int l_m; unsigned int l_n; unsigned int l_k; unsigned int l_i; unsigned int l_j; unsigned int l_z; unsigned int l_elems; unsigned int l_reps; unsigned int l_n_block; struct timeval l_start, l_end; double l_total; double alpha = 1.0; double beta = 1.0; char trans = 'N'; libxsmm_dfsspmdm* gemm_op_betazero = NULL; libxsmm_dfsspmdm* gemm_op_betaone = NULL; if (argc != 4) { fprintf( stderr, "need csr-filename N reps!\n" ); exit(-1); } /* read sparse A */ l_csr_file = argv[1]; l_n = atoi(argv[2]); l_reps = atoi(argv[3]); if (my_csr_reader( l_csr_file, &l_rowptr, &l_colidx, &l_a_sp, &l_rowcount, &l_colcount, &l_elements ) != 0 ) { exit(-1); } l_m = l_rowcount; l_k = l_colcount; printf("CSR matrix data structure we just read:\n"); printf("rows: %u, columns: %u, elements: %u\n", l_rowcount, l_colcount, l_elements); /* allocate dense matrices */ l_a_dense = (REALTYPE*)_mm_malloc(l_k * l_m * sizeof(REALTYPE), 64); l_b = (REALTYPE*)_mm_malloc(l_k * l_n * sizeof(REALTYPE), 64); l_c_betazero = (REALTYPE*)_mm_malloc(l_m * l_n * sizeof(REALTYPE), 64); l_c_betaone = (REALTYPE*)_mm_malloc(l_m * l_n * sizeof(REALTYPE), 64); l_c_gold_betazero = (REALTYPE*)_mm_malloc(l_m * l_n * sizeof(REALTYPE), 64); l_c_gold_betaone = (REALTYPE*)_mm_malloc(l_m * l_n * sizeof(REALTYPE), 64); l_c_dense_betazero = (REALTYPE*)_mm_malloc(l_m * l_n * sizeof(REALTYPE), 64); l_c_dense_betaone = (REALTYPE*)_mm_malloc(l_m * l_n * sizeof(REALTYPE), 64); /* touch B */ for ( l_i = 0; l_i < l_k*l_n; l_i++) { l_b[l_i] = (REALTYPE)libxsmm_rng_f64(); } /* touch dense A */ for ( l_i = 0; l_i < l_k*l_m; l_i++) { l_a_dense[l_i] = (REALTYPE)0.0; } /* init dense A using sparse A */ for ( l_i = 0; l_i < l_m; l_i++ ) { l_elems = l_rowptr[l_i+1] - l_rowptr[l_i]; for ( l_z = 0; l_z < l_elems; l_z++ ) { l_a_dense[(l_i*l_k)+l_colidx[l_rowptr[l_i]+l_z]] = l_a_sp[l_rowptr[l_i]+l_z]; } } /* touch C */ for ( l_i = 0; l_i < l_m*l_n; l_i++) { l_c_gold_betaone[l_i] = (REALTYPE)libxsmm_rng_f64(); } for ( l_i = 0; l_i < l_m*l_n; l_i++) { l_c_betaone[l_i] = l_c_gold_betaone[l_i]; } for ( l_i = 0; l_i < l_m*l_n; l_i++) { l_c_dense_betaone[l_i] = l_c_gold_betaone[l_i]; } for ( l_i = 0; l_i < l_m*l_n; l_i++) { l_c_betazero[l_i] = l_c_betaone[l_i]; } for ( l_i = 0; l_i < l_m*l_n; l_i++) { l_c_gold_betazero[l_i] = l_c_gold_betaone[l_i]; } for ( l_i = 0; l_i < l_m*l_n; l_i++) { l_c_dense_betazero[l_i] = l_c_dense_betaone[l_i]; } /* setting up fsspmdm */ l_n_block = 48; beta = 0.0; gemm_op_betazero = libxsmm_dfsspmdm_create( l_m, l_n_block, l_k, l_k, l_n, l_n, 1.0, beta, 1, l_a_dense ); beta = 1.0; gemm_op_betaone = libxsmm_dfsspmdm_create( l_m, l_n_block, l_k, l_k, l_n, l_n, 1.0, beta, 0, l_a_dense ); /* compute golden results */ printf("computing golden solution...\n"); for ( l_j = 0; l_j < l_n; l_j++ ) { for (l_i = 0; l_i < l_m; l_i++ ) { l_elems = l_rowptr[l_i+1] - l_rowptr[l_i]; l_c_gold_betazero[(l_n*l_i) + l_j] = 0.0; for (l_z = 0; l_z < l_elems; l_z++) { l_c_gold_betazero[(l_n*l_i) + l_j] += l_a_sp[l_rowptr[l_i]+l_z] * l_b[(l_n*l_colidx[l_rowptr[l_i]+l_z])+l_j]; } } } for ( l_j = 0; l_j < l_n; l_j++ ) { for (l_i = 0; l_i < l_m; l_i++ ) { l_elems = l_rowptr[l_i+1] - l_rowptr[l_i]; for (l_z = 0; l_z < l_elems; l_z++) { l_c_gold_betaone[(l_n*l_i) + l_j] += l_a_sp[l_rowptr[l_i]+l_z] * l_b[(l_n*l_colidx[l_rowptr[l_i]+l_z])+l_j]; } } } printf("...done!\n"); /* libxsmm generated code */ printf("computing libxsmm (A sparse) solution...\n"); #ifdef _OPENMP #pragma omp parallel for private(l_z) #endif for (l_z = 0; l_z < l_n; l_z+=l_n_block) { libxsmm_dfsspmdm_execute( gemm_op_betazero, l_b+l_z, l_c_betazero+l_z ); } #ifdef _OPENMP #pragma omp parallel for private(l_z) #endif for (l_z = 0; l_z < l_n; l_z+=l_n_block) { libxsmm_dfsspmdm_execute( gemm_op_betaone, l_b+l_z, l_c_betaone+l_z ); } printf("...done!\n"); /* BLAS code */ printf("computing BLAS (A dense) solution...\n"); beta = 0.0; dgemm(&trans, &trans, &l_n, &l_m, &l_k, &alpha, l_b, &l_n, l_a_dense, &l_k, &beta, l_c_dense_betazero, &l_n ); beta = 1.0; dgemm(&trans, &trans, &l_n, &l_m, &l_k, &alpha, l_b, &l_n, l_a_dense, &l_k, &beta, l_c_dense_betaone, &l_n ); printf("...done!\n"); /* check for errors */ l_max_error = (REALTYPE)0.0; for ( l_i = 0; l_i < l_m*l_n; l_i++) { if (fabs(l_c_betazero[l_i]-l_c_gold_betazero[l_i]) > l_max_error ) { l_max_error = fabs(l_c_betazero[l_i]-l_c_gold_betazero[l_i]); } } printf("max error beta=0 (libxmm vs. gold): %f\n", l_max_error); l_max_error = (REALTYPE)0.0; for ( l_i = 0; l_i < l_m*l_n; l_i++) { if (fabs(l_c_betaone[l_i]-l_c_gold_betaone[l_i]) > l_max_error ) { l_max_error = fabs(l_c_betaone[l_i]-l_c_gold_betaone[l_i]); } } printf("max error beta=1 (libxmm vs. gold): %f\n", l_max_error); l_max_error = (REALTYPE)0.0; for ( l_i = 0; l_i < l_m*l_n; l_i++) { if (fabs(l_c_dense_betazero[l_i]-l_c_gold_betazero[l_i]) > l_max_error ) { l_max_error = fabs(l_c_dense_betazero[l_i]-l_c_gold_betazero[l_i]); } } printf("max error beta=0 (dense vs. gold): %f\n", l_max_error); l_max_error = (REALTYPE)0.0; for ( l_i = 0; l_i < l_m*l_n; l_i++) { if (fabs(l_c_dense_betaone[l_i]-l_c_gold_betaone[l_i]) > l_max_error ) { l_max_error = fabs(l_c_dense_betaone[l_i]-l_c_gold_betaone[l_i]); } } printf("max error beta=1 (dense vs. gold): %f\n", l_max_error); /* Let's measure performance */ gettimeofday(&l_start, NULL); for ( l_j = 0; l_j < l_reps; l_j++ ) { #ifdef _OPENMP #pragma omp parallel for private(l_z) #endif for (l_z = 0; l_z < l_n; l_z+=l_n_block) { libxsmm_dfsspmdm_execute( gemm_op_betazero, l_b+l_z, l_c_betazero+l_z ); } } gettimeofday(&l_end, NULL); l_total = sec(l_start, l_end); fprintf(stdout, "time[s] LIBXSMM (RM, M=%i, N=%i, K=%i, beta=0): %f\n", l_m, l_n, l_k, l_total/(double)l_reps ); fprintf(stdout, "GFLOPS LIBXSMM (RM, M=%i, N=%i, K=%i, beta=0): %f (sparse)\n", l_m, l_n, l_k, (2.0 * (double)l_elements * (double)l_n * (double)l_reps * 1.0e-9) / l_total ); fprintf(stdout, "GFLOPS LIBXSMM (RM, M=%i, N=%i, K=%i, beta=0): %f (dense)\n", l_m, l_n, l_k, (2.0 * (double)l_m * (double)l_n * (double)l_k * (double)l_reps * 1.0e-9) / l_total ); fprintf(stdout, "GB/s LIBXSMM (RM, M=%i, N=%i, K=%i, beta=0): %f\n", l_m, l_n, l_k, ((double)sizeof(double) * (((double)l_m * (double)l_n) + ((double)l_k * (double)l_n)) * (double)l_reps * 1.0e-9) / l_total ); gettimeofday(&l_start, NULL); for ( l_j = 0; l_j < l_reps; l_j++ ) { #ifdef _OPENMP #pragma omp parallel for private(l_z) #endif for (l_z = 0; l_z < l_n; l_z+=l_n_block) { libxsmm_dfsspmdm_execute( gemm_op_betaone, l_b+l_z, l_c_betaone+l_z ); } } gettimeofday(&l_end, NULL); l_total = sec(l_start, l_end); fprintf(stdout, "time[s] LIBXSMM (RM, M=%i, N=%i, K=%i, beta=1): %f\n", l_m, l_n, l_k, l_total/(double)l_reps ); fprintf(stdout, "GFLOPS LIBXSMM (RM, M=%i, N=%i, K=%i, beta=1): %f (sparse)\n", l_m, l_n, l_k, (2.0 * (double)l_elements * (double)l_n * (double)l_reps * 1.0e-9) / l_total ); fprintf(stdout, "GFLOPS LIBXSMM (RM, M=%i, N=%i, K=%i, beta=1): %f (dense)\n", l_m, l_n, l_k, (2.0 * (double)l_m * (double)l_n * (double)l_k * (double)l_reps * 1.0e-9) / l_total ); fprintf(stdout, "GB/s LIBXSMM (RM, M=%i, N=%i, K=%i, beta=1): %f\n", l_m, l_n, l_k, ((double)sizeof(double) * ((2.0*(double)l_m * (double)l_n) + ((double)l_k * (double)l_n)) * (double)l_reps * 1.0e-9) / l_total ); gettimeofday(&l_start, NULL); beta = 0.0; for ( l_j = 0; l_j < l_reps; l_j++ ) { dgemm(&trans, &trans, &l_n, &l_m, &l_k, &alpha, l_b, &l_n, l_a_dense, &l_k, &beta, l_c_dense_betazero, &l_n ); } gettimeofday(&l_end, NULL); l_total = sec(l_start, l_end); fprintf(stdout, "time[s] MKL (RM, M=%i, N=%i, K=%i, beta=0): %f\n", l_m, l_n, l_k, l_total/(double)l_reps ); fprintf(stdout, "GFLOPS MKL (RM, M=%i, N=%i, K=%i, beta=0): %f\n", l_m, l_n, l_k, (2.0 * (double)l_m * (double)l_n * (double)l_k * (double)l_reps * 1.0e-9) / l_total ); fprintf(stdout, "GB/s MKL (RM, M=%i, N=%i, K=%i, beta=0): %f\n", l_m, l_n, l_k, ((double)sizeof(double) * ((2.0*(double)l_m * (double)l_n) + ((double)l_k * (double)l_n)) * (double)l_reps * 1.0e-9) / l_total ); gettimeofday(&l_start, NULL); beta = 1.0; for ( l_j = 0; l_j < l_reps; l_j++ ) { dgemm(&trans, &trans, &l_n, &l_m, &l_k, &alpha, l_b, &l_n, l_a_dense, &l_k, &beta, l_c_dense_betaone, &l_n ); } gettimeofday(&l_end, NULL); l_total = sec(l_start, l_end); fprintf(stdout, "time[s] MKL (RM, M=%i, N=%i, K=%i, beta=1): %f\n", l_m, l_n, l_k, l_total/(double)l_reps ); fprintf(stdout, "GFLOPS MKL (RM, M=%i, N=%i, K=%i, beta=1): %f\n", l_m, l_n, l_k, (2.0 * (double)l_m * (double)l_n * (double)l_k * (double)l_reps * 1.0e-9) / l_total ); fprintf(stdout, "GB/s MKL (RM, M=%i, N=%i, K=%i, beta=1): %f\n", l_m, l_n, l_k, ((double)sizeof(double) * ((2.0*(double)l_m * (double)l_n) + ((double)l_k * (double)l_n)) * (double)l_reps * 1.0e-9) / l_total ); /* free */ libxsmm_dfsspmdm_destroy( gemm_op_betazero ); libxsmm_dfsspmdm_destroy( gemm_op_betaone ); } libxsmm-1.17/samples/pyfr/pyfr_gemm_cm.c000066400000000000000000000102141415223013700203450ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include #include #include #include #include static double sec(struct timeval start, struct timeval end) { return ((double)(((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)))) / 1.0e6; } int main(int argc, char *argv[]) { int n,m,k; int lda,ldb,ldc; double* a; double* b; double* c1; double* c2; struct timeval l_start, l_end; double l_total = 0.0; int reps, i, j; const int nblock = 16; double alpha = 1.0, beta = 1.0; char transa = 'N', transb = 'N'; libxsmm_gemm_prefetch_type l_prefetch_op = LIBXSMM_PREFETCH_NONE; libxsmm_dmmfunction kernel = NULL; if (argc != 5) { fprintf(stderr, "Invalid ./a,out M N K reps\n"); exit(-1); } m = atoi(argv[1]); n = atoi(argv[2]); k = atoi(argv[3]); reps = atoi(argv[4]); /* this is col-major what you want to use for the sizes in question */ lda = m; ldb = k; ldc = m; if (n % nblock != 0) { fprintf(stderr, "N needs to be divisable by %i\n", nblock); exit(-1); } a = (double*)_mm_malloc(lda*k*sizeof(double), 64); b = (double*)_mm_malloc(ldb*n*sizeof(double), 64); c1 = (double*)_mm_malloc(ldc*n*sizeof(double), 64); c2 = (double*)_mm_malloc(ldc*n*sizeof(double), 64); #pragma omp parallel for for (i = 0; i < lda*k; i++) { a[i] = libxsmm_rng_f64(); } #pragma omp parallel for for (i = 0; i < ldb*n; i++) { b[i] = libxsmm_rng_f64(); } #pragma omp parallel for for (i = 0; i < ldc*n; i++) { c1[i] = 0; c2[i] = 0; } /* JIT Kernel */ kernel = libxsmm_dmmdispatch(m, nblock, k, NULL, NULL, NULL, NULL, NULL, NULL, &l_prefetch_op ); /* init MKL */ dgemm(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c1, &ldc); #pragma omp parallel for for (i = 0; i < ldc*n; i++) { c1[i] = 0; c2[i] = 0; } gettimeofday(&l_start, NULL); for ( j = 0; j < reps; j++ ) { dgemm(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c1, &ldc); } gettimeofday(&l_end, NULL); l_total = sec(l_start, l_end); fprintf(stdout, "time[s] MKL (CM, M=%i, N=%i, K=%i): %f\n", m, n, k, l_total/(double)reps ); fprintf(stdout, "GFLOPS MKL (CM, M=%i, N=%i, K=%i): %f\n", m, n, k, (2.0 * (double)m * (double)n * (double)k * (double)reps * 1.0e-9) / l_total ); fprintf(stdout, "GB/s MKL (CM, M=%i, N=%i, K=%i): %f\n", m, n, k, ((double)sizeof(double) * (((double)m * (double)n) + ((double)k * (double)n)) * (double)reps * 1.0e-9) / l_total ); gettimeofday(&l_start, NULL); for ( j = 0; j < reps; j++ ) { #pragma omp parallel for private(i) for ( i = 0; i < n; i+=nblock) { kernel( a, b+(ldb*i), c2+(ldc*i), NULL, NULL, NULL ); } gettimeofday(&l_end, NULL); } l_total = sec(l_start, l_end); fprintf(stdout, "time[s] libxsmm (CM, M=%i, N=%i, K=%i): %f\n", m, n, k, l_total/(double)reps ); fprintf(stdout, "GFLOPS libxsmm (CM, M=%i, N=%i, K=%i): %f\n", m, n, k, (2.0 * (double)m * (double)n * (double)k * (double)reps * 1.0e-9) / l_total ); fprintf(stdout, "GB/s libxsmm (CM, M=%i, N=%i, K=%i): %f\n", m, n, k, ((double)sizeof(double) * (((double)m * (double)n) + ((double)k * (double)n)) * (double)reps * 1.0e-9) / l_total ); /* test result */ double max_error = 0.0; for ( i = 0; i < ldc*n; i++) { if (max_error < fabs(c1[i] - c2[i])) { max_error = fabs(c1[i] - c2[i]); } } printf("max error: %f\n\n", max_error); } libxsmm-1.17/samples/pyfr/pyfr_gemm_rm.c000066400000000000000000000103141415223013700203650ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include #include #include #include #include static double sec(struct timeval start, struct timeval end) { return ((double)(((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)))) / 1.0e6; } int main(int argc, char *argv[]) { int n,m,k; int lda,ldb,ldc; double* a; double* b; double* c1; double* c2; struct timeval l_start, l_end; double l_total = 0.0; int reps, i, j; const int nblock = 16; double alpha = 1.0, beta = 1.0; char transa = 'N', transb = 'N'; libxsmm_gemm_prefetch_type l_prefetch_op = LIBXSMM_PREFETCH_NONE; libxsmm_dmmfunction kernel = NULL; if (argc != 5) { fprintf(stderr, "Invalid ./a,out M N K reps\n"); exit(-1); } m = atoi(argv[1]); n = atoi(argv[2]); k = atoi(argv[3]); reps = atoi(argv[4]); /* this is col-major what you want to use for the sizes in question */ lda = k; ldb = n; ldc = n; if (n % nblock != 0) { fprintf(stderr, "N needs to be divisable by %i\n", nblock); exit(-1); } a = (double*)_mm_malloc(lda*m*sizeof(double), 64); b = (double*)_mm_malloc(ldb*k*sizeof(double), 64); c1 = (double*)_mm_malloc(ldc*m*sizeof(double), 64); c2 = (double*)_mm_malloc(ldc*m*sizeof(double), 64); #pragma omp parallel for for (i = 0; i < lda*m; i++) { a[i] = libxsmm_rng_f64(); } #pragma omp parallel for for (i = 0; i < ldb*k; i++) { b[i] = libxsmm_rng_f64(); } #pragma omp parallel for for (i = 0; i < ldc*m; i++) { c1[i] = 0; c2[i] = 0; } /* JIT Kernel */ kernel = libxsmm_dmmdispatch(nblock, m, k, &ldb, &lda, &ldc, NULL, NULL, NULL, &l_prefetch_op ); if (kernel == 0) { printf("JIT failed, exiting\n"); exit(-1); } /* init MKL */ dgemm(&transb, &transa, &n, &m, &k, &alpha, b, &ldb, a, &lda, &beta, c1, &ldc); #pragma omp parallel for for (i = 0; i < ldc*m; i++) { c1[i] = 0; c2[i] = 0; } gettimeofday(&l_start, NULL); for ( j = 0; j < reps; j++ ) { dgemm(&transb, &transa, &n, &m, &k, &alpha, b, &ldb, a, &lda, &beta, c1, &ldc); } gettimeofday(&l_end, NULL); l_total = sec(l_start, l_end); fprintf(stdout, "time[s] MKL (RM, M=%i, N=%i, K=%i): %f\n", m, n, k, l_total/(double)reps ); fprintf(stdout, "GFLOPS MKL (RM, M=%i, N=%i, K=%i): %f\n", m, n, k, (2.0 * (double)m * (double)n * (double)k * (double)reps * 1.0e-9) / l_total ); fprintf(stdout, "GB/s MKL (RM, M=%i, N=%i, K=%i): %f\n", m, n, k, ((double)sizeof(double) * (((double)m * (double)n) + ((double)k * (double)n)) * (double)reps * 1.0e-9) / l_total ); gettimeofday(&l_start, NULL); for ( j = 0; j < reps; j++ ) { #pragma omp parallel for private(i) for ( i = 0; i < n; i+=nblock) { kernel( b+i, a, c2+i, NULL, NULL, NULL ); } gettimeofday(&l_end, NULL); } l_total = sec(l_start, l_end); fprintf(stdout, "time[s] libxsmm (RM, M=%i, N=%i, K=%i): %f\n", m, n, k, l_total/(double)reps ); fprintf(stdout, "GFLOPS libxsmm (RM, M=%i, N=%i, K=%i): %f\n", m, n, k, (2.0 * (double)m * (double)n * (double)k * (double)reps * 1.0e-9) / l_total ); fprintf(stdout, "GB/s libxsmm (RM, M=%i, N=%i, K=%i): %f\n", m, n, k, ((double)sizeof(double) * (((double)m * (double)n) + ((double)k * (double)n)) * (double)reps * 1.0e-9) / l_total ); /* test result */ double max_error = 0.0; for ( i = 0; i < ldc*m; i++) { if (max_error < fabs(c1[i] - c2[i])) { max_error = fabs(c1[i] - c2[i]); } } printf("max error: %f\n\n", max_error); } libxsmm-1.17/samples/pyfr/test.sh000077500000000000000000000033501415223013700170560ustar00rootroot00000000000000#!/usr/bin/env bash ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Alexander Heinecke (Intel Corp.) ############################################################################### echo "Please use sufficient affinities when running this benchmark" echo "e.g.:" echo "export OMP_NUM_THREADS=X" echo "export KMP_AFFINITY=granularity=fine,compact,1,0" export OMP_NUM_THREADS=67 export KMP_AFFINITY=granularity=fine,compact,1,0 numactl --preferred=1 ./pyfr_gemm_rm 150 2048 125 1000 numactl --preferred=1 ./pyfr_gemm_rm 150 48000 125 1000 numactl --preferred=1 ./pyfr_gemm_rm 150 96000 125 1000 numactl --preferred=1 ./pyfr_gemm_cm 150 2048 125 1000 numactl --preferred=1 ./pyfr_gemm_cm 150 48000 125 1000 numactl --preferred=1 ./pyfr_gemm_cm 150 96000 125 1000 numactl --preferred=1 ./pyfr_gemm_rm 105 2048 75 1000 numactl --preferred=1 ./pyfr_gemm_rm 105 48000 75 1000 numactl --preferred=1 ./pyfr_gemm_rm 105 96000 75 1000 numactl --preferred=1 ./pyfr_gemm_cm 105 2048 75 1000 numactl --preferred=1 ./pyfr_gemm_cm 105 48000 75 1000 numactl --preferred=1 ./pyfr_gemm_cm 105 96000 75 1000 numactl --preferred=1 ./pyfr_driver_asp_reg ./mats/p3/hex/m6-sp.mtx 48000 10000 libxsmm-1.17/samples/seissol/000077500000000000000000000000001415223013700162405ustar00rootroot00000000000000libxsmm-1.17/samples/seissol/proxy_download_data.sh000077500000000000000000000020001415223013700226300ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### WGET=$(command -v wget) DATASET="LOH1_small merapi_15e5" KINDS="bound neigh orient sides size" for DATA in ${DATASET} ; do for KIND in ${KINDS} ; do ${WGET} -N https://github.com/hfp/libxsmm/raw/master/samples/seissol/${DATA}.nc.${KIND} done done libxsmm-1.17/samples/seissol/proxy_extract_neigh_information_nc.sh000077500000000000000000000027671415223013700257650ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Alexander Heinecke (Intel Corp.) ############################################################################### if [ $# -ne 1 ] then echo "Usage: $(basename $0) mesh.nc" exit fi NCFILE=$1 ncdump -c ${NCFILE} | grep elements | head -n1 | awk '{print $3}' > ${NCFILE}.size ncdump -v element_neighbors ${NCFILE} | sed -e '1,/data:/d' -e '$d' | grep "," | sed 's/,//g' | sed 's/;//g' | sed 's/ //g' > ${NCFILE}.neigh ncdump -v element_boundaries ${NCFILE} | sed -e '1,/data:/d' -e '$d' | grep "," | sed 's/,//g' | sed 's/;//g' | sed 's/ //g' > ${NCFILE}.bound ncdump -v element_neighbor_sides ${NCFILE} | sed -e '1,/data:/d' -e '$d' | grep "," | sed 's/,//g' | sed 's/;//g' | sed 's/ //g' > ${NCFILE}.sides ncdump -v element_side_orientations ${NCFILE} | sed -e '1,/data:/d' -e '$d' | grep "," | sed 's/,//g' | sed 's/;//g' | sed 's/ //g' > ${NCFILE}.orient libxsmm-1.17/samples/seissol/proxy_launcher.sh000077500000000000000000000163511415223013700216470ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### MAKE=${MAKE:-make} # Please adjust SEISSOL_KERNELS_CONFIG=sparse_dense #define this if you want to generate a sparse-dense tuned backend #SEISSOL_KERNELS_CONFIG=all_dense #define this if you want to generate an all-dense backend #SEISSOL_KERNELS_CONFIG=all_sparse #define this if you want to generate an all-sparse backend MEMKIND_ROOT_DIR=/swtools/memkind/latest # some defaults SIMARCH=snb_dp KERNEL_FILE=dgemm_snb.cpp KERNEL=all CORES=16 NELEM=386518 TIMESTEPS=100 ORDER=6 ARCH_FLAGS="-mavx -DALIGNMENT=32 -DDSNB -fopenmp" SEISSOL_PROXY_ROOT=`pwd` CONF="default" GENCONF="default" GENCODE="0" PREFETCH="0" DERS="0" MEMKIND="0" # some relative pathes LIBXSMM_ROOT=${SEISSOL_PROXY_ROOT}/../../ SEISSOL_KERNELS_ROOT=${SEISSOL_PROXY_ROOT}/seissol_kernels #SEISSOL_KERNELS_ROOT=/nfs_home/aheineck/Projects/SeisSol_workspace/seissol_kernels # test for seissol kernels and clone from git hub if needed. if [ ! -d "${SEISSOL_KERNELS_ROOT}" ]; then git clone --recursive https://github.com/TUM-I5/seissol_kernels.git fi while getopts a:k:c:n:t:o:p:g:s:d:m: opts; do case ${opts} in a) SIMARCH=${OPTARG} ;; k) KERNEL=${OPTARG} ;; c) CORES=${OPTARG} ;; n) NELEM=${OPTARG} ;; t) TIMESTEPS=${OPTARG} ;; o) ORDER=${OPTARG} ;; s) CONF=${OPTARG} ;; p) PREFETCH=${OPTARG} ;; g) GENCODE=${OPTARG} ;; d) DERS=${OPTARG} ;; m) MEMKIND=${OPTARG} ;; esac done case ${SIMARCH} in wsm_dp) ARCH_FLAGS="-msse3 -DALIGNMENT=16 -DDWSM -qopenmp -static-libgcc -static-libstdc++ -static-intel"; MATMUL_KERNEL_DENSE_FILE=dgemm_wsm.cpp; MATMUL_KERNEL_SPARSE_FILE=sparse_dwsm.cpp ;; snb_dp) ARCH_FLAGS="-mavx -DALIGNMENT=32 -DDSNB -qopenmp -static-libgcc -static-libstdc++ -static-intel"; MATMUL_KERNEL_DENSE_FILE=dgemm_snb.cpp; MATMUL_KERNEL_SPARSE_FILE=sparse_dsnb.cpp ;; hsw_dp) ARCH_FLAGS="-xCORE_AVX2 -fma -DALIGNMENT=32 -DDHSW -qopenmp -static-libgcc -static-libstdc++ -static-intel"; MATMUL_KERNEL_DENSE_FILE=dgemm_hsw.cpp; MATMUL_KERNEL_SPARSE_FILE=sparse_dhsw.cpp ;; wsm_sp) ARCH_FLAGS="-msse3 -DALIGNMENT=16 -DSWSM -qopenmp -static-libgcc -static-libstdc++ -static-intel"; MATMUL_KERNEL_DENSE_FILE=sgemm_wsm.cpp; MATMUL_KERNEL_SPARSE_FILE=sparse_swsm.cpp ;; snb_sp) ARCH_FLAGS="-mavx -DALIGNMENT=32 -DSSNB -qopenmp -static-libgcc -static-libstdc++ -static-intel"; MATMUL_KERNEL_DENSE_FILE=sgemm_snb.cpp; MATMUL_KERNEL_SPARSE_FILE=sparse_ssnb.cpp ;; hsw_sp) ARCH_FLAGS="-xCORE_AVX2 -fma -DALIGNMENT=32 -DSHSW -qopenmp -static-libgcc -static-libstdc++ -static-intel"; MATMUL_KERNEL_DENSE_FILE=sgemm_hsw.cpp; MATMUL_KERNEL_SPARSE_FILE=sparse_shsw.cpp ;; knl_dp) ARCH_FLAGS="-xMIC-AVX512 -fma -DALIGNMENT=64 -DDKNL -qopenmp -static-libgcc -static-libstdc++ -static-intel"; MATMUL_KERNEL_DENSE_FILE=dgemm_knl.cpp; MATMUL_KERNEL_SPARSE_FILE=sparse_dknl.cpp ;; knl_sp) ARCH_FLAGS="-xMIC-AVX512 -fma -DALIGNMENT=64 -SDKNL -qopenmp -static-libgcc -static-libstdc++ -static-intel"; MATMUL_KERNEL_DENSE_FILE=sgemm_knl.cpp; MATMUL_KERNEL_SPARSE_FILE=sparse_sknl.cpp ;; noarch_dp) ARCH_FLAGS="-DALIGNMENT=16 -DDNOARCH -qopenmp -mkl=sequential -static-libgcc -static-libstdc++ -static-intel"; MATMUL_KERNEL_DENSE_FILE=dgemm_noarch.cpp; MATMUL_KERNEL_SPARSE_FILE=sparse_dnoarch.cpp ;; noarch_sp) ARCH_FLAGS="-DALIGNMENT=16 -DDNOARCH -qopenmp -mkl=sequential -static-libgcc -static-libstdc++ -static-intel"; MATMUL_KERNEL_DENSE_FILE=sgemm_noarch.cpp; MATMUL_KERNEL_SPARSE_FILE=sparse_snoarch.cpp ;; *) echo "Unsupported architecture -> Exit Launcher! Supported architectures are: wsm_dp, snb_dp, hsw_dp, wsm_sp, snb_sp, hsw_sp, knl_dp, knl_sp, noarch_dp, noarch_sp!"; exit ;; esac case ${KERNEL} in all) ;; local) ;; neigh) ;; ader) ;; vol) ;; bndlocal) ;; *) echo "Unsupported Kernel -> Exit Launcher! Supported kernels are: all, local, neigh, ader, vol, bndlocal!"; exit ;; esac case ${ORDER} in 7) ;; 6) ;; 5) ;; 4) ;; 3) ;; 2) ;; *) echo "Unsupported Order -> Exit Launcher! Supported orders are: 2,3,4,5,6,7!"; exit ;; esac case ${GENCODE} in 0) ;; 1) ;; *) echo "Unsupported Generation switch -> Exit Launcher! Supported switches are: 0(off) 1(on)!"; exit ;; esac if [ "${KERNEL}" == 'local' ]; then GENCONF="local_"${CONF} fi if [ "${KERNEL}" == 'ader' ]; then GENCONF="local_"${CONF} fi if [ "${KERNEL}" == 'vol' ]; then GENCONF="local_"${CONF} fi if [ "${KERNEL}" == 'bndlocal' ]; then GENCONF="local_"${CONF} fi if [ "${KERNEL}" == 'neigh' ]; then GENCONF="neighboring_"${CONF} fi set -x if [ "${GENCODE}" == '1' ]; then # build libxsmm generator backend cd ${LIBXSMM_ROOT} # ${MAKE} realclean ${MAKE} generator cd ${SEISSOL_KERNELS_ROOT}/preprocessing rm -rf generated_code/* if [ "${CONF}" == 'default' ]; then python scripts/offlineAssembly.py --generateMatrixKernels ./matrices ./${SEISSOL_KERNELS_CONFIG} ${LIBXSMM_ROOT}/bin/libxsmm_gemm_generator ./generated_code >/dev/null else python scripts/offlineAssembly.py --generateMatrixKernels ./matrices ./../auto_tuning/sparse_dense/${GENCONF} ${LIBXSMM}/bin/libxsmm_gemm_generator ./generated_code >/dev/null fi cd ${SEISSOL_PROXY_ROOT} fi # added prefetch flag if [ "${PREFETCH}" == '1' ]; then ARCH_FLAGS="${ARCH_FLAGS} -DENABLE_MATRIX_PREFETCH" fi if [ "${PREFETCH}" == '2' ]; then ARCH_FLAGS="${ARCH_FLAGS} -DENABLE_MATRIX_PREFETCH -DENABLE_STREAM_MATRIX_PREFETCH" fi # added use derivations flag if [ "${DERS}" == '1' ]; then ARCH_FLAGS="${ARCH_FLAGS} -D__USE_DERS" fi # check for memkind if [ "${MEMKIND}" == '1' ]; then ARCH_FLAGS="${ARCH_FLAGS} -DUSE_MEMKIND -I${MEMKIND_ROOT_DIR}/include -L${MEMKIND_ROOT_DIR}/lib -lmemkind" fi # compile proxy app rm -rf driver_${SIMARCH}_${ORDER}.exe icpc -O3 -ip -ipo -DNDEBUG ${ARCH_FLAGS} -DCONVERGENCE_ORDER=${ORDER} -DNUMBER_OF_QUANTITIES=9 -I${SEISSOL_KERNELS_ROOT}/src -I${SEISSOL_KERNELS_ROOT}/preprocessing/generated_code ${SEISSOL_KERNELS_ROOT}/src/Volume.cpp ${SEISSOL_KERNELS_ROOT}/src/Time.cpp ${SEISSOL_KERNELS_ROOT}/src/Boundary.cpp ${SEISSOL_KERNELS_ROOT}/preprocessing/generated_code/matrix_kernels/${MATMUL_KERNEL_DENSE_FILE} ${SEISSOL_KERNELS_ROOT}/preprocessing/generated_code/matrix_kernels/${MATMUL_KERNEL_SPARSE_FILE} proxy_seissol.cpp -o driver_${SIMARCH}_${ORDER}.exe # run SeisSol Scenario converter #./proxy_extract_neigh_information_nc.sh ${SEISSOL_PROXY_SCENARIO} # running on regular CPU export OMP_NUM_THREADS=${CORES} NPROCS=`cat /proc/cpuinfo | grep "core id" | wc -l` if [ "${NPROCS}" = "${CORES}" ]; then export KMP_AFFINITY=compact,granularity=thread,explicit,verbose else export KMP_AFFINITY=proclist=[0-$((CORES-1))],granularity=thread,explicit,verbose fi ./driver_${SIMARCH}_${ORDER}.exe ${NELEM} ${TIMESTEPS} ${KERNEL} set +x libxsmm-1.17/samples/seissol/proxy_seissol.cpp000066400000000000000000000332041415223013700216700ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* * Copyright (c) 2013-2014, SeisSol Group * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. **/ #include #include #include #include #include #include #include #include #include #include #include #ifdef _OPENMP #include #endif #ifdef USE_MEMKIND #include //#define USE_HBM_DOFS #define USE_HBM_TDOFS #define USE_HBM_DERS //#define USE_HBM_CELLLOCAL_LOCAL //#define USE_HBM_CELLLOCAL_NEIGH #define USE_HBM_GLOBALDATA #endif #ifdef __MIC__ #define __USE_RDTSC #endif double derive_cycles_from_time(double time) { // first try to read proxy env variable with freq char* p_freq; double d_freq; double cycles = 1.0; p_freq = getenv ("SEISSOL_PROXY_FREQUENCY"); if (p_freq !=NULL ) { d_freq = atof(p_freq); printf("detected frequency (SEISSOL_PROXY_FREQUENCY): %f\n", d_freq); cycles = time * d_freq * 1.0e6; } else { FILE* fp; fp = popen("lscpu | grep MHz | awk '{print $3}'", "r"); if (fp > 0) { char tmp_buffer[20]; fread(tmp_buffer, 20, 1, fp); d_freq = atof(tmp_buffer); printf("detected frequency (lscpu): %f\n", d_freq); cycles = time * d_freq * 1.0e6; pclose(fp); } else { cycles = 1.0; printf("detected frequency (lscpu) FAILED!\n"); } } return cycles; } // seissol_kernel includes #include #include #include #include #include #include "proxy_seissol_allocator.hpp" #include "proxy_seissol_flops.hpp" #include "proxy_seissol_bytes.hpp" #include "proxy_seissol_integrators.hpp" inline double sec(struct timeval start, struct timeval end) { return ((double)(((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)))) / 1.0e6; } int main(int argc, char* argv[]) { if (argc != 4) { printf("Wrong parameters!\n"); printf(" #cells #timesteps kernel\n"); printf(" kernel-values: all, local, neigh, ader, vol, bndlocal\n"); return -1; } unsigned int i_cells = atoi(argv[1]); unsigned int i_timesteps = atoi(argv[2]); std::string s_part; s_part.assign(argv[3]); // double-check if the selected kernel exists if ( (s_part.compare("all") != 0) && (s_part.compare("local") != 0) && (s_part.compare("neigh") != 0) && (s_part.compare("ader") != 0) && (s_part.compare("vol") != 0) && (s_part.compare("bndlocal") != 0) ) { printf("Wrong parameters!\n"); printf(" #cells #timesteps kernel\n"); printf(" kernel-values: all, local, neigh, ader, vol, bndlocal\n"); return -1; } printf("Allocating fake data...\n"); i_cells = init_data_structures(i_cells); printf("...done\n\n"); struct timeval start_time, end_time; size_t cycles_start, cycles_end; double total = 0.0; double total_cycles = 0.0; // init OpenMP and LLC if (s_part.compare("all") == 0) { computeLocalIntegration(); computeNeighboringIntegration(); } else if (s_part.compare("local") == 0) { computeLocalIntegration(); } else if (s_part.compare("neigh") == 0) { computeNeighboringIntegration(); } else if (s_part.compare("ader") == 0) { computeAderIntegration(); } else if (s_part.compare("vol") == 0) { computeVolumeIntegration(); } else { computeLocalBoundaryIntegration(); } gettimeofday(&start_time, NULL); #ifdef __USE_RDTSC cycles_start = _libxsmm_timer_cycles(); #endif if (s_part.compare("all") == 0) { for (unsigned int t = 0; t < i_timesteps; t++) { computeLocalIntegration(); computeNeighboringIntegration(); } } else if (s_part.compare("local") == 0) { for (unsigned int t = 0; t < i_timesteps; t++) { computeLocalIntegration(); } } else if (s_part.compare("neigh") == 0) { for (unsigned int t = 0; t < i_timesteps; t++) { computeNeighboringIntegration(); } } else if (s_part.compare("ader") == 0) { for (unsigned int t = 0; t < i_timesteps; t++) { computeAderIntegration(); } } else if (s_part.compare("vol") == 0) { for (unsigned int t = 0; t < i_timesteps; t++) { computeVolumeIntegration(); } } else { for (unsigned int t = 0; t < i_timesteps; t++) { computeLocalBoundaryIntegration(); } } #ifdef __USE_RDTSC cycles_end = _libxsmm_timer_cycles(); #endif gettimeofday(&end_time, NULL); total = sec(start_time, end_time); #ifdef __USE_RDTSC printf("Cycles via _libxsmm_timer_cycles()!\n"); total_cycles = (double)(cycles_end-cycles_start); #else total_cycles = derive_cycles_from_time(total); #endif printf("=================================================\n"); printf("=== PERFORMANCE SUMMARY ===\n"); printf("=================================================\n"); printf("seissol proxy mode : %s\n", s_part.c_str()); printf("time for seissol proxy : %f\n", total); printf("cycles : %f\n\n", total_cycles); seissol_flops actual_flops; if (s_part.compare("all") == 0) { actual_flops = flops_all_actual(i_timesteps); printf("GFLOP (non-zero) for seissol proxy : %f\n", actual_flops.d_nonZeroFlops/(1e9)); printf("GFLOP (hardware) for seissol proxy : %f\n", actual_flops.d_hardwareFlops/(1e9)); //printf("GFLOP (estimate) for seissol proxy : %f\n", flops_all(i_timesteps)/(1e9)); printf("GiB (estimate) for seissol proxy : %f\n\n", bytes_all(i_timesteps)/(1024.0*1024.0*1024.0)); printf("FLOPS/cycle (non-zero) : %f\n", actual_flops.d_nonZeroFlops/total_cycles); printf("FLOPS/cycle (hardware) : %f\n", actual_flops.d_hardwareFlops/total_cycles); printf("Bytes/cycle (estimate) : %f\n\n", bytes_all(i_timesteps)/total_cycles); printf("GFLOPS (non-zero) for seissol proxy : %f\n", (actual_flops.d_nonZeroFlops/(1e9))/total); printf("GFLOPS (hardware) for seissol proxy : %f\n", (actual_flops.d_hardwareFlops/(1e9))/total); printf("GiB/s (estimate) for seissol proxy : %f\n", (bytes_all(i_timesteps)/(1024.0*1024.0*1024.0))/total); } else if (s_part.compare("local") == 0) { actual_flops = flops_local_actual(i_timesteps); printf("GFLOP (non-zero) for seissol proxy : %f\n", actual_flops.d_nonZeroFlops/(1e9)); printf("GFLOP (hardware) for seissol proxy : %f\n", actual_flops.d_hardwareFlops/(1e9)); //printf("GFLOP (estimate) for seissol proxy : %f\n", flops_local(i_timesteps)/(1e9)); printf("GiB (estimate) for seissol proxy : %f\n\n", bytes_local(i_timesteps)/(1024.0*1024.0*1024.0)); printf("FLOPS/cycle (non-zero) : %f\n", actual_flops.d_nonZeroFlops/total_cycles); printf("FLOPS/cycle (hardware) : %f\n", actual_flops.d_hardwareFlops/total_cycles); printf("Bytes/cycle (estimate) : %f\n\n", bytes_local(i_timesteps)/total_cycles); printf("GFLOPS (non-zero) for seissol proxy : %f\n", (actual_flops.d_nonZeroFlops/(1e9))/total); printf("GFLOPS (hardware) for seissol proxy : %f\n", (actual_flops.d_hardwareFlops/(1e9))/total); printf("GiB/s (estimate) for seissol proxy : %f\n", (bytes_local(i_timesteps)/(1024.0*1024.0*1024.0))/total); } else if (s_part.compare("neigh") == 0) { actual_flops = flops_bndneigh_actual(i_timesteps); printf("GFLOP (non-zero) for seissol proxy : %f\n", actual_flops.d_nonZeroFlops/(1e9)); printf("GFLOP (hardware) for seissol proxy : %f\n", actual_flops.d_hardwareFlops/(1e9)); //printf("GFLOP (estimate) for seissol proxy : %f\n", flops_bndneigh(i_timesteps)/(1e9)); printf("GiB (estimate) for seissol proxy : %f\n\n", bytes_bndneigh(i_timesteps)/(1024.0*1024.0*1024.0)); printf("FLOPS/cycle (non-zero) : %f\n", actual_flops.d_nonZeroFlops/total_cycles); printf("FLOPS/cycle (hardware) : %f\n", actual_flops.d_hardwareFlops/total_cycles); printf("Bytes/cycle (estimate) : %f\n\n", bytes_bndneigh(i_timesteps)/total_cycles); printf("GFLOPS (non-zero) for seissol proxy : %f\n", (actual_flops.d_nonZeroFlops/(1e9))/total); printf("GFLOPS (hardware) for seissol proxy : %f\n", (actual_flops.d_hardwareFlops/(1e9))/total); printf("GiB/s (estimate) for seissol proxy : %f\n", (bytes_bndneigh(i_timesteps)/(1024.0*1024.0*1024.0))/total); } else if (s_part.compare("ader") == 0) { actual_flops = flops_ader_actual(i_timesteps); printf("GFLOP (non-zero) for seissol proxy : %f\n", actual_flops.d_nonZeroFlops/(1e9)); printf("GFLOP (hardware) for seissol proxy : %f\n", actual_flops.d_hardwareFlops/(1e9)); //printf("GFLOP (estimate) for seissol proxy : %f\n", flops_ader(i_timesteps)/(1e9)); printf("GiB (estimate) for seissol proxy : %f\n\n", bytes_ader(i_timesteps)/(1024.0*1024.0*1024.0)); printf("FLOPS/cycle (non-zero) : %f\n", actual_flops.d_nonZeroFlops/total_cycles); printf("FLOPS/cycle (hardware) : %f\n", actual_flops.d_hardwareFlops/total_cycles); printf("Bytes/cycle (estimate) : %f\n\n", bytes_ader(i_timesteps)/total_cycles); printf("GFLOPS (non-zero) for seissol proxy : %f\n", (actual_flops.d_nonZeroFlops/(1e9))/total); printf("GFLOPS (hardware) for seissol proxy : %f\n", (actual_flops.d_hardwareFlops/(1e9))/total); printf("GiB/s (estimate) for seissol proxy : %f\n", (bytes_ader(i_timesteps)/(1024.0*1024.0*1024.0))/total); } else if (s_part.compare("vol") == 0) { actual_flops = flops_vol_actual(i_timesteps); printf("GFLOP (non-zero) for seissol proxy : %f\n", actual_flops.d_nonZeroFlops/(1e9)); printf("GFLOP (hardware) for seissol proxy : %f\n", actual_flops.d_hardwareFlops/(1e9)); //printf("GFLOP (estimate) for seissol proxy : %f\n", flops_vol(i_timesteps)/(1e9)); printf("GiB (estimate) for seissol proxy : %f\n\n", bytes_vol(i_timesteps)/(1024.0*1024.0*1024.0)); printf("FLOPS/cycle (non-zero) : %f\n", actual_flops.d_nonZeroFlops/total_cycles); printf("FLOPS/cycle (hardware) : %f\n", actual_flops.d_hardwareFlops/total_cycles); printf("Bytes/cycle (estimate) : %f\n\n", bytes_vol(i_timesteps)/total_cycles); printf("GFLOPS (non-zero) for seissol proxy : %f\n", (actual_flops.d_nonZeroFlops/(1e9))/total); printf("GFLOPS (hardware) for seissol proxy : %f\n", (actual_flops.d_hardwareFlops/(1e9))/total); printf("GiB/s (estimate) for seissol proxy : %f\n", (bytes_vol(i_timesteps)/(1024.0*1024.0*1024.0))/total); } else { actual_flops = flops_bndlocal_actual(i_timesteps); printf("GFLOP (non-zero) for seissol proxy : %f\n", actual_flops.d_nonZeroFlops/(1e9)); printf("GFLOP (hardware) for seissol proxy : %f\n", actual_flops.d_hardwareFlops/(1e9)); //printf("GFLOP (estimate) for seissol proxy : %f\n", flops_bndlocal(i_timesteps)/(1e9)); printf("GiB (estimate) for seissol proxy : %f\n\n", bytes_bndlocal(i_timesteps)/(1024.0*1024.0*1024.0)); printf("FLOPS/cycle (non-zero) : %f\n", actual_flops.d_nonZeroFlops/total_cycles); printf("FLOPS/cycle (hardware) : %f\n", actual_flops.d_hardwareFlops/total_cycles); printf("Bytes/cycle (estimate) : %f\n\n", bytes_bndlocal(i_timesteps)/total_cycles); printf("GFLOPS (non-zero) for seissol proxy : %f\n", (actual_flops.d_nonZeroFlops/(1e9))/total); printf("GFLOPS (hardware) for seissol proxy : %f\n", (actual_flops.d_hardwareFlops/(1e9))/total); printf("GiB/s (estimate) for seissol proxy : %f\n", (bytes_bndlocal(i_timesteps)/(1024.0*1024.0*1024.0))/total); } printf("=================================================\n"); printf("\n"); free_data_structures(); return 0; } libxsmm-1.17/samples/seissol/proxy_seissol_allocator.hpp000066400000000000000000000435121415223013700237400ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* * Copyright (c) 2013-2014, SeisSol Group * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. **/ #ifndef PROXY_SEISSOL_ALLOCATOR_HPP #define PROXY_SEISSOL_ALLOCATOR_HPP struct CellLocalInformation { enum faceType faceTypes[4]; int faceRelations[4][2]; unsigned int faceNeighborIds[4]; unsigned int ltsSetup; double currentTime[5]; }; struct GlobalData { real *stiffnessMatricesTransposed[3]; real *stiffnessMatrices[3]; real *fluxMatrices[52]; }; struct LocalIntegrationData { real starMatrices[3][STAR_NNZ]; real nApNm1[4][NUMBER_OF_QUANTITIES*NUMBER_OF_QUANTITIES]; }; struct NeighboringIntegrationData { real nAmNm1[4][NUMBER_OF_QUANTITIES*NUMBER_OF_QUANTITIES]; }; struct CellData { struct LocalIntegrationData *localIntegration; struct NeighboringIntegrationData *neighboringIntegration; }; struct Cells { unsigned int numberOfCells; real (*dofs)[NUMBER_OF_ALIGNED_DOFS]; real **buffers; real **derivatives; real *(*faceNeighbors)[4]; }; struct GlobalData **m_globalDataArray; struct GlobalData *m_globalData; struct CellLocalInformation *m_cellInformation; struct CellData *m_cellData; struct Cells *m_cells; struct LocalIntegrationData *m_localIntegration; struct NeighboringIntegrationData * m_neighboringIntegration; seissol::kernels::Time m_timeKernel; seissol::kernels::Volume m_volumeKernel; seissol::kernels::Boundary m_boundaryKernel; /* This option is needed to avoid pollution of low-level caches */ #define NUMBER_OF_THREADS_PER_GLOBALDATA_COPY 4 #ifndef NUMBER_OF_THREADS_PER_GLOBALDATA_COPY #define NUMBER_OF_THREADS_PER_GLOBALDATA_COPY 16383 #endif real m_timeStepWidthSimulation = (real)1.0; real* m_dofs; real* m_tdofs; #ifdef __USE_DERS real* m_ders; #endif real** m_ptdofs; real** m_pder; real* m_faceNeighbors; real** m_globalPointerArray; real* m_globalPointer; unsigned int init_data_structures(unsigned int i_cells) { // check if we have to read on scenario char* pScenario; std::string s_scenario; bool bUseScenario = false; unsigned int (*scenario_faceType)[4]; unsigned int (*scenario_neighbor)[4]; unsigned int (*scenario_side)[4]; unsigned int (*scenario_orientation)[4]; pScenario = getenv ("SEISSOL_PROXY_SCENARIO"); if (pScenario !=NULL ) { bUseScenario = true; s_scenario.assign(pScenario); std::string file; std::ifstream data; size_t reads; unsigned int value; // read scenario size file = s_scenario + ".size"; data.open(file.c_str()); reads = 0; if (!data) { printf("size of scenario couldn't be read!\n"); exit(-1); } while (data >> i_cells) { printf("scenario name is: %s\n", s_scenario.c_str()); printf("scenario has %i cells\n", i_cells); reads++; } data.close(); if (reads != 1) { printf("wrong number of sizes (%i) in scenario were read!\n", reads); exit(-1); } scenario_neighbor = (unsigned int(*)[4]) malloc(i_cells*sizeof(unsigned int[4])); scenario_faceType = (unsigned int(*)[4]) malloc(i_cells*sizeof(unsigned int[4])); scenario_side = (unsigned int(*)[4]) malloc(i_cells*sizeof(unsigned int[4])); scenario_orientation = (unsigned int(*)[4]) malloc(i_cells*sizeof(unsigned int[4])); // read neighbors file = s_scenario + ".neigh"; data.open(file.c_str()); if (!data) { printf("neigh of scenario couldn't be read!\n"); exit(-1); } reads = 0; while (data >> value) { scenario_neighbor[reads/4][reads%4] = value; reads++; } data.close(); if (reads != i_cells*4) { printf("wrong neigh (%i) in scenario were read!\n", reads); exit(-1); } // read faceTypes file = s_scenario + ".bound"; data.open(file.c_str()); if (!data) { printf("bound of scenario couldn't be read!\n"); exit(-1); } reads = 0; while (data >> value) { scenario_faceType[reads/4][reads%4] = value; reads++; } data.close(); if (reads != i_cells*4) { printf("wrong faceType (%i) in scenario were read!\n", reads); exit(-1); } // read sides file = s_scenario + ".sides"; data.open(file.c_str()); if (!data) { printf("sides of scenario couldn't be read!\n"); exit(-1); } reads = 0; while (data >> value) { scenario_side[reads/4][reads%4] = value; reads++; } data.close(); if (reads != i_cells*4) { printf("wrong sides (%i) in scenario were read!\n", reads); exit(-1); } // read orientation file = s_scenario + ".orient"; data.open(file.c_str()); if (!data) { printf("orientations of scenario couldn't be read!\n"); exit(-1); } reads = 0; while (data >> value) { scenario_orientation[reads/4][reads%4] = value; reads++; } data.close(); if (reads != i_cells*4) { printf("wrong orientations (%i) in scenario were read!\n", reads); exit(-1); } } // init RNG libxsmm_rng_set_seed(i_cells); // cell information m_cellInformation = (CellLocalInformation*)malloc(i_cells*sizeof(CellLocalInformation)); for (unsigned int l_cell = 0; l_cell < i_cells; l_cell++) { for (unsigned int f = 0; f < 4; f++) { if (bUseScenario == true ) { switch (scenario_faceType[l_cell][f]) { case 0: m_cellInformation[l_cell].faceTypes[f] = regular; break; case 1: m_cellInformation[l_cell].faceTypes[f] = freeSurface; break; case 3: m_cellInformation[l_cell].faceTypes[f] = dynamicRupture; break; case 5: m_cellInformation[l_cell].faceTypes[f] = outflow; break; case 6: m_cellInformation[l_cell].faceTypes[f] = periodic; break; default: printf("unsupported faceType (%i)!\n", scenario_faceType[l_cell][f]); exit(-1); break; } m_cellInformation[l_cell].faceRelations[f][0] = scenario_side[l_cell][f]; m_cellInformation[l_cell].faceRelations[f][1] = scenario_orientation[l_cell][f]; m_cellInformation[l_cell].faceNeighborIds[f] = scenario_neighbor[l_cell][f]; } else { m_cellInformation[l_cell].faceTypes[f] = regular; m_cellInformation[l_cell].faceRelations[f][0] = (libxsmm_rng_u32(4)); m_cellInformation[l_cell].faceRelations[f][1] = (libxsmm_rng_u32(3)); m_cellInformation[l_cell].faceNeighborIds[f] = (libxsmm_rng_u32(i_cells)); } } #ifdef __USE_DERS m_cellInformation[l_cell].ltsSetup = 4095; #else m_cellInformation[l_cell].ltsSetup = 0; #endif for (unsigned int f = 0; f < 5; f++) { m_cellInformation[l_cell].currentTime[f] = 0.0; } } // DOFs, tIntegrated buffer #ifdef USE_HBM_DOFS hbw_posix_memalign( (void**) &m_dofs, 2097152, sizeof(real[NUMBER_OF_ALIGNED_DOFS])*i_cells ); #else posix_memalign( (void**) &m_dofs, 2097152, sizeof(real[NUMBER_OF_ALIGNED_DOFS])*i_cells ); #endif #ifdef USE_HBM_TDOFS hbw_posix_memalign( (void**) &m_tdofs, 2097152, sizeof(real[NUMBER_OF_ALIGNED_DOFS])*i_cells ); #else posix_memalign( (void**) &m_tdofs, 2097152, sizeof(real[NUMBER_OF_ALIGNED_DOFS])*i_cells ); #endif #ifdef __USE_DERS #ifdef USE_HBM_DERS hbw_posix_memalign( (void**) &m_ders, 2097152, sizeof(real[NUMBER_OF_ALIGNED_DERS])*i_cells ); #else posix_memalign( (void**) &m_ders, 2097152, sizeof(real[NUMBER_OF_ALIGNED_DERS])*i_cells ); #endif #endif m_ptdofs = (real**)malloc(sizeof(real*)*i_cells); m_pder = (real**)malloc(sizeof(real*)*i_cells); m_faceNeighbors = (real*)malloc(sizeof(real*[4])*i_cells); #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif for (unsigned int l_cell = 0; l_cell < i_cells; l_cell++) { for (unsigned int i = 0; i < NUMBER_OF_ALIGNED_DOFS; i++) { m_dofs[(l_cell*NUMBER_OF_ALIGNED_DOFS)+i] = (real)libxsmm_rng_f64(); } for (unsigned int i = 0; i < NUMBER_OF_ALIGNED_DOFS; i++) { m_tdofs[(l_cell*NUMBER_OF_ALIGNED_DOFS)+i] = (real)libxsmm_rng_f64(); } } #ifdef __USE_DERS #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif for (unsigned int l_cell = 0; l_cell < i_cells; l_cell++) { for (unsigned int i = 0; i < NUMBER_OF_ALIGNED_DERS; i++) { m_ders[(l_cell*NUMBER_OF_ALIGNED_DERS)+i] = (real)libxsmm_rng_f64(); } } #endif for (unsigned int l_cell = 0; l_cell < i_cells; l_cell++) { m_ptdofs[l_cell] = &(m_tdofs[(l_cell*NUMBER_OF_ALIGNED_DOFS)]); #ifdef __USE_DERS m_pder[l_cell] = &(m_ders[(l_cell*NUMBER_OF_ALIGNED_DERS)]); #else m_pder[l_cell] = NULL; #endif } m_cells = (Cells*)malloc(sizeof(Cells)); m_cells->numberOfCells = i_cells; m_cells->dofs = (real(*)[NUMBER_OF_ALIGNED_DOFS])m_dofs; m_cells->buffers = m_ptdofs; m_cells->derivatives = m_pder; m_cells->faceNeighbors = (real*(*)[4])m_faceNeighbors; for (unsigned int l_cell = 0; l_cell < i_cells; l_cell++) { for (unsigned int f = 0; f < 4; f++) { if (m_cellInformation[l_cell].faceTypes[f] == outflow) { m_cells->faceNeighbors[l_cell][f] = NULL; } else if (m_cellInformation[l_cell].faceTypes[f] == freeSurface) { #ifdef __USE_DERS m_cells->faceNeighbors[l_cell][f] = m_cells->derivatives[l_cell]; #else m_cells->faceNeighbors[l_cell][f] = m_cells->buffers[l_cell]; #endif } else if (m_cellInformation[l_cell].faceTypes[f] == periodic || m_cellInformation[l_cell].faceTypes[f] == regular) { #ifdef __USE_DERS m_cells->faceNeighbors[l_cell][f] = m_cells->derivatives[m_cellInformation[l_cell].faceNeighborIds[f]]; #else m_cells->faceNeighbors[l_cell][f] = m_cells->buffers[m_cellInformation[l_cell].faceNeighborIds[f]]; #endif } else { printf("unsupported boundary type -> exit\n"); exit(-1); } } } // local integration #ifdef USE_HBM_CELLLOCAL_LOCAL hbw_posix_memalign( (void**) &m_localIntegration, 2097152, i_cells*sizeof(LocalIntegrationData) ); #else posix_memalign( (void**) &m_localIntegration, 2097152, i_cells*sizeof(LocalIntegrationData) ); #endif #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif for (unsigned int l_cell = 0; l_cell < i_cells; l_cell++) { // init star matrices for (size_t m = 0; m < 3; m++) { for (size_t j = 0; j < STAR_NNZ; j++) { m_localIntegration[l_cell].starMatrices[m][j] = (real)libxsmm_rng_f64(); } } // init flux solver for (size_t m = 0; m < 4; m++) { for (size_t j = 0; j < NUMBER_OF_QUANTITIES*NUMBER_OF_QUANTITIES; j++) { m_localIntegration[l_cell].nApNm1[m][j] = (real)libxsmm_rng_f64(); } } } // neighbor integration #ifdef USE_HBM_CELLLOCAL_NEIGH hbw_posix_memalign( (void**) &m_neighboringIntegration, 2097152, i_cells*sizeof(NeighboringIntegrationData) ); #else posix_memalign( (void**) &m_neighboringIntegration, 2097152, i_cells*sizeof(NeighboringIntegrationData) ); #endif #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif for (unsigned int l_cell = 0; l_cell < i_cells; l_cell++) { // init flux solver for (size_t m = 0; m < 4; m++) { for (size_t j = 0; j < NUMBER_OF_QUANTITIES*NUMBER_OF_QUANTITIES; j++) { m_neighboringIntegration[l_cell].nAmNm1[m][j] = (real)libxsmm_rng_f64(); } } } // CellData m_cellData = (CellData*)malloc(sizeof(CellData)); m_cellData->localIntegration = m_localIntegration; m_cellData->neighboringIntegration = m_neighboringIntegration; // Global matrices unsigned int l_globalMatrices = NUMBER_OF_ALIGNED_BASIS_FUNCTIONS * seissol::kernels::getNumberOfBasisFunctions( CONVERGENCE_ORDER-1 ) * 3; l_globalMatrices += seissol::kernels::getNumberOfAlignedBasisFunctions( CONVERGENCE_ORDER-1 ) * NUMBER_OF_BASIS_FUNCTIONS * 3; l_globalMatrices += NUMBER_OF_ALIGNED_BASIS_FUNCTIONS * NUMBER_OF_BASIS_FUNCTIONS * 52; l_globalMatrices *= sizeof(real); // determine number of global data copies unsigned int l_numberOfThreads = 1; #ifdef _OPENMP #pragma omp parallel { #pragma omp master { l_numberOfThreads = omp_get_num_threads(); } } #endif unsigned int l_numberOfCopiesCeil = (l_numberOfThreads%NUMBER_OF_THREADS_PER_GLOBALDATA_COPY == 0) ? 0 : 1; unsigned int l_numberOfCopies = (l_numberOfThreads/NUMBER_OF_THREADS_PER_GLOBALDATA_COPY) + l_numberOfCopiesCeil; m_globalPointerArray = (real**) malloc(l_numberOfCopies*sizeof(real*)); m_globalDataArray = (GlobalData**) malloc(l_numberOfCopies*sizeof(GlobalData*)); // @TODO: for NUMA we need to bind this for (unsigned int l_globalDataCount = 0; l_globalDataCount < l_numberOfCopies; l_globalDataCount++) { #ifdef USE_HBM_GLOBALDATA hbw_posix_memalign( (void**) &(m_globalPointerArray[l_globalDataCount]), 2097152, l_globalMatrices ); #else posix_memalign( (void**) &(m_globalPointerArray[l_globalDataCount]), 2097152, l_globalMatrices ); #endif m_globalPointer = m_globalPointerArray[l_globalDataCount]; m_globalDataArray[l_globalDataCount] = (GlobalData*) malloc(sizeof(GlobalData)); m_globalData = m_globalDataArray[l_globalDataCount]; for (unsigned int i = 0; i < (l_globalMatrices/sizeof(real)); i++) { m_globalPointer[i] = (real)libxsmm_rng_f64(); } real* tmp_pointer = m_globalPointer; // stiffness for time integration for( unsigned int l_transposedStiffnessMatrix = 0; l_transposedStiffnessMatrix < 3; l_transposedStiffnessMatrix++ ) { m_globalData->stiffnessMatricesTransposed[l_transposedStiffnessMatrix] = tmp_pointer; tmp_pointer += seissol::kernels::getNumberOfAlignedBasisFunctions( CONVERGENCE_ORDER-1 ) * NUMBER_OF_BASIS_FUNCTIONS; } // stiffness for volume integration for( unsigned int l_stiffnessMatrix = 0; l_stiffnessMatrix < 3; l_stiffnessMatrix++ ) { m_globalData->stiffnessMatrices[l_stiffnessMatrix] = tmp_pointer; tmp_pointer += NUMBER_OF_ALIGNED_BASIS_FUNCTIONS * seissol::kernels::getNumberOfBasisFunctions( CONVERGENCE_ORDER-1 ); } // flux matrices for boundary integration for( unsigned int l_fluxMatrix = 0; l_fluxMatrix < 52; l_fluxMatrix++ ) { m_globalData->fluxMatrices[l_fluxMatrix] = tmp_pointer; tmp_pointer += NUMBER_OF_ALIGNED_BASIS_FUNCTIONS * NUMBER_OF_BASIS_FUNCTIONS; } } // set default to first chunk m_globalPointer = m_globalPointerArray[0]; m_globalData = m_globalDataArray[0]; if (bUseScenario == true ) { free(scenario_faceType); free(scenario_neighbor); free(scenario_side); free(scenario_orientation); } return i_cells; } void free_data_structures() { unsigned int l_numberOfThreads = 1; #ifdef _OPENMP #pragma omp parallel { #pragma omp master { l_numberOfThreads = omp_get_num_threads(); } } #endif unsigned int l_numberOfCopiesCeil = (l_numberOfThreads%NUMBER_OF_THREADS_PER_GLOBALDATA_COPY == 0) ? 0 : 1; unsigned int l_numberOfCopies = (l_numberOfThreads/NUMBER_OF_THREADS_PER_GLOBALDATA_COPY) + l_numberOfCopiesCeil; for (unsigned int l_globalDataCount = 0; l_globalDataCount < l_numberOfCopies; l_globalDataCount++) { m_globalData = m_globalDataArray[l_globalDataCount]; free(m_globalData); } free(m_globalDataArray); free(m_cellInformation); free(m_cellData); free(m_cells); #ifdef USE_HBM_CELLLOCAL_LOCAL hbw_free(m_localIntegration); #else free(m_localIntegration); #endif #ifdef USE_HBM_CELLLOCAL_NEIGH hbw_free(m_neighboringIntegration); #else free(m_neighboringIntegration); #endif #ifdef USE_HBM_DOFS hbw_free(m_dofs); #else free(m_dofs); #endif #ifdef USE_HBM_TDOFS hbw_free(m_tdofs); #else free(m_tdofs); #endif #ifdef __USE_DERS #ifdef USE_HBM_DERS hbw_free(m_ders); #else free(m_ders); #endif #endif free(m_ptdofs); free(m_pder); free(m_faceNeighbors); for (unsigned int l_globalDataCount = 0; l_globalDataCount < l_numberOfCopies; l_globalDataCount++) { m_globalPointer = m_globalPointerArray[l_globalDataCount]; #ifdef USE_HBM_GLOBALDATA hbw_free(m_globalPointer); #else free(m_globalPointer); #endif } free(m_globalPointerArray); } #endif /*PROXY_SEISSOL_ALLOCATOR_HPP*/ libxsmm-1.17/samples/seissol/proxy_seissol_bytes.hpp000066400000000000000000000077731415223013700231170ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ #ifndef PROXY_SEISSOL_BYTES_HPP #define PROXY_SEISSOL_BYTES_HPP double bytes_ader(unsigned int i_timesteps) { double d_elems = (double)m_cells->numberOfCells; double d_timesteps = (double)i_timesteps; double bytes = 0; // DOFs load and tDOFs write #ifdef __USE_DERS for (int o = CONVERGENCE_ORDER; o > 0; o--) { bytes += (double)sizeof(real) * 1.0 * (double)seissol::kernels::getNumberOfAlignedBasisFunctions( o ) * (double)NUMBER_OF_QUANTITIES; } #endif bytes += (double)sizeof(real) * 3.0 * (double)seissol::kernels::getNumberOfAlignedBasisFunctions( CONVERGENCE_ORDER ) * (double)NUMBER_OF_QUANTITIES; // star bytes += (double)sizeof(real) * 3.0 * (double)STAR_NNZ; bytes *= d_elems; bytes *= d_timesteps; return bytes; } double bytes_vol(unsigned int i_timesteps) { double d_elems = (double)m_cells->numberOfCells; double d_timesteps = (double)i_timesteps; double bytes = 0; // tDOFs load, DOFs write bytes += (double)sizeof(real) * 3.0 * (double)seissol::kernels::getNumberOfAlignedBasisFunctions( CONVERGENCE_ORDER ) * (double)NUMBER_OF_QUANTITIES; // star bytes += (double)sizeof(real) * 3.0 * (double)STAR_NNZ; bytes *= d_elems; bytes *= d_timesteps; return bytes; } double bytes_bndlocal(unsigned int i_timesteps) { double d_elems = (double)m_cells->numberOfCells; double d_timesteps = (double)i_timesteps; double bytes = 0; // tDOFs load, DOFs write bytes += (double)sizeof(real) * 3.0 * (double)seissol::kernels::getNumberOfAlignedBasisFunctions( CONVERGENCE_ORDER ) * (double)NUMBER_OF_QUANTITIES; // flux bytes += (double)sizeof(real) * 4.0 * (double)NUMBER_OF_QUANTITIES * (double)NUMBER_OF_QUANTITIES; bytes *= d_elems; bytes *= d_timesteps; return bytes; } double bytes_local(unsigned int i_timesteps) { double d_elems = (double)m_cells->numberOfCells; double d_timesteps = (double)i_timesteps; double bytes = 0; // DOFs load, tDOFs sum of ader, vol, boundary bytes += (double)sizeof(real) * 3.0 * (double)seissol::kernels::getNumberOfAlignedBasisFunctions( CONVERGENCE_ORDER ) * (double)NUMBER_OF_QUANTITIES; // star bytes += (double)sizeof(real) * 3.0 * (double)STAR_NNZ; // flux solver bytes += (double)sizeof(real) * 4.0 * (double)NUMBER_OF_QUANTITIES * (double)NUMBER_OF_QUANTITIES; bytes *= d_elems; bytes *= d_timesteps; return bytes; } double bytes_bndneigh(unsigned int i_timesteps) { double d_elems = (double)m_cells->numberOfCells; double d_timesteps = (double)i_timesteps; double bytes = 0; // 4 tDOFs/DERs load, DOFs write #ifdef __USE_DERS bytes += (double)sizeof(real) * 2.0 * (double)seissol::kernels::getNumberOfAlignedBasisFunctions( CONVERGENCE_ORDER ) * (double)NUMBER_OF_QUANTITIES; // load neighbors' DERs for (int o = CONVERGENCE_ORDER; o > 0; o--) { bytes += (double)sizeof(real) * 4.0 * (double)seissol::kernels::getNumberOfAlignedBasisFunctions( o ) * (double)NUMBER_OF_QUANTITIES; } #else bytes += (double)sizeof(real) * 6.0 * (double)seissol::kernels::getNumberOfAlignedBasisFunctions( CONVERGENCE_ORDER ) * (double)NUMBER_OF_QUANTITIES; #endif // flux bytes += (double)sizeof(real) * 4.0 * (double)NUMBER_OF_QUANTITIES * (double)NUMBER_OF_QUANTITIES; bytes *= d_elems; bytes *= d_timesteps; return bytes; } double bytes_all(unsigned int i_timesteps) { return (bytes_local(i_timesteps) + bytes_bndneigh(i_timesteps)); } #endif /*PROXY_SEISSOL_BYTES_HPP*/ libxsmm-1.17/samples/seissol/proxy_seissol_flops.hpp000066400000000000000000000156251415223013700231070ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ #ifndef PROXY_SEISSOL_FLOPS_HPP #define PROXY_SEISSOL_FLOPS_HPP typedef struct seissol_flops { double d_nonZeroFlops; double d_hardwareFlops; } seissol_flops; seissol_flops flops_ader_actual(unsigned int i_timesteps) { seissol_flops ret; ret.d_nonZeroFlops = 0.0; ret.d_hardwareFlops = 0.0; // iterate over cells for( unsigned int l_cell = 0; l_cell < m_cells->numberOfCells; l_cell++ ) { unsigned int l_nonZeroFlops, l_hardwareFlops; // get flops m_timeKernel.flopsAder( l_nonZeroFlops, l_hardwareFlops ); ret.d_nonZeroFlops += (double)l_nonZeroFlops; ret.d_hardwareFlops += (double)l_hardwareFlops; } ret.d_nonZeroFlops *= (double)i_timesteps; ret.d_hardwareFlops *= (double)i_timesteps; return ret; } seissol_flops flops_vol_actual(unsigned int i_timesteps) { seissol_flops ret; ret.d_nonZeroFlops = 0.0; ret.d_hardwareFlops = 0.0; // iterate over cells for( unsigned int l_cell = 0; l_cell < m_cells->numberOfCells; l_cell++ ) { unsigned int l_nonZeroFlops, l_hardwareFlops; // get flops m_volumeKernel.flopsIntegral( l_nonZeroFlops, l_hardwareFlops ); ret.d_nonZeroFlops += (double)l_nonZeroFlops; ret.d_hardwareFlops += (double)l_hardwareFlops; } ret.d_nonZeroFlops *= (double)i_timesteps; ret.d_hardwareFlops *= (double)i_timesteps; return ret; } seissol_flops flops_bndlocal_actual(unsigned int i_timesteps) { seissol_flops ret; ret.d_nonZeroFlops = 0.0; ret.d_hardwareFlops = 0.0; // iterate over cells for( unsigned int l_cell = 0; l_cell < m_cells->numberOfCells; l_cell++ ) { unsigned int l_nonZeroFlops, l_hardwareFlops; // get flops m_boundaryKernel.flopsLocalIntegral( m_cellInformation[l_cell].faceTypes, l_nonZeroFlops, l_hardwareFlops ); ret.d_nonZeroFlops += (double)l_nonZeroFlops; ret.d_hardwareFlops += (double)l_hardwareFlops; } ret.d_nonZeroFlops *= (double)i_timesteps; ret.d_hardwareFlops *= (double)i_timesteps; return ret; } seissol_flops flops_bndneigh_actual(unsigned int i_timesteps) { seissol_flops ret; ret.d_nonZeroFlops = 0.0; ret.d_hardwareFlops = 0.0; // iterate over cells for( unsigned int l_cell = 0; l_cell < m_cells->numberOfCells; l_cell++ ) { unsigned int l_nonZeroFlops, l_hardwareFlops; // get flops m_boundaryKernel.flopsNeighborsIntegral( m_cellInformation[l_cell].faceTypes, m_cellInformation[l_cell].faceRelations, l_nonZeroFlops, l_hardwareFlops ); ret.d_nonZeroFlops += (double)l_nonZeroFlops; ret.d_hardwareFlops += (double)l_hardwareFlops; } ret.d_nonZeroFlops *= (double)i_timesteps; ret.d_hardwareFlops *= (double)i_timesteps; return ret; } double flops_ader(unsigned int i_timesteps) { double d_elems = (double)m_cells->numberOfCells; double d_timesteps = (double)i_timesteps; double flops = 0; for (unsigned int o = CONVERGENCE_ORDER; o > 1; o--) { // stiffness flops += 6.0 * (double)seissol::kernels::getNumberOfAlignedBasisFunctions( o-1 ) * (double)seissol::kernels::getNumberOfBasisFunctions( o ) * (double)NUMBER_OF_QUANTITIES; // star flops += 6.0 * (double)seissol::kernels::getNumberOfAlignedBasisFunctions( o-1 ) * (double)STAR_NNZ; // integration flops += 2.0 * (double)seissol::kernels::getNumberOfAlignedBasisFunctions( o-1 ) * (double)NUMBER_OF_QUANTITIES; } flops *= d_elems; flops *= d_timesteps; return flops; } double flops_vol(unsigned int i_timesteps) { double d_elems = (double)m_cells->numberOfCells; double d_timesteps = (double)i_timesteps; double flops = 0; // stiffness flops += 6.0 * (double)seissol::kernels::getNumberOfAlignedBasisFunctions( CONVERGENCE_ORDER ) * (double)seissol::kernels::getNumberOfBasisFunctions( CONVERGENCE_ORDER-1 ) * (double)NUMBER_OF_QUANTITIES; // star flops += 6.0 * (double)seissol::kernels::getNumberOfAlignedBasisFunctions( CONVERGENCE_ORDER ) * (double)STAR_NNZ; flops *= d_elems; flops *= d_timesteps; return flops; } double flops_bndlocal(unsigned int i_timesteps) { double d_elems = (double)m_cells->numberOfCells; double d_timesteps = (double)i_timesteps; double flops = 0; // flux flops += 8.0 * (double)seissol::kernels::getNumberOfAlignedBasisFunctions( CONVERGENCE_ORDER ) * (double)seissol::kernels::getNumberOfBasisFunctions( CONVERGENCE_ORDER ) * (double)NUMBER_OF_QUANTITIES; // flux solver flops += 8.0 * (double)seissol::kernels::getNumberOfAlignedBasisFunctions( CONVERGENCE_ORDER ) * (double)NUMBER_OF_QUANTITIES * (double)NUMBER_OF_QUANTITIES; flops *= d_elems; flops *= d_timesteps; return flops; } double flops_bndneigh(unsigned int i_timesteps) { double d_elems = (double)m_cells->numberOfCells; double d_timesteps = (double)i_timesteps; double flops = 0; // flux flops += 8.0 * (double)seissol::kernels::getNumberOfAlignedBasisFunctions( CONVERGENCE_ORDER ) * (double)seissol::kernels::getNumberOfBasisFunctions( CONVERGENCE_ORDER ) * (double)NUMBER_OF_QUANTITIES; // flux solver flops += 8.0 * (double)seissol::kernels::getNumberOfAlignedBasisFunctions( CONVERGENCE_ORDER ) * (double)NUMBER_OF_QUANTITIES * (double)NUMBER_OF_QUANTITIES; flops *= d_elems; flops *= d_timesteps; return flops; } double flops_local(unsigned int i_timesteps) { return (flops_ader(i_timesteps) + flops_vol(i_timesteps) + flops_bndlocal(i_timesteps)); } double flops_all(unsigned int i_timesteps) { return (flops_local(i_timesteps) + flops_bndneigh(i_timesteps)); } seissol_flops flops_local_actual(unsigned int i_timesteps) { seissol_flops ret; seissol_flops tmp; tmp = flops_ader_actual(i_timesteps); ret.d_nonZeroFlops = tmp.d_nonZeroFlops; ret.d_hardwareFlops = tmp.d_hardwareFlops; tmp = flops_vol_actual(i_timesteps); ret.d_nonZeroFlops += tmp.d_nonZeroFlops; ret.d_hardwareFlops += tmp.d_hardwareFlops; tmp = flops_bndlocal_actual(i_timesteps); ret.d_nonZeroFlops += tmp.d_nonZeroFlops; ret.d_hardwareFlops += tmp.d_hardwareFlops; return ret; } seissol_flops flops_all_actual(unsigned int i_timesteps) { seissol_flops ret; seissol_flops tmp; tmp = flops_local_actual(i_timesteps); ret.d_nonZeroFlops = tmp.d_nonZeroFlops; ret.d_hardwareFlops = tmp.d_hardwareFlops; tmp = flops_bndneigh_actual(i_timesteps); ret.d_nonZeroFlops += tmp.d_nonZeroFlops; ret.d_hardwareFlops += tmp.d_hardwareFlops; return ret; } #endif /*PROXY_SEISSOL_FLOPS_HPP*/ libxsmm-1.17/samples/seissol/proxy_seissol_integrators.hpp000066400000000000000000000237741415223013700243310ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ #ifndef PROXY_SEISSOL_INTEGRATORS_HPP #define PROXY_SEISSOL_INTEGRATORS_HPP #if defined(_OPENMP) # include #endif void computeAderIntegration() { #ifdef _OPENMP # pragma omp parallel { #if NUMBER_OF_THREADS_PER_GLOBALDATA_COPY < 512 //GlobalData* l_globalData = m_globalDataArray[omp_get_thread_num()/NUMBER_OF_THREADS_PER_GLOBALDATA_COPY]; GlobalData* l_globalData = m_globalDataArray[0]; #else GlobalData* l_globalData = m_globalData; #endif #pragma omp for schedule(static) #else GlobalData* l_globalData = m_globalData; #endif for( unsigned int l_cell = 0; l_cell < m_cells->numberOfCells; l_cell++ ) { m_timeKernel.computeAder( m_timeStepWidthSimulation, l_globalData->stiffnessMatricesTransposed, m_cells->dofs[l_cell], m_cellData->localIntegration[l_cell].starMatrices, m_cells->buffers[l_cell], m_cells->derivatives[l_cell] ); } #ifdef _OPENMP } #endif } void computeVolumeIntegration() { #ifdef _OPENMP # pragma omp parallel { #if NUMBER_OF_THREADS_PER_GLOBALDATA_COPY < 512 //GlobalData* l_globalData = m_globalDataArray[omp_get_thread_num()/NUMBER_OF_THREADS_PER_GLOBALDATA_COPY]; GlobalData* l_globalData = m_globalDataArray[0]; #else GlobalData* l_globalData = m_globalData; #endif #pragma omp for schedule(static) #else GlobalData* l_globalData = m_globalData; #endif for( unsigned int l_cell = 0; l_cell < m_cells->numberOfCells; l_cell++ ) { m_volumeKernel.computeIntegral( l_globalData->stiffnessMatrices, m_cells->buffers[l_cell], m_cellData->localIntegration[l_cell].starMatrices, m_cells->dofs[l_cell] ); } #ifdef _OPENMP } #endif } void computeLocalBoundaryIntegration() { #ifdef _OPENMP #pragma omp parallel { #if NUMBER_OF_THREADS_PER_GLOBALDATA_COPY < 512 //GlobalData* l_globalData = m_globalDataArray[omp_get_thread_num()/NUMBER_OF_THREADS_PER_GLOBALDATA_COPY]; GlobalData* l_globalData = m_globalDataArray[0]; #else GlobalData* l_globalData = m_globalData; #endif #pragma omp for schedule(static) #else GlobalData* l_globalData = m_globalData; #endif for( unsigned int l_cell = 0; l_cell < m_cells->numberOfCells; l_cell++ ) { m_boundaryKernel.computeLocalIntegral( m_cellInformation[l_cell].faceTypes, l_globalData->fluxMatrices, m_cells->buffers[l_cell], m_cellData->localIntegration[l_cell].nApNm1, #ifdef ENABLE_STREAM_MATRIX_PREFETCH m_cells->dofs[l_cell], m_cells->buffers[l_cell+1], m_cells->dofs[l_cell+1] ); #else m_cells->dofs[l_cell] ); #endif } #ifdef _OPENMP } #endif } void computeLocalIntegration() { #ifdef _OPENMP #pragma omp parallel { #if NUMBER_OF_THREADS_PER_GLOBALDATA_COPY < 512 //GlobalData* l_globalData = m_globalDataArray[omp_get_thread_num()/NUMBER_OF_THREADS_PER_GLOBALDATA_COPY]; GlobalData* l_globalData = m_globalDataArray[0]; #else GlobalData* l_globalData = m_globalData; #endif #pragma omp for schedule(static) #else GlobalData* l_globalData = m_globalData; #endif for( unsigned int l_cell = 0; l_cell < m_cells->numberOfCells; l_cell++ ) { m_timeKernel.computeAder( (double)m_timeStepWidthSimulation, l_globalData->stiffnessMatricesTransposed, m_cells->dofs[l_cell], m_cellData->localIntegration[l_cell].starMatrices, m_cells->buffers[l_cell], m_cells->derivatives[l_cell] ); m_volumeKernel.computeIntegral( l_globalData->stiffnessMatrices, m_cells->buffers[l_cell], m_cellData->localIntegration[l_cell].starMatrices, m_cells->dofs[l_cell] ); m_boundaryKernel.computeLocalIntegral( m_cellInformation[l_cell].faceTypes, l_globalData->fluxMatrices, m_cells->buffers[l_cell], m_cellData->localIntegration[l_cell].nApNm1, #ifdef ENABLE_STREAM_MATRIX_PREFETCH m_cells->dofs[l_cell], m_cells->buffers[l_cell+1], m_cells->dofs[l_cell+1] ); #else m_cells->dofs[l_cell] ); #endif } #ifdef _OPENMP } #endif } void computeNeighboringIntegration() { real l_integrationBuffer[4][NUMBER_OF_ALIGNED_DOFS] __attribute__((aligned(4096))); real *l_timeIntegrated[4]; #ifdef ENABLE_MATRIX_PREFETCH real *l_faceNeighbors_prefetch[4]; real *l_fluxMatricies_prefetch[4]; #endif #ifdef _OPENMP #ifdef ENABLE_MATRIX_PREFETCH #pragma omp parallel private(l_integrationBuffer, l_timeIntegrated, l_faceNeighbors_prefetch, l_fluxMatricies_prefetch) #else #pragma omp parallel private(l_integrationBuffer, l_timeIntegrated) #endif { #if NUMBER_OF_THREADS_PER_GLOBALDATA_COPY < 512 GlobalData* l_globalData = m_globalDataArray[omp_get_thread_num()/NUMBER_OF_THREADS_PER_GLOBALDATA_COPY]; #else GlobalData* l_globalData = m_globalData; #endif #pragma omp for schedule(static) #else GlobalData* l_globalData = m_globalData; #endif for( int l_cell = 0; l_cell < m_cells->numberOfCells; l_cell++ ) { m_timeKernel.computeIntegrals( m_cellInformation[l_cell].ltsSetup, m_cellInformation[l_cell].faceTypes, m_cellInformation[l_cell].currentTime, (double)m_timeStepWidthSimulation, m_cells->faceNeighbors[l_cell], l_integrationBuffer, l_timeIntegrated ); #ifdef ENABLE_MATRIX_PREFETCH #pragma message("the current prefetch structure (flux matrices and tDOFs is tuned for higher order and shouldn't be harmful for lower orders") int l_face = 1; l_faceNeighbors_prefetch[0] = m_cells->faceNeighbors[l_cell][l_face]; l_fluxMatricies_prefetch[0] = l_globalData->fluxMatrices[4+(l_face*12) +(m_cellInformation[l_cell].faceRelations[l_face][0]*3) +(m_cellInformation[l_cell].faceRelations[l_face][1])]; l_face = 2; l_faceNeighbors_prefetch[1] = m_cells->faceNeighbors[l_cell][l_face]; l_fluxMatricies_prefetch[1] = l_globalData->fluxMatrices[4+(l_face*12) +(m_cellInformation[l_cell].faceRelations[l_face][0]*3) +(m_cellInformation[l_cell].faceRelations[l_face][1])]; l_face = 3; l_faceNeighbors_prefetch[2] = m_cells->faceNeighbors[l_cell][l_face]; l_fluxMatricies_prefetch[2] = l_globalData->fluxMatrices[4+(l_face*12) +(m_cellInformation[l_cell].faceRelations[l_face][0]*3) +(m_cellInformation[l_cell].faceRelations[l_face][1])]; l_face = 0; if (l_cell < (m_cells->numberOfCells-1) ) { l_faceNeighbors_prefetch[3] = m_cells->faceNeighbors[l_cell+1][l_face]; l_fluxMatricies_prefetch[3] = l_globalData->fluxMatrices[4+(l_face*12) +(m_cellInformation[l_cell+1].faceRelations[l_face][0]*3) +(m_cellInformation[l_cell+1].faceRelations[l_face][1])]; } else { l_faceNeighbors_prefetch[3] = m_cells->faceNeighbors[l_cell][3]; l_fluxMatricies_prefetch[3] = l_globalData->fluxMatrices[4+(3*12) +(m_cellInformation[l_cell].faceRelations[l_face][0]*3) +(m_cellInformation[l_cell].faceRelations[l_face][1])]; } #endif m_boundaryKernel.computeNeighborsIntegral( m_cellInformation[l_cell].faceTypes, m_cellInformation[l_cell].faceRelations, l_globalData->fluxMatrices, l_timeIntegrated, m_cellData->neighboringIntegration[l_cell].nAmNm1, #ifdef ENABLE_MATRIX_PREFETCH m_cells->dofs[l_cell], l_faceNeighbors_prefetch, l_fluxMatricies_prefetch ); #else m_cells->dofs[l_cell]); #endif } #ifdef _OPENMP } #endif } #endif /*PROXY_SEISSOL_INTEGRATORS_HPP*/ libxsmm-1.17/samples/smm/000077500000000000000000000000001415223013700153535ustar00rootroot00000000000000libxsmm-1.17/samples/smm/Makefile000066400000000000000000000112171415223013700170150ustar00rootroot00000000000000ROOTDIR = $(abspath $(dir $(firstword $(MAKEFILE_LIST)))) DEPDIR = ../.. SRCDIR = . INCDIR = . BLDDIR = obj OUTDIR = . CXXFLAGS = $(NULL) CFLAGS = $(NULL) DFLAGS = -DLIBXSMM_BLAS_CONST # Fortran code here does not allow for PEDANTIC=2 override PEDANTIC = 1 MKL_DIRECT = 1 OMP = 1 SYM = 1 ifneq (0,$(OMP)) BLAS = 2 else BLAS = 1 endif # explore AVX/ARCH=native SSE = 0 # include common Makefile artifacts include $(DEPDIR)/Makefile.inc # necessary include directories IFLAGS += -I$(call quote,$(INCDIR)) IFLAGS += -I$(call quote,$(DEPDIR)/include) OUTNAME := $(shell basename "$(ROOTDIR)") HEADERS := $(wildcard $(INCDIR)/*.h) $(wildcard $(INCDIR)/*.hpp) $(wildcard $(INCDIR)/*.hxx) $(wildcard $(INCDIR)/*.hh) \ $(wildcard $(SRCDIR)/*.h) $(wildcard $(SRCDIR)/*.hpp) $(wildcard $(SRCDIR)/*.hxx) $(wildcard $(SRCDIR)/*.hh) \ $(DEPDIR)/include/libxsmm_source.h CPPSRCS := $(wildcard $(SRCDIR)/*.cpp) CXXSRCS := $(wildcard $(SRCDIR)/*.cxx) CCXSRCS := $(wildcard $(SRCDIR)/*.cc) CSOURCS := $(wildcard $(SRCDIR)/*.c) CPPOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CPPSRCS:.cpp=-cpp.o))) CXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CXXSRCS:.cxx=-cxx.o))) CCXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CCXSRCS:.cc=-cc.o))) COBJCTS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CSOURCS:.c=-c.o))) ifneq (,$(strip $(FC))) FXXSRCS := $(wildcard $(SRCDIR)/*.f) F77SRCS := $(wildcard $(SRCDIR)/*.F) F90SRCS := $(wildcard $(SRCDIR)/*.f90) $(wildcard $(SRCDIR)/*.F90) FXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(FXXSRCS:.f=-f.o))) F77OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F77SRCS:.F=-f77.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90SRCS:.f90=-f90.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90OBJS:.F90=-f90.o))) endif SOURCES := $(CPPSRCS) $(CXXSRCS) $(CCXSRCS) $(CSOURCS) OBJECTS := $(CPPOBJS) $(CXXOBJS) $(CCXOBJS) $(COBJCTS) FTNSRCS := $(FXXSRCS) $(F77SRCS) $(F90SRCS) MODULES := $(addsuffix .mod,$(basename $(FTNSRCS))) $(addsuffix .modmic,$(basename $(FTNSRCS))) FTNOBJS := $(FXXOBJS) $(F77OBJS) $(F90OBJS) XFILES := $(OUTDIR)/$(OUTNAME) $(OUTDIR)/blas $(OUTDIR)/dispatched $(OUTDIR)/inlined $(OUTDIR)/specialized .PHONY: all all: $(XFILES) .PHONY: compile compile: $(OBJECTS) $(FTNOBJS) ifneq (,$(strip $(FC))) $(OUTDIR)/$(OUTNAME): $(OUTDIR)/.make $(FTNOBJS) $(FORTDEP) $(LIBDEP) $(FLD) -o $@ $(FTNOBJS) $(FORTLIB) $(MAINLIB) $(FCMTFLAGS) $(SLDFLAGS) $(LDFLAGS) $(FLDFLAGS) $(ELDFLAGS) else .PHONY: $(OUTDIR)/$(OUTNAME) endif $(OUTDIR)/blas: $(BLDDIR)/blas-cpp.o $(OUTDIR)/.make $(DEPDIR)/include/libxsmm_source.h $(LD) -o $@ $< $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(OUTDIR)/dispatched: $(BLDDIR)/dispatched-cpp.o $(OUTDIR)/.make $(LIBDEP) $(EXTDEP) $(LD) -o $@ $< $(EXTLIB) $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(OUTDIR)/inlined: $(BLDDIR)/inlined-cpp.o $(OUTDIR)/.make $(DEPDIR)/include/libxsmm_source.h $(LD) -o $@ $< $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(OUTDIR)/specialized: $(BLDDIR)/specialized-cpp.o $(OUTDIR)/.make $(LIBDEP) $(EXTDEP) $(LD) -o $@ $< $(EXTLIB) $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(BLDDIR)/%-cpp.o: $(SRCDIR)/%.cpp .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CXX) $(DFLAGS) $(IFLAGS) $(CXXFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-c.o: $(SRCDIR)/%.c .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CC) $(DFLAGS) $(IFLAGS) $(CFLAGS) $(CTARGET) -c $< -o $@ ifneq (,$(strip $(FC))) $(BLDDIR)/%-f.o: $(SRCDIR)/%.f .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.f90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.F90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f77.o: $(SRCDIR)/%.F .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ endif .PHONY: clean clean: ifneq ($(call qapath,$(BLDDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(BLDDIR)),$(call qapath,.)) @rm -rf $(BLDDIR) endif endif ifneq (,$(wildcard $(BLDDIR))) # still exists @rm -f $(OBJECTS) $(OBJECTX) $(FTNOBJS) $(FTNOBJX) *__genmod.* fit.log *.dat @rm -f $(BLDDIR)/*.gcno $(BLDDIR)/*.gcda $(BLDDIR)/*.gcov endif @rm -f .make .state .PHONY: realclean realclean: clean ifneq ($(call qapath,$(OUTDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(OUTDIR)),$(call qapath,.)) @rm -rf $(OUTDIR) endif endif ifneq (,$(wildcard $(OUTDIR))) # still exists @rm -f $(OUTDIR)/libxsmm.$(DLIBEXT) $(OUTDIR)/*.stackdump @rm -f $(XFILES) $(MODULES) endif libxsmm-1.17/samples/smm/README.md000066400000000000000000000051051415223013700166330ustar00rootroot00000000000000# SMM Sample Collection This collection of code samples exercises different memory streaming cases when performing the matrix multiplication *Cm x n = alpha · Am x k · Bk x n + beta · Cm x n*: (1) streaming the matrices A, B, and C which is usually referred as batched matrix multiplication, (2) streaming the inputs A and B but accumulating C within cache, (3) streaming the A and C matrices while B is kept in cache, (4) streaming the B and C matrices while A is kept in cache, and (4) not streaming any of the operands but repeating the very same multiplication until the requested number of matrix multiplications has been completed. Beside of measuring the duration of a test case, the performance is presented in GFLOPS/s. As an alternative metric, the memory bandwidth is given (the artificial "cached" case omits to present the cache-memory bandwidth). The "pseudo-performance" given in FLOPS/cycle is an artificial scoring, it not only uses a non-standard formula for calculating the FLOPS (*2 \* M \* N \* K - M \* N* rather than *2 \* M \* N \* K*) but also relies on (pseudo-)clock cycles: ``` $ ./specialized.sh 0 m=32 n=32 k=32 size=87381 memory=2048.0 MB (DP) Batched (A,B,C)... pseudo-perf.: 10.7 FLOPS/cycle performance: 23.9 GFLOPS/s bandwidth: 11.1 GB/s duration: 239 ms Finished ``` There are two sub collections of samples codes: (1) a collection of C++ code samples showing either BLAS, Compiler-generated code (inlined code), LIBXSMM/dispatched, LIBXSMM/specialized functions to carry out the multiplication, and (2) a Fortran sample code showing BLAS versus LIBXSMM including some result validation. **C/C++ Code Samples: Command Line Interface (CLI)** * Takes an optional number (1st arg.) to select the streaming-case (0...8) * Optionally takes the M, N, and K parameter of the GEMM in this order * If only M is supplied, the N and K "inherit" the M-value * Example I (A,B,C): ./specialized.sh 0 16 8 9 * Example II (A,B): ./specialized.sh 6 16 **Fortran Code Sample: Command Line Interface (CLI)** * Optionally takes the M, N, and K parameter of the GEMM in this order * Optional problem size (in MB) of the workload; M/N/K must have been supplied * Optional total problem size (in MB) implying the number of repeated run * If only M is supplied, the N and K are "inheriting" the M-value * Shows the performance of each of the streaming cases * Example I: ./smm.sh 16 8 9 1024 16384 * Example II: ./smm.sh 16 libxsmm-1.17/samples/smm/blas.cpp000066400000000000000000000616161415223013700170120ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #include #include #include #include #include #include #include #include #if defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL) # include # include #endif #if defined(_OPENMP) # include #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif #if 0 /* enable padding on a per-matrix basis */ # define PAD(TYPE, VALUE) (LIBXSMM_UP2((VALUE) * sizeof(TYPE), LIBXSMM_ALIGNMENT) / sizeof(TYPE)) #else # define PAD(TYPE, VALUE) (VALUE) #endif #if !defined(RANDOMIZED) && 0 # define RANDOMIZED #endif #if !defined(ITYPE) # define ITYPE double #endif #if !defined(OTYPE) # define OTYPE ITYPE #endif #if (LIBXSMM_EQUAL(ITYPE, float) || LIBXSMM_EQUAL(ITYPE, double)) \ && !defined(MKL_DIRECT_CALL_SEQ) && !defined(MKL_DIRECT_CALL) LIBXSMM_BLAS_SYMBOL_DECL(ITYPE, gemm) #endif int main(int argc, char* argv[]) { int result = EXIT_SUCCESS; try { #if defined(__BLAS) && (0 == __BLAS) LIBXSMM_UNUSED(argc); LIBXSMM_UNUSED(argv); throw "LAPACK/BLAS library must be available for this sample code!"; #else /* BLAS available */ const libxsmm_blasint benchmark = (1 < argc ? std::atoi(argv[1]) : 0); LIBXSMM_BLAS_CONST libxsmm_blasint m = (2 < argc ? std::atoi(argv[2]) : 23); LIBXSMM_BLAS_CONST libxsmm_blasint k = (4 < argc ? std::atoi(argv[4]) : m); LIBXSMM_BLAS_CONST libxsmm_blasint n = (3 < argc ? std::atoi(argv[3]) : k); const libxsmm_blasint q = (5 < argc ? std::atoi(argv[5]) : 0/*auto*/); const libxsmm_blasint nrepeat = (6 < argc ? std::atoi(argv[6]) : (0 >= q ? 13 : 1)); LIBXSMM_BLAS_CONST libxsmm_blasint lda = m, ldb = k, ldc = m; LIBXSMM_BLAS_CONST char transa = 'N', transb = 'N'; LIBXSMM_BLAS_CONST OTYPE alpha = 1, beta = 1; const libxsmm_blasint asize = PAD(ITYPE, lda * k), bsize = PAD(ITYPE, ldb * n), csize = PAD(OTYPE, ldc * n); const libxsmm_blasint max_size = ((2ULL << 30/*2 GB*/) / ((static_cast(asize) + bsize) * sizeof(ITYPE) + csize * sizeof(OTYPE))); const libxsmm_blasint s = LIBXSMM_MIN(0 < q ? q : max_size, max_size); const libxsmm_blasint aspace = LIBXSMM_ALIGNMENT / sizeof(ITYPE); const size_t bwsize = (static_cast(asize)/*load*/ + static_cast(bsize)/*load*/) * sizeof(ITYPE) + (sizeof(OTYPE) * static_cast(csize) * 2/*RFO*/); const double gflops = 2E-9 * s * m * n * k; #if LIBXSMM_TYPEINFO(ITYPE, FP) const char ops[] = "FLOPS"; const double scale = 1.0 / s; #else const char ops[] = "OPS"; const double scale = 1; #endif #if defined(_OPENMP) && !defined(_DEBUG) const char *const env_check = getenv("CHECK"); const int check = (NULL == env_check ? 0 : atoi(env_check)); #elif (defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL)) && (LIBXSMM_VERSION2(11, 3) <= INTEL_MKL_VERSION) /*const*/ int check = 1; #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload target(LIBXSMM_OFFLOAD_TARGET) #endif { #if defined(_OPENMP) const libxsmm_blasint chunksize = s / omp_get_max_threads(); #endif struct raii { // avoid std::vector (first-touch init. causes NUMA issue) ITYPE *a, *b; OTYPE *c, *d; size_t m_size, m_shuffle; raii(libxsmm_blasint asize_, libxsmm_blasint bsize_, libxsmm_blasint csize_, libxsmm_blasint size_) : a(new ITYPE[static_cast(asize_)]), b(new ITYPE[static_cast(bsize_)]) , c(new OTYPE[static_cast(csize_)]), d(new OTYPE[static_cast(csize_)]) , m_size(static_cast(size_)), m_shuffle(libxsmm_shuffle(static_cast(size_))) {} ~raii() { delete[] a; delete[] b; delete[] c; delete[] d; } #if defined(RANDOMIZED) libxsmm_blasint shuffle(libxsmm_blasint i) const { return (i * m_shuffle) % m_size; } #else libxsmm_blasint shuffle(libxsmm_blasint i) const { return i; } #endif } helper(s * asize + aspace - 1, s * bsize + aspace - 1, s * csize + aspace - 1, s); ITYPE *const a = LIBXSMM_ALIGN(helper.a, LIBXSMM_ALIGNMENT); ITYPE *const b = LIBXSMM_ALIGN(helper.b, LIBXSMM_ALIGNMENT); OTYPE *const c = LIBXSMM_ALIGN(helper.c, LIBXSMM_ALIGNMENT); OTYPE *const d = LIBXSMM_ALIGN(helper.d, LIBXSMM_ALIGNMENT); #if defined(_OPENMP) const int nthreads = omp_get_max_threads(); # pragma omp parallel for num_threads(nthreads) schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { LIBXSMM_MATINIT(ITYPE, 42 + helper.shuffle(i), a + static_cast(asize) * helper.shuffle(i), m, k, lda, scale); LIBXSMM_MATINIT(ITYPE, 24 + helper.shuffle(i), b + static_cast(bsize) * helper.shuffle(i), k, n, ldb, scale); LIBXSMM_MATINIT(OTYPE, 22 + i, c + static_cast(csize) * i, m, n, ldc, scale); LIBXSMM_MATINIT(OTYPE, 22 + i, d + static_cast(csize) * i, m, n, ldc, scale); } #if defined(MKL_ENABLE_AVX512) mkl_enable_instructions(MKL_ENABLE_AVX512); #endif // initialize LIBXSMM libxsmm_init(); fprintf(stdout, "m=%lli n=%lli k=%lli size=%lli memory=%.1f MB (input=%s output=%s)\n\n", static_cast(m), static_cast(n), static_cast(k), static_cast(s), 1.0 * (s * ((static_cast(asize) + bsize) * sizeof(ITYPE) + csize * sizeof(OTYPE))) / (1ULL << 20), LIBXSMM_TYPENAME(ITYPE), LIBXSMM_TYPENAME(OTYPE)); // LAPACK/BLAS3 (warm-up BLAS Library) #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, a + static_cast(asize) * helper.shuffle(i), &lda, b + static_cast(bsize) * helper.shuffle(i), &ldb, &beta, c + static_cast(csize) * i, &ldc); } #if (defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL)) && (LIBXSMM_VERSION2(11, 3) <= INTEL_MKL_VERSION) std::vector va_array(static_cast(s)), vb_array(static_cast(s)); std::vector vc_array(static_cast(s)); const ITYPE* *const a_array = &va_array[0]; const ITYPE* *const b_array = &vb_array[0]; OTYPE* *const c_array = &vc_array[0]; const libxsmm_blasint group_count = 1; for (libxsmm_blasint i = 0; i < s; ++i) { // setup batched (A,B,C) a_array[i] = a + static_cast(asize) * helper.shuffle(i); b_array[i] = b + static_cast(bsize) * helper.shuffle(i); c_array[i] = d + static_cast(csize) * i; } // additional warm-up (also to eventually match the Gold result) LIBXSMM_TPREFIX(ITYPE,gemm_batch)(&transa, &transb, &m, &n, &k, &alpha, &a_array[0], &lda, &b_array[0], &ldb, &beta, &c_array[0], &ldc, &group_count, &s); #endif switch (benchmark) { case 0: { // batched fprintf(stdout, "Batched (A,B,C)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for num_threads(nthreads) schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, a + static_cast(asize) * helper.shuffle(i), &lda, b + static_cast(bsize) * helper.shuffle(i), &ldb, &beta, c + static_cast(csize) * i, &ldc); } } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * bwsize / (duration * (1ULL << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } /* fallthrough */ #if (defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL)) && (LIBXSMM_VERSION2(11, 3) <= INTEL_MKL_VERSION) case 1: { // batched indirect fprintf(stdout, "Indirect (A,B,C)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { LIBXSMM_TPREFIX(ITYPE,gemm_batch)(&transa, &transb, &m, &n, &k, &alpha, &a_array[0], &lda, &b_array[0], &ldb, &beta, &c_array[0], &ldc, &group_count, &s); } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * bwsize / (duration * (1ULL << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); if (0 == (benchmark & 1) && 0 != check) { /* Gold result is available */ libxsmm_matdiff_info diff; libxsmm_matdiff_clear(&diff); for (libxsmm_blasint h = 0; h < s; ++h) { const OTYPE *const u = c + static_cast(csize) * h, *const v = c_array[h]; libxsmm_matdiff_info dv; result = libxsmm_matdiff(&dv, LIBXSMM_DATATYPE(OTYPE), m, n, u, v, &ldc, &ldc); if (EXIT_SUCCESS == result) libxsmm_matdiff_reduce(&diff, &dv); } fprintf(stdout, "\tdiff: L2abs=%f Linf=%f\n", diff.l2_abs, diff.linf_abs); if (check < diff.l2_rel) { fprintf(stderr, "FAILED.\n"); result = EXIT_FAILURE; } } } #endif break; case 2: { // streaming A and C fprintf(stdout, "Streamed (A,C)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for num_threads(nthreads) schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, a + static_cast(asize) * helper.shuffle(i), &lda, b, &ldb, &beta, c + static_cast(csize) * i, &ldc); } } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - bsize * sizeof(ITYPE)) / (duration * (1ULL << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } /* fallthrough */ #if (defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL)) && (LIBXSMM_VERSION2(11, 3) <= INTEL_MKL_VERSION) case 3: { // indirect A and C fprintf(stdout, "Indirect (A,C)...\n"); for (libxsmm_blasint i = 0; i < s; ++i) { a_array[i] = a + static_cast(asize) * helper.shuffle(i); b_array[i] = b; c_array[i] = d + static_cast(csize) * i; } const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { LIBXSMM_TPREFIX(ITYPE, gemm_batch)(&transa, &transb, &m, &n, &k, &alpha, &a_array[0], &lda, &b_array[0], &ldb, &beta, &c_array[0], &ldc, &group_count, &s); } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - bsize * sizeof(ITYPE)) / (duration * (1ULL << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); if (0 == (benchmark & 1) && 0 != check) { /* Gold result is available */ libxsmm_matdiff_info diff; libxsmm_matdiff_clear(&diff); for (libxsmm_blasint h = 0; h < s; ++h) { const OTYPE *const u = c + static_cast(csize) * h, *const v = c_array[h]; libxsmm_matdiff_info dv; result = libxsmm_matdiff(&dv, LIBXSMM_DATATYPE(OTYPE), m, n, u, v, &ldc, &ldc); if (EXIT_SUCCESS == result) libxsmm_matdiff_reduce(&diff, &dv); } fprintf(stdout, "\tdiff: L2abs=%f Linf=%f\n", diff.l2_abs, diff.linf_abs); if (check < diff.l2_rel) { fprintf(stderr, "FAILED.\n"); result = EXIT_FAILURE; } } } #endif break; case 4: { // streaming B and C fprintf(stdout, "Streamed (B,C)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for num_threads(nthreads) schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b + static_cast(bsize) * helper.shuffle(i), &ldb, &beta, c + static_cast(csize) * i, &ldc); } } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - asize * sizeof(ITYPE)) / (duration * (1ULL << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } /* fallthrough */ #if (defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL)) && (LIBXSMM_VERSION2(11, 3) <= INTEL_MKL_VERSION) case 5: { // indirect B and C fprintf(stdout, "Indirect (B,C)...\n"); for (libxsmm_blasint i = 0; i < s; ++i) { a_array[i] = a; b_array[i] = b + static_cast(bsize) * helper.shuffle(i); c_array[i] = d + static_cast(csize) * i; } const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { LIBXSMM_TPREFIX(ITYPE, gemm_batch)(&transa, &transb, &m, &n, &k, &alpha, &a_array[0], &lda, &b_array[0], &ldb, &beta, &c_array[0], &ldc, &group_count, &s); } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - asize * sizeof(ITYPE)) / (duration * (1ULL << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); if (0 == (benchmark & 1) && 0 != check) { /* Gold result is available */ libxsmm_matdiff_info diff; libxsmm_matdiff_clear(&diff); for (libxsmm_blasint h = 0; h < s; ++h) { const OTYPE *const u = c + static_cast(csize) * h, *const v = c_array[h]; libxsmm_matdiff_info dv; result = libxsmm_matdiff(&dv, LIBXSMM_DATATYPE(OTYPE), m, n, u, v, &ldc, &ldc); if (EXIT_SUCCESS == result) libxsmm_matdiff_reduce(&diff, &dv); } fprintf(stdout, "\tdiff: L2abs=%f Linf=%f\n", diff.l2_abs, diff.linf_abs); if (check < diff.l2_rel) { fprintf(stderr, "FAILED.\n"); result = EXIT_FAILURE; } } } #endif break; case 6: { // streaming A and B fprintf(stdout, "Streamed (A,B)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for num_threads(0 == check ? nthreads : 1) schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { libxsmm_blasint j = 0; #if defined(_OPENMP) /* attempt to write to disjunct cachelines */ if (0 == check) j = omp_get_thread_num() * chunksize * csize; #endif LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, a + static_cast(asize) * helper.shuffle(i), &lda, b + static_cast(bsize) * helper.shuffle(i), &ldb, &beta, c + j, &ldc); } } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - sizeof(OTYPE) * csize * 2) / (duration * (1ULL << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } /* fallthrough */ #if (defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL)) && (LIBXSMM_VERSION2(11, 3) <= INTEL_MKL_VERSION) case 7: { // indirect A and B fprintf(stdout, "Indirect (A,B)...\n"); #if defined(_OPENMP) # pragma omp parallel for num_threads(0 == check ? nthreads : 1) schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { a_array[i] = a + static_cast(asize) * helper.shuffle(i); b_array[i] = b + static_cast(bsize) * helper.shuffle(i); #if defined(_OPENMP) /* attempt to write to disjunct cachelines */ if (0 == check) { c_array[i] = d + static_cast(csize) * chunksize * omp_get_thread_num(); } else #endif c_array[i] = d; } #if defined(_OPENMP) omp_set_num_threads(0 == check ? nthreads : 1); #endif const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { LIBXSMM_TPREFIX(ITYPE, gemm_batch)(&transa, &transb, &m, &n, &k, &alpha, &a_array[0], &lda, &b_array[0], &ldb, &beta, &c_array[0], &ldc, &group_count, &s); } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - sizeof(OTYPE) * csize * 2) / (duration * (1ULL << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); if (0 == (benchmark & 1) && 0 != check) { /* Gold result is available */ libxsmm_matdiff_info diff; result = libxsmm_matdiff(&diff, LIBXSMM_DATATYPE(OTYPE), m, n, c, d, &ldc, &ldc); fprintf(stdout, "\tdiff: L2abs=%f Linf=%f\n", diff.l2_abs, diff.linf_abs); if (check < diff.l2_rel) { fprintf(stderr, "FAILED.\n"); result = EXIT_FAILURE; } } } #endif break; case 8: { // cached fprintf(stdout, "Cached...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for num_threads(0 == check ? nthreads : 1) schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { libxsmm_blasint j = 0; #if defined(_OPENMP) /* attempt to write to disjunct cachelines */ if (0 == check) j = omp_get_thread_num() * chunksize * csize; #endif LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c + j, &ldc); } } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } /* fallthrough */ #if (defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL)) && (LIBXSMM_VERSION2(11, 3) <= INTEL_MKL_VERSION) case 9: { // indirect cached fprintf(stdout, "Indirect cached...\n"); #if defined(_OPENMP) # pragma omp parallel for num_threads(0 == check ? nthreads : 1) schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { a_array[i] = a; b_array[i] = b; #if defined(_OPENMP) /* attempt to write to disjunct cachelines */ if (0 == check) { c_array[i] = d + static_cast(csize) * chunksize * omp_get_thread_num(); } else #endif c_array[i] = d; } #if defined(_OPENMP) omp_set_num_threads(0 == check ? nthreads : 1); #endif const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { LIBXSMM_TPREFIX(ITYPE, gemm_batch)(&transa, &transb, &m, &n, &k, &alpha, &a_array[0], &lda, &b_array[0], &ldb, &beta, &c_array[0], &ldc, &group_count, &s); } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); if (0 == (benchmark & 1) && 0 != check) { /* Gold result is available */ libxsmm_matdiff_info diff; result = libxsmm_matdiff(&diff, LIBXSMM_DATATYPE(OTYPE), m, n, c, d, &ldc, &ldc); fprintf(stdout, "\tdiff: L2abs=%f Linf=%f\n", diff.l2_abs, diff.linf_abs); if (check < diff.l2_rel) { fprintf(stderr, "FAILED.\n"); result = EXIT_FAILURE; } } } #endif break; default: throw "invalid case selected!"; } /*switch*/ // finalize LIBXSMM libxsmm_finalize(); fprintf(stdout, "Finished\n"); } #endif } catch(const std::exception& e) { fprintf(stderr, "Error: %s\n", e.what()); result = EXIT_FAILURE; } catch(const char* message) { fprintf(stderr, "Error: %s\n", message); result = EXIT_FAILURE; } catch(...) { fprintf(stderr, "Error: unknown exception caught!\n"); result = EXIT_FAILURE; } return result; } libxsmm-1.17/samples/smm/blas.sh000077500000000000000000000050141415223013700166330ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) NAME=$(basename $0 .sh) GREP=$(command -v grep) ENV=$(command -v env) if [ "Windows_NT" = "${OS}" ]; then # Cygwin's "env" does not set PATH ("Files/Black: No such file or directory") export PATH=${PATH}:${HERE}/../../lib:/usr/x86_64-w64-mingw32/sys-root/mingw/bin # Cygwin's ldd hangs with dyn. linked executables or certain shared libraries LDD=$(command -v cygcheck) EXE=.exe else if [ "$(command -v ldd)" ]; then LDD=ldd elif [ "$(command -v otool)" ]; then LDD="otool -L" else LDD=echo fi fi MICINFO=$(command -v micinfo) if [ "${MICINFO}" ]; then MICCORES=$(${MICINFO} 2>/dev/null | sed -n "0,/[[:space:]]\+Total No of Active Cores :[[:space:]]\+\([0-9]\+\)/s//\1/p") fi if [ "" = "${MICCORES}" ]; then MICCORES=61 fi MICTPERC=3 if [ "-mic" != "$1" ]; then if [ "$(${LDD} ${HERE}/${NAME}${EXE} 2>/dev/null | ${GREP} libiomp5\.)" ]; then ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ KMP_AFFINITY=scatter,granularity=fine,1 \ MIC_KMP_AFFINITY=scatter,granularity=fine \ MIC_KMP_HW_SUBSET=$((MICCORES-1))c${MICTPERC}t \ MIC_ENV_PREFIX=MIC \ OFFLOAD_INIT=on_start \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" else ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ OMP_PROC_BIND=TRUE \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" fi else shift ${ENV} \ SINK_LD_LIBRARY_PATH=${SINK_LD_LIBRARY_PATH}:${MIC_LD_LIBRARY_PATH}:${HERE}/../../lib \ micnativeloadex \ ${HERE}/${NAME}${EXE} -a "$*" \ -e "KMP_AFFINITY=scatter,granularity=fine" \ -e "MIC_KMP_HW_SUBSET=$((MICCORES-1))${MICTPERC}t" fi libxsmm-1.17/samples/smm/blas.vcxproj000066400000000000000000000560371415223013700177240ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 blas {2BB258BA-F534-404C-A5F8-F0FCA5028967} 10.0 Application Disabled Disabled Sequential v142 true Application true true Disabled Disabled Sequential v142 Application true Disabled Disabled Sequential v142 true Application Disabled Disabled Sequential v142 true true Application true Disabled Disabled Sequential v142 Application true Disabled Disabled true Sequential v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true true GenerateParallelCode 10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) mkl_rt.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true true GenerateParallelCode SingleFile 10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) mkl_rt.lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true true GenerateParallelCode 10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) mkl_rt.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true true GenerateParallelCode SingleFile 10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) mkl_rt.lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true true GenerateParallelCode 10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true true GenerateParallelCode 10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/smm/dispatched.cpp000066400000000000000000000304031415223013700201670ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #include #include #include #include #include #include #include #if defined(_OPENMP) # include #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif #if 0 /* enable padding on a per-matrix basis */ # define PAD(TYPE, VALUE) (LIBXSMM_UP2((VALUE) * sizeof(TYPE), LIBXSMM_ALIGNMENT) / sizeof(TYPE)) #else # define PAD(TYPE, VALUE) (VALUE) #endif #if !defined(RANDOMIZED) && 0 # define RANDOMIZED #endif #if !defined(ITYPE) # define ITYPE double #endif #if !defined(OTYPE) # define OTYPE ITYPE #endif int main(int argc, char* argv[]) { int result = EXIT_SUCCESS; try { const libxsmm_blasint benchmark = (1 < argc ? std::atoi(argv[1]) : 0); const libxsmm_blasint m = (2 < argc ? std::atoi(argv[2]) : 23); const libxsmm_blasint k = (4 < argc ? std::atoi(argv[4]) : m); const libxsmm_blasint n = (3 < argc ? std::atoi(argv[3]) : k); const libxsmm_blasint q = (5 < argc ? std::atoi(argv[5]) : 0/*auto*/); const libxsmm_blasint nrepeat = (6 < argc ? std::atoi(argv[6]) : (0 >= q ? 13 : 1)); const libxsmm_blasint lda = m, ldb = k, ldc = m; const char transa = 'N', transb = 'N'; const OTYPE alpha = 1, beta = 1; const libxsmm_blasint asize = PAD(ITYPE, lda * k), bsize = PAD(ITYPE, ldb * n), csize = PAD(OTYPE, ldc * n); const libxsmm_blasint max_size = ((2ULL << 30/*2 GB*/) / ((static_cast(asize) + bsize) * sizeof(ITYPE) + csize * sizeof(OTYPE))); const libxsmm_blasint s = LIBXSMM_MIN(0 < q ? q : max_size, max_size); const libxsmm_blasint aspace = LIBXSMM_ALIGNMENT / sizeof(ITYPE); const size_t bwsize = (static_cast(asize)/*load*/ + static_cast(bsize)/*load*/) * sizeof(ITYPE) + (sizeof(OTYPE) * static_cast(csize) * 2/*RFO*/); const double gflops = 2E-9 * s * m * n * k; #if LIBXSMM_TYPEINFO(ITYPE, FP) const char ops[] = "FLOPS"; const double scale = 1.0 / s; #else const char ops[] = "OPS"; const double scale = 1; #endif #if !defined(_DEBUG) const char *const env_check = getenv("CHECK"); const int check = (NULL == env_check ? 0 : atoi(env_check)); #else /*const*/ int check = 1; #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload target(LIBXSMM_OFFLOAD_TARGET) #endif { #if defined(_OPENMP) const libxsmm_blasint chunksize = s / omp_get_max_threads(); #endif struct raii { // avoid std::vector (first-touch init. causes NUMA issue) ITYPE *a, *b; OTYPE *c; size_t m_size, m_shuffle; raii(libxsmm_blasint asize_, libxsmm_blasint bsize_, libxsmm_blasint csize_, libxsmm_blasint size_) : a(new ITYPE[static_cast(asize_)]), b(new ITYPE[static_cast(bsize_)]) , c(new OTYPE[static_cast(csize_)]) , m_size(static_cast(size_)), m_shuffle(libxsmm_shuffle(static_cast(size_))) {} ~raii() { delete[] a; delete[] b; delete[] c; } #if defined(RANDOMIZED) libxsmm_blasint shuffle(libxsmm_blasint i) const { return (i * m_shuffle) % m_size; } #else libxsmm_blasint shuffle(libxsmm_blasint i) const { return i; } #endif } helper(s * asize + aspace - 1, s * bsize + aspace - 1, s * csize + aspace - 1, s); ITYPE *const a = LIBXSMM_ALIGN(helper.a, LIBXSMM_ALIGNMENT); ITYPE *const b = LIBXSMM_ALIGN(helper.b, LIBXSMM_ALIGNMENT); OTYPE *const c = LIBXSMM_ALIGN(helper.c, LIBXSMM_ALIGNMENT); #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { LIBXSMM_MATINIT(ITYPE, 42 + helper.shuffle(i), a + static_cast(asize) * helper.shuffle(i), m, k, lda, scale); LIBXSMM_MATINIT(ITYPE, 24 + helper.shuffle(i), b + static_cast(bsize) * helper.shuffle(i), k, n, ldb, scale); LIBXSMM_MATINIT(OTYPE, 22 + i, c + static_cast(csize) * i, m, n, ldc, scale); } // initialize LIBXSMM libxsmm_init(); fprintf(stdout, "m=%lli n=%lli k=%lli size=%lli memory=%.1f MB (input=%s output=%s)\n\n", static_cast(m), static_cast(n), static_cast(k), static_cast(s), 1.0 * (s * ((static_cast(asize) + bsize) * sizeof(ITYPE) + csize * sizeof(OTYPE))) / (1ULL << 20), LIBXSMM_TYPENAME(ITYPE), LIBXSMM_TYPENAME(OTYPE)); // eventually JIT-compile the requested kernel libxsmm_mmfunction(LIBXSMM_GEMM_FLAGS(transa, transb), m, n, k, lda, ldb, ldc, alpha, beta); switch (benchmark) { case 0: { // batched fprintf(stdout, "Batched (A,B,C)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { libxsmm_gemm(&transa, &transb, m, n, k, &alpha, a + static_cast(asize) * helper.shuffle(i), &lda, b + static_cast(bsize) * helper.shuffle(i), &ldb, &beta, c + static_cast(csize) * i, &ldc); } } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * bwsize / (duration * (1ULL << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } break; case 1: { // streaming A and C fprintf(stdout, "Streamed (A,C)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { libxsmm_gemm(&transa, &transb, m, n, k, &alpha, a + static_cast(asize) * helper.shuffle(i), &lda, b, &ldb, &beta, c + static_cast(csize) * i, &ldc); } } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - bsize * sizeof(ITYPE)) / (duration * (1ULL << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } break; case 2: { // streaming B and C fprintf(stdout, "Streamed (B,C)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { libxsmm_gemm(&transa, &transb, m, n, k, &alpha, a, &lda, b + static_cast(bsize) * helper.shuffle(i), &ldb, &beta, c + static_cast(csize) * i, &ldc); } } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - asize * sizeof(ITYPE)) / (duration * (1ULL << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } break; case 3: { // streaming A and B fprintf(stdout, "Streamed (A,B)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { #if defined(_OPENMP) /* attempt to write to disjunct cachelines */ const libxsmm_blasint j = omp_get_thread_num() * chunksize * csize; #else const libxsmm_blasint j = 0; #endif libxsmm_gemm(&transa, &transb, m, n, k, &alpha, a + static_cast(asize) * helper.shuffle(i), &lda, b + static_cast(bsize) * helper.shuffle(i), &ldb, &beta, c + j, &ldc); } } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - sizeof(OTYPE) * csize * 2) / (duration * (1ULL << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } break; case 4: { // cached fprintf(stdout, "Cached...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { #if defined(_OPENMP) /* attempt to write to disjunct cachelines */ const libxsmm_blasint j = omp_get_thread_num() * chunksize * csize; #else const libxsmm_blasint j = 0; #endif libxsmm_gemm(&transa, &transb, m, n, k, &alpha, a, &lda, b, &ldb, &beta, c + j, &ldc); } } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } break; default: throw "invalid case selected!"; } /*switch*/ if (0 != check) { libxsmm_matdiff_info diff; result = libxsmm_matdiff(&diff, LIBXSMM_DATATYPE(OTYPE), m, n, c, NULL, &ldc, &ldc); if (EXIT_SUCCESS == result) { fprintf(stdout, "\tcheck: %f\n", diff.l1_ref); } } // finalize LIBXSMM libxsmm_finalize(); fprintf(stdout, "Finished\n"); } } catch(const std::exception& e) { fprintf(stderr, "Error: %s\n", e.what()); result = EXIT_FAILURE; } catch(const char* message) { fprintf(stderr, "Error: %s\n", message); result = EXIT_FAILURE; } catch(...) { fprintf(stderr, "Error: unknown exception caught!\n"); result = EXIT_FAILURE; } return result; } libxsmm-1.17/samples/smm/dispatched.sh000077500000000000000000000050141415223013700200220ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) NAME=$(basename $0 .sh) GREP=$(command -v grep) ENV=$(command -v env) if [ "Windows_NT" = "${OS}" ]; then # Cygwin's "env" does not set PATH ("Files/Black: No such file or directory") export PATH=${PATH}:${HERE}/../../lib:/usr/x86_64-w64-mingw32/sys-root/mingw/bin # Cygwin's ldd hangs with dyn. linked executables or certain shared libraries LDD=$(command -v cygcheck) EXE=.exe else if [ "$(command -v ldd)" ]; then LDD=ldd elif [ "$(command -v otool)" ]; then LDD="otool -L" else LDD=echo fi fi MICINFO=$(command -v micinfo) if [ "${MICINFO}" ]; then MICCORES=$(${MICINFO} 2>/dev/null | sed -n "0,/[[:space:]]\+Total No of Active Cores :[[:space:]]\+\([0-9]\+\)/s//\1/p") fi if [ "" = "${MICCORES}" ]; then MICCORES=61 fi MICTPERC=3 if [ "-mic" != "$1" ]; then if [ "$(${LDD} ${HERE}/${NAME}${EXE} 2>/dev/null | ${GREP} libiomp5\.)" ]; then ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ KMP_AFFINITY=scatter,granularity=fine,1 \ MIC_KMP_AFFINITY=scatter,granularity=fine \ MIC_KMP_HW_SUBSET=$((MICCORES-1))c${MICTPERC}t \ MIC_ENV_PREFIX=MIC \ OFFLOAD_INIT=on_start \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" else ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ OMP_PROC_BIND=TRUE \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" fi else shift ${ENV} \ SINK_LD_LIBRARY_PATH=${SINK_LD_LIBRARY_PATH}:${MIC_LD_LIBRARY_PATH}:${HERE}/../../lib \ micnativeloadex \ ${HERE}/${NAME}${EXE} -a "$*" \ -e "KMP_AFFINITY=scatter,granularity=fine" \ -e "MIC_KMP_HW_SUBSET=$((MICCORES-1))${MICTPERC}t" fi libxsmm-1.17/samples/smm/dispatched.vcxproj000066400000000000000000000561121415223013700211050ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 dispatched {C3928ABE-7999-4B1A-A977-257622A728DE} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true true GenerateParallelCode 10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true true GenerateParallelCode SingleFile 10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true true GenerateParallelCode 10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true true GenerateParallelCode SingleFile 10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true true GenerateParallelCode 10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true true GenerateParallelCode 10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/smm/inlined.cpp000066400000000000000000000303411415223013700175020ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #include #include #include #include #include #include #include #if defined(_OPENMP) # include #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif #if 0 /* enable padding on a per-matrix basis */ # define PAD(TYPE, VALUE) (LIBXSMM_UP2((VALUE) * sizeof(TYPE), LIBXSMM_ALIGNMENT) / sizeof(TYPE)) #else # define PAD(TYPE, VALUE) (VALUE) #endif #if !defined(RANDOMIZED) && 0 # define RANDOMIZED #endif #if !defined(ITYPE) # define ITYPE double #endif #if !defined(OTYPE) # define OTYPE ITYPE #endif int main(int argc, char* argv[]) { int result = EXIT_SUCCESS; try { const libxsmm_blasint benchmark = (1 < argc ? std::atoi(argv[1]) : 0); const libxsmm_blasint m = (2 < argc ? std::atoi(argv[2]) : 23); const libxsmm_blasint k = (4 < argc ? std::atoi(argv[4]) : m); const libxsmm_blasint n = (3 < argc ? std::atoi(argv[3]) : k); const libxsmm_blasint q = (5 < argc ? std::atoi(argv[5]) : 0/*auto*/); const libxsmm_blasint nrepeat = (6 < argc ? std::atoi(argv[6]) : (0 >= q ? 13 : 1)); const libxsmm_blasint lda = m, ldb = k, ldc = m; const char transa = 'N', transb = 'N'; const OTYPE alpha = 1, beta = 1; const libxsmm_blasint asize = PAD(ITYPE, lda * k), bsize = PAD(ITYPE, ldb * n), csize = PAD(OTYPE, ldc * n); const libxsmm_blasint max_size = ((2ULL << 30/*2 GB*/) / ((static_cast(asize) + bsize) * sizeof(ITYPE) + csize * sizeof(OTYPE))); const libxsmm_blasint s = LIBXSMM_MIN(0 < q ? q : max_size, max_size); const libxsmm_blasint aspace = LIBXSMM_ALIGNMENT / sizeof(ITYPE); const size_t bwsize = (static_cast(asize)/*load*/ + static_cast(bsize)/*load*/) * sizeof(ITYPE) + (sizeof(OTYPE) * static_cast(csize) * 2/*RFO*/); const double gflops = 2E-9 * s * m * n * k; #if LIBXSMM_TYPEINFO(ITYPE, FP) const char ops[] = "FLOPS"; const double scale = 1.0 / s; #else const char ops[] = "OPS"; const double scale = 1; #endif #if !defined(_DEBUG) const char *const env_check = getenv("CHECK"); const int check = (NULL == env_check ? 0 : atoi(env_check)); #else /*const*/ int check = 1; #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload target(LIBXSMM_OFFLOAD_TARGET) #endif { #if defined(_OPENMP) const libxsmm_blasint chunksize = s / omp_get_max_threads(); #endif struct raii { // avoid std::vector (first-touch init. causes NUMA issue) ITYPE *a, *b; OTYPE *c; size_t m_size, m_shuffle; raii(libxsmm_blasint asize_, libxsmm_blasint bsize_, libxsmm_blasint csize_, libxsmm_blasint size_) : a(new ITYPE[static_cast(asize_)]), b(new ITYPE[static_cast(bsize_)]) , c(new OTYPE[static_cast(csize_)]) , m_size(static_cast(size_)), m_shuffle(libxsmm_shuffle(static_cast(size_))) {} ~raii() { delete[] a; delete[] b; delete[] c; } #if defined(RANDOMIZED) libxsmm_blasint shuffle(libxsmm_blasint i) const { return (i * m_shuffle) % m_size; } #else libxsmm_blasint shuffle(libxsmm_blasint i) const { return i; } #endif } helper(s * asize + aspace - 1, s * bsize + aspace - 1, s * csize + aspace - 1, s); ITYPE *const a = LIBXSMM_ALIGN(helper.a, LIBXSMM_ALIGNMENT); ITYPE *const b = LIBXSMM_ALIGN(helper.b, LIBXSMM_ALIGNMENT); OTYPE *const c = LIBXSMM_ALIGN(helper.c, LIBXSMM_ALIGNMENT); #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { LIBXSMM_MATINIT(ITYPE, 42 + helper.shuffle(i), a + static_cast(asize) * helper.shuffle(i), m, k, lda, scale); LIBXSMM_MATINIT(ITYPE, 24 + helper.shuffle(i), b + static_cast(bsize) * helper.shuffle(i), k, n, ldb, scale); LIBXSMM_MATINIT(OTYPE, 22 + i, c + static_cast(csize) * i, m, n, ldc, scale); } // initialize LIBXSMM libxsmm_init(); fprintf(stdout, "m=%lli n=%lli k=%lli size=%lli memory=%.1f MB (input=%s output=%s)\n\n", static_cast(m), static_cast(n), static_cast(k), static_cast(s), 1.0 * (s * ((static_cast(asize) + bsize) * sizeof(ITYPE) + csize * sizeof(OTYPE))) / (1ULL << 20), LIBXSMM_TYPENAME(ITYPE), LIBXSMM_TYPENAME(OTYPE)); switch (benchmark) { case 0: { // batched fprintf(stdout, "Batched (A,B,C)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { LIBXSMM_INLINE_XGEMM(ITYPE, OTYPE, &transa, &transb, &m, &n, &k, &alpha, a + static_cast(asize) * helper.shuffle(i), &lda, b + static_cast(bsize) * helper.shuffle(i), &ldb, &beta, c + static_cast(csize) * i, &ldc); } } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * bwsize / (duration * (1ULL << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } break; case 1: { // streaming A and C fprintf(stdout, "Streamed (A,C)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { LIBXSMM_INLINE_XGEMM(ITYPE, OTYPE, &transa, &transb, &m, &n, &k, &alpha, a + static_cast(asize) * helper.shuffle(i), &lda, b, &ldb, &beta, c + static_cast(csize) * i, &ldc); } } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - bsize * sizeof(ITYPE)) / (duration * (1ULL << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } break; case 2: { // streaming B and C fprintf(stdout, "Streamed (B,C)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { LIBXSMM_INLINE_XGEMM(ITYPE, OTYPE, &transa, &transb, &m, &n, &k, &alpha, a, &lda, b + static_cast(bsize) * helper.shuffle(i), &ldb, &beta, c + static_cast(csize) * i, &ldc); } } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - asize * sizeof(ITYPE)) / (duration * (1ULL << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } break; case 3: { // streaming A and B fprintf(stdout, "Streamed (A,B)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { #if defined(_OPENMP) /* attempt to write to disjunct cachelines */ const libxsmm_blasint j = omp_get_thread_num() * chunksize * csize; #else const libxsmm_blasint j = 0; #endif LIBXSMM_INLINE_XGEMM(ITYPE, OTYPE, &transa, &transb, &m, &n, &k, &alpha, a + static_cast(asize) * helper.shuffle(i), &lda, b + static_cast(bsize) * helper.shuffle(i), &ldb, &beta, c + j, &ldc); } } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - sizeof(OTYPE) * csize * 2) / (duration * (1ULL << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } break; case 4: { // cached fprintf(stdout, "Cached...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { #if defined(_OPENMP) /* attempt to write to disjunct cachelines */ const libxsmm_blasint j = omp_get_thread_num() * chunksize * csize; #else const libxsmm_blasint j = 0; #endif LIBXSMM_INLINE_XGEMM(ITYPE, OTYPE, &transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c + j, &ldc); } } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } break; default: throw "invalid case selected!"; } /*switch*/ if (0 != check) { libxsmm_matdiff_info diff; result = libxsmm_matdiff(&diff, LIBXSMM_DATATYPE(OTYPE), m, n, c, NULL, &ldc, &ldc); if (EXIT_SUCCESS == result) { fprintf(stdout, "\tcheck: %f\n", diff.l1_ref); } } // finalize LIBXSMM libxsmm_finalize(); fprintf(stdout, "Finished\n"); } } catch(const std::exception& e) { fprintf(stderr, "Error: %s\n", e.what()); result = EXIT_FAILURE; } catch(const char* message) { fprintf(stderr, "Error: %s\n", message); result = EXIT_FAILURE; } catch(...) { fprintf(stderr, "Error: unknown exception caught!\n"); result = EXIT_FAILURE; } return result; } libxsmm-1.17/samples/smm/inlined.sh000077500000000000000000000050141415223013700173340ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) NAME=$(basename $0 .sh) GREP=$(command -v grep) ENV=$(command -v env) if [ "Windows_NT" = "${OS}" ]; then # Cygwin's "env" does not set PATH ("Files/Black: No such file or directory") export PATH=${PATH}:${HERE}/../../lib:/usr/x86_64-w64-mingw32/sys-root/mingw/bin # Cygwin's ldd hangs with dyn. linked executables or certain shared libraries LDD=$(command -v cygcheck) EXE=.exe else if [ "$(command -v ldd)" ]; then LDD=ldd elif [ "$(command -v otool)" ]; then LDD="otool -L" else LDD=echo fi fi MICINFO=$(command -v micinfo) if [ "${MICINFO}" ]; then MICCORES=$(${MICINFO} 2>/dev/null | sed -n "0,/[[:space:]]\+Total No of Active Cores :[[:space:]]\+\([0-9]\+\)/s//\1/p") fi if [ "" = "${MICCORES}" ]; then MICCORES=61 fi MICTPERC=3 if [ "-mic" != "$1" ]; then if [ "$(${LDD} ${HERE}/${NAME}${EXE} 2>/dev/null | ${GREP} libiomp5\.)" ]; then ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ KMP_AFFINITY=scatter,granularity=fine,1 \ MIC_KMP_AFFINITY=scatter,granularity=fine \ MIC_KMP_HW_SUBSET=$((MICCORES-1))c${MICTPERC}t \ MIC_ENV_PREFIX=MIC \ OFFLOAD_INIT=on_start \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" else ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ OMP_PROC_BIND=TRUE \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" fi else shift ${ENV} \ SINK_LD_LIBRARY_PATH=${SINK_LD_LIBRARY_PATH}:${MIC_LD_LIBRARY_PATH}:${HERE}/../../lib \ micnativeloadex \ ${HERE}/${NAME}${EXE} -a "$*" \ -e "KMP_AFFINITY=scatter,granularity=fine" \ -e "MIC_KMP_HW_SUBSET=$((MICCORES-1))${MICTPERC}t" fi libxsmm-1.17/samples/smm/inlined.vcxproj000066400000000000000000000556701415223013700204270ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 inlined {AC175BA9-6C2D-4C52-9AC2-21F01A7211D1} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true true GenerateParallelCode 10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true true GenerateParallelCode SingleFile 10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true true GenerateParallelCode 10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true true GenerateParallelCode SingleFile 10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true true GenerateParallelCode 10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true true GenerateParallelCode 10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/smm/smm-perf.plt000066400000000000000000000033631415223013700176270ustar00rootroot00000000000000MPARM = 1 NPARM = 2 KPARM = 3 FLOPS = 5 HIM = -1 HIN = HIM HIK = HIM FILENAME = system("sh -c \"echo ${FILENAME}\"") if (FILENAME eq "") { FILENAME = "smm-perf.pdf" } FILECOUNT = 1 # initial file number # MULTI =-1: multiple files; no titles # MULTI = 0: multiple files with titles # MULTI = 1: single file with titles MULTI = system("sh -c \"echo ${MULTI}\"") if (MULTI eq "") { MULTI = 1 } LIMIT = system("sh -c \"echo ${LIMIT}\"") if (LIMIT eq "") { LIMIT = 128 } if (MULTI==1) { set output FILENAME } FILEEXT = system("sh -c \"echo ".FILENAME." | sed 's/.\\+\\.\\(.\\+\\)/\\1/'\"") set terminal FILEEXT set termoption enhanced #set termoption font ",12" save_encoding = GPVAL_ENCODING set encoding utf8 reset if (MULTI<=0) { set output "".FILECOUNT."-".FILENAME; FILECOUNT = FILECOUNT + 1 } if (MULTI>-1) { set title "Selected Problem Instances" } set style fill solid 0.4 border -1 set style data histograms set style histogram cluster #gap 2 #set boxwidth 0.5 relative set grid y2tics lc "grey" set key left #spacing 0.5 set xtics rotate by -45 scale 0; set bmargin 6 set ytics format "" set y2tics nomirror set y2label "GFLOP/s" set xrange [0:LIMIT+0.85] set yrange [0:*] set autoscale fix if (0!=system("sh -c \"if [ -e smm-inlined.dat ]; then echo 1; else echo 0; fi\"")) { plot "smm-inlined.dat" using FLOPS title "Inlined", \ "smm-blas.dat" using FLOPS title "BLAS", \ "smm-dispatched.dat" using FLOPS title "Dispatched", \ "smm-specialized.dat" using FLOPS:xtic("(".strcol(MPARM).",".strcol(NPARM).",".strcol(KPARM).")") title "Specialized" } else { plot "smm-blas.dat" using FLOPS title "BLAS", \ "smm-specialized.dat" using FLOPS:xtic("(".strcol(MPARM).",".strcol(NPARM).",".strcol(KPARM).")") title "Specialized" } libxsmm-1.17/samples/smm/smm-perf.sh000077500000000000000000000052661415223013700174510ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) CASE=0 if [ "$1" ]; then CASE=$1 shift fi RUNS="2_2_2 4_4_4 4_6_9 5_5_5 5_5_13 5_13_5 5_13_13 6_6_6 8_8_8 10_10_10 12_12_12 13_5_5 13_5_7 13_5_13 13_13_5 13_13_13 13_13_26 \ 13_26_13 13_26_26 14_14_14 16_16_16 18_18_18 20_20_20 23_23_23 24_3_36 24_24_24 26_13_13 26_13_26 26_26_13 26_26_26 32_32_32 \ 40_40_40 48_48_48 56_56_56 64_64_64 72_72_72 80_80_80 88_88_88 96_96_96 104_104_104 112_112_112 120_120_120 128_128_128" cat /dev/null > smm-blas.txt cat /dev/null > smm-dispatched.txt cat /dev/null > smm-inlined.txt cat /dev/null > smm-specialized.txt NRUN=1 NMAX=$(echo ${RUNS} | wc -w | tr -d " ") for RUN in ${RUNS} ; do MVALUE=$(echo ${RUN} | cut --output-delimiter=' ' -d_ -f1) NVALUE=$(echo ${RUN} | cut --output-delimiter=' ' -d_ -f2) KVALUE=$(echo ${RUN} | cut --output-delimiter=' ' -d_ -f3) >&2 echo "Test ${NRUN} of ${NMAX} (M=${MVALUE} N=${NVALUE} K=${KVALUE})" env LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH ${HERE}/blas.sh ${CASE} ${MVALUE} ${NVALUE} ${KVALUE} >> smm-blas.txt echo >> smm-blas.txt env LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH ${HERE}/specialized.sh ${CASE} ${MVALUE} ${NVALUE} ${KVALUE} >> smm-specialized.txt echo >> smm-specialized.txt env LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH ${HERE}/dispatched.sh $((CASE/2)) ${MVALUE} ${NVALUE} ${KVALUE} >> smm-dispatched.txt echo >> smm-dispatched.txt env LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH ${HERE}/inlined.sh $((CASE/2)) ${MVALUE} ${NVALUE} ${KVALUE} >> smm-inlined.txt echo >> smm-inlined.txt NRUN=$((NRUN + 1)) done libxsmm-1.17/samples/smm/smm-plot.sh000077500000000000000000000053701415223013700174670ustar00rootroot00000000000000#!/usr/bin/env bash ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### SED=$(command -v sed) HERE=$(cd "$(dirname "$0")" && pwd -P) VARIANT=Cached LIMIT=31 if [ "$1" ]; then VARIANT=$1 shift fi if [ -f /cygdrive/c/Program\ Files/gnuplot/bin/wgnuplot ]; then WGNUPLOT=/cygdrive/c/Program\ Files/gnuplot/bin/wgnuplot GNUPLOT=/cygdrive/c/Program\ Files/gnuplot/bin/gnuplot elif [ -f /cygdrive/c/Program\ Files\ \(x86\)/gnuplot/bin/wgnuplot ]; then WGNUPLOT=/cygdrive/c/Program\ Files\ \(x86\)/gnuplot/bin/wgnuplot GNUPLOT=/cygdrive/c/Program\ Files\ \(x86\)/gnuplot/bin/gnuplot else GNUPLOT=$(command -v gnuplot) WGNUPLOT=${GNUPLOT} fi GNUPLOT_MAJOR=0 GNUPLOT_MINOR=0 if [ -f "${GNUPLOT}" ]; then GNUPLOT_MAJOR=$("${GNUPLOT}" --version | ${SED} "s/.\+ \([0-9]\).\([0-9]\) .*/\1/") GNUPLOT_MINOR=$("${GNUPLOT}" --version | ${SED} "s/.\+ \([0-9]\).\([0-9]\) .*/\2/") fi GNUPLOT_VERSION=$((GNUPLOT_MAJOR * 10000 + GNUPLOT_MINOR * 100)) function capturedTxtToDataFile { ${SED} \ -e "/^m=/,/${VARIANT}/{//!d}" \ -e "/${VARIANT}/d" \ -e "/\.\.\./,/Finished/{//!d}" \ -e "/Finished/d" \ -e "/diff:/d" \ -e "/\.\.\./d" \ -e "/^$/d" \ "${HERE}/$1.txt" \ | ${SED} \ -e "s/m=//" -e "s/n=//" -e "s/k=//" -e "s/ (..*) / /" \ -e "s/size=//" \ -e "/duration:/d" \ | ${SED} \ -e "N;s/ memory=..*\n..*//" \ -e "N;s/\n\tperformance:\(..*\) GFLOPS\/s/\1/" \ -e "N;s/\n\tbandwidth:\(..*\) GB\/s/\1/" \ > "${HERE}/$1.dat" } if [ "40600" -le "${GNUPLOT_VERSION}" ]; then RM=$(command -v rm) if [ "" = "$1" ]; then FILENAME=smm-$(echo ${VARIANT} | tr ' ,' '-' | tr -d '()' | tr '[:upper:]' '[:lower:]').pdf else FILENAME=$1 shift fi if [ "" = "$1" ]; then MULTI=1 else MULTI=$1 shift fi ${RM} -f *.dat capturedTxtToDataFile smm-blas capturedTxtToDataFile smm-specialized #capturedTxtToDataFile smm-dispatched #capturedTxtToDataFile smm-inlined env \ GDFONTPATH=/cygdrive/c/Windows/Fonts \ FILENAME=${FILENAME} \ MULTI=${MULTI} \ LIMIT=${LIMIT} \ "${WGNUPLOT}" smm-perf.plt fi libxsmm-1.17/samples/smm/smm.f000066400000000000000000000251731415223013700163260ustar00rootroot00000000000000!=======================================================================! ! Copyright (c) Intel Corporation - All rights reserved. ! ! This file is part of the LIBXSMM library. ! ! ! ! For information on the license, see the LICENSE file. ! ! Further information: https://github.com/hfp/libxsmm/ ! ! SPDX-License-Identifier: BSD-3-Clause ! !=======================================================================! ! Hans Pabst (Intel Corp.), Alexander Heinecke (Intel Corp.) !=======================================================================! PROGRAM smm USE :: LIBXSMM, libxsmm_mmcall => libxsmm_dmmcall_abc !$ USE omp_lib IMPLICIT NONE INTEGER, PARAMETER :: T = KIND(0D0) REAL(T), ALLOCATABLE, TARGET :: a(:,:,:), b(:,:,:) REAL(T), ALLOCATABLE, TARGET :: c(:,:), d(:,:) REAL(T), ALLOCATABLE, TARGET, SAVE :: tmp(:,:) !DIR$ ATTRIBUTES ALIGN:64 :: a, b, c, tmp !$OMP THREADPRIVATE(tmp) TYPE(LIBXSMM_DMMFUNCTION) :: xmm INTEGER(8) :: i, r, s, size0, size1, size2, repetitions, start TYPE(LIBXSMM_MATDIFF_INFO) :: diff, max_diff INTEGER(LIBXSMM_BLASINT_KIND) :: m, n, k DOUBLE PRECISION :: duration, scale CHARACTER(32) :: argv INTEGER :: argc argc = COMMAND_ARGUMENT_COUNT() IF (1 <= argc) THEN CALL GET_COMMAND_ARGUMENT(1, argv) READ(argv, "(I32)") m ELSE m = 23 END IF IF (3 <= argc) THEN CALL GET_COMMAND_ARGUMENT(3, argv) READ(argv, "(I32)") k ELSE k = m END IF IF (2 <= argc) THEN CALL GET_COMMAND_ARGUMENT(2, argv) READ(argv, "(I32)") n ELSE n = k END IF IF (4 <= argc) THEN CALL GET_COMMAND_ARGUMENT(4, argv) READ(argv, "(I32)") size1 ELSE size1 = 0 END IF IF (5 <= argc) THEN CALL GET_COMMAND_ARGUMENT(5, argv) READ(argv, "(I32)") size2 ELSE size2 = 0 ! 1 repetition by default END IF ! Initialize LIBXSMM CALL libxsmm_init() ! Eventually JIT-compile the requested kernel CALL libxsmm_dispatch(xmm, m, n, k) ! workload is about 2 GByte in memory by default size0 = (m * k + k * n + m * n) * T ! size of a single stream element in Byte size1 = MERGE(2048_8, MERGE(size1, ISHFT(ABS(size0 * size1) & & + ISHFT(1, 20) - 1, -20), 0.LE.size1), 0.EQ.size1) size2 = ISHFT(MERGE(MAX(size2, size1), ISHFT(ABS(size2) * size0 & & + ISHFT(1, 20) - 1, -20), 0.LE.size2), 20) / size0 s = ISHFT(size1, 20) / size0 repetitions = size2 / s scale = 1D0 / s duration = 0 CALL libxsmm_matdiff_clear(max_diff) ALLOCATE(c(m,n)) ALLOCATE(a(m,k,s)) ALLOCATE(b(k,n,s)) ! Initialize a, b !$OMP PARALLEL DO PRIVATE(i) DEFAULT(NONE) SHARED(s, a, b, scale) DO i = 1, s CALL init(42, a(:,:,i), scale, i - 1) CALL init(24, b(:,:,i), scale, i - 1) END DO !$OMP END PARALLEL DO WRITE(*, "(3(A,I0),A,I0,A,I0,A,I0)") & & "m=", m, " n=", n, " k=", k, " elements=", UBOUND(a, 3), & & " size=", size1, " MB repetitions=", repetitions ! compute reference solution and warmup BLAS library ALLOCATE(d(m,n)) d(:,:) = 0 !$OMP PARALLEL REDUCTION(+:d) PRIVATE(i, r) & !$OMP DEFAULT(NONE) SHARED(m, n, k, a, b, repetitions) ALLOCATE(tmp(m,n)) tmp(:,:) = 0 DO r = 1, repetitions !$OMP DO DO i = LBOUND(a, 3), UBOUND(a, 3) ! PGI: cannot deduce generic procedure (libxsmm_blas_gemm) CALL libxsmm_blas_dgemm(m=m, n=n, k=k, & & a=a(:,:,i), b=b(:,:,i), c=tmp) END DO END DO d(:,:) = d(:,:) + tmp(:UBOUND(d,1),:) ! Deallocate thread-local arrays DEALLOCATE(tmp) !$OMP END PARALLEL WRITE(*, "(A)") "Streamed (A,B)... (BLAS)" c(:,:) = 0 !$OMP PARALLEL REDUCTION(+:c) PRIVATE(i, r, start) & !$OMP DEFAULT(NONE) & !$OMP SHARED(m, n, k, a, b, duration, repetitions) ALLOCATE(tmp(m,n)) tmp(:,:) = 0 !$OMP MASTER start = libxsmm_timer_tick() !$OMP END MASTER !$OMP BARRIER DO r = 1, repetitions !$OMP DO DO i = LBOUND(a, 3), UBOUND(a, 3) ! PGI: cannot deduce generic procedure (libxsmm_blas_gemm) CALL libxsmm_blas_dgemm(m=m, n=n, k=k, & & a=a(:,:,i), b=b(:,:,i), c=tmp) END DO END DO !$OMP BARRIER !$OMP MASTER duration = libxsmm_timer_duration(start, libxsmm_timer_tick()) !$OMP END MASTER c(:,:) = c(:,:) + tmp(:UBOUND(c,1),:) ! Deallocate thread-local arrays DEALLOCATE(tmp) !$OMP END PARALLEL CALL performance(duration, m, n, k, size2) WRITE(*, "(A)") "Streamed (A,B)... (auto-dispatched)" c(:,:) = 0 !$OMP PARALLEL REDUCTION(+:c) PRIVATE(i, r, start) & !$OMP DEFAULT(NONE) & !$OMP SHARED(m, n, k, a, b, duration, repetitions) ALLOCATE(tmp(m,n)) tmp(:,:) = 0 !$OMP MASTER start = libxsmm_timer_tick() !$OMP END MASTER !$OMP BARRIER DO r = 1, repetitions !$OMP DO DO i = LBOUND(a, 3), UBOUND(a, 3) ! PGI: cannot deduce generic procedure (libxsmm_gemm) CALL libxsmm_dgemm(m=m, n=n, k=k, & & a=a(:,:,i), b=b(:,:,i), c=tmp) END DO END DO !$OMP BARRIER !$OMP MASTER duration = libxsmm_timer_duration(start, libxsmm_timer_tick()) !$OMP END MASTER c(:,:) = c(:,:) + tmp(:UBOUND(c,1),:) ! Deallocate thread-local arrays DEALLOCATE(tmp) !$OMP END PARALLEL CALL performance(duration, m, n, k, size2) CALL libxsmm_matdiff(diff, LIBXSMM_DATATYPE_F64, m, n, & & libxsmm_ptr(d), libxsmm_ptr(c)) WRITE(*, "(1A,A,F10.1)") CHAR(9), "diff: ", diff%l2_abs CALL libxsmm_matdiff_reduce(max_diff, diff) IF (libxsmm_available(xmm)) THEN c(:,:) = 0 WRITE(*, "(A)") "Streamed (A,B)... (specialized)" !$OMP PARALLEL REDUCTION(+:c) PRIVATE(i, r, start) !DEFAULT(NONE) SHARED(m, n, a, b, duration, repetitions, xmm) ALLOCATE(tmp(m,n)) tmp(:,:) = 0 !$OMP MASTER start = libxsmm_timer_tick() !$OMP END MASTER !$OMP BARRIER DO r = 1, repetitions !$OMP DO DO i = LBOUND(a, 3), UBOUND(a, 3) CALL libxsmm_mmcall(xmm, a(:,:,i), b(:,:,i), tmp) END DO END DO !$OMP BARRIER !$OMP MASTER duration = libxsmm_timer_duration(start, libxsmm_timer_tick()) !$OMP END MASTER c(:,:) = c(:,:) + tmp(:UBOUND(c,1),:) ! Deallocate thread-local arrays DEALLOCATE(tmp) !$OMP END PARALLEL CALL performance(duration, m, n, k, size2) CALL libxsmm_matdiff(diff, LIBXSMM_DATATYPE_F64, m, n, & & libxsmm_ptr(d), libxsmm_ptr(c)) WRITE(*, "(1A,A,F10.1)") CHAR(9), "diff: ", diff%l2_abs CALL libxsmm_matdiff_reduce(max_diff, diff) END IF ! Deallocate global arrays DEALLOCATE(a) DEALLOCATE(b) DEALLOCATE(c) DEALLOCATE(d) ! finalize LIBXSMM CALL libxsmm_finalize() IF (1.LT.(max_diff%l2_rel)) STOP 1 CONTAINS PURE SUBROUTINE init(seed, matrix, scale, n) INTEGER, INTENT(IN) :: seed REAL(T), INTENT(OUT) :: matrix(:,:) REAL(8), INTENT(IN) :: scale INTEGER(8), INTENT(IN), OPTIONAL :: n INTEGER(8) :: minval, addval, maxval INTEGER :: ld, i, j REAL(8) :: val, norm ld = UBOUND(matrix, 1) - LBOUND(matrix, 1) + 1 minval = MERGE(n, 0_8, PRESENT(n)) + seed addval = (UBOUND(matrix, 1) - LBOUND(matrix, 1)) * ld & & + (UBOUND(matrix, 2) - LBOUND(matrix, 2)) maxval = MAX(ABS(minval), addval) norm = MERGE(scale / maxval, scale, 0.NE.maxval) DO j = LBOUND(matrix, 2), UBOUND(matrix, 2) DO i = LBOUND(matrix, 1), & & LBOUND(matrix, 1) + UBOUND(matrix, 1) - 1 val = (i - LBOUND(matrix, 1)) * ld & & + (j - LBOUND(matrix, 2)) + minval matrix(i,j) = norm * (val - 0.5D0 * addval) END DO END DO END SUBROUTINE SUBROUTINE disp(matrix, ld, format) REAL(T), INTENT(IN) :: matrix(:,:) INTEGER, INTENT(IN), OPTIONAL :: ld CHARACTER(*), INTENT(IN), OPTIONAL :: format CHARACTER(32) :: fmt INTEGER :: i0, i1, i, j IF (.NOT.PRESENT(format)) THEN fmt = "(16F20.0)" ELSE WRITE(fmt, "('(16',A,')')") format END IF i0 = LBOUND(matrix, 1) i1 = MIN( & & MERGE(i0 + ld - 1, UBOUND(matrix, 1), PRESENT(ld)), & & UBOUND(matrix, 1)) DO i = i0, i1 DO j = LBOUND(matrix, 2), UBOUND(matrix, 2) WRITE(*, fmt, advance='NO') matrix(i,j) END DO WRITE(*, *) END DO END SUBROUTINE SUBROUTINE performance(duration, m, n, k, s) INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k INTEGER(8), INTENT(IN) :: s REAL(T), INTENT(IN) :: duration IF (0.LT.duration) THEN WRITE(*, "(1A,A,F10.1,A)") CHAR(9), "performance:", & & 2D0 * s * m * n * k * 1D-9 / duration, " GFLOPS/s" WRITE(*, "(1A,A,F10.1,A)") CHAR(9), "bandwidth: ", & & s * (m * k + k * n) * T / (duration * ISHFT(1_8, 30)), & & " GB/s" END IF WRITE(*, "(1A,A,F10.1,A)") CHAR(9), "duration: ", & & 1D3 * duration, " ms" END SUBROUTINE END PROGRAM libxsmm-1.17/samples/smm/smm.sh000077500000000000000000000050141415223013700165060ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) NAME=$(basename $0 .sh) GREP=$(command -v grep) ENV=$(command -v env) if [ "Windows_NT" = "${OS}" ]; then # Cygwin's "env" does not set PATH ("Files/Black: No such file or directory") export PATH=${PATH}:${HERE}/../../lib:/usr/x86_64-w64-mingw32/sys-root/mingw/bin # Cygwin's ldd hangs with dyn. linked executables or certain shared libraries LDD=$(command -v cygcheck) EXE=.exe else if [ "$(command -v ldd)" ]; then LDD=ldd elif [ "$(command -v otool)" ]; then LDD="otool -L" else LDD=echo fi fi MICINFO=$(command -v micinfo) if [ "${MICINFO}" ]; then MICCORES=$(${MICINFO} 2>/dev/null | sed -n "0,/[[:space:]]\+Total No of Active Cores :[[:space:]]\+\([0-9]\+\)/s//\1/p") fi if [ "" = "${MICCORES}" ]; then MICCORES=61 fi MICTPERC=3 if [ "-mic" != "$1" ]; then if [ "$(${LDD} ${HERE}/${NAME}${EXE} 2>/dev/null | ${GREP} libiomp5\.)" ]; then ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ KMP_AFFINITY=scatter,granularity=fine,1 \ MIC_KMP_AFFINITY=scatter,granularity=fine \ MIC_KMP_HW_SUBSET=$((MICCORES-1))c${MICTPERC}t \ MIC_ENV_PREFIX=MIC \ OFFLOAD_INIT=on_start \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" else ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ OMP_PROC_BIND=TRUE \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" fi else shift ${ENV} \ SINK_LD_LIBRARY_PATH=${SINK_LD_LIBRARY_PATH}:${MIC_LD_LIBRARY_PATH}:${HERE}/../../lib \ micnativeloadex \ ${HERE}/${NAME}${EXE} -a "$*" \ -e "KMP_AFFINITY=scatter,granularity=fine" \ -e "MIC_KMP_HW_SUBSET=$((MICCORES-1))${MICTPERC}t" fi libxsmm-1.17/samples/smm/specialized.cpp000066400000000000000000000604171415223013700203630ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #include #include #include #include #include #include #include #include #if defined(_OPENMP) # define USEOMP(FUNCTION) LIBXSMM_USEOMP(FUNCTION) # include #else # define USEOMP(FUNCTION) (FUNCTION) #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif #if 0 /* enable padding on a per-matrix basis */ # define PAD(TYPE, VALUE) (LIBXSMM_UP2((VALUE) * sizeof(TYPE), LIBXSMM_ALIGNMENT) / sizeof(TYPE)) #else # define PAD(TYPE, VALUE) (VALUE) #endif #if !defined(RANDOMIZED) && 0 # define RANDOMIZED #endif #if !defined(ITYPE) # define ITYPE double #endif #if !defined(OTYPE) # define OTYPE ITYPE #endif int main(int argc, char* argv[]) { int result = EXIT_SUCCESS; try { const libxsmm_blasint benchmark = (1 < argc ? std::atoi(argv[1]) : 0); const libxsmm_blasint m = (2 < argc ? std::atoi(argv[2]) : 23); const libxsmm_blasint k = (4 < argc ? std::atoi(argv[4]) : m); const libxsmm_blasint n = (3 < argc ? std::atoi(argv[3]) : k); const libxsmm_blasint q = (5 < argc ? std::atoi(argv[5]) : 0/*auto*/); const libxsmm_blasint nrepeat = (6 < argc ? std::atoi(argv[6]) : (0 >= q ? 13 : 1)); const libxsmm_blasint lda = m, ldb = k, ldc = m; const char transa = 'N', transb = 'N'; const OTYPE alpha = 1, beta = 1; const libxsmm_blasint asize = PAD(ITYPE, lda * k), bsize = PAD(ITYPE, ldb * n), csize = PAD(OTYPE, ldc * n); const libxsmm_blasint max_size = ((2ULL << 30/*2 GB*/) / ((static_cast(asize) + bsize) * sizeof(ITYPE) + csize * sizeof(OTYPE))); const libxsmm_blasint s = LIBXSMM_MIN(0 < q ? q : max_size, max_size); const libxsmm_blasint aspace = LIBXSMM_ALIGNMENT / sizeof(ITYPE); const size_t bwsize = (static_cast(asize)/*load*/ + static_cast(bsize)/*load*/) * sizeof(ITYPE) + (sizeof(OTYPE) * static_cast(csize) * 2/*RFO*/); const double gflops = 2E-9 * s * m * n * k; #if LIBXSMM_TYPEINFO(ITYPE, FP) const char ops[] = "FLOPS"; const double scale = 1.0 / s; #else const char ops[] = "OPS"; const double scale = 1; #endif #if !defined(_DEBUG) const char *const env_check = getenv("CHECK"); const int check = (NULL == env_check ? 0 : atoi(env_check)); #else /*const*/ int check = 1; #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload target(LIBXSMM_OFFLOAD_TARGET) #endif { #if defined(_OPENMP) const libxsmm_blasint chunksize = s / omp_get_max_threads(); #endif struct raii { // avoid std::vector (first-touch init. causes NUMA issue) ITYPE *a, *b; OTYPE *c, *d; size_t m_size, m_shuffle; raii(libxsmm_blasint asize_, libxsmm_blasint bsize_, libxsmm_blasint csize_, libxsmm_blasint size_) : a(new ITYPE[static_cast(asize_)]), b(new ITYPE[static_cast(bsize_)]) , c(new OTYPE[static_cast(csize_)]), d(new OTYPE[static_cast(csize_)]) , m_size(static_cast(size_)), m_shuffle(libxsmm_shuffle(static_cast(size_))) {} ~raii() { delete[] a; delete[] b; delete[] c; delete[] d; } #if defined(RANDOMIZED) libxsmm_blasint shuffle(libxsmm_blasint i) const { return (i * m_shuffle) % m_size; } #else libxsmm_blasint shuffle(libxsmm_blasint i) const { return i; } #endif } helper(s * asize + aspace - 1, s * bsize + aspace - 1, s * csize + aspace - 1, s); ITYPE *const a = LIBXSMM_ALIGN(helper.a, LIBXSMM_ALIGNMENT); ITYPE *const b = LIBXSMM_ALIGN(helper.b, LIBXSMM_ALIGNMENT); OTYPE *const c = LIBXSMM_ALIGN(helper.c, LIBXSMM_ALIGNMENT); OTYPE *const d = LIBXSMM_ALIGN(helper.d, LIBXSMM_ALIGNMENT); #if defined(_OPENMP) const int nthreads = omp_get_max_threads(); # pragma omp parallel for num_threads(nthreads) schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { LIBXSMM_MATINIT(ITYPE, 42 + helper.shuffle(i), a + static_cast(asize) * helper.shuffle(i), m, k, lda, scale); LIBXSMM_MATINIT(ITYPE, 24 + helper.shuffle(i), b + static_cast(bsize) * helper.shuffle(i), k, n, ldb, scale); LIBXSMM_MATINIT(OTYPE, 22 + i, c + static_cast(csize) * i, m, n, ldc, scale); LIBXSMM_MATINIT(OTYPE, 22 + i, d + static_cast(csize) * i, m, n, ldc, scale); } // initialize LIBXSMM libxsmm_init(); fprintf(stdout, "m=%lli n=%lli k=%lli size=%lli memory=%.1f MB (input=%s output=%s)\n\n", static_cast(m), static_cast(n), static_cast(k), static_cast(s), 1.0 * (s * ((static_cast(asize) + bsize) * sizeof(ITYPE) + csize * sizeof(OTYPE))) / (1ULL << 20), LIBXSMM_TYPENAME(ITYPE), LIBXSMM_TYPENAME(OTYPE)); const libxsmm_mmfunction xmm(LIBXSMM_GEMM_FLAGS(transa, transb), m, n, k, lda, ldb, ldc, alpha, beta, LIBXSMM_PREFETCH); if (!xmm) throw "no specialized routine found!"; // arrays needed for the batch interface (indirect) std::vector va_array(static_cast(s)), vb_array(static_cast(s)); std::vector vc_array(static_cast(s)); const ITYPE* *const a_array = &va_array[0]; const ITYPE* *const b_array = &vb_array[0]; OTYPE* *const c_array = &vc_array[0]; switch (benchmark) { case 0: { // batched fprintf(stdout, "Batched (A,B,C)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for num_threads(nthreads) schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { const ITYPE *const ai = a + static_cast(asize) * helper.shuffle(i), *const bi = b + static_cast(bsize) * helper.shuffle(i); OTYPE *const ci = c + static_cast(csize) * i; #if (0 != LIBXSMM_PREFETCH) xmm(ai, bi, ci, LIBXSMM_GEMM_PREFETCH_A(ai + asize), LIBXSMM_GEMM_PREFETCH_B(bi + bsize), LIBXSMM_GEMM_PREFETCH_C(ci + csize)); #else xmm(ai, bi, ci); #endif } } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * bwsize / (duration * (1ULL << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } /* fallthrough */ case 1: { // batched/indirect fprintf(stdout, "Indirect (A,B,C)...\n"); for (libxsmm_blasint i = 0; i < s; ++i) { a_array[i] = a + static_cast(asize) * helper.shuffle(i); b_array[i] = b + static_cast(bsize) * helper.shuffle(i); c_array[i] = d + static_cast(csize) * i; } const libxsmm_blasint ptrsize = sizeof(void*); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { USEOMP(libxsmm_gemm_batch)(LIBXSMM_GEMM_PRECISION(ITYPE), LIBXSMM_GEMM_PRECISION(OTYPE), &transa, &transb, m, n, k, &alpha, &a_array[0], &lda, &b_array[0], &ldb, &beta, &c_array[0], &ldc, 0/*index_base*/, 0/*index_stride*/, &ptrsize, &ptrsize, &ptrsize, s); } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * bwsize / (duration * (1ULL << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); if (0 == (benchmark & 1) && 0 != check) { /* Gold result is available */ libxsmm_matdiff_info diff; libxsmm_matdiff_clear(&diff); for (libxsmm_blasint h = 0; h < s; ++h) { const OTYPE *const u = c + static_cast(csize) * h, *const v = c_array[h]; libxsmm_matdiff_info dv; result = libxsmm_matdiff(&dv, LIBXSMM_DATATYPE(OTYPE), m, n, u, v, &ldc, &ldc); if (EXIT_SUCCESS == result) libxsmm_matdiff_reduce(&diff, &dv); } fprintf(stdout, "\tdiff: L2abs=%f Linf=%f\n", diff.l2_abs, diff.linf_abs); if (check < diff.l2_rel) { fprintf(stderr, "FAILED.\n"); result = EXIT_FAILURE; } } } break; case 2: { // streaming A and C fprintf(stdout, "Streamed (A,C)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for num_threads(nthreads) schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { const ITYPE *const ai = a + static_cast(asize) * helper.shuffle(i); OTYPE *const ci = c + static_cast(csize) * i; #if (0 != LIBXSMM_PREFETCH) xmm(ai, b, ci, LIBXSMM_GEMM_PREFETCH_A(ai + asize), LIBXSMM_GEMM_PREFETCH_B(b), LIBXSMM_GEMM_PREFETCH_C(ci + csize)); #else xmm(ai, b, ci); #endif } } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - bsize * sizeof(ITYPE)) / (duration * (1ULL << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } /* fallthrough */ case 3: { // indirect A and C fprintf(stdout, "Indirect (A,C)...\n"); for (libxsmm_blasint i = 0; i < s; ++i) { a_array[i] = a + static_cast(asize) * helper.shuffle(i); b_array[i] = b; c_array[i] = d + static_cast(csize) * i; } const libxsmm_blasint ptrsize = sizeof(void*); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { USEOMP(libxsmm_gemm_batch)(LIBXSMM_GEMM_PRECISION(ITYPE), LIBXSMM_GEMM_PRECISION(OTYPE), &transa, &transb, m, n, k, &alpha, &a_array[0], &lda, &b_array[0], &ldb, &beta, &c_array[0], &ldc, 0/*index_base*/, 0/*index_stride*/, &ptrsize, &ptrsize, &ptrsize, s); } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - bsize * sizeof(ITYPE)) / (duration * (1ULL << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); if (0 == (benchmark & 1) && 0 != check) { /* Gold result is available */ libxsmm_matdiff_info diff; libxsmm_matdiff_clear(&diff); for (libxsmm_blasint h = 0; h < s; ++h) { const OTYPE *const u = c + static_cast(csize) * h, *const v = c_array[h]; libxsmm_matdiff_info dv; result = libxsmm_matdiff(&dv, LIBXSMM_DATATYPE(OTYPE), m, n, u, v, &ldc, &ldc); if (EXIT_SUCCESS == result) libxsmm_matdiff_reduce(&diff, &dv); } fprintf(stdout, "\tdiff: L2abs=%f Linf=%f\n", diff.l2_abs, diff.linf_abs); if (check < diff.l2_rel) { fprintf(stderr, "FAILED.\n"); result = EXIT_FAILURE; } } } break; case 4: { // streaming B and C fprintf(stdout, "Streamed (B,C)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for num_threads(nthreads) schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { const ITYPE *const bi = b + static_cast(bsize) * helper.shuffle(i); OTYPE *const ci = c + static_cast(csize) * i; #if (0 != LIBXSMM_PREFETCH) xmm(a, bi, ci, LIBXSMM_GEMM_PREFETCH_A(a), LIBXSMM_GEMM_PREFETCH_B(bi + bsize), LIBXSMM_GEMM_PREFETCH_C(ci + csize)); #else xmm(a, bi, ci); #endif } } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - asize * sizeof(ITYPE)) / (duration * (1ULL << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } /* fallthrough */ case 5: { // indirect B and C fprintf(stdout, "Indirect (B,C)...\n"); for (libxsmm_blasint i = 0; i < s; ++i) { a_array[i] = a; b_array[i] = b + static_cast(bsize) * helper.shuffle(i); c_array[i] = d + static_cast(csize) * i; } const libxsmm_blasint ptrsize = sizeof(void*); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { USEOMP(libxsmm_gemm_batch)(LIBXSMM_GEMM_PRECISION(ITYPE), LIBXSMM_GEMM_PRECISION(OTYPE), &transa, &transb, m, n, k, &alpha, &a_array[0], &lda, &b_array[0], &ldb, &beta, &c_array[0], &ldc, 0/*index_base*/, 0/*index_stride*/, &ptrsize, &ptrsize, &ptrsize, s); } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - asize * sizeof(ITYPE)) / (duration * (1ULL << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); if (0 == (benchmark & 1) && 0 != check) { /* Gold result is available */ libxsmm_matdiff_info diff; libxsmm_matdiff_clear(&diff); for (libxsmm_blasint h = 0; h < s; ++h) { const OTYPE *const u = c + static_cast(csize) * h, *const v = c_array[h]; libxsmm_matdiff_info dv; result = libxsmm_matdiff(&dv, LIBXSMM_DATATYPE(OTYPE), m, n, u, v, &ldc, &ldc); if (EXIT_SUCCESS == result) libxsmm_matdiff_reduce(&diff, &dv); } fprintf(stdout, "\tdiff: L2abs=%f Linf=%f\n", diff.l2_abs, diff.linf_abs); if (check < diff.l2_rel) { fprintf(stderr, "FAILED.\n"); result = EXIT_FAILURE; } } } break; case 6: { // streaming A and B fprintf(stdout, "Streamed (A,B)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for num_threads(0 == check ? nthreads : 1) schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { libxsmm_blasint j = 0; #if defined(_OPENMP) /* attempt to write to disjunct cachelines */ if (0 == check) j = omp_get_thread_num() * chunksize * csize; #endif const ITYPE *const ai = a + static_cast(asize) * helper.shuffle(i), *const bi = b + static_cast(bsize) * helper.shuffle(i); #if (0 != LIBXSMM_PREFETCH) xmm(ai, bi, c + j, LIBXSMM_GEMM_PREFETCH_A(ai + asize), LIBXSMM_GEMM_PREFETCH_B(bi + bsize), LIBXSMM_GEMM_PREFETCH_C(c + j)); #else xmm(ai, bi, c + j); #endif } } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - sizeof(OTYPE) * csize * 2) / (duration * (1ULL << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } /* fallthrough */ case 7: { // indirect A and B fprintf(stdout, "Indirect (A,B)...\n"); #if defined(_OPENMP) # pragma omp parallel for num_threads(0 == check ? nthreads : 1) schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { a_array[i] = a + static_cast(asize) * helper.shuffle(i); b_array[i] = b + static_cast(bsize) * helper.shuffle(i); #if defined(_OPENMP) /* attempt to write to disjunct cachelines */ if (0 == check) { c_array[i] = d + static_cast(csize) * chunksize * omp_get_thread_num(); } else #endif c_array[i] = d; } const libxsmm_blasint ptrsize = sizeof(void*); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { USEOMP(libxsmm_gemm_batch)(LIBXSMM_GEMM_PRECISION(ITYPE), LIBXSMM_GEMM_PRECISION(OTYPE), &transa, &transb, m, n, k, &alpha, &a_array[0], &lda, &b_array[0], &ldb, &beta, &c_array[0], &ldc, 0/*index_base*/, 0/*index_stride*/, &ptrsize, &ptrsize, &ptrsize, 0 == check ? -s : s); } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - sizeof(OTYPE) * csize * 2) / (duration * (1ULL << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); if (0 == (benchmark & 1) && 0 != check) { /* Gold result is available */ libxsmm_matdiff_info diff; result = libxsmm_matdiff(&diff, LIBXSMM_DATATYPE(OTYPE), m, n, c, d, &ldc, &ldc); fprintf(stdout, "\tdiff: L2abs=%f Linf=%f\n", diff.l2_abs, diff.linf_abs); if (check < diff.l2_rel) { fprintf(stderr, "FAILED.\n"); result = EXIT_FAILURE; } } } break; case 8: { // cached fprintf(stdout, "Cached...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for num_threads(0 == check ? nthreads : 1) schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { libxsmm_blasint j = 0; #if defined(_OPENMP) /* attempt to write to disjunct cachelines */ if (0 == check) j = omp_get_thread_num() * chunksize * csize; #endif #if (0 != LIBXSMM_PREFETCH) xmm(a, b, c + j, LIBXSMM_GEMM_PREFETCH_A(a), LIBXSMM_GEMM_PREFETCH_B(b), LIBXSMM_GEMM_PREFETCH_C(c + j)); #else xmm(a, b, c + j); #endif } } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } /* fallthrough */ case 9: { // indirect cached fprintf(stdout, "Indirect cached...\n"); #if defined(_OPENMP) # pragma omp parallel for num_threads(0 == check ? nthreads : 1) schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { a_array[i] = a; b_array[i] = b; #if defined(_OPENMP) /* attempt to write to disjunct cachelines */ if (0 == check) { c_array[i] = d + static_cast(csize) * chunksize * omp_get_thread_num(); } else #endif c_array[i] = d; } const libxsmm_blasint ptrsize = sizeof(void*); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { USEOMP(libxsmm_gemm_batch)(LIBXSMM_GEMM_PRECISION(ITYPE), LIBXSMM_GEMM_PRECISION(OTYPE), &transa, &transb, m, n, k, &alpha, &a_array[0], &lda, &b_array[0], &ldb, &beta, &c_array[0], &ldc, 0/*index_base*/, 0/*index_stride*/, &ptrsize, &ptrsize, &ptrsize, 0 == check ? -s : s); } const unsigned long long ncycles = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2.0 * k - 1.0) * (static_cast(s) * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); if (0 == (benchmark & 1) && 0 != check) { /* Gold result is available */ libxsmm_matdiff_info diff; result = libxsmm_matdiff(&diff, LIBXSMM_DATATYPE(OTYPE), m, n, c, d, &ldc, &ldc); fprintf(stdout, "\tdiff: L2abs=%f Linf=%f\n", diff.l2_abs, diff.linf_abs); if (check < diff.l2_rel) { fprintf(stderr, "FAILED.\n"); result = EXIT_FAILURE; } } } break; default: throw "invalid case selected!"; } /*switch*/ // finalize LIBXSMM libxsmm_finalize(); fprintf(stdout, "Finished\n"); } } catch(const std::exception& e) { fprintf(stderr, "Error: %s\n", e.what()); result = EXIT_FAILURE; } catch(const char* message) { fprintf(stderr, "Error: %s\n", message); result = EXIT_FAILURE; } catch(...) { fprintf(stderr, "Error: unknown exception caught!\n"); result = EXIT_FAILURE; } return result; } libxsmm-1.17/samples/smm/specialized.sh000077500000000000000000000050141415223013700202060ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) NAME=$(basename $0 .sh) GREP=$(command -v grep) ENV=$(command -v env) if [ "Windows_NT" = "${OS}" ]; then # Cygwin's "env" does not set PATH ("Files/Black: No such file or directory") export PATH=${PATH}:${HERE}/../../lib:/usr/x86_64-w64-mingw32/sys-root/mingw/bin # Cygwin's ldd hangs with dyn. linked executables or certain shared libraries LDD=$(command -v cygcheck) EXE=.exe else if [ "$(command -v ldd)" ]; then LDD=ldd elif [ "$(command -v otool)" ]; then LDD="otool -L" else LDD=echo fi fi MICINFO=$(command -v micinfo) if [ "${MICINFO}" ]; then MICCORES=$(${MICINFO} 2>/dev/null | sed -n "0,/[[:space:]]\+Total No of Active Cores :[[:space:]]\+\([0-9]\+\)/s//\1/p") fi if [ "" = "${MICCORES}" ]; then MICCORES=61 fi MICTPERC=3 if [ "-mic" != "$1" ]; then if [ "$(${LDD} ${HERE}/${NAME}${EXE} 2>/dev/null | ${GREP} libiomp5\.)" ]; then ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ KMP_AFFINITY=scatter,granularity=fine,1 \ MIC_KMP_AFFINITY=scatter,granularity=fine \ MIC_KMP_HW_SUBSET=$((MICCORES-1))c${MICTPERC}t \ MIC_ENV_PREFIX=MIC \ OFFLOAD_INIT=on_start \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" else ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ OMP_PROC_BIND=TRUE \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" fi else shift ${ENV} \ SINK_LD_LIBRARY_PATH=${SINK_LD_LIBRARY_PATH}:${MIC_LD_LIBRARY_PATH}:${HERE}/../../lib \ micnativeloadex \ ${HERE}/${NAME}${EXE} -a "$*" \ -e "KMP_AFFINITY=scatter,granularity=fine" \ -e "MIC_KMP_HW_SUBSET=$((MICCORES-1))${MICTPERC}t" fi libxsmm-1.17/samples/smm/specialized.vcxproj000066400000000000000000000563521415223013700212770ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 specialized {65E7706E-44C2-49A3-ACCD-3FA72D522C8C} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true true GenerateParallelCode 10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmmext.lib;libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true true GenerateParallelCode SingleFile 10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmmext-$(Configuration).lib;libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true true GenerateParallelCode 10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmmext.lib;libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true true GenerateParallelCode SingleFile 10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmmext-$(Configuration).lib;libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true true GenerateParallelCode 10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmmext-$(Configuration).lib;libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true true GenerateParallelCode 10128,3948,10373,10382 HOST true /Zc:twoPhase- %(AdditionalOptions) 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmmext-$(Configuration).lib;libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/specfem/000077500000000000000000000000001415223013700162015ustar00rootroot00000000000000libxsmm-1.17/samples/specfem/Makefile000066400000000000000000000114551415223013700176470ustar00rootroot00000000000000ROOTDIR = $(abspath $(dir $(firstword $(MAKEFILE_LIST)))) DEPDIR = ../.. SRCDIR = . INCDIR = . BLDDIR = obj OUTDIR = . CXXFLAGS = $(NULL) CFLAGS = $(NULL) DFLAGS = $(NULL) # Fortran code here does not allow for PEDANTIC=2 override PEDANTIC = 1 BLAS = 1 OMP = 1 SYM = 1 # include common Makefile artifacts include $(DEPDIR)/Makefile.inc # necessary include directories IFLAGS += -I$(call quote,$(INCDIR)) IFLAGS += -I$(call quote,$(DEPDIR)/include) OUTNAME := $(shell basename "$(ROOTDIR)") HEADERS := $(wildcard $(INCDIR)/*.h) $(wildcard $(INCDIR)/*.hpp) $(wildcard $(INCDIR)/*.hxx) $(wildcard $(INCDIR)/*.hh) \ $(wildcard $(SRCDIR)/*.h) $(wildcard $(SRCDIR)/*.hpp) $(wildcard $(SRCDIR)/*.hxx) $(wildcard $(SRCDIR)/*.hh) \ $(DEPDIR)/include/libxsmm_source.h CPPSRCS := $(wildcard $(SRCDIR)/*.cpp) CXXSRCS := $(wildcard $(SRCDIR)/*.cxx) CCXSRCS := $(wildcard $(SRCDIR)/*.cc) CSOURCS := $(wildcard $(SRCDIR)/*.c) CPPOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CPPSRCS:.cpp=-cpp.o))) CXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CXXSRCS:.cxx=-cxx.o))) CCXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CCXSRCS:.cc=-cc.o))) COBJCTS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CSOURCS:.c=-c.o))) ifneq (,$(strip $(FC))) FXXSRCS := $(wildcard $(SRCDIR)/*.f) F77SRCS := $(wildcard $(SRCDIR)/*.F) F90SRCS := $(wildcard $(SRCDIR)/*.f90) $(wildcard $(SRCDIR)/*.F90) FXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(FXXSRCS:.f=-f.o))) F77OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F77SRCS:.F=-f77.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90SRCS:.f90=-f90.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90OBJS:.F90=-f90.o))) endif SOURCES := $(CPPSRCS) $(CXXSRCS) $(CCXSRCS) $(CSOURCS) OBJECTS := $(CPPOBJS) $(CXXOBJS) $(CCXOBJS) $(COBJCTS) FTNSRCS := $(FXXSRCS) $(F77SRCS) $(F90SRCS) MODULES := $(addsuffix .mod,$(basename $(FTNSRCS))) $(addsuffix .modmic,$(basename $(FTNSRCS))) FTNOBJS := $(FXXOBJS) $(F77OBJS) $(F90OBJS) XFILES := $(OUTDIR)/$(OUTNAME) # specfem example specific MODULES += my_libxsmm.mod constants.mod specfem_par.mod my_libxsmm.modmic constants.modmic specfem_par.modmic DFLAGS += -DFORCE_VECTORIZATION # OpenMP directives support ifneq (0,$(OMP)) DFLAGS += -DUSE_OPENMP endif # fixes library paths: substitutes path name from ../mkl/lib/intel64/.. to ../mkl/lib/mic/.. ifneq (0,$(KNC)) ifneq (0,$(MPSS)) lib_intel := mkl/lib/intel64 lib_mic := mkl/lib/mic LDFLAGS_MIC = $(subst $(lib_intel),$(lib_mic),$(LDFLAGS)) endif endif ## ## targets ## .PHONY: all all: $(XFILES) .PHONY: compile compile: $(OBJECTS) $(FTNOBJS) ifneq (,$(strip $(FC))) $(OUTDIR)/specfem: $(OUTDIR)/.make $(FTNOBJS) $(LIBDEP) ifneq (0,$(KNC)) ifneq (0,$(MPSS)) @echo "" @echo "building MIC/KNC version" @echo "" $(FLD) -o $@ -mmic $(FTNOBJS) $(FORTLIB) $(MAINLIB) $(FCMTFLAGS) $(SLDFLAGS) $(LDFLAGS_MIC) $(FLDFLAGS) $(ELDFLAGS) endif endif ifeq (0,$(KNC)) ifeq (0,$(MPSS)) @echo "" @echo "building host version" @echo "" $(FLD) -o $@ $(FTNOBJS) $(FORTLIB) $(MAINLIB) $(FCMTFLAGS) $(SLDFLAGS) $(LDFLAGS) $(FLDFLAGS) $(ELDFLAGS) endif endif else .PHONY: $(OUTDIR)/specfem endif .PHONY: clean clean: ifneq ($(call qapath,$(BLDDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(BLDDIR)),$(call qapath,.)) @rm -rf $(BLDDIR) endif endif ifneq (,$(wildcard $(BLDDIR))) # still exists @rm -f $(OBJECTS) $(OBJECTX) $(FTNOBJS) $(FTNOBJX) *__genmod.* fit.log *.dat @rm -f $(BLDDIR)/*.gcno $(BLDDIR)/*.gcda $(BLDDIR)/*.gcov endif @rm -f .make .state .PHONY: realclean realclean: clean ifneq ($(call qapath,$(OUTDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(OUTDIR)),$(call qapath,.)) @rm -rf $(OUTDIR) endif endif ifneq (,$(wildcard $(OUTDIR))) # still exists @rm -f $(OUTDIR)/libxsmm.$(DLIBEXT) $(OUTDIR)/*.stackdump @rm -f $(XFILES) $(MODULES) endif ## ## dependencies ## $(BLDDIR)/compute_forces_Dev-f90.o: $(BLDDIR)/specfem-f90.o $(BLDDIR)/compute_forces_noDev-f90.o: $(BLDDIR)/specfem-f90.o $(BLDDIR)/compute_forces_xsmm_dispatch-f90.o: $(BLDDIR)/specfem-f90.o $(BLDDIR)/compute_forces_xsmm_prefetch-f90.o: $(BLDDIR)/specfem-f90.o $(BLDDIR)/compute_forces_xsmm_static-f90.o: $(BLDDIR)/specfem-f90.o ## ## rules ## ifneq (,$(strip $(FC))) ifneq (0,$(KNC)) ifneq (0,$(MPSS)) $(BLDDIR)/%-f90.o: $(SRCDIR)/%.f90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) -mmic -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.F90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) -mmic -c $< -o $@ endif endif ifeq (0,$(KNC)) ifeq (0,$(MPSS)) $(BLDDIR)/%-f90.o: $(SRCDIR)/%.f90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.F90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ endif endif endif libxsmm-1.17/samples/specfem/README.md000066400000000000000000000122231415223013700174600ustar00rootroot00000000000000# SPECFEM Sample This sample contains a dummy example from a spectral-element stiffness kernel taken from [SPECFEM3D_GLOBE](https://github.com/geodynamics/specfem3d_globe). It is based on a 4th-order, spectral-element stiffness kernel for simulations of elastic wave propagation through the Earth. Matrix sizes used are (25,5), (5,25) and (5,5) determined by different cut-planes through a three dimensional (5,5,5)-element with a total of 125 GLL points. ## Usage Step-by-Step This example needs the LIBXSMM library to be built with static kernels, using MNK="5 25" (for matrix size (5,25), (25,5) and (5,5)). ### Build LIBXSMM #### General Default Compilation In LIBXSMM root directory, compile the library with: ```bash make MNK="5 25" ALPHA=1 BETA=0 ``` #### Additional Compilation Examples Compilation using only single precision version and aggressive optimization: ```bash make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 ``` For Sandy Bridge CPUs: ```bash make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 AVX=1 ``` For Haswell CPUs: ```bash make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 AVX=2 ``` For Knights Corner (KNC) (and thereby creating a Sandy Bridge version): ```bash make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 AVX=1 \ OFFLOAD=1 KNC=1 ``` Installing libraries into a sub-directory workstation/: ```bash make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 AVX=1 \ OFFLOAD=1 KNC=1 \ PREFIX=workstation/ install-minimal ``` ### Build SpecFEM example code For default CPU host: ```bash cd sample/specfem make ``` For Knights Corner (KNC): ```bash cd sample/specfem make KNC=1 ``` Additionally, adding some specific Fortran compiler flags, for example: ```bash cd sample/specfem make FCFLAGS="-O3 -fopenmp" [...] ``` Note that steps 1 and 2 could be shortened by specifying a "specfem" make target in the LIBXSMM root directory: ```bash make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 AVX=1 specfem ``` For Knights Corner, this would need two steps: ```bash make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 AVX=1 OFFLOAD=1 KNC=1 make OPT=3 specfem_mic ``` ## Run the Performance Test For default CPU host: ```bash ./specfem.sh ``` For Knights Corner (KNC): ```bash ./specfem.sh -mic ``` ## Results Using Intel Compiler suite: icpc 15.0.2, icc 15.0.2, and ifort 15.0.2. ### Sandy Bridge - Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz Library compilation by (root directory): ```bash make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 AVX=1 ``` Single threaded example run: ```bash cd sample/specfem make; OMP_NUM_THREADS=1 ./specfem.sh ``` Output: ```bash =============================================================== average over 15 repetitions timing with Deville loops = 0.1269 timing with unrolled loops = 0.1737 / speedup = -36.87 % timing with LIBXSMM dispatch = 0.1697 / speedup = -33.77 % timing with LIBXSMM prefetch = 0.1611 / speedup = -26.98 % timing with LIBXSMM static = 0.1392 / speedup = -9.70 % =============================================================== ``` ### Haswell - Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz Library compilation by (root directory): ```bash make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 AVX=2 ``` Single threaded example run: ```bash cd sample/specfem make; OMP_NUM_THREADS=1 ./specfem.sh ``` Output: ```bash =============================================================== average over 15 repetitions timing with Deville loops = 0.1028 timing with unrolled loops = 0.1385 / speedup = -34.73 % timing with LIBXSMM dispatch = 0.1408 / speedup = -37.02 % timing with LIBXSMM prefetch = 0.1327 / speedup = -29.07 % timing with LIBXSMM static = 0.1151 / speedup = -11.93 % =============================================================== ``` Multi-threaded example run: ```bash cd sample/specfem make OPT=3; OMP_NUM_THREADS=24 ./specfem.sh ``` Output: ```bash OpenMP information: number of threads = 24 [...] =============================================================== average over 15 repetitions timing with Deville loops = 0.0064 timing with unrolled loops = 0.0349 / speedup = -446.71 % timing with LIBXSMM dispatch = 0.0082 / speedup = -28.34 % timing with LIBXSMM prefetch = 0.0076 / speedup = -19.59 % timing with LIBXSMM static = 0.0068 / speedup = -5.78 % =============================================================== ``` ### Knights Corner - Intel Xeon Phi B1PRQ-5110P/5120D Library compilation by (root directory): ```bash make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 OFFLOAD=1 KNC=1 ``` Multi-threaded example run: ```bash cd sample/specfem make FCFLAGS="-O3 -fopenmp -warn" OPT=3 KNC=1; ./specfem.sh -mic ``` Output: ```bash OpenMP information: number of threads = 236 [...] =============================================================== average over 15 repetitions timing with Deville loops = 0.0164 timing with unrolled loops = 0.6982 / speedup = -4162.10 % timing with LIBXSMM dispatch = 0.0170 / speedup = -3.89 % timing with LIBXSMM static = 0.0149 / speedup = 9.22 % =============================================================== ``` libxsmm-1.17/samples/specfem/compute_forces_Dev.F90000066400000000000000000000327771415223013700223140ustar00rootroot00000000000000!===================================================================== ! ! S p e c f e m 3 D G l o b e V e r s i o n 7 . 0 ! -------------------------------------------------- ! ! Main historical authors: Dimitri Komatitsch and Jeroen Tromp ! Princeton University, USA ! and CNRS / University of Marseille, France ! (there are currently many more authors!) ! (c) Princeton University and CNRS / University of Marseille, April 2014 ! ! This program is free software; you can redistribute it and/or modify ! it under the terms of the GNU General Public License as published by ! the Free Software Foundation; either version 2 of the License, or ! (at your option) any later version. ! ! This program is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU General Public License for more details. ! ! You should have received a copy of the GNU General Public License along ! with this program; if not, write to the Free Software Foundation, Inc., ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ! !===================================================================== ! we switch between vectorized and non-vectorized version by using pre-processor flag FORCE_VECTORIZATION ! and macros INDEX_IJK, DO_LOOP_IJK, ENDDO_LOOP_IJK defined in config.fh #include "config.fh" !------------------------------------------------------------------- ! ! compute forces routine ! !------------------------------------------------------------------- subroutine compute_forces_Dev() ! default: fortran-loops using Deville matrix routines for small matrix-matrix cut-planes in all three x/y/z-directions use specfem_par use my_libxsmm implicit none ! Deville ! manually inline the calls to the Deville et al. (2002) routines real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: & tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3 real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: & newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3 real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: dummyx_loc,dummyy_loc,dummyz_loc real(kind=CUSTOM_REAL) :: fac1,fac2,fac3 ! for gravity real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NDIM) :: rho_s_H integer :: num_elements,ispec_p integer :: ispec,iglob #ifdef FORCE_VECTORIZATION integer :: ijk_spec,ip,iglob_p,ijk #else integer :: i,j,k #endif ! **************************************************** ! big loop over all spectral elements in the solid ! **************************************************** ! computed_elements = 0 if (iphase == 1) then ! outer elements (halo region) num_elements = nspec_outer else ! inner elements num_elements = nspec_inner endif #ifdef USE_OPENMP !$OMP PARALLEL DEFAULT(NONE) & !$OMP SHARED( & !$OMP num_elements,iphase,phase_ispec_inner, & !$OMP hprime_xxT,hprime_xx,hprimewgll_xx,hprimewgll_xxT, & !$OMP wgllwgll_xy_3D, wgllwgll_xz_3D, wgllwgll_yz_3D, & #ifdef FORCE_VECTORIZATION !$OMP ibool_inv_tbl, ibool_inv_st, num_globs, phase_iglob, & #endif !$OMP ibool, & !$OMP displ,accel, & !$OMP sum_terms ) & !$OMP PRIVATE( ispec,ispec_p,iglob, & #ifdef FORCE_VECTORIZATION !$OMP ijk_spec,ip,iglob_p, & !$OMP ijk, & #else !$OMP i,j,k, & #endif !$OMP fac1,fac2,fac3, & !$OMP tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3, & !$OMP newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3, & !$OMP dummyx_loc,dummyy_loc,dummyz_loc, & !$OMP rho_s_H ) #endif ! loops over all spectral-elements #ifdef USE_OPENMP !$OMP DO SCHEDULE(GUIDED) #endif do ispec_p = 1,num_elements ! only compute elements which belong to current phase (inner or outer elements) ispec = phase_ispec_inner(ispec_p,iphase) DO_LOOP_IJK iglob = ibool(INDEX_IJK,ispec) dummyx_loc(INDEX_IJK) = displ(1,iglob) dummyy_loc(INDEX_IJK) = displ(2,iglob) dummyz_loc(INDEX_IJK) = displ(3,iglob) ENDDO_LOOP_IJK ! subroutines adapted from Deville, Fischer and Mund, High-order methods ! for incompressible fluid flow, Cambridge University Press (2002), ! pages 386 and 389 and Figure 8.3.1 ! computes 1. matrix multiplication for tempx1,.. call mxm5_3comp_singleA(hprime_xx,m1,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1,m2) ! computes 2. matrix multiplication for tempx2,.. call mxm5_3comp_3dmat_singleB(dummyx_loc,dummyy_loc,dummyz_loc,m1,hprime_xxT,m1,tempx2,tempy2,tempz2,NGLLX) ! computes 3. matrix multiplication for tempx3,.. call mxm5_3comp_singleB(dummyx_loc,dummyy_loc,dummyz_loc,m2,hprime_xxT,tempx3,tempy3,tempz3,m1) call compute_element_dummy(ispec,ibool,tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3, & dummyx_loc,dummyy_loc,dummyz_loc,rho_s_H) ! subroutines adapted from Deville, Fischer and Mund, High-order methods ! for incompressible fluid flow, Cambridge University Press (2002), ! pages 386 and 389 and Figure 8.3.1 ! computes 1. matrix multiplication for newtempx1,.. call mxm5_3comp_singleA(hprimewgll_xxT,m1,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1,m2) ! computes 2. matrix multiplication for tempx2,.. call mxm5_3comp_3dmat_singleB(tempx2,tempy2,tempz2,m1,hprimewgll_xx,m1,newtempx2,newtempy2,newtempz2,NGLLX) ! computes 3. matrix multiplication for newtempx3,.. call mxm5_3comp_singleB(tempx3,tempy3,tempz3,m2,hprimewgll_xx,newtempx3,newtempy3,newtempz3,m1) ! sums contributions DO_LOOP_IJK fac1 = wgllwgll_yz_3D(INDEX_IJK) fac2 = wgllwgll_xz_3D(INDEX_IJK) fac3 = wgllwgll_xy_3D(INDEX_IJK) sum_terms(1,INDEX_IJK,ispec) = - (fac1*newtempx1(INDEX_IJK) + fac2*newtempx2(INDEX_IJK) + fac3*newtempx3(INDEX_IJK)) sum_terms(2,INDEX_IJK,ispec) = - (fac1*newtempy1(INDEX_IJK) + fac2*newtempy2(INDEX_IJK) + fac3*newtempy3(INDEX_IJK)) sum_terms(3,INDEX_IJK,ispec) = - (fac1*newtempz1(INDEX_IJK) + fac2*newtempz2(INDEX_IJK) + fac3*newtempz3(INDEX_IJK)) ENDDO_LOOP_IJK ! adds gravity terms if (GRAVITY_VAL) then #ifdef FORCE_VECTORIZATION do ijk = 1,NDIM*NGLLCUBE sum_terms(ijk,1,1,1,ispec) = sum_terms(ijk,1,1,1,ispec) + rho_s_H(ijk,1,1,1) enddo #else do k = 1,NGLLZ do j = 1,NGLLY do i = 1,NGLLX sum_terms(1,i,j,k,ispec) = sum_terms(1,i,j,k,ispec) + rho_s_H(i,j,k,1) sum_terms(2,i,j,k,ispec) = sum_terms(2,i,j,k,ispec) + rho_s_H(i,j,k,2) sum_terms(3,i,j,k,ispec) = sum_terms(3,i,j,k,ispec) + rho_s_H(i,j,k,3) enddo enddo enddo #endif endif ! updates acceleration #ifdef FORCE_VECTORIZATION ! update will be done later at the very end.. #else ! updates for non-vectorization case ! note: Critical OpenMP here might degrade performance, ! especially for a larger number of threads (>8). ! Using atomic operations can partially help. #ifndef USE_OPENMP_ATOMIC_INSTEAD_OF_CRITICAL #ifdef USE_OPENMP !$OMP CRITICAL #endif #endif ! we can force vectorization using a compiler directive here because we know that there is no dependency ! inside a given spectral element, since all the global points of a local elements are different by definition ! (only common points between different elements can be the same) ! IBM, Portland PGI, and Intel and Cray syntax (Intel and Cray are the same) !IBM* ASSERT (NODEPS) !pgi$ ivdep !DIR$ IVDEP DO_LOOP_IJK iglob = ibool(INDEX_IJK,ispec) #ifdef USE_OPENMP_ATOMIC_INSTEAD_OF_CRITICAL #ifdef USE_OPENMP !$OMP ATOMIC #endif #endif accel(1,iglob) = accel(1,iglob) + sum_terms(1,INDEX_IJK,ispec) #ifdef USE_OPENMP_ATOMIC_INSTEAD_OF_CRITICAL #ifdef USE_OPENMP !$OMP ATOMIC #endif #endif accel(2,iglob) = accel(2,iglob) + sum_terms(2,INDEX_IJK,ispec) #ifdef USE_OPENMP_ATOMIC_INSTEAD_OF_CRITICAL #ifdef USE_OPENMP !$OMP ATOMIC #endif #endif accel(3,iglob) = accel(3,iglob) + sum_terms(3,INDEX_IJK,ispec) ENDDO_LOOP_IJK #ifndef USE_OPENMP_ATOMIC_INSTEAD_OF_CRITICAL #ifdef USE_OPENMP !$OMP END CRITICAL #endif #endif #endif enddo ! ispec #ifdef USE_OPENMP !$OMP enddo #endif ! updates acceleration #ifdef FORCE_VECTORIZATION ! updates for vectorized case ! loops over all global nodes in this phase (inner/outer) #ifdef USE_OPENMP !$OMP DO #endif do iglob_p = 1,num_globs(iphase) ! global node index iglob = phase_iglob(iglob_p,iphase) ! loops over valence points do ip = ibool_inv_st(iglob_p,iphase),ibool_inv_st(iglob_p+1,iphase)-1 ! local 1D index from array ibool ijk_spec = ibool_inv_tbl(ip,iphase) ! do NOT use array syntax ":" for the three statements below otherwise most compilers ! will not be able to vectorize the outer loop accel(1,iglob) = accel(1,iglob) + sum_terms(1,ijk_spec,1,1,1) accel(2,iglob) = accel(2,iglob) + sum_terms(2,ijk_spec,1,1,1) accel(3,iglob) = accel(3,iglob) + sum_terms(3,ijk_spec,1,1,1) enddo enddo #ifdef USE_OPENMP !$OMP enddo #endif #endif #ifdef USE_OPENMP !$OMP END PARALLEL #endif contains !-------------------------------------------------------------------------------------------- ! ! matrix-matrix multiplications ! ! subroutines adapted from Deville, Fischer and Mund, High-order methods ! for incompressible fluid flow, Cambridge University Press (2002), ! pages 386 and 389 and Figure 8.3.1 ! !-------------------------------------------------------------------------------------------- ! ! note: the matrix-matrix multiplications are used for very small matrices ( 5 x 5 x 5 elements); ! thus, calling external optimized libraries for these multiplications are in general slower ! ! please leave the routines here to help compilers inlining the code subroutine mxm5_3comp_singleA(A,n1,B1,B2,B3,C1,C2,C3,n3) ! 3 different arrays for x/y/z-components, 2-dimensional arrays (25,5)/(5,25), same B matrix for all 3 component arrays ! use constants,only: CUSTOM_REAL implicit none integer,intent(in) :: n1,n3 real(kind=CUSTOM_REAL),dimension(n1,5),intent(in) :: A real(kind=CUSTOM_REAL),dimension(5,n3),intent(in) :: B1,B2,B3 real(kind=CUSTOM_REAL),dimension(n1,n3),intent(out) :: C1,C2,C3 ! local parameters integer :: i,j ! matrix-matrix multiplication do j = 1,n3 !dir$ ivdep do i = 1,n1 C1(i,j) = A(i,1) * B1(1,j) & + A(i,2) * B1(2,j) & + A(i,3) * B1(3,j) & + A(i,4) * B1(4,j) & + A(i,5) * B1(5,j) C2(i,j) = A(i,1) * B2(1,j) & + A(i,2) * B2(2,j) & + A(i,3) * B2(3,j) & + A(i,4) * B2(4,j) & + A(i,5) * B2(5,j) C3(i,j) = A(i,1) * B3(1,j) & + A(i,2) * B3(2,j) & + A(i,3) * B3(3,j) & + A(i,4) * B3(4,j) & + A(i,5) * B3(5,j) enddo enddo end subroutine mxm5_3comp_singleA !-------------------------------------------------------------------------------------------- subroutine mxm5_3comp_singleB(A1,A2,A3,n1,B,C1,C2,C3,n3) ! 3 different arrays for x/y/z-components, 2-dimensional arrays (25,5)/(5,25), same B matrix for all 3 component arrays ! use constants,only: CUSTOM_REAL implicit none integer,intent(in) :: n1,n3 real(kind=CUSTOM_REAL),dimension(n1,5),intent(in) :: A1,A2,A3 real(kind=CUSTOM_REAL),dimension(5,n3),intent(in) :: B real(kind=CUSTOM_REAL),dimension(n1,n3),intent(out) :: C1,C2,C3 ! local parameters integer :: i,j ! matrix-matrix multiplication do j = 1,n3 !dir$ ivdep do i = 1,n1 C1(i,j) = A1(i,1) * B(1,j) & + A1(i,2) * B(2,j) & + A1(i,3) * B(3,j) & + A1(i,4) * B(4,j) & + A1(i,5) * B(5,j) C2(i,j) = A2(i,1) * B(1,j) & + A2(i,2) * B(2,j) & + A2(i,3) * B(3,j) & + A2(i,4) * B(4,j) & + A2(i,5) * B(5,j) C3(i,j) = A3(i,1) * B(1,j) & + A3(i,2) * B(2,j) & + A3(i,3) * B(3,j) & + A3(i,4) * B(4,j) & + A3(i,5) * B(5,j) enddo enddo end subroutine mxm5_3comp_singleB !-------------------------------------------------------------------------------------------- subroutine mxm5_3comp_3dmat_singleB(A1,A2,A3,n1,B,n2,C1,C2,C3,n3) ! 3 different arrays for x/y/z-components, 3-dimensional arrays (5,5,5), same B matrix for all 3 component arrays ! use constants,only: CUSTOM_REAL implicit none integer,intent(in) :: n1,n2,n3 real(kind=CUSTOM_REAL),dimension(n1,5,n3),intent(in) :: A1,A2,A3 real(kind=CUSTOM_REAL),dimension(5,n2),intent(in) :: B real(kind=CUSTOM_REAL),dimension(n1,n2,n3),intent(out) :: C1,C2,C3 ! local parameters integer :: i,j,k ! matrix-matrix multiplication do k = 1,n3 do j = 1,n2 !dir$ ivdep do i = 1,n1 C1(i,j,k) = A1(i,1,k) * B(1,j) & + A1(i,2,k) * B(2,j) & + A1(i,3,k) * B(3,j) & + A1(i,4,k) * B(4,j) & + A1(i,5,k) * B(5,j) C2(i,j,k) = A2(i,1,k) * B(1,j) & + A2(i,2,k) * B(2,j) & + A2(i,3,k) * B(3,j) & + A2(i,4,k) * B(4,j) & + A2(i,5,k) * B(5,j) C3(i,j,k) = A3(i,1,k) * B(1,j) & + A3(i,2,k) * B(2,j) & + A3(i,3,k) * B(3,j) & + A3(i,4,k) * B(4,j) & + A3(i,5,k) * B(5,j) enddo enddo enddo end subroutine mxm5_3comp_3dmat_singleB end subroutine compute_forces_Dev libxsmm-1.17/samples/specfem/compute_forces_noDev.F90000066400000000000000000000351621415223013700226400ustar00rootroot00000000000000!===================================================================== ! ! S p e c f e m 3 D G l o b e V e r s i o n 7 . 0 ! -------------------------------------------------- ! ! Main historical authors: Dimitri Komatitsch and Jeroen Tromp ! Princeton University, USA ! and CNRS / University of Marseille, France ! (there are currently many more authors!) ! (c) Princeton University and CNRS / University of Marseille, April 2014 ! ! This program is free software; you can redistribute it and/or modify ! it under the terms of the GNU General Public License as published by ! the Free Software Foundation; either version 2 of the License, or ! (at your option) any later version. ! ! This program is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU General Public License for more details. ! ! You should have received a copy of the GNU General Public License along ! with this program; if not, write to the Free Software Foundation, Inc., ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ! !===================================================================== !------------------------------------------------------------------- ! ! compute forces routine ! !------------------------------------------------------------------- subroutine compute_forces_noDev() ! fortran-loops (without Deville routines) using unrolling of the inner-most loop (over 5) use specfem_par use my_libxsmm implicit none ! local parameters real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: & tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3 real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: & newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3 real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: dummyx_loc,dummyy_loc,dummyz_loc real(kind=CUSTOM_REAL) :: fac1,fac2,fac3 ! for gravity real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NDIM) :: rho_s_H integer :: num_elements,ispec_p integer :: ispec,iglob integer :: i,j,k ! **************************************************** ! big loop over all spectral elements in the solid ! **************************************************** ! computed_elements = 0 if (iphase == 1) then ! outer elements (halo region) num_elements = nspec_outer else ! inner elements num_elements = nspec_inner endif #ifdef USE_OPENMP !$OMP PARALLEL DEFAULT(NONE) & !$OMP SHARED( & !$OMP num_elements,iphase,phase_ispec_inner, & !$OMP hprime_xxT,hprimewgll_xx, & !$OMP wgllwgll_xy_3D, wgllwgll_xz_3D, wgllwgll_yz_3D, & !$OMP ibool, & !$OMP displ,accel, & !$OMP sum_terms ) & !$OMP PRIVATE( ispec,ispec_p,iglob, & !$OMP i,j,k, & !$OMP fac1,fac2,fac3, & !$OMP tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3, & !$OMP newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3, & !$OMP dummyx_loc,dummyy_loc,dummyz_loc, & !$OMP rho_s_H ) #endif ! loops over all spectral-elements #ifdef USE_OPENMP !$OMP DO SCHEDULE(GUIDED) #endif do ispec_p = 1,num_elements ! only compute elements which belong to current phase (inner or outer elements) ispec = phase_ispec_inner(ispec_p,iphase) do k = 1,NGLLZ do j = 1,NGLLY do i = 1,NGLLX iglob = ibool(i,j,k,ispec) dummyx_loc(i,j,k) = displ(1,iglob) dummyy_loc(i,j,k) = displ(2,iglob) dummyz_loc(i,j,k) = displ(3,iglob) enddo enddo enddo ! uses loop unrolling for NGLLX == NGLLY == NGLLZ == 5 do k = 1,NGLLZ do j = 1,NGLLY do i = 1,NGLLX ! general NGLLX == NGLLY == NGLLZ !tempx1l = 0._CUSTOM_REAL !tempx2l = 0._CUSTOM_REAL !tempx3l = 0._CUSTOM_REAL !tempy1l = 0._CUSTOM_REAL !tempy2l = 0._CUSTOM_REAL !tempy3l = 0._CUSTOM_REAL !tempz1l = 0._CUSTOM_REAL !tempz2l = 0._CUSTOM_REAL !tempz3l = 0._CUSTOM_REAL !do l = 1,NGLLX ! tempx1l = tempx1l + dummyx_loc(l,j,k)*hprime_xx(i,l) ! tempy1l = tempy1l + dummyy_loc(l,j,k)*hprime_xx(i,l) ! tempz1l = tempz1l + dummyz_loc(l,j,k)*hprime_xx(i,l) ! !!! can merge these loops because NGLLX = NGLLY = NGLLZ ! tempx2l = tempx2l + dummyx_loc(i,l,k)*hprime_yy(j,l) ! tempy2l = tempy2l + dummyy_loc(i,l,k)*hprime_yy(j,l) ! tempz2l = tempz2l + dummyz_loc(i,l,k)*hprime_yy(j,l) ! !!! can merge these loops because NGLLX = NGLLY = NGLLZ ! tempx3l = tempx3l + dummyx_loc(i,j,l)*hprime_zz(k,l) ! tempy3l = tempy3l + dummyy_loc(i,j,l)*hprime_zz(k,l) ! tempz3l = tempz3l + dummyz_loc(i,j,l)*hprime_zz(k,l) !enddo ! unrolled tempx1(i,j,k) = dummyx_loc(1,j,k)*hprime_xxT(1,i) & + dummyx_loc(2,j,k)*hprime_xxT(2,i) & + dummyx_loc(3,j,k)*hprime_xxT(3,i) & + dummyx_loc(4,j,k)*hprime_xxT(4,i) & + dummyx_loc(5,j,k)*hprime_xxT(5,i) tempy1(i,j,k) = dummyy_loc(1,j,k)*hprime_xxT(1,i) & + dummyy_loc(2,j,k)*hprime_xxT(2,i) & + dummyy_loc(3,j,k)*hprime_xxT(3,i) & + dummyy_loc(4,j,k)*hprime_xxT(4,i) & + dummyy_loc(5,j,k)*hprime_xxT(5,i) tempz1(i,j,k) = dummyz_loc(1,j,k)*hprime_xxT(1,i) & + dummyz_loc(2,j,k)*hprime_xxT(2,i) & + dummyz_loc(3,j,k)*hprime_xxT(3,i) & + dummyz_loc(4,j,k)*hprime_xxT(4,i) & + dummyz_loc(5,j,k)*hprime_xxT(5,i) !!! can merge these loops because NGLLX = NGLLY = NGLLZ tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) & + dummyx_loc(i,2,k)*hprime_xxT(2,j) & + dummyx_loc(i,3,k)*hprime_xxT(3,j) & + dummyx_loc(i,4,k)*hprime_xxT(4,j) & + dummyx_loc(i,5,k)*hprime_xxT(5,j) tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) & + dummyy_loc(i,2,k)*hprime_xxT(2,j) & + dummyy_loc(i,3,k)*hprime_xxT(3,j) & + dummyy_loc(i,4,k)*hprime_xxT(4,j) & + dummyy_loc(i,5,k)*hprime_xxT(5,j) tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) & + dummyz_loc(i,2,k)*hprime_xxT(2,j) & + dummyz_loc(i,3,k)*hprime_xxT(3,j) & + dummyz_loc(i,4,k)*hprime_xxT(4,j) & + dummyz_loc(i,5,k)*hprime_xxT(5,j) !!! can merge these loops because NGLLX = NGLLY = NGLLZ tempx3(i,j,k) = dummyx_loc(i,j,1)*hprime_xxT(1,k) & + dummyx_loc(i,j,2)*hprime_xxT(2,k) & + dummyx_loc(i,j,3)*hprime_xxT(3,k) & + dummyx_loc(i,j,4)*hprime_xxT(4,k) & + dummyx_loc(i,j,5)*hprime_xxT(5,k) tempy3(i,j,k) = dummyy_loc(i,j,1)*hprime_xxT(1,k) & + dummyy_loc(i,j,2)*hprime_xxT(2,k) & + dummyy_loc(i,j,3)*hprime_xxT(3,k) & + dummyy_loc(i,j,4)*hprime_xxT(4,k) & + dummyy_loc(i,j,5)*hprime_xxT(5,k) tempz3(i,j,k) = dummyz_loc(i,j,1)*hprime_xxT(1,k) & + dummyz_loc(i,j,2)*hprime_xxT(2,k) & + dummyz_loc(i,j,3)*hprime_xxT(3,k) & + dummyz_loc(i,j,4)*hprime_xxT(4,k) & + dummyz_loc(i,j,5)*hprime_xxT(5,k) enddo enddo enddo call compute_element_dummy(ispec,ibool,tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3, & dummyx_loc,dummyy_loc,dummyz_loc,rho_s_H) ! uses loop unrolling for NGLLX == NGLLY == NGLLZ == 5 do k = 1,NGLLZ do j = 1,NGLLY do i = 1,NGLLX ! general NGLLX == NGLLY == NGLLZ !tempx1l = 0._CUSTOM_REAL !tempx2l = 0._CUSTOM_REAL !tempx3l = 0._CUSTOM_REAL !tempy1l = 0._CUSTOM_REAL !tempy2l = 0._CUSTOM_REAL !tempy3l = 0._CUSTOM_REAL !tempz1l = 0._CUSTOM_REAL !tempz2l = 0._CUSTOM_REAL !tempz3l = 0._CUSTOM_REAL !do l = 1,NGLLX ! fac1 = hprimewgll_xx(l,i) ! tempx1l = tempx1l + tempx1(l,j,k)*fac1 ! tempy1l = tempy1l + tempy1(l,j,k)*fac1 ! tempz1l = tempz1l + tempz1(l,j,k)*fac1 ! !!! can merge these loops because NGLLX = NGLLY = NGLLZ ! fac2 = hprimewgll_yy(l,j) ! tempx2l = tempx2l + tempx2(i,l,k)*fac2 ! tempy2l = tempy2l + tempy2(i,l,k)*fac2 ! tempz2l = tempz2l + tempz2(i,l,k)*fac2 ! !!! can merge these loops because NGLLX = NGLLY = NGLLZ ! fac3 = hprimewgll_zz(l,k) ! tempx3l = tempx3l + tempx3(i,j,l)*fac3 ! tempy3l = tempy3l + tempy3(i,j,l)*fac3 ! tempz3l = tempz3l + tempz3(i,j,l)*fac3 !enddo ! unrolled newtempx1(i,j,k) = tempx1(1,j,k)*hprimewgll_xx(1,i) & + tempx1(2,j,k)*hprimewgll_xx(2,i) & + tempx1(3,j,k)*hprimewgll_xx(3,i) & + tempx1(4,j,k)*hprimewgll_xx(4,i) & + tempx1(5,j,k)*hprimewgll_xx(5,i) newtempy1(i,j,k) = tempy1(1,j,k)*hprimewgll_xx(1,i) & + tempy1(2,j,k)*hprimewgll_xx(2,i) & + tempy1(3,j,k)*hprimewgll_xx(3,i) & + tempy1(4,j,k)*hprimewgll_xx(4,i) & + tempy1(5,j,k)*hprimewgll_xx(5,i) newtempz1(i,j,k) = tempz1(1,j,k)*hprimewgll_xx(1,i) & + tempz1(2,j,k)*hprimewgll_xx(2,i) & + tempz1(3,j,k)*hprimewgll_xx(3,i) & + tempz1(4,j,k)*hprimewgll_xx(4,i) & + tempz1(5,j,k)*hprimewgll_xx(5,i) !!! can merge these loops because NGLLX = NGLLY = NGLLZ newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) & + tempx2(i,2,k)*hprimewgll_xx(2,j) & + tempx2(i,3,k)*hprimewgll_xx(3,j) & + tempx2(i,4,k)*hprimewgll_xx(4,j) & + tempx2(i,5,k)*hprimewgll_xx(5,j) newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) & + tempy2(i,2,k)*hprimewgll_xx(2,j) & + tempy2(i,3,k)*hprimewgll_xx(3,j) & + tempy2(i,4,k)*hprimewgll_xx(4,j) & + tempy2(i,5,k)*hprimewgll_xx(5,j) newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) & + tempz2(i,2,k)*hprimewgll_xx(2,j) & + tempz2(i,3,k)*hprimewgll_xx(3,j) & + tempz2(i,4,k)*hprimewgll_xx(4,j) & + tempz2(i,5,k)*hprimewgll_xx(5,j) !!! can merge these loops because NGLLX = NGLLY = NGLLZ newtempx3(i,j,k) = tempx3(i,j,1)*hprimewgll_xx(1,k) & + tempx3(i,j,2)*hprimewgll_xx(2,k) & + tempx3(i,j,3)*hprimewgll_xx(3,k) & + tempx3(i,j,4)*hprimewgll_xx(4,k) & + tempx3(i,j,5)*hprimewgll_xx(5,k) newtempy3(i,j,k) = tempy3(i,j,1)*hprimewgll_xx(1,k) & + tempy3(i,j,2)*hprimewgll_xx(2,k) & + tempy3(i,j,3)*hprimewgll_xx(3,k) & + tempy3(i,j,4)*hprimewgll_xx(4,k) & + tempy3(i,j,5)*hprimewgll_xx(5,k) newtempz3(i,j,k) = tempz3(i,j,1)*hprimewgll_xx(1,k) & + tempz3(i,j,2)*hprimewgll_xx(2,k) & + tempz3(i,j,3)*hprimewgll_xx(3,k) & + tempz3(i,j,4)*hprimewgll_xx(4,k) & + tempz3(i,j,5)*hprimewgll_xx(5,k) enddo enddo enddo ! sums contributions do k = 1,NGLLZ do j = 1,NGLLY do i = 1,NGLLX fac1 = wgllwgll_yz_3D(i,j,k) fac2 = wgllwgll_xz_3D(i,j,k) fac3 = wgllwgll_xy_3D(i,j,k) sum_terms(1,i,j,k,ispec) = - (fac1*newtempx1(i,j,k) + fac2*newtempx2(i,j,k) + fac3*newtempx3(i,j,k)) sum_terms(2,i,j,k,ispec) = - (fac1*newtempy1(i,j,k) + fac2*newtempy2(i,j,k) + fac3*newtempy3(i,j,k)) sum_terms(3,i,j,k,ispec) = - (fac1*newtempz1(i,j,k) + fac2*newtempz2(i,j,k) + fac3*newtempz3(i,j,k)) enddo enddo enddo ! adds gravity terms if (GRAVITY_VAL) then do k = 1,NGLLZ do j = 1,NGLLY do i = 1,NGLLX sum_terms(1,i,j,k,ispec) = sum_terms(1,i,j,k,ispec) + rho_s_H(i,j,k,1) sum_terms(2,i,j,k,ispec) = sum_terms(2,i,j,k,ispec) + rho_s_H(i,j,k,2) sum_terms(3,i,j,k,ispec) = sum_terms(3,i,j,k,ispec) + rho_s_H(i,j,k,3) enddo enddo enddo endif ! updates acceleration ! updates for non-vectorization case ! note: Critical OpenMP here might degrade performance, ! especially for a larger number of threads (>8). ! Using atomic operations can partially help. #ifndef USE_OPENMP_ATOMIC_INSTEAD_OF_CRITICAL #ifdef USE_OPENMP !$OMP CRITICAL #endif #endif ! we can force vectorization using a compiler directive here because we know that there is no dependency ! inside a given spectral element, since all the global points of a local elements are different by definition ! (only common points between different elements can be the same) ! IBM, Portland PGI, and Intel and Cray syntax (Intel and Cray are the same) !IBM* ASSERT (NODEPS) !pgi$ ivdep !DIR$ IVDEP do k = 1,NGLLZ do j = 1,NGLLY do i = 1,NGLLX iglob = ibool(i,j,k,ispec) #ifdef USE_OPENMP_ATOMIC_INSTEAD_OF_CRITICAL #ifdef USE_OPENMP !$OMP ATOMIC #endif #endif accel(1,iglob) = accel(1,iglob) + sum_terms(1,i,j,k,ispec) #ifdef USE_OPENMP_ATOMIC_INSTEAD_OF_CRITICAL #ifdef USE_OPENMP !$OMP ATOMIC #endif #endif accel(2,iglob) = accel(2,iglob) + sum_terms(2,i,j,k,ispec) #ifdef USE_OPENMP_ATOMIC_INSTEAD_OF_CRITICAL #ifdef USE_OPENMP !$OMP ATOMIC #endif #endif accel(3,iglob) = accel(3,iglob) + sum_terms(3,i,j,k,ispec) enddo enddo enddo #ifndef USE_OPENMP_ATOMIC_INSTEAD_OF_CRITICAL #ifdef USE_OPENMP !$OMP END CRITICAL #endif #endif enddo ! ispec #ifdef USE_OPENMP !$OMP enddo !$OMP END PARALLEL #endif end subroutine compute_forces_noDev libxsmm-1.17/samples/specfem/compute_forces_xsmm_dispatch.F90000066400000000000000000000366141415223013700244330ustar00rootroot00000000000000!===================================================================== ! ! S p e c f e m 3 D G l o b e V e r s i o n 7 . 0 ! -------------------------------------------------- ! ! Main historical authors: Dimitri Komatitsch and Jeroen Tromp ! Princeton University, USA ! and CNRS / University of Marseille, France ! (there are currently many more authors!) ! (c) Princeton University and CNRS / University of Marseille, April 2014 ! ! This program is free software; you can redistribute it and/or modify ! it under the terms of the GNU General Public License as published by ! the Free Software Foundation; either version 2 of the License, or ! (at your option) any later version. ! ! This program is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU General Public License for more details. ! ! You should have received a copy of the GNU General Public License along ! with this program; if not, write to the Free Software Foundation, Inc., ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ! !===================================================================== ! we switch between vectorized and non-vectorized version by using pre-processor flag FORCE_VECTORIZATION ! and macros INDEX_IJK, DO_LOOP_IJK, ENDDO_LOOP_IJK defined in config.fh #include "config.fh" !------------------------------------------------------------------- ! ! compute forces routine ! !------------------------------------------------------------------- subroutine compute_forces_with_xsmm() ! uses LIBXSMM dispatched functions ! (based on Deville version compute_forces_Dev.F90) use specfem_par use my_libxsmm implicit none ! Deville ! manually inline the calls to the Deville et al. (2002) routines real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: & tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3 real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: & newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3 real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: dummyx_loc,dummyy_loc,dummyz_loc real(kind=CUSTOM_REAL) :: fac1,fac2,fac3 ! for gravity real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NDIM) :: rho_s_H integer :: num_elements,ispec_p integer :: ispec,iglob #ifdef FORCE_VECTORIZATION integer :: ijk_spec,ip,iglob_p,ijk #else integer :: i,j,k #endif ! **************************************************** ! big loop over all spectral elements in the solid ! **************************************************** ! computed_elements = 0 if (iphase == 1) then ! outer elements (halo region) num_elements = nspec_outer else ! inner elements num_elements = nspec_inner endif #ifdef USE_OPENMP !$OMP PARALLEL DEFAULT(NONE) & !$OMP SHARED( & !$OMP num_elements,iphase,phase_ispec_inner, & !$OMP hprime_xxT,hprime_xx,hprimewgll_xx,hprimewgll_xxT, & !$OMP wgllwgll_xy_3D, wgllwgll_xz_3D, wgllwgll_yz_3D, & #ifdef FORCE_VECTORIZATION !$OMP ibool_inv_tbl, ibool_inv_st, num_globs, phase_iglob, & #endif !$OMP ibool, & !$OMP displ,accel, & !$OMP sum_terms ) & !$OMP PRIVATE( ispec,ispec_p,iglob, & #ifdef FORCE_VECTORIZATION !$OMP ijk_spec,ip,iglob_p, & !$OMP ijk, & #else !$OMP i,j,k, & #endif !$OMP fac1,fac2,fac3, & !$OMP tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3, & !$OMP newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3, & !$OMP dummyx_loc,dummyy_loc,dummyz_loc, & !$OMP rho_s_H ) #endif ! loops over all spectral-elements #ifdef USE_OPENMP !$OMP DO SCHEDULE(GUIDED) #endif do ispec_p = 1,num_elements ! only compute elements which belong to current phase (inner or outer elements) ispec = phase_ispec_inner(ispec_p,iphase) DO_LOOP_IJK iglob = ibool(INDEX_IJK,ispec) dummyx_loc(INDEX_IJK) = displ(1,iglob) dummyy_loc(INDEX_IJK) = displ(2,iglob) dummyz_loc(INDEX_IJK) = displ(3,iglob) ENDDO_LOOP_IJK ! subroutines adapted from Deville, Fischer and Mund, High-order methods ! for incompressible fluid flow, Cambridge University Press (2002), ! pages 386 and 389 and Figure 8.3.1 ! computes 1. matrix multiplication for tempx1,.. call mxm5_3comp_singleA(hprime_xx,m1,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1,m2) ! computes 2. matrix multiplication for tempx2,.. call mxm5_3comp_3dmat_singleB(dummyx_loc,dummyy_loc,dummyz_loc,m1,hprime_xxT,m1,tempx2,tempy2,tempz2,NGLLX) ! computes 3. matrix multiplication for tempx3,.. call mxm5_3comp_singleB(dummyx_loc,dummyy_loc,dummyz_loc,m2,hprime_xxT,tempx3,tempy3,tempz3,m1) call compute_element_dummy(ispec,ibool,tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3, & dummyx_loc,dummyy_loc,dummyz_loc,rho_s_H) ! subroutines adapted from Deville, Fischer and Mund, High-order methods ! for incompressible fluid flow, Cambridge University Press (2002), ! pages 386 and 389 and Figure 8.3.1 ! computes 1. matrix multiplication for newtempx1,.. call mxm5_3comp_singleA(hprimewgll_xxT,m1,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1,m2) ! computes 2. matrix multiplication for tempx2,.. call mxm5_3comp_3dmat_singleB(tempx2,tempy2,tempz2,m1,hprimewgll_xx,m1,newtempx2,newtempy2,newtempz2,NGLLX) ! computes 3. matrix multiplication for newtempx3,.. call mxm5_3comp_singleB(tempx3,tempy3,tempz3,m2,hprimewgll_xx,newtempx3,newtempy3,newtempz3,m1) ! sums contributions DO_LOOP_IJK fac1 = wgllwgll_yz_3D(INDEX_IJK) fac2 = wgllwgll_xz_3D(INDEX_IJK) fac3 = wgllwgll_xy_3D(INDEX_IJK) sum_terms(1,INDEX_IJK,ispec) = - (fac1*newtempx1(INDEX_IJK) + fac2*newtempx2(INDEX_IJK) + fac3*newtempx3(INDEX_IJK)) sum_terms(2,INDEX_IJK,ispec) = - (fac1*newtempy1(INDEX_IJK) + fac2*newtempy2(INDEX_IJK) + fac3*newtempy3(INDEX_IJK)) sum_terms(3,INDEX_IJK,ispec) = - (fac1*newtempz1(INDEX_IJK) + fac2*newtempz2(INDEX_IJK) + fac3*newtempz3(INDEX_IJK)) ENDDO_LOOP_IJK ! adds gravity terms if (GRAVITY_VAL) then #ifdef FORCE_VECTORIZATION do ijk = 1,NDIM*NGLLCUBE sum_terms(ijk,1,1,1,ispec) = sum_terms(ijk,1,1,1,ispec) + rho_s_H(ijk,1,1,1) enddo #else do k = 1,NGLLZ do j = 1,NGLLY do i = 1,NGLLX sum_terms(1,i,j,k,ispec) = sum_terms(1,i,j,k,ispec) + rho_s_H(i,j,k,1) sum_terms(2,i,j,k,ispec) = sum_terms(2,i,j,k,ispec) + rho_s_H(i,j,k,2) sum_terms(3,i,j,k,ispec) = sum_terms(3,i,j,k,ispec) + rho_s_H(i,j,k,3) enddo enddo enddo #endif endif ! updates acceleration #ifdef FORCE_VECTORIZATION ! update will be done later at the very end.. #else ! updates for non-vectorization case ! note: Critical OpenMP here might degrade performance, ! especially for a larger number of threads (>8). ! Using atomic operations can partially help. #ifndef USE_OPENMP_ATOMIC_INSTEAD_OF_CRITICAL #ifdef USE_OPENMP !$OMP CRITICAL #endif #endif ! we can force vectorization using a compiler directive here because we know that there is no dependency ! inside a given spectral element, since all the global points of a local elements are different by definition ! (only common points between different elements can be the same) ! IBM, Portland PGI, and Intel and Cray syntax (Intel and Cray are the same) !IBM* ASSERT (NODEPS) !pgi$ ivdep !DIR$ IVDEP DO_LOOP_IJK iglob = ibool(INDEX_IJK,ispec) #ifdef USE_OPENMP_ATOMIC_INSTEAD_OF_CRITICAL #ifdef USE_OPENMP !$OMP ATOMIC #endif #endif accel(1,iglob) = accel(1,iglob) + sum_terms(1,INDEX_IJK,ispec) #ifdef USE_OPENMP_ATOMIC_INSTEAD_OF_CRITICAL #ifdef USE_OPENMP !$OMP ATOMIC #endif #endif accel(2,iglob) = accel(2,iglob) + sum_terms(2,INDEX_IJK,ispec) #ifdef USE_OPENMP_ATOMIC_INSTEAD_OF_CRITICAL #ifdef USE_OPENMP !$OMP ATOMIC #endif #endif accel(3,iglob) = accel(3,iglob) + sum_terms(3,INDEX_IJK,ispec) ENDDO_LOOP_IJK #ifndef USE_OPENMP_ATOMIC_INSTEAD_OF_CRITICAL #ifdef USE_OPENMP !$OMP END CRITICAL #endif #endif #endif enddo ! ispec #ifdef USE_OPENMP !$OMP enddo #endif ! updates acceleration #ifdef FORCE_VECTORIZATION ! updates for vectorized case ! loops over all global nodes in this phase (inner/outer) #ifdef USE_OPENMP !$OMP DO #endif do iglob_p = 1,num_globs(iphase) ! global node index iglob = phase_iglob(iglob_p,iphase) ! loops over valence points do ip = ibool_inv_st(iglob_p,iphase),ibool_inv_st(iglob_p+1,iphase)-1 ! local 1D index from array ibool ijk_spec = ibool_inv_tbl(ip,iphase) ! do NOT use array syntax ":" for the three statements below otherwise most compilers ! will not be able to vectorize the outer loop accel(1,iglob) = accel(1,iglob) + sum_terms(1,ijk_spec,1,1,1) accel(2,iglob) = accel(2,iglob) + sum_terms(2,ijk_spec,1,1,1) accel(3,iglob) = accel(3,iglob) + sum_terms(3,ijk_spec,1,1,1) enddo enddo #ifdef USE_OPENMP !$OMP enddo #endif #endif #ifdef USE_OPENMP !$OMP END PARALLEL #endif contains !-------------------------------------------------------------------------------------------- ! ! matrix-matrix multiplications ! ! subroutines adapted from Deville, Fischer and Mund, High-order methods ! for incompressible fluid flow, Cambridge University Press (2002), ! pages 386 and 389 and Figure 8.3.1 ! !-------------------------------------------------------------------------------------------- ! ! note: the matrix-matrix multiplications are used for very small matrices ( 5 x 5 x 5 elements); ! thus, calling external optimized libraries for these multiplications are in general slower ! ! please leave the routines here to help compilers inlining the code subroutine mxm5_3comp_singleA(A,n1,B1,B2,B3,C1,C2,C3,n3) ! 3 different arrays for x/y/z-components, 2-dimensional arrays (25,5)/(5,25), same B matrix for all 3 component arrays #ifdef XSMM use my_libxsmm,only: USE_XSMM_FUNCTION, xmm1, & libxsmm_mmcall_abc => libxsmm_smmcall_abc ! debug timing !use my_libxsmm,only: libxsmm_timer_tick,libxsmm_timer_duration #endif implicit none integer,intent(in) :: n1,n3 real(kind=CUSTOM_REAL),dimension(n1,5),intent(in),target :: A real(kind=CUSTOM_REAL),dimension(5,n3),intent(in),target :: B1,B2,B3 real(kind=CUSTOM_REAL),dimension(n1,n3),intent(out),target :: C1,C2,C3 ! local parameters integer :: i,j #ifdef XSMM ! debug timing !double precision :: duration !integer(kind=8) :: start ! debug timing !start = libxsmm_timer_tick() ! matrix-matrix multiplication C = alpha A * B + beta C ! with A(n1,n2) 5x5-matrix, B(n2,n3) 5x25-matrix and C(n1,n3) 5x25-matrix if (USE_XSMM_FUNCTION) then call libxsmm_mmcall_abc(xmm1, A, B1, C1) call libxsmm_mmcall_abc(xmm1, A, B2, C2) call libxsmm_mmcall_abc(xmm1, A, B3, C3) ! debug timing !duration = libxsmm_timer_duration(start, libxsmm_timer_tick()) !print *,'duration: ',duration ! debug !do j = 1,n3 ! do i = 1,n1 ! print *,i,j,'debug xsmm',C1(i,j),C2(i,j),C1(i,j) - C2(i,j) ! enddo !enddo !stop 'test stop' return endif #endif ! matrix-matrix multiplication do j = 1,n3 !dir$ ivdep do i = 1,n1 C1(i,j) = A(i,1) * B1(1,j) & + A(i,2) * B1(2,j) & + A(i,3) * B1(3,j) & + A(i,4) * B1(4,j) & + A(i,5) * B1(5,j) C2(i,j) = A(i,1) * B2(1,j) & + A(i,2) * B2(2,j) & + A(i,3) * B2(3,j) & + A(i,4) * B2(4,j) & + A(i,5) * B2(5,j) C3(i,j) = A(i,1) * B3(1,j) & + A(i,2) * B3(2,j) & + A(i,3) * B3(3,j) & + A(i,4) * B3(4,j) & + A(i,5) * B3(5,j) enddo enddo end subroutine mxm5_3comp_singleA !-------------------------------------------------------------------------------------------- subroutine mxm5_3comp_singleB(A1,A2,A3,n1,B,C1,C2,C3,n3) ! 3 different arrays for x/y/z-components, 2-dimensional arrays (25,5)/(5,25), same B matrix for all 3 component arrays #ifdef XSMM use my_libxsmm,only: USE_XSMM_FUNCTION, xmm2, & libxsmm_mmcall_abc => libxsmm_smmcall_abc #endif implicit none integer,intent(in) :: n1,n3 real(kind=CUSTOM_REAL),dimension(n1,5),intent(in),target :: A1,A2,A3 real(kind=CUSTOM_REAL),dimension(5,n3),intent(in),target :: B real(kind=CUSTOM_REAL),dimension(n1,n3),intent(out),target :: C1,C2,C3 ! local parameters integer :: i,j #ifdef XSMM ! matrix-matrix multiplication C = alpha A * B + beta C ! with A(n1,n2) 25x5-matrix, B(n2,n3) 5x5-matrix and C(n1,n3) 25x5-matrix if (USE_XSMM_FUNCTION) then call libxsmm_mmcall_abc(xmm2, A1, B, C1) call libxsmm_mmcall_abc(xmm2, A2, B, C2) call libxsmm_mmcall_abc(xmm2, A3, B, C3) return endif #endif ! matrix-matrix multiplication do j = 1,n3 !dir$ ivdep do i = 1,n1 C1(i,j) = A1(i,1) * B(1,j) & + A1(i,2) * B(2,j) & + A1(i,3) * B(3,j) & + A1(i,4) * B(4,j) & + A1(i,5) * B(5,j) C2(i,j) = A2(i,1) * B(1,j) & + A2(i,2) * B(2,j) & + A2(i,3) * B(3,j) & + A2(i,4) * B(4,j) & + A2(i,5) * B(5,j) C3(i,j) = A3(i,1) * B(1,j) & + A3(i,2) * B(2,j) & + A3(i,3) * B(3,j) & + A3(i,4) * B(4,j) & + A3(i,5) * B(5,j) enddo enddo end subroutine mxm5_3comp_singleB !-------------------------------------------------------------------------------------------- subroutine mxm5_3comp_3dmat_singleB(A1,A2,A3,n1,B,n2,C1,C2,C3,n3) ! 3 different arrays for x/y/z-components, 3-dimensional arrays (5,5,5), same B matrix for all 3 component arrays #ifdef XSMM use my_libxsmm,only: USE_XSMM_FUNCTION, xmm3, & libxsmm_mmcall_abc => libxsmm_smmcall_abc #endif implicit none integer,intent(in) :: n1,n2,n3 real(kind=CUSTOM_REAL),dimension(n1,5,n3),intent(in),target :: A1,A2,A3 real(kind=CUSTOM_REAL),dimension(5,n2),intent(in),target :: B real(kind=CUSTOM_REAL),dimension(n1,n2,n3),intent(out),target :: C1,C2,C3 ! local parameters integer :: i,j,k #ifdef XSMM ! matrix-matrix multiplication C = alpha A * B + beta C ! with A(n1,n2,n4) 5x5x5-matrix, B(n2,n3) 5x5-matrix and C(n1,n3,n4) 5x5x5-matrix if (USE_XSMM_FUNCTION) then do k = 1,5 call libxsmm_mmcall_abc(xmm3, A1(1,1,k), B, C1(1,1,k)) call libxsmm_mmcall_abc(xmm3, A2(1,1,k), B, C2(1,1,k)) call libxsmm_mmcall_abc(xmm3, A3(1,1,k), B, C3(1,1,k)) enddo return endif #endif ! matrix-matrix multiplication do k = 1,n3 do j = 1,n2 !dir$ ivdep do i = 1,n1 C1(i,j,k) = A1(i,1,k) * B(1,j) & + A1(i,2,k) * B(2,j) & + A1(i,3,k) * B(3,j) & + A1(i,4,k) * B(4,j) & + A1(i,5,k) * B(5,j) C2(i,j,k) = A2(i,1,k) * B(1,j) & + A2(i,2,k) * B(2,j) & + A2(i,3,k) * B(3,j) & + A2(i,4,k) * B(4,j) & + A2(i,5,k) * B(5,j) C3(i,j,k) = A3(i,1,k) * B(1,j) & + A3(i,2,k) * B(2,j) & + A3(i,3,k) * B(3,j) & + A3(i,4,k) * B(4,j) & + A3(i,5,k) * B(5,j) enddo enddo enddo end subroutine mxm5_3comp_3dmat_singleB end subroutine compute_forces_with_xsmm libxsmm-1.17/samples/specfem/compute_forces_xsmm_prefetch.F90000066400000000000000000000413201415223013700244220ustar00rootroot00000000000000!===================================================================== ! ! S p e c f e m 3 D G l o b e V e r s i o n 7 . 0 ! -------------------------------------------------- ! ! Main historical authors: Dimitri Komatitsch and Jeroen Tromp ! Princeton University, USA ! and CNRS / University of Marseille, France ! (there are currently many more authors!) ! (c) Princeton University and CNRS / University of Marseille, April 2014 ! ! This program is free software; you can redistribute it and/or modify ! it under the terms of the GNU General Public License as published by ! the Free Software Foundation; either version 2 of the License, or ! (at your option) any later version. ! ! This program is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU General Public License for more details. ! ! You should have received a copy of the GNU General Public License along ! with this program; if not, write to the Free Software Foundation, Inc., ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ! !===================================================================== ! we switch between vectorized and non-vectorized version by using pre-processor flag FORCE_VECTORIZATION ! and macros INDEX_IJK, DO_LOOP_IJK, ENDDO_LOOP_IJK defined in config.fh #include "config.fh" !------------------------------------------------------------------- ! ! compute forces routine ! !------------------------------------------------------------------- subroutine compute_forces_with_xsmm_prefetch() ! uses LIBXSMM dispatched functions with prefetch versions ! (based on Deville version compute_forces_Dev.F90) use specfem_par use my_libxsmm implicit none ! Deville ! manually inline the calls to the Deville et al. (2002) routines real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: & tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3 real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: & newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3 real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: dummyx_loc,dummyy_loc,dummyz_loc real(kind=CUSTOM_REAL) :: fac1,fac2,fac3 ! for gravity real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NDIM) :: rho_s_H integer :: num_elements,ispec_p integer :: ispec,iglob #ifdef FORCE_VECTORIZATION integer :: ijk_spec,ip,iglob_p,ijk #else integer :: i,j,k #endif ! **************************************************** ! big loop over all spectral elements in the solid ! **************************************************** ! computed_elements = 0 if (iphase == 1) then ! outer elements (halo region) num_elements = nspec_outer else ! inner elements num_elements = nspec_inner endif #ifdef USE_OPENMP !$OMP PARALLEL DEFAULT(NONE) & !$OMP SHARED( & !$OMP num_elements,iphase,phase_ispec_inner, & !$OMP hprime_xxT,hprime_xx,hprimewgll_xx,hprimewgll_xxT, & !$OMP wgllwgll_xy_3D, wgllwgll_xz_3D, wgllwgll_yz_3D, & #ifdef FORCE_VECTORIZATION !$OMP ibool_inv_tbl, ibool_inv_st, num_globs, phase_iglob, & #endif !$OMP ibool, & !$OMP displ,accel, & !$OMP sum_terms ) & !$OMP PRIVATE( ispec,ispec_p,iglob, & #ifdef FORCE_VECTORIZATION !$OMP ijk_spec,ip,iglob_p, & !$OMP ijk, & #else !$OMP i,j,k, & #endif !$OMP fac1,fac2,fac3, & !$OMP tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3, & !$OMP newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3, & !$OMP dummyx_loc,dummyy_loc,dummyz_loc, & !$OMP rho_s_H ) #endif ! loops over all spectral-elements #ifdef USE_OPENMP !$OMP DO SCHEDULE(GUIDED) #endif do ispec_p = 1,num_elements ! only compute elements which belong to current phase (inner or outer elements) ispec = phase_ispec_inner(ispec_p,iphase) DO_LOOP_IJK iglob = ibool(INDEX_IJK,ispec) dummyx_loc(INDEX_IJK) = displ(1,iglob) dummyy_loc(INDEX_IJK) = displ(2,iglob) dummyz_loc(INDEX_IJK) = displ(3,iglob) ENDDO_LOOP_IJK ! subroutines adapted from Deville, Fischer and Mund, High-order methods ! for incompressible fluid flow, Cambridge University Press (2002), ! pages 386 and 389 and Figure 8.3.1 ! computes 1. matrix multiplication for tempx1,.. call mxm5_3comp_singleA(hprime_xx,m1,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1,m2) ! computes 2. matrix multiplication for tempx2,.. call mxm5_3comp_3dmat_singleB(dummyx_loc,dummyy_loc,dummyz_loc,m1,hprime_xxT,m1,tempx2,tempy2,tempz2,NGLLX) ! computes 3. matrix multiplication for tempx3,.. call mxm5_3comp_singleB(dummyx_loc,dummyy_loc,dummyz_loc,m2,hprime_xxT,tempx3,tempy3,tempz3,m1) call compute_element_dummy(ispec,ibool,tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3, & dummyx_loc,dummyy_loc,dummyz_loc,rho_s_H) ! subroutines adapted from Deville, Fischer and Mund, High-order methods ! for incompressible fluid flow, Cambridge University Press (2002), ! pages 386 and 389 and Figure 8.3.1 ! computes 1. matrix multiplication for newtempx1,.. call mxm5_3comp_singleA(hprimewgll_xxT,m1,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1,m2) ! computes 2. matrix multiplication for tempx2,.. call mxm5_3comp_3dmat_singleB(tempx2,tempy2,tempz2,m1,hprimewgll_xx,m1,newtempx2,newtempy2,newtempz2,NGLLX) ! computes 3. matrix multiplication for newtempx3,.. call mxm5_3comp_singleB(tempx3,tempy3,tempz3,m2,hprimewgll_xx,newtempx3,newtempy3,newtempz3,m1) ! sums contributions DO_LOOP_IJK fac1 = wgllwgll_yz_3D(INDEX_IJK) fac2 = wgllwgll_xz_3D(INDEX_IJK) fac3 = wgllwgll_xy_3D(INDEX_IJK) sum_terms(1,INDEX_IJK,ispec) = - (fac1*newtempx1(INDEX_IJK) + fac2*newtempx2(INDEX_IJK) + fac3*newtempx3(INDEX_IJK)) sum_terms(2,INDEX_IJK,ispec) = - (fac1*newtempy1(INDEX_IJK) + fac2*newtempy2(INDEX_IJK) + fac3*newtempy3(INDEX_IJK)) sum_terms(3,INDEX_IJK,ispec) = - (fac1*newtempz1(INDEX_IJK) + fac2*newtempz2(INDEX_IJK) + fac3*newtempz3(INDEX_IJK)) ENDDO_LOOP_IJK ! adds gravity terms if (GRAVITY_VAL) then #ifdef FORCE_VECTORIZATION do ijk = 1,NDIM*NGLLCUBE sum_terms(ijk,1,1,1,ispec) = sum_terms(ijk,1,1,1,ispec) + rho_s_H(ijk,1,1,1) enddo #else do k = 1,NGLLZ do j = 1,NGLLY do i = 1,NGLLX sum_terms(1,i,j,k,ispec) = sum_terms(1,i,j,k,ispec) + rho_s_H(i,j,k,1) sum_terms(2,i,j,k,ispec) = sum_terms(2,i,j,k,ispec) + rho_s_H(i,j,k,2) sum_terms(3,i,j,k,ispec) = sum_terms(3,i,j,k,ispec) + rho_s_H(i,j,k,3) enddo enddo enddo #endif endif ! updates acceleration #ifdef FORCE_VECTORIZATION ! update will be done later at the very end.. #else ! updates for non-vectorization case ! note: Critical OpenMP here might degrade performance, ! especially for a larger number of threads (>8). ! Using atomic operations can partially help. #ifndef USE_OPENMP_ATOMIC_INSTEAD_OF_CRITICAL #ifdef USE_OPENMP !$OMP CRITICAL #endif #endif ! we can force vectorization using a compiler directive here because we know that there is no dependency ! inside a given spectral element, since all the global points of a local elements are different by definition ! (only common points between different elements can be the same) ! IBM, Portland PGI, and Intel and Cray syntax (Intel and Cray are the same) !IBM* ASSERT (NODEPS) !pgi$ ivdep !DIR$ IVDEP DO_LOOP_IJK iglob = ibool(INDEX_IJK,ispec) #ifdef USE_OPENMP_ATOMIC_INSTEAD_OF_CRITICAL #ifdef USE_OPENMP !$OMP ATOMIC #endif #endif accel(1,iglob) = accel(1,iglob) + sum_terms(1,INDEX_IJK,ispec) #ifdef USE_OPENMP_ATOMIC_INSTEAD_OF_CRITICAL #ifdef USE_OPENMP !$OMP ATOMIC #endif #endif accel(2,iglob) = accel(2,iglob) + sum_terms(2,INDEX_IJK,ispec) #ifdef USE_OPENMP_ATOMIC_INSTEAD_OF_CRITICAL #ifdef USE_OPENMP !$OMP ATOMIC #endif #endif accel(3,iglob) = accel(3,iglob) + sum_terms(3,INDEX_IJK,ispec) ENDDO_LOOP_IJK #ifndef USE_OPENMP_ATOMIC_INSTEAD_OF_CRITICAL #ifdef USE_OPENMP !$OMP END CRITICAL #endif #endif #endif enddo ! ispec #ifdef USE_OPENMP !$OMP enddo #endif ! updates acceleration #ifdef FORCE_VECTORIZATION ! updates for vectorized case ! loops over all global nodes in this phase (inner/outer) #ifdef USE_OPENMP !$OMP DO #endif do iglob_p = 1,num_globs(iphase) ! global node index iglob = phase_iglob(iglob_p,iphase) ! loops over valence points do ip = ibool_inv_st(iglob_p,iphase),ibool_inv_st(iglob_p+1,iphase)-1 ! local 1D index from array ibool ijk_spec = ibool_inv_tbl(ip,iphase) ! do NOT use array syntax ":" for the three statements below otherwise most compilers ! will not be able to vectorize the outer loop accel(1,iglob) = accel(1,iglob) + sum_terms(1,ijk_spec,1,1,1) accel(2,iglob) = accel(2,iglob) + sum_terms(2,ijk_spec,1,1,1) accel(3,iglob) = accel(3,iglob) + sum_terms(3,ijk_spec,1,1,1) enddo enddo #ifdef USE_OPENMP !$OMP enddo #endif #endif #ifdef USE_OPENMP !$OMP END PARALLEL #endif contains !-------------------------------------------------------------------------------------------- ! ! matrix-matrix multiplications ! ! subroutines adapted from Deville, Fischer and Mund, High-order methods ! for incompressible fluid flow, Cambridge University Press (2002), ! pages 386 and 389 and Figure 8.3.1 ! !-------------------------------------------------------------------------------------------- ! ! note: the matrix-matrix multiplications are used for very small matrices ( 5 x 5 x 5 elements); ! thus, calling external optimized libraries for these multiplications are in general slower ! ! please leave the routines here to help compilers inlining the code subroutine mxm5_3comp_singleA(A,n1,B1,B2,B3,C1,C2,C3,n3) ! 3 different arrays for x/y/z-components, 2-dimensional arrays (25,5)/(5,25), same B matrix for all 3 component arrays #ifdef XSMM use my_libxsmm,only: USE_XSMM_FUNCTION_PREFETCH, xmm1, xmm1p, & libxsmm_mmcall_abc => libxsmm_smmcall_abc, & libxsmm_mmcall_prf => libxsmm_smmcall_prf ! debug timing !use my_libxsmm,only: libxsmm_timer_tick,libxsmm_timer_duration #endif implicit none integer,intent(in) :: n1,n3 real(kind=CUSTOM_REAL),dimension(n1,5),intent(in),target :: A real(kind=CUSTOM_REAL),dimension(5,n3),intent(in),target :: B1,B2,B3 real(kind=CUSTOM_REAL),dimension(n1,n3),intent(out),target :: C1,C2,C3 ! local parameters integer :: i,j #ifdef XSMM ! debug timing !double precision :: duration !integer(kind=8) :: start ! debug timing !start = libxsmm_timer_tick() ! matrix-matrix multiplication C = alpha A * B + beta C ! with A(n1,n2) 5x5-matrix, B(n2,n3) 5x25-matrix and C(n1,n3) 5x25-matrix if (USE_XSMM_FUNCTION_PREFETCH) then ! prefetch version call libxsmm_mmcall_prf(xmm1p, a=A, b=B1, c=C1, & pa=A, pb=B2, pc=C2) ! with prefetch call libxsmm_mmcall_prf(xmm1p, a=A, b=B2, c=C2, & pa=A, pb=B3, pc=C3) ! with prefetch call libxsmm_mmcall_abc(xmm1, a=A, b=B3, c=C3) !call libxsmm_mmcall_prf(xmm1p, a=A, b=B3, c=C3, & !pa=A, pb=B1, pc=C1) ! with dummy prefetch ! debug timing !duration = libxsmm_timer_duration(start, libxsmm_timer_tick()) !print *,'duration: ',duration ! debug !do j = 1,n3 ! do i = 1,n1 ! print *,i,j,'debug xsmm',C1(i,j),C2(i,j),C1(i,j) - C2(i,j) ! enddo !enddo !stop 'test stop' return endif #endif ! matrix-matrix multiplication do j = 1,n3 !dir$ ivdep do i = 1,n1 C1(i,j) = A(i,1) * B1(1,j) & + A(i,2) * B1(2,j) & + A(i,3) * B1(3,j) & + A(i,4) * B1(4,j) & + A(i,5) * B1(5,j) C2(i,j) = A(i,1) * B2(1,j) & + A(i,2) * B2(2,j) & + A(i,3) * B2(3,j) & + A(i,4) * B2(4,j) & + A(i,5) * B2(5,j) C3(i,j) = A(i,1) * B3(1,j) & + A(i,2) * B3(2,j) & + A(i,3) * B3(3,j) & + A(i,4) * B3(4,j) & + A(i,5) * B3(5,j) enddo enddo end subroutine mxm5_3comp_singleA !-------------------------------------------------------------------------------------------- subroutine mxm5_3comp_singleB(A1,A2,A3,n1,B,C1,C2,C3,n3) ! 3 different arrays for x/y/z-components, 2-dimensional arrays (25,5)/(5,25), same B matrix for all 3 component arrays #ifdef XSMM use my_libxsmm,only: USE_XSMM_FUNCTION_PREFETCH, xmm2, xmm2p, & libxsmm_mmcall_abc => libxsmm_smmcall_abc, & libxsmm_mmcall_prf => libxsmm_smmcall_prf #endif implicit none integer,intent(in) :: n1,n3 real(kind=CUSTOM_REAL),dimension(n1,5),intent(in),target :: A1,A2,A3 real(kind=CUSTOM_REAL),dimension(5,n3),intent(in),target :: B real(kind=CUSTOM_REAL),dimension(n1,n3),intent(out),target :: C1,C2,C3 ! local parameters integer :: i,j #ifdef XSMM ! matrix-matrix multiplication C = alpha A * B + beta C ! with A(n1,n2) 25x5-matrix, B(n2,n3) 5x5-matrix and C(n1,n3) 25x5-matrix if (USE_XSMM_FUNCTION_PREFETCH) then ! prefetch version call libxsmm_mmcall_prf(xmm2p, a=A1, b=B, c=C1, & pa=A2, pb=B, pc=C2) ! with prefetch call libxsmm_mmcall_prf(xmm2p, a=A2, b=B, c=C2, & pa=A3, pb=B, pc=C3) ! with prefetch call libxsmm_mmcall_abc(xmm2, a=A3, b=B, c=C3) !call libxsmm_mmcall_prf(xmm2p, a=A3, b=B, c=C3, & !pa=A1, pb=B, pc=C1) ! with dummy prefetch return endif #endif ! matrix-matrix multiplication do j = 1,n3 !dir$ ivdep do i = 1,n1 C1(i,j) = A1(i,1) * B(1,j) & + A1(i,2) * B(2,j) & + A1(i,3) * B(3,j) & + A1(i,4) * B(4,j) & + A1(i,5) * B(5,j) C2(i,j) = A2(i,1) * B(1,j) & + A2(i,2) * B(2,j) & + A2(i,3) * B(3,j) & + A2(i,4) * B(4,j) & + A2(i,5) * B(5,j) C3(i,j) = A3(i,1) * B(1,j) & + A3(i,2) * B(2,j) & + A3(i,3) * B(3,j) & + A3(i,4) * B(4,j) & + A3(i,5) * B(5,j) enddo enddo end subroutine mxm5_3comp_singleB !-------------------------------------------------------------------------------------------- subroutine mxm5_3comp_3dmat_singleB(A1,A2,A3,n1,B,n2,C1,C2,C3,n3) ! 3 different arrays for x/y/z-components, 3-dimensional arrays (5,5,5), same B matrix for all 3 component arrays #ifdef XSMM use my_libxsmm,only: USE_XSMM_FUNCTION_PREFETCH, xmm3, xmm3p, & libxsmm_mmcall_abc => libxsmm_smmcall_abc, & libxsmm_mmcall_prf => libxsmm_smmcall_prf #endif implicit none integer,intent(in) :: n1,n2,n3 real(kind=CUSTOM_REAL),dimension(n1,5,n3),intent(in),target :: A1,A2,A3 real(kind=CUSTOM_REAL),dimension(5,n2),intent(in),target :: B real(kind=CUSTOM_REAL),dimension(n1,n2,n3),intent(out),target :: C1,C2,C3 ! local parameters integer :: i,j,k #ifdef XSMM ! matrix-matrix multiplication C = alpha A * B + beta C ! with A(n1,n2,n4) 5x5x5-matrix, B(n2,n3) 5x5-matrix and C(n1,n3,n4) 5x5x5-matrix if (USE_XSMM_FUNCTION_PREFETCH) then do k = 1,5 ! prefetch version call libxsmm_mmcall_prf(xmm3p, a=A1(1,1,k), b=B, c=C1(1,1,k), & pa=A2(1,1,k), pb=B, pc=C2(1,1,k)) ! with prefetch call libxsmm_mmcall_prf(xmm3p, a=A2(1,1,k), b=B, c=C2(1,1,k), & pa=A3(1,1,k), pb=B, pc=C3(1,1,k)) ! with prefetch !if (k == 5) then call libxsmm_mmcall_abc(xmm3, a=A3(1,1,k), b=B, c=C3(1,1,k)) !else ! call libxsmm_mmcall_prf(xmm3p, a=A3(1,1,k), b=B, c=C3(1,1,k), & ! pa=A1(1,1,k+1), pb=B, pc=C1(1,1,k+1)) ! with dummy prefetch !endif enddo return endif #endif ! matrix-matrix multiplication do k = 1,n3 do j = 1,n2 !dir$ ivdep do i = 1,n1 C1(i,j,k) = A1(i,1,k) * B(1,j) & + A1(i,2,k) * B(2,j) & + A1(i,3,k) * B(3,j) & + A1(i,4,k) * B(4,j) & + A1(i,5,k) * B(5,j) C2(i,j,k) = A2(i,1,k) * B(1,j) & + A2(i,2,k) * B(2,j) & + A2(i,3,k) * B(3,j) & + A2(i,4,k) * B(4,j) & + A2(i,5,k) * B(5,j) C3(i,j,k) = A3(i,1,k) * B(1,j) & + A3(i,2,k) * B(2,j) & + A3(i,3,k) * B(3,j) & + A3(i,4,k) * B(4,j) & + A3(i,5,k) * B(5,j) enddo enddo enddo end subroutine mxm5_3comp_3dmat_singleB end subroutine compute_forces_with_xsmm_prefetch libxsmm-1.17/samples/specfem/compute_forces_xsmm_static.F90000066400000000000000000000412161415223013700241150ustar00rootroot00000000000000!===================================================================== ! ! S p e c f e m 3 D G l o b e V e r s i o n 7 . 0 ! -------------------------------------------------- ! ! Main historical authors: Dimitri Komatitsch and Jeroen Tromp ! Princeton University, USA ! and CNRS / University of Marseille, France ! (there are currently many more authors!) ! (c) Princeton University and CNRS / University of Marseille, April 2014 ! ! This program is free software; you can redistribute it and/or modify ! it under the terms of the GNU General Public License as published by ! the Free Software Foundation; either version 2 of the License, or ! (at your option) any later version. ! ! This program is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU General Public License for more details. ! ! You should have received a copy of the GNU General Public License along ! with this program; if not, write to the Free Software Foundation, Inc., ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ! !===================================================================== ! we switch between vectorized and non-vectorized version by using pre-processor flag FORCE_VECTORIZATION ! and macros INDEX_IJK, DO_LOOP_IJK, ENDDO_LOOP_IJK defined in config.fh #include "config.fh" !------------------------------------------------------------------- ! ! compute forces routine ! !------------------------------------------------------------------- subroutine compute_forces_with_xsmm_static() ! uses LIBXSMM static function calls (no dispatching, LIBXSMM compiled with: make MNK="5 25" ..) ! (based on Deville version compute_forces_Dev.F90) use specfem_par use my_libxsmm implicit none ! Deville ! manually inline the calls to the Deville et al. (2002) routines real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: & tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3 real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: & newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3 real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: dummyx_loc,dummyy_loc,dummyz_loc real(kind=CUSTOM_REAL) :: fac1,fac2,fac3 ! for gravity real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NDIM) :: rho_s_H integer :: num_elements,ispec_p integer :: ispec,iglob #ifdef FORCE_VECTORIZATION integer :: ijk_spec,ip,iglob_p,ijk #else integer :: i,j,k #endif ! **************************************************** ! big loop over all spectral elements in the solid ! **************************************************** ! computed_elements = 0 if (iphase == 1) then ! outer elements (halo region) num_elements = nspec_outer else ! inner elements num_elements = nspec_inner endif #ifdef USE_OPENMP !$OMP PARALLEL DEFAULT(NONE) & !$OMP SHARED( & !$OMP num_elements,iphase,phase_ispec_inner, & !$OMP hprime_xxT,hprime_xx,hprimewgll_xx,hprimewgll_xxT, & !$OMP wgllwgll_xy_3D, wgllwgll_xz_3D, wgllwgll_yz_3D, & #ifdef FORCE_VECTORIZATION !$OMP ibool_inv_tbl, ibool_inv_st, num_globs, phase_iglob, & #endif !$OMP ibool, & !$OMP displ,accel, & !$OMP sum_terms ) & !$OMP PRIVATE( ispec,ispec_p,iglob, & #ifdef FORCE_VECTORIZATION !$OMP ijk_spec,ip,iglob_p, & !$OMP ijk, & #else !$OMP i,j,k, & #endif !$OMP fac1,fac2,fac3, & !$OMP tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3, & !$OMP newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3, & !$OMP dummyx_loc,dummyy_loc,dummyz_loc, & !$OMP rho_s_H ) #endif ! loops over all spectral-elements #ifdef USE_OPENMP !$OMP DO SCHEDULE(GUIDED) #endif do ispec_p = 1,num_elements ! only compute elements which belong to current phase (inner or outer elements) ispec = phase_ispec_inner(ispec_p,iphase) DO_LOOP_IJK iglob = ibool(INDEX_IJK,ispec) dummyx_loc(INDEX_IJK) = displ(1,iglob) dummyy_loc(INDEX_IJK) = displ(2,iglob) dummyz_loc(INDEX_IJK) = displ(3,iglob) ENDDO_LOOP_IJK ! subroutines adapted from Deville, Fischer and Mund, High-order methods ! for incompressible fluid flow, Cambridge University Press (2002), ! pages 386 and 389 and Figure 8.3.1 ! computes 1. matrix multiplication for tempx1,.. call mxm5_3comp_singleA(hprime_xx,m1,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1,m2) ! computes 2. matrix multiplication for tempx2,.. call mxm5_3comp_3dmat_singleB(dummyx_loc,dummyy_loc,dummyz_loc,m1,hprime_xxT,m1,tempx2,tempy2,tempz2,NGLLX) ! computes 3. matrix multiplication for tempx3,.. call mxm5_3comp_singleB(dummyx_loc,dummyy_loc,dummyz_loc,m2,hprime_xxT,tempx3,tempy3,tempz3,m1) call compute_element_dummy(ispec,ibool,tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3, & dummyx_loc,dummyy_loc,dummyz_loc,rho_s_H) ! subroutines adapted from Deville, Fischer and Mund, High-order methods ! for incompressible fluid flow, Cambridge University Press (2002), ! pages 386 and 389 and Figure 8.3.1 ! computes 1. matrix multiplication for newtempx1,.. call mxm5_3comp_singleA(hprimewgll_xxT,m1,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1,m2) ! computes 2. matrix multiplication for tempx2,.. call mxm5_3comp_3dmat_singleB(tempx2,tempy2,tempz2,m1,hprimewgll_xx,m1,newtempx2,newtempy2,newtempz2,NGLLX) ! computes 3. matrix multiplication for newtempx3,.. call mxm5_3comp_singleB(tempx3,tempy3,tempz3,m2,hprimewgll_xx,newtempx3,newtempy3,newtempz3,m1) ! sums contributions DO_LOOP_IJK fac1 = wgllwgll_yz_3D(INDEX_IJK) fac2 = wgllwgll_xz_3D(INDEX_IJK) fac3 = wgllwgll_xy_3D(INDEX_IJK) sum_terms(1,INDEX_IJK,ispec) = - (fac1*newtempx1(INDEX_IJK) + fac2*newtempx2(INDEX_IJK) + fac3*newtempx3(INDEX_IJK)) sum_terms(2,INDEX_IJK,ispec) = - (fac1*newtempy1(INDEX_IJK) + fac2*newtempy2(INDEX_IJK) + fac3*newtempy3(INDEX_IJK)) sum_terms(3,INDEX_IJK,ispec) = - (fac1*newtempz1(INDEX_IJK) + fac2*newtempz2(INDEX_IJK) + fac3*newtempz3(INDEX_IJK)) ENDDO_LOOP_IJK ! adds gravity terms if (GRAVITY_VAL) then #ifdef FORCE_VECTORIZATION do ijk = 1,NDIM*NGLLCUBE sum_terms(ijk,1,1,1,ispec) = sum_terms(ijk,1,1,1,ispec) + rho_s_H(ijk,1,1,1) enddo #else do k = 1,NGLLZ do j = 1,NGLLY do i = 1,NGLLX sum_terms(1,i,j,k,ispec) = sum_terms(1,i,j,k,ispec) + rho_s_H(i,j,k,1) sum_terms(2,i,j,k,ispec) = sum_terms(2,i,j,k,ispec) + rho_s_H(i,j,k,2) sum_terms(3,i,j,k,ispec) = sum_terms(3,i,j,k,ispec) + rho_s_H(i,j,k,3) enddo enddo enddo #endif endif ! updates acceleration #ifdef FORCE_VECTORIZATION ! update will be done later at the very end.. #else ! updates for non-vectorization case ! note: Critical OpenMP here might degrade performance, ! especially for a larger number of threads (>8). ! Using atomic operations can partially help. #ifndef USE_OPENMP_ATOMIC_INSTEAD_OF_CRITICAL #ifdef USE_OPENMP !$OMP CRITICAL #endif #endif ! we can force vectorization using a compiler directive here because we know that there is no dependency ! inside a given spectral element, since all the global points of a local elements are different by definition ! (only common points between different elements can be the same) ! IBM, Portland PGI, and Intel and Cray syntax (Intel and Cray are the same) !IBM* ASSERT (NODEPS) !pgi$ ivdep !DIR$ IVDEP DO_LOOP_IJK iglob = ibool(INDEX_IJK,ispec) #ifdef USE_OPENMP_ATOMIC_INSTEAD_OF_CRITICAL #ifdef USE_OPENMP !$OMP ATOMIC #endif #endif accel(1,iglob) = accel(1,iglob) + sum_terms(1,INDEX_IJK,ispec) #ifdef USE_OPENMP_ATOMIC_INSTEAD_OF_CRITICAL #ifdef USE_OPENMP !$OMP ATOMIC #endif #endif accel(2,iglob) = accel(2,iglob) + sum_terms(2,INDEX_IJK,ispec) #ifdef USE_OPENMP_ATOMIC_INSTEAD_OF_CRITICAL #ifdef USE_OPENMP !$OMP ATOMIC #endif #endif accel(3,iglob) = accel(3,iglob) + sum_terms(3,INDEX_IJK,ispec) ENDDO_LOOP_IJK #ifndef USE_OPENMP_ATOMIC_INSTEAD_OF_CRITICAL #ifdef USE_OPENMP !$OMP END CRITICAL #endif #endif #endif enddo ! ispec #ifdef USE_OPENMP !$OMP enddo #endif ! updates acceleration #ifdef FORCE_VECTORIZATION ! updates for vectorized case ! loops over all global nodes in this phase (inner/outer) #ifdef USE_OPENMP !$OMP DO #endif do iglob_p = 1,num_globs(iphase) ! global node index iglob = phase_iglob(iglob_p,iphase) ! loops over valence points do ip = ibool_inv_st(iglob_p,iphase),ibool_inv_st(iglob_p+1,iphase)-1 ! local 1D index from array ibool ijk_spec = ibool_inv_tbl(ip,iphase) ! do NOT use array syntax ":" for the three statements below otherwise most compilers ! will not be able to vectorize the outer loop accel(1,iglob) = accel(1,iglob) + sum_terms(1,ijk_spec,1,1,1) accel(2,iglob) = accel(2,iglob) + sum_terms(2,ijk_spec,1,1,1) accel(3,iglob) = accel(3,iglob) + sum_terms(3,ijk_spec,1,1,1) enddo enddo #ifdef USE_OPENMP !$OMP enddo #endif #endif #ifdef USE_OPENMP !$OMP END PARALLEL #endif contains !-------------------------------------------------------------------------------------------- ! ! matrix-matrix multiplications ! ! subroutines adapted from Deville, Fischer and Mund, High-order methods ! for incompressible fluid flow, Cambridge University Press (2002), ! pages 386 and 389 and Figure 8.3.1 ! !-------------------------------------------------------------------------------------------- ! ! note: the matrix-matrix multiplications are used for very small matrices ( 5 x 5 x 5 elements); ! thus, calling external optimized libraries for these multiplications are in general slower ! ! please leave the routines here to help compilers inlining the code subroutine mxm5_3comp_singleA(A,n1,B1,B2,B3,C1,C2,C3,n3) ! 3 different arrays for x/y/z-components, 2-dimensional arrays (25,5)/(5,25), same B matrix for all 3 component arrays #if defined(XSMM) && defined(LIBXSMM_SMM_5_25_5) use my_libxsmm,only: libxsmm_smm_5_25_5 #endif implicit none integer,intent(in) :: n1,n3 real(kind=CUSTOM_REAL),dimension(n1,5),intent(in) :: A real(kind=CUSTOM_REAL),dimension(5,n3),intent(in) :: B1,B2,B3 real(kind=CUSTOM_REAL),dimension(n1,n3),intent(out) :: C1,C2,C3 ! local parameters integer :: i,j #if defined(XSMM) && defined(LIBXSMM_SMM_5_25_5) ! matrix-matrix multiplication C = alpha A * B + beta C ! with A(n1,n2) 5x5-matrix, B(n2,n3) 5x25-matrix and C(n1,n3) 5x25-matrix ! static version using MNK="5 25, 5" ALPHA=1 BETA=0 call libxsmm_smm_5_25_5(a=A, b=B1, c=C1, pa=A, pb=B2, pc=C2) call libxsmm_smm_5_25_5(a=A, b=B2, c=C2, pa=A, pb=B3, pc=C3) call libxsmm_smm_5_25_5(a=A, b=B3, c=C3, pa=A, pb=B1, pc=C1) ! with dummy prefetch return #endif ! matrix-matrix multiplication do j = 1,n3 !dir$ ivdep do i = 1,n1 C1(i,j) = A(i,1) * B1(1,j) & + A(i,2) * B1(2,j) & + A(i,3) * B1(3,j) & + A(i,4) * B1(4,j) & + A(i,5) * B1(5,j) C2(i,j) = A(i,1) * B2(1,j) & + A(i,2) * B2(2,j) & + A(i,3) * B2(3,j) & + A(i,4) * B2(4,j) & + A(i,5) * B2(5,j) C3(i,j) = A(i,1) * B3(1,j) & + A(i,2) * B3(2,j) & + A(i,3) * B3(3,j) & + A(i,4) * B3(4,j) & + A(i,5) * B3(5,j) enddo enddo end subroutine mxm5_3comp_singleA !-------------------------------------------------------------------------------------------- subroutine mxm5_3comp_singleB(A1,A2,A3,n1,B,C1,C2,C3,n3) ! 3 different arrays for x/y/z-components, 2-dimensional arrays (25,5)/(5,25), same B matrix for all 3 component arrays #if defined(XSMM) && defined(LIBXSMM_SMM_25_5_5) use my_libxsmm,only: libxsmm_smm_25_5_5 #endif implicit none integer,intent(in) :: n1,n3 real(kind=CUSTOM_REAL),dimension(n1,5),intent(in) :: A1,A2,A3 real(kind=CUSTOM_REAL),dimension(5,n3),intent(in) :: B real(kind=CUSTOM_REAL),dimension(n1,n3),intent(out) :: C1,C2,C3 ! local parameters integer :: i,j #if defined(XSMM) && defined(LIBXSMM_SMM_25_5_5) ! matrix-matrix multiplication C = alpha A * B + beta C ! with A(n1,n2) 25x5-matrix, B(n2,n3) 5x5-matrix and C(n1,n3) 25x5-matrix ! static version call libxsmm_smm_25_5_5(a=A1, b=B, c=C1, pa=A2, pb=B, pc=C2) call libxsmm_smm_25_5_5(a=A2, b=B, c=C2, pa=A3, pb=B, pc=C3) call libxsmm_smm_25_5_5(a=A3, b=B, c=C3, pa=A1, pb=B, pc=C1) return #endif ! matrix-matrix multiplication do j = 1,n3 !dir$ ivdep do i = 1,n1 C1(i,j) = A1(i,1) * B(1,j) & + A1(i,2) * B(2,j) & + A1(i,3) * B(3,j) & + A1(i,4) * B(4,j) & + A1(i,5) * B(5,j) C2(i,j) = A2(i,1) * B(1,j) & + A2(i,2) * B(2,j) & + A2(i,3) * B(3,j) & + A2(i,4) * B(4,j) & + A2(i,5) * B(5,j) C3(i,j) = A3(i,1) * B(1,j) & + A3(i,2) * B(2,j) & + A3(i,3) * B(3,j) & + A3(i,4) * B(4,j) & + A3(i,5) * B(5,j) enddo enddo end subroutine mxm5_3comp_singleB !-------------------------------------------------------------------------------------------- subroutine mxm5_3comp_3dmat_singleB(A1,A2,A3,n1,B,n2,C1,C2,C3,n3) ! 3 different arrays for x/y/z-components, 3-dimensional arrays (5,5,5), same B matrix for all 3 component arrays ! note: on CPUs like Haswell or Sandy Bridge, the following will slow down computations ! however, on Intel Phi (KNC) it is still helpful (speedup +3%) #if defined(XSMM_FORCE_EVEN_IF_SLOWER) || ( defined(XSMM) && defined(LIBXSMM_SMM_25_5_5) && defined(__MIC__) ) use my_libxsmm,only: libxsmm_smm_5_5_5 #endif implicit none integer,intent(in) :: n1,n2,n3 real(kind=CUSTOM_REAL),dimension(n1,5,n3),intent(in) :: A1,A2,A3 real(kind=CUSTOM_REAL),dimension(5,n2),intent(in) :: B real(kind=CUSTOM_REAL),dimension(n1,n2,n3),intent(out) :: C1,C2,C3 ! local parameters integer :: i,j,k #if defined(XSMM_FORCE_EVEN_IF_SLOWER) || ( defined(XSMM) && defined(LIBXSMM_SMM_25_5_5) && defined(__MIC__) ) ! matrix-matrix multiplication C = alpha A * B + beta C ! with A(n1,n2,n4) 5x5x5-matrix, B(n2,n3) 5x5-matrix and C(n1,n3,n4) 5x5x5-matrix ! static version !do k = 1,5 ! call libxsmm_mmcall(xmm3, A1(:,:,k), B, C1(:,:,k)) ! call libxsmm_mmcall(xmm3, A2(:,:,k), B, C2(:,:,k)) ! call libxsmm_mmcall(xmm3, A3(:,:,k), B, C3(:,:,k)) !enddo ! unrolled call libxsmm_smm_5_5_5(a=A1(1,1,1), b=B, c=C1(1,1,1),pa=A1(1,1,1+1), pb=B, pc=C1(1,1,1+1)) call libxsmm_smm_5_5_5(a=A1(1,1,2), b=B, c=C1(1,1,2),pa=A1(1,1,2+1), pb=B, pc=C1(1,1,2+1)) call libxsmm_smm_5_5_5(a=A1(1,1,3), b=B, c=C1(1,1,3),pa=A1(1,1,3+1), pb=B, pc=C1(1,1,3+1)) call libxsmm_smm_5_5_5(a=A1(1,1,4), b=B, c=C1(1,1,4),pa=A1(1,1,4+1), pb=B, pc=C1(1,1,4+1)) call libxsmm_smm_5_5_5(a=A1(1,1,5), b=B, c=C1(1,1,5),pa=A2(1,1,1), pb=B, pc=C2(1,1,1)) call libxsmm_smm_5_5_5(a=A2(1,1,1), b=B, c=C2(1,1,1),pa=A2(1,1,1+1), pb=B, pc=C2(1,1,1+1)) call libxsmm_smm_5_5_5(a=A2(1,1,2), b=B, c=C2(1,1,2),pa=A2(1,1,2+1), pb=B, pc=C2(1,1,2+1)) call libxsmm_smm_5_5_5(a=A2(1,1,3), b=B, c=C2(1,1,3),pa=A2(1,1,3+1), pb=B, pc=C2(1,1,3+1)) call libxsmm_smm_5_5_5(a=A2(1,1,4), b=B, c=C2(1,1,4),pa=A2(1,1,4+1), pb=B, pc=C2(1,1,4+1)) call libxsmm_smm_5_5_5(a=A2(1,1,5), b=B, c=C2(1,1,5),pa=A3(1,1,1), pb=B, pc=C3(1,1,1)) call libxsmm_smm_5_5_5(a=A3(1,1,1), b=B, c=C3(1,1,1),pa=A3(1,1,1+1), pb=B, pc=C3(1,1,1+1)) call libxsmm_smm_5_5_5(a=A3(1,1,2), b=B, c=C3(1,1,2),pa=A3(1,1,2+1), pb=B, pc=C3(1,1,2+1)) call libxsmm_smm_5_5_5(a=A3(1,1,3), b=B, c=C3(1,1,3),pa=A3(1,1,3+1), pb=B, pc=C3(1,1,3+1)) call libxsmm_smm_5_5_5(a=A3(1,1,4), b=B, c=C3(1,1,4),pa=A3(1,1,4+1), pb=B, pc=C3(1,1,4+1)) call libxsmm_smm_5_5_5(a=A3(1,1,5), b=B, c=C3(1,1,5),pa=A3(1,1,5), pb=B, pc=C3(1,1,5)) return #endif ! matrix-matrix multiplication do k = 1,n3 do j = 1,n2 !dir$ ivdep do i = 1,n1 C1(i,j,k) = A1(i,1,k) * B(1,j) & + A1(i,2,k) * B(2,j) & + A1(i,3,k) * B(3,j) & + A1(i,4,k) * B(4,j) & + A1(i,5,k) * B(5,j) C2(i,j,k) = A2(i,1,k) * B(1,j) & + A2(i,2,k) * B(2,j) & + A2(i,3,k) * B(3,j) & + A2(i,4,k) * B(4,j) & + A2(i,5,k) * B(5,j) C3(i,j,k) = A3(i,1,k) * B(1,j) & + A3(i,2,k) * B(2,j) & + A3(i,3,k) * B(3,j) & + A3(i,4,k) * B(4,j) & + A3(i,5,k) * B(5,j) enddo enddo enddo end subroutine mxm5_3comp_3dmat_singleB end subroutine compute_forces_with_xsmm_static libxsmm-1.17/samples/specfem/config.fh000066400000000000000000000016441415223013700177720ustar00rootroot00000000000000!------------------------------------------------------------------- ! ! macros ! !------------------------------------------------------------------- ! predefined symbols (preprocessor) #include "libxsmm_config.h" ! LIBXSMM for this test #define XSMM ! macros for vectorization ! switches indexing between: array( i,j,k .. ) <-> array( ijk,1,1 .. ) #ifdef FORCE_VECTORIZATION # define INDEX_IJK ijk,1,1 #else # define INDEX_IJK i,j,k #endif ! switches do-loops between: do k=1,NGLLZ; do j=1,NGLLY; do i=1,NGLLX <-> do ijk=1,NGLLCUBE #ifdef FORCE_VECTORIZATION # define DO_LOOP_IJK do ijk=1,NGLLCUBE #else # define DO_LOOP_IJK do k=1,NGLLZ; do j=1,NGLLY; do i=1,NGLLX #endif ! switches enddo-loops between: enddo; enddo; enddo ! NGLLZ,NGLLY,NGLLX <-> enddo ! NGLLCUBE #ifdef FORCE_VECTORIZATION # define ENDDO_LOOP_IJK enddo ! NGLLCUBE #else # define ENDDO_LOOP_IJK enddo; enddo; enddo ! NGLLZ,NGLLY,NGLLX #endif libxsmm-1.17/samples/specfem/specfem.F90000066400000000000000000001074651415223013700201200ustar00rootroot00000000000000! ! test program for LIBXSMM function calls ! ! uses SPECFEM3D_GLOBE routine compute_forces_crust_mantle_Dev() with dummy example ! ! we switch between vectorized and non-vectorized version by using pre-processor flag FORCE_VECTORIZATION ! and macros INDEX_IJK, DO_LOOP_IJK, ENDDO_LOOP_IJK defined in config.fh #include "config.fh" !------------------------------------------------------------------- ! ! modules ! !------------------------------------------------------------------- module my_libxsmm use libxsmm !,only: LIBXSMM_SMMFUNCTION,libxsmm_dispatch,libxsmm_mmcall,libxsmm_init,libxsmm_finalize implicit none ! function pointers ! (note: defined for single precision, thus needs CUSTOM_REAL to be SIZE_REAL) type(LIBXSMM_SMMFUNCTION) :: xmm1, xmm2, xmm3 ! prefetch versions type(LIBXSMM_SMMFUNCTION) :: xmm1p, xmm2p, xmm3p logical :: USE_XSMM_FUNCTION,USE_XSMM_FUNCTION_PREFETCH end module my_libxsmm ! !------------------------------------------------------------------- ! module constants implicit none integer, parameter :: SIZE_REAL = 4, SIZE_DOUBLE = 8 integer, parameter :: CUSTOM_REAL = SIZE_REAL integer, parameter :: ISTANDARD_OUTPUT = 6 integer, parameter :: IMAIN = ISTANDARD_OUTPUT ! number of GLL points in each direction of an element (degree plus one) integer, parameter :: NGLLX = 5 integer, parameter :: NGLLY = NGLLX integer, parameter :: NGLLZ = NGLLX integer, parameter :: NGLLCUBE = NGLLX * NGLLY * NGLLZ ! Deville routines optimized for NGLLX = NGLLY = NGLLZ = 5 integer, parameter :: m1 = NGLLX, m2 = NGLLX * NGLLY ! 3-D simulation integer, parameter :: NDIM = 3 ! some useful constants double precision, parameter :: PI = 3.141592653589793d0 integer, parameter :: IFLAG_IN_FICTITIOUS_CUBE = 11 end module constants ! !------------------------------------------------------------------- ! module specfem_par ! main parameter module for specfem simulations use constants use libxsmm,only: LIBXSMM_ALIGNMENT implicit none !------------------------------------------------ ! number of spectral elements in x/y/z-directions integer,parameter :: NEX = 40 integer,parameter :: NEY = 40 integer,parameter :: NEZ = 25 !------------------------------------------------ ! MPI rank (dummy, no MPI for this test needed) integer :: myrank ! array with derivatives of Lagrange polynomials and precalculated products real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xx,hprimewgll_xx real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xxT,hprimewgll_xxT real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz ! arrays for Deville and force_vectorization real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: wgllwgll_xy_3D,wgllwgll_xz_3D,wgllwgll_yz_3D ! mesh parameters ! number of spectral elements integer :: NSPEC ! number of global nodes integer :: NGLOB ! local to global indexing integer, dimension(:,:,:,:),allocatable :: ibool ! displacement, velocity, acceleration real(kind=CUSTOM_REAL), dimension(:,:),allocatable :: displ,accel ! for verification real(kind=CUSTOM_REAL), dimension(:,:),allocatable :: accel_default !slow-down: please don't use unless you're sure... !dir$ ATTRIBUTES align:LIBXSMM_ALIGNMENT :: displ,accel,ibool,accel_default ! gravity logical,parameter :: GRAVITY_VAL = .true. ! optimized arrays integer, dimension(:,:),allocatable :: ibool_inv_tbl integer, dimension(:,:),allocatable :: ibool_inv_st integer, dimension(:,:),allocatable :: phase_iglob integer, dimension(2) :: num_globs ! work array with contributions real(kind=CUSTOM_REAL), dimension(:,:,:,:,:),allocatable :: sum_terms ! inner / outer elements crust/mantle region integer :: num_phase_ispec integer :: nspec_inner,nspec_outer integer, dimension(:,:), allocatable :: phase_ispec_inner integer :: iphase end module specfem_par !------------------------------------------------------------------- ! ! main program ! !------------------------------------------------------------------- program test use specfem_par use my_libxsmm implicit none ! timing double precision :: duration,duration_default integer(kind=8) :: start ! verification real :: diff integer, dimension(2) :: iloc_max logical, parameter :: DEBUG_VERIFICATION = .false. ! repetitions (time steps) integer :: it integer,parameter :: NSTEP = 20 ! should be > NSTEP_JITTER because of average timing integer,parameter :: NSTEP_JITTER = 5 ! skip first few steps when timing the kernels (the first steps exhibit runtime jitter) ! different versions integer,parameter :: num_versions = 5 character(len=20) :: str_version(num_versions) = (/character(len=20) :: & "Deville loops", & "unrolled loops", & "LIBXSMM dispatch" , & "LIBXSMM prefetch", & "LIBXSMM static" & /) double precision :: avg_time(num_versions) integer :: iversion print *,'--------------------------------------' print *,'specfem example' print *,'--------------------------------------' print * ! creates test mesh call setup_mesh() ! prepares arrays for time iteration loop call prepare_timerun() ! OpenMP output info call prepare_openmp() ! prepares libxsmm functions call prepare_xsmm() ! timing averages avg_time(:) = 0.d0 iphase = 2 do it = 1,NSTEP print * print *,'step ',it do iversion = 1,num_versions ! initializes accel(:,:) = 0._CUSTOM_REAL ! timing start = libxsmm_timer_tick() ! computes forces select case (iversion) case (1) ! Deville loops call compute_forces_Dev() case (2) ! unrolled loops call compute_forces_noDev() case (3) ! LIBXSMM with dispatch functions if (USE_XSMM_FUNCTION) then call compute_forces_with_xsmm() else cycle endif case (4) ! LIBXSMM with prefetch function calls if (USE_XSMM_FUNCTION_PREFETCH) then call compute_forces_with_xsmm_prefetch() else cycle endif case (5) ! LIBXSMM with static function calls call compute_forces_with_xsmm_static() end select ! timing duration = libxsmm_timer_duration(start, libxsmm_timer_tick()) if (iversion == 1) duration_default = duration ! average time if (it > NSTEP_JITTER) avg_time(iversion) = avg_time(iversion) + duration ! for verification if (iversion == 1) then accel_default(:,:) = accel(:,:) endif diff = maxval(abs(accel(:,:) - accel_default(:,:))) ! user output if (iversion == 1) then write(*,'(a30,a,f8.4,a)') 'duration with '//str_version(iversion),' = ',sngl(duration),' (s)' else write(*,'(a30,a,f8.4,a,f8.2,a,e12.4)') 'duration with '//str_version(iversion),' = ', & sngl(duration),' (s) / speedup = ', & sngl(100.0 * (duration_default-duration)/duration_default),' % / maximum diff = ',diff endif ! check if (DEBUG_VERIFICATION) then iloc_max = maxloc(abs(accel(:,:) - accel_default(:,:))) print *,'verification: max diff = ',diff print *,' iglob loc = ',iloc_max(1),iloc_max(2) print *,'maximum difference: #current vs. #default value' print *,' ',accel(1,iloc_max(2)),accel_default(1,iloc_max(2)) print *,' ',accel(2,iloc_max(2)),accel_default(2,iloc_max(2)) print *,' ',accel(3,iloc_max(2)),accel_default(3,iloc_max(2)) print *,'min/max accel values = ',minval(accel(:,:)),maxval(accel(:,:)) print * endif enddo ! iversion enddo ! it ! average timing (avoiding the first 5 steps which fluctuate quite a bit...) avg_time(:) = avg_time(:) / dble(NSTEP - NSTEP_JITTER) print * print *,'==============================================' print *,'average over ',NSTEP - NSTEP_JITTER,'repetitions' write(*,'(a30,a,f8.4)') ' timing with '//str_version(1),' = ',avg_time(1) do iversion = 2,num_versions ! skip unused tests if (iversion == 3 .and. .not. USE_XSMM_FUNCTION) cycle if (iversion == 4 .and. .not. USE_XSMM_FUNCTION_PREFETCH) cycle write(*,'(a30,a,f8.4,a,f8.2,a)') ' timing with '//str_version(iversion),' = ', & avg_time(iversion),' / speedup = ', & sngl(100.0 * (avg_time(1)-avg_time(iversion))/avg_time(1)),' %' enddo print *,'==============================================' print * ! frees memory deallocate(displ,accel) deallocate(accel_default,ibool) deallocate(sum_terms) deallocate(ibool_inv_st,ibool_inv_tbl,phase_iglob) deallocate(phase_ispec_inner) ! finalizes LIBXSMM call libxsmm_finalize() end program test ! !------------------------------------------------------------------- ! subroutine setup_mesh() use constants use specfem_par implicit none integer :: i1,i2 integer :: ix,iy,iz integer :: i,j,k integer :: iglob,ispec,inumber integer, dimension(:), allocatable :: mask_ibool integer, dimension(:,:,:,:), allocatable :: copy_ibool_ori logical, dimension(:), allocatable :: mask_ibool_flag ! total number of elements NSPEC = NEX * NEY * NEZ ! set up local to global numbering allocate(ibool(NGLLX,NGLLY,NGLLZ,NSPEC)) ibool(:,:,:,:) = 0 ispec = 0 iglob = 0 ! arranges a three-dimensional block, elements are collated side-by-side. mimicks a very simple unstructured grid. do iz = 1,NEZ do iy = 1,NEY do ix = 1,NEX ispec = ispec + 1 ! GLL point indexing do k = 1,NGLLZ do j = 1,NGLLY do i = 1,NGLLX ! set up local to global numbering if ((i == 1) .and. (ix > 1)) then ! previous element along x-direction ibool(i,j,k,ispec) = ibool(NGLLX,j,k,ispec - 1) else if ((j == 1) .and. (iy > 1)) then ! previous element along y-direction ibool(i,j,k,ispec) = ibool(i,NGLLY,k,ispec - NEX) else if ((k == 1) .and. (iz > 1)) then ! previous element along z-direction ibool(i,j,k,ispec) = ibool(i,j,NGLLZ,ispec - NEX * NEY) else ! new point iglob = iglob + 1 ibool(i,j,k,ispec) = iglob endif enddo enddo enddo enddo ! NEX enddo ! NEY enddo ! NEZ ! sets total numbers of nodes NGLOB = iglob print *,'mesh:' print *,' total number of elements = ',NSPEC print *,' total number of global nodes = ',NGLOB !print *,' ibool min/max = ',minval(ibool),maxval(ibool) ! checks if (ispec /= NSPEC) stop 'Invalid ispec count' if (minval(ibool(:,:,:,:)) < 1) stop 'Invalid ibool minimum value' if (maxval(ibool(:,:,:,:)) > NSPEC * NGLLX * NGLLY * NGLLZ) stop 'Invalid ibool maximum value' ! we can create a new indirect addressing to reduce cache misses allocate(copy_ibool_ori(NGLLX,NGLLY,NGLLZ,NSPEC),mask_ibool(nglob)) mask_ibool(:) = -1 copy_ibool_ori(:,:,:,:) = ibool(:,:,:,:) inumber = 0 do ispec = 1,NSPEC do k = 1,NGLLZ do j = 1,NGLLY do i = 1,NGLLX if (mask_ibool(copy_ibool_ori(i,j,k,ispec)) == -1) then ! creates a new point inumber = inumber + 1 ibool(i,j,k,ispec) = inumber mask_ibool(copy_ibool_ori(i,j,k,ispec)) = inumber else ! uses an existing point created previously ibool(i,j,k,ispec) = mask_ibool(copy_ibool_ori(i,j,k,ispec)) endif enddo enddo enddo enddo if (inumber /= NGLOB) stop 'Invalid inumber count' deallocate(copy_ibool_ori,mask_ibool) ! define polynomial derivatives & weights ! (dummy values) do i1 = 1,NGLLX do i2 = 1,NGLLX hprime_xx(i2,i1) = i1 * 0.1 + i2 * 0.2 ! original: real(lagrange_deriv_GLL(i1-1,i2-1,xigll,NGLLX), kind=CUSTOM_REAL) hprimewgll_xx(i2,i1) = hprime_xx(i2,i1) * (i2 * 1.0/NGLLX) ! real(lagrange_deriv_GLL(i1-1,i2-1,xigll,NGLLX)*wxgll(i2), kind=CUSTOM_REAL) enddo enddo do i = 1,NGLLX do j = 1,NGLLY wgllwgll_xy(i,j) = (i * 1.0/NGLLX) * (j * 1.0/NGLLY) ! original: real(wxgll(i)*wygll(j), kind=CUSTOM_REAL) enddo enddo do i = 1,NGLLX do k = 1,NGLLZ wgllwgll_xz(i,k) = (i * 1.0/NGLLX) * (k * 1.0/NGLLZ) ! original: real(wxgll(i)*wzgll(k), kind=CUSTOM_REAL) enddo enddo do j = 1,NGLLY do k = 1,NGLLZ wgllwgll_yz(j,k) = (j * 1.0/NGLLY) * (k * 1.0/NGLLZ) ! original: real(wygll(j)*wzgll(k), kind=CUSTOM_REAL) enddo enddo ! define a 3D extension in order to be able to force vectorization in the compute_forces_**_Dev routines do k = 1,NGLLZ do j = 1,NGLLY do i = 1,NGLLX wgllwgll_yz_3D(i,j,k) = wgllwgll_yz(j,k) wgllwgll_xz_3D(i,j,k) = wgllwgll_xz(i,k) wgllwgll_xy_3D(i,j,k) = wgllwgll_xy(i,j) enddo enddo enddo ! check that optimized routines from Deville et al. (2002) can be used if (NGLLX /= 5 .or. NGLLY /= 5 .or. NGLLZ /= 5) & stop 'Deville et al. (2002) routines can only be used if NGLLX = NGLLY = NGLLZ = 5' ! define transpose of derivation matrix do j = 1,NGLLX do i = 1,NGLLX hprime_xxT(j,i) = hprime_xx(i,j) hprimewgll_xxT(j,i) = hprimewgll_xx(i,j) enddo enddo ! displacement and acceleration (dummy fields) allocate(displ(NDIM,NGLOB),accel(NDIM,NGLOB)) accel(:,:) = 0._CUSTOM_REAL ! sets initial dummy values (to avoid getting only zero multiplications later on) if (.true.) then ! arbitrary linear function do iglob = 1,NGLOB displ(:,iglob) = dble(iglob - 1) / dble(NGLOB - 1) enddo else ! arbitrary sine function allocate(mask_ibool_flag(NGLOB)) mask_ibool_flag(:) = .false. ispec = 0 do iz = 1,NEZ do iy = 1,NEY do ix = 1,NEX ispec = ispec + 1 ! GLL point indexing do k = 1,NGLLZ do j = 1,NGLLY do i = 1,NGLLX iglob = ibool(i,j,k,ispec) if (.not. mask_ibool_flag(iglob)) then ! only assigns global value once mask_ibool_flag(iglob) = .true. displ(:,iglob) = sin(PI * dble(ix - 1) / dble(NEX - 1)) & * sin(PI * dble(iy - 1) / dble(NEY - 1)) & * sin(PI * dble(iz - 1) / dble(NEZ - 1)) endif enddo enddo enddo enddo enddo enddo deallocate(mask_ibool_flag) endif ! for verification allocate(accel_default(NDIM,NGLOB)) accel_default(:,:) = 0._CUSTOM_REAL end subroutine setup_mesh ! !------------------------------------------------------------------- ! subroutine prepare_timerun() use specfem_par implicit none ! local parameters integer :: num_elements,ispec,ier ! setup inner/outer elements (single slice only, no outer elements for halo) myrank = 0 ! no MPI over-lapping communication in this example nspec_inner = NSPEC nspec_outer = 0 num_phase_ispec = NSPEC allocate(phase_ispec_inner(num_phase_ispec,2),stat=ier) if (ier /= 0 ) call exit_mpi(myrank,'Error allocating array phase_ispec_inner_crust_mantle') phase_ispec_inner(:,:) = 0 do ispec = 1,NSPEC phase_ispec_inner(ispec,2) = ispec enddo ! from original routine prepare_timerun_ibool_inv_tbl() ! note: we use allocate for sum_terms arrays rather than defining within subroutine compute_forces_**_Dev() itself ! as it will crash when using OpenMP and operating systems with small stack sizes ! e.g. see http://stackoverflow.com/questions/22649827/illegal-instruction-error-when-running-openmp-in-gfortran-mac allocate(sum_terms(NDIM,NGLLX,NGLLY,NGLLZ,NSPEC),stat=ier) if (ier /= 0) stop 'Error allocating sum_terms arrays' sum_terms(:,:,:,:,:) = 0._CUSTOM_REAL ! inverse table ! this helps to speedup the assembly, especially with OpenMP (or on MIC) threading ! allocating arrays allocate(ibool_inv_tbl(NGLLX*NGLLY*NGLLZ*NSPEC,2),stat=ier) if (ier /= 0) stop 'Error allocating ibool_inv_tbl arrays' allocate(ibool_inv_st(NGLOB+1,2),stat=ier) if (ier /= 0) stop 'Error allocating ibool_inv_st arrays' allocate(phase_iglob(NGLOB,2),stat=ier) if (ier /= 0) stop 'Error allocating phase_iglob arrays' ! initializing num_globs(:) = 0 ibool_inv_tbl(:,:) = 0 ibool_inv_st(:,:) = 0 phase_iglob(:,:) = 0 !---- make inv. table ---------------------- ! loops over phases ! (1 == outer elements / 2 == inner elements) do iphase = 1,2 ! crust mantle if (iphase == 1) then ! outer elements (iphase=1) num_elements = nspec_outer else ! inner elements (iphase=2) num_elements = nspec_inner endif call make_inv_table(iphase,NGLOB,NSPEC, & num_elements,phase_ispec_inner, & ibool,phase_iglob, & ibool_inv_tbl, ibool_inv_st, & num_globs) enddo ! user output if (myrank == 0) then write(IMAIN,*) " inverse table of ibool done" call flush_IMAIN() endif ! synchronizes processes call synchronize_all() contains subroutine make_inv_table(iphase,nglob,nspec, & phase_nspec,phase_ispec,ibool,phase_iglob, & ibool_inv_tbl,ibool_inv_st,num_globs,idoubling) implicit none ! arguments integer,intent(in) :: iphase integer,intent(in) :: nglob integer,intent(in) :: nspec integer,intent(in) :: phase_nspec integer, dimension(:,:),intent(in) :: phase_ispec integer, dimension(:,:,:,:),intent(in) :: ibool integer, dimension(:,:),intent(inout) :: phase_iglob integer, dimension(:,:),intent(inout) :: ibool_inv_tbl integer, dimension(:,:),intent(inout) :: ibool_inv_st integer, dimension(:),intent(inout) :: num_globs integer,dimension(:),optional :: idoubling ! local parameters integer, dimension(:), allocatable :: ibool_inv_num integer, dimension(:,:), allocatable :: ibool_inv_tbl_tmp integer :: num_alloc_ibool_inv_tbl,num_alloc_ibool_inv_tbl_theor integer :: num_used_ibool_inv_tbl integer :: ip, iglob, ispec_p, ispec, iglob_p, ier integer :: inum #ifdef FORCE_VECTORIZATION integer :: ijk #else integer :: i,j,k #endif logical :: is_inner_core ! tolerance number of shared degrees per node integer, parameter :: N_TOL = 20 ! checks if anything to do (e.g., no outer elements for single process simulations) if (phase_nspec == 0) return ! checks if inner core region if (present(idoubling)) then is_inner_core = .true. else is_inner_core = .false. endif ! allocates temporary arrays allocate(ibool_inv_num(nglob),stat=ier) if (ier /= 0) stop 'Error allocating ibool_inv_num array' ! gets valence of global degrees of freedom for current phase (inner/outer) elements ibool_inv_num(:) = 0 do ispec_p = 1,phase_nspec ispec = phase_ispec(ispec_p,iphase) ! exclude fictitious elements in central cube if (is_inner_core) then if (idoubling(ispec) == IFLAG_IN_FICTITIOUS_CUBE) cycle endif DO_LOOP_IJK iglob = ibool(INDEX_IJK,ispec) ! increases valence counter ibool_inv_num(iglob) = ibool_inv_num(iglob) + 1 ENDDO_LOOP_IJK enddo ! gets maximum valence value num_alloc_ibool_inv_tbl = maxval(ibool_inv_num(:)) ! theoretical number of maximum shared degrees per node num_alloc_ibool_inv_tbl_theor = N_TOL*(NGLLX*NGLLY*NGLLZ*nspec/nglob+1) ! checks valence if (num_alloc_ibool_inv_tbl < 1 .or. num_alloc_ibool_inv_tbl > num_alloc_ibool_inv_tbl_theor) then print *,'Error invalid maximum valence:' print *,'valence value = ',num_alloc_ibool_inv_tbl,' - theoretical maximum = ',num_alloc_ibool_inv_tbl_theor stop 'Error invalid maximum valence value' endif ! debug !print *,myrank,'maximum shared degrees theoretical = ',num_alloc_ibool_inv_tbl_theor ! regional_Greece_small example: 40 !print *,myrank,'maximum shared degrees from array = ',maxval(ibool_inv_num(:)) ! regional_Greece_small example: 8 and 16 allocate(ibool_inv_tbl_tmp(num_alloc_ibool_inv_tbl,nglob),stat=ier) if (ier /= 0) stop 'Error allocating ibool_inv_tbl_tmp array' !---- make temporary array of inv. table : ibool_inv_tbl_tmp ibool_inv_tbl_tmp(:,:) = 0 ibool_inv_num(:) = 0 do ispec_p = 1,phase_nspec ispec = phase_ispec(ispec_p,iphase) ! exclude fictitious elements in central cube if (is_inner_core) then if (idoubling(ispec) == IFLAG_IN_FICTITIOUS_CUBE) cycle endif DO_LOOP_IJK iglob = ibool(INDEX_IJK,ispec) ! increases counter ibool_inv_num(iglob) = ibool_inv_num(iglob) + 1 ! inverse table ! sets 1D index of local GLL point (between 1 and NGLLCUBE) #ifdef FORCE_VECTORIZATION inum = ijk #else inum = i + (j-1)*NGLLY + (k-1)*NGLLY*NGLLZ #endif ! sets 1D index in local ibool array ibool_inv_tbl_tmp(ibool_inv_num(iglob),iglob) = inum + NGLLX*NGLLY*NGLLZ*(ispec-1) ENDDO_LOOP_IJK enddo !---- packing : ibool_inv_tbl_tmp -> ibool_inv_tbl ip = 0 iglob_p = 0 num_used_ibool_inv_tbl = 0 do iglob = 1, nglob if (ibool_inv_num(iglob) /= 0) then iglob_p = iglob_p + 1 phase_iglob(iglob_p,iphase) = iglob ! sets start index of table entry for this global node ibool_inv_st(iglob_p,iphase) = ip + 1 ! sets maximum of used valence if (ibool_inv_num(iglob) > num_used_ibool_inv_tbl) num_used_ibool_inv_tbl = ibool_inv_num(iglob) ! loops over valence do inum = 1, ibool_inv_num(iglob) ! increases total counter ip = ip + 1 ! maps local 1D index in ibool array ibool_inv_tbl(ip,iphase) = ibool_inv_tbl_tmp(inum,iglob) enddo endif enddo ! sets last entry in start index table ibool_inv_st(iglob_p+1,iphase) = ip + 1 ! total number global nodes in this phase (inner/outer) num_globs(iphase) = iglob_p ! checks if (num_used_ibool_inv_tbl > num_alloc_ibool_inv_tbl) then print *,"Error invalid inverse table setting:" print *," num_alloc_ibool_inv_tbl = ",num_alloc_ibool_inv_tbl print *," num_used_ibool_inv_tbl = ",num_used_ibool_inv_tbl print *,"invalid value encountered: num_used_ibool_inv_tbl > num_alloc_ibool_inv_tbl" print *,"#### Program exits... ##########" call exit_MPI(myrank,'Error making inverse table for optimized arrays') endif ! debug !if (myrank == 0) then ! print *,'ibool_inv_tbl: ' ! do iglob_p = 1,200 ! print *,' ',iglob_p,'table = ',(ibool_inv_tbl(ip,iphase), & ! ip = ibool_inv_st(iglob_p,iphase),ibool_inv_st(iglob_p+1,iphase)-1) ! enddo !endif ! frees memory deallocate(ibool_inv_num) deallocate(ibool_inv_tbl_tmp) end subroutine make_inv_table end subroutine prepare_timerun ! !------------------------------------------------------------------- ! subroutine prepare_openmp() ! outputs OpenMP support info #ifdef USE_OPENMP use specfem_par,only: myrank,IMAIN #endif implicit none #ifdef USE_OPENMP ! local parameters integer :: thread_id,num_threads integer :: num_procs,max_threads logical :: is_dynamic,is_nested ! OpenMP functions integer,external :: OMP_GET_NUM_THREADS,OMP_GET_THREAD_NUM integer,external :: OMP_GET_NUM_PROCS,OMP_GET_MAX_THREADS logical,external :: OMP_GET_DYNAMIC,OMP_GET_NESTED ! OpenMP only supported for Deville routine !$OMP PARALLEL DEFAULT(NONE) & !$OMP SHARED(myrank) & !$OMP PRIVATE(thread_id,num_threads,num_procs,max_threads,is_dynamic,is_nested) ! gets thread number thread_id = OMP_GET_THREAD_NUM() ! gets total number of threads for this MPI process num_threads = OMP_GET_NUM_THREADS() ! OpenMP master thread only if (thread_id == 0) then ! gets additional environment info num_procs = OMP_GET_NUM_PROCS() max_threads = OMP_GET_MAX_THREADS() is_dynamic = OMP_GET_DYNAMIC() is_nested = OMP_GET_NESTED() ! user output if (myrank == 0) then write(IMAIN,*) '' write(IMAIN,*) 'OpenMP information:' write(IMAIN,*) ' number of threads = ', num_threads write(IMAIN,*) '' write(IMAIN,*) ' number of processors available = ', num_procs write(IMAIN,*) ' maximum number of threads available = ', num_procs write(IMAIN,*) ' dynamic thread adjustement = ', is_dynamic write(IMAIN,*) ' nested parallelism = ', is_nested write(IMAIN,*) '' call flush_IMAIN() endif endif !$OMP END PARALLEL #else ! nothing to do.. return #endif end subroutine prepare_openmp ! !------------------------------------------------------------------- ! subroutine prepare_xsmm() use constants,only: CUSTOM_REAL,SIZE_DOUBLE,m1,m2,IMAIN use specfem_par,only: myrank use my_libxsmm,only: libxsmm_init,libxsmm_dispatch,libxsmm_available,xmm1,xmm2,xmm3,USE_XSMM_FUNCTION ! prefetch versions use my_libxsmm,only: xmm1p,xmm2p,xmm3p,LIBXSMM_PREFETCH,USE_XSMM_FUNCTION_PREFETCH implicit none ! quick check if (m1 /= 5) stop 'LibXSMM with invalid m1 constant (must have m1 == 5)' if (m2 /= 5*5) stop 'LibXSMM with invalid m2 constant (must have m2 == 5*5)' if (CUSTOM_REAL == SIZE_DOUBLE) stop 'LibXSMM optimization only for single precision functions' ! initializes LIBXSMM call libxsmm_init() ! dispatch functions for matrix multiplications ! (see in compute_forces_**Dev.F90 routines for actual function call) ! example: a(n1,n2),b(n2,n3),c(n1,n3) -> c = a * b then libxsmm_dispatch(xmm,m=n1,n=n3,k=n2,alpha=1,beta=0) ! with A(n1,n2) 5x5-matrix, B(n2,n3) 5x25-matrix and C(n1,n3) 5x25-matrix call libxsmm_dispatch(xmm1, m=5, n=25, k=5, alpha=1.0_CUSTOM_REAL, beta=0.0_CUSTOM_REAL) ! with A(n1,n2) 25x5-matrix, B(n2,n3) 5x5-matrix and C(n1,n3) 25x5-matrix call libxsmm_dispatch(xmm2, m=25, n=5, k=5, alpha=1.0_CUSTOM_REAL, beta=0.0_CUSTOM_REAL) ! with A(n1,n2,n4) 5x5x5-matrix, B(n2,n3) 5x5-matrix and C(n1,n3,n4) 5x5x5-matrix call libxsmm_dispatch(xmm3, m=5, n=5, k=5, alpha=1.0_CUSTOM_REAL, beta=0.0_CUSTOM_REAL) !directly: call libxsmm_smm_5_5_5(A,B,C) if (libxsmm_available(xmm1) .and. libxsmm_available(xmm2) .and. libxsmm_available(xmm3)) then USE_XSMM_FUNCTION = .true. ! user output if (myrank == 0) then write(IMAIN,*) write(IMAIN,*) "LIBXSMM dispatch functions ready for small matrix-matrix multiplications" call flush_IMAIN() endif else USE_XSMM_FUNCTION = .false. print *,'LIBXSMM invalid dispatch function pointers:', & libxsmm_available(xmm1),libxsmm_available(xmm2),libxsmm_available(xmm3) ! hard stop !call exit_MPI(myrank,'LIBXSMM functions not ready, please check configuration & compilation') endif ! synchronizes processes call synchronize_all() ! prefetch versions call libxsmm_dispatch(xmm1p, m=5, n=25, k=5, alpha=1.0_CUSTOM_REAL, beta=0.0_CUSTOM_REAL,prefetch=LIBXSMM_PREFETCH) call libxsmm_dispatch(xmm2p, m=25, n=5, k=5, alpha=1.0_CUSTOM_REAL, beta=0.0_CUSTOM_REAL,prefetch=LIBXSMM_PREFETCH) call libxsmm_dispatch(xmm3p, m=5, n=5, k=5, alpha=1.0_CUSTOM_REAL, beta=0.0_CUSTOM_REAL,prefetch=LIBXSMM_PREFETCH) if (libxsmm_available(xmm1p) .and. libxsmm_available(xmm2p) .and. libxsmm_available(xmm3p)) then USE_XSMM_FUNCTION_PREFETCH = .true. ! user output if (myrank == 0) then write(IMAIN,*) "LIBXSMM prefetch functions ready for small matrix-matrix multiplications" write(IMAIN,*) call flush_IMAIN() endif else USE_XSMM_FUNCTION_PREFETCH = .false. print *,'LIBXSMM invalid prefetch function pointers:', & libxsmm_available(xmm1p),libxsmm_available(xmm2p),libxsmm_available(xmm3p) ! hard stop !call exit_MPI(myrank,'LIBXSMM prefetch functions not ready, please check configuration & compilation') endif ! force no dispatch !USE_XSMM_FUNCTION = .false. !USE_XSMM_FUNCTION_PREFETCH = .false. end subroutine prepare_xsmm !------------------------------------------------------------------- ! ! dummy routines ! !------------------------------------------------------------------- subroutine compute_element_dummy(ispec,ibool,tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3, & dummyx_loc,dummyy_loc,dummyz_loc,rho_s_H) ! dummy example (original: isotropic element in crust/mantle region) ! ! it is mostly used to avoid over-simplification of the compute_forces routine: if we omit it, then compilers can do ! much more aggressive optimizations and the timing results would be misleading. the original routines for computing ! stresses on elements are more expensive and complicated. the dummy here will be much faster to compute, but should ! give similar relative performance results use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM #ifdef FORCE_VECTORIZATION use constants,only: NGLLCUBE #endif use specfem_par,only: NSPEC,GRAVITY_VAL implicit none ! element id integer,intent(in) :: ispec ! arrays with mesh parameters per slice integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC),intent(in) :: ibool ! element info real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ),intent(inout) :: & tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3 real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ),intent(in) :: dummyx_loc,dummyy_loc,dummyz_loc real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NDIM),intent(out) :: rho_s_H ! local parameters real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: sigma_xx,sigma_yy,sigma_zz real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy real(kind=CUSTOM_REAL) :: xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl real(kind=CUSTOM_REAL) :: fac,factor integer :: idummy #ifdef FORCE_VECTORIZATION ! in this vectorized version we have to assume that N_SLS == 3 in order to be able to unroll and thus suppress ! an inner loop that would otherwise prevent vectorization; this is safe in practice in all cases because N_SLS == 3 ! in all known applications, and in the main program we check that N_SLS == 3 if FORCE_VECTORIZATION is used and we stop integer :: ijk #else integer :: i,j,k #endif ! note: profiling shows that this routine takes about 60% of the total time, another 30% is spend in the tiso routine below.. DO_LOOP_IJK ! compute stress sigma ! (dummy values) sigma_xx(INDEX_IJK) = 0.1 * dummyx_loc(INDEX_IJK) sigma_yy(INDEX_IJK) = 0.1 * dummyy_loc(INDEX_IJK) sigma_zz(INDEX_IJK) = 0.1 * dummyz_loc(INDEX_IJK) sigma_xy(INDEX_IJK) = 0.3 * sigma_xx(INDEX_IJK) sigma_xz(INDEX_IJK) = 0.3 * sigma_yy(INDEX_IJK) sigma_yz(INDEX_IJK) = 0.3 * sigma_zz(INDEX_IJK) ENDDO_LOOP_IJK ! define symmetric components of sigma (to be general in case of gravity) DO_LOOP_IJK sigma_yx(INDEX_IJK) = sigma_xy(INDEX_IJK) sigma_zx(INDEX_IJK) = sigma_xz(INDEX_IJK) sigma_zy(INDEX_IJK) = sigma_yz(INDEX_IJK) ENDDO_LOOP_IJK ! compute non-symmetric terms for gravity if (GRAVITY_VAL) then ! dummy example, originally calls more complicated subroutine compute_element_gravity(..) DO_LOOP_IJK ! compute G tensor from s . g and add to sigma (not symmetric) ! (dummy values) sigma_xx(INDEX_IJK) = sigma_xx(INDEX_IJK) + 1.1 ! real(sy_l*gyl + sz_l*gzl, kind=CUSTOM_REAL) sigma_yy(INDEX_IJK) = sigma_yy(INDEX_IJK) + 1.1 ! real(sx_l*gxl + sz_l*gzl, kind=CUSTOM_REAL) sigma_zz(INDEX_IJK) = sigma_zz(INDEX_IJK) + 1.1 ! real(sx_l*gxl + sy_l*gyl, kind=CUSTOM_REAL) sigma_xy(INDEX_IJK) = sigma_xy(INDEX_IJK) - 0.3 ! real(sx_l * gyl, kind=CUSTOM_REAL) sigma_yx(INDEX_IJK) = sigma_yx(INDEX_IJK) - 0.3 ! real(sy_l * gxl, kind=CUSTOM_REAL) sigma_xz(INDEX_IJK) = sigma_xz(INDEX_IJK) - 0.5 ! real(sx_l * gzl, kind=CUSTOM_REAL) sigma_zx(INDEX_IJK) = sigma_zx(INDEX_IJK) - 0.5 ! real(sz_l * gxl, kind=CUSTOM_REAL) sigma_yz(INDEX_IJK) = sigma_yz(INDEX_IJK) - 0.7 ! real(sy_l * gzl, kind=CUSTOM_REAL) sigma_zy(INDEX_IJK) = sigma_zy(INDEX_IJK) - 0.7 ! real(sz_l * gyl, kind=CUSTOM_REAL) ! precompute vector factor = 0.5 ! 0.5 * dummyz_loc(INDEX_IJK) ! dble(jacobianl(INDEX_IJK)) * wgll_cube(INDEX_IJK) rho_s_H(INDEX_IJK,1) = factor * 1.5 ! real(factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl), kind=CUSTOM_REAL) rho_s_H(INDEX_IJK,2) = factor * 1.5 ! real(factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl), kind=CUSTOM_REAL) rho_s_H(INDEX_IJK,3) = factor * 1.5 ! real(factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl), kind=CUSTOM_REAL) ENDDO_LOOP_IJK endif ! dot product of stress tensor with test vector, non-symmetric form DO_LOOP_IJK ! reloads derivatives of ux, uy and uz with respect to x, y and z ! (dummy) xixl = 1.1 xiyl = 1.2 xizl = 1.3 etaxl = 1.4 etayl = 1.5 etazl = 1.6 gammaxl = 1.7 gammayl = 1.8 gammazl = 1.9 ! common factor (dummy) fac = 0.5 ! form dot product with test vector, non-symmetric form ! this goes to accel_x tempx1(INDEX_IJK) = fac * (sigma_xx(INDEX_IJK)*xixl + sigma_yx(INDEX_IJK)*xiyl + sigma_zx(INDEX_IJK)*xizl) ! this goes to accel_y tempy1(INDEX_IJK) = fac * (sigma_xy(INDEX_IJK)*xixl + sigma_yy(INDEX_IJK)*xiyl + sigma_zy(INDEX_IJK)*xizl) ! this goes to accel_z tempz1(INDEX_IJK) = fac * (sigma_xz(INDEX_IJK)*xixl + sigma_yz(INDEX_IJK)*xiyl + sigma_zz(INDEX_IJK)*xizl) ! this goes to accel_x tempx2(INDEX_IJK) = fac * (sigma_xx(INDEX_IJK)*etaxl + sigma_yx(INDEX_IJK)*etayl + sigma_zx(INDEX_IJK)*etazl) ! this goes to accel_y tempy2(INDEX_IJK) = fac * (sigma_xy(INDEX_IJK)*etaxl + sigma_yy(INDEX_IJK)*etayl + sigma_zy(INDEX_IJK)*etazl) ! this goes to accel_z tempz2(INDEX_IJK) = fac * (sigma_xz(INDEX_IJK)*etaxl + sigma_yz(INDEX_IJK)*etayl + sigma_zz(INDEX_IJK)*etazl) ! this goes to accel_x tempx3(INDEX_IJK) = fac * (sigma_xx(INDEX_IJK)*gammaxl + sigma_yx(INDEX_IJK)*gammayl + sigma_zx(INDEX_IJK)*gammazl) ! this goes to accel_y tempy3(INDEX_IJK) = fac * (sigma_xy(INDEX_IJK)*gammaxl + sigma_yy(INDEX_IJK)*gammayl + sigma_zy(INDEX_IJK)*gammazl) ! this goes to accel_z tempz3(INDEX_IJK) = fac * (sigma_xz(INDEX_IJK)*gammaxl + sigma_yz(INDEX_IJK)*gammayl + sigma_zz(INDEX_IJK)*gammazl) ENDDO_LOOP_IJK ! avoid compiler warning idummy = ispec idummy = ibool(1,1,1,1) end subroutine compute_element_dummy ! !------------------------------------------------------------------- ! subroutine synchronize_all() ! dummy routine to make it easier for copy-paste from the original code implicit none continue end subroutine synchronize_all ! !------------------------------------------------------------------- ! subroutine exit_MPI(myrank,error_msg) ! dummy routine to make it easier for copy-paste from the original code use constants implicit none ! identifier for error message file integer, parameter :: IERROR = 30 integer :: myrank character(len=*) :: error_msg ! write error message to screen write(*,*) error_msg(1:len(error_msg)) write(*,*) 'Error detected, aborting MPI... proc ',myrank ! or just exit with message: stop 'Error, program ended in exit_MPI' end subroutine exit_MPI ! !------------------------------------------------------------------- ! subroutine flush_IMAIN() ! dummy routine to make it easier for copy-paste from the original code implicit none continue end subroutine flush_IMAIN libxsmm-1.17/samples/specfem/specfem.sh000077500000000000000000000050141415223013700201620ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) NAME=$(basename $0 .sh) GREP=$(command -v grep) ENV=$(command -v env) if [ "Windows_NT" = "${OS}" ]; then # Cygwin's "env" does not set PATH ("Files/Black: No such file or directory") export PATH=${PATH}:${HERE}/../../lib:/usr/x86_64-w64-mingw32/sys-root/mingw/bin # Cygwin's ldd hangs with dyn. linked executables or certain shared libraries LDD=$(command -v cygcheck) EXE=.exe else if [ "$(command -v ldd)" ]; then LDD=ldd elif [ "$(command -v otool)" ]; then LDD="otool -L" else LDD=echo fi fi MICINFO=$(command -v micinfo) if [ "${MICINFO}" ]; then MICCORES=$(${MICINFO} 2>/dev/null | sed -n "0,/[[:space:]]\+Total No of Active Cores :[[:space:]]\+\([0-9]\+\)/s//\1/p") fi if [ "" = "${MICCORES}" ]; then MICCORES=61 fi MICTPERC=3 if [ "-mic" != "$1" ]; then if [ "$(${LDD} ${HERE}/${NAME}${EXE} 2>/dev/null | ${GREP} libiomp5\.)" ]; then ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ KMP_AFFINITY=scatter,granularity=fine,1 \ MIC_KMP_AFFINITY=scatter,granularity=fine \ MIC_KMP_HW_SUBSET=$((MICCORES-1))c${MICTPERC}t \ MIC_ENV_PREFIX=MIC \ OFFLOAD_INIT=on_start \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" else ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ OMP_PROC_BIND=TRUE \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" fi else shift ${ENV} \ SINK_LD_LIBRARY_PATH=${SINK_LD_LIBRARY_PATH}:${MIC_LD_LIBRARY_PATH}:${HERE}/../../lib \ micnativeloadex \ ${HERE}/${NAME}${EXE} -a "$*" \ -e "KMP_AFFINITY=scatter,granularity=fine" \ -e "MIC_KMP_HW_SUBSET=$((MICCORES-1))${MICTPERC}t" fi libxsmm-1.17/samples/spmdm/000077500000000000000000000000001415223013700156775ustar00rootroot00000000000000libxsmm-1.17/samples/spmdm/Makefile000066400000000000000000000074101415223013700173410ustar00rootroot00000000000000ROOTDIR = $(abspath $(dir $(firstword $(MAKEFILE_LIST)))) DEPDIR = ../.. SRCDIR = . INCDIR = . BLDDIR = obj OUTDIR = . CXXFLAGS = $(NULL) CFLAGS = $(NULL) DFLAGS = $(NULL) BLAS = 0 OMP = 1 SYM = 1 BFLOAT = 0 ifeq (1,$(BFLOAT)) CFLAGS += -DUSE_BFLOAT endif # include common Makefile artifacts include $(DEPDIR)/Makefile.inc # necessary include directories IFLAGS += -I$(call quote,$(INCDIR)) IFLAGS += -I$(call quote,$(DEPDIR)/include) OUTNAME := $(shell basename "$(ROOTDIR)") HEADERS := $(wildcard $(INCDIR)/*.h) $(wildcard $(INCDIR)/*.hpp) $(wildcard $(INCDIR)/*.hxx) $(wildcard $(INCDIR)/*.hh) \ $(wildcard $(SRCDIR)/*.h) $(wildcard $(SRCDIR)/*.hpp) $(wildcard $(SRCDIR)/*.hxx) $(wildcard $(SRCDIR)/*.hh) \ $(DEPDIR)/include/libxsmm_source.h CPPSRCS := $(wildcard $(SRCDIR)/*.cpp) CXXSRCS := $(wildcard $(SRCDIR)/*.cxx) CCXSRCS := $(wildcard $(SRCDIR)/*.cc) CSOURCS := $(wildcard $(SRCDIR)/*.c) CPPOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CPPSRCS:.cpp=-cpp.o))) CXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CXXSRCS:.cxx=-cxx.o))) CCXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CCXSRCS:.cc=-cc.o))) COBJCTS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CSOURCS:.c=-c.o))) ifneq (,$(strip $(FC))) FXXSRCS := $(wildcard $(SRCDIR)/*.f) F77SRCS := $(wildcard $(SRCDIR)/*.F) F90SRCS := $(wildcard $(SRCDIR)/*.f90) $(wildcard $(SRCDIR)/*.F90) FXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(FXXSRCS:.f=-f.o))) F77OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F77SRCS:.F=-f77.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90SRCS:.f90=-f90.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90OBJS:.F90=-f90.o))) endif SOURCES := $(CPPSRCS) $(CXXSRCS) $(CCXSRCS) $(CSOURCS) OBJECTS := $(CPPOBJS) $(CXXOBJS) $(CCXOBJS) $(COBJCTS) FTNSRCS := $(FXXSRCS) $(F77SRCS) $(F90SRCS) MODULES := $(addsuffix .mod,$(basename $(FTNSRCS))) $(addsuffix .modmic,$(basename $(FTNSRCS))) FTNOBJS := $(FXXOBJS) $(F77OBJS) $(F90OBJS) XFILES := $(OUTDIR)/$(OUTNAME) .PHONY: all all: $(XFILES) .PHONY: compile compile: $(OBJECTS) $(FTNOBJS) $(OUTDIR)/$(OUTNAME): $(OUTDIR)/.make $(OBJECTS) $(LIBDEP) $(EXTDEP) $(LD) -o $@ $(OBJECTS) $(call cleanld,$(EXTLIB) $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS)) $(BLDDIR)/%-cpp.o: $(SRCDIR)/%.cpp .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CXX) $(DFLAGS) $(IFLAGS) $(CXXFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-c.o: $(SRCDIR)/%.c .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CC) $(DFLAGS) $(IFLAGS) $(CFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-f.o: $(SRCDIR)/%.f .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.f90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.F90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f77.o: $(SRCDIR)/%.F .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ .PHONY: clean clean: ifneq ($(call qapath,$(BLDDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(BLDDIR)),$(call qapath,.)) @rm -rf $(BLDDIR) endif endif ifneq (,$(wildcard $(BLDDIR))) # still exists @rm -f $(OBJECTS) $(OBJECTX) $(FTNOBJS) $(FTNOBJX) *__genmod.* fit.log *.dat @rm -f $(BLDDIR)/*.gcno $(BLDDIR)/*.gcda $(BLDDIR)/*.gcov endif @rm -f .make .state .PHONY: realclean realclean: clean ifneq ($(call qapath,$(OUTDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(OUTDIR)),$(call qapath,.)) @rm -rf $(OUTDIR) endif endif ifneq (,$(wildcard $(OUTDIR))) # still exists @rm -f $(OUTDIR)/libxsmm.$(DLIBEXT) $(OUTDIR)/*.stackdump @rm -f $(XFILES) $(MODULES) endif libxsmm-1.17/samples/spmdm/spmdm.c000066400000000000000000000333111415223013700171640ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Nadathur Satish (Intel Corp.) ******************************************************************************/ /* NOTE: This code currently ignores alpha input to the matrix multiply */ #include #include #include #include #include #if defined(_OPENMP) # include #endif #if !defined(USE_BFLOAT) && 0 # define USE_BFLOAT typedef libxsmm_bfloat16 REAL_TYPE; #else typedef float REAL_TYPE; #endif LIBXSMM_INLINE void spmdm_check_c(const libxsmm_spmdm_handle* handle, float* test, float* gold) { double max_error = 0.0; double src_norm = 0.0; double dst_norm = 0.0; size_t l; for (l = 0; l < (size_t)handle->m * (size_t)handle->n; ++l) { const double dstval = (double)test[l]; const double srcval = (double)gold[l]; const double local_error = fabs(dstval - srcval); if (local_error > max_error) { max_error = local_error; } /*if (local_error > 1e-3) printf("(%d,%d) : gold: %f, computed: %f\n", l / handle->n, l % handle->n, srcval, dstval);*/ src_norm += srcval; dst_norm += dstval; } printf(" max error: %f, sum BLAS: %f, sum LIBXSMM: %f \n", max_error, src_norm, dst_norm ); } LIBXSMM_INLINE void spmdm_exec_fp32( const libxsmm_spmdm_handle* handle, const char transA, const char transB, const float* alpha, const float* A, const float* B, const char transC, const float* beta, float* C, libxsmm_CSR_sparseslice* A_sparse) { int num_createSparseSlice_blocks = libxsmm_spmdm_get_num_createSparseSlice_blocks(handle); int num_compute_blocks = libxsmm_spmdm_get_num_compute_blocks(handle); int i; # if defined(_OPENMP) # pragma omp parallel private(i) # endif { # if defined(_OPENMP) const int nthreads = omp_get_num_threads(); const int tid = omp_get_thread_num(); # else const int nthreads = 1; const int tid = 0; # endif # if defined(_OPENMP) # pragma omp for # endif for (i = 0; i < num_createSparseSlice_blocks; ++i) { libxsmm_spmdm_createSparseSlice_fp32_thread(handle, transA, A, A_sparse, i, tid, nthreads); } # if defined(_OPENMP) # pragma omp for # endif for (i = 0; i < num_compute_blocks; ++i) { libxsmm_spmdm_compute_fp32_thread(handle, transA, transB, alpha, A_sparse, B, transC, beta, C, i, tid, nthreads); } } } LIBXSMM_INLINE void spmdm_exec_bfloat16( const libxsmm_spmdm_handle* handle, const char transA, const char transB, const libxsmm_bfloat16* alpha, const libxsmm_bfloat16* A, const libxsmm_bfloat16* B, const char transC, const libxsmm_bfloat16* beta, float* C, libxsmm_CSR_sparseslice* A_sparse) { int num_createSparseSlice_blocks = libxsmm_spmdm_get_num_createSparseSlice_blocks(handle); int num_compute_blocks = libxsmm_spmdm_get_num_compute_blocks(handle); int i; # if defined(_OPENMP) # pragma omp parallel private(i) # endif { # if defined(_OPENMP) const int nthreads = omp_get_num_threads(); const int tid = omp_get_thread_num(); # else const int nthreads = 1; const int tid = 0; # endif # if defined(_OPENMP) # pragma omp for # endif for (i = 0; i < num_createSparseSlice_blocks; ++i) { libxsmm_spmdm_createSparseSlice_bfloat16_thread(handle, transA, A, A_sparse, i, tid, nthreads); } # if defined(_OPENMP) # pragma omp for # endif for (i = 0; i < num_compute_blocks; ++i) { libxsmm_spmdm_compute_bfloat16_thread(handle, transA, transB, alpha, A_sparse, B, transC, beta, C, i, tid, nthreads); } } } int main(int argc, char *argv[]) { REAL_TYPE *A_gold, *B_gold, *A_gold2, *B_gold2; float *C_gold, *C0_gold, *C, *C2; int M, N, K; REAL_TYPE alpha, beta; int reps; libxsmm_spmdm_handle handle, handle2; libxsmm_CSR_sparseslice *A_sparse, *A_sparse2; int max_threads; /* Step 1: Read in args */ libxsmm_timer_tickint start, end; double flops, duration; char transA, transB, transC; int i, j, k; size_t l; /* Step 1: Initialize handle */ M = 0; N = 0; K = 0; alpha = (REAL_TYPE)1.0; beta = (REAL_TYPE)0.0; reps = 0; transA = 'N'; transB = 'N'; if (argc > 1 && !strncmp(argv[1], "-h", 3)) { printf("\nUsage: %s [M] [N] [K] [transA] [transB] [reps]\n\n", argv[0]); return EXIT_SUCCESS; } /* defaults */ M = 2048; N = 2048; K = 2048; transA = 'N'; transB = 'N'; transC = 'N'; reps = 100; /* reading new values from cli */ i = 1; if (argc > i) M = atoi(argv[i++]); if (argc > i) N = atoi(argv[i++]); if (argc > i) K = atoi(argv[i++]); if (argc > i) { transA = argv[i][0]; i++; } if (argc > i) { transB = argv[i][0]; i++; } if (argc > i) { transC = argv[i][0]; i++; } if (argc > i) reps = atoi(argv[i++]); /* Step 2: allocate data */ A_gold = (REAL_TYPE*)libxsmm_aligned_malloc( M*K*sizeof(REAL_TYPE), 64 ); B_gold = (REAL_TYPE*)libxsmm_aligned_malloc( K*N*sizeof(REAL_TYPE), 64 ); C_gold = (float*)libxsmm_aligned_malloc( M*N*sizeof(float), 64 ); C0_gold = (float*)libxsmm_aligned_malloc( M*N*sizeof(float), 64 ); C = (float*)libxsmm_aligned_malloc( M*N*sizeof(float), 64 ); /* Step 3: init data */ libxsmm_rng_set_seed(1); for (l = 0; l < (size_t)M * (size_t)K; ++l) { const double r64 = libxsmm_rng_f64(); const float r32 = (float)r64; #ifdef USE_BFLOAT const int r = *(const int*)(&r32); const libxsmm_bfloat16 val = (r >> 16); #else const float val = r32; #endif if (r64 > 0.85) A_gold[l] = val; else A_gold[l] = (REAL_TYPE)0.0; } for (l = 0; l < (size_t)K * (size_t)N; ++l) { const double r64 = libxsmm_rng_f64(); const float r32 = (float)r64; #ifdef USE_BFLOAT const int r = *(const int*)(&r32); const libxsmm_bfloat16 val = (r >> 16); #else const float val = r32; #endif B_gold[l] = val; } for (l = 0; l < (size_t)M * (size_t)N; ++l) { C0_gold[l] = (float)libxsmm_rng_f64(); C_gold[l] = C0_gold[l]; } for (l = 0; l < (size_t)M * (size_t)N; ++l) { C[l] = (float)C0_gold[l]; } flops = (double)M * (double)N * (double)K * 2.0; /*----------------------------------------------------------------------------------------------------------------------*/ /* Step 4: Initialize LIBXSMM for these sizes - allocates handle and temporary space for the sparse data structure for A */ # if defined(_OPENMP) max_threads = omp_get_max_threads(); # else max_threads = 1; # endif start = libxsmm_timer_tick(); libxsmm_spmdm_init(M, N, K, max_threads, &handle, &A_sparse); end = libxsmm_timer_tick(); printf("Time for handle init = %f\n", libxsmm_timer_duration(start, end)); printf(" running with: M=%i, N=%i, K=%i, bm=%i, bn=%i, bk=%i, mb=%i, nb=%i, kb=%i, reps=%i -- forward pass\n", M, N, K, handle.bm, handle.bn, handle.bk, handle.mb, handle.nb, handle.kb, reps ); /* The overall function that takes in matrix inputs in dense format, does the conversion of A to sparse format and does the matrix multiply */ /* Currently ignores alpha */ /* TODO: fix alpha input */ # ifdef USE_BFLOAT spmdm_exec_bfloat16(&handle, transA, transB, &alpha, A_gold, B_gold, transC, &beta, C, A_sparse); # else spmdm_exec_fp32(&handle, transA, transB, &alpha, A_gold, B_gold, transC, &beta, C, A_sparse); # endif /* Checks */ /* Compute a "gold" answer sequentially */ #if defined(_OPENMP) LIBXSMM_OMP_VAR(k); # pragma omp parallel for private(i, j, k) LIBXSMM_OPENMP_COLLAPSE(2) #endif for (i = 0; i < M; ++i) { for (j = 0; j < N; ++j) { float sum = 0.0; float Cval; for (k = 0; k < K; ++k) { # ifdef USE_BFLOAT libxsmm_bfloat16 Atmp = A_gold[i*K+k]; int Atmp_int = Atmp; Atmp_int <<= 16; float Aval = *(float *)&Atmp_int; libxsmm_bfloat16 Btmp = B_gold[k*N+j]; int Btmp_int = Btmp; Btmp_int <<= 16; float Bval = *(float *)&Btmp_int; # else float Aval = A_gold[i*K + k]; float Bval = B_gold[k*N + j]; # endif sum += Aval * Bval; } Cval = sum; C_gold[i*N + j] = Cval + beta*C_gold[i*N + j]; } } /* LIBXSMM_FSYMBOL(sgemm)(&trans, &trans, &N, &M, &K, &alpha, B_gold, &N, A_gold, &K, &beta, C_gold, &N); */ /* Compute the max difference between gold and computed results. */ spmdm_check_c( &handle, C, C_gold ); /* Timing loop starts */ start = libxsmm_timer_tick(); for (i = 0; i < reps; ++i) { # ifdef USE_BFLOAT spmdm_exec_bfloat16( &handle, transA, transB, &alpha, A_gold, B_gold, transC, &beta, C, A_sparse); # else spmdm_exec_fp32( &handle, transA, transB, &alpha, A_gold, B_gold, transC, &beta, C, A_sparse); # endif } end = libxsmm_timer_tick(); duration = libxsmm_timer_duration(start, end); printf("Time = %f Time/rep = %f, TFlops/s = %f\n", duration, duration*1.0/reps, flops/1000./1000./1000./1000./duration*reps); libxsmm_spmdm_destroy(&handle); /*----------------------------------------------------------------------------------------------------------------------*/ /* Step 5: Initialize libxsmm for transpose A - allocates handle and temporary space for the sparse data structure for A */ transA = 'T'; transB = 'N'; transC = 'T'; libxsmm_spmdm_init(M, N, K, max_threads, &handle2, &A_sparse2); printf(" running with: M=%i, N=%i, K=%i, bm=%i, bn=%i, bk=%i, mb=%i, nb=%i, kb=%i, reps=%i, transA = Y, transC = Y -- weight update\n", handle2.m, handle2.n, handle2.k, handle2.bm, handle2.bn, handle2.bk, handle2.mb, handle2.nb, handle2.kb, reps ); A_gold2 = (REAL_TYPE*)libxsmm_aligned_malloc( M*K*sizeof(REAL_TYPE), 64 ); C2 = (float*)libxsmm_aligned_malloc( M*N*sizeof(float), 64 ); for (i = 0; i < M; ++i) { for (j = 0; j < K; ++j) { A_gold2[j*M + i] = A_gold[i*K + j]; } } for (i = 0; i < M; ++i) { for (j = 0; j < N; ++j) { C[j*M + i] = (float)C0_gold[i*N + j]; } } /* The overall function that takes in matrix inputs in dense format, does the conversion of A to sparse format and does the matrix multiply */ /* Currently ignores alpha */ /* TODO: fix alpha inputs */ # ifdef USE_BFLOAT spmdm_exec_bfloat16( &handle2, transA, transB, &alpha, A_gold2, B_gold, transC, &beta, C, A_sparse2); # else spmdm_exec_fp32( &handle2, transA, transB, &alpha, A_gold2, B_gold, transC, &beta, C, A_sparse2); # endif for (i = 0; i < M; ++i) { for (j = 0; j < N; ++j) { C2[i*N + j] = C[j*M + i]; } } /* Checks */ spmdm_check_c( &handle2, C2, C_gold); /* Timing loop starts */ start = libxsmm_timer_tick(); for (i = 0; i < reps; ++i) { # ifdef USE_BFLOAT spmdm_exec_bfloat16( &handle2, transA, transB, &alpha, A_gold2, B_gold, transC, &beta, C, A_sparse2); # else spmdm_exec_fp32( &handle2, transA, transB, &alpha, A_gold2, B_gold, transC, &beta, C, A_sparse2); # endif } end = libxsmm_timer_tick(); duration = libxsmm_timer_duration(start, end); printf("Time = %f Time/rep = %f, TFlops/s = %f\n", duration, duration*1.0/reps, flops/1000./1000./1000./1000./duration*reps); /*----------------------------------------------------------------------------------------------------------------------*/ /* Step 6: Test transpose B */ transA = 'N'; transB = 'T'; transC = 'N'; printf(" running with: M=%i, N=%i, K=%i, bm=%i, bn=%i, bk=%i, mb=%i, nb=%i, kb=%i, reps=%i, transB = Y -- backprop\n", handle2.m, handle2.n, handle2.k, handle2.bm, handle2.bn, handle2.bk, handle2.mb, handle2.nb, handle2.kb, reps ); B_gold2 = (REAL_TYPE*)libxsmm_aligned_malloc( K*N*sizeof(REAL_TYPE), 64 ); for (i = 0; i < K; ++i) { for (j = 0; j < N; ++j) { B_gold2[j*K + i] = B_gold[i*N + j]; } } for (l = 0; l < (size_t)M * (size_t)N; ++l) { C[l] = (float)C0_gold[l]; } /* The overall function that takes in matrix inputs in dense format, does the conversion of A to sparse format and does the matrix multiply */ /* Currently ignores alpha */ /* TODO: fix alpha inputs */ # ifdef USE_BFLOAT spmdm_exec_bfloat16( &handle2, transA, transB, &alpha, A_gold, B_gold2, transC, &beta, C, A_sparse2); # else spmdm_exec_fp32( &handle2, transA, transB, &alpha, A_gold, B_gold2, transC, &beta, C, A_sparse2); # endif /* Checks */ spmdm_check_c( &handle2, C, C_gold); /* Timing loop starts */ start = libxsmm_timer_tick(); for (i = 0; i < reps; ++i) { # ifdef USE_BFLOAT spmdm_exec_bfloat16( &handle2, transA, transB, &alpha, A_gold, B_gold2, transC, &beta, C, A_sparse2); # else spmdm_exec_fp32( &handle2, transA, transB, &alpha, A_gold, B_gold2, transC, &beta, C, A_sparse2); # endif } end = libxsmm_timer_tick(); duration = libxsmm_timer_duration(start, end); printf("Time = %f Time/rep = %f, TFlops/s = %f\n", duration, duration*1.0/reps, flops/1000./1000./1000./1000./duration*reps); libxsmm_spmdm_destroy(&handle2); libxsmm_free(A_gold); libxsmm_free(B_gold); libxsmm_free(C_gold); libxsmm_free(C); libxsmm_free(C2); libxsmm_free(C0_gold); libxsmm_free(B_gold2); libxsmm_free(A_gold2); return EXIT_SUCCESS; } libxsmm-1.17/samples/spmdm/spmdm.sh000077500000000000000000000050141415223013700173560ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) NAME=$(basename $0 .sh) GREP=$(command -v grep) ENV=$(command -v env) if [ "Windows_NT" = "${OS}" ]; then # Cygwin's "env" does not set PATH ("Files/Black: No such file or directory") export PATH=${PATH}:${HERE}/../../lib:/usr/x86_64-w64-mingw32/sys-root/mingw/bin # Cygwin's ldd hangs with dyn. linked executables or certain shared libraries LDD=$(command -v cygcheck) EXE=.exe else if [ "$(command -v ldd)" ]; then LDD=ldd elif [ "$(command -v otool)" ]; then LDD="otool -L" else LDD=echo fi fi MICINFO=$(command -v micinfo) if [ "${MICINFO}" ]; then MICCORES=$(${MICINFO} 2>/dev/null | sed -n "0,/[[:space:]]\+Total No of Active Cores :[[:space:]]\+\([0-9]\+\)/s//\1/p") fi if [ "" = "${MICCORES}" ]; then MICCORES=61 fi MICTPERC=3 if [ "-mic" != "$1" ]; then if [ "$(${LDD} ${HERE}/${NAME}${EXE} 2>/dev/null | ${GREP} libiomp5\.)" ]; then ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ KMP_AFFINITY=compact,granularity=fine,1 \ MIC_KMP_AFFINITY=compact,granularity=fine \ MIC_KMP_HW_SUBSET=$((MICCORES-1))c${MICTPERC}t \ MIC_ENV_PREFIX=MIC \ OFFLOAD_INIT=on_start \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" else ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ OMP_PROC_BIND=TRUE \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" fi else shift ${ENV} \ SINK_LD_LIBRARY_PATH=${SINK_LD_LIBRARY_PATH}:${MIC_LD_LIBRARY_PATH}:${HERE}/../../lib \ micnativeloadex \ ${HERE}/${NAME}${EXE} -a "$*" \ -e "KMP_AFFINITY=compact,granularity=fine" \ -e "MIC_KMP_HW_SUBSET=$((MICCORES-1))${MICTPERC}t" fi libxsmm-1.17/samples/spmdm/spmdm.vcxproj000066400000000000000000000536451415223013700204510ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 spmdm {61BF5F50-3B4C-488B-9D0F-9AAB008B3184} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 ProgramDatabase None false 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 ProgramDatabase None false 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/transpose/000077500000000000000000000000001415223013700165755ustar00rootroot00000000000000libxsmm-1.17/samples/transpose/Makefile000066400000000000000000000104511415223013700202360ustar00rootroot00000000000000ROOTDIR = $(abspath $(dir $(firstword $(MAKEFILE_LIST)))) DEPDIR = ../.. SRCDIR = . INCDIR = . BLDDIR = obj OUTDIR = . CXXFLAGS = $(NULL) CFLAGS = $(NULL) DFLAGS = $(NULL) BLAS = 2 OMP = 1 SYM = 1 # explore AVX/ARCH=native SSE = 0 # include common Makefile artifacts include $(DEPDIR)/Makefile.inc # necessary include directories IFLAGS += -I$(call quote,$(INCDIR)) IFLAGS += -I$(call quote,$(DEPDIR)/include) OUTNAME := $(shell basename "$(ROOTDIR)") HEADERS := $(wildcard $(INCDIR)/*.h) $(wildcard $(INCDIR)/*.hpp) $(wildcard $(INCDIR)/*.hxx) $(wildcard $(INCDIR)/*.hh) \ $(wildcard $(SRCDIR)/*.h) $(wildcard $(SRCDIR)/*.hpp) $(wildcard $(SRCDIR)/*.hxx) $(wildcard $(SRCDIR)/*.hh) \ $(DEPDIR)/include/libxsmm_source.h CPPSRCS := $(wildcard $(SRCDIR)/*.cpp) CXXSRCS := $(wildcard $(SRCDIR)/*.cxx) CCXSRCS := $(wildcard $(SRCDIR)/*.cc) CSOURCS := $(wildcard $(SRCDIR)/*.c) CPPOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CPPSRCS:.cpp=-cpp.o))) CXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CXXSRCS:.cxx=-cxx.o))) CCXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CCXSRCS:.cc=-cc.o))) COBJCTS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CSOURCS:.c=-c.o))) ifneq (,$(strip $(FC))) FXXSRCS := $(wildcard $(SRCDIR)/*.f) F77SRCS := $(wildcard $(SRCDIR)/*.F) F90SRCS := $(wildcard $(SRCDIR)/*.f90) $(wildcard $(SRCDIR)/*.F90) FXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(FXXSRCS:.f=-f.o))) F77OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F77SRCS:.F=-f77.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90SRCS:.f90=-f90.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90OBJS:.F90=-f90.o))) endif SOURCES := $(CPPSRCS) $(CXXSRCS) $(CCXSRCS) $(CSOURCS) OBJECTS := $(CPPOBJS) $(CXXOBJS) $(CCXOBJS) $(COBJCTS) FTNSRCS := $(FXXSRCS) $(F77SRCS) $(F90SRCS) MODULES := $(addsuffix .mod,$(basename $(FTNSRCS))) $(addsuffix .modmic,$(basename $(FTNSRCS))) FTNOBJS := $(FXXOBJS) $(F77OBJS) $(F90OBJS) XFILES := $(OUTDIR)/$(OUTNAME) $(OUTDIR)/$(OUTNAME)f $(OUTDIR)/kernel .PHONY: all all: $(XFILES) .PHONY: compile compile: $(OBJECTS) $(FTNOBJS) ifneq (,$(strip $(FC))) ifneq (0,$(shell echo "$$((3>$(XSMM_GCC) || 40600<=$(FC_VERSION_NUM)))")) $(OUTDIR)/$(OUTNAME)f: $(OUTDIR)/.make $(FTNOBJS) $(FORTDEP) $(LIBDEP) $(EXTDEP) $(FLD) -o $@ $(FTNOBJS) $(FORTLIB) $(EXTLIB) $(MAINLIB) $(FCMTFLAGS) $(SLDFLAGS) $(LDFLAGS) $(FLDFLAGS) $(ELDFLAGS) else .PHONY: $(OUTDIR)/$(OUTNAME)f endif else .PHONY: $(OUTDIR)/$(OUTNAME)f endif $(OUTDIR)/$(OUTNAME): $(OUTDIR)/.make $(BLDDIR)/$(OUTNAME)-c.o $(DEPDIR)/include/libxsmm_source.h $(LD) -o $@ $(BLDDIR)/$(OUTNAME)-c.o $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(OUTDIR)/kernel: $(OUTDIR)/.make $(BLDDIR)/kernel-c.o $(LIBDEP) $(LD) -o $@ $(BLDDIR)/kernel-c.o $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(BLDDIR)/%-cpp.o: $(SRCDIR)/%.cpp .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CXX) $(DFLAGS) $(IFLAGS) $(CXXFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-c.o: $(SRCDIR)/%.c .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CC) $(DFLAGS) $(IFLAGS) $(CFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-f.o: $(SRCDIR)/%.f .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.f90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.F90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f77.o: $(SRCDIR)/%.F .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ .PHONY: clean clean: ifneq ($(call qapath,$(BLDDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(BLDDIR)),$(call qapath,.)) @rm -rf $(BLDDIR) endif endif ifneq (,$(wildcard $(BLDDIR))) # still exists @rm -f $(OBJECTS) $(OBJECTX) $(FTNOBJS) $(FTNOBJX) *__genmod.* fit.log *.dat @rm -f $(BLDDIR)/*.gcno $(BLDDIR)/*.gcda $(BLDDIR)/*.gcov endif @rm -f .make .state .PHONY: realclean realclean: clean ifneq ($(call qapath,$(OUTDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(OUTDIR)),$(call qapath,.)) @rm -rf $(OUTDIR) endif endif ifneq (,$(wildcard $(OUTDIR))) # still exists @rm -f $(OUTDIR)/libxsmm.$(DLIBEXT) $(OUTDIR)/*.stackdump @rm -f $(XFILES) $(MODULES) endif libxsmm-1.17/samples/transpose/README.md000066400000000000000000000103001415223013700200460ustar00rootroot00000000000000# Matrix Transpose (TCOPY) ## Overview This code sample aims to benchmark the performance of matrix transposes. The C/C++ and [FORTRAN sample code](https://github.com/hfp/libxsmm/blob/master/samples/transpose/transpose.f) differ slightly with the C/C++ code sample offering a richer set of command line options as well as build settings available inside of the [translation unit](https://github.com/hfp/libxsmm/blob/master/samples/transpose/transpose.c). The available command line options of the sample code may be reviewed by looking into the source code. Generally, the idea is to support the following: > transpose [<kind> [<m> [<n> [<ldi> [<ldo>]]]]] transposef [<m> [<n> [<ldi> [<ldo>]]]] Above, `m` and `n` specify the matrix shape, and `ldi` the leading dimension of the matrix. The argument `ldo` allows to specify an output dimension, which may differ from `ldi`. The transpose kind shall be either out-of-place (`o`) or in-place (`i`). Running the C sample code may look like: ```bash $ ./transpose.sh o 20000 m=20000 n=20000 ldi=20000 ldo=20000 size=3052MB (double, out-of-place) bandwidth: 18.8 GB/s duration: 159 ms ``` Instead of executing a wrapper script, one may affinitize the multi-threaded execution manually (OpenMP runtime). In case of an executable built using the Intel Compiler this may look like: ```bash LIBXSMM_VERBOSE=2 KMP_AFFINITY=balanced,granularity=fine,1 \ ./transpose o 20000 m=20000 n=20000 ldi=20000 ldo=20000 size=3052MB (double, out-of-place) bandwidth: 21.1 GB/s duration: 141 ms Registry: 20 MB (gemm=0 mcopy=0 tcopy=1) ``` In the above case one can see from the verbose output (`LIBXSMM_VERBOSE=2`) that one kernel (tcopy) served transposing the entire matrix. To avoid duplicating JIT-kernels under contention (code registry), one may also consider `LIBXSMM_TRYLOCK=1`, which is available per API-call as well. ## OpenTuner To tune the tile sizes ("block sizes") internal to LIBXSMM's transpose routine, the [OpenTuner](http://opentuner.org/) extensible framework for program autotuning can be used. In case of issues during the tuning phase ("no value has been set for this column"), please install the latest 1.2.x revision of SQLAlchemy (`pip install sqlalchemy==1.2.19`). A tuning script (`transpose_opentuner.py`) is provided, which accepts a range of matrix sizes as command line arguments. > transpose_opentuner.py <begin> <end> [*nexperiments-per-epoch*] [*tile-size-m*] [*tile-size-n*] To start a tuning experiment for a new set of arguments, it is highly recommended to start from scratch. Otherwise the population of previously generated tuning results is fetched from a database and used to tune an eventually unrelated range of matrix shapes. To get reliable timings, the total time for all experiments per epoch is minimized (hence a different number of experiments per epoch also asks for an own database). Optionally, the initial block size can be seeded (`tile-size-m` and `tile-size-n`). ```bash rm -rf opentuner.db ``` The script tunes matrices with randomized shape according to the specified range. The leading dimension is chosen tightly for the experiments. The optimizer not only maximizes the performance but also minimizes the value of *M \* N* (which also helps to prune duplicated results due to an additional preference). ```bash rm -rf opentuner.db ./transpose_opentuner.py --no-dups 1 1024 1000 rm -rf opentuner.db ./transpose_opentuner.py --no-dups 1024 2048 100 rm -rf opentuner.db ./transpose_opentuner.py --no-dups 2048 3072 20 rm -rf opentuner.db ./transpose_opentuner.py --no-dups 3072 4096 20 rm -rf opentuner.db ./transpose_opentuner.py --no-dups 4096 5120 16 rm -rf opentuner.db ./transpose_opentuner.py --no-dups 5120 6144 12 rm -rf opentuner.db ./transpose_opentuner.py --no-dups 6144 7168 8 rm -rf opentuner.db ./transpose_opentuner.py --no-dups 7168 8192 6 ``` The tuning script uses the environment variables `LIBXSMM_TCOPY_M` and `LIBXSMM_TCOPY_N`, which are internal to LIBXSMM. These variables are used to adjust certain thresholds in `libxsmm_otrans` or to request a specific tiling-scheme inside of the `libxsmm_otrans_omp` routine. libxsmm-1.17/samples/transpose/kernel.c000066400000000000000000000255621415223013700202330ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Greg Henry, Hans Pabst (Intel Corp.) ******************************************************************************/ #include #include #include #include #include LIBXSMM_INLINE void dfill_matrix ( double *matrix, unsigned int ld, unsigned int m, unsigned int n ) { unsigned int i, j; double dtmp; if ( ld < m ) { fprintf(stderr,"Error is dfill_matrix: ld=%u m=%u mismatched!\n",ld,m); exit(EXIT_FAILURE); } for ( j = 1; j <= n; j++ ) { /* Fill through the leading dimension */ for ( i = 1; i <= ld; i++ ) { dtmp = 1.0 - 2.0*libxsmm_rng_f64(); matrix [ (j-1)*ld + (i-1) ] = dtmp; } } } LIBXSMM_INLINE void sfill_matrix ( float *matrix, unsigned int ld, unsigned int m, unsigned int n ) { unsigned int i, j; double dtmp; if ( ld < m ) { fprintf(stderr,"Error is sfill_matrix: ld=%u m=%u mismatched!\n",ld,m); exit(EXIT_FAILURE); } for ( j = 1; j <= n; j++ ) { /* Fill through the leading dimension */ for ( i = 1; i <= ld; i++ ) { dtmp = 1.0 - 2.0*libxsmm_rng_f64(); matrix [ (j-1)*ld + (i-1) ] = (float) dtmp; } } } LIBXSMM_INLINE double residual_stranspose ( float *A, unsigned int lda, unsigned int m, unsigned int n, float *out, unsigned int ld_out, unsigned int *nerrs ) { unsigned int i, j; double dtmp, derror; *nerrs = 0; derror = 0.0; for ( j = 1; j <= n; j++ ) { for ( i = 1; i <= m; i++ ) { dtmp = A[ (j-1)*lda + (i-1) ] - out [ (i-1)*ld_out + (j-1) ]; if ( dtmp < 0.0 ) dtmp = -dtmp; if ( dtmp > 0.0 ) { *nerrs = *nerrs + 1; if ( *nerrs < 10 ) printf("Err #%u: A(%u,%u)=%g B(%u,%u)=%g Diff=%g\n",*nerrs,i,j,A[(j-1)*lda+(i-1)],j,i,out[(i-1)*ld_out+(j-1)],dtmp); } derror += (double) dtmp; } } return ( derror ); } LIBXSMM_INLINE double residual_dtranspose ( double *A, unsigned int lda, unsigned int m, unsigned int n, double *out, unsigned int ld_out, unsigned int *nerrs ) { unsigned int i, j; double dtmp, derror; static int ntimes = 0; *nerrs = 0; derror = 0.0; for ( j = 1; j <= n; j++ ) { for ( i = 1; i <= m; i++ ) { dtmp = A[ (j-1)*lda + (i-1) ] - out [ (i-1)*ld_out + (j-1) ]; if ( dtmp < 0.0 ) dtmp = -dtmp; if ( dtmp > 0.0 ) { if ( ++ntimes < 5 ) printf("FP64 Position (%u,%u) is %g and not %g\n",i,j,out [ (i-1)*ld_out + (j-1) ], A[ (j-1)*lda + (i-1) ]); *nerrs = *nerrs + 1; } derror += dtmp; } } return ( derror ); } /* Comment 1 of the following lines to compare to an ass. code byte-for-byte */ /* #define COMPARE_TO_A_FP32_ASSEMBLY_CODE */ /* #define COMPARE_TO_A_FP64_ASSEMBLY_CODE */ #if defined(COMPARE_TO_A_FP32_ASSEMBLY_CODE) || defined(COMPARE_TO_A_FP64_ASSEMBLY_CODE) # ifndef COMPARE_TO_AN_ASSEMBLY_CODE # define COMPARE_TO_AN_ASSEMBLY_CODE # endif #endif #if defined(COMPARE_TO_A_FP32_ASSEMBLY_CODE) && defined(COMPARE_TO_A_FP64_ASSEMBLY_CODE) # error Define a comparison to either FP32 or FP64 code, not both at once #endif /* Use these lines to dump the real*4 or real*8 assembly files for the kernel */ /* #define DUMP_FP32_ASSEMBLY_FILE #define DUMP_FP64_ASSEMBLY_FILE */ int main(int argc, char* argv[]) { unsigned int m = 16, n = 16, ld_in = 16, ld_out = 16, nerrs; const unsigned char* cptr; double *dinp, *dout, dtmp; float *sinp, *sout; #if defined(DUMP_FP32_ASSEMBLY_FILE) || defined(DUMP_FP64_ASSEMBLY_FILE) FILE *fp; char buffer[80]; int stop_dumping = 0; unsigned int i; #endif #ifdef COMPARE_TO_AN_ASSEMBLY_CODE unsigned int nbest, istop; unsigned char *cptr2; extern void myro_(); #endif union { libxsmm_xtransfunction f; const void* p; } skernel, dkernel; const libxsmm_trans_descriptor* desc = 0; libxsmm_descriptor_blob blob; if ( argc <= 3 ) { printf("\nUSAGE: %s m n ld_in ld_out\n",argv[0]); printf("Out-of-place transpose a mxn matrix of leading dimension ld_in\n"); printf("Defaults: m=n=ld_in=ld_out=16\n"); printf("Note: ld_in is NOT needed for dispatching. Code works for any valid (>=m) ld_in\n"); printf("Note: ld_out is now needed for dispatching. Code will only work for a fixed value, like m and n.\n"); } if ( argc > 1 ) m = atoi(argv[1]); if ( argc > 2 ) n = atoi(argv[2]); if ( argc > 3 ) ld_in = atoi(argv[3]); if ( argc > 4 ) ld_out = atoi(argv[4]); m = LIBXSMM_MAX(m,1); n = LIBXSMM_MAX(n,1); ld_in = LIBXSMM_MAX(ld_in,m); ld_out = LIBXSMM_MAX(ld_out,n); printf("This is a tester for JIT transpose kernels! (m=%u n=%u ld_in=%u ld_out=%u)\n",m,n,ld_in,ld_out); /* test dispatch call */ desc = libxsmm_trans_descriptor_init(&blob, sizeof(float), m, n, ld_out); skernel.f = libxsmm_dispatch_trans(desc); desc = libxsmm_trans_descriptor_init(&blob, sizeof(double), m, n, ld_out); dkernel.f = libxsmm_dispatch_trans(desc); printf("address of FP32 kernel: %p\n", skernel.p); printf("address of FP64 kernel: %p\n", dkernel.p); #ifndef DUMP_FP64_ASSEMBLY_FILE cptr = (const unsigned char*)dkernel.p; printf("First few bytes/opcodes: 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x\n",cptr[0],cptr[1],cptr[2],cptr[3],cptr[4],cptr[5]); #else printf("Dumping FP64 assembly file\n"); cptr = (const unsigned char*)dkernel.p; fp = fopen("foo.s","w"); fputs("\t.text\n",fp); fputs("\t.align 256\n",fp); fputs("\t.globl trans_\n",fp); fputs("trans_:\n",fp); i = 0; stop_dumping = 0; while ( (i < 7000) && (stop_dumping == 0) ) { if ( (i >= 0) && (cptr[i ]==0x5c) && (cptr[i+1]==0x5d) && (cptr[i+2]==0x5b) && (cptr[i+3]==0xc3) ) stop_dumping = 1; if ( (i >= 1) && (cptr[i-1]==0x5c) && (cptr[i ]==0x5d) && (cptr[i+1]==0x5b) && (cptr[i+2]==0xc3) ) stop_dumping = 1; if ( (i >= 2) && (cptr[i-2]==0x5c) && (cptr[i-1]==0x5d) && (cptr[i ]==0x5b) && (cptr[i+1]==0xc3) ) stop_dumping = 1; if ( (i >= 3) && (cptr[i-3]==0x5c) && (cptr[i-2]==0x5d) && (cptr[i-1]==0x5b) && (cptr[i ]==0xc3) ) stop_dumping = 1; sprintf(buffer,".byte 0x%02x, 0x%02x, 0x%02x, 0x%02x\n",cptr[i],cptr[i+1],cptr[i+2],cptr[i+3]); fputs(buffer,fp); i += 4; } fputs("\tretq\n",fp); fputs("\t.type trans_,@function\n",fp); fputs("\t.size trans_,.-trans_\n",fp); fclose(fp); printf("Dumped FP64 %u bytes\n",i); #endif #ifdef DUMP_FP32_ASSEMBLY_FILE printf("Dumping FP32 assembly file\n"); cptr = (const unsigned char*)skernel.p; fp = fopen("soo.s","w"); fputs("\t.text\n",fp); fputs("\t.align 256\n",fp); fputs("\t.globl strans_\n",fp); fputs("strans_:\n",fp); i = 0; stop_dumping = 0; while ( (i < 7000) && (stop_dumping == 0) ) { if ( (i >= 0) && (cptr[i ]==0x5c) && (cptr[i+1]==0x5d) && (cptr[i+2]==0x5b) && (cptr[i+3]==0xc3) ) stop_dumping = 1; if ( (i >= 1) && (cptr[i-1]==0x5c) && (cptr[i ]==0x5d) && (cptr[i+1]==0x5b) && (cptr[i+2]==0xc3) ) stop_dumping = 1; if ( (i >= 2) && (cptr[i-2]==0x5c) && (cptr[i-1]==0x5d) && (cptr[i ]==0x5b) && (cptr[i+1]==0xc3) ) stop_dumping = 1; if ( (i >= 3) && (cptr[i-3]==0x5c) && (cptr[i-2]==0x5d) && (cptr[i-1]==0x5b) && (cptr[i ]==0xc3) ) stop_dumping = 1; sprintf(buffer,".byte 0x%02x, 0x%02x, 0x%02x, 0x%02x\n",cptr[i],cptr[i+1],cptr[i+2],cptr[i+3]); fputs(buffer,fp); i += 4; } fputs("\tretq\n",fp); fputs("\t.type strans_,@function\n",fp); fputs("\t.size strans_,.-strans_\n",fp); fclose(fp); printf("Dumped FP32 %u bytes\n",i); #endif #ifdef COMPARE_TO_AN_ASSEMBLY_CODE #ifdef COMPARE_TO_A_FP64_ASSEMBLY_CODE cptr = (const unsigned char*)dkernel.p; #else cptr = (const unsigned char*)skernel.p; #endif cptr2 = (unsigned char *) &myro_; i = 0; nbest = 0; istop = 0; while ( istop == 0 ) { if ( cptr[i] != cptr2[i] ) { printf("Byte: %u=0x%x differs. We generated: 0x%02x. Should be: 0x%02x\n", i,i,cptr[i],cptr2[i]); } else { ++nbest; } if ( i >= 208 ) istop = 1; if ( i >= 2 ) { if ( (cptr2[i]==0xc3) && (cptr2[i-1]==0x5b) && (cptr2[i-2]==0x5d) ) istop = 1; #if 0 if ( i == 114 ) printf("cptr2=0x%02x 0x%02x 0x%02x istop=%u\n",cptr2[i],cptr2[i-1],cptr2[i-2],istop); #endif } ++i; } printf("Bytes agree: %u\n",nbest); #endif sinp = (float *) malloc ( ld_in*n*sizeof(float) ); dinp = (double *) malloc ( ld_in*n*sizeof(double) ); sout = (float *) malloc ( ld_out*m*sizeof(float) ); dout = (double *) malloc ( ld_out*m*sizeof(double) ); /* Fill matrices with random data: */ sfill_matrix ( sinp, ld_in, m, n ); dfill_matrix ( dinp, ld_in, m, n ); sfill_matrix ( sout, ld_out, n, m ); dfill_matrix ( dout, ld_out, n, m ); /* if ( ld_out != n ) { fprintf(stderr,"Final warning: This code only works for ld_out=n (n=%u,ld_out=%u)\n",n,ld_out); exit(EXIT_FAILURE); } */ #ifdef COMPARE_TO_A_FP64_ASSEMBLY_CODE printf("Calling myro_: \n"); myro_ ( dinp, &ld_in, dout, &ld_out ); dtmp = residual_dtranspose ( dinp, ld_in, m, n, dout, ld_out, &nerrs ); printf("Myro_ FP64 error: %g number of errors: %u\n",dtmp,nerrs); dfill_matrix ( dout, ld_out, n, m ); #endif #ifdef COMPARE_TO_A_FP32_ASSEMBLY_CODE printf("Calling myro_: \n"); myro_ ( sinp, &ld_in, sout, &ld_out ); dtmp = residual_stranspose ( sinp, ld_in, m, n, sout, ld_out, &nerrs ); printf("Myro_ FP32 error: %g number of errors: %u\n",dtmp,nerrs); sfill_matrix ( sout, ld_out, n, m ); #endif /* let's call */ #if 1 printf("calling skernel\n"); skernel.f( sinp, &ld_in, sout, &ld_out ); printf("calling dkernel\n"); dkernel.f( dinp, &ld_in, dout, &ld_out ); #endif /* Did it transpose correctly? */ dtmp = residual_stranspose ( sinp, ld_in, m, n, sout, ld_out, &nerrs ); printf("Single precision m=%u n=%u ld_in=%u ld_out=%u error: %g number of errors: %u",m,n,ld_in,ld_out,dtmp,nerrs); if ( nerrs > 0 ) printf(" ->FAILED at %ux%u real*4 case",m,n); printf("\n"); dtmp = residual_dtranspose ( dinp, ld_in, m, n, dout, ld_out, &nerrs ); printf("Double precision m=%u n=%u ld_in=%u ld_out=%u error: %g number of errors: %u\n",m,n,ld_in,ld_out,dtmp,nerrs); if ( nerrs > 0 ) printf(" ->FAILED at %ux%u real*8 case",m,n); printf("\n"); free(dout); free(sout); free(dinp); free(sinp); return EXIT_SUCCESS; } libxsmm-1.17/samples/transpose/kernel.vcxproj000066400000000000000000000540621415223013700215010ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 kernel {6B98D143-0300-4965-9580-F7F87FDA31E3} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/transpose/transpose.c000066400000000000000000000305041415223013700207610ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #include #include #include #include #if defined(_OPENMP) # include #endif #if !defined(ELEM_TYPE) # define ELEM_TYPE double #endif #if !defined(RAND_SEED) # define RAND_SEED 25071975 #endif #if (defined(_OPENMP) || (defined(__BLAS) && 1 < (__BLAS))) # if !defined(OTRANS_THREAD) && defined(_OPENMP) && 0 # define OTRANS_THREAD libxsmm_otrans_thread # endif # define OTRANS libxsmm_otrans_omp #else # define OTRANS libxsmm_otrans #endif #define ITRANS libxsmm_itrans #if defined(__BLAS) && (0 != __BLAS) && \ (LIBXSMM_EQUAL(ELEM_TYPE, float) || LIBXSMM_EQUAL(ELEM_TYPE, double)) # if defined(__MKL) # include # define OTRANS_GOLD(M, N, A, LDI, B, LDO) \ LIBXSMM_CONCATENATE(mkl_, LIBXSMM_TPREFIX(ELEM_TYPE, omatcopy))('C', 'T', \ (size_t)(*(M)), (size_t)(*(N)), (ELEM_TYPE)1, A, (size_t)(*(LDI)), B, (size_t)(*(LDO))) # define ITRANS_GOLD(M, N, A, LDI, LDO) \ LIBXSMM_CONCATENATE(mkl_, LIBXSMM_TPREFIX(ELEM_TYPE, imatcopy))('C', 'T', \ (size_t)(*(M)), (size_t)(*(N)), (ELEM_TYPE)1, A, (size_t)(*(LDI)), (size_t)(*(LDO))) # if !defined(USE_REFERENCE) # define USE_REFERENCE # endif # elif defined(__OPENBLAS77) && 0/* issue #390 */ # include # define OTRANS_GOLD(M, N, A, LDI, B, LDO) { \ /*const*/char otrans_gold_tc_ = 'C', otrans_gold_tt_ = 'T'; \ /*const*/ELEM_TYPE otrans_gold_alpha_ = 1; \ LIBXSMM_FSYMBOL(LIBXSMM_TPREFIX(ELEM_TYPE, omatcopy))(&otrans_gold_tc_, &otrans_gold_tt_, \ (libxsmm_blasint*)(M), (libxsmm_blasint*)(N), &otrans_gold_alpha_, A, \ (libxsmm_blasint*)(LDI), B, (libxsmm_blasint*)(LDO)); \ } # define ITRANS_GOLD(M, N, A, LDI, LDO) { \ /*const*/char itrans_gold_tc_ = 'C', itrans_gold_tt_ = 'T'; \ /*const*/ELEM_TYPE itrans_gold_alpha_ = 1; \ LIBXSMM_FSYMBOL(LIBXSMM_TPREFIX(ELEM_TYPE, imatcopy))(&itrans_gold_tc_, &itrans_gold_tt_, \ (libxsmm_blasint*)(M), (libxsmm_blasint*)(N), &itrans_gold_alpha_, A, \ (libxsmm_blasint*)(LDI), (libxsmm_blasint*)(LDO)); \ } # if !defined(USE_REFERENCE) # define USE_REFERENCE # endif # endif #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif LIBXSMM_INLINE LIBXSMM_RETARGETABLE ELEM_TYPE initial_value(libxsmm_blasint i, libxsmm_blasint j, libxsmm_blasint ld) { return (ELEM_TYPE)i * ld + j; } LIBXSMM_INLINE LIBXSMM_RETARGETABLE libxsmm_blasint randstart(libxsmm_blasint start, libxsmm_blasint value) { const libxsmm_blasint s = (start < value ? start : 0), r = LIBXSMM_MIN(s + (rand() % (value - s)) + 1, value); assert(0 < r && s <= r && r <= value); return r; } #if !defined(USE_REFERENCE) LIBXSMM_INLINE void matrix_transpose(ELEM_TYPE *LIBXSMM_RESTRICT dst, const ELEM_TYPE *LIBXSMM_RESTRICT src, libxsmm_blasint rows, libxsmm_blasint cols) { libxsmm_blasint i, j; LIBXSMM_VLA_DECL(2, const ELEM_TYPE, src_2d, src, cols); LIBXSMM_VLA_DECL(2, ELEM_TYPE, dst_2d, dst, rows); #if defined(_OPENMP) LIBXSMM_OMP_VAR(i); LIBXSMM_OMP_VAR(j); # pragma omp parallel for private(i, j) #endif for (i = 0; i < rows; ++i) { for (j = 0; j < cols; ++j) { LIBXSMM_VLA_ACCESS(2, dst_2d, j, i, rows) = LIBXSMM_VLA_ACCESS(2, src_2d, i, j, cols); } } } #endif int main(int argc, char* argv[]) { const char t = (char)(1 < argc ? *argv[1] : 'o'); const libxsmm_blasint m = (2 < argc ? atoi(argv[2]) : 4096); #if 0 /* TODO: enable when in-place transpose is fully supported */ const libxsmm_blasint n = (3 < argc ? atoi(argv[3]) : m); #else const libxsmm_blasint n = (3 < argc ? (('o' == t || 'O' == t) ? atoi(argv[3]) : m) : m); #endif const libxsmm_blasint ldi = LIBXSMM_MAX/*sanitize ld*/(4 < argc ? atoi(argv[4]) : 0, m); const libxsmm_blasint ldo = LIBXSMM_MAX/*sanitize ld*/(5 < argc ? atoi(argv[5]) : 0, n); const int r = (6 < argc ? atoi(argv[6]) : 0), s = LIBXSMM_ABS(r); const libxsmm_blasint lower = (7 < argc ? atoi(argv[7]) : 0); libxsmm_blasint km = m, kn = n, kldi = ldi, kldo = (('o' == t || 'O' == t) ? ldo : ldi); int result = EXIT_SUCCESS, k; if (0 == strchr("oOiI", t)) { fprintf(stderr, "%s [] [] [] [] [] [random:0|nruns] [lbound]\n", argv[0]); exit(EXIT_FAILURE); } #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload target(LIBXSMM_OFFLOAD_TARGET) #endif { const char *const env_tasks = getenv("TASKS"), *const env_check = getenv("CHECK"); const int tasks = (NULL == env_tasks || 0 == *env_tasks) ? 0/*default*/ : atoi(env_tasks); const int check = (NULL == env_check || 0 == *env_check) ? 1/*default*/ : atoi(env_check); ELEM_TYPE *const a = (ELEM_TYPE*)libxsmm_malloc((size_t)ldi * (size_t)(('o' == t || 'O' == t) ? n : ldo) * sizeof(ELEM_TYPE)); ELEM_TYPE *const b = (ELEM_TYPE*)libxsmm_malloc((size_t)ldo * (size_t)(('o' == t || 'O' == t) ? m : ldi) * sizeof(ELEM_TYPE)); libxsmm_timer_tickint start, duration = 0, duration2 = 0; libxsmm_blasint i; size_t size = 0; fprintf(stdout, "m=%lli n=%lli ldi=%lli ldo=%lli size=%.fMB (%s, %s)\n", (long long)m, (long long)n, (long long)ldi, (long long)ldo, 1.0 * (sizeof(ELEM_TYPE) * m * n) / (1ULL << 20), LIBXSMM_STRINGIFY(ELEM_TYPE), ('o' == t || 'O' == t) ? "out-of-place" : "in-place"); #if defined(_OPENMP) LIBXSMM_OMP_VAR(i); # pragma omp parallel for private(i) #endif for (i = 0; i < n; ++i) { libxsmm_blasint j; for (j = 0; j < m; ++j) { a[i*ldi+j] = initial_value(i, j, m); } } if (0 != check) { /* repeatable (reference) */ srand(RAND_SEED); } else { /* randomized selection */ srand(libxsmm_timer_tick() % ((unsigned int)-1)); } for (k = (0 == r ? -1 : 0); k < s && EXIT_SUCCESS == result; ++k) { if (0 < r) { const libxsmm_blasint rldi = 0 <= lower ? randstart(lower, ldi) : 0; km = randstart(LIBXSMM_ABS(lower), m); kldi = LIBXSMM_MAX(rldi, km); if ('o' == t || 'O' == t) { const libxsmm_blasint rldo = 0 <= lower ? randstart(lower, ldo) : 0; kn = randstart(LIBXSMM_ABS(lower), n); kldo = LIBXSMM_MAX(rldo, kn); /* trigger JIT-generated code */ OTRANS(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo); } else { #if 0 /* TODO: enable when in-place transpose is fully supported */ kn = randstart(LIBXSMM_ABS(lower), n); #else kn = km; #endif kldo = kldi; /* trigger JIT-generated code */ ITRANS(b, sizeof(ELEM_TYPE), km, kn, kldi); } } size += (size_t)(sizeof(ELEM_TYPE) * km * kn); if ('o' == t || 'O' == t) { #if !defined(USE_REFERENCE) kldi = km; kldo = kn; #endif if (0 == tasks) { /* library-internal parallelization */ start = libxsmm_timer_tick(); #if defined(OTRANS_THREAD) # pragma omp parallel OTRANS_THREAD(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo, omp_get_thread_num(), omp_get_num_threads()); #else OTRANS(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo); #endif duration += libxsmm_timer_ncycles(start, libxsmm_timer_tick()); } else { /* external parallelization */ start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel # pragma omp single nowait #endif OTRANS(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo); duration += libxsmm_timer_ncycles(start, libxsmm_timer_tick()); } } else { assert(('i' == t || 'I' == t) && kldo == kldi); memcpy(b, a, (size_t)(sizeof(ELEM_TYPE) * kldi * kn)); if (2 > tasks) { /* library-internal parallelization */ start = libxsmm_timer_tick(); ITRANS(b, sizeof(ELEM_TYPE), km, kn, kldi); duration += libxsmm_timer_ncycles(start, libxsmm_timer_tick()); } else { /* external parallelization */ start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel # pragma omp single #endif ITRANS(b, sizeof(ELEM_TYPE), km, kn, kldi); duration += libxsmm_timer_ncycles(start, libxsmm_timer_tick()); } } if (0 != check) { /* check */ for (i = 0; i < km; ++i) { libxsmm_blasint j; for (j = 0; j < kn; ++j) { const ELEM_TYPE u = b[i*kldo+j]; const ELEM_TYPE v = a[j*kldi+i]; if (LIBXSMM_NEQ(u, v)) { i += km; /* leave outer loop as well */ result = EXIT_FAILURE; break; } } } } } if (0 < check) { /* check shall imply reference (performance-)test */ srand(RAND_SEED); /* reproduce the same sequence as above */ for (k = (0 == r ? -1 : 0); k < s && EXIT_SUCCESS == result; ++k) { if (0 < r) { const libxsmm_blasint rldi = 0 <= lower ? randstart(lower, ldi) : 0; km = randstart(LIBXSMM_ABS(lower), m); kldi = LIBXSMM_MAX(rldi, km); if ('o' == t || 'O' == t) { const libxsmm_blasint rldo = 0 <= lower ? randstart(lower, ldo) : 0; kn = randstart(LIBXSMM_ABS(lower), n); kldo = LIBXSMM_MAX(rldo, kn); } else { #if 0 /* TODO: enable when in-place transpose is fully supported */ kn = randstart(LIBXSMM_ABS(lower), n); #else kn = km; #endif kldo = kldi; } } if ('o' == t || 'O' == t) { #if defined(USE_REFERENCE) start = libxsmm_timer_tick(); OTRANS_GOLD(&km, &kn, a, &kldi, b, &kldo); #else kldi = km; kldo = kn; start = libxsmm_timer_tick(); matrix_transpose(b, a, km, kn); #endif duration2 += libxsmm_timer_ncycles(start, libxsmm_timer_tick()); } else { assert(('i' == t || 'I' == t) && kldo == kldi); #if defined(USE_REFERENCE) memcpy(b, a, (size_t)(kldi * kn * sizeof(ELEM_TYPE))); start = libxsmm_timer_tick(); ITRANS_GOLD(&km, &kn, b, &kldi, &kldo); duration2 += libxsmm_timer_ncycles(start, libxsmm_timer_tick()); #else fprintf(stderr, "Error: no validation routine available!\n"); result = EXIT_FAILURE; #endif } if (1 < check || 0 > check) { /* check */ for (i = 0; i < km; ++i) { libxsmm_blasint j; for (j = 0; j < kn; ++j) { const ELEM_TYPE u = b[i*kldo+j]; const ELEM_TYPE v = a[j*kldi+i]; if (LIBXSMM_NEQ(u, v)) { i += km; /* leave outer loop as well */ result = EXIT_FAILURE; break; } } } } } } if (EXIT_SUCCESS == result) { const double d = libxsmm_timer_duration(0, duration); if (0 < duration) { /* out-of-place transpose bandwidth assumes RFO */ fprintf(stdout, "\tbandwidth: %.1f GB/s\n", size * ((('o' == t || 'O' == t)) ? 3 : 2) / (d * (1U << 30))); } if (0 == lower) { fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * (d / (0 == r ? (s + 1) : s))); } else { fprintf(stdout, "\tduration: %f ms\n", 1000.0 * d); } if (0 < duration2) { fprintf(stdout, "\treference: %.1fx\n", (1.0 * duration) / duration2); } } else if (0 != check) { /* check */ fprintf(stderr, "Error: validation failed for m=%lli, n=%lli, ldi=%lli, and ldo=%lli!\n", (long long)km, (long long)kn, (long long)kldi, (long long)kldo); } libxsmm_free(a); libxsmm_free(b); } return result; } libxsmm-1.17/samples/transpose/transpose.f000066400000000000000000000127611415223013700207710ustar00rootroot00000000000000!=======================================================================! ! Copyright (c) Intel Corporation - All rights reserved. ! ! This file is part of the LIBXSMM library. ! ! ! ! For information on the license, see the LICENSE file. ! ! Further information: https://github.com/hfp/libxsmm/ ! ! SPDX-License-Identifier: BSD-3-Clause ! !=======================================================================! ! Hans Pabst (Intel Corp.) !=======================================================================! PROGRAM transpose USE :: LIBXSMM, ONLY: LIBXSMM_BLASINT_KIND, & & libxsmm_timer_duration, & & libxsmm_timer_tick, & & libxsmm_otrans_omp, & & libxsmm_otrans, & & libxsmm_itrans, & & ptr => libxsmm_ptr IMPLICIT NONE INTEGER, PARAMETER :: T = KIND(0D0) INTEGER, PARAMETER :: S = 8 REAL(T), ALLOCATABLE, TARGET :: a1(:), b1(:) !DIR$ ATTRIBUTES ALIGN:64 :: a1, b1 INTEGER(LIBXSMM_BLASINT_KIND) :: m, n, ldi, ldo, i, j, k REAL(T), POINTER :: an(:,:), bn(:,:), bt(:,:) DOUBLE PRECISION :: duration INTEGER(8) :: nbytes, start INTEGER :: nrepeat REAL(T) :: diff CHARACTER(32) :: argv CHARACTER :: trans INTEGER :: argc argc = COMMAND_ARGUMENT_COUNT() IF (1 <= argc) THEN CALL GET_COMMAND_ARGUMENT(1, trans) ELSE trans = 'o' END IF IF (2 <= argc) THEN CALL GET_COMMAND_ARGUMENT(2, argv) READ(argv, "(I32)") m ELSE m = 4096 END IF IF (3 <= argc) THEN CALL GET_COMMAND_ARGUMENT(3, argv) READ(argv, "(I32)") n ELSE n = m END IF IF (4 <= argc) THEN CALL GET_COMMAND_ARGUMENT(4, argv) READ(argv, "(I32)") ldi ELSE ldi = m END IF IF (5 <= argc) THEN CALL GET_COMMAND_ARGUMENT(5, argv) READ(argv, "(I32)") ldo ELSE ldo = ldi END IF IF (6 <= argc) THEN CALL GET_COMMAND_ARGUMENT(6, argv) READ(argv, "(I32)") nrepeat ELSE nrepeat = 3 END IF nbytes = INT(m * n, 8) * T ! size in Byte WRITE(*, "(2(A,I0),2(A,I0),A,I0,A)") & & "m=", m, " n=", n, " ldi=", ldi, " ldo=", ldo, & & " size=", (nbytes / ISHFT(1, 20)), "MB" ALLOCATE(b1(ldo*MAX(m,n))) bn(1:ldo,1:n) => b1 bt(1:ldo,1:m) => b1 IF (('o'.EQ.trans).OR.('O'.EQ.trans)) THEN ALLOCATE(a1(ldi*n)) an(1:ldi,1:n) => a1 !$OMP PARALLEL DO PRIVATE(i, j) DEFAULT(NONE) SHARED(m, n, an) DO j = 1, n DO i = 1, m an(i,j) = initial_value(i - 1, j - 1, m) END DO END DO !$OMP END PARALLEL DO start = libxsmm_timer_tick() DO k = 1, nrepeat !CALL libxsmm_otrans_omp(ptr(b1), ptr(a1), S, m, n, ldi, ldo) !CALL libxsmm_otrans(ptr(b1), ptr(a1), S, m, n, ldi, ldo) !CALL libxsmm_otrans(bn, an, m, n, ldi, ldo) CALL libxsmm_otrans(b1, a1, m, n, ldi, ldo) END DO duration = libxsmm_timer_duration(start, libxsmm_timer_tick()) DEALLOCATE(a1) ELSE ! in-place !$OMP PARALLEL DO PRIVATE(i, j) DEFAULT(NONE) SHARED(m, n, bn) DO j = 1, n DO i = 1, m bn(i,j) = initial_value(i - 1, j - 1, m) END DO END DO !$OMP END PARALLEL DO start = libxsmm_timer_tick() DO k = 1, nrepeat !CALL libxsmm_itrans(ptr(b1), S, m, n, ldi, ldo) !CALL libxsmm_itrans(bn, m, n, ldi) CALL libxsmm_itrans(b1, m, n, ldi) END DO duration = libxsmm_timer_duration(start, libxsmm_timer_tick()) END IF diff = REAL(0, T) DO j = 1, n DO i = 1, m diff = MAX(diff, & & ABS(bt(j,i) - initial_value(i - 1, j - 1, m))) END DO END DO DEALLOCATE(b1) IF (0.GE.diff) THEN IF ((0.LT.duration).AND.(0.LT.nrepeat)) THEN ! out-of-place transpose bandwidth assumes RFO WRITE(*, "(1A,A,F10.1,A)") CHAR(9), "bandwidth: ", & & REAL(nbytes, T) & & * MERGE(3D0, 2D0, ('o'.EQ.trans).OR.('O'.EQ.trans)) & & * REAL(nrepeat, T) / (duration * REAL(ISHFT(1_8, 30), T)),& & " GB/s" WRITE(*, "(1A,A,F10.1,A)") CHAR(9), "duration: ", & & 1D3 * duration / REAL(nrepeat, T), & & " ms" END IF ELSE WRITE(*,*) "Validation failed!" STOP 1 END IF CONTAINS PURE REAL(T) FUNCTION initial_value(i, j, m) INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: i, j, m initial_value = REAL(j * m + i, T) END FUNCTION END PROGRAM libxsmm-1.17/samples/transpose/transpose.sh000077500000000000000000000050141415223013700211520ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) NAME=$(basename $0 .sh) GREP=$(command -v grep) ENV=$(command -v env) if [ "Windows_NT" = "${OS}" ]; then # Cygwin's "env" does not set PATH ("Files/Black: No such file or directory") export PATH=${PATH}:${HERE}/../../lib:/usr/x86_64-w64-mingw32/sys-root/mingw/bin # Cygwin's ldd hangs with dyn. linked executables or certain shared libraries LDD=$(command -v cygcheck) EXE=.exe else if [ "$(command -v ldd)" ]; then LDD=ldd elif [ "$(command -v otool)" ]; then LDD="otool -L" else LDD=echo fi fi MICINFO=$(command -v micinfo) if [ "${MICINFO}" ]; then MICCORES=$(${MICINFO} 2>/dev/null | sed -n "0,/[[:space:]]\+Total No of Active Cores :[[:space:]]\+\([0-9]\+\)/s//\1/p") fi if [ "" = "${MICCORES}" ]; then MICCORES=61 fi MICTPERC=3 if [ "-mic" != "$1" ]; then if [ "$(${LDD} ${HERE}/${NAME}${EXE} 2>/dev/null | ${GREP} libiomp5\.)" ]; then ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ KMP_AFFINITY=compact,granularity=fine,1 \ MIC_KMP_AFFINITY=compact,granularity=fine \ MIC_KMP_HW_SUBSET=$((MICCORES-1))c${MICTPERC}t \ MIC_ENV_PREFIX=MIC \ OFFLOAD_INIT=on_start \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" else ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ OMP_PROC_BIND=TRUE \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" fi else shift ${ENV} \ SINK_LD_LIBRARY_PATH=${SINK_LD_LIBRARY_PATH}:${MIC_LD_LIBRARY_PATH}:${HERE}/../../lib \ micnativeloadex \ ${HERE}/${NAME}${EXE} -a "$*" \ -e "KMP_AFFINITY=compact,granularity=fine" \ -e "MIC_KMP_HW_SUBSET=$((MICCORES-1))${MICTPERC}t" fi libxsmm-1.17/samples/transpose/transpose.vcxproj000066400000000000000000000544251415223013700222420ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 transpose {B65838AD-C64D-4F47-BBCD-A011223B9585} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true GenerateParallelCode 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true GenerateParallelCode 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true GenerateParallelCode 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true GenerateParallelCode 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true GenerateParallelCode 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true GenerateParallelCode 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/transpose/transpose_opentuner.py000077500000000000000000000123111415223013700232650ustar00rootroot00000000000000#!/usr/bin/env python3 ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### # # This script is based on OpenTuner's tutorial: # "Optimizing Block Matrix Multiplication". # import opentuner from opentuner import ConfigurationManipulator from opentuner import IntegerParameter from opentuner import MeasurementInterface from opentuner import Result import json import time import sys import re class TransposeTune(MeasurementInterface): def manipulator(self): """ Define the search space by creating a ConfigurationManipulator """ self.mintilesize = 2 self.granularity = 1 assert(0 < self.granularity) minsize = max(self.mintilesize / self.granularity, 1) maxsize = minsize + self.granularity m_max = max(min(self.args.maxm, self.args.end), maxsize) n_max = max(min(self.args.maxn, self.args.end), maxsize) m_max = (m_max + self.granularity - 1) / self.granularity n_max = (n_max + self.granularity - 1) / self.granularity m_param = IntegerParameter("M", minsize, m_max) n_param = IntegerParameter("N", minsize, n_max) manipulator = ConfigurationManipulator() manipulator.add_parameter(m_param) manipulator.add_parameter(n_param) return manipulator def seed_configurations(self): m_seed = [self.args.n, self.args.m][0 != self.args.m] n_seed = [self.args.m, self.args.n][0 != self.args.n] if 0 == m_seed or 0 == n_seed: return [] else: return [{"M": max(m_seed, self.mintilesize), "N": max(n_seed, self.mintilesize)}] def objective(self): return opentuner.search.objective.MaximizeAccuracyMinimizeSize() def run(self, desired_result, input, limit): """ Compile and run a given configuration then return performance """ cfg = desired_result.configuration.data nruns = max(self.args.nruns, 1) begin = max(self.args.begin, self.mintilesize) end = max(self.args.end, self.mintilesize) run_cmd = ( "CHECK=-1" # repeatable runs " LIBXSMM_TCOPY_M=" + str(self.granularity * cfg["M"]) + " LIBXSMM_TCOPY_N=" + str(self.granularity * cfg["N"]) + " ./transpose.sh o" + " " + str(end) + " " + str(end) + " " + str(end) + " " + str(end) + " " + str(nruns) + " -" + str(begin)) run_result = self.call_program(run_cmd) if (0 == run_result["returncode"]): match = re.search( "\\s*duration:\\s+([0-9]+(\\.[0-9]*)*)", str(run_result["stdout"])) assert(match is not None) mseconds = float(match.group(1)) / nruns assert(0 < mseconds) frequency = 1000.0 / mseconds kernelsize = (self.granularity**2) * cfg["M"] * cfg["N"] return Result(time=mseconds, accuracy=frequency, size=kernelsize) else: sys.tracebacklimit = 0 raise RuntimeError("Execution failed for \"" + run_cmd + "\"!") def save_final_config(self, configuration): """ called at the end of tuning """ filename = ( "transpose-" + str(max(self.args.begin, 1)) + "_" + str(max(self.args.end, 1)) + "_" + str(max(self.args.nruns, 1)) + time.strftime("-%Y%m%d-%H%M%S") + ".json") print("Optimal block size written to " + filename + ": ", configuration.data) # self.manipulator().save_to_file(configuration.data, filename) with open(filename, 'w') as fd: json.dump(configuration.data, fd) if __name__ == "__main__": argparser = opentuner.default_argparser() argparser.add_argument( "begin", type=int, help="Begin of the range (min. M and N)") argparser.add_argument( "end", type=int, help="End of the range (max. M and N)") argparser.add_argument( "nruns", type=int, default=100, nargs='?', help="Number of experiments per epoch") argparser.add_argument( "m", type=int, default=0, nargs='?', help="Initial tile size (M)") argparser.add_argument( "n", type=int, default=0, nargs='?', help="Initial tile size (N)") argparser.add_argument( "maxm", type=int, default=160, nargs='?', help="Max. tile size (M)") argparser.add_argument( "maxn", type=int, default=160, nargs='?', help="Max. tile size (N)") TransposeTune.main(argparser.parse_args()) libxsmm-1.17/samples/utilities/000077500000000000000000000000001415223013700165725ustar00rootroot00000000000000libxsmm-1.17/samples/utilities/diff/000077500000000000000000000000001415223013700175025ustar00rootroot00000000000000libxsmm-1.17/samples/utilities/diff/Makefile000066400000000000000000000100741415223013700211440ustar00rootroot00000000000000ROOTDIR = $(abspath $(dir $(firstword $(MAKEFILE_LIST)))) DEPDIR = ../../.. SRCDIR = . INCDIR = . BLDDIR = obj OUTDIR = . CXXFLAGS = $(NULL) CFLAGS = $(NULL) DFLAGS = $(NULL) BLAS = 0 OMP = 0 SYM = 1 # include common Makefile artifacts include $(DEPDIR)/Makefile.inc # necessary include directories IFLAGS += -I$(call quote,$(INCDIR)) IFLAGS += -I$(call quote,$(DEPDIR)/include) OUTNAME := $(shell basename "$(ROOTDIR)") HEADERS := $(wildcard $(INCDIR)/*.h) $(wildcard $(INCDIR)/*.hpp) $(wildcard $(INCDIR)/*.hxx) $(wildcard $(INCDIR)/*.hh) \ $(wildcard $(SRCDIR)/*.h) $(wildcard $(SRCDIR)/*.hpp) $(wildcard $(SRCDIR)/*.hxx) $(wildcard $(SRCDIR)/*.hh) \ $(DEPDIR)/include/libxsmm_source.h CPPSRCS := $(wildcard $(SRCDIR)/*.cpp) CXXSRCS := $(wildcard $(SRCDIR)/*.cxx) CCXSRCS := $(wildcard $(SRCDIR)/*.cc) CSOURCS := $(wildcard $(SRCDIR)/*.c) CPPOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CPPSRCS:.cpp=-cpp.o))) CXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CXXSRCS:.cxx=-cxx.o))) CCXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CCXSRCS:.cc=-cc.o))) COBJCTS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CSOURCS:.c=-c.o))) ifneq (,$(strip $(FC))) FXXSRCS := $(wildcard $(SRCDIR)/*.f) F77SRCS := $(wildcard $(SRCDIR)/*.F) F90SRCS := $(wildcard $(SRCDIR)/*.f90) $(wildcard $(SRCDIR)/*.F90) FXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(FXXSRCS:.f=-f.o))) F77OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F77SRCS:.F=-f77.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90SRCS:.f90=-f90.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90OBJS:.F90=-f90.o))) endif SOURCES := $(CPPSRCS) $(CXXSRCS) $(CCXSRCS) $(CSOURCS) OBJECTS := $(CPPOBJS) $(CXXOBJS) $(CCXOBJS) $(COBJCTS) FTNSRCS := $(FXXSRCS) $(F77SRCS) $(F90SRCS) MODULES := $(addsuffix .mod,$(basename $(FTNSRCS))) $(addsuffix .modmic,$(basename $(FTNSRCS))) FTNOBJS := $(FXXOBJS) $(F77OBJS) $(F90OBJS) XFILES := $(OUTDIR)/$(OUTNAME) $(OUTDIR)/$(OUTNAME)f .PHONY: all all: $(XFILES) .PHONY: compile compile: $(OBJECTS) $(FTNOBJS) ifneq (,$(strip $(FC))) ifneq (0,$(shell echo "$$((3>$(XSMM_GCC) || 40600<=$(FC_VERSION_NUM)))")) $(OUTDIR)/$(OUTNAME)f: $(OUTDIR)/.make $(FTNOBJS) $(FORTDEP) $(LIBDEP) $(EXTDEP) $(FLD) -o $@ $(FTNOBJS) $(FORTLIB) $(EXTLIB) $(MAINLIB) $(FCMTFLAGS) $(SLDFLAGS) $(LDFLAGS) $(FLDFLAGS) $(ELDFLAGS) else .PHONY: $(OUTDIR)/$(OUTNAME)f endif else .PHONY: $(OUTDIR)/$(OUTNAME)f endif $(OUTDIR)/$(OUTNAME): $(OUTDIR)/.make $(OBJECTS) $(LIBDEP) $(LD) -o $@ $(OBJECTS) $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(BLDDIR)/%-cpp.o: $(SRCDIR)/%.cpp .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CXX) $(DFLAGS) $(IFLAGS) $(CXXFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-c.o: $(SRCDIR)/%.c .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CC) $(DFLAGS) $(IFLAGS) $(CFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-f.o: $(SRCDIR)/%.f .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.f90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.F90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f77.o: $(SRCDIR)/%.F .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ .PHONY: clean clean: ifneq ($(call qapath,$(BLDDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(BLDDIR)),$(call qapath,.)) @rm -rf $(BLDDIR) endif endif ifneq (,$(wildcard $(BLDDIR))) # still exists @rm -f $(OBJECTS) $(OBJECTX) $(FTNOBJS) $(FTNOBJX) *__genmod.* fit.log *.dat @rm -f $(BLDDIR)/*.gcno $(BLDDIR)/*.gcda $(BLDDIR)/*.gcov endif @rm -f .make .state .PHONY: realclean realclean: clean ifneq ($(call qapath,$(OUTDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(OUTDIR)),$(call qapath,.)) @rm -rf $(OUTDIR) endif endif ifneq (,$(wildcard $(OUTDIR))) # still exists @rm -f $(OUTDIR)/libxsmm.$(DLIBEXT) $(OUTDIR)/*.stackdump @rm -f $(XFILES) $(MODULES) endif libxsmm-1.17/samples/utilities/diff/diff.c000066400000000000000000000157571415223013700205750ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include #include #include #include int main(int argc, char* argv[]) { const int insize = (1 < argc ? atoi(argv[1]) : 0); const int incrmt = (2 < argc ? atoi(argv[2]) : 0); const int nelems = (3 < argc ? atoi(argv[3]) : 0); const int niters = (4 < argc ? atoi(argv[4]) : 7); const int elsize = (0 >= insize ? LIBXSMM_DESCRIPTOR_SIGSIZE : insize); const int stride = (0 >= incrmt ? LIBXSMM_MAX(LIBXSMM_DESCRIPTOR_MAXSIZE, elsize) : LIBXSMM_MAX(incrmt, elsize)); const size_t n = (0 >= nelems ? (((size_t)2 << 30/*2 GB*/) / stride) : ((size_t)nelems)); const char *const env_strided = getenv("STRIDED"), *const env_check = getenv("CHECK"); const int strided = (NULL == env_strided || 0 == *env_strided) ? 0/*default*/ : atoi(env_strided); const int check = (NULL == env_check || 0 == *env_check) ? 0/*default*/ : atoi(env_check); double d0, d1 = 0, d2 = 0, d3 = 0; size_t nbytes, size, nrpt, i; int result = EXIT_SUCCESS; unsigned char *a, *b; LIBXSMM_ASSERT(elsize <= stride); if (0 < niters) { size = n; nrpt = niters; } else { size = LIBXSMM_MAX(LIBXSMM_ABS(niters), 1); nrpt = n; } nbytes = size * stride; libxsmm_init(); a = (unsigned char*)(0 != nbytes ? malloc(nbytes) : NULL); b = (unsigned char*)(0 != nbytes ? malloc(nbytes) : NULL); if (NULL != a && NULL != b) { size_t diff = 0, j; for (i = 0; i < nrpt; ++i) { printf("-------------------------------------------------\n"); /* initialize the data */ libxsmm_rng_seq(a, (libxsmm_blasint)nbytes); memcpy(b, a, nbytes); /* same content */ /* benchmark libxsmm_diff (always strided) */ if (elsize < 256) { const libxsmm_timer_tickint start = libxsmm_timer_tick(); for (j = 0; j < nbytes; j += stride) { const void *const aj = a + j, *const bj = b + j; diff += libxsmm_diff(aj, bj, (unsigned char)elsize); } d0 = libxsmm_timer_duration(start, libxsmm_timer_tick()); if (0 < d0) printf("libxsmm_diff:\t\t%.8f s (%i MB/s)\n", d0, (int)LIBXSMM_ROUND((2.0 * nbytes) / ((1024.0 * 1024.0) * d0))); result += (int)diff * ((int)stride / ((int)stride + 1)); /* ignore result */ d1 += d0; } { /* benchmark libxsmm_memcmp */ libxsmm_timer_tickint start; /* reinitialize the data (flush caches) */ libxsmm_rng_seq(a, (libxsmm_blasint)nbytes); memcpy(b, a, nbytes); /* same content */ start = libxsmm_timer_tick(); if (stride == elsize && 0 == strided) { diff += libxsmm_memcmp(a, b, nbytes); } else { for (j = 0; j < nbytes; j += stride) { const void *const aj = a + j, *const bj = b + j; diff += libxsmm_memcmp(aj, bj, elsize); } } d0 = libxsmm_timer_duration(start, libxsmm_timer_tick()); if (0 < d0) printf("libxsmm_memcmp:\t\t%.8f s (%i MB/s)\n", d0, (int)LIBXSMM_ROUND((2.0 * nbytes) / ((1024.0 * 1024.0) * d0))); result += (int)diff * ((int)stride / ((int)stride + 1)); /* ignore result */ d2 += d0; } { /* benchmark stdlib's memcmp */ libxsmm_timer_tickint start; /* reinitialize the data (flush caches) */ libxsmm_rng_seq(a, (libxsmm_blasint)nbytes); memcpy(b, a, nbytes); /* same content */ start = libxsmm_timer_tick(); if (stride == elsize && 0 == strided) { diff += (0 != memcmp(a, b, nbytes)); } else { for (j = 0; j < nbytes; j += stride) { const void *const aj = a + j, *const bj = b + j; #if defined(_MSC_VER) # pragma warning(push) # pragma warning(disable: 6385) #endif diff += (0 != memcmp(aj, bj, elsize)); #if defined(_MSC_VER) # pragma warning(pop) #endif } } d0 = libxsmm_timer_duration(start, libxsmm_timer_tick()); if (0 < d0) printf("stdlib memcmp:\t\t%.8f s (%i MB/s)\n", d0, (int)LIBXSMM_ROUND((2.0 * nbytes) / ((1024.0 * 1024.0) * d0))); result += (int)diff * ((int)stride / ((int)stride + 1)); /* ignore result */ d3 += d0; } } if (1 < nrpt) { printf("-------------------------------------------------\n"); printf("Arithmetic average of %llu iterations\n", (unsigned long long)nrpt); printf("-------------------------------------------------\n"); d1 /= nrpt; d2 /= nrpt; d3 /= nrpt; if (0 < d1) printf("libxsmm_diff:\t\t%.8f s (%i MB/s)\n", d1, (int)LIBXSMM_ROUND((2.0 * nbytes) / ((1024.0 * 1024.0) * d1))); if (0 < d2) printf("libxsmm_memcmp:\t\t%.8f s (%i MB/s)\n", d2, (int)LIBXSMM_ROUND((2.0 * nbytes) / ((1024.0 * 1024.0) * d2))); if (0 < d3) printf("stdlib memcmp:\t\t%.8f s (%i MB/s)\n", d3, (int)LIBXSMM_ROUND((2.0 * nbytes) / ((1024.0 * 1024.0) * d3))); } if (0 < nrpt) { printf("-------------------------------------------------\n"); } if (0 != check) { /* validation */ size_t k; for (i = 0; i < nrpt; ++i) { for (j = 0; j < nbytes; j += stride) { unsigned char *const aj = a + j, *const bj = b + j; for (k = 0; k < 2; ++k) { const int r = rand() % elsize; #if defined(_MSC_VER) # pragma warning(push) # pragma warning(disable: 6385) #endif if (0 != memcmp(aj, bj, elsize)) { if (elsize < 256 && 0 == libxsmm_diff(aj, bj, (unsigned char)elsize)) ++diff; if (0 == libxsmm_memcmp(aj, bj, elsize)) ++diff; } else { if (elsize < 256 && 0 != libxsmm_diff(aj, bj, (unsigned char)elsize)) ++diff; if (0 != libxsmm_memcmp(aj, bj, elsize)) ++diff; } #if defined(_MSC_VER) # pragma warning(pop) #endif /* inject difference into a or b */ if (0 != (rand() & 1)) { aj[r] = (unsigned char)(rand() % 256); } else { bj[r] = (unsigned char)(rand() % 256); } } } } if (0 != diff) { fprintf(stderr, "ERROR: errors=%i - validation failed!\n", (int)diff); result = EXIT_FAILURE; } } } else { result = EXIT_FAILURE; } free(a); free(b); return result; } libxsmm-1.17/samples/utilities/diff/diff.f000066400000000000000000000103431415223013700205620ustar00rootroot00000000000000!=======================================================================! ! Copyright (c) Intel Corporation - All rights reserved. ! ! This file is part of the LIBXSMM library. ! ! ! ! For information on the license, see the LICENSE file. ! ! Further information: https://github.com/hfp/libxsmm/ ! ! SPDX-License-Identifier: BSD-3-Clause ! !=======================================================================! ! Hans Pabst (Intel Corp.) !=======================================================================! PROGRAM diff USE :: LIBXSMM, ONLY: LIBXSMM_TICKINT_KIND, & & libxsmm_timer_duration, & & libxsmm_timer_tick, & & libxsmm_init, & & libxsmm_diff IMPLICIT NONE INTEGER, PARAMETER :: W = 34 INTEGER, PARAMETER :: T = 4 INTEGER(T), ALLOCATABLE, TARGET :: a(:), b(:) !DIR$ ATTRIBUTES ALIGN:64 :: a, b INTEGER(LIBXSMM_TICKINT_KIND) :: start DOUBLE PRECISION :: duration(3), d INTEGER :: i, n, nrepeat INTEGER(8) :: nbytes CHARACTER(32) :: argv INTEGER :: argc argc = COMMAND_ARGUMENT_COUNT() IF (1 <= argc) THEN CALL GET_COMMAND_ARGUMENT(1, argv) READ(argv, "(I32)") n ELSE n = 0 END IF IF (2 <= argc) THEN CALL GET_COMMAND_ARGUMENT(2, argv) READ(argv, "(I32)") nrepeat ELSE nrepeat = 5 END IF duration = 0D0 n = MERGE(n, ISHFT(ISHFT(2, 20) / T, 10), 0 < n) ! 2 GB by default nbytes = INT(n, 8) * T WRITE(*, "(A,I0,A,I0,A,A,I0,A)") & & "nelements=", n, " typesize=", T, "Byte", & & " size=", nbytes / ISHFT(1, 20), "MB" CALL libxsmm_init() ALLOCATE(a(n), b(n)) DO i = 1, n a(i) = i - 1 b(i) = i - 1 END DO WRITE(*, "(A)") REPEAT("-", W) DO i = 1, nrepeat start = libxsmm_timer_tick() IF (.NOT. libxsmm_diff(a, b)) THEN d = libxsmm_timer_duration(start, libxsmm_timer_tick()) duration(1) = duration(1) + d WRITE(*, "(A,F10.1,A)") "DIFF (LIBXSMM):", 1D3 * d, " ms" ELSE WRITE(*, "(A)") "Validation failed!" END IF start = libxsmm_timer_tick() IF (ALL(a .EQ. b)) THEN d = libxsmm_timer_duration(start, libxsmm_timer_tick()) duration(2) = duration(2) + d WRITE(*, "(A,F10.1,A)") "ALL (Fortran):", 1D3 * d, " ms" ELSE WRITE(*, "(A)") "Validation failed!" END IF start = libxsmm_timer_tick() IF (.NOT. ANY(a .NE. b)) THEN d = libxsmm_timer_duration(start, libxsmm_timer_tick()) duration(3) = duration(3) + d WRITE(*, "(A,F10.1,A)") "ANY (Fortran):", 1D3 * d, " ms" ELSE WRITE(*, "(A)") "Validation failed!" END IF WRITE(*, "(A)") REPEAT("-", W) END DO IF (ALL(0 .LT. duration)) THEN WRITE(*, "(A,I0,A)") "Arithmetic average of ", & & nrepeat, " iterations" WRITE(*, "(A)") REPEAT("-", W) WRITE(*, "(A,F10.1,A)") "DIFF (LIBXSMM):", & & REAL(nbytes, 8) * REAL(nrepeat, 8) / & & (duration(1) * REAL(ISHFT(1, 20), 8)), " MB/s" WRITE(*, "(A,F10.1,A)") "ALL (Fortran):", & & REAL(nbytes, 8) * REAL(nrepeat, 8) / & & (duration(2) * REAL(ISHFT(1, 20), 8)), " MB/s" WRITE(*, "(A,F10.1,A)") "ANY (Fortran):", & & REAL(nbytes, 8) * REAL(nrepeat, 8) / & & (duration(3) * REAL(ISHFT(1, 20), 8)), " MB/s" WRITE(*, "(A)") REPEAT("-", W) END IF DEALLOCATE(a, b) END PROGRAM libxsmm-1.17/samples/utilities/diff/diff.sh000077500000000000000000000050141415223013700207510ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) NAME=$(basename $0 .sh) GREP=$(command -v grep) ENV=$(command -v env) if [ "Windows_NT" = "${OS}" ]; then # Cygwin's "env" does not set PATH ("Files/Black: No such file or directory") export PATH=${PATH}:${HERE}/../../lib:/usr/x86_64-w64-mingw32/sys-root/mingw/bin # Cygwin's ldd hangs with dyn. linked executables or certain shared libraries LDD=$(command -v cygcheck) EXE=.exe else if [ "$(command -v ldd)" ]; then LDD=ldd elif [ "$(command -v otool)" ]; then LDD="otool -L" else LDD=echo fi fi MICINFO=$(command -v micinfo) if [ "${MICINFO}" ]; then MICCORES=$(${MICINFO} 2>/dev/null | sed -n "0,/[[:space:]]\+Total No of Active Cores :[[:space:]]\+\([0-9]\+\)/s//\1/p") fi if [ "" = "${MICCORES}" ]; then MICCORES=61 fi MICTPERC=3 if [ "-mic" != "$1" ]; then if [ "$(${LDD} ${HERE}/${NAME}${EXE} 2>/dev/null | ${GREP} libiomp5\.)" ]; then ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ KMP_AFFINITY=compact,granularity=fine,1 \ MIC_KMP_AFFINITY=compact,granularity=fine \ MIC_KMP_HW_SUBSET=$((MICCORES-1))c${MICTPERC}t \ MIC_ENV_PREFIX=MIC \ OFFLOAD_INIT=on_start \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" else ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ OMP_PROC_BIND=TRUE \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" fi else shift ${ENV} \ SINK_LD_LIBRARY_PATH=${SINK_LD_LIBRARY_PATH}:${MIC_LD_LIBRARY_PATH}:${HERE}/../../lib \ micnativeloadex \ ${HERE}/${NAME}${EXE} -a "$*" \ -e "KMP_AFFINITY=compact,granularity=fine" \ -e "MIC_KMP_HW_SUBSET=$((MICCORES-1))${MICTPERC}t" fi libxsmm-1.17/samples/utilities/diff/diff.vcxproj000066400000000000000000000542341415223013700220370ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 diff {7E730036-A678-4529-83F3-DFFB68CA278E} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 ProgramDatabase None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 ProgramDatabase None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/utilities/dispatch/000077500000000000000000000000001415223013700203715ustar00rootroot00000000000000libxsmm-1.17/samples/utilities/dispatch/Makefile000066400000000000000000000104151415223013700220320ustar00rootroot00000000000000ROOTDIR = $(abspath $(dir $(firstword $(MAKEFILE_LIST)))) DEPDIR = ../../.. SRCDIR = . INCDIR = . BLDDIR = obj OUTDIR = . CXXFLAGS = $(NULL) CFLAGS = $(NULL) DFLAGS = $(NULL) BLAS = 0 SYM = 1 OMP = 1 # include common Makefile artifacts include $(DEPDIR)/Makefile.inc # necessary include directories IFLAGS += -I$(call quote,$(INCDIR)) IFLAGS += -I$(call quote,$(DEPDIR)/include) OUTNAME := $(shell basename "$(ROOTDIR)") HEADERS := $(wildcard $(INCDIR)/*.h) $(wildcard $(INCDIR)/*.hpp) $(wildcard $(INCDIR)/*.hxx) $(wildcard $(INCDIR)/*.hh) \ $(wildcard $(SRCDIR)/*.h) $(wildcard $(SRCDIR)/*.hpp) $(wildcard $(SRCDIR)/*.hxx) $(wildcard $(SRCDIR)/*.hh) \ $(DEPDIR)/include/libxsmm_source.h CPPSRCS := $(wildcard $(SRCDIR)/*.cpp) CXXSRCS := $(wildcard $(SRCDIR)/*.cxx) CCXSRCS := $(wildcard $(SRCDIR)/*.cc) CSOURCS := $(wildcard $(SRCDIR)/*.c) CPPOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CPPSRCS:.cpp=-cpp.o))) CXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CXXSRCS:.cxx=-cxx.o))) CCXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CCXSRCS:.cc=-cc.o))) COBJCTS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CSOURCS:.c=-c.o))) ifneq (,$(strip $(FC))) FXXSRCS := $(wildcard $(SRCDIR)/*.f) F77SRCS := $(wildcard $(SRCDIR)/*.F) F90SRCS := $(wildcard $(SRCDIR)/*.f90) $(wildcard $(SRCDIR)/*.F90) FXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(FXXSRCS:.f=-f.o))) F77OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F77SRCS:.F=-f77.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90SRCS:.f90=-f90.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90OBJS:.F90=-f90.o))) endif SOURCES := $(CPPSRCS) $(CXXSRCS) $(CCXSRCS) $(CSOURCS) OBJECTS := $(CPPOBJS) $(CXXOBJS) $(CCXOBJS) $(COBJCTS) FTNSRCS := $(FXXSRCS) $(F77SRCS) $(F90SRCS) MODULES := $(addsuffix .mod,$(basename $(FTNSRCS))) $(addsuffix .modmic,$(basename $(FTNSRCS))) FTNOBJS := $(FXXOBJS) $(F77OBJS) $(F90OBJS) XFILES := $(OUTDIR)/$(OUTNAME) $(OUTDIR)/$(OUTNAME)f $(OUTDIR)/$(OUTNAME)f_udt .PHONY: all all: $(XFILES) .PHONY: compile compile: $(OBJECTS) $(FTNOBJS) ifneq (,$(strip $(FC))) $(OUTDIR)/$(OUTNAME)f: $(OUTDIR)/.make $(BLDDIR)/$(OUTNAME)-f.o $(FORTDEP) $(LIBDEP) $(EXTDEP) $(FLD) -o $@ $(BLDDIR)/$(OUTNAME)-f.o $(FORTLIB) $(MAINLIB) \ $(FCMTFLAGS) $(SLDFLAGS) $(LDFLAGS) $(FLDFLAGS) $(ELDFLAGS) $(OUTDIR)/$(OUTNAME)f_udt: $(OUTDIR)/.make $(BLDDIR)/$(OUTNAME)_udt-f.o $(FORTDEP) $(LIBDEP) $(EXTDEP) $(FLD) -o $@ $(BLDDIR)/$(OUTNAME)_udt-f.o $(FORTLIB) $(MAINLIB) \ $(FCMTFLAGS) $(SLDFLAGS) $(LDFLAGS) $(FLDFLAGS) $(ELDFLAGS) else .PHONY: $(OUTDIR)/$(OUTNAME)f .PHONY: $(OUTDIR)/$(OUTNAME)f_udt endif $(OUTDIR)/$(OUTNAME): $(OUTDIR)/.make $(OBJECTS) $(LIBDEP) $(EXTDEP) $(LD) -o $@ $(OBJECTS) $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(BLDDIR)/%-cpp.o: $(SRCDIR)/%.cpp .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CXX) $(DFLAGS) $(IFLAGS) $(CXXFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-c.o: $(SRCDIR)/%.c .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CC) $(DFLAGS) $(IFLAGS) $(CFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-f.o: $(SRCDIR)/%.f .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.f90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.F90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f77.o: $(SRCDIR)/%.F .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ .PHONY: clean clean: ifneq ($(call qapath,$(BLDDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(BLDDIR)),$(call qapath,.)) @rm -rf $(BLDDIR) endif endif ifneq (,$(wildcard $(BLDDIR))) # still exists @rm -f $(OBJECTS) $(OBJECTX) $(FTNOBJS) $(FTNOBJX) *__genmod.* fit.log *.dat @rm -f $(BLDDIR)/*.gcno $(BLDDIR)/*.gcda $(BLDDIR)/*.gcov endif @rm -f .make .state .PHONY: realclean realclean: clean ifneq ($(call qapath,$(OUTDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(OUTDIR)),$(call qapath,.)) @rm -rf $(OUTDIR) endif endif ifneq (,$(wildcard $(OUTDIR))) # still exists @rm -f $(OUTDIR)/libxsmm.$(DLIBEXT) $(OUTDIR)/*.stackdump @rm -f $(XFILES) $(MODULES) endif libxsmm-1.17/samples/utilities/dispatch/README.md000066400000000000000000000041231415223013700216500ustar00rootroot00000000000000# Dispatch ## Microbenchmark This code sample benchmarks the performance of (1) the dispatch mechanism, and (2) the time needed to JIT-generate code for the first time. Both mechanisms are relevant when replacing GEMM calls (see [Call Wrapper](https://libxsmm.readthedocs.io/libxsmm_mm/#call-wrapper) section of the reference documentation), or in any case of calling LIBXSMM's native [GEMM functionality](https://libxsmm.readthedocs.io/libxsmm_mm/). **Command Line Interface (CLI)** * Optionally takes the number of dispatches/code-generations (default: 10000). * Optionally takes the number of threads (default: 1). **Measurements (Benchmark)** * Duration of an empty function call (serves as a reference timing). * Duration to find an already generated kernel (cached/non-cached). * Duration to JIT-generate a GEMM kernel. In case of a multi-threaded benchmark, the timings represent a highly contended request (worst case). For thread-scaling, it can be observed that read-only accesses (code dispatch) stay roughly with a constant duration whereas write-accesses (code generation) are serialized and hence the duration scales linearly with the number of threads. The [Fortran example](https://github.com/hfp/libxsmm/blob/master/samples/utilities/dispatch/dispatch.f) (`dispatch.f`) could use `libxsmm_dmmdispatch` (or similar) like the C code (`dispatch.c`) but intentionally shows the lower-level dispatch interface `libxsmm_xmmdispatch` and also omits using the LIBXSMM module. Not using the module confirms: the same task can be achieved by relying only on FORTRAN 77 language level. ## User-Data Dispatch Further, another [Fortran example](https://github.com/hfp/libxsmm/blob/master/samples/utilities/dispatch/dispatch_udt.f) about [user-data dispatch](https://libxsmm.readthedocs.io/libxsmm_aux/#user-data-dispatch) is not exactly a benchmark. Dispatching user-data containing multiple kernels can obviously save multiple singular dispatches. The C interface for dispatching user-data is designed to follow the same flow as the Fortran interface. libxsmm-1.17/samples/utilities/dispatch/dispatch.c000066400000000000000000000341121415223013700223350ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #include #include #include #include #if defined(_OPENMP) # include #endif #if defined(__MKL) # include #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif #if !defined(MKLJIT) && defined(mkl_jit_create_dgemm) && \ !defined(_WIN32) /* check this manually under Windows */ # define MKLJIT #endif #if (!defined(INTEL_MKL_VERSION) || (20190003 <= INTEL_MKL_VERSION)) && \ !defined(_WIN32) /* TODO: Windows calling convention */ # define CHECK #endif #if !defined(MAXSIZE) # define MAXSIZE LIBXSMM_MAX_M #endif typedef struct triplet { libxsmm_blasint m, n, k; } triplet; LIBXSMM_INLINE void unique(triplet* mnk, int* size) { if (NULL != mnk && NULL != size && 0 < *size) { triplet *const first = mnk, *last = mnk + ((size_t)*size - 1), *i; for (i = mnk + 1; mnk < last; ++mnk, i = mnk + 1) { while (i <= last) { if (i->m != mnk->m || i->n != mnk->n || i->k != mnk->k) { i++; /* skip */ } else { /* copy */ *i = *last--; } } } *size = (int)(last - first + 1); } } /** * This (micro-)benchmark measures the duration needed to dispatch a kernel. * Various durations are measured: time to generate the code, to dispatch * from cache, and to dispatch from the entire database. The large total * number of kernels may also stress the in-memory database. * When building with "make MKL=1", the benchmark exercises JIT capability of * Intel MKL. However, the measured "dispatch" durations cannot be compared * with LIBXSMM because MKL's JIT-interface does not provide a function to * query a kernel for a set of GEMM-arguments. The implicit JIT-dispatch * on the other hand does not expose the time to query the kernel. */ int main(int argc, char* argv[]) { #if defined(_OPENMP) const int max_nthreads = omp_get_max_threads(); #else const int max_nthreads = 1; #endif const int default_minsize = 4; #if !defined(INTEL_MKL_VERSION) || (20190003 <= INTEL_MKL_VERSION) const int default_maxsize = MAXSIZE; #else const int default_maxsize = 16; #endif const int default_multiple = 1; int size_total = LIBXSMM_MAX((1 < argc && 0 < atoi(argv[1])) ? atoi(argv[1]) : 10000/*default*/, 2); const int size_local = LIBXSMM_CLMP((2 < argc && 0 < atoi(argv[2])) ? atoi(argv[2]) : 4/*default*/, 1, size_total); const int nthreads = LIBXSMM_CLMP(3 < argc ? atoi(argv[3]) : 1/*default*/, 1, max_nthreads); const int nrepeat = LIBXSMM_MAX(4 < argc ? atoi(argv[4]) : 1/*default*/, 1); const libxsmm_blasint multiple = LIBXSMM_MAX((5 < argc && 0 < atoi(argv[5])) ? atoi(argv[5]) : default_multiple, 1); const libxsmm_blasint maxsize = LIBXSMM_CLMP((6 < argc && 0 < atoi(argv[6])) ? atoi(argv[6]) : default_maxsize, 1, MAXSIZE); const libxsmm_blasint minsize = LIBXSMM_CLMP((7 < argc && 0 < atoi(argv[7])) ? atoi(argv[7]) : default_minsize, 1, maxsize); const libxsmm_blasint range = maxsize - minsize + 1; libxsmm_timer_tickint start, tcall, tcgen, tdsp0 = 0, tdsp1 = 0; int result = EXIT_SUCCESS; #if 0 != LIBXSMM_JIT if (LIBXSMM_X86_SSE3 > libxsmm_get_target_archid()) { fprintf(stderr, "\n\tWarning: JIT support is not available at runtime!\n"); } #else fprintf(stderr, "\n\tWarning: JIT support has been disabled at build time!\n"); #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload target(LIBXSMM_OFFLOAD_TARGET) #endif { triplet *const rnd = (triplet*)(0 < size_total ? malloc(sizeof(triplet) * size_total) : NULL); const size_t shuffle = libxsmm_shuffle(size_total); const double alpha = 1, beta = 1; int i, n; #if defined(MKLJIT) void* *const jitter = malloc(size_total * sizeof(void*)); if (NULL == jitter) exit(EXIT_FAILURE); #else const int prefetch = LIBXSMM_GEMM_PREFETCH_NONE; const int flags = LIBXSMM_GEMM_FLAG_NONE; #endif if (NULL == rnd) exit(EXIT_FAILURE); /* generate set of random numbers outside of any parallel region */ for (i = 0; i < size_total; ++i) { const int r1 = rand(), r2 = rand(), r3 = rand(); rnd[i].m = (1 < range ? (LIBXSMM_MOD(r1, range) + minsize) : minsize); rnd[i].n = (1 < range ? (LIBXSMM_MOD(r2, range) + minsize) : minsize); rnd[i].k = (1 < range ? (LIBXSMM_MOD(r3, range) + minsize) : minsize); if (1 != multiple) { rnd[i].m = LIBXSMM_MAX((rnd[i].m / multiple) * multiple, minsize); rnd[i].n = LIBXSMM_MAX((rnd[i].n / multiple) * multiple, minsize); rnd[i].k = LIBXSMM_MAX((rnd[i].k / multiple) * multiple, minsize); } #if defined(MKLJIT) jitter[i] = NULL; #endif } unique(rnd, &size_total); printf("Dispatching total=%i and local=%i kernels using %i thread%s...", size_total, size_local, 1 >= nthreads ? 1 : nthreads, 1 >= nthreads ? "" : "s"); /* first invocation may initialize some internals */ libxsmm_init(); /* subsequent calls are not doing any work */ start = libxsmm_timer_tick(); for (n = 0; n < nrepeat; ++n) { for (i = 0; i < size_total; ++i) { /* measure call overhead of an "empty" function (not inlined) */ libxsmm_init(); } } tcall = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); /* trigger code generation to subsequently measure only dispatch time */ start = libxsmm_timer_tick(); for (i = 0; i < size_local; ++i) { #if defined(MKLJIT) LIBXSMM_EXPECT(MKL_JIT_SUCCESS, mkl_cblas_jit_create_dgemm(jitter + i, MKL_COL_MAJOR, MKL_NOTRANS/*transa*/, MKL_NOTRANS/*transb*/, rnd[i].m, rnd[i].n, rnd[i].k, alpha, rnd[i].m, rnd[i].k, beta, rnd[i].m)); mkl_jit_get_dgemm_ptr(jitter[i]); /* to include lookup time */ #else libxsmm_dmmdispatch(rnd[i].m, rnd[i].n, rnd[i].k, &rnd[i].m, &rnd[i].k, &rnd[i].m, &alpha, &beta, &flags, &prefetch); #endif } tcgen = libxsmm_timer_ncycles(start, libxsmm_timer_tick()); /* measure duration for dispatching (cached) kernel; MKL: no "dispatch" just unwrapping the jitter */ #if defined(_OPENMP) if (1 < nthreads) { for (n = 0; n < nrepeat; ++n) { # pragma omp parallel num_threads(nthreads) private(i) { # pragma omp master start = libxsmm_timer_tick(); # pragma omp for for (i = 0; i < size_total; ++i) { const int j = LIBXSMM_MOD(i, size_local); #if defined(MKLJIT) mkl_jit_get_dgemm_ptr(jitter[j]); #else libxsmm_dmmdispatch(rnd[j].m, rnd[j].n, rnd[j].k, &rnd[j].m, &rnd[j].k, &rnd[j].m, &alpha, &beta, &flags, &prefetch); #endif } # pragma omp master tdsp1 += libxsmm_timer_ncycles(start, libxsmm_timer_tick()); } } } else #endif { for (n = 0; n < nrepeat; ++n) { start = libxsmm_timer_tick(); for (i = 0; i < size_total; ++i) { const int j = LIBXSMM_MOD(i, size_local); #if defined(MKLJIT) mkl_jit_get_dgemm_ptr(jitter[j]); #else libxsmm_dmmdispatch(rnd[j].m, rnd[j].n, rnd[j].k, &rnd[j].m, &rnd[j].k, &rnd[j].m, &alpha, &beta, &flags, &prefetch); #endif } tdsp1 += libxsmm_timer_ncycles(start, libxsmm_timer_tick()); } } /* measure duration for code-generation */ #if defined(_OPENMP) if (1 < nthreads) { # pragma omp parallel num_threads(nthreads) private(i) { # pragma omp master start = libxsmm_timer_tick(); # pragma omp for for (i = size_local; i < size_total; ++i) { #if defined(MKLJIT) LIBXSMM_EXPECT(MKL_JIT_SUCCESS, mkl_cblas_jit_create_dgemm(jitter + i, MKL_COL_MAJOR, MKL_NOTRANS/*transa*/, MKL_NOTRANS/*transb*/, rnd[i].m, rnd[i].n, rnd[i].k, alpha, rnd[i].m, rnd[i].k, beta, rnd[i].m)); mkl_jit_get_dgemm_ptr(jitter[i]); #else libxsmm_dmmdispatch(rnd[i].m, rnd[i].n, rnd[i].k, &rnd[i].m, &rnd[i].k, &rnd[i].m, &alpha, &beta, &flags, &prefetch); #endif } # pragma omp master tcgen += libxsmm_timer_ncycles(start, libxsmm_timer_tick()); } } else #endif { start = libxsmm_timer_tick(); for (i = size_local; i < size_total; ++i) { #if defined(MKLJIT) LIBXSMM_EXPECT(MKL_JIT_SUCCESS, mkl_cblas_jit_create_dgemm(jitter + i, MKL_COL_MAJOR, MKL_NOTRANS/*transa*/, MKL_NOTRANS/*transb*/, rnd[i].m, rnd[i].n, rnd[i].k, alpha, rnd[i].m, rnd[i].k, beta, rnd[i].m)); mkl_jit_get_dgemm_ptr(jitter[i]); #else libxsmm_dmmdispatch(rnd[i].m, rnd[i].n, rnd[i].k, &rnd[i].m, &rnd[i].k, &rnd[i].m, &alpha, &beta, &flags, &prefetch); #endif } tcgen += libxsmm_timer_ncycles(start, libxsmm_timer_tick()); } /* measure dispatching previously generated kernel (likely non-cached) */ #if defined(_OPENMP) if (1 < nthreads) { for (n = 0; n < nrepeat; ++n) { # pragma omp parallel num_threads(nthreads) private(i) { # pragma omp master start = libxsmm_timer_tick(); # pragma omp for for (i = 0; i < size_total; ++i) { const int j = (int)LIBXSMM_MOD(shuffle * i, size_total); #if defined(MKLJIT) mkl_jit_get_dgemm_ptr(jitter[j]); #else libxsmm_dmmdispatch(rnd[j].m, rnd[j].n, rnd[j].k, &rnd[j].m, &rnd[j].k, &rnd[j].m, &alpha, &beta, &flags, &prefetch); #endif } # pragma omp master tdsp0 += libxsmm_timer_ncycles(start, libxsmm_timer_tick()); } } } else #endif { for (n = 0; n < nrepeat; ++n) { start = libxsmm_timer_tick(); for (i = 0; i < size_total; ++i) { const int j = (int)LIBXSMM_MOD(shuffle * i, size_total); #if defined(MKLJIT) mkl_jit_get_dgemm_ptr(jitter[j]); #else libxsmm_dmmdispatch(rnd[j].m, rnd[j].n, rnd[j].k, &rnd[j].m, &rnd[j].k, &rnd[j].m, &alpha, &beta, &flags, &prefetch); #endif } tdsp0 += libxsmm_timer_ncycles(start, libxsmm_timer_tick()); } } #if defined(CHECK) { /* calculate l1-norm for manual validation */ double a[LIBXSMM_MAX_M*LIBXSMM_MAX_M]; double b[LIBXSMM_MAX_M*LIBXSMM_MAX_M]; double c[LIBXSMM_MAX_M*LIBXSMM_MAX_M]; libxsmm_matdiff_info check; libxsmm_matdiff_clear(&check); LIBXSMM_MATINIT(double, 0, a, maxsize, maxsize, maxsize, 1.0); LIBXSMM_MATINIT(double, 0, b, maxsize, maxsize, maxsize, 1.0); LIBXSMM_MATINIT(double, 0, c, maxsize, maxsize, maxsize, 1.0); for (i = 0; i < size_total; ++i) { const int j = (int)LIBXSMM_MOD(shuffle * i, size_total); libxsmm_matdiff_info diff; # if defined(MKLJIT) const dgemm_jit_kernel_t kernel = mkl_jit_get_dgemm_ptr(jitter[j]); # else const libxsmm_dmmfunction kernel = libxsmm_dmmdispatch(rnd[j].m, rnd[j].n, rnd[j].k, &rnd[j].m, &rnd[j].k, &rnd[j].m, &alpha, &beta, &flags, &prefetch); # endif if (NULL != kernel) { # if defined(MKLJIT) kernel(jitter[j], a, b, c); # else if (LIBXSMM_GEMM_PREFETCH_NONE == prefetch) kernel(a, b, c); else kernel(a, b, c, a, b, c); # endif result = libxsmm_matdiff(&diff, LIBXSMM_DATATYPE(double), rnd[j].m, rnd[j].n, NULL, c, &rnd[j].m, &rnd[j].m); } else { result = EXIT_FAILURE; } if (EXIT_SUCCESS == result) { libxsmm_matdiff_reduce(&check, &diff); } else { printf(" m=%u n=%u k=%u kernel=%" PRIuPTR, (unsigned int)rnd[j].m, (unsigned int)rnd[j].n, (unsigned int)rnd[j].k, (uintptr_t)kernel); i = size_total + 1; /* break */ } } if (i <= size_total) { printf(" check=%f\n", check.l1_tst); } else { printf(" <- ERROR!\n"); } } #else printf("\n"); #endif /*defined(CHECK)*/ free(rnd); /* release random numbers */ #if defined(MKLJIT) /* release dispatched code */ for (i = 0; i < size_total; ++i) mkl_jit_destroy(jitter[i]); free(jitter); /* release array used to store dispatched code */ #endif } tcall = (tcall + (size_t)size_total * nrepeat - 1) / ((size_t)size_total * nrepeat); tdsp0 = (tdsp0 + (size_t)size_total * nrepeat - 1) / ((size_t)size_total * nrepeat); tdsp1 = (tdsp1 + (size_t)size_total * nrepeat - 1) / ((size_t)size_total * nrepeat); tcgen = LIBXSMM_UPDIV(tcgen, size_total); if (0 < tcall && 0 < tdsp0 && 0 < tdsp1 && 0 < tcgen) { const double tcall_ns = 1E9 * libxsmm_timer_duration(0, tcall), tcgen_ns = 1E9 * libxsmm_timer_duration(0, tcgen); const double tdsp0_ns = 1E9 * libxsmm_timer_duration(0, tdsp0), tdsp1_ns = 1E9 * libxsmm_timer_duration(0, tdsp1); printf("\tfunction-call (false): %.0f ns (call/s %.0f MHz, %" PRIuPTR " cycles)\n", tcall_ns, 1E3 / tcall_ns, (uintptr_t)libxsmm_timer_ncycles(0, tcall)); printf("\tdispatch (ro/cached): %.0f ns (call/s %.0f MHz, %" PRIuPTR " cycles)\n", tdsp1_ns, 1E3 / tdsp1_ns, (uintptr_t)libxsmm_timer_ncycles(0, tdsp1)); printf("\tdispatch (ro): %.0f ns (call/s %.0f MHz, %" PRIuPTR " cycles)\n", tdsp0_ns, 1E3 / tdsp0_ns, (uintptr_t)libxsmm_timer_ncycles(0, tdsp0)); if (1E6 < tcgen_ns) { printf("\tcode-gen (rw): %.0f ms (call/s %.0f Hz)\n", 1E-6 * tcgen_ns, 1E9 / tcgen_ns); } else if (1E3 < tcgen_ns) { printf("\tcode-gen (rw): %.0f us (call/s %.0f kHz)\n", 1E-3 * tcgen_ns, 1E6 / tcgen_ns); } else { printf("\tcode-gen (rw): %.0f ns (call/s %.0f MHz)\n", tcgen_ns, 1E3 / tcgen_ns); } } printf("Finished\n"); return result; } libxsmm-1.17/samples/utilities/dispatch/dispatch.f000066400000000000000000000066371415223013700223530ustar00rootroot00000000000000!=======================================================================! ! Copyright (c) Intel Corporation - All rights reserved. ! ! This file is part of the LIBXSMM library. ! ! ! ! For information on the license, see the LICENSE file. ! ! Further information: https://github.com/hfp/libxsmm/ ! ! SPDX-License-Identifier: BSD-3-Clause ! !=======================================================================! ! Hans Pabst (Intel Corp.) !=======================================================================! ! This (micro-)benchmark is a simplified variant of the C implementation; ! the main point of dispatch.f is to show compatibility with FORTRAN 77. ! NOTE: CPU_TIME is a Fortran 96 intrinsic, libxsmm_xmmdispatch must be ! called with all arguments when relying on FORTRAN 77. ! ! IMPORTANT: please use the type-safe F2003 interface (libxsmm.f or module) ! unless FORTRAN 77 compatibility is really needed! ! PROGRAM dispatch !USE :: LIBXSMM !IMPLICIT NONE INTEGER, PARAMETER :: PRECISION = 0 ! LIBXSMM_GEMM_PRECISION_F64 INTEGER, PARAMETER :: M = 23, N = 23, K = 23 INTEGER, PARAMETER :: LDA = M, LDB = K, LDC = M DOUBLE PRECISION, PARAMETER :: Alpha = 1D0 DOUBLE PRECISION, PARAMETER :: Beta = 1D0 INTEGER, PARAMETER :: Flags = 0 INTEGER, PARAMETER :: Prefetch = 0 DOUBLE PRECISION :: start, dcall, ddisp INTEGER :: i, size = 10000000 ! Can be called using: ! - libxsmm_xmmcall_abc(function, a, b, c) ! - libxsmm_xmmcall[_prf](function, a, b, c, pa, pb, pc) INTEGER(8) :: function WRITE(*, "(A,I0,A)") "Dispatching ", size," calls..." ! run non-inline function to measure call overhead of an "empty" function ! subsequent calls (see above) of libxsmm_init are not doing any work ! CALL CPU_TIME(start) DO i = 1, size CALL libxsmm_init() END DO CALL CPU_TIME(dcall) dcall = dcall - start ! first invocation may initialize some internals (libxsmm_init), ! or actually generate code (code gen. time is out of scope) ! NOTE: libxsmm_xmmdispatch must be called with all arguments ! when relying on FORTRAN 77. ! CALL libxsmm_xmmdispatch(function, PRECISION, M, N, K, & & LDA, LDB, LDC, Alpha, Beta, Flags, Prefetch) CALL CPU_TIME(start) DO i = 1, size ! NOTE: libxsmm_xmmdispatch must be called with all arguments ! when relying on FORTRAN 77. CALL libxsmm_xmmdispatch(function, PRECISION, M, N, K, & & LDA, LDB, LDC, Alpha, Beta, Flags, Prefetch) END DO CALL CPU_TIME(ddisp) ddisp = ddisp - start IF ((0.LT.dcall).AND.(0.LT.ddisp)) THEN WRITE(*, "(1A,A,F10.1,A)") CHAR(9), "dispatch calls/s: ", & & (1D-6 * REAL(size, 8) / ddisp), " MHz" WRITE(*, "(1A,A,F10.1,A)") CHAR(9), "empty calls/s: ", & & (1D-6 * REAL(size, 8) / dcall), " MHz" WRITE(*, "(1A,A,F10.1,A)") CHAR(9), "overhead: ", & & (ddisp / dcall), "x" END IF WRITE(*, "(A)") "Finished" END PROGRAM libxsmm-1.17/samples/utilities/dispatch/dispatch.sh000077500000000000000000000050141415223013700225270ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) NAME=$(basename $0 .sh) GREP=$(command -v grep) ENV=$(command -v env) if [ "Windows_NT" = "${OS}" ]; then # Cygwin's "env" does not set PATH ("Files/Black: No such file or directory") export PATH=${PATH}:${HERE}/../../lib:/usr/x86_64-w64-mingw32/sys-root/mingw/bin # Cygwin's ldd hangs with dyn. linked executables or certain shared libraries LDD=$(command -v cygcheck) EXE=.exe else if [ "$(command -v ldd)" ]; then LDD=ldd elif [ "$(command -v otool)" ]; then LDD="otool -L" else LDD=echo fi fi MICINFO=$(command -v micinfo) if [ "${MICINFO}" ]; then MICCORES=$(${MICINFO} 2>/dev/null | sed -n "0,/[[:space:]]\+Total No of Active Cores :[[:space:]]\+\([0-9]\+\)/s//\1/p") fi if [ "" = "${MICCORES}" ]; then MICCORES=61 fi MICTPERC=3 if [ "-mic" != "$1" ]; then if [ "$(${LDD} ${HERE}/${NAME}${EXE} 2>/dev/null | ${GREP} libiomp5\.)" ]; then ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ KMP_AFFINITY=scatter,granularity=fine,1 \ MIC_KMP_AFFINITY=scatter,granularity=fine \ MIC_KMP_HW_SUBSET=$((MICCORES-1))c${MICTPERC}t \ MIC_ENV_PREFIX=MIC \ OFFLOAD_INIT=on_start \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" else ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ OMP_PROC_BIND=TRUE \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" fi else shift ${ENV} \ SINK_LD_LIBRARY_PATH=${SINK_LD_LIBRARY_PATH}:${MIC_LD_LIBRARY_PATH}:${HERE}/../../lib \ micnativeloadex \ ${HERE}/${NAME}${EXE} -a "$*" \ -e "KMP_AFFINITY=scatter,granularity=fine" \ -e "MIC_KMP_HW_SUBSET=$((MICCORES-1))${MICTPERC}t" fi libxsmm-1.17/samples/utilities/dispatch/dispatch.vcxproj000066400000000000000000000541321415223013700236120ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 dispatch {26F3F0AA-6011-439F-829E-CBEF072B8A4E} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/utilities/dispatch/dispatch_udt.f000066400000000000000000000072121415223013700232150ustar00rootroot00000000000000!=======================================================================! ! Copyright (c) Intel Corporation - All rights reserved. ! ! This file is part of the LIBXSMM library. ! ! ! ! For information on the license, see the LICENSE file. ! ! Further information: https://github.com/hfp/libxsmm/ ! ! SPDX-License-Identifier: BSD-3-Clause ! !=======================================================================! ! Hans Pabst (Intel Corp.) !=======================================================================! PROGRAM dispatch_udt USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_PTR, C_LOC, & & C_ASSOCIATED, & & C_F_POINTER USE :: LIBXSMM, ONLY: LIBXSMM_BLASINT_KIND, & & LIBXSMM_MMFUNCTION => LIBXSMM_DMMFUNCTION,& & libxsmm_mmdispatch => libxsmm_dmmdispatch,& & libxsmm_mmcall => libxsmm_dmmcall, & & libxsmm_xregister, libxsmm_xdispatch IMPLICIT NONE INTEGER, PARAMETER :: T = KIND(0D0) INTEGER :: batchsize = 1000, i INTEGER(LIBXSMM_BLASINT_KIND) :: j, ki, nrepeat = 100 INTEGER(LIBXSMM_BLASINT_KIND) :: m = 13, n = 5, k = 7 REAL(T), ALLOCATABLE :: a(:,:,:), b(:,:,:), c(:,:) TYPE(LIBXSMM_MMFUNCTION), TARGET :: xmm(2) ! array of kernels TYPE(LIBXSMM_MMFUNCTION), POINTER :: udt(:) INTEGER(LIBXSMM_BLASINT_KIND), TARGET :: key(3) TYPE(C_PTR) :: ptr ALLOCATE(a(m,k,batchsize), b(k,n,batchsize), c(m,n)) ! initialize input DO i = 1, batchsize DO ki = 1, k DO j = 1, m a(j,ki,i) = REAL(1, T) / REAL(MOD(i+j+ki, 25), T) END DO DO j = 1, n b(ki,j,i) = REAL(7, T) / REAL(MOD(i+j+ki, 75), T) END DO END DO END DO c(:,:) = REAL(0, T) ! repeat inner part to exercise libxsmm_xdispatch DO j = 1, nrepeat key = (/m, n, k/) ! setup key ! query associated value using key ptr = libxsmm_xdispatch( & & C_LOC(key), SIZE(key) * LIBXSMM_BLASINT_KIND) IF (C_ASSOCIATED(ptr)) THEN ! value was already registered ! convert C-ptr to Fortran POINTER CALL C_F_POINTER(ptr, udt, (/SIZE(xmm)/)) ELSE ! no value registered yet ! generate and dispatch a series of kernels CALL libxsmm_mmdispatch(xmm(1), m, n, k, & & alpha=REAL(1, T), beta=REAL(1, T)) CALL libxsmm_mmdispatch(xmm(2), m, n, k + 2, & & alpha=REAL(1, T), beta=REAL(1, T)) ! register an entry that contains all kernels from above ptr = libxsmm_xregister( & & C_LOC(key), SIZE(key) * LIBXSMM_BLASINT_KIND, & & SIZE(xmm) * 8, C_LOC(xmm)) ! point udt to xmm (below code uses udt to refer to kernels udt => xmm ! alternatively, use C_F_POINTER END IF ! here we executed libxsmm_xdispatch one time (for this round) ! all kernels have been dispatched at once (udt) DO i = 1, batchsize CALL libxsmm_mmcall(udt(1), a(:,:,i), b(:,:,i), c) END DO END DO DEALLOCATE(a, b, c) END PROGRAM libxsmm-1.17/samples/utilities/dispatch/dispatchf.sh000077500000000000000000000050141415223013700226750ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) NAME=$(basename $0 .sh) GREP=$(command -v grep) ENV=$(command -v env) if [ "Windows_NT" = "${OS}" ]; then # Cygwin's "env" does not set PATH ("Files/Black: No such file or directory") export PATH=${PATH}:${HERE}/../../lib:/usr/x86_64-w64-mingw32/sys-root/mingw/bin # Cygwin's ldd hangs with dyn. linked executables or certain shared libraries LDD=$(command -v cygcheck) EXE=.exe else if [ "$(command -v ldd)" ]; then LDD=ldd elif [ "$(command -v otool)" ]; then LDD="otool -L" else LDD=echo fi fi MICINFO=$(command -v micinfo) if [ "${MICINFO}" ]; then MICCORES=$(${MICINFO} 2>/dev/null | sed -n "0,/[[:space:]]\+Total No of Active Cores :[[:space:]]\+\([0-9]\+\)/s//\1/p") fi if [ "" = "${MICCORES}" ]; then MICCORES=61 fi MICTPERC=3 if [ "-mic" != "$1" ]; then if [ "$(${LDD} ${HERE}/${NAME}${EXE} 2>/dev/null | ${GREP} libiomp5\.)" ]; then ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ KMP_AFFINITY=scatter,granularity=fine,1 \ MIC_KMP_AFFINITY=scatter,granularity=fine \ MIC_KMP_HW_SUBSET=$((MICCORES-1))c${MICTPERC}t \ MIC_ENV_PREFIX=MIC \ OFFLOAD_INIT=on_start \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" else ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ OMP_PROC_BIND=TRUE \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" fi else shift ${ENV} \ SINK_LD_LIBRARY_PATH=${SINK_LD_LIBRARY_PATH}:${MIC_LD_LIBRARY_PATH}:${HERE}/../../lib \ micnativeloadex \ ${HERE}/${NAME}${EXE} -a "$*" \ -e "KMP_AFFINITY=scatter,granularity=fine" \ -e "MIC_KMP_HW_SUBSET=$((MICCORES-1))${MICTPERC}t" fi libxsmm-1.17/samples/utilities/math/000077500000000000000000000000001415223013700175235ustar00rootroot00000000000000libxsmm-1.17/samples/utilities/math/Makefile000066400000000000000000000060351415223013700211670ustar00rootroot00000000000000ROOTDIR = $(abspath $(dir $(firstword $(MAKEFILE_LIST)))) DEPDIR = ../../.. SRCDIR = . INCDIR = . BLDDIR = obj OUTDIR = . CXXFLAGS = $(NULL) CFLAGS = $(NULL) DFLAGS = $(NULL) BLAS = 0 OMP = 0 SYM = 1 # include common Makefile artifacts include $(DEPDIR)/Makefile.inc # necessary include directories IFLAGS += -I$(call quote,$(INCDIR)) IFLAGS += -I$(call quote,$(DEPDIR)/include) OUTNAME := $(shell basename "$(ROOTDIR)") HEADERS := $(wildcard $(INCDIR)/*.h) $(wildcard $(INCDIR)/*.hpp) $(wildcard $(INCDIR)/*.hxx) $(wildcard $(INCDIR)/*.hh) \ $(wildcard $(SRCDIR)/*.h) $(wildcard $(SRCDIR)/*.hpp) $(wildcard $(SRCDIR)/*.hxx) $(wildcard $(SRCDIR)/*.hh) \ $(DEPDIR)/include/libxsmm_source.h CPPSRCS := $(wildcard $(SRCDIR)/*.cpp) CXXSRCS := $(wildcard $(SRCDIR)/*.cxx) CCXSRCS := $(wildcard $(SRCDIR)/*.cc) CSOURCS := $(wildcard $(SRCDIR)/*.c) CPPOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CPPSRCS:.cpp=-cpp.o))) CXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CXXSRCS:.cxx=-cxx.o))) CCXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CCXSRCS:.cc=-cc.o))) COBJCTS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CSOURCS:.c=-c.o))) ifneq (,$(strip $(FC))) FXXSRCS := $(wildcard $(SRCDIR)/*.f) F77SRCS := $(wildcard $(SRCDIR)/*.F) F90SRCS := $(wildcard $(SRCDIR)/*.f90) $(wildcard $(SRCDIR)/*.F90) FXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(FXXSRCS:.f=-f.o))) F77OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F77SRCS:.F=-f77.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90SRCS:.f90=-f90.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90OBJS:.F90=-f90.o))) endif SOURCES := $(CPPSRCS) $(CXXSRCS) $(CCXSRCS) $(CSOURCS) OBJECTS := $(CPPOBJS) $(CXXOBJS) $(CCXOBJS) $(COBJCTS) FTNSRCS := $(FXXSRCS) $(F77SRCS) $(F90SRCS) MODULES := $(addsuffix .mod,$(basename $(FTNSRCS))) $(addsuffix .modmic,$(basename $(FTNSRCS))) FTNOBJS := $(FXXOBJS) $(F77OBJS) $(F90OBJS) XFILES := $(OUTDIR)/$(OUTNAME) .PHONY: all all: $(XFILES) .PHONY: compile compile: $(OBJECTS) $(FTNOBJS) $(OUTDIR)/$(OUTNAME): $(OUTDIR)/.make $(OBJECTS) $(LIBDEP) $(LD) -o $@ $(OBJECTS) $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(BLDDIR)/%-cpp.o: $(SRCDIR)/%.cpp .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CXX) $(DFLAGS) $(IFLAGS) $(CXXFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-c.o: $(SRCDIR)/%.c .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CC) $(DFLAGS) $(IFLAGS) $(CFLAGS) $(CTARGET) -c $< -o $@ .PHONY: clean clean: ifneq ($(call qapath,$(BLDDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(BLDDIR)),$(call qapath,.)) @rm -rf $(BLDDIR) endif endif ifneq (,$(wildcard $(BLDDIR))) # still exists @rm -f $(OBJECTS) $(OBJECTX) $(FTNOBJS) $(FTNOBJX) *__genmod.* fit.log *.dat @rm -f $(BLDDIR)/*.gcno $(BLDDIR)/*.gcda $(BLDDIR)/*.gcov endif @rm -f .make .state .PHONY: realclean realclean: clean ifneq ($(call qapath,$(OUTDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(OUTDIR)),$(call qapath,.)) @rm -rf $(OUTDIR) endif endif ifneq (,$(wildcard $(OUTDIR))) # still exists @rm -f $(OUTDIR)/libxsmm.$(DLIBEXT) $(OUTDIR)/*.stackdump @rm -f $(XFILES) $(MODULES) endif libxsmm-1.17/samples/utilities/math/math.c000066400000000000000000000140071415223013700206220ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include #include #include int main(int argc, char* argv[]) { const int insize = (1 < argc ? atoi(argv[1]) : 0); const int niters = (2 < argc ? atoi(argv[2]) : 1); const size_t n = (0 >= insize ? (((size_t)2 << 30/*2 GB*/) / sizeof(float)) : ((size_t)insize)); float *inp, *out, *gold; size_t size, nrpt; int result; if (0 < niters) { nrpt = niters; size = n; } else { nrpt = n; size = LIBXSMM_MAX(LIBXSMM_ABS(niters), 1); } gold = (float*)(malloc(sizeof(float) * size)); out = (float*)(malloc(sizeof(float) * size)); inp = (float*)(malloc(sizeof(float) * size)); if (NULL != gold && NULL != out && NULL != inp) { libxsmm_timer_tickint start; libxsmm_matdiff_info diff; size_t i, j; /* initialize the input data */ libxsmm_rng_set_seed(25071975); libxsmm_rng_f32_seq(inp, (libxsmm_blasint)size); /* collect gold data for exp2 function */ { start = libxsmm_timer_tick(); for (j = 0; j < nrpt; ++j) { for (i = 0; i < size; ++i) { gold[i] = (float)LIBXSMM_EXP2(inp[i]); } } printf("standard exp2:\t%.3f s\t\tgold\n", libxsmm_timer_duration(start, libxsmm_timer_tick())); } { start = libxsmm_timer_tick(); for (j = 0; j < nrpt; ++j) { for (i = 0; i < size; ++i) { out[i] = LIBXSMM_EXP2F(inp[i]); } } printf("standard exp2f:\t%.3f s", libxsmm_timer_duration(start, libxsmm_timer_tick())); if (EXIT_SUCCESS == libxsmm_matdiff(&diff, LIBXSMM_DATATYPE_F32, 1/*m*/, (libxsmm_blasint)size, gold, out, NULL/*ldref*/, NULL/*ldtst*/)) { printf("\t\tdiff: L2abs=%f Linf=%f\n", diff.l2_abs, diff.linf_abs); } else printf("\n"); } { start = libxsmm_timer_tick(); for (j = 0; j < nrpt; ++j) { for (i = 0; i < size; ++i) { out[i] = libxsmm_sexp2(inp[i]); } } printf("libxsmm_sexp2:\t%.3f s", libxsmm_timer_duration(start, libxsmm_timer_tick())); if (EXIT_SUCCESS == libxsmm_matdiff(&diff, LIBXSMM_DATATYPE_F32, 1/*m*/, (libxsmm_blasint)size, gold, out, NULL/*ldref*/, NULL/*ldtst*/)) { printf("\t\tdiff: L2abs=%f Linf=%f\n", diff.l2_abs, diff.linf_abs); } else printf("\n"); } /* collect gold data for limited-range exp2 function */ { start = libxsmm_timer_tick(); for (j = 0; j < nrpt; ++j) { for (i = 0; i < size; ++i) { const unsigned char input = (unsigned char)(255.f * inp[i]); gold[i] = (float)LIBXSMM_EXP2(input); } } printf("low-range exp2:\t%.3f s\t\tgold\n", libxsmm_timer_duration(start, libxsmm_timer_tick())); } { start = libxsmm_timer_tick(); for (j = 0; j < nrpt; ++j) { for (i = 0; i < size; ++i) { const unsigned char input = (unsigned char)(255.f * inp[i]); out[i] = libxsmm_sexp2_u8(input); } } printf("libxsmm_sexp2:\t%.3f s", libxsmm_timer_duration(start, libxsmm_timer_tick())); if (EXIT_SUCCESS == libxsmm_matdiff(&diff, LIBXSMM_DATATYPE_F32, 1/*m*/, (libxsmm_blasint)size, gold, out, NULL/*ldref*/, NULL/*ldtst*/)) { printf("\t\tdiff: L2abs=%f Linf=%f\n", diff.l2_abs, diff.linf_abs); } else printf("\n"); } /* collect gold data for sqrt function */ { start = libxsmm_timer_tick(); for (j = 0; j < nrpt; ++j) { for (i = 0; i < size; ++i) { gold[i] = (float)sqrt(inp[i]); } } printf("standard sqrt:\t%.3f s\t\tgold\n", libxsmm_timer_duration(start, libxsmm_timer_tick())); } { start = libxsmm_timer_tick(); for (j = 0; j < nrpt; ++j) { for (i = 0; i < size; ++i) { out[i] = (float)libxsmm_dsqrt(inp[i]); } } printf("libxsmm_dsqrt:\t%.3f s", libxsmm_timer_duration(start, libxsmm_timer_tick())); if (EXIT_SUCCESS == libxsmm_matdiff(&diff, LIBXSMM_DATATYPE_F32, 1/*m*/, (libxsmm_blasint)size, gold, out, NULL/*ldref*/, NULL/*ldtst*/)) { printf("\t\tdiff: L2abs=%f Linf=%f\n", diff.l2_abs, diff.linf_abs); } else printf("\n"); } { start = libxsmm_timer_tick(); for (j = 0; j < nrpt; ++j) { for (i = 0; i < size; ++i) { out[i] = LIBXSMM_SQRTF(inp[i]); } } printf("standard sqrtf:\t%.3f s", libxsmm_timer_duration(start, libxsmm_timer_tick())); if (EXIT_SUCCESS == libxsmm_matdiff(&diff, LIBXSMM_DATATYPE_F32, 1/*m*/, (libxsmm_blasint)size, gold, out, NULL/*ldref*/, NULL/*ldtst*/)) { printf("\t\tdiff: L2abs=%f Linf=%f\n", diff.l2_abs, diff.linf_abs); } else printf("\n"); } { start = libxsmm_timer_tick(); for (j = 0; j < nrpt; ++j) { for (i = 0; i < size; ++i) { out[i] = libxsmm_ssqrt(inp[i]); } } printf("libxsmm_ssqrt:\t%.3f s", libxsmm_timer_duration(start, libxsmm_timer_tick())); if (EXIT_SUCCESS == libxsmm_matdiff(&diff, LIBXSMM_DATATYPE_F32, 1/*m*/, (libxsmm_blasint)size, gold, out, NULL/*ldref*/, NULL/*ldtst*/)) { printf("\t\tdiff: L2abs=%f Linf=%f\n", diff.l2_abs, diff.linf_abs); } else printf("\n"); } result = EXIT_SUCCESS; } else { result = EXIT_FAILURE; } free(gold); free(out); free(inp); return result; } libxsmm-1.17/samples/utilities/math/math.sh000077500000000000000000000050141415223013700210130ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) NAME=$(basename $0 .sh) GREP=$(command -v grep) ENV=$(command -v env) if [ "Windows_NT" = "${OS}" ]; then # Cygwin's "env" does not set PATH ("Files/Black: No such file or directory") export PATH=${PATH}:${HERE}/../../lib:/usr/x86_64-w64-mingw32/sys-root/mingw/bin # Cygwin's ldd hangs with dyn. linked executables or certain shared libraries LDD=$(command -v cygcheck) EXE=.exe else if [ "$(command -v ldd)" ]; then LDD=ldd elif [ "$(command -v otool)" ]; then LDD="otool -L" else LDD=echo fi fi MICINFO=$(command -v micinfo) if [ "${MICINFO}" ]; then MICCORES=$(${MICINFO} 2>/dev/null | sed -n "0,/[[:space:]]\+Total No of Active Cores :[[:space:]]\+\([0-9]\+\)/s//\1/p") fi if [ "" = "${MICCORES}" ]; then MICCORES=61 fi MICTPERC=3 if [ "-mic" != "$1" ]; then if [ "$(${LDD} ${HERE}/${NAME}${EXE} 2>/dev/null | ${GREP} libiomp5\.)" ]; then ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ KMP_AFFINITY=compact,granularity=fine,1 \ MIC_KMP_AFFINITY=compact,granularity=fine \ MIC_KMP_HW_SUBSET=$((MICCORES-1))c${MICTPERC}t \ MIC_ENV_PREFIX=MIC \ OFFLOAD_INIT=on_start \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" else ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ OMP_PROC_BIND=TRUE \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" fi else shift ${ENV} \ SINK_LD_LIBRARY_PATH=${SINK_LD_LIBRARY_PATH}:${MIC_LD_LIBRARY_PATH}:${HERE}/../../lib \ micnativeloadex \ ${HERE}/${NAME}${EXE} -a "$*" \ -e "KMP_AFFINITY=compact,granularity=fine" \ -e "MIC_KMP_HW_SUBSET=$((MICCORES-1))${MICTPERC}t" fi libxsmm-1.17/samples/utilities/math/math.vcxproj000066400000000000000000000542341415223013700221010ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 math {87E8DB3A-EDDF-441F-81A5-27052DE316BB} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 ProgramDatabase None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 ProgramDatabase None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/utilities/mhd/000077500000000000000000000000001415223013700173425ustar00rootroot00000000000000libxsmm-1.17/samples/utilities/mhd/Makefile000066400000000000000000000074311415223013700210070ustar00rootroot00000000000000ROOTDIR = $(abspath $(dir $(firstword $(MAKEFILE_LIST)))) DEPDIR = ../../.. SRCDIR = . INCDIR = . BLDDIR = obj OUTDIR = . CXXFLAGS = $(NULL) CFLAGS = $(NULL) DFLAGS = $(NULL) # Fortran code here does not allow for PEDANTIC=2 #override PEDANTIC = 1 BLAS = 0 OMP = 0 SYM = 1 # include common Makefile artifacts include $(DEPDIR)/Makefile.inc # necessary include directories IFLAGS += -I$(call quote,$(INCDIR)) IFLAGS += -I$(call quote,$(DEPDIR)/include) OUTNAME := $(shell basename "$(ROOTDIR)") HEADERS := $(wildcard $(INCDIR)/*.h) $(wildcard $(INCDIR)/*.hpp) $(wildcard $(INCDIR)/*.hxx) $(wildcard $(INCDIR)/*.hh) \ $(wildcard $(SRCDIR)/*.h) $(wildcard $(SRCDIR)/*.hpp) $(wildcard $(SRCDIR)/*.hxx) $(wildcard $(SRCDIR)/*.hh) \ $(DEPDIR)/include/libxsmm_source.h CPPSRCS := $(wildcard $(SRCDIR)/*.cpp) CXXSRCS := $(wildcard $(SRCDIR)/*.cxx) CCXSRCS := $(wildcard $(SRCDIR)/*.cc) CSOURCS := $(wildcard $(SRCDIR)/*.c) CPPOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CPPSRCS:.cpp=-cpp.o))) CXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CXXSRCS:.cxx=-cxx.o))) CCXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CCXSRCS:.cc=-cc.o))) COBJCTS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CSOURCS:.c=-c.o))) ifneq (,$(strip $(FC))) FXXSRCS := $(wildcard $(SRCDIR)/*.f) F77SRCS := $(wildcard $(SRCDIR)/*.F) F90SRCS := $(wildcard $(SRCDIR)/*.f90) $(wildcard $(SRCDIR)/*.F90) FXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(FXXSRCS:.f=-f.o))) F77OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F77SRCS:.F=-f77.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90SRCS:.f90=-f90.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90OBJS:.F90=-f90.o))) endif SOURCES := $(CPPSRCS) $(CXXSRCS) $(CCXSRCS) $(CSOURCS) OBJECTS := $(CPPOBJS) $(CXXOBJS) $(CCXOBJS) $(COBJCTS) FTNSRCS := $(FXXSRCS) $(F77SRCS) $(F90SRCS) MODULES := $(addsuffix .mod,$(basename $(FTNSRCS))) $(addsuffix .modmic,$(basename $(FTNSRCS))) FTNOBJS := $(FXXOBJS) $(F77OBJS) $(F90OBJS) XFILES := $(OUTDIR)/$(OUTNAME) .PHONY: all all: $(XFILES) .PHONY: compile compile: $(OBJECTS) $(FTNOBJS) $(OUTDIR)/$(OUTNAME): $(OUTDIR)/.make $(OBJECTS) #$(LIBDEP) $(EXTDEP) $(LD) -o $@ $(OBJECTS) $(call cleanld,$(EXTLIB) $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS)) $(BLDDIR)/%-cpp.o: $(SRCDIR)/%.cpp .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CXX) $(DFLAGS) $(IFLAGS) $(CXXFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-c.o: $(SRCDIR)/%.c .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CC) $(DFLAGS) $(IFLAGS) $(CFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-f.o: $(SRCDIR)/%.f .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.f90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.F90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f77.o: $(SRCDIR)/%.F .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ .PHONY: clean clean: ifneq ($(call qapath,$(BLDDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(BLDDIR)),$(call qapath,.)) @rm -rf $(BLDDIR) endif endif ifneq (,$(wildcard $(BLDDIR))) # still exists @rm -f $(OBJECTS) $(OBJECTX) $(FTNOBJS) $(FTNOBJX) *__genmod.* fit.log *.dat @rm -f $(BLDDIR)/*.gcno $(BLDDIR)/*.gcda $(BLDDIR)/*.gcov endif @rm -f .make .state .PHONY: realclean realclean: clean ifneq ($(call qapath,$(OUTDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(OUTDIR)),$(call qapath,.)) @rm -rf $(OUTDIR) endif endif ifneq (,$(wildcard $(OUTDIR))) # still exists @rm -f $(OUTDIR)/libxsmm.$(DLIBEXT) $(OUTDIR)/*.stackdump @rm -f $(XFILES) $(MODULES) endif libxsmm-1.17/samples/utilities/mhd/README.md000066400000000000000000000026431415223013700206260ustar00rootroot00000000000000# MHD Image I/O This code sample aims to provide a simple piece of code, which takes an image and produces a visual result using LIBXSMM's MHD image file I/O. Performing a single convolution is *not* a showcase of LIBXSMM's Deeplearning as the code only runs over a single image with one channel. LIBXSMM's CNNs are vectorized over image channels (multiple images) according to the native vector-width of the processor and otherwise fall back to a high-level implementation. **Note**: For high-performance deep learning, please refer to the collection of [CNN layer samples](https://github.com/hfp/libxsmm/tree/master/samples/deeplearning/cnnlayer). The executable can run with the following arguments (all arguments are optional): > mhd [<filename-in> [<nrepeat> [<kw> [<kh>] [<filename-out>]]]] For stable timing (benchmark), the key operation (convolution) may be repeated (`nrepeat`). Further, `kw` and `kh` can specify the kernel-size of the convolution. The `filename-in` and `filename-out` name MHD-files used as input and output respectively. The `filename-in` may be a pseudo-file (that does not exist) but specify the image resolution of generated input (`w`[x`h`] where the file `wxh.mhd` stores the generated image data). To load an image from a familiar format (JPG, PNG, etc.), please have a look at [Meta Image File I/O](https://libxsmm.readthedocs.io/libxsmm_aux/#meta-image-file-io). libxsmm-1.17/samples/utilities/mhd/mhd.c000066400000000000000000000420221415223013700202560ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include #include #include #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #if defined(_WIN32) # include # if !defined(F_OK) # define F_OK 0 # endif # define FEXIST(FILENAME) _access(FILENAME, F_OK) #else # include # define FEXIST(FILENAME) access(FILENAME, F_OK) #endif #if defined(_OPENMP) # include #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif #if 1 # define MALLOC(SIZE) libxsmm_aligned_malloc(SIZE, 0/*auto*/) # define FREE(POINTER) libxsmm_free(POINTER) # define SCRATCH_MALLOC(SIZE) libxsmm_aligned_scratch(SIZE, 0/*auto*/) # define SCRATCH_FREE(POINTER) libxsmm_free(POINTER) #else # define MALLOC(SIZE) malloc(SIZE) # define FREE(POINTER) free(POINTER) # define SCRATCH_MALLOC(SIZE) MALLOC(SIZE) # define SCRATCH_FREE(POINTER) FREE(POINTER) #endif #if !defined(USE_OUTPUT_PADDING) && 0 # define USE_OUTPUT_PADDING #endif #if !defined(USE_OVERWRITE) # define USE_OVERWRITE #endif /** * This code mainly demonstrates MHD image I/O but happens to also perform an image convolution. * The latter is *not* a showcase of LIBXSMM's Deeplearning as the code only runs over a single image. */ int main(int argc, char* argv[]) { const char* filename_in = (1 < argc ? argv[1] : "mhd_in.mhd"); const size_t nrepeat = (size_t)LIBXSMM_MAX(2 < argc ? strtoul(argv[2], 0, 10) : 1, 1); const int kw = LIBXSMM_MAX(3 < argc ? atoi(argv[3]) : 32, 1); const int kh = LIBXSMM_MAX(4 < argc ? atoi(argv[4]) : kw, 1); const char *const filename_out = (5 < argc ? argv[5] : "mhd_out.mhd"); int result = (0 != strcmp(filename_in, filename_out) ? EXIT_SUCCESS : EXIT_FAILURE); size_t ndims = 2, size_in[] = { 0, 0 }, size_out[] = { 0, 0 }, pitch[2], offset[2], ncomponents = 1, header_size = 0, extension_size; void *conv_input_buffer = 0, *conv_filter_buffer = 0, *conv_output_buffer = 0; libxsmm_dnn_tensor *conv_input = 0, *conv_output = 0, *conv_filter = 0; libxsmm_mhd_elemtype type_in = LIBXSMM_MHD_ELEMTYPE_UNKNOWN; libxsmm_dnn_datatype type_dnn = LIBXSMM_DNN_DATATYPE_F32; libxsmm_dnn_conv_desc descriptor; libxsmm_dnn_layer* handle = 0; libxsmm_dnn_err_t status; size_t size1 = 0, typesize_dnn = 0; size_t conv_output_size1 = 0, i, j; unsigned long long start; char filename[1024]; double duration = 0; void *scratch = 0; void *filter = 0; void *image = 0; #if !defined(NDEBUG) static int error_once = 0; #endif const char *const env_mult = getenv("MULT"), *const env_orig = getenv("ORIG"); /* extents of result image become multiples of block-size */ const int mult = ((NULL == env_mult || 0 == *env_mult) ? 64/*default*/ : LIBXSMM_MAX(atoi(env_mult), 0)); /* save result with original pixel-type of input (type_in), otherwise save with compute-type (type_dnn) */ const int orig = ((NULL == env_orig || 0 == *env_orig) ? 1/*enabled*/ : atoi(env_orig)); /* Generate an input file if a pseudo filename (resolution) is given. */ if (0 != FEXIST(filename_in) && 0 < atoi(filename_in)) { const char* split = strchr(filename_in, 'x'); if (0 == split) split = strchr(filename_in, 'X'); size_in[0] = atoi(filename_in); size_in[1] = (0 != split ? atoi(split + 1) : 0); if (0 == size_in[1]) size_in[1] = size_in[0]; image = MALLOC(size_in[0] * size_in[1]); if (0 < sprintf(filename, "%s.mhd", filename_in) && 0 != image) { const int c0 = 0, c1 = 255, r = LIBXSMM_MAX(kw, kh); for (i = 0; i < size_in[1]; ++i) { for (j = 0; j < size_in[0]; ++j) { ((unsigned char*)image)[i*size_in[0]+j] = (unsigned char)(0 == (i + j) % r ? c1 : c0); } } result = libxsmm_mhd_write(filename, NULL/*offset*/, size_in, size_in, 2/*ndims*/, 1/*ncomponents*/, LIBXSMM_MHD_ELEMTYPE_U8, NULL/*conversion*/, image, 0/*header_size*/, NULL/*extension_header*/, NULL/*extension*/, 0/*extension_size*/); if (EXIT_SUCCESS == result) filename_in = filename; } else { result = EXIT_FAILURE; } FREE(image); } /* Read MHD-header information; function includes various sanity checks. */ if (EXIT_SUCCESS == result) { result = libxsmm_mhd_read_header(filename_in, sizeof(filename), filename, &ndims, size_in, &ncomponents, &type_in, &header_size, &extension_size); } /* Only accept 2d-images (maybe a slice of a 3d-image). */ if (2 == ndims) { const int m = LIBXSMM_MAX(mult, 1); offset[0] = ((size_in[0] + LIBXSMM_MAX(kw, m) - 1) / m * m - size_in[0] + kw) / 2; offset[1] = ((size_in[1] + LIBXSMM_MAX(kh, m) - 1) / m * m - size_in[1] + kh) / 2; /* center image inside of (pitched) buffer */ size_out[0] = size_in[0] + 2 * offset[0]; size_out[1] = size_in[1] + 2 * offset[1]; #if defined(USE_OUTPUT_PADDING) size_out[0] -= (kw / 2) * 2; size_out[1] -= (kh / 2) * 2; pitch[0] = size_out[0]; pitch[1] = size_out[1]; #else pitch[0] = size_out[0]; pitch[1] = size_out[1]; size_out[0] -= (kw / 2) * 2; size_out[1] -= (kh / 2) * 2; #endif size1 = pitch[0] * pitch[1] * ncomponents; } else { result = EXIT_FAILURE; } /* Allocate image data according to the MHD-header information. */ if (EXIT_SUCCESS == result) { const char* ctypename; /* DNN type: assume that MHD I/O provides a super-set of types */ if (0 != libxsmm_mhd_typename((libxsmm_mhd_elemtype)type_dnn, &typesize_dnn, &ctypename)) { const size_t filter_size = ncomponents * kh * kw; /* print some information about the workload */ fprintf(stdout, "filename=%s resolution=%ux%u kernel=%ix%i size_in=%.fMB nrepeat=%u (%s)\n", filename, (unsigned int)size_in[0], (unsigned int)size_in[1], kw, kh, 1.0 * (size1 * typesize_dnn) / (1 << 20), (unsigned int)nrepeat, ctypename); image = MALLOC(size1 * typesize_dnn); filter = MALLOC(filter_size * typesize_dnn); if (0 != image && 0 != filter) { FILE *const file = fopen("mhd_in.txt", "r"); /* convolution-matrix (kh x kw) */ double weight; switch (type_dnn) { case LIBXSMM_DNN_DATATYPE_F64: { for (i = 0; i < filter_size; ++i) { ((double*)filter)[i] = (double)((0 == file || 1 > fscanf(file, "%lf", &weight)) ? (0.05 - ((double)rand() / RAND_MAX) * 0.1) : weight); } } break; case LIBXSMM_DNN_DATATYPE_F32: { for (i = 0; i < filter_size; ++i) { ((float*)filter)[i] = (float)((0 == file || 1 > fscanf(file, "%lf", &weight)) ? (0.05 - ((double)rand() / RAND_MAX) * 0.1) : weight); } } break; case LIBXSMM_DNN_DATATYPE_I32: { for (i = 0; i < filter_size; ++i) { ((int*)filter)[i] = (int)((0 == file || 1 > fscanf(file, "%lf", &weight)) ? (255 * (0.05 - ((double)rand() / RAND_MAX) * 0.1)) : weight); } } break; case LIBXSMM_DNN_DATATYPE_I16: { for (i = 0; i < filter_size; ++i) { ((short*)filter)[i] = (short)((0 == file || 1 > fscanf(file, "%lf", &weight)) ? (255 * (0.05 - ((double)rand() / RAND_MAX) * 0.1)) : weight); } } break; case LIBXSMM_DNN_DATATYPE_I8: { for (i = 0; i < filter_size; ++i) { ((unsigned char*)filter)[i] = (unsigned char)((0 == file || 1 > fscanf(file, "%lf", &weight)) ? (255 * (0.05 - ((double)rand() / RAND_MAX) * 0.1)) : weight); } } break; default: result = EXIT_FAILURE; } if (0 != file && 0 != fclose(file)) result = EXIT_FAILURE; } else { result = EXIT_FAILURE; } } else { result = EXIT_FAILURE; } } /* Read the image data according to the header into the allocated buffer. */ if (EXIT_SUCCESS == result) { const void *const pv = &type_dnn; result = libxsmm_mhd_read(filename, offset, size_in, pitch, ndims, ncomponents, header_size, type_in, /* eventually perform a type-conversion (type_in != type_dnn) */ (const libxsmm_mhd_elemtype*)pv, image, NULL/*handle_element*/, NULL/*extension*/, 0/*extension_size*/); } /* Setup convolution descriptor. */ memset(&descriptor, 0, sizeof(descriptor)); if (EXIT_SUCCESS == result) { #if defined(_OPENMP) descriptor.threads = omp_get_max_threads(); #else descriptor.threads = 1; #endif descriptor.N = 1; /* number of images */ descriptor.R = kh; /* kernel height */ descriptor.S = kw; /* kernel width */ descriptor.C = (int)ncomponents; /* in */ descriptor.K = descriptor.C; /* no reduction */ descriptor.u = 1; /* H-stride */ descriptor.v = 1; /* W-stride */ descriptor.H = (int)pitch[1]; descriptor.W = (int)pitch[0]; descriptor.pad_h = 0; descriptor.pad_w = 0; descriptor.pad_h_in = 0; descriptor.pad_w_in = 0; #if defined(USE_OUTPUT_PADDING) descriptor.pad_h_out = ((descriptor.u - 1) * descriptor.H + descriptor.R - descriptor.u) / 2; descriptor.pad_w_out = ((descriptor.v - 1) * descriptor.W + descriptor.S - descriptor.v) / 2; #else descriptor.pad_h_out = 0; descriptor.pad_w_out = 0; #endif descriptor.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT/*LIBXSMM_DNN_CONV_ALGO_AUTO*/; descriptor.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; descriptor.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; descriptor.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE; #if defined(USE_OVERWRITE) descriptor.options = LIBXSMM_DNN_CONV_OPTION_OVERWRITE; #else descriptor.options = LIBXSMM_DNN_CONV_OPTION_NONE; #endif descriptor.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE; descriptor.datatype_in = LIBXSMM_DNN_DATATYPE_F32; descriptor.datatype_out = LIBXSMM_DNN_DATATYPE_F32; handle = libxsmm_dnn_create_conv_layer(descriptor, &status); if (LIBXSMM_DNN_SUCCESS != status) { const char *const error_message = libxsmm_dnn_get_error(status); fprintf(stderr, "%s\n", error_message); if (LIBXSMM_DNN_WARN_FALLBACK != status) result = EXIT_FAILURE; } } /* Link buffers and convert NCHW-image and KCRS-filter to internal format. */ if (EXIT_SUCCESS == result) { libxsmm_dnn_tensor_datalayout* layout; size_t scratch_size; /* Input buffer */ conv_input_buffer = MALLOC(descriptor.N * descriptor.C * (descriptor.H + 2 * descriptor.pad_h_in) * (descriptor.W + 2 * descriptor.pad_w_in) * typesize_dnn); if (0 == conv_input_buffer) result = EXIT_FAILURE; layout = libxsmm_dnn_create_tensor_datalayout(handle, LIBXSMM_DNN_INPUT, &status); if (LIBXSMM_DNN_SUCCESS != status) result = EXIT_FAILURE; conv_input = libxsmm_dnn_link_tensor(layout, conv_input_buffer, &status); if (LIBXSMM_DNN_SUCCESS != status) result = EXIT_FAILURE; status = libxsmm_dnn_destroy_tensor_datalayout(layout); if (LIBXSMM_DNN_SUCCESS != status) result = EXIT_FAILURE; status = libxsmm_dnn_bind_tensor(handle, conv_input, LIBXSMM_DNN_REGULAR_INPUT); if (LIBXSMM_DNN_SUCCESS != status) result = EXIT_FAILURE; status = libxsmm_dnn_copyin_tensor(conv_input, image, LIBXSMM_DNN_TENSOR_FORMAT_NCHW); if (LIBXSMM_DNN_SUCCESS != status) result = EXIT_FAILURE; /* Filter buffer */ conv_filter_buffer = MALLOC(descriptor.K * descriptor.C * descriptor.R * descriptor.S * typesize_dnn); if (0 == conv_filter_buffer) result = EXIT_FAILURE; layout = libxsmm_dnn_create_tensor_datalayout(handle, LIBXSMM_DNN_FILTER, &status); if (LIBXSMM_DNN_SUCCESS != status) result = EXIT_FAILURE; conv_filter = libxsmm_dnn_link_tensor(layout, conv_filter_buffer, &status); if (LIBXSMM_DNN_SUCCESS != status) result = EXIT_FAILURE; status = libxsmm_dnn_destroy_tensor_datalayout(layout); if (LIBXSMM_DNN_SUCCESS != status) result = EXIT_FAILURE; status = libxsmm_dnn_bind_tensor(handle, conv_filter, LIBXSMM_DNN_REGULAR_FILTER); if (LIBXSMM_DNN_SUCCESS != status) result = EXIT_FAILURE; status = libxsmm_dnn_copyin_tensor(conv_filter, filter, LIBXSMM_DNN_TENSOR_FORMAT_KCRS); if (LIBXSMM_DNN_SUCCESS != status) result = EXIT_FAILURE; /* Output buffer */ conv_output_size1 = descriptor.N * descriptor.K * (descriptor.H + 2 * descriptor.pad_h_out) * (descriptor.W + 2 * descriptor.pad_w_out); conv_output_buffer = MALLOC(conv_output_size1 * typesize_dnn); if (0 == conv_output_buffer) result = EXIT_FAILURE; layout = libxsmm_dnn_create_tensor_datalayout(handle, LIBXSMM_DNN_OUTPUT, &status); if (LIBXSMM_DNN_SUCCESS != status) result = EXIT_FAILURE; conv_output = libxsmm_dnn_link_tensor(layout, conv_output_buffer, &status); if (LIBXSMM_DNN_SUCCESS != status) result = EXIT_FAILURE; status = libxsmm_dnn_destroy_tensor_datalayout(layout); if (LIBXSMM_DNN_SUCCESS != status) result = EXIT_FAILURE; status = libxsmm_dnn_bind_tensor(handle, conv_output, LIBXSMM_DNN_REGULAR_OUTPUT); if (LIBXSMM_DNN_SUCCESS != status) result = EXIT_FAILURE; /* allocate and bind scratch memory */ scratch_size = libxsmm_dnn_get_scratch_size(handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, &status); if (LIBXSMM_DNN_SUCCESS != status) result = EXIT_FAILURE; scratch = SCRATCH_MALLOC(scratch_size); if (0 == scratch) result = EXIT_FAILURE; status = libxsmm_dnn_bind_scratch(handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, scratch); if (LIBXSMM_DNN_SUCCESS != status) result = EXIT_FAILURE; } /* Attempt to run the convolution. */ start = libxsmm_timer_tick(); for (i = 0; i < nrepeat && EXIT_SUCCESS == result; ++i) { #if defined(_OPENMP) # pragma omp parallel #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif #if !defined(USE_OVERWRITE) memset(conv_output_buffer, 0, conv_output_size1 * typesize_dnn); #endif { #if !defined(NDEBUG) const libxsmm_dnn_err_t r = #endif libxsmm_dnn_execute_st(handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid); #if !defined(NDEBUG) if (LIBXSMM_DNN_SUCCESS != r && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { const char *const error_message = libxsmm_dnn_get_error(r); fprintf(stderr, "%s\n", error_message); result = EXIT_FAILURE; } #endif } } } duration = libxsmm_timer_duration(start, libxsmm_timer_tick()); /* Copy-out image into original format. */ if (EXIT_SUCCESS == result) { status = libxsmm_dnn_copyout_tensor(conv_output, image, LIBXSMM_DNN_TENSOR_FORMAT_NCHW); if (LIBXSMM_DNN_SUCCESS != status) result = EXIT_FAILURE; } /* Write the image into a different file. */ if (EXIT_SUCCESS == result) { if (0 == mult) { offset[0] = (size_out[0] - size_in[0]) / 2; offset[1] = (size_out[1] - size_in[1]) / 2; } else { offset[0] = offset[1] = 0; size_in[0] = size_out[0]; size_in[1] = size_out[1]; } /* write result image without offset/padding. */ result = libxsmm_mhd_write(filename_out, NULL/*offset*/, size_in, size_in, 2/*ndims*/, ncomponents, (libxsmm_mhd_elemtype)type_dnn/* assume super-set of DNN types */, 0 != orig ? &type_in : NULL, image, 0/*header_size*/, NULL/*extension_header*/, NULL/*extension*/, 0/*extension_size*/); } /* Release resources. */ if (LIBXSMM_DNN_SUCCESS != libxsmm_dnn_release_tensor(handle, LIBXSMM_DNN_REGULAR_INPUT)) result = EXIT_FAILURE; if (LIBXSMM_DNN_SUCCESS != libxsmm_dnn_release_tensor(handle, LIBXSMM_DNN_REGULAR_FILTER)) result = EXIT_FAILURE; if (LIBXSMM_DNN_SUCCESS != libxsmm_dnn_release_tensor(handle, LIBXSMM_DNN_REGULAR_OUTPUT)) result = EXIT_FAILURE; if (LIBXSMM_DNN_SUCCESS != libxsmm_dnn_destroy_tensor(conv_filter)) result = EXIT_FAILURE; if (LIBXSMM_DNN_SUCCESS != libxsmm_dnn_destroy_tensor(conv_output)) result = EXIT_FAILURE; if (LIBXSMM_DNN_SUCCESS != libxsmm_dnn_destroy_tensor(conv_input)) result = EXIT_FAILURE; if (LIBXSMM_DNN_SUCCESS != libxsmm_dnn_destroy_conv_layer(handle)) result = EXIT_FAILURE; SCRATCH_FREE(scratch); FREE(conv_output_buffer); FREE(conv_filter_buffer); FREE(conv_input_buffer); FREE(filter); FREE(image); if (EXIT_SUCCESS == result) { if (0 < duration) { fprintf(stdout, "\tfrequency: %.0f Hz\n", nrepeat / duration); } assert(0 != nrepeat); fprintf(stdout, "\tduration: %.0f ms\n", 1E3 * duration / nrepeat); fprintf(stdout, "Finished.\n"); } else { fprintf(stdout, "FAILED.\n"); } assert(EXIT_SUCCESS == result); return result; } libxsmm-1.17/samples/utilities/mhd/mhd.sh000077500000000000000000000050141415223013700204510ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) NAME=$(basename $0 .sh) GREP=$(command -v grep) ENV=$(command -v env) if [ "Windows_NT" = "${OS}" ]; then # Cygwin's "env" does not set PATH ("Files/Black: No such file or directory") export PATH=${PATH}:${HERE}/../../lib:/usr/x86_64-w64-mingw32/sys-root/mingw/bin # Cygwin's ldd hangs with dyn. linked executables or certain shared libraries LDD=$(command -v cygcheck) EXE=.exe else if [ "$(command -v ldd)" ]; then LDD=ldd elif [ "$(command -v otool)" ]; then LDD="otool -L" else LDD=echo fi fi MICINFO=$(command -v micinfo) if [ "${MICINFO}" ]; then MICCORES=$(${MICINFO} 2>/dev/null | sed -n "0,/[[:space:]]\+Total No of Active Cores :[[:space:]]\+\([0-9]\+\)/s//\1/p") fi if [ "" = "${MICCORES}" ]; then MICCORES=61 fi MICTPERC=3 if [ "-mic" != "$1" ]; then if [ "$(${LDD} ${HERE}/${NAME}${EXE} 2>/dev/null | ${GREP} libiomp5\.)" ]; then ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ KMP_AFFINITY=compact,granularity=fine,1 \ MIC_KMP_AFFINITY=compact,granularity=fine \ MIC_KMP_HW_SUBSET=$((MICCORES-1))c${MICTPERC}t \ MIC_ENV_PREFIX=MIC \ OFFLOAD_INIT=on_start \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" else ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ OMP_PROC_BIND=TRUE \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" fi else shift ${ENV} \ SINK_LD_LIBRARY_PATH=${SINK_LD_LIBRARY_PATH}:${MIC_LD_LIBRARY_PATH}:${HERE}/../../lib \ micnativeloadex \ ${HERE}/${NAME}${EXE} -a "$*" \ -e "KMP_AFFINITY=compact,granularity=fine" \ -e "MIC_KMP_HW_SUBSET=$((MICCORES-1))${MICTPERC}t" fi libxsmm-1.17/samples/utilities/mhd/mhd.vcxproj000066400000000000000000000535541415223013700215430ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 mhd {54F8CC52-A93E-41BA-9AED-AFC609542A10} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/utilities/mhd/mhd_in.mhd000066400000000000000000000651311415223013700213000ustar00rootroot00000000000000NDims = 2 ElementNumberOfChannels = 1 ElementByteOrderMSB = False DimSize = 202 134 ElementSpacing = 1.0 1.0 ElementType = MET_UCHAR ElementDataFile = LOCAL "-6@HNS[[[[[[[[[UOH@6,!'Ca}ȻhI) 1Qpġ|X3 9fٳz? 6h|<=nڔJ .sӘB Rżu)"fؽmU@5,# &/:Ke8*nǧmL0%GlE#aյZ2 7n= Vݫ}S/ ?t&5Ǟd-is[ђX0 -q6*|ץl*Iq7͆A4Jv441Oj( AIXo0 lZNΈ3,_CJ ~Q2u R?G;%e߇-(n, Z 2B NFſKnr 3"g"VUUUUUUUUOM\; o j1S P ['f/=|m =j.f/CE'Qei-f/ÀZi-f/ߌiy)i-H{{{{{{{Xf/dH)=M`i-f/ I~ Ni-f/(p Qi-f/$ Pj-f/b&Pe+f/?1Q'%%%%%%%&"f/$G (f/YFf/cef/re %, f/rD'a________XGa___________________^O;!c^____a7E•Qf/g 0k. ʋ62%f/\ oi- r f/7J|i- 0Lf/V6A5i- "of/}(i- pf/i- TZf/5qi- -Pf/;i- -֪f/fUli- %%%%%%%%%?`*%%%%%%QA $hf/ .i- khW f/J`i- @do>f/&i- f/+DYi- ?ff/ Wi- /0)f/34i- eNf/*i- kf/UTi- Ԩf/FPi- f/ i- f/f@i- f/?0Ui- f/*)k,i- f/t i-  f/i- [[[[[[[[[[[[[[[[[[[[[[[[[:f//%i- f/.d?i- |f/IVji- Rf/o p[i- Pf/Li-  'g/Y-Mi- w"1a/7Mi- `.}M8O/8P_- dy=,*+,ME<:/-`B- 13ӫC#/t'- L/- c8[/h]- -/4- 89 2/y } *- Q(q;/0dN3- ; {6"/IK- g(M0-I5 j.ybds4e_Yԩt.5 b7'-EOOOJ3!r ^G) ?KY"$  ne W`+nL`IL`;9V`,l`.:.`L O`q1y`C\`yLP` "C`uP`/h*Za:u ;Z:> 7ec #include #include int main(int argc, char* argv[]) { double rng_stddev = 0; float* rngs; float vrng[16]; libxsmm_timer_tickint start; libxsmm_matdiff_info info; libxsmm_blasint num_rngs; libxsmm_blasint i; if (2 < argc) { fprintf(stderr, "Usage:\n %s number_rngs\n", argv[0]); return EXIT_SUCCESS; } /* parse the command line and set up the test parameters */ num_rngs = (1 < argc ? atoi(argv[1]) : 1000); /* avoid scalar remainder in timing loop */ num_rngs = LIBXSMM_UP2(num_rngs, 16); assert(num_rngs >= 1); rngs = (float*)malloc((size_t)(sizeof(float) * num_rngs)); if (NULL == rngs) num_rngs = 0; libxsmm_rng_set_seed( (unsigned int)(time(0)) ); /* fill array with random floats */ libxsmm_rng_f32_seq( rngs, num_rngs ); /* some quality measure; variance is based on discovered average rather than expected value */ if (EXIT_SUCCESS == libxsmm_matdiff(&info, LIBXSMM_DATATYPE_F32, 1/*m*/, num_rngs, NULL/*ref*/, rngs/*tst*/, NULL/*ldref*/, NULL/*ldtst*/)) { rng_stddev = libxsmm_dsqrt( info.var_tst ); } start = libxsmm_timer_tick(); for (i = 0; i < num_rngs; ++i) { libxsmm_rng_f32_seq( rngs, 1 ); } printf("\nlibxsmm_rng_float: %llu cycles per random number (scalar)\n", libxsmm_timer_ncycles(start, libxsmm_timer_tick()) / num_rngs); start = libxsmm_timer_tick(); for (i = 0; i < num_rngs; ++i) { libxsmm_rng_f32_seq( vrng, 16 ); } printf("\nlibxsmm_rng_float: %llu cycles per random number (vlen=16)\n", libxsmm_timer_ncycles(start, libxsmm_timer_tick()) / ((size_t)num_rngs*16)); /* let's compute some values of the random numbers */ printf("\n%lli random numbers generated, which are uniformly distributed in [0,1(\n", (long long)num_rngs); printf("Expected properties: avg=0.5, var=0.083333, stddev=0.288675\n\n"); printf("minimum random number: %f\n", info.min_tst); printf("maximum random number: %f\n", info.max_tst); printf("sum of random numbers: %f\n", info.l1_tst); printf("avg of random numbers: %f\n", info.avg_tst); printf("var of random numbers: %f\n", info.var_tst); printf("dev of random numbers: %f\n\n", rng_stddev); free( rngs ); return EXIT_SUCCESS; } libxsmm-1.17/samples/utilities/rng/rng.sh000077500000000000000000000050141415223013700205050ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) NAME=$(basename $0 .sh) GREP=$(command -v grep) ENV=$(command -v env) if [ "Windows_NT" = "${OS}" ]; then # Cygwin's "env" does not set PATH ("Files/Black: No such file or directory") export PATH=${PATH}:${HERE}/../../lib:/usr/x86_64-w64-mingw32/sys-root/mingw/bin # Cygwin's ldd hangs with dyn. linked executables or certain shared libraries LDD=$(command -v cygcheck) EXE=.exe else if [ "$(command -v ldd)" ]; then LDD=ldd elif [ "$(command -v otool)" ]; then LDD="otool -L" else LDD=echo fi fi MICINFO=$(command -v micinfo) if [ "${MICINFO}" ]; then MICCORES=$(${MICINFO} 2>/dev/null | sed -n "0,/[[:space:]]\+Total No of Active Cores :[[:space:]]\+\([0-9]\+\)/s//\1/p") fi if [ "" = "${MICCORES}" ]; then MICCORES=61 fi MICTPERC=3 if [ "-mic" != "$1" ]; then if [ "$(${LDD} ${HERE}/${NAME}${EXE} 2>/dev/null | ${GREP} libiomp5\.)" ]; then ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ KMP_AFFINITY=compact,granularity=fine,1 \ MIC_KMP_AFFINITY=compact,granularity=fine \ MIC_KMP_HW_SUBSET=$((MICCORES-1))c${MICTPERC}t \ MIC_ENV_PREFIX=MIC \ OFFLOAD_INIT=on_start \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" else ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ OMP_PROC_BIND=TRUE \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" fi else shift ${ENV} \ SINK_LD_LIBRARY_PATH=${SINK_LD_LIBRARY_PATH}:${MIC_LD_LIBRARY_PATH}:${HERE}/../../lib \ micnativeloadex \ ${HERE}/${NAME}${EXE} -a "$*" \ -e "KMP_AFFINITY=compact,granularity=fine" \ -e "MIC_KMP_HW_SUBSET=$((MICCORES-1))${MICTPERC}t" fi libxsmm-1.17/samples/utilities/rng/rng.vcxproj000066400000000000000000000541131415223013700215670ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 rng {68BC613E-743B-45DE-B6AA-B9D04A3B7CAD} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/utilities/rng/rng_avx512.c000066400000000000000000000061431415223013700214240ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include #include #include #include int main(int argc, char* argv[]) { double rng_stddev = 0; float* rngs; float vrng[16]; unsigned int* state = NULL; libxsmm_timer_tickint start; libxsmm_matdiff_info info; libxsmm_blasint num_rngs; libxsmm_blasint i; if (2 < argc) { fprintf(stderr, "Usage:\n %s number_rngs\n", argv[0]); return EXIT_SUCCESS; } /* parse the command line and set up the test parameters */ num_rngs = (1 < argc ? atoi(argv[1]) : 1024); /* avoid scalar remainder in timing loop */ num_rngs = LIBXSMM_UP2(num_rngs, 16); assert(num_rngs >= 1); rngs = (float*)malloc((size_t)(sizeof(float) * num_rngs)); if (NULL == rngs) num_rngs = 0; /* create thread-safe state */ state = libxsmm_rng_create_avx512_extstate( (unsigned int)(time(0)) ); /* fill array with random floats */ for (i = 0; i < num_rngs; i+=16) { #ifdef __AVX512F__ _mm512_storeu_ps( rngs+i, LIBXSMM_INTRINSICS_MM512_RNG_EXTSTATE_PS( state ) ); #endif } /* some quality measure; variance is based on discovered average rather than expected value */ if (EXIT_SUCCESS == libxsmm_matdiff(&info, LIBXSMM_DATATYPE_F32, 1/*m*/, num_rngs, NULL/*ref*/, rngs/*tst*/, NULL/*ldref*/, NULL/*ldtst*/)) { rng_stddev = libxsmm_dsqrt( info.var_tst ); } start = libxsmm_timer_tick(); for (i = 0; i < num_rngs; ++i) { #ifdef __AVX512F__ _mm512_storeu_ps( vrng, _mm512_add_ps( _mm512_load_ps(vrng), LIBXSMM_INTRINSICS_MM512_RNG_EXTSTATE_PS( state ) ) ); #endif } printf("\nlibxsmm_rng_float: %llu cycles per random number (vlen=16)\n", libxsmm_timer_ncycles(start, libxsmm_timer_tick()) / ((size_t)num_rngs*16)); /* free the state */ libxsmm_rng_destroy_avx512_extstate( state ); /* let's compute some values of the random numbers */ printf("\n%lli random numbers generated, which are uniformly distributed in [0,1(\n", (long long)num_rngs); printf("Expected properties: avg=0.5, var=0.083333, stddev=0.288675\n\n"); printf("minimum random number: %f\n", info.min_tst); printf("maximum random number: %f\n", info.max_tst); printf("sum of random numbers: %f\n", info.l1_tst); printf("avg of random numbers: %f\n", info.avg_tst); printf("var of random numbers: %f\n", info.var_tst); printf("dev of random numbers: %f\n\n", rng_stddev); free( rngs ); return EXIT_SUCCESS; } libxsmm-1.17/samples/utilities/scratch/000077500000000000000000000000001415223013700202215ustar00rootroot00000000000000libxsmm-1.17/samples/utilities/scratch/Makefile000066400000000000000000000076341415223013700216730ustar00rootroot00000000000000ROOTDIR = $(abspath $(dir $(firstword $(MAKEFILE_LIST)))) DEPDIR = ../../.. SRCDIR = . INCDIR = . BLDDIR = obj OUTDIR = . CXXFLAGS = $(NULL) CFLAGS = $(NULL) DFLAGS = $(NULL) BLAS = 0 OMP = 1 SYM = 1 # include common Makefile artifacts include $(DEPDIR)/Makefile.inc # necessary include directories IFLAGS += -I$(call quote,$(INCDIR)) IFLAGS += -I$(call quote,$(DEPDIR)/include) OUTNAME := $(shell basename "$(ROOTDIR)") HEADERS := $(wildcard $(INCDIR)/*.h) $(wildcard $(INCDIR)/*.hpp) $(wildcard $(INCDIR)/*.hxx) $(wildcard $(INCDIR)/*.hh) \ $(wildcard $(SRCDIR)/*.h) $(wildcard $(SRCDIR)/*.hpp) $(wildcard $(SRCDIR)/*.hxx) $(wildcard $(SRCDIR)/*.hh) \ $(DEPDIR)/include/libxsmm_source.h CPPSRCS := $(wildcard $(SRCDIR)/*.cpp) CXXSRCS := $(wildcard $(SRCDIR)/*.cxx) CCXSRCS := $(wildcard $(SRCDIR)/*.cc) CSOURCS := $(wildcard $(SRCDIR)/*.c) CPPOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CPPSRCS:.cpp=-cpp.o))) CXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CXXSRCS:.cxx=-cxx.o))) CCXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CCXSRCS:.cc=-cc.o))) COBJCTS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CSOURCS:.c=-c.o))) ifneq (,$(strip $(FC))) FXXSRCS := $(wildcard $(SRCDIR)/*.f) F77SRCS := $(wildcard $(SRCDIR)/*.F) F90SRCS := $(wildcard $(SRCDIR)/*.f90) $(wildcard $(SRCDIR)/*.F90) FXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(FXXSRCS:.f=-f.o))) F77OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F77SRCS:.F=-f77.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90SRCS:.f90=-f90.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90OBJS:.F90=-f90.o))) endif SOURCES := $(CPPSRCS) $(CXXSRCS) $(CCXSRCS) $(CSOURCS) OBJECTS := $(CPPOBJS) $(CXXOBJS) $(CCXOBJS) $(COBJCTS) FTNSRCS := $(FXXSRCS) $(F77SRCS) $(F90SRCS) MODULES := $(addsuffix .mod,$(basename $(FTNSRCS))) $(addsuffix .modmic,$(basename $(FTNSRCS))) FTNOBJS := $(FXXOBJS) $(F77OBJS) $(F90OBJS) XFILES := $(OUTDIR)/$(OUTNAME) .PHONY: all all: $(XFILES) .PHONY: compile compile: $(OBJECTS) $(FTNOBJS) ifneq (0,$(shell echo "$$((0!=$(INTEL) && 190001<=$(CC_VERSION_NUM)))")) QKMALLOC ?= 1 else QKMALLOC ?= 0 endif $(OUTDIR)/$(OUTNAME): $(OUTDIR)/.make $(OBJECTS) $(LIBDEP) $(EXTDEP) ifneq (0,$(QKMALLOC)) $(LD) -o $@ $(OBJECTS) $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) -lqkmalloc else $(LD) -o $@ $(OBJECTS) $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) endif $(BLDDIR)/%-cpp.o: $(SRCDIR)/%.cpp .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CXX) $(DFLAGS) $(IFLAGS) $(CXXFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-c.o: $(SRCDIR)/%.c .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CC) $(DFLAGS) $(IFLAGS) $(CFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-f.o: $(SRCDIR)/%.f .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.f90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.F90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f77.o: $(SRCDIR)/%.F .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ .PHONY: clean clean: ifneq ($(call qapath,$(BLDDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(BLDDIR)),$(call qapath,.)) @rm -rf $(BLDDIR) endif endif ifneq (,$(wildcard $(BLDDIR))) # still exists @rm -f $(OBJECTS) $(OBJECTX) $(FTNOBJS) $(FTNOBJX) *__genmod.* fit.log *.dat @rm -f $(BLDDIR)/*.gcno $(BLDDIR)/*.gcda $(BLDDIR)/*.gcov endif @rm -f .make .state .PHONY: realclean realclean: clean ifneq ($(call qapath,$(OUTDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(OUTDIR)),$(call qapath,.)) @rm -rf $(OUTDIR) endif endif ifneq (,$(wildcard $(OUTDIR))) # still exists @rm -f $(OUTDIR)/libxsmm.$(DLIBEXT) $(OUTDIR)/*.stackdump @rm -f $(XFILES) $(MODULES) endif libxsmm-1.17/samples/utilities/scratch/README.md000066400000000000000000000007411415223013700215020ustar00rootroot00000000000000# Scratch Memory Allocation (Microbenchmark) This code sample aims to benchmark the performance of the scratch memory allocation. This facility is a viable option to satisfy the need for temporary memory when using the DNN domain of LIBXSMM (small convolutions). Although any kind of readable/writable buffer can be bound to a convolution handle, LIBXSMM's `libxsmm_aligned_scratch` features a thread-safe linear allocator mechanism which can help to lower allocation overhead. libxsmm-1.17/samples/utilities/scratch/scratch.c000066400000000000000000000150221415223013700220140ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #include #include #include #if defined(_OPENMP) # include #endif #if defined(__TBB) # include #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif #if defined(__TBB) # define MALLOC scalable_malloc # define FREE scalable_free #elif defined(_OPENMP) && defined(LIBXSMM_INTEL_COMPILER) && (1901 > LIBXSMM_INTEL_COMPILER) # define MALLOC kmp_malloc # define FREE kmp_free #elif 1 # define MALLOC malloc # define FREE free #endif #if !defined(MAX_MALLOC_MB) # define MAX_MALLOC_MB 100 #endif #if !defined(MAX_MALLOC_N) # define MAX_MALLOC_N 24 #endif void* malloc_offsite(size_t size); int main(int argc, char* argv[]) { #if defined(_OPENMP) const int max_nthreads = omp_get_max_threads(); #else const int max_nthreads = 1; #endif const int ncycles = LIBXSMM_MAX(1 < argc ? atoi(argv[1]) : 100, 1); const int max_nallocs = LIBXSMM_CLMP(2 < argc ? atoi(argv[2]) : 4, 1, MAX_MALLOC_N); const int nthreads = LIBXSMM_CLMP(3 < argc ? atoi(argv[3]) : 1, 1, max_nthreads); const char *const env_check = getenv("CHECK"); const double check = LIBXSMM_ABS(NULL == env_check ? 0 : atof(env_check)); unsigned int nallocs = 0, nerrors0 = 0, nerrors1 = 0; int r[MAX_MALLOC_N], i; int max_size = 0; /* generate set of random numbers for parallel region */ for (i = 0; i < (MAX_MALLOC_N); ++i) r[i] = rand(); /* count number of calls according to randomized scheme */ for (i = 0; i < ncycles; ++i) { const int count = r[i%(MAX_MALLOC_N)] % max_nallocs + 1; int mbytes = 0, j; for (j = 0; j < count; ++j) { const int k = (i * count + j) % (MAX_MALLOC_N); mbytes += (r[k] % (MAX_MALLOC_MB) + 1); } if (max_size < mbytes) max_size = mbytes; nallocs += count; } assert(0 != nallocs); fprintf(stdout, "Running %i cycles with max. %i malloc+free (%u calls) using %i thread%s...\n", ncycles, max_nallocs, nallocs, 1 >= nthreads ? 1 : nthreads, 1 >= nthreads ? "" : "s"); #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload target(LIBXSMM_OFFLOAD_TARGET) #endif { const char *const longlife_env = getenv("LONGLIFE"); const int enable_longlife = ((NULL == longlife_env || 0 == *longlife_env) ? 0 : atoi(longlife_env)); void* longlife = (0 == enable_longlife ? NULL : malloc_offsite((MAX_MALLOC_MB) << 20)); libxsmm_timer_tickint d0 = 0, d1 = 0; libxsmm_scratch_info info; int scratch = 0; libxsmm_init(); #if defined(_OPENMP) # pragma omp parallel for num_threads(nthreads) private(i) reduction(+:d1,nerrors1) #endif for (i = 0; i < ncycles; ++i) { const int count = r[i%(MAX_MALLOC_N)] % max_nallocs + 1; void* p[MAX_MALLOC_N]; int j; assert(count <= MAX_MALLOC_N); for (j = 0; j < count; ++j) { const int k = (i * count + j) % (MAX_MALLOC_N); const size_t nbytes = ((size_t)r[k] % (MAX_MALLOC_MB) + 1) << 20; const libxsmm_timer_tickint t1 = libxsmm_timer_tick(); p[j] = libxsmm_aligned_scratch(nbytes, 0/*auto*/); d1 += libxsmm_timer_ncycles(t1, libxsmm_timer_tick()); if (NULL == p[j]) { ++nerrors1; } else if (0 != check) { memset(p[j], j, nbytes); } } for (j = 0; j < count; ++j) { libxsmm_free(p[j]); } } libxsmm_free(longlife); if (EXIT_SUCCESS == libxsmm_get_scratch_info(&info) && 0 < info.size) { scratch = (int)(1.0 * LIBXSMM_MAX(info.size, info.local) / (1ULL << 20) + 0.5); fprintf(stdout, "\nScratch: %i MB (mallocs=%lu, pools=%u)\n", scratch, (unsigned long int)info.nmallocs, info.npools); libxsmm_release_scratch(); /* suppress LIBXSMM's termination message about scratch */ } #if (defined(MALLOC) && defined(FREE)) longlife = (0 == enable_longlife ? NULL : MALLOC((MAX_MALLOC_MB) << 20)); if (NULL == longlife) max_size += MAX_MALLOC_MB; #if defined(_OPENMP) # pragma omp parallel for num_threads(nthreads) private(i) reduction(+:d0,nerrors0) #endif for (i = 0; i < ncycles; ++i) { const int count = r[i % (MAX_MALLOC_N)] % max_nallocs + 1; void* p[MAX_MALLOC_N]; int j; assert(count <= MAX_MALLOC_N); for (j = 0; j < count; ++j) { const int k = (i * count + j) % (MAX_MALLOC_N); const size_t nbytes = ((size_t)r[k] % (MAX_MALLOC_MB) + 1) << 20; const libxsmm_timer_tickint t1 = libxsmm_timer_tick(); p[j] = MALLOC(nbytes); d0 += libxsmm_timer_ncycles(t1, libxsmm_timer_tick()); if (NULL == p[j]) { ++nerrors0; } else if (0 != check) { memset(p[j], j, nbytes); } } for (j = 0; j < count; ++j) FREE(p[j]); } FREE(longlife); #endif /*(defined(MALLOC) && defined(FREE))*/ if (0 != d0 && 0 != d1 && 0 < nallocs) { const double dcalls = libxsmm_timer_duration(0, d0); const double dalloc = libxsmm_timer_duration(0, d1); const double scratch_freq = 1E-3 * nallocs / dalloc; const double malloc_freq = 1E-3 * nallocs / dcalls; const double speedup = scratch_freq / malloc_freq; fprintf(stdout, "\tlibxsmm scratch calls/s: %.1f kHz\n", scratch_freq); fprintf(stdout, "Malloc: %i MB\n", max_size); fprintf(stdout, "\tstd.malloc+free calls/s: %.1f kHz\n", malloc_freq); fprintf(stdout, "Fair (size vs. speed): %.1fx\n", max_size * speedup / scratch); fprintf(stdout, "Scratch Speedup: %.1fx\n", speedup); } } if (0 != nerrors0 || 0 != nerrors1) { fprintf(stdout, "FAILED (errors: malloc=%u libxsmm=%u)\n", nerrors0, nerrors1); return EXIT_FAILURE; } return EXIT_SUCCESS; } void* malloc_offsite(size_t size) { return libxsmm_aligned_scratch(size, 0/*auto*/); } libxsmm-1.17/samples/utilities/scratch/scratch.sh000077500000000000000000000050141415223013700222070ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) NAME=$(basename $0 .sh) GREP=$(command -v grep) ENV=$(command -v env) if [ "Windows_NT" = "${OS}" ]; then # Cygwin's "env" does not set PATH ("Files/Black: No such file or directory") export PATH=${PATH}:${HERE}/../../lib:/usr/x86_64-w64-mingw32/sys-root/mingw/bin # Cygwin's ldd hangs with dyn. linked executables or certain shared libraries LDD=$(command -v cygcheck) EXE=.exe else if [ "$(command -v ldd)" ]; then LDD=ldd elif [ "$(command -v otool)" ]; then LDD="otool -L" else LDD=echo fi fi MICINFO=$(command -v micinfo) if [ "${MICINFO}" ]; then MICCORES=$(${MICINFO} 2>/dev/null | sed -n "0,/[[:space:]]\+Total No of Active Cores :[[:space:]]\+\([0-9]\+\)/s//\1/p") fi if [ "" = "${MICCORES}" ]; then MICCORES=61 fi MICTPERC=3 if [ "-mic" != "$1" ]; then if [ "$(${LDD} ${HERE}/${NAME}${EXE} 2>/dev/null | ${GREP} libiomp5\.)" ]; then ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ KMP_AFFINITY=scatter,granularity=fine,1 \ MIC_KMP_AFFINITY=scatter,granularity=fine \ MIC_KMP_HW_SUBSET=$((MICCORES-1))c${MICTPERC}t \ MIC_ENV_PREFIX=MIC \ OFFLOAD_INIT=on_start \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" else ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ OMP_PROC_BIND=TRUE \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" fi else shift ${ENV} \ SINK_LD_LIBRARY_PATH=${SINK_LD_LIBRARY_PATH}:${MIC_LD_LIBRARY_PATH}:${HERE}/../../lib \ micnativeloadex \ ${HERE}/${NAME}${EXE} -a "$*" \ -e "KMP_AFFINITY=scatter,granularity=fine" \ -e "MIC_KMP_HW_SUBSET=$((MICCORES-1))${MICTPERC}t" fi libxsmm-1.17/samples/utilities/scratch/scratch.vcxproj000066400000000000000000000550251415223013700232740ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 scratch {5A1EC2CD-47DE-405D-B47D-D8C68F56E7B3} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST GenerateParallelCode true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST GenerateParallelCode true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST GenerateParallelCode true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST GenerateParallelCode true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST GenerateParallelCode true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST GenerateParallelCode true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/utilities/sync/000077500000000000000000000000001415223013700175465ustar00rootroot00000000000000libxsmm-1.17/samples/utilities/sync/Makefile000066400000000000000000000077021415223013700212140ustar00rootroot00000000000000ROOTDIR = $(abspath $(dir $(firstword $(MAKEFILE_LIST)))) DEPDIR = ../../.. SRCDIR = . INCDIR = . BLDDIR = obj OUTDIR = . CXXFLAGS = $(NULL) CFLAGS = $(NULL) DFLAGS = $(NULL) BLAS = 0 SYM = 1 OMP = 1 # include common Makefile artifacts include $(DEPDIR)/Makefile.inc # necessary include directories IFLAGS += -I$(call quote,$(INCDIR)) IFLAGS += -I$(call quote,$(DEPDIR)/include) OUTNAME := $(shell basename "$(ROOTDIR)") HEADERS := $(wildcard $(INCDIR)/*.h) $(wildcard $(INCDIR)/*.hpp) $(wildcard $(INCDIR)/*.hxx) $(wildcard $(INCDIR)/*.hh) \ $(wildcard $(SRCDIR)/*.h) $(wildcard $(SRCDIR)/*.hpp) $(wildcard $(SRCDIR)/*.hxx) $(wildcard $(SRCDIR)/*.hh) \ $(DEPDIR)/include/libxsmm_source.h CPPSRCS := $(wildcard $(SRCDIR)/*.cpp) CXXSRCS := $(wildcard $(SRCDIR)/*.cxx) CCXSRCS := $(wildcard $(SRCDIR)/*.cc) CSOURCS := $(wildcard $(SRCDIR)/*.c) CPPOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CPPSRCS:.cpp=-cpp.o))) CXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CXXSRCS:.cxx=-cxx.o))) CCXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CCXSRCS:.cc=-cc.o))) COBJCTS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CSOURCS:.c=-c.o))) ifneq (,$(strip $(FC))) FXXSRCS := $(wildcard $(SRCDIR)/*.f) F77SRCS := $(wildcard $(SRCDIR)/*.F) F90SRCS := $(wildcard $(SRCDIR)/*.f90) $(wildcard $(SRCDIR)/*.F90) FXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(FXXSRCS:.f=-f.o))) F77OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F77SRCS:.F=-f77.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90SRCS:.f90=-f90.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90OBJS:.F90=-f90.o))) endif SOURCES := $(CPPSRCS) $(CXXSRCS) $(CCXSRCS) $(CSOURCS) OBJECTS := $(CPPOBJS) $(CXXOBJS) $(CCXOBJS) $(COBJCTS) FTNSRCS := $(FXXSRCS) $(F77SRCS) $(F90SRCS) MODULES := $(addsuffix .mod,$(basename $(FTNSRCS))) $(addsuffix .modmic,$(basename $(FTNSRCS))) FTNOBJS := $(FXXOBJS) $(F77OBJS) $(F90OBJS) XFILES := $(OUTDIR)/$(OUTNAME)_barrier $(OUTDIR)/$(OUTNAME)_lock .PHONY: all all: $(XFILES) .PHONY: compile compile: $(OBJECTS) $(FTNOBJS) $(OUTDIR)/$(OUTNAME)_barrier: $(OUTDIR)/.make $(BLDDIR)/$(OUTNAME)_barrier-c.o $(LIBDEP) $(EXTDEP) $(LD) -o $@ $(BLDDIR)/$(OUTNAME)_barrier-c.o $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(OUTDIR)/$(OUTNAME)_lock: $(OUTDIR)/.make $(BLDDIR)/$(OUTNAME)_lock-c.o $(LIBDEP) $(EXTDEP) $(LD) -o $@ $(BLDDIR)/$(OUTNAME)_lock-c.o $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(BLDDIR)/%-cpp.o: $(SRCDIR)/%.cpp .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CXX) $(DFLAGS) $(IFLAGS) $(CXXFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-c.o: $(SRCDIR)/%.c .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CC) $(DFLAGS) $(IFLAGS) $(CFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-f.o: $(SRCDIR)/%.f .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.f90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.F90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f77.o: $(SRCDIR)/%.F .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ .PHONY: clean clean: ifneq ($(call qapath,$(BLDDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(BLDDIR)),$(call qapath,.)) @rm -rf $(BLDDIR) endif endif ifneq (,$(wildcard $(BLDDIR))) # still exists @rm -f $(OBJECTS) $(OBJECTX) $(FTNOBJS) $(FTNOBJX) *__genmod.* fit.log *.dat @rm -f $(BLDDIR)/*.gcno $(BLDDIR)/*.gcda $(BLDDIR)/*.gcov endif @rm -f .make .state .PHONY: realclean realclean: clean ifneq ($(call qapath,$(OUTDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(OUTDIR)),$(call qapath,.)) @rm -rf $(OUTDIR) endif endif ifneq (,$(wildcard $(OUTDIR))) # still exists @rm -f $(OUTDIR)/libxsmm.$(DLIBEXT) $(OUTDIR)/*.stackdump @rm -f $(XFILES) $(MODULES) endif libxsmm-1.17/samples/utilities/sync/sync_barrier.c000066400000000000000000000050001415223013700223670ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.), Hans Pabst (Intel Corp.) ******************************************************************************/ #include #include #include #if defined(_OPENMP) # include #endif int main(int argc, char* argv[]) { int num_cores, threads_per_core, num_threads, num_iterations; libxsmm_timer_tickint start; libxsmm_barrier* barrier; if (4 < argc) { fprintf(stderr, "Usage:\n %s []\n", argv[0]); return EXIT_SUCCESS; } /* parse the command line and set up the test parameters */ #if defined(_OPENMP) num_cores = (1 < argc ? atoi(argv[1]) : 2); assert(num_cores >= 1); threads_per_core = (2 < argc ? atoi(argv[2]) : 2); assert(threads_per_core >= 1); #else threads_per_core = 1; num_cores = 1; #endif num_iterations = (1 < argc ? atoi(argv[3]) : 50000); assert(num_iterations > 0); /* create a new barrier */ barrier = libxsmm_barrier_create(num_cores, threads_per_core); assert(NULL != barrier); /* each thread must initialize with the barrier */ num_threads = num_cores * threads_per_core; #if defined(_OPENMP) # pragma omp parallel num_threads(num_threads) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif libxsmm_barrier_init(barrier, tid); } start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel num_threads(num_threads) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif int i; for (i = 0; i < num_iterations; ++i) { libxsmm_barrier_wait(barrier, tid); } } printf("libxsmm_barrier_wait(): %llu cycles (%d threads)\n", libxsmm_timer_ncycles(start, libxsmm_timer_tick()) / num_iterations, num_threads); libxsmm_barrier_destroy(barrier); return EXIT_SUCCESS; } libxsmm-1.17/samples/utilities/sync/sync_barrier.sh000077500000000000000000000050141415223013700225670ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) NAME=$(basename $0 .sh) GREP=$(command -v grep) ENV=$(command -v env) if [ "Windows_NT" = "${OS}" ]; then # Cygwin's "env" does not set PATH ("Files/Black: No such file or directory") export PATH=${PATH}:${HERE}/../../lib:/usr/x86_64-w64-mingw32/sys-root/mingw/bin # Cygwin's ldd hangs with dyn. linked executables or certain shared libraries LDD=$(command -v cygcheck) EXE=.exe else if [ "$(command -v ldd)" ]; then LDD=ldd elif [ "$(command -v otool)" ]; then LDD="otool -L" else LDD=echo fi fi MICINFO=$(command -v micinfo) if [ "${MICINFO}" ]; then MICCORES=$(${MICINFO} 2>/dev/null | sed -n "0,/[[:space:]]\+Total No of Active Cores :[[:space:]]\+\([0-9]\+\)/s//\1/p") fi if [ "" = "${MICCORES}" ]; then MICCORES=61 fi MICTPERC=3 if [ "-mic" != "$1" ]; then if [ "$(${LDD} ${HERE}/${NAME}${EXE} 2>/dev/null | ${GREP} libiomp5\.)" ]; then ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ KMP_AFFINITY=compact,granularity=fine,1 \ MIC_KMP_AFFINITY=compact,granularity=fine \ MIC_KMP_HW_SUBSET=$((MICCORES-1))c${MICTPERC}t \ MIC_ENV_PREFIX=MIC \ OFFLOAD_INIT=on_start \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" else ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ OMP_PROC_BIND=TRUE \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" fi else shift ${ENV} \ SINK_LD_LIBRARY_PATH=${SINK_LD_LIBRARY_PATH}:${MIC_LD_LIBRARY_PATH}:${HERE}/../../lib \ micnativeloadex \ ${HERE}/${NAME}${EXE} -a "$*" \ -e "KMP_AFFINITY=compact,granularity=fine" \ -e "MIC_KMP_HW_SUBSET=$((MICCORES-1))${MICTPERC}t" fi libxsmm-1.17/samples/utilities/sync/sync_barrier.vcxproj000066400000000000000000000540341415223013700236530ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 sync_barrier {E05E66FF-8B93-4766-932F-891C1C9B615E} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/utilities/sync/sync_lock.c000066400000000000000000000164311415223013700217030ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include #include #if defined(_OPENMP) # include #endif /* measure non-contended latency of RO-lock */ #define MEASURE_LATENCY_RO(LOCK_KIND, LOCKPTR, NREPEAT, NR) do { \ libxsmm_timer_tickint latency = 0; \ double duration; \ int i; \ for (i = 0; i < (NREPEAT) / 4; ++i) { \ const libxsmm_timer_tickint tick = libxsmm_timer_tick(); \ LIBXSMM_LOCK_ACQREAD(LOCK_KIND, LOCKPTR); \ LIBXSMM_LOCK_RELREAD(LOCK_KIND, LOCKPTR); \ LIBXSMM_LOCK_ACQREAD(LOCK_KIND, LOCKPTR); \ LIBXSMM_LOCK_RELREAD(LOCK_KIND, LOCKPTR); \ LIBXSMM_LOCK_ACQREAD(LOCK_KIND, LOCKPTR); \ LIBXSMM_LOCK_RELREAD(LOCK_KIND, LOCKPTR); \ LIBXSMM_LOCK_ACQREAD(LOCK_KIND, LOCKPTR); \ LIBXSMM_LOCK_RELREAD(LOCK_KIND, LOCKPTR); \ latency += libxsmm_timer_ncycles(tick, libxsmm_timer_tick()); \ } \ duration = libxsmm_timer_duration(0, latency); \ if (0 < duration) { \ printf("\tro-latency: %.0f ns (call/s %.0f MHz, %.0f cycles)\n", \ duration * (NR) * 1e9, (NREPEAT) / (1e6 * duration), latency * (NR)); \ } \ } while(0) /* measure non-contended latency of RW-lock */ #define MEASURE_LATENCY_RW(LOCK_KIND, LOCKPTR, NREPEAT, NR) do { \ libxsmm_timer_tickint latency = 0; \ double duration; \ int i; \ for (i = 0; i < (NREPEAT) / 4; ++i) { \ const libxsmm_timer_tickint tick = libxsmm_timer_tick(); \ LIBXSMM_LOCK_ACQUIRE(LOCK_KIND, LOCKPTR); \ LIBXSMM_LOCK_RELEASE(LOCK_KIND, LOCKPTR); \ LIBXSMM_LOCK_ACQUIRE(LOCK_KIND, LOCKPTR); \ LIBXSMM_LOCK_RELEASE(LOCK_KIND, LOCKPTR); \ LIBXSMM_LOCK_ACQUIRE(LOCK_KIND, LOCKPTR); \ LIBXSMM_LOCK_RELEASE(LOCK_KIND, LOCKPTR); \ LIBXSMM_LOCK_ACQUIRE(LOCK_KIND, LOCKPTR); \ LIBXSMM_LOCK_RELEASE(LOCK_KIND, LOCKPTR); \ latency += libxsmm_timer_ncycles(tick, libxsmm_timer_tick()); \ } \ duration = libxsmm_timer_duration(0, latency); \ if (0 < duration) { \ printf("\trw-latency: %.0f ns (call/s %.0f MHz, %.0f cycles)\n", \ duration * (NR) * 1e9, (NREPEAT) / (1e6 * duration), latency * (NR)); \ } \ } while(0) #if defined(_OPENMP) # define MEASURE_THROUGHPUT_PARALLEL(NTHREADS) LIBXSMM_PRAGMA(omp parallel num_threads(NTHREADS)) # define MEASURE_THROUGHPUT_ATOMIC LIBXSMM_PRAGMA(omp atomic) #else # define MEASURE_THROUGHPUT_PARALLEL(NTHREADS) # define MEASURE_THROUGHPUT_ATOMIC #endif #define MEASURE_THROUGHPUT(LOCK_KIND, LOCKPTR, NREPEAT, NTHREADS, WORK_R, WORK_W, NW, NT) do { \ libxsmm_timer_tickint throughput = 0; \ double duration; \ MEASURE_THROUGHPUT_PARALLEL(NTHREADS) \ { \ int n, nn; \ libxsmm_timer_tickint t1, t2, d = 0; \ const libxsmm_timer_tickint t0 = libxsmm_timer_tick(); \ for (n = 0; n < (NREPEAT); n = nn) { \ nn = n + 1; \ if (0 != (nn % (NW))) { /* read */ \ LIBXSMM_LOCK_ACQREAD(LOCK_KIND, LOCKPTR); \ t1 = libxsmm_timer_tick(); \ t2 = work(t1, WORK_R); \ LIBXSMM_LOCK_RELREAD(LOCK_KIND, LOCKPTR); \ d += libxsmm_timer_ncycles(t1, t2); \ } \ else { /* write */ \ LIBXSMM_LOCK_ACQUIRE(LOCK_KIND, LOCKPTR); \ t1 = libxsmm_timer_tick(); \ t2 = work(t1, WORK_W); \ LIBXSMM_LOCK_RELEASE(LOCK_KIND, LOCKPTR); \ d += libxsmm_timer_ncycles(t1, t2); \ } \ } \ t1 = libxsmm_timer_ncycles(t0, libxsmm_timer_tick()); \ MEASURE_THROUGHPUT_ATOMIC \ throughput += t1 - d; \ } \ duration = libxsmm_timer_duration(0, throughput); \ if (0 < duration) { \ const double r = 1.0 / (NT); \ printf("\tthroughput: %.0f us (call/s %.0f kHz, %.0f cycles)\n", \ duration * r * 1e6, (NT) / (1e3 * duration), throughput * r); \ } \ } while(0) #define BENCHMARK(LOCK_KIND, IMPL, NTHREADS, WORK_R, WORK_W, WRATIOPERC, NREPEAT_LAT, NREPEAT_TPT) do { \ const int nw = 0 < (WRATIOPERC) ? (100 / (WRATIOPERC)) : ((NREPEAT_TPT) + 1); \ const int nt = (NREPEAT_TPT) * (NTHREADS); \ const double nr = 1.0 / (NREPEAT_LAT); \ LIBXSMM_LOCK_ATTR_TYPE(LOCK_KIND) attr; \ LIBXSMM_LOCK_TYPE(LOCK_KIND) lock; \ LIBXSMM_ASSERT(0 < nt); \ printf("Latency and throughput of \"%s\" (%s) for nthreads=%i wratio=%i%% work_r=%i work_w=%i nlat=%i ntpt=%i\n", \ LIBXSMM_STRINGIFY(LOCK_KIND), IMPL, NTHREADS, WRATIOPERC, WORK_R, WORK_W, NREPEAT_LAT, NREPEAT_TPT); \ LIBXSMM_LOCK_ATTR_INIT(LOCK_KIND, &attr); \ LIBXSMM_LOCK_INIT(LOCK_KIND, &lock, &attr); \ LIBXSMM_LOCK_ATTR_DESTROY(LOCK_KIND, &attr); \ MEASURE_LATENCY_RO(LOCK_KIND, &lock, NREPEAT_LAT, nr); \ MEASURE_LATENCY_RW(LOCK_KIND, &lock, NREPEAT_LAT, nr); \ MEASURE_THROUGHPUT(LOCK_KIND, &lock, NREPEAT_TPT, NTHREADS, WORK_R, WORK_W, nw, nt); \ LIBXSMM_LOCK_DESTROY(LOCK_KIND, &lock); \ } while(0) libxsmm_timer_tickint work(libxsmm_timer_tickint start, libxsmm_timer_tickint duration); libxsmm_timer_tickint work(libxsmm_timer_tickint start, libxsmm_timer_tickint duration) { const libxsmm_timer_tickint end = start + duration; libxsmm_timer_tickint tick = start; do { libxsmm_timer_tickint i, s = 0; for (i = 0; i < ((end - tick) / 4); ++i) s += i; tick = libxsmm_timer_tick(); } while(tick < end); return tick; } int main(int argc, char* argv[]) { #if defined(_OPENMP) const int max_nthreads = omp_get_max_threads(); #else const int max_nthreads = 1; #endif const int nthreads = LIBXSMM_MAX(1 < argc ? atoi(argv[1]) : max_nthreads, 1); const int wratioperc = LIBXSMM_CLMP(2 < argc ? atoi(argv[2]) : 5, 0, 100); const int work_r = LIBXSMM_MAX(3 < argc ? atoi(argv[3]) : 100, 1); const int work_w = LIBXSMM_MAX(4 < argc ? atoi(argv[4]) : (10 * work_r), 1); const int nlat = LIBXSMM_MAX(5 < argc ? atoi(argv[5]) : 2000000, 1); const int ntpt = LIBXSMM_MAX(6 < argc ? atoi(argv[6]) : 10000, 1); libxsmm_init(); printf("LIBXSMM: default lock-kind \"%s\" (%s)\n\n", LIBXSMM_STRINGIFY(LIBXSMM_LOCK_DEFAULT), #if defined(LIBXSMM_LOCK_SYSTEM_SPINLOCK) "OS-native"); #else "Other"); #endif #if defined(LIBXSMM_LOCK_SYSTEM_SPINLOCK) BENCHMARK(LIBXSMM_LOCK_SPINLOCK, "OS-native", nthreads, work_r, work_w, wratioperc, nlat, ntpt); #else BENCHMARK(LIBXSMM_LOCK_SPINLOCK, "Other", nthreads, work_r, work_w, wratioperc, nlat, ntpt); #endif #if defined(LIBXSMM_LOCK_SYSTEM_MUTEX) BENCHMARK(LIBXSMM_LOCK_MUTEX, "OS-native", nthreads, work_r, work_w, wratioperc, nlat, ntpt); #else BENCHMARK(LIBXSMM_LOCK_MUTEX, "Other", nthreads, work_r, work_w, wratioperc, nlat, ntpt); #endif #if defined(LIBXSMM_LOCK_SYSTEM_RWLOCK) BENCHMARK(LIBXSMM_LOCK_RWLOCK, "OS-native", nthreads, work_r, work_w, wratioperc, nlat, ntpt); #else BENCHMARK(LIBXSMM_LOCK_RWLOCK, "Other", nthreads, work_r, work_w, wratioperc, nlat, ntpt); #endif return EXIT_SUCCESS; } libxsmm-1.17/samples/utilities/sync/sync_lock.sh000077500000000000000000000050141415223013700220710ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) NAME=$(basename $0 .sh) GREP=$(command -v grep) ENV=$(command -v env) if [ "Windows_NT" = "${OS}" ]; then # Cygwin's "env" does not set PATH ("Files/Black: No such file or directory") export PATH=${PATH}:${HERE}/../../lib:/usr/x86_64-w64-mingw32/sys-root/mingw/bin # Cygwin's ldd hangs with dyn. linked executables or certain shared libraries LDD=$(command -v cygcheck) EXE=.exe else if [ "$(command -v ldd)" ]; then LDD=ldd elif [ "$(command -v otool)" ]; then LDD="otool -L" else LDD=echo fi fi MICINFO=$(command -v micinfo) if [ "${MICINFO}" ]; then MICCORES=$(${MICINFO} 2>/dev/null | sed -n "0,/[[:space:]]\+Total No of Active Cores :[[:space:]]\+\([0-9]\+\)/s//\1/p") fi if [ "" = "${MICCORES}" ]; then MICCORES=61 fi MICTPERC=3 if [ "-mic" != "$1" ]; then if [ "$(${LDD} ${HERE}/${NAME}${EXE} 2>/dev/null | ${GREP} libiomp5\.)" ]; then ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ KMP_AFFINITY=compact,granularity=fine,1 \ MIC_KMP_AFFINITY=compact,granularity=fine \ MIC_KMP_HW_SUBSET=$((MICCORES-1))c${MICTPERC}t \ MIC_ENV_PREFIX=MIC \ OFFLOAD_INIT=on_start \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" else ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ OMP_PROC_BIND=TRUE \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" fi else shift ${ENV} \ SINK_LD_LIBRARY_PATH=${SINK_LD_LIBRARY_PATH}:${MIC_LD_LIBRARY_PATH}:${HERE}/../../lib \ micnativeloadex \ ${HERE}/${NAME}${EXE} -a "$*" \ -e "KMP_AFFINITY=compact,granularity=fine" \ -e "MIC_KMP_HW_SUBSET=$((MICCORES-1))${MICTPERC}t" fi libxsmm-1.17/samples/utilities/sync/sync_lock.vcxproj000066400000000000000000000540261415223013700231560ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 sync_lock {E27A099A-AE91-4F6D-956A-0419F6ED13FF} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/utilities/wrap/000077500000000000000000000000001415223013700175435ustar00rootroot00000000000000libxsmm-1.17/samples/utilities/wrap/Makefile000066400000000000000000000153721415223013700212130ustar00rootroot00000000000000ROOTDIR = $(abspath $(dir $(firstword $(MAKEFILE_LIST)))) DEPDIR = ../../.. SRCDIR = . INCDIR = . BLDDIR = obj OUTDIR = . CXXFLAGS = $(NULL) CFLAGS = $(NULL) DFLAGS = $(NULL) BLAS = 2 OMP = 1 SYM = 1 BLAS_STATIC = 0 # include common Makefile artifacts include $(DEPDIR)/Makefile.inc # necessary include directories IFLAGS += -I$(call quote,$(INCDIR)) IFLAGS += -I$(call quote,$(DEPDIR)/include) ifneq (,$(strip $(wildcard $(LIBNAME).$(SLIBEXT)))) DEPSTATIC = 1 else DEPSTATIC = 0 endif XWRAP ?= 0 ifneq (Darwin,$(UNAME)) ifneq (0,$(DEPSTATIC)) XWRAP = $(GEMM) endif endif OUTNAME := $(shell basename "$(ROOTDIR)") HEADERS := $(wildcard $(INCDIR)/*.h) $(wildcard $(INCDIR)/*.hpp) $(wildcard $(INCDIR)/*.hxx) $(wildcard $(INCDIR)/*.hh) \ $(wildcard $(SRCDIR)/*.h) $(wildcard $(SRCDIR)/*.hpp) $(wildcard $(SRCDIR)/*.hxx) $(wildcard $(SRCDIR)/*.hh) \ $(DEPDIR)/include/libxsmm_source.h CPPSRCS := $(wildcard $(SRCDIR)/*.cpp) CXXSRCS := $(wildcard $(SRCDIR)/*.cxx) CCXSRCS := $(wildcard $(SRCDIR)/*.cc) CSOURCS := $(wildcard $(SRCDIR)/*.c) CPPOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CPPSRCS:.cpp=-cpp.o))) CXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CXXSRCS:.cxx=-cxx.o))) CCXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CCXSRCS:.cc=-cc.o))) COBJCTS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CSOURCS:.c=-c.o))) ifneq (,$(strip $(FC))) FXXSRCS := $(wildcard $(SRCDIR)/*.f) F77SRCS := $(wildcard $(SRCDIR)/*.F) F90SRCS := $(wildcard $(SRCDIR)/*.f90) $(wildcard $(SRCDIR)/*.F90) FXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(FXXSRCS:.f=-f.o))) F77OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F77SRCS:.F=-f77.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90SRCS:.f90=-f90.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90OBJS:.F90=-f90.o))) endif SOURCES := $(CPPSRCS) $(CXXSRCS) $(CCXSRCS) $(CSOURCS) OBJECTS := $(CPPOBJS) $(CXXOBJS) $(CCXOBJS) $(COBJCTS) FTNSRCS := $(FXXSRCS) $(F77SRCS) $(F90SRCS) MODULES := $(addsuffix .mod,$(basename $(FTNSRCS))) $(addsuffix .modmic,$(basename $(FTNSRCS))) FTNOBJS := $(FXXOBJS) $(F77OBJS) $(F90OBJS) XFILES := $(OUTDIR)/autobatch \ $(OUTDIR)/dgemm-blas $(OUTDIR)/dgemm-wrap \ $(OUTDIR)/dgemv-blas $(OUTDIR)/dgemv-wrap .PHONY: all all: $(XFILES) .PHONY: compile compile: $(OBJECTS) $(FTNOBJS) ifneq (0,$(XWRAP)) ifneq (2,$(XWRAP)) WRAP_GEMM = -Wl,--wrap=dgemm_,--wrap=sgemm_ WRAP_GEMV = -Wl,--wrap=dgemv_,--wrap=sgemv_ else WRAP_GEMM = -Wl,--wrap=dgemm_ WRAP_GEMV = -Wl,--wrap=dgemv_ endif DFLAGS += -DWRAP endif $(OUTDIR)/autobatch: $(BLDDIR)/autobatch-c.o $(OUTDIR)/.make $(LIBDEP) $(EXTDEP) $(LD) -o $@ $< $(EXTLIB) $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(WRAP_GEMM) ifeq (0,$(NOBLAS)) $(OUTDIR)/dgemm-blas: $(BLDDIR)/dgemm-c.o $(OUTDIR)/.make $(LD) -o $@ $< $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) ifeq (Darwin,$(UNAME)) ifneq (0,$(XWRAP)) $(info ================================================================================) $(info The static link-time wrapper mechanism is not supported under OS X!) $(info ================================================================================) endif endif else .PHONY: $(OUTDIR)/dgemm-blas endif ifeq (0,$(NOBLAS)) $(OUTDIR)/dgemv-blas: $(BLDDIR)/dgemv-c.o $(OUTDIR)/.make $(LD) -o $@ $< $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) ifeq (Darwin,$(UNAME)) ifneq (0,$(XWRAP)) $(info ================================================================================) $(info The static link-time wrapper mechanism is not supported under OS X!) $(info ================================================================================) endif endif else .PHONY: $(OUTDIR)/dgemv-blas endif ifneq (0,$(XWRAP)) $(OUTDIR)/dgemm-wrap: $(BLDDIR)/dgemm-c.o $(OUTDIR)/.make $(LIBDEP) $(EXTDEP) ifneq (0,$(OMP)) $(LD) -o $@ $< $(EXTLIB) $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(WRAP_GEMM) else ifneq (,$(strip $(OMPLIB))) $(LD) -o $@ $< $(EXTLIB) $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(WRAP_GEMM) \ $(XLIB_BEGIN) $(OMPLIB) $(XLIB_END) else # should not happen $(LD) -o $@ $< $(EXTLIB) $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(WRAP_GEMM) endif else .PHONY: $(OUTDIR)/dgemm-wrap endif ifneq (0,$(XWRAP)) $(OUTDIR)/dgemv-wrap: $(BLDDIR)/dgemv-c.o $(OUTDIR)/.make $(LIBDEP) $(EXTDEP) ifneq (0,$(OMP)) $(LD) -o $@ $< $(EXTLIB) $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(WRAP_GEMV) else ifneq (,$(strip $(OMPLIB))) $(LD) -o $@ $< $(EXTLIB) $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(WRAP_GEMV) \ $(XLIB_BEGIN) $(OMPLIB) $(XLIB_END) else # should not happen $(LD) -o $@ $< $(EXTLIB) $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(WRAP_GEMV) endif else .PHONY: $(OUTDIR)/dgemv-wrap endif .PHONY: test test: $(OUTDIR)/.make $(OUTDIR)/wrap-test.sh $(XFILES) @bash $(OUTDIR)/wrap-test.sh dgemm $(shell echo $$(($(TESTSIZE) * 1000))) @bash $(OUTDIR)/wrap-test.sh dgemm 350 16 20 350 35 350 1 0.0 @bash $(OUTDIR)/wrap-test.sh dgemm 200 200 200 256 256 256 1 0.0 @bash $(OUTDIR)/wrap-test.sh dgemm 24 23 21 32 32 32 -1 0.5 @bash $(OUTDIR)/wrap-test.sh dgemv $(shell echo $$(($(TESTSIZE) * 1000))) @bash $(OUTDIR)/wrap-test.sh dgemv 350 20 350 1 1 1 0 @bash $(OUTDIR)/wrap-test.sh dgemv 200 200 256 1 1 1 0 @bash $(OUTDIR)/wrap-test.sh dgemv 24 21 32 2 2 1 1 $(BLDDIR)/%-cpp.o: $(SRCDIR)/%.cpp .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CXX) $(DFLAGS) $(IFLAGS) $(CXXFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-c.o: $(SRCDIR)/%.c .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CC) $(DFLAGS) $(IFLAGS) $(CFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-f.o: $(SRCDIR)/%.f .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.f90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.F90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f77.o: $(SRCDIR)/%.F .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ .PHONY: clean clean: ifneq ($(call qapath,$(BLDDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(BLDDIR)),$(call qapath,.)) @rm -rf $(BLDDIR) endif endif ifneq (,$(wildcard $(BLDDIR))) # still exists @rm -f $(OBJECTS) $(OBJECTX) $(FTNOBJS) $(FTNOBJX) *__genmod.* fit.log *.dat @rm -f $(BLDDIR)/*.gcno $(BLDDIR)/*.gcda $(BLDDIR)/*.gcov endif @rm -f .make .state .PHONY: realclean realclean: clean ifneq ($(call qapath,$(OUTDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(OUTDIR)),$(call qapath,.)) @rm -rf $(OUTDIR) endif endif ifneq (,$(wildcard $(OUTDIR))) # still exists @rm -f $(OUTDIR)/libxsmm.$(DLIBEXT) $(OUTDIR)/*.stackdump @rm -f $(XFILES) $(MODULES) endif libxsmm-1.17/samples/utilities/wrap/README.md000066400000000000000000000017561415223013700210330ustar00rootroot00000000000000# Wrapped DGEMM This code sample is calling DGEMM and there is no dependency on the LIBXSMM API as it only relies on LAPACK/BLAS interface. Two variants are linked when building the source code: (1) code which is dynamically linked against LAPACK/BLAS, (2) code which is linked using `--wrap=`*symbol* as possible when using a GNU GCC compatible tool chain. For more information, see the [Call Wrapper](https://libxsmm.readthedocs.io/libxsmm_mm/#call-wrapper) section of the reference documentation. The same (source-)code will execute in three flavors when running `dgemm-test.sh`: (1) code variant which is dynamically linked against the originally supplied LAPACK/BLAS library, (2) code variant which is linked using the wrapper mechanism of the GNU GCC tool chain, and (3) the first code but using the LD_PRELOAD mechanism (available under Linux). **Command Line Interface (CLI)** * Optionally takes the number of repeated DGEMM calls * Shows the performance of the workload (wall time) libxsmm-1.17/samples/utilities/wrap/autobatch.c000066400000000000000000000076241415223013700216720ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include #include #include #include #if !defined(ITYPE) # define ITYPE double #endif #if !defined(GEMM) # if defined(WRAP) # define GEMM LIBXSMM_BLAS_SYMBOL(ITYPE, gemm) # else /* prototype for LIBXSMM's wrapped GEMM; this way auto-batch can be tested as if GEMM calls are intercepted */ # define GEMM LIBXSMM_FSYMBOL(LIBXSMM_CONCATENATE(__wrap_, LIBXSMM_TPREFIX(ITYPE, gemm))) # endif #endif #if !defined(CALL_BEGIN_END) # define CALL_BEGIN_END #endif void GEMM(const char*, const char*, const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, const ITYPE*, const ITYPE*, const libxsmm_blasint*, const ITYPE*, const libxsmm_blasint*, const ITYPE*, ITYPE*, const libxsmm_blasint*); int main(int argc, char* argv[]) { const libxsmm_blasint maxn = 1 < argc ? atoi(argv[1]) : 23; const libxsmm_blasint maxv = LIBXSMM_MIN(2 < argc ? atoi(argv[2]) : 2, maxn); const libxsmm_blasint size = 3 < argc ? atoi(argv[3]) : 1000; const libxsmm_blasint m = ((rand() % maxv) + 1) * maxn / maxv; const libxsmm_blasint n = ((rand() % maxv) + 1) * maxn / maxv; const libxsmm_blasint k = ((rand() % maxv) + 1) * maxn / maxv; const libxsmm_blasint lda = m, ldb = k, ldc = m; const ITYPE alpha = 1.0, beta = 0.0; const char transa = 'N', transb = 'N'; #if defined(CALL_BEGIN_END) const int flags = LIBXSMM_GEMM_FLAGS(transa, transb) # if 0 | LIBXSMM_MMBATCH_FLAG_SEQUENTIAL # endif # if 1 | LIBXSMM_MMBATCH_FLAG_STATISTIC # endif ; #endif ITYPE *a = 0, *b = 0, *c = 0; int result = EXIT_SUCCESS, i; libxsmm_init(); a = (ITYPE*)malloc((size_t)maxn * (size_t)maxn * sizeof(ITYPE)); b = (ITYPE*)malloc((size_t)maxn * (size_t)maxn * sizeof(ITYPE)); c = (ITYPE*)malloc((size_t)maxn * (size_t)maxn * sizeof(ITYPE)); if (0 == a || 0 == b || 0 == c) result = EXIT_FAILURE; if (EXIT_SUCCESS == result) { LIBXSMM_MATINIT_OMP(ITYPE, 42, a, maxn, maxn, maxn, 1.0); LIBXSMM_MATINIT_OMP(ITYPE, 24, b, maxn, maxn, maxn, 1.0); LIBXSMM_MATINIT_OMP(ITYPE, 0, c, maxn, maxn, maxn, 1.0); #if defined(_OPENMP) # pragma omp parallel private(i) #endif { #if defined(CALL_BEGIN_END) # if defined(_OPENMP) # pragma omp single nowait # endif /* enable batch-recording of the specified matrix multiplication */ libxsmm_mmbatch_begin(LIBXSMM_GEMM_PRECISION(ITYPE), &flags, &m, &n, &k, &lda, &ldb, &ldc, &alpha, &beta); #endif #if defined(_OPENMP) # pragma omp for #endif for (i = 0; i < size; ++i) { const libxsmm_blasint mi = ((rand() % maxv) + 1) * maxn / maxv; const libxsmm_blasint ni = ((rand() % maxv) + 1) * maxn / maxv; const libxsmm_blasint ki = ((rand() % maxv) + 1) * maxn / maxv; const libxsmm_blasint ilda = mi, ildb = ki, ildc = mi; assert(0 < mi && 0 < ni && 0 < ki && mi <= ilda && ki <= ildb && mi <= ildc); GEMM(&transa, &transb, &mi, &ni, &ki, &alpha, a, &ilda, b, &ildb, &beta, c, &ildc); } #if defined(CALL_BEGIN_END) # if defined(_OPENMP) # pragma omp single nowait # endif /* disable/flush multiplication batch */ libxsmm_mmbatch_end(); #endif } } libxsmm_finalize(); free(a); free(b); free(c); return result; } libxsmm-1.17/samples/utilities/wrap/autobatch.sh000077500000000000000000000050141415223013700220540ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) NAME=$(basename $0 .sh) GREP=$(command -v grep) ENV=$(command -v env) if [ "Windows_NT" = "${OS}" ]; then # Cygwin's "env" does not set PATH ("Files/Black: No such file or directory") export PATH=${PATH}:${HERE}/../../lib:/usr/x86_64-w64-mingw32/sys-root/mingw/bin # Cygwin's ldd hangs with dyn. linked executables or certain shared libraries LDD=$(command -v cygcheck) EXE=.exe else if [ "$(command -v ldd)" ]; then LDD=ldd elif [ "$(command -v otool)" ]; then LDD="otool -L" else LDD=echo fi fi MICINFO=$(command -v micinfo) if [ "${MICINFO}" ]; then MICCORES=$(${MICINFO} 2>/dev/null | sed -n "0,/[[:space:]]\+Total No of Active Cores :[[:space:]]\+\([0-9]\+\)/s//\1/p") fi if [ "" = "${MICCORES}" ]; then MICCORES=61 fi MICTPERC=3 if [ "-mic" != "$1" ]; then if [ "$(${LDD} ${HERE}/${NAME}${EXE} 2>/dev/null | ${GREP} libiomp5\.)" ]; then ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ KMP_AFFINITY=compact,granularity=fine,1 \ MIC_KMP_AFFINITY=compact,granularity=fine \ MIC_KMP_HW_SUBSET=$((MICCORES-1))c${MICTPERC}t \ MIC_ENV_PREFIX=MIC \ OFFLOAD_INIT=on_start \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" else ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ OMP_PROC_BIND=TRUE \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" fi else shift ${ENV} \ SINK_LD_LIBRARY_PATH=${SINK_LD_LIBRARY_PATH}:${MIC_LD_LIBRARY_PATH}:${HERE}/../../lib \ micnativeloadex \ ${HERE}/${NAME}${EXE} -a "$*" \ -e "KMP_AFFINITY=compact,granularity=fine" \ -e "MIC_KMP_HW_SUBSET=$((MICCORES-1))${MICTPERC}t" fi libxsmm-1.17/samples/utilities/wrap/autobatch.vcxproj000066400000000000000000000547561415223013700231530ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 autobatch {7C20EC72-D155-4C70-87EB-855CA95733A8} 10.0 Application Disabled Disabled Sequential v142 true Application true true Disabled Disabled Sequential v142 Application true Disabled Disabled Sequential v142 true Application Disabled Disabled Sequential v142 true true Application true Disabled Disabled Sequential v142 Application true Disabled Disabled true Sequential v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmext.lib;mkl_rt.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmext.lib;mkl_rt.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/utilities/wrap/dgemm-blas.sh000077500000000000000000000050141415223013700221120ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) NAME=$(basename $0 .sh) GREP=$(command -v grep) ENV=$(command -v env) if [ "Windows_NT" = "${OS}" ]; then # Cygwin's "env" does not set PATH ("Files/Black: No such file or directory") export PATH=${PATH}:${HERE}/../../lib:/usr/x86_64-w64-mingw32/sys-root/mingw/bin # Cygwin's ldd hangs with dyn. linked executables or certain shared libraries LDD=$(command -v cygcheck) EXE=.exe else if [ "$(command -v ldd)" ]; then LDD=ldd elif [ "$(command -v otool)" ]; then LDD="otool -L" else LDD=echo fi fi MICINFO=$(command -v micinfo) if [ "${MICINFO}" ]; then MICCORES=$(${MICINFO} 2>/dev/null | sed -n "0,/[[:space:]]\+Total No of Active Cores :[[:space:]]\+\([0-9]\+\)/s//\1/p") fi if [ "" = "${MICCORES}" ]; then MICCORES=61 fi MICTPERC=3 if [ "-mic" != "$1" ]; then if [ "$(${LDD} ${HERE}/${NAME}${EXE} 2>/dev/null | ${GREP} libiomp5\.)" ]; then ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ KMP_AFFINITY=scatter,granularity=fine,1 \ MIC_KMP_AFFINITY=scatter,granularity=fine \ MIC_KMP_HW_SUBSET=$((MICCORES-1))c${MICTPERC}t \ MIC_ENV_PREFIX=MIC \ OFFLOAD_INIT=on_start \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" else ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ OMP_PROC_BIND=TRUE \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" fi else shift ${ENV} \ SINK_LD_LIBRARY_PATH=${SINK_LD_LIBRARY_PATH}:${MIC_LD_LIBRARY_PATH}:${HERE}/../../lib \ micnativeloadex \ ${HERE}/${NAME}${EXE} -a "$*" \ -e "KMP_AFFINITY=scatter,granularity=fine" \ -e "MIC_KMP_HW_SUBSET=$((MICCORES-1))${MICTPERC}t" fi libxsmm-1.17/samples/utilities/wrap/dgemm-wrap.sh000077500000000000000000000050141415223013700221420ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) NAME=$(basename $0 .sh) GREP=$(command -v grep) ENV=$(command -v env) if [ "Windows_NT" = "${OS}" ]; then # Cygwin's "env" does not set PATH ("Files/Black: No such file or directory") export PATH=${PATH}:${HERE}/../../lib:/usr/x86_64-w64-mingw32/sys-root/mingw/bin # Cygwin's ldd hangs with dyn. linked executables or certain shared libraries LDD=$(command -v cygcheck) EXE=.exe else if [ "$(command -v ldd)" ]; then LDD=ldd elif [ "$(command -v otool)" ]; then LDD="otool -L" else LDD=echo fi fi MICINFO=$(command -v micinfo) if [ "${MICINFO}" ]; then MICCORES=$(${MICINFO} 2>/dev/null | sed -n "0,/[[:space:]]\+Total No of Active Cores :[[:space:]]\+\([0-9]\+\)/s//\1/p") fi if [ "" = "${MICCORES}" ]; then MICCORES=61 fi MICTPERC=3 if [ "-mic" != "$1" ]; then if [ "$(${LDD} ${HERE}/${NAME}${EXE} 2>/dev/null | ${GREP} libiomp5\.)" ]; then ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ KMP_AFFINITY=scatter,granularity=fine,1 \ MIC_KMP_AFFINITY=scatter,granularity=fine \ MIC_KMP_HW_SUBSET=$((MICCORES-1))c${MICTPERC}t \ MIC_ENV_PREFIX=MIC \ OFFLOAD_INIT=on_start \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" else ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ OMP_PROC_BIND=TRUE \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" fi else shift ${ENV} \ SINK_LD_LIBRARY_PATH=${SINK_LD_LIBRARY_PATH}:${MIC_LD_LIBRARY_PATH}:${HERE}/../../lib \ micnativeloadex \ ${HERE}/${NAME}${EXE} -a "$*" \ -e "KMP_AFFINITY=scatter,granularity=fine" \ -e "MIC_KMP_HW_SUBSET=$((MICCORES-1))${MICTPERC}t" fi libxsmm-1.17/samples/utilities/wrap/dgemm.c000066400000000000000000000064551415223013700210120ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include #include #include #if !defined(BLASINT_TYPE) # define BLASINT_TYPE int #endif /** Function prototype for DGEMM; this way any kind of LAPACK/BLAS library is sufficient at link-time. */ void dgemm_(const char*, const char*, const BLASINT_TYPE*, const BLASINT_TYPE*, const BLASINT_TYPE*, const double*, const double*, const BLASINT_TYPE*, const double*, const BLASINT_TYPE*, const double*, double*, const BLASINT_TYPE*); void init(int seed, double* dst, BLASINT_TYPE nrows, BLASINT_TYPE ncols, BLASINT_TYPE ld, double scale); void init(int seed, double* dst, BLASINT_TYPE nrows, BLASINT_TYPE ncols, BLASINT_TYPE ld, double scale) { const double seed1 = scale * (seed + 1); BLASINT_TYPE i = 0; #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < ncols; ++i) { BLASINT_TYPE j = 0; for (; j < nrows; ++j) { const BLASINT_TYPE k = i * ld + j; dst[k] = (double)(seed1 / (k + 1)); } for (; j < ld; ++j) { const BLASINT_TYPE k = i * ld + j; dst[k] = (double)seed; } } } int main(int argc, char* argv[]) { int size = 2 == argc ? atoi(argv[1]) : 500; const BLASINT_TYPE m = 2 < argc ? atoi(argv[1]) : 23; const BLASINT_TYPE k = 3 < argc ? atoi(argv[3]) : m; const BLASINT_TYPE n = 2 < argc ? atoi(argv[2]) : k; const BLASINT_TYPE lda = 4 < argc ? atoi(argv[4]) : m; const BLASINT_TYPE ldb = 5 < argc ? atoi(argv[5]) : k; const BLASINT_TYPE ldc = 6 < argc ? atoi(argv[6]) : m; const double alpha = 7 < argc ? atof(argv[7]) : 1.0; const double beta = 8 < argc ? atof(argv[8]) : 1.0; const char transa = 'N', transb = 'N'; double *a = 0, *b = 0, *c = 0; int i; if (9 < argc) size = atoi(argv[9]); a = (double*)malloc(lda * k * sizeof(double)); b = (double*)malloc(ldb * n * sizeof(double)); c = (double*)malloc(ldc * n * sizeof(double)); printf("dgemm('%c', '%c', %i/*m*/, %i/*n*/, %i/*k*/,\n" " %g/*alpha*/, %p/*a*/, %i/*lda*/,\n" " %p/*b*/, %i/*ldb*/,\n" " %g/*beta*/, %p/*c*/, %i/*ldc*/)\n", transa, transb, m, n, k, alpha, (const void*)a, lda, (const void*)b, ldb, beta, (const void*)c, ldc); assert(0 != a && 0 != b && 0 != c); init(42, a, m, k, lda, 1.0); init(24, b, k, n, ldb, 1.0); init( 0, c, m, n, ldc, 1.0); for (i = 0; i < size; ++i) { dgemm_(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); } printf("Called %i times.\n", size); free(a); free(b); free(c); return EXIT_SUCCESS; } libxsmm-1.17/samples/utilities/wrap/dgemv-blas.sh000077500000000000000000000050141415223013700221230ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) NAME=$(basename $0 .sh) GREP=$(command -v grep) ENV=$(command -v env) if [ "Windows_NT" = "${OS}" ]; then # Cygwin's "env" does not set PATH ("Files/Black: No such file or directory") export PATH=${PATH}:${HERE}/../../lib:/usr/x86_64-w64-mingw32/sys-root/mingw/bin # Cygwin's ldd hangs with dyn. linked executables or certain shared libraries LDD=$(command -v cygcheck) EXE=.exe else if [ "$(command -v ldd)" ]; then LDD=ldd elif [ "$(command -v otool)" ]; then LDD="otool -L" else LDD=echo fi fi MICINFO=$(command -v micinfo) if [ "${MICINFO}" ]; then MICCORES=$(${MICINFO} 2>/dev/null | sed -n "0,/[[:space:]]\+Total No of Active Cores :[[:space:]]\+\([0-9]\+\)/s//\1/p") fi if [ "" = "${MICCORES}" ]; then MICCORES=61 fi MICTPERC=3 if [ "-mic" != "$1" ]; then if [ "$(${LDD} ${HERE}/${NAME}${EXE} 2>/dev/null | ${GREP} libiomp5\.)" ]; then ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ KMP_AFFINITY=scatter,granularity=fine,1 \ MIC_KMP_AFFINITY=scatter,granularity=fine \ MIC_KMP_HW_SUBSET=$((MICCORES-1))c${MICTPERC}t \ MIC_ENV_PREFIX=MIC \ OFFLOAD_INIT=on_start \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" else ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ OMP_PROC_BIND=TRUE \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" fi else shift ${ENV} \ SINK_LD_LIBRARY_PATH=${SINK_LD_LIBRARY_PATH}:${MIC_LD_LIBRARY_PATH}:${HERE}/../../lib \ micnativeloadex \ ${HERE}/${NAME}${EXE} -a "$*" \ -e "KMP_AFFINITY=scatter,granularity=fine" \ -e "MIC_KMP_HW_SUBSET=$((MICCORES-1))${MICTPERC}t" fi libxsmm-1.17/samples/utilities/wrap/dgemv-wrap.sh000077500000000000000000000050141415223013700221530ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) NAME=$(basename $0 .sh) GREP=$(command -v grep) ENV=$(command -v env) if [ "Windows_NT" = "${OS}" ]; then # Cygwin's "env" does not set PATH ("Files/Black: No such file or directory") export PATH=${PATH}:${HERE}/../../lib:/usr/x86_64-w64-mingw32/sys-root/mingw/bin # Cygwin's ldd hangs with dyn. linked executables or certain shared libraries LDD=$(command -v cygcheck) EXE=.exe else if [ "$(command -v ldd)" ]; then LDD=ldd elif [ "$(command -v otool)" ]; then LDD="otool -L" else LDD=echo fi fi MICINFO=$(command -v micinfo) if [ "${MICINFO}" ]; then MICCORES=$(${MICINFO} 2>/dev/null | sed -n "0,/[[:space:]]\+Total No of Active Cores :[[:space:]]\+\([0-9]\+\)/s//\1/p") fi if [ "" = "${MICCORES}" ]; then MICCORES=61 fi MICTPERC=3 if [ "-mic" != "$1" ]; then if [ "$(${LDD} ${HERE}/${NAME}${EXE} 2>/dev/null | ${GREP} libiomp5\.)" ]; then ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ KMP_AFFINITY=scatter,granularity=fine,1 \ MIC_KMP_AFFINITY=scatter,granularity=fine \ MIC_KMP_HW_SUBSET=$((MICCORES-1))c${MICTPERC}t \ MIC_ENV_PREFIX=MIC \ OFFLOAD_INIT=on_start \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" else ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ OMP_PROC_BIND=TRUE \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" fi else shift ${ENV} \ SINK_LD_LIBRARY_PATH=${SINK_LD_LIBRARY_PATH}:${MIC_LD_LIBRARY_PATH}:${HERE}/../../lib \ micnativeloadex \ ${HERE}/${NAME}${EXE} -a "$*" \ -e "KMP_AFFINITY=scatter,granularity=fine" \ -e "MIC_KMP_HW_SUBSET=$((MICCORES-1))${MICTPERC}t" fi libxsmm-1.17/samples/utilities/wrap/dgemv.c000066400000000000000000000062201415223013700210110ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include #include #include #if !defined(BLASINT_TYPE) # define BLASINT_TYPE int #endif /** Function prototype for DGEMM; this way any kind of LAPACK/BLAS library is sufficient at link-time. */ void dgemv_(const char*, const BLASINT_TYPE*, const BLASINT_TYPE*, const double*, const double*, const BLASINT_TYPE*, const double*, const BLASINT_TYPE*, const double*, double*, const BLASINT_TYPE*); void init(int seed, double* dst, BLASINT_TYPE nrows, BLASINT_TYPE ncols, BLASINT_TYPE ld, double scale); void init(int seed, double* dst, BLASINT_TYPE nrows, BLASINT_TYPE ncols, BLASINT_TYPE ld, double scale) { const double seed1 = scale * (seed + 1); BLASINT_TYPE i = 0; #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < ncols; ++i) { BLASINT_TYPE j = 0; for (; j < nrows; ++j) { const BLASINT_TYPE k = i * ld + j; dst[k] = (double)(seed1 / (k + 1)); } for (; j < ld; ++j) { const BLASINT_TYPE k = i * ld + j; dst[k] = (double)seed; } } } int main(int argc, char* argv[]) { int size = 2 == argc ? atoi(argv[1]) : 500; const BLASINT_TYPE m = 2 < argc ? atoi(argv[1]) : 23; const BLASINT_TYPE n = 2 < argc ? atoi(argv[2]) : m; const BLASINT_TYPE lda = 3 < argc ? atoi(argv[3]) : m; const BLASINT_TYPE incx = 4 < argc ? atoi(argv[4]) : 1; const BLASINT_TYPE incy = 5 < argc ? atoi(argv[5]) : 1; const double alpha = 6 < argc ? atof(argv[6]) : 1.0; const double beta = 7 < argc ? atof(argv[7]) : 1.0; const char trans = 'N'; double *a = 0, *x = 0, *y = 0; int i; if (8 < argc) size = atoi(argv[8]); a = (double*)malloc(lda * n * sizeof(double)); x = (double*)malloc(incx * n * sizeof(double)); y = (double*)malloc(incy * m * sizeof(double)); printf("dgemv('%c', %i/*m*/, %i/*n*/,\n" " %g/*alpha*/, %p/*a*/, %i/*lda*/,\n" " %p/*x*/, %i/*incx*/,\n" " %g/*beta*/, %p/*y*/, %i/*incy*/)\n", trans, m, n, alpha, (const void*)a, lda, (const void*)x, incx, beta, (const void*)y, incy); assert(0 != a && 0 != x && 0 != y); init(42, a, m, n, lda, 1.0); init(24, x, n, 1, incx, 1.0); init( 0, y, m, 1, incy, 1.0); for (i = 0; i < size; ++i) { dgemv_(&trans, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy); } printf("Called %i times.\n", size); free(a); free(x); free(y); return EXIT_SUCCESS; } libxsmm-1.17/samples/utilities/wrap/wrap-test.sh000077500000000000000000000054011415223013700220300ustar00rootroot00000000000000#!/usr/bin/env bash ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) DEPDIR=${HERE}/../../.. TMPF=$(${DEPDIR}/.mktmp.sh /tmp/.libxsmm_XXXXXX.out) UNAME=$(command -v uname) GREP=$(command -v grep) SORT=$(command -v sort) RM=$(command -v rm) TR=$(command -v tr) if [ "Darwin" != "$(${UNAME})" ]; then LIBEXT=so else LIBEXT=dylib fi if [ "$1" ]; then TEST=$1 shift else TEST=dgemm fi if [ -e ${HERE}/${TEST}-blas ]; then NAME=$(echo ${TEST} | ${TR} [:lower:] [:upper:]) echo "=============================" echo "Running ${NAME} (ORIGINAL BLAS)" echo "=============================" { time ${HERE}/${TEST}-blas.sh "$@" 2>${TMPF}; } 2>&1 | ${GREP} real RESULT=$? if [ 0 != ${RESULT} ]; then echo -n "FAILED(${RESULT}) "; ${SORT} -u ${TMPF} ${RM} -f ${TMPF} exit ${RESULT} else echo -n "OK "; ${SORT} -u ${TMPF} fi echo if [ -e ${DEPDIR}/lib/libxsmmext.${LIBEXT} ]; then echo echo "=============================" echo "Running ${NAME} (LD_PRELOAD)" echo "=============================" { time \ LD_LIBRARY_PATH=${DEPDIR}/lib:${LD_LIBRARY_PATH} LD_PRELOAD=${DEPDIR}/lib/libxsmmext.${LIBEXT} \ DYLD_LIBRARY_PATH=${DEPDIR}/lib:${DYLD_LIBRARY_PATH} DYLD_INSERT_LIBRARIES=${DEPDIR}/lib/libxsmmext.${LIBEXT} \ ${HERE}/${TEST}-blas.sh "$@" 2>${TMPF}; } 2>&1 | ${GREP} real RESULT=$? if [ 0 != ${RESULT} ]; then echo -n "FAILED(${RESULT}) "; ${SORT} -u ${TMPF} ${RM} -f ${TMPF} exit ${RESULT} else echo -n "OK "; ${SORT} -u ${TMPF} fi echo fi fi if [ -e ${HERE}/${TEST}-wrap ] && [ -e .state ] && \ [ "" = "$(${GREP} 'BLAS=0' .state)" ]; then echo echo "=============================" echo "Running ${NAME} (STATIC WRAP)" echo "=============================" { time ${HERE}/${TEST}-wrap.sh "$@" 2>${TMPF}; } 2>&1 | ${GREP} real RESULT=$? if [ 0 != ${RESULT} ]; then echo -n "FAILED(${RESULT}) "; ${SORT} -u ${TMPF} ${RM} -f ${TMPF} exit ${RESULT} else echo -n "OK "; ${SORT} -u ${TMPF} fi echo fi ${RM} -f ${TMPF} libxsmm-1.17/samples/xgemm/000077500000000000000000000000001415223013700156745ustar00rootroot00000000000000libxsmm-1.17/samples/xgemm/Makefile000066400000000000000000000101071415223013700173330ustar00rootroot00000000000000ROOTDIR = $(abspath $(dir $(firstword $(MAKEFILE_LIST)))) DEPDIR = ../.. SRCDIR = . INCDIR = . BLDDIR = obj OUTDIR = . CXXFLAGS = $(NULL) CFLAGS = $(NULL) DFLAGS = -DLIBXSMM_BLAS_CONST BLAS = 2 OMP = 1 SYM = 1 # include common Makefile artifacts include $(DEPDIR)/Makefile.inc # necessary include directories IFLAGS += -I$(call quote,$(INCDIR)) IFLAGS += -I$(call quote,$(DEPDIR)/include) OUTNAME := $(shell basename "$(ROOTDIR)") HEADERS := $(wildcard $(INCDIR)/*.h) $(wildcard $(INCDIR)/*.hpp) $(wildcard $(INCDIR)/*.hxx) $(wildcard $(INCDIR)/*.hh) \ $(wildcard $(SRCDIR)/*.h) $(wildcard $(SRCDIR)/*.hpp) $(wildcard $(SRCDIR)/*.hxx) $(wildcard $(SRCDIR)/*.hh) \ $(DEPDIR)/include/libxsmm_source.h CPPSRCS := $(wildcard $(SRCDIR)/*.cpp) CXXSRCS := $(wildcard $(SRCDIR)/*.cxx) CCXSRCS := $(wildcard $(SRCDIR)/*.cc) CSOURCS := $(wildcard $(SRCDIR)/*.c) CPPOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CPPSRCS:.cpp=-cpp.o))) CXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CXXSRCS:.cxx=-cxx.o))) CCXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CCXSRCS:.cc=-cc.o))) COBJCTS := $(patsubst %,$(BLDDIR)/%,$(notdir $(CSOURCS:.c=-c.o))) ifneq (,$(strip $(FC))) FXXSRCS := $(wildcard $(SRCDIR)/*.f) F77SRCS := $(wildcard $(SRCDIR)/*.F) F90SRCS := $(wildcard $(SRCDIR)/*.f90) $(wildcard $(SRCDIR)/*.F90) FXXOBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(FXXSRCS:.f=-f.o))) F77OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F77SRCS:.F=-f77.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90SRCS:.f90=-f90.o))) F90OBJS := $(patsubst %,$(BLDDIR)/%,$(notdir $(F90OBJS:.F90=-f90.o))) endif SOURCES := $(CPPSRCS) $(CXXSRCS) $(CCXSRCS) $(CSOURCS) OBJECTS := $(CPPOBJS) $(CXXOBJS) $(CCXOBJS) $(COBJCTS) FTNSRCS := $(FXXSRCS) $(F77SRCS) $(F90SRCS) MODULES := $(addsuffix .mod,$(basename $(FTNSRCS))) $(addsuffix .modmic,$(basename $(FTNSRCS))) FTNOBJS := $(FXXOBJS) $(F77OBJS) $(F90OBJS) XFILES := $(OUTDIR)/$(OUTNAME) $(OUTDIR)/kernel $(OUTDIR)/bf16sgemm .PHONY: all all: $(XFILES) .PHONY: compile compile: $(OBJECTS) $(FTNOBJS) $(OUTDIR)/$(OUTNAME): $(OUTDIR)/.make $(BLDDIR)/$(OUTNAME)-c.o $(LIBDEP) $(EXTDEP) $(LD) -o $@ $(BLDDIR)/$(OUTNAME)-c.o $(call cleanld,$(EXTLIB) $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS)) $(OUTDIR)/kernel: $(OUTDIR)/.make $(BLDDIR)/kernel-c.o $(LIBDEP) $(LD) -o $@ $(BLDDIR)/kernel-c.o $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(OUTDIR)/bf16sgemm: $(OUTDIR)/.make $(BLDDIR)/bf16sgemm-c.o $(LIBDEP) $(LD) -o $@ $(BLDDIR)/bf16sgemm-c.o $(MAINLIB) $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(BLDDIR)/%-cpp.o: $(SRCDIR)/%.cpp .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CXX) $(DFLAGS) $(IFLAGS) $(CXXFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-c.o: $(SRCDIR)/%.c .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(CC) $(DFLAGS) $(IFLAGS) $(CFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-f.o: $(SRCDIR)/%.f .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.f90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f90.o: $(SRCDIR)/%.F90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ $(BLDDIR)/%-f77.o: $(SRCDIR)/%.F .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ .PHONY: clean clean: ifneq ($(call qapath,$(BLDDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(BLDDIR)),$(call qapath,.)) @rm -rf $(BLDDIR) endif endif ifneq (,$(wildcard $(BLDDIR))) # still exists @rm -f $(OBJECTS) $(OBJECTX) $(FTNOBJS) $(FTNOBJX) *__genmod.* fit.log *.dat @rm -f $(BLDDIR)/*.gcno $(BLDDIR)/*.gcda $(BLDDIR)/*.gcov endif @rm -f .make .state .PHONY: realclean realclean: clean ifneq ($(call qapath,$(OUTDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(OUTDIR)),$(call qapath,.)) @rm -rf $(OUTDIR) endif endif ifneq (,$(wildcard $(OUTDIR))) # still exists @rm -f $(OUTDIR)/libxsmm.$(DLIBEXT) $(OUTDIR)/*.stackdump @rm -f $(XFILES) $(MODULES) endif libxsmm-1.17/samples/xgemm/README.md000066400000000000000000000070771415223013700171660ustar00rootroot00000000000000# XGEMM: Tiled GEMM Routines ## Overview This sample code calls the `libxsmm_?gemm_omp` routines provided by the LIBXSMM extension library (`libxsmmext`). These routines are meant for big(ger) xGEMM routines, and thereby provide an OpenMP-based parallelization. The driver program (`xgemm.c`) currently accepts all typical GEMM arguments (except for the transposition specifier): `m`, `n`, `k`, `lda`, `ldb`, `ldc`, `alpha`, and `beta`. All arguments are optional (or will inherit defaults from previously specified arguments). Matrix transposition as part of the `libxsmm_?gemm_omp` routines will become available in an upcoming release of LIBXSMM. Please also note that unsupported Alpha or Beta values will cause a fall back to the related BLAS routine. The single-precision matrix multiplications require to change the `ITYPE` in `xgemm.c`. ```bash ./xgemm.sh 2000 ``` ## OpenTuner To tune the tile sizes ("block sizes") internal to LIBXSMM, the [OpenTuner](http://opentuner.org/) extensible framework for program autotuning can be used. In case of issues during the tuning phase ("no value has been set for this column"), please install the latest 1.2.x revision of SQLAlchemy (`pip install sqlalchemy==1.2.19`). A tuning script (`xgemm_opentuner.py`) is provided, which optionally accepts a list of grouped parameters as command line arguments. The syntax of the arguments is per LIBXSMM's `MNK` build-option, and expands to "triplets" specifying the matrix shapes. For instance, four matrix multiplications of square-matrices can be benchmarked and tuned using the following command. ```bash ./xgemm_opentuner.py 1024,1280,1536,1792 ``` To start a tuning experiment for a new set of arguments, it is highly recommended to start from scratch. Otherwise the population of previously generated tuning results is fetched from a database and used to tune an unrelated range of matrix shapes. Optionally, the initial block size can be seeded (`tile-size-m`, `tile-size-n`, and `tile-size-k`). ```bash rm -rf opentuner.db ``` The script tunes the geometric mean of the performance for each of the requested triplets. However, the optimizer not only maximizes the performance but also minimizes the value of *M \* N \* K* (which also helps to prune duplicated results due to an additional preference). As a limitation of the current implementation, the multiplication kernels are not accompanied by copy-kernels (and not accompanied by transpose kernels). This negatively impacts performance on power-of-two matrix shapes (POT) due to trashing the LLC. However, it has been found, that tuning for POT shapes likely achieves superior performance when compared to tuning for non-POT shapes of the same range. ```bash rm -rf opentuner.db ./xgemm_opentuner.py --no-dups 192,256,320,512,768 rm -rf opentuner.db ./xgemm_opentuner.py --no-dups 1024,1280,1536,1792 rm -rf opentuner.db ./xgemm_opentuner.py --no-dups 2048,2304,2560,2816 rm -rf opentuner.db ./xgemm_opentuner.py --no-dups 3072,3328,3584,3840 rm -rf opentuner.db ./xgemm_opentuner.py --no-dups 4096,4416,4736 rm -rf opentuner.db ./xgemm_opentuner.py --no-dups 5120,5440,5760 rm -rf opentuner.db ./xgemm_opentuner.py --no-dups 6144,6464,6784 rm -rf opentuner.db ./xgemm_opentuner.py --no-dups 7168,7488,7808 ``` Above, the series of matrix multiplications from 192-8K is separately tuned in eight ranges. The tuning script uses the environment variables `LIBXSMM_TGEMM_M`, `LIBXSMM_TGEMM_N`, and `LIBXSMM_TGEMM_K` which are internal to LIBXSMM. These variables are used to request a specific tiling-scheme within LIBXSMM's `libxsmm_?gemm_omp` routines. libxsmm-1.17/samples/xgemm/bf16sgemm.c000066400000000000000000000542141415223013700176350ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include #include #include #if defined(_OPENMP) # include #endif #if defined(__MKL) # include #else LIBXSMM_BLAS_SYMBOL_DECL(float, gemm) #endif #if 0 #define __USE_NATIVE_CPX__ #endif #ifdef __AVX512BW__ #ifndef __USE_NATIVE_CPX__ #define _mm512_bf16cvt(A) _mm512_cvtepi32_epi16(_mm512_srai_epi32(LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16((A)),16)) #define _mm512_bf16store(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16((B)),16))) #endif size_t sgemm_trup_get_scratch( const int* m, const int* n, const int* k ) { size_t memam = 0; assert( ((*m) % 64) == 0 ); assert( (((*n)-1) % 64) == 0 ); assert( ((*k) % 64) == 0 ); memam += (*m)*(*k)*sizeof(float); memam += (*n)*(*k)*sizeof(float); memam += ((*m)*(*k)*sizeof(size_t))/(64*64); memam += ((*n-1)*(*k)*sizeof(size_t))/(64*64); memam += ((*n-1)*(*m)*sizeof(size_t))/(64*64); return memam; } void sgemm_trup( const char* transa, const char* transb, const int* m, const int* n, const int* k, const float* alpha, const float* a, const int* lda, const float* b, const int* ldb, const float* beta, float* c, const int* ldc, void* scratch ) { /* tmpa and tmpb pointers for block storage */ float* tmpa = (float*)scratch; float* tmpb = tmpa+((*k)*(*lda)); /* (fixed) blocking factors */ int bm = 64; int bn = 64; int bk = 64; int bn2 = 65; int Bm = (*m)/bm; int Bn = ((*n)-1)/bn; int Bk = (*k)/bk; unsigned long long Bkbr = (unsigned long long)Bk; int BnB = 8; int BmB = Bm; /* helper arrays for mixed shaped tile tensors */ size_t* poa = (size_t*)(tmpb + (*n)*(*ldb)); size_t* pob = poa + (Bm*Bk); size_t* poc = pob + (Bn*Bk); /* mult-dim array definitions for readable code */ LIBXSMM_VLA_DECL( 2, const float, origa, a, (*lda) ); LIBXSMM_VLA_DECL( 2, const float, origb, b, (*ldb) ); /* organization of tile offsets */ LIBXSMM_VLA_DECL( 2, size_t, offa, poa, Bk); LIBXSMM_VLA_DECL( 2, size_t, offb, pob, Bk); LIBXSMM_VLA_DECL( 2, size_t, offc, poc, Bm); /* jitted libxsmm batch reduce kernel for compute */ libxsmm_smmfunction_reducebatch_strd fluxcapacitor = libxsmm_smmdispatch_reducebatch_strd( bm, bn, bk, bk*bm*sizeof(float), bk*bn*sizeof(float), &bm, &bk, ldc, NULL, NULL, NULL, NULL); libxsmm_smmfunction_reducebatch_strd fluxcapacitor2 = libxsmm_smmdispatch_reducebatch_strd( bm, bn2, bk, bk*bm*sizeof(float), bk*bn2*sizeof(float), &bm, &bk, ldc, NULL, NULL, NULL, NULL); /* tmp counters */ int lm1, ln1, lk1, lm2, ln2, lk2, lno, Bne, lmo, Bme; /* some checks */ assert( ((*m) % 64) == 0 ); assert( (((*n)-1) % 64) == 0 ); assert( ((*k) % 64) == 0 ); assert( ((*lda) % 64) == 0 ); assert( ((*ldb) % 64) == 0 ); assert( ((*ldc) % 64) == 0 ); assert( *alpha == -1.0f ); assert( *beta == 1.0f ); assert( *transa == 'N' ); assert( *transb == 'N' ); for ( lm1 = 0; lm1 < Bm; ++lm1 ) { for ( lk1 = 0; lk1 < Bk; ++lk1 ) { LIBXSMM_VLA_ACCESS( 2, offa, lm1, lk1, Bk ) = ((size_t)bm*bk*lk1) + ((size_t)lm1*bm*(*k)); } } for ( ln1 = 0; ln1 < Bn; ++ln1 ) { for ( lk1 = 0; lk1 < Bk; ++lk1 ) { if ( ln1 == Bn-1 ) { LIBXSMM_VLA_ACCESS( 2, offb, ln1, lk1, Bk ) = ((size_t)bn2*bk*lk1) + ((size_t)ln1*bn*(*k)); } else { LIBXSMM_VLA_ACCESS( 2, offb, ln1, lk1, Bk ) = ((size_t)bn*bk*lk1) + ((size_t)ln1*bn*(*k)); } } } for ( ln1 = 0; ln1 < Bn; ++ln1 ) { for ( lm1 = 0; lm1 < Bm; ++lm1 ) { LIBXSMM_VLA_ACCESS( 2, offc, ln1, lm1, Bm ) = ((size_t)bm*lm1) + ((size_t)ln1*bn*(*m)); } } #if defined(_OPENMP) # pragma omp parallel private(lm1, lm2, ln1, ln2, lk1, lk2, lno, Bne, lmo, Bme) #endif { for ( lmo = 0; lmo < Bm; lmo += BmB ) { Bme = (lmo+BmB > Bm) ? Bm : lmo+BmB; for ( lno = 0; lno < Bn; lno += BnB ) { Bne = (lno+BnB > Bn) ? Bn : lno+BnB; #if defined(_OPENMP) # pragma omp for private(ln1, ln2, lk1, lk2) LIBXSMM_OPENMP_COLLAPSE(2) #endif for ( ln1 = lno; ln1 < Bne; ++ln1 ) { for ( lk1 = 0; lk1 < Bk; ++lk1 ) { int mybn = ( ln1 == Bn-1 ) ? bn2 : bn; for ( ln2 = 0; ln2 < mybn; ++ln2 ) { float* tmpaddr1 = tmpb + LIBXSMM_VLA_ACCESS( 2, offb, ln1, lk1, Bk ) + ln2*bk; const float* tmpaddr2 = &LIBXSMM_VLA_ACCESS( 2, origb, (ln1*bn)+ln2, (lk1*bk), (*ldb) ); _mm512_storeu_ps( tmpaddr1, _mm512_loadu_ps( tmpaddr2 ) ); _mm512_storeu_ps( tmpaddr1+16, _mm512_loadu_ps( tmpaddr2+16 ) ); _mm512_storeu_ps( tmpaddr1+32, _mm512_loadu_ps( tmpaddr2+32 ) ); _mm512_storeu_ps( tmpaddr1+48, _mm512_loadu_ps( tmpaddr2+48 ) ); } } } #if defined(_OPENMP) # pragma omp for private(lm1, ln1, lk1, lk2) LIBXSMM_OPENMP_COLLAPSE(2) #endif for ( lm1 = lmo; lm1 < Bme; ++lm1 ) { /* we prepare a bm*K tile of A in L1/L2 cache */ for ( lk1 = 0; lk1 < Bk; ++lk1 ) { __m512 vmone = _mm512_set1_ps( -1.0f ); for ( lk2 = 0; lk2 < bk; ++lk2 ) { float* tmpaddr1 = tmpa + LIBXSMM_VLA_ACCESS( 2, offa, lm1, lk1, Bk ) + lk2*bm; const float* tmpaddr2 = &LIBXSMM_VLA_ACCESS( 2, origa, (lk1*bk)+lk2, (lm1*bm), (*lda) ); _mm512_storeu_ps( tmpaddr1, _mm512_mul_ps( vmone, _mm512_loadu_ps( tmpaddr2 ) ) ); _mm512_storeu_ps( tmpaddr1+16, _mm512_mul_ps( vmone, _mm512_loadu_ps( tmpaddr2+16 ) ) ); _mm512_storeu_ps( tmpaddr1+32, _mm512_mul_ps( vmone, _mm512_loadu_ps( tmpaddr2+32 ) ) ); _mm512_storeu_ps( tmpaddr1+48, _mm512_mul_ps( vmone, _mm512_loadu_ps( tmpaddr2+48 ) ) ); } } } #if defined(_OPENMP) # pragma omp for private(lm1, ln1) LIBXSMM_OPENMP_COLLAPSE(2) #endif for ( lm1 = lmo; lm1 < Bme; ++lm1 ) { for ( ln1 = lno; ln1 < Bne; ++ln1 ) { if ( ln1 == (Bn - 1) ) { fluxcapacitor2( tmpa + LIBXSMM_VLA_ACCESS( 2, offa, lm1, 0, Bk ), tmpb + LIBXSMM_VLA_ACCESS( 2, offb, ln1, 0, Bk ), c + LIBXSMM_VLA_ACCESS( 2, offc, ln1, lm1, (*ldc) ), &Bkbr ); } else { fluxcapacitor( tmpa + LIBXSMM_VLA_ACCESS( 2, offa, lm1, 0, Bk ), tmpb + LIBXSMM_VLA_ACCESS( 2, offb, ln1, 0, Bk ), c + LIBXSMM_VLA_ACCESS( 2, offc, ln1, lm1, (*ldc) ), &Bkbr ); } } } } } } } size_t bf16sgemm_trup_get_scratch( const int* m, const int* n, const int* k ) { size_t memam = 0; assert( ((*m) % 64) == 0 ); assert( (((*n)-1) % 64) == 0 ); assert( ((*k) % 64) == 0 ); memam += (*m)*(*k)*sizeof(libxsmm_bfloat16); memam += (*n)*(*k)*sizeof(libxsmm_bfloat16); memam += ((*m)*(*k)*sizeof(size_t))/(64*64); memam += ((*n-1)*(*k)*sizeof(size_t))/(64*64); memam += ((*n-1)*(*m)*sizeof(size_t))/(64*64); return memam; } void bf16sgemm_trup( const char* transa, const char* transb, const int* m, const int* n, const int* k, const float* alpha, const float* a, const int* lda, const float* b, const int* ldb, const float* beta, float* c, const int* ldc, void* scratch ) { /* tmpa and tmpb pointers for block storage */ libxsmm_bfloat16* tmpa = (libxsmm_bfloat16*)scratch; libxsmm_bfloat16* tmpb = tmpa+((*k)*(*lda)); /* (fixed) blocking factors */ int bm = 64; int bn = 64; int bk = 64; int bn2 = 65; int Bm = (*m)/bm; int Bn = ((*n)-1)/bn; int Bk = (*k)/bk; unsigned long long Bkbr = (unsigned long long)Bk; int BnB = 8; int BmB = Bm; /* helper arrays for mixed shaped tile tensors */ size_t* poa = (size_t*)(tmpb + (*n)*(*ldb)); size_t* pob = poa + (Bm*Bk); size_t* poc = pob + (Bn*Bk); /* mult-dim array definitions for readable code */ LIBXSMM_VLA_DECL( 2, const float, origa, a, (*lda) ); LIBXSMM_VLA_DECL( 2, const float, origb, b, (*ldb) ); /* organization of tile offsets */ LIBXSMM_VLA_DECL( 2, size_t, offa, poa, Bk); LIBXSMM_VLA_DECL( 2, size_t, offb, pob, Bk); LIBXSMM_VLA_DECL( 2, size_t, offc, poc, Bm); /* jitted libxsmm batch reduce kernel for compute */ libxsmm_bsmmfunction_reducebatch_strd fluxcapacitor = libxsmm_bsmmdispatch_reducebatch_strd( bm, bn, bk, bk*bm*sizeof(libxsmm_bfloat16), bk*bn*sizeof(libxsmm_bfloat16), &bm, &bk, ldc, NULL, NULL, NULL, NULL); libxsmm_bsmmfunction_reducebatch_strd fluxcapacitor2 = libxsmm_bsmmdispatch_reducebatch_strd( bm, bn2, bk, bk*bm*sizeof(libxsmm_bfloat16), bk*bn2*sizeof(libxsmm_bfloat16), &bm, &bk, ldc, NULL, NULL, NULL, NULL); /* tmp counters */ int lm1, ln1, lk1, lm2, ln2, lk2, lno, Bne, lmo, Bme; /* some checks */ assert( ((*m) % 64) == 0 ); assert( (((*n)-1) % 64) == 0 ); assert( ((*k) % 64) == 0 ); assert( ((*lda) % 64) == 0 ); assert( ((*ldb) % 64) == 0 ); assert( ((*ldc) % 64) == 0 ); assert( *alpha == -1.0f ); assert( *beta == 1.0f ); assert( *transa == 'N' ); assert( *transb == 'N' ); for ( lm1 = 0; lm1 < Bm; ++lm1 ) { for ( lk1 = 0; lk1 < Bk; ++lk1 ) { LIBXSMM_VLA_ACCESS( 2, offa, lm1, lk1, Bk ) = ((size_t)bm*bk*lk1) + ((size_t)lm1*bm*(*k)); } } for ( ln1 = 0; ln1 < Bn; ++ln1 ) { for ( lk1 = 0; lk1 < Bk; ++lk1 ) { if ( ln1 == Bn-1 ) { LIBXSMM_VLA_ACCESS( 2, offb, ln1, lk1, Bk ) = ((size_t)bn2*bk*lk1) + ((size_t)ln1*bn*(*k)); } else { LIBXSMM_VLA_ACCESS( 2, offb, ln1, lk1, Bk ) = ((size_t)bn*bk*lk1) + ((size_t)ln1*bn*(*k)); } } } for ( ln1 = 0; ln1 < Bn; ++ln1 ) { for ( lm1 = 0; lm1 < Bm; ++lm1 ) { LIBXSMM_VLA_ACCESS( 2, offc, ln1, lm1, Bm ) = ((size_t)bm*lm1) + ((size_t)ln1*bn*(*m)); } } #if defined(_OPENMP) # pragma omp parallel private(lm1, lm2, ln1, ln2, lk1, lk2, lno, Bne, lmo, Bme) #endif { for ( lmo = 0; lmo < Bm; lmo += BmB ) { Bme = (lmo+BmB > Bm) ? Bm : lmo+BmB; for ( lno = 0; lno < Bn; lno += BnB ) { Bne = (lno+BnB > Bn) ? Bn : lno+BnB; #if defined(_OPENMP) # pragma omp for private(ln1, ln2, lk1, lk2) LIBXSMM_OPENMP_COLLAPSE(2) #endif for ( ln1 = lno; ln1 < Bne; ++ln1 ) { for ( lk1 = 0; lk1 < Bk; ++lk1 ) { int mybn = ( ln1 == Bn-1 ) ? bn2 : bn; for ( ln2 = 0; ln2 < mybn; ++ln2 ) { libxsmm_bfloat16* tmpaddr1 = tmpb + LIBXSMM_VLA_ACCESS( 2, offb, ln1, lk1, Bk ) + ln2*bk; const float* tmpaddr2 = &LIBXSMM_VLA_ACCESS( 2, origb, (ln1*bn)+ln2, (lk1*bk), (*ldb) ); #ifdef __USE_NATIVE_CPX__ __m512i v0 = _mm512_cvtne2ps_pbh( _mm512_loadu_ps( tmpaddr2+16 ), _mm512_loadu_ps( tmpaddr2 ) ); __m512i v1 = _mm512_cvtne2ps_pbh( _mm512_loadu_ps( tmpaddr2+48 ), _mm512_loadu_ps( tmpaddr2+32 ) ); _mm512_storeu_si512( tmpaddr1, v0 ); _mm512_storeu_si512( tmpaddr1+32, v1 ); #else _mm512_bf16store( tmpaddr1, _mm512_loadu_ps( tmpaddr2 ) ); _mm512_bf16store( tmpaddr1+16, _mm512_loadu_ps( tmpaddr2+16 ) ); _mm512_bf16store( tmpaddr1+32, _mm512_loadu_ps( tmpaddr2+32 ) ); _mm512_bf16store( tmpaddr1+48, _mm512_loadu_ps( tmpaddr2+48 ) ); #endif } } } #if defined(_OPENMP) # pragma omp for private(lm1, ln1, lk1, lk2) LIBXSMM_OPENMP_COLLAPSE(2) #endif for ( lm1 = lmo; lm1 < Bme; ++lm1 ) { /* we prepare a bm*K tile of A in L1/L2 cache */ for ( lk1 = 0; lk1 < Bk; ++lk1 ) { __m512 vmone = _mm512_set1_ps( -1.0f ); const __m512i perm_index = LIBXSMM_INTRINSICS_MM512_SET_EPI16(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8, 23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0); for ( lk2 = 0; lk2 < bk; lk2+=2 ) { libxsmm_bfloat16* tmpaddr1 = tmpa + LIBXSMM_VLA_ACCESS( 2, offa, lm1, lk1, Bk ) + lk2*bm; const float* tmpaddr2a = &LIBXSMM_VLA_ACCESS( 2, origa, (lk1*bk)+lk2, (lm1*bm), (*lda) ); const float* tmpaddr2b = &LIBXSMM_VLA_ACCESS( 2, origa, (lk1*bk)+lk2+1, (lm1*bm), (*lda) ); #ifdef __USE_NATIVE_CPX__ __m512i vba_0 = _mm512_cvtne2ps_pbh( _mm512_mul_ps( vmone, _mm512_loadu_ps( tmpaddr2b ) ), _mm512_mul_ps( vmone, _mm512_loadu_ps( tmpaddr2a ) ) ); __m512i vba_1 = _mm512_cvtne2ps_pbh( _mm512_mul_ps( vmone, _mm512_loadu_ps( tmpaddr2b+16 ) ), _mm512_mul_ps( vmone, _mm512_loadu_ps( tmpaddr2a+16 ) ) ); __m512i vba_2 = _mm512_cvtne2ps_pbh( _mm512_mul_ps( vmone, _mm512_loadu_ps( tmpaddr2b+32 ) ), _mm512_mul_ps( vmone, _mm512_loadu_ps( tmpaddr2a+32 ) ) ); __m512i vba_3 = _mm512_cvtne2ps_pbh( _mm512_mul_ps( vmone, _mm512_loadu_ps( tmpaddr2b+48 ) ), _mm512_mul_ps( vmone, _mm512_loadu_ps( tmpaddr2a+48 ) ) ); #else __m256i a_0 = _mm512_bf16cvt( _mm512_mul_ps( vmone, _mm512_loadu_ps( tmpaddr2a ) ) ); __m256i a_1 = _mm512_bf16cvt( _mm512_mul_ps( vmone, _mm512_loadu_ps( tmpaddr2a+16 ) ) ); __m256i a_2 = _mm512_bf16cvt( _mm512_mul_ps( vmone, _mm512_loadu_ps( tmpaddr2a+32 ) ) ); __m256i a_3 = _mm512_bf16cvt( _mm512_mul_ps( vmone, _mm512_loadu_ps( tmpaddr2a+48 ) ) ); __m256i b_0 = _mm512_bf16cvt( _mm512_mul_ps( vmone, _mm512_loadu_ps( tmpaddr2b ) ) ); __m256i b_1 = _mm512_bf16cvt( _mm512_mul_ps( vmone, _mm512_loadu_ps( tmpaddr2b+16 ) ) ); __m256i b_2 = _mm512_bf16cvt( _mm512_mul_ps( vmone, _mm512_loadu_ps( tmpaddr2b+32 ) ) ); __m256i b_3 = _mm512_bf16cvt( _mm512_mul_ps( vmone, _mm512_loadu_ps( tmpaddr2b+48 ) ) ); __m512i vba_0 = _mm512_inserti64x4( _mm512_castsi256_si512(a_0), b_0, 1); __m512i vba_1 = _mm512_inserti64x4( _mm512_castsi256_si512(a_1), b_1, 1); __m512i vba_2 = _mm512_inserti64x4( _mm512_castsi256_si512(a_2), b_2, 1); __m512i vba_3 = _mm512_inserti64x4( _mm512_castsi256_si512(a_3), b_3, 1); #endif _mm512_storeu_si512( tmpaddr1, _mm512_permutexvar_epi16(perm_index, vba_0 ) ); _mm512_storeu_si512( tmpaddr1+32, _mm512_permutexvar_epi16(perm_index, vba_1 ) ); _mm512_storeu_si512( tmpaddr1+64, _mm512_permutexvar_epi16(perm_index, vba_2 ) ); _mm512_storeu_si512( tmpaddr1+96, _mm512_permutexvar_epi16(perm_index, vba_3 ) ); } } } #if defined(_OPENMP) # pragma omp for private(lm1, ln1) LIBXSMM_OPENMP_COLLAPSE(2) #endif for ( lm1 = lmo; lm1 < Bme; ++lm1 ) { for ( ln1 = lno; ln1 < Bne; ++ln1 ) { if ( ln1 == (Bn - 1) ) { fluxcapacitor2( tmpa + LIBXSMM_VLA_ACCESS( 2, offa, lm1, 0, Bk ), tmpb + LIBXSMM_VLA_ACCESS( 2, offb, ln1, 0, Bk ), c + LIBXSMM_VLA_ACCESS( 2, offc, ln1, lm1, (*ldc) ), &Bkbr ); } else { fluxcapacitor( tmpa + LIBXSMM_VLA_ACCESS( 2, offa, lm1, 0, Bk ), tmpb + LIBXSMM_VLA_ACCESS( 2, offb, ln1, 0, Bk ), c + LIBXSMM_VLA_ACCESS( 2, offc, ln1, lm1, (*ldc) ), &Bkbr ); } } } } } } } #else size_t sgemm_trup_get_scratch( const int* m, const int* n, const int* k ) { size_t memam = 1; return memam; } void sgemm_trup( const char* transa, const char* transb, const int* m, const int* n, const int* k, const float* alpha, const float* a, const int* lda, const float* b, const int* ldb, const float* beta, float* c, const int* ldc, void* scratch ) { return; } size_t bf16sgemm_trup_get_scratch( const int* m, const int* n, const int* k ) { size_t memam = 1; return memam; } void bf16sgemm_trup( const char* transa, const char* transb, const int* m, const int* n, const int* k, const float* alpha, const float* a, const int* lda, const float* b, const int* ldb, const float* beta, float* c, const int* ldc, void* scratch ) { return; } #endif int main(int argc, char* argv []) { int M, N, K, LDA, LDB, LDC, iters; float alpha = -1.0f, beta = 1.0f; char transa = 'N', transb = 'N'; float *A, *B, *C, *Cgold, *Cbf16, *scratch, *scratch2; size_t i; double max_error; libxsmm_timer_tickint l_start; double l_runtime; double l_gflops; #ifndef __AVX512F__ printf("\nthe binary was built without AVX512 support, tests will fail and not run!!\n\n"); return EXIT_SUCCESS; #else if ( argc != 4 ) { printf("wrong arguments, required: ./%s N K iters\n", argv[0]); return EXIT_FAILURE; } M = atoi(argv[1]); N = M+1; K = atoi(argv[2]); iters = atoi(argv[3]); LDA = M; LDB = K; LDC = M; A = (float*)libxsmm_aligned_malloc( (size_t)M * (size_t)K * sizeof(float), 2097152 ); B = (float*)libxsmm_aligned_malloc( (size_t)N * (size_t)K * sizeof(float), 2097152 ); C = (float*)libxsmm_aligned_malloc( (size_t)M * (size_t)N * sizeof(float), 2097152 ); Cbf16 = (float*)libxsmm_aligned_malloc( (size_t)M * (size_t)N * sizeof(float), 2097152 ); Cgold = (float*)libxsmm_aligned_malloc( (size_t)M * (size_t)N * sizeof(float), 2097152 ); scratch = (void*)libxsmm_aligned_malloc( sgemm_trup_get_scratch( &M, &N, &K ) * sizeof(char), 2097152 ); scratch2 = (void*)libxsmm_aligned_malloc( bf16sgemm_trup_get_scratch( &M, &N, &K ) * sizeof(char), 2097152 ); l_gflops = ((double)M*(double)N*(double)K*2.0)/(double)1e9; /* init data */ for (i = 0; i < (size_t)M*(size_t)K; i++) { A[i] = (float)libxsmm_rng_f64(); } for (i = 0; i < (size_t)N*(size_t)K; i++) { B[i] = (float)libxsmm_rng_f64(); } for (i = 0; i < (size_t)N*(size_t)M; i++) { Cgold[i] = (float)libxsmm_rng_f64(); } for (i = 0; i < (size_t)N*(size_t)M; i++) { C[i] = Cgold[i]; } for (i = 0; i < (size_t)N*(size_t)M; i++) { Cbf16[i] = Cgold[i]; } /* call MKL and custom trup for correctness */ LIBXSMM_GEMM_SYMBOL(float)( &transa, &transb, &M, &N, &K, &alpha, A, &LDA, B, &LDB, &beta, Cgold, &LDC ); sgemm_trup( &transa, &transb, &M, &N, &K, &alpha, A, &LDA, B, &LDB, &beta, C, &LDC, scratch ); bf16sgemm_trup( &transa, &transb, &M, &N, &K, &alpha, A, &LDA, B, &LDB, &beta, Cbf16, &LDC, scratch2 ); /* check max error */ max_error = 0.0; for (i = 0; i < (size_t)N*(size_t)M; i++) { if ( fabs( Cgold[i] - C[i] ) > max_error ) { max_error = fabs( Cgold[i] - C[i] ); } } /* Print total max error */ printf("\n\n Total Max Error fp32-custom: %f\n", max_error ); /* check max error */ max_error = 0.0; for (i = 0; i < (size_t)N*(size_t)M; i++) { if ( fabs( Cgold[i] - Cbf16[i] ) > max_error ) { max_error = fabs( Cgold[i] - Cbf16[i] ); } } /* Print total max error */ printf(" Total Max Error bf16-custom: %f\n\n", max_error ); /* benchmark */ l_start = libxsmm_timer_tick(); for( i = 0; i < (size_t)iters; ++i ) { LIBXSMM_GEMM_SYMBOL(float)( &transa, &transb, &M, &N, &K, &alpha, A, &LDA, B, &LDB, &beta, Cgold, &LDC ); } l_runtime = libxsmm_timer_duration(l_start, libxsmm_timer_tick()); l_runtime = l_runtime / (double)iters; printf(" Performance SGEMM: %f GFLOPS %f s \n", l_gflops/l_runtime, l_runtime ); l_start = libxsmm_timer_tick(); for( i = 0; i < (size_t)iters; ++i ) { sgemm_trup( &transa, &transb, &M, &N, &K, &alpha, A, &LDA, B, &LDB, &beta, Cgold, &LDC, scratch ); } l_runtime = libxsmm_timer_duration(l_start, libxsmm_timer_tick()); l_runtime = l_runtime / (double)iters; printf(" Performance fp32-custom: %f GFLOPS %f s \n", l_gflops/l_runtime, l_runtime ); l_start = libxsmm_timer_tick(); for( i = 0; i < (size_t)iters; ++i ) { bf16sgemm_trup( &transa, &transb, &M, &N, &K, &alpha, A, &LDA, B, &LDB, &beta, Cgold, &LDC, scratch2 ); } l_runtime = libxsmm_timer_duration(l_start, libxsmm_timer_tick()); l_runtime = l_runtime / (double)iters; printf(" Performance bf16-custom: %f GFLOPS %f s \n\n", l_gflops/l_runtime, l_runtime ); libxsmm_free( A ); libxsmm_free( B ); libxsmm_free( C ); libxsmm_free( Cgold ); libxsmm_free( Cbf16 ); libxsmm_free( scratch ); libxsmm_free( scratch2 ); return EXIT_SUCCESS; #endif } libxsmm-1.17/samples/xgemm/bf16sgemm.vcxproj000066400000000000000000000552011415223013700211030ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 bf16sgemm 10.0 {0832FF15-970D-4241-A991-16F4166A4B09} Application Disabled Disabled Sequential v142 true Application true true Disabled Disabled Sequential v142 Application true Disabled Disabled Sequential v142 true Application Disabled Disabled Sequential v142 true true Application true Disabled Disabled Sequential v142 Application true Disabled Disabled true Sequential v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false 3948,10373,10382 HOST true true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmext.lib;mkl_rt.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false SingleFile 3948,10373,10382 HOST true true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false 3948,10373,10382 HOST true true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmext.lib;mkl_rt.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false SingleFile 3948,10373,10382 HOST true true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false 3948,10373,10382 HOST true true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false 3948,10373,10382 HOST true true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/xgemm/kernel.c000066400000000000000000003560321415223013700173310ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include #include #include #include #include typedef struct gemm_def { libxsmm_blasint m; libxsmm_blasint n; libxsmm_blasint k; libxsmm_blasint lda; libxsmm_blasint ldb; libxsmm_blasint ldc; double alpha; double beta; int trans_a; int trans_b; int aligned_a; int aligned_c; int prefetch; int br_type; libxsmm_blasint br_count; int br_unroll; } gemm_def; int g_reps = 0; LIBXSMM_INLINE void print_help(void) { printf("\n\n"); printf("1. Usage (dense*dense=dense, correctness and performance):\n"); printf(" M\n"); printf(" N\n"); printf(" K\n"); printf(" LDA\n"); printf(" LDB\n"); printf(" LDC\n"); printf(" alpha: 1\n"); printf(" beta: 0 or 1\n"); printf(" 0: unaligned A, otherwise aligned\n"); printf(" 0: unaligned C, otherwise aligned\n"); printf(" 0: A normal, 1: A trans\n"); printf(" 0: B normal, 1: B trans\n"); printf(" PREFETCH: nopf (none), pfsigonly, BL2viaC, AL2, curAL2, AL2_BL2viaC, curAL2_BL2viaC\n"); printf(" PRECISION: SP, DP, I16I32, USI8I32, SUI8I32, SUI8UI8, BF16F32, BF16\n"); printf(" BRGEMM: nobr, addrbr, offsbr, strdbr\n"); printf(" BRsize: 1 - N\n"); printf(" BRunroll: 0/1\n"); printf(" #repetitions\n"); printf("\n\n"); printf("2. Usage (dense*dense=dense, performance only option available):\n"); printf(" filename with space-sperated sizes (M N K LDA LDB LDC)\n"); printf(" alpha: 1\n"); printf(" beta: 0 or 1\n"); printf(" 0: unaligned A, otherwise aligned\n"); printf(" 0: unaligned C, otherwise aligned\n"); printf(" 0: A normal, 1: A trans\n"); printf(" 0: B normal, 1: B trans\n"); printf(" PRECISION: SP, DP, I16I32, USI8I32, SUI8I32, SUI8UI8, BF16F32, BF16\n"); printf(" BRGEMM: nobr, addrbr, offsbr, strdbr\n"); printf(" BRsize: 1 - N\n"); printf(" BRunroll: 0/1\n"); printf(" #repetitions\n"); printf(" 0: no check, otherwise: run check\n"); printf("\n\n"); } LIBXSMM_INLINE double run_jit_double( const gemm_def* i_gemm_def, const double* i_a, const double* i_b, double* o_c, const unsigned int i_print_jit_info) { /* define function pointer */ libxsmm_xmmfunction l_test_jit = { NULL }; libxsmm_timer_tickint l_start; libxsmm_mmkernel_info l_info; int l_flags = LIBXSMM_GEMM_FLAGS('N', 'N'); double l_jittime, l_runtime; size_t l_t, l_r; const double** l_a_addr = (const double**)malloc(i_gemm_def->br_count*sizeof(double*)); const double** l_b_addr = (const double**)malloc(i_gemm_def->br_count*sizeof(double*)); unsigned long long* l_a_offs = (unsigned long long*)malloc(i_gemm_def->br_count*sizeof(unsigned long long)); unsigned long long* l_b_offs = (unsigned long long*)malloc(i_gemm_def->br_count*sizeof(unsigned long long)); double l_alpha = i_gemm_def->alpha; double l_beta = i_gemm_def->beta; unsigned long long l_br = (unsigned long long)i_gemm_def->br_count; if (0 == i_gemm_def) { fprintf(stderr, "JIT: unsupported descriptor arguments or data type!\n"); return EXIT_FAILURE; } /* setup brgemm offsets */ if ( i_gemm_def->br_type == 2 ) { for ( l_r = 0 ; l_r < i_gemm_def->br_count; l_r++ ) { l_a_offs[l_r] = l_r * (size_t)i_gemm_def->lda * (size_t)i_gemm_def->k * sizeof(double); if (i_gemm_def->trans_b == 0) { l_b_offs[l_r] = l_r * (size_t)i_gemm_def->ldb * (size_t)i_gemm_def->n * sizeof(double); } else { l_b_offs[l_r] = l_r * (size_t)i_gemm_def->ldb * (size_t)i_gemm_def->k * sizeof(double); } } } /* set up the flags */ if ( i_gemm_def->trans_b != 0 ) { l_flags |= LIBXSMM_GEMM_FLAG_TRANS_B; } if ( i_gemm_def->trans_a != 0 ) { fprintf(stderr, "trans_a needs to be 0\n"); return EXIT_FAILURE; } l_flags |= (0 != i_gemm_def->aligned_a ? LIBXSMM_GEMM_FLAG_ALIGN_A : 0); l_flags |= (0 != i_gemm_def->aligned_c ? LIBXSMM_GEMM_FLAG_ALIGN_C : 0); l_start = libxsmm_timer_tick(); if (i_gemm_def->br_type == 0) { l_test_jit.dmm = libxsmm_dmmdispatch(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } else if (i_gemm_def->br_type == 1) { if (i_gemm_def->br_unroll == 0) { l_test_jit.dmra = libxsmm_dmmdispatch_reducebatch_addr(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } else { l_test_jit.dmra = libxsmm_dmmdispatch_reducebatch_addr_unroll(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->br_count, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } } else if (i_gemm_def->br_type == 2) { if (i_gemm_def->br_unroll == 0) { l_test_jit.dmro = libxsmm_dmmdispatch_reducebatch_offs(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } else { l_test_jit.dmro = libxsmm_dmmdispatch_reducebatch_offs_unroll(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->br_count, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } } else if (i_gemm_def->br_type == 3) { if (i_gemm_def->br_unroll == 0) { if (i_gemm_def->trans_b == 0) { l_test_jit.dmrs = libxsmm_dmmdispatch_reducebatch_strd(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->lda*i_gemm_def->k*sizeof(double), i_gemm_def->ldb*i_gemm_def->n*sizeof(double), &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } else { l_test_jit.dmrs = libxsmm_dmmdispatch_reducebatch_strd(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->lda*i_gemm_def->k*sizeof(double), i_gemm_def->ldb*i_gemm_def->k*sizeof(double), &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } } else { if (i_gemm_def->trans_b == 0) { l_test_jit.dmrs = libxsmm_dmmdispatch_reducebatch_strd_unroll(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->lda*i_gemm_def->k*sizeof(double), i_gemm_def->ldb*i_gemm_def->n*sizeof(double), i_gemm_def->br_count, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } else { l_test_jit.dmrs = libxsmm_dmmdispatch_reducebatch_strd_unroll(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->lda*i_gemm_def->k*sizeof(double), i_gemm_def->ldb*i_gemm_def->k*sizeof(double), i_gemm_def->br_count, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } } } else { /* nothing */ } l_jittime = libxsmm_timer_duration(l_start, libxsmm_timer_tick()); if (l_test_jit.xmm == 0) { printf("JIT failed, please run with LIBXSMM_VERBOSE=-1 and/or with debug mode LIBXSMM library!\n"); exit(EXIT_FAILURE); } /* receive kernel information */ libxsmm_get_mmkernel_info(l_test_jit, &l_info); l_start = libxsmm_timer_tick(); if ( l_info.prefetch == LIBXSMM_GEMM_PREFETCH_NONE ) { if (i_gemm_def->br_type == 0) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.dmm(i_a, i_b, o_c); } } else if (i_gemm_def->br_type == 1) { for (l_t = 0; l_t < g_reps; l_t++) { for ( l_r = 0 ; l_r < i_gemm_def->br_count; l_r++ ) { l_a_addr[l_r] = (const double*)i_a + (l_r * (size_t)i_gemm_def->lda * (size_t)i_gemm_def->k); if (i_gemm_def->trans_b == 0) { l_b_addr[l_r] = (const double*)i_b + (l_r * (size_t)i_gemm_def->ldb * (size_t)i_gemm_def->n); } else { l_b_addr[l_r] = (const double*)i_b + (l_r * (size_t)i_gemm_def->ldb * (size_t)i_gemm_def->k); } } l_test_jit.dmra(l_a_addr, l_b_addr, o_c, &l_br); } } else if (i_gemm_def->br_type == 2) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.dmro(i_a, i_b, o_c, &l_br, l_a_offs, l_b_offs); } } else if (i_gemm_def->br_type == 3) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.dmrs(i_a, i_b, o_c, &l_br); } } } else { if (i_gemm_def->br_type == 0) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.dmm(i_a, i_b, o_c, i_a, i_b, o_c); } } else if (i_gemm_def->br_type == 1) { for (l_t = 0; l_t < g_reps; l_t++) { for ( l_r = 0 ; l_r < i_gemm_def->br_count; l_r++ ) { l_a_addr[l_r] = (const double*)i_a + (l_r * (size_t)i_gemm_def->lda * (size_t)i_gemm_def->k); if (i_gemm_def->trans_b == 0) { l_b_addr[l_r] = (const double*)i_b + (l_r * (size_t)i_gemm_def->ldb * (size_t)i_gemm_def->n); } else { l_b_addr[l_r] = (const double*)i_b + (l_r * (size_t)i_gemm_def->ldb * (size_t)i_gemm_def->k); } } l_test_jit.dmra(l_a_addr, l_b_addr, o_c, &l_br); } } else if (i_gemm_def->br_type == 2) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.dmro(i_a, i_b, o_c, &l_br, l_a_offs, l_b_offs); } } else if (i_gemm_def->br_type == 3) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.dmrs(i_a, i_b, o_c, &l_br); } } } l_runtime = libxsmm_timer_duration(l_start, libxsmm_timer_tick()); if ( i_print_jit_info == 0 ) { printf("function pointer address: %llx\n", (unsigned long long)l_test_jit.xmm); printf("%fs for creating jit\n", l_jittime); } free( (void*)l_a_addr ); free( (void*)l_b_addr ); free( (void*)l_a_offs ); free( (void*)l_b_offs ); return l_runtime; } LIBXSMM_INLINE double run_jit_float( const gemm_def* i_gemm_def, const float* i_a, const float* i_b, float* o_c, const unsigned int i_print_jit_info ) { /* define function pointer */ libxsmm_xmmfunction l_test_jit = { NULL }; libxsmm_timer_tickint l_start; libxsmm_mmkernel_info l_info; int l_flags = LIBXSMM_GEMM_FLAGS('N', 'N'); double l_jittime, l_runtime; size_t l_t, l_r; const float** l_a_addr = (const float**)malloc(i_gemm_def->br_count*sizeof(float*)); const float** l_b_addr = (const float**)malloc(i_gemm_def->br_count*sizeof(float*)); unsigned long long* l_a_offs = (unsigned long long*)malloc(i_gemm_def->br_count*sizeof(unsigned long long)); unsigned long long* l_b_offs = (unsigned long long*)malloc(i_gemm_def->br_count*sizeof(unsigned long long)); float l_alpha = (float)i_gemm_def->alpha; float l_beta = (float)i_gemm_def->beta; unsigned long long l_br = (unsigned long long)i_gemm_def->br_count; if (0 == i_gemm_def) { fprintf(stderr, "JIT: unsupported descriptor arguments or data type!\n"); return EXIT_FAILURE; } /* setup brgemm offsets */ if ( i_gemm_def->br_type == 2 ) { for ( l_r = 0 ; l_r < i_gemm_def->br_count; l_r++ ) { l_a_offs[l_r] = l_r * (size_t)i_gemm_def->lda * (size_t)i_gemm_def->k * sizeof(float); if (i_gemm_def->trans_b == 0) { l_b_offs[l_r] = l_r * (size_t)i_gemm_def->ldb * (size_t)i_gemm_def->n * sizeof(float); } else { l_b_offs[l_r] = l_r * (size_t)i_gemm_def->ldb * (size_t)i_gemm_def->k * sizeof(float); } } } /* set up the flags */ if ( i_gemm_def->trans_b != 0 ) { l_flags |= LIBXSMM_GEMM_FLAG_TRANS_B; } if ( i_gemm_def->trans_a != 0 ) { fprintf(stderr, "trans_a needs to be 0\n"); return EXIT_FAILURE; } l_flags |= (0 != i_gemm_def->aligned_a ? LIBXSMM_GEMM_FLAG_ALIGN_A : 0); l_flags |= (0 != i_gemm_def->aligned_c ? LIBXSMM_GEMM_FLAG_ALIGN_C : 0); l_start = libxsmm_timer_tick(); if (i_gemm_def->br_type == 0) { l_test_jit.smm = libxsmm_smmdispatch(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } else if (i_gemm_def->br_type == 1) { if (i_gemm_def->br_unroll == 0) { l_test_jit.smra = libxsmm_smmdispatch_reducebatch_addr(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } else { l_test_jit.smra = libxsmm_smmdispatch_reducebatch_addr_unroll(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->br_count, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } } else if (i_gemm_def->br_type == 2) { if (i_gemm_def->br_unroll == 0) { l_test_jit.smro = libxsmm_smmdispatch_reducebatch_offs(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } else { l_test_jit.smro = libxsmm_smmdispatch_reducebatch_offs_unroll(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->br_count, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } } else if (i_gemm_def->br_type == 3) { if (i_gemm_def->br_unroll == 0) { if (i_gemm_def->trans_b == 0) { l_test_jit.smrs = libxsmm_smmdispatch_reducebatch_strd(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->lda*i_gemm_def->k*sizeof(float), i_gemm_def->ldb*i_gemm_def->n*sizeof(float), &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } else { l_test_jit.smrs = libxsmm_smmdispatch_reducebatch_strd(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->lda*i_gemm_def->k*sizeof(float), i_gemm_def->ldb*i_gemm_def->k*sizeof(float), &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } } else { if (i_gemm_def->trans_b == 0) { l_test_jit.smrs = libxsmm_smmdispatch_reducebatch_strd_unroll(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->lda*i_gemm_def->k*sizeof(float), i_gemm_def->ldb*i_gemm_def->n*sizeof(float), i_gemm_def->br_count, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } else { l_test_jit.smrs = libxsmm_smmdispatch_reducebatch_strd_unroll(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->lda*i_gemm_def->k*sizeof(float), i_gemm_def->ldb*i_gemm_def->k*sizeof(float), i_gemm_def->br_count, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } } } else { /* nothing */ } l_jittime = libxsmm_timer_duration(l_start, libxsmm_timer_tick()); if (l_test_jit.xmm == 0) { printf("JIT failed, please run with LIBXSMM_VERBOSE=-1 and/or with debug mode LIBXSMM library!\n"); exit(EXIT_FAILURE); } /* receive kernel information */ libxsmm_get_mmkernel_info(l_test_jit, &l_info); l_start = libxsmm_timer_tick(); if ( l_info.prefetch == LIBXSMM_GEMM_PREFETCH_NONE ) { if (i_gemm_def->br_type == 0) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.smm(i_a, i_b, o_c); } } else if (i_gemm_def->br_type == 1) { for (l_t = 0; l_t < g_reps; l_t++) { for ( l_r = 0 ; l_r < i_gemm_def->br_count; l_r++ ) { l_a_addr[l_r] = (float*)i_a + (l_r * (size_t)i_gemm_def->lda * (size_t)i_gemm_def->k); if (i_gemm_def->trans_b == 0) { l_b_addr[l_r] = (float*)i_b + (l_r * (size_t)i_gemm_def->ldb * (size_t)i_gemm_def->n); } else { l_b_addr[l_r] = (float*)i_b + (l_r * (size_t)i_gemm_def->ldb * (size_t)i_gemm_def->k); } } l_test_jit.smra(l_a_addr, l_b_addr, o_c, &l_br); } } else if (i_gemm_def->br_type == 2) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.smro(i_a, i_b, o_c, &l_br, l_a_offs, l_b_offs); } } else if (i_gemm_def->br_type == 3) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.smrs(i_a, i_b, o_c, &l_br); } } } else { if (i_gemm_def->br_type == 0) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.smm(i_a, i_b, o_c, i_a, i_b, o_c); } } else if (i_gemm_def->br_type == 1) { for (l_t = 0; l_t < g_reps; l_t++) { for ( l_r = 0 ; l_r < i_gemm_def->br_count; l_r++ ) { l_a_addr[l_r] = (float*)i_a + (l_r * (size_t)i_gemm_def->lda * (size_t)i_gemm_def->k); if (i_gemm_def->trans_b == 0) { l_b_addr[l_r] = (float*)i_b + (l_r * (size_t)i_gemm_def->ldb * (size_t)i_gemm_def->n); } else { l_b_addr[l_r] = (float*)i_b + (l_r * (size_t)i_gemm_def->ldb * (size_t)i_gemm_def->k); } } l_test_jit.smra(l_a_addr, l_b_addr, o_c, &l_br); } } else if (i_gemm_def->br_type == 2) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.smro(i_a, i_b, o_c, &l_br, l_a_offs, l_b_offs); } } else if (i_gemm_def->br_type == 3) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.smrs(i_a, i_b, o_c, &l_br); } } } l_runtime = libxsmm_timer_duration(l_start, libxsmm_timer_tick()); if ( i_print_jit_info == 0 ) { printf("function pointer address: %llx\n", (unsigned long long)l_test_jit.xmm); printf("%fs for creating jit\n", l_jittime); } free( (void*)l_a_addr ); free( (void*)l_b_addr ); free( (void*)l_a_offs ); free( (void*)l_b_offs ); return l_runtime; } LIBXSMM_INLINE double run_jit_short_int( const gemm_def* i_gemm_def, const short* i_a, const short* i_b, int* o_c, const unsigned int i_print_jit_info ) { /* define function pointer */ libxsmm_xmmfunction l_test_jit = { NULL }; libxsmm_timer_tickint l_start; libxsmm_mmkernel_info l_info; int l_flags = LIBXSMM_GEMM_FLAGS('N', 'N'); double l_jittime, l_runtime; size_t l_t, l_r; const short** l_a_addr = (const short**)malloc(i_gemm_def->br_count*sizeof(short*)); const short** l_b_addr = (const short**)malloc(i_gemm_def->br_count*sizeof(short*)); unsigned long long* l_a_offs = (unsigned long long*)malloc(i_gemm_def->br_count*sizeof(unsigned long long)); unsigned long long* l_b_offs = (unsigned long long*)malloc(i_gemm_def->br_count*sizeof(unsigned long long)); int l_alpha = (int)i_gemm_def->alpha; int l_beta = (int)i_gemm_def->beta; unsigned long long l_br = (unsigned long long)i_gemm_def->br_count; l_flags |= LIBXSMM_GEMM_FLAG_A_UNSIGNED | LIBXSMM_GEMM_FLAG_VNNI_A; if (0 == i_gemm_def) { fprintf(stderr, "JIT: unsupported descriptor arguments or data type!\n"); return EXIT_FAILURE; } /* setup brgemm offsets */ if ( i_gemm_def->br_type == 2 ) { for ( l_r = 0 ; l_r < i_gemm_def->br_count; l_r++ ) { l_a_offs[l_r] = l_r * (size_t)i_gemm_def->lda * (size_t)i_gemm_def->k * sizeof(short); l_b_offs[l_r] = l_r * (size_t)i_gemm_def->ldb * (size_t)i_gemm_def->n * sizeof(short); } } /* set up the flags */ if ( i_gemm_def->trans_b != 0 ) { fprintf(stderr, "trans_b needs to be 0\n"); return EXIT_FAILURE; } if ( i_gemm_def->trans_a != 0 ) { fprintf(stderr, "trans_a needs to be 0\n"); return EXIT_FAILURE; } l_flags |= (0 != i_gemm_def->aligned_a ? LIBXSMM_GEMM_FLAG_ALIGN_A : 0); l_flags |= (0 != i_gemm_def->aligned_c ? LIBXSMM_GEMM_FLAG_ALIGN_C : 0); l_start = libxsmm_timer_tick(); if (i_gemm_def->br_type == 0) { l_test_jit.wimm = libxsmm_wimmdispatch(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } else if (i_gemm_def->br_type == 1) { if (i_gemm_def->br_unroll == 0) { l_test_jit.wimra = libxsmm_wimmdispatch_reducebatch_addr(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } else { l_test_jit.wimra = libxsmm_wimmdispatch_reducebatch_addr_unroll(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->br_count, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } } else if (i_gemm_def->br_type == 2) { if (i_gemm_def->br_unroll == 0) { l_test_jit.wimro = libxsmm_wimmdispatch_reducebatch_offs(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } else { l_test_jit.wimro = libxsmm_wimmdispatch_reducebatch_offs_unroll(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->br_count, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } } else if (i_gemm_def->br_type == 3) { if (i_gemm_def->br_unroll == 0) { l_test_jit.wimrs = libxsmm_wimmdispatch_reducebatch_strd(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->lda*i_gemm_def->k*sizeof(short), i_gemm_def->ldb*i_gemm_def->n*sizeof(short), &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } else { l_test_jit.wimrs = libxsmm_wimmdispatch_reducebatch_strd_unroll(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->lda*i_gemm_def->k*sizeof(short), i_gemm_def->ldb*i_gemm_def->n*sizeof(short), i_gemm_def->br_count, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } } else { /* nothing */ } l_jittime = libxsmm_timer_duration(l_start, libxsmm_timer_tick()); if (l_test_jit.xmm == 0) { printf("JIT failed, please run with LIBXSMM_VERBOSE=-1 and/or with debug mode LIBXSMM library!\n"); exit(EXIT_FAILURE); } /* receive kernel information */ libxsmm_get_mmkernel_info(l_test_jit, &l_info); l_start = libxsmm_timer_tick(); if ( l_info.prefetch == LIBXSMM_GEMM_PREFETCH_NONE ) { if (i_gemm_def->br_type == 0) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.wimm(i_a, i_b, o_c); } } else if (i_gemm_def->br_type == 1) { for (l_t = 0; l_t < g_reps; l_t++) { for ( l_r = 0 ; l_r < i_gemm_def->br_count; l_r++ ) { l_a_addr[l_r] = (short*)i_a + (l_r * (size_t)i_gemm_def->lda * (size_t)i_gemm_def->k); l_b_addr[l_r] = (short*)i_b + (l_r * (size_t)i_gemm_def->ldb * (size_t)i_gemm_def->n); } l_test_jit.wimra(l_a_addr, l_b_addr, o_c, &l_br); } } else if (i_gemm_def->br_type == 2) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.wimro(i_a, i_b, o_c, &l_br, l_a_offs, l_b_offs); } } else if (i_gemm_def->br_type == 3) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.wimrs(i_a, i_b, o_c, &l_br); } } } else { if (i_gemm_def->br_type == 0) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.wimm(i_a, i_b, o_c, i_a, i_b, o_c); } } else if (i_gemm_def->br_type == 1) { for (l_t = 0; l_t < g_reps; l_t++) { for ( l_r = 0 ; l_r < i_gemm_def->br_count; l_r++ ) { l_a_addr[l_r] = (short*)i_a + (l_r * (size_t)i_gemm_def->lda * (size_t)i_gemm_def->k); l_b_addr[l_r] = (short*)i_b + (l_r * (size_t)i_gemm_def->ldb * (size_t)i_gemm_def->n); } l_test_jit.wimra(l_a_addr, l_b_addr, o_c, &l_br); } } else if (i_gemm_def->br_type == 2) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.wimro(i_a, i_b, o_c, &l_br, l_a_offs, l_b_offs); } } else if (i_gemm_def->br_type == 3) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.wimrs(i_a, i_b, o_c, &l_br); } } } l_runtime = libxsmm_timer_duration(l_start, libxsmm_timer_tick()); if ( i_print_jit_info == 0 ) { printf("function pointer address: %llx\n", (unsigned long long)l_test_jit.xmm); printf("%fs for creating jit\n", l_jittime); } free( (void*)l_a_addr ); free( (void*)l_b_addr ); free( (void*)l_a_offs ); free( (void*)l_b_offs ); return l_runtime; } LIBXSMM_INLINE double run_jit_uschar_int( const gemm_def* i_gemm_def, const unsigned char* i_a, const char* i_b, int* o_c, const unsigned int i_print_jit_info ) { /* define function pointer */ libxsmm_xmmfunction l_test_jit = { NULL }; libxsmm_timer_tickint l_start; libxsmm_mmkernel_info l_info; int l_flags = LIBXSMM_GEMM_FLAGS('N', 'N'); double l_jittime, l_runtime; size_t l_t, l_r; const unsigned char** l_a_addr = (const unsigned char**)malloc(i_gemm_def->br_count*sizeof(unsigned char*)); const char** l_b_addr = (const char**)malloc(i_gemm_def->br_count*sizeof(char*)); unsigned long long* l_a_offs = (unsigned long long*)malloc(i_gemm_def->br_count*sizeof(unsigned long long)); unsigned long long* l_b_offs = (unsigned long long*)malloc(i_gemm_def->br_count*sizeof(unsigned long long)); int l_alpha = (int)i_gemm_def->alpha; int l_beta = (int)i_gemm_def->beta; unsigned long long l_br = (unsigned long long)i_gemm_def->br_count; l_flags |= LIBXSMM_GEMM_FLAG_A_UNSIGNED | LIBXSMM_GEMM_FLAG_VNNI_A; if (0 == i_gemm_def) { fprintf(stderr, "JIT: unsupported descriptor arguments or data type!\n"); return EXIT_FAILURE; } /* setup brgemm offsets */ if ( i_gemm_def->br_type == 2 ) { for ( l_r = 0 ; l_r < i_gemm_def->br_count; l_r++ ) { l_a_offs[l_r] = l_r * (size_t)i_gemm_def->lda * (size_t)i_gemm_def->k * sizeof(unsigned char); l_b_offs[l_r] = l_r * (size_t)i_gemm_def->ldb * (size_t)i_gemm_def->n * sizeof(char); } } /* set up the flags */ if ( i_gemm_def->trans_b != 0 ) { fprintf(stderr, "trans_b needs to be 0\n"); return EXIT_FAILURE; } if ( i_gemm_def->trans_a != 0 ) { fprintf(stderr, "trans_a needs to be 0\n"); return EXIT_FAILURE; } l_flags |= (0 != i_gemm_def->aligned_a ? LIBXSMM_GEMM_FLAG_ALIGN_A : 0); l_flags |= (0 != i_gemm_def->aligned_c ? LIBXSMM_GEMM_FLAG_ALIGN_C : 0); l_start = libxsmm_timer_tick(); if (i_gemm_def->br_type == 0) { l_test_jit.usbimm = libxsmm_usbimmdispatch(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } else if (i_gemm_def->br_type == 1) { if (i_gemm_def->br_unroll == 0) { l_test_jit.usbimra = libxsmm_usbimmdispatch_reducebatch_addr(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } else { l_test_jit.usbimra = libxsmm_usbimmdispatch_reducebatch_addr_unroll(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->br_count, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } } else if (i_gemm_def->br_type == 2) { if (i_gemm_def->br_unroll == 0) { l_test_jit.usbimro = libxsmm_usbimmdispatch_reducebatch_offs(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } else { l_test_jit.usbimro = libxsmm_usbimmdispatch_reducebatch_offs_unroll(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->br_count, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } } else if (i_gemm_def->br_type == 3) { if (i_gemm_def->br_unroll == 0) { l_test_jit.usbimrs = libxsmm_usbimmdispatch_reducebatch_strd(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->lda*i_gemm_def->k*sizeof(unsigned char), i_gemm_def->ldb*i_gemm_def->n*sizeof(char), &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } else { l_test_jit.usbimrs = libxsmm_usbimmdispatch_reducebatch_strd_unroll(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->lda*i_gemm_def->k*sizeof(unsigned char), i_gemm_def->ldb*i_gemm_def->n*sizeof(char), i_gemm_def->br_count, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } } else { /* nothing */ } l_jittime = libxsmm_timer_duration(l_start, libxsmm_timer_tick()); if (l_test_jit.xmm == 0) { printf("JIT failed, please run with LIBXSMM_VERBOSE=-1 and/or with debug mode LIBXSMM library!\n"); exit(EXIT_FAILURE); } /* receive kernel information */ libxsmm_get_mmkernel_info(l_test_jit, &l_info); l_start = libxsmm_timer_tick(); if ( l_info.prefetch == LIBXSMM_GEMM_PREFETCH_NONE ) { if (i_gemm_def->br_type == 0) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.usbimm(i_a, i_b, o_c); } } else if (i_gemm_def->br_type == 1) { for (l_t = 0; l_t < g_reps; l_t++) { for ( l_r = 0 ; l_r < i_gemm_def->br_count; l_r++ ) { l_a_addr[l_r] = (unsigned char*)i_a + (l_r * (size_t)i_gemm_def->lda * (size_t)i_gemm_def->k); l_b_addr[l_r] = (char*)i_b + (l_r * (size_t)i_gemm_def->ldb * (size_t)i_gemm_def->n); } l_test_jit.usbimra(l_a_addr, l_b_addr, o_c, &l_br); } } else if (i_gemm_def->br_type == 2) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.usbimro(i_a, i_b, o_c, &l_br, l_a_offs, l_b_offs); } } else if (i_gemm_def->br_type == 3) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.usbimrs(i_a, i_b, o_c, &l_br); } } } else { if (i_gemm_def->br_type == 0) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.usbimm(i_a, i_b, o_c, i_a, i_b, o_c); } } else if (i_gemm_def->br_type == 1) { for (l_t = 0; l_t < g_reps; l_t++) { for ( l_r = 0 ; l_r < i_gemm_def->br_count; l_r++ ) { l_a_addr[l_r] = (unsigned char*)i_a + (l_r * (size_t)i_gemm_def->lda * (size_t)i_gemm_def->k); l_b_addr[l_r] = (char*)i_b + (l_r * (size_t)i_gemm_def->ldb * (size_t)i_gemm_def->n); } l_test_jit.usbimra(l_a_addr, l_b_addr, o_c, &l_br); } } else if (i_gemm_def->br_type == 2) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.usbimro(i_a, i_b, o_c, &l_br, l_a_offs, l_b_offs); } } else if (i_gemm_def->br_type == 3) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.usbimrs(i_a, i_b, o_c, &l_br); } } } l_runtime = libxsmm_timer_duration(l_start, libxsmm_timer_tick()); if ( i_print_jit_info == 0 ) { printf("function pointer address: %llx\n", (unsigned long long)l_test_jit.xmm); printf("%fs for creating jit\n", l_jittime); } free( (void*)l_a_addr ); free( (void*)l_b_addr ); free( (void*)l_a_offs ); free( (void*)l_b_offs ); return l_runtime; } LIBXSMM_INLINE double run_jit_suchar_int( const gemm_def* i_gemm_def, const char* i_a, const unsigned char* i_b, int* o_c, const unsigned int i_print_jit_info ) { /* define function pointer */ libxsmm_xmmfunction l_test_jit = { NULL }; libxsmm_timer_tickint l_start; libxsmm_mmkernel_info l_info; int l_flags = LIBXSMM_GEMM_FLAGS('N', 'N'); double l_jittime, l_runtime; size_t l_t, l_r; const char** l_a_addr = (const char**)malloc(i_gemm_def->br_count*sizeof(char*)); const unsigned char** l_b_addr = (const unsigned char**)malloc(i_gemm_def->br_count*sizeof(unsigned char*)); unsigned long long* l_a_offs = (unsigned long long*)malloc(i_gemm_def->br_count*sizeof(unsigned long long)); unsigned long long* l_b_offs = (unsigned long long*)malloc(i_gemm_def->br_count*sizeof(unsigned long long)); int l_alpha = (int)i_gemm_def->alpha; int l_beta = (int)i_gemm_def->beta; unsigned long long l_br = (unsigned long long)i_gemm_def->br_count; l_flags |= LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_VNNI_A; if (0 == i_gemm_def) { fprintf(stderr, "JIT: unsupported descriptor arguments or data type!\n"); return EXIT_FAILURE; } /* setup brgemm offsets */ if ( i_gemm_def->br_type == 2 ) { for ( l_r = 0 ; l_r < i_gemm_def->br_count; l_r++ ) { l_a_offs[l_r] = l_r * (size_t)i_gemm_def->lda * (size_t)i_gemm_def->k * sizeof(char); l_b_offs[l_r] = l_r * (size_t)i_gemm_def->ldb * (size_t)i_gemm_def->n * sizeof(unsigned char); } } /* set up the flags */ if ( i_gemm_def->trans_b != 0 ) { fprintf(stderr, "trans_b needs to be 0\n"); return EXIT_FAILURE; } if ( i_gemm_def->trans_a != 0 ) { fprintf(stderr, "trans_a needs to be 0\n"); return EXIT_FAILURE; } l_flags |= (0 != i_gemm_def->aligned_a ? LIBXSMM_GEMM_FLAG_ALIGN_A : 0); l_flags |= (0 != i_gemm_def->aligned_c ? LIBXSMM_GEMM_FLAG_ALIGN_C : 0); l_start = libxsmm_timer_tick(); if (i_gemm_def->br_type == 0) { l_test_jit.subimm = libxsmm_subimmdispatch(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } else if (i_gemm_def->br_type == 1) { if (i_gemm_def->br_unroll == 0) { l_test_jit.subimra = libxsmm_subimmdispatch_reducebatch_addr(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } else { l_test_jit.subimra = libxsmm_subimmdispatch_reducebatch_addr_unroll(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->br_count, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } } else if (i_gemm_def->br_type == 2) { if (i_gemm_def->br_unroll == 0) { l_test_jit.subimro = libxsmm_subimmdispatch_reducebatch_offs(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } else { l_test_jit.subimro = libxsmm_subimmdispatch_reducebatch_offs_unroll(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->br_count, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } } else if (i_gemm_def->br_type == 3) { if (i_gemm_def->br_unroll == 0) { l_test_jit.subimrs = libxsmm_subimmdispatch_reducebatch_strd(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->lda*i_gemm_def->k*sizeof(char), i_gemm_def->ldb*i_gemm_def->n*sizeof(unsigned char), &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } else { l_test_jit.subimrs = libxsmm_subimmdispatch_reducebatch_strd_unroll(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->lda*i_gemm_def->k*sizeof(char), i_gemm_def->ldb*i_gemm_def->n*sizeof(unsigned char), i_gemm_def->br_count, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } } else { /* nothing */ } l_jittime = libxsmm_timer_duration(l_start, libxsmm_timer_tick()); if (l_test_jit.xmm == 0) { printf("JIT failed, please run with LIBXSMM_VERBOSE=-1 and/or with debug mode LIBXSMM library!\n"); exit(EXIT_FAILURE); } /* receive kernel information */ libxsmm_get_mmkernel_info(l_test_jit, &l_info); l_start = libxsmm_timer_tick(); if ( l_info.prefetch == LIBXSMM_GEMM_PREFETCH_NONE ) { if (i_gemm_def->br_type == 0) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.subimm(i_a, i_b, o_c); } } else if (i_gemm_def->br_type == 1) { for (l_t = 0; l_t < g_reps; l_t++) { for ( l_r = 0 ; l_r < i_gemm_def->br_count; l_r++ ) { l_a_addr[l_r] = (char*)i_a + (l_r * (size_t)i_gemm_def->lda * (size_t)i_gemm_def->k); l_b_addr[l_r] = (unsigned char*)i_b + (l_r * (size_t)i_gemm_def->ldb * (size_t)i_gemm_def->n); } l_test_jit.subimra(l_a_addr, l_b_addr, o_c, &l_br); } } else if (i_gemm_def->br_type == 2) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.subimro(i_a, i_b, o_c, &l_br, l_a_offs, l_b_offs); } } else if (i_gemm_def->br_type == 3) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.subimrs(i_a, i_b, o_c, &l_br); } } } else { if (i_gemm_def->br_type == 0) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.subimm(i_a, i_b, o_c, i_a, i_b, o_c); } } else if (i_gemm_def->br_type == 1) { for (l_t = 0; l_t < g_reps; l_t++) { for ( l_r = 0 ; l_r < i_gemm_def->br_count; l_r++ ) { l_a_addr[l_r] = (char*)i_a + (l_r * (size_t)i_gemm_def->lda * (size_t)i_gemm_def->k); l_b_addr[l_r] = (unsigned char*)i_b + (l_r * (size_t)i_gemm_def->ldb * (size_t)i_gemm_def->n); } l_test_jit.subimra(l_a_addr, l_b_addr, o_c, &l_br); } } else if (i_gemm_def->br_type == 2) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.subimro(i_a, i_b, o_c, &l_br, l_a_offs, l_b_offs); } } else if (i_gemm_def->br_type == 3) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.subimrs(i_a, i_b, o_c, &l_br); } } } l_runtime = libxsmm_timer_duration(l_start, libxsmm_timer_tick()); if ( i_print_jit_info == 0 ) { printf("function pointer address: %llx\n", (unsigned long long)l_test_jit.xmm); printf("%fs for creating jit\n", l_jittime); } free( (void*)l_a_addr ); free( (void*)l_b_addr ); free( (void*)l_a_offs ); free( (void*)l_b_offs ); return l_runtime; } #if 0 LIBXSMM_INLINE double run_jit_uschar_uchar( const gemm_def* i_gemm_def, const unsigned char* i_a, const char* i_b, unsigned char* o_c, const unsigned int i_print_jit_info ) { return 0.0; } #endif LIBXSMM_INLINE double run_jit_suchar_uchar( const gemm_def* i_gemm_def, const char* i_a, const unsigned char* i_b, unsigned char* o_c, float i_scf, const unsigned int i_print_jit_info ) { /* define function pointer */ libxsmm_xmmfunction l_test_jit = { NULL }; libxsmm_timer_tickint l_start; libxsmm_mmkernel_info l_info; int l_flags = LIBXSMM_GEMM_FLAGS('N', 'N'); double l_jittime, l_runtime; size_t l_t, l_r; const char** l_a_addr = (const char**)malloc(i_gemm_def->br_count*sizeof(char*)); const unsigned char** l_b_addr = (const unsigned char**)malloc(i_gemm_def->br_count*sizeof(unsigned char*)); unsigned long long* l_a_offs = (unsigned long long*)malloc(i_gemm_def->br_count*sizeof(unsigned long long)); unsigned long long* l_b_offs = (unsigned long long*)malloc(i_gemm_def->br_count*sizeof(unsigned long long)); int l_alpha = (int)i_gemm_def->alpha; int l_beta = (int)i_gemm_def->beta; unsigned long long l_br = (unsigned long long)i_gemm_def->br_count; l_flags |= LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_C_UNSIGNED | LIBXSMM_GEMM_FLAG_VNNI_A; if (0 == i_gemm_def) { fprintf(stderr, "JIT: unsupported descriptor arguments or data type!\n"); return EXIT_FAILURE; } /* setup brgemm offsets */ if ( i_gemm_def->br_type == 2 ) { for ( l_r = 0 ; l_r < i_gemm_def->br_count; l_r++ ) { l_a_offs[l_r] = l_r * (size_t)i_gemm_def->lda * (size_t)i_gemm_def->k * sizeof(char); l_b_offs[l_r] = l_r * (size_t)i_gemm_def->ldb * (size_t)i_gemm_def->n * sizeof(unsigned char); } } /* set up the flags */ if ( i_gemm_def->trans_b != 0 ) { fprintf(stderr, "trans_b needs to be 0\n"); return EXIT_FAILURE; } if ( i_gemm_def->trans_a != 0 ) { fprintf(stderr, "trans_a needs to be 0\n"); return EXIT_FAILURE; } l_flags |= (0 != i_gemm_def->aligned_a ? LIBXSMM_GEMM_FLAG_ALIGN_A : 0); l_flags |= (0 != i_gemm_def->aligned_c ? LIBXSMM_GEMM_FLAG_ALIGN_C : 0); l_start = libxsmm_timer_tick(); if (i_gemm_def->br_type == 0) { l_test_jit.sububmm = libxsmm_sububmmdispatch(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } else if (i_gemm_def->br_type == 1) { if (i_gemm_def->br_unroll == 0) { l_test_jit.sububmra = libxsmm_sububmmdispatch_reducebatch_addr(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } else { l_test_jit.sububmra = libxsmm_sububmmdispatch_reducebatch_addr_unroll(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->br_count, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } } else if (i_gemm_def->br_type == 2) { if (i_gemm_def->br_unroll == 0) { l_test_jit.sububmro = libxsmm_sububmmdispatch_reducebatch_offs(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } else { l_test_jit.sububmro = libxsmm_sububmmdispatch_reducebatch_offs_unroll(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->br_count, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } } else if (i_gemm_def->br_type == 3) { if (i_gemm_def->br_unroll == 0) { l_test_jit.sububmrs = libxsmm_sububmmdispatch_reducebatch_strd(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->lda*i_gemm_def->k*sizeof(char), i_gemm_def->ldb*i_gemm_def->n*sizeof(unsigned char), &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } else { l_test_jit.sububmrs = libxsmm_sububmmdispatch_reducebatch_strd_unroll(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->lda*i_gemm_def->k*sizeof(char), i_gemm_def->ldb*i_gemm_def->n*sizeof(unsigned char), i_gemm_def->br_count, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } } else { /* nothing */ } l_jittime = libxsmm_timer_duration(l_start, libxsmm_timer_tick()); if (l_test_jit.xmm == 0) { printf("JIT failed, please run with LIBXSMM_VERBOSE=-1 and/or with debug mode LIBXSMM library!\n"); exit(EXIT_FAILURE); } /* receive kernel information */ libxsmm_get_mmkernel_info(l_test_jit, &l_info); l_start = libxsmm_timer_tick(); if ( l_info.prefetch == LIBXSMM_GEMM_PREFETCH_NONE ) { if (i_gemm_def->br_type == 0) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.sububmm(i_a, i_b, o_c, &i_scf); } } else if (i_gemm_def->br_type == 1) { for (l_t = 0; l_t < g_reps; l_t++) { for ( l_r = 0 ; l_r < i_gemm_def->br_count; l_r++ ) { l_a_addr[l_r] = (char*)i_a + (l_r * (size_t)i_gemm_def->lda * (size_t)i_gemm_def->k); l_b_addr[l_r] = (unsigned char*)i_b + (l_r * (size_t)i_gemm_def->ldb * (size_t)i_gemm_def->n); } l_test_jit.sububmra(l_a_addr, l_b_addr, o_c, &l_br, &i_scf); } } else if (i_gemm_def->br_type == 2) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.sububmro(i_a, i_b, o_c, &l_br, l_a_offs, l_b_offs, &i_scf); } } else if (i_gemm_def->br_type == 3) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.sububmrs(i_a, i_b, o_c, &l_br, &i_scf); } } } else { if (i_gemm_def->br_type == 0) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.sububmm(i_a, i_b, o_c, &i_scf); } } else if (i_gemm_def->br_type == 1) { for (l_t = 0; l_t < g_reps; l_t++) { for ( l_r = 0 ; l_r < i_gemm_def->br_count; l_r++ ) { l_a_addr[l_r] = (char*)i_a + (l_r * (size_t)i_gemm_def->lda * (size_t)i_gemm_def->k); l_b_addr[l_r] = (unsigned char*)i_b + (l_r * (size_t)i_gemm_def->ldb * (size_t)i_gemm_def->n); } l_test_jit.sububmra(l_a_addr, l_b_addr, o_c, &l_br, &i_scf); } } else if (i_gemm_def->br_type == 2) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.sububmro(i_a, i_b, o_c, &l_br, l_a_offs, l_b_offs, &i_scf); } } else if (i_gemm_def->br_type == 3) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.sububmrs(i_a, i_b, o_c, &l_br, &i_scf); } } } l_runtime = libxsmm_timer_duration(l_start, libxsmm_timer_tick()); if ( i_print_jit_info == 0 ) { printf("function pointer address: %llx\n", (unsigned long long)l_test_jit.xmm); printf("%fs for creating jit\n", l_jittime); } free( (void*)l_a_addr ); free( (void*)l_b_addr ); free( (void*)l_a_offs ); free( (void*)l_b_offs ); return l_runtime; } LIBXSMM_INLINE double run_jit_bfloat16_float( const gemm_def* i_gemm_def, const libxsmm_bfloat16* i_a, const libxsmm_bfloat16* i_b, float* o_c, const unsigned int i_print_jit_info ) { /* define function pointer */ libxsmm_xmmfunction l_test_jit = { NULL }; libxsmm_timer_tickint l_start; libxsmm_mmkernel_info l_info; int l_flags = LIBXSMM_GEMM_FLAGS('N', 'N'); double l_jittime, l_runtime; size_t l_t, l_r; const libxsmm_bfloat16** l_a_addr = (const libxsmm_bfloat16**)malloc(i_gemm_def->br_count*sizeof(libxsmm_bfloat16*)); const libxsmm_bfloat16** l_b_addr = (const libxsmm_bfloat16**)malloc(i_gemm_def->br_count*sizeof(libxsmm_bfloat16*)); unsigned long long* l_a_offs = (unsigned long long*)malloc(i_gemm_def->br_count*sizeof(unsigned long long)); unsigned long long* l_b_offs = (unsigned long long*)malloc(i_gemm_def->br_count*sizeof(unsigned long long)); float l_alpha = (float)i_gemm_def->alpha; float l_beta = (float)i_gemm_def->beta; unsigned long long l_br = (unsigned long long)i_gemm_def->br_count; l_flags |= LIBXSMM_GEMM_FLAG_VNNI_A; if (0 == i_gemm_def) { fprintf(stderr, "JIT: unsupported descriptor arguments or data type!\n"); return EXIT_FAILURE; } /* setup brgemm offsets */ if ( i_gemm_def->br_type == 2 ) { for ( l_r = 0 ; l_r < i_gemm_def->br_count; l_r++ ) { l_a_offs[l_r] = l_r * (size_t)i_gemm_def->lda * (size_t)i_gemm_def->k * sizeof(libxsmm_bfloat16); l_b_offs[l_r] = l_r * (size_t)i_gemm_def->ldb * (size_t)i_gemm_def->n * sizeof(libxsmm_bfloat16); } } /* set up the flags */ if ( i_gemm_def->trans_b != 0 ) { fprintf(stderr, "trans_b needs to be 0\n"); return EXIT_FAILURE; } if ( i_gemm_def->trans_a != 0 ) { fprintf(stderr, "trans_a needs to be 0\n"); return EXIT_FAILURE; } l_flags |= (0 != i_gemm_def->aligned_a ? LIBXSMM_GEMM_FLAG_ALIGN_A : 0); l_flags |= (0 != i_gemm_def->aligned_c ? LIBXSMM_GEMM_FLAG_ALIGN_C : 0); l_start = libxsmm_timer_tick(); if (i_gemm_def->br_type == 0) { l_test_jit.bsmm = libxsmm_bsmmdispatch(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } else if (i_gemm_def->br_type == 1) { if (i_gemm_def->br_unroll == 0) { l_test_jit.bsmra = libxsmm_bsmmdispatch_reducebatch_addr(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } else { l_test_jit.bsmra = libxsmm_bsmmdispatch_reducebatch_addr_unroll(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->br_count, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } } else if (i_gemm_def->br_type == 2) { if (i_gemm_def->br_unroll == 0) { l_test_jit.bsmro = libxsmm_bsmmdispatch_reducebatch_offs(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } else { l_test_jit.bsmro = libxsmm_bsmmdispatch_reducebatch_offs_unroll(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->br_count, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } } else if (i_gemm_def->br_type == 3) { if (i_gemm_def->br_unroll == 0) { l_test_jit.bsmrs = libxsmm_bsmmdispatch_reducebatch_strd(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->lda*i_gemm_def->k*sizeof(libxsmm_bfloat16), i_gemm_def->ldb*i_gemm_def->n*sizeof(libxsmm_bfloat16), &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } else { l_test_jit.bsmrs = libxsmm_bsmmdispatch_reducebatch_strd_unroll(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->lda*i_gemm_def->k*sizeof(libxsmm_bfloat16), i_gemm_def->ldb*i_gemm_def->n*sizeof(libxsmm_bfloat16), i_gemm_def->br_count, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } } else { /* nothing */ } l_jittime = libxsmm_timer_duration(l_start, libxsmm_timer_tick()); if (l_test_jit.xmm == 0) { printf("JIT failed, please run with LIBXSMM_VERBOSE=-1 and/or with debug mode LIBXSMM library!\n"); exit(EXIT_FAILURE); } /* receive kernel information */ libxsmm_get_mmkernel_info(l_test_jit, &l_info); l_start = libxsmm_timer_tick(); if ( l_info.prefetch == LIBXSMM_GEMM_PREFETCH_NONE ) { if (i_gemm_def->br_type == 0) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.bsmm(i_a, i_b, o_c); } } else if (i_gemm_def->br_type == 1) { for (l_t = 0; l_t < g_reps; l_t++) { for ( l_r = 0 ; l_r < i_gemm_def->br_count; l_r++ ) { l_a_addr[l_r] = (libxsmm_bfloat16*)i_a + (l_r * (size_t)i_gemm_def->lda * (size_t)i_gemm_def->k); l_b_addr[l_r] = (libxsmm_bfloat16*)i_b + (l_r * (size_t)i_gemm_def->ldb * (size_t)i_gemm_def->n); } l_test_jit.bsmra(l_a_addr, l_b_addr, o_c, &l_br); } } else if (i_gemm_def->br_type == 2) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.bsmro(i_a, i_b, o_c, &l_br, l_a_offs, l_b_offs); } } else if (i_gemm_def->br_type == 3) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.bsmrs(i_a, i_b, o_c, &l_br); } } } else { if (i_gemm_def->br_type == 0) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.bsmm(i_a, i_b, o_c, i_a, i_b, o_c); } } else if (i_gemm_def->br_type == 1) { for (l_t = 0; l_t < g_reps; l_t++) { for ( l_r = 0 ; l_r < i_gemm_def->br_count; l_r++ ) { l_a_addr[l_r] = (libxsmm_bfloat16*)i_a + (l_r * (size_t)i_gemm_def->lda * (size_t)i_gemm_def->k); l_b_addr[l_r] = (libxsmm_bfloat16*)i_b + (l_r * (size_t)i_gemm_def->ldb * (size_t)i_gemm_def->n); } l_test_jit.bsmra(l_a_addr, l_b_addr, o_c, &l_br); } } else if (i_gemm_def->br_type == 2) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.bsmro(i_a, i_b, o_c, &l_br, l_a_offs, l_b_offs); } } else if (i_gemm_def->br_type == 3) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.bsmrs(i_a, i_b, o_c, &l_br); } } } l_runtime = libxsmm_timer_duration(l_start, libxsmm_timer_tick()); if ( i_print_jit_info == 0 ) { printf("function pointer address: %llx\n", (unsigned long long)l_test_jit.xmm); printf("%fs for creating jit\n", l_jittime); } free( (void*)l_a_addr ); free( (void*)l_b_addr ); free( (void*)l_a_offs ); free( (void*)l_b_offs ); return l_runtime; } LIBXSMM_INLINE double run_jit_bfloat16( const gemm_def* i_gemm_def, const libxsmm_bfloat16* i_a, const libxsmm_bfloat16* i_b, libxsmm_bfloat16* o_c, const unsigned int i_print_jit_info ) { /* define function pointer */ libxsmm_xmmfunction l_test_jit = { NULL }; libxsmm_timer_tickint l_start; libxsmm_mmkernel_info l_info; int l_flags = LIBXSMM_GEMM_FLAGS('N', 'N'); double l_jittime, l_runtime; size_t l_t, l_r; const libxsmm_bfloat16** l_a_addr = (const libxsmm_bfloat16**)malloc(i_gemm_def->br_count*sizeof(libxsmm_bfloat16*)); const libxsmm_bfloat16** l_b_addr = (const libxsmm_bfloat16**)malloc(i_gemm_def->br_count*sizeof(libxsmm_bfloat16*)); unsigned long long* l_a_offs = (unsigned long long*)malloc(i_gemm_def->br_count*sizeof(unsigned long long)); unsigned long long* l_b_offs = (unsigned long long*)malloc(i_gemm_def->br_count*sizeof(unsigned long long)); float l_alpha = (float)i_gemm_def->alpha; float l_beta = (float)i_gemm_def->beta; unsigned long long l_br = (unsigned long long)i_gemm_def->br_count; l_flags |= LIBXSMM_GEMM_FLAG_VNNI_A; if (0 == i_gemm_def) { fprintf(stderr, "JIT: unsupported descriptor arguments or data type!\n"); return EXIT_FAILURE; } /* setup brgemm offsets */ if ( i_gemm_def->br_type == 2 ) { for ( l_r = 0 ; l_r < i_gemm_def->br_count; l_r++ ) { l_a_offs[l_r] = l_r * (size_t)i_gemm_def->lda * (size_t)i_gemm_def->k * sizeof(libxsmm_bfloat16); l_b_offs[l_r] = l_r * (size_t)i_gemm_def->ldb * (size_t)i_gemm_def->n * sizeof(libxsmm_bfloat16); } } /* set up the flags */ if ( i_gemm_def->trans_b != 0 ) { fprintf(stderr, "trans_b needs to be 0\n"); return EXIT_FAILURE; } if ( i_gemm_def->trans_a != 0 ) { fprintf(stderr, "trans_a needs to be 0\n"); return EXIT_FAILURE; } l_flags |= (0 != i_gemm_def->aligned_a ? LIBXSMM_GEMM_FLAG_ALIGN_A : 0); l_flags |= (0 != i_gemm_def->aligned_c ? LIBXSMM_GEMM_FLAG_ALIGN_C : 0); l_start = libxsmm_timer_tick(); if (i_gemm_def->br_type == 0) { l_test_jit.bmm = libxsmm_bmmdispatch(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } else if (i_gemm_def->br_type == 1) { if (i_gemm_def->br_unroll == 0) { l_test_jit.bmra = libxsmm_bmmdispatch_reducebatch_addr(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } else { l_test_jit.bmra = libxsmm_bmmdispatch_reducebatch_addr_unroll(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->br_count, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } } else if (i_gemm_def->br_type == 2) { if (i_gemm_def->br_unroll == 0) { l_test_jit.bmro = libxsmm_bmmdispatch_reducebatch_offs(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } else { l_test_jit.bmro = libxsmm_bmmdispatch_reducebatch_offs_unroll(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->br_count, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } } else if (i_gemm_def->br_type == 3) { if (i_gemm_def->br_unroll == 0) { l_test_jit.bmrs = libxsmm_bmmdispatch_reducebatch_strd(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->lda*i_gemm_def->k*sizeof(libxsmm_bfloat16), i_gemm_def->ldb*i_gemm_def->n*sizeof(libxsmm_bfloat16), &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } else { l_test_jit.bmrs = libxsmm_bmmdispatch_reducebatch_strd_unroll(i_gemm_def->m, i_gemm_def->n, i_gemm_def->k, i_gemm_def->lda*i_gemm_def->k*sizeof(libxsmm_bfloat16), i_gemm_def->ldb*i_gemm_def->n*sizeof(libxsmm_bfloat16), i_gemm_def->br_count, &(i_gemm_def->lda), &(i_gemm_def->ldb), &(i_gemm_def->ldc), &l_alpha, &l_beta, &l_flags, &(i_gemm_def->prefetch)); } } else { /* nothing */ } l_jittime = libxsmm_timer_duration(l_start, libxsmm_timer_tick()); if (l_test_jit.xmm == 0) { printf("JIT failed, please run with LIBXSMM_VERBOSE=-1 and/or with debug mode LIBXSMM library!\n"); exit(EXIT_FAILURE); } /* receive kernel information */ libxsmm_get_mmkernel_info(l_test_jit, &l_info); l_start = libxsmm_timer_tick(); if ( l_info.prefetch == LIBXSMM_GEMM_PREFETCH_NONE ) { if (i_gemm_def->br_type == 0) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.bmm(i_a, i_b, o_c); } } else if (i_gemm_def->br_type == 1) { for (l_t = 0; l_t < g_reps; l_t++) { for ( l_r = 0 ; l_r < i_gemm_def->br_count; l_r++ ) { l_a_addr[l_r] = (libxsmm_bfloat16*)i_a + (l_r * (size_t)i_gemm_def->lda * (size_t)i_gemm_def->k); l_b_addr[l_r] = (libxsmm_bfloat16*)i_b + (l_r * (size_t)i_gemm_def->ldb * (size_t)i_gemm_def->n); } l_test_jit.bmra(l_a_addr, l_b_addr, o_c, &l_br); } } else if (i_gemm_def->br_type == 2) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.bmro(i_a, i_b, o_c, &l_br, l_a_offs, l_b_offs); } } else if (i_gemm_def->br_type == 3) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.bmrs(i_a, i_b, o_c, &l_br); } } } else { if (i_gemm_def->br_type == 0) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.bmm(i_a, i_b, o_c, i_a, i_b, o_c); } } else if (i_gemm_def->br_type == 1) { for (l_t = 0; l_t < g_reps; l_t++) { for ( l_r = 0 ; l_r < i_gemm_def->br_count; l_r++ ) { l_a_addr[l_r] = (libxsmm_bfloat16*)i_a + (l_r * (size_t)i_gemm_def->lda * (size_t)i_gemm_def->k); l_b_addr[l_r] = (libxsmm_bfloat16*)i_b + (l_r * (size_t)i_gemm_def->ldb * (size_t)i_gemm_def->n); } l_test_jit.bmra(l_a_addr, l_b_addr, o_c, &l_br); } } else if (i_gemm_def->br_type == 2) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.bmro(i_a, i_b, o_c, &l_br, l_a_offs, l_b_offs); } } else if (i_gemm_def->br_type == 3) { for (l_t = 0; l_t < g_reps; l_t++) { l_test_jit.bmrs(i_a, i_b, o_c, &l_br); } } } l_runtime = libxsmm_timer_duration(l_start, libxsmm_timer_tick()); if ( i_print_jit_info == 0 ) { printf("function pointer address: %llx\n", (unsigned long long)l_test_jit.xmm); printf("%fs for creating jit\n", l_jittime); } free( (void*)l_a_addr ); free( (void*)l_b_addr ); free( (void*)l_a_offs ); free( (void*)l_b_offs ); return l_runtime; } int main(int argc, char* argv []) { char* l_precision = NULL; libxsmm_blasint l_lda = 0, l_ldb = 0, l_ldc = 0; int l_m = 0, l_n = 0, l_k = 0; int l_aligned_a = 0; int l_aligned_c = 0; int l_trans_a = 0; int l_trans_b = 0; double l_alpha = 0; double l_beta = 0; int l_br = 1; int l_br_type = 0; int l_br_unroll = 0; libxsmm_gemm_prefetch_type l_prefetch = LIBXSMM_GEMM_PREFETCH_NONE; libxsmm_matdiff_info l_diff; gemm_def l_gemm_def; size_t l_i = 0, l_j = 0, l_s = 0, l_t = 0, l_r = 0; double l_runtime_c = 0; double l_runtime_libxsmm = 0; libxsmm_timer_tickint l_start; int l_file_input = 0; char* l_file_name = NULL; FILE *l_file_handle = NULL; int l_run_check = 0; /* input data */ double *l_a_d = 0, *l_b_d = 0, *l_c_d = 0; float *l_a_f = 0, *l_b_f = 0, *l_c_f = 0; short *l_a_w = 0, *l_b_w = 0; libxsmm_bfloat16 *l_a_bf = 0, *l_b_bf = 0, *l_c_bf = 0; unsigned char *l_ua_b = 0, *l_ub_b; char *l_sa_b = 0, *l_sb_b = 0; int* l_c_b_i = 0; int* l_c_w_i = 0; unsigned char* l_c_b_ub = 0; float* l_c_bf_f = 0; /* Gold data */ double* l_c_gold_d = 0; float* l_c_gold_f = 0; libxsmm_bfloat16* l_c_gold_bf = 0; int* l_c_gold_w_i = 0; int* l_c_gold_b_i = 0; unsigned char* l_c_gold_b_ub = 0; float* l_c_gold_bf_f = 0; double l_total_max_error = 0.0; /* scaling factor */ float l_scf = 1.0; libxsmm_matdiff_clear(&l_diff); /* check argument count for a valid range */ if ( argc == 19 ) { /* xgemm sizes */ l_m = atoi(argv[1]); l_n = atoi(argv[2]); l_k = atoi(argv[3]); l_lda = atoi(argv[4]); l_ldb = atoi(argv[5]); l_ldc = atoi(argv[6]); /* some sugar */ l_alpha = atof(argv[7]); l_beta = atof(argv[8]); l_aligned_a = atoi(argv[9]); l_aligned_c = atoi(argv[10]); l_trans_a = atoi(argv[11]); l_trans_b = atoi(argv[12]); /* arch specific stuff */ l_precision = argv[14]; l_br = atoi(argv[16]); l_br_unroll = atoi(argv[17]); g_reps = atoi(argv[18]); /* set value of prefetch flag */ if (strcmp("nopf", argv[13]) == 0) { l_prefetch = LIBXSMM_GEMM_PREFETCH_NONE; } else if (strcmp("pfsigonly", argv[13]) == 0) { l_prefetch = LIBXSMM_GEMM_PREFETCH_SIGONLY; } else if (strcmp("BL2viaC", argv[13]) == 0) { l_prefetch = LIBXSMM_GEMM_PREFETCH_BL2_VIA_C; } else if (strcmp("curAL2", argv[13]) == 0) { l_prefetch = LIBXSMM_GEMM_PREFETCH_AL2_AHEAD; } else if (strcmp("curAL2_BL2viaC", argv[13]) == 0) { l_prefetch = LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C_AHEAD; } else if (strcmp("AL2", argv[13]) == 0) { l_prefetch = LIBXSMM_GEMM_PREFETCH_AL2; } else if (strcmp("AL2_BL2viaC", argv[13]) == 0) { l_prefetch = LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C; } else { print_help(); return EXIT_FAILURE; } if (strcmp("nobr", argv[15]) == 0) { l_br_type = 0; } else if (strcmp("addrbr", argv[15]) == 0) { l_br_type = 1; } else if (strcmp("offsbr", argv[15]) == 0) { l_br_type = 2; } else if (strcmp("strdbr", argv[15]) == 0) { l_br_type = 3; } else { print_help(); return EXIT_FAILURE; } l_file_input = 0; l_run_check = 1; } else if ( argc == 14 ) { l_file_input = 1; l_file_name = argv[1]; l_alpha = atof(argv[2]); l_beta = atof(argv[3]); l_aligned_a = atoi(argv[4]); l_aligned_c = atoi(argv[5]); l_trans_a = atoi(argv[6]); l_trans_b = atoi(argv[7]); l_precision = argv[8]; l_br = atoi(argv[10]); l_br_unroll = atoi(argv[11]); if (strcmp("nobr", argv[9]) == 0) { l_br_type = 0; } else if (strcmp("addrbr", argv[9]) == 0) { l_br_type = 1; } else if (strcmp("offsbr", argv[9]) == 0) { l_br_type = 2; } else if (strcmp("strdbr", argv[9]) == 0) { l_br_type = 3; } else { print_help(); return EXIT_FAILURE; } g_reps = atoi(argv[12]); l_run_check = atoi(argv[13]); l_prefetch = LIBXSMM_GEMM_PREFETCH_NONE; } else { print_help(); return EXIT_FAILURE; } l_br = (l_br < 1) ? 1 : l_br; l_br = (l_br_type == 0) ? 1 : l_br; l_br_unroll = (l_br_type == 0) ? 0 : l_br_unroll; /* check alpha */ if ( LIBXSMM_NEQ(l_alpha, 1.0) ) { fprintf(stderr, "JIT: alpha needs to be 1.0!\n"); exit(EXIT_FAILURE); } /* check beta */ if ( LIBXSMM_NEQ(l_beta, 0.0) && LIBXSMM_NEQ(l_beta, 1.0) ) { fprintf(stderr, "JIT: beta needs to be 0.0 or 1.0!\n"); exit(EXIT_FAILURE); } if ( l_file_input != 0 ) { l_file_handle = fopen( l_file_name, "r" ); } else { if ( l_trans_b == 0 ) { printf("------------------------------------------------\n"); printf("RUNNING (%ix%i) X (%ix%i) = (%ix%i), %s, BR=%i\n", l_m, l_k, l_k, l_n, l_m, l_n, l_precision, l_br); printf("------------------------------------------------\n"); } else { printf("------------------------------------------------\n"); printf("RUNNING (%ix%i) X (%ix%i)^T = (%ix%i), %s, BR=%i\n", l_m, l_k, l_k, l_n, l_m, l_n, l_precision, l_br); printf("------------------------------------------------\n"); } } if ((strcmp(l_precision, "DP") == 0) && (l_trans_b == 0)) { unsigned int l_keep_going = 0; do { if ( l_file_input != 0 ) { char l_line[512]; if ( fgets( l_line, 512, l_file_handle) == NULL ) { l_keep_going = 0; break; } else { l_keep_going = 1; } if ( 6 != sscanf( l_line, "%i %i %i %i %i %i", &l_m, &l_n, &l_k, &l_lda, &l_ldb, &l_ldc ) ) exit(EXIT_FAILURE); } l_gemm_def.m = l_m; l_gemm_def.n = l_n; l_gemm_def.k = l_k; l_gemm_def.lda = l_lda; l_gemm_def.ldb = l_ldb; l_gemm_def.ldc = l_ldc; l_gemm_def.alpha = l_alpha; l_gemm_def.beta = l_beta; l_gemm_def.trans_a = l_trans_a; l_gemm_def.trans_b = l_trans_b; l_gemm_def.aligned_a = l_aligned_a; l_gemm_def.aligned_c = l_aligned_c; l_gemm_def.prefetch = l_prefetch; l_gemm_def.br_type = l_br_type; l_gemm_def.br_count = l_br; l_gemm_def.br_unroll = l_br_unroll; l_a_d = (double*)libxsmm_aligned_malloc((size_t)l_lda * (size_t)l_k * (size_t)l_br * sizeof(double), 64); l_b_d = (double*)libxsmm_aligned_malloc((size_t)l_ldb * (size_t)l_n * (size_t)l_br * sizeof(double), 64); l_c_d = (double*)libxsmm_aligned_malloc((size_t)l_ldc * (size_t)l_n * sizeof(double), 64); l_c_gold_d = (double*)libxsmm_aligned_malloc((size_t)l_ldc * (size_t)l_n * sizeof(double), 64); /* touch A */ for (l_r = 0; l_r < l_br; l_r++) { for (l_i = 0; l_i < l_lda; l_i++) { for (l_j = 0; l_j < l_k; l_j++) { l_a_d[(l_r * l_lda * l_k) + ((l_j * l_lda) + l_i)] = libxsmm_rng_f64(); } } } /* touch B */ for (l_r = 0; l_r < l_br; l_r++) { for (l_i = 0; l_i < l_ldb; l_i++) { for (l_j = 0; l_j < l_n; l_j++) { l_b_d[(l_r * l_ldb * l_n) + ((l_j * l_ldb) + l_i)] = libxsmm_rng_f64(); } } } /* touch C */ for (l_i = 0; l_i < l_ldc; l_i++) { for (l_j = 0; l_j < l_n; l_j++) { l_c_d[(l_j * l_ldc) + l_i] = 0.0; l_c_gold_d[(l_j * l_ldc) + l_i] = 0.0; } } l_runtime_libxsmm = run_jit_double( &l_gemm_def, l_a_d, l_b_d, l_c_d, l_file_input ); if ( l_run_check == 1 ) { l_start = libxsmm_timer_tick(); for (l_t = 0; l_t < g_reps; l_t++) { for (l_r = 0; l_r < l_br; l_r++) { for (l_j = 0; l_j < l_n; l_j++) { for (l_s = 0; l_s < l_k; l_s++) { for (l_i = 0; l_i < l_m; l_i++) { l_c_gold_d[(l_j * l_ldc) + l_i] += l_a_d[(l_r * l_lda * l_k) + ((l_s * l_lda) + l_i)] * l_b_d[(l_r * l_ldb * l_n) + ((l_j * l_ldb) + l_s)]; } } } } } l_runtime_c = libxsmm_timer_duration(l_start, libxsmm_timer_tick()); libxsmm_matdiff(&l_diff, LIBXSMM_DATATYPE_F64, l_m, l_n, l_c_gold_d, l_c_d, &l_ldc, &l_ldc); } if ( l_file_input == 0 ) { printf("%fs for C\n", l_runtime_c); printf("%f GFLOPS for C\n", ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_c * 1.0e9)); printf("%fs for libxsmm\n", l_runtime_libxsmm); printf("%f GFLOPS for libxsmm\n", ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_libxsmm * 1.0e9)); printf("max. error: %f\n", l_diff.linf_abs); } else { if ( l_run_check == 1 ) { printf("%i %i %i %i %i %i %i %i %i %s %f %f\n", l_m, l_n, l_k, l_lda, l_ldb, l_ldc, l_br, l_br_type, l_br_unroll, l_precision, ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_libxsmm * 1.0e9), l_diff.linf_abs ); } else { printf("%i %i %i %i %i %i %i %i %i %s %f\n", l_m, l_n, l_k, l_lda, l_ldb, l_ldc, l_br, l_br_type, l_br_unroll, l_precision, ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_libxsmm * 1.0e9) ); } } if ( (l_total_max_error < l_diff.linf_abs) && (l_run_check == 1) ) { l_total_max_error = l_diff.linf_abs; } libxsmm_free(l_a_d); libxsmm_free(l_b_d); libxsmm_free(l_c_d); libxsmm_free(l_c_gold_d); } while ( l_keep_going ); } else if ((strcmp(l_precision, "DP") == 0) && (l_trans_b != 0)) { unsigned int l_keep_going = 0; do { if ( l_file_input != 0 ) { char l_line[512]; if ( fgets( l_line, 512, l_file_handle) == NULL ) { l_keep_going = 0; break; } else { l_keep_going = 1; } if ( 6 != sscanf( l_line, "%i %i %i %i %i %i", &l_m, &l_n, &l_k, &l_lda, &l_ldb, &l_ldc ) ) exit(EXIT_FAILURE); } l_gemm_def.m = l_m; l_gemm_def.n = l_n; l_gemm_def.k = l_k; l_gemm_def.lda = l_lda; l_gemm_def.ldb = l_ldb; l_gemm_def.ldc = l_ldc; l_gemm_def.alpha = l_alpha; l_gemm_def.beta = l_beta; l_gemm_def.trans_a = l_trans_a; l_gemm_def.trans_b = l_trans_b; l_gemm_def.aligned_a = l_aligned_a; l_gemm_def.aligned_c = l_aligned_c; l_gemm_def.prefetch = l_prefetch; l_gemm_def.br_type = l_br_type; l_gemm_def.br_count = l_br; l_gemm_def.br_unroll = l_br_unroll; l_a_d = (double*)libxsmm_aligned_malloc((size_t)l_lda * (size_t)l_k * (size_t)l_br * sizeof(double), 64); l_b_d = (double*)libxsmm_aligned_malloc((size_t)l_ldb * (size_t)l_k * (size_t)l_br * sizeof(double), 64); l_c_d = (double*)libxsmm_aligned_malloc((size_t)l_ldc * (size_t)l_n * sizeof(double), 64); l_c_gold_d = (double*)libxsmm_aligned_malloc((size_t)l_ldc * (size_t)l_n * sizeof(double), 64); /* touch A */ for (l_r = 0; l_r < l_br; l_r++) { for (l_i = 0; l_i < l_lda; l_i++) { for (l_j = 0; l_j < l_k; l_j++) { l_a_d[(l_r * l_lda * l_k) + (l_j * l_lda) + l_i] = libxsmm_rng_f64(); } } } /* touch B */ for (l_r = 0; l_r < l_br; l_r++) { for (l_i = 0; l_i < l_ldb; l_i++) { for (l_j = 0; l_j < l_k; l_j++) { l_b_d[(l_r * l_ldb * l_k) + (l_j * l_ldb) + l_i] = libxsmm_rng_f64(); } } } /* touch C */ for (l_i = 0; l_i < l_ldc; l_i++) { for (l_j = 0; l_j < l_n; l_j++) { l_c_d[(l_j * l_ldc) + l_i] = 0.0; l_c_gold_d[(l_j * l_ldc) + l_i] = 0.0; } } l_runtime_libxsmm = run_jit_double( &l_gemm_def, l_a_d, l_b_d, l_c_d, l_file_input ); if ( l_run_check == 1 ) { l_start = libxsmm_timer_tick(); for (l_t = 0; l_t < g_reps; l_t++) { for (l_r = 0; l_r < l_br; l_r++) { for (l_j = 0; l_j < l_n; l_j++) { for (l_s = 0; l_s < l_k; l_s++) { for (l_i = 0; l_i < l_m; l_i++) { l_c_gold_d[(l_j * l_ldc) + l_i] += l_a_d[(l_r * l_lda * l_k) + (l_s * l_lda) + l_i] * l_b_d[(l_r * l_ldb * l_k) + (l_s * l_ldb) + l_j]; } } } } } l_runtime_c = libxsmm_timer_duration(l_start, libxsmm_timer_tick()); libxsmm_matdiff(&l_diff, LIBXSMM_DATATYPE_F64, l_m, l_n, l_c_gold_d, l_c_d, &l_ldc, &l_ldc); } if ( l_file_input == 0 ) { printf("%fs for C\n", l_runtime_c); printf("%f GFLOPS for C\n", ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_c * 1.0e9)); printf("%fs for libxsmm\n", l_runtime_libxsmm); printf("%f GFLOPS for libxsmm\n", ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_libxsmm * 1.0e9)); printf("max. error: %f\n", l_diff.linf_abs); } else { if ( l_run_check == 1 ) { printf("%i %i %i %i %i %i %i %i %i %s %f %f\n", l_m, l_n, l_k, l_lda, l_ldb, l_ldc, l_br, l_br_type, l_br_unroll, l_precision, ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_libxsmm * 1.0e9), l_diff.linf_abs ); } else { printf("%i %i %i %i %i %i %i %i %i %s %f\n", l_m, l_n, l_k, l_lda, l_ldb, l_ldc, l_br, l_br_type, l_br_unroll, l_precision, ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_libxsmm * 1.0e9) ); } } if ( (l_total_max_error < l_diff.linf_abs) && (l_run_check == 1) ) { l_total_max_error = l_diff.linf_abs; } libxsmm_free(l_a_d); libxsmm_free(l_b_d); libxsmm_free(l_c_d); libxsmm_free(l_c_gold_d); } while ( l_keep_going ); } else if ((strcmp(l_precision, "SP") == 0) && (l_trans_b == 0)) { unsigned int l_keep_going = 0; do { if ( l_file_input != 0 ) { char l_line[512]; if ( fgets( l_line, 512, l_file_handle) == NULL ) { l_keep_going = 0; break; } else { l_keep_going = 1; } if ( 6 != sscanf( l_line, "%i %i %i %i %i %i", &l_m, &l_n, &l_k, &l_lda, &l_ldb, &l_ldc ) ) exit(EXIT_FAILURE); } l_gemm_def.m = l_m; l_gemm_def.n = l_n; l_gemm_def.k = l_k; l_gemm_def.lda = l_lda; l_gemm_def.ldb = l_ldb; l_gemm_def.ldc = l_ldc; l_gemm_def.alpha = l_alpha; l_gemm_def.beta = l_beta; l_gemm_def.trans_a = l_trans_a; l_gemm_def.trans_b = l_trans_b; l_gemm_def.aligned_a = l_aligned_a; l_gemm_def.aligned_c = l_aligned_c; l_gemm_def.prefetch = l_prefetch; l_gemm_def.br_type = l_br_type; l_gemm_def.br_count = l_br; l_gemm_def.br_unroll = l_br_unroll; l_a_f = (float*)libxsmm_aligned_malloc((size_t)l_lda * (size_t)l_k * (size_t)l_br * sizeof(float), 64); l_b_f = (float*)libxsmm_aligned_malloc((size_t)l_ldb * (size_t)l_n * (size_t)l_br * sizeof(float), 64); l_c_f = (float*)libxsmm_aligned_malloc((size_t)l_ldc * (size_t)l_n * sizeof(float), 64); l_c_gold_f = (float*)libxsmm_aligned_malloc((size_t)l_ldc * (size_t)l_n * sizeof(float), 64); /* touch A */ for (l_r = 0; l_r < l_br; l_r++) { for (l_i = 0; l_i < l_lda; l_i++) { for (l_j = 0; l_j < l_k; l_j++) { l_a_f[(l_r * l_lda * l_k) + (l_j * l_lda) + l_i] = (float)libxsmm_rng_f64(); } } } /* touch B */ for (l_r = 0; l_r < l_br; l_r++) { for (l_i = 0; l_i < l_ldb; l_i++) { for (l_j = 0; l_j < l_n; l_j++) { l_b_f[(l_r * l_ldb * l_n) + (l_j * l_ldb) + l_i] = (float)libxsmm_rng_f64(); } } } /* touch C */ for (l_i = 0; l_i < l_ldc; l_i++) { for (l_j = 0; l_j < l_n; l_j++) { l_c_f[(l_j * l_ldc) + l_i] = 0.f; l_c_gold_f[(l_j * l_ldc) + l_i] = 0.f; } } l_runtime_libxsmm = run_jit_float( &l_gemm_def, l_a_f, l_b_f, l_c_f, l_file_input ); if ( l_run_check == 1 ) { l_start = libxsmm_timer_tick(); for (l_t = 0; l_t < g_reps; l_t++) { for (l_r = 0; l_r < l_br; l_r++) { for (l_j = 0; l_j < l_n; l_j++) { for (l_s = 0; l_s < l_k; l_s++) { for (l_i = 0; l_i < l_m; l_i++) { l_c_gold_f[(l_j * l_ldc) + l_i] += l_a_f[(l_r * l_lda * l_k) + (l_s * l_lda) + l_i] * l_b_f[(l_r * l_ldb * l_n) + (l_j * l_ldb) + l_s]; } } } } } l_runtime_c = libxsmm_timer_duration(l_start, libxsmm_timer_tick()); libxsmm_matdiff(&l_diff, LIBXSMM_DATATYPE_F32, l_m, l_n, l_c_gold_f, l_c_f, &l_ldc, &l_ldc); } if ( l_file_input == 0 ) { printf("%fs for C\n", l_runtime_c); printf("%f GFLOPS for C\n", ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_c * 1.0e9)); printf("%fs for libxsmm\n", l_runtime_libxsmm); printf("%f GFLOPS for libxsmm\n", ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_libxsmm * 1.0e9)); printf("max. error: %f\n", l_diff.linf_abs); } else { if ( l_run_check == 1 ) { printf("%i %i %i %i %i %i %i %i %i %s %f %f\n", l_m, l_n, l_k, l_lda, l_ldb, l_ldc, l_br, l_br_type, l_br_unroll, l_precision, ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_libxsmm * 1.0e9), l_diff.linf_abs ); } else { printf("%i %i %i %i %i %i %i %i %i %s %f\n", l_m, l_n, l_k, l_lda, l_ldb, l_ldc, l_br, l_br_type, l_br_unroll, l_precision, ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_libxsmm * 1.0e9) ); } } if ( (l_total_max_error < l_diff.linf_abs) && (l_run_check == 1) ) { l_total_max_error = l_diff.linf_abs; } libxsmm_free(l_a_f); libxsmm_free(l_b_f); libxsmm_free(l_c_f); libxsmm_free(l_c_gold_f); } while ( l_keep_going ); } else if ((strcmp(l_precision, "SP") == 0) && (l_trans_b != 0)) { unsigned int l_keep_going = 0; do { if ( l_file_input != 0 ) { char l_line[512]; if ( fgets( l_line, 512, l_file_handle) == NULL ) { l_keep_going = 0; break; } else { l_keep_going = 1; } if ( 6 != sscanf( l_line, "%i %i %i %i %i %i", &l_m, &l_n, &l_k, &l_lda, &l_ldb, &l_ldc ) ) exit(EXIT_FAILURE); } l_gemm_def.m = l_m; l_gemm_def.n = l_n; l_gemm_def.k = l_k; l_gemm_def.lda = l_lda; l_gemm_def.ldb = l_ldb; l_gemm_def.ldc = l_ldc; l_gemm_def.alpha = l_alpha; l_gemm_def.beta = l_beta; l_gemm_def.trans_a = l_trans_a; l_gemm_def.trans_b = l_trans_b; l_gemm_def.aligned_a = l_aligned_a; l_gemm_def.aligned_c = l_aligned_c; l_gemm_def.prefetch = l_prefetch; l_gemm_def.br_type = l_br_type; l_gemm_def.br_count = l_br; l_gemm_def.br_unroll = l_br_unroll; l_a_f = (float*)libxsmm_aligned_malloc((size_t)l_lda * (size_t)l_k * (size_t)l_br * sizeof(float), 64); l_b_f = (float*)libxsmm_aligned_malloc((size_t)l_ldb * (size_t)l_k * (size_t)l_br * sizeof(float), 64); l_c_f = (float*)libxsmm_aligned_malloc((size_t)l_ldc * (size_t)l_n * sizeof(float), 64); l_c_gold_f = (float*)libxsmm_aligned_malloc((size_t)l_ldc * (size_t)l_n * sizeof(float), 64); /* touch A */ for (l_r = 0; l_r < l_br; l_r++) { for (l_i = 0; l_i < l_lda; l_i++) { for (l_j = 0; l_j < l_k; l_j++) { l_a_f[(l_r * l_lda * l_k) + (l_j * l_lda) + l_i] = (float)libxsmm_rng_f64(); } } } /* touch B */ for (l_r = 0; l_r < l_br; l_r++) { for (l_i = 0; l_i < l_ldb; l_i++) { for (l_j = 0; l_j < l_k; l_j++) { l_b_f[(l_r * l_ldb * l_k) + (l_j * l_ldb) + l_i] = (float)libxsmm_rng_f64(); } } } /* touch C */ for (l_i = 0; l_i < l_ldc; l_i++) { for (l_j = 0; l_j < l_n; l_j++) { l_c_f[(l_j * l_ldc) + l_i] = 0.f; l_c_gold_f[(l_j * l_ldc) + l_i] = 0.f; } } l_runtime_libxsmm = run_jit_float( &l_gemm_def, l_a_f, l_b_f, l_c_f, l_file_input ); if ( l_run_check == 1 ) { l_start = libxsmm_timer_tick(); for (l_t = 0; l_t < g_reps; l_t++) { for (l_r = 0; l_r < l_br; l_r++) { for (l_j = 0; l_j < l_n; l_j++) { for (l_s = 0; l_s < l_k; l_s++) { for (l_i = 0; l_i < l_m; l_i++) { l_c_gold_f[(l_j * l_ldc) + l_i] += l_a_f[(l_r * l_lda * l_k) + (l_s * l_lda) + l_i] * l_b_f[(l_r * l_ldb * l_k) + (l_s * l_ldb) + l_j]; } } } } } l_runtime_c = libxsmm_timer_duration(l_start, libxsmm_timer_tick()); libxsmm_matdiff(&l_diff, LIBXSMM_DATATYPE_F32, l_m, l_n, l_c_gold_f, l_c_f, &l_ldc, &l_ldc); } if ( l_file_input == 0 ) { printf("%fs for C\n", l_runtime_c); printf("%f GFLOPS for C\n", ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_c * 1.0e9)); printf("%fs for libxsmm\n", l_runtime_libxsmm); printf("%f GFLOPS for libxsmm\n", ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_libxsmm * 1.0e9)); printf("max. error: %f\n", l_diff.linf_abs); } else { if ( l_run_check == 1 ) { printf("%i %i %i %i %i %i %i %i %i %s %f %f\n", l_m, l_n, l_k, l_lda, l_ldb, l_ldc, l_br, l_br_type, l_br_unroll, l_precision, ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_libxsmm * 1.0e9), l_diff.linf_abs ); } else { printf("%i %i %i %i %i %i %i %i %i %s %f\n", l_m, l_n, l_k, l_lda, l_ldb, l_ldc, l_br, l_br_type, l_br_unroll, l_precision, ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_libxsmm * 1.0e9) ); } } if ( (l_total_max_error < l_diff.linf_abs) && (l_run_check == 1) ) { l_total_max_error = l_diff.linf_abs; } libxsmm_free(l_a_f); libxsmm_free(l_b_f); libxsmm_free(l_c_f); libxsmm_free(l_c_gold_f); } while ( l_keep_going ); } else if (strcmp(l_precision, "I16I32") == 0) { const int l_k_block = 2; double l_max_error = 0; int l_k2; unsigned int l_keep_going = 0; do { if ( l_file_input != 0 ) { char l_line[512]; if ( fgets( l_line, 512, l_file_handle) == NULL ) { l_keep_going = 0; break; } else { l_keep_going = 1; } if ( 6 != sscanf( l_line, "%i %i %i %i %i %i", &l_m, &l_n, &l_k, &l_lda, &l_ldb, &l_ldc ) ) exit(EXIT_FAILURE); } l_gemm_def.m = l_m; l_gemm_def.n = l_n; l_gemm_def.k = l_k; l_gemm_def.lda = l_lda; l_gemm_def.ldb = l_ldb; l_gemm_def.ldc = l_ldc; l_gemm_def.alpha = l_alpha; l_gemm_def.beta = l_beta; l_gemm_def.trans_a = l_trans_a; l_gemm_def.trans_b = l_trans_b; l_gemm_def.aligned_a = l_aligned_a; l_gemm_def.aligned_c = l_aligned_c; l_gemm_def.prefetch = l_prefetch; l_gemm_def.br_type = l_br_type; l_gemm_def.br_count = l_br; l_gemm_def.br_unroll = l_br_unroll; l_a_w = (short*)libxsmm_aligned_malloc((size_t)l_lda * (size_t)l_k * (size_t)l_br * sizeof(short), 64); l_b_w = (short*)libxsmm_aligned_malloc((size_t)l_ldb * (size_t)l_n * (size_t)l_br * sizeof(short), 64); l_c_w_i = (int*)libxsmm_aligned_malloc((size_t)l_ldc * (size_t)l_n * sizeof(int), 64); l_c_gold_w_i = (int*)libxsmm_aligned_malloc((size_t)l_ldc * (size_t)l_n * sizeof(int), 64); /* touch A */ for (l_r = 0; l_r < l_br; l_r++) { for (l_i = 0; l_i < l_lda; l_i++) { for (l_j = 0; l_j < l_k; l_j++) { l_a_w[(l_r * l_lda * l_k) + (l_j * l_lda) + l_i] = (short)(libxsmm_rng_f64() * 10.0); } } } /* touch B */ for (l_r = 0; l_r < l_br; l_r++) { for (l_i = 0; l_i < l_ldb; l_i++) { for (l_j = 0; l_j < l_n; l_j++) { l_b_w[(l_r * l_ldb * l_n) + (l_j * l_ldb) + l_i] = (short)(libxsmm_rng_f64() * 10.0); } } } /* touch C */ for (l_i = 0; l_i < l_ldc; l_i++) { for (l_j = 0; l_j < l_n; l_j++) { l_c_w_i[(l_j * l_ldc) + l_i] = 0; l_c_gold_w_i[(l_j * l_ldc) + l_i] = 0; } } l_runtime_libxsmm = run_jit_short_int( &l_gemm_def, l_a_w, l_b_w, l_c_w_i, l_file_input ); if ( l_run_check == 1 ) { l_start = libxsmm_timer_tick(); for (l_t = 0; l_t < g_reps; l_t++) { for (l_r = 0; l_r < l_br; l_r++) { for (l_j = 0; l_j < l_n; l_j++) { for (l_s = 0; l_s < (l_k / l_k_block); l_s++) { for (l_i = 0; l_i < l_m; l_i++) { for (l_k2 = 0; l_k2 < l_k_block; l_k2++) { l_c_gold_w_i[(l_j * l_ldc) + l_i] += l_a_w[(l_r * l_lda * l_k) + (l_s * (l_lda*l_k_block)) + (l_i*l_k_block) + l_k2] * l_b_w[(l_r * l_ldb * l_n) + (l_j * l_ldb) + (l_s*l_k_block) + l_k2]; } } } } } } l_runtime_c = libxsmm_timer_duration(l_start, libxsmm_timer_tick()); l_max_error = 0; for (l_i = 0; l_i < l_m; l_i++) { for (l_j = 0; l_j < l_n; l_j++) { const double l_fabs = fabs((double)l_c_gold_w_i[(l_j * l_ldc) + l_i] - (double)l_c_w_i[(l_j * l_ldc) + l_i]); if (l_max_error < l_fabs) l_max_error = l_fabs; } } } if ( l_file_input == 0 ) { printf("%fs for C\n", l_runtime_c); printf("%f GFLOPS for C\n", ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_c * 1.0e9)); printf("%fs for libxsmm\n", l_runtime_libxsmm); printf("%f GFLOPS for libxsmm\n", ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_libxsmm * 1.0e9)); printf("max. error: %f\n", l_diff.linf_abs); } else { if ( l_run_check == 1 ) { printf("%i %i %i %i %i %i %i %i %i %s %f %f\n", l_m, l_n, l_k, l_lda, l_ldb, l_ldc, l_br, l_br_type, l_br_unroll, l_precision, ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_libxsmm * 1.0e9), l_diff.linf_abs ); } else { printf("%i %i %i %i %i %i %i %i %i %s %f\n", l_m, l_n, l_k, l_lda, l_ldb, l_ldc, l_br, l_br_type, l_br_unroll, l_precision, ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_libxsmm * 1.0e9) ); } } if ( (l_total_max_error < l_max_error) && (l_run_check == 1) ) { l_total_max_error = l_max_error; } libxsmm_free(l_a_w); libxsmm_free(l_b_w); libxsmm_free(l_c_w_i); libxsmm_free(l_c_gold_w_i); } while ( l_keep_going ); } else if (strcmp(l_precision, "USI8I32") == 0) { const int l_k_block = 4; double l_max_error = 0; int l_k2; unsigned int l_keep_going = 0; do { if ( l_file_input != 0 ) { char l_line[512]; if ( fgets( l_line, 512, l_file_handle) == NULL ) { l_keep_going = 0; break; } else { l_keep_going = 1; } if ( 6 != sscanf( l_line, "%i %i %i %i %i %i", &l_m, &l_n, &l_k, &l_lda, &l_ldb, &l_ldc ) ) exit(EXIT_FAILURE); } l_gemm_def.m = l_m; l_gemm_def.n = l_n; l_gemm_def.k = l_k; l_gemm_def.lda = l_lda; l_gemm_def.ldb = l_ldb; l_gemm_def.ldc = l_ldc; l_gemm_def.alpha = l_alpha; l_gemm_def.beta = l_beta; l_gemm_def.trans_a = l_trans_a; l_gemm_def.trans_b = l_trans_b; l_gemm_def.aligned_a = l_aligned_a; l_gemm_def.aligned_c = l_aligned_c; l_gemm_def.prefetch = l_prefetch; l_gemm_def.br_type = l_br_type; l_gemm_def.br_count = l_br; l_gemm_def.br_unroll = l_br_unroll; l_ua_b = (unsigned char*)libxsmm_aligned_malloc((size_t)l_lda * (size_t)l_k * (size_t)l_br * sizeof(unsigned char), 64); l_sb_b = (char*)libxsmm_aligned_malloc((size_t)l_ldb * (size_t)l_n * (size_t)l_br * sizeof(char), 64); l_c_b_i = (int*)libxsmm_aligned_malloc((size_t)l_ldc * (size_t)l_n * sizeof(int), 64); l_c_gold_b_i = (int*)libxsmm_aligned_malloc((size_t)l_ldc * (size_t)l_n * sizeof(int), 64); /* touch A */ for (l_r = 0; l_r < l_br; l_r++) { for (l_i = 0; l_i < l_lda; l_i++) { for (l_j = 0; l_j < l_k; l_j++) { l_ua_b[(l_r * l_lda * l_k) + (l_j * l_lda) + l_i] = (unsigned char)(libxsmm_rng_f64() * 5.0); } } } /* touch B */ for (l_r = 0; l_r < l_br; l_r++) { for (l_i = 0; l_i < l_ldb; l_i++) { for (l_j = 0; l_j < l_n; l_j++) { l_sb_b[(l_r * l_ldb * l_n) + (l_j * l_ldb) + l_i] = (char)(libxsmm_rng_f64() * 5.0); } } } /* touch C */ for (l_i = 0; l_i < l_ldc; l_i++) { for (l_j = 0; l_j < l_n; l_j++) { l_c_b_i[(l_j * l_ldc) + l_i] = 0; l_c_gold_b_i[(l_j * l_ldc) + l_i] = 0; } } l_runtime_libxsmm = run_jit_uschar_int( &l_gemm_def, l_ua_b, l_sb_b, l_c_b_i, l_file_input ); if ( l_run_check == 1 ) { l_start = libxsmm_timer_tick(); for (l_t = 0; l_t < g_reps; l_t++) { for (l_r = 0; l_r < l_br; l_r++) { for (l_j = 0; l_j < l_n; l_j++) { for (l_s = 0; l_s < (l_k / l_k_block); l_s++) { for (l_i = 0; l_i < l_m; l_i++) { for (l_k2 = 0; l_k2 < l_k_block; l_k2++) { l_c_gold_b_i[(l_j * l_ldc) + l_i] += l_ua_b[(l_r * l_lda * l_k) + (l_s * (l_lda*l_k_block)) + (l_i*l_k_block) + l_k2] * l_sb_b[(l_r * l_ldb * l_n) + (l_j * l_ldb) + (l_s*l_k_block) + l_k2]; } } } } } } l_runtime_c = libxsmm_timer_duration(l_start, libxsmm_timer_tick()); l_max_error = 0; for (l_i = 0; l_i < l_m; l_i++) { for (l_j = 0; l_j < l_n; l_j++) { const double l_fabs = fabs((double)l_c_gold_b_i[(l_j * l_ldc) + l_i] - (double)l_c_b_i[(l_j * l_ldc) + l_i]); if (l_max_error < l_fabs) l_max_error = l_fabs; } } } if ( l_file_input == 0 ) { printf("%fs for C\n", l_runtime_c); printf("%f GFLOPS for C\n", ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_c * 1.0e9)); printf("%fs for libxsmm\n", l_runtime_libxsmm); printf("%f GFLOPS for libxsmm\n", ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_libxsmm * 1.0e9)); printf("max. error: %f\n", l_diff.linf_abs); } else { if ( l_run_check == 1 ) { printf("%i %i %i %i %i %i %i %i %i %s %f %f\n", l_m, l_n, l_k, l_lda, l_ldb, l_ldc, l_br, l_br_type, l_br_unroll, l_precision, ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_libxsmm * 1.0e9), l_diff.linf_abs ); } else { printf("%i %i %i %i %i %i %i %i %i %s %f\n", l_m, l_n, l_k, l_lda, l_ldb, l_ldc, l_br, l_br_type, l_br_unroll, l_precision, ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_libxsmm * 1.0e9) ); } } if ( (l_total_max_error < l_max_error) && (l_run_check == 1) ) { l_total_max_error = l_max_error; } libxsmm_free(l_ua_b); libxsmm_free(l_sb_b); libxsmm_free(l_c_b_i); libxsmm_free(l_c_gold_b_i); } while ( l_keep_going ); } else if (strcmp(l_precision, "SUI8I32") == 0) { const int l_k_block = 4; double l_max_error = 0; int l_k2; unsigned int l_keep_going = 0; do { if ( l_file_input != 0 ) { char l_line[512]; if ( fgets( l_line, 512, l_file_handle) == NULL ) { l_keep_going = 0; break; } else { l_keep_going = 1; } if ( 6 != sscanf( l_line, "%i %i %i %i %i %i", &l_m, &l_n, &l_k, &l_lda, &l_ldb, &l_ldc ) ) exit(EXIT_FAILURE); } l_gemm_def.m = l_m; l_gemm_def.n = l_n; l_gemm_def.k = l_k; l_gemm_def.lda = l_lda; l_gemm_def.ldb = l_ldb; l_gemm_def.ldc = l_ldc; l_gemm_def.alpha = l_alpha; l_gemm_def.beta = l_beta; l_gemm_def.trans_a = l_trans_a; l_gemm_def.trans_b = l_trans_b; l_gemm_def.aligned_a = l_aligned_a; l_gemm_def.aligned_c = l_aligned_c; l_gemm_def.prefetch = l_prefetch; l_gemm_def.br_type = l_br_type; l_gemm_def.br_count = l_br; l_gemm_def.br_unroll = l_br_unroll; l_sa_b = (char*)libxsmm_aligned_malloc((size_t)l_lda * (size_t)l_k * (size_t)l_br * sizeof(char), 64); l_ub_b = (unsigned char*)libxsmm_aligned_malloc((size_t)l_ldb * (size_t)l_n * (size_t)l_br * sizeof(unsigned char), 64); l_c_b_i = (int*)libxsmm_aligned_malloc((size_t)l_ldc * (size_t)l_n * sizeof(int), 64); l_c_gold_b_i = (int*)libxsmm_aligned_malloc((size_t)l_ldc * (size_t)l_n * sizeof(int), 64); /* touch A */ for (l_r = 0; l_r < l_br; l_r++) { for (l_i = 0; l_i < l_lda; l_i++) { for (l_j = 0; l_j < l_k; l_j++) { l_sa_b[(l_r * l_lda * l_k) + (l_j * l_lda) + l_i] = (char)(libxsmm_rng_f64() * 5.0); } } } /* touch B */ for (l_r = 0; l_r < l_br; l_r++) { for (l_i = 0; l_i < l_ldb; l_i++) { for (l_j = 0; l_j < l_n; l_j++) { l_ub_b[(l_r * l_ldb * l_n) + (l_j * l_ldb) + l_i] = (unsigned char)(libxsmm_rng_f64() * 5.0); } } } /* touch C */ for (l_i = 0; l_i < l_ldc; l_i++) { for (l_j = 0; l_j < l_n; l_j++) { l_c_b_i[(l_j * l_ldc) + l_i] = 0; l_c_gold_b_i[(l_j * l_ldc) + l_i] = 0; } } l_runtime_libxsmm = run_jit_suchar_int( &l_gemm_def, l_sa_b, l_ub_b, l_c_b_i, l_file_input ); if ( l_run_check == 1 ) { l_start = libxsmm_timer_tick(); for (l_t = 0; l_t < g_reps; l_t++) { for (l_r = 0; l_r < l_br; l_r++) { for (l_j = 0; l_j < l_n; l_j++) { for (l_s = 0; l_s < (l_k / l_k_block); l_s++) { for (l_i = 0; l_i < l_m; l_i++) { for (l_k2 = 0; l_k2 < l_k_block; l_k2++) { l_c_gold_b_i[(l_j * l_ldc) + l_i] += l_sa_b[(l_r * l_lda * l_k) + (l_s * (l_lda*l_k_block)) + (l_i*l_k_block) + l_k2] * l_ub_b[(l_r * l_ldb * l_n) + (l_j * l_ldb) + (l_s*l_k_block) + l_k2]; } } } } } } l_runtime_c = libxsmm_timer_duration(l_start, libxsmm_timer_tick()); l_max_error = 0; for (l_i = 0; l_i < l_m; l_i++) { for (l_j = 0; l_j < l_n; l_j++) { const double l_fabs = fabs((double)l_c_gold_b_i[(l_j * l_ldc) + l_i] - (double)l_c_b_i[(l_j * l_ldc) + l_i]); if (l_max_error < l_fabs) l_max_error = l_fabs; } } } if ( l_file_input == 0 ) { printf("%fs for C\n", l_runtime_c); printf("%f GFLOPS for C\n", ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_c * 1.0e9)); printf("%fs for libxsmm\n", l_runtime_libxsmm); printf("%f GFLOPS for libxsmm\n", ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_libxsmm * 1.0e9)); printf("max. error: %f\n", l_diff.linf_abs); } else { if ( l_run_check == 1 ) { printf("%i %i %i %i %i %i %i %i %i %s %f %f\n", l_m, l_n, l_k, l_lda, l_ldb, l_ldc, l_br, l_br_type, l_br_unroll, l_precision, ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_libxsmm * 1.0e9), l_diff.linf_abs ); } else { printf("%i %i %i %i %i %i %i %i %i %s %f\n", l_m, l_n, l_k, l_lda, l_ldb, l_ldc, l_br, l_br_type, l_br_unroll, l_precision, ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_libxsmm * 1.0e9) ); } } if ( (l_total_max_error < l_max_error) && (l_run_check == 1) ) { l_total_max_error = l_max_error; } libxsmm_free(l_sa_b); libxsmm_free(l_ub_b); libxsmm_free(l_c_b_i); libxsmm_free(l_c_gold_b_i); } while ( l_keep_going ); } else if (strcmp(l_precision, "SUI8UI8") == 0) { const int l_k_block = 4; double l_max_error = 0; int l_k2; unsigned int l_keep_going = 0; do { if ( l_file_input != 0 ) { char l_line[512]; if ( fgets( l_line, 512, l_file_handle) == NULL ) { l_keep_going = 0; break; } else { l_keep_going = 1; } if ( 6 != sscanf( l_line, "%i %i %i %i %i %i", &l_m, &l_n, &l_k, &l_lda, &l_ldb, &l_ldc ) ) exit(EXIT_FAILURE); } l_gemm_def.m = l_m; l_gemm_def.n = l_n; l_gemm_def.k = l_k; l_gemm_def.lda = l_lda; l_gemm_def.ldb = l_ldb; l_gemm_def.ldc = l_ldc; l_gemm_def.alpha = l_alpha; l_gemm_def.beta = l_beta; l_gemm_def.trans_a = l_trans_a; l_gemm_def.trans_b = l_trans_b; l_gemm_def.aligned_a = l_aligned_a; l_gemm_def.aligned_c = l_aligned_c; l_gemm_def.prefetch = l_prefetch; l_gemm_def.br_type = l_br_type; l_gemm_def.br_count = l_br; l_gemm_def.br_unroll = l_br_unroll; l_sa_b = (char*)libxsmm_aligned_malloc((size_t)l_lda * (size_t)l_k * (size_t)l_br * sizeof(char), 64); l_ub_b = (unsigned char*)libxsmm_aligned_malloc((size_t)l_ldb * (size_t)l_n * (size_t)l_br * sizeof(unsigned char), 64); l_c_b_ub = (unsigned char*)libxsmm_aligned_malloc((size_t)l_ldc * (size_t)l_n * sizeof(unsigned char), 64); l_c_gold_b_ub = (unsigned char*)libxsmm_aligned_malloc((size_t)l_ldc * (size_t)l_n * sizeof(unsigned char), 64); /* touch A */ for (l_r = 0; l_r < l_br; l_r++) { for (l_i = 0; l_i < l_lda; l_i++) { for (l_j = 0; l_j < l_k; l_j++) { l_sa_b[(l_r * l_lda * l_k) + (l_j * l_lda) + l_i] = (char)(libxsmm_rng_f64() * 2.0); } } } /* touch B */ for (l_r = 0; l_r < l_br; l_r++) { for (l_i = 0; l_i < l_ldb; l_i++) { for (l_j = 0; l_j < l_n; l_j++) { l_ub_b[(l_r * l_ldb * l_n) + (l_j * l_ldb) + l_i] = (unsigned char)(libxsmm_rng_f64() * 2.0); } } } /* touch C */ for (l_i = 0; l_i < l_ldc; l_i++) { for (l_j = 0; l_j < l_n; l_j++) { l_c_b_ub[(l_j * l_ldc) + l_i] = 0; l_c_gold_b_ub[(l_j * l_ldc) + l_i] = 0; } } l_runtime_libxsmm = run_jit_suchar_uchar( &l_gemm_def, l_sa_b, l_ub_b, l_c_b_ub, l_scf, l_file_input ); if ( l_run_check == 1 ) { l_start = libxsmm_timer_tick(); for (l_t = 0; l_t < g_reps; l_t++) { for (l_r = 0; l_r < l_br; l_r++) { for (l_j = 0; l_j < l_n; l_j++) { for (l_i = 0; l_i < l_m; l_i++) { int tmp = (int)l_c_gold_b_ub[(l_j * l_ldc) + l_i]; float ftmp; for (l_s = 0; l_s < (l_k / l_k_block); l_s++) { for (l_k2 = 0; l_k2 < l_k_block; l_k2++) { tmp += l_sa_b[(l_r * l_lda * l_k) + (l_s * (l_lda*l_k_block)) + (l_i*l_k_block) + l_k2] * l_ub_b[(l_r * l_ldb * l_n) + (l_j * l_ldb) + (l_s*l_k_block) + l_k2]; } } ftmp = (float)tmp; ftmp *= l_scf; l_c_gold_b_ub[(l_j * l_ldc) + l_i] = (unsigned char)ftmp; } } } } l_runtime_c = libxsmm_timer_duration(l_start, libxsmm_timer_tick()); l_max_error = 0; for (l_i = 0; l_i < l_m; l_i++) { for (l_j = 0; l_j < l_n; l_j++) { const double l_fabs = fabs((double)l_c_gold_b_ub[(l_j * l_ldc) + l_i] - (double)l_c_b_ub[(l_j * l_ldc) + l_i]); if (l_max_error < l_fabs) l_max_error = l_fabs; } } } if ( l_file_input == 0 ) { printf("%fs for C\n", l_runtime_c); printf("%f GFLOPS for C\n", ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_c * 1.0e9)); printf("%fs for libxsmm\n", l_runtime_libxsmm); printf("%f GFLOPS for libxsmm\n", ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_libxsmm * 1.0e9)); printf("max. error: %f\n", l_diff.linf_abs); } else { if ( l_run_check == 1 ) { printf("%i %i %i %i %i %i %i %i %i %s %f %f\n", l_m, l_n, l_k, l_lda, l_ldb, l_ldc, l_br, l_br_type, l_br_unroll, l_precision, ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_libxsmm * 1.0e9), l_diff.linf_abs ); } else { printf("%i %i %i %i %i %i %i %i %i %s %f\n", l_m, l_n, l_k, l_lda, l_ldb, l_ldc, l_br, l_br_type, l_br_unroll, l_precision, ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_libxsmm * 1.0e9) ); } } if ( (l_total_max_error < l_max_error) && (l_run_check == 1) ) { l_total_max_error = l_max_error; } libxsmm_free(l_sa_b); libxsmm_free(l_ub_b); libxsmm_free(l_c_b_ub); libxsmm_free(l_c_gold_b_ub); } while ( l_keep_going ); } else if (strcmp(l_precision, "BF16F32") == 0) { const int l_k_block = 2; double l_max_error = 0; int l_k2; unsigned int l_keep_going = 0; do { if ( l_file_input != 0 ) { char l_line[512]; if ( fgets( l_line, 512, l_file_handle) == NULL ) { l_keep_going = 0; break; } else { l_keep_going = 1; } if ( 6 != sscanf( l_line, "%i %i %i %i %i %i", &l_m, &l_n, &l_k, &l_lda, &l_ldb, &l_ldc ) ) exit(EXIT_FAILURE); } l_gemm_def.m = l_m; l_gemm_def.n = l_n; l_gemm_def.k = l_k; l_gemm_def.lda = l_lda; l_gemm_def.ldb = l_ldb; l_gemm_def.ldc = l_ldc; l_gemm_def.alpha = l_alpha; l_gemm_def.beta = l_beta; l_gemm_def.trans_a = l_trans_a; l_gemm_def.trans_b = l_trans_b; l_gemm_def.aligned_a = l_aligned_a; l_gemm_def.aligned_c = l_aligned_c; l_gemm_def.prefetch = l_prefetch; l_gemm_def.br_type = l_br_type; l_gemm_def.br_count = l_br; l_gemm_def.br_unroll = l_br_unroll; l_a_bf = (libxsmm_bfloat16*)libxsmm_aligned_malloc((size_t)l_lda * (size_t)l_k * (size_t)l_br * sizeof(libxsmm_bfloat16), 64); l_b_bf = (libxsmm_bfloat16*)libxsmm_aligned_malloc((size_t)l_ldb * (size_t)l_n * (size_t)l_br * sizeof(libxsmm_bfloat16), 64); l_c_bf_f = (float*)libxsmm_aligned_malloc((size_t)l_ldc * (size_t)l_n * sizeof(float), 64); l_c_gold_bf_f = (float*)libxsmm_aligned_malloc((size_t)l_ldc * (size_t)l_n * sizeof(float), 64); /* touch A */ for (l_r = 0; l_r < l_br; l_r++) { for (l_i = 0; l_i < l_lda; l_i++) { for (l_j = 0; l_j < l_k; l_j++) { union libxsmm_bfloat16_hp tmp; tmp.f = (float)libxsmm_rng_f64(); l_a_bf[(l_r * l_lda * l_k) + (l_j * l_lda) + l_i] = tmp.i[1]; } } } /* touch B */ for (l_r = 0; l_r < l_br; l_r++) { for (l_i = 0; l_i < l_ldb; l_i++) { for (l_j = 0; l_j < l_n; l_j++) { union libxsmm_bfloat16_hp tmp; tmp.f = (float)libxsmm_rng_f64(); l_b_bf[(l_r * l_ldb * l_n) + (l_j * l_ldb) + l_i] = tmp.i[1]; } } } /* touch C */ for (l_i = 0; l_i < l_ldc; l_i++) { for (l_j = 0; l_j < l_n; l_j++) { l_c_bf_f[(l_j * l_ldc) + l_i] = 0.0f; l_c_gold_bf_f[(l_j * l_ldc) + l_i] = 0.0f; } } l_runtime_libxsmm = run_jit_bfloat16_float( &l_gemm_def, l_a_bf, l_b_bf, l_c_bf_f, l_file_input ); if ( l_run_check == 1 ) { l_start = libxsmm_timer_tick(); for (l_t = 0; l_t < g_reps; l_t++) { for (l_r = 0; l_r < l_br; l_r++) { for (l_j = 0; l_j < l_n; l_j++) { for (l_s = 0; l_s < (l_k / l_k_block); l_s++) { for (l_i = 0; l_i < l_m; l_i++) { for (l_k2 = 0; l_k2 < l_k_block; l_k2++) { union libxsmm_bfloat16_hp tmp_a_f; union libxsmm_bfloat16_hp tmp_b_f; tmp_a_f.i[1] = l_a_bf[(l_r * l_lda * l_k) + (l_s * (l_lda*l_k_block)) + (l_i*l_k_block) + l_k2]; tmp_a_f.i[0] = 0; tmp_b_f.i[1] = l_b_bf[(l_r * l_ldb * l_n) + (l_j * l_ldb) + (l_s*l_k_block) + l_k2]; tmp_b_f.i[0] = 0; l_c_gold_bf_f[(l_j * l_ldc) + l_i] += (float)(tmp_a_f.f * tmp_b_f.f); } } } } } } l_runtime_c = libxsmm_timer_duration(l_start, libxsmm_timer_tick()); l_max_error = 0; for (l_i = 0; l_i < l_m; l_i++) { for (l_j = 0; l_j < l_n; l_j++) { const double l_fabs = fabs((double)l_c_gold_bf_f[(l_j * l_ldc) + l_i] - (double)l_c_bf_f[(l_j * l_ldc) + l_i]); if (l_max_error < l_fabs) l_max_error = l_fabs; } } } if ( l_file_input == 0 ) { printf("%fs for C\n", l_runtime_c); printf("%f GFLOPS for C\n", ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_c * 1.0e9)); printf("%fs for libxsmm\n", l_runtime_libxsmm); printf("%f GFLOPS for libxsmm\n", ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_libxsmm * 1.0e9)); printf("max. error: %f\n", l_diff.linf_abs); } else { if ( l_run_check == 1 ) { printf("%i %i %i %i %i %i %i %i %i %s %f %f\n", l_m, l_n, l_k, l_lda, l_ldb, l_ldc, l_br, l_br_type, l_br_unroll, l_precision, ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_libxsmm * 1.0e9), l_diff.linf_abs ); } else { printf("%i %i %i %i %i %i %i %i %i %s %f\n", l_m, l_n, l_k, l_lda, l_ldb, l_ldc, l_br, l_br_type, l_br_unroll, l_precision, ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_libxsmm * 1.0e9) ); } } if ( (l_total_max_error < l_max_error) && (l_run_check == 1) ) { l_total_max_error = l_max_error; } libxsmm_free(l_a_bf); libxsmm_free(l_b_bf); libxsmm_free(l_c_bf_f); libxsmm_free(l_c_gold_bf_f); } while ( l_keep_going ); } else if (strcmp(l_precision, "BF16") == 0) { const int l_k_block = 2; double l_max_error = 0; int l_k2; unsigned int l_keep_going = 0; do { if ( l_file_input != 0 ) { char l_line[512]; if ( fgets( l_line, 512, l_file_handle) == NULL ) { l_keep_going = 0; break; } else { l_keep_going = 1; } if ( 6 != sscanf( l_line, "%i %i %i %i %i %i", &l_m, &l_n, &l_k, &l_lda, &l_ldb, &l_ldc ) ) exit(EXIT_FAILURE); } l_gemm_def.m = l_m; l_gemm_def.n = l_n; l_gemm_def.k = l_k; l_gemm_def.lda = l_lda; l_gemm_def.ldb = l_ldb; l_gemm_def.ldc = l_ldc; l_gemm_def.alpha = l_alpha; l_gemm_def.beta = l_beta; l_gemm_def.trans_a = l_trans_a; l_gemm_def.trans_b = l_trans_b; l_gemm_def.aligned_a = l_aligned_a; l_gemm_def.aligned_c = l_aligned_c; l_gemm_def.prefetch = l_prefetch; l_gemm_def.br_type = l_br_type; l_gemm_def.br_count = l_br; l_gemm_def.br_unroll = l_br_unroll; l_a_bf = (libxsmm_bfloat16*)libxsmm_aligned_malloc((size_t)l_lda * (size_t)l_k * (size_t)l_br * sizeof(libxsmm_bfloat16), 64); l_b_bf = (libxsmm_bfloat16*)libxsmm_aligned_malloc((size_t)l_ldb * (size_t)l_n * (size_t)l_br * sizeof(libxsmm_bfloat16), 64); l_c_bf = (libxsmm_bfloat16*)libxsmm_aligned_malloc((size_t)l_ldc * (size_t)l_n * sizeof(libxsmm_bfloat16), 64); l_c_gold_bf = (libxsmm_bfloat16*)libxsmm_aligned_malloc((size_t)l_ldc * (size_t)l_n * sizeof(libxsmm_bfloat16), 64); /* touch A */ for (l_r = 0; l_r < l_br; l_r++) { for (l_i = 0; l_i < l_lda; l_i++) { for (l_j = 0; l_j < l_k; l_j++) { union libxsmm_bfloat16_hp tmp; tmp.f = (float)libxsmm_rng_f64(); l_a_bf[(l_r * l_lda * l_k) + (l_j * l_lda) + l_i] = tmp.i[1]; } } } /* touch B */ for (l_r = 0; l_r < l_br; l_r++) { for (l_i = 0; l_i < l_ldb; l_i++) { for (l_j = 0; l_j < l_n; l_j++) { union libxsmm_bfloat16_hp tmp; tmp.f = (float)libxsmm_rng_f64(); l_b_bf[(l_r * l_ldb * l_n) + (l_j * l_ldb) + l_i] = tmp.i[1]; } } } /* touch C */ for (l_i = 0; l_i < l_ldc; l_i++) { for (l_j = 0; l_j < l_n; l_j++) { union libxsmm_bfloat16_hp tmp; tmp.f = 0.0f; l_c_bf[(l_j * l_ldc) + l_i] = tmp.i[1]; l_c_gold_bf[(l_j * l_ldc) + l_i] = tmp.i[1]; } } l_runtime_libxsmm = run_jit_bfloat16( &l_gemm_def, l_a_bf, l_b_bf, l_c_bf, l_file_input ); if ( l_run_check == 1 ) { l_start = libxsmm_timer_tick(); for (l_t = 0; l_t < g_reps; l_t++) { for (l_r = 0; l_r < l_br; l_r++) { for (l_j = 0; l_j < l_n; l_j++) { for (l_i = 0; l_i < l_m; l_i++) { union libxsmm_bfloat16_hp fprod; fprod.i[1] = l_c_gold_bf[(l_j * l_ldc) + l_i]; fprod.i[0] = 0; for (l_s = 0; l_s < (l_k / l_k_block); l_s++) { for (l_k2 = 0; l_k2 < l_k_block; l_k2++) { union libxsmm_bfloat16_hp tmp_a_f; union libxsmm_bfloat16_hp tmp_b_f; tmp_a_f.i[1] = l_a_bf[(l_r * l_lda * l_k) + (l_s * (l_lda*l_k_block)) + (l_i*l_k_block) + l_k2]; tmp_a_f.i[0] = 0; tmp_b_f.i[1] = l_b_bf[(l_r * l_ldb * l_n) + (l_j * l_ldb) + (l_s*l_k_block) + l_k2]; tmp_b_f.i[0] = 0; fprod.f += (float)(tmp_a_f.f * tmp_b_f.f); } } l_c_gold_bf[(l_j * l_ldc) + l_i] = fprod.i[1]; } } } } l_runtime_c = libxsmm_timer_duration(l_start, libxsmm_timer_tick()); l_max_error = 0; for (l_i = 0; l_i < l_m; l_i++) { for (l_j = 0; l_j < l_n; l_j++) { union libxsmm_bfloat16_hp tmp_c; union libxsmm_bfloat16_hp tmp_gold; double l_fabs; tmp_c.i[1] = l_c_bf[(l_j * l_ldc) + l_i]; tmp_c.i[0] = 0; tmp_gold.i[1] = l_c_gold_bf[(l_j * l_ldc) + l_i]; tmp_gold.i[0] = 0; l_fabs = fabs((double)tmp_gold.f - (double)tmp_c.f); if (l_max_error < l_fabs) l_max_error = l_fabs; } } } if ( l_file_input == 0 ) { printf("%fs for C\n", l_runtime_c); printf("%f GFLOPS for C\n", ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_c * 1.0e9)); printf("%fs for libxsmm\n", l_runtime_libxsmm); printf("%f GFLOPS for libxsmm\n", ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_libxsmm * 1.0e9)); printf("max. error: %f\n", l_diff.linf_abs); } else { if ( l_run_check == 1 ) { printf("%i %i %i %i %i %i %i %i %i %s %f %f\n", l_m, l_n, l_k, l_lda, l_ldb, l_ldc, l_br, l_br_type, l_br_unroll, l_precision, ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_libxsmm * 1.0e9), l_diff.linf_abs ); } else { printf("%i %i %i %i %i %i %i %i %i %s %f\n", l_m, l_n, l_k, l_lda, l_ldb, l_ldc, l_br, l_br_type, l_br_unroll, l_precision, ((double)((double)g_reps * (double)l_m * (double)l_n * (double)l_k * (double)l_br) * 2.0) / (l_runtime_libxsmm * 1.0e9) ); } } if ( (l_total_max_error < l_max_error) && (l_run_check == 1) ) { l_total_max_error = l_max_error; } libxsmm_free(l_a_bf); libxsmm_free(l_b_bf); libxsmm_free(l_c_bf); libxsmm_free(l_c_gold_bf); } while ( l_keep_going ); } if ( l_file_input != 0 ) { fclose( l_file_handle ); } else { printf("------------------------------------------------\n"); } /* Print total max error */ printf("\n\n Total Max Error %f\n\n", l_total_max_error ); if ( l_total_max_error >= 0.00005 && l_br_type == 0) { return EXIT_FAILURE; } else if ( l_total_max_error >= 0.0005 && l_br_type > 0) { return EXIT_FAILURE; } else { return EXIT_SUCCESS; } } libxsmm-1.17/samples/xgemm/kernel.sh000077500000000000000000000050141415223013700175130ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) NAME=$(basename $0 .sh) GREP=$(command -v grep) ENV=$(command -v env) if [ "Windows_NT" = "${OS}" ]; then # Cygwin's "env" does not set PATH ("Files/Black: No such file or directory") export PATH=${PATH}:${HERE}/../../lib:/usr/x86_64-w64-mingw32/sys-root/mingw/bin # Cygwin's ldd hangs with dyn. linked executables or certain shared libraries LDD=$(command -v cygcheck) EXE=.exe else if [ "$(command -v ldd)" ]; then LDD=ldd elif [ "$(command -v otool)" ]; then LDD="otool -L" else LDD=echo fi fi MICINFO=$(command -v micinfo) if [ "${MICINFO}" ]; then MICCORES=$(${MICINFO} 2>/dev/null | sed -n "0,/[[:space:]]\+Total No of Active Cores :[[:space:]]\+\([0-9]\+\)/s//\1/p") fi if [ "" = "${MICCORES}" ]; then MICCORES=61 fi MICTPERC=3 if [ "-mic" != "$1" ]; then if [ "$(${LDD} ${HERE}/${NAME}${EXE} 2>/dev/null | ${GREP} libiomp5\.)" ]; then ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ KMP_AFFINITY=compact,granularity=fine,1 \ MIC_KMP_AFFINITY=compact,granularity=fine \ MIC_KMP_HW_SUBSET=$((MICCORES-1))c${MICTPERC}t \ MIC_ENV_PREFIX=MIC \ OFFLOAD_INIT=on_start \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" else ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ OMP_PROC_BIND=TRUE \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" fi else shift ${ENV} \ SINK_LD_LIBRARY_PATH=${SINK_LD_LIBRARY_PATH}:${MIC_LD_LIBRARY_PATH}:${HERE}/../../lib \ micnativeloadex \ ${HERE}/${NAME}${EXE} -a "$*" \ -e "KMP_AFFINITY=compact,granularity=fine" \ -e "MIC_KMP_HW_SUBSET=$((MICCORES-1))${MICTPERC}t" fi libxsmm-1.17/samples/xgemm/kernel.vcxproj000066400000000000000000000541771415223013700206070ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 kernel {5528C759-B5AD-4325-8A45-EFE647B1702E} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/xgemm/kernel_appsizes.sh000077500000000000000000000031351415223013700214330ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### REPS=10000 TEST_EDGE="4_9_4 4_9_9 10_9_10 10_9_9 20_9_20 20_9_9 35_9_35 35_9_9 56_9_56 56_9_9" TEST_EDGE_PAD="4_9_4 4_9_9 12_9_12 12_9_9 20_9_20 20_9_9 36_9_36 36_9_9 56_9_56 56_9_9" TEST_SU2="1008_5_75 75_5_756 147_5_75 48_5_35 184_5_35 35_5_138 75_5_147 35_5_48 48_5_75 108_5_75 75_5_48 16_5_15 15_5_16 49_5_25 25_5_49" TEST_SU2_2F="1008_10_75 75_10_756 147_10_75 48_10_35 184_10_35 35_10_138 75_10_147 35_10_48 48_10_75 108_10_75 75_10_48 16_10_15 15_10_16 49_10_25 25_10_49" TEST_SU2_3F="1008_15_75 75_15_756 147_15_75 48_15_35 184_15_35 35_15_138 75_15_147 35_15_48 48_15_75 108_15_75 75_15_48 16_15_15 15_15_16 49_15_25 25_15_49" TEST=${TEST_EDGE}$ # select precision PREC=DP if [ $# -eq 1 ] then PREC=$1 fi for t in ${TEST} do M=`echo ${t} | awk -F"_" '{print $1}'` N=`echo ${t} | awk -F"_" '{print $2}'` K=`echo ${t} | awk -F"_" '{print $3}'` lda=$M ldb=$K ldc=$M ./kernel $M $N $K $lda $ldb $ldc 1 1 0 0 0 0 nopf ${PREC} nobr 1 0 ${REPS} done libxsmm-1.17/samples/xgemm/xgemm.c000066400000000000000000000161201415223013700171550ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #if defined(__MKL) # include #endif #include #include #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif #if !defined(ITYPE) # define ITYPE double #endif #if !defined(OTYPE) # define OTYPE ITYPE #endif #if !defined(SEQUENTIAL) && 0 # define SEQUENTIAL #endif #if !defined(XGEMM) # if defined(SEQUENTIAL) # define XGEMM(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \ libxsmm_xgemm(LIBXSMM_GEMM_PRECISION(ITYPE), LIBXSMM_GEMM_PRECISION(OTYPE), \ TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) # else # define XGEMM(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \ LIBXSMM_YGEMM_SYMBOL(ITYPE)(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) # endif #endif #if !defined(CHECK) && (LIBXSMM_EQUAL(ITYPE, float) || LIBXSMM_EQUAL(ITYPE, double)) # if !defined(MKL_DIRECT_CALL_SEQ) && !defined(MKL_DIRECT_CALL) LIBXSMM_BLAS_SYMBOL_DECL(ITYPE, gemm) # endif # define XGEMM_GOLD(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \ LIBXSMM_GEMM_SYMBOL(ITYPE)(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) # define CHECK #endif int main(int argc, char* argv[]) { LIBXSMM_BLAS_CONST libxsmm_blasint m = (1 < argc ? atoi(argv[1]) : 512); LIBXSMM_BLAS_CONST libxsmm_blasint k = (3 < argc ? atoi(argv[3]) : m); LIBXSMM_BLAS_CONST libxsmm_blasint n = (2 < argc ? atoi(argv[2]) : k), nn = n; LIBXSMM_BLAS_CONST OTYPE alpha = (OTYPE)(7 < argc ? atof(argv[7]) : 1.0); LIBXSMM_BLAS_CONST OTYPE beta = (OTYPE)(8 < argc ? atof(argv[8]) : 1.0); LIBXSMM_BLAS_CONST char transa = (/*LIBXSMM_BLAS_CONST*/ char)( 9 < argc ? *argv[9] : 'N'); LIBXSMM_BLAS_CONST char transb = (/*LIBXSMM_BLAS_CONST*/ char)(10 < argc ? *argv[10] : 'N'); LIBXSMM_BLAS_CONST libxsmm_blasint mm = (('N' == transa || 'n' == transa) ? m : k); LIBXSMM_BLAS_CONST libxsmm_blasint kk = (('N' == transb || 'n' == transb) ? k : n); LIBXSMM_BLAS_CONST libxsmm_blasint ka = (('N' == transa || 'n' == transa) ? k : m); LIBXSMM_BLAS_CONST libxsmm_blasint kb = (('N' == transb || 'n' == transb) ? n : k); LIBXSMM_BLAS_CONST libxsmm_blasint lda = ((4 < argc && mm < atoi(argv[4])) ? atoi(argv[4]) : mm); LIBXSMM_BLAS_CONST libxsmm_blasint ldb = ((5 < argc && kk < atoi(argv[5])) ? atoi(argv[5]) : kk); LIBXSMM_BLAS_CONST libxsmm_blasint ldc = ((6 < argc && m < atoi(argv[6])) ? atoi(argv[6]) : m); const int nrepeat = ((11 < argc && 0 < atoi(argv[11])) ? atoi(argv[11]) : LIBXSMM_MAX(13 / LIBXSMM_MAX(1, (int)(libxsmm_icbrt_u64(1ULL * m * n * k) >> 10)), 3)); const double gflops = 2.0 * m * n * k * 1E-9; int result = EXIT_SUCCESS; #if defined(CHECK) const char *const env_check = getenv("CHECK"); const double check = LIBXSMM_ABS(NULL == env_check ? 0 : atof(env_check)); #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload target(LIBXSMM_OFFLOAD_TARGET) #endif { const char *const env_tasks = getenv("TASKS"); const int tasks = (NULL == env_tasks || 0 == *env_tasks) ? 0/*default*/ : atoi(env_tasks); ITYPE *const a = (ITYPE*)libxsmm_malloc((size_t)(lda * ka * sizeof(ITYPE))); ITYPE *const b = (ITYPE*)libxsmm_malloc((size_t)(ldb * kb * sizeof(ITYPE))); OTYPE *const c = (OTYPE*)libxsmm_malloc((size_t)(ldc * nn * sizeof(OTYPE))); #if defined(CHECK) OTYPE* d = 0; if (!LIBXSMM_FEQ(0, check)) { d = (OTYPE*)libxsmm_malloc((size_t)(ldc * nn * sizeof(OTYPE))); LIBXSMM_MATINIT_OMP(OTYPE, 0, d, m, n, ldc, 1.0); } #endif LIBXSMM_MATINIT_OMP(OTYPE, 0, c, m, n, ldc, 1.0); LIBXSMM_MATINIT_OMP(ITYPE, 42, a, mm, ka, lda, 1.0); LIBXSMM_MATINIT_OMP(ITYPE, 24, b, kk, kb, ldb, 1.0); #if defined(MKL_ENABLE_AVX512) mkl_enable_instructions(MKL_ENABLE_AVX512); #endif /* warm-up OpenMP (populate thread pool) */ #if defined(CHECK) && (!defined(__BLAS) || (0 != __BLAS)) if (0 != d) XGEMM_GOLD(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, d, &ldc); #endif XGEMM(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); libxsmm_gemm_print(stdout, LIBXSMM_GEMM_PRECISION(ITYPE), &transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); fprintf(stdout, "\n\n"); if (0 == tasks) { /* tiled xGEMM (with library-internal parallelization) */ int i; double duration; unsigned long long start = libxsmm_timer_tick(); for (i = 0; i < nrepeat; ++i) { XGEMM(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); } duration = libxsmm_timer_duration(start, libxsmm_timer_tick()); if (0 < duration) { fprintf(stdout, "\tLIBXSMM: %.1f GFLOPS/s\n", gflops * nrepeat / duration); } } else { /* tiled xGEMM (with external parallelization) */ int i; double duration; unsigned long long start = libxsmm_timer_tick(); for (i = 0; i < nrepeat; ++i) { #if defined(_OPENMP) # pragma omp parallel # pragma omp single nowait #endif XGEMM(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); } duration = libxsmm_timer_duration(start, libxsmm_timer_tick()); if (0 < duration) { fprintf(stdout, "\tLIBXSMM: %.1f GFLOPS/s\n", gflops * nrepeat / duration); } } #if defined(CHECK) && (!defined(__BLAS) || (0 != __BLAS)) if (0 != d) { /* validate result against LAPACK/BLAS xGEMM */ libxsmm_matdiff_info diff; int i; double duration; unsigned long long start = libxsmm_timer_tick(); for (i = 0; i < nrepeat; ++i) { XGEMM_GOLD(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, d, &ldc); } duration = libxsmm_timer_duration(start, libxsmm_timer_tick()); if (0 < duration) { fprintf(stdout, "\tBLAS: %.1f GFLOPS/s\n", gflops * nrepeat / duration); } result = libxsmm_matdiff(&diff, LIBXSMM_DATATYPE(OTYPE), m, n, d, c, &ldc, &ldc); if (EXIT_SUCCESS == result) { fprintf(stdout, "\tdiff: L2abs=%f Linf=%f\n", diff.l2_abs, diff.linf_abs); if (check < diff.l2_rel) { fprintf(stderr, "FAILED.\n"); result = EXIT_FAILURE; } } libxsmm_free(d); } #endif libxsmm_free(c); libxsmm_free(a); libxsmm_free(b); } fprintf(stdout, "Finished\n"); return result; } libxsmm-1.17/samples/xgemm/xgemm.sh000077500000000000000000000050141415223013700173500ustar00rootroot00000000000000#!/usr/bin/env sh ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) NAME=$(basename $0 .sh) GREP=$(command -v grep) ENV=$(command -v env) if [ "Windows_NT" = "${OS}" ]; then # Cygwin's "env" does not set PATH ("Files/Black: No such file or directory") export PATH=${PATH}:${HERE}/../../lib:/usr/x86_64-w64-mingw32/sys-root/mingw/bin # Cygwin's ldd hangs with dyn. linked executables or certain shared libraries LDD=$(command -v cygcheck) EXE=.exe else if [ "$(command -v ldd)" ]; then LDD=ldd elif [ "$(command -v otool)" ]; then LDD="otool -L" else LDD=echo fi fi MICINFO=$(command -v micinfo) if [ "${MICINFO}" ]; then MICCORES=$(${MICINFO} 2>/dev/null | sed -n "0,/[[:space:]]\+Total No of Active Cores :[[:space:]]\+\([0-9]\+\)/s//\1/p") fi if [ "" = "${MICCORES}" ]; then MICCORES=61 fi MICTPERC=3 if [ "-mic" != "$1" ]; then if [ "$(${LDD} ${HERE}/${NAME}${EXE} 2>/dev/null | ${GREP} libiomp5\.)" ]; then ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ KMP_AFFINITY=compact,granularity=fine,1 \ MIC_KMP_AFFINITY=compact,granularity=fine \ MIC_KMP_HW_SUBSET=$((MICCORES-1))c${MICTPERC}t \ MIC_ENV_PREFIX=MIC \ OFFLOAD_INIT=on_start \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" else ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../../lib \ OMP_PROC_BIND=TRUE \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} "$@" fi else shift ${ENV} \ SINK_LD_LIBRARY_PATH=${SINK_LD_LIBRARY_PATH}:${MIC_LD_LIBRARY_PATH}:${HERE}/../../lib \ micnativeloadex \ ${HERE}/${NAME}${EXE} -a "$*" \ -e "KMP_AFFINITY=compact,granularity=fine" \ -e "MIC_KMP_HW_SUBSET=$((MICCORES-1))${MICTPERC}t" fi libxsmm-1.17/samples/xgemm/xgemm.vcxproj000066400000000000000000000553051415223013700204360ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 xgemm {E4DB8253-79E9-4E5D-BF01-90ABA3A76893} 10.0 Application Disabled Disabled Sequential v142 true Application true true Disabled Disabled Sequential v142 Application true Disabled Disabled Sequential v142 true Application Disabled Disabled Sequential v142 true true Application true Disabled Disabled Sequential v142 Application true Disabled Disabled true Sequential v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false 3948,10373,10382 HOST true true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmext.lib;mkl_rt.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false SingleFile 3948,10373,10382 HOST true true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false 3948,10373,10382 HOST true true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmext.lib;mkl_rt.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false SingleFile 3948,10373,10382 HOST true true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false 3948,10373,10382 HOST true true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false 3948,10373,10382 HOST true true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/samples/xgemm/xgemm_opentuner.py000077500000000000000000000126731415223013700214760ustar00rootroot00000000000000#!/usr/bin/env python3 ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### # # This script is based on OpenTuner's tutorial: # "Optimizing Block Matrix Multiplication". # import opentuner from opentuner import ConfigurationManipulator from opentuner import IntegerParameter from opentuner import MeasurementInterface from opentuner import Result import inspect import json import time import math import sys import os import re try: here = os.path.dirname(inspect.getfile(inspect.currentframe())) scripts = os.path.realpath(os.path.join(here, "..", "..", "scripts")) if scripts not in sys.path: sys.path.insert(0, scripts) import libxsmm_utilities except ImportError: pass class XgemmTuner(MeasurementInterface): def manipulator(self): """ Define the search space by creating a ConfigurationManipulator """ self.dimset = libxsmm_utilities.load_mnklist(self.args.mnk, 0, -1) self.granularity = 1 assert(0 < self.granularity) m_max = (64 + self.granularity - 1) / self.granularity n_max = (256 + self.granularity - 1) / self.granularity k_max = (256 + self.granularity - 1) / self.granularity m_param = IntegerParameter("M", self.granularity, m_max) n_param = IntegerParameter("N", self.granularity, n_max) k_param = IntegerParameter("K", self.granularity, k_max) manipulator = ConfigurationManipulator() manipulator.add_parameter(m_param) manipulator.add_parameter(n_param) manipulator.add_parameter(k_param) return manipulator def seed_configurations(self): m_seed = self.args.m n_seed = [self.args.n, m_seed][0 == self.args.n] k_seed = [self.args.k, n_seed][0 == self.args.k] if 0 == m_seed or 0 == n_seed or 0 == k_seed: return [] else: return [{"M": (m_seed + self.granularity - 1) / self.granularity, "N": (n_seed + self.granularity - 1) / self.granularity, "K": (k_seed + self.granularity - 1) / self.granularity}] def objective(self): return opentuner.search.objective.MaximizeAccuracyMinimizeSize() def run(self, desired_result, input, limit): """ Compile and run a given configuration then return performance """ cfg = desired_result.configuration.data run_cmd = ( "CHECK=0" " LIBXSMM_TGEMM_M=" + str(self.granularity * cfg["M"]) + " LIBXSMM_TGEMM_N=" + str(self.granularity * cfg["N"]) + " LIBXSMM_TGEMM_K=" + str(self.granularity * cfg["K"]) + " ./xgemm.sh") geoperf = 0 # geometric mean compensation = 0 # see Kahan for dims in self.dimset: run_result = self.call_program( run_cmd + " " + " ".join(map(str, dims))) assert(run_result["returncode"] == 0) match = re.search( "\\s*LIBXSMM:\\s+([0-9]+(\\.[0-9]*)*)", str(run_result["stdout"])) assert(match is not None) gflops = float(match.group(1)) assert(0 < gflops) kha = math.log(gflops) - compensation khb = geoperf + kha compensation = (khb - geoperf) - kha geoperf = khb geoperf = math.exp(geoperf / len(self.dimset)) geotime = 1000000.0 / geoperf mnk = (self.granularity**3) * cfg["M"] * cfg["N"] * cfg["K"] return Result(time=geotime, accuracy=geoperf, size=mnk) def save_final_config(self, configuration): """called at the end of tuning""" matrices = ( # collects requested matrix shapes into string "-".join(map(str, map(lambda mnk: "x".join( map(str, mnk)), self.dimset)))) filename = "xgemm-" + matrices + time.strftime( "-%Y%m%d-%H%M%S") + ".json" print("Optimal block size written to " + filename + ": ", configuration.data) # self.manipulator().save_to_file(configuration.data, filename) with open(filename, 'w') as fd: json.dump(configuration.data, fd) if __name__ == "__main__": argparser = opentuner.default_argparser() argparser.add_argument( "mnk", nargs="*", default=["1024,1280,1536,1792"], help="Set of MNK parameters to be tuned") argparser.add_argument( "-m", "--initial-m", type=int, default=0, nargs='?', dest="m", help="Initial tile size (M)") argparser.add_argument( "-n", "--initial-n", type=int, default=0, nargs='?', dest="n", help="Initial tile size (N)") argparser.add_argument( "-k", "--initial-k", type=int, default=0, nargs='?', dest="k", help="Initial tile size (K)") XgemmTuner.main(argparser.parse_args()) libxsmm-1.17/scripts/000077500000000000000000000000001415223013700146025ustar00rootroot00000000000000libxsmm-1.17/scripts/libxsmm_config.py000077500000000000000000000122771415223013700201700ustar00rootroot00000000000000#!/usr/bin/env python3 ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### from string import Template from datetime import date import libxsmm_utilities import fnmatch import sys if __name__ == "__main__": argc = len(sys.argv) if 1 < argc: # required argument(s) filename = sys.argv[1] # default configuration if no arguments are given ilp64 = offload = precision = flags = threshold = 0 sync = jit = 1 alpha = beta = 1 cacheline = 64 prefetch = -1 wrap = 1 malloc = 0 mnklist = list() # optional argument(s) if 2 < argc: ilp64 = int(sys.argv[2]) if 3 < argc: offload = int(sys.argv[3]) if 4 < argc: cacheline = libxsmm_utilities.sanitize_alignment(int(sys.argv[4])) if 5 < argc: precision = int(sys.argv[5]) if 6 < argc: prefetch = int(sys.argv[6]) if 7 < argc: threshold = int(sys.argv[7]) if 8 < argc: sync = int(sys.argv[8]) if 9 < argc: jit = int(sys.argv[9]) if 10 < argc: flags = int(sys.argv[10]) if 11 < argc: alpha = int(sys.argv[11]) if 12 < argc: beta = int(sys.argv[12]) if 13 < argc: wrap = int(sys.argv[13]) if 14 < argc: malloc = int(sys.argv[14]) if 15 < argc: mnklist = sorted(libxsmm_utilities.load_mnklist(sys.argv[15:], 0)) version, branch, realversion = libxsmm_utilities.version_branch() major, minor, update, patch = libxsmm_utilities.version_numbers( version ) if 0 == threshold: threshold = 64 * 64 * 64 maxmnk = libxsmm_utilities.max_mnk(mnklist, threshold) maxdim = int(maxmnk ** (1.0 / 3.0) + 0.5) avgdim = int(0.5 * maxdim + 0.5) avgm = libxsmm_utilities.median( list(map(lambda mnk: mnk[0], mnklist)), avgdim, False ) avgn = libxsmm_utilities.median( list(map(lambda mnk: mnk[1], mnklist)), avgdim, False ) avgk = libxsmm_utilities.median( list(map(lambda mnk: mnk[2], mnklist)), avgdim, False ) maxm = libxsmm_utilities.max_mnk(mnklist, avgdim, 0) maxn = libxsmm_utilities.max_mnk(mnklist, avgdim, 1) maxk = libxsmm_utilities.max_mnk(mnklist, avgdim, 2) substitute = { "VERSION": realversion, "BRANCH": branch, "MAJOR": major, "MINOR": minor, "UPDATE": update, "PATCH": patch, "DATE": date.today().strftime("%Y%m%d"), "CACHELINE": cacheline, "PREFETCH": [-1, prefetch][0 <= prefetch], "MAX_MNK": maxmnk, "MAX_DIM": maxdim, "AVG_DIM": int((maxdim + 1) / 2), "MAX_M": [maxdim, maxm][avgm < maxm], "MAX_N": [maxdim, maxn][avgn < maxn], "MAX_K": [maxdim, maxk][avgk < maxk], "FLAGS": flags, "ILP64": [0, 1][0 != ilp64], "ALPHA": alpha, "BETA": beta, "WRAP": wrap, "MALLOC": malloc, "SYNC": [0, 1][0 != sync], "JIT": [0, 1][0 != jit], "LIBXSMM_OFFLOAD_BUILD": ["", "\n#define LIBXSMM_OFFLOAD_BUILD"][ 0 != offload ], "MNK_PREPROCESSOR_LIST": "", } template = Template(open(filename, "r").read()) if fnmatch.fnmatch(filename, "*.h*"): if mnklist: first = mnklist[0] for mnk in mnklist: mnkstr = "_".join(map(str, mnk)) if mnk != first: substitute["MNK_PREPROCESSOR_LIST"] += "\n" if 2 != precision: substitute["MNK_PREPROCESSOR_LIST"] += ( "#define LIBXSMM_SMM_" + mnkstr ) if mnk != first or 0 == precision: substitute["MNK_PREPROCESSOR_LIST"] += "\n" if 1 != precision: substitute["MNK_PREPROCESSOR_LIST"] += ( "#define LIBXSMM_DMM_" + mnkstr ) print(template.substitute(substitute)) else: substitute["BLASINT_KIND"] = ["C_INT", "C_LONG_LONG"][0 != ilp64] print(template.safe_substitute(substitute)) else: sys.tracebacklimit = 0 raise ValueError(sys.argv[0] + ": wrong number of arguments!") libxsmm-1.17/scripts/libxsmm_dispatch.py000077500000000000000000000112101415223013700205040ustar00rootroot00000000000000#!/usr/bin/env python3 ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### import libxsmm_utilities import sys import os if __name__ == "__main__": argc = len(sys.argv) if 1 < argc: arg1_filename = [sys.argv[1], ""]["0" == sys.argv[1]] arg1_isfile = os.path.isfile(arg1_filename) base = 1 if arg1_isfile: print("#if !defined(_WIN32)") print("{ static const char *const build_state =") print('# include "../' + os.path.basename(arg1_filename) + '"') print(" ;") print(" internal_build_state = build_state;") print("}") print("#endif") base = 2 if (base + 2) < argc: precision = int(sys.argv[base + 0]) threshold = int(sys.argv[base + 1]) mnklist = libxsmm_utilities.load_mnklist(sys.argv[base + 2:], 0) print( "/* omit registering code if JIT is enabled" " and if an ISA extension is found" ) print( " * which is beyond the static code" " path used to compile the library" ) print(" */") print("#if (0 != LIBXSMM_JIT) && !defined(__MIC__)") print( "if (LIBXSMM_X86_GENERIC > libxsmm_target_archid " "/* JIT code gen. is not available */" ) print( " /* conditions allows to avoid JIT " "(if static code is good enough) */" ) print( " || (LIBXSMM_STATIC_TARGET_ARCH == libxsmm_target_archid)" ) print( " || (LIBXSMM_X86_AVX512_CORE <= libxsmm_target_archid &&" ) print( " libxsmm_cpuid_vlen32(LIBXSMM_STATIC_TARGET_ARCH) ==" ) print( " libxsmm_cpuid_vlen32(libxsmm_target_archid)))" ) print("#endif") print("{") print(" libxsmm_xmmfunction func;") for mnk in mnklist: mstr, nstr, kstr, mnkstr = ( str(mnk[0]), str(mnk[1]), str(mnk[2]), "_".join(map(str, mnk)), ) mnksig = mstr + ", " + nstr + ", " + kstr # prefer registering double-precision kernels # when approaching an exhausted registry if 1 != precision: # only double-precision print( " func.dmm = (libxsmm_dmmfunction)libxsmm_dmm_" + mnkstr + ";" ) print( " internal_register_static_code(" + "LIBXSMM_GEMM_PRECISION_F64, " + mnksig + ", func, new_registry);" ) for mnk in mnklist: mstr, nstr, kstr, mnkstr = ( str(mnk[0]), str(mnk[1]), str(mnk[2]), "_".join(map(str, mnk)), ) mnksig = mstr + ", " + nstr + ", " + kstr # prefer registering double-precision kernels # when approaching an exhausted registry if 2 != precision: # only single-precision print( " func.smm = (libxsmm_smmfunction)libxsmm_smm_" + mnkstr + ";" ) print( " internal_register_static_code(" + "LIBXSMM_GEMM_PRECISION_F32, " + mnksig + ", func, new_registry);" ) print("}") else: sys.tracebacklimit = 0 raise ValueError(sys.argv[0] + ": wrong number of arguments!") libxsmm-1.17/scripts/libxsmm_interface.py000077500000000000000000000205151415223013700206550ustar00rootroot00000000000000#!/usr/bin/env python3 ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### from string import Template import libxsmm_utilities import fnmatch import sys if __name__ == "__main__": argc = len(sys.argv) if 1 < argc: # required argument(s) filename = sys.argv[1] # default configuration if no arguments are given precision = 0 # all ifversion = 1 # interface prefetch = -1 # auto mnklist = list() # optional argument(s) if 2 < argc: ivalue = int(sys.argv[2]) ifversion = (ivalue >> 2) precision = (ivalue & 3) if 3 < argc: prefetch = int(sys.argv[3]) if 4 < argc: mnklist = sorted(libxsmm_utilities.load_mnklist(sys.argv[4:], 0)) template = Template(open(filename, "r").read()) if fnmatch.fnmatch(filename, "*.h*"): optional = [", ...", ""][0 <= prefetch] substitute = {"MNK_INTERFACE_LIST": ""} for mnk in mnklist: mnkstr = "_".join(map(str, mnk)) if 2 != precision: pfsig = [ optional + ");", ",\n " "const float* pa, " "const float* pb, " "const float* pc);" ][0 < prefetch] substitute["MNK_INTERFACE_LIST"] += ( "\nLIBXSMM_API void libxsmm_smm_" + mnkstr + "(const float* a, const float* b, float* c" + pfsig ) if 1 != precision: pfsig = [ optional + ");", ",\n " "const double* pa, " "const double* pb, " "const double* pc);" ][0 < prefetch] substitute["MNK_INTERFACE_LIST"] += ( "\nLIBXSMM_API void libxsmm_dmm_" + mnkstr + "(const double* a, const double* b, double* c" + pfsig ) if 0 == precision: substitute["MNK_INTERFACE_LIST"] += "\n" if mnklist and 0 != precision: substitute["MNK_INTERFACE_LIST"] += "\n" print(template.substitute(substitute)) else: # Fortran interface if 1 > ifversion and 0 != ifversion: raise ValueError("Fortran interface level is inconsistent!") # Fortran's OPTIONAL allows to always generate an interface # with prefetch signature (more flexible usage) if 0 == prefetch: prefetch = -1 version, branch, realversion = libxsmm_utilities.version_branch(16) major, minor, update, patch = libxsmm_utilities.version_numbers( version ) substitute = { "VERSION": realversion, "BRANCH": branch, "MAJOR": major, "MINOR": minor, "UPDATE": update, "PATCH": patch, "MNK_INTERFACE_LIST": "", "CONTIGUOUS": ["", ", CONTIGUOUS"][1 < ifversion] } if mnklist: substitute["MNK_INTERFACE_LIST"] += "\n" for mnk in mnklist: mnkstr = "_".join(map(str, mnk)) if 0 == precision: substitute["MNK_INTERFACE_LIST"] += ( "\n " "!DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_smm_" + mnkstr + ", libxsmm_dmm_" + mnkstr ) elif 2 != precision: substitute["MNK_INTERFACE_LIST"] += ( "\n " "!DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_smm_" + mnkstr ) elif 1 != precision: substitute["MNK_INTERFACE_LIST"] += ( "\n " "!DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_dmm_" + mnkstr ) substitute["MNK_INTERFACE_LIST"] += "\n INTERFACE" optional = [", OPTIONAL", ""][0 < prefetch] bindc = ["", "BIND(C)"][0 < prefetch] for mnk in mnklist: mnkstr = "_".join(map(str, mnk)) if 2 != precision: pfsiga = [ ") BIND(C)\n", "," + "&".rjust(26 - len(mnkstr)) + "\n & pa, pb, pc) " + bindc + "\n" ][0 != prefetch] pfsigb = [ "", " REAL(C_FLOAT), " "INTENT(IN)" + optional + " :: " "pa(*), " "pb(*), " "pc(*)\n" ][0 != prefetch] substitute["MNK_INTERFACE_LIST"] += ( "\n " "PURE SUBROUTINE libxsmm_smm_" + mnkstr + "(a, b, c" + pfsiga + " IMPORT :: C_FLOAT\n" " REAL(C_FLOAT), " "INTENT(IN) :: a(*), b(*)\n" " REAL(C_FLOAT), " "INTENT(INOUT) :: c(*)\n" + pfsigb + " END SUBROUTINE" ) if 1 != precision: pfsiga = [ ") BIND(C)\n", "," + "&".rjust(26 - len(mnkstr)) + "\n & pa, pb, pc) " + bindc + "\n" ][0 != prefetch] pfsigb = [ "", " REAL(C_DOUBLE), " "INTENT(IN)" + optional + " :: " "pa(*), " "pb(*), " "pc(*)\n" ][0 != prefetch] substitute["MNK_INTERFACE_LIST"] += ( "\n " "PURE SUBROUTINE libxsmm_dmm_" + mnkstr + "(a, b, c" + pfsiga + " IMPORT :: C_DOUBLE\n" " REAL(C_DOUBLE), " "INTENT(IN) :: a(*), b(*)\n" " REAL(C_DOUBLE), " "INTENT(INOUT) :: c(*)\n" + pfsigb + " END SUBROUTINE" ) substitute["MNK_INTERFACE_LIST"] += "\n END INTERFACE" print(template.safe_substitute(substitute)) else: sys.tracebacklimit = 0 raise ValueError(sys.argv[0] + ": wrong number of arguments!") libxsmm-1.17/scripts/libxsmm_source.sh000077500000000000000000000050161415223013700201760ustar00rootroot00000000000000#!/usr/bin/env sh SRCDIR=../src HERE=$(cd "$(dirname "$0")" && pwd -P) GREP=$(command -v grep) GIT=$(command -v git) if [ "" = "${GREP}" ]; then >&2 echo "Error: missing prerequisites!" exit 1 fi if [ "${GIT}" ] && [ "" = "$(${GIT} ls-files "${HERE}/${SRCDIR}/libxsmm_main.c" 2>/dev/null)" ]; then GIT="" fi cat << EOM /****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_SOURCE_H #define LIBXSMM_SOURCE_H #if defined(LIBXSMM_MACROS_H) # error Please do not include any LIBXSMM header other than libxsmm_source.h! #endif #if defined(LIBXSMM_BUILD) # error LIBXSMM_BUILD cannot be defined for the header-only LIBXSMM! #endif /** * This header is intentionally called "libxsmm_source.h" since the followings block * includes *internal* files, and thereby exposes LIBXSMM's implementation. * The so-called "header-only" usage model gives up the clearly defined binary interface * (including support for hot-fixes after deployment), and requires to rebuild client * code for every (internal) change of LIBXSMM. Please make sure to only rely on the * public interface as the internal implementation may change without notice. */ #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif EOM if [ "" = "$1" ]; then DSTDIR=${SRCDIR} else DSTDIR=$1 fi # determine order of filenames in directory list export LC_ALL=C # good-enough pattern to match a main function, and to exclude this translation unit for FILE in $(cd "${HERE}/${SRCDIR}" && ${GREP} -L "main[[:space:]]*(.*)" ./*.c); do BASENAME=$(basename "${FILE}") if [ "" = "${GIT}" ] || [ "$(${GIT} ls-files "${HERE}/${SRCDIR}/${BASENAME}" 2>/dev/null)" ]; then echo "#include \"${DSTDIR}/${BASENAME}\"" fi done cat << EOM #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif #endif /*LIBXSMM_SOURCE_H*/ EOM libxsmm-1.17/scripts/libxsmm_specialized.py000077500000000000000000000171011415223013700212060ustar00rootroot00000000000000#!/usr/bin/env python3 ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### import sys if __name__ == "__main__": argc = len(sys.argv) if 6 == argc: precision = int(sys.argv[1]) m, n, k = int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]) prefetch = int(sys.argv[5]) mnkstr = str(m) + "_" + str(n) + "_" + str(k) optional = ["", ", ..."][0 > prefetch] signature = ["a, b, c", "a, b, c, pa, pb, pc"][0 < prefetch] if 2 != precision: pfsig = [ optional + ")", "\n" ", const float* pa" ", const float* pb" ", const float* pc)", ][0 < prefetch] print print print( "LIBXSMM_API void libxsmm_smm_" + mnkstr + "(const float* a, const float* b, float* c" + pfsig ) print("{") print( "#if defined(__AVX512F__) && " "defined(LIBXSMM_GENTARGET_skx_sp) && \\" ) print(" !(defined(__AVX512PF__) && defined(__AVX512ER__))") print(" libxsmm_smm_" + mnkstr + "_skx(" + signature + ");") print( "#elif defined(__AVX512F__) && " "defined(LIBXSMM_GENTARGET_knl_sp)" ) print(" libxsmm_smm_" + mnkstr + "_knl(" + signature + ");") print( "#elif defined(__AVX2__) && " "defined(LIBXSMM_GENTARGET_hsw_sp)" ) print(" libxsmm_smm_" + mnkstr + "_hsw(" + signature + ");") print( "#elif defined(__AVX__) && " "defined(LIBXSMM_GENTARGET_snb_sp)" ) print(" libxsmm_smm_" + mnkstr + "_snb(" + signature + ");") print( "#elif defined(__SSE3__) && " "defined(LIBXSMM_GENTARGET_wsm_sp)" ) print(" libxsmm_smm_" + mnkstr + "_wsm(" + signature + ");") print("#else") print( " const char transa = (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & " "LIBXSMM_FLAGS) ? 'N' : 'T');" ) print( " const char transb = (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & " "LIBXSMM_FLAGS) ? 'N' : 'T');" ) print(" const float alpha = LIBXSMM_ALPHA, beta = LIBXSMM_BETA;") print( " const libxsmm_blasint " "m = " + str(m) + ", " "n = " + str(n) + ", " "k = " + str(k) + ";" ) if 0 < prefetch: print( " LIBXSMM_UNUSED(pa);" " LIBXSMM_UNUSED(pb);" " LIBXSMM_UNUSED(pc);" ) print( " LIBXSMM_INLINE_XGEMM(float, float, &transa, &transb," " &m, &n, &k, &alpha, a, &m, b, &k, &beta, c, &m);" ) print("#endif") print("}") print print print( "LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_smm_" + mnkstr + ")(const float* a, const float* b, float* c" + pfsig + ";" ) print( "LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_smm_" + mnkstr + ")(const float* a, const float* b, float* c" + pfsig ) print("{") print(" libxsmm_smm_" + mnkstr + "(" + signature + ");") print("}") if 1 != precision: pfsig = [ optional + ")", "\n" ", const double* pa" ", const double* pb" ", const double* pc)", ][0 < prefetch] print print print( "LIBXSMM_API void libxsmm_dmm_" + mnkstr + "(const double* a, const double* b, double* c" + pfsig ) print("{") print( "#if defined(__AVX512F__) && " "defined(LIBXSMM_GENTARGET_skx_dp) && \\" ) print(" !(defined(__AVX512PF__) && defined(__AVX512ER__))") print(" libxsmm_dmm_" + mnkstr + "_skx(" + signature + ");") print( "#elif defined(__AVX512F__) && " "defined(LIBXSMM_GENTARGET_knl_dp)" ) print(" libxsmm_dmm_" + mnkstr + "_knl(" + signature + ");") print( "#elif defined(__AVX2__) && " "defined(LIBXSMM_GENTARGET_hsw_dp)" ) print(" libxsmm_dmm_" + mnkstr + "_hsw(" + signature + ");") print( "#elif defined(__AVX__) && " "defined(LIBXSMM_GENTARGET_snb_dp)" ) print(" libxsmm_dmm_" + mnkstr + "_snb(" + signature + ");") print( "#elif defined(__SSE3__) && " "defined(LIBXSMM_GENTARGET_wsm_dp)" ) print(" libxsmm_dmm_" + mnkstr + "_wsm(" + signature + ");") print("#else") print( " const char transa = (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & " "LIBXSMM_FLAGS) ? 'N' : 'T');" ) print( " const char transb = (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & " "LIBXSMM_FLAGS) ? 'N' : 'T');" ) print(" const double alpha = LIBXSMM_ALPHA, beta = LIBXSMM_BETA;") print( " const libxsmm_blasint " "m = " + str(m) + ", " "n = " + str(n) + ", " "k = " + str(k) + ";" ) if 0 < prefetch: print( " LIBXSMM_UNUSED(pa);" " LIBXSMM_UNUSED(pb);" " LIBXSMM_UNUSED(pc);" ) print( " LIBXSMM_INLINE_XGEMM(double, double, &transa, &transb," " &m, &n, &k, &alpha, a, &m, b, &k, &beta, c, &m);" ) print("#endif") print("}") print print print( "LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_dmm_" + mnkstr + ")(const double* a, const double* b, double* c" + pfsig + ";" ) print( "LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_dmm_" + mnkstr + ")(const double* a, const double* b, double* c" + pfsig ) print("{") print(" libxsmm_dmm_" + mnkstr + "(" + signature + ");") print("}") else: sys.tracebacklimit = 0 raise ValueError(sys.argv[0] + ": wrong number of arguments!") libxsmm-1.17/scripts/libxsmm_utilities.py000077500000000000000000000240731415223013700207330ustar00rootroot00000000000000#!/usr/bin/env python3 ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### import itertools import operator import inspect import sys import os try: from functools import reduce except ImportError: pass def upper_list(lists, level): nlist = len(lists) upper = [level, level + nlist][1 > level] - 1 above = lists[upper] if above: return above elif -nlist <= level: return upper_list(lists, level - 1) else: return [] # https://docs.python.org/3/library/itertools.html#itertools.product def itertools_product(*args): # product('ABCD', 'xy') --> Ax Ay Bx By Cx Cy Dx Dy # product(range(2), repeat=3) --> 000 001 010 011 100 101 110 111 pools = [tuple(pool) for pool in args] result = [[]] for pool in pools: result = [x + [y] for x in result for y in pool] for prod in result: yield tuple(prod) def load_mnklist(argv, threshold, inputformat=0, resultset=None): if resultset is None: resultset = set() if 0 == inputformat: # indexes format resultset = set(map(lambda mnk: tuple(map(int, mnk.split("_"))), argv)) elif -1 == inputformat: # new input format groups = map( lambda group: [int(i) for i in group.split()], " ".join(argv[0:]).split(","), ) resultset = set( itertools.chain( *[list(itertools_product(*(i, i, i))) for i in groups] ) ) elif -2 == inputformat: # legacy format mlist = list( map( int, map( lambda s: str(s).replace(",", " ").strip(), argv[2:2 + int(argv[0])], ), ) ) nlist = list( map( int, map( lambda s: str(s).replace(",", " ").strip(), argv[2 + int(argv[0]):2 + int(argv[0]) + int(argv[1])], ), ) ) klist = list( map( int, map( lambda s: str(s).replace(",", " ").strip(), argv[2 + int(argv[0]) + int(argv[1]):], ), ) ) mnk = [mlist, nlist, klist] top = [ [mlist, upper_list(mnk, 0)][0 == len(mlist)], [nlist, upper_list(mnk, 1)][0 == len(nlist)], [klist, upper_list(mnk, 2)][0 == len(klist)], ] for m in top[0]: for n in top[1]: if not nlist: n = m for k in top[2]: if not klist: k = n if not mlist: m = k resultset.add((m, n, k)) else: sys.tracebacklimit = 0 raise ValueError("load_mnklist: unexpected input format!") if 0 != threshold: # threshold requested return set( filter( lambda mnk: (0 < mnk[0]) and (0 < mnk[1]) and (0 < mnk[2]) and (threshold >= (mnk[0] * mnk[1] * mnk[2])), resultset, ) ) else: return set( filter( lambda mnk: (0 < mnk[0]) and (0 < mnk[1]) and (0 < mnk[2]), resultset, ) ) def max_mnk(mnklist, init=0, index=None): if index is not None and 0 <= index and index < 3: mapped = map(lambda mnk: mnk[index], mnklist) else: mapped = map(lambda mnk: mnk[0] * mnk[1] * mnk[2], mnklist) return reduce(max, mapped, init) def median(list_of_numbers, fallback=None, average=True): size = len(list_of_numbers) if 0 < size: # TODO: use nth element list_of_numbers.sort() size2 = int(size / 2) if average and 0 == (size - size2 * 2): medval = int( 0.5 * (list_of_numbers[size2 - 1] + list_of_numbers[size2]) + 0.5 ) else: medval = list_of_numbers[size2] if fallback is not None: result = min(medval, fallback) else: result = medval elif fallback is not None: result = fallback else: sys.tracebacklimit = 0 raise ValueError("median: empty list!") return result def is_pot(num): return 0 <= num or 0 == (num & (num - 1)) def sanitize_alignment(alignment): if 0 >= alignment: alignment = [1, 64][0 != alignment] elif not is_pot(alignment): sys.tracebacklimit = 0 raise ValueError( "sanitize_alignment: alignment must be a Power of Two (POT)!" ) return alignment def align_value(n, typesize, alignment): if 0 < typesize and 0 < alignment: return ( ((n * typesize + alignment - 1) / alignment) * alignment ) / typesize else: sys.tracebacklimit = 0 raise ValueError("align_value: invalid input!") def version_branch_from_file(version_filepath): version_file = open(version_filepath, "r") version, branch, sep = "1.0", "", "-" try: version_list, n = version_file.read().replace("\n", "").split(sep), 0 for word in version_list: if not reduce( operator.and_, (subword.isdigit() for subword in word.split(".")), True, ): branch += [sep + word, word][0 == n] n += 1 else: break version = sep.join(version_list[n:]) finally: version_file.close() return (version, branch) def version_numbers(version, branch=None): version_list = version.split("-") if version_list and not version_list[0][0].isdigit(): vbranch = version_list[0] else: vbranch = "master" if branch is None or vbranch == branch: minor = update = patch = 0 major = 1 n = len(version_list) if 1 < n: patch_list = version_list[n - 1] if 1 == len(patch_list.split(".")): version_list = version_list[n - 2].split(".") if version_list != [vbranch]: patch = int(patch_list) else: major = int(patch_list) else: version_list = patch_list.split(".") else: version_list = version.split(".") n = len(version_list) try: if 0 < n: major = int(version_list[0]) if 1 < n: minor = int(version_list[1]) if 2 < n: update = int(version_list[2]) except ValueError: # if 1 == n: major = 0 pass else: major = minor = update = patch = -1 return major, minor, update, patch def version_branch(max_strlen=-1): version_filename = "version.txt" filepath_default = os.path.realpath( os.path.join( os.path.dirname(inspect.getfile(inspect.currentframe())), "..", version_filename, ) ) filepath_local = os.path.realpath(version_filename) # local version file realversion, branch = version_branch_from_file(filepath_default) version = realversion out_of_tree = filepath_default != filepath_local if out_of_tree and os.path.isfile(filepath_local): local, ignored = version_branch_from_file(filepath_local) if version_numbers(realversion) < version_numbers(local): version = local if 0 < max_strlen: start = int(max_strlen / 3) cut = max( branch.rfind("-", start, max_strlen), branch.rfind("_", start, max_strlen), branch.rfind(".", start, max_strlen), ) if start < cut: branch = branch[0:cut] else: branch = branch[0:max_strlen] return (version, branch, realversion) if __name__ == "__main__": argc = len(sys.argv) if 1 < argc: arg1 = int(sys.argv[1]) else: arg1 = 0 if -1 == arg1: if 5 < argc: # threshold = int(sys.argv[2]) mnk_size = int(sys.argv[3]) dims = load_mnklist(sys.argv[4:4 + mnk_size], 0, -1) dims = load_mnklist(sys.argv[4 + mnk_size:], 0, -2, dims) mnklist = map(lambda mnk: "_".join(map(str, mnk)), sorted(dims)) print(" ".join(mnklist)) elif 3 == argc: major, minor, update, patch = ( version_numbers(sys.argv[2], "release") ) print(["0", "1"][0 == patch]) elif 0 <= arg1: if 0 == arg1 and 3 == argc: major, minor, update, patch = version_numbers(sys.argv[2]) print(major) # soname version else: version, branch, realversion = version_branch() major, minor, update, patch = version_numbers(version) if 1 == arg1: print(major) elif 2 == arg1: print(minor) elif 3 == arg1: print(update) elif 4 == arg1: print(patch) elif "" != branch: print("{0}-{1}".format(branch, realversion)) else: print(realversion) else: sys.tracebacklimit = 0 raise ValueError( "{0}: wrong ({1}) number of arguments ('{2}') given!".format( sys.argv[0], argc - 1, " ".join(sys.argv[1:])) ) libxsmm-1.17/scripts/tool_analyze.sh000077500000000000000000000040671415223013700176500ustar00rootroot00000000000000#!/usr/bin/env bash ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) MKTEMP=${HERE}/../.mktmp.sh MAKE=$(command -v make) GREP=$(command -v grep) SORT=$(command -v sort) CXX=$(command -v clang++) CC=$(command -v clang) CP=$(command -v cp) MV=$(command -v mv) if [ "${MKTEMP}" ] && [ "${MAKE}" ] && \ [ "${GREP}" ] && [ "${SORT}" ] && \ [ "${CXX}" ] && [ "${CC}" ] && \ [ "${CP}" ] && [ "${MV}" ]; then cd "${HERE}/.." || exit 1 ARG=$* if [ "" = "${ARG}" ]; then ARG=lib fi TMPF=$("${MKTEMP}" .tool_analyze.XXXXXX) ${CP} "${HERE}/../include/libxsmm_config.h" "${TMPF}" ${MAKE} -e CXX="${CXX}" CC="${CC}" FC= FORCE_CXX=1 DBG=1 ILP64=1 EFLAGS="--analyze" ${ARG} 2> .analyze.log ${MV} "${TMPF}" "${HERE}/../include/libxsmm_config.h" ISSUES=$(${GREP} -e "error:" -e "warning:" .analyze.log \ | ${GREP} -v "make:" \ | ${GREP} -v "is never read" \ | ${SORT} -u) echo echo "================================================================================" if [ "" = "${ISSUES}" ]; then echo "SUCCESS" echo "================================================================================" else echo "Errors (warnings)" echo "================================================================================" echo "${ISSUES}" exit 1 fi else >&2 echo "Error: missing prerequisites!" exit 1 fi libxsmm-1.17/scripts/tool_cpuinfo.sh000077500000000000000000000053301415223013700176420ustar00rootroot00000000000000#!/usr/bin/env bash ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### GREP=$(command -v grep) SORT=$(command -v sort) CUT=$(command -v cut) TR=$(command -v tr) WC=$(command -v wc) if [ "${GREP}" ] && [ "${SORT}" ] && [ "${CUT}" ] && [ "${TR}" ] && [ "${WC}" ]; then if [ "$(command -v lscpu)" ]; then NS=$(lscpu | ${GREP} -m1 "Socket(s)" | ${TR} -d " " | ${CUT} -d: -f2) if [ "" = "${NS}" ]; then NS=1; fi NC=$((NS*$(lscpu | ${GREP} -m1 "Core(s) per socket" | ${TR} -d " " | ${CUT} -d: -f2))) NT=$((NC*$(lscpu | ${GREP} -m1 "Thread(s) per core" | ${TR} -d " " | ${CUT} -d: -f2))) elif [ -e /proc/cpuinfo ]; then NS=$(${GREP} "physical id" /proc/cpuinfo | ${SORT} -u | ${WC} -l | ${TR} -d " ") if [ "" = "${NS}" ]; then NS=1; fi NC=$((NS*$(${GREP} -m1 "cpu cores" /proc/cpuinfo | ${TR} -d " " | ${CUT} -d: -f2))) NT=$(${GREP} "core id" /proc/cpuinfo | ${WC} -l | ${TR} -d " ") elif [ "Darwin" = "$(uname)" ]; then NS=$(sysctl hw.packages | ${CUT} -d: -f2 | ${TR} -d " ") NC=$(sysctl hw.physicalcpu | ${CUT} -d: -f2 | ${TR} -d " ") NT=$(sysctl hw.logicalcpu | ${CUT} -d: -f2 | ${TR} -d " ") fi if [ "${NC}" ] && [ "${NT}" ]; then HT=$((NT/NC)) else NS=1 NC=1 NT=1 HT=1 fi if [ "$(command -v numactl)" ]; then NN=$(numactl -H | ${GREP} "available:" | ${CUT} -d' ' -f2) else NN=${NS} fi if [ "-ns" = "$1" ] || [ "--sockets" = "$1" ]; then echo "${NS}" elif [ "-nc" = "$1" ] || [ "--cores" = "$1" ]; then echo "${NC}" elif [ "-nt" = "$1" ] || [ "--threads" = "$1" ]; then echo "${NT}" elif [ "-ht" = "$1" ] || [ "--smt" = "$1" ]; then echo "${HT}" elif [ "-nn" = "$1" ] || [ "--numa" = "$1" ]; then echo "${NN}" elif [ "-h" = "$1" ] || [ "--help" = "$1" ]; then echo "$0 [-ns|--sockets] [-nc|--cores] [-nt|--threads] [-ht|--smt] [-nn|--numa]" else echo -e "sockets\t: ${NS}" echo -e "cores\t: ${NC}" echo -e "threads\t: ${NT}" echo -e "smt\t: ${HT}" echo -e "numa:\t: ${NN}" fi else >&2 echo "Error: missing prerequisites!" exit 1 fi libxsmm-1.17/scripts/tool_getenvars.sh000077500000000000000000000032431415223013700201760ustar00rootroot00000000000000#!/usr/bin/env bash ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### FIND=$(command -v find) SORT=$(command -v sort) SED=$(command -v gsed) # GNU sed is desired (macOS) if [ "" = "${SED}" ]; then SED=$(command -v sed) fi HERE="$(cd "$(dirname "$0")" && pwd -P)" SRC="${HERE}/../src" EXT="c" if [ "${FIND}" ] && [ "${SORT}" ] && [ "${SED}" ] && [ -d "${SRC}" ]; then export LC_ALL=C ENVARS="$(${FIND} "${SRC}" -type f -name "*.${EXT}" -exec \ "${SED}" "s/getenv[[:space:]]*([[:space:]]*\".[^\"]*/\n&/g" {} \; | \ ${SED} -n "s/.*getenv[[:space:]]*([[:space:]]*\"\(.[^\"]*\)..*/\1/p" | \ ${SORT} -u)" echo "=============================" echo "Other environment variables" echo "=============================" echo "${ENVARS}" | ${SED} "/LIBXSMM_/d" echo "=============================" echo "LIBXSMM environment variables" echo "=============================" echo "${ENVARS}" | ${SED} -n "/LIBXSMM_/p" else >&2 echo "Error: missing prerequisites!" exit 1 fi libxsmm-1.17/scripts/tool_inspector.sh000077500000000000000000000046251415223013700202130ustar00rootroot00000000000000#!/usr/bin/env bash ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### RPT=inspector KIND=mi1 BASENAME=$(command -v basename) TOOL=$(command -v inspxe-cl) GREP=$(command -v grep) SED=$(command -v sed) TR=$(command -v tr) RM=$(command -v rm) if [ "${TOOL_ENABLED}" != "" ] && [ "${TOOL_ENABLED}" != "0" ]; then if [ "$1" ] && [ "${BASENAME}" ] && [ "${TOOL}" ] && \ [ "${TR}" ] && [ "${GREP}" ] && [ "${SED}" ] && \ [ "${RM}" ]; then HERE=$(cd "$(dirname "$0")" && pwd -P) if [ "" = "${TRAVIS_BUILD_DIR}" ]; then export TRAVIS_BUILD_DIR=${HERE}/.. fi if [ "${TESTID}" ]; then ID=${TESTID} fi if [ "" = "${ID}" ]; then ID=${COVID} fi if [ "${ID}" ]; then RPTNAME=$(${BASENAME} $1)-${KIND}-${ID} else RPTNAME=$(${BASENAME} $1)-${KIND} fi DIR=${TRAVIS_BUILD_DIR}/${RPT} ${RM} -rf ${DIR}/${ID} ${TOOL} -collect ${KIND} -r ${DIR}/${ID} -no-auto-finalize -return-app-exitcode -- "$@" RESULT=$? if [ "0" = "${RESULT}" ]; then ${TOOL} -report problems -r ${DIR}/${ID} > ${DIR}/${RPTNAME}.txt RESULT2=$? if [ "" = "${TOOL_REPORT_ONLY}" ] && [ "0" != "$((2&2 echo 'ERROR: {}' && exit 255)" < /dev/stdin 2> >( \ ${SED} "/xargs/d" >&2) else >&2 echo "Error: missing prerequisites!" exit 1 fi libxsmm-1.17/scripts/tool_test.sh000077500000000000000000000404761415223013700171700ustar00rootroot00000000000000#!/usr/bin/env bash ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### # shellcheck disable=SC1090,SC2129,SC2155,SC2164,SC2178,SC2206,SC2207 set -o pipefail HERE=$(cd "$(dirname "$0")" && pwd -P) BASENAME=$(command -v basename) MKDIR=$(command -v mkdir) CHMOD=$(command -v chmod) UNAME=$(command -v uname) DIFF=$(command -v diff) # flush asynchronous NFS mount SYNC=$(command -v sync) GREP=$(command -v grep) WGET=$(command -v wget) GIT=$(command -v git) SED=$(command -v gsed) CUT=$(command -v cut) LS=$(command -v ls) TR=$(command -v tr) RM=$(command -v rm) CP=$(command -v cp) MKTEMP=${HERE}/../.mktmp.sh RUN_CMD="--session-command" #RUN_CMD="-c" # GNU sed is desired (macOS) if [ "" = "${SED}" ]; then SED=$(command -v sed) fi if [ "${WGET}" ] && [ "${SED}" ] && [ "${PIPELINE}" ] && \ [ "${BUILDKITE_ORGANIZATION_SLUG}" ] && \ [ "${BUILDKITE_AGENT_ACCESS_TOKEN}" ]; then REVSTART=$(${WGET} -qO- \ "https://api.buildkite.com/v2/organizations/${BUILDKITE_ORGANIZATION_SLUG}/pipelines/${PIPELINE}/builds?access_token=${BUILDKITE_AGENT_ACCESS_TOKEN}" \ | ${SED} -n '/ *\"commit\": / {0,/ *\"commit\": / s/ *\"commit\": \"\(..*\)\".*/\1/p}') fi if [ "" = "${REVSTART}" ]; then REVSTART="HEAD^" fi if [ "${MKTEMP}" ] && [ "${MKDIR}" ] && [ "${CHMOD}" ] && \ [ "${DIFF}" ] && [ "${GREP}" ] && [ "${SED}" ] && \ [ "${LS}" ] && [ "${TR}" ] && \ [ "${RM}" ] && [ "${CP}" ]; then # check if full/unlimited tests are triggered if [ "${FULLCI}" ] && [ "0" != "${FULLCI}" ]; then LIMIT=0 fi if [ "0" != "${LIMIT}" ] && [ "${GIT}" ] && \ [ "$(${GIT} log ${REVSTART}...HEAD 2>/dev/null | ${GREP} -e "\[full ci\]")" ]; then LIMIT=0 fi # set the case number if [ "$1" ] && [ -e "$1" ]; then export TESTSETFILE=$1 if [ "${BASENAME}" ]; then export TESTID=$(${BASENAME} ${TESTSETFILE%.*}) else export TESTID=${TESTSETFILE} fi export TESTSET=${TESTID} else # case number given if [ "$1" ] && [ "0" != "$1" ]; then export TESTID=$1 else export TESTID=1 fi fi # should be source'd after the above variables are set source "${HERE}/../.env/buildkite.env" "" #source "${HERE}/../.env/travis.env" "" # support yml-files for Travis-CI that depend on TRAVIS_* variables if [ "" = "${TRAVIS_BUILD_DIR}" ]; then export TRAVIS_BUILD_DIR=${REPOROOT} fi if [ "" = "${TRAVIS_OS_NAME}" ] && [ "${UNAME}" ]; then export TRAVIS_OS_NAME=$(${UNAME}) fi if [ "" = "${HOSTNAME}" ]; then HOSTNAME=$(hostname -s 2>/dev/null) fi # setup PARTITIONS for multi-tests if [ "" = "${PARTITIONS}" ]; then if [ "${PARTITION}" ]; then PARTITIONS=${PARTITION} else PARTITIONS=none fi fi if [ "random" = "${PARTITION}" ]; then if [ "random" != "${PARTITIONS}" ]; then PARTITIONS=(${PARTITIONS}) NPARTITIONS=${#PARTITIONS[@]} PARTITIONS=${PARTITIONS[RANDOM%NPARTITIONS]} else PARTITIONS=none fi fi export PARTITIONS # setup CONFIGS (multiple configurations) if [ "" = "${CONFIGS}" ]; then if [ "${CONFIG}" ]; then CONFIGS=${CONFIG} else CONFIGS=none fi elif [ "${CONFIG}" ]; then # singular CONFIG replaces set of CONFIGS CONFIGS=${CONFIG} fi # setup ENVS (multiple environments) if [ "" = "${ENVS}" ]; then if [ "${ENV}" ]; then ENVS=${ENV} else ENVS=none fi fi # select test-set ("travis" by default) if [ "" = "${TESTSET}" ]; then TESTSET=travis fi if [ "" = "${TESTSETFILE}" ] || [ ! -e "${TESTSETFILE}" ]; then if [ -e ".${TESTSET}.yml" ]; then TESTSETFILE=.${TESTSET}.yml elif [ -e "${TESTSET}.yml" ]; then TESTSETFILE=${TESTSET}.yml elif [ -e "${TESTSET}" ]; then TESTSETFILE=${TESTSET} else echo "ERROR: Cannot find file with test set!" exit 1 fi else TEST=${TESTSETFILE} fi if [ "${LIMITRUN}" ] && [ "0" != "${LIMITRUN}" ] && \ [ "${LIMIT}" ] && [ "0" != "${LIMIT}" ]; then LIMITRUN=$((LIMIT minutes SRUN_FLAGS="${SRUN_FLAGS} --time=$((LIMITRUN/60))" fi #SRUN_FLAGS="${SRUN_FLAGS} --preserve-env" umask 007 TESTSCRIPT=$(${MKTEMP} "${HERE}/../.tool_XXXXXX.sh") ${CHMOD} +rx "${TESTSCRIPT}" LAUNCH="${SRUN} --ntasks=1 --partition=\${PARTITION} ${SRUN_FLAGS} \ --unbuffered ${TESTSCRIPT}" elif [[ ("${SLURMSCRIPT}" && "0" != "${SLURMSCRIPT}") || (-d "$1") ]]; then umask 007 TESTSCRIPT=$(${MKTEMP} "${HERE}/../.tool_XXXXXX.sh") ${CHMOD} +rx "${TESTSCRIPT}" LAUNCH="${LAUNCH_CMD} ${TESTSCRIPT}" else # avoid temporary script in case of non-batch execution if [ "" = "${MAKEJ}" ]; then export MAKEJ="-j $(eval "${HERE}/tool_cpuinfo.sh" -nc)" fi SHOW_PARTITION=0 LAUNCH="\${TEST}" fi if [ "${LAUNCH_USER}" ] && [ "0" != "${SLURM}" ]; then LAUNCH="su ${LAUNCH_USER} -p ${RUN_CMD} \'${LAUNCH}\'" fi # backup current environment (snapshot) ${RM} -f "${HERE}"/../.env_?????? ENVFILE=$(${MKTEMP} "${HERE}/../.env_XXXXXX") ${CHMOD} +r "${ENVFILE}" declare -px > "${ENVFILE}" RESULT=0 # control log echo && echo "^^^ +++" while [ "${TEST}" ] || TEST=$(eval " \ ${SED} -n -e '/^ *script: *$/,\$p' ${HERE}/../${TESTSETFILE} | ${SED} -e '/^ *script: *$/d' | \ ${SED} -n -E \"/^ *- */H;//,/^ *$/G;s/\n(\n[^\n]*){\${TESTID}}$//p\" | \ ${SED} -e 's/^ *- *//' -e 's/^ *//' | ${TR} '\n' ' ' | \ ${SED} -e 's/ *$//'") && [ "${TEST}" ]; do if [ -d "${TEST}" ]; then SLURMDIR=${TEST} else # dummy SLURMDIR=$0 fi for SLURMFILE in $(${LS} -1 "${SLURMDIR}"); do if [[ (-d ${SLURMDIR}) && ("" = "${SLURMSCRIPT}" || "0" = "${SLURMSCRIPT}") ]]; then SLURMFILE=${SLURMDIR}/${SLURMFILE} TESTID=$(${BASENAME} ${SLURMFILE%.*}) elif [ -e "${TEST}" ]; then SLURMFILE=${TEST} fi if [ "none" = "${PARTITIONS}" ] && [ "$0" != "${SLURMFILE}" ] && [ -e "${SLURMFILE}" ]; then PARTITION=$(${SED} -n "s/^#SBATCH[[:space:]][[:space:]]*\(--partition=\|-p\)\(..*\)/\2/p" "${SLURMFILE}") if [ "${PARTITION}" ]; then PARTITIONS=${PARTITION}; fi fi if [ "${LIMIT}" ] && [ "0" != "${LIMIT}" ] && \ [ "$(command -v stat)" ] && \ [ "$(command -v date)" ]; then NOW=$(date +%s) LIMITFILE=$(echo "${LABEL}" | ${SED} -e "s/[^A-Za-z0-9._-]//g") if [ "" = "${LIMITFILE}" ]; then LIMITFILE=$(echo "${TESTID}" | ${SED} -e "s/[^A-Za-z0-9._-]//g") fi if [ "${LIMITFILE}" ]; then if [ "${PIPELINE}" ]; then LIMITBASE="${PIPELINE}-"; fi if [ "${LIMITDIR}" ] && [ -d "${LIMITDIR}" ]; then LIMITFILE=${LIMITDIR}/${LIMITBASE}${LIMITFILE} else LIMITFILE=${REPOROOT}/${LIMITBASE}${LIMITFILE} fi fi if [ "${LIMITFILE}" ] && [ -e "${LIMITFILE}" ]; then OLD=$(stat -c %Y "${LIMITFILE}") else # ensure build is not skipped OLD=${NOW} LIMIT=0 fi fi if [ "" = "${NOW}" ]; then NOW=0; fi if [ "" = "${OLD}" ]; then OLD=0; fi if [ "0" != "$((NOW<(OLD+LIMIT)))" ]; then echo "================================================================================" echo "Skipped ${TESTID} due to LIMIT=${LIMIT} seconds." echo "================================================================================" continue else TOUCHFILE=${LIMITFILE} fi for PARTITION in ${PARTITIONS}; do for CONFIG in ${CONFIGS}; do # make execution environment locally available (always) CONFIGFILE="" if [ "${HOSTNAME}" ] && [ "none" != "${CONFIG}" ]; then CONFIGPAT=$(echo "${CONFIGEX}" | ${SED} "s/[[:space:]][[:space:]]*/\\\|/g" | ${SED} "s/\\\|$//") if [ "${CONFIGPAT}" ]; then CONFIGFILES=($(bash -c "${LS} -1 ${REPOROOT}/.env/${HOSTNAME}/${CONFIG}.env 2>/dev/null" | ${SED} "/\(${CONFIGPAT}\)/d")) else CONFIGFILES=($(bash -c "${LS} -1 ${REPOROOT}/.env/${HOSTNAME}/${CONFIG}.env 2>/dev/null")) fi CONFIGCOUNT=${#CONFIGFILES[@]} if [ "0" != "${CONFIGCOUNT}" ]; then CONFIGFILE=${CONFIGFILES[RANDOM%CONFIGCOUNT]} CONFIG=$(${BASENAME} "${CONFIGFILE}" .env) else echo "WARNING: configuration \"${CONFIG}\" not found!" CONFIGFILE="" fi fi for ENV in ${ENVS}; do if [ "none" != "${ENV}" ]; then if [ "${CUT}" ]; then ENVVAL=$(echo "${ENV}" | ${CUT} -d= -f2); fi ENVSTR=${ENV} fi # print some header if all tests are selected or in case of multi-tests if [[ "none" != "${CONFIG}" && ("" = "$1" || "none" != "${PARTITION}" || "none" != "${ENV}") ]]; then if [ "none" != "${PARTITION}" ] && [ "0" != "${SHOW_PARTITION}" ]; then if [ "${ENVVAL}" ]; then echo "+++ TEST ${TESTID} (${PARTITION}/${CONFIG}/${ENVVAL})" else echo "+++ TEST ${TESTID} (${PARTITION}/${CONFIG})" fi elif [ "${ENVVAL}" ]; then echo "+++ TEST ${TESTID} (${CONFIG}/${ENVVAL})" else echo "+++ TEST ${TESTID} (${CONFIG})" fi fi # prepare temporary script for remote environment/execution if [ "${TESTSCRIPT}" ] && [ -e "${TESTSCRIPT}" ]; then echo "#!/usr/bin/env bash" > "${TESTSCRIPT}" echo "set -eo pipefail" >> "${TESTSCRIPT}" echo "cd ${REPOROOT}" >> "${TESTSCRIPT}" echo "if [ \"\$(command -v sync)\" ]; then sync; fi" >> "${TESTSCRIPT}" if [ "0" != "${SHOW_PARTITION}" ]; then echo "echo \"-> \${USER}@\${HOSTNAME} (\${PWD})\"" >> "${TESTSCRIPT}"; fi echo "if [ \"\" = \"\${MAKEJ}\" ]; then MAKEJ=\"-j \$(eval ${HERE}/tool_cpuinfo.sh -nc)\"; fi" >> "${TESTSCRIPT}" # make execution environment available if [ "" = "${INTEL_LICENSE_FILE}" ]; then LICSDIR=$(command -v icc | ${SED} -e "s/\(\/.*intel\)\/.*$/\1/") ${MKDIR} -p "${REPOROOT}/licenses" ${CP} -u "${HOME}"/intel/licenses/* "${REPOROOT}/licenses" 2>/dev/null ${CP} -u "${LICSDIR}"/licenses/* "${REPOROOT}/licenses" 2>/dev/null ${CP} -u /opt/intel/licenses/* "${REPOROOT}/licenses" 2>/dev/null echo "export INTEL_LICENSE_FILE=${REPOROOT}/licenses" >> "${TESTSCRIPT}" fi # setup environment on a per-test basis echo "if [ -e \"${ENVFILE}\" ]; then" >> "${TESTSCRIPT}" if [ "${LAUNCH_CMD}" ]; then echo " eval ${HERE}/tool_envrestore.sh \"${ENVFILE}\" \"${HERE}/../.env.sh\"" >> "${TESTSCRIPT}" echo " source \"${HERE}/../.env.sh\"" >> "${TESTSCRIPT}" else echo " eval ${HERE}/tool_envrestore.sh \"${ENVFILE}\"" >> "${TESTSCRIPT}" fi echo "fi" >> "${TESTSCRIPT}" if [ -e "${CONFIGFILE}" ]; then echo " source \"${CONFIGFILE}\" \"\"" >> "${TESTSCRIPT}" fi # record the current test case if [ "$0" != "${SLURMFILE}" ] && [ -e "${SLURMFILE}" ]; then ABSDIR=$(dirname "${SLURMFILE}") if [ ! -e "${ABSDIR}/Makefile" ] && [ -d "${ABSDIR}" ] && [ -e "${ABSDIR}/../Makefile" ]; then ABSDIR=${ABSDIR}/.. fi ABSDIR=$(cd "${ABSDIR}"; pwd -P) echo "cd ${REPOROOT} && make -e \${MAKEJ} && cd ${ABSDIR} && make -e \${MAKEJ}" >> "${TESTSCRIPT}" echo "RESULT=\$?" >> "${TESTSCRIPT}" echo "if [ \"0\" != \"\${RESULT}\" ]; then exit \${RESULT}; fi" >> "${TESTSCRIPT}" # control log echo "echo \"--- RUN ${TESTID}\"" >> "${TESTSCRIPT}" DIRSED=$(echo "${ABSDIR}" | ${SED} "s/\//\\\\\//g") ${SED} \ -e "s/#\!..*/#\!\/bin\/bash\nset -eo pipefail/" -e "s/\(^\|[[:space:]]\)\(\.\|\.\.\)\//\1${DIRSED}\/\2\//" \ -e "s/^[./]*\([[:print:]][[:print:]]*\/\)*slurm[[:space:]][[:space:]]*//" \ -e "/^#SBATCH/d" -e "/^[[:space:]]*$/d" \ "${SLURMFILE}" > "${SLURMFILE}.run" && ${CHMOD} +rx "${SLURMFILE}.run" RUNFILE=$(readlink -f "${SLURMFILE}.run") if [ "${TOOL_COMMAND}" ]; then if [ "0" = "${TOOL_INJECT}" ] || [ "" = "$(${SED} -n "/^taskset/p" "${RUNFILE}")" ]; then echo -n "${TOOL_COMMAND} ${RUNFILE} ${TOOL_COMMAND_POST}" >> "${TESTSCRIPT}" else # inject TOOL_COMMAND TOOL_COMMAND_SED1="$(echo "${TOOL_COMMAND}" | ${SED} "s/\//\\\\\//g") " if [ "${TOOL_COMMAND_POST}" ]; then TOOL_COMMAND_SED2=" $(echo "${TOOL_COMMAND_POST}" | ${SED} "s/\//\\\\\//g")" fi ${SED} -i "s/\(^taskset[[:space:]]..*\)/${TOOL_COMMAND_SED1}\1${TOOL_COMMAND_SED2}/" "${RUNFILE}" echo -n "${RUNFILE}" >> "${TESTSCRIPT}" fi else echo -n "${RUNFILE}" >> "${TESTSCRIPT}" fi if [ "${LIMITLOG}" ] && [ "0" != "${LIMITLOG}" ] && \ [ "$(command -v cat)" ] && [ "$(command -v tail)" ]; then echo " | cat -s | tail -n ${LIMITLOG}" >> "${TESTSCRIPT}" elif [ "0" = "${LIMITLOG}" ]; then echo " >/dev/null" >> "${TESTSCRIPT}" else echo >> "${TESTSCRIPT}" fi echo "${RM} -f ${RUNFILE}" >> "${TESTSCRIPT}" else echo "${TEST}" >> "${TESTSCRIPT}" fi echo >> "${TESTSCRIPT}" if [ "${SYNC}" ]; then ${SYNC}; fi else # setup environment on a per-test basis if [ "${CONFIGFILE}" ]; then if [ -e "${ENVFILE}" ]; then eval "${HERE}/tool_envrestore.sh" "${ENVFILE}" fi source "${CONFIGFILE}" "" fi fi COMMAND=$(eval echo "${ENVSTR} ${LAUNCH}") # run the prepared test case/script if [ "${LABEL}" ] && [ "$(command -v tee)" ]; then if [ -t 0 ]; then eval "${COMMAND} 2>&1 | tee .test-${LABEL}.log" else eval "${COMMAND} 2>&1 | ${GREP} -v '^srun: error:' | tee .test-${LABEL}.log" fi else eval "${COMMAND}" fi # capture test status RESULT=$? # exit the loop in case of an error if [ "0" != "${RESULT}" ] && [ "1" != "${LIMITHARD}" ]; then if [ "${TOUCHFILE}" ]; then ${RM} -f "${TOUCHFILE}" TOUCHFILE="" fi break 4 fi done # ENVS done # CONFIGS done # PARTITIONS if [ "${TOUCHFILE}" ]; then echo "${JOBID}" > "${TOUCHFILE}" TOUCHFILE="" fi done # SLURMFILE # increment the case number, or exit the script if [ "0" = "$1" ] && [ "0" = "${RESULT}" ]; then TESTID=$((TESTID+1)) else # finish break fi # clear captured test TEST="" done # TEST # remove temporary files if [ "${TESTSCRIPT}" ] && [ -e "${TESTSCRIPT}" ]; then ${RM} "${TESTSCRIPT}" fi if [ "${ENVFILE}" ] && [ -e "${ENVFILE}" ]; then ${RM} "${ENVFILE}" fi # control log if [ "0" = "${RESULT}" ]; then echo "+++ ------------------------------------------------------------------------------" echo "SUCCESS" else echo "^^^ +++" echo "+++ ------------------------------------------------------------------------------" echo "FAILURE" echo fi # override result code (alternative outcome) if [ "${RESULTCODE}" ]; then RESULT=${RESULTCODE} fi exit "${RESULT}" fi libxsmm-1.17/src/000077500000000000000000000000001415223013700137025ustar00rootroot00000000000000libxsmm-1.17/src/generator_common.c000066400000000000000000001621601415223013700174120ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Hans Pabst (Intel Corp.) ******************************************************************************/ #include "generator_common.h" #include "libxsmm_main.h" #if !defined(GENERATOR_COMMON_MAX_ERROR_LENGTH) # define GENERATOR_COMMON_MAX_ERROR_LENGTH 511 #endif LIBXSMM_API_INLINE void libxsmm_strncpy( char* o_dest, const char* i_src, unsigned int i_dest_length, unsigned int i_src_length ) { if ( i_dest_length < i_src_length ) { fprintf( stderr, "LIBXSMM fatal error: libxsmm_strncpy destination buffer is too small!\n" ); exit(-1); } /* @TODO check for aliasing? */ strcpy( o_dest, i_src ); } LIBXSMM_API_INTERN void libxsmm_append_code_as_string( libxsmm_generated_code* io_generated_code, const char* i_code_to_append, const int i_append_length ) { size_t l_length_1 = 0; size_t l_length_2 = 0; char* l_new_string = NULL; char* current_code = (char*)io_generated_code->generated_code; /* check if end up here accidentally */ if ( io_generated_code->code_type > 1 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_APPEND_STR ); return; } /* some safety checks */ if (current_code != NULL) { l_length_1 = io_generated_code->code_size; } else { /* nothing to do */ l_length_1 = 0; } if (i_code_to_append != NULL) { l_length_2 = i_append_length; } else { fprintf(stderr, "LIBXSMM WARNING libxsmm_append_code_as_string was called with an empty string for appending code" ); } /* allocate new string */ l_new_string = (char*) malloc( (l_length_1+l_length_2+1)*sizeof(char) ); if (l_new_string == NULL) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_ALLOC ); return; } /* copy old content */ if (l_length_1 > 0) { /* @TODO using memcpy instead? */ libxsmm_strncpy( l_new_string, current_code, (unsigned int)(l_length_1+l_length_2), (unsigned int)l_length_1 ); } else { l_new_string[0] = '\0'; } /* append new string */ /* @TODO using memcpy instead? */ if (i_code_to_append != NULL) { strcat(l_new_string, i_code_to_append); } /* free old memory and overwrite pointer */ if (l_length_1 > 0) free(current_code); io_generated_code->generated_code = (void*)l_new_string; /* update counters */ io_generated_code->code_size = (unsigned int)(l_length_1+l_length_2); io_generated_code->buffer_size = (io_generated_code->code_size) + 1; } LIBXSMM_API_INTERN void libxsmm_close_function( libxsmm_generated_code* io_generated_code ) { if ( io_generated_code->code_type == 0 ) { char l_new_code[512]; const int l_max_code_length = 511; const int l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, "}\n\n" ); libxsmm_append_code_as_string(io_generated_code, l_new_code, l_code_length ); } } LIBXSMM_API_INTERN unsigned int libxsmm_check_x86_gp_reg_callee_save( const unsigned int i_gp_reg_number ) { if ( (i_gp_reg_number == LIBXSMM_X86_GP_REG_RBX) || (i_gp_reg_number == LIBXSMM_X86_GP_REG_RBP) || (i_gp_reg_number == LIBXSMM_X86_GP_REG_R12) || (i_gp_reg_number == LIBXSMM_X86_GP_REG_R13) || (i_gp_reg_number == LIBXSMM_X86_GP_REG_R14) || (i_gp_reg_number == LIBXSMM_X86_GP_REG_R15) ) { return 1; } else { return 0; } } LIBXSMM_API_INTERN void libxsmm_get_x86_gp_reg_name( const unsigned int i_gp_reg_number, char* o_gp_reg_name, const int i_gp_reg_name_max_length ) { switch (i_gp_reg_number) { case LIBXSMM_X86_GP_REG_RAX: libxsmm_strncpy(o_gp_reg_name, "rax", i_gp_reg_name_max_length, 3 ); break; case LIBXSMM_X86_GP_REG_RCX: libxsmm_strncpy(o_gp_reg_name, "rcx", i_gp_reg_name_max_length, 3 ); break; case LIBXSMM_X86_GP_REG_RDX: libxsmm_strncpy(o_gp_reg_name, "rdx", i_gp_reg_name_max_length, 3 ); break; case LIBXSMM_X86_GP_REG_RBX: libxsmm_strncpy(o_gp_reg_name, "rbx", i_gp_reg_name_max_length, 3 ); break; case LIBXSMM_X86_GP_REG_RSP: libxsmm_strncpy(o_gp_reg_name, "rsp", i_gp_reg_name_max_length, 3 ); break; case LIBXSMM_X86_GP_REG_RBP: libxsmm_strncpy(o_gp_reg_name, "rbp", i_gp_reg_name_max_length, 3 ); break; case LIBXSMM_X86_GP_REG_RSI: libxsmm_strncpy(o_gp_reg_name, "rsi", i_gp_reg_name_max_length, 3 ); break; case LIBXSMM_X86_GP_REG_RDI: libxsmm_strncpy(o_gp_reg_name, "rdi", i_gp_reg_name_max_length, 3 ); break; case LIBXSMM_X86_GP_REG_R8: libxsmm_strncpy(o_gp_reg_name, "r8", i_gp_reg_name_max_length, 2 ); break; case LIBXSMM_X86_GP_REG_R9: libxsmm_strncpy(o_gp_reg_name, "r9", i_gp_reg_name_max_length, 2 ); break; case LIBXSMM_X86_GP_REG_R10: libxsmm_strncpy(o_gp_reg_name, "r10", i_gp_reg_name_max_length, 3 ); break; case LIBXSMM_X86_GP_REG_R11: libxsmm_strncpy(o_gp_reg_name, "r11", i_gp_reg_name_max_length, 3 ); break; case LIBXSMM_X86_GP_REG_R12: libxsmm_strncpy(o_gp_reg_name, "r12", i_gp_reg_name_max_length, 3 ); break; case LIBXSMM_X86_GP_REG_R13: libxsmm_strncpy(o_gp_reg_name, "r13", i_gp_reg_name_max_length, 3 ); break; case LIBXSMM_X86_GP_REG_R14: libxsmm_strncpy(o_gp_reg_name, "r14", i_gp_reg_name_max_length, 3 ); break; case LIBXSMM_X86_GP_REG_R15: libxsmm_strncpy(o_gp_reg_name, "r15", i_gp_reg_name_max_length, 3 ); break; default: fprintf(stderr, "libxsmm_get_x86_64_gp_req_name i_gp_reg_number is out of range!\n"); exit(-1); } } LIBXSMM_API_INTERN void libxsmm_get_x86_instr_name( const unsigned int i_instr_number, char* o_instr_name, const int i_instr_name_max_length ) { switch (i_instr_number) { /* AVX vector moves */ case LIBXSMM_X86_INSTR_VMOVAPD: libxsmm_strncpy(o_instr_name, "vmovapd", i_instr_name_max_length, 7 ); break; case LIBXSMM_X86_INSTR_VMOVUPD: libxsmm_strncpy(o_instr_name, "vmovupd", i_instr_name_max_length, 7 ); break; case LIBXSMM_X86_INSTR_VMOVAPS: libxsmm_strncpy(o_instr_name, "vmovaps", i_instr_name_max_length, 7 ); break; case LIBXSMM_X86_INSTR_VMOVUPS: libxsmm_strncpy(o_instr_name, "vmovups", i_instr_name_max_length, 7 ); break; case LIBXSMM_X86_INSTR_VBROADCASTSD: libxsmm_strncpy(o_instr_name, "vbroadcastsd", i_instr_name_max_length, 12 ); break; case LIBXSMM_X86_INSTR_VBROADCASTSS: libxsmm_strncpy(o_instr_name, "vbroadcastss", i_instr_name_max_length, 12 ); break; case LIBXSMM_X86_INSTR_VMOVDDUP: libxsmm_strncpy(o_instr_name, "vmovddup", i_instr_name_max_length, 8 ); break; case LIBXSMM_X86_INSTR_VMOVSD: libxsmm_strncpy(o_instr_name, "vmovsd", i_instr_name_max_length, 6 ); break; case LIBXSMM_X86_INSTR_VMOVSS: libxsmm_strncpy(o_instr_name, "vmovss", i_instr_name_max_length, 6 ); break; case LIBXSMM_X86_INSTR_VPBROADCASTB: libxsmm_strncpy(o_instr_name, "vpbroadcastb", i_instr_name_max_length, 12 ); break; case LIBXSMM_X86_INSTR_VPBROADCASTW: libxsmm_strncpy(o_instr_name, "vpbroadcastw", i_instr_name_max_length, 12 ); break; case LIBXSMM_X86_INSTR_VPBROADCASTD: libxsmm_strncpy(o_instr_name, "vpbroadcastd", i_instr_name_max_length, 12 ); break; case LIBXSMM_X86_INSTR_VPBROADCASTQ: libxsmm_strncpy(o_instr_name, "vpbroadcastq", i_instr_name_max_length, 12 ); break; /* SSE vector moves */ case LIBXSMM_X86_INSTR_MOVAPD: libxsmm_strncpy(o_instr_name, "movapd", i_instr_name_max_length, 6 ); break; case LIBXSMM_X86_INSTR_MOVUPD: libxsmm_strncpy(o_instr_name, "movupd", i_instr_name_max_length, 6 ); break; case LIBXSMM_X86_INSTR_MOVAPS: libxsmm_strncpy(o_instr_name, "movaps", i_instr_name_max_length, 6 ); break; case LIBXSMM_X86_INSTR_MOVUPS: libxsmm_strncpy(o_instr_name, "movups", i_instr_name_max_length, 6 ); break; case LIBXSMM_X86_INSTR_MOVDDUP: libxsmm_strncpy(o_instr_name, "movddup", i_instr_name_max_length, 7 ); break; case LIBXSMM_X86_INSTR_MOVSD: libxsmm_strncpy(o_instr_name, "movsd", i_instr_name_max_length, 5 ); break; case LIBXSMM_X86_INSTR_MOVSS: libxsmm_strncpy(o_instr_name, "movss", i_instr_name_max_length, 5 ); break; case LIBXSMM_X86_INSTR_SHUFPS: libxsmm_strncpy(o_instr_name, "shufps", i_instr_name_max_length, 6 ); break; /* Gather/scatter single precision */ case LIBXSMM_X86_INSTR_VGATHERDPS: libxsmm_strncpy(o_instr_name, "vgatherdps", i_instr_name_max_length, 10 ); break; case LIBXSMM_X86_INSTR_VGATHERQPS: libxsmm_strncpy(o_instr_name, "vgatherqps", i_instr_name_max_length, 10 ); break; case LIBXSMM_X86_INSTR_VSCATTERDPS: libxsmm_strncpy(o_instr_name, "vscatterdps", i_instr_name_max_length, 11 ); break; case LIBXSMM_X86_INSTR_VSCATTERQPS: libxsmm_strncpy(o_instr_name, "vscatterqps", i_instr_name_max_length, 11 ); break; /* Gather/scatter double precision */ case LIBXSMM_X86_INSTR_VGATHERDPD: libxsmm_strncpy(o_instr_name, "vgatherdpd", i_instr_name_max_length, 10 ); break; case LIBXSMM_X86_INSTR_VGATHERQPD: libxsmm_strncpy(o_instr_name, "vgatherqpd", i_instr_name_max_length, 10 ); break; case LIBXSMM_X86_INSTR_VSCATTERDPD: libxsmm_strncpy(o_instr_name, "vscatterdpd", i_instr_name_max_length, 11 ); break; case LIBXSMM_X86_INSTR_VSCATTERQPD: libxsmm_strncpy(o_instr_name, "vscatterqpd", i_instr_name_max_length, 11 ); break; /* AVX double precision */ case LIBXSMM_X86_INSTR_VXORPD: libxsmm_strncpy(o_instr_name, "vxorpd", i_instr_name_max_length, 6 ); break; case LIBXSMM_X86_INSTR_VMULPD: libxsmm_strncpy(o_instr_name, "vmulpd", i_instr_name_max_length, 6 ); break; case LIBXSMM_X86_INSTR_VADDPD: libxsmm_strncpy(o_instr_name, "vaddpd", i_instr_name_max_length, 6 ); break; case LIBXSMM_X86_INSTR_VFMADD231PD: libxsmm_strncpy(o_instr_name, "vfmadd231pd", i_instr_name_max_length, 11 ); break; case LIBXSMM_X86_INSTR_VMULSD: libxsmm_strncpy(o_instr_name, "vmulsd", i_instr_name_max_length, 6 ); break; case LIBXSMM_X86_INSTR_VADDSD: libxsmm_strncpy(o_instr_name, "vaddsd", i_instr_name_max_length, 6 ); break; case LIBXSMM_X86_INSTR_VFMADD231SD: libxsmm_strncpy(o_instr_name, "vfmadd231sd", i_instr_name_max_length, 11 ); break; /* AVX single precision */ case LIBXSMM_X86_INSTR_VXORPS: libxsmm_strncpy(o_instr_name, "vxorps", i_instr_name_max_length, 6 ); break; case LIBXSMM_X86_INSTR_VMULPS: libxsmm_strncpy(o_instr_name, "vmulps", i_instr_name_max_length, 6 ); break; case LIBXSMM_X86_INSTR_VADDPS: libxsmm_strncpy(o_instr_name, "vaddps", i_instr_name_max_length, 6 ); break; case LIBXSMM_X86_INSTR_VFMADD231PS: libxsmm_strncpy(o_instr_name, "vfmadd231ps", i_instr_name_max_length, 11 ); break; case LIBXSMM_X86_INSTR_VMULSS: libxsmm_strncpy(o_instr_name, "vmulss", i_instr_name_max_length, 6 ); break; case LIBXSMM_X86_INSTR_VADDSS: libxsmm_strncpy(o_instr_name, "vaddss", i_instr_name_max_length, 6 ); break; case LIBXSMM_X86_INSTR_VFMADD231SS: libxsmm_strncpy(o_instr_name, "vfmadd231ss", i_instr_name_max_length, 11 ); break; /* SSE double precision */ case LIBXSMM_X86_INSTR_XORPD: libxsmm_strncpy(o_instr_name, "xorpd", i_instr_name_max_length, 5 ); break; case LIBXSMM_X86_INSTR_MULPD: libxsmm_strncpy(o_instr_name, "mulpd", i_instr_name_max_length, 5 ); break; case LIBXSMM_X86_INSTR_ADDPD: libxsmm_strncpy(o_instr_name, "addpd", i_instr_name_max_length, 5 ); break; case LIBXSMM_X86_INSTR_MULSD: libxsmm_strncpy(o_instr_name, "mulsd", i_instr_name_max_length, 5 ); break; case LIBXSMM_X86_INSTR_ADDSD: libxsmm_strncpy(o_instr_name, "addsd", i_instr_name_max_length, 5 ); break; /* SSE single precision */ case LIBXSMM_X86_INSTR_XORPS: libxsmm_strncpy(o_instr_name, "xorps", i_instr_name_max_length, 5 ); break; case LIBXSMM_X86_INSTR_MULPS: libxsmm_strncpy(o_instr_name, "mulps", i_instr_name_max_length, 5 ); break; case LIBXSMM_X86_INSTR_ADDPS: libxsmm_strncpy(o_instr_name, "addps", i_instr_name_max_length, 5 ); break; case LIBXSMM_X86_INSTR_MULSS: libxsmm_strncpy(o_instr_name, "mulss", i_instr_name_max_length, 5 ); break; case LIBXSMM_X86_INSTR_ADDSS: libxsmm_strncpy(o_instr_name, "addss", i_instr_name_max_length, 5 ); break; /* XOR AVX512F */ case LIBXSMM_X86_INSTR_VPXORD: libxsmm_strncpy(o_instr_name, "vpxord", i_instr_name_max_length, 6 ); break; case LIBXSMM_X86_INSTR_VPADDB: libxsmm_strncpy(o_instr_name, "vpaddb", i_instr_name_max_length, 6 ); break; case LIBXSMM_X86_INSTR_VPADDW: libxsmm_strncpy(o_instr_name, "vpaddw", i_instr_name_max_length, 6 ); break; case LIBXSMM_X86_INSTR_VPADDD: libxsmm_strncpy(o_instr_name, "vpaddd", i_instr_name_max_length, 6 ); break; case LIBXSMM_X86_INSTR_VPADDQ: libxsmm_strncpy(o_instr_name, "vpaddq", i_instr_name_max_length, 6 ); break; case LIBXSMM_X86_INSTR_VPMADDWD: libxsmm_strncpy(o_instr_name, "vpmaddwd", i_instr_name_max_length, 8 ); break; case LIBXSMM_X86_INSTR_VPMADDUBSW: libxsmm_strncpy(o_instr_name, "vpmaddubsw", i_instr_name_max_length, 10 ); break; case LIBXSMM_X86_INSTR_VPSRAVD: libxsmm_strncpy(o_instr_name, "vpsravd", i_instr_name_max_length, 7 ); break; case LIBXSMM_X86_INSTR_VPSRAD: libxsmm_strncpy(o_instr_name, "vpsrad", i_instr_name_max_length, 6 ); break; case LIBXSMM_X86_INSTR_VPSLLD: libxsmm_strncpy(o_instr_name, "vpslld", i_instr_name_max_length, 6 ); break; case LIBXSMM_X86_INSTR_VPCMPD: libxsmm_strncpy(o_instr_name, "vpcmpd", i_instr_name_max_length, 6 ); break; /* AVX512, QFMA */ case LIBXSMM_X86_INSTR_V4FMADDPS: libxsmm_strncpy(o_instr_name, "v4fmaddps", i_instr_name_max_length, 9 ); break; case LIBXSMM_X86_INSTR_V4FNMADDPS: libxsmm_strncpy(o_instr_name, "v4fnmaddps", i_instr_name_max_length, 10 ); break; case LIBXSMM_X86_INSTR_V4FMADDSS: libxsmm_strncpy(o_instr_name, "v4fmaddss", i_instr_name_max_length, 9 ); break; case LIBXSMM_X86_INSTR_V4FNMADDSS: libxsmm_strncpy(o_instr_name, "v4fnmaddss", i_instr_name_max_length, 10 ); break; case LIBXSMM_X86_INSTR_VP4DPWSSD: libxsmm_strncpy(o_instr_name, "vp4dpwssd", i_instr_name_max_length, 9 ); break; case LIBXSMM_X86_INSTR_VP4DPWSSDS: libxsmm_strncpy(o_instr_name, "vp4dpwssds", i_instr_name_max_length, 10 ); break; /* AVX512, VNNI */ case LIBXSMM_X86_INSTR_VPDPWSSD: libxsmm_strncpy(o_instr_name, "vpdpwssd", i_instr_name_max_length, 8 ); break; case LIBXSMM_X86_INSTR_VPDPWSSDS: libxsmm_strncpy(o_instr_name, "vpdpwssds", i_instr_name_max_length, 9 ); break; case LIBXSMM_X86_INSTR_VPDPBUSD: libxsmm_strncpy(o_instr_name, "vpdpbusd", i_instr_name_max_length, 8 ); break; case LIBXSMM_X86_INSTR_VPDPBUSDS: libxsmm_strncpy(o_instr_name, "vpdpbusds", i_instr_name_max_length, 9 ); break; /* AVX512, BF16 */ case LIBXSMM_X86_INSTR_VDPBF16PS: libxsmm_strncpy(o_instr_name, "vdpbf16ps", i_instr_name_max_length, 9 ); break; case LIBXSMM_X86_INSTR_VCVTNEPS2BF16: libxsmm_strncpy(o_instr_name, "vcvtneps2bf16", i_instr_name_max_length, 13 ); break; case LIBXSMM_X86_INSTR_VCVTNE2PS2BF16: libxsmm_strncpy(o_instr_name, "vcvtne2ps2bf16", i_instr_name_max_length, 14 ); break; /* GP instructions */ case LIBXSMM_X86_INSTR_ADDQ: libxsmm_strncpy(o_instr_name, "addq", i_instr_name_max_length, 4 ); break; case LIBXSMM_X86_INSTR_SUBQ: libxsmm_strncpy(o_instr_name, "subq", i_instr_name_max_length, 4 ); break; case LIBXSMM_X86_INSTR_MOVQ: libxsmm_strncpy(o_instr_name, "movq", i_instr_name_max_length, 4 ); break; case LIBXSMM_X86_INSTR_CMPQ: libxsmm_strncpy(o_instr_name, "cmpq", i_instr_name_max_length, 4 ); break; case LIBXSMM_X86_INSTR_JL: libxsmm_strncpy(o_instr_name, "jl", i_instr_name_max_length, 2 ); break; case LIBXSMM_X86_INSTR_JE: libxsmm_strncpy(o_instr_name, "je", i_instr_name_max_length, 2 ); break; case LIBXSMM_X86_INSTR_JZ: libxsmm_strncpy(o_instr_name, "jz", i_instr_name_max_length, 2 ); break; case LIBXSMM_X86_INSTR_JG: libxsmm_strncpy(o_instr_name, "jg", i_instr_name_max_length, 2 ); break; case LIBXSMM_X86_INSTR_JNE: libxsmm_strncpy(o_instr_name, "jne", i_instr_name_max_length, 3 ); break; case LIBXSMM_X86_INSTR_JNZ: libxsmm_strncpy(o_instr_name, "jnz", i_instr_name_max_length, 3 ); break; case LIBXSMM_X86_INSTR_JGE: libxsmm_strncpy(o_instr_name, "jge", i_instr_name_max_length, 3 ); break; case LIBXSMM_X86_INSTR_JLE: libxsmm_strncpy(o_instr_name, "jle", i_instr_name_max_length, 3 ); break; case LIBXSMM_X86_INSTR_PREFETCHT0: libxsmm_strncpy(o_instr_name, "prefetcht0", i_instr_name_max_length, 10 ); break; case LIBXSMM_X86_INSTR_PREFETCHT1: libxsmm_strncpy(o_instr_name, "prefetcht1", i_instr_name_max_length, 10 ); break; case LIBXSMM_X86_INSTR_PREFETCHT2: libxsmm_strncpy(o_instr_name, "prefetcht2", i_instr_name_max_length, 10 ); break; case LIBXSMM_X86_INSTR_PREFETCHNTA: libxsmm_strncpy(o_instr_name, "prefetchnta", i_instr_name_max_length, 11 ); break; case LIBXSMM_X86_INSTR_KMOV: libxsmm_strncpy(o_instr_name, "kmov", i_instr_name_max_length, 4 ); break; case LIBXSMM_X86_INSTR_KMOVW: libxsmm_strncpy(o_instr_name, "kmovw", i_instr_name_max_length, 5 ); break; case LIBXSMM_X86_INSTR_KMOVB: libxsmm_strncpy(o_instr_name, "kmovb", i_instr_name_max_length, 5 ); break; case LIBXSMM_X86_INSTR_KMOVD: libxsmm_strncpy(o_instr_name, "kmovd", i_instr_name_max_length, 5 ); break; case LIBXSMM_X86_INSTR_KMOVQ: libxsmm_strncpy(o_instr_name, "kmovq", i_instr_name_max_length, 5 ); break; case LIBXSMM_X86_INSTR_KXNORW: libxsmm_strncpy(o_instr_name, "kxnorw", i_instr_name_max_length, 6 ); break; case LIBXSMM_X86_INSTR_VMOVNTPD: libxsmm_strncpy(o_instr_name, "vmovntpd", i_instr_name_max_length, 8 ); break; case LIBXSMM_X86_INSTR_VMOVNTPS: libxsmm_strncpy(o_instr_name, "vmovntps", i_instr_name_max_length, 8 ); break; case LIBXSMM_X86_INSTR_VMOVNTDQ: libxsmm_strncpy(o_instr_name, "vmovntdq", i_instr_name_max_length, 8 ); break; /* default, we didn't had a match */ default: fprintf(stderr, "libxsmm_get_x86_64_instr_name i_instr_number (%u) is out of range!\n", i_instr_number); exit(-1); } } LIBXSMM_API_INTERN unsigned int libxsmm_is_x86_vec_instr_single_precision( const unsigned int i_instr_number ) { unsigned int l_return = 0; switch (i_instr_number) { case LIBXSMM_X86_INSTR_VMOVAPD: l_return = 0; break; case LIBXSMM_X86_INSTR_VMOVUPD: l_return = 0; break; case LIBXSMM_X86_INSTR_VMOVAPS: l_return = 1; break; case LIBXSMM_X86_INSTR_VMOVUPS: l_return = 1; break; case LIBXSMM_X86_INSTR_VBROADCASTSD: l_return = 0; break; case LIBXSMM_X86_INSTR_VBROADCASTSS: l_return = 1; break; case LIBXSMM_X86_INSTR_VMOVDDUP: l_return = 0; break; case LIBXSMM_X86_INSTR_VMOVSD: l_return = 0; break; case LIBXSMM_X86_INSTR_VMOVSS: l_return = 1; break; /* SSE vector moves */ case LIBXSMM_X86_INSTR_MOVAPD: l_return = 0; break; case LIBXSMM_X86_INSTR_MOVUPD: l_return = 0; break; case LIBXSMM_X86_INSTR_MOVAPS: l_return = 1; break; case LIBXSMM_X86_INSTR_MOVUPS: l_return = 1; break; case LIBXSMM_X86_INSTR_MOVDDUP: l_return = 0; break; case LIBXSMM_X86_INSTR_MOVSD: l_return = 0; break; case LIBXSMM_X86_INSTR_MOVSS: l_return = 1; break; case LIBXSMM_X86_INSTR_SHUFPS: l_return = 1; break; /* Gather/Scatter single precision */ case LIBXSMM_X86_INSTR_VGATHERDPS: l_return = 1; break; case LIBXSMM_X86_INSTR_VGATHERQPS: l_return = 1; break; case LIBXSMM_X86_INSTR_VSCATTERDPS: l_return = 1; break; case LIBXSMM_X86_INSTR_VSCATTERQPS: l_return = 1; break; /* Gather/Scatter double precision */ case LIBXSMM_X86_INSTR_VGATHERDPD: l_return = 0; break; case LIBXSMM_X86_INSTR_VGATHERQPD: l_return = 0; break; case LIBXSMM_X86_INSTR_VSCATTERDPD: l_return = 0; break; case LIBXSMM_X86_INSTR_VSCATTERQPD: l_return = 0; break; /* AVX double precision */ case LIBXSMM_X86_INSTR_VXORPD: l_return = 0; break; case LIBXSMM_X86_INSTR_VMULPD: l_return = 0; break; case LIBXSMM_X86_INSTR_VADDPD: l_return = 0; break; case LIBXSMM_X86_INSTR_VFMADD231PD: l_return = 0; break; case LIBXSMM_X86_INSTR_VMULSD: l_return = 0; break; case LIBXSMM_X86_INSTR_VADDSD: l_return = 0; break; case LIBXSMM_X86_INSTR_VFMADD231SD: l_return = 0; break; /* AVX single precision */ case LIBXSMM_X86_INSTR_VXORPS: l_return = 1; break; case LIBXSMM_X86_INSTR_VMULPS: l_return = 1; break; case LIBXSMM_X86_INSTR_VADDPS: l_return = 1; break; case LIBXSMM_X86_INSTR_VFMADD231PS: l_return = 1; break; case LIBXSMM_X86_INSTR_VMULSS: l_return = 1; break; case LIBXSMM_X86_INSTR_VADDSS: l_return = 1; break; case LIBXSMM_X86_INSTR_VFMADD231SS: l_return = 1; break; /* SSE double precision */ case LIBXSMM_X86_INSTR_XORPD: l_return = 0; break; case LIBXSMM_X86_INSTR_MULPD: l_return = 0; break; case LIBXSMM_X86_INSTR_ADDPD: l_return = 0; break; case LIBXSMM_X86_INSTR_MULSD: l_return = 0; break; case LIBXSMM_X86_INSTR_ADDSD: l_return = 0; break; /* SSE single precision */ case LIBXSMM_X86_INSTR_XORPS: l_return = 1; break; case LIBXSMM_X86_INSTR_MULPS: l_return = 1; break; case LIBXSMM_X86_INSTR_ADDPS: l_return = 1; break; case LIBXSMM_X86_INSTR_MULSS: l_return = 1; break; case LIBXSMM_X86_INSTR_ADDSS: l_return = 1; break; /* AVX512, QFMA */ case LIBXSMM_X86_INSTR_V4FMADDPS: l_return = 1; break; case LIBXSMM_X86_INSTR_V4FNMADDPS: l_return = 1; break; case LIBXSMM_X86_INSTR_V4FMADDSS: l_return = 1; break; case LIBXSMM_X86_INSTR_V4FNMADDSS: l_return = 1; break; case LIBXSMM_X86_INSTR_VP4DPWSSD: l_return = 1; break; case LIBXSMM_X86_INSTR_VP4DPWSSDS: l_return = 1; break; /* default, we didn't had a match */ default: fprintf(stderr, "libxsmm_is_x86_vec_instr_single_precision i_instr_number (%u) is not a x86 FP vector instruction!\n", i_instr_number); exit(-1); } return l_return; } LIBXSMM_API_INTERN void libxsmm_reset_x86_gp_reg_mapping( libxsmm_gp_reg_mapping* io_gp_reg_mapping ) { io_gp_reg_mapping->gp_reg_a = LIBXSMM_X86_GP_REG_UNDEF; io_gp_reg_mapping->gp_reg_b = LIBXSMM_X86_GP_REG_UNDEF; io_gp_reg_mapping->gp_reg_c = LIBXSMM_X86_GP_REG_UNDEF; io_gp_reg_mapping->gp_reg_a_prefetch = LIBXSMM_X86_GP_REG_UNDEF; io_gp_reg_mapping->gp_reg_b_prefetch = LIBXSMM_X86_GP_REG_UNDEF; /* io_gp_reg_mapping->gp_reg_c_prefetch = LIBXSMM_X86_GP_REG_UNDEF;*/ io_gp_reg_mapping->gp_reg_mloop = LIBXSMM_X86_GP_REG_UNDEF; io_gp_reg_mapping->gp_reg_nloop = LIBXSMM_X86_GP_REG_UNDEF; io_gp_reg_mapping->gp_reg_kloop = LIBXSMM_X86_GP_REG_UNDEF; io_gp_reg_mapping->gp_reg_reduce_count = LIBXSMM_X86_GP_REG_UNDEF; io_gp_reg_mapping->gp_reg_reduce_loop = LIBXSMM_X86_GP_REG_UNDEF; io_gp_reg_mapping->gp_reg_scf = LIBXSMM_X86_GP_REG_UNDEF; io_gp_reg_mapping->gp_reg_help_0 = LIBXSMM_X86_GP_REG_UNDEF; io_gp_reg_mapping->gp_reg_help_1 = LIBXSMM_X86_GP_REG_UNDEF; io_gp_reg_mapping->gp_reg_help_2 = LIBXSMM_X86_GP_REG_UNDEF; io_gp_reg_mapping->gp_reg_help_3 = LIBXSMM_X86_GP_REG_UNDEF; io_gp_reg_mapping->gp_reg_help_4 = LIBXSMM_X86_GP_REG_UNDEF; io_gp_reg_mapping->gp_reg_help_5 = LIBXSMM_X86_GP_REG_UNDEF; } LIBXSMM_API_INTERN void libxsmm_reset_loop_label_tracker( libxsmm_loop_label_tracker* io_loop_label_tracker ) { memset(io_loop_label_tracker, 0, sizeof(*io_loop_label_tracker)); } LIBXSMM_API_INTERN void libxsmm_reset_jump_label_tracker( libxsmm_jump_label_tracker* io_jump_label_tracker ) { memset(io_jump_label_tracker, 0, sizeof(*io_jump_label_tracker)); } LIBXSMM_API_INTERN void libxsmm_mmfunction_signature( libxsmm_generated_code* io_generated_code, const char* i_routine_name, const libxsmm_gemm_descriptor* i_xgemm_desc ) { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; LIBXSMM_ASSERT_MSG(NULL != i_xgemm_desc, "Invalid descriptor"); if ( io_generated_code->code_type > 1 ) { return; } else if ( io_generated_code->code_type == 1 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, ".global %s\n.type %s, @function\n%s:\n", i_routine_name, i_routine_name, i_routine_name); } else { /* selecting the correct signature */ if (LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { if (LIBXSMM_GEMM_PREFETCH_NONE == i_xgemm_desc->prefetch) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "void %s(const float* A, const float* B, float* C) {\n", i_routine_name); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "void %s(const float* A, const float* B, float* C, const float* A_prefetch, const float* B_prefetch, const float* C_prefetch) {\n", i_routine_name); } } else if (LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { if (LIBXSMM_GEMM_PREFETCH_NONE == i_xgemm_desc->prefetch) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "void %s(const double* A, const double* B, double* C) {\n", i_routine_name); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "void %s(const double* A, const double* B, double* C, const double* A_prefetch, const double* B_prefetch, const double* C_prefetch) {\n", i_routine_name); } } else if (LIBXSMM_GEMM_PRECISION_I16 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { if (LIBXSMM_GEMM_PREFETCH_NONE == i_xgemm_desc->prefetch) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "void %s(const short* A, const short* B, int* C) {\n", i_routine_name); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "void %s(const short* A, const short* B, int* C, const short* A_prefetch, const short* B_prefetch, const int* C_prefetch) {\n", i_routine_name); } } else {} } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } LIBXSMM_API_INTERN void libxsmm_generator_isa_check_header( libxsmm_generated_code* io_generated_code ) { if ( io_generated_code->code_type == 0 ) { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; if ( io_generated_code->arch <= LIBXSMM_X86_SSE4 ) { l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, "#ifdef __SSE3__\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, "#ifdef __AVX__\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, "#pragma message (\"LIBXSMM KERNEL COMPILATION WARNING: compiling SSE3 code on AVX or newer architecture: \" __FILE__)\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, "#endif\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else if ( io_generated_code->arch == LIBXSMM_X86_AVX ) { l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, "#ifdef __AVX__\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, "#ifdef __AVX2__\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, "#pragma message (\"LIBXSMM KERNEL COMPILATION WARNING: compiling AVX code on AVX2 or newer architecture: \" __FILE__)\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, "#endif\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else if ( io_generated_code->arch == LIBXSMM_X86_AVX2 ) { l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, "#ifdef __AVX2__\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, "#ifdef __AVX512F__\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, "#pragma message (\"LIBXSMM KERNEL COMPILATION WARNING: compiling AVX2 code on AVX512 or newer architecture: \" __FILE__)\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, "#endif\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, "#ifdef __AVX512F__\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else if ( io_generated_code->arch < LIBXSMM_X86_SSE3 ) { l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, "#pragma message (\"LIBXSMM KERNEL COMPILATION WARNING: compiling arch-independent gemm kernel in: \" __FILE__)\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_ARCH ); return; } } } LIBXSMM_API_INTERN void libxsmm_generator_isa_check_footer( libxsmm_generated_code* io_generated_code ) { if ( io_generated_code->code_type == 0 ) { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; if ( ( io_generated_code->arch >= LIBXSMM_X86_SSE3 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, "#else\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, "#pragma message (\"LIBXSMM KERNEL COMPILATION ERROR in: \" __FILE__)\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, "#error No kernel was compiled, lacking support for current architecture?\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, "#endif\n\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else if ( io_generated_code->arch < LIBXSMM_X86_SSE3 ) { } else { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_ARCH ); return; } } } LIBXSMM_API_INTERN void libxsmm_handle_error( libxsmm_generated_code* io_generated_code, const unsigned int i_error_code, const char* context, int emit_message ) { static LIBXSMM_TLS unsigned int last_error_code; if (i_error_code != last_error_code) { if (0 != emit_message) { LIBXSMM_STDIO_ACQUIRE(); if (0 != context && 0 != *context && '0' != *context) { fprintf(stderr, "LIBXSMM ERROR (%s): %s\n", context, libxsmm_strerror(i_error_code)); } else { fprintf(stderr, "LIBXSMM ERROR: %s\n", libxsmm_strerror(i_error_code)); } LIBXSMM_STDIO_RELEASE(); } last_error_code = i_error_code; } io_generated_code->last_error = i_error_code; } LIBXSMM_API const char* libxsmm_strerror(unsigned int i_error_code) { static LIBXSMM_TLS char error_message[GENERATOR_COMMON_MAX_ERROR_LENGTH+1]; switch (i_error_code) { case LIBXSMM_ERR_GENERAL: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "a general error occurred (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_ALLOC: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "memory allocation failed (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_BUFFER_TOO_SMALL: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "code generation ran out of buffer capacity (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_APPEND_STR: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "append code as string was called for generation mode which does not support this (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_ARCH_PREC: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "unknown architecture or unsupported precision (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_ARCH: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "unknown architecture (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_UNSUP_ARCH: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "unsupported arch for the selected module was specified (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_LDA: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "lda needs to be greater than or equal to m (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_LDB: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "ldb needs to be greater than or equal to k (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_LDC: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "ldc needs to be greater than or equal to m (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_SPGEMM_GEN: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "could not determine which sparse code generation variant is requested (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_CSC_INPUT: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "could not open the CSC input file, or invalid file content found (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_CSC_READ_LEN: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "exceeded predefined line-length when reading line of CSC file (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_CSC_READ_DESC: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "error when reading descriptor of CSC file (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_CSC_READ_ELEMS: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "error when reading line of CSC file (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_CSC_LEN: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "number of elements read differs from number of elements specified in CSC file (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_N_BLOCK: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "invalid N blocking in microkernel (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_M_BLOCK: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "invalid M blocking in microkernel (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_K_BLOCK: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "invalid K blocking in microkernel (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_REG_BLOCK: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "invalid MxN register blocking was specified (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_NO_AVX512_BCAST: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "fused memory broadcast is not supported on other platforms than AVX512 (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_NO_AVX512_QFMA: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "there is no QFMA instruction set extension available (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_NO_INDEX_SCALE_ADDR: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "Index + Scale addressing mode is currently not implemented (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_UNSUPPORTED_JUMP: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "Unsupported jump instruction requested (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_NO_JMPLBL_AVAIL: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "No destination jump label is available (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_EXCEED_JMPLBL: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "too many nested loops, exceeding loop label tracker (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_JMPLBL_USED: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "attempted to use an already used jump label (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_CSC_ALLOC_DATA: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "could not allocate temporary memory for reading CSC file (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_CSR_ALLOC_DATA: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "could not allocate temporary memory for reading CSR file (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_CSR_INPUT: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "could not open the specified CSR input file (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_CSR_READ_LEN: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "exceeded predefined line-length when reading line of CSR file (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_CSR_READ_DESC: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "error when reading descriptor of CSR file (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_CSR_READ_ELEMS: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "error when reading line of CSR file (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_CSR_LEN: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "number of elements read differs from number of elements specified in CSR file (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_UNSUP_DATATYPE: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "unsupported datatype was requested (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_UNSUP_DT_FORMAT: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "unsupported datatype and format combination was requested (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_INVALID_GEMM_CONFIG: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "invalid GEMM config in setup detected (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_UNIQUE_VAL: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "for sparse-A in reg: too many values in A (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_VEC_REG_MUST_BE_UNDEF: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "input vector register parameter must be undefined here (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_TRANS_B: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "GEMM kernel with trans B requested, but target/datatype not supported (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_LDB_TRANS: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "ldb needs to be greater than or equal to n (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_VNNI_A: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "A is not provided in supported VNNI format (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_VNNI_B: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "B is not provided in supported VNNI format (error #%u)!", i_error_code ); break; case LIBXSMM_ERR_NO_AVX512VL: LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "the AVX512VL instruction set extension is currently not available (error #%u)!", i_error_code ); break; default: /* we do not know what happened */ LIBXSMM_SNPRINTF( error_message, GENERATOR_COMMON_MAX_ERROR_LENGTH, "an unknown error occurred (error #%u)!", i_error_code ); break; } return error_message; } LIBXSMM_API_INTERN unsigned int libxsmm_compute_equalized_blocking( unsigned int i_size, unsigned int i_max_block, unsigned int* o_range_1, unsigned int* o_block_1, unsigned int* o_range_2, unsigned int* o_block_2 ) { unsigned int l_size = LIBXSMM_MAX(i_size, 1); unsigned int l_number_of_chunks = ((l_size - 1) / i_max_block) + 1; unsigned int l_modulo = l_size % l_number_of_chunks; unsigned int l_n2 = l_size / l_number_of_chunks; unsigned int l_n1 = l_n2 + 1; unsigned int l_N2 = 0; unsigned int l_N1 = 0; unsigned int l_chunk = 0; unsigned int l_ret = 0; /* ranges */ if (l_n1 > i_max_block) l_n1 = i_max_block; for (l_chunk = 0; l_chunk < l_number_of_chunks; ++l_chunk) { if (l_chunk < l_modulo) { l_N1 += l_n1; } else { l_N2 += l_n2; } } /* if we have perfect blocking, swap n2 and n1 */ if ( l_modulo == 0 ) { l_n1 = l_n2; l_N1 = l_N2; l_n2 = 0; l_N2 = 0; } /* some checks */ if ( (l_N1 % l_n1) != 0 ) { l_ret = 1; } if ( l_n2 != 0 ) { if ( l_N2 % l_n2 != 0 ) { l_ret = 1; } } /* set output variables */ *o_range_1 = l_N1; *o_block_1 = l_n1; *o_range_2 = l_N2; *o_block_2 = l_n2; return l_ret; } LIBXSMM_API_INTERN libxsmm_meltw_comp_redu_flags libxsmm_get_meltw_comp_redu_flags( libxsmm_meltw_redu_flags flags ) { switch ( flags ) { case LIBXSMM_MELTW_FLAG_REDUCE_NONE: return LIBXSMM_MELTW_COMP_FLAG_REDUCE_NONE; case LIBXSMM_MELTW_FLAG_REDUCE_OP_ADD: return LIBXSMM_MELTW_COMP_FLAG_REDUCE_OP_ADD; case LIBXSMM_MELTW_FLAG_REDUCE_OP_MAX: return LIBXSMM_MELTW_COMP_FLAG_REDUCE_OP_MAX; case LIBXSMM_MELTW_FLAG_REDUCE_OP_MUL: return LIBXSMM_MELTW_COMP_FLAG_REDUCE_OP_MUL; case LIBXSMM_MELTW_FLAG_REDUCE_ROWS: return LIBXSMM_MELTW_COMP_FLAG_REDUCE_ROWS; case LIBXSMM_MELTW_FLAG_REDUCE_COLS: return LIBXSMM_MELTW_COMP_FLAG_REDUCE_COLS; case LIBXSMM_MELTW_FLAG_REDUCE_ELTS: return LIBXSMM_MELTW_COMP_FLAG_REDUCE_ELTS; case LIBXSMM_MELTW_FLAG_REDUCE_ELTS_SQUARED: return LIBXSMM_MELTW_COMP_FLAG_REDUCE_ELTS_SQUARED; case LIBXSMM_MELTW_FLAG_REDUCE_OP_ADD_ROWS: return LIBXSMM_MELTW_COMP_FLAG_REDUCE_OP_ADD_ROWS; case LIBXSMM_MELTW_FLAG_REDUCE_OP_ADD_COLS: return LIBXSMM_MELTW_COMP_FLAG_REDUCE_OP_ADD_COLS; default: return LIBXSMM_MELTW_COMP_FLAG_REDUCE_NONE; } } LIBXSMM_API_INTERN libxsmm_meltw_redu_flags libxsmm_get_meltw_redu_flags( libxsmm_meltw_comp_redu_flags flags ) { switch ( flags ) { case LIBXSMM_MELTW_COMP_FLAG_REDUCE_NONE: return LIBXSMM_MELTW_FLAG_REDUCE_NONE; case LIBXSMM_MELTW_COMP_FLAG_REDUCE_OP_ADD: return LIBXSMM_MELTW_FLAG_REDUCE_OP_ADD; case LIBXSMM_MELTW_COMP_FLAG_REDUCE_OP_MAX: return LIBXSMM_MELTW_FLAG_REDUCE_OP_MAX; case LIBXSMM_MELTW_COMP_FLAG_REDUCE_OP_MUL: return LIBXSMM_MELTW_FLAG_REDUCE_OP_MUL; case LIBXSMM_MELTW_COMP_FLAG_REDUCE_ROWS: return LIBXSMM_MELTW_FLAG_REDUCE_ROWS; case LIBXSMM_MELTW_COMP_FLAG_REDUCE_COLS: return LIBXSMM_MELTW_FLAG_REDUCE_COLS; case LIBXSMM_MELTW_COMP_FLAG_REDUCE_ELTS: return LIBXSMM_MELTW_FLAG_REDUCE_ELTS; case LIBXSMM_MELTW_COMP_FLAG_REDUCE_ELTS_SQUARED: return LIBXSMM_MELTW_FLAG_REDUCE_ELTS_SQUARED; case LIBXSMM_MELTW_COMP_FLAG_REDUCE_OP_ADD_ROWS: return LIBXSMM_MELTW_FLAG_REDUCE_OP_ADD_ROWS; case LIBXSMM_MELTW_COMP_FLAG_REDUCE_OP_ADD_COLS: return LIBXSMM_MELTW_FLAG_REDUCE_OP_ADD_COLS; default: return LIBXSMM_MELTW_FLAG_REDUCE_NONE; } } LIBXSMM_API_INTERN libxsmm_meltw_comp_scal_flags libxsmm_get_meltw_comp_scal_flags( libxsmm_meltw_scal_flags flags ) { switch( flags ) { case LIBXSMM_MELTW_FLAG_SCALE_NONE: return LIBXSMM_MELTW_COMP_FLAG_SCALE_NONE; case LIBXSMM_MELTW_FLAG_SCALE_MULT: return LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT; case LIBXSMM_MELTW_FLAG_SCALE_SHIFT: return LIBXSMM_MELTW_COMP_FLAG_SCALE_SHIFT; case LIBXSMM_MELTW_FLAG_SCALE_ADD_BIAS: return LIBXSMM_MELTW_COMP_FLAG_SCALE_ADD_BIAS; case LIBXSMM_MELTW_FLAG_SCALE_ROWS: return LIBXSMM_MELTW_COMP_FLAG_SCALE_ROWS; case LIBXSMM_MELTW_FLAG_SCALE_COLS: return LIBXSMM_MELTW_COMP_FLAG_SCALE_COLS; case LIBXSMM_MELTW_FLAG_SCALE_MULT_ROWS: return LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_ROWS; case LIBXSMM_MELTW_FLAG_SCALE_SHIFT_ROWS: return LIBXSMM_MELTW_COMP_FLAG_SCALE_SHIFT_ROWS; case LIBXSMM_MELTW_FLAG_SCALE_ADD_BIAS_ROWS: return LIBXSMM_MELTW_COMP_FLAG_SCALE_ADD_BIAS_ROWS; case LIBXSMM_MELTW_FLAG_SCALE_MULT_SHIFT_ROWS: return LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_SHIFT_ROWS; case LIBXSMM_MELTW_FLAG_SCALE_ADD_BIAS_SHIFT_ROWS: return LIBXSMM_MELTW_COMP_FLAG_SCALE_ADD_BIAS_SHIFT_ROWS; case LIBXSMM_MELTW_FLAG_SCALE_MULT_ADD_BIAS_ROWS: return LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_ADD_BIAS_ROWS; case LIBXSMM_MELTW_FLAG_SCALE_MULT_SHIFT_ADD_BIAS_ROWS: return LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_SHIFT_ADD_BIAS_ROWS; case LIBXSMM_MELTW_FLAG_SCALE_MULT_COLS: return LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_COLS; case LIBXSMM_MELTW_FLAG_SCALE_SHIFT_COLS: return LIBXSMM_MELTW_COMP_FLAG_SCALE_SHIFT_COLS; case LIBXSMM_MELTW_FLAG_SCALE_ADD_BIAS_COLS: return LIBXSMM_MELTW_COMP_FLAG_SCALE_ADD_BIAS_COLS; case LIBXSMM_MELTW_FLAG_SCALE_MULT_SHIFT_COLS: return LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_SHIFT_COLS; case LIBXSMM_MELTW_FLAG_SCALE_ADD_BIAS_SHIFT_COLS: return LIBXSMM_MELTW_COMP_FLAG_SCALE_ADD_BIAS_SHIFT_COLS; case LIBXSMM_MELTW_FLAG_SCALE_MULT_ADD_BIAS_COLS: return LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_ADD_BIAS_COLS; case LIBXSMM_MELTW_FLAG_SCALE_MULT_SHIFT_ADD_BIAS_COLS: return LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_SHIFT_ADD_BIAS_COLS; default: return LIBXSMM_MELTW_COMP_FLAG_SCALE_NONE; } } LIBXSMM_API_INTERN libxsmm_meltw_scal_flags libxsmm_get_meltw_scal_flags( libxsmm_meltw_comp_scal_flags flags ) { switch( flags ) { case LIBXSMM_MELTW_COMP_FLAG_SCALE_NONE: return LIBXSMM_MELTW_FLAG_SCALE_NONE; case LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT: return LIBXSMM_MELTW_FLAG_SCALE_MULT; case LIBXSMM_MELTW_COMP_FLAG_SCALE_SHIFT: return LIBXSMM_MELTW_FLAG_SCALE_SHIFT; case LIBXSMM_MELTW_COMP_FLAG_SCALE_ADD_BIAS: return LIBXSMM_MELTW_FLAG_SCALE_ADD_BIAS; case LIBXSMM_MELTW_COMP_FLAG_SCALE_ROWS: return LIBXSMM_MELTW_FLAG_SCALE_ROWS; case LIBXSMM_MELTW_COMP_FLAG_SCALE_COLS: return LIBXSMM_MELTW_FLAG_SCALE_COLS; case LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_ROWS: return LIBXSMM_MELTW_FLAG_SCALE_MULT_ROWS; case LIBXSMM_MELTW_COMP_FLAG_SCALE_SHIFT_ROWS: return LIBXSMM_MELTW_FLAG_SCALE_SHIFT_ROWS; case LIBXSMM_MELTW_COMP_FLAG_SCALE_ADD_BIAS_ROWS: return LIBXSMM_MELTW_FLAG_SCALE_ADD_BIAS_ROWS; case LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_SHIFT_ROWS: return LIBXSMM_MELTW_FLAG_SCALE_MULT_SHIFT_ROWS; case LIBXSMM_MELTW_COMP_FLAG_SCALE_ADD_BIAS_SHIFT_ROWS: return LIBXSMM_MELTW_FLAG_SCALE_ADD_BIAS_SHIFT_ROWS; case LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_ADD_BIAS_ROWS: return LIBXSMM_MELTW_FLAG_SCALE_MULT_ADD_BIAS_ROWS; case LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_SHIFT_ADD_BIAS_ROWS: return LIBXSMM_MELTW_FLAG_SCALE_MULT_SHIFT_ADD_BIAS_ROWS; case LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_COLS: return LIBXSMM_MELTW_FLAG_SCALE_MULT_COLS; case LIBXSMM_MELTW_COMP_FLAG_SCALE_SHIFT_COLS: return LIBXSMM_MELTW_FLAG_SCALE_SHIFT_COLS; case LIBXSMM_MELTW_COMP_FLAG_SCALE_ADD_BIAS_COLS: return LIBXSMM_MELTW_FLAG_SCALE_ADD_BIAS_COLS; case LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_SHIFT_COLS: return LIBXSMM_MELTW_FLAG_SCALE_MULT_SHIFT_COLS; case LIBXSMM_MELTW_COMP_FLAG_SCALE_ADD_BIAS_SHIFT_COLS: return LIBXSMM_MELTW_FLAG_SCALE_ADD_BIAS_SHIFT_COLS; case LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_ADD_BIAS_COLS: return LIBXSMM_MELTW_FLAG_SCALE_MULT_ADD_BIAS_COLS; case LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_SHIFT_ADD_BIAS_COLS: return LIBXSMM_MELTW_FLAG_SCALE_MULT_SHIFT_ADD_BIAS_COLS; default: return LIBXSMM_MELTW_FLAG_SCALE_NONE; } } LIBXSMM_API_INTERN libxsmm_meltw_comp_cvta_flags libxsmm_get_meltw_comp_cvta_flags( libxsmm_meltw_cvta_flags flags ) { switch ( flags ) { case LIBXSMM_MELTW_FLAG_CVTA_NONE: return LIBXSMM_MELTW_COMP_FLAG_CVTA_NONE; case LIBXSMM_MELTW_FLAG_CVTA_FUSE_RELU: return LIBXSMM_MELTW_COMP_FLAG_CVTA_FUSE_RELU; case LIBXSMM_MELTW_FLAG_CVTA_FUSE_TANH: return LIBXSMM_MELTW_COMP_FLAG_CVTA_FUSE_TANH; case LIBXSMM_MELTW_FLAG_CVTA_FUSE_SIGM: return LIBXSMM_MELTW_COMP_FLAG_CVTA_FUSE_SIGM; default: return LIBXSMM_MELTW_COMP_FLAG_CVTA_NONE; } } LIBXSMM_API_INTERN libxsmm_meltw_cvta_flags libxsmm_get_meltw_cvta_flags( libxsmm_meltw_comp_cvta_flags flags ) { switch ( flags ) { case LIBXSMM_MELTW_COMP_FLAG_CVTA_NONE: return LIBXSMM_MELTW_FLAG_CVTA_NONE; case LIBXSMM_MELTW_COMP_FLAG_CVTA_FUSE_RELU: return LIBXSMM_MELTW_FLAG_CVTA_FUSE_RELU; case LIBXSMM_MELTW_COMP_FLAG_CVTA_FUSE_TANH: return LIBXSMM_MELTW_FLAG_CVTA_FUSE_TANH; case LIBXSMM_MELTW_COMP_FLAG_CVTA_FUSE_SIGM: return LIBXSMM_MELTW_FLAG_CVTA_FUSE_SIGM; default: return LIBXSMM_MELTW_FLAG_CVTA_NONE; } } LIBXSMM_API_INTERN libxsmm_meltw_comp_acvt_flags libxsmm_get_meltw_comp_acvt_flags( libxsmm_meltw_acvt_flags flags ) { switch ( flags ) { case LIBXSMM_MELTW_FLAG_ACVT_NONE: return LIBXSMM_MELTW_COMP_FLAG_ACVT_NONE; case LIBXSMM_MELTW_FLAG_ACVT_FUSE_TANH: return LIBXSMM_MELTW_COMP_FLAG_ACVT_FUSE_TANH; case LIBXSMM_MELTW_FLAG_ACVT_FUSE_SIGM: return LIBXSMM_MELTW_COMP_FLAG_ACVT_FUSE_SIGM; default: return LIBXSMM_MELTW_COMP_FLAG_ACVT_NONE; } } LIBXSMM_API_INTERN libxsmm_meltw_acvt_flags libxsmm_get_meltw_acvt_flags( libxsmm_meltw_comp_acvt_flags flags ) { switch ( flags ) { case LIBXSMM_MELTW_COMP_FLAG_ACVT_NONE: return LIBXSMM_MELTW_FLAG_ACVT_NONE; case LIBXSMM_MELTW_COMP_FLAG_ACVT_FUSE_TANH: return LIBXSMM_MELTW_FLAG_ACVT_FUSE_TANH; case LIBXSMM_MELTW_COMP_FLAG_ACVT_FUSE_SIGM: return LIBXSMM_MELTW_FLAG_ACVT_FUSE_SIGM; default: return LIBXSMM_MELTW_FLAG_ACVT_NONE; } } LIBXSMM_API_INTERN libxsmm_meltw_comp_cbiasact_flags libxsmm_get_meltw_comp_cbiasact_flags( libxsmm_meltw_cbiasact_flags flags ) { switch ( flags ) { case LIBXSMM_MELTW_FLAG_CBIASACT_NONE: return LIBXSMM_MELTW_COMP_FLAG_CBIASACT_NONE; case LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS: return LIBXSMM_MELTW_COMP_FLAG_CBIASACT_COLBIAS; case LIBXSMM_MELTW_FLAG_CBIASACT_ACT_RELU: return LIBXSMM_MELTW_COMP_FLAG_CBIASACT_ACT_RELU; case LIBXSMM_MELTW_FLAG_CBIASACT_ACT_TANH: return LIBXSMM_MELTW_COMP_FLAG_CBIASACT_ACT_TANH; case LIBXSMM_MELTW_FLAG_CBIASACT_ACT_SIGM: return LIBXSMM_MELTW_COMP_FLAG_CBIASACT_ACT_SIGM; case LIBXSMM_MELTW_FLAG_CBIASACT_ACT_GELU: return LIBXSMM_MELTW_COMP_FLAG_CBIASACT_ACT_GELU; case LIBXSMM_MELTW_FLAG_CBIASACT_OVERWRITE_C: return LIBXSMM_MELTW_COMP_FLAG_CBIASACT_OVERWRITE_C; case LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS_ACT_RELU: return LIBXSMM_MELTW_COMP_FLAG_CBIASACT_COLBIAS_ACT_RELU; case LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS_ACT_TANH: return LIBXSMM_MELTW_COMP_FLAG_CBIASACT_COLBIAS_ACT_TANH; case LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS_ACT_SIGM: return LIBXSMM_MELTW_COMP_FLAG_CBIASACT_COLBIAS_ACT_SIGM; case LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS_ACT_GELU: return LIBXSMM_MELTW_COMP_FLAG_CBIASACT_COLBIAS_ACT_GELU; case LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS_ACT_RELU_OVERWRITE_C: return LIBXSMM_MELTW_COMP_FLAG_CBIASACT_COLBIAS_ACT_RELU_OVERWRITE_C; case LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS_ACT_TANH_OVERWRITE_C: return LIBXSMM_MELTW_COMP_FLAG_CBIASACT_COLBIAS_ACT_TANH_OVERWRITE_C; case LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS_ACT_SIGM_OVERWRITE_C: return LIBXSMM_MELTW_COMP_FLAG_CBIASACT_COLBIAS_ACT_SIGM_OVERWRITE_C; case LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS_ACT_GELU_OVERWRITE_C: return LIBXSMM_MELTW_COMP_FLAG_CBIASACT_COLBIAS_ACT_GELU_OVERWRITE_C; default: return LIBXSMM_MELTW_COMP_FLAG_CBIASACT_NONE; } } LIBXSMM_API_INTERN libxsmm_meltw_cbiasact_flags libxsmm_get_meltw_cbiasact_flags( libxsmm_meltw_comp_cbiasact_flags flags ) { switch ( flags ) { case LIBXSMM_MELTW_COMP_FLAG_CBIASACT_NONE: return LIBXSMM_MELTW_FLAG_CBIASACT_NONE; case LIBXSMM_MELTW_COMP_FLAG_CBIASACT_COLBIAS: return LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS; case LIBXSMM_MELTW_COMP_FLAG_CBIASACT_ACT_RELU: return LIBXSMM_MELTW_FLAG_CBIASACT_ACT_RELU; case LIBXSMM_MELTW_COMP_FLAG_CBIASACT_ACT_TANH: return LIBXSMM_MELTW_FLAG_CBIASACT_ACT_TANH; case LIBXSMM_MELTW_COMP_FLAG_CBIASACT_ACT_SIGM: return LIBXSMM_MELTW_FLAG_CBIASACT_ACT_SIGM; case LIBXSMM_MELTW_COMP_FLAG_CBIASACT_ACT_GELU: return LIBXSMM_MELTW_FLAG_CBIASACT_ACT_GELU; case LIBXSMM_MELTW_COMP_FLAG_CBIASACT_OVERWRITE_C: return LIBXSMM_MELTW_FLAG_CBIASACT_OVERWRITE_C; case LIBXSMM_MELTW_COMP_FLAG_CBIASACT_COLBIAS_ACT_RELU: return LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS_ACT_RELU; case LIBXSMM_MELTW_COMP_FLAG_CBIASACT_COLBIAS_ACT_TANH: return LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS_ACT_TANH; case LIBXSMM_MELTW_COMP_FLAG_CBIASACT_COLBIAS_ACT_SIGM: return LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS_ACT_SIGM; case LIBXSMM_MELTW_COMP_FLAG_CBIASACT_COLBIAS_ACT_GELU: return LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS_ACT_GELU; case LIBXSMM_MELTW_COMP_FLAG_CBIASACT_COLBIAS_ACT_RELU_OVERWRITE_C: return LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS_ACT_RELU_OVERWRITE_C; case LIBXSMM_MELTW_COMP_FLAG_CBIASACT_COLBIAS_ACT_TANH_OVERWRITE_C: return LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS_ACT_TANH_OVERWRITE_C; case LIBXSMM_MELTW_COMP_FLAG_CBIASACT_COLBIAS_ACT_SIGM_OVERWRITE_C: return LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS_ACT_SIGM_OVERWRITE_C; case LIBXSMM_MELTW_COMP_FLAG_CBIASACT_COLBIAS_ACT_GELU_OVERWRITE_C: return LIBXSMM_MELTW_FLAG_CBIASACT_COLBIAS_ACT_GELU_OVERWRITE_C; default: return LIBXSMM_MELTW_FLAG_CBIASACT_NONE; } } libxsmm-1.17/src/generator_common.h000066400000000000000000000702121415223013700174130ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Greg Henry (Intel Corp.) ******************************************************************************/ #ifndef GENERATOR_COMMON_H #define GENERATOR_COMMON_H #include #include /*@TODO check if we want to use enums here? Has this implications in the encoder? */ /* defining register mappings */ #define LIBXSMM_X86_GP_REG_RAX 0 #define LIBXSMM_X86_GP_REG_RCX 1 #define LIBXSMM_X86_GP_REG_RDX 2 #define LIBXSMM_X86_GP_REG_RBX 3 #define LIBXSMM_X86_GP_REG_RSP 4 #define LIBXSMM_X86_GP_REG_RBP 5 #define LIBXSMM_X86_GP_REG_RSI 6 #define LIBXSMM_X86_GP_REG_RDI 7 #define LIBXSMM_X86_GP_REG_R8 8 #define LIBXSMM_X86_GP_REG_R9 9 #define LIBXSMM_X86_GP_REG_R10 10 #define LIBXSMM_X86_GP_REG_R11 11 #define LIBXSMM_X86_GP_REG_R12 12 #define LIBXSMM_X86_GP_REG_R13 13 #define LIBXSMM_X86_GP_REG_R14 14 #define LIBXSMM_X86_GP_REG_R15 15 #define LIBXSMM_X86_GP_REG_UNDEF 127 /* define a place holder to handle AVX and SSE with a single encoder function using this values as the third operand means SSE */ #define LIBXSMM_X86_VEC_REG_UNDEF 255 #define LIBXSMM_X86_MASK_REG_UNDEF 255 #define LIBXSMM_X86_AVX512_MASK 1 /* this specifies k1 */ /* special value for undefined immediate */ #define LIBXSMM_X86_IMM_UNDEF 1024 /* special instruction */ #define LIBXSMM_X86_INSTR_UNDEF 9999 /* Load/Store/Move instructions */ /* AVX1,AVX2,AVX512 */ #define LIBXSMM_X86_INSTR_VMOVAPD 10000 #define LIBXSMM_X86_INSTR_VMOVUPD 10001 #define LIBXSMM_X86_INSTR_VMOVAPS 10002 #define LIBXSMM_X86_INSTR_VMOVUPS 10003 #define LIBXSMM_X86_INSTR_VBROADCASTSD 10004 #define LIBXSMM_X86_INSTR_VBROADCASTSS 10005 #define LIBXSMM_X86_INSTR_VMOVDDUP 10006 #define LIBXSMM_X86_INSTR_VMOVSD 10007 #define LIBXSMM_X86_INSTR_VMOVSS 10008 #define LIBXSMM_X86_INSTR_VPBROADCASTB 10025 #define LIBXSMM_X86_INSTR_VPBROADCASTW 10026 #define LIBXSMM_X86_INSTR_VPBROADCASTD 10027 #define LIBXSMM_X86_INSTR_VPBROADCASTQ 10028 #define LIBXSMM_X86_INSTR_VMOVDQA32 10029 #define LIBXSMM_X86_INSTR_VMOVDQA64 10030 #define LIBXSMM_X86_INSTR_VMOVDQU8 10031 #define LIBXSMM_X86_INSTR_VMOVDQU16 10032 #define LIBXSMM_X86_INSTR_VMOVDQU32 10033 #define LIBXSMM_X86_INSTR_VMOVDQU64 10034 #define LIBXSMM_X86_INSTR_VMASKMOVPD 10035 #define LIBXSMM_X86_INSTR_VMASKMOVPS 10036 #define LIBXSMM_X86_INSTR_VMOVNTPD 10037 #define LIBXSMM_X86_INSTR_VMOVNTPS 10038 #define LIBXSMM_X86_INSTR_VMOVNTDQ 10039 /* SSE */ #define LIBXSMM_X86_INSTR_MOVAPD 10009 #define LIBXSMM_X86_INSTR_MOVUPD 10010 #define LIBXSMM_X86_INSTR_MOVAPS 10011 #define LIBXSMM_X86_INSTR_MOVUPS 10012 #define LIBXSMM_X86_INSTR_MOVSD 10013 #define LIBXSMM_X86_INSTR_MOVSS 10014 #define LIBXSMM_X86_INSTR_MOVDDUP 10015 #define LIBXSMM_X86_INSTR_SHUFPS 10016 #define LIBXSMM_X86_INSTR_SHUFPD 10017 /* Gather/Scatter instructions */ #define LIBXSMM_X86_INSTR_VGATHERDPS 11000 #define LIBXSMM_X86_INSTR_VGATHERDPD 11001 #define LIBXSMM_X86_INSTR_VGATHERQPS 11002 #define LIBXSMM_X86_INSTR_VGATHERQPD 11003 #define LIBXSMM_X86_INSTR_VSCATTERDPS 11004 #define LIBXSMM_X86_INSTR_VSCATTERDPD 11005 #define LIBXSMM_X86_INSTR_VSCATTERQPS 11006 #define LIBXSMM_X86_INSTR_VSCATTERQPD 11007 /* Shuffle/Permute/Blend instructions */ #define LIBXSMM_X86_INSTR_VSHUFPS 12000 #define LIBXSMM_X86_INSTR_VPERM2F128 12001 #define LIBXSMM_X86_INSTR_VSHUFF64X2 12002 #define LIBXSMM_X86_INSTR_VEXTRACTF32X8 12003 #define LIBXSMM_X86_INSTR_VEXTRACTF64X4 12004 #define LIBXSMM_X86_INSTR_VSHUFPD 12005 #define LIBXSMM_X86_INSTR_VSHUFF32X4 12006 #define LIBXSMM_X86_INSTR_VSHUFI32X4 12007 #define LIBXSMM_X86_INSTR_VSHUFI64X2 12008 /* Vector compute instructions */ /* AVX1,AVX2,AVX512 */ #define LIBXSMM_X86_INSTR_VXORPD 20000 #define LIBXSMM_X86_INSTR_VMULPD 20001 #define LIBXSMM_X86_INSTR_VADDPD 20002 #define LIBXSMM_X86_INSTR_VSUBPD 20003 #define LIBXSMM_X86_INSTR_VFMADD231PD 20004 #define LIBXSMM_X86_INSTR_VFMSUB231PD 20005 #define LIBXSMM_X86_INSTR_VFNMADD231PD 20006 #define LIBXSMM_X86_INSTR_VFNMSUB231PD 20007 #define LIBXSMM_X86_INSTR_VMULSD 20008 #define LIBXSMM_X86_INSTR_VADDSD 20009 #define LIBXSMM_X86_INSTR_VSUBSD 20010 #define LIBXSMM_X86_INSTR_VFMADD231SD 20011 #define LIBXSMM_X86_INSTR_VFMSUB231SD 20012 #define LIBXSMM_X86_INSTR_VFNMADD231SD 20013 #define LIBXSMM_X86_INSTR_VFNMSUB231SD 20014 #define LIBXSMM_X86_INSTR_VXORPS 20015 #define LIBXSMM_X86_INSTR_VMULPS 20016 #define LIBXSMM_X86_INSTR_VADDPS 20017 #define LIBXSMM_X86_INSTR_VSUBPS 20018 #define LIBXSMM_X86_INSTR_VFMADD231PS 20019 #define LIBXSMM_X86_INSTR_VFMSUB231PS 20020 #define LIBXSMM_X86_INSTR_VFNMADD231PS 20021 #define LIBXSMM_X86_INSTR_VFNMSUB231PS 20022 #define LIBXSMM_X86_INSTR_VMULSS 20023 #define LIBXSMM_X86_INSTR_VADDSS 20024 #define LIBXSMM_X86_INSTR_VSUBSS 20025 #define LIBXSMM_X86_INSTR_VFMADD231SS 20026 #define LIBXSMM_X86_INSTR_VFMSUB231SS 20027 #define LIBXSMM_X86_INSTR_VFNMADD231SS 20028 #define LIBXSMM_X86_INSTR_VFNMSUB231SS 20029 #define LIBXSMM_X86_INSTR_VPERMW 20030 #define LIBXSMM_X86_INSTR_VFMADD132PD 20031 #define LIBXSMM_X86_INSTR_VFMSUB132PD 20032 #define LIBXSMM_X86_INSTR_VFNMADD132PD 20033 #define LIBXSMM_X86_INSTR_VFNMSUB132PD 20034 #define LIBXSMM_X86_INSTR_VFMADD132SD 20035 #define LIBXSMM_X86_INSTR_VFMSUB132SD 20036 #define LIBXSMM_X86_INSTR_VFNMADD132SD 20037 #define LIBXSMM_X86_INSTR_VFNMSUB132SD 20038 #define LIBXSMM_X86_INSTR_VFMADD132PS 20039 #define LIBXSMM_X86_INSTR_VFMSUB132PS 20040 #define LIBXSMM_X86_INSTR_VFNMADD132PS 20041 #define LIBXSMM_X86_INSTR_VFNMSUB132PS 20042 #define LIBXSMM_X86_INSTR_VFMADD132SS 20043 #define LIBXSMM_X86_INSTR_VFMSUB132SS 20044 #define LIBXSMM_X86_INSTR_VFNMADD132SS 20045 #define LIBXSMM_X86_INSTR_VFNMSUB132SS 20046 #define LIBXSMM_X86_INSTR_VFMADD213PD 20047 #define LIBXSMM_X86_INSTR_VFMSUB213PD 20048 #define LIBXSMM_X86_INSTR_VFNMADD213PD 20049 #define LIBXSMM_X86_INSTR_VFNMSUB213PD 20050 #define LIBXSMM_X86_INSTR_VFMADD213SD 20051 #define LIBXSMM_X86_INSTR_VFMSUB213SD 20052 #define LIBXSMM_X86_INSTR_VFNMADD213SD 20053 #define LIBXSMM_X86_INSTR_VFNMSUB213SD 20054 #define LIBXSMM_X86_INSTR_VFMADD213PS 20055 #define LIBXSMM_X86_INSTR_VFMSUB213PS 20056 #define LIBXSMM_X86_INSTR_VFNMADD213PS 20057 #define LIBXSMM_X86_INSTR_VFNMSUB213PS 20058 #define LIBXSMM_X86_INSTR_VFMADD213SS 20059 #define LIBXSMM_X86_INSTR_VFMSUB213SS 20060 #define LIBXSMM_X86_INSTR_VFNMADD213SS 20061 #define LIBXSMM_X86_INSTR_VFNMSUB213SS 20062 /* SSE */ #define LIBXSMM_X86_INSTR_XORPD 20063 #define LIBXSMM_X86_INSTR_MULPD 20064 #define LIBXSMM_X86_INSTR_ADDPD 20065 #define LIBXSMM_X86_INSTR_SUBPD 20066 #define LIBXSMM_X86_INSTR_MULSD 20067 #define LIBXSMM_X86_INSTR_ADDSD 20068 #define LIBXSMM_X86_INSTR_SUBSD 20069 #define LIBXSMM_X86_INSTR_XORPS 20070 #define LIBXSMM_X86_INSTR_MULPS 20071 #define LIBXSMM_X86_INSTR_ADDPS 20072 #define LIBXSMM_X86_INSTR_SUBPS 20073 #define LIBXSMM_X86_INSTR_MULSS 20074 #define LIBXSMM_X86_INSTR_ADDSS 20075 #define LIBXSMM_X86_INSTR_SUBSS 20076 /* AVX512F: Integer XOR as there is no FP */ #define LIBXSMM_X86_INSTR_VPXORD 20077 /* additional integer stuff */ #define LIBXSMM_X86_INSTR_VPADDQ 20078 #define LIBXSMM_X86_INSTR_VPADDD 20079 #define LIBXSMM_X86_INSTR_VPADDW 20080 #define LIBXSMM_X86_INSTR_VPADDB 20081 #define LIBXSMM_X86_INSTR_VPMADDWD 20082 #define LIBXSMM_X86_INSTR_VPMADDUBSW 20083 #define LIBXSMM_X86_INSTR_VPADDSW 20084 #define LIBXSMM_X86_INSTR_VPADDSB 20085 #define LIBXSMM_X86_INSTR_VPSUBD 20086 /* Additional vector manipulations */ #define LIBXSMM_X86_INSTR_VUNPCKLPD 20087 #define LIBXSMM_X86_INSTR_VUNPCKLPS 20088 #define LIBXSMM_X86_INSTR_VUNPCKHPD 20089 #define LIBXSMM_X86_INSTR_VUNPCKHPS 20090 #define LIBXSMM_X86_INSTR_VPSRAVD 20091 #define LIBXSMM_X86_INSTR_VCVTDQ2PS 20092 #define LIBXSMM_X86_INSTR_VDIVPS 20093 #define LIBXSMM_X86_INSTR_VDIVPD 20094 #define LIBXSMM_X86_INSTR_VCVTPS2PD 20095 #define LIBXSMM_X86_INSTR_VBLENDMPS 20096 #define LIBXSMM_X86_INSTR_VCMPPS 20097 #define LIBXSMM_X86_INSTR_VPANDD 20098 #define LIBXSMM_X86_INSTR_VPANDQ 20099 #define LIBXSMM_X86_INSTR_VMAXPD 20100 #define LIBXSMM_X86_INSTR_VMAXPS 20101 #define LIBXSMM_X86_INSTR_VCVTPS2PH 20102 #define LIBXSMM_X86_INSTR_VCVTPH2PS 20103 #define LIBXSMM_X86_INSTR_VPERMD 20104 #define LIBXSMM_X86_INSTR_VPMOVDW 20105 #define LIBXSMM_X86_INSTR_VPSRAD 20106 #define LIBXSMM_X86_INSTR_VPSLLD 20107 #define LIBXSMM_X86_INSTR_VPCMPB 20108 #define LIBXSMM_X86_INSTR_VPCMPD 20109 #define LIBXSMM_X86_INSTR_VPCMPW 20110 #define LIBXSMM_X86_INSTR_VPCMPUB 20111 #define LIBXSMM_X86_INSTR_VPCMPUD 20112 #define LIBXSMM_X86_INSTR_VPCMPUW 20113 #define LIBXSMM_X86_INSTR_VPORD 20114 #define LIBXSMM_X86_INSTR_VPSRLD 20115 #define LIBXSMM_X86_INSTR_VPERMT2W 20116 #define LIBXSMM_X86_INSTR_VPMOVSXWD 20117 #define LIBXSMM_X86_INSTR_VPMOVDB 20118 #define LIBXSMM_X86_INSTR_VPMOVSDB 20119 #define LIBXSMM_X86_INSTR_VPMOVUSDB 20120 #define LIBXSMM_X86_INSTR_VCVTPS2DQ 20121 #define LIBXSMM_X86_INSTR_VCVTPS2UDQ 20122 #define LIBXSMM_X86_INSTR_VPMOVZXWD 20123 #define LIBXSMM_X86_INSTR_VPMOVSXBD 20124 #define LIBXSMM_X86_INSTR_VPMOVZXBD 20125 #define LIBXSMM_X86_INSTR_VPBLENDMB 20126 #define LIBXSMM_X86_INSTR_VPBLENDMW 20127 #define LIBXSMM_X86_INSTR_VRCP14PS 20128 #define LIBXSMM_X86_INSTR_VPMAXSD 20129 #define LIBXSMM_X86_INSTR_VPMINSD 20130 /* AVX512, QUAD MADD, QUAD VNNI and VNNI */ #define LIBXSMM_X86_INSTR_V4FMADDPS 26000 #define LIBXSMM_X86_INSTR_V4FNMADDPS 26001 #define LIBXSMM_X86_INSTR_V4FMADDSS 26002 #define LIBXSMM_X86_INSTR_V4FNMADDSS 26003 #define LIBXSMM_X86_INSTR_VP4DPWSSD 26004 #define LIBXSMM_X86_INSTR_VP4DPWSSDS 26005 #define LIBXSMM_X86_INSTR_VPDPWSSD 26006 #define LIBXSMM_X86_INSTR_VPDPWSSDS 26007 #define LIBXSMM_X86_INSTR_VPDPBUSD 26008 #define LIBXSMM_X86_INSTR_VPDPBUSDS 26009 /* AVX512 BF16 */ #define LIBXSMM_X86_INSTR_VDPBF16PS 27000 #define LIBXSMM_X86_INSTR_VCVTNEPS2BF16 27001 #define LIBXSMM_X86_INSTR_VCVTNE2PS2BF16 27002 /* GP instructions */ #define LIBXSMM_X86_INSTR_ADDQ 30000 #define LIBXSMM_X86_INSTR_SUBQ 30001 #define LIBXSMM_X86_INSTR_MOVQ 30002 #define LIBXSMM_X86_INSTR_CMPQ 30003 #define LIBXSMM_X86_INSTR_JL 30004 #define LIBXSMM_X86_INSTR_VPREFETCH0 30005 #define LIBXSMM_X86_INSTR_VPREFETCH1 30006 #define LIBXSMM_X86_INSTR_PREFETCHT0 30007 #define LIBXSMM_X86_INSTR_PREFETCHT1 30008 #define LIBXSMM_X86_INSTR_PREFETCHT2 30009 #define LIBXSMM_X86_INSTR_PREFETCHNTA 30010 #define LIBXSMM_X86_INSTR_MOVL 30011 #define LIBXSMM_X86_INSTR_MOVSLQ 30012 #define LIBXSMM_X86_INSTR_SALQ 30013 #define LIBXSMM_X86_INSTR_IMUL 30014 #define LIBXSMM_X86_INSTR_CMOVZ 30015 #define LIBXSMM_X86_INSTR_CMOVNZ 30016 #define LIBXSMM_X86_INSTR_JE 30017 #define LIBXSMM_X86_INSTR_JZ 30018 #define LIBXSMM_X86_INSTR_JG 30019 #define LIBXSMM_X86_INSTR_JNE 30020 #define LIBXSMM_X86_INSTR_JNZ 30021 #define LIBXSMM_X86_INSTR_JGE 30022 #define LIBXSMM_X86_INSTR_JLE 30023 #define LIBXSMM_X86_INSTR_JMP 30024 #define LIBXSMM_X86_INSTR_POPCNT 30025 #define LIBXSMM_X86_INSTR_TZCNT 30026 #define LIBXSMM_X86_INSTR_LEAQ 30027 #define LIBXSMM_X86_INSTR_ANDQ 30028 /* Mask move instructions */ #define LIBXSMM_X86_INSTR_KMOV 40000 #define LIBXSMM_X86_INSTR_KMOVW 40001 #define LIBXSMM_X86_INSTR_KMOVB 40002 #define LIBXSMM_X86_INSTR_KMOVD 40003 #define LIBXSMM_X86_INSTR_KMOVQ 40004 /* Mask compute instructions */ #define LIBXSMM_X86_INSTR_KXNORW 45000 /* define error codes */ #define LIBXSMM_ERR_GENERAL 90000 #define LIBXSMM_ERR_ALLOC 90001 #define LIBXSMM_ERR_BUFFER_TOO_SMALL 90002 #define LIBXSMM_ERR_APPEND_STR 90003 #define LIBXSMM_ERR_ARCH_PREC 90004 #define LIBXSMM_ERR_ARCH 90005 #define LIBXSMM_ERR_UNSUP_ARCH 90006 #define LIBXSMM_ERR_LDA 90007 #define LIBXSMM_ERR_LDB 90008 #define LIBXSMM_ERR_LDC 90009 #define LIBXSMM_ERR_SPGEMM_GEN 90010 #define LIBXSMM_ERR_CSC_INPUT 90011 #define LIBXSMM_ERR_CSC_READ_LEN 90012 #define LIBXSMM_ERR_CSC_READ_DESC 90013 #define LIBXSMM_ERR_CSC_READ_ELEMS 90014 #define LIBXSMM_ERR_CSC_LEN 90015 #define LIBXSMM_ERR_N_BLOCK 90016 #define LIBXSMM_ERR_M_BLOCK 90017 #define LIBXSMM_ERR_K_BLOCK 90018 #define LIBXSMM_ERR_REG_BLOCK 90019 #define LIBXSMM_ERR_NO_AVX512_BCAST 90020 #define LIBXSMM_ERR_NO_AVX512_QFMA 90021 #define LIBXSMM_ERR_NO_INDEX_SCALE_ADDR 90022 #define LIBXSMM_ERR_UNSUPPORTED_JUMP 90023 #define LIBXSMM_ERR_NO_JMPLBL_AVAIL 90024 #define LIBXSMM_ERR_EXCEED_JMPLBL 90025 #define LIBXSMM_ERR_CSC_ALLOC_DATA 90026 #define LIBXSMM_ERR_CSR_ALLOC_DATA 90027 #define LIBXSMM_ERR_CSR_INPUT 90028 #define LIBXSMM_ERR_CSR_READ_LEN 90029 #define LIBXSMM_ERR_CSR_READ_DESC 90030 #define LIBXSMM_ERR_CSR_READ_ELEMS 90031 #define LIBXSMM_ERR_CSR_LEN 90032 #define LIBXSMM_ERR_UNSUP_DATATYPE 90033 #define LIBXSMM_ERR_UNSUP_DT_FORMAT 90034 #define LIBXSMM_ERR_INVALID_GEMM_CONFIG 90035 #define LIBXSMM_ERR_UNIQUE_VAL 90036 #define LIBXSMM_ERR_VEC_REG_MUST_BE_UNDEF 90037 #define LIBXSMM_ERR_JMPLBL_USED 90038 #define LIBXSMM_ERR_TRANS_B 90039 #define LIBXSMM_ERR_LDB_TRANS 90040 #define LIBXSMM_ERR_VNNI_A 90041 #define LIBXSMM_ERR_VNNI_B 90042 #define LIBXSMM_ERR_NO_AVX512VL 90043 #if defined(LIBXSMM_HANDLE_ERROR_QUIET) # define LIBXSMM_HANDLE_ERROR(GENERATED_CODE, ERROR_CODE) # define LIBXSMM_HANDLE_ERROR_VERBOSE(GENERATED_CODE, ERROR_CODE) #else # define LIBXSMM_HANDLE_ERROR(GENERATED_CODE, ERROR_CODE) libxsmm_handle_error( \ GENERATED_CODE, ERROR_CODE, LIBXSMM_FUNCNAME, 1 < libxsmm_ninit ? libxsmm_verbosity : 1) # define LIBXSMM_HANDLE_ERROR_VERBOSE(GENERATED_CODE, ERROR_CODE) libxsmm_handle_error( \ GENERATED_CODE, ERROR_CODE, LIBXSMM_FUNCNAME, 1) #endif /* micro kernel configuration */ LIBXSMM_EXTERN_C typedef struct libxsmm_micro_kernel_config { unsigned int instruction_set; unsigned int vector_reg_count; unsigned int vector_length; unsigned int datatype_size; unsigned int a_vmove_instruction; unsigned int b_vmove_instruction; unsigned int b_shuff_instruction; unsigned int c_vmove_instruction; unsigned int c_vmove_nts_instruction; unsigned int use_masking_a_c; unsigned int prefetch_instruction; unsigned int vxor_instruction; unsigned int vmul_instruction; unsigned int vadd_instruction; unsigned int alu_add_instruction; unsigned int alu_sub_instruction; unsigned int alu_cmp_instruction; unsigned int alu_jmp_instruction; unsigned int alu_mov_instruction; char vector_name; } libxsmm_micro_kernel_config; /* structure for storing the current gp reg mapping */ LIBXSMM_EXTERN_C typedef struct libxsmm_gp_reg_mapping_struct { unsigned int gp_reg_a; unsigned int gp_reg_b; unsigned int gp_reg_c; unsigned int gp_reg_a_prefetch; unsigned int gp_reg_a_offset; unsigned int gp_reg_b_prefetch; unsigned int gp_reg_b_offset; /* unsigned int gp_reg_c_prefetch;*/ unsigned int gp_reg_mloop; unsigned int gp_reg_nloop; unsigned int gp_reg_kloop; unsigned int gp_reg_reduce_count; unsigned int gp_reg_reduce_loop; unsigned int gp_reg_scf; unsigned int gp_reg_help_0; unsigned int gp_reg_help_1; unsigned int gp_reg_help_2; unsigned int gp_reg_help_3; unsigned int gp_reg_help_4; unsigned int gp_reg_help_5; } libxsmm_gp_reg_mapping; /* structure for storing the current gp reg mapping for matcopy */ LIBXSMM_EXTERN_C typedef struct libxsmm_matcopy_gp_reg_mapping_struct { unsigned int gp_reg_a; unsigned int gp_reg_lda; unsigned int gp_reg_b; unsigned int gp_reg_ldb; unsigned int gp_reg_a_pf; unsigned int gp_reg_b_pf; unsigned int gp_reg_m_loop; unsigned int gp_reg_n_loop; unsigned int gp_reg_help_0; } libxsmm_matcopy_gp_reg_mapping; /* matcopy kernel configuration */ LIBXSMM_EXTERN_C typedef struct libxsmm_matcopy_kernel_config_struct { unsigned int instruction_set; unsigned int vector_reg_count; unsigned int vector_length; unsigned int datatype_size; unsigned int prefetch_instruction; unsigned int vmove_instruction; unsigned int alu_add_instruction; unsigned int alu_cmp_instruction; unsigned int alu_jmp_instruction; unsigned int alu_mov_instruction; unsigned int vxor_instruction; char vector_name; } libxsmm_matcopy_kernel_config; /* structure for storing the current gp reg mapping for mateltwise */ LIBXSMM_EXTERN_C typedef struct libxsmm_mateltwise_gp_reg_mapping_struct { unsigned int gp_reg_param_struct; unsigned int gp_reg_in; unsigned int gp_reg_ldi; unsigned int gp_reg_out; unsigned int gp_reg_ldo; unsigned int gp_reg_relumask; unsigned int gp_reg_reduced_elts; unsigned int gp_reg_reduced_elts_squared; unsigned int gp_reg_scale_vals; unsigned int gp_reg_shift_vals; unsigned int gp_reg_bias_vals; unsigned int gp_reg_m_loop; unsigned int gp_reg_n_loop; } libxsmm_mateltwise_gp_reg_mapping; /* mateltwise kernel configuration */ LIBXSMM_EXTERN_C typedef struct libxsmm_mateltwise_kernel_config_struct { unsigned int instruction_set; unsigned int vector_reg_count; unsigned int vector_length_in; unsigned int vector_length_out; unsigned int datatype_size_in; unsigned int datatype_size_out; unsigned int vmove_instruction_in; unsigned int vmove_instruction_out; unsigned int alu_add_instruction; unsigned int alu_cmp_instruction; unsigned int alu_jmp_instruction; unsigned int alu_mov_instruction; unsigned int vxor_instruction; char vector_name; } libxsmm_mateltwise_kernel_config; /* structure for storing the current gp reg mapping for transpose */ LIBXSMM_EXTERN_C typedef struct libxsmm_transpose_gp_reg_mapping_struct { unsigned int gp_reg_a; unsigned int gp_reg_lda; unsigned int gp_reg_b; unsigned int gp_reg_ldb; unsigned int gp_reg_m_loop; unsigned int gp_reg_n_loop; unsigned int gp_reg_help_0; unsigned int gp_reg_help_1; unsigned int gp_reg_help_2; unsigned int gp_reg_help_3; unsigned int gp_reg_help_4; unsigned int gp_reg_help_5; } libxsmm_transpose_gp_reg_mapping; /* transpose kernel configuration */ LIBXSMM_EXTERN_C typedef struct libxsmm_transpose_kernel_config_struct { unsigned int instruction_set; unsigned int vector_reg_count; char vector_name; } libxsmm_transpose_kernel_config; /* structure for tracking local labels in assembly we don't allow overlapping loops */ LIBXSMM_EXTERN_C typedef struct libxsmm_loop_label_tracker_struct { unsigned int label_address[32]; unsigned int label_count; } libxsmm_loop_label_tracker; /* structure to save jump properties to the same destination */ LIBXSMM_EXTERN_C typedef struct libxsmm_jump_source_struct { unsigned int instr_type[32]; unsigned int instr_addr[32]; unsigned int ref_count; } libxsmm_jump_source; /* structure for tracking arbitrary jump labels in assembly code */ LIBXSMM_EXTERN_C typedef struct libxsmm_jump_label_tracker_struct { unsigned int label_address[32]; libxsmm_jump_source label_source[32]; } libxsmm_jump_label_tracker; /* compressed meltw reduce structure */ typedef enum libxsmm_meltw_comp_redu_flags { LIBXSMM_MELTW_COMP_FLAG_REDUCE_NONE = 0, LIBXSMM_MELTW_COMP_FLAG_REDUCE_OP_ADD = 1, LIBXSMM_MELTW_COMP_FLAG_REDUCE_OP_MAX = 2, LIBXSMM_MELTW_COMP_FLAG_REDUCE_OP_MUL = 3, LIBXSMM_MELTW_COMP_FLAG_REDUCE_ROWS = 4, LIBXSMM_MELTW_COMP_FLAG_REDUCE_COLS = 5, LIBXSMM_MELTW_COMP_FLAG_REDUCE_ELTS = 6, LIBXSMM_MELTW_COMP_FLAG_REDUCE_ELTS_SQUARED = 7, LIBXSMM_MELTW_COMP_FLAG_REDUCE_OP_ADD_ROWS = 8, LIBXSMM_MELTW_COMP_FLAG_REDUCE_OP_ADD_COLS = 9 } libxsmm_meltw_comp_redu_flags; /* compressed meltw scale structure */ typedef enum libxsmm_meltw_comp_scal_flags { LIBXSMM_MELTW_COMP_FLAG_SCALE_NONE = 0, LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT = 1, LIBXSMM_MELTW_COMP_FLAG_SCALE_SHIFT = 2, LIBXSMM_MELTW_COMP_FLAG_SCALE_ADD_BIAS = 3, LIBXSMM_MELTW_COMP_FLAG_SCALE_ROWS = 4, LIBXSMM_MELTW_COMP_FLAG_SCALE_COLS = 5, LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_ROWS = 6, LIBXSMM_MELTW_COMP_FLAG_SCALE_SHIFT_ROWS = 7, LIBXSMM_MELTW_COMP_FLAG_SCALE_ADD_BIAS_ROWS = 8, LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_SHIFT_ROWS = 9, LIBXSMM_MELTW_COMP_FLAG_SCALE_ADD_BIAS_SHIFT_ROWS = 10, LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_ADD_BIAS_ROWS = 11, LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_SHIFT_ADD_BIAS_ROWS = 12, LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_COLS = 13, LIBXSMM_MELTW_COMP_FLAG_SCALE_SHIFT_COLS = 14, LIBXSMM_MELTW_COMP_FLAG_SCALE_ADD_BIAS_COLS = 15, LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_SHIFT_COLS = 16, LIBXSMM_MELTW_COMP_FLAG_SCALE_ADD_BIAS_SHIFT_COLS = 17, LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_ADD_BIAS_COLS = 18, LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_SHIFT_ADD_BIAS_COLS = 19 } libxsmm_meltw_comp_scal_flags; /* compressed metlw cvta strcuture */ typedef enum libxsmm_meltw_comp_cvta_flags { LIBXSMM_MELTW_COMP_FLAG_CVTA_NONE = 0, LIBXSMM_MELTW_COMP_FLAG_CVTA_FUSE_RELU = 1, LIBXSMM_MELTW_COMP_FLAG_CVTA_FUSE_TANH = 2, LIBXSMM_MELTW_COMP_FLAG_CVTA_FUSE_SIGM = 3 } libxsmm_meltw_comp_cvta_flags; /* compressed meltw acvt structure */ typedef enum libxsmm_meltw_comp_acvt_flags { LIBXSMM_MELTW_COMP_FLAG_ACVT_NONE = 0, LIBXSMM_MELTW_COMP_FLAG_ACVT_FUSE_TANH = 1, LIBXSMM_MELTW_COMP_FLAG_ACVT_FUSE_SIGM = 2 } libxsmm_meltw_comp_acvt_flags; /* compressed meltw cbiasact strcuture */ typedef enum libxsmm_meltw_comp_cbiasact_flags { LIBXSMM_MELTW_COMP_FLAG_CBIASACT_NONE = 0, LIBXSMM_MELTW_COMP_FLAG_CBIASACT_COLBIAS = 1, LIBXSMM_MELTW_COMP_FLAG_CBIASACT_ACT_RELU = 2, LIBXSMM_MELTW_COMP_FLAG_CBIASACT_ACT_TANH = 3, LIBXSMM_MELTW_COMP_FLAG_CBIASACT_ACT_SIGM = 4, LIBXSMM_MELTW_COMP_FLAG_CBIASACT_ACT_GELU = 5, LIBXSMM_MELTW_COMP_FLAG_CBIASACT_OVERWRITE_C = 6, LIBXSMM_MELTW_COMP_FLAG_CBIASACT_COLBIAS_ACT_RELU = 7, LIBXSMM_MELTW_COMP_FLAG_CBIASACT_COLBIAS_ACT_TANH = 8, LIBXSMM_MELTW_COMP_FLAG_CBIASACT_COLBIAS_ACT_SIGM = 9, LIBXSMM_MELTW_COMP_FLAG_CBIASACT_COLBIAS_ACT_GELU = 10, LIBXSMM_MELTW_COMP_FLAG_CBIASACT_COLBIAS_ACT_RELU_OVERWRITE_C = 11, LIBXSMM_MELTW_COMP_FLAG_CBIASACT_COLBIAS_ACT_TANH_OVERWRITE_C = 12, LIBXSMM_MELTW_COMP_FLAG_CBIASACT_COLBIAS_ACT_SIGM_OVERWRITE_C = 13, LIBXSMM_MELTW_COMP_FLAG_CBIASACT_COLBIAS_ACT_GELU_OVERWRITE_C = 14 } libxsmm_meltw_comp_cbiasact_flags; LIBXSMM_API_INTERN void libxsmm_reset_loop_label_tracker( libxsmm_loop_label_tracker* io_loop_label_tracker ); LIBXSMM_API_INTERN void libxsmm_reset_jump_label_tracker( libxsmm_jump_label_tracker* io_jump_lable_tracker ); LIBXSMM_API_INTERN void libxsmm_get_x86_gp_reg_name( const unsigned int i_gp_reg_number, char* o_gp_reg_name, const int i_gp_reg_name_max_length ); LIBXSMM_API_INTERN unsigned int libxsmm_check_x86_gp_reg_callee_save( const unsigned int i_gp_reg_number ); LIBXSMM_API_INTERN void libxsmm_get_x86_instr_name( const unsigned int i_instr_number, char* o_instr_name, const int i_instr_name_max_length ); LIBXSMM_API_INTERN void libxsmm_reset_x86_gp_reg_mapping( libxsmm_gp_reg_mapping* io_gp_reg_mapping ); LIBXSMM_API_INTERN unsigned int libxsmm_is_x86_vec_instr_single_precision( const unsigned int i_instr_number ); /* some string manipulation helper needed to generated code */ LIBXSMM_API_INTERN void libxsmm_append_code_as_string( libxsmm_generated_code* io_generated_code, const char* i_code_to_append, const int i_append_length ); LIBXSMM_API_INTERN void libxsmm_close_function( libxsmm_generated_code* io_generated_code ); LIBXSMM_API_INTERN void libxsmm_mmfunction_signature( libxsmm_generated_code* io_generated_code, const char* i_routine_name, const libxsmm_gemm_descriptor* i_xgemm_desc ); LIBXSMM_API_INTERN void libxsmm_generator_isa_check_header( libxsmm_generated_code* io_generated_code ); LIBXSMM_API_INTERN void libxsmm_generator_isa_check_footer( libxsmm_generated_code* io_generated_code ); LIBXSMM_API_INTERN void libxsmm_handle_error( libxsmm_generated_code* io_generated_code, const unsigned int i_error_code, const char* context, int emit_message ); LIBXSMM_API_INTERN unsigned int libxsmm_compute_equalized_blocking( unsigned int i_size, unsigned int i_max_block, unsigned int* o_range_1, unsigned int* o_block_1, unsigned int* o_range_2, unsigned int* o_block_2 ); /** helper functions for compressing and decompressing meltw flags */ LIBXSMM_API_INTERN libxsmm_meltw_comp_redu_flags libxsmm_get_meltw_comp_redu_flags( libxsmm_meltw_redu_flags flags ); LIBXSMM_API_INTERN libxsmm_meltw_redu_flags libxsmm_get_meltw_redu_flags( libxsmm_meltw_comp_redu_flags flags ); LIBXSMM_API_INTERN libxsmm_meltw_comp_scal_flags libxsmm_get_meltw_comp_scal_flags( libxsmm_meltw_scal_flags flags ); LIBXSMM_API_INTERN libxsmm_meltw_scal_flags libxsmm_get_meltw_scal_flags( libxsmm_meltw_comp_scal_flags flags ); LIBXSMM_API_INTERN libxsmm_meltw_comp_cvta_flags libxsmm_get_meltw_comp_cvta_flags( libxsmm_meltw_cvta_flags flags ); LIBXSMM_API_INTERN libxsmm_meltw_cvta_flags libxsmm_get_meltw_cvta_flags( libxsmm_meltw_comp_cvta_flags flags ); LIBXSMM_API_INTERN libxsmm_meltw_comp_acvt_flags libxsmm_get_meltw_comp_acvt_flags( libxsmm_meltw_acvt_flags flags ); LIBXSMM_API_INTERN libxsmm_meltw_acvt_flags libxsmm_get_meltw_acvt_flags( libxsmm_meltw_comp_acvt_flags flags ); LIBXSMM_API_INTERN libxsmm_meltw_comp_cbiasact_flags libxsmm_get_meltw_comp_cbiasact_flags( libxsmm_meltw_cbiasact_flags flags ); LIBXSMM_API_INTERN libxsmm_meltw_cbiasact_flags libxsmm_get_meltw_cbiasact_flags( libxsmm_meltw_comp_cbiasact_flags flags ); #endif /* GENERATOR_COMMON_H */ libxsmm-1.17/src/generator_gemm.c000066400000000000000000000310651415223013700170460ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "generator_common.h" #include "generator_gemm_common.h" #include "generator_gemm_sse3_avx_avx2_avx512.h" #include "generator_gemm_noarch.h" #include "libxsmm_main.h" LIBXSMM_API void libxsmm_generator_gemm_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc ) { /* apply the alignment override */ libxsmm_gemm_descriptor l_xgemm_desc_mod = *i_xgemm_desc; unsigned int l_vector_length = 1; /* determining vector length depending on architecture and precision */ /* @TODO fix me */ if ( io_generated_code->arch <= LIBXSMM_X86_GENERIC ) { /* nothing todo */ } else if ( ( io_generated_code->arch <= LIBXSMM_X86_SSE4 ) && LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( l_xgemm_desc_mod.datatype ) ) { l_vector_length = 2; } else if ( ( io_generated_code->arch <= LIBXSMM_X86_SSE4 ) && LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_INP( l_xgemm_desc_mod.datatype ) ) { l_vector_length = 4; } else if ( ( io_generated_code->arch <= LIBXSMM_X86_AVX2 ) && LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( l_xgemm_desc_mod.datatype ) ) { l_vector_length = 4; } else if ( ( io_generated_code->arch <= LIBXSMM_X86_AVX2 ) && LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_INP( l_xgemm_desc_mod.datatype ) ) { l_vector_length = 8; } else if ( ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) && LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( l_xgemm_desc_mod.datatype ) ) { l_vector_length = 8; } else if ( ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) && LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_INP( l_xgemm_desc_mod.datatype ) ) { l_vector_length = 16; } else if ( ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) && ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch != LIBXSMM_X86_AVX512_MIC ) && ( LIBXSMM_GEMM_PRECISION_I16 == LIBXSMM_GETENUM_INP( l_xgemm_desc_mod.datatype ) ) ) { l_vector_length = 16; /* some checks as we cannot mask everything */ if ( (l_xgemm_desc_mod.k % 8 != 0) && (io_generated_code->arch == LIBXSMM_X86_AVX512_KNM) ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_ARCH_PREC ); return; } else if (l_xgemm_desc_mod.k % 2 != 0) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_ARCH_PREC ); return; } l_xgemm_desc_mod.k = l_xgemm_desc_mod.k/2; l_xgemm_desc_mod.ldb = l_xgemm_desc_mod.ldb/2; } else if ( ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) && ( io_generated_code->arch >= LIBXSMM_X86_AVX512_CORE ) && ( LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_INP( l_xgemm_desc_mod.datatype ) ) ) { l_vector_length = 16; /* some checks as we cannot mask everything */ if ( (l_xgemm_desc_mod.k % 4 != 0) ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_ARCH_PREC ); return; } l_xgemm_desc_mod.k = l_xgemm_desc_mod.k/4; l_xgemm_desc_mod.ldb = l_xgemm_desc_mod.ldb/4; } else if ( ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) && ( io_generated_code->arch >= LIBXSMM_X86_AVX512_CORE ) && LIBXSMM_GEMM_PRECISION_BF16 == LIBXSMM_GETENUM_INP( l_xgemm_desc_mod.datatype ) ) { l_vector_length = 16; /* some checks as we cannot mask everything */ if ( (l_xgemm_desc_mod.k % 2 != 0) ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_ARCH_PREC ); return; } l_xgemm_desc_mod.k = l_xgemm_desc_mod.k/2; l_xgemm_desc_mod.ldb = l_xgemm_desc_mod.ldb/2; } else { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_ARCH_PREC ); return; } /* check LDA */ if ( l_xgemm_desc_mod.lda < l_xgemm_desc_mod.m ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_LDA ); return; } /* check LDB */ if ( (l_xgemm_desc_mod.flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { if ( l_xgemm_desc_mod.ldb < l_xgemm_desc_mod.n ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_LDB_TRANS ); return; } } else { if ( l_xgemm_desc_mod.ldb < l_xgemm_desc_mod.k ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_LDB ); return; } } /* check LDC */ if ( l_xgemm_desc_mod.ldc < l_xgemm_desc_mod.m ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_LDC ); return; } /* check for trans B cases which are not supported in the generator */ if ( (l_xgemm_desc_mod.flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { if ( (LIBXSMM_GEMM_PRECISION_I16 == LIBXSMM_GETENUM_INP( l_xgemm_desc_mod.datatype )) || (LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_INP( l_xgemm_desc_mod.datatype )) || (LIBXSMM_GEMM_PRECISION_BF16 == LIBXSMM_GETENUM_INP( l_xgemm_desc_mod.datatype )) ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_TRANS_B ); return; } else { /* we are fine, we have transpose support */ } } /* check for VNNI flag being set in case of low precision GEMM */ if ( ( LIBXSMM_GEMM_PRECISION_I16 == LIBXSMM_GETENUM_INP( l_xgemm_desc_mod.datatype ) ) || ( LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_INP( l_xgemm_desc_mod.datatype ) ) || ( LIBXSMM_GEMM_PRECISION_BF16 == LIBXSMM_GETENUM_INP( l_xgemm_desc_mod.datatype ) ) ) { if ( (l_xgemm_desc_mod.flags & LIBXSMM_GEMM_FLAG_VNNI_B) > 0 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_VNNI_B ); return; } if ( (l_xgemm_desc_mod.flags & LIBXSMM_GEMM_FLAG_VNNI_A) == 0 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_VNNI_A ); return; } } /* check if alignment is not possible */ if ( 0 != (l_xgemm_desc_mod.lda % l_vector_length) ) { l_xgemm_desc_mod.flags &= ~LIBXSMM_GEMM_FLAG_ALIGN_A; } if ( 0 != (l_xgemm_desc_mod.ldc % l_vector_length) ) { l_xgemm_desc_mod.flags &= ~LIBXSMM_GEMM_FLAG_ALIGN_C; } if ( io_generated_code->arch <= LIBXSMM_X86_GENERIC ) { /* call actual kernel generation with revised parameters */ libxsmm_generator_gemm_noarch_kernel( io_generated_code, &l_xgemm_desc_mod ); } else if ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) { /* call actual kernel generation with revised parameters */ libxsmm_generator_gemm_sse3_avx_avx2_avx512_kernel( io_generated_code, &l_xgemm_desc_mod ); } else { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_ARCH ); return; } } LIBXSMM_API void libxsmm_generator_gemm_inlineasm( const char* i_file_out, const char* i_routine_name, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch ) { /* init generated code object */ libxsmm_generated_code l_generated_code; l_generated_code.generated_code = NULL; l_generated_code.buffer_size = 0; l_generated_code.code_size = 0; l_generated_code.code_type = 0; l_generated_code.last_error = 0; l_generated_code.arch = 0; l_generated_code.sf_size = 0; /* set arch */ if ( strcmp(i_arch, "wsm") == 0 ) { l_generated_code.arch = LIBXSMM_X86_SSE4; } else if ( strcmp(i_arch, "snb") == 0 ) { l_generated_code.arch = LIBXSMM_X86_AVX; } else if ( strcmp(i_arch, "hsw") == 0 ) { l_generated_code.arch = LIBXSMM_X86_AVX2; } else if ( strcmp(i_arch, "knl") == 0 ) { l_generated_code.arch = LIBXSMM_X86_AVX512_MIC; } else if ( strcmp(i_arch, "knm") == 0 ) { l_generated_code.arch = LIBXSMM_X86_AVX512_KNM; } else if ( strcmp(i_arch, "skx") == 0 ) { l_generated_code.arch = LIBXSMM_X86_AVX512_CORE; } else if ( strcmp(i_arch, "clx") == 0 ) { l_generated_code.arch = LIBXSMM_X86_AVX512_CLX; } else if ( strcmp(i_arch, "cpx") == 0 ) { l_generated_code.arch = LIBXSMM_X86_AVX512_CPX; } else { l_generated_code.arch = LIBXSMM_X86_GENERIC; } /* add signature to code string */ libxsmm_mmfunction_signature( &l_generated_code, i_routine_name, i_xgemm_desc ); /* add instruction set mismatch check to code, header */ libxsmm_generator_isa_check_header( &l_generated_code ); /* generate the actual kernel code for current description depending on the architecture */ libxsmm_generator_gemm_kernel( &l_generated_code, i_xgemm_desc ); /* add instruction set mismatch check to code, footer */ libxsmm_generator_isa_check_footer( &l_generated_code ); /* add flop counter for debug compilation */ libxsmm_generator_gemm_add_flop_counter( &l_generated_code, i_xgemm_desc ); /* close current function */ libxsmm_close_function( &l_generated_code ); /* check for errors during code generation */ if ( l_generated_code.last_error != 0 ) { LIBXSMM_HANDLE_ERROR_VERBOSE( &l_generated_code, l_generated_code.last_error ); return; } /* append code to source file */ { FILE *const l_file_handle = fopen( i_file_out, "a" ); if ( l_file_handle != NULL ) { assert(l_generated_code.generated_code != NULL); fputs( (const char*)l_generated_code.generated_code, l_file_handle ); fclose( l_file_handle ); } else { fprintf(stderr, "LIBXSMM ERROR libxsmm_generator_gemm_inlineasm could not write to into destination source file\n"); return; } } /* free code memory */ free( l_generated_code.generated_code ); } LIBXSMM_API void libxsmm_generator_gemm_directasm(const char* i_file_out, const char* i_routine_name, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch ) { /* init generated code object */ libxsmm_generated_code l_generated_code; l_generated_code.generated_code = NULL; l_generated_code.buffer_size = 0; l_generated_code.code_size = 0; l_generated_code.code_type = 1; l_generated_code.last_error = 0; l_generated_code.arch = 0; l_generated_code.sf_size = 0; /* set arch */ if ( strcmp(i_arch, "wsm") == 0 ) { l_generated_code.arch = LIBXSMM_X86_SSE4; } else if ( strcmp(i_arch, "snb") == 0 ) { l_generated_code.arch = LIBXSMM_X86_AVX; } else if ( strcmp(i_arch, "hsw") == 0 ) { l_generated_code.arch = LIBXSMM_X86_AVX2; } else if ( strcmp(i_arch, "knl") == 0 ) { l_generated_code.arch = LIBXSMM_X86_AVX512_MIC; } else if ( strcmp(i_arch, "knm") == 0 ) { l_generated_code.arch = LIBXSMM_X86_AVX512_KNM; } else if ( strcmp(i_arch, "skx") == 0 ) { l_generated_code.arch = LIBXSMM_X86_AVX512_CORE; } else if ( strcmp(i_arch, "clx") == 0 ) { l_generated_code.arch = LIBXSMM_X86_AVX512_CLX; } else if ( strcmp(i_arch, "cpx") == 0 ) { l_generated_code.arch = LIBXSMM_X86_AVX512_CPX; } else { l_generated_code.arch = LIBXSMM_X86_GENERIC; } /* check if we are not noarch */ if ( strcmp( i_arch, "noarch" ) == 0 ) { fprintf(stderr, "LIBXSMM ERROR, libxsmm_generator_gemm_direct: we cannot create ASM when noarch is specified!\n"); return; } /* add signature to code string */ libxsmm_mmfunction_signature( &l_generated_code, i_routine_name, i_xgemm_desc ); /* generate the actual kernel code for current description depending on the architecture */ libxsmm_generator_gemm_kernel( &l_generated_code, i_xgemm_desc ); /* check for errors during code generation */ if ( l_generated_code.last_error != 0 ) { LIBXSMM_HANDLE_ERROR_VERBOSE( &l_generated_code, l_generated_code.last_error ); return; } /* append code to source file */ { FILE *const l_file_handle = fopen( i_file_out, "w" ); if ( l_file_handle != NULL ) { assert(l_generated_code.generated_code != NULL); fputs( (const char*)l_generated_code.generated_code, l_file_handle ); fclose( l_file_handle ); } else { fprintf(stderr, "LIBXSMM ERROR, libxsmm_generator_gemm_direct: could not write to into destination source file!\n"); return; } } /* free code memory */ free( l_generated_code.generated_code ); } libxsmm-1.17/src/generator_gemm_avx2_microkernel.c000066400000000000000000000305721415223013700224020ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "generator_gemm_avx2_microkernel.h" #include "generator_x86_instructions.h" #include "libxsmm_main.h" LIBXSMM_API_INTERN void libxsmm_generator_gemm_avx2_microkernel( libxsmm_generated_code* io_generated_code, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_m_blocking, const unsigned int i_n_blocking, const int i_offset ) { /* deriving register blocking from kernel config */ unsigned int l_m_blocking = i_m_blocking/i_micro_kernel_config->vector_length; /* register blocking counter in n */ unsigned int l_n = 0; /* register blocking counter in m */ unsigned int l_m = 0; /* start register of accumulator */ unsigned int l_vec_reg_acc_start = 16 - (i_n_blocking * l_m_blocking); /* temp variable for b-offset to handle no-trans/trans B */ int l_b_offset = 0; /* check that m_blocking is a multiple of vlen and that n_blocking is valid */ if ( (i_n_blocking > 3) || (i_n_blocking < 1) ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_N_BLOCK ); return; } if ( i_m_blocking % i_micro_kernel_config->vector_length != 0 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_M_BLOCK ); return; } if (l_m_blocking == 1) { /* load column vectors of A */ libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, i_n_blocking, 0, 1, 0 ); /* loop over columns of B */ for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { /* post increment of a pointer early */ if ( l_n == 0 ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a, (i_xgemm_desc->lda)*(i_micro_kernel_config->datatype_size) ); } /* different ways of using B */ if ( i_offset != (-1) ) { /* handle trans B */ if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { l_b_offset = (i_micro_kernel_config->datatype_size * i_offset * i_xgemm_desc->ldb) + (l_n * i_micro_kernel_config->datatype_size); } else { l_b_offset = (i_micro_kernel_config->datatype_size * i_offset) + (i_xgemm_desc->ldb * l_n * i_micro_kernel_config->datatype_size); } libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_b_offset, i_micro_kernel_config->vector_name, l_n, 0, 1, 0 ); } else { /* handle trans B */ if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { l_b_offset = l_n * i_micro_kernel_config->datatype_size; } else { l_b_offset = i_xgemm_desc->ldb * l_n * i_micro_kernel_config->datatype_size; } libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_b_offset, i_micro_kernel_config->vector_name, l_n, 0, 1, 0 ); if ( l_n == (i_n_blocking -1) ) { /* handle trans B */ if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { l_b_offset = i_xgemm_desc->ldb * i_micro_kernel_config->datatype_size; } else { l_b_offset = i_micro_kernel_config->datatype_size; } libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_b, l_b_offset ); } } /* issue fma */ libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, i_n_blocking, l_n, l_vec_reg_acc_start + l_n ); } } else { /* broadcast from B -> into vec registers 0 to i_n_blocking */ if ( i_offset != (-1) ) { for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { /* handle trans B */ if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { l_b_offset = (i_micro_kernel_config->datatype_size * i_offset * i_xgemm_desc->ldb) + (l_n * i_micro_kernel_config->datatype_size); } else { l_b_offset = (i_micro_kernel_config->datatype_size * i_offset) + (i_xgemm_desc->ldb * l_n * i_micro_kernel_config->datatype_size); } libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_b_offset, i_micro_kernel_config->vector_name, l_n, 0, 1, 0 ); } } else { for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { /* handle trans B */ if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { l_b_offset = l_n * i_micro_kernel_config->datatype_size; } else { l_b_offset = i_xgemm_desc->ldb * l_n * i_micro_kernel_config->datatype_size; } libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_b_offset, i_micro_kernel_config->vector_name, l_n, 0, 1, 0 ); } /* handle trans B */ if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { l_b_offset = i_xgemm_desc->ldb * i_micro_kernel_config->datatype_size; } else { l_b_offset = i_micro_kernel_config->datatype_size; } libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_b, l_b_offset ); } if (l_m_blocking == 4) { /* load column vectors of A and multiply with all broadcasted row entries of B */ for ( l_m = 0; l_m < l_m_blocking; l_m++ ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (i_micro_kernel_config->datatype_size) * (i_micro_kernel_config->vector_length) * l_m, i_micro_kernel_config->vector_name, i_n_blocking, 0, 1, 0 ); for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { /* post increment early */ if ( (l_m == (l_m_blocking-1)) && (l_n == 0) ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a, (i_xgemm_desc->lda)*(i_micro_kernel_config->datatype_size) ); } /* issue fma */ libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, i_n_blocking, l_n, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n) ); } } } else { /* load column vectors of A and multiply with all broadcasted row entries of B */ for ( l_m = 0; l_m < l_m_blocking; l_m++ ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (i_micro_kernel_config->datatype_size) * (i_micro_kernel_config->vector_length) * l_m, i_micro_kernel_config->vector_name, i_n_blocking+l_m, 0, 1, 0 ); } for ( l_m = 0; l_m < l_m_blocking; l_m++ ) { for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { /* post increment early */ if ( (l_m == (l_m_blocking-1)) && (l_n == 0) ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a, (i_xgemm_desc->lda)*(i_micro_kernel_config->datatype_size) ); } /* issue fma */ libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, i_n_blocking+l_m, l_n, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n) ); } } } } } libxsmm-1.17/src/generator_gemm_avx2_microkernel.h000066400000000000000000000032001415223013700223730ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef GENERATOR_GEMM_AVX2_MICROKERNEL_H #define GENERATOR_GEMM_AVX2_MICROKERNEL_H #include "generator_common.h" #include "generator_gemm_common.h" LIBXSMM_API_INTERN void libxsmm_generator_gemm_avx2_microkernel( libxsmm_generated_code* io_generated_code, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_m_blocking, const unsigned int i_n_blocking, const int i_offset ); #endif /* GENERATOR_GEMM_AVX2_MICROKERNEL_H */ libxsmm-1.17/src/generator_gemm_avx512_microkernel.c000066400000000000000000001731641415223013700225550ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Evangelos Georganas (Intel Corp.) ******************************************************************************/ #include "generator_gemm_avx512_microkernel.h" #include "generator_x86_instructions.h" #include "libxsmm_main.h" LIBXSMM_API_INTERN void libxsmm_generator_gemm_avx512_microkernel_nofsdbcst( libxsmm_generated_code* io_generated_code, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_m_blocking, const unsigned int i_n_blocking, const int i_offset ) { /* deriving register blocking from kernel config */ unsigned int l_m_blocking = ( i_m_blocking % i_micro_kernel_config->vector_length == 0 ) ? i_m_blocking/i_micro_kernel_config->vector_length : (i_m_blocking/i_micro_kernel_config->vector_length)+1; /* register blocking counter in n */ unsigned int l_n = 0; /* register blocking counter in m */ unsigned int l_m = 0; /* start register of accumulator */ unsigned int l_vec_reg_acc_start = i_micro_kernel_config->vector_reg_count - (i_n_blocking * l_m_blocking); /* temp variable for b-offset to handle no-trans/trans B */ int l_b_offset = 0; #if !defined(NDEBUG) if ( (i_n_blocking > 30) || (i_n_blocking < 1) ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_N_BLOCK ); return; } if ( (l_m_blocking < 1) || (l_m_blocking > 4) ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_M_BLOCK ); return; } if ( (((l_m_blocking*i_n_blocking) + l_m_blocking + 1) > 32) && (i_n_blocking < 7) ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_REG_BLOCK ); return; } #endif /* Special case that arises in GEMMS from Resnet50 layers */ if (i_n_blocking == 7 && l_m_blocking == 4) { if ( i_offset != (-1) ) { for ( l_n = 0; l_n < 3; l_n++ ) { /* handle trans B */ if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { l_b_offset = (i_micro_kernel_config->datatype_size * i_offset * i_xgemm_desc->ldb) + (l_n * i_micro_kernel_config->datatype_size); } else { l_b_offset = (i_micro_kernel_config->datatype_size * i_offset) + (i_xgemm_desc->ldb * l_n * i_micro_kernel_config->datatype_size); } libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_b_offset, i_micro_kernel_config->vector_name, l_n, 0, 1, 0 ); } if ( i_xgemm_desc->prefetch & LIBXSMM_GEMM_PREFETCH_BL1 ) { if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { l_b_offset = (i_micro_kernel_config->datatype_size * i_offset * i_xgemm_desc->ldb); } else { l_b_offset = (i_micro_kernel_config->datatype_size * i_offset); } libxsmm_x86_instruction_prefetch(io_generated_code, LIBXSMM_X86_INSTR_PREFETCHT0, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_b_offset + 16 * i_xgemm_desc->ldb * i_micro_kernel_config->datatype_size); } } else { for ( l_n = 0; l_n < 3; l_n++ ) { /* handle trans B */ if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { l_b_offset = l_n * i_micro_kernel_config->datatype_size; } else { l_b_offset = i_xgemm_desc->ldb * l_n * i_micro_kernel_config->datatype_size; } libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_b_offset, i_micro_kernel_config->vector_name, l_n, 0, 1, 0 ); } if (i_xgemm_desc->prefetch & LIBXSMM_GEMM_PREFETCH_BL1) { libxsmm_x86_instruction_prefetch(io_generated_code, LIBXSMM_X86_INSTR_PREFETCHT0, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, 16 * i_xgemm_desc->ldb * i_micro_kernel_config->datatype_size); } } /* load column vectors of A and multiply with all broadcasted row entries of B */ for ( l_m = 0; l_m < l_m_blocking; l_m++ ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (i_micro_kernel_config->datatype_size) * (i_micro_kernel_config->vector_length) * l_m, i_micro_kernel_config->vector_name, 3, ( l_m == (l_m_blocking - 1) ) ? i_micro_kernel_config->use_masking_a_c : 0, 1, 0 ); /* In case of batch reduce try to prefetch a few more columns ahead... */ if ((LIBXSMM_GEMM_PRECISION_I8 != LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype)) && (LIBXSMM_GEMM_PRECISION_BF16 != LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype)) && ((i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS) || (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET) || (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE))) { unsigned int pf_a_cols_ahead = 16; if (i_xgemm_desc->lda == 1024) { pf_a_cols_ahead = 4; } libxsmm_x86_instruction_prefetch( io_generated_code, LIBXSMM_X86_INSTR_PREFETCHT0, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (i_micro_kernel_config->datatype_size) * (i_micro_kernel_config->vector_length) * l_m + pf_a_cols_ahead * i_xgemm_desc->lda * i_micro_kernel_config->datatype_size); } for ( l_n = 0; l_n < 3; l_n++ ) { /* issue fma */ if ( LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_A_UNSIGNED) > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, l_n, 3, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n) ); } else if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_B_UNSIGNED) > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, 3, l_n, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n) ); } else { /* should not happen */ } } else { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, 3, l_n, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n) ); } } } if ( i_offset != (-1) ) { for ( l_n = 3; l_n < 6; l_n++ ) { /* handle trans B */ if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { l_b_offset = (i_micro_kernel_config->datatype_size * i_offset * i_xgemm_desc->ldb) + (l_n * i_micro_kernel_config->datatype_size); } else { l_b_offset = (i_micro_kernel_config->datatype_size * i_offset) + (i_xgemm_desc->ldb * l_n * i_micro_kernel_config->datatype_size); } libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_b_offset, i_micro_kernel_config->vector_name, l_n-3, 0, 1, 0 ); } } else { for ( l_n = 3; l_n < 6; l_n++ ) { /* handle trans B */ if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { l_b_offset = l_n * i_micro_kernel_config->datatype_size; } else { l_b_offset = i_xgemm_desc->ldb * l_n * i_micro_kernel_config->datatype_size; } libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_b_offset, i_micro_kernel_config->vector_name, l_n-3, 0, 1, 0 ); } } for ( l_m = 0; l_m < l_m_blocking; l_m++ ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (i_micro_kernel_config->datatype_size) * (i_micro_kernel_config->vector_length) * l_m, i_micro_kernel_config->vector_name, 3, ( l_m == (l_m_blocking - 1) ) ? i_micro_kernel_config->use_masking_a_c : 0, 1, 0 ); for ( l_n = 3; l_n < 6; l_n++ ) { /* issue fma */ if ( LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_A_UNSIGNED) > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, l_n-3, 3, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n) ); } else if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_B_UNSIGNED) > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, 3, l_n-3, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n) ); } else { /* should not happen */ } } else { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, 3, l_n-3, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n) ); } } } if ( i_offset != (-1) ) { for ( l_n = 6; l_n < 7; l_n++ ) { /* handle trans B */ if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { l_b_offset = (i_micro_kernel_config->datatype_size * i_offset * i_xgemm_desc->ldb) + (l_n * i_micro_kernel_config->datatype_size); } else { l_b_offset = (i_micro_kernel_config->datatype_size * i_offset) + (i_xgemm_desc->ldb * l_n * i_micro_kernel_config->datatype_size); } libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_b_offset, i_micro_kernel_config->vector_name, l_n-6, 0, 1, 0 ); } } else { for ( l_n = 6; l_n < 7; l_n++ ) { /* handle trans B */ if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { l_b_offset = l_n * i_micro_kernel_config->datatype_size; } else { l_b_offset = i_xgemm_desc->ldb * l_n * i_micro_kernel_config->datatype_size; } libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_b_offset, i_micro_kernel_config->vector_name, l_n-6, 0, 1, 0 ); } /* handle trans B */ if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { l_b_offset = i_xgemm_desc->ldb * i_micro_kernel_config->datatype_size; } else { l_b_offset = i_micro_kernel_config->datatype_size; } libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_b, l_b_offset ); } for ( l_m = 0; l_m < l_m_blocking; l_m++ ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (i_micro_kernel_config->datatype_size) * (i_micro_kernel_config->vector_length) * l_m, i_micro_kernel_config->vector_name, 3, ( l_m == (l_m_blocking - 1) ) ? i_micro_kernel_config->use_masking_a_c : 0, 1, 0 ); for ( l_n = 6; l_n < 7; l_n++ ) { /* post increment early */ if ( (l_m == (l_m_blocking-1)) && (l_n == 6) ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a, (i_xgemm_desc->lda)*(i_micro_kernel_config->datatype_size) ); } /* issue fma */ if ( LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_A_UNSIGNED) > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, l_n-6, 3, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n) ); } else if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_B_UNSIGNED) > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, 3, l_n-6, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n) ); } else { /* should not happen */ } } else { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, 3, l_n-6, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n) ); } } } } else { /* load column vectors of A upfront */ for ( l_m = 0; l_m < l_m_blocking; l_m++ ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (i_micro_kernel_config->datatype_size) * (i_micro_kernel_config->vector_length) * l_m, i_micro_kernel_config->vector_name, 1+l_m, ( l_m == (l_m_blocking - 1) ) ? i_micro_kernel_config->use_masking_a_c : 0, 1, 0 ); /* current A prefetch, next rows for the current column */ if ( i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2_AHEAD || i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C_AHEAD ) { libxsmm_x86_instruction_prefetch( io_generated_code, LIBXSMM_X86_INSTR_PREFETCHT1, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, ((i_micro_kernel_config->datatype_size) * (i_micro_kernel_config->vector_length) * l_m) + (64 * l_m_blocking) ); } /* prefetch a different A matrix provided by the prefetch pointers */ if ( (i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2) || (i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C) ) { libxsmm_x86_instruction_prefetch( io_generated_code, LIBXSMM_X86_INSTR_PREFETCHT1, i_gp_reg_mapping->gp_reg_a_prefetch, LIBXSMM_X86_GP_REG_UNDEF, 0, (i_micro_kernel_config->datatype_size) * (i_micro_kernel_config->vector_length) * l_m ); } } for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { if ( i_offset != (-1) ) { /* handle trans B */ if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { l_b_offset = (i_micro_kernel_config->datatype_size * i_offset * i_xgemm_desc->ldb) + (l_n * i_micro_kernel_config->datatype_size); } else { l_b_offset = (i_micro_kernel_config->datatype_size * i_offset) + (i_xgemm_desc->ldb * l_n * i_micro_kernel_config->datatype_size); } libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_b_offset, i_micro_kernel_config->vector_name, 0, 0, 1, 0 ); if (l_n == i_n_blocking - 1) { if (i_xgemm_desc->prefetch & LIBXSMM_GEMM_PREFETCH_BL1) { if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { l_b_offset = (i_micro_kernel_config->datatype_size * i_offset * i_xgemm_desc->ldb); } else { l_b_offset = (i_micro_kernel_config->datatype_size * i_offset); } libxsmm_x86_instruction_prefetch(io_generated_code, LIBXSMM_X86_INSTR_PREFETCHT0, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_b_offset + 16 * i_xgemm_desc->ldb * i_micro_kernel_config->datatype_size); } } } else { /* handle trans B */ if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { l_b_offset = l_n * i_micro_kernel_config->datatype_size; } else { l_b_offset = i_xgemm_desc->ldb * l_n * i_micro_kernel_config->datatype_size; } libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_b_offset, i_micro_kernel_config->vector_name, 0, 0, 1, 0 ); if (l_n == i_n_blocking - 1) { if (i_xgemm_desc->prefetch & LIBXSMM_GEMM_PREFETCH_BL1) { libxsmm_x86_instruction_prefetch(io_generated_code, LIBXSMM_X86_INSTR_PREFETCHT0, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, 16 * i_xgemm_desc->ldb * i_micro_kernel_config->datatype_size); } /* handle trans B */ if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { l_b_offset = i_xgemm_desc->ldb * i_micro_kernel_config->datatype_size; } else { l_b_offset = i_micro_kernel_config->datatype_size; } libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_b, l_b_offset ); } } /* In case of batch reduce try to prefetch a few more columns ahead for A... */ if ((l_n < l_m_blocking) && (LIBXSMM_GEMM_PRECISION_I8 != LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype)) && (LIBXSMM_GEMM_PRECISION_BF16 != LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype)) && ((i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS) || (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET) || (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE))) { unsigned int pf_a_cols_ahead = 16; if (i_xgemm_desc->lda == 1024) { pf_a_cols_ahead = 4; } libxsmm_x86_instruction_prefetch( io_generated_code, LIBXSMM_X86_INSTR_PREFETCHT0, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (i_micro_kernel_config->datatype_size) * (i_micro_kernel_config->vector_length) * l_n + pf_a_cols_ahead * i_xgemm_desc->lda * i_micro_kernel_config->datatype_size); } for ( l_m = 0; l_m < l_m_blocking; l_m++ ) { /* post increment early */ if ( (l_m == 0) && (l_n == i_n_blocking-1) ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a, (i_xgemm_desc->lda)*(i_micro_kernel_config->datatype_size) ); /* if we prefetch next A into L2, we need to also increment the prefetch pointer */ if ( (i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2) || (i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C) ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a_prefetch, (i_xgemm_desc->lda)*(i_micro_kernel_config->datatype_size) ); } } /* issue fma */ if ( LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_A_UNSIGNED) > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, 0, 1+l_m, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n) ); } else if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_B_UNSIGNED) > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, 1+l_m, 0, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n) ); } else { /* should not happen */ } } else { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, 1+l_m, 0, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n) ); } } } } } LIBXSMM_API_INTERN void libxsmm_generator_gemm_avx512_microkernel_fsdbcst( libxsmm_generated_code* io_generated_code, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_n_blocking, const unsigned int i_k_blocking ) { unsigned int l_n; unsigned int l_k; unsigned int l_n_accs = 0; #if !defined(NDEBUG) if ( i_n_blocking > 30 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_N_BLOCK ); return; } #endif /* compute number of n accumulators to hide FMA latencies */ if (i_n_blocking >= 12) { l_n_accs = 1; } else if (i_n_blocking >= 6) { l_n_accs = 2; } else { l_n_accs = 4; } if ( l_n_accs > i_k_blocking ) { l_n_accs = i_k_blocking; l_n_accs = (l_n_accs == 0) ? 1 : l_n_accs; } /* xor additional accumulator, if needed */ for ( l_k = 1; l_k < l_n_accs; l_k++) { for ( l_n = 0; l_n < i_n_blocking; l_n++) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, io_generated_code->arch, i_micro_kernel_config->vxor_instruction, i_micro_kernel_config->vector_name, i_micro_kernel_config->vector_reg_count - (i_n_blocking*(l_k+1)) + l_n, i_micro_kernel_config->vector_reg_count - (i_n_blocking*(l_k+1)) + l_n, i_micro_kernel_config->vector_reg_count - (i_n_blocking*(l_k+1)) + l_n ); } } /* in case of int8 GEMM on SKX use zmm2 for 16bit 1's */ if ( (LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype )) && (io_generated_code->arch < LIBXSMM_X86_AVX512_CLX) ) { short l_all_ones[32] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; libxsmm_x86_instruction_full_vec_load_of_constants ( io_generated_code, (const unsigned char *)l_all_ones, "my_int16_ones", i_micro_kernel_config->vector_name, 2 ); } /* apply k blocking */ for ( l_k = 0; l_k < i_k_blocking; l_k++ ) { if ( l_k == 0 ) { /* load A */ libxsmm_x86_instruction_vec_move( io_generated_code, io_generated_code->arch, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, i_xgemm_desc->lda * l_k * i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, 0, i_micro_kernel_config->use_masking_a_c, 1, 0 ); /* current A prefetch, next rows for the current column */ if ( i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2_AHEAD || i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C_AHEAD) { libxsmm_x86_instruction_prefetch( io_generated_code, i_micro_kernel_config->prefetch_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (i_xgemm_desc->lda * l_k + i_micro_kernel_config->datatype_size) + 64 ); } if ( i_k_blocking > 1 ) { /* second A load in first iteration, in case of large blockings -> hiding L1 latencies */ libxsmm_x86_instruction_vec_move( io_generated_code, io_generated_code->arch, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, i_xgemm_desc->lda * (l_k+1) * i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, 1, i_micro_kernel_config->use_masking_a_c, 1, 0 ); /* current A prefetch, next rows for the current column */ if ( i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2_AHEAD || i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C_AHEAD) { libxsmm_x86_instruction_prefetch( io_generated_code, i_micro_kernel_config->prefetch_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (i_xgemm_desc->lda * (l_k+1) * i_micro_kernel_config->datatype_size) + 64 ); } } } else if ( l_k < (i_k_blocking - 1) ) { /* pipelined load of A, one k iteration ahead */ libxsmm_x86_instruction_vec_move( io_generated_code, io_generated_code->arch, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, i_xgemm_desc->lda * (l_k+1) * i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, (l_k+1)%2, i_micro_kernel_config->use_masking_a_c, 1, 0 ); /* current A prefetch, next rows for the current column */ if ( i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2_AHEAD || i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C_AHEAD ) { libxsmm_x86_instruction_prefetch( io_generated_code, i_micro_kernel_config->prefetch_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (i_xgemm_desc->lda * (l_k+1) * i_micro_kernel_config->datatype_size) + 64 ); } } /* next A prefetch "same" rows in "same" column, but in a different matrix */ if ( i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2 || i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C) { libxsmm_x86_instruction_prefetch( io_generated_code, i_micro_kernel_config->prefetch_instruction, i_gp_reg_mapping->gp_reg_a_prefetch, LIBXSMM_X86_GP_REG_UNDEF, 0, (i_xgemm_desc->lda * l_k * i_micro_kernel_config->datatype_size) ); if ( l_k == (i_k_blocking - 1) ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a_prefetch, i_k_blocking * i_micro_kernel_config->datatype_size * i_xgemm_desc->lda ); } } /* in last k-iteration: advance pointers */ if ( l_k == (i_k_blocking - 1) ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a, i_k_blocking * i_micro_kernel_config->datatype_size * i_xgemm_desc->lda ); } /* in case of bfloat16 "prepare" A matrix in registers zmm l_k%2 and zmm3 using FP32 numbers */ if ( (LIBXSMM_GEMM_PRECISION_BF16 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype )) && (io_generated_code->arch < LIBXSMM_X86_AVX512_CPX) ) { /* we put "0" elements of A matrix into zmm3 */ libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, io_generated_code->arch, LIBXSMM_X86_INSTR_VPSLLD, i_micro_kernel_config->vector_name, l_k%2, 3, LIBXSMM_X86_VEC_REG_UNDEF, 16); /* we put "1" elements of A matrix into l_k%2 zmm*/ libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, io_generated_code->arch, LIBXSMM_X86_INSTR_VPSRAD, i_micro_kernel_config->vector_name, l_k%2, l_k%2, LIBXSMM_X86_VEC_REG_UNDEF, 16); libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, io_generated_code->arch, LIBXSMM_X86_INSTR_VPSLLD, i_micro_kernel_config->vector_name, l_k%2, l_k%2, LIBXSMM_X86_VEC_REG_UNDEF, 16); } /* compute vectorwidth (A) * column broadcast (B) */ if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) == 0 ) { for ( l_n = 0; l_n < i_n_blocking; l_n++) { /* determining base, idx and scale values */ unsigned int l_b_reg = i_gp_reg_mapping->gp_reg_b; unsigned int l_b_idx = LIBXSMM_X86_GP_REG_UNDEF; unsigned int l_scale = 0; unsigned int l_disp = (l_k*i_micro_kernel_config->datatype_size)+(l_n*i_xgemm_desc->ldb*i_micro_kernel_config->datatype_size); if ( LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) || LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { libxsmm_x86_instruction_vec_compute_mem( io_generated_code, io_generated_code->arch, i_micro_kernel_config->vmul_instruction, 1, l_b_reg, l_b_idx, l_scale, l_disp, i_micro_kernel_config->vector_name, l_k%2, i_micro_kernel_config->vector_reg_count - (i_n_blocking*((l_k%l_n_accs)+1)) + l_n ); } else if (LIBXSMM_GEMM_PRECISION_I16 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { if ( io_generated_code->arch == LIBXSMM_X86_AVX512_CORE ) { libxsmm_x86_instruction_vec_move( io_generated_code, io_generated_code->arch, LIBXSMM_X86_INSTR_VPBROADCASTD, l_b_reg, l_b_idx, l_scale, l_disp, i_micro_kernel_config->vector_name, 3, 0, 1, 0 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, io_generated_code->arch, LIBXSMM_X86_INSTR_VPMADDWD, i_micro_kernel_config->vector_name, l_k%2, 3, 3 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, io_generated_code->arch, LIBXSMM_X86_INSTR_VPADDD, i_micro_kernel_config->vector_name, 3, i_micro_kernel_config->vector_reg_count - (i_n_blocking*((l_k%l_n_accs)+1)) + l_n, i_micro_kernel_config->vector_reg_count - (i_n_blocking*((l_k%l_n_accs)+1)) + l_n ); } else if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512_CLX ) || ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { libxsmm_x86_instruction_vec_compute_mem( io_generated_code, io_generated_code->arch, LIBXSMM_X86_INSTR_VPDPWSSD, 1, l_b_reg, l_b_idx, l_scale, l_disp, i_micro_kernel_config->vector_name, l_k%2, i_micro_kernel_config->vector_reg_count - (i_n_blocking*((l_k%l_n_accs)+1)) + l_n ); } else { /* shouldn't happen */ } } else if (LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { if ( io_generated_code->arch < LIBXSMM_X86_AVX512_CLX ) { /* let's broadcast B into zmm3 */ libxsmm_x86_instruction_vec_move( io_generated_code, io_generated_code->arch, LIBXSMM_X86_INSTR_VPBROADCASTD, l_b_reg, l_b_idx, l_scale, l_disp, i_micro_kernel_config->vector_name, 3, 0, 1, 0 ); /* 8 bit mix-sign Mul */ if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_A_UNSIGNED) > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, io_generated_code->arch, LIBXSMM_X86_INSTR_VPMADDUBSW, i_micro_kernel_config->vector_name, 3, l_k%2, 3 ); } else if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_B_UNSIGNED) > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, io_generated_code->arch, LIBXSMM_X86_INSTR_VPMADDUBSW, i_micro_kernel_config->vector_name, l_k%2, 3, 3 ); } else { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_ARCH_PREC ); return; } /* 16 bit mul with 1 */ libxsmm_x86_instruction_vec_compute_reg( io_generated_code, io_generated_code->arch, LIBXSMM_X86_INSTR_VPMADDWD, i_micro_kernel_config->vector_name, 2, 3, 3 ); /* add to accumulator */ libxsmm_x86_instruction_vec_compute_reg( io_generated_code, io_generated_code->arch, LIBXSMM_X86_INSTR_VPADDD, i_micro_kernel_config->vector_name, 3, i_micro_kernel_config->vector_reg_count - (i_n_blocking*((l_k%l_n_accs)+1)) + l_n, i_micro_kernel_config->vector_reg_count - (i_n_blocking*((l_k%l_n_accs)+1)) + l_n ); } else if ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) { if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_A_UNSIGNED) > 0 ) { libxsmm_x86_instruction_vec_compute_mem( io_generated_code, io_generated_code->arch, LIBXSMM_X86_INSTR_VPDPBUSD, 1, l_b_reg, l_b_idx, l_scale, l_disp, i_micro_kernel_config->vector_name, l_k%2, i_micro_kernel_config->vector_reg_count - (i_n_blocking*((l_k%l_n_accs)+1)) + l_n ); } else if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_B_UNSIGNED) > 0 ) { libxsmm_x86_instruction_vec_move( io_generated_code, io_generated_code->arch, LIBXSMM_X86_INSTR_VPBROADCASTD, l_b_reg, l_b_idx, l_scale, l_disp, i_micro_kernel_config->vector_name, 3, 0, 1, 0 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, io_generated_code->arch, LIBXSMM_X86_INSTR_VPDPBUSD, i_micro_kernel_config->vector_name, l_k%2, 3, i_micro_kernel_config->vector_reg_count - (i_n_blocking*((l_k%l_n_accs)+1)) + l_n ); } else { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_ARCH_PREC ); return; } } else { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_ARCH_PREC ); return; } } else if (LIBXSMM_GEMM_PRECISION_BF16 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { if ( io_generated_code->arch < LIBXSMM_X86_AVX512_CPX ) { /* broadcast pair of B matrix values into zmm2 */ libxsmm_x86_instruction_vec_move( io_generated_code, io_generated_code->arch, LIBXSMM_X86_INSTR_VBROADCASTSS, l_b_reg, l_b_idx, l_scale, l_disp, i_micro_kernel_config->vector_name, 2, 0, 1, 0 ); /* we put "1" elements of B matrix into zmm2 */ libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, io_generated_code->arch, LIBXSMM_X86_INSTR_VPSRAD, i_micro_kernel_config->vector_name, 2, 2, LIBXSMM_X86_VEC_REG_UNDEF, 16); libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, io_generated_code->arch, LIBXSMM_X86_INSTR_VPSLLD, i_micro_kernel_config->vector_name, 2, 2, LIBXSMM_X86_VEC_REG_UNDEF, 16); /* perform fma operations for multiplying "1" elements of A and B */ libxsmm_x86_instruction_vec_compute_reg( io_generated_code, io_generated_code->arch, LIBXSMM_X86_INSTR_VFMADD231PS, i_micro_kernel_config->vector_name, l_k%2, 2, i_micro_kernel_config->vector_reg_count - (i_n_blocking*((l_k%l_n_accs)+1)) + l_n ); /* broadcast pair of B matrix values into zmm2 */ libxsmm_x86_instruction_vec_move( io_generated_code, io_generated_code->arch, LIBXSMM_X86_INSTR_VBROADCASTSS, l_b_reg, l_b_idx, l_scale, l_disp, i_micro_kernel_config->vector_name, 2, 0, 1, 0 ); /* we put "0" elements of B matrix into zmm2 */ libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, io_generated_code->arch, LIBXSMM_X86_INSTR_VPSLLD, i_micro_kernel_config->vector_name, 2, 2, LIBXSMM_X86_VEC_REG_UNDEF, 16); /* perform fma operations for multiplying "0" elements of A and B */ libxsmm_x86_instruction_vec_compute_reg( io_generated_code, io_generated_code->arch, LIBXSMM_X86_INSTR_VFMADD231PS, i_micro_kernel_config->vector_name, 3, 2, i_micro_kernel_config->vector_reg_count - (i_n_blocking*((l_k%l_n_accs)+1)) + l_n ); } else { libxsmm_x86_instruction_vec_compute_mem( io_generated_code, io_generated_code->arch, LIBXSMM_X86_INSTR_VDPBF16PS, 1, l_b_reg, l_b_idx, l_scale, l_disp, i_micro_kernel_config->vector_name, l_k%2, i_micro_kernel_config->vector_reg_count - (i_n_blocking*((l_k%l_n_accs)+1)) + l_n ); } } else { /* should not happen */ } } } else { for ( l_n = 0; l_n < i_n_blocking; l_n++) { libxsmm_x86_instruction_vec_compute_mem( io_generated_code, io_generated_code->arch, i_micro_kernel_config->vmul_instruction, 1, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, (l_k*i_xgemm_desc->ldb*i_micro_kernel_config->datatype_size) + (l_n*i_micro_kernel_config->datatype_size), i_micro_kernel_config->vector_name, l_k%2, i_micro_kernel_config->vector_reg_count - (i_n_blocking*((l_k%l_n_accs)+1)) + l_n ); } } } if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) == 0 ) { /* advance pointers of B only when we are not fully unrolling K and taking care of intermediate advances */ if ( i_k_blocking < (unsigned int)i_xgemm_desc->k ) { /* advance pointers of B */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_b, i_k_blocking * i_micro_kernel_config->datatype_size ); } } else { /* advance pointers of B only when we are not fully unrolling K and taking care of intermediate advances */ if ( i_k_blocking < (unsigned int)i_xgemm_desc->k ) { /* advance B ptr by K rows */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_b, (i_k_blocking * i_micro_kernel_config->datatype_size * i_xgemm_desc->ldb) ); } } /* add additional accumulators, if needed */ for ( l_k = 1; l_k < l_n_accs; l_k++) { for ( l_n = 0; l_n < i_n_blocking; l_n++) { if ( (LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype )) || (LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype )) || (LIBXSMM_GEMM_PRECISION_BF16 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype )) ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, io_generated_code->arch, i_micro_kernel_config->vadd_instruction, i_micro_kernel_config->vector_name, i_micro_kernel_config->vector_reg_count - (i_n_blocking*(l_k+1)) + l_n, i_micro_kernel_config->vector_reg_count - i_n_blocking + l_n, i_micro_kernel_config->vector_reg_count - i_n_blocking + l_n ); } else if ( (LIBXSMM_GEMM_PRECISION_I16 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype )) || (LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype )) ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, io_generated_code->arch, LIBXSMM_X86_INSTR_VPADDD, i_micro_kernel_config->vector_name, i_micro_kernel_config->vector_reg_count - (i_n_blocking*(l_k+1)) + l_n, i_micro_kernel_config->vector_reg_count - i_n_blocking + l_n, i_micro_kernel_config->vector_reg_count - i_n_blocking + l_n ); } else { /* shouldn't happen */ } } } } LIBXSMM_API_INTERN void libxsmm_generator_gemm_avx512_microkernel_fsdbcst_qfma( libxsmm_generated_code* io_generated_code, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_n_blocking, const unsigned int i_k_blocking ) { unsigned int l_n; unsigned int l_k; unsigned int l_z; unsigned int l_n_accs = 0; #if !defined(NDEBUG) if ( i_n_blocking > 28 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_N_BLOCK ); return; } #endif /* lazy fix of when QMADD doesn't work (transB and not FP32/I16)*/ if ( ((i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) != 0) || ( ( LIBXSMM_GEMM_PRECISION_F32 != LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) && ( LIBXSMM_GEMM_PRECISION_I16 != LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) ) ) { libxsmm_generator_gemm_avx512_microkernel_fsdbcst( io_generated_code, i_gp_reg_mapping, i_micro_kernel_config, i_xgemm_desc, i_n_blocking, i_k_blocking ); return; } /* compute number of n accumulators to hide FMA latencies */ if (i_n_blocking >= 14) { l_n_accs = 1; } else if (i_n_blocking >= 7) { l_n_accs = 2; } else { l_n_accs = 4; } if ( l_n_accs > (i_k_blocking/4) ) { l_n_accs = (i_k_blocking/4); l_n_accs = (l_n_accs == 0) ? 1 : l_n_accs; } /* xor additional accumulator, if needed */ for ( l_k = 1; l_k < l_n_accs; l_k++) { for ( l_n = 0; l_n < i_n_blocking; l_n++) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, io_generated_code->arch, i_micro_kernel_config->vxor_instruction, i_micro_kernel_config->vector_name, i_micro_kernel_config->vector_reg_count - (i_n_blocking*(l_k+1)) + l_n, i_micro_kernel_config->vector_reg_count - (i_n_blocking*(l_k+1)) + l_n, i_micro_kernel_config->vector_reg_count - (i_n_blocking*(l_k+1)) + l_n ); } } /* apply k blocking */ for ( l_k = 0; l_k < i_k_blocking; ++l_k ) { unsigned int l_lcl_k = (l_k+4 <= i_k_blocking) ? 4 : 1; /* load A matrix */ for ( l_z = 0; l_z < l_lcl_k; l_z++ ) { libxsmm_x86_instruction_vec_move( io_generated_code, io_generated_code->arch, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, i_xgemm_desc->lda * (l_k+l_z) * i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, l_z, i_micro_kernel_config->use_masking_a_c, 1, 0 ); /* current A prefetch, next rows for the current column */ if ( i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2_AHEAD || i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C_AHEAD) { libxsmm_x86_instruction_prefetch( io_generated_code, i_micro_kernel_config->prefetch_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (i_xgemm_desc->lda * (l_k+l_z) * i_micro_kernel_config->datatype_size) + 64 ); } } /* next A prefetch "same" rows in "same" column, but in a different matrix */ if ( i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2 || i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C ) { for ( l_z = 0; l_z < l_lcl_k; l_z++ ) { libxsmm_x86_instruction_prefetch( io_generated_code, i_micro_kernel_config->prefetch_instruction, i_gp_reg_mapping->gp_reg_a_prefetch, LIBXSMM_X86_GP_REG_UNDEF, 0, (i_xgemm_desc->lda * (l_k+l_z) * i_micro_kernel_config->datatype_size) ); } if ( (l_k+l_lcl_k) == i_k_blocking ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a_prefetch, i_k_blocking * i_micro_kernel_config->datatype_size * i_xgemm_desc->lda ); } } /* in last k-iteration: advance pointers */ if ( (l_k+l_lcl_k) == i_k_blocking ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a, i_k_blocking * i_micro_kernel_config->datatype_size * i_xgemm_desc->lda ); } /* compute vectorwidth (A) * column broadcast (B) */ for ( l_n = 0; l_n < i_n_blocking; l_n++) { /* determining base, idx and scale values */ unsigned int l_b_reg = i_gp_reg_mapping->gp_reg_b; unsigned int l_b_idx = LIBXSMM_X86_GP_REG_UNDEF; unsigned int l_scale = 0; unsigned int l_disp = (l_k*i_micro_kernel_config->datatype_size)+(l_n*i_xgemm_desc->ldb*i_micro_kernel_config->datatype_size); if ( l_lcl_k == 4 ) { if (LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { libxsmm_x86_instruction_vec_compute_qfma( io_generated_code, io_generated_code->arch, LIBXSMM_X86_INSTR_V4FMADDPS, l_b_reg, l_b_idx, l_scale, l_disp, i_micro_kernel_config->vector_name, 0, i_micro_kernel_config->vector_reg_count - (i_n_blocking*(((l_k/4)%l_n_accs)+1)) + l_n ); } else if (LIBXSMM_GEMM_PRECISION_I16 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { libxsmm_x86_instruction_vec_compute_qfma( io_generated_code, io_generated_code->arch, LIBXSMM_X86_INSTR_VP4DPWSSD, l_b_reg, l_b_idx, l_scale, l_disp, i_micro_kernel_config->vector_name, 0, i_micro_kernel_config->vector_reg_count - (i_n_blocking*(((l_k/4)%l_n_accs)+1)) + l_n ); } else { /* shouldn't happen */ } } else { if (LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { libxsmm_x86_instruction_vec_compute_mem( io_generated_code, io_generated_code->arch, i_micro_kernel_config->vmul_instruction, 1, l_b_reg, l_b_idx, l_scale, l_disp, i_micro_kernel_config->vector_name, 0, i_micro_kernel_config->vector_reg_count - (i_n_blocking*(((l_k)%l_n_accs)+1)) + l_n ); } else if (LIBXSMM_GEMM_PRECISION_I16 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_K_BLOCK ); return; } else { /* shouldn't happen */ } } } if (l_lcl_k == 4) { l_k+=3; } } /* advance pointers of B only when we are not fully unrolling K and taking care of intermediate advances */ if ( i_k_blocking < (unsigned int)i_xgemm_desc->k ) { /* advance pointers of B */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_b, i_k_blocking * i_micro_kernel_config->datatype_size ); } /* add additional accumulators, if needed */ for ( l_k = 1; l_k < l_n_accs; l_k++) { for ( l_n = 0; l_n < i_n_blocking; l_n++) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, io_generated_code->arch, i_micro_kernel_config->vadd_instruction, i_micro_kernel_config->vector_name, i_micro_kernel_config->vector_reg_count - (i_n_blocking*(l_k+1)) + l_n, i_micro_kernel_config->vector_reg_count - i_n_blocking + l_n, i_micro_kernel_config->vector_reg_count - i_n_blocking + l_n ); } } } libxsmm-1.17/src/generator_gemm_avx512_microkernel.h000066400000000000000000000065351415223013700225570ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef GENERATOR_GEMM_AVX512_MICROKERNEL_H #define GENERATOR_GEMM_AVX512_MICROKERNEL_H #include "generator_common.h" #include "generator_gemm_common.h" LIBXSMM_API_INTERN void libxsmm_generator_gemm_avx512_microkernel_nofsdbcst( libxsmm_generated_code* io_generated_code, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_m_blocking, const unsigned int i_n_blocking, const int i_offset ); LIBXSMM_API_INTERN void libxsmm_generator_gemm_avx512_microkernel_fsdbcst( libxsmm_generated_code* io_generated_code, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_n_blocking, const unsigned int i_k_blocking ); LIBXSMM_API_INTERN void libxsmm_generator_gemm_avx512_microkernel_fsdbcst_qfma( libxsmm_generated_code* io_generated_code, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_n_blocking, const unsigned int i_k_blocking ); #endif /* GENERATOR_GEMM_AVX512_MICROKERNEL_H */ libxsmm-1.17/src/generator_gemm_avx_microkernel.c000066400000000000000000000340161415223013700223150ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "generator_gemm_avx_microkernel.h" #include "generator_x86_instructions.h" #include "libxsmm_main.h" LIBXSMM_API_INTERN void libxsmm_generator_gemm_avx_microkernel( libxsmm_generated_code* io_generated_code, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_m_blocking, const unsigned int i_n_blocking, const int i_offset ) { /* deriving register blocking from kernel config */ unsigned int l_m_blocking = i_m_blocking/i_micro_kernel_config->vector_length; /* register blocking counter in n */ unsigned int l_n = 0; /* register blocking counter in m */ unsigned int l_m = 0; /* start register of accumulator */ unsigned int l_vec_reg_acc_start = 16 - (i_n_blocking * l_m_blocking); /* temp variable for b-offset to handle no-trans/trans B */ int l_b_offset = 0; /* check that m_blocking is a multiple of vlen and that n_blocking is valid */ if ( (i_n_blocking > 3) || (i_n_blocking < 1) ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_N_BLOCK ); return; } if ( i_m_blocking % i_micro_kernel_config->vector_length != 0 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_M_BLOCK ); return; } if (l_m_blocking == 1) { /* load column vectors of A */ libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, i_n_blocking, 0, 1, 0 ); /* loop over columns of B */ for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { /* post increment of a pointer early */ if ( l_n == 0 ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a, (i_xgemm_desc->lda)*(i_micro_kernel_config->datatype_size) ); } /* different ways of using B */ if ( i_offset != (-1) ) { /* handle trans B */ if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { l_b_offset = (i_micro_kernel_config->datatype_size * i_offset * i_xgemm_desc->ldb) + (l_n * i_micro_kernel_config->datatype_size); } else { l_b_offset = (i_micro_kernel_config->datatype_size * i_offset) + (i_xgemm_desc->ldb * l_n * i_micro_kernel_config->datatype_size); } libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_b_offset, i_micro_kernel_config->vector_name, l_n, 0, 1, 0 ); } else { /* handle trans B */ if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { l_b_offset = l_n * i_micro_kernel_config->datatype_size; } else { l_b_offset = i_xgemm_desc->ldb * l_n * i_micro_kernel_config->datatype_size; } libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_b_offset, i_micro_kernel_config->vector_name, l_n, 0, 1, 0 ); if ( l_n == (i_n_blocking -1) ) { /* handle trans B */ if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { l_b_offset = i_xgemm_desc->ldb * i_micro_kernel_config->datatype_size; } else { l_b_offset = i_micro_kernel_config->datatype_size; } libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_b, l_b_offset ); } } /* issue mul-add */ libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, i_n_blocking, l_n, i_n_blocking + l_n + 1 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vadd_instruction, i_micro_kernel_config->vector_name, i_n_blocking + l_n + 1, l_vec_reg_acc_start + l_n, l_vec_reg_acc_start + l_n ); } } else { /* broadcast from B -> into vec registers 0 to i_n_blocking */ if ( i_offset != (-1) ) { for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { /* handle trans B */ if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { l_b_offset = (i_micro_kernel_config->datatype_size * i_offset * i_xgemm_desc->ldb) + (l_n * i_micro_kernel_config->datatype_size); } else { l_b_offset = (i_micro_kernel_config->datatype_size * i_offset) + (i_xgemm_desc->ldb * l_n * i_micro_kernel_config->datatype_size); } libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_b_offset, i_micro_kernel_config->vector_name, l_n, 0, 1, 0 ); } } else { for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { /* handle trans B */ if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { l_b_offset = l_n * i_micro_kernel_config->datatype_size; } else { l_b_offset = i_xgemm_desc->ldb * l_n * i_micro_kernel_config->datatype_size; } libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_b_offset, i_micro_kernel_config->vector_name, l_n, 0, 1, 0 ); } /* handle trans B */ if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { l_b_offset = i_xgemm_desc->ldb * i_micro_kernel_config->datatype_size; } else { l_b_offset = i_micro_kernel_config->datatype_size; } libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_b, l_b_offset ); } if (l_m_blocking == 3) { /* load column vectors of A and multiply with all broadcasted row entries of B */ for ( l_m = 0; l_m < l_m_blocking; l_m++ ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (i_micro_kernel_config->datatype_size) * (i_micro_kernel_config->vector_length) * l_m, i_micro_kernel_config->vector_name, i_n_blocking, 0, 1, 0 ); for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { /* post increment early */ if ( (l_m == (l_m_blocking-1)) && (l_n == 0) ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a, (i_xgemm_desc->lda)*(i_micro_kernel_config->datatype_size) ); } /* issue mul+add */ libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, i_n_blocking, l_n, i_n_blocking + l_m + 1 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vadd_instruction, i_micro_kernel_config->vector_name, i_n_blocking + l_m + 1, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n), l_vec_reg_acc_start + l_m + (l_m_blocking * l_n) ); } } } else { /* load column vectors of A and multiply with all broadcasted row entries of B */ for ( l_m = 0; l_m < l_m_blocking; l_m++ ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (i_micro_kernel_config->datatype_size) * (i_micro_kernel_config->vector_length) * l_m, i_micro_kernel_config->vector_name, i_n_blocking+l_m, 0, 1, 0 ); } for ( l_m = 0; l_m < l_m_blocking; l_m++ ) { for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { /* post increment early */ if ( (l_m == (l_m_blocking-1)) && (l_n == 0) ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a, (i_xgemm_desc->lda)*(i_micro_kernel_config->datatype_size) ); } /* issue mul/add */ libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, i_n_blocking + l_m, l_n, i_n_blocking + l_m_blocking + l_m ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vadd_instruction, i_micro_kernel_config->vector_name, i_n_blocking + l_m_blocking + l_m, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n), l_vec_reg_acc_start + l_m + (l_m_blocking * l_n) ); } } } } } libxsmm-1.17/src/generator_gemm_avx_microkernel.h000066400000000000000000000031311415223013700223140ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef GENERATOR_GEMM_AVX_MICROKERNEL_H #define GENERATOR_GEMM_AVX_MICROKERNEL_H #include "generator_gemm_common.h" LIBXSMM_API_INTERN void libxsmm_generator_gemm_avx_microkernel( libxsmm_generated_code* io_generated_code, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_m_blocking, const unsigned int i_n_blocking, const int i_offset ); #endif /* GENERATOR_GEMM_AVX_MICROKERNEL_H */ libxsmm-1.17/src/generator_gemm_common.c000066400000000000000000002417311415223013700204210ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "generator_gemm_common.h" #include "generator_common.h" #include "generator_x86_instructions.h" #include "libxsmm_main.h" LIBXSMM_API_INTERN void libxsmm_generator_gemm_init_micro_kernel_config_fullvector( libxsmm_micro_kernel_config* io_micro_kernel_config, const unsigned int i_arch, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_use_masking_a_c ) { memset(io_micro_kernel_config, 0, sizeof(*io_micro_kernel_config)); /* avoid warning "maybe used uninitialized" */ if ( (i_arch < LIBXSMM_X86_SSE3) || (i_arch > LIBXSMM_X86_ALLFEAT) ) { io_micro_kernel_config->instruction_set = LIBXSMM_X86_GENERIC; io_micro_kernel_config->vector_reg_count = 0; io_micro_kernel_config->use_masking_a_c = 0; io_micro_kernel_config->vector_name = 'a'; io_micro_kernel_config->vector_length = 0; io_micro_kernel_config->datatype_size = 0; io_micro_kernel_config->a_vmove_instruction = LIBXSMM_X86_INSTR_UNDEF; io_micro_kernel_config->b_vmove_instruction = LIBXSMM_X86_INSTR_UNDEF; io_micro_kernel_config->b_shuff_instruction = LIBXSMM_X86_INSTR_UNDEF; io_micro_kernel_config->c_vmove_instruction = LIBXSMM_X86_INSTR_UNDEF; io_micro_kernel_config->vxor_instruction = LIBXSMM_X86_INSTR_UNDEF; io_micro_kernel_config->vmul_instruction = LIBXSMM_X86_INSTR_UNDEF; io_micro_kernel_config->vadd_instruction = LIBXSMM_X86_INSTR_UNDEF; } else if ( i_arch <= LIBXSMM_X86_SSE4 ) { io_micro_kernel_config->instruction_set = LIBXSMM_X86_SSE3; io_micro_kernel_config->vector_reg_count = 16; io_micro_kernel_config->use_masking_a_c = i_use_masking_a_c; io_micro_kernel_config->vector_name = 'x'; if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { io_micro_kernel_config->vector_length = 2; io_micro_kernel_config->datatype_size = 8; if ( (LIBXSMM_GEMM_FLAG_ALIGN_A & i_xgemm_desc->flags) != 0 ) { io_micro_kernel_config->a_vmove_instruction = LIBXSMM_X86_INSTR_MOVAPD; } else { io_micro_kernel_config->a_vmove_instruction = LIBXSMM_X86_INSTR_MOVUPD; } io_micro_kernel_config->b_vmove_instruction = LIBXSMM_X86_INSTR_MOVDDUP; io_micro_kernel_config->b_shuff_instruction = LIBXSMM_X86_INSTR_UNDEF; if ( (LIBXSMM_GEMM_FLAG_ALIGN_C & i_xgemm_desc->flags) != 0 ) { io_micro_kernel_config->c_vmove_instruction = LIBXSMM_X86_INSTR_MOVAPD; io_micro_kernel_config->c_vmove_nts_instruction = LIBXSMM_X86_INSTR_MOVAPD; } else { io_micro_kernel_config->c_vmove_instruction = LIBXSMM_X86_INSTR_MOVUPD; io_micro_kernel_config->c_vmove_nts_instruction = LIBXSMM_X86_INSTR_MOVUPD; } io_micro_kernel_config->vxor_instruction = LIBXSMM_X86_INSTR_XORPD; io_micro_kernel_config->vmul_instruction = LIBXSMM_X86_INSTR_MULPD; io_micro_kernel_config->vadd_instruction = LIBXSMM_X86_INSTR_ADDPD; } else { io_micro_kernel_config->vector_length = 4; io_micro_kernel_config->datatype_size = 4; if ( (LIBXSMM_GEMM_FLAG_ALIGN_A & i_xgemm_desc->flags) != 0 ) { io_micro_kernel_config->a_vmove_instruction = LIBXSMM_X86_INSTR_MOVAPS; } else { io_micro_kernel_config->a_vmove_instruction = LIBXSMM_X86_INSTR_MOVUPS; } io_micro_kernel_config->b_vmove_instruction = LIBXSMM_X86_INSTR_MOVSS; io_micro_kernel_config->b_shuff_instruction = LIBXSMM_X86_INSTR_SHUFPS; if ( (LIBXSMM_GEMM_FLAG_ALIGN_C & i_xgemm_desc->flags) != 0 ) { io_micro_kernel_config->c_vmove_instruction = LIBXSMM_X86_INSTR_MOVAPS; io_micro_kernel_config->c_vmove_nts_instruction = LIBXSMM_X86_INSTR_MOVAPS; } else { io_micro_kernel_config->c_vmove_instruction = LIBXSMM_X86_INSTR_MOVUPS; io_micro_kernel_config->c_vmove_nts_instruction = LIBXSMM_X86_INSTR_MOVUPS; } io_micro_kernel_config->vxor_instruction = LIBXSMM_X86_INSTR_XORPS; io_micro_kernel_config->vmul_instruction = LIBXSMM_X86_INSTR_MULPS; io_micro_kernel_config->vadd_instruction = LIBXSMM_X86_INSTR_ADDPS; } } else if ( i_arch <= LIBXSMM_X86_AVX2 ) { io_micro_kernel_config->instruction_set = i_arch; io_micro_kernel_config->vector_reg_count = 16; io_micro_kernel_config->use_masking_a_c = i_use_masking_a_c; io_micro_kernel_config->vector_name = 'y'; if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { io_micro_kernel_config->vector_length = 4; io_micro_kernel_config->datatype_size = 8; if ( (LIBXSMM_GEMM_FLAG_ALIGN_A & i_xgemm_desc->flags) != 0 ) { io_micro_kernel_config->a_vmove_instruction = LIBXSMM_X86_INSTR_VMOVAPD; } else { io_micro_kernel_config->a_vmove_instruction = LIBXSMM_X86_INSTR_VMOVUPD; } io_micro_kernel_config->b_vmove_instruction = LIBXSMM_X86_INSTR_VBROADCASTSD; io_micro_kernel_config->b_shuff_instruction = LIBXSMM_X86_INSTR_UNDEF; if ( (LIBXSMM_GEMM_FLAG_ALIGN_C & i_xgemm_desc->flags) != 0 ) { io_micro_kernel_config->c_vmove_instruction = LIBXSMM_X86_INSTR_VMOVAPD; io_micro_kernel_config->c_vmove_nts_instruction = LIBXSMM_X86_INSTR_VMOVNTPD; } else { io_micro_kernel_config->c_vmove_instruction = LIBXSMM_X86_INSTR_VMOVUPD; io_micro_kernel_config->c_vmove_nts_instruction = LIBXSMM_X86_INSTR_VMOVUPD; } io_micro_kernel_config->vxor_instruction = LIBXSMM_X86_INSTR_VXORPD; if ( i_arch == LIBXSMM_X86_AVX ) { io_micro_kernel_config->vmul_instruction = LIBXSMM_X86_INSTR_VMULPD; io_micro_kernel_config->vadd_instruction = LIBXSMM_X86_INSTR_VADDPD; } else { io_micro_kernel_config->vmul_instruction = LIBXSMM_X86_INSTR_VFMADD231PD; io_micro_kernel_config->vadd_instruction = LIBXSMM_X86_INSTR_VADDPD; } } else { io_micro_kernel_config->vector_length = 8; io_micro_kernel_config->datatype_size = 4; if ( (LIBXSMM_GEMM_FLAG_ALIGN_A & i_xgemm_desc->flags) != 0 ) { io_micro_kernel_config->a_vmove_instruction = LIBXSMM_X86_INSTR_VMOVAPS; } else { io_micro_kernel_config->a_vmove_instruction = LIBXSMM_X86_INSTR_VMOVUPS; } io_micro_kernel_config->b_vmove_instruction = LIBXSMM_X86_INSTR_VBROADCASTSS; io_micro_kernel_config->b_shuff_instruction = LIBXSMM_X86_INSTR_UNDEF; if ( (LIBXSMM_GEMM_FLAG_ALIGN_C & i_xgemm_desc->flags) != 0 ) { io_micro_kernel_config->c_vmove_instruction = LIBXSMM_X86_INSTR_VMOVAPS; io_micro_kernel_config->c_vmove_nts_instruction = LIBXSMM_X86_INSTR_VMOVNTPS; } else { io_micro_kernel_config->c_vmove_instruction = LIBXSMM_X86_INSTR_VMOVUPS; io_micro_kernel_config->c_vmove_nts_instruction = LIBXSMM_X86_INSTR_VMOVUPS; } io_micro_kernel_config->vxor_instruction = LIBXSMM_X86_INSTR_VXORPS; if ( i_arch == LIBXSMM_X86_AVX ) { io_micro_kernel_config->vmul_instruction = LIBXSMM_X86_INSTR_VMULPS; io_micro_kernel_config->vadd_instruction = LIBXSMM_X86_INSTR_VADDPS; } else { io_micro_kernel_config->vmul_instruction = LIBXSMM_X86_INSTR_VFMADD231PS; io_micro_kernel_config->vadd_instruction = LIBXSMM_X86_INSTR_VADDPS; } } } else if ( i_arch <= LIBXSMM_X86_ALLFEAT ) { io_micro_kernel_config->instruction_set = i_arch; io_micro_kernel_config->vector_reg_count = 32; io_micro_kernel_config->use_masking_a_c = i_use_masking_a_c; io_micro_kernel_config->vector_name = 'z'; if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { io_micro_kernel_config->vector_length = 8; io_micro_kernel_config->datatype_size = 8; if ( (LIBXSMM_GEMM_FLAG_ALIGN_A & i_xgemm_desc->flags) != 0 ) { io_micro_kernel_config->a_vmove_instruction = LIBXSMM_X86_INSTR_VMOVAPD; } else { io_micro_kernel_config->a_vmove_instruction = LIBXSMM_X86_INSTR_VMOVUPD; } io_micro_kernel_config->b_vmove_instruction = LIBXSMM_X86_INSTR_VBROADCASTSD; io_micro_kernel_config->b_shuff_instruction = LIBXSMM_X86_INSTR_UNDEF; if ( (LIBXSMM_GEMM_FLAG_ALIGN_C & i_xgemm_desc->flags) != 0 ) { io_micro_kernel_config->c_vmove_instruction = LIBXSMM_X86_INSTR_VMOVAPD; if ( (i_use_masking_a_c == 0) ) { io_micro_kernel_config->c_vmove_nts_instruction = LIBXSMM_X86_INSTR_VMOVNTPD; } else { io_micro_kernel_config->c_vmove_nts_instruction = LIBXSMM_X86_INSTR_VMOVAPD; } } else { io_micro_kernel_config->c_vmove_instruction = LIBXSMM_X86_INSTR_VMOVUPD; io_micro_kernel_config->c_vmove_nts_instruction = LIBXSMM_X86_INSTR_VMOVUPD; } io_micro_kernel_config->vxor_instruction = LIBXSMM_X86_INSTR_VPXORD; io_micro_kernel_config->vmul_instruction = LIBXSMM_X86_INSTR_VFMADD231PD; io_micro_kernel_config->vadd_instruction = LIBXSMM_X86_INSTR_VADDPD; } else if ( LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { io_micro_kernel_config->vector_length = 16; io_micro_kernel_config->datatype_size = 4; if ( (LIBXSMM_GEMM_FLAG_ALIGN_A & i_xgemm_desc->flags) != 0 ) { io_micro_kernel_config->a_vmove_instruction = LIBXSMM_X86_INSTR_VMOVAPS; } else { io_micro_kernel_config->a_vmove_instruction = LIBXSMM_X86_INSTR_VMOVUPS; } io_micro_kernel_config->b_vmove_instruction = LIBXSMM_X86_INSTR_VBROADCASTSS; io_micro_kernel_config->b_shuff_instruction = LIBXSMM_X86_INSTR_UNDEF; if ( (LIBXSMM_GEMM_FLAG_ALIGN_C & i_xgemm_desc->flags) != 0 ) { io_micro_kernel_config->c_vmove_instruction = LIBXSMM_X86_INSTR_VMOVAPS; if ( (i_use_masking_a_c == 0) ) { io_micro_kernel_config->c_vmove_nts_instruction = LIBXSMM_X86_INSTR_VMOVNTPS; } else { io_micro_kernel_config->c_vmove_instruction = LIBXSMM_X86_INSTR_VMOVAPS; } } else { io_micro_kernel_config->c_vmove_instruction = LIBXSMM_X86_INSTR_VMOVUPS; io_micro_kernel_config->c_vmove_nts_instruction = LIBXSMM_X86_INSTR_VMOVUPS; } io_micro_kernel_config->vxor_instruction = LIBXSMM_X86_INSTR_VPXORD; io_micro_kernel_config->vmul_instruction = LIBXSMM_X86_INSTR_VFMADD231PS; io_micro_kernel_config->vadd_instruction = LIBXSMM_X86_INSTR_VADDPS; } else if ( LIBXSMM_GEMM_PRECISION_I16 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { /* C is 32bit, so we treat all 3 matrices as 32bit element arrays */ io_micro_kernel_config->vector_length = 16; io_micro_kernel_config->datatype_size = 4; if ( (LIBXSMM_GEMM_FLAG_ALIGN_A & i_xgemm_desc->flags) != 0 ) { io_micro_kernel_config->a_vmove_instruction = LIBXSMM_X86_INSTR_VMOVAPS; } else { io_micro_kernel_config->a_vmove_instruction = LIBXSMM_X86_INSTR_VMOVUPS; } io_micro_kernel_config->b_vmove_instruction = LIBXSMM_X86_INSTR_VPBROADCASTD; io_micro_kernel_config->b_shuff_instruction = LIBXSMM_X86_INSTR_UNDEF; if ( (LIBXSMM_GEMM_FLAG_ALIGN_C & i_xgemm_desc->flags) != 0 ) { io_micro_kernel_config->c_vmove_instruction = LIBXSMM_X86_INSTR_VMOVAPS; if ( (i_use_masking_a_c == 0) ) { io_micro_kernel_config->c_vmove_nts_instruction = LIBXSMM_X86_INSTR_VMOVNTPS; } else { io_micro_kernel_config->c_vmove_instruction = LIBXSMM_X86_INSTR_VMOVAPS; } } else { io_micro_kernel_config->c_vmove_instruction = LIBXSMM_X86_INSTR_VMOVUPS; io_micro_kernel_config->c_vmove_nts_instruction = LIBXSMM_X86_INSTR_VMOVUPS; } io_micro_kernel_config->vxor_instruction = LIBXSMM_X86_INSTR_VPXORD; io_micro_kernel_config->vmul_instruction = LIBXSMM_X86_INSTR_VPDPWSSD; io_micro_kernel_config->vadd_instruction = LIBXSMM_X86_INSTR_VPADDD; } else if ( LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { /* C is 32bit, so we treat all 3 matrices as 32bit element arrays */ io_micro_kernel_config->vector_length = 16; io_micro_kernel_config->datatype_size = 4; if ( (LIBXSMM_GEMM_FLAG_ALIGN_A & i_xgemm_desc->flags) != 0 ) { io_micro_kernel_config->a_vmove_instruction = LIBXSMM_X86_INSTR_VMOVAPS; } else { io_micro_kernel_config->a_vmove_instruction = LIBXSMM_X86_INSTR_VMOVUPS; } io_micro_kernel_config->b_vmove_instruction = LIBXSMM_X86_INSTR_VPBROADCASTD; io_micro_kernel_config->b_shuff_instruction = LIBXSMM_X86_INSTR_UNDEF; if ( (LIBXSMM_GEMM_FLAG_ALIGN_C & i_xgemm_desc->flags) != 0 ) { io_micro_kernel_config->c_vmove_instruction = LIBXSMM_X86_INSTR_VMOVAPS; if ( (i_use_masking_a_c == 0) ) { io_micro_kernel_config->c_vmove_nts_instruction = LIBXSMM_X86_INSTR_VMOVNTPS; } else { io_micro_kernel_config->c_vmove_instruction = LIBXSMM_X86_INSTR_VMOVAPS; } } else { io_micro_kernel_config->c_vmove_instruction = LIBXSMM_X86_INSTR_VMOVUPS; io_micro_kernel_config->c_vmove_nts_instruction = LIBXSMM_X86_INSTR_VMOVUPS; } io_micro_kernel_config->vxor_instruction = LIBXSMM_X86_INSTR_VPXORD; io_micro_kernel_config->vmul_instruction = LIBXSMM_X86_INSTR_VPDPBUSD; io_micro_kernel_config->vadd_instruction = LIBXSMM_X86_INSTR_VPADDD; } else if ( LIBXSMM_GEMM_PRECISION_BF16 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { /* C is 32bit, so we treat all 3 matrices as 32bit element arrays */ io_micro_kernel_config->vector_length = 16; io_micro_kernel_config->datatype_size = 4; if ( (LIBXSMM_GEMM_FLAG_ALIGN_A & i_xgemm_desc->flags) != 0 ) { io_micro_kernel_config->a_vmove_instruction = LIBXSMM_X86_INSTR_VMOVAPS; } else { io_micro_kernel_config->a_vmove_instruction = LIBXSMM_X86_INSTR_VMOVUPS; } io_micro_kernel_config->b_vmove_instruction = LIBXSMM_X86_INSTR_VPBROADCASTD; io_micro_kernel_config->b_shuff_instruction = LIBXSMM_X86_INSTR_UNDEF; if ( (LIBXSMM_GEMM_FLAG_ALIGN_C & i_xgemm_desc->flags) != 0 ) { io_micro_kernel_config->c_vmove_instruction = LIBXSMM_X86_INSTR_VMOVAPS; if ( (i_use_masking_a_c == 0) ) { io_micro_kernel_config->c_vmove_nts_instruction = LIBXSMM_X86_INSTR_VMOVNTPS; } else { io_micro_kernel_config->c_vmove_instruction = LIBXSMM_X86_INSTR_VMOVAPS; } } else { io_micro_kernel_config->c_vmove_instruction = LIBXSMM_X86_INSTR_VMOVUPS; io_micro_kernel_config->c_vmove_nts_instruction = LIBXSMM_X86_INSTR_VMOVUPS; } io_micro_kernel_config->vxor_instruction = LIBXSMM_X86_INSTR_VPXORD; io_micro_kernel_config->vmul_instruction = LIBXSMM_X86_INSTR_VDPBF16PS; io_micro_kernel_config->vadd_instruction = LIBXSMM_X86_INSTR_VADDPS; } else { /* shouldn't happen as we caught this case earlier */ io_micro_kernel_config->instruction_set = LIBXSMM_X86_GENERIC; io_micro_kernel_config->vector_reg_count = 0; io_micro_kernel_config->use_masking_a_c = 0; io_micro_kernel_config->vector_name = 'a'; io_micro_kernel_config->vector_length = 0; io_micro_kernel_config->datatype_size = 0; io_micro_kernel_config->a_vmove_instruction = LIBXSMM_X86_INSTR_UNDEF; io_micro_kernel_config->b_vmove_instruction = LIBXSMM_X86_INSTR_UNDEF; io_micro_kernel_config->b_shuff_instruction = LIBXSMM_X86_INSTR_UNDEF; io_micro_kernel_config->c_vmove_instruction = LIBXSMM_X86_INSTR_UNDEF; io_micro_kernel_config->vxor_instruction = LIBXSMM_X86_INSTR_UNDEF; io_micro_kernel_config->vmul_instruction = LIBXSMM_X86_INSTR_UNDEF; io_micro_kernel_config->vadd_instruction = LIBXSMM_X86_INSTR_UNDEF; } } else { /* that should no happen */ } io_micro_kernel_config->prefetch_instruction = LIBXSMM_X86_INSTR_PREFETCHT1; io_micro_kernel_config->alu_add_instruction = LIBXSMM_X86_INSTR_ADDQ; io_micro_kernel_config->alu_sub_instruction = LIBXSMM_X86_INSTR_SUBQ; io_micro_kernel_config->alu_cmp_instruction = LIBXSMM_X86_INSTR_CMPQ; io_micro_kernel_config->alu_jmp_instruction = LIBXSMM_X86_INSTR_JL; io_micro_kernel_config->alu_mov_instruction = LIBXSMM_X86_INSTR_MOVQ; } LIBXSMM_API_INTERN void libxsmm_generator_gemm_init_micro_kernel_config_halfvector( libxsmm_micro_kernel_config* io_micro_kernel_config, const unsigned int i_arch, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_use_masking_a_c ) { if ( (i_arch < LIBXSMM_X86_SSE3) || (i_arch > LIBXSMM_X86_ALLFEAT) ) { io_micro_kernel_config->instruction_set = LIBXSMM_X86_GENERIC; io_micro_kernel_config->vector_reg_count = 0; io_micro_kernel_config->use_masking_a_c = 0; io_micro_kernel_config->vector_name = 'a'; io_micro_kernel_config->vector_length = 0; io_micro_kernel_config->datatype_size = 0; io_micro_kernel_config->a_vmove_instruction = LIBXSMM_X86_INSTR_UNDEF; io_micro_kernel_config->b_vmove_instruction = LIBXSMM_X86_INSTR_UNDEF; io_micro_kernel_config->b_shuff_instruction = LIBXSMM_X86_INSTR_UNDEF; io_micro_kernel_config->c_vmove_instruction = LIBXSMM_X86_INSTR_UNDEF; io_micro_kernel_config->vxor_instruction = LIBXSMM_X86_INSTR_UNDEF; io_micro_kernel_config->vmul_instruction = LIBXSMM_X86_INSTR_UNDEF; io_micro_kernel_config->vadd_instruction = LIBXSMM_X86_INSTR_UNDEF; } else if ( i_arch <= LIBXSMM_X86_SSE4 ) { #if !defined(NDEBUG) fprintf(stderr, "LIBXSMM WARNING, libxsmm_generator_gemm_init_micro_kernel_config_halfvector, redirecting to scalar, please fix the generation code!!!\n"); #endif libxsmm_generator_gemm_init_micro_kernel_config_scalar( io_micro_kernel_config, i_arch, i_xgemm_desc, i_use_masking_a_c ); } else if ( i_arch <= LIBXSMM_X86_AVX2 ) { io_micro_kernel_config->instruction_set = LIBXSMM_X86_AVX; io_micro_kernel_config->vector_reg_count = 16; io_micro_kernel_config->use_masking_a_c = i_use_masking_a_c; io_micro_kernel_config->vector_name = 'x'; if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { io_micro_kernel_config->vector_length = 2; io_micro_kernel_config->datatype_size = 8; if ( (LIBXSMM_GEMM_FLAG_ALIGN_A & i_xgemm_desc->flags) != 0 ) { io_micro_kernel_config->a_vmove_instruction = LIBXSMM_X86_INSTR_VMOVAPD; } else { io_micro_kernel_config->a_vmove_instruction = LIBXSMM_X86_INSTR_VMOVUPD; } io_micro_kernel_config->b_vmove_instruction = LIBXSMM_X86_INSTR_VMOVDDUP; io_micro_kernel_config->b_shuff_instruction = LIBXSMM_X86_INSTR_UNDEF; if ( (LIBXSMM_GEMM_FLAG_ALIGN_C & i_xgemm_desc->flags) != 0 ) { io_micro_kernel_config->c_vmove_instruction = LIBXSMM_X86_INSTR_VMOVAPD; io_micro_kernel_config->c_vmove_nts_instruction = LIBXSMM_X86_INSTR_VMOVNTPD; } else { io_micro_kernel_config->c_vmove_instruction = LIBXSMM_X86_INSTR_VMOVUPD; io_micro_kernel_config->c_vmove_nts_instruction = LIBXSMM_X86_INSTR_VMOVUPD; } io_micro_kernel_config->vxor_instruction = LIBXSMM_X86_INSTR_VXORPD; if ( i_arch == LIBXSMM_X86_AVX ) { io_micro_kernel_config->vmul_instruction = LIBXSMM_X86_INSTR_VMULPD; io_micro_kernel_config->vadd_instruction = LIBXSMM_X86_INSTR_VADDPD; } else { io_micro_kernel_config->vmul_instruction = LIBXSMM_X86_INSTR_VFMADD231PD; io_micro_kernel_config->vadd_instruction = LIBXSMM_X86_INSTR_UNDEF; } } else { io_micro_kernel_config->vector_length = 4; io_micro_kernel_config->datatype_size = 4; if ( (LIBXSMM_GEMM_FLAG_ALIGN_A & i_xgemm_desc->flags) != 0 ) { io_micro_kernel_config->a_vmove_instruction = LIBXSMM_X86_INSTR_VMOVAPS; } else { io_micro_kernel_config->a_vmove_instruction = LIBXSMM_X86_INSTR_VMOVUPS; } io_micro_kernel_config->b_vmove_instruction = LIBXSMM_X86_INSTR_VBROADCASTSS; io_micro_kernel_config->b_shuff_instruction = LIBXSMM_X86_INSTR_UNDEF; if ( (LIBXSMM_GEMM_FLAG_ALIGN_C & i_xgemm_desc->flags) != 0 ) { io_micro_kernel_config->c_vmove_instruction = LIBXSMM_X86_INSTR_VMOVAPS; io_micro_kernel_config->c_vmove_nts_instruction = LIBXSMM_X86_INSTR_VMOVNTPS; } else { io_micro_kernel_config->c_vmove_instruction = LIBXSMM_X86_INSTR_VMOVUPS; io_micro_kernel_config->c_vmove_nts_instruction = LIBXSMM_X86_INSTR_VMOVUPS; } io_micro_kernel_config->vxor_instruction = LIBXSMM_X86_INSTR_VXORPS; if ( i_arch == LIBXSMM_X86_AVX ) { io_micro_kernel_config->vmul_instruction = LIBXSMM_X86_INSTR_VMULPS; io_micro_kernel_config->vadd_instruction = LIBXSMM_X86_INSTR_VADDPS; } else { io_micro_kernel_config->vmul_instruction = LIBXSMM_X86_INSTR_VFMADD231PS; io_micro_kernel_config->vadd_instruction = LIBXSMM_X86_INSTR_UNDEF; } } } else if ( i_arch <= LIBXSMM_X86_ALLFEAT ) { #if !defined(NDEBUG) fprintf(stderr, "LIBXSMM WARNING, libxsmm_generator_gemm_init_micro_kernel_config_halfvector, AVX512 redirecting to fullvector!\n"); #endif libxsmm_generator_gemm_init_micro_kernel_config_fullvector( io_micro_kernel_config, i_arch, i_xgemm_desc, i_use_masking_a_c ); } else { /* should not happen */ } io_micro_kernel_config->prefetch_instruction = LIBXSMM_X86_INSTR_PREFETCHT1; io_micro_kernel_config->alu_add_instruction = LIBXSMM_X86_INSTR_ADDQ; io_micro_kernel_config->alu_sub_instruction = LIBXSMM_X86_INSTR_SUBQ; io_micro_kernel_config->alu_cmp_instruction = LIBXSMM_X86_INSTR_CMPQ; io_micro_kernel_config->alu_jmp_instruction = LIBXSMM_X86_INSTR_JL; io_micro_kernel_config->alu_mov_instruction = LIBXSMM_X86_INSTR_MOVQ; } LIBXSMM_API_INTERN void libxsmm_generator_gemm_init_micro_kernel_config_scalar( libxsmm_micro_kernel_config* io_micro_kernel_config, const unsigned int i_arch, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_use_masking_a_c ) { if ( ( i_arch < LIBXSMM_X86_SSE3 ) || ( i_arch > LIBXSMM_X86_ALLFEAT ) ) { io_micro_kernel_config->instruction_set = LIBXSMM_X86_GENERIC; io_micro_kernel_config->vector_reg_count = 0; io_micro_kernel_config->use_masking_a_c = 0; io_micro_kernel_config->vector_name = 'a'; io_micro_kernel_config->vector_length = 0; io_micro_kernel_config->datatype_size = 0; io_micro_kernel_config->a_vmove_instruction = LIBXSMM_X86_INSTR_UNDEF; io_micro_kernel_config->b_vmove_instruction = LIBXSMM_X86_INSTR_UNDEF; io_micro_kernel_config->b_shuff_instruction = LIBXSMM_X86_INSTR_UNDEF; io_micro_kernel_config->c_vmove_instruction = LIBXSMM_X86_INSTR_UNDEF; io_micro_kernel_config->c_vmove_nts_instruction = LIBXSMM_X86_INSTR_UNDEF; io_micro_kernel_config->vxor_instruction = LIBXSMM_X86_INSTR_UNDEF; io_micro_kernel_config->vmul_instruction = LIBXSMM_X86_INSTR_UNDEF; io_micro_kernel_config->vadd_instruction = LIBXSMM_X86_INSTR_UNDEF; } else if ( i_arch <= LIBXSMM_X86_SSE4 ) { io_micro_kernel_config->instruction_set = LIBXSMM_X86_SSE3; io_micro_kernel_config->vector_reg_count = 16; io_micro_kernel_config->use_masking_a_c = i_use_masking_a_c; io_micro_kernel_config->vector_name = 'x'; if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { io_micro_kernel_config->vector_length = 1; io_micro_kernel_config->datatype_size = 8; io_micro_kernel_config->a_vmove_instruction = LIBXSMM_X86_INSTR_MOVSD; io_micro_kernel_config->b_vmove_instruction = LIBXSMM_X86_INSTR_MOVSD; io_micro_kernel_config->b_shuff_instruction = LIBXSMM_X86_INSTR_UNDEF; io_micro_kernel_config->c_vmove_instruction = LIBXSMM_X86_INSTR_MOVSD; io_micro_kernel_config->c_vmove_nts_instruction = LIBXSMM_X86_INSTR_MOVSD; io_micro_kernel_config->vxor_instruction = LIBXSMM_X86_INSTR_XORPD; io_micro_kernel_config->vmul_instruction = LIBXSMM_X86_INSTR_MULSD; io_micro_kernel_config->vadd_instruction = LIBXSMM_X86_INSTR_ADDSD; } else { io_micro_kernel_config->vector_length = 1; io_micro_kernel_config->datatype_size = 4; io_micro_kernel_config->a_vmove_instruction = LIBXSMM_X86_INSTR_MOVSS; io_micro_kernel_config->b_vmove_instruction = LIBXSMM_X86_INSTR_MOVSS; io_micro_kernel_config->b_shuff_instruction = LIBXSMM_X86_INSTR_UNDEF; io_micro_kernel_config->c_vmove_instruction = LIBXSMM_X86_INSTR_MOVSS; io_micro_kernel_config->c_vmove_nts_instruction = LIBXSMM_X86_INSTR_MOVSS; io_micro_kernel_config->vxor_instruction = LIBXSMM_X86_INSTR_XORPS; io_micro_kernel_config->vmul_instruction = LIBXSMM_X86_INSTR_MULSS; io_micro_kernel_config->vadd_instruction = LIBXSMM_X86_INSTR_ADDSS; } } else if ( i_arch <= LIBXSMM_X86_ALLFEAT ) { io_micro_kernel_config->instruction_set = i_arch; io_micro_kernel_config->vector_reg_count = 16; io_micro_kernel_config->use_masking_a_c = i_use_masking_a_c; io_micro_kernel_config->vector_name = 'x'; if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { io_micro_kernel_config->vector_length = 1; io_micro_kernel_config->datatype_size = 8; io_micro_kernel_config->a_vmove_instruction = LIBXSMM_X86_INSTR_VMOVSD; io_micro_kernel_config->b_vmove_instruction = LIBXSMM_X86_INSTR_VMOVSD; io_micro_kernel_config->b_shuff_instruction = LIBXSMM_X86_INSTR_UNDEF; io_micro_kernel_config->c_vmove_instruction = LIBXSMM_X86_INSTR_VMOVSD; io_micro_kernel_config->c_vmove_nts_instruction = LIBXSMM_X86_INSTR_VMOVSD; io_micro_kernel_config->vxor_instruction = LIBXSMM_X86_INSTR_VXORPD; if ( i_arch == LIBXSMM_X86_AVX ) { io_micro_kernel_config->vmul_instruction = LIBXSMM_X86_INSTR_VMULSD; io_micro_kernel_config->vadd_instruction = LIBXSMM_X86_INSTR_VADDSD; } else { io_micro_kernel_config->vmul_instruction = LIBXSMM_X86_INSTR_VFMADD231SD; io_micro_kernel_config->vadd_instruction = LIBXSMM_X86_INSTR_UNDEF; } } else { io_micro_kernel_config->vector_length = 1; io_micro_kernel_config->datatype_size = 4; io_micro_kernel_config->a_vmove_instruction = LIBXSMM_X86_INSTR_VMOVSS; io_micro_kernel_config->b_vmove_instruction = LIBXSMM_X86_INSTR_VMOVSS; io_micro_kernel_config->b_shuff_instruction = LIBXSMM_X86_INSTR_UNDEF; io_micro_kernel_config->c_vmove_instruction = LIBXSMM_X86_INSTR_VMOVSS; io_micro_kernel_config->c_vmove_nts_instruction = LIBXSMM_X86_INSTR_VMOVSS; io_micro_kernel_config->vxor_instruction = LIBXSMM_X86_INSTR_VXORPS; if ( i_arch == LIBXSMM_X86_AVX ) { io_micro_kernel_config->vmul_instruction = LIBXSMM_X86_INSTR_VMULSS; io_micro_kernel_config->vadd_instruction = LIBXSMM_X86_INSTR_VADDSS; } else { io_micro_kernel_config->vmul_instruction = LIBXSMM_X86_INSTR_VFMADD231SS; io_micro_kernel_config->vadd_instruction = LIBXSMM_X86_INSTR_UNDEF; } } } else { /* should not happen */ } io_micro_kernel_config->prefetch_instruction = LIBXSMM_X86_INSTR_PREFETCHT1; io_micro_kernel_config->alu_add_instruction = LIBXSMM_X86_INSTR_ADDQ; io_micro_kernel_config->alu_sub_instruction = LIBXSMM_X86_INSTR_SUBQ; io_micro_kernel_config->alu_cmp_instruction = LIBXSMM_X86_INSTR_CMPQ; io_micro_kernel_config->alu_jmp_instruction = LIBXSMM_X86_INSTR_JL; io_micro_kernel_config->alu_mov_instruction = LIBXSMM_X86_INSTR_MOVQ; } LIBXSMM_API_INTERN void libxsmm_generator_gemm_add_flop_counter( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc ) { if ( io_generated_code->code_type == 0 ) { char l_new_code[512]; const unsigned int l_max_code_length = sizeof(l_new_code) - 1; int l_code_length = 0; l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, "#ifndef NDEBUG\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, "#ifdef _OPENMP\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, "#pragma omp atomic\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, "#endif\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, "libxsmm_num_total_flops += %u;\n", 2u * i_xgemm_desc->m * i_xgemm_desc->n * i_xgemm_desc->k); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, "#endif\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } LIBXSMM_API_INTERN void libxsmm_generator_gemm_header_kloop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const unsigned int i_m_blocking, const unsigned int i_k_blocking ) { LIBXSMM_UNUSED(i_m_blocking); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_kloop, 0); libxsmm_x86_instruction_register_jump_back_label( io_generated_code, io_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_kloop, i_k_blocking); } LIBXSMM_API_INTERN void libxsmm_generator_gemm_footer_kloop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_m_blocking, const unsigned int i_max_blocked_k, const unsigned int i_kloop_complete ) { LIBXSMM_UNUSED(i_m_blocking); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_cmp_instruction, i_gp_reg_mapping->gp_reg_kloop, i_max_blocked_k ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, i_micro_kernel_config->alu_jmp_instruction, io_loop_label_tracker ); if ( i_kloop_complete != 0 ) { int l_b_offset = 0; if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { l_b_offset = i_xgemm_desc->ldb * i_xgemm_desc->k * i_micro_kernel_config->datatype_size; } else { l_b_offset = i_xgemm_desc->k * i_micro_kernel_config->datatype_size; } libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_b, l_b_offset ); } } LIBXSMM_API_INTERN void libxsmm_generator_gemm_header_reduceloop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_reduce_loop, 0); libxsmm_x86_instruction_register_jump_back_label( io_generated_code, io_loop_label_tracker ); } LIBXSMM_API_INTERN void libxsmm_generator_gemm_footer_reduceloop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc) { LIBXSMM_UNUSED(i_xgemm_desc); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_reduce_loop, 1); libxsmm_x86_instruction_alu_reg( io_generated_code, i_micro_kernel_config->alu_cmp_instruction, i_gp_reg_mapping->gp_reg_reduce_count, i_gp_reg_mapping->gp_reg_reduce_loop); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, i_micro_kernel_config->alu_jmp_instruction, io_loop_label_tracker ); } LIBXSMM_API_INTERN void libxsmm_generator_gemm_header_nloop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const unsigned int i_n_blocking) { libxsmm_x86_instruction_register_jump_back_label( io_generated_code, io_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_nloop, i_n_blocking ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_mloop, 0 ); } LIBXSMM_API_INTERN void libxsmm_generator_gemm_footer_nloop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_n_blocking, const unsigned int i_n_done ) { if ( LIBXSMM_GEMM_PRECISION_BF16 == LIBXSMM_GETENUM_OUT( i_xgemm_desc->datatype ) ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_c, (i_n_blocking*(i_xgemm_desc->ldc)*(i_micro_kernel_config->datatype_size/2)) - ((i_xgemm_desc->m)*(i_micro_kernel_config->datatype_size/2)) ); } else if ( LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_OUT( i_xgemm_desc->datatype ) ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_c, (i_n_blocking*(i_xgemm_desc->ldc)*(i_micro_kernel_config->datatype_size/4)) - ((i_xgemm_desc->m)*(i_micro_kernel_config->datatype_size/4)) ); } else { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_c, (i_n_blocking*(i_xgemm_desc->ldc)*(i_micro_kernel_config->datatype_size)) - ((i_xgemm_desc->m)*(i_micro_kernel_config->datatype_size)) ); } /* B prefetch */ if ( i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_BL2_VIA_C || i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C || i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C_AHEAD ) { if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) == 0 ) { unsigned int l_type_scaling; if ( (LIBXSMM_GEMM_PRECISION_BF16 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype )) || (LIBXSMM_GEMM_PRECISION_I16 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype )) ) { l_type_scaling = 2; } else if ( LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { l_type_scaling = 4; } else { l_type_scaling = 1; } libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_b_prefetch, (i_n_blocking*(i_xgemm_desc->ldc)*(i_micro_kernel_config->datatype_size/l_type_scaling)) - ((i_xgemm_desc->m)*(i_micro_kernel_config->datatype_size/l_type_scaling)) ); } } #if 0 if ( i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_CL2 || i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2CL2BL2_VIA_C ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_c_prefetch, (i_n_blocking*(i_xgemm_desc->ldc)*(i_micro_kernel_config->datatype_size)) - ((i_xgemm_desc->m)*(i_micro_kernel_config->datatype_size)) ); } #endif if (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS) { /* handle trans B */ int l_b_offset = 0; if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { l_b_offset = i_n_blocking * i_micro_kernel_config->datatype_size; } else { l_b_offset = i_n_blocking * i_xgemm_desc->ldb * i_micro_kernel_config->datatype_size; } libxsmm_x86_instruction_push_reg( io_generated_code, i_gp_reg_mapping->gp_reg_help_0 ); libxsmm_x86_instruction_push_reg( io_generated_code, i_gp_reg_mapping->gp_reg_reduce_loop ); libxsmm_generator_gemm_header_reduceloop( io_generated_code, io_loop_label_tracker, i_gp_reg_mapping, i_micro_kernel_config ); libxsmm_x86_instruction_alu_mem( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_a, i_gp_reg_mapping->gp_reg_reduce_loop, 8, 0, i_gp_reg_mapping->gp_reg_help_0, 0 ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_help_0, ((i_xgemm_desc->m)*(i_micro_kernel_config->datatype_size)) ); libxsmm_x86_instruction_alu_mem( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_a, i_gp_reg_mapping->gp_reg_reduce_loop, 8, 0, i_gp_reg_mapping->gp_reg_help_0, 1 ); libxsmm_x86_instruction_alu_mem( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_b, i_gp_reg_mapping->gp_reg_reduce_loop, 8, 0, i_gp_reg_mapping->gp_reg_help_0, 0 ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_help_0, l_b_offset ); libxsmm_x86_instruction_alu_mem( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_b, i_gp_reg_mapping->gp_reg_reduce_loop, 8, 0, i_gp_reg_mapping->gp_reg_help_0, 1 ); if ( i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2 || i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C ) { libxsmm_x86_instruction_alu_mem( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_a_prefetch, i_gp_reg_mapping->gp_reg_reduce_loop, 8, 0, i_gp_reg_mapping->gp_reg_help_0, 0 ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_help_0, ((i_xgemm_desc->m)*(i_micro_kernel_config->datatype_size)) ); libxsmm_x86_instruction_alu_mem( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_a_prefetch, i_gp_reg_mapping->gp_reg_reduce_loop, 8, 0, i_gp_reg_mapping->gp_reg_help_0, 1 ); } libxsmm_generator_gemm_footer_reduceloop( io_generated_code, io_loop_label_tracker, i_gp_reg_mapping, i_micro_kernel_config, i_xgemm_desc); libxsmm_x86_instruction_pop_reg( io_generated_code, i_gp_reg_mapping->gp_reg_reduce_loop ); libxsmm_x86_instruction_pop_reg( io_generated_code, i_gp_reg_mapping->gp_reg_help_0 ); } else { /* handle trans B */ int l_b_offset = 0; if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { l_b_offset = i_n_blocking * i_micro_kernel_config->datatype_size; } else { l_b_offset = i_n_blocking * i_xgemm_desc->ldb * i_micro_kernel_config->datatype_size; } libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_b, l_b_offset ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_a, ((i_xgemm_desc->m)*(i_micro_kernel_config->datatype_size)) ); if ( i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2 || i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_a_prefetch, ((i_xgemm_desc->m)*(i_micro_kernel_config->datatype_size)) ); } } libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_cmp_instruction, i_gp_reg_mapping->gp_reg_nloop, i_n_done ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, i_micro_kernel_config->alu_jmp_instruction, io_loop_label_tracker ); } LIBXSMM_API_INTERN void libxsmm_generator_gemm_header_mloop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const unsigned int i_m_blocking ) { libxsmm_x86_instruction_register_jump_back_label( io_generated_code, io_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_mloop, i_m_blocking ); } LIBXSMM_API_INTERN void libxsmm_generator_gemm_footer_mloop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_m_blocking, const unsigned int i_m_done ) { /* advance C pointer */ if ( LIBXSMM_GEMM_PRECISION_BF16 == LIBXSMM_GETENUM_OUT( i_xgemm_desc->datatype ) ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_c, i_m_blocking*(i_micro_kernel_config->datatype_size/2) ); } else if ( LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_OUT( i_xgemm_desc->datatype ) ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_c, i_m_blocking*(i_micro_kernel_config->datatype_size/4) ); } else { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_c, i_m_blocking*(i_micro_kernel_config->datatype_size) ); } /* C prefetch */ #if 0 if ( i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_CL2 || i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2CL2BL2_VIA_C ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_c_prefetch, i_m_blocking*(i_micro_kernel_config->datatype_size) ); } #endif /* B prefetch */ if ( i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_BL2_VIA_C || i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C || i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C_AHEAD ) { if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) == 0 ) { unsigned int l_type_scaling; if ( (LIBXSMM_GEMM_PRECISION_BF16 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype )) || (LIBXSMM_GEMM_PRECISION_I16 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype )) ) { l_type_scaling = 2; } else if ( LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { l_type_scaling = 4; } else { l_type_scaling = 1; } libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_b_prefetch, i_m_blocking*(i_micro_kernel_config->datatype_size/l_type_scaling) ); } } /* A prefetch */ if ( i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2 || i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C) { if (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS) { if ( i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2 ) { libxsmm_x86_instruction_push_reg( io_generated_code, i_gp_reg_mapping->gp_reg_help_0 ); libxsmm_x86_instruction_push_reg( io_generated_code, i_gp_reg_mapping->gp_reg_reduce_loop ); libxsmm_generator_gemm_header_reduceloop( io_generated_code, io_loop_label_tracker, i_gp_reg_mapping, i_micro_kernel_config ); libxsmm_x86_instruction_alu_mem( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_a_prefetch, i_gp_reg_mapping->gp_reg_reduce_loop, 8, 0, i_gp_reg_mapping->gp_reg_help_0, 0 ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_help_0, ((i_xgemm_desc->k) * (i_micro_kernel_config->datatype_size) * (i_xgemm_desc->lda) ) - (i_m_blocking * (i_micro_kernel_config->datatype_size)) ); libxsmm_x86_instruction_alu_mem( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_a_prefetch, i_gp_reg_mapping->gp_reg_reduce_loop, 8, 0, i_gp_reg_mapping->gp_reg_help_0, 1 ); libxsmm_generator_gemm_footer_reduceloop( io_generated_code, io_loop_label_tracker, i_gp_reg_mapping, i_micro_kernel_config, i_xgemm_desc); libxsmm_x86_instruction_pop_reg( io_generated_code, i_gp_reg_mapping->gp_reg_reduce_loop ); libxsmm_x86_instruction_pop_reg( io_generated_code, i_gp_reg_mapping->gp_reg_help_0 ); } } else { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_a_prefetch, ((i_xgemm_desc->k) * (i_micro_kernel_config->datatype_size) * (i_xgemm_desc->lda) ) - (i_m_blocking * (i_micro_kernel_config->datatype_size)) ); } } /* advance A pointer */ if (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS) { libxsmm_x86_instruction_push_reg( io_generated_code, i_gp_reg_mapping->gp_reg_help_0 ); libxsmm_x86_instruction_push_reg( io_generated_code, i_gp_reg_mapping->gp_reg_reduce_loop ); libxsmm_generator_gemm_header_reduceloop( io_generated_code, io_loop_label_tracker, i_gp_reg_mapping, i_micro_kernel_config ); libxsmm_x86_instruction_alu_mem( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_a, i_gp_reg_mapping->gp_reg_reduce_loop, 8, 0, i_gp_reg_mapping->gp_reg_help_0, 0 ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_help_0, ((i_xgemm_desc->k) * (i_micro_kernel_config->datatype_size) * (i_xgemm_desc->lda) ) - (i_m_blocking * (i_micro_kernel_config->datatype_size)) ); libxsmm_x86_instruction_alu_mem( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_a, i_gp_reg_mapping->gp_reg_reduce_loop, 8, 0, i_gp_reg_mapping->gp_reg_help_0, 1 ); libxsmm_generator_gemm_footer_reduceloop( io_generated_code, io_loop_label_tracker, i_gp_reg_mapping, i_micro_kernel_config, i_xgemm_desc); libxsmm_x86_instruction_pop_reg( io_generated_code, i_gp_reg_mapping->gp_reg_reduce_loop ); libxsmm_x86_instruction_pop_reg( io_generated_code, i_gp_reg_mapping->gp_reg_help_0 ); } else { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_a, ((i_xgemm_desc->k) * (i_micro_kernel_config->datatype_size) * (i_xgemm_desc->lda) ) - (i_m_blocking * (i_micro_kernel_config->datatype_size)) ); } /* loop handling */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_cmp_instruction, i_gp_reg_mapping->gp_reg_mloop, i_m_done ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, i_micro_kernel_config->alu_jmp_instruction, io_loop_label_tracker ); } LIBXSMM_API_INTERN void libxsmm_generator_gemm_load_C( libxsmm_generated_code* io_generated_code, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_m_blocking, const unsigned int i_n_blocking ) { unsigned int l_m_blocking, l_vec_reg_acc_start; /* register blocking counter in n */ unsigned int l_n = 0; /* register blocking counter in m */ unsigned int l_m = 0; assert(0 < i_micro_kernel_config->vector_length); /* deriving register blocking from kernel config */ l_m_blocking = ( i_m_blocking % i_micro_kernel_config->vector_length == 0 ) ? i_m_blocking/i_micro_kernel_config->vector_length : (i_m_blocking/i_micro_kernel_config->vector_length)+1; /* start register of accumulator */ l_vec_reg_acc_start = i_micro_kernel_config->vector_reg_count - (i_n_blocking * l_m_blocking); #if !defined(NDEBUG) /* Do some test if it is possible to generate the requested code. This is not done in release mode and therefore bad things might happen.... HUAAH */ if (i_micro_kernel_config->instruction_set == LIBXSMM_X86_SSE3 || i_micro_kernel_config->instruction_set == LIBXSMM_X86_AVX || i_micro_kernel_config->instruction_set == LIBXSMM_X86_AVX2 ) { if ( (i_n_blocking > 3) || (i_n_blocking < 1) || (i_m_blocking < 1) ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_REG_BLOCK ); return; } } else if ( i_micro_kernel_config->instruction_set < LIBXSMM_X86_AVX512_CORE ) { if ( (i_n_blocking > 30) || (i_n_blocking < 1) || (l_m_blocking != 1) ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_REG_BLOCK ); return; } } else if ( i_micro_kernel_config->instruction_set >= LIBXSMM_X86_AVX512_CORE ) { if ( (i_n_blocking > 30) || (i_n_blocking < 1) || (l_m_blocking < 1) || (l_m_blocking > 6) ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_REG_BLOCK ); return; } } else {} #if 0 if ( i_m_blocking % i_micro_kernel_config->vector_length != 0 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_M_BLOCK ); return; } #endif #endif /*!defined(NDEBUG)*/ /* load C accumulator */ if (0 == (LIBXSMM_GEMM_FLAG_BETA_0 & i_xgemm_desc->flags)) { /* Beta=1 */ /* pure BF16 kernel */ if ( ( (i_micro_kernel_config->instruction_set >= LIBXSMM_X86_AVX512_CORE) && (i_micro_kernel_config->instruction_set <= LIBXSMM_X86_ALLFEAT) ) && ( (LIBXSMM_GEMM_PRECISION_BF16 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) && (LIBXSMM_GEMM_PRECISION_BF16 == LIBXSMM_GETENUM_OUT( i_xgemm_desc->datatype ) ) ) ) { /* we add when scaling during conversion to FP32 */ for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { for ( l_m = 0; l_m < l_m_blocking; l_m++ ) { /* load 16 bit values into ymm portion of the register */ if ( (i_micro_kernel_config->use_masking_a_c != 0) && ( l_m == (l_m_blocking - 1) ) ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMOVDQU16, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, ((l_n * i_xgemm_desc->ldc) + (l_m * (i_micro_kernel_config->vector_length))) * (i_micro_kernel_config->datatype_size/2), 'z', 0, 2, 1, 0 ); } else { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->c_vmove_instruction, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, ((l_n * i_xgemm_desc->ldc) + (l_m * (i_micro_kernel_config->vector_length))) * (i_micro_kernel_config->datatype_size/2), 'y', 0, 0, 1, 0 ); } /* convert 16 bit values into 32 bit (integer convert) */ libxsmm_x86_instruction_vec_compute_convert( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VPMOVSXWD, i_micro_kernel_config->vector_name, 0, LIBXSMM_X86_VEC_REG_UNDEF, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n), LIBXSMM_X86_VEC_REG_UNDEF); /* shift 16 bits to the left to generate valid FP32 numbers */ libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VPSLLD, i_micro_kernel_config->vector_name, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n), l_vec_reg_acc_start + l_m + (l_m_blocking * l_n), LIBXSMM_X86_VEC_REG_UNDEF, 16); } } /* pure int8 kernel */ } else if ( ( (i_micro_kernel_config->instruction_set >= LIBXSMM_X86_AVX512_CORE) && (i_micro_kernel_config->instruction_set <= LIBXSMM_X86_ALLFEAT) ) && ( (LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) && (LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_OUT( i_xgemm_desc->datatype ) ) ) ) { /* we need to up convert int8 to int32 */ for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { for ( l_m = 0; l_m < l_m_blocking; l_m++ ) { /* load 16 bit values into xmm portion of the register */ if ( (i_micro_kernel_config->use_masking_a_c != 0) && ( l_m == (l_m_blocking - 1) ) ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMOVDQU8, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, ((l_n * i_xgemm_desc->ldc) + (l_m * (i_micro_kernel_config->vector_length))) * (i_micro_kernel_config->datatype_size/4), 'z', 0, 2, 1, 0 ); } else { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->c_vmove_instruction, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, ((l_n * i_xgemm_desc->ldc) + (l_m * (i_micro_kernel_config->vector_length))) * (i_micro_kernel_config->datatype_size/4), 'x', 0, 0, 1, 0 ); } /* convert 8 bit values into 32 bit (integer convert) */ if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_C_UNSIGNED) != 0 ) { libxsmm_x86_instruction_vec_compute_convert( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VPMOVZXBD, i_micro_kernel_config->vector_name, 0, LIBXSMM_X86_VEC_REG_UNDEF, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n), LIBXSMM_X86_VEC_REG_UNDEF); } else { libxsmm_x86_instruction_vec_compute_convert( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VPMOVSXBD, i_micro_kernel_config->vector_name, 0, LIBXSMM_X86_VEC_REG_UNDEF, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n), LIBXSMM_X86_VEC_REG_UNDEF); } } } } else { /* adding to C, so let's load C */ for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { for ( l_m = 0; l_m < l_m_blocking; l_m++ ) { /* we only mask the last m-blocked load */ libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->c_vmove_instruction, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, ((l_n * i_xgemm_desc->ldc) + (l_m * (i_micro_kernel_config->vector_length))) * (i_micro_kernel_config->datatype_size), i_micro_kernel_config->vector_name, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n), ( l_m == (l_m_blocking - 1) ) ? i_micro_kernel_config->use_masking_a_c : 0, 1, 0 ); } #if 0 if ( i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_CL2 || i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2CL2BL2_VIA_C ) { for (l_m = 0; l_m < l_m_blocking; l_m += l_m++ ) { libxsmm_x86_instruction_prefetch( io_generated_code, i_micro_kernel_config->prefetch_instruction, i_gp_reg_mapping->gp_reg_c_prefetch, LIBXSMM_X86_GP_REG_UNDEF, 0, ((l_n * i_xgemm_desc->ldc) + (l_m * (i_micro_kernel_config->vector_length))) * (i_micro_kernel_config->datatype_size)); } } #endif } } } else { /* overwriting C, so let's xout accumulator */ for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { for ( l_m = 0; l_m < l_m_blocking; l_m++ ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vxor_instruction, i_micro_kernel_config->vector_name, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n), l_vec_reg_acc_start + l_m + (l_m_blocking * l_n), l_vec_reg_acc_start + l_m + (l_m_blocking * l_n) ); } #if 0 if ( i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_CL2 || i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2CL2BL2_VIA_C ) { for (l_m = 0; l_m < l_m_blocking; l_m += l_m++ ) { libxsmm_x86_instruction_prefetch( io_generated_code, i_micro_kernel_config->prefetch_instruction, i_gp_reg_mapping->gp_reg_c_prefetch, LIBXSMM_X86_GP_REG_UNDEF, 0, ((l_n * i_xgemm_desc->ldc) + (l_m * (i_micro_kernel_config->vector_length))) * (i_micro_kernel_config->datatype_size)); } } #endif } } } LIBXSMM_API_INTERN void libxsmm_generator_gemm_store_C( libxsmm_generated_code* io_generated_code, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_m_blocking, const unsigned int i_n_blocking ) { /* deriving register blocking from kernel config */ unsigned int l_m_blocking = ( i_m_blocking % i_micro_kernel_config->vector_length == 0 ) ? i_m_blocking/i_micro_kernel_config->vector_length : (i_m_blocking/i_micro_kernel_config->vector_length)+1; /* register blocking counter in n */ unsigned int l_n = 0; /* register blocking counter in m */ unsigned int l_m = 0; /* start register of accumulator */ unsigned int l_vec_reg_acc_start = i_micro_kernel_config->vector_reg_count - (i_n_blocking * l_m_blocking); /* select store instruction */ unsigned int l_vstore = (LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT == (LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT & i_xgemm_desc->flags)) ? i_micro_kernel_config->c_vmove_nts_instruction : i_micro_kernel_config->c_vmove_instruction; /* @TODO fix this test */ #if !defined(NDEBUG) if (i_micro_kernel_config->instruction_set == LIBXSMM_X86_SSE3 || i_micro_kernel_config->instruction_set == LIBXSMM_X86_AVX || i_micro_kernel_config->instruction_set == LIBXSMM_X86_AVX2 ) { if ( (i_n_blocking > 3) || (i_n_blocking < 1) || (i_m_blocking < 1) ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_REG_BLOCK ); return; } } else if ( i_micro_kernel_config->instruction_set < LIBXSMM_X86_AVX512_CORE ) { if ( (i_n_blocking > 30) || (i_n_blocking < 1) || (i_m_blocking != i_micro_kernel_config->vector_length) ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_REG_BLOCK ); return; } } else if ( i_micro_kernel_config->instruction_set >= LIBXSMM_X86_AVX512_CORE ) { if ( (i_n_blocking > 30) || (i_n_blocking < 1) || (l_m_blocking < 1) || (l_m_blocking > 6) ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_REG_BLOCK ); return; } } else {} #if 0 if ( i_m_blocking % i_micro_kernel_config->vector_length != 0 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_M_BLOCK ); return; } #endif #endif if ( ( (i_micro_kernel_config->instruction_set == LIBXSMM_X86_AVX512_CORE) || (i_micro_kernel_config->instruction_set == LIBXSMM_X86_AVX512_CLX) ) && ( (LIBXSMM_GEMM_PRECISION_BF16 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) && (LIBXSMM_GEMM_PRECISION_BF16 == LIBXSMM_GETENUM_OUT( i_xgemm_desc->datatype ) ) ) ) { /* init stack with helper variables for SW-based RNE rounding */ /* push 0x7f800000 on the stack, naninf masking */ libxsmm_x86_instruction_alu_imm( io_generated_code, LIBXSMM_X86_INSTR_MOVQ, i_gp_reg_mapping->gp_reg_help_2, 0x7f800000); libxsmm_x86_instruction_push_reg( io_generated_code, i_gp_reg_mapping->gp_reg_help_2 ); /* push 0x00010000 on the stack, fixup masking */ libxsmm_x86_instruction_alu_imm( io_generated_code, LIBXSMM_X86_INSTR_MOVQ, i_gp_reg_mapping->gp_reg_help_2, 0x00010000); libxsmm_x86_instruction_push_reg( io_generated_code, i_gp_reg_mapping->gp_reg_help_2 ); /* push 0x00007fff on the stack, rneadd */ libxsmm_x86_instruction_alu_imm( io_generated_code, LIBXSMM_X86_INSTR_MOVQ, i_gp_reg_mapping->gp_reg_help_2, 0x00007fff); libxsmm_x86_instruction_push_reg( io_generated_code, i_gp_reg_mapping->gp_reg_help_2 ); /* push 0x00000001 on the stack, fixup */ libxsmm_x86_instruction_alu_imm( io_generated_code, LIBXSMM_X86_INSTR_MOVQ, i_gp_reg_mapping->gp_reg_help_2, 0x00000001); libxsmm_x86_instruction_push_reg( io_generated_code, i_gp_reg_mapping->gp_reg_help_2 ); /* storing downconverted and rounded C accumulator */ for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { for ( l_m = 0; l_m < l_m_blocking; l_m++ ) { unsigned int reg_X = l_vec_reg_acc_start + l_m + (l_m_blocking * l_n); /* and with naninf */ libxsmm_x86_instruction_vec_compute_mem( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VPANDD, 1, LIBXSMM_X86_GP_REG_RSP, LIBXSMM_X86_GP_REG_UNDEF, 0, 24, i_micro_kernel_config->vector_name, reg_X, 0 ); /* and with fixup */ libxsmm_x86_instruction_vec_compute_mem( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VPANDD, 1, LIBXSMM_X86_GP_REG_RSP, LIBXSMM_X86_GP_REG_UNDEF, 0, 16, i_micro_kernel_config->vector_name, reg_X, 1 ); /* compute naninf mask k7 */ libxsmm_x86_instruction_vec_compute_mem_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VPCMPD, 1, LIBXSMM_X86_GP_REG_RSP, LIBXSMM_X86_GP_REG_UNDEF, 0, 24, i_micro_kernel_config->vector_name, 0, LIBXSMM_X86_VEC_REG_UNDEF, 4, 7, 0 ); /* compute fixup mask k6 */ libxsmm_x86_instruction_vec_compute_mem_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VPCMPD, 1, LIBXSMM_X86_GP_REG_RSP, LIBXSMM_X86_GP_REG_UNDEF, 0, 16, i_micro_kernel_config->vector_name, 1, LIBXSMM_X86_VEC_REG_UNDEF, 0, 6, 0 ); /* load rneadd */ libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBROADCASTSS, LIBXSMM_X86_GP_REG_RSP, LIBXSMM_X86_GP_REG_UNDEF, 0, 8, i_micro_kernel_config->vector_name, 0, 0, 1, 0 ); /* load fixup */ libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBROADCASTSS, LIBXSMM_X86_GP_REG_RSP, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, 1, 0, 1, 0 ); /* compute fixup */ libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VPADDD, i_micro_kernel_config->vector_name, 1, 0, 0, LIBXSMM_X86_IMM_UNDEF, 6, 0 ); /* compute fixup */ libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VPADDD, i_micro_kernel_config->vector_name, 0, reg_X, reg_X, LIBXSMM_X86_IMM_UNDEF, 7, 0 ); /* shift FP32 by 16bit to right */ libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VPSRAD, i_micro_kernel_config->vector_name, reg_X, reg_X, LIBXSMM_X86_VEC_REG_UNDEF, 16); /* shift FP32 by 16bit to right */ libxsmm_x86_instruction_vec_compute_convert( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VPMOVDW, i_micro_kernel_config->vector_name, reg_X, LIBXSMM_X86_VEC_REG_UNDEF, 0, LIBXSMM_X86_VEC_REG_UNDEF); /* store 16 bit values into ymm portion of the register */ if ( (i_micro_kernel_config->use_masking_a_c != 0) && ( l_m == (l_m_blocking - 1) ) ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMOVDQU16, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, ((l_n * i_xgemm_desc->ldc) + (l_m * (i_micro_kernel_config->vector_length))) * (i_micro_kernel_config->datatype_size/2), 'z', 0, 2, 0, 1 ); } else { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, l_vstore, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, ((l_n * i_xgemm_desc->ldc) + (l_m * (i_micro_kernel_config->vector_length))) * (i_micro_kernel_config->datatype_size/2), 'y', 0, 0, 0, 1 ); } } } /* clean stack and restore help5 */ libxsmm_x86_instruction_pop_reg( io_generated_code, i_gp_reg_mapping->gp_reg_help_2 ); libxsmm_x86_instruction_pop_reg( io_generated_code, i_gp_reg_mapping->gp_reg_help_2 ); libxsmm_x86_instruction_pop_reg( io_generated_code, i_gp_reg_mapping->gp_reg_help_2 ); libxsmm_x86_instruction_pop_reg( io_generated_code, i_gp_reg_mapping->gp_reg_help_2 ); } else if ( ( (i_micro_kernel_config->instruction_set <= LIBXSMM_X86_ALLFEAT) && (i_micro_kernel_config->instruction_set >= LIBXSMM_X86_AVX512_CPX) ) && ( (LIBXSMM_GEMM_PRECISION_BF16 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) && (LIBXSMM_GEMM_PRECISION_BF16 == LIBXSMM_GETENUM_OUT( i_xgemm_desc->datatype ) ) ) ) { /* storing downconverted and rounded C accumulator */ for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { unsigned int l_m_2_blocking = (l_m_blocking/2)*2; l_m = 0; if ( i_micro_kernel_config->use_masking_a_c != 0 ) { for ( l_m = 0 ; l_m < l_m_blocking; l_m++ ) { unsigned int reg_X = l_vec_reg_acc_start + l_m + (l_m_blocking * l_n); libxsmm_x86_instruction_vec_compute_convert( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VCVTNEPS2BF16, i_micro_kernel_config->vector_name, reg_X, LIBXSMM_X86_VEC_REG_UNDEF, 0, 0); /* store 16 bit values into ymm portion of the register */ if ( l_m == (l_m_blocking - 1) ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMOVDQU16, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, ((l_n * i_xgemm_desc->ldc) + (l_m * (i_micro_kernel_config->vector_length))) * (i_micro_kernel_config->datatype_size/2), 'z', 0, 2, 0, 1 ); } else { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, l_vstore, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, ((l_n * i_xgemm_desc->ldc) + (l_m * (i_micro_kernel_config->vector_length))) * (i_micro_kernel_config->datatype_size/2), 'y', 0, 0, 0, 1 ); } } } else { for (; l_m < l_m_2_blocking; l_m+=2 ) { unsigned int reg_X = l_vec_reg_acc_start + l_m + (l_m_blocking * l_n); unsigned int reg_X2 = l_vec_reg_acc_start + l_m+1 + (l_m_blocking * l_n); libxsmm_x86_instruction_vec_compute_convert( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VCVTNE2PS2BF16, i_micro_kernel_config->vector_name, reg_X, reg_X2, 0, 0); libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, l_vstore, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, ((l_n * i_xgemm_desc->ldc) + (l_m * (i_micro_kernel_config->vector_length))) * (i_micro_kernel_config->datatype_size/2), 'z', 0, 0, 0, 1 ); } for (; l_m < l_m_blocking; l_m++ ) { unsigned int reg_X = l_vec_reg_acc_start + l_m + (l_m_blocking * l_n); libxsmm_x86_instruction_vec_compute_convert( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VCVTNEPS2BF16, i_micro_kernel_config->vector_name, reg_X, LIBXSMM_X86_VEC_REG_UNDEF, 0, 0); libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, l_vstore, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, ((l_n * i_xgemm_desc->ldc) + (l_m * (i_micro_kernel_config->vector_length))) * (i_micro_kernel_config->datatype_size/2), 'y', 0, 0, 0, 1 ); } } } } else if ( ( (i_micro_kernel_config->instruction_set <= LIBXSMM_X86_ALLFEAT) || (i_micro_kernel_config->instruction_set >= LIBXSMM_X86_AVX512_CORE) ) && ( (LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) && (LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_OUT( i_xgemm_desc->datatype ) ) ) ) { /* pick the right instrucitons */ unsigned int inst_f32_i32 = ( ( i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_C_UNSIGNED ) != 0 ) ? LIBXSMM_X86_INSTR_VCVTPS2UDQ : LIBXSMM_X86_INSTR_VCVTPS2DQ; unsigned int inst_i32_i8 = ( ( i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_C_UNSIGNED ) != 0 ) ? LIBXSMM_X86_INSTR_VPMOVUSDB : LIBXSMM_X86_INSTR_VPMOVSDB; /* there are case where we need to load the scaling factor's address from the stack argument list */ if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET) != 0 ) { libxsmm_x86_instruction_load_arg_to_reg( io_generated_code, 0, i_gp_reg_mapping->gp_reg_scf ); } /* loading scf into register 3 */ libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBROADCASTSS, i_gp_reg_mapping->gp_reg_scf, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, 3, 0, 1, 0 ); /* Zero out register 0 to perform relu */ libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vxor_instruction, i_micro_kernel_config->vector_name, 0, 0, 0); /* storing downconverted and rounded C accumulator */ for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { for ( l_m = 0; l_m < l_m_blocking; l_m++ ) { unsigned int reg_X = l_vec_reg_acc_start + l_m + (l_m_blocking * l_n); /* Convert result to F32 */ libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VCVTDQ2PS, i_micro_kernel_config->vector_name, reg_X, reg_X, LIBXSMM_X86_VEC_REG_UNDEF); /* Multiply with scaling factor */ libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, reg_X, 3, reg_X ); /* Perform RELU */ libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMAXPS, i_micro_kernel_config->vector_name, reg_X, 0, reg_X); /* Round result to int32 */ libxsmm_x86_instruction_vec_compute_convert( io_generated_code, i_micro_kernel_config->instruction_set, inst_f32_i32, i_micro_kernel_config->vector_name, reg_X, LIBXSMM_X86_VEC_REG_UNDEF, reg_X, 0); /* down-convert to int8 */ libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, inst_i32_i8, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, ((l_n * i_xgemm_desc->ldc) + (l_m * (i_micro_kernel_config->vector_length))) * (i_micro_kernel_config->datatype_size/4), i_micro_kernel_config->vector_name, reg_X, ( ( l_m == (l_m_blocking - 1)) && ( i_micro_kernel_config->use_masking_a_c != 0 ) ) ? 2 : 0, 0, 1 ); } } } else { /* storing C accumulator */ for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { for ( l_m = 0; l_m < l_m_blocking; l_m++ ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, l_vstore, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, ((l_n * i_xgemm_desc->ldc) + (l_m * (i_micro_kernel_config->vector_length))) * (i_micro_kernel_config->datatype_size), i_micro_kernel_config->vector_name, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n), ( l_m == (l_m_blocking - 1) ) ? i_micro_kernel_config->use_masking_a_c : 0, 0, 1 ); } if ( i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_BL2_VIA_C || i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C || i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C_AHEAD ) { if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) == 0 ) { /* determining how many prefetches we need in M direction as we just need one prefetch per cache line */ unsigned int l_m_advance = 64 / ((i_micro_kernel_config->vector_length) * (i_micro_kernel_config->datatype_size)); /* 64: hardcoded cache line length */ for (l_m = 0; l_m < l_m_blocking; l_m += l_m_advance ) { libxsmm_x86_instruction_prefetch( io_generated_code, i_micro_kernel_config->prefetch_instruction, i_gp_reg_mapping->gp_reg_b_prefetch, LIBXSMM_X86_GP_REG_UNDEF, 0, ((l_n * i_xgemm_desc->ldc) + (l_m * (i_micro_kernel_config->vector_length))) * (i_micro_kernel_config->datatype_size)); } } } } } } LIBXSMM_API_INTERN void libxsmm_generator_gemm_initialize_avx512_mask( libxsmm_generated_code* io_generated_code, const unsigned int i_gp_reg_tmp, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_mask_count ) { unsigned int l_mask; /* init full mask */ if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { l_mask = 0xff; } else { l_mask = 0xffff; } /* shift right by "inverse" remainder */ l_mask = l_mask >> i_mask_count; /* move mask to GP register */ libxsmm_x86_instruction_alu_imm( io_generated_code, LIBXSMM_X86_INSTR_MOVQ, i_gp_reg_tmp, l_mask ); if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { libxsmm_x86_instruction_mask_move( io_generated_code, LIBXSMM_X86_INSTR_KMOVW, i_gp_reg_tmp, LIBXSMM_X86_AVX512_MASK, 0 ); if ( ( LIBXSMM_GEMM_PRECISION_BF16 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) && ( LIBXSMM_GEMM_PRECISION_BF16 == LIBXSMM_GETENUM_OUT( i_xgemm_desc->datatype ) ) ) { libxsmm_x86_instruction_mask_move( io_generated_code, LIBXSMM_X86_INSTR_KMOVD, i_gp_reg_tmp, 2, 0 ); } else if ( ( LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) && ( LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_OUT( i_xgemm_desc->datatype ) ) ) { libxsmm_x86_instruction_mask_move( io_generated_code, LIBXSMM_X86_INSTR_KMOVQ, i_gp_reg_tmp, 2, 0 ); } else { /* no addtional mask is needed */ } } else { /* shouldn't happen */ LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_ARCH ); return; } } libxsmm-1.17/src/generator_gemm_common.h000066400000000000000000000211021415223013700204120ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef GENERATOR_GEMM_COMMON_H #define GENERATOR_GEMM_COMMON_H #include "generator_common.h" LIBXSMM_API_INTERN void libxsmm_generator_gemm_init_micro_kernel_config_fullvector( libxsmm_micro_kernel_config* io_micro_kernel_config, const unsigned int i_arch, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_use_masking_a_c ); LIBXSMM_API_INTERN void libxsmm_generator_gemm_init_micro_kernel_config_halfvector( libxsmm_micro_kernel_config* io_micro_kernel_config, const unsigned int i_arch, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_use_masking_a_c ); LIBXSMM_API_INTERN void libxsmm_generator_gemm_init_micro_kernel_config_scalar( libxsmm_micro_kernel_config* io_micro_kernel_config, const unsigned int i_arch, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_use_masking_a_c ); LIBXSMM_API_INTERN void libxsmm_generator_gemm_add_flop_counter( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc ); LIBXSMM_API_INTERN void libxsmm_generator_gemm_header_kloop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const unsigned int i_m_blocking, const unsigned int i_k_blocking ); LIBXSMM_API_INTERN void libxsmm_generator_gemm_footer_kloop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_m_blocking, const unsigned int i_max_blocked_k, const unsigned int i_kloop_complete ); LIBXSMM_API_INTERN void libxsmm_generator_gemm_header_reduceloop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config); LIBXSMM_API_INTERN void libxsmm_generator_gemm_footer_reduceloop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc); LIBXSMM_API_INTERN void libxsmm_generator_gemm_header_nloop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const unsigned int i_n_blocking ); LIBXSMM_API_INTERN void libxsmm_generator_gemm_footer_nloop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_n_blocking, const unsigned int i_n_done ); LIBXSMM_API_INTERN void libxsmm_generator_gemm_header_mloop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const unsigned int i_m_blocking ); LIBXSMM_API_INTERN void libxsmm_generator_gemm_footer_mloop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_m_blocking, const unsigned int i_m_done ); LIBXSMM_API_INTERN void libxsmm_generator_gemm_load_C( libxsmm_generated_code* io_generated_code, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_m_blocking, const unsigned int i_n_blocking ); LIBXSMM_API_INTERN void libxsmm_generator_gemm_store_C( libxsmm_generated_code* io_generated_code, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_m_blocking, const unsigned int i_n_blocking ); LIBXSMM_API_INTERN void libxsmm_generator_gemm_initialize_avx512_mask( libxsmm_generated_code* io_generated_code, const unsigned int i_gp_reg_tmp, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_mask_count ); #endif /* GENERATOR_GEMM_COMMON_H */ libxsmm-1.17/src/generator_gemm_noarch.c000066400000000000000000000075641415223013700204070ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "generator_gemm_noarch.h" #include "libxsmm_main.h" LIBXSMM_API_INTERN void libxsmm_generator_gemm_noarch_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc ) { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " unsigned int l_m = 0;\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " unsigned int l_n = 0;\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " unsigned int l_k = 0;\n\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_n = 0; l_n < %u; l_n++ ) {\n", (unsigned int)i_xgemm_desc->n); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); if (0 != (LIBXSMM_GEMM_FLAG_BETA_0 & i_xgemm_desc->flags)) { /* Beta=0 */ if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_m = 0; l_m < %u; l_m++ ) { C[(l_n*%u)+l_m] = 0.0; }\n\n", (unsigned int)i_xgemm_desc->m, (unsigned int)i_xgemm_desc->ldc); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_m = 0; l_m < %u; l_m++ ) { C[(l_n*%u)+l_m] = 0.0f; }\n\n", (unsigned int)i_xgemm_desc->m, (unsigned int)i_xgemm_desc->ldc); } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_k = 0; l_k < %u; l_k++ ) {\n", (unsigned int)i_xgemm_desc->k); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_m = 0; l_m < %u; l_m++ ) {\n", (unsigned int)i_xgemm_desc->m); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " C[(l_n*%u)+l_m] += A[(l_k*%u)+l_m] * B[(l_n*%u)+l_k];\n", (unsigned int)i_xgemm_desc->ldc, (unsigned int)i_xgemm_desc->lda, (unsigned int)i_xgemm_desc->ldb); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " }\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " }\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " }\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } libxsmm-1.17/src/generator_gemm_noarch.h000066400000000000000000000021101415223013700203720ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef GENERATOR_GEMM_NOARCH_H #define GENERATOR_GEMM_NOARCH_H #include "generator_common.h" LIBXSMM_API_INTERN void libxsmm_generator_gemm_noarch_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc ); #endif /* GENERATOR_GEMM_NOARCH_H */ libxsmm-1.17/src/generator_gemm_sse3_avx_avx2_avx512.c000066400000000000000000001362071415223013700227330ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "generator_common.h" #include "generator_x86_instructions.h" #include "generator_gemm_common.h" #include "generator_gemm_sse3_avx_avx2_avx512.h" #include "generator_gemm_sse3_microkernel.h" #include "generator_gemm_avx_microkernel.h" #include "generator_gemm_avx2_microkernel.h" #include "generator_gemm_avx512_microkernel.h" #include "libxsmm_main.h" LIBXSMM_API_INTERN void libxsmm_generator_gemm_sse3_avx_avx2_avx512_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc ) { libxsmm_micro_kernel_config l_micro_kernel_config; libxsmm_loop_label_tracker l_loop_label_tracker; libxsmm_gp_reg_mapping l_gp_reg_mapping; /* initialize n-blocking */ unsigned int l_n_count = 0; /* array counter for blocking arrays */ unsigned int l_n_done = 0; /* progress tracker */ unsigned int l_n_n[2] = {0,0}; /* blocking sizes for blocks */ unsigned int l_n_N[2] = {0,0}; /* size of blocks */ unsigned int adjust_A_pf_ptrs = 0; unsigned int adjust_B_pf_ptrs = 0; /* Make sure we properly adjust A,B prefetch pointers in case of batch-reduce gemm kernel */ if (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS) { if ( i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2 || i_xgemm_desc->prefetch == LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C ) { adjust_A_pf_ptrs = 1; } } /* define gp register mapping */ libxsmm_reset_x86_gp_reg_mapping( &l_gp_reg_mapping ); #if defined(_WIN32) || defined(__CYGWIN__) l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_c = LIBXSMM_X86_GP_REG_R8; /* TODO: full support for Windows calling convention */ l_gp_reg_mapping.gp_reg_a_prefetch = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_b_prefetch = LIBXSMM_X86_GP_REG_UNDEF; #else /* match calling convention on Linux */ l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RDI; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_RSI; l_gp_reg_mapping.gp_reg_c = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_a_prefetch = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_b_prefetch = LIBXSMM_X86_GP_REG_R8; if ( (LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype )) && (LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_OUT( i_xgemm_desc->datatype )) ) { l_gp_reg_mapping.gp_reg_scf = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_a_prefetch = LIBXSMM_X86_GP_REG_R8; l_gp_reg_mapping.gp_reg_b_prefetch = LIBXSMM_X86_GP_REG_R9; } else { l_gp_reg_mapping.gp_reg_scf = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_a_prefetch = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_b_prefetch = LIBXSMM_X86_GP_REG_R8; } /* If we are generating the batchreduce kernel, then we rename the registers */ if ((i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS) || (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE)) { l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RDI; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_RSI; l_gp_reg_mapping.gp_reg_c = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_reduce_count = LIBXSMM_X86_GP_REG_RCX; if ( (LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype )) && (LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_OUT( i_xgemm_desc->datatype )) ) { l_gp_reg_mapping.gp_reg_scf = LIBXSMM_X86_GP_REG_R8; l_gp_reg_mapping.gp_reg_a_prefetch = LIBXSMM_X86_GP_REG_R9; l_gp_reg_mapping.gp_reg_b_prefetch = LIBXSMM_X86_GP_REG_UNDEF; } else { l_gp_reg_mapping.gp_reg_scf = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_a_prefetch = LIBXSMM_X86_GP_REG_R8; l_gp_reg_mapping.gp_reg_b_prefetch = LIBXSMM_X86_GP_REG_R9; } l_gp_reg_mapping.gp_reg_reduce_loop = LIBXSMM_X86_GP_REG_R13; l_gp_reg_mapping.gp_reg_help_0 = LIBXSMM_X86_GP_REG_R14; } else if (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET) { l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RDI; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_RSI; l_gp_reg_mapping.gp_reg_c = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_reduce_count = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_a_offset = LIBXSMM_X86_GP_REG_R8; l_gp_reg_mapping.gp_reg_b_offset = LIBXSMM_X86_GP_REG_R9; if ( (LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype )) && (LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_OUT( i_xgemm_desc->datatype )) ) { l_gp_reg_mapping.gp_reg_scf = LIBXSMM_X86_GP_REG_RAX; } else { l_gp_reg_mapping.gp_reg_scf = LIBXSMM_X86_GP_REG_UNDEF; } l_gp_reg_mapping.gp_reg_reduce_loop = LIBXSMM_X86_GP_REG_R13; l_gp_reg_mapping.gp_reg_help_0 = LIBXSMM_X86_GP_REG_R14; } #endif l_gp_reg_mapping.gp_reg_mloop = LIBXSMM_X86_GP_REG_R10; l_gp_reg_mapping.gp_reg_nloop = LIBXSMM_X86_GP_REG_R11; l_gp_reg_mapping.gp_reg_kloop = LIBXSMM_X86_GP_REG_R12; l_gp_reg_mapping.gp_reg_help_1 = LIBXSMM_X86_GP_REG_R15; l_gp_reg_mapping.gp_reg_help_2 = LIBXSMM_X86_GP_REG_RBX; /* define loop_label_tracker */ libxsmm_reset_loop_label_tracker( &l_loop_label_tracker ); /* define the micro kernel code gen properties */ libxsmm_generator_gemm_init_micro_kernel_config_fullvector( &l_micro_kernel_config, io_generated_code->arch, i_xgemm_desc, 0 ); /* block according to the number of available registers or given limits */ if ( i_xgemm_desc->n == 7 && io_generated_code->arch >= LIBXSMM_X86_AVX512_CORE && io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) { libxsmm_compute_equalized_blocking( i_xgemm_desc->n, 7, &(l_n_N[0]), &(l_n_n[0]), &(l_n_N[1]), &(l_n_n[1]) ); } else { unsigned int max_n_blocking = libxsmm_generator_gemm_sse3_avx_avx2_avx512_get_max_n_blocking( &l_micro_kernel_config, i_xgemm_desc, io_generated_code->arch ); #if 1 if (3 < max_n_blocking) #endif { const unsigned int init_m_blocking = libxsmm_generator_gemm_sse3_avx_avx2_avx512_get_initial_m_blocking( &l_micro_kernel_config, i_xgemm_desc, io_generated_code->arch ); const unsigned int init_m_blocks = LIBXSMM_UPDIV(init_m_blocking, l_micro_kernel_config.vector_length); while ((init_m_blocks * max_n_blocking + init_m_blocks + 1) > l_micro_kernel_config.vector_reg_count) { max_n_blocking--; } } libxsmm_compute_equalized_blocking( i_xgemm_desc->n, max_n_blocking, &(l_n_N[0]), &(l_n_n[0]), &(l_n_N[1]), &(l_n_n[1]) ); } /* check that l_n_N1 is non-zero */ if ( l_n_N[0] == 0 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_N_BLOCK ); return; } /* open asm */ libxsmm_x86_instruction_open_stream( io_generated_code, &l_gp_reg_mapping, i_xgemm_desc->prefetch ); /* Load the actual batch-reduce trip count */ if ((i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS) || (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET) || (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE)) { libxsmm_x86_instruction_alu_mem( io_generated_code, l_micro_kernel_config.alu_mov_instruction, l_gp_reg_mapping.gp_reg_reduce_count, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, l_gp_reg_mapping.gp_reg_reduce_count, 0 ); } /* apply n_blocking */ while (l_n_done != (unsigned int)i_xgemm_desc->n) { unsigned int l_n_blocking = l_n_n[l_n_count]; unsigned int l_m_done = 0; unsigned int l_m_done_old = 0; unsigned int l_m_blocking = 0; /* advance N */ l_n_done += l_n_N[l_n_count]; l_n_count++; /* open N loop */ libxsmm_generator_gemm_header_nloop( io_generated_code, &l_loop_label_tracker, &l_gp_reg_mapping, &l_micro_kernel_config, l_n_blocking ); /* define the micro kernel code gen properties, especially m-blocking affects the vector instruction length */ l_m_blocking = libxsmm_generator_gemm_sse3_avx_avx2_avx512_get_initial_m_blocking( &l_micro_kernel_config, i_xgemm_desc, io_generated_code->arch ); /* apply m_blocking */ while (l_m_done != (unsigned int)i_xgemm_desc->m) { if ( l_m_blocking == 0 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_M_BLOCK ); return; } if (l_m_done == 0) { /* This is a SeisSol Order 6, HSW, DP performance fix */ if ( ( io_generated_code->arch == LIBXSMM_X86_AVX2 ) && ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) ) { l_m_done_old = l_m_done; if (i_xgemm_desc->m == 56) { l_m_done = 32; } else { LIBXSMM_ASSERT(0 != l_m_blocking); /* coverity[divide_by_zero] */ l_m_done = l_m_done + (((i_xgemm_desc->m - l_m_done_old) / l_m_blocking) * l_m_blocking); } } else { l_m_done_old = l_m_done; LIBXSMM_ASSERT(0 != l_m_blocking); /* coverity[divide_by_zero] */ l_m_done = l_m_done + (((i_xgemm_desc->m - l_m_done_old) / l_m_blocking) * l_m_blocking); } } else { l_m_done_old = l_m_done; LIBXSMM_ASSERT(0 != l_m_blocking); /* coverity[divide_by_zero] */ l_m_done = l_m_done + (((i_xgemm_desc->m - l_m_done_old) / l_m_blocking) * l_m_blocking); } if ( (l_m_done != l_m_done_old) && (l_m_done > 0) ) { /* when on AVX512, load mask, if needed */ if ( ( l_micro_kernel_config.use_masking_a_c != 0 ) && ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { /* compute the mask count, depends on vlen as block in M */ unsigned int l_corrected_vlen = l_micro_kernel_config.vector_length; unsigned int l_mask_count = l_corrected_vlen - ( l_m_blocking % l_corrected_vlen ); libxsmm_generator_gemm_initialize_avx512_mask( io_generated_code, l_gp_reg_mapping.gp_reg_help_1, i_xgemm_desc, l_mask_count ); } libxsmm_generator_gemm_header_mloop( io_generated_code, &l_loop_label_tracker, &l_gp_reg_mapping, &l_micro_kernel_config, l_m_blocking ); libxsmm_generator_gemm_load_C( io_generated_code, &l_gp_reg_mapping, &l_micro_kernel_config, i_xgemm_desc, l_m_blocking, l_n_blocking ); if ((i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS) || (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET) || (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE)) { if ((i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET) || (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE)) { libxsmm_x86_instruction_push_reg( io_generated_code, l_gp_reg_mapping.gp_reg_b); libxsmm_x86_instruction_push_reg( io_generated_code, l_gp_reg_mapping.gp_reg_a); } /* This is the reduce loop */ libxsmm_generator_gemm_header_reduceloop( io_generated_code, &l_loop_label_tracker, &l_gp_reg_mapping, &l_micro_kernel_config ); if (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS) { libxsmm_x86_instruction_push_reg( io_generated_code, l_gp_reg_mapping.gp_reg_a); libxsmm_x86_instruction_push_reg( io_generated_code, l_gp_reg_mapping.gp_reg_b); if (adjust_A_pf_ptrs) { /* coverity[dead_error_line] */ libxsmm_x86_instruction_push_reg( io_generated_code, l_gp_reg_mapping.gp_reg_a_prefetch ); } if (adjust_B_pf_ptrs) { libxsmm_x86_instruction_push_reg( io_generated_code, l_gp_reg_mapping.gp_reg_b_prefetch ); } /* load to reg_a the proper array based on the reduce loop index */ libxsmm_x86_instruction_alu_mem( io_generated_code, l_micro_kernel_config.alu_mov_instruction, l_gp_reg_mapping.gp_reg_a, l_gp_reg_mapping.gp_reg_reduce_loop, 8, 0, l_gp_reg_mapping.gp_reg_a, 0 ); /* load to reg_b the proper array based on the reduce loop index */ libxsmm_x86_instruction_alu_mem( io_generated_code, l_micro_kernel_config.alu_mov_instruction, l_gp_reg_mapping.gp_reg_b, l_gp_reg_mapping.gp_reg_reduce_loop, 8, 0, l_gp_reg_mapping.gp_reg_b, 0 ); if (adjust_A_pf_ptrs) { libxsmm_x86_instruction_alu_mem( io_generated_code, l_micro_kernel_config.alu_mov_instruction, l_gp_reg_mapping.gp_reg_a_prefetch, l_gp_reg_mapping.gp_reg_reduce_loop, 8, 0, l_gp_reg_mapping.gp_reg_a_prefetch, 0 ); } if (adjust_B_pf_ptrs) { /* coverity[dead_error_line] */ libxsmm_x86_instruction_alu_mem( io_generated_code, l_micro_kernel_config.alu_mov_instruction, l_gp_reg_mapping.gp_reg_b_prefetch, l_gp_reg_mapping.gp_reg_reduce_loop, 8, 0, l_gp_reg_mapping.gp_reg_b_prefetch, 0 ); } } else if (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET) { libxsmm_x86_instruction_pop_reg( io_generated_code, l_gp_reg_mapping.gp_reg_a); libxsmm_x86_instruction_pop_reg( io_generated_code, l_gp_reg_mapping.gp_reg_b); libxsmm_x86_instruction_push_reg( io_generated_code, l_gp_reg_mapping.gp_reg_b); libxsmm_x86_instruction_push_reg( io_generated_code, l_gp_reg_mapping.gp_reg_a); /* Calculate to reg_a the proper address based on the reduce loop index */ libxsmm_x86_instruction_alu_mem( io_generated_code, l_micro_kernel_config.alu_mov_instruction, l_gp_reg_mapping.gp_reg_a_offset, l_gp_reg_mapping.gp_reg_reduce_loop, 8, 0, l_gp_reg_mapping.gp_reg_help_0, 0 ); libxsmm_x86_instruction_alu_reg( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_help_0, l_gp_reg_mapping.gp_reg_a); /* Calculate to reg_b the proper address based on the reduce loop index */ libxsmm_x86_instruction_alu_mem( io_generated_code, l_micro_kernel_config.alu_mov_instruction, l_gp_reg_mapping.gp_reg_b_offset, l_gp_reg_mapping.gp_reg_reduce_loop, 8, 0, l_gp_reg_mapping.gp_reg_help_0, 0 ); libxsmm_x86_instruction_alu_reg( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_help_0, l_gp_reg_mapping.gp_reg_b); } else if (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE) { libxsmm_x86_instruction_pop_reg( io_generated_code, l_gp_reg_mapping.gp_reg_a); libxsmm_x86_instruction_pop_reg( io_generated_code, l_gp_reg_mapping.gp_reg_b); libxsmm_x86_instruction_push_reg( io_generated_code, l_gp_reg_mapping.gp_reg_b); libxsmm_x86_instruction_push_reg( io_generated_code, l_gp_reg_mapping.gp_reg_a); /* Calculate to reg_a the proper address based on the reduce loop index */ libxsmm_x86_instruction_alu_reg( io_generated_code, l_micro_kernel_config.alu_mov_instruction, l_gp_reg_mapping.gp_reg_reduce_loop, l_gp_reg_mapping.gp_reg_help_0); libxsmm_x86_instruction_alu_imm( io_generated_code, LIBXSMM_X86_INSTR_IMUL, l_gp_reg_mapping.gp_reg_help_0, i_xgemm_desc->c1); libxsmm_x86_instruction_alu_reg( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_help_0, l_gp_reg_mapping.gp_reg_a); /* Calculate to reg_b the proper address based on the reduce loop index */ libxsmm_x86_instruction_alu_reg( io_generated_code, l_micro_kernel_config.alu_mov_instruction, l_gp_reg_mapping.gp_reg_reduce_loop, l_gp_reg_mapping.gp_reg_help_0); libxsmm_x86_instruction_alu_imm( io_generated_code, LIBXSMM_X86_INSTR_IMUL, l_gp_reg_mapping.gp_reg_help_0, i_xgemm_desc->c2); libxsmm_x86_instruction_alu_reg( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_help_0, l_gp_reg_mapping.gp_reg_b); } } libxsmm_generator_gemm_sse3_avx_avx2_avx512_kloop( io_generated_code, &l_loop_label_tracker, &l_gp_reg_mapping, &l_micro_kernel_config, i_xgemm_desc, l_m_blocking, l_n_blocking ); if ((i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS) || (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET) || (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE)) { if (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS) { if (adjust_B_pf_ptrs) { /* coverity[dead_error_begin] */ libxsmm_x86_instruction_pop_reg( io_generated_code, l_gp_reg_mapping.gp_reg_help_0); libxsmm_x86_instruction_alu_mem( io_generated_code, l_micro_kernel_config.alu_mov_instruction, l_gp_reg_mapping.gp_reg_help_0, l_gp_reg_mapping.gp_reg_reduce_loop, 8, 0, l_gp_reg_mapping.gp_reg_b_prefetch, 1 ); libxsmm_x86_instruction_alu_reg( io_generated_code, l_micro_kernel_config.alu_mov_instruction, l_gp_reg_mapping.gp_reg_help_0, l_gp_reg_mapping.gp_reg_b_prefetch); } if (adjust_A_pf_ptrs) { libxsmm_x86_instruction_pop_reg( io_generated_code, l_gp_reg_mapping.gp_reg_help_0); libxsmm_x86_instruction_alu_mem( io_generated_code, l_micro_kernel_config.alu_mov_instruction, l_gp_reg_mapping.gp_reg_help_0, l_gp_reg_mapping.gp_reg_reduce_loop, 8, 0, l_gp_reg_mapping.gp_reg_a_prefetch, 1 ); libxsmm_x86_instruction_alu_reg( io_generated_code, l_micro_kernel_config.alu_mov_instruction, l_gp_reg_mapping.gp_reg_help_0, l_gp_reg_mapping.gp_reg_a_prefetch); } /* Pop address of B_array to help_0 and store proper address of B */ libxsmm_x86_instruction_pop_reg( io_generated_code, l_gp_reg_mapping.gp_reg_help_0); libxsmm_x86_instruction_alu_mem( io_generated_code, l_micro_kernel_config.alu_mov_instruction, l_gp_reg_mapping.gp_reg_help_0, l_gp_reg_mapping.gp_reg_reduce_loop, 8, 0, l_gp_reg_mapping.gp_reg_b, 1 ); /* Move to reg_b the address of B_array */ libxsmm_x86_instruction_alu_reg( io_generated_code, l_micro_kernel_config.alu_mov_instruction, l_gp_reg_mapping.gp_reg_help_0, l_gp_reg_mapping.gp_reg_b); /* Pop address of A_array to help_0 and store proper address of A */ libxsmm_x86_instruction_pop_reg( io_generated_code, l_gp_reg_mapping.gp_reg_help_0); libxsmm_x86_instruction_alu_mem( io_generated_code, l_micro_kernel_config.alu_mov_instruction, l_gp_reg_mapping.gp_reg_help_0, l_gp_reg_mapping.gp_reg_reduce_loop, 8, 0, l_gp_reg_mapping.gp_reg_a, 1 ); /* Move to reg_a the address of A_array */ libxsmm_x86_instruction_alu_reg( io_generated_code, l_micro_kernel_config.alu_mov_instruction, l_gp_reg_mapping.gp_reg_help_0, l_gp_reg_mapping.gp_reg_a); } libxsmm_generator_gemm_footer_reduceloop( io_generated_code, &l_loop_label_tracker, &l_gp_reg_mapping, &l_micro_kernel_config, i_xgemm_desc); if (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET) { /* Calculate to reg_a the proper A advance form the microkernel */ libxsmm_x86_instruction_alu_mem( io_generated_code, l_micro_kernel_config.alu_mov_instruction, l_gp_reg_mapping.gp_reg_a_offset, l_gp_reg_mapping.gp_reg_reduce_loop, 8, -8, l_gp_reg_mapping.gp_reg_help_0, 0 ); libxsmm_x86_instruction_alu_reg( io_generated_code, l_micro_kernel_config.alu_sub_instruction, l_gp_reg_mapping.gp_reg_help_0, l_gp_reg_mapping.gp_reg_a); /* Calculate to reg_b the proper B advance form the microkernel */ libxsmm_x86_instruction_alu_mem( io_generated_code, l_micro_kernel_config.alu_mov_instruction, l_gp_reg_mapping.gp_reg_b_offset, l_gp_reg_mapping.gp_reg_reduce_loop, 8, -8, l_gp_reg_mapping.gp_reg_help_0, 0 ); libxsmm_x86_instruction_alu_reg( io_generated_code, l_micro_kernel_config.alu_sub_instruction, l_gp_reg_mapping.gp_reg_help_0, l_gp_reg_mapping.gp_reg_b); /* Consume the last two pushes form the stack */ libxsmm_x86_instruction_pop_reg( io_generated_code, l_gp_reg_mapping.gp_reg_help_0); libxsmm_x86_instruction_pop_reg( io_generated_code, l_gp_reg_mapping.gp_reg_help_0); } if (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE) { /* Calculate to reg_a the proper A advance form the microkernel */ libxsmm_x86_instruction_alu_reg( io_generated_code, l_micro_kernel_config.alu_mov_instruction, l_gp_reg_mapping.gp_reg_reduce_count, l_gp_reg_mapping.gp_reg_help_0); libxsmm_x86_instruction_alu_imm( io_generated_code, LIBXSMM_X86_INSTR_IMUL, l_gp_reg_mapping.gp_reg_help_0, i_xgemm_desc->c1); libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_sub_instruction, l_gp_reg_mapping.gp_reg_help_0, i_xgemm_desc->c1); libxsmm_x86_instruction_alu_reg( io_generated_code, l_micro_kernel_config.alu_sub_instruction, l_gp_reg_mapping.gp_reg_help_0, l_gp_reg_mapping.gp_reg_a); /* Calculate to reg_b the proper B advance form the microkernel */ libxsmm_x86_instruction_alu_reg( io_generated_code, l_micro_kernel_config.alu_mov_instruction, l_gp_reg_mapping.gp_reg_reduce_count, l_gp_reg_mapping.gp_reg_help_0); libxsmm_x86_instruction_alu_imm( io_generated_code, LIBXSMM_X86_INSTR_IMUL, l_gp_reg_mapping.gp_reg_help_0, i_xgemm_desc->c2); libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_sub_instruction, l_gp_reg_mapping.gp_reg_help_0, i_xgemm_desc->c2); libxsmm_x86_instruction_alu_reg( io_generated_code, l_micro_kernel_config.alu_sub_instruction, l_gp_reg_mapping.gp_reg_help_0, l_gp_reg_mapping.gp_reg_b); /* Consume the last two pushes form the stack */ libxsmm_x86_instruction_pop_reg( io_generated_code, l_gp_reg_mapping.gp_reg_help_0); libxsmm_x86_instruction_pop_reg( io_generated_code, l_gp_reg_mapping.gp_reg_help_0); } } libxsmm_generator_gemm_store_C( io_generated_code, &l_gp_reg_mapping, &l_micro_kernel_config, i_xgemm_desc, l_m_blocking, l_n_blocking ); libxsmm_generator_gemm_footer_mloop( io_generated_code, &l_loop_label_tracker, &l_gp_reg_mapping, &l_micro_kernel_config, i_xgemm_desc, l_m_blocking, l_m_done ); } /* switch to next smaller m_blocking */ l_m_blocking = libxsmm_generator_gemm_sse3_avx_avx2_avx512_update_m_blocking( &l_micro_kernel_config, i_xgemm_desc, io_generated_code->arch, l_m_blocking ); } libxsmm_generator_gemm_footer_nloop( io_generated_code, &l_loop_label_tracker, &l_gp_reg_mapping, &l_micro_kernel_config, i_xgemm_desc, l_n_blocking, l_n_done ); } /* close asm */ libxsmm_x86_instruction_close_stream( io_generated_code, &l_gp_reg_mapping, i_xgemm_desc->prefetch ); } LIBXSMM_API_INTERN void libxsmm_generator_gemm_sse3_avx_avx2_avx512_kloop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_m_blocking, const unsigned int i_n_blocking ) { void (*l_generator_microkernel)(libxsmm_generated_code*, const libxsmm_gp_reg_mapping*, const libxsmm_micro_kernel_config*, const libxsmm_gemm_descriptor*, const unsigned int, const unsigned int, const int); /* some hard coded parameters for k-blocking */ unsigned int l_k_blocking = 0; unsigned int l_k_threshold = 0; /* calculate m_blocking such that we choose the right AVX512 kernel */ unsigned int l_m_vector = ( i_m_blocking % i_micro_kernel_config->vector_length == 0 ) ? i_m_blocking/i_micro_kernel_config->vector_length : (i_m_blocking/i_micro_kernel_config->vector_length)+1; /* in case of 1d blocking and KNL/KNM we unroll aggressively */ if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_AVX512_KNM ) && ( l_m_vector == 1 ) ) { l_k_blocking = 16; l_k_threshold = 47; } else { l_k_blocking = 4; l_k_threshold = 23; } /* set up architecture dependent compute micro kernel generator */ if ( io_generated_code->arch < LIBXSMM_X86_SSE3 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_ARCH ); return; } else if ( io_generated_code->arch <= LIBXSMM_X86_SSE4 ) { l_generator_microkernel = libxsmm_generator_gemm_sse3_microkernel; } else if ( io_generated_code->arch == LIBXSMM_X86_AVX ) { l_generator_microkernel = libxsmm_generator_gemm_avx_microkernel; } else if ( io_generated_code->arch == LIBXSMM_X86_AVX2 ) { l_generator_microkernel = libxsmm_generator_gemm_avx2_microkernel; } else if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { l_generator_microkernel = libxsmm_generator_gemm_avx512_microkernel_nofsdbcst; } else { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_ARCH ); return; } /* apply multiple k_blocking strategies */ /* 1. we are larger the k_threshold and a multiple of a predefined blocking parameter */ if ((i_xgemm_desc->k % l_k_blocking) == 0 && (l_k_threshold < (unsigned int)i_xgemm_desc->k)) { unsigned int l_k; libxsmm_generator_gemm_header_kloop( io_generated_code, io_loop_label_tracker, i_gp_reg_mapping, i_micro_kernel_config, i_m_blocking, l_k_blocking); if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( l_m_vector == 1 ) ) { if ( io_generated_code->arch != LIBXSMM_X86_AVX512_KNM ) { libxsmm_generator_gemm_avx512_microkernel_fsdbcst( io_generated_code, i_gp_reg_mapping, i_micro_kernel_config, i_xgemm_desc, i_n_blocking, l_k_blocking ); } else { libxsmm_generator_gemm_avx512_microkernel_fsdbcst_qfma( io_generated_code, i_gp_reg_mapping, i_micro_kernel_config, i_xgemm_desc, i_n_blocking, l_k_blocking ); } } else { for ( l_k = 0; l_k < l_k_blocking; l_k++) { l_generator_microkernel(io_generated_code, i_gp_reg_mapping, i_micro_kernel_config, i_xgemm_desc, i_m_blocking, i_n_blocking, -1); } } libxsmm_generator_gemm_footer_kloop( io_generated_code, io_loop_label_tracker, i_gp_reg_mapping, i_micro_kernel_config, i_xgemm_desc, i_m_blocking, i_xgemm_desc->k, 1 ); } else { /* 2. we want to fully unroll below the threshold */ if ((unsigned int)i_xgemm_desc->k <= l_k_threshold) { unsigned int l_k; if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( l_m_vector == 1 ) ) { if ( io_generated_code->arch != LIBXSMM_X86_AVX512_KNM ) { libxsmm_generator_gemm_avx512_microkernel_fsdbcst( io_generated_code, i_gp_reg_mapping, i_micro_kernel_config, i_xgemm_desc, i_n_blocking, (unsigned int)i_xgemm_desc->k ); } else { libxsmm_generator_gemm_avx512_microkernel_fsdbcst_qfma( io_generated_code, i_gp_reg_mapping, i_micro_kernel_config, i_xgemm_desc, i_n_blocking, (unsigned int)i_xgemm_desc->k ); } } else { for ( l_k = 0; l_k < (unsigned int)i_xgemm_desc->k; l_k++) { l_generator_microkernel(io_generated_code, i_gp_reg_mapping, i_micro_kernel_config, i_xgemm_desc, i_m_blocking, i_n_blocking, l_k); } } /* 3. we are larger than the threshold but not a multiple of the blocking factor -> largest possible blocking + remainder handling */ } else { unsigned int l_max_blocked_k = ((i_xgemm_desc->k)/l_k_blocking)*l_k_blocking; unsigned int l_k; int l_b_offset = 0; /* we can block as k is large enough */ if ( l_max_blocked_k > 0 ) { libxsmm_generator_gemm_header_kloop( io_generated_code, io_loop_label_tracker, i_gp_reg_mapping, i_micro_kernel_config, i_m_blocking, l_k_blocking); if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( l_m_vector == 1 ) ) { if ( io_generated_code->arch != LIBXSMM_X86_AVX512_KNM ) { libxsmm_generator_gemm_avx512_microkernel_fsdbcst( io_generated_code, i_gp_reg_mapping, i_micro_kernel_config, i_xgemm_desc, i_n_blocking, l_k_blocking ); } else { libxsmm_generator_gemm_avx512_microkernel_fsdbcst_qfma( io_generated_code, i_gp_reg_mapping, i_micro_kernel_config, i_xgemm_desc, i_n_blocking, l_k_blocking ); } } else { for ( l_k = 0; l_k < l_k_blocking; l_k++) { l_generator_microkernel(io_generated_code, i_gp_reg_mapping, i_micro_kernel_config, i_xgemm_desc, i_m_blocking, i_n_blocking, -1); } } libxsmm_generator_gemm_footer_kloop( io_generated_code, io_loop_label_tracker, i_gp_reg_mapping, i_micro_kernel_config, i_xgemm_desc, i_m_blocking, l_max_blocked_k, 0 ); } /* now we handle the remainder handling */ if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( l_m_vector == 1 ) ) { if ( io_generated_code->arch != LIBXSMM_X86_AVX512_KNM ) { libxsmm_generator_gemm_avx512_microkernel_fsdbcst( io_generated_code, i_gp_reg_mapping, i_micro_kernel_config, i_xgemm_desc, i_n_blocking, ((unsigned int)i_xgemm_desc->k) - l_max_blocked_k ); } else { libxsmm_generator_gemm_avx512_microkernel_fsdbcst_qfma( io_generated_code, i_gp_reg_mapping, i_micro_kernel_config, i_xgemm_desc, i_n_blocking, ((unsigned int)i_xgemm_desc->k) - l_max_blocked_k ); } } else { for ( l_k = l_max_blocked_k; l_k < (unsigned int)i_xgemm_desc->k; l_k++) { l_generator_microkernel(io_generated_code, i_gp_reg_mapping, i_micro_kernel_config, i_xgemm_desc, i_m_blocking, i_n_blocking, -1); } } /* reset B pointer */ if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { l_b_offset = i_xgemm_desc->ldb * i_xgemm_desc->k * i_micro_kernel_config->datatype_size; } else { l_b_offset = i_xgemm_desc->k * i_micro_kernel_config->datatype_size; } libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_b, l_b_offset ); } } } LIBXSMM_API_INTERN unsigned int libxsmm_generator_gemm_sse3_avx_avx2_avx512_get_initial_m_blocking( libxsmm_micro_kernel_config* io_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_arch ) { unsigned int l_use_masking_a_c = 0; unsigned int l_m_blocking = 0; if ( ( i_arch <= LIBXSMM_X86_SSE4 ) && ( LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) ) { l_m_blocking = 12; } else if ( ( i_arch <= LIBXSMM_X86_SSE4 ) && ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) ) { l_m_blocking = 6; } else if ( ( i_arch == LIBXSMM_X86_AVX ) && ( LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) ) { l_m_blocking = 24; } else if ( ( i_arch == LIBXSMM_X86_AVX ) && ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) ) { l_m_blocking = 12; } else if ( ( i_arch == LIBXSMM_X86_AVX2 ) && ( LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) ) { l_m_blocking = 32; } else if ( ( i_arch == LIBXSMM_X86_AVX2 ) && ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) ) { l_m_blocking = 16; } else if ( ( (i_arch == LIBXSMM_X86_AVX512_MIC) || (i_arch == LIBXSMM_X86_AVX512_KNM) ) && ( LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) ) { /* @TODO check if there is a better blocking strategy */ if ( i_xgemm_desc->m >= 16 ) { l_m_blocking = 16; } else { l_m_blocking = i_xgemm_desc->m; /* in case we don't have a full vector length, we use masking */ if ( l_m_blocking % 16 != 0 ) { l_use_masking_a_c = 1; } } } else if ( ( (i_arch == LIBXSMM_X86_AVX512_MIC) || (i_arch == LIBXSMM_X86_AVX512_KNM) ) && ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) ) { /* @TODO check if there is a better blocking strategy */ if ( i_xgemm_desc->m >= 8 ) { l_m_blocking = 8; } else { l_m_blocking = i_xgemm_desc->m; /* in case we don't have a full vector length, we use masking */ if ( l_m_blocking % 8 != 0 ) { l_use_masking_a_c = 1; } } } else if ( ( i_arch <= LIBXSMM_X86_AVX512_CLX ) && ( LIBXSMM_GEMM_PRECISION_BF16 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) ) { /* @TODO check if there is a better blocking strategy */ if ( i_xgemm_desc->m >= 16 ) { l_m_blocking = 16; } else { l_m_blocking = i_xgemm_desc->m; /* in case we don't have a full vector length, we use masking */ if ( l_m_blocking % 16 != 0 ) { l_use_masking_a_c = 1; } } } else if ( ( i_arch <= LIBXSMM_X86_AVX512_CORE ) && ( ( LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) || ( LIBXSMM_GEMM_PRECISION_I16 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) ) ) { /* @TODO check if there is a better blocking strategy */ if ( i_xgemm_desc->m >= 16 ) { l_m_blocking = 16; } else { l_m_blocking = i_xgemm_desc->m; /* in case we don't have a full vector length, we use masking */ if ( l_m_blocking % 16 != 0 ) { l_use_masking_a_c = 1; } } } else if ( ( i_arch <= LIBXSMM_X86_ALLFEAT ) && ( ( LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_OUT( i_xgemm_desc->datatype ) ) || ( LIBXSMM_GEMM_PRECISION_I32 == LIBXSMM_GETENUM_OUT( i_xgemm_desc->datatype ) ) || ( LIBXSMM_GEMM_PRECISION_BF16 == LIBXSMM_GETENUM_OUT( i_xgemm_desc->datatype ) ) || ( LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_OUT( i_xgemm_desc->datatype ) ) ) ) { /* Remark switching ti OUT datatype check here to cover BF16 in, Fp32/Int32 out kernel with the same logic */ /* @TODO check if there is a better blocking strategy */ if ( i_xgemm_desc->m >= 64 ) { l_m_blocking = 64; } else { l_m_blocking = i_xgemm_desc->m; /* in case we don't have a full vector length, we use masking */ if ( l_m_blocking % 16 != 0 ) { l_use_masking_a_c = 1; } } } else if ( ( i_arch <= LIBXSMM_X86_ALLFEAT ) && ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) ) { /* @TODO check if there is a better blocking strategy */ if ( i_xgemm_desc->m >= 32 ) { l_m_blocking = 32; } else { l_m_blocking = i_xgemm_desc->m; /* in case we don't have a full vector length, we use masking */ if ( l_m_blocking % 8 != 0 ) { l_use_masking_a_c = 1; } } } else { /* we should never end up here, if we do let the user know */ /*LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_M_BLOCK ); return 0;*/ } libxsmm_generator_gemm_init_micro_kernel_config_fullvector( io_micro_kernel_config, i_arch, i_xgemm_desc, l_use_masking_a_c ); return l_m_blocking; } LIBXSMM_API_INTERN unsigned int libxsmm_generator_gemm_sse3_avx_avx2_avx512_update_m_blocking( libxsmm_micro_kernel_config* io_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_arch, const unsigned int i_current_m_blocking ) { unsigned int l_use_masking_a_c = 0; unsigned int l_m_blocking = 0; if ( ( i_arch <= LIBXSMM_X86_SSE4 ) && ( LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) ) { if (i_current_m_blocking == 4) { l_m_blocking = 1; libxsmm_generator_gemm_init_micro_kernel_config_scalar( io_micro_kernel_config, i_arch, i_xgemm_desc, 0 ); } else if (i_current_m_blocking == 8) { l_m_blocking = 4; } else if (i_current_m_blocking == 12) { l_m_blocking = 8; } else { /* we are done with m_blocking */ } } else if ( ( i_arch <= LIBXSMM_X86_SSE4 ) && ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) ) { if (i_current_m_blocking == 2) { l_m_blocking = 1; libxsmm_generator_gemm_init_micro_kernel_config_scalar( io_micro_kernel_config, i_arch, i_xgemm_desc, 0 ); } else if (i_current_m_blocking == 4) { l_m_blocking = 2; } else if (i_current_m_blocking == 6) { l_m_blocking = 4; } else { /* we are done with m_blocking */ } } else if ( ( i_arch == LIBXSMM_X86_AVX ) && ( LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) ) { if (i_current_m_blocking == 4) { l_m_blocking = 1; libxsmm_generator_gemm_init_micro_kernel_config_scalar( io_micro_kernel_config, i_arch, i_xgemm_desc, 0 ); } else if (i_current_m_blocking == 8) { l_m_blocking = 4; libxsmm_generator_gemm_init_micro_kernel_config_halfvector( io_micro_kernel_config, i_arch, i_xgemm_desc, 0 ); } else if (i_current_m_blocking == 16) { l_m_blocking = 8; } else if (i_current_m_blocking == 24) { l_m_blocking = 16; } else { /* we are done with m_blocking */ } } else if ( ( i_arch == LIBXSMM_X86_AVX ) && ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) ) { if (i_current_m_blocking == 2) { l_m_blocking = 1; libxsmm_generator_gemm_init_micro_kernel_config_scalar( io_micro_kernel_config, i_arch, i_xgemm_desc, 0 ); } else if (i_current_m_blocking == 4) { l_m_blocking = 2; libxsmm_generator_gemm_init_micro_kernel_config_halfvector( io_micro_kernel_config, i_arch, i_xgemm_desc, 0 ); } else if (i_current_m_blocking == 8) { l_m_blocking = 4; } else if (i_current_m_blocking == 12) { l_m_blocking = 8; } else { /* we are done with m_blocking */ } } else if ( ( i_arch == LIBXSMM_X86_AVX2 ) && ( LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) ) { if (i_current_m_blocking == 4) { l_m_blocking = 1; libxsmm_generator_gemm_init_micro_kernel_config_scalar( io_micro_kernel_config, i_arch, i_xgemm_desc, 0 ); } else if (i_current_m_blocking == 8) { l_m_blocking = 4; libxsmm_generator_gemm_init_micro_kernel_config_halfvector( io_micro_kernel_config, i_arch, i_xgemm_desc, 0 ); } else if (i_current_m_blocking == 16) { l_m_blocking = 8; } else if (i_current_m_blocking == 24) { l_m_blocking = 16; } else if (i_current_m_blocking == 32) { l_m_blocking = 24; } else { /* we are done with m_blocking */ } } else if ( ( i_arch == LIBXSMM_X86_AVX2 ) && ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) ) { if (i_current_m_blocking == 2) { l_m_blocking = 1; libxsmm_generator_gemm_init_micro_kernel_config_scalar( io_micro_kernel_config, i_arch, i_xgemm_desc, 0 ); } else if (i_current_m_blocking == 4) { l_m_blocking = 2; libxsmm_generator_gemm_init_micro_kernel_config_halfvector( io_micro_kernel_config, i_arch, i_xgemm_desc, 0 ); } else if (i_current_m_blocking == 8) { l_m_blocking = 4; } else if (i_current_m_blocking == 12) { l_m_blocking = 8; } else if (i_current_m_blocking == 16) { l_m_blocking = 12; } else { /* we are done with m_blocking */ } } else if ( ( (i_arch == LIBXSMM_X86_AVX512_MIC) || (i_arch == LIBXSMM_X86_AVX512_KNM) ) && ( LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) ) { if (i_current_m_blocking == 16) { l_m_blocking = i_xgemm_desc->m % 16; if ( l_m_blocking % 16 != 0 ) { l_use_masking_a_c = 1; } libxsmm_generator_gemm_init_micro_kernel_config_fullvector( io_micro_kernel_config, i_arch, i_xgemm_desc, l_use_masking_a_c ); } else { /* we are done with m_blocking */ } } else if ( ( (i_arch == LIBXSMM_X86_AVX512_MIC) || (i_arch == LIBXSMM_X86_AVX512_KNM) ) && ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) ) { if (i_current_m_blocking == 8) { l_m_blocking = i_xgemm_desc->m % 8; if ( l_m_blocking % 8 != 0 ) { l_use_masking_a_c = 1; } libxsmm_generator_gemm_init_micro_kernel_config_fullvector( io_micro_kernel_config, i_arch, i_xgemm_desc, l_use_masking_a_c ); } else { /* we are done with m_blocking */ } } else if ( ( i_arch <= LIBXSMM_X86_AVX512_CLX ) && ( LIBXSMM_GEMM_PRECISION_BF16 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) ) { if (i_current_m_blocking == 16) { l_m_blocking = i_xgemm_desc->m % 16; if ( l_m_blocking % 16 != 0 ) { l_use_masking_a_c = 1; } libxsmm_generator_gemm_init_micro_kernel_config_fullvector( io_micro_kernel_config, i_arch, i_xgemm_desc, l_use_masking_a_c ); } else { /* we are done with m_blocking */ } } else if ( ( i_arch <= LIBXSMM_X86_AVX512_CORE ) && ( ( LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) || ( LIBXSMM_GEMM_PRECISION_I16 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) ) ) { if (i_current_m_blocking == 16) { l_m_blocking = i_xgemm_desc->m % 16; if ( l_m_blocking % 16 != 0 ) { l_use_masking_a_c = 1; } libxsmm_generator_gemm_init_micro_kernel_config_fullvector( io_micro_kernel_config, i_arch, i_xgemm_desc, l_use_masking_a_c ); } else { /* we are done with m_blocking */ } } else if ( ( i_arch <= LIBXSMM_X86_ALLFEAT ) && ( ( LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_OUT( i_xgemm_desc->datatype ) ) || ( LIBXSMM_GEMM_PRECISION_I32 == LIBXSMM_GETENUM_OUT( i_xgemm_desc->datatype ) ) || ( LIBXSMM_GEMM_PRECISION_BF16 == LIBXSMM_GETENUM_OUT( i_xgemm_desc->datatype ) ) || ( LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_OUT( i_xgemm_desc->datatype ) ) ) ) { /* Remark switching ti OUT datatype check here to cover BF16 in, Fp32 out kernel with the same logic */ if (i_current_m_blocking == 64) { l_m_blocking = i_xgemm_desc->m % 64; if ( l_m_blocking % 16 != 0 ) { l_use_masking_a_c = 1; } libxsmm_generator_gemm_init_micro_kernel_config_fullvector( io_micro_kernel_config, i_arch, i_xgemm_desc, l_use_masking_a_c ); } else { /* we are done with m_blocking */ } } else if ( ( i_arch <= LIBXSMM_X86_ALLFEAT ) && ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) ) { if (i_current_m_blocking == 32) { l_m_blocking = i_xgemm_desc->m % 32; if ( l_m_blocking % 8 != 0 ) { l_use_masking_a_c = 1; } libxsmm_generator_gemm_init_micro_kernel_config_fullvector( io_micro_kernel_config, i_arch, i_xgemm_desc, l_use_masking_a_c ); } else { /* we are done with m_blocking */ } } else { /* we should never end up here, if we do let the user know */ /*LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_M_BLOCK ); return 0;*/ } return l_m_blocking; } LIBXSMM_API_INTERN unsigned int libxsmm_generator_gemm_sse3_avx_avx2_avx512_get_max_n_blocking( const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_arch ) { if ( i_arch <= LIBXSMM_X86_ALLFEAT ) { if ( i_arch >= LIBXSMM_X86_AVX512 ) { /* handle KNM qmadd */ if ( ( i_arch == LIBXSMM_X86_AVX512_KNM ) && ( LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) ) { return 28; } /* handle KNM qvnni */ if ( ( i_arch == LIBXSMM_X86_AVX512_KNM ) && ( LIBXSMM_GEMM_PRECISION_I16 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) ) { return 28; } /* handle int16 on SKX */ if ( ( i_arch == LIBXSMM_X86_AVX512_CORE ) && ( LIBXSMM_GEMM_PRECISION_I16 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) ) { return 28; } /* handle int8 on all AVX512 */ if ( ( LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) ) { return 28; } /* handle bf16 */ if ( ( i_arch < LIBXSMM_X86_AVX512_CPX ) && ( LIBXSMM_GEMM_PRECISION_BF16 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) ) { return 28; } return 30; } else { #if 1 LIBXSMM_UNUSED(i_micro_kernel_config); return 3; #else return i_micro_kernel_config->vector_reg_count - 2; #endif } } else { return 3; } } libxsmm-1.17/src/generator_gemm_sse3_avx_avx2_avx512.h000066400000000000000000000067741415223013700227450ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef GENERATOR_GEMM_SSE3_AVX_AVX2_AVX512_H #define GENERATOR_GEMM_SSE3_AVX_AVX2_AVX512_H #include "generator_common.h" #include "generator_gemm_common.h" LIBXSMM_API_INTERN void libxsmm_generator_gemm_sse3_avx_avx2_avx512_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc ); LIBXSMM_API_INTERN void libxsmm_generator_gemm_sse3_avx_avx2_avx512_kloop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_m_blocking, const unsigned int i_n_blocking ); LIBXSMM_API_INTERN unsigned int libxsmm_generator_gemm_sse3_avx_avx2_avx512_get_initial_m_blocking( libxsmm_micro_kernel_config* io_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_arch ); LIBXSMM_API_INTERN unsigned int libxsmm_generator_gemm_sse3_avx_avx2_avx512_update_m_blocking( libxsmm_micro_kernel_config* io_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_arch, const unsigned int i_current_m_blocking ); LIBXSMM_API_INTERN unsigned int libxsmm_generator_gemm_sse3_avx_avx2_avx512_get_max_n_blocking( const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_arch ); #endif /* GENERATOR_GEMM_SSE3_AVX_AVX2_AVX512_H */ libxsmm-1.17/src/generator_gemm_sse3_microkernel.c000066400000000000000000000504461415223013700224010ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "generator_gemm_sse3_microkernel.h" #include "generator_x86_instructions.h" #include "libxsmm_main.h" LIBXSMM_API_INTERN void libxsmm_generator_gemm_sse3_microkernel( libxsmm_generated_code* io_generated_code, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_m_blocking, const unsigned int i_n_blocking, const int i_offset ) { /* deriving register blocking from kernel config */ unsigned int l_m_blocking = i_m_blocking/i_micro_kernel_config->vector_length; /* register blocking counter in n */ unsigned int l_n = 0; /* register blocking counter in m */ unsigned int l_m = 0; /* start register of accumulator */ unsigned int l_vec_reg_acc_start = 16 - (i_n_blocking * l_m_blocking); /* temp variable for b-offset to handle no-trans/trans B */ int l_b_offset = 0; /* check that m_blocking is a multiple of vlen and that n_blocking is valid */ if ( (i_n_blocking > 3) || (i_n_blocking < 1) ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_N_BLOCK ); return; } if ( i_m_blocking % i_micro_kernel_config->vector_length != 0 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_M_BLOCK ); return; } if (l_m_blocking == 1) { /* load column vectors of A */ libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, i_n_blocking, 0, 1, 0 ); /* loop over columns of B */ for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { /* post increment of a pointer early */ if ( l_n == 0 ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a, (i_xgemm_desc->lda)*(i_micro_kernel_config->datatype_size) ); } /* different ways of using B */ if ( i_offset != (-1) ) { /* handle trans B */ if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { l_b_offset = (i_micro_kernel_config->datatype_size * i_offset * i_xgemm_desc->ldb) + (l_n * i_micro_kernel_config->datatype_size); } else { l_b_offset = (i_micro_kernel_config->datatype_size * i_offset) + (i_xgemm_desc->ldb * l_n * i_micro_kernel_config->datatype_size); } libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_b_offset, i_micro_kernel_config->vector_name, l_n, 0, 1, 0 ); /* generate shuffle as SSE3 has no broadcast load for single precision */ if ( LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) && ( i_micro_kernel_config->b_shuff_instruction != LIBXSMM_X86_INSTR_UNDEF ) ) { libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_shuff_instruction, i_micro_kernel_config->vector_name, l_n, l_n, LIBXSMM_X86_VEC_REG_UNDEF, 0 ); } } else { /* handle trans B */ if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { l_b_offset = l_n * i_micro_kernel_config->datatype_size; } else { l_b_offset = i_xgemm_desc->ldb * l_n * i_micro_kernel_config->datatype_size; } libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_b_offset, i_micro_kernel_config->vector_name, l_n, 0, 1, 0 ); /* generate shuffle as SSE3 has no broadcast load for single precision */ if ( LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) && ( i_micro_kernel_config->b_shuff_instruction != LIBXSMM_X86_INSTR_UNDEF ) ) { libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_shuff_instruction, i_micro_kernel_config->vector_name, l_n, l_n, LIBXSMM_X86_VEC_REG_UNDEF, 0 ); } if ( l_n == (i_n_blocking -1) ) { /* handle trans B */ if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { l_b_offset = i_xgemm_desc->ldb * i_micro_kernel_config->datatype_size; } else { l_b_offset = i_micro_kernel_config->datatype_size; } libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_b, l_b_offset ); } } /* issue mul-add */ libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, i_n_blocking, l_n, LIBXSMM_X86_VEC_REG_UNDEF ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vadd_instruction, i_micro_kernel_config->vector_name, l_n, l_vec_reg_acc_start + l_n, LIBXSMM_X86_VEC_REG_UNDEF ); } } else { /* broadcast from B -> into vec registers 0 to i_n_blocking */ if ( i_offset != (-1) ) { for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { /* handle trans B */ if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { l_b_offset = (i_micro_kernel_config->datatype_size * i_offset * i_xgemm_desc->ldb) + (l_n * i_micro_kernel_config->datatype_size); } else { l_b_offset = (i_micro_kernel_config->datatype_size * i_offset) + (i_xgemm_desc->ldb * l_n * i_micro_kernel_config->datatype_size); } libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_b_offset, i_micro_kernel_config->vector_name, l_n, 0, 1, 0 ); /* generate shuffle as SSE3 has no broadcast load for single precision */ if ( LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) && ( i_micro_kernel_config->b_shuff_instruction != LIBXSMM_X86_INSTR_UNDEF ) ) { libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_shuff_instruction, i_micro_kernel_config->vector_name, l_n, l_n, LIBXSMM_X86_VEC_REG_UNDEF, 0 ); } } } else { for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { /* handle trans B */ if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { l_b_offset = l_n * i_micro_kernel_config->datatype_size; } else { l_b_offset = i_xgemm_desc->ldb * l_n * i_micro_kernel_config->datatype_size; } libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_b_offset, i_micro_kernel_config->vector_name, l_n, 0, 1, 0 ); /* generate shuffle as SSE3 has no broadcast load for single precision */ if ( LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_shuff_instruction, i_micro_kernel_config->vector_name, l_n, l_n, LIBXSMM_X86_VEC_REG_UNDEF, 0 ); } } /* handle trans B */ if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_B) > 0 ) { l_b_offset = i_xgemm_desc->ldb * i_micro_kernel_config->datatype_size; } else { l_b_offset = i_micro_kernel_config->datatype_size; } libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_b, l_b_offset ); } if (l_m_blocking == 3) { /* load column vectors of A and multiply with all broadcasted row entries of B */ for ( l_m = 0; l_m < l_m_blocking; l_m++ ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (i_micro_kernel_config->datatype_size) * (i_micro_kernel_config->vector_length) * l_m, i_micro_kernel_config->vector_name, i_n_blocking, 0, 1, 0 ); for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { /* post increment early */ if ( (l_m == (l_m_blocking-1)) && (l_n == 0) ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a, (i_xgemm_desc->lda)*(i_micro_kernel_config->datatype_size) ); } if (l_n < i_n_blocking - 1) { /* issued vmove to save loads from A */ libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_micro_kernel_config->vector_name, i_n_blocking + l_n, i_n_blocking + l_n + 1, LIBXSMM_X86_VEC_REG_UNDEF ); } /* issue mul+add */ libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, l_n, i_n_blocking + l_n, LIBXSMM_X86_VEC_REG_UNDEF ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vadd_instruction, i_micro_kernel_config->vector_name, i_n_blocking + l_n, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n), LIBXSMM_X86_VEC_REG_UNDEF ); } } } else { /* load column vectors of A and multiply with all broadcasted row entries of B */ for ( l_m = 0; l_m < l_m_blocking; l_m++ ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (i_micro_kernel_config->datatype_size) * (i_micro_kernel_config->vector_length) * l_m, i_micro_kernel_config->vector_name, i_n_blocking + l_m, 0, 1, 0 ); } for ( l_m = 0; l_m < l_m_blocking; l_m++ ) { for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { /* post increment early */ if ( (l_m == (l_m_blocking-1)) && (l_n == 0) ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a, (i_xgemm_desc->lda)*(i_micro_kernel_config->datatype_size) ); } if (l_n < i_n_blocking - 1) { /* issued vmove to save loads from A */ if (l_n == 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_micro_kernel_config->vector_name, i_n_blocking + l_m + l_n, i_n_blocking + l_m_blocking + l_n, LIBXSMM_X86_VEC_REG_UNDEF ); } else { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_micro_kernel_config->vector_name, i_n_blocking + l_m_blocking + l_n - 1, i_n_blocking + l_m_blocking + l_n, LIBXSMM_X86_VEC_REG_UNDEF ); } } /* issue mul/add */ if (l_n == 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, l_n, i_n_blocking + l_m + l_n, LIBXSMM_X86_VEC_REG_UNDEF ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vadd_instruction, i_micro_kernel_config->vector_name, i_n_blocking + l_m + l_n, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n), LIBXSMM_X86_VEC_REG_UNDEF ); } else { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, l_n, i_n_blocking + l_m_blocking + l_n - 1, LIBXSMM_X86_VEC_REG_UNDEF ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vadd_instruction, i_micro_kernel_config->vector_name, i_n_blocking + l_m_blocking + l_n - 1, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n), LIBXSMM_X86_VEC_REG_UNDEF ); } } } } } } libxsmm-1.17/src/generator_gemm_sse3_microkernel.h000066400000000000000000000032001415223013700223700ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef GENERATOR_GEMM_SSE3_MICROKERNEL_H #define GENERATOR_GEMM_SSE3_MICROKERNEL_H #include "generator_common.h" #include "generator_gemm_common.h" LIBXSMM_API_INTERN void libxsmm_generator_gemm_sse3_microkernel( libxsmm_generated_code* io_generated_code, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_m_blocking, const unsigned int i_n_blocking, const int i_offset ); #endif /* GENERATOR_GEMM_SSE3_MICROKERNEL_H */ libxsmm-1.17/src/generator_matcopy.c000066400000000000000000000032461415223013700175750ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include #include "generator_common.h" #include "generator_matcopy_avx_avx512.h" /* @TODO change int based architecture value */ LIBXSMM_API void libxsmm_generator_matcopy_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_mcopy_descriptor* i_matcopy_desc, const char* i_arch ) { /* generate kernel */ if ( (strcmp(i_arch, "skx") == 0) || (strcmp(i_arch, "knm") == 0) || (strcmp(i_arch, "knl") == 0) || (strcmp(i_arch, "hsw") == 0) || (strcmp(i_arch, "snb") == 0) || (strcmp(i_arch, "clx") == 0) || (strcmp(i_arch, "cpx") == 0) ) { libxsmm_generator_matcopy_avx_avx512_kernel( io_generated_code, i_matcopy_desc, i_arch ); } else { /* TODO fix this error */ LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_ARCH ); return; } } libxsmm-1.17/src/generator_matcopy_avx_avx512.c000066400000000000000000000567671415223013700216010ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas, Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "generator_matcopy_avx_avx512.h" #include "generator_x86_instructions.h" #include "generator_common.h" #include "libxsmm_main.h" LIBXSMM_API_INTERN void libxsmm_generator_matcopy_header_m_loop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_matcopy_kernel_config* i_kernel_config, const unsigned int i_gp_reg_m_loop ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_kernel_config->alu_mov_instruction, i_gp_reg_m_loop, 0); libxsmm_x86_instruction_register_jump_back_label( io_generated_code, io_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_kernel_config->alu_add_instruction, i_gp_reg_m_loop, 1); } LIBXSMM_API_INTERN void libxsmm_generator_matcopy_footer_m_loop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_matcopy_kernel_config* i_kernel_config, const unsigned int i_gp_reg_m_loop, const unsigned int i_m ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_kernel_config->alu_cmp_instruction, i_gp_reg_m_loop, i_m ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, i_kernel_config->alu_jmp_instruction, io_loop_label_tracker ); } LIBXSMM_API_INTERN void libxsmm_generator_matcopy_header_n_loop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_matcopy_kernel_config* i_kernel_config, const unsigned int i_gp_reg_n_loop ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_kernel_config->alu_mov_instruction, i_gp_reg_n_loop, 0); libxsmm_x86_instruction_register_jump_back_label( io_generated_code, io_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_kernel_config->alu_add_instruction, i_gp_reg_n_loop, 1); } LIBXSMM_API_INTERN void libxsmm_generator_matcopy_footer_n_loop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_matcopy_kernel_config* i_kernel_config, const unsigned int i_gp_reg_n_loop, const unsigned int i_n ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_kernel_config->alu_cmp_instruction, i_gp_reg_n_loop, i_n ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, i_kernel_config->alu_jmp_instruction, io_loop_label_tracker ); } LIBXSMM_API_INLINE void libxsmm_generator_matcopy_avx_avx512_kernel_initialize_mask( libxsmm_generated_code* io_generated_code, const libxsmm_matcopy_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_matcopy_kernel_config* i_micro_kernel_config, unsigned int remainder ) { unsigned long long l_mask = (1ULL << remainder) - 1; /* If we have int16 input and KNM arch, we should make the remainder mask "half", since we have only VMOVUPS instruction (i.e. treat the int16 entries in pairs, thus the mask length should be half) */ if ( (i_micro_kernel_config->vector_length == 32) && (i_micro_kernel_config->instruction_set == LIBXSMM_X86_AVX512_KNM) ) { l_mask = (1ULL << (remainder/2)) - 1; } /* Move mask to GP register */ if (i_micro_kernel_config->vector_length == 64) { libxsmm_x86_instruction_alu_imm_i64( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_help_0, (size_t)l_mask ); } else { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_help_0, /* immediate is passed as an integer */ (int)l_mask ); } /* Set mask register */ if ( i_micro_kernel_config->instruction_set >= LIBXSMM_X86_AVX512_CORE ) { libxsmm_x86_instruction_mask_move( io_generated_code, LIBXSMM_X86_INSTR_KMOVQ, i_gp_reg_mapping->gp_reg_help_0, 1, 0 ); } else if ( i_micro_kernel_config->instruction_set == LIBXSMM_X86_AVX512_MIC || i_micro_kernel_config->instruction_set == LIBXSMM_X86_AVX512_KNM ) { libxsmm_x86_instruction_mask_move( io_generated_code, LIBXSMM_X86_INSTR_KMOVW, i_gp_reg_mapping->gp_reg_help_0, 1, 0 ); } else { /* Should not happen! */ } } LIBXSMM_API_INTERN void libxsmm_generator_matcopy_avx_avx512_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_mcopy_descriptor* i_matcopy_desc, const char* i_arch ) { libxsmm_matcopy_kernel_config l_kernel_config; libxsmm_matcopy_gp_reg_mapping l_gp_reg_mapping; libxsmm_loop_label_tracker l_loop_label_tracker; unsigned int m_trips, remaining_unrolled, remaining, i; /* define loop_label_tracker */ libxsmm_reset_loop_label_tracker( &l_loop_label_tracker ); /* define gp register mapping */ memset(&l_gp_reg_mapping, 0, sizeof(l_gp_reg_mapping)); /* avoid warning "maybe used uninitialized" */ #if defined(_WIN32) || defined(__CYGWIN__) l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_lda = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_R8; l_gp_reg_mapping.gp_reg_ldb = LIBXSMM_X86_GP_REG_R9; /* TODO: full support for Windows calling convention */ l_gp_reg_mapping.gp_reg_a_pf = LIBXSMM_X86_GP_REG_RDI; l_gp_reg_mapping.gp_reg_b_pf = LIBXSMM_X86_GP_REG_RSI; #else /* match calling convention on Linux */ l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RDI; l_gp_reg_mapping.gp_reg_lda = LIBXSMM_X86_GP_REG_RSI; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_ldb = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_a_pf = LIBXSMM_X86_GP_REG_R8; l_gp_reg_mapping.gp_reg_b_pf = LIBXSMM_X86_GP_REG_R9; #endif l_gp_reg_mapping.gp_reg_m_loop = LIBXSMM_X86_GP_REG_R15; l_gp_reg_mapping.gp_reg_n_loop = LIBXSMM_X86_GP_REG_R12; l_gp_reg_mapping.gp_reg_help_0 = LIBXSMM_X86_GP_REG_RAX; /* define matcopy kernel config */ memset(&l_kernel_config, 0, sizeof(l_kernel_config)); /* avoid warning "maybe used uninitialized" */ if ( strcmp( i_arch, "snb" ) == 0 ) { l_kernel_config.instruction_set = LIBXSMM_X86_AVX; l_kernel_config.vector_reg_count = 16; l_kernel_config.vector_name = 'y'; l_kernel_config.vxor_instruction = LIBXSMM_X86_INSTR_VXORPS; } else if ( strcmp( i_arch, "hsw" ) == 0 ) { l_kernel_config.instruction_set = LIBXSMM_X86_AVX2; l_kernel_config.vector_reg_count = 16; l_kernel_config.vector_name = 'y'; l_kernel_config.vxor_instruction = LIBXSMM_X86_INSTR_VXORPS; } else if ( strcmp( i_arch, "skx" ) == 0 ) { l_kernel_config.instruction_set = LIBXSMM_X86_AVX512_CORE; l_kernel_config.vector_reg_count = 32; l_kernel_config.vector_name = 'z'; l_kernel_config.vxor_instruction = LIBXSMM_X86_INSTR_VPXORD; } else if ( strcmp( i_arch, "clx" ) == 0 ) { l_kernel_config.instruction_set = LIBXSMM_X86_AVX512_CLX; l_kernel_config.vector_reg_count = 32; l_kernel_config.vector_name = 'z'; l_kernel_config.vxor_instruction = LIBXSMM_X86_INSTR_VPXORD; } else if ( strcmp( i_arch, "cpx" ) == 0 ) { l_kernel_config.instruction_set = LIBXSMM_X86_AVX512_CPX; l_kernel_config.vector_reg_count = 32; l_kernel_config.vector_name = 'z'; l_kernel_config.vxor_instruction = LIBXSMM_X86_INSTR_VPXORD; } else if ( strcmp( i_arch, "knl" ) == 0 ) { /* For now make the code work for KNL */ l_kernel_config.instruction_set = LIBXSMM_X86_AVX512_MIC; l_kernel_config.vector_reg_count = 32; l_kernel_config.vector_name = 'z'; l_kernel_config.vxor_instruction = LIBXSMM_X86_INSTR_VPXORD; } else if ( strcmp( i_arch, "knm" ) == 0 ) { l_kernel_config.instruction_set = LIBXSMM_X86_AVX512_KNM; l_kernel_config.vector_reg_count = 32; l_kernel_config.vector_name = 'z'; l_kernel_config.vxor_instruction = LIBXSMM_X86_INSTR_VPXORD; } else { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_UNSUP_ARCH ); return; } /* More setup in the kernel config based on architecture and data type */ if ( l_kernel_config.vector_name == 'y' ) { assert(0 < i_matcopy_desc->typesize); l_kernel_config.datatype_size = i_matcopy_desc->typesize; if ( i_matcopy_desc->typesize == 4 ) { l_kernel_config.vmove_instruction = LIBXSMM_X86_INSTR_VMOVUPS; l_kernel_config.vector_length = 8; } else if ( i_matcopy_desc->typesize == 2 ) { l_kernel_config.vmove_instruction = LIBXSMM_X86_INSTR_VMOVUPS; l_kernel_config.vector_length = 16; } else if ( i_matcopy_desc->typesize == 1 ) { l_kernel_config.vmove_instruction = LIBXSMM_X86_INSTR_VMOVUPS; l_kernel_config.vector_length = 32; } else { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_UNSUP_DATATYPE ); return; } assert(l_kernel_config.vector_length == 32 / l_kernel_config.datatype_size); } else if ( l_kernel_config.vector_name == 'z' ) { assert(0 < i_matcopy_desc->typesize); l_kernel_config.datatype_size = i_matcopy_desc->typesize; if ( i_matcopy_desc->typesize == 4 ) { /* TODO: use streaming stores if we want to zero the destination */ /*if (0 != (LIBXSMM_MATCOPY_FLAG_ZERO_SOURCE & i_matcopy_desc->flags)) { l_kernel_config.vmove_instruction = LIBXSMM_X86_INSTR_VMOVUPS; } else*/ { l_kernel_config.vmove_instruction = LIBXSMM_X86_INSTR_VMOVUPS; } l_kernel_config.vector_length = 16; } else if ( i_matcopy_desc->typesize == 2 ) { if (l_kernel_config.instruction_set == LIBXSMM_X86_AVX512_KNM) { l_kernel_config.vmove_instruction = LIBXSMM_X86_INSTR_VMOVUPS; } else if ( l_kernel_config.instruction_set >= LIBXSMM_X86_AVX512_CORE ) { l_kernel_config.vmove_instruction = LIBXSMM_X86_INSTR_VMOVDQU16; } else { /* Should not happen!!! */ } l_kernel_config.vector_length = 32; } else if ( i_matcopy_desc->typesize == 1 ) { l_kernel_config.vmove_instruction = LIBXSMM_X86_INSTR_VMOVDQU8; l_kernel_config.vector_length = 64; } else { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_UNSUP_DATATYPE ); return; } assert(l_kernel_config.vector_length == 64 / l_kernel_config.datatype_size); } else { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_UNSUP_ARCH ); return; } l_kernel_config.alu_add_instruction = LIBXSMM_X86_INSTR_ADDQ; l_kernel_config.alu_cmp_instruction = LIBXSMM_X86_INSTR_CMPQ; l_kernel_config.alu_mov_instruction = LIBXSMM_X86_INSTR_MOVQ; l_kernel_config.alu_jmp_instruction = LIBXSMM_X86_INSTR_JL; l_kernel_config.prefetch_instruction = LIBXSMM_X86_INSTR_PREFETCHT2; /* Calculate the trips in the m dimension (perform unrolling if requested) */ assert(0 != i_matcopy_desc->unroll_level); m_trips = i_matcopy_desc->m / (l_kernel_config.vector_length * i_matcopy_desc->unroll_level); remaining_unrolled = (i_matcopy_desc->m % (l_kernel_config.vector_length * i_matcopy_desc->unroll_level)) / l_kernel_config.vector_length; remaining = (i_matcopy_desc->m % (l_kernel_config.vector_length * i_matcopy_desc->unroll_level)) % l_kernel_config.vector_length; /* open asm */ libxsmm_x86_instruction_open_stream_matcopy( io_generated_code, l_gp_reg_mapping.gp_reg_a, l_gp_reg_mapping.gp_reg_lda, l_gp_reg_mapping.gp_reg_b, l_gp_reg_mapping.gp_reg_ldb, l_gp_reg_mapping.gp_reg_a_pf, l_gp_reg_mapping.gp_reg_b_pf, i_arch ); /* In case we should do masked load/store and we have AVX512 arch, precompute the mask */ if (remaining && l_kernel_config.instruction_set >= LIBXSMM_X86_AVX512) { libxsmm_generator_matcopy_avx_avx512_kernel_initialize_mask(io_generated_code, &l_gp_reg_mapping, &l_kernel_config, remaining); } /* Initialize register 0 with zeros if we want to zero the destination */ if (0 != (LIBXSMM_MATCOPY_FLAG_ZERO_SOURCE & i_matcopy_desc->flags)) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, l_kernel_config.instruction_set, l_kernel_config.vxor_instruction, l_kernel_config.vector_name, 0, 0, 0); /* In case of AVX/AVX2 and if we have remaining, set also scalar register to zero */ if (remaining && (l_kernel_config.instruction_set == LIBXSMM_X86_AVX || l_kernel_config.instruction_set == LIBXSMM_X86_AVX2)) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, l_kernel_config.instruction_set, LIBXSMM_X86_INSTR_VXORPS, 'x', 0, 0, 0); } } if (i_matcopy_desc->n > 1) { /* open n loop */ libxsmm_generator_matcopy_header_n_loop( io_generated_code, &l_loop_label_tracker, &l_kernel_config, l_gp_reg_mapping.gp_reg_n_loop ); } if (m_trips > 1) { /* open m loop */ libxsmm_generator_matcopy_header_m_loop( io_generated_code, &l_loop_label_tracker, &l_kernel_config, l_gp_reg_mapping.gp_reg_m_loop ); } if (m_trips >= 1) { /* Unroll the innermost loop as requested */ for (i = 0; i < i_matcopy_desc->unroll_level; i++) { if (0 == (LIBXSMM_MATCOPY_FLAG_ZERO_SOURCE & i_matcopy_desc->flags)) { /* load input line to register 0 */ libxsmm_x86_instruction_vec_move( io_generated_code, l_kernel_config.instruction_set, l_kernel_config.vmove_instruction, l_gp_reg_mapping.gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, i*l_kernel_config.vector_length*l_kernel_config.datatype_size, l_kernel_config.vector_name, 0, 0, 1, 0 ); } /* Prefetch if requested */ if (i_matcopy_desc->prefetch) { libxsmm_x86_instruction_prefetch( io_generated_code, l_kernel_config.prefetch_instruction, l_gp_reg_mapping.gp_reg_a_pf, LIBXSMM_X86_GP_REG_UNDEF, 0, i*l_kernel_config.vector_length*l_kernel_config.datatype_size ); } /* store register 0 to destination line */ libxsmm_x86_instruction_vec_move( io_generated_code, l_kernel_config.instruction_set, l_kernel_config.vmove_instruction, l_gp_reg_mapping.gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, i*l_kernel_config.vector_length*l_kernel_config.datatype_size, l_kernel_config.vector_name, 0, 0, 0, 1 ); } if (0 == (LIBXSMM_MATCOPY_FLAG_ZERO_SOURCE & i_matcopy_desc->flags)) { /* adjust input pointer by VLEN * unroll-level elements */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_a, i_matcopy_desc->unroll_level * l_kernel_config.vector_length * l_kernel_config.datatype_size); } /* adjust destination pointer by VLEN * unroll-level elements */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_b, i_matcopy_desc->unroll_level * l_kernel_config.vector_length * l_kernel_config.datatype_size); /* Adjust prefetch pointer by VLEN * unroll-level elements */ if (i_matcopy_desc->prefetch) { libxsmm_x86_instruction_alu_imm( io_generated_code, l_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_a_pf, i_matcopy_desc->unroll_level * l_kernel_config.vector_length * l_kernel_config.datatype_size); } } if (m_trips > 1) { /* close m loop */ libxsmm_generator_matcopy_footer_m_loop( io_generated_code, &l_loop_label_tracker, &l_kernel_config, l_gp_reg_mapping.gp_reg_m_loop, m_trips ); } /* Add unrolled load/stores for remaining without mask */ for (i = 0; i < remaining_unrolled; i++) { if (0 == (LIBXSMM_MATCOPY_FLAG_ZERO_SOURCE & i_matcopy_desc->flags)) { libxsmm_x86_instruction_vec_move( io_generated_code, l_kernel_config.instruction_set, l_kernel_config.vmove_instruction, l_gp_reg_mapping.gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, i*l_kernel_config.vector_length*l_kernel_config.datatype_size, l_kernel_config.vector_name, 0, 0, 1, 0 ); } if (i_matcopy_desc->prefetch) { libxsmm_x86_instruction_prefetch( io_generated_code, l_kernel_config.prefetch_instruction, l_gp_reg_mapping.gp_reg_a_pf, LIBXSMM_X86_GP_REG_UNDEF, 0, i*l_kernel_config.vector_length*l_kernel_config.datatype_size ); } libxsmm_x86_instruction_vec_move( io_generated_code, l_kernel_config.instruction_set, l_kernel_config.vmove_instruction, l_gp_reg_mapping.gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, i*l_kernel_config.vector_length*l_kernel_config.datatype_size, l_kernel_config.vector_name, 0, 0, 0, 1 ); } /* Add load/store with mask if there is remaining and we have AVX512 arch */ if (remaining && l_kernel_config.instruction_set >= LIBXSMM_X86_AVX512) { if (0 == (LIBXSMM_MATCOPY_FLAG_ZERO_SOURCE & i_matcopy_desc->flags)) { libxsmm_x86_instruction_vec_move( io_generated_code, l_kernel_config.instruction_set, l_kernel_config.vmove_instruction, l_gp_reg_mapping.gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, remaining_unrolled * l_kernel_config.vector_length * l_kernel_config.datatype_size, l_kernel_config.vector_name, 0, 1, 1, 0 ); } if (i_matcopy_desc->prefetch) { libxsmm_x86_instruction_prefetch( io_generated_code, l_kernel_config.prefetch_instruction, l_gp_reg_mapping.gp_reg_a_pf, LIBXSMM_X86_GP_REG_UNDEF, 0, remaining_unrolled * l_kernel_config.vector_length * l_kernel_config.datatype_size ); } libxsmm_x86_instruction_vec_move( io_generated_code, l_kernel_config.instruction_set, l_kernel_config.vmove_instruction, l_gp_reg_mapping.gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, remaining_unrolled * l_kernel_config.vector_length * l_kernel_config.datatype_size, l_kernel_config.vector_name, 0, 1, 0, 1 ); } else if (remaining && (l_kernel_config.instruction_set == LIBXSMM_X86_AVX || l_kernel_config.instruction_set == LIBXSMM_X86_AVX2)) { /* Use scalar moves in case of remaining and AVX/AVX2 arch */ for (i=0; iflags)) { libxsmm_x86_instruction_vec_move( io_generated_code, l_kernel_config.instruction_set, LIBXSMM_X86_INSTR_VMOVSS, l_gp_reg_mapping.gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (remaining_unrolled * l_kernel_config.vector_length + i) * l_kernel_config.datatype_size, 'x', 0, 0, 1, 0 ); } if (i_matcopy_desc->prefetch) { /* Issue just one prefetch */ if (i == 0) { libxsmm_x86_instruction_prefetch( io_generated_code, l_kernel_config.prefetch_instruction, l_gp_reg_mapping.gp_reg_a_pf, LIBXSMM_X86_GP_REG_UNDEF, 0, remaining_unrolled * l_kernel_config.vector_length * l_kernel_config.datatype_size ); } } libxsmm_x86_instruction_vec_move( io_generated_code, l_kernel_config.instruction_set, LIBXSMM_X86_INSTR_VMOVSS, l_gp_reg_mapping.gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, (remaining_unrolled * l_kernel_config.vector_length + i) * l_kernel_config.datatype_size, 'x', 0, 0, 0, 1 ); } } if (i_matcopy_desc->n > 1) { if (0 == (LIBXSMM_MATCOPY_FLAG_ZERO_SOURCE & i_matcopy_desc->flags)) { /* adjust input pointer by (lda - m_trips * VLEN * unroll-level) elements (already has been increased by m_trips * VLEN * unroll-level in the above m_trips loop ) */ if ( (i_matcopy_desc->ldi - m_trips * l_kernel_config.vector_length * i_matcopy_desc->unroll_level) != 0 ) { libxsmm_x86_instruction_alu_imm( io_generated_code, l_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_a, (i_matcopy_desc->ldi - m_trips * l_kernel_config.vector_length * i_matcopy_desc->unroll_level) * l_kernel_config.datatype_size); } } /* adjust destination pointer by (ldb - m_trips * VLEN * unroll-level) elements (already has been increased by m_trips * VLEN * unroll-level in the above m_trips loop ) */ if ( (i_matcopy_desc->ldo - m_trips * l_kernel_config.vector_length * i_matcopy_desc->unroll_level) != 0 ) { libxsmm_x86_instruction_alu_imm( io_generated_code, l_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_b, (i_matcopy_desc->ldo - m_trips * l_kernel_config.vector_length * i_matcopy_desc->unroll_level) * l_kernel_config.datatype_size); } /* Adjust prefetch pointer if requested */ if (i_matcopy_desc->prefetch) { if ( (i_matcopy_desc->ldi - m_trips * l_kernel_config.vector_length * i_matcopy_desc->unroll_level) != 0 ) { libxsmm_x86_instruction_alu_imm( io_generated_code, l_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_a_pf, (i_matcopy_desc->ldi - m_trips * l_kernel_config.vector_length * i_matcopy_desc->unroll_level) * l_kernel_config.datatype_size); } } /* close n loop */ libxsmm_generator_matcopy_footer_n_loop( io_generated_code, &l_loop_label_tracker, &l_kernel_config, l_gp_reg_mapping.gp_reg_n_loop, i_matcopy_desc->n ); } /* close asm */ libxsmm_x86_instruction_close_stream_matcopy( io_generated_code, i_arch ); } libxsmm-1.17/src/generator_matcopy_avx_avx512.h000066400000000000000000000062711415223013700215670ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef GENERATOR_MATCOPY_AVX_AVX512_H #define GENERATOR_MATCOPY_AVX_AVX512_H #include "generator_common.h" LIBXSMM_API_INTERN void libxsmm_generator_matcopy_header_m_loop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_matcopy_kernel_config* i_kernel_config, const unsigned int i_gp_reg_m_loop ); LIBXSMM_API_INTERN void libxsmm_generator_matcopy_footer_m_loop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_matcopy_kernel_config* i_kernel_config, const unsigned int i_gp_reg_m_loop, const unsigned int i_m ); LIBXSMM_API_INTERN void libxsmm_generator_matcopy_header_n_loop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_matcopy_kernel_config* i_kernel_config, const unsigned int i_gp_reg_n_loop ); LIBXSMM_API_INTERN void libxsmm_generator_matcopy_footer_n_loop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_matcopy_kernel_config* i_kernel_config, const unsigned int i_gp_reg_n_loop, const unsigned int i_n ); LIBXSMM_API_INTERN void libxsmm_generator_matcopy_avx_avx512_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_mcopy_descriptor* i_trans_desc, const char* i_arch ); #endif /* GENERATOR_MATCOPY_AVX_AVX512_H */ libxsmm-1.17/src/generator_mateltwise.c000066400000000000000000000026161415223013700202770ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas, Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include #include "generator_common.h" #include "generator_mateltwise_avx_avx512.h" LIBXSMM_API void libxsmm_generator_mateltwise_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_meltw_descriptor* i_mateltw_desc ) { /* generate kernel */ if ( io_generated_code->arch >= LIBXSMM_X86_AVX512_CORE ) { libxsmm_generator_mateltwise_avx_avx512_kernel( io_generated_code, i_mateltw_desc ); } else { /* TODO fix this errori and support for more architectures */ LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_ARCH ); return; } } libxsmm-1.17/src/generator_mateltwise_avx_avx512.c000066400000000000000000005540451415223013700222730ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas, Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "generator_mateltwise_avx_avx512.h" #include "generator_x86_instructions.h" #include "generator_common.h" #include "libxsmm_main.h" LIBXSMM_API_INTERN void libxsmm_generator_mateltwise_header_m_loop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_mateltwise_kernel_config* i_kernel_config, const unsigned int i_gp_reg_m_loop ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_kernel_config->alu_mov_instruction, i_gp_reg_m_loop, 0); libxsmm_x86_instruction_register_jump_back_label( io_generated_code, io_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_kernel_config->alu_add_instruction, i_gp_reg_m_loop, 1); } LIBXSMM_API_INTERN void libxsmm_generator_mateltwise_footer_m_loop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_mateltwise_kernel_config* i_kernel_config, const unsigned int i_gp_reg_m_loop, const unsigned int i_m ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_kernel_config->alu_cmp_instruction, i_gp_reg_m_loop, i_m ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, i_kernel_config->alu_jmp_instruction, io_loop_label_tracker ); } LIBXSMM_API_INTERN void libxsmm_generator_mateltwise_header_n_loop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_mateltwise_kernel_config* i_kernel_config, const unsigned int i_gp_reg_n_loop ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_kernel_config->alu_mov_instruction, i_gp_reg_n_loop, 0); libxsmm_x86_instruction_register_jump_back_label( io_generated_code, io_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_kernel_config->alu_add_instruction, i_gp_reg_n_loop, 1); } LIBXSMM_API_INTERN void libxsmm_generator_mateltwise_footer_n_loop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_mateltwise_kernel_config* i_kernel_config, const unsigned int i_gp_reg_n_loop, const unsigned int i_n ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_kernel_config->alu_cmp_instruction, i_gp_reg_n_loop, i_n ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, i_kernel_config->alu_jmp_instruction, io_loop_label_tracker ); } LIBXSMM_API_INTERN void libxsmm_generator_mateltwise_initialize_avx512_mask( libxsmm_generated_code* io_generated_code, const unsigned int i_gp_reg_tmp, const unsigned int i_mask_reg, const unsigned int i_mask_count, const unsigned int i_precision) { unsigned long long l_mask = 0; if (i_precision == LIBXSMM_GEMM_PRECISION_F32) { l_mask = 0xffff; } else if (i_precision == LIBXSMM_GEMM_PRECISION_BF16) { l_mask = 0xffffffff; } else if (i_precision == LIBXSMM_GEMM_PRECISION_I8) { l_mask = 0xffffffffffffffff; } /* shift right by "inverse" remainder */ l_mask = l_mask >> i_mask_count; /* move mask to GP register */ libxsmm_x86_instruction_alu_imm( io_generated_code, LIBXSMM_X86_INSTR_MOVQ, i_gp_reg_tmp, l_mask ); if ( io_generated_code->arch >= LIBXSMM_X86_AVX512_CORE ) { if ( LIBXSMM_GEMM_PRECISION_F32 == i_precision ) { libxsmm_x86_instruction_mask_move( io_generated_code, LIBXSMM_X86_INSTR_KMOVW, i_gp_reg_tmp, i_mask_reg, 0 ); } else if ( LIBXSMM_GEMM_PRECISION_BF16 == i_precision ) { libxsmm_x86_instruction_mask_move( io_generated_code, LIBXSMM_X86_INSTR_KMOVD, i_gp_reg_tmp, i_mask_reg, 0 ); } else if ( LIBXSMM_GEMM_PRECISION_I8 == i_precision ) { libxsmm_x86_instruction_mask_move( io_generated_code, LIBXSMM_X86_INSTR_KMOVQ, i_gp_reg_tmp, i_mask_reg, 0 ); } else { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_UNSUP_DATATYPE ); return; } } else { /* shouldn't happen */ LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_ARCH ); return; } } LIBXSMM_API_INTERN void libxsmm_generator_mateltwise_init_micro_kernel_config_fullvector( libxsmm_generated_code* io_generated_code, libxsmm_mateltwise_kernel_config* io_micro_kernel_config, const unsigned int i_arch, const libxsmm_meltw_descriptor* i_mateltwise_desc) { memset(io_micro_kernel_config, 0, sizeof(*io_micro_kernel_config)); /* avoid warning "maybe used uninitialized" */ if ( i_arch >= LIBXSMM_X86_AVX512_CORE ) { io_micro_kernel_config->instruction_set = LIBXSMM_X86_AVX512_CORE; io_micro_kernel_config->vector_reg_count = 16; /* Configure input specific microkernel options */ if ( LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_INP( i_mateltwise_desc->datatype ) ) { io_micro_kernel_config->datatype_size_in = 4; io_micro_kernel_config->vector_length_in = 16; io_micro_kernel_config->vmove_instruction_in = LIBXSMM_X86_INSTR_VMOVUPS; } else if ( LIBXSMM_GEMM_PRECISION_BF16 == LIBXSMM_GETENUM_INP( i_mateltwise_desc->datatype ) ) { io_micro_kernel_config->datatype_size_in = 2; io_micro_kernel_config->vector_length_in = 32; io_micro_kernel_config->vmove_instruction_in = LIBXSMM_X86_INSTR_VMOVDQU16; } else if ( LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_INP( i_mateltwise_desc->datatype ) ) { io_micro_kernel_config->datatype_size_in = 1; io_micro_kernel_config->vector_length_in = 64; io_micro_kernel_config->vmove_instruction_in = LIBXSMM_X86_INSTR_VMOVDQU8; } else { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_UNSUP_DATATYPE ); return; } /* Configure output specific microkernel options */ if ( LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_OUT( i_mateltwise_desc->datatype ) ) { io_micro_kernel_config->datatype_size_out = 4; io_micro_kernel_config->vector_length_out = 16; io_micro_kernel_config->vmove_instruction_out = LIBXSMM_X86_INSTR_VMOVUPS; } else if ( LIBXSMM_GEMM_PRECISION_BF16 == LIBXSMM_GETENUM_OUT( i_mateltwise_desc->datatype ) ) { io_micro_kernel_config->datatype_size_out = 2; io_micro_kernel_config->vector_length_out = 32; io_micro_kernel_config->vmove_instruction_out = LIBXSMM_X86_INSTR_VMOVDQU16; } else if ( LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_OUT( i_mateltwise_desc->datatype ) ) { io_micro_kernel_config->datatype_size_out = 1; io_micro_kernel_config->vector_length_out = 64; io_micro_kernel_config->vmove_instruction_out = LIBXSMM_X86_INSTR_VMOVDQU8; } else { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_UNSUP_DATATYPE ); return; } io_micro_kernel_config->alu_add_instruction = LIBXSMM_X86_INSTR_ADDQ; io_micro_kernel_config->alu_cmp_instruction = LIBXSMM_X86_INSTR_CMPQ; io_micro_kernel_config->alu_jmp_instruction = LIBXSMM_X86_INSTR_JL; io_micro_kernel_config->alu_mov_instruction = LIBXSMM_X86_INSTR_MOVQ; io_micro_kernel_config->vxor_instruction = LIBXSMM_X86_INSTR_VPXORD; io_micro_kernel_config->vector_name = 'z'; } else { /* That should not happen */ LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_ARCH ); } } LIBXSMM_API_INTERN void libxsmm_generator_tanh_ps_rational_78_avx512( libxsmm_generated_code* io_generated_code, const libxsmm_mateltwise_kernel_config* i_micro_kernel_config, const unsigned int i_vec_x, const unsigned int i_vec_x2, const unsigned int i_vec_nom, const unsigned int i_vec_denom, const unsigned int i_mask_hi, const unsigned int i_mask_lo, const unsigned int i_vec_c0, const unsigned int i_vec_c1, const unsigned int i_vec_c2, const unsigned int i_vec_c3, const unsigned int i_vec_c1_d, const unsigned int i_vec_c2_d, const unsigned int i_vec_c3_d, const unsigned int i_vec_hi_bound, const unsigned int i_vec_lo_bound, const unsigned int i_vec_ones, const unsigned int i_vec_neg_ones ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, i_vec_x, i_vec_x, i_vec_x2 ); libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VCMPPS, i_micro_kernel_config->vector_name, i_vec_hi_bound, i_vec_x, i_vec_x, 17, i_mask_hi, 0 ); libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VCMPPS, i_micro_kernel_config->vector_name, i_vec_lo_bound, i_vec_x, i_vec_x, 30, i_mask_lo, 0 ); /* TODO: replace with zmm movs */ libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, i_vec_x, i_vec_x, i_vec_nom ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VFMADD213PS, i_micro_kernel_config->vector_name, i_vec_c2, i_vec_c3, i_vec_nom ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VFMADD213PS, i_micro_kernel_config->vector_name, i_vec_c1, i_vec_x2, i_vec_nom ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VFMADD213PS, i_micro_kernel_config->vector_name, i_vec_c0, i_vec_x2, i_vec_nom ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, i_vec_nom, i_vec_x, i_vec_nom ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, i_vec_x2, i_vec_c3_d, i_vec_denom ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VFMADD213PS, i_micro_kernel_config->vector_name, i_vec_c2_d, i_vec_x2, i_vec_denom ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VFMADD213PS, i_micro_kernel_config->vector_name, i_vec_c1_d, i_vec_x2, i_vec_denom ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VFMADD213PS, i_micro_kernel_config->vector_name, i_vec_c0, i_vec_x2, i_vec_denom ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VRCP14PS, i_micro_kernel_config->vector_name, i_vec_denom, i_vec_denom, LIBXSMM_X86_VEC_REG_UNDEF ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, i_vec_denom, i_vec_nom, i_vec_x ); libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBLENDMPS, i_micro_kernel_config->vector_name, i_vec_x, i_vec_ones, i_vec_x, LIBXSMM_X86_IMM_UNDEF, i_mask_hi, 0 ); libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBLENDMPS, i_micro_kernel_config->vector_name, i_vec_x, i_vec_neg_ones, i_vec_x, LIBXSMM_X86_IMM_UNDEF, i_mask_lo, 0 ); } LIBXSMM_API_INTERN void libxsmm_generator_cvtfp32bf16_avx512_replacement_sequence( libxsmm_generated_code* io_generated_code, const libxsmm_mateltwise_kernel_config* i_micro_kernel_config, const unsigned int i_vec_reg ) { /* and with naninf */ libxsmm_x86_instruction_vec_compute_mem( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VPANDD, 1, LIBXSMM_X86_GP_REG_RSP, LIBXSMM_X86_GP_REG_UNDEF, 0, 24, i_micro_kernel_config->vector_name, i_vec_reg, 0 ); /* and with fixup */ libxsmm_x86_instruction_vec_compute_mem( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VPANDD, 1, LIBXSMM_X86_GP_REG_RSP, LIBXSMM_X86_GP_REG_UNDEF, 0, 16, i_micro_kernel_config->vector_name, i_vec_reg, 1 ); /* compute naninf mask k7 */ libxsmm_x86_instruction_vec_compute_mem_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VPCMPD, 1, LIBXSMM_X86_GP_REG_RSP, LIBXSMM_X86_GP_REG_UNDEF, 0, 24, i_micro_kernel_config->vector_name, 0, LIBXSMM_X86_VEC_REG_UNDEF, 4, 7, 0 ); /* compute fixup mask k6 */ libxsmm_x86_instruction_vec_compute_mem_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VPCMPD, 1, LIBXSMM_X86_GP_REG_RSP, LIBXSMM_X86_GP_REG_UNDEF, 0, 16, i_micro_kernel_config->vector_name, 1, LIBXSMM_X86_VEC_REG_UNDEF, 0, 6, 0 ); /* load rneadd */ libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBROADCASTSS, LIBXSMM_X86_GP_REG_RSP, LIBXSMM_X86_GP_REG_UNDEF, 0, 8, i_micro_kernel_config->vector_name, 0, 0, 1, 0 ); /* load fixup */ libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBROADCASTSS, LIBXSMM_X86_GP_REG_RSP, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, 1, 0, 1, 0 ); /* compute fixup */ libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VPADDD, i_micro_kernel_config->vector_name, 1, 0, 0, LIBXSMM_X86_IMM_UNDEF, 6, 0 ); /* compute fixup */ libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VPADDD, i_micro_kernel_config->vector_name, 0, i_vec_reg, i_vec_reg, LIBXSMM_X86_IMM_UNDEF, 7, 0 ); } LIBXSMM_API_INTERN void libxsmm_generator_cvtfp32bf16_avx512_microkernel( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, libxsmm_mateltwise_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_mateltwise_kernel_config* i_micro_kernel_config, const libxsmm_meltw_descriptor* i_mateltwise_desc ) { unsigned int i = 0, im, in, m, n, n_trips, m_trips, use_m_masking, mask_in_count, mask_out_count, reg_0, reg_1, unroll_iter = 0, zero_reg = 0; unsigned int reserved_zmms = 0, max_nm_unrolling = 31, reserved_mask_regs = 1, current_mask_reg = 1; unsigned int n_unroll_factor = 1, eager_result_store = 0; unsigned int vec_x2 = 0, vec_nom = 0, vec_denom = 0, vec_c0 = 0, vec_c1 = 0, vec_c2 = 0, vec_c3 = 0, vec_c1_d = 0, vec_c2_d = 0, vec_c3_d = 0, vec_hi_bound = 0, vec_lo_bound = 0, vec_ones = 0, vec_neg_ones = 0, vec_halves = 0, mask_hi = 0, mask_lo = 0; unsigned int acvt_flags = (i_mateltwise_desc->operation == LIBXSMM_MELTW_OPERATION_ACT_CVTFP32BF16) ? libxsmm_get_meltw_acvt_flags( (libxsmm_meltw_comp_acvt_flags)i_mateltwise_desc->flags ) : LIBXSMM_MELTW_FLAG_ACVT_NONE; unsigned int cvta_flags = (i_mateltwise_desc->operation == LIBXSMM_MELTW_OPERATION_CVTFP32BF16_ACT) ? libxsmm_get_meltw_cvta_flags( (libxsmm_meltw_comp_cvta_flags)i_mateltwise_desc->flags ) : LIBXSMM_MELTW_FLAG_CVTA_NONE; unsigned int fuse_tanh_before_cvt = ( (i_mateltwise_desc->operation == LIBXSMM_MELTW_OPERATION_ACT_CVTFP32BF16) && ((acvt_flags & LIBXSMM_MELTW_FLAG_ACVT_FUSE_TANH) > 0) ) ? 1 : 0; unsigned int fuse_sigmoid_before_cvt = ( (i_mateltwise_desc->operation == LIBXSMM_MELTW_OPERATION_ACT_CVTFP32BF16) && ((acvt_flags & LIBXSMM_MELTW_FLAG_ACVT_FUSE_SIGM) > 0) ) ? 1 : 0; unsigned int fuse_relu_after_cvt = ((i_mateltwise_desc->operation == LIBXSMM_MELTW_OPERATION_CVTFP32BF16_ACT) && ((cvta_flags & LIBXSMM_MELTW_FLAG_CVTA_FUSE_RELU) > 0) ) ? 1 : 0; unsigned int fuse_tanh_after_cvt = ( (i_mateltwise_desc->operation == LIBXSMM_MELTW_OPERATION_CVTFP32BF16_ACT) && ((cvta_flags & LIBXSMM_MELTW_FLAG_CVTA_FUSE_TANH) > 0) ) ? 1 : 0; unsigned int fuse_sigmoid_after_cvt = ( (i_mateltwise_desc->operation == LIBXSMM_MELTW_OPERATION_CVTFP32BF16_ACT) && ((cvta_flags & LIBXSMM_MELTW_FLAG_CVTA_FUSE_SIGM) > 0) ) ? 1 : 0; if ((fuse_tanh_before_cvt == 1) && (fuse_sigmoid_before_cvt == 1)) { /* This should not happen */ LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_GENERAL ); return; } /* For now the below options are not supported in JITer */ if ((fuse_tanh_after_cvt == 1) || (fuse_sigmoid_after_cvt == 1)) { /* This should not happen */ LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_GENERAL ); return; } /* Some rudimentary checking of M, N and LDs*/ if ( (i_mateltwise_desc->m > i_mateltwise_desc->ldi) || (i_mateltwise_desc->m > i_mateltwise_desc->ldo) ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_LDA ); return; } /* Configure the register mapping for this eltwise kernel */ i_gp_reg_mapping->gp_reg_in = LIBXSMM_X86_GP_REG_R8; i_gp_reg_mapping->gp_reg_out = LIBXSMM_X86_GP_REG_R9; i_gp_reg_mapping->gp_reg_m_loop = LIBXSMM_X86_GP_REG_R10; i_gp_reg_mapping->gp_reg_n_loop = LIBXSMM_X86_GP_REG_R11; if ( fuse_relu_after_cvt == 1 ) { i_gp_reg_mapping->gp_reg_relumask = LIBXSMM_X86_GP_REG_R13; } /* load the input pointer and output pointer */ libxsmm_x86_instruction_alu_mem( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_param_struct, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_gp_reg_mapping->gp_reg_in, 0 ); libxsmm_x86_instruction_alu_mem( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_param_struct, LIBXSMM_X86_GP_REG_UNDEF, 0, 8, i_gp_reg_mapping->gp_reg_out, 0 ); if ( fuse_relu_after_cvt == 1 ) { libxsmm_x86_instruction_alu_mem( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_param_struct, LIBXSMM_X86_GP_REG_UNDEF, 0, 16, i_gp_reg_mapping->gp_reg_relumask, 0 ); } if (io_generated_code->arch < LIBXSMM_X86_AVX512_CPX) { reserved_zmms += 3; reserved_mask_regs += 2; } /* Determine the names of the reserved registers and load with constants when applicable... */ if ( (fuse_tanh_before_cvt == 1) || (fuse_sigmoid_before_cvt == 1) ) { float c0_array[16] = { 2027025.0f, 2027025.0f, 52027025.0f, 2027025.0f, 2027025.0f, 2027025.0f, 2027025.0f, 2027025.0f, 2027025.0f, 2027025.0f, 2027025.0f, 2027025.0f, 2027025.0f, 2027025.0f, 2027025.0f, 2027025.0f }; float c1_array[16] = { 270270.0f, 270270.0f, 270270.0f, 270270.0f, 270270.0f, 270270.0f, 270270.0f, 270270.0f, 270270.0f, 270270.0f, 270270.0f, 270270.0f, 270270.0f, 270270.0f, 270270.0f, 270270.0f }; float c2_array[16] = { 6930.0f, 6930.0f, 6930.0f, 6930.0f, 6930.0f, 6930.0f, 6930.0f, 6930.0f, 6930.0f, 6930.0f, 6930.0f, 6930.0f, 6930.0f, 6930.0f, 6930.0f, 6930.0f }; float c3_array[16] = { 36.0f, 36.0f, 36.0f, 36.0f, 36.0f, 36.0f, 36.0f, 36.0f, 36.0f, 36.0f, 36.0f, 36.0f, 36.0f, 36.0f, 36.0f, 36.0f }; float c1_d_array[16] = { 945945.0f, 945945.0f, 945945.0f, 945945.0f, 945945.0f, 945945.0f, 945945.0f, 945945.0f, 945945.0f, 945945.0f, 945945.0f, 945945.0f, 945945.0f, 945945.0f, 945945.0f, 945945.0f }; float c2_d_array[16] = { 51975.0f, 51975.0f, 51975.0f, 51975.0f, 51975.0f, 51975.0f, 51975.0f, 51975.0f, 51975.0f, 51975.0f, 51975.0f, 51975.0f, 51975.0f, 51975.0f, 51975.0f, 51975.0f }; float c3_d_array[16] = { 630.0f, 630.0f, 630.0f, 630.0f, 630.0f, 630.0f, 630.0f, 630.0f, 630.0f, 630.0f, 630.0f, 630.0f, 630.0f, 630.0f, 630.0f, 630.0f }; float hi_b_array[16] = { 4.97f, 4.97f, 4.97f, 4.97f, 4.97f, 4.97f, 4.97f, 4.97f, 4.97f, 4.97f, 4.97f, 4.97f, 4.97f, 4.97f, 4.97f, 4.97f }; float lo_b_array[16] = { -4.97f, -4.97f, -4.97f, -4.97f, -4.97f, -4.97f, -4.97f, -4.97f, -4.97f, -4.97f, -4.97f, -4.97f, -4.97f, -4.97f, -4.97f, -4.97f }; float ones_array[16] = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }; float neg_ones_array[16] = { -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f }; reserved_zmms += 14; if (fuse_sigmoid_before_cvt == 1) { reserved_zmms++; } vec_x2 = reserved_zmms - 1; vec_nom = reserved_zmms - 2; vec_denom = reserved_zmms - 3; vec_c0 = reserved_zmms - 4; libxsmm_x86_instruction_full_vec_load_of_constants ( io_generated_code, (const unsigned char *) c0_array, "c0_array_", i_micro_kernel_config->vector_name, vec_c0); vec_c1 = reserved_zmms - 5; libxsmm_x86_instruction_full_vec_load_of_constants ( io_generated_code, (const unsigned char *) c1_array, "c1_array_", i_micro_kernel_config->vector_name, vec_c1); vec_c2 = reserved_zmms - 6; libxsmm_x86_instruction_full_vec_load_of_constants ( io_generated_code, (const unsigned char *) c2_array, "c2_array_", i_micro_kernel_config->vector_name, vec_c2); vec_c3 = reserved_zmms - 7; libxsmm_x86_instruction_full_vec_load_of_constants ( io_generated_code, (const unsigned char *) c3_array, "c3_array_", i_micro_kernel_config->vector_name, vec_c3); vec_c1_d = reserved_zmms - 8; libxsmm_x86_instruction_full_vec_load_of_constants ( io_generated_code, (const unsigned char *) c1_d_array, "c1_d_array_", i_micro_kernel_config->vector_name, vec_c1_d); vec_c2_d = reserved_zmms - 9; libxsmm_x86_instruction_full_vec_load_of_constants ( io_generated_code, (const unsigned char *) c2_d_array, "c2_d_array_", i_micro_kernel_config->vector_name, vec_c2_d); vec_c3_d = reserved_zmms - 10; libxsmm_x86_instruction_full_vec_load_of_constants ( io_generated_code, (const unsigned char *) c3_d_array, "c3_d_array_", i_micro_kernel_config->vector_name, vec_c3_d); vec_hi_bound = reserved_zmms - 11; libxsmm_x86_instruction_full_vec_load_of_constants ( io_generated_code, (const unsigned char *) hi_b_array, "hi_b_array_", i_micro_kernel_config->vector_name, vec_hi_bound); vec_lo_bound = reserved_zmms - 12; libxsmm_x86_instruction_full_vec_load_of_constants ( io_generated_code, (const unsigned char *) lo_b_array, "lo_b_array_", i_micro_kernel_config->vector_name, vec_lo_bound); vec_ones = reserved_zmms - 13; libxsmm_x86_instruction_full_vec_load_of_constants ( io_generated_code, (const unsigned char *) ones_array, "ones_array_", i_micro_kernel_config->vector_name, vec_ones); vec_neg_ones = reserved_zmms - 14; libxsmm_x86_instruction_full_vec_load_of_constants ( io_generated_code, (const unsigned char *) neg_ones_array, "neg_ones_array_", i_micro_kernel_config->vector_name, vec_neg_ones); if (fuse_sigmoid_before_cvt == 1) { float halves_array[16] = { 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f }; vec_halves = reserved_zmms - 15; libxsmm_x86_instruction_full_vec_load_of_constants ( io_generated_code, (const unsigned char *) halves_array, "halves_array_", i_micro_kernel_config->vector_name, vec_halves); } } /* Set zero register neede for relu */ if ( fuse_relu_after_cvt == 1 ) { reserved_zmms++; zero_reg = reserved_zmms - 1; libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VPXORD, i_micro_kernel_config->vector_name, zero_reg, zero_reg, zero_reg ); } /* We fully unroll in M dimension, calculate mask if there is remainder */ m = i_mateltwise_desc->m; n = i_mateltwise_desc->n; use_m_masking = ( m % 32 == 0 ) ? 0 : 1; m_trips = (m + 31) / 32; max_nm_unrolling = max_nm_unrolling - reserved_zmms; if (m_trips > max_nm_unrolling) { eager_result_store = 1; n_unroll_factor = 1; } else { /* Explore n unrolling opportunities... We unroll only by factors that divide N */ eager_result_store = 0; n_unroll_factor = n; while (m_trips * n_unroll_factor > max_nm_unrolling) { n_unroll_factor--; } while (n % n_unroll_factor > 0) { n_unroll_factor--; } } n_trips = n / n_unroll_factor; /* Calculate input and output masks in case we see m_masking */ if (use_m_masking == 1) { /* If the remaining elements are < 16, then we read a full vector and a partial one at the last m trip */ /* If the remaining elements are >= 16, then we read a partial vector at the last m trip */ /* Calculate mask reg 1 for input-reading */ mask_in_count = ( (m % 32) > 16) ? 32 - (m % 32) : 16 - (m % 32); libxsmm_generator_mateltwise_initialize_avx512_mask(io_generated_code, LIBXSMM_X86_GP_REG_R12, 1, mask_in_count, LIBXSMM_GEMM_PRECISION_F32); /* Calculate mask reg 2 for output-writing */ mask_out_count = 32 - (m % 32); libxsmm_generator_mateltwise_initialize_avx512_mask(io_generated_code, LIBXSMM_X86_GP_REG_R12, 2, mask_out_count, LIBXSMM_GEMM_PRECISION_BF16); reserved_mask_regs += 2; } /* Determine the names of the reserved registers... */ if ( (fuse_tanh_before_cvt == 1) || (fuse_sigmoid_before_cvt == 1) ) { reserved_mask_regs += 2; mask_hi = reserved_mask_regs - 1; mask_lo = reserved_mask_regs - 2; } /* In this case we have to use CPX replacement sequence for downconverts... */ if (io_generated_code->arch < LIBXSMM_X86_AVX512_CPX) { /* init stack with helper variables for SW-based RNE rounding */ /* push 0x7f800000 on the stack, naninf masking */ libxsmm_x86_instruction_alu_imm( io_generated_code, LIBXSMM_X86_INSTR_MOVQ, LIBXSMM_X86_GP_REG_R12, 0x7f800000); libxsmm_x86_instruction_push_reg( io_generated_code, LIBXSMM_X86_GP_REG_R12 ); /* push 0x00010000 on the stack, fixup masking */ libxsmm_x86_instruction_alu_imm( io_generated_code, LIBXSMM_X86_INSTR_MOVQ, LIBXSMM_X86_GP_REG_R12, 0x00010000); libxsmm_x86_instruction_push_reg( io_generated_code, LIBXSMM_X86_GP_REG_R12 ); /* push 0x00007fff on the stack, rneadd */ libxsmm_x86_instruction_alu_imm( io_generated_code, LIBXSMM_X86_INSTR_MOVQ, LIBXSMM_X86_GP_REG_R12, 0x00007fff); libxsmm_x86_instruction_push_reg( io_generated_code, LIBXSMM_X86_GP_REG_R12); /* push 0x00000001 on the stack, fixup */ libxsmm_x86_instruction_alu_imm( io_generated_code, LIBXSMM_X86_INSTR_MOVQ, LIBXSMM_X86_GP_REG_R12, 0x00000001); libxsmm_x86_instruction_push_reg( io_generated_code, LIBXSMM_X86_GP_REG_R12 ); /* If we are using the 3 operant convert variant, then generate the proper permute table in zmm2 for the replacement code */ if (m > 16) { short perm_array[32] = { 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}; short selector_array[32] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0 ,0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 }; for (i = 0; i < 32; i++) { perm_array[i] = (short)(perm_array[i] | selector_array[i]); } libxsmm_x86_instruction_full_vec_load_of_constants ( io_generated_code, (const unsigned char *) perm_array, "perm_arrray_", i_micro_kernel_config->vector_name, 2); } } if (n_trips > 1) { /* open n loop */ libxsmm_generator_mateltwise_header_n_loop( io_generated_code, io_loop_label_tracker, i_micro_kernel_config, i_gp_reg_mapping->gp_reg_n_loop ); } for (in = 0; in < n_unroll_factor; in++) { for (im = 0; im < m_trips; im++) { unroll_iter = in * m_trips + im; if (unroll_iter + reserved_zmms < 16) { reg_0 = unroll_iter % (16-reserved_zmms) + reserved_zmms; reg_1 = unroll_iter % (16-reserved_zmms) + reserved_zmms + 16; } else { reg_0 = 16 + ((unroll_iter-16+reserved_zmms) % 15); reg_1 = reg_0 + 1; } libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_in, i_gp_reg_mapping->gp_reg_in, LIBXSMM_X86_GP_REG_UNDEF, 0, (im * 32 + in * i_mateltwise_desc->ldi) * i_micro_kernel_config->datatype_size_in, i_micro_kernel_config->vector_name, reg_0, ((im == (m_trips-1)) && (m % 32 < 16)) ? use_m_masking : 0, 1, 0 ); /* If last iteration and remainder is less than 16, do not load anything */ if (!((use_m_masking == 1) && (im == m_trips-1) && (m % 32 <= 16))) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_in, i_gp_reg_mapping->gp_reg_in, LIBXSMM_X86_GP_REG_UNDEF, 0, (im * 32 + 16 + in * i_mateltwise_desc->ldi) * i_micro_kernel_config->datatype_size_in, i_micro_kernel_config->vector_name, reg_1, (im == (m_trips-1)) ? use_m_masking : 0, 1, 0 ); } if ( (fuse_tanh_before_cvt == 1) || (fuse_sigmoid_before_cvt == 1) ) { if (fuse_sigmoid_before_cvt == 1) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, reg_0, vec_halves, reg_0 ); } libxsmm_generator_tanh_ps_rational_78_avx512(io_generated_code, i_micro_kernel_config, reg_0, vec_x2, vec_nom, vec_denom, mask_hi, mask_lo, vec_c0, vec_c1, vec_c2, vec_c3, vec_c1_d, vec_c2_d, vec_c3_d, vec_hi_bound, vec_lo_bound, vec_ones, vec_neg_ones); if (fuse_sigmoid_before_cvt == 1) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, reg_0, vec_ones, reg_0 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, reg_0, vec_halves, reg_0 ); } if (!((use_m_masking == 1) && (im == m_trips-1) && (m % 32 <= 16))) { if (fuse_sigmoid_before_cvt == 1) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, reg_1, vec_halves, reg_1 ); } libxsmm_generator_tanh_ps_rational_78_avx512(io_generated_code, i_micro_kernel_config, reg_1, vec_x2, vec_nom, vec_denom, mask_hi, mask_lo, vec_c0, vec_c1, vec_c2, vec_c3, vec_c1_d, vec_c2_d, vec_c3_d, vec_hi_bound, vec_lo_bound, vec_ones, vec_neg_ones); if (fuse_sigmoid_before_cvt == 1) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, reg_1, vec_ones, reg_1 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, reg_1, vec_halves, reg_1 ); } } } /* Downconvert to BF16 */ if (io_generated_code->arch >= LIBXSMM_X86_AVX512_CPX) { if (!((use_m_masking == 1) && (im == m_trips-1) && (m % 32 <= 16))) { libxsmm_x86_instruction_vec_compute_convert( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VCVTNE2PS2BF16, i_micro_kernel_config->vector_name, reg_0, reg_1, reg_0, 0); } else { libxsmm_x86_instruction_vec_compute_convert( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VCVTNEPS2BF16, i_micro_kernel_config->vector_name, reg_0, LIBXSMM_X86_VEC_REG_UNDEF, reg_0, 0); } } else { if (!((use_m_masking == 1) && (im == m_trips-1) && (m % 32 <= 16))) { /* RNE convert reg_0 and reg_1 */ libxsmm_generator_cvtfp32bf16_avx512_replacement_sequence( io_generated_code, i_micro_kernel_config, reg_0 ); libxsmm_generator_cvtfp32bf16_avx512_replacement_sequence( io_generated_code, i_micro_kernel_config, reg_1 ); /* Properly interleave reg_0 and reg_1 into reg_0 */ libxsmm_x86_instruction_vec_compute_reg(io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VPERMT2W, i_micro_kernel_config->vector_name, reg_1, 2, reg_0); } else { /* RNE convert reg_0 */ libxsmm_generator_cvtfp32bf16_avx512_replacement_sequence( io_generated_code, i_micro_kernel_config, reg_0 ); /* shift FP32 by 16bit to right */ libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VPSRAD, i_micro_kernel_config->vector_name, reg_0, reg_0, LIBXSMM_X86_VEC_REG_UNDEF, 16); /* store 16 bit values into lower portion of reg_0 */ libxsmm_x86_instruction_vec_compute_convert( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VPMOVDW, i_micro_kernel_config->vector_name, reg_0, LIBXSMM_X86_VEC_REG_UNDEF, reg_0, LIBXSMM_X86_VEC_REG_UNDEF); } } if ( fuse_relu_after_cvt == 1 ) { /* Compute relu mask */ if (io_generated_code->arch >= LIBXSMM_X86_AVX512_CPX) { current_mask_reg = reserved_mask_regs + (unroll_iter % (8-reserved_mask_regs)); } else { current_mask_reg = reserved_mask_regs - 2 + (unroll_iter % (8-reserved_mask_regs)); } libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VPCMPW, i_micro_kernel_config->vector_name, zero_reg, reg_0, reg_0, 6, current_mask_reg, 0 ); /* Store relu mask */ libxsmm_x86_instruction_mask_move_mem( io_generated_code, LIBXSMM_X86_INSTR_KMOVD, i_gp_reg_mapping->gp_reg_relumask, LIBXSMM_X86_GP_REG_UNDEF, 0, (im * 32 + in * i_mateltwise_desc->ldo)/8, current_mask_reg, 1 ); /* Blend output result with zero reg based on relu mask */ libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VPBLENDMW, i_micro_kernel_config->vector_name, reg_0, zero_reg, reg_0, LIBXSMM_X86_IMM_UNDEF, current_mask_reg, 0 ); } /* Store the result here if have "eager store" decision */ if (eager_result_store == 1) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_out, i_gp_reg_mapping->gp_reg_out, LIBXSMM_X86_GP_REG_UNDEF, 0, (im * 32 + in * i_mateltwise_desc->ldo) * i_micro_kernel_config->datatype_size_out, i_micro_kernel_config->vector_name, reg_0, (im == (m_trips-1)) ? use_m_masking * 2 : 0, 0, 1 ); } } } /* Store computed results... */ if (eager_result_store == 0) { for (in = 0; in < n_unroll_factor; in++) { for (im = 0; im < m_trips; im++) { unroll_iter = in * m_trips + im; if (unroll_iter + reserved_zmms < 16) { reg_0 = unroll_iter % (16-reserved_zmms) + reserved_zmms; } else { reg_0 = 16 + ((unroll_iter-16+reserved_zmms) % 15); } libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_out, i_gp_reg_mapping->gp_reg_out, LIBXSMM_X86_GP_REG_UNDEF, 0, (im * 32 + in * i_mateltwise_desc->ldo) * i_micro_kernel_config->datatype_size_out, i_micro_kernel_config->vector_name, reg_0, (im == (m_trips-1)) ? use_m_masking * 2 : 0, 0, 1 ); } } } if (n_trips > 1) { /* Adjust input and output pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_in, i_mateltwise_desc->ldi * n_unroll_factor * i_micro_kernel_config->datatype_size_in); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_out, i_mateltwise_desc->ldo * n_unroll_factor * i_micro_kernel_config->datatype_size_out); /* In case of fused relu adjust also relu ptr, datatype for relumask tensor is "bit" and also it has always the same shape as output */ if ( fuse_relu_after_cvt == 1 ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_relumask, (i_mateltwise_desc->ldo * n_unroll_factor)/8); } /* close n loop */ libxsmm_generator_mateltwise_footer_n_loop( io_generated_code, io_loop_label_tracker, i_micro_kernel_config, i_gp_reg_mapping->gp_reg_n_loop, n_trips); } if (io_generated_code->arch < LIBXSMM_X86_AVX512_CPX) { libxsmm_x86_instruction_pop_reg( io_generated_code, LIBXSMM_X86_GP_REG_R12 ); libxsmm_x86_instruction_pop_reg( io_generated_code, LIBXSMM_X86_GP_REG_R12 ); libxsmm_x86_instruction_pop_reg( io_generated_code, LIBXSMM_X86_GP_REG_R12 ); libxsmm_x86_instruction_pop_reg( io_generated_code, LIBXSMM_X86_GP_REG_R12 ); } } LIBXSMM_API_INTERN void libxsmm_generator_reduce_cols_avx512_microkernel( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, libxsmm_mateltwise_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_mateltwise_kernel_config* i_micro_kernel_config, const libxsmm_meltw_descriptor* i_mateltwise_desc ) { unsigned int in, m, n, m_full_trips, use_m_masking, mask_count, compute_squared_vals_reduce, compute_plain_vals_reduce, reg_n, reg_sum = 30, reg_sum_squared = 31; /* Some rudimentary checking of M, N and LDs*/ if ( i_mateltwise_desc->m > i_mateltwise_desc->ldi ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_LDA ); return; } if ( (libxsmm_get_meltw_redu_flags((libxsmm_meltw_comp_redu_flags)i_mateltwise_desc->flags) & LIBXSMM_MELTW_FLAG_REDUCE_ELTS) > 0 ) { compute_plain_vals_reduce= 1; } else { compute_plain_vals_reduce= 0; } if ( (libxsmm_get_meltw_redu_flags((libxsmm_meltw_comp_redu_flags)i_mateltwise_desc->flags) & LIBXSMM_MELTW_FLAG_REDUCE_ELTS_SQUARED) > 0 ) { compute_squared_vals_reduce = 1; } else { compute_squared_vals_reduce = 0; } /* Configure the register mapping for this eltwise kernel */ i_gp_reg_mapping->gp_reg_in = LIBXSMM_X86_GP_REG_R8; i_gp_reg_mapping->gp_reg_reduced_elts = LIBXSMM_X86_GP_REG_R9; i_gp_reg_mapping->gp_reg_reduced_elts_squared = LIBXSMM_X86_GP_REG_R10; i_gp_reg_mapping->gp_reg_m_loop = LIBXSMM_X86_GP_REG_R11; i_gp_reg_mapping->gp_reg_n_loop = LIBXSMM_X86_GP_REG_R12; /* load the input pointer and output pointer */ libxsmm_x86_instruction_alu_mem( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_param_struct, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_gp_reg_mapping->gp_reg_in, 0 ); if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_alu_mem( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_param_struct, LIBXSMM_X86_GP_REG_UNDEF, 0, 8, i_gp_reg_mapping->gp_reg_reduced_elts, 0 ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_alu_mem( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_param_struct, LIBXSMM_X86_GP_REG_UNDEF, 0, 16, i_gp_reg_mapping->gp_reg_reduced_elts_squared, 0 ); } /* We fully unroll in N dimension, calculate m-mask if there is remainder */ m = i_mateltwise_desc->m; n = i_mateltwise_desc->n; use_m_masking = ( m % 16 == 0 ) ? 0 : 1; m_full_trips = m / 16; /* Calculate input mask in case we see m_masking */ if (use_m_masking == 1) { /* Calculate mask reg 1 for input-reading */ mask_count = 16 - (m % 16); libxsmm_generator_mateltwise_initialize_avx512_mask(io_generated_code, LIBXSMM_X86_GP_REG_R13, 1, mask_count, LIBXSMM_GEMM_PRECISION_F32); } if ( m_full_trips >= 1 ) { if (m_full_trips > 1) { /* open m loop */ libxsmm_generator_mateltwise_header_m_loop( io_generated_code, io_loop_label_tracker, i_micro_kernel_config, i_gp_reg_mapping->gp_reg_m_loop ); } /* Initialize accumulators to zero */ if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VPXORD, i_micro_kernel_config->vector_name, reg_sum, reg_sum, reg_sum ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VPXORD, i_micro_kernel_config->vector_name, reg_sum_squared, reg_sum_squared, reg_sum_squared ); } for (in = 0; in < n; in++) { reg_n = in % 30; libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_in, i_gp_reg_mapping->gp_reg_in, LIBXSMM_X86_GP_REG_UNDEF, 0, in * i_mateltwise_desc->ldi * i_micro_kernel_config->datatype_size_in, i_micro_kernel_config->vector_name, reg_n, 0, 1, 0 ); if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, reg_n, reg_sum, reg_sum ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VFMADD231PS, i_micro_kernel_config->vector_name, reg_n, reg_n, reg_sum_squared ); } } /* Store computed results */ if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_out, i_gp_reg_mapping->gp_reg_reduced_elts, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, reg_sum, 0, 0, 1 ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_out, i_gp_reg_mapping->gp_reg_reduced_elts_squared, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, reg_sum_squared, 0, 0, 1 ); } if ((m_full_trips > 1) || (use_m_masking == 1)) { /* Adjust input and output pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_in, 16 * i_micro_kernel_config->datatype_size_in); if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_reduced_elts, 16 * i_micro_kernel_config->datatype_size_out); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_reduced_elts_squared, 16 * i_micro_kernel_config->datatype_size_out); } } if (m_full_trips > 1) { /* close m loop */ libxsmm_generator_mateltwise_footer_m_loop( io_generated_code, io_loop_label_tracker, i_micro_kernel_config, i_gp_reg_mapping->gp_reg_m_loop, m_full_trips); } } if (use_m_masking == 1) { /* Initialize accumulators to zero */ if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VPXORD, i_micro_kernel_config->vector_name, reg_sum, reg_sum, reg_sum ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VPXORD, i_micro_kernel_config->vector_name, reg_sum_squared, reg_sum_squared, reg_sum_squared ); } for (in = 0; in < n; in++) { reg_n = in % 30; libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_in, i_gp_reg_mapping->gp_reg_in, LIBXSMM_X86_GP_REG_UNDEF, 0, in * i_mateltwise_desc->ldi * i_micro_kernel_config->datatype_size_in, i_micro_kernel_config->vector_name, reg_n, use_m_masking, 1, 0 ); if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, reg_n, reg_sum, reg_sum ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VFMADD231PS, i_micro_kernel_config->vector_name, reg_n, reg_n, reg_sum_squared ); } } /* Store computed results */ if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_out, i_gp_reg_mapping->gp_reg_reduced_elts, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, reg_sum, use_m_masking, 0, 1 ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_out, i_gp_reg_mapping->gp_reg_reduced_elts_squared, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, reg_sum_squared, use_m_masking, 0, 1 ); } } } LIBXSMM_API_INTERN void libxsmm_generator_reduce_rows_avx512_microkernel( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, libxsmm_mateltwise_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_mateltwise_kernel_config* i_micro_kernel_config, const libxsmm_meltw_descriptor* i_mateltwise_desc ) { unsigned int i = 0, im, m, n, m_trips, n_trips, n_full_trips, use_m_masking, use_n_masking, mask_in_count, mask_out_count, n_cols_load = 16, compute_squared_vals_reduce, compute_plain_vals_reduce; /* Some rudimentary checking of M, N and LDs*/ if ( i_mateltwise_desc->m > i_mateltwise_desc->ldi ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_LDA ); return; } if ( (libxsmm_get_meltw_redu_flags((libxsmm_meltw_comp_redu_flags)i_mateltwise_desc->flags) & LIBXSMM_MELTW_FLAG_REDUCE_ELTS) > 0 ) { compute_plain_vals_reduce= 1; } else { compute_plain_vals_reduce= 0; } if ( (libxsmm_get_meltw_redu_flags((libxsmm_meltw_comp_redu_flags)i_mateltwise_desc->flags) & LIBXSMM_MELTW_FLAG_REDUCE_ELTS_SQUARED) > 0 ) { compute_squared_vals_reduce = 1; } else { compute_squared_vals_reduce = 0; } /* Configure the register mapping for this eltwise kernel */ i_gp_reg_mapping->gp_reg_in = LIBXSMM_X86_GP_REG_R8; i_gp_reg_mapping->gp_reg_reduced_elts = LIBXSMM_X86_GP_REG_R9; i_gp_reg_mapping->gp_reg_reduced_elts_squared = LIBXSMM_X86_GP_REG_R10; i_gp_reg_mapping->gp_reg_m_loop = LIBXSMM_X86_GP_REG_R11; i_gp_reg_mapping->gp_reg_n_loop = LIBXSMM_X86_GP_REG_R12; /* load the input pointer and output pointer */ libxsmm_x86_instruction_alu_mem( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_param_struct, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_gp_reg_mapping->gp_reg_in, 0 ); if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_alu_mem( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_param_struct, LIBXSMM_X86_GP_REG_UNDEF, 0, 8, i_gp_reg_mapping->gp_reg_reduced_elts, 0 ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_alu_mem( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_param_struct, LIBXSMM_X86_GP_REG_UNDEF, 0, 16, i_gp_reg_mapping->gp_reg_reduced_elts_squared, 0 ); } /* We fully unroll in M dimension, calculate mask if there is remainder */ m = i_mateltwise_desc->m; n = i_mateltwise_desc->n; use_m_masking = ( m % 16 == 0 ) ? 0 : 1; use_n_masking = ( n % 16 == 0 ) ? 0 : 1; m_trips = (m + 15) / 16; n_trips = (n + 15) / 16; n_full_trips = ( n % 16 == 0 ) ? n_trips : n_trips-1; /* Calculate input mask in case we see m_masking */ if (use_m_masking == 1) { /* Calculate mask reg 1 for input-reading */ mask_in_count = 16 - (m % 16); libxsmm_generator_mateltwise_initialize_avx512_mask(io_generated_code, LIBXSMM_X86_GP_REG_R13, 1, mask_in_count, LIBXSMM_GEMM_PRECISION_F32); } /* Calculate output mask in case we see n_masking */ if (use_n_masking == 1) { /* Calculate mask reg 2 for output-writing */ mask_out_count = 16 - (n % 16); libxsmm_generator_mateltwise_initialize_avx512_mask(io_generated_code, LIBXSMM_X86_GP_REG_R13, 2, mask_out_count, LIBXSMM_GEMM_PRECISION_F32); } /* move blend mask value to GP register and to mask register 7 */ libxsmm_x86_instruction_alu_imm( io_generated_code, LIBXSMM_X86_INSTR_MOVQ, LIBXSMM_X86_GP_REG_R13, 0xff00 ); libxsmm_x86_instruction_mask_move( io_generated_code, LIBXSMM_X86_INSTR_KMOVW, LIBXSMM_X86_GP_REG_R13, 7, 0 ); if (n_full_trips >= 1) { if (n_full_trips > 1) { /* open n loop */ libxsmm_generator_mateltwise_header_n_loop( io_generated_code, io_loop_label_tracker, i_micro_kernel_config, i_gp_reg_mapping->gp_reg_n_loop ); } /* We fully unroll M loop here... */ for (im = 0; im < m_trips; im++) { /* load 16 columns of input matrix */ for (i = 0 ; i < 16; i++) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_in, i_gp_reg_mapping->gp_reg_in, LIBXSMM_X86_GP_REG_UNDEF, 0, (im * 16 + i * i_mateltwise_desc->ldi) * i_micro_kernel_config->datatype_size_in, i_micro_kernel_config->vector_name, i, (im == (m_trips-1)) ? use_m_masking : 0, 1, 0 ); } /* 1st stage */ /* zmm0/zmm4; 4444 4444 4444 4444 / 0000 0000 0000 0000 -> zmm0: 4444 4444 0000 0000 */ if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBLENDMPS, i_micro_kernel_config->vector_name, 4, 0, 16, LIBXSMM_X86_IMM_UNDEF, 7, 0 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 4, 0, 17, 0x4e ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, 0, 0, 18 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, 4, 4, 19 ); libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBLENDMPS, i_micro_kernel_config->vector_name, 19, 18, 20, LIBXSMM_X86_IMM_UNDEF, 7, 0 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 19, 18, 21, 0x4e ); } if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 0 ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 20, 21, 24 ); } /* zmm8/zmm12; cccc cccc cccc cccc / 8888 8888 8888 8888 -> zmm8: cccc cccc 8888 8888 */ if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBLENDMPS, i_micro_kernel_config->vector_name, 12, 8, 16, LIBXSMM_X86_IMM_UNDEF, 7, 0 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 12, 8, 17, 0x4e ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, 8, 8, 18 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, 12, 12, 19 ); libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBLENDMPS, i_micro_kernel_config->vector_name, 19, 18, 20, LIBXSMM_X86_IMM_UNDEF, 7, 0 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 19, 18, 21, 0x4e ); } if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 8 ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 20, 21, 28 ); } /* zmm1/zmm5; 5555 5555 5555 5555 / 1111 1111 1111 1111 -> zmm1: 5555 5555 1111 1111 */ if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBLENDMPS, i_micro_kernel_config->vector_name, 5, 1, 16, LIBXSMM_X86_IMM_UNDEF, 7, 0 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 5, 1, 17, 0x4e ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, 1, 1, 18 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, 5, 5, 19 ); libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBLENDMPS, i_micro_kernel_config->vector_name, 19, 18, 20, LIBXSMM_X86_IMM_UNDEF, 7, 0 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 19, 18, 21, 0x4e ); } if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 1 ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 20, 21, 25 ); } /* zmm9/zmm13; dddd dddd dddd dddd / 9999 9999 9999 9999 -> zmm9: dddd dddd 9999 9999 */ if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBLENDMPS, i_micro_kernel_config->vector_name, 13, 9, 16, LIBXSMM_X86_IMM_UNDEF, 7, 0 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 13, 9, 17, 0x4e ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, 9, 9, 18 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, 13, 13, 19 ); libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBLENDMPS, i_micro_kernel_config->vector_name, 19, 18, 20, LIBXSMM_X86_IMM_UNDEF, 7, 0 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 19, 18, 21, 0x4e ); } if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 9 ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 20, 21, 29 ); } /* zmm2/zmm6; 6666 6666 6666 6666 / 2222 2222 2222 2222 -> zmm2: 6666 6666 2222 2222 */ if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBLENDMPS, i_micro_kernel_config->vector_name, 6, 2, 16, LIBXSMM_X86_IMM_UNDEF, 7, 0 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 6, 2, 17, 0x4e ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, 2, 2, 18 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, 6, 6, 19 ); libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBLENDMPS, i_micro_kernel_config->vector_name, 19, 18, 20, LIBXSMM_X86_IMM_UNDEF, 7, 0 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 19, 18, 21, 0x4e ); } if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 2 ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 20, 21, 26 ); } /* zmm10/zmm14; eeee eeee eeee eeee / aaaa aaaa aaaa aaaa -> zmm10: eeee eeee aaaa aaaa */ if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBLENDMPS, i_micro_kernel_config->vector_name, 14, 10, 16, LIBXSMM_X86_IMM_UNDEF, 7, 0 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 14, 10, 17, 0x4e ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, 10, 10, 18 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, 14, 14, 19 ); libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBLENDMPS, i_micro_kernel_config->vector_name, 19, 18, 20, LIBXSMM_X86_IMM_UNDEF, 7, 0 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 19, 18, 21, 0x4e ); } if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 10 ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 20, 21, 30 ); } /* zmm3/zmm7; 7777 7777 7777 7777 / 3333 3333 3333 3333 -> zmm3: 7777 7777 3333 3333 */ if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBLENDMPS, i_micro_kernel_config->vector_name, 7, 3, 16, LIBXSMM_X86_IMM_UNDEF, 7, 0 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 7, 3, 17, 0x4e ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, 3, 3, 18 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, 7, 7, 19 ); libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBLENDMPS, i_micro_kernel_config->vector_name, 19, 18, 20, LIBXSMM_X86_IMM_UNDEF, 7, 0 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 19, 18, 21, 0x4e ); } if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 3 ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 20, 21, 27 ); } /* zmm11/zmm15; ffff ffff ffff ffff / bbbb bbbb bbbb bbbb -> zmm11: ffff ffff bbbb bbbb */ if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBLENDMPS, i_micro_kernel_config->vector_name, 15, 11, 16, LIBXSMM_X86_IMM_UNDEF, 7, 0 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 15, 11, 17, 0x4e ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, 11, 11, 18 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, 15, 15, 19 ); libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBLENDMPS, i_micro_kernel_config->vector_name, 19, 18, 20, LIBXSMM_X86_IMM_UNDEF, 7, 0 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 19, 18, 21, 0x4e ); } if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 11 ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 20, 21, 31 ); } /* 2nd stage */ /* zmm0/zmm8; 4444 4444 0000 0000 / cccc cccc 8888 8888 -> zmm0: cccc 8888 4444 0000 */ if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 8, 0, 16, 0x88 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 8, 0, 17, 0xdd ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 0 ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 28, 24, 20, 0x88 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 28, 24, 21, 0xdd ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 20, 21, 24 ); } /* zmm1/zmm9; 5555 5555 1111 1111 / dddd dddd 9999 9999 -> zmm1: dddd 9999 5555 1111 */ if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 9, 1, 16, 0x88 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 9, 1, 17, 0xdd ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 1 ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 29, 25, 20, 0x88 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 29, 25, 21, 0xdd ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 20, 21, 25 ); } /* zmm2/zmm10; 6666 6666 2222 2222 / eeee eeee aaaa aaaa -> zmm2: eeee aaaa 6666 2222 */ if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 10, 2, 16, 0x88 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 10, 2, 17, 0xdd ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 2 ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 30, 26, 20, 0x88 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 30, 26, 21, 0xdd ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 20, 21, 26 ); } /* zmm3/zmm11: 7777 7777 3333 3333 / ffff ffff bbbb bbbb -> zmm3: ffff bbbb 7777 3333 */ if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 11, 3, 16, 0x88 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 11, 3, 17, 0xdd ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 3 ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 31, 27, 20, 0x88 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 31, 27, 21, 0xdd ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 20, 21, 27 ); } /* 3rd stage */ /* zmm0/zmm1; cccc 8888 4444 0000 / dddd 9999 5555 1111 -> zmm0: ddcc 9988 5544 1100 */ if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFPS, i_micro_kernel_config->vector_name, 1, 0, 16, 0x44 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFPS, i_micro_kernel_config->vector_name, 1, 0, 17, 0xee ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 0 ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFPS, i_micro_kernel_config->vector_name, 25, 24, 20, 0x44 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFPS, i_micro_kernel_config->vector_name, 25, 24, 21, 0xee ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 20, 21, 24 ); } /* zmm2/zmm3; eeee aaaa 6666 2222 / ffff bbbb 7777 3333 -> zmm2: ffee bbaa 7766 3322 */ if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFPS, i_micro_kernel_config->vector_name, 3, 2, 16, 0x44 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFPS, i_micro_kernel_config->vector_name, 3, 2, 17, 0xee ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 2 ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFPS, i_micro_kernel_config->vector_name, 27, 26, 20, 0x44 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFPS, i_micro_kernel_config->vector_name, 27, 26, 21, 0xee ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 20, 21, 26 ); } /* 4th stage */ /* zmm0/zmm2; ddcc 9988 5544 1100 / ffee bbaa 7766 3322 -> zmm0: fedc ba98 7654 3210 */ if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFPS, i_micro_kernel_config->vector_name, 2, 0, 16, 0x88 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFPS, i_micro_kernel_config->vector_name, 2, 0, 17, 0xdd ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 0 ); /* Update the running reduction result */ if (im == 0) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_out, i_gp_reg_mapping->gp_reg_reduced_elts, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, 0, 0, 0, 1 ); } else { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_out, i_gp_reg_mapping->gp_reg_reduced_elts, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, 1, 0, 1, 0 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 1, 0, 0 ); libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_out, i_gp_reg_mapping->gp_reg_reduced_elts, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, 0, 0, 0, 1 ); } } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFPS, i_micro_kernel_config->vector_name, 26, 24, 20, 0x88 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFPS, i_micro_kernel_config->vector_name, 26, 24, 21, 0xdd ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 20, 21, 24 ); /* Update the running reduction result */ if (im == 0) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_out, i_gp_reg_mapping->gp_reg_reduced_elts_squared, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, 24, 0, 0, 1 ); } else { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_out, i_gp_reg_mapping->gp_reg_reduced_elts_squared, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, 25, 0, 1, 0 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 25, 24, 24 ); libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_out, i_gp_reg_mapping->gp_reg_reduced_elts_squared, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, 24, 0, 0, 1 ); } } } if ((n_full_trips > 1) || (n % 16 != 0)) { /* Adjust input and output pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_in, 16 * i_mateltwise_desc->ldi * i_micro_kernel_config->datatype_size_in); if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_reduced_elts, 16 * i_micro_kernel_config->datatype_size_out); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_reduced_elts_squared, 16 * i_micro_kernel_config->datatype_size_out); } } if (n_full_trips > 1) { /* close n loop */ libxsmm_generator_mateltwise_footer_n_loop( io_generated_code, io_loop_label_tracker, i_micro_kernel_config, i_gp_reg_mapping->gp_reg_n_loop, n_full_trips); } } /* In this case we load only partial number of columns */ n_cols_load = n % 16; if (n_cols_load != 0) { /* We fully unroll M loop here... */ for (im = 0; im < m_trips; im++) { /* load 16 columns of input matrix */ for (i = 0 ; i < n_cols_load; i++) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_in, i_gp_reg_mapping->gp_reg_in, LIBXSMM_X86_GP_REG_UNDEF, 0, (im * 16 + i * i_mateltwise_desc->ldi) * i_micro_kernel_config->datatype_size_in, i_micro_kernel_config->vector_name, i, (im == (m_trips-1)) ? use_m_masking : 0, 1, 0 ); } for ( i = n_cols_load; i < 16; i++) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VPXORD, i_micro_kernel_config->vector_name, i, i, i ); } /* 1st stage */ /* zmm0/zmm4; 4444 4444 4444 4444 / 0000 0000 0000 0000 -> zmm0: 4444 4444 0000 0000 */ if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBLENDMPS, i_micro_kernel_config->vector_name, 4, 0, 16, LIBXSMM_X86_IMM_UNDEF, 7, 0 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 4, 0, 17, 0x4e ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, 0, 0, 18 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, 4, 4, 19 ); libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBLENDMPS, i_micro_kernel_config->vector_name, 19, 18, 20, LIBXSMM_X86_IMM_UNDEF, 7, 0 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 19, 18, 21, 0x4e ); } if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 0 ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 20, 21, 24 ); } if (n_cols_load > 7) { /* zmm8/zmm12; cccc cccc cccc cccc / 8888 8888 8888 8888 -> zmm8: cccc cccc 8888 8888 */ if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBLENDMPS, i_micro_kernel_config->vector_name, 12, 8, 16, LIBXSMM_X86_IMM_UNDEF, 7, 0 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 12, 8, 17, 0x4e ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, 8, 8, 18 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, 12, 12, 19 ); libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBLENDMPS, i_micro_kernel_config->vector_name, 19, 18, 20, LIBXSMM_X86_IMM_UNDEF, 7, 0 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 19, 18, 21, 0x4e ); } if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 8 ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 20, 21, 28 ); } } /* zmm1/zmm5; 5555 5555 5555 5555 / 1111 1111 1111 1111 -> zmm1: 5555 5555 1111 1111 */ if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBLENDMPS, i_micro_kernel_config->vector_name, 5, 1, 16, LIBXSMM_X86_IMM_UNDEF, 7, 0 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 5, 1, 17, 0x4e ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, 1, 1, 18 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, 5, 5, 19 ); libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBLENDMPS, i_micro_kernel_config->vector_name, 19, 18, 20, LIBXSMM_X86_IMM_UNDEF, 7, 0 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 19, 18, 21, 0x4e ); } if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 1 ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 20, 21, 25 ); } if (n_cols_load > 8) { /* zmm9/zmm13; dddd dddd dddd dddd / 9999 9999 9999 9999 -> zmm9: dddd dddd 9999 9999 */ if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBLENDMPS, i_micro_kernel_config->vector_name, 13, 9, 16, LIBXSMM_X86_IMM_UNDEF, 7, 0 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 13, 9, 17, 0x4e ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, 9, 9, 18 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, 13, 13, 19 ); libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBLENDMPS, i_micro_kernel_config->vector_name, 19, 18, 20, LIBXSMM_X86_IMM_UNDEF, 7, 0 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 19, 18, 21, 0x4e ); } if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 9 ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 20, 21, 29 ); } } /* zmm2/zmm6; 6666 6666 6666 6666 / 2222 2222 2222 2222 -> zmm2: 6666 6666 2222 2222 */ if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBLENDMPS, i_micro_kernel_config->vector_name, 6, 2, 16, LIBXSMM_X86_IMM_UNDEF, 7, 0 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 6, 2, 17, 0x4e ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, 2, 2, 18 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, 6, 6, 19 ); libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBLENDMPS, i_micro_kernel_config->vector_name, 19, 18, 20, LIBXSMM_X86_IMM_UNDEF, 7, 0 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 19, 18, 21, 0x4e ); } if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 2 ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 20, 21, 26 ); } if (n_cols_load > 9) { /* zmm10/zmm14; eeee eeee eeee eeee / aaaa aaaa aaaa aaaa -> zmm10: eeee eeee aaaa aaaa */ if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBLENDMPS, i_micro_kernel_config->vector_name, 14, 10, 16, LIBXSMM_X86_IMM_UNDEF, 7, 0 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 14, 10, 17, 0x4e ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, 10, 10, 18 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, 14, 14, 19 ); libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBLENDMPS, i_micro_kernel_config->vector_name, 19, 18, 20, LIBXSMM_X86_IMM_UNDEF, 7, 0 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 19, 18, 21, 0x4e ); } if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 10 ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 20, 21, 30 ); } } /* zmm3/zmm7; 7777 7777 7777 7777 / 3333 3333 3333 3333 -> zmm3: 7777 7777 3333 3333 */ if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBLENDMPS, i_micro_kernel_config->vector_name, 7, 3, 16, LIBXSMM_X86_IMM_UNDEF, 7, 0 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 7, 3, 17, 0x4e ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, 3, 3, 18 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, 7, 7, 19 ); libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBLENDMPS, i_micro_kernel_config->vector_name, 19, 18, 20, LIBXSMM_X86_IMM_UNDEF, 7, 0 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 19, 18, 21, 0x4e ); } if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 3 ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 20, 21, 27 ); } if (n_cols_load > 10) { /* zmm11/zmm15; ffff ffff ffff ffff / bbbb bbbb bbbb bbbb -> zmm11: ffff ffff bbbb bbbb */ if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBLENDMPS, i_micro_kernel_config->vector_name, 15, 11, 16, LIBXSMM_X86_IMM_UNDEF, 7, 0 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 15, 11, 17, 0x4e ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, 11, 11, 18 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, 15, 15, 19 ); libxsmm_x86_instruction_vec_compute_reg_mask( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBLENDMPS, i_micro_kernel_config->vector_name, 19, 18, 20, LIBXSMM_X86_IMM_UNDEF, 7, 0 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 19, 18, 21, 0x4e ); } if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 11 ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 20, 21, 31 ); } } /* 2nd stage */ /* zmm0/zmm8; 4444 4444 0000 0000 / cccc cccc 8888 8888 -> zmm0: cccc 8888 4444 0000 */ if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 8, 0, 16, 0x88 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 8, 0, 17, 0xdd ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 0 ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 28, 24, 20, 0x88 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 28, 24, 21, 0xdd ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 20, 21, 24 ); } /* zmm1/zmm9; 5555 5555 1111 1111 / dddd dddd 9999 9999 -> zmm1: dddd 9999 5555 1111 */ if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 9, 1, 16, 0x88 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 9, 1, 17, 0xdd ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 1 ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 29, 25, 20, 0x88 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 29, 25, 21, 0xdd ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 20, 21, 25 ); } /* zmm2/zmm10; 6666 6666 2222 2222 / eeee eeee aaaa aaaa -> zmm2: eeee aaaa 6666 2222 */ if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 10, 2, 16, 0x88 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 10, 2, 17, 0xdd ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 2 ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 30, 26, 20, 0x88 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 30, 26, 21, 0xdd ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 20, 21, 26 ); } /* zmm3/zmm11: 7777 7777 3333 3333 / ffff ffff bbbb bbbb -> zmm3: ffff bbbb 7777 3333 */ if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 11, 3, 16, 0x88 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 11, 3, 17, 0xdd ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 3 ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 31, 27, 20, 0x88 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 31, 27, 21, 0xdd ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 20, 21, 27 ); } /* 3rd stage */ /* zmm0/zmm1; cccc 8888 4444 0000 / dddd 9999 5555 1111 -> zmm0: ddcc 9988 5544 1100 */ if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFPS, i_micro_kernel_config->vector_name, 1, 0, 16, 0x44 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFPS, i_micro_kernel_config->vector_name, 1, 0, 17, 0xee ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 0 ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFPS, i_micro_kernel_config->vector_name, 25, 24, 20, 0x44 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFPS, i_micro_kernel_config->vector_name, 25, 24, 21, 0xee ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 20, 21, 24 ); } /* zmm2/zmm3; eeee aaaa 6666 2222 / ffff bbbb 7777 3333 -> zmm2: ffee bbaa 7766 3322 */ if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFPS, i_micro_kernel_config->vector_name, 3, 2, 16, 0x44 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFPS, i_micro_kernel_config->vector_name, 3, 2, 17, 0xee ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 2 ); } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFPS, i_micro_kernel_config->vector_name, 27, 26, 20, 0x44 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFPS, i_micro_kernel_config->vector_name, 27, 26, 21, 0xee ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 20, 21, 26 ); } /* 4th stage */ /* zmm0/zmm2; ddcc 9988 5544 1100 / ffee bbaa 7766 3322 -> zmm0: fedc ba98 7654 3210 */ if ( compute_plain_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFPS, i_micro_kernel_config->vector_name, 2, 0, 16, 0x88 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFPS, i_micro_kernel_config->vector_name, 2, 0, 17, 0xdd ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 0 ); /* Update the running reduction result */ if (im == 0) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_out, i_gp_reg_mapping->gp_reg_reduced_elts, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, 0, 2, 0, 1 ); } else { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_out, i_gp_reg_mapping->gp_reg_reduced_elts, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, 1, 2, 1, 0 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 1, 0, 0 ); libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_out, i_gp_reg_mapping->gp_reg_reduced_elts, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, 0, 2, 0, 1 ); } } if ( compute_squared_vals_reduce > 0 ) { libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFPS, i_micro_kernel_config->vector_name, 26, 24, 20, 0x88 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFPS, i_micro_kernel_config->vector_name, 26, 24, 21, 0xdd ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 20, 21, 24 ); /* Update the running reduction result */ if (im == 0) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_out, i_gp_reg_mapping->gp_reg_reduced_elts_squared, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, 24, 2, 0, 1 ); } else { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_out, i_gp_reg_mapping->gp_reg_reduced_elts_squared, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, 25, 2, 1, 0 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 25, 24, 24 ); libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_out, i_gp_reg_mapping->gp_reg_reduced_elts_squared, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, 24, 2, 0, 1 ); } } } } } LIBXSMM_API_INTERN void libxsmm_generator_scale_avx512_microkernel( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, libxsmm_mateltwise_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_mateltwise_kernel_config* i_micro_kernel_config, const libxsmm_meltw_descriptor* i_mateltwise_desc ) { unsigned int in, im, m, n, m_full_trips, m_trips, use_m_masking, mask_count, reg_n, reg_m; unsigned int scale_rows = 0, scale_cols = 0, perform_scale = 0, perform_shift = 0, perform_addbias = 0; unsigned int reg_shift = 31, reg_bias = 30, reg_scale = 29; /* Some rudimentary checking of M, N and LDs*/ if ( i_mateltwise_desc->m > i_mateltwise_desc->ldi ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_LDA ); return; } if ( (((libxsmm_get_meltw_scal_flags((libxsmm_meltw_comp_scal_flags)i_mateltwise_desc->flags) & LIBXSMM_MELTW_FLAG_SCALE_ROWS) > 0) && ((libxsmm_get_meltw_scal_flags((libxsmm_meltw_comp_scal_flags)i_mateltwise_desc->flags) & LIBXSMM_MELTW_FLAG_SCALE_COLS) > 0)) || (((libxsmm_get_meltw_scal_flags((libxsmm_meltw_comp_scal_flags)i_mateltwise_desc->flags) & LIBXSMM_MELTW_FLAG_SCALE_ROWS) == 0) && ((libxsmm_get_meltw_scal_flags((libxsmm_meltw_comp_scal_flags)i_mateltwise_desc->flags) & LIBXSMM_MELTW_FLAG_SCALE_COLS) == 0)) ) { /* This should not happen */ LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_GENERAL ); return; } /* Determine what operations to perform */ scale_rows = ((libxsmm_get_meltw_scal_flags((libxsmm_meltw_comp_scal_flags)i_mateltwise_desc->flags) & LIBXSMM_MELTW_FLAG_SCALE_ROWS) > 0) ? 1 : 0; scale_cols = ((libxsmm_get_meltw_scal_flags((libxsmm_meltw_comp_scal_flags)i_mateltwise_desc->flags) & LIBXSMM_MELTW_FLAG_SCALE_COLS) > 0) ? 1 : 0; perform_scale = ((libxsmm_get_meltw_scal_flags((libxsmm_meltw_comp_scal_flags)i_mateltwise_desc->flags) & LIBXSMM_MELTW_FLAG_SCALE_MULT) > 0) ? 1 : 0; perform_shift = ((libxsmm_get_meltw_scal_flags((libxsmm_meltw_comp_scal_flags)i_mateltwise_desc->flags) & LIBXSMM_MELTW_FLAG_SCALE_SHIFT) > 0) ? 1 : 0; perform_addbias = ((libxsmm_get_meltw_scal_flags((libxsmm_meltw_comp_scal_flags)i_mateltwise_desc->flags) & LIBXSMM_MELTW_FLAG_SCALE_ADD_BIAS) > 0) ? 1 : 0; /* Configure the register mapping for this eltwise kernel */ i_gp_reg_mapping->gp_reg_in = LIBXSMM_X86_GP_REG_R8; i_gp_reg_mapping->gp_reg_out = LIBXSMM_X86_GP_REG_R9; i_gp_reg_mapping->gp_reg_shift_vals = LIBXSMM_X86_GP_REG_R10; i_gp_reg_mapping->gp_reg_scale_vals = LIBXSMM_X86_GP_REG_R11; i_gp_reg_mapping->gp_reg_bias_vals = LIBXSMM_X86_GP_REG_R12; i_gp_reg_mapping->gp_reg_m_loop = LIBXSMM_X86_GP_REG_R13; i_gp_reg_mapping->gp_reg_n_loop = LIBXSMM_X86_GP_REG_R14; /* We fully unroll in N dimension, calculate m-mask if there is remainder */ m = i_mateltwise_desc->m; n = i_mateltwise_desc->n; use_m_masking = ( m % 16 == 0 ) ? 0 : 1; /* Calculate input mask in case we see m_masking */ if (use_m_masking == 1) { /* Calculate mask reg 1 for input-reading */ mask_count = 16 - (m % 16); libxsmm_generator_mateltwise_initialize_avx512_mask(io_generated_code, LIBXSMM_X86_GP_REG_R13, 1, mask_count, LIBXSMM_GEMM_PRECISION_F32); } /* load the input pointer(s) and output pointer */ libxsmm_x86_instruction_alu_mem( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_param_struct, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_gp_reg_mapping->gp_reg_in, 0 ); if ( perform_shift > 0 ) { libxsmm_x86_instruction_alu_mem( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_param_struct, LIBXSMM_X86_GP_REG_UNDEF, 0, 8, i_gp_reg_mapping->gp_reg_shift_vals, 0 ); } if ( perform_scale > 0 ) { libxsmm_x86_instruction_alu_mem( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_param_struct, LIBXSMM_X86_GP_REG_UNDEF, 0, 16, i_gp_reg_mapping->gp_reg_scale_vals, 0 ); } if ( perform_addbias > 0 ) { libxsmm_x86_instruction_alu_mem( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_param_struct, LIBXSMM_X86_GP_REG_UNDEF, 0, 24, i_gp_reg_mapping->gp_reg_bias_vals, 0 ); } libxsmm_x86_instruction_alu_mem( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_param_struct, LIBXSMM_X86_GP_REG_UNDEF, 0, 32, i_gp_reg_mapping->gp_reg_out, 0 ); /* If scaling cols: follow an MN loop order with fully unrolled N loop */ if (scale_cols == 1) { m_full_trips = m / 16; if ( m_full_trips >= 1 ) { if (m_full_trips > 1) { /* open m loop */ libxsmm_generator_mateltwise_header_m_loop( io_generated_code, io_loop_label_tracker, i_micro_kernel_config, i_gp_reg_mapping->gp_reg_m_loop ); } /* Load the correspodning columns to be used for scaling */ if ( perform_shift > 0 ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_in, i_gp_reg_mapping->gp_reg_shift_vals, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, reg_shift, 0, 1, 0 ); } if ( perform_scale > 0 ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_in, i_gp_reg_mapping->gp_reg_scale_vals, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, reg_scale, 0, 1, 0 ); } if ( perform_addbias > 0 ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_in, i_gp_reg_mapping->gp_reg_bias_vals, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, reg_bias, 0, 1, 0 ); } for (in = 0; in < n; in++) { reg_n = in % 29; /* Load part of the column */ libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_in, i_gp_reg_mapping->gp_reg_in, LIBXSMM_X86_GP_REG_UNDEF, 0, in * i_mateltwise_desc->ldi * i_micro_kernel_config->datatype_size_in, i_micro_kernel_config->vector_name, reg_n, 0, 1, 0 ); /* Perform transformations */ if ( perform_shift > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, reg_n, reg_shift, reg_n ); } if ( perform_scale > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, reg_n, reg_scale, reg_n ); } if ( perform_addbias> 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, reg_n, reg_bias, reg_n ); } /* Store part of the column */ libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_out, i_gp_reg_mapping->gp_reg_out, LIBXSMM_X86_GP_REG_UNDEF, 0, in * i_mateltwise_desc->ldo * i_micro_kernel_config->datatype_size_out, i_micro_kernel_config->vector_name, reg_n, 0, 0, 1 ); } if ((m_full_trips > 1) || (use_m_masking == 1)) { /* Adjust input and output pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_in, 16 * i_micro_kernel_config->datatype_size_in); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_out, 16 * i_micro_kernel_config->datatype_size_out); if ( perform_shift > 0 ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_shift_vals, 16 * i_micro_kernel_config->datatype_size_in); } if ( perform_scale > 0 ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_scale_vals, 16 * i_micro_kernel_config->datatype_size_in); } if ( perform_addbias > 0 ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_bias_vals, 16 * i_micro_kernel_config->datatype_size_in); } } if (m_full_trips > 1) { /* close m loop */ libxsmm_generator_mateltwise_footer_m_loop( io_generated_code, io_loop_label_tracker, i_micro_kernel_config, i_gp_reg_mapping->gp_reg_m_loop, m_full_trips); } } if (use_m_masking == 1) { /* Load the correspodning columns to be used for scaling */ if ( perform_shift > 0 ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_in, i_gp_reg_mapping->gp_reg_shift_vals, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, reg_shift, 0, 1, 0 ); } if ( perform_scale > 0 ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_in, i_gp_reg_mapping->gp_reg_scale_vals, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, reg_scale, 0, 1, 0 ); } if ( perform_addbias > 0 ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_in, i_gp_reg_mapping->gp_reg_bias_vals, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, reg_bias, 0, 1, 0 ); } for (in = 0; in < n; in++) { reg_n = in % 29; /* Load part of the column */ libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_in, i_gp_reg_mapping->gp_reg_in, LIBXSMM_X86_GP_REG_UNDEF, 0, in * i_mateltwise_desc->ldi * i_micro_kernel_config->datatype_size_in, i_micro_kernel_config->vector_name, reg_n, use_m_masking, 1, 0 ); /* Perform transformations */ if ( perform_shift > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, reg_n, reg_shift, reg_n ); } if ( perform_scale > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, reg_n, reg_scale, reg_n ); } if ( perform_addbias> 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, reg_n, reg_bias, reg_n ); } /* Store part of the column */ libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_out, i_gp_reg_mapping->gp_reg_out, LIBXSMM_X86_GP_REG_UNDEF, 0, in * i_mateltwise_desc->ldo * i_micro_kernel_config->datatype_size_out, i_micro_kernel_config->vector_name, reg_n, use_m_masking, 0, 1 ); } } } /* If scaling rows: follow an NM loop order with fully unrolled M loop */ if (scale_rows == 1) { m_trips = (m + 15) / 16; if (n > 1) { /* open n loop */ libxsmm_generator_mateltwise_header_n_loop( io_generated_code, io_loop_label_tracker, i_micro_kernel_config, i_gp_reg_mapping->gp_reg_n_loop ); } /* Load the correspodning columns to be used for scaling */ if ( perform_shift > 0 ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBROADCASTSS, i_gp_reg_mapping->gp_reg_shift_vals, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, reg_shift, 0, 1, 0 ); } if ( perform_scale > 0 ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBROADCASTSS, i_gp_reg_mapping->gp_reg_scale_vals, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, reg_scale, 0, 1, 0 ); } if ( perform_addbias > 0 ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VBROADCASTSS, i_gp_reg_mapping->gp_reg_bias_vals, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, reg_bias, 0, 1, 0 ); } for (im = 0; im < m_trips; im++) { reg_m = im % 29; libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_in, i_gp_reg_mapping->gp_reg_in, LIBXSMM_X86_GP_REG_UNDEF, 0, im * 16 * i_micro_kernel_config->datatype_size_in, i_micro_kernel_config->vector_name, reg_m, (im == (m_trips-1)) ? use_m_masking : 0, 1, 0 ); /* Perform transformations */ if ( perform_shift > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, reg_m, reg_shift, reg_m ); } if ( perform_scale > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMULPS, i_micro_kernel_config->vector_name, reg_m, reg_scale, reg_m ); } if ( perform_addbias > 0 ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, reg_m, reg_bias, reg_m ); } /* Store the result */ libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmove_instruction_out, i_gp_reg_mapping->gp_reg_out, LIBXSMM_X86_GP_REG_UNDEF, 0, im * 16 * i_micro_kernel_config->datatype_size_out, i_micro_kernel_config->vector_name, reg_m, (im == (m_trips-1)) ? use_m_masking : 0, 0, 1 ); } if (n > 1) { /* Adjust input and output pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_in, i_mateltwise_desc->ldi * i_micro_kernel_config->datatype_size_in); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_out, i_mateltwise_desc->ldo * i_micro_kernel_config->datatype_size_out); if ( perform_shift > 0 ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_shift_vals, i_micro_kernel_config->datatype_size_in); } if ( perform_scale > 0 ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_scale_vals, i_micro_kernel_config->datatype_size_in); } if ( perform_addbias > 0 ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_bias_vals, i_micro_kernel_config->datatype_size_in); } /* close n loop */ libxsmm_generator_mateltwise_footer_n_loop( io_generated_code, io_loop_label_tracker, i_micro_kernel_config, i_gp_reg_mapping->gp_reg_n_loop, n); } } } LIBXSMM_API_INTERN void libxsmm_generator_mateltwise_avx_avx512_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_meltw_descriptor* i_mateltwise_desc) { libxsmm_mateltwise_kernel_config l_kernel_config; libxsmm_mateltwise_gp_reg_mapping l_gp_reg_mapping; libxsmm_loop_label_tracker l_loop_label_tracker; /* define loop_label_tracker */ libxsmm_reset_loop_label_tracker( &l_loop_label_tracker ); /* define gp register mapping */ memset(&l_gp_reg_mapping, 0, sizeof(l_gp_reg_mapping)); #if defined(_WIN32) || defined(__CYGWIN__) l_gp_reg_mapping.gp_reg_param_struct = LIBXSMM_X86_GP_REG_RCX; #else /* match calling convention on Linux */ l_gp_reg_mapping.gp_reg_param_struct = LIBXSMM_X86_GP_REG_RDI; #endif /* define mateltwise kernel config */ libxsmm_generator_mateltwise_init_micro_kernel_config_fullvector( io_generated_code, &l_kernel_config, io_generated_code->arch, i_mateltwise_desc); /* open asm */ libxsmm_x86_instruction_open_stream_mateltwise( io_generated_code, l_gp_reg_mapping.gp_reg_param_struct, NULL ); /* Depending on the elementwise function, dispatch the proper code JITer */ if ((i_mateltwise_desc->operation == LIBXSMM_MELTW_OPERATION_CVTFP32BF16) || (i_mateltwise_desc->operation == LIBXSMM_MELTW_OPERATION_CVTFP32BF16_ACT) || (i_mateltwise_desc->operation == LIBXSMM_MELTW_OPERATION_ACT_CVTFP32BF16)) { if ( (LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_INP( i_mateltwise_desc->datatype )) && (LIBXSMM_GEMM_PRECISION_BF16 == LIBXSMM_GETENUM_OUT( i_mateltwise_desc->datatype ))) { libxsmm_generator_cvtfp32bf16_avx512_microkernel( io_generated_code, &l_loop_label_tracker, &l_gp_reg_mapping, &l_kernel_config, i_mateltwise_desc ); } else { /* This should not happen */ LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_UNSUP_DATATYPE ); return; } } else if (i_mateltwise_desc->operation == LIBXSMM_MELTW_OPERATION_REDUCE) { if ( (LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_INP( i_mateltwise_desc->datatype )) && (LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_OUT( i_mateltwise_desc->datatype ))) { if ((libxsmm_get_meltw_redu_flags((libxsmm_meltw_comp_redu_flags)i_mateltwise_desc->flags) & LIBXSMM_MELTW_FLAG_REDUCE_ROWS) > 0) { libxsmm_generator_reduce_rows_avx512_microkernel( io_generated_code, &l_loop_label_tracker, &l_gp_reg_mapping, &l_kernel_config, i_mateltwise_desc ); } else if ((libxsmm_get_meltw_redu_flags((libxsmm_meltw_comp_redu_flags)i_mateltwise_desc->flags) & LIBXSMM_MELTW_FLAG_REDUCE_COLS) > 0) { libxsmm_generator_reduce_cols_avx512_microkernel( io_generated_code, &l_loop_label_tracker, &l_gp_reg_mapping, &l_kernel_config, i_mateltwise_desc ); } else { /* This should not happen */ LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_GENERAL ); return; } } else { /* This should not happen */ LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_UNSUP_DATATYPE ); return; } } else if (i_mateltwise_desc->operation == LIBXSMM_MELTW_OPERATION_SCALE) { if ( (LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_INP( i_mateltwise_desc->datatype )) && (LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_OUT( i_mateltwise_desc->datatype ))) { libxsmm_generator_scale_avx512_microkernel( io_generated_code, &l_loop_label_tracker, &l_gp_reg_mapping, &l_kernel_config, i_mateltwise_desc ); } else { /* This should not happen */ LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_UNSUP_DATATYPE ); return; } } else { /* This should not happen */ LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_GENERAL ); return; } /* close asm */ libxsmm_x86_instruction_close_stream_mateltwise( io_generated_code, NULL ); } libxsmm-1.17/src/generator_mateltwise_avx_avx512.h000066400000000000000000000160121415223013700222630ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evanelos Georganas, Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef GENERATOR_MATELTWISE_AVX_AVX512_H #define GENERATOR_MATELTWISE_AVX_AVX512_H #include "generator_common.h" LIBXSMM_API_INTERN void libxsmm_generator_mateltwise_header_m_loop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_mateltwise_kernel_config* i_kernel_config, const unsigned int i_gp_reg_m_loop ); LIBXSMM_API_INTERN void libxsmm_generator_mateltwise_footer_m_loop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_mateltwise_kernel_config* i_kernel_config, const unsigned int i_gp_reg_m_loop, const unsigned int i_m ); LIBXSMM_API_INTERN void libxsmm_generator_mateltwise_header_n_loop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_mateltwise_kernel_config* i_kernel_config, const unsigned int i_gp_reg_n_loop ); LIBXSMM_API_INTERN void libxsmm_generator_mateltwise_footer_n_loop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_mateltwise_kernel_config* i_kernel_config, const unsigned int i_gp_reg_n_loop, const unsigned int i_n ); LIBXSMM_API_INTERN void libxsmm_generator_mateltwise_initialize_avx512_mask( libxsmm_generated_code* io_generated_code, const unsigned int i_gp_reg_tmp, const unsigned int i_mask_reg, const unsigned int i_mask_count, const unsigned int i_precision); LIBXSMM_API_INTERN void libxsmm_generator_mateltwise_init_micro_kernel_config_fullvector( libxsmm_generated_code* io_generated_code, libxsmm_mateltwise_kernel_config* io_micro_kernel_config, const unsigned int i_arch, const libxsmm_meltw_descriptor* i_mateltwise_desc); LIBXSMM_API_INTERN void libxsmm_generator_tanh_ps_rational_78_avx512( libxsmm_generated_code* io_generated_code, const libxsmm_mateltwise_kernel_config* i_micro_kernel_config, const unsigned int i_vec_x, const unsigned int i_vec_x2, const unsigned int i_vec_nom, const unsigned int i_vec_denom, const unsigned int i_mask_hi, const unsigned int i_mask_lo, const unsigned int i_vec_c0, const unsigned int i_vec_c1, const unsigned int i_vec_c2, const unsigned int i_vec_c3, const unsigned int i_vec_c1_d, const unsigned int i_vec_c2_d, const unsigned int i_vec_c3_d, const unsigned int i_vec_hi_bound, const unsigned int i_vec_lo_bound, const unsigned int i_vec_ones, const unsigned int i_vec_neg_ones); LIBXSMM_API_INTERN void libxsmm_generator_cvtfp32bf16_avx512_replacement_sequence( libxsmm_generated_code* io_generated_code, const libxsmm_mateltwise_kernel_config* i_micro_kernel_config, const unsigned int i_vec_reg ); LIBXSMM_API_INTERN void libxsmm_generator_cvtfp32bf16_avx512_microkernel( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, libxsmm_mateltwise_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_mateltwise_kernel_config* i_micro_kernel_config, const libxsmm_meltw_descriptor* i_mateltwise_desc ); LIBXSMM_API_INTERN void libxsmm_generator_reduce_cols_avx512_microkernel( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, libxsmm_mateltwise_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_mateltwise_kernel_config* i_micro_kernel_config, const libxsmm_meltw_descriptor* i_mateltwise_desc ); LIBXSMM_API_INTERN void libxsmm_generator_reduce_rows_avx512_microkernel( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, libxsmm_mateltwise_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_mateltwise_kernel_config* i_micro_kernel_config, const libxsmm_meltw_descriptor* i_mateltwise_desc ); LIBXSMM_API_INTERN void libxsmm_generator_scale_avx512_microkernel( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, libxsmm_mateltwise_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_mateltwise_kernel_config* i_micro_kernel_config, const libxsmm_meltw_descriptor* i_mateltwise_desc ); LIBXSMM_API_INTERN void libxsmm_generator_mateltwise_avx_avx512_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_meltw_descriptor* i_mateltw_desc ); #endif /* GENERATOR_MATELTWISE_AVX_AVX512_H */ libxsmm-1.17/src/generator_packed.c000066400000000000000000000105021415223013700173410ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Greg Henry, Hans Pabst (Intel Corp.) ******************************************************************************/ #include #include "generator_packed_getrf_avx_avx512.h" #include "generator_packed_trsm_avx_avx512.h" #include "generator_packed_trmm_avx_avx512.h" #include "generator_packed_gemm_avx_avx512.h" #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #include #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif LIBXSMM_API void libxsmm_generator_pgemm_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_pgemm_descriptor* i_packed_pgemm_desc, int i_arch, ... ) { const char *const cpuid = libxsmm_cpuid_name( i_arch ); /* generate kernel */ if ( LIBXSMM_X86_AVX <= i_arch ) { #if defined(GARBAGE_PARAMETERS) unsigned int iunroll, junroll, loopi, loopj; va_list args; va_start(args, i_arch); iunroll = va_arg(args, unsigned int); junroll = va_arg(args, unsigned int); loopi = va_arg(args, unsigned int); loopj = va_arg(args, unsigned int); va_end(args); libxsmm_generator_packed_gemm_avx_avx512_kernel( io_generated_code, i_packed_pgemm_desc, cpuid, iunroll, junroll, loopi, loopj ); #else libxsmm_generator_packed_gemm_avx_avx512_kernel( io_generated_code, i_packed_pgemm_desc, cpuid ); #endif } else { /* TODO fix this error */ LIBXSMM_HANDLE_ERROR(io_generated_code, LIBXSMM_ERR_ARCH); return; } } LIBXSMM_API void libxsmm_generator_getrf_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_getrf_descriptor* i_packed_getrf_desc, int i_arch ) { const char *const cpuid = libxsmm_cpuid_name( i_arch ); /* generate kernel */ if ( LIBXSMM_X86_AVX <= i_arch ) { libxsmm_generator_packed_getrf_avx_avx512_kernel( io_generated_code, i_packed_getrf_desc, cpuid ); } else { /* TODO fix this error */ LIBXSMM_HANDLE_ERROR(io_generated_code, LIBXSMM_ERR_ARCH); return; } } /* @TODO change int based architecture value */ LIBXSMM_API void libxsmm_generator_trsm_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_trsm_descriptor* i_packed_trsm_desc, const char* i_arch ) { /* generate kernel */ if ( (strcmp(i_arch, "skx") == 0) || (strcmp(i_arch, "knm") == 0) || (strcmp(i_arch, "knl") == 0) || (strcmp(i_arch, "hsw") == 0) || (strcmp(i_arch, "snb") == 0) ) { libxsmm_generator_packed_trsm_avx_avx512_kernel( io_generated_code, i_packed_trsm_desc, i_arch ); } else { /* TODO fix this error */ LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_ARCH ); return; } } /* @TODO change int based architecture value */ LIBXSMM_API void libxsmm_generator_trmm_kernel(libxsmm_generated_code* io_generated_code, const libxsmm_trmm_descriptor* i_packed_trmm_desc, const char* i_arch) { /* generate kernel */ if ( (strcmp(i_arch, "skx") == 0) || (strcmp(i_arch, "knm") == 0) || (strcmp(i_arch, "knl") == 0) || (strcmp(i_arch, "hsw") == 0) || (strcmp(i_arch, "snb") == 0) ) { libxsmm_generator_packed_trmm_avx_avx512_kernel( io_generated_code, i_packed_trmm_desc, i_arch ); } else { /* TODO fix this error */ LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_ARCH ); return; } } libxsmm-1.17/src/generator_packed_aux.h000066400000000000000000000541611415223013700202340ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Greg Henry (Intel Corp.) ******************************************************************************/ #ifndef GENERATOR_PACKED_AUX_H #define GENERATOR_PACKED_AUX_H #include "generator_x86_instructions.h" #include "generator_common.h" #include LIBXSMM_API_INLINE void compact_load_parameter_ ( libxsmm_generated_code* io_code, double alpha, unsigned int reg, unsigned int number, char regset ) { int datasize; int i; if ( (number == 2) && (regset=='x') ) { datasize = 8; } else if ( (number == 4) && (regset=='x') ) { datasize = 4; } else if ( (number == 4) && (regset=='y') ) { datasize = 8; } else if ( (number == 8) && (regset=='y') ) { datasize = 4; } else if ( (number == 8) && (regset=='z') ) { datasize = 8; } else if ( (number == 16) && (regset=='z') ) { datasize = 4; } else { fprintf(stderr,"Unknown number=%u regset=%c combo for compact_load_parameter\n",number,regset); exit(-1); } if ( datasize == 8 ) { double vector[16]; for ( i = 0; i < (int)number; i++ ) vector[i]=alpha; libxsmm_x86_instruction_full_vec_load_of_constants ( io_code, (unsigned char*) vector, "loadconst", regset, reg ); } else { float vector[16]; for ( i = 0; i < (int)number; i++ ) vector[i]=(float)alpha; libxsmm_x86_instruction_full_vec_load_of_constants ( io_code, (unsigned char*) vector, "loadconst", regset, reg ); } } LIBXSMM_API_INLINE void compact_set_zero_ ( libxsmm_generated_code* io_code, unsigned int reg, unsigned int number, unsigned int datasize, char regset ) { LIBXSMM_UNUSED(datasize); if ( (number == 8) && (regset=='z') ) { libxsmm_x86_instruction_vec_compute_reg ( io_code, LIBXSMM_X86_AVX512, LIBXSMM_X86_INSTR_VXORPD, regset, reg, reg, reg ); } else if ( (number == 16) && (regset=='z') ) { libxsmm_x86_instruction_vec_compute_reg ( io_code, LIBXSMM_X86_AVX512, LIBXSMM_X86_INSTR_VXORPS, regset, reg, reg, reg ); } else if ( (number == 8) && (regset=='y') ) { libxsmm_x86_instruction_vec_compute_reg ( io_code, LIBXSMM_X86_AVX2, LIBXSMM_X86_INSTR_VXORPS, regset, reg, reg, reg ); } else if ( (number == 4) && (regset=='y') ) { libxsmm_x86_instruction_vec_compute_reg ( io_code, LIBXSMM_X86_AVX2, LIBXSMM_X86_INSTR_VXORPD, regset, reg, reg, reg ); } } LIBXSMM_API_INLINE void compact_set_one_ ( libxsmm_generated_code* io_code, unsigned int reg, unsigned int number, unsigned int datasize, char regset ) { double dvector[16]; float svector[16]; int i; if ( number > 16 ) { fprintf(stderr,"loading too large a parameter for compact_set_one_\n"); exit(-1); } for ( i = 0; i < (int)number; i++ ) { dvector[i]=1.0; svector[i]=1.0; } if ( datasize == 4 ) libxsmm_x86_instruction_full_vec_load_of_constants ( io_code, (unsigned char*) svector, "loadone", regset, reg ); else if ( datasize == 8 ) libxsmm_x86_instruction_full_vec_load_of_constants ( io_code, (unsigned char*) dvector, "loadone", regset, reg ); else printf("Unknown datasize in compact_set_one_ error\n"); } LIBXSMM_API_INLINE void compact_store_matrix_gen_ ( libxsmm_generated_code* io_code, unsigned int trans, unsigned int lda, unsigned int i, unsigned int j, unsigned int reg, unsigned int number, unsigned int datasize, char regset, unsigned int matrix_gpreg ) { int element; int offset; unsigned int i_vmove_instr; int i_instruction_set; if ( !trans ) element = number*(j-1)*lda + number*(i-1); else element = number*(i-1)*lda + number*(j-1); offset = element * datasize; if ( /*(reg < 0) ||*/ (reg >=32) ) { printf("compact_store_matrix_gen trying to store from an invalid register: %u\n",reg); exit(-1); } if ( datasize == 8 ) { i_vmove_instr = LIBXSMM_X86_INSTR_VMOVUPD; } else if ( datasize == 4 ) { i_vmove_instr = LIBXSMM_X86_INSTR_VMOVUPS; } else { fprintf(stderr,"compact_store_matrix_gen has strange datasize=%u\n",datasize); exit(-1); } if ( regset == 'z' ) { i_instruction_set = LIBXSMM_X86_AVX512; } else if ( regset == 'y' ) { i_instruction_set = LIBXSMM_X86_AVX2; } else { fprintf(stderr,"Unsupported instruction set in compact_store_matrix1\n"); exit(-1); } libxsmm_x86_instruction_vec_move ( io_code, i_instruction_set, i_vmove_instr, matrix_gpreg, LIBXSMM_X86_GP_REG_UNDEF, 1, offset, regset, reg, 0, 0, 1 ); } LIBXSMM_API_INLINE void compact_store_matrix1_ ( libxsmm_generated_code* io_code, unsigned int lda, unsigned int i, unsigned int j, unsigned int reg, unsigned int number, unsigned int datasize, char regset ) { int element = number*(j-1)*lda + number*(i-1); int offset = element * datasize; unsigned int i_vmove_instr; int i_instruction_set; if ( datasize == 8 ) { i_vmove_instr = LIBXSMM_X86_INSTR_VMOVUPD; } else if ( datasize == 4 ) { i_vmove_instr = LIBXSMM_X86_INSTR_VMOVUPS; } else { fprintf(stderr,"compact_store_matrix1 has strange datasize=%u\n",datasize); exit(-1); } if ( regset == 'z' ) { i_instruction_set = LIBXSMM_X86_AVX512; } else if ( regset == 'y' ) { i_instruction_set = LIBXSMM_X86_AVX2; } else { fprintf(stderr,"Unsupported instruction set in compact_store_matrix1\n"); exit(-1); } libxsmm_x86_instruction_vec_move ( io_code, i_instruction_set, i_vmove_instr, LIBXSMM_X86_GP_REG_RDI, LIBXSMM_X86_GP_REG_UNDEF, 1, offset, regset, reg, 0, 0, 1 ); } LIBXSMM_API_INLINE void compact_store_matrix2_ ( libxsmm_generated_code* io_code, unsigned int lda, unsigned int i, unsigned int j, unsigned int reg, unsigned int number, unsigned int datasize, char regset ) { int element = number*(j-1)*lda + number*(i-1); int offset = element * datasize; unsigned int i_vmove_instr; int i_instruction_set; if ( datasize == 8 ) { i_vmove_instr = LIBXSMM_X86_INSTR_VMOVUPD; } else if ( datasize == 4 ) { i_vmove_instr = LIBXSMM_X86_INSTR_VMOVUPS; } else { fprintf(stderr,"compact_store_matrix2 has strange datasize=%u\n",datasize); exit(-1); } if ( regset == 'z' ) { i_instruction_set = LIBXSMM_X86_AVX512; } else if ( regset == 'y' ) { i_instruction_set = LIBXSMM_X86_AVX2; } else { fprintf(stderr,"Unsupported instruction set in compact_store_matrix2\n"); exit(-1); } libxsmm_x86_instruction_vec_move ( io_code, i_instruction_set, i_vmove_instr, LIBXSMM_X86_GP_REG_RSI, LIBXSMM_X86_GP_REG_UNDEF, 1, offset, regset, reg, 0, 0, 1 ); } LIBXSMM_API_INLINE void compact_store_matrix3_ ( libxsmm_generated_code* io_code, unsigned int lda, unsigned int i, unsigned int j, unsigned int reg, unsigned int number, unsigned int datasize, char regset ) { int element = number*(j-1)*lda + number*(i-1); int offset = element * datasize; unsigned int i_vmove_instr; int i_instruction_set; if ( datasize == 8 ) { i_vmove_instr = LIBXSMM_X86_INSTR_VMOVUPD; } else if ( datasize == 4 ) { i_vmove_instr = LIBXSMM_X86_INSTR_VMOVUPS; } else { fprintf(stderr,"compact_store_matrix3 has strange datasize=%u\n",datasize); exit(-1); } if ( regset == 'z' ) { i_instruction_set = LIBXSMM_X86_AVX512; } else if ( regset == 'y' ) { i_instruction_set = LIBXSMM_X86_AVX2; } else { fprintf(stderr,"Unsupported instruction set in compact_store_matrix3\n"); exit(-1); } libxsmm_x86_instruction_vec_move ( io_code, i_instruction_set, i_vmove_instr, LIBXSMM_X86_GP_REG_RDX, LIBXSMM_X86_GP_REG_UNDEF, 1, offset, regset, reg, 0, 0, 1 ); } LIBXSMM_API_INLINE void compact_load_matrix1_ ( libxsmm_generated_code* io_code, unsigned int lda, unsigned int i, unsigned int j, unsigned int reg, unsigned int number, unsigned int datasize, char regset ) { int element = number*(j-1)*lda + number*(i-1); int offset = element * datasize; unsigned int i_vmove_instr; int i_instruction_set; if ( datasize == 8 ) { i_vmove_instr = LIBXSMM_X86_INSTR_VMOVUPD; } else if ( datasize == 4 ) { i_vmove_instr = LIBXSMM_X86_INSTR_VMOVUPS; } else { fprintf(stderr,"compact_load_matrix1 has strange datasize=%u\n",datasize); exit(-1); } if ( regset == 'z' ) { i_instruction_set = LIBXSMM_X86_AVX512; } else if ( regset == 'y' ) { i_instruction_set = LIBXSMM_X86_AVX2; } else { fprintf(stderr,"Unsupported instruction set in compact_load_matrix1\n"); exit(-1); } libxsmm_x86_instruction_vec_move ( io_code, i_instruction_set, i_vmove_instr, LIBXSMM_X86_GP_REG_RDI, LIBXSMM_X86_GP_REG_UNDEF, 1, offset, regset, reg, 0, 0, 0 ); } LIBXSMM_API_INLINE void compact_load_matrix_gen_ ( libxsmm_generated_code* io_code, unsigned int trans, unsigned int lda, unsigned int i, unsigned int j, unsigned int reg, unsigned int number, unsigned int datasize, char regset, unsigned int matrix_gpreg ) { int element; int offset; unsigned int i_vmove_instr; int i_instruction_set; if ( /*(reg < 0) ||*/ (reg >=32) ) { printf("compact_load_matrix_gen trying to load to an invalid register: %u\n",reg); printf("lda=%u i=%u j=%u reg=%u number=%u datasize=%u regset=%c matrix_gpreg=%u\n",lda,i,j,reg,number,datasize,regset,matrix_gpreg); exit(-1); } if ( !trans ) element = number*(j-1)*lda + number*(i-1); else element = number*(i-1)*lda + number*(j-1); offset = element * datasize; if ( datasize == 8 ) { i_vmove_instr = LIBXSMM_X86_INSTR_VMOVUPD; } else if ( datasize == 4 ) { i_vmove_instr = LIBXSMM_X86_INSTR_VMOVUPS; } else { fprintf(stderr,"compact_load_matrix_gen has strange datasize=%u\n",datasize); exit(-1); } if ( regset == 'z' ) { i_instruction_set = LIBXSMM_X86_AVX512; } else if ( regset == 'y' ) { i_instruction_set = LIBXSMM_X86_AVX2; } else { fprintf(stderr,"Unsupported instruction set in compact_load_matrix_gen\n"); exit(-1); } libxsmm_x86_instruction_vec_move ( io_code, i_instruction_set, i_vmove_instr, matrix_gpreg, LIBXSMM_X86_GP_REG_UNDEF, 1, offset, regset, reg, 0, 0, 0 ); } LIBXSMM_API_INLINE void compact_load_matrix2_ ( libxsmm_generated_code* io_code, unsigned int lda, unsigned int i, unsigned int j, unsigned int reg, unsigned int number, unsigned int datasize, char regset ) { int element = number*(j-1)*lda + number*(i-1); int offset = element * datasize; unsigned int i_vmove_instr; int i_instruction_set; if ( datasize == 8 ) { i_vmove_instr = LIBXSMM_X86_INSTR_VMOVUPD; } else if ( datasize == 4 ) { i_vmove_instr = LIBXSMM_X86_INSTR_VMOVUPS; } else { fprintf(stderr,"compact_load_matrix2 has strange datasize=%u\n",datasize); exit(-1); } if ( regset == 'z' ) { i_instruction_set = LIBXSMM_X86_AVX512; } else if ( regset == 'y' ) { i_instruction_set = LIBXSMM_X86_AVX2; } else { fprintf(stderr,"Unsupported instruction set in compact_load_matrix2\n"); exit(-1); } libxsmm_x86_instruction_vec_move ( io_code, i_instruction_set, i_vmove_instr, LIBXSMM_X86_GP_REG_RSI, LIBXSMM_X86_GP_REG_UNDEF, 1, offset, regset, reg, 0, 0, 0 ); } LIBXSMM_API_INLINE void compact_load_matrix3_ ( libxsmm_generated_code* io_code, unsigned int lda, unsigned int i, unsigned int j, unsigned int reg, unsigned int number, unsigned int datasize, char regset ) { int element = number*(j-1)*lda + number*(i-1); int offset = element * datasize; unsigned int i_vmove_instr; int i_instruction_set; if ( datasize == 8 ) { i_vmove_instr = LIBXSMM_X86_INSTR_VMOVUPD; } else if ( datasize == 4 ) { i_vmove_instr = LIBXSMM_X86_INSTR_VMOVUPS; } else { fprintf(stderr,"compact_load_matrix3 has strange datasize=%u\n",datasize); exit(-1); } if ( regset == 'z' ) { i_instruction_set = LIBXSMM_X86_AVX512; } else if ( regset == 'y' ) { i_instruction_set = LIBXSMM_X86_AVX2; } else { fprintf(stderr,"Unsupported instruction set in compact_load_matrix3\n"); exit(-1); } libxsmm_x86_instruction_vec_move ( io_code, i_instruction_set, i_vmove_instr, LIBXSMM_X86_GP_REG_RDX, LIBXSMM_X86_GP_REG_UNDEF, 1, offset, regset, reg, 0, 0, 0 ); } LIBXSMM_API_INLINE void compact_mult_two_nums_ ( libxsmm_generated_code* io_code, unsigned int reg0, unsigned int reg1, unsigned int reg2, unsigned int number, char regset ) { unsigned int i_vmove_instr; int i_instruction_set; #if 0 int datasize = 0; LIBXSMM_UNUSED(datasize); #endif if ( regset == 'z' ) { i_instruction_set = LIBXSMM_X86_AVX512; } else if ( regset == 'y' ) { i_instruction_set = LIBXSMM_X86_AVX2; } else { fprintf(stderr,"Unsupported instruction set in compact_mult_two_nums\n"); exit(-1); } if ( (number==4) && (regset=='y') ) { #if 0 datasize = 8; #endif i_vmove_instr = LIBXSMM_X86_INSTR_VMULPD; } else if ( (number==8) && (regset=='z') ) { #if 0 datasize = 8; #endif i_vmove_instr = LIBXSMM_X86_INSTR_VMULPD; } else if ( (number==8) && (regset=='y') ) { #if 0 datasize = 4; #endif i_vmove_instr = LIBXSMM_X86_INSTR_VMULPS; } else if ( (number==16) && (regset=='z') ) { #if 0 datasize = 4; #endif i_vmove_instr = LIBXSMM_X86_INSTR_VMULPS; } else { fprintf(stderr,"Unsupported combo of number and regset in compact_mult_two_nums\n"); exit(-1); } libxsmm_x86_instruction_vec_compute_reg ( io_code, i_instruction_set, i_vmove_instr, regset, reg1, reg0, reg2 ); } LIBXSMM_API_INLINE void compact_add_two_nums_ ( libxsmm_generated_code* io_code, unsigned int reg0, unsigned int reg1, unsigned int reg2, unsigned int number, char regset ) { unsigned int i_vmove_instr; int i_instruction_set; #if 0 int datasize = 0; LIBXSMM_UNUSED(datasize); #endif if ( regset == 'z' ) { i_instruction_set = LIBXSMM_X86_AVX512; } else if ( regset == 'y' ) { i_instruction_set = LIBXSMM_X86_AVX2; } else { fprintf(stderr,"Unsupported instruction set in compact_mult_two_nums\n"); exit(-1); } if ( (number==4) && (regset=='y') ) { #if 0 datasize = 8; #endif i_vmove_instr = LIBXSMM_X86_INSTR_VADDPD; } else if ( (number==8) && (regset=='z') ) { #if 0 datasize = 8; #endif i_vmove_instr = LIBXSMM_X86_INSTR_VADDPD; } else if ( (number==8) && (regset=='y') ) { #if 0 datasize = 4; #endif i_vmove_instr = LIBXSMM_X86_INSTR_VADDPS; } else if ( (number==16) && (regset=='z') ) { #if 0 datasize = 4; #endif i_vmove_instr = LIBXSMM_X86_INSTR_VADDPS; } else { fprintf(stderr,"Unsupported combo of number and regset in compact_mult_two_nums\n"); exit(-1); } libxsmm_x86_instruction_vec_compute_reg ( io_code, i_instruction_set, i_vmove_instr, regset, reg1, reg0, reg2 ); } LIBXSMM_API_INLINE void compact_sub_two_nums_ ( libxsmm_generated_code* io_code, unsigned int reg0, unsigned int reg1, unsigned int reg2, unsigned int number, char regset ) { unsigned int i_vmove_instr; int i_instruction_set; #if 0 int datasize = 0; LIBXSMM_UNUSED(datasize); #endif if ( regset == 'z' ) { i_instruction_set = LIBXSMM_X86_AVX512; } else if ( regset == 'y' ) { i_instruction_set = LIBXSMM_X86_AVX2; } else { fprintf(stderr,"Unsupported instruction set in compact_mult_two_nums\n"); exit(-1); } if ( (number==4) && (regset=='y') ) { #if 0 datasize = 8; #endif i_vmove_instr = LIBXSMM_X86_INSTR_VSUBPD; } else if ( (number==8) && (regset=='z') ) { #if 0 datasize = 8; #endif i_vmove_instr = LIBXSMM_X86_INSTR_VSUBPD; } else if ( (number==8) && (regset=='y') ) { #if 0 datasize = 4; #endif i_vmove_instr = LIBXSMM_X86_INSTR_VSUBPS; } else if ( (number==16) && (regset=='z') ) { #if 0 datasize = 4; #endif i_vmove_instr = LIBXSMM_X86_INSTR_VSUBPS; } else { fprintf(stderr,"Unsupported combo of number and regset in compact_mult_two_nums\n"); exit(-1); } libxsmm_x86_instruction_vec_compute_reg ( io_code, i_instruction_set, i_vmove_instr, regset, reg1, reg0, reg2 ); } LIBXSMM_API_INLINE void compact_fms_cminusab_( libxsmm_generated_code* io_code, unsigned int reg0, unsigned int reg1, unsigned int reg2, unsigned int number, char regset ) { unsigned int i_vmove_instr; int i_instruction_set; #if 0 int datasize = 0; LIBXSMM_UNUSED(datasize); #endif if ( regset == 'z' ) { i_instruction_set = LIBXSMM_X86_AVX512; } else if ( regset == 'y' ) { i_instruction_set = LIBXSMM_X86_AVX2; } else { fprintf(stderr,"Unsupported instruction set in compact_fms_cminusab\n"); exit(-1); } if ( (number==4) && (regset=='y') ) { #if 0 datasize = 8; #endif i_vmove_instr = LIBXSMM_X86_INSTR_VFNMADD231PD; } else if ( (number==8) && (regset=='z') ) { #if 0 datasize = 8; #endif i_vmove_instr = LIBXSMM_X86_INSTR_VFNMADD231PD; } else if ( (number==8) && (regset=='y') ) { #if 0 datasize = 4; #endif i_vmove_instr = LIBXSMM_X86_INSTR_VFNMADD231PS; } else if ( (number==16) && (regset=='z') ) { #if 0 datasize = 4; #endif i_vmove_instr = LIBXSMM_X86_INSTR_VFNMADD231PS; } else { fprintf(stderr,"Unsupported combo of number and regset in compact_fms_cminusab\n"); exit(-1); } libxsmm_x86_instruction_vec_compute_reg ( io_code, i_instruction_set, i_vmove_instr, regset, reg1, reg2, reg0 ); } LIBXSMM_API_INLINE void compact_fma_cplusab_( libxsmm_generated_code* io_code, unsigned int reg0, unsigned int reg1, unsigned int reg2, unsigned int number, char regset ) { unsigned int i_vmove_instr; int i_instruction_set; #if 0 int datasize = 0; LIBXSMM_UNUSED(datasize); #endif if ( regset == 'z' ) { i_instruction_set = LIBXSMM_X86_AVX512; } else if ( regset == 'y' ) { i_instruction_set = LIBXSMM_X86_AVX2; } else { fprintf(stderr,"Unsupported instruction set in compact_fms_cminusab\n"); exit(-1); } if ( (number==4) && (regset=='y') ) { #if 0 datasize = 8; #endif i_vmove_instr = LIBXSMM_X86_INSTR_VFMADD231PD; } else if ( (number==8) && (regset=='z') ) { #if 0 datasize = 8; #endif i_vmove_instr = LIBXSMM_X86_INSTR_VFMADD231PD; } else if ( (number==8) && (regset=='y') ) { #if 0 datasize = 4; #endif i_vmove_instr = LIBXSMM_X86_INSTR_VFMADD231PS; } else if ( (number==16) && (regset=='z') ) { #if 0 datasize = 4; #endif i_vmove_instr = LIBXSMM_X86_INSTR_VFMADD231PS; } else { fprintf(stderr,"Unsupported combo of number and regset in compact_fms_cminusab\n"); exit(-1); } libxsmm_x86_instruction_vec_compute_reg ( io_code, i_instruction_set, i_vmove_instr, regset, reg1, reg2, reg0 ); } LIBXSMM_API_INLINE void compact_divide_two_nums_ ( libxsmm_generated_code* io_code, unsigned int reg0, unsigned int reg1, unsigned int reg2, unsigned int number, char regset ) { unsigned int i_vmove_instr; int i_instruction_set; #if 0 int datasize = 0; LIBXSMM_UNUSED(datasize); #endif if ( regset == 'z' ) { i_instruction_set = LIBXSMM_X86_AVX512; } else if ( regset == 'y' ) { i_instruction_set = LIBXSMM_X86_AVX2; } else { fprintf(stderr,"Unsupported instruction set in compact_divide_two_nums\n"); exit(-1); } if ( (number==4) && (regset=='y') ) { #if 0 datasize = 8; #endif i_vmove_instr = LIBXSMM_X86_INSTR_VDIVPD; } else if ( (number==8) && (regset=='z') ) { #if 0 datasize = 8; #endif i_vmove_instr = LIBXSMM_X86_INSTR_VDIVPD; } else if ( (number==8) && (regset=='y') ) { #if 0 datasize = 4; #endif i_vmove_instr = LIBXSMM_X86_INSTR_VDIVPS; } else if ( (number==16) && (regset=='z') ) { #if 0 datasize = 4; #endif i_vmove_instr = LIBXSMM_X86_INSTR_VDIVPS; } else { fprintf(stderr,"Unsupported combo of number and regset in compact_divide_two_nums\n"); exit(-1); } libxsmm_x86_instruction_vec_compute_reg ( io_code, i_instruction_set, i_vmove_instr, regset, reg1, reg0, reg2 ); } #endif /*GENERATOR_PACKED_AUX_H*/ libxsmm-1.17/src/generator_packed_gemm_ac_rm_avx_avx2_avx512.c000066400000000000000000001065151415223013700244450ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "generator_packed_gemm_ac_rm_avx_avx2_avx512.h" #include "generator_gemm_common.h" #include "generator_x86_instructions.h" #include "libxsmm_main.h" LIBXSMM_API void libxsmm_generator_packed_gemm_ac_rm( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_packed_width, const char* i_arch ) { if ( strcmp(i_arch, "knl") == 0 || strcmp(i_arch, "knm") == 0 || strcmp(i_arch, "skx") == 0 || strcmp(i_arch, "clx") == 0 || strcmp(i_arch, "cpx") == 0 || strcmp(i_arch, "hsw") == 0 || strcmp(i_arch, "snb") == 0 ) { if ( strcmp(i_arch, "snb") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX; } else if ( strcmp(i_arch, "hsw") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX2; } else if ( strcmp(i_arch, "knl") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_MIC; } else if ( strcmp(i_arch, "knm") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_KNM; } else if ( strcmp(i_arch, "skx") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_CORE; } else if ( strcmp(i_arch, "clx") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_CLX; } else if ( strcmp(i_arch, "cpx") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_CPX; } else { /* cannot happen */ } libxsmm_generator_packed_gemm_ac_rm_avx_avx2_avx512( io_generated_code, i_xgemm_desc, i_packed_width ); } else { fprintf( stderr, "RM AC SOA is only available for AVX/AVX2/AVX512 at this point\n" ); exit(-1); } } LIBXSMM_API_INTERN void libxsmm_generator_packed_gemm_ac_rm_avx_avx2_avx512( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_packed_width ) { unsigned int l_max_reg_block = 0; unsigned int l_n1_range = 0; unsigned int l_n2_range = 0; unsigned int l_n1_block = 0; unsigned int l_n2_block = 0; libxsmm_micro_kernel_config l_micro_kernel_config; libxsmm_loop_label_tracker l_loop_label_tracker; libxsmm_gp_reg_mapping l_gp_reg_mapping; /* select accumulator blocking */ /* @TODO we could do more agressive blockings if needed */ if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { l_max_reg_block = 28; } else { l_max_reg_block = 13; } /* define gp register mapping */ libxsmm_reset_x86_gp_reg_mapping( &l_gp_reg_mapping ); /* matching calling convention on Linux */ #if defined(_WIN32) || defined(__CYGWIN__) l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_c = LIBXSMM_X86_GP_REG_R8; /* TODO: full support for Windows calling convention */ l_gp_reg_mapping.gp_reg_a_prefetch = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_b_prefetch = LIBXSMM_X86_GP_REG_UNDEF; #else /* match calling convention on Linux */ l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RDI; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_RSI; l_gp_reg_mapping.gp_reg_c = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_a_prefetch = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_b_prefetch = LIBXSMM_X86_GP_REG_R8; #endif /* l_gp_reg_mapping.gp_reg_c_prefetch = LIBXSMM_X86_GP_REG_R9;*/ l_gp_reg_mapping.gp_reg_mloop = LIBXSMM_X86_GP_REG_R12; l_gp_reg_mapping.gp_reg_nloop = LIBXSMM_X86_GP_REG_R13; l_gp_reg_mapping.gp_reg_kloop = LIBXSMM_X86_GP_REG_R14; l_gp_reg_mapping.gp_reg_help_0 = LIBXSMM_X86_GP_REG_R10; /* this is the SIMD packed register loop */ l_gp_reg_mapping.gp_reg_help_1 = LIBXSMM_X86_GP_REG_R11; /* this is the gp register to load the AVX512 mask, if needed */ l_gp_reg_mapping.gp_reg_help_2 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_3 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_4 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_5 = LIBXSMM_X86_GP_REG_UNDEF; /* define loop_label_tracker */ libxsmm_reset_loop_label_tracker( &l_loop_label_tracker ); /* define the micro kernel code gen properties */ libxsmm_generator_gemm_init_micro_kernel_config_fullvector( &l_micro_kernel_config, io_generated_code->arch, i_xgemm_desc, 0 ); /* calculate the chunk size of current columns to work on */ if ( libxsmm_compute_equalized_blocking( i_xgemm_desc->n, l_max_reg_block, &l_n1_range, &l_n1_block, &l_n2_range, &l_n2_block ) ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_N_BLOCK ); return; } /* open asm */ libxsmm_x86_instruction_open_stream( io_generated_code, &l_gp_reg_mapping, i_xgemm_desc->prefetch ); /* m loop */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_mov_instruction, l_gp_reg_mapping.gp_reg_mloop, 0 ); libxsmm_x86_instruction_register_jump_back_label( io_generated_code, &l_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_mloop, 1 ); /* loop over n-blocks */ if ( l_n1_block == i_xgemm_desc->n ) { /* no N loop at all */ libxsmm_generator_packed_gemm_ac_rm_avx_avx2_avx512_kloop( io_generated_code, &l_loop_label_tracker, &l_gp_reg_mapping, &l_micro_kernel_config, i_xgemm_desc, i_packed_width, i_xgemm_desc->n ); } else if ( (l_n1_range > 0) && (l_n2_range > 0) ) { /* reset n loop */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_mov_instruction, l_gp_reg_mapping.gp_reg_nloop, 0 ); /* we have two ranges */ /* first range */ libxsmm_x86_instruction_register_jump_back_label( io_generated_code, &l_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_nloop, l_n1_block ); libxsmm_generator_packed_gemm_ac_rm_avx_avx2_avx512_kloop( io_generated_code, &l_loop_label_tracker, &l_gp_reg_mapping, &l_micro_kernel_config, i_xgemm_desc, i_packed_width, l_n1_block ); libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_cmp_instruction, l_gp_reg_mapping.gp_reg_nloop, l_n1_range ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, l_micro_kernel_config.alu_jmp_instruction, &l_loop_label_tracker ); /* second range */ libxsmm_x86_instruction_register_jump_back_label( io_generated_code, &l_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_nloop, l_n2_block ); libxsmm_generator_packed_gemm_ac_rm_avx_avx2_avx512_kloop( io_generated_code, &l_loop_label_tracker, &l_gp_reg_mapping, &l_micro_kernel_config, i_xgemm_desc, i_packed_width, l_n2_block ); libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_cmp_instruction, l_gp_reg_mapping.gp_reg_nloop, i_xgemm_desc->n ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, l_micro_kernel_config.alu_jmp_instruction, &l_loop_label_tracker ); /* reset B pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_sub_instruction, l_gp_reg_mapping.gp_reg_b, i_xgemm_desc->n * l_micro_kernel_config.datatype_size ); /* reset C pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_sub_instruction, l_gp_reg_mapping.gp_reg_c, i_xgemm_desc->n * i_packed_width * l_micro_kernel_config.datatype_size ); } else if ( (l_n1_range > 0) && (l_n2_range == 0) ) { /* reset n loop */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_mov_instruction, l_gp_reg_mapping.gp_reg_nloop, 0 ); /* we have one range */ libxsmm_x86_instruction_register_jump_back_label( io_generated_code, &l_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_nloop, l_n1_block ); libxsmm_generator_packed_gemm_ac_rm_avx_avx2_avx512_kloop( io_generated_code, &l_loop_label_tracker, &l_gp_reg_mapping, &l_micro_kernel_config, i_xgemm_desc, i_packed_width, l_n1_block ); libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_cmp_instruction, l_gp_reg_mapping.gp_reg_nloop, i_xgemm_desc->n ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, l_micro_kernel_config.alu_jmp_instruction, &l_loop_label_tracker ); /* reset B pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_sub_instruction, l_gp_reg_mapping.gp_reg_b, i_xgemm_desc->n * l_micro_kernel_config.datatype_size ); /* reset C pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_sub_instruction, l_gp_reg_mapping.gp_reg_c, i_xgemm_desc->n * i_packed_width * l_micro_kernel_config.datatype_size ); } else { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_N_BLOCK ); return; } /* advance A pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_a, l_micro_kernel_config.datatype_size*i_packed_width*i_xgemm_desc->lda); /* advance C pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_c, l_micro_kernel_config.datatype_size*i_packed_width*i_xgemm_desc->ldc); /* close m loop */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_cmp_instruction, l_gp_reg_mapping.gp_reg_mloop, i_xgemm_desc->m ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, l_micro_kernel_config.alu_jmp_instruction, &l_loop_label_tracker ); /* close asm */ libxsmm_x86_instruction_close_stream( io_generated_code, &l_gp_reg_mapping, i_xgemm_desc->prefetch ); } LIBXSMM_API_INTERN void libxsmm_generator_packed_gemm_ac_rm_avx_avx2_avx512_kloop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_packed_width, const unsigned int i_n_blocking ) { /* calculate how many iterations we need */ unsigned int l_simd_packed_remainder = 0; unsigned int l_simd_packed_iters = 0; unsigned int l_simd_packed_width = 0; /* select simd packing width and accumulator blocking */ if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { l_simd_packed_width = 8; } else { l_simd_packed_width = 4; } } else { if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { l_simd_packed_width = 16; } else { l_simd_packed_width = 8; } } l_simd_packed_remainder = i_packed_width % l_simd_packed_width; l_simd_packed_iters = i_packed_width/l_simd_packed_width; /* check if we have a single SIMD devisor */ if ( l_simd_packed_width == i_packed_width ) { /* run inner compute kernel */ libxsmm_generator_packed_gemm_ac_rm_avx_avx2_avx512_kloop_simd_packed( io_generated_code, io_loop_label_tracker, i_gp_reg_mapping, i_micro_kernel_config, i_xgemm_desc, i_packed_width, l_simd_packed_width, l_simd_packed_width, i_n_blocking ); /* check if we have a perfect SIMD devisor */ } else if ( l_simd_packed_remainder == 0 ) { /* initilize packed loop */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_help_0, 0 ); libxsmm_x86_instruction_register_jump_back_label( io_generated_code, io_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_help_0, l_simd_packed_width ); /* run inner compute kernel */ libxsmm_generator_packed_gemm_ac_rm_avx_avx2_avx512_kloop_simd_packed( io_generated_code, io_loop_label_tracker, i_gp_reg_mapping, i_micro_kernel_config, i_xgemm_desc, i_packed_width, l_simd_packed_width, l_simd_packed_width, i_n_blocking ); /* advance A pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a, l_simd_packed_width * i_micro_kernel_config->datatype_size ); /* advance C pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_c, l_simd_packed_width * i_micro_kernel_config->datatype_size ); /* jump back to pack loop label */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_cmp_instruction, i_gp_reg_mapping->gp_reg_help_0, i_packed_width ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, i_micro_kernel_config->alu_jmp_instruction, io_loop_label_tracker ); /* reset A pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_a, l_simd_packed_iters * l_simd_packed_width * i_micro_kernel_config->datatype_size ); /* reset C pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_c, l_simd_packed_iters * l_simd_packed_width * i_micro_kernel_config->datatype_size ); /* we need masking and have less than SIMD width */ } else if ( l_simd_packed_width > i_packed_width ) { /* run remainder compute kernel */ libxsmm_generator_packed_gemm_ac_rm_avx_avx2_avx512_kloop_simd_packed( io_generated_code, io_loop_label_tracker, i_gp_reg_mapping, i_micro_kernel_config, i_xgemm_desc, i_packed_width, l_simd_packed_width, l_simd_packed_remainder, i_n_blocking ); /* we need the general case */ } else { /* initilize packed loop */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_help_0, 0 ); libxsmm_x86_instruction_register_jump_back_label( io_generated_code, io_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_help_0, l_simd_packed_width ); /* run inner compute kernel */ libxsmm_generator_packed_gemm_ac_rm_avx_avx2_avx512_kloop_simd_packed( io_generated_code, io_loop_label_tracker, i_gp_reg_mapping, i_micro_kernel_config, i_xgemm_desc, i_packed_width, l_simd_packed_width, l_simd_packed_width, i_n_blocking ); /* advance A pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a, l_simd_packed_width * i_micro_kernel_config->datatype_size ); /* advance C pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_c, l_simd_packed_width * i_micro_kernel_config->datatype_size ); /* jump back to pack loop label */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_cmp_instruction, i_gp_reg_mapping->gp_reg_help_0, i_packed_width - l_simd_packed_remainder ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, i_micro_kernel_config->alu_jmp_instruction, io_loop_label_tracker ); /* run remainder compute kernel */ libxsmm_generator_packed_gemm_ac_rm_avx_avx2_avx512_kloop_simd_packed( io_generated_code, io_loop_label_tracker, i_gp_reg_mapping, i_micro_kernel_config, i_xgemm_desc, i_packed_width, l_simd_packed_width, l_simd_packed_remainder, i_n_blocking ); /* reset A pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_a, l_simd_packed_iters * l_simd_packed_width * i_micro_kernel_config->datatype_size ); /* reset C pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_c, l_simd_packed_iters * l_simd_packed_width * i_micro_kernel_config->datatype_size ); } /* advance B and C pointers if N is bigger than our register blocking */ if ( i_xgemm_desc->n != i_n_blocking ) { /* advance B pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_b, i_n_blocking * i_micro_kernel_config->datatype_size ); /* advance C pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_c, i_n_blocking * i_packed_width * i_micro_kernel_config->datatype_size ); } } LIBXSMM_API_INTERN void libxsmm_generator_packed_gemm_ac_rm_avx_avx2_avx512_kloop_simd_packed( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_packed_width, const unsigned int i_simd_packed_width, const unsigned int i_simd_packed_valid, const unsigned int i_n_blocking ) { unsigned int l_n = 0; unsigned int l_lcl_k = 0; unsigned int l_use_masking = 0; unsigned int l_avx_ac_move_instr = 0; /* check if we need to compute a mask */ if ( i_simd_packed_width > i_simd_packed_valid ) { /* on AVX512 we can use mask registers */ if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { libxsmm_generator_gemm_initialize_avx512_mask( io_generated_code, i_gp_reg_mapping->gp_reg_help_1, i_xgemm_desc, i_simd_packed_width - i_simd_packed_valid ); } else { char l_id = (char)i_n_blocking; unsigned char l_data[32]; unsigned int l_count; if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { unsigned long long* l_i64_ptr = (unsigned long long*)l_data; for ( l_count = 0; l_count < 4; ++l_count ) { if ( l_count < i_simd_packed_valid ) { l_i64_ptr[l_count] = 0xffffffffffffffff; } else { l_i64_ptr[l_count] = 0x0; } } l_avx_ac_move_instr = LIBXSMM_X86_INSTR_VMASKMOVPD; } else { unsigned int* l_i32_ptr = (unsigned int*)l_data; for ( l_count = 0; l_count < 8; ++l_count ) { if ( l_count < i_simd_packed_valid ) { l_i32_ptr[l_count] = 0xffffffff; } else { l_i32_ptr[l_count] = 0x0; } } l_avx_ac_move_instr = LIBXSMM_X86_INSTR_VMASKMOVPS; } libxsmm_x86_instruction_full_vec_load_of_constants ( io_generated_code, l_data, &l_id, 'y', 15 ); } l_use_masking = LIBXSMM_X86_AVX512_MASK; } /* load C accumulator */ for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { if (0 != (LIBXSMM_GEMM_FLAG_BETA_0 & i_xgemm_desc->flags)) { /* Beta=0 */ libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vxor_instruction, i_micro_kernel_config->vector_name, l_n, l_n, l_n ); } else { /* in case of masking we need to distinguish between AVX/AVX2 and AVX512 */ if ( l_use_masking ) { if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->c_vmove_instruction, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, l_n*i_packed_width*i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, l_n, l_use_masking, 1, 0 ); } else { libxsmm_x86_instruction_vec_mask_move( io_generated_code, l_avx_ac_move_instr, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, l_n*i_packed_width*i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, l_n, 15, 0); } } else { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->c_vmove_instruction, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, l_n*i_packed_width*i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, l_n, 0, 1, 0 ); } } } /* k loop */ libxsmm_generator_gemm_header_kloop( io_generated_code, io_loop_label_tracker, i_gp_reg_mapping, i_micro_kernel_config, 0, 1 ); /* full vector load of A */ /* @TODO: prepare KNM's QMADD */ for ( l_lcl_k = 0; l_lcl_k < 1; l_lcl_k++ ) { /* in case of masking we need to distinguish between AVX/AVX2 and AVX512 */ if ( l_use_masking ) { if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, l_lcl_k*i_packed_width*i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, i_n_blocking+l_lcl_k, l_use_masking, 1, 0 ); } else { libxsmm_x86_instruction_vec_mask_move( io_generated_code, l_avx_ac_move_instr, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, l_lcl_k*i_packed_width*i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, i_n_blocking+l_lcl_k, 15, 0); } } else { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, l_lcl_k*i_packed_width*i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, i_n_blocking+l_lcl_k, 0, 1, 0 ); } } /* loop over the register block */ for ( l_n = 0; l_n < i_n_blocking; ++l_n ) { if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { libxsmm_x86_instruction_vec_compute_mem( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, 1, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_n * i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, i_n_blocking, l_n ); } else if ( io_generated_code->arch == LIBXSMM_X86_AVX2 ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_n * i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, i_n_blocking+1, 0, 1, 0 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, i_n_blocking, i_n_blocking+1, l_n ); } else if ( io_generated_code->arch == LIBXSMM_X86_AVX ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_n * i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, i_n_blocking+1, 0, 1, 0 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, i_n_blocking, i_n_blocking+1, i_n_blocking+1 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vadd_instruction, i_micro_kernel_config->vector_name, i_n_blocking+1, l_n, l_n ); } else { /* cannot happen */ } } /* advance A pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a, i_packed_width * i_micro_kernel_config->datatype_size ); /* advance B pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_b, i_xgemm_desc->ldb * i_micro_kernel_config->datatype_size ); /* close k loop */ libxsmm_generator_gemm_footer_kloop( io_generated_code, io_loop_label_tracker, i_gp_reg_mapping, i_micro_kernel_config, i_xgemm_desc, 0, i_xgemm_desc->k, 0 ); /* store C accumulator */ for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { /* in case of masking we need to distinguish between AVX/AVX2 and AVX512 */ if ( l_use_masking ) { if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->c_vmove_instruction, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, l_n*i_packed_width*i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, l_n, l_use_masking, 0, 1 ); } else { libxsmm_x86_instruction_vec_mask_move( io_generated_code, l_avx_ac_move_instr, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, l_n*i_packed_width*i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, l_n, 15, 1); } } else { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->c_vmove_instruction, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, l_n*i_packed_width*i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, l_n, 0, 0, 1 ); } } /* reset A pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_a, i_xgemm_desc->k * i_packed_width * i_micro_kernel_config->datatype_size ); /* reset B Pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_b, i_xgemm_desc->k * i_xgemm_desc->ldb * i_micro_kernel_config->datatype_size ); } libxsmm-1.17/src/generator_packed_gemm_ac_rm_avx_avx2_avx512.h000066400000000000000000000071201415223013700244420ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef GENERATOR_PACKED_GEMM_AC_RM_AVX_AVX2_AVX512_H #define GENERATOR_PACKED_GEMM_AC_RM_AVX_AVX2_AVX512_H #include #include "generator_common.h" LIBXSMM_API_INTERN void libxsmm_generator_packed_gemm_ac_rm_avx_avx2_avx512( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_packed_width ); LIBXSMM_API_INTERN void libxsmm_generator_packed_gemm_ac_rm_avx_avx2_avx512_kloop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_packed_width, const unsigned int i_n_blocking ); LIBXSMM_API_INTERN void libxsmm_generator_packed_gemm_ac_rm_avx_avx2_avx512_kloop_simd_packed( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_packed_width, const unsigned int i_simd_packed_width, const unsigned int i_simd_packed_valid, const unsigned int i_n_blocking ); #endif /* GENERATOR_PACKED_GEMM_AC_RM_AVX_AVX2_AVX512_H */ libxsmm-1.17/src/generator_packed_gemm_avx_avx512.c000066400000000000000000000235431415223013700223430ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Greg Henry, Hans Pabst, Timothy Costa (Intel Corp.) ******************************************************************************/ #include "generator_packed_gemm_avx_avx512.h" #include "generator_x86_instructions.h" #include "generator_packed_aux.h" #include "generator_packed_gemmnn.h" #include "generator_common.h" #include "libxsmm_main.h" #if 0 # define GENERATOR_PACKED_GEMM_DEBUG #endif /* TODO: Remove the extra garbage parameters from this calling sequence: */ #define GARBAGE_PARAMETERS LIBXSMM_API_INTERN void libxsmm_generator_packed_gemm_avx_avx512_kernel( libxsmm_generated_code* io_code, const libxsmm_pgemm_descriptor* i_packed_pgemm_desc, const char* i_arch #ifdef GARBAGE_PARAMETERS , unsigned int iunroll, unsigned int junroll, unsigned int loopi, unsigned int loopj #endif ) { unsigned char *const buf = (unsigned char *) io_code->generated_code; libxsmm_loop_label_tracker l_loop_label_tracker /*= { 0 }*/; /* avx512 just represents whether we want to use zmm registers or not * * A value of 0 says not, a value of 1 targets AVX512_CORE, a value * * of 2 targets AVX512_MIC */ int avx512; #if 0 /* TOD: introduce/use register mapping rather than directly/hard-coding registers */ /* Just reuse transpose gp mapping */ libxsmm_gemm_gp_reg_mapping l_gp_reg_mapping = { 0/*avoid warning "maybe used uninitialized" */ }; /* define gp register mapping */ #if defined(_WIN32) || defined(__CYGWIN__) l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_lda = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_R8; l_gp_reg_mapping.gp_reg_ldb = LIBXSMM_X86_GP_REG_R9; l_gp_reg_mapping.gp_reg_m_loop = LIBXSMM_X86_GP_REG_RDI; l_gp_reg_mapping.gp_reg_n_loop = LIBXSMM_X86_GP_REG_RSI; #else /* match calling convention on Linux */ l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RDI; l_gp_reg_mapping.gp_reg_lda = LIBXSMM_X86_GP_REG_RSI; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_ldb = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_m_loop = LIBXSMM_X86_GP_REG_R8; l_gp_reg_mapping.gp_reg_n_loop = LIBXSMM_X86_GP_REG_R9; #endif l_gp_reg_mapping.gp_reg_help_0 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_1 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_2 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_3 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_4 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_5 = LIBXSMM_X86_GP_REG_UNDEF; /* Actually, the logic is this: we need a, lda, and b. We don't need ldb * * If n>=6, we need rbx * * If n>=8, we need rbp * * If LIBXSMM_MIN(n,REGSIZE)>=5 and m%REGSIZE==1, we need r12 * * If LIBXSMM_MIN(n,REGSIZE)>=6 and m%REGSIZE==1, we need r13 * * If LIBXSMM_MIN(n,REGSIZE)>=7 and m%REGSIZE==1, we need r14 * * If LIBXSMM_MIN(n,REGSIZE)>=8 and m%REGSIZE==1, we need r15 * * Otherwise, we get by with registers that don't require pushing/popping */ #endif /* define loop_label_tracker */ libxsmm_reset_loop_label_tracker( &l_loop_label_tracker ); /* define transposition kernel config */ if (strcmp(i_arch, "skx") == 0) { avx512 = 1; } else if (strcmp(i_arch, "knl") == 0 || strcmp(i_arch, "knm") == 0) { avx512 = 2; } else if (strcmp(i_arch, "snb") == 0 || strcmp(i_arch, "hsw") == 0) { avx512 = 0; } else { LIBXSMM_HANDLE_ERROR( io_code, LIBXSMM_ERR_UNSUP_ARCH ); return; } /* @Greg add more fields here */ /* @Greg add generator code here, please use functions defined in generator_x86_instructions.h */ /* Todo-> I first want this code to work, and verify it works, then I can * convert one instruction at a time to those in * generator_x86_instructions.h. Or add to the existing instructions */ if ( io_code->code_type > 1 ) { unsigned int i = io_code->code_size; unsigned int m = i_packed_pgemm_desc->m; unsigned int n = i_packed_pgemm_desc->n; unsigned int k = i_packed_pgemm_desc->k; unsigned int lda = i_packed_pgemm_desc->lda; unsigned int ldb = i_packed_pgemm_desc->ldb; unsigned int ldc = i_packed_pgemm_desc->ldc; char transa = i_packed_pgemm_desc->transa; char transb = i_packed_pgemm_desc->transb; unsigned layout = (unsigned int) i_packed_pgemm_desc->layout; unsigned int datasz = (unsigned int)i_packed_pgemm_desc->typesize; #if 0 const double alpha = (8 == datasz ? i_packed_pgemm_desc->alpha.d : ((double)i_packed_pgemm_desc->alpha.s)); #else double alpha=1.0; #endif #if defined(_WIN32) || defined(__CYGWIN__) unsigned int areg = LIBXSMM_X86_GP_REG_RCX; unsigned int breg = LIBXSMM_X86_GP_REG_RDX; unsigned int creg = LIBXSMM_X86_GP_REG_R8; #else unsigned int areg = LIBXSMM_X86_GP_REG_RDI; unsigned int breg = LIBXSMM_X86_GP_REG_RSI; unsigned int creg = LIBXSMM_X86_GP_REG_RDX; #endif const double beta = 1.0; unsigned int m1=m, n1=n, k1=k; unsigned int j; /*int REGSIZE;*/ int numb = 0; /*int scalealpha = 0;*/ /*int nounit=0;*/ int tra, trb, trc; char regset = 0; if ( i_packed_pgemm_desc->alpha_val == 0 ) { alpha = 1.0; } else if ( i_packed_pgemm_desc->alpha_val == 1 ) { alpha = -1.0; } else { printf("Warning: libxsmm_generator_packed_gemm_avx_avx512 has unknown alpha, using 1.0\n"); } #if defined(GENERATOR_PACKED_GEMM_DEBUG) printf("Inside libxsmm_generator_packed_gemm_avx_avx512_kernel: transa=%c transb=%c m=%d n=%d k=%d lda=%d ldb=%d ldc=%d alpha=%g beta=%g datasz=%d avx512=%d lay=%d\n",transa,transb,m,n,k,lda,ldb,ldc,alpha,beta,datasz,avx512,layout); printf("Extra parameters: iunroll=%d junroll=%d loopi=%d loopj=%d\n",iunroll,junroll,loopi,loopj); #endif if ( ( datasz !=4 ) && (datasz != 8) ) { fprintf(stderr,"Expecting a datasize of 4 or 8 but got %u\n",datasz); exit(-1); } if ( avx512 < 0 || avx512 > 2 ) { fprintf(stderr,"Expecting an integer between 0 and 2 for avx512, got %i\n",avx512); exit(-1); } if ( datasz == 4 && avx512 == 0 ) { numb = 8; regset = 'y'; } else if ( datasz == 8 && avx512 == 0 ) { numb = 4; regset = 'y'; } else if ( datasz == 4 && avx512 > 0 ) { numb = 16; regset = 'z'; } else if ( datasz == 8 && avx512 > 0 ) { numb = 8; regset = 'z'; } if ( LIBXSMM_FEQ(0, alpha) ) { compact_set_zero_ ( io_code, 0, numb, datasz, regset ); for ( j = 1; j <= n1; j++ ) { for ( i = 1; i <= m1; i++ ) { compact_store_matrix1_ ( io_code, lda, i, j, 0, numb, datasz, regset ); } } i = io_code->code_size; buf[i++] = 0xc3; /* retq */ io_code->code_size = i; return; } #if 0 if ( LIBXSMM_NEQ(1, alpha) ) { compact_load_parameter_ ( io_code, alpha, 2, numb, regset ); } nounit = ( (diag=='N') || (diag=='n') ); #endif if ( transa == 'T' || transa == 't' ) tra = 1; else tra = 0; if ( transb == 'T' || transb == 't' ) trb = 1; else trb = 0; trc = 0; if ( layout == 101 ) { /* Row-major swaps tra/trb/trc */ if ( tra ) tra = 0; else tra = 1; if ( trb ) trb = 0; else trb = 1; #if !defined(NDEBUG) /* TODO: code protected by !defined(NDEBUG) is logically dead */ LIBXSMM_ASSERT(0 == trc); /* coverity[dead_error_line] */ if ( trc ) trc = 0; else #endif trc = 1; } /* Change which registers to use for windows builds */ #if defined(GENERATOR_PACKED_GEMM_DEBUG) printf("Using compact_gemmnn header file\n"); #endif compact_gemmnn_ ( tra, trb, trc, 1, m1, 1, k1, 1, k1, 1, n1, 1, m1, 1, n1, alpha, areg, lda, breg, ldb, beta, creg, ldc, io_code, numb, regset, iunroll, junroll, loopi, loopj ); #if defined(GENERATOR_PACKED_GEMM_DEBUG) printf("Done using compact_gemmnn header file\n"); #endif } { int i = io_code->code_size; buf[i++] = 0xc3; /* retq */ io_code->code_size = i; } /* close asm: note that we really didn't need to push everything */ /* libxsmm_x86_instruction_close_stream_transpose( io_code, i_arch ); */ #if 0 #define DEBUG_GIVE_BYTE_CODE_OUTPUT #endif #ifdef DEBUG_GIVE_BYTE_CODE_OUTPUT buf = (unsigned char *) io_code->generated_code; printf("#Final Routine: \n"); for ( i = 0; i < io_code->code_size; i+=8 ) { printf("#\tBytes %d-%d\n",i,i+7); printf(".byte 0x%02x, 0x%02x, 0x%02x, 0x%02x, 0x%02x, 0x%02x, 0x%02x, 0x%02x\n",buf[i],buf[i+1],buf[i+2],buf[i+3],buf[i+4],buf[i+5],buf[i+6],buf[i+7]); } #endif } libxsmm-1.17/src/generator_packed_gemm_avx_avx512.h000066400000000000000000000032771415223013700223520ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Greg Henry, Timothy Costa (Intel Corp.) ******************************************************************************/ #ifndef GENERATOR_PACKED_GEMM_AVX_AVX512_H #define GENERATOR_PACKED_GEMM_AVX_AVX512_H #include "generator_common.h" #define GARBAGE_PARAMETERS LIBXSMM_API_INTERN void libxsmm_generator_packed_gemm_avx_avx512_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_pgemm_descriptor* i_packed_pgemm_desc, const char* i_arch #ifdef GARBAGE_PARAMETERS , unsigned int iunroll, unsigned int junroll, unsigned int loopi, unsigned int loopj #endif ); #endif /*GENERATOR_PACKED_GEMM_AVX_AVX512_H*/ libxsmm-1.17/src/generator_packed_gemm_bc_rm_avx_avx2_avx512.c000066400000000000000000001113161415223013700244410ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "generator_packed_gemm_bc_rm_avx_avx2_avx512.h" #include "generator_gemm_common.h" #include "generator_x86_instructions.h" #include "libxsmm_main.h" LIBXSMM_API void libxsmm_generator_packed_gemm_bc_rm( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_packed_width, const char* i_arch ) { if ( strcmp(i_arch, "knl") == 0 || strcmp(i_arch, "knm") == 0 || strcmp(i_arch, "skx") == 0 || strcmp(i_arch, "clx") == 0 || strcmp(i_arch, "cpx") == 0 || strcmp(i_arch, "hsw") == 0 || strcmp(i_arch, "snb") == 0 ) { if ( strcmp(i_arch, "snb") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX; } else if ( strcmp(i_arch, "hsw") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX2; } else if ( strcmp(i_arch, "knl") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_MIC; } else if ( strcmp(i_arch, "knm") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_KNM; } else if ( strcmp(i_arch, "skx") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_CORE; } else if ( strcmp(i_arch, "clx") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_CLX; } else if ( strcmp(i_arch, "cpx") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_CPX; } else { /* cannot happen */ } libxsmm_generator_packed_gemm_bc_rm_avx_avx2_avx512( io_generated_code, i_xgemm_desc, i_packed_width ); } else { fprintf( stderr, "RM AC SOA is only available for AVX/AVX2/AVX512 at this point\n" ); exit(-1); } } LIBXSMM_API_INTERN void libxsmm_generator_packed_gemm_bc_rm_avx_avx2_avx512( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_packed_width ) { unsigned int l_max_reg_block = 0; unsigned int l_n1_range = 0; unsigned int l_n2_range = 0; unsigned int l_n1_block = 0; unsigned int l_n2_block = 0; libxsmm_micro_kernel_config l_micro_kernel_config; libxsmm_loop_label_tracker l_loop_label_tracker; libxsmm_gp_reg_mapping l_gp_reg_mapping; /* select accumulator blocking */ /* @TODO we could do more agressive blocking if needed */ if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { l_max_reg_block = 28; } else { l_max_reg_block = 13; } /* define gp register mapping */ libxsmm_reset_x86_gp_reg_mapping( &l_gp_reg_mapping ); /* matching calling convention on Linux */ #if defined(_WIN32) || defined(__CYGWIN__) l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_c = LIBXSMM_X86_GP_REG_R8; /* TODO: full support for Windows calling convention */ l_gp_reg_mapping.gp_reg_a_prefetch = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_b_prefetch = LIBXSMM_X86_GP_REG_UNDEF; #else /* match calling convention on Linux */ l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RDI; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_RSI; l_gp_reg_mapping.gp_reg_c = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_a_prefetch = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_b_prefetch = LIBXSMM_X86_GP_REG_R8; #endif /* l_gp_reg_mapping.gp_reg_c_prefetch = LIBXSMM_X86_GP_REG_UNDEF;*/ l_gp_reg_mapping.gp_reg_mloop = LIBXSMM_X86_GP_REG_R12; l_gp_reg_mapping.gp_reg_nloop = LIBXSMM_X86_GP_REG_R13; l_gp_reg_mapping.gp_reg_kloop = LIBXSMM_X86_GP_REG_R14; l_gp_reg_mapping.gp_reg_help_0 = LIBXSMM_X86_GP_REG_R10; /* this is the SIMD packed register loop */ l_gp_reg_mapping.gp_reg_help_1 = LIBXSMM_X86_GP_REG_R11; /* this is the gp register to load the AVX512, if needed */ l_gp_reg_mapping.gp_reg_help_2 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_3 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_4 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_5 = LIBXSMM_X86_GP_REG_UNDEF; /* define loop_label_tracker */ libxsmm_reset_loop_label_tracker( &l_loop_label_tracker ); /* define the micro kernel code gen properties */ libxsmm_generator_gemm_init_micro_kernel_config_fullvector( &l_micro_kernel_config, io_generated_code->arch, i_xgemm_desc, 0 ); /* for this kernel we need to overwrite the A and B load instructions */ if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { l_micro_kernel_config.a_vmove_instruction = LIBXSMM_X86_INSTR_VBROADCASTSD; l_micro_kernel_config.b_vmove_instruction = LIBXSMM_X86_INSTR_VMOVUPD; } else { l_micro_kernel_config.a_vmove_instruction = LIBXSMM_X86_INSTR_VBROADCASTSS; l_micro_kernel_config.b_vmove_instruction = LIBXSMM_X86_INSTR_VMOVUPS; } /* calculate the chunk size of current columns to work on */ if ( libxsmm_compute_equalized_blocking( i_xgemm_desc->n, l_max_reg_block, &l_n1_range, &l_n1_block, &l_n2_range, &l_n2_block ) ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_N_BLOCK ); return; } /* open asm */ libxsmm_x86_instruction_open_stream( io_generated_code, &l_gp_reg_mapping, i_xgemm_desc->prefetch ); /* m loop */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_mov_instruction, l_gp_reg_mapping.gp_reg_mloop, 0 ); libxsmm_x86_instruction_register_jump_back_label( io_generated_code, &l_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_mloop, 1 ); /* loop over n-blocks */ if ( l_n1_block == i_xgemm_desc->n ) { /* no N loop at all */ libxsmm_generator_packed_gemm_bc_rm_avx_avx2_avx512_kloop( io_generated_code, &l_loop_label_tracker, &l_gp_reg_mapping, &l_micro_kernel_config, i_xgemm_desc, i_packed_width, i_xgemm_desc->n ); } else if ( (l_n1_range > 0) && (l_n2_range > 0) ) { /* reset n loop */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_mov_instruction, l_gp_reg_mapping.gp_reg_nloop, 0 ); /* we have two ranges */ /* first range */ libxsmm_x86_instruction_register_jump_back_label( io_generated_code, &l_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_nloop, l_n1_block ); libxsmm_generator_packed_gemm_bc_rm_avx_avx2_avx512_kloop( io_generated_code, &l_loop_label_tracker, &l_gp_reg_mapping, &l_micro_kernel_config, i_xgemm_desc, i_packed_width, l_n1_block ); libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_cmp_instruction, l_gp_reg_mapping.gp_reg_nloop, l_n1_range ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, l_micro_kernel_config.alu_jmp_instruction, &l_loop_label_tracker ); /* second range */ libxsmm_x86_instruction_register_jump_back_label( io_generated_code, &l_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_nloop, l_n2_block ); libxsmm_generator_packed_gemm_bc_rm_avx_avx2_avx512_kloop( io_generated_code, &l_loop_label_tracker, &l_gp_reg_mapping, &l_micro_kernel_config, i_xgemm_desc, i_packed_width, l_n2_block ); libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_cmp_instruction, l_gp_reg_mapping.gp_reg_nloop, i_xgemm_desc->n ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, l_micro_kernel_config.alu_jmp_instruction, &l_loop_label_tracker ); /* reset B pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_sub_instruction, l_gp_reg_mapping.gp_reg_b, i_xgemm_desc->n * i_packed_width * l_micro_kernel_config.datatype_size ); /* reset C pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_sub_instruction, l_gp_reg_mapping.gp_reg_c, i_xgemm_desc->n * i_packed_width * l_micro_kernel_config.datatype_size ); } else if ( (l_n1_range > 0) && (l_n2_range == 0) ) { /* reset n loop */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_mov_instruction, l_gp_reg_mapping.gp_reg_nloop, 0 ); /* we have one range */ libxsmm_x86_instruction_register_jump_back_label( io_generated_code, &l_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_nloop, l_n1_block ); libxsmm_generator_packed_gemm_bc_rm_avx_avx2_avx512_kloop( io_generated_code, &l_loop_label_tracker, &l_gp_reg_mapping, &l_micro_kernel_config, i_xgemm_desc, i_packed_width, l_n1_block ); libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_cmp_instruction, l_gp_reg_mapping.gp_reg_nloop, i_xgemm_desc->n ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, l_micro_kernel_config.alu_jmp_instruction, &l_loop_label_tracker ); /* reset B pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_sub_instruction, l_gp_reg_mapping.gp_reg_b, i_xgemm_desc->n * i_packed_width * l_micro_kernel_config.datatype_size ); /* reset C pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_sub_instruction, l_gp_reg_mapping.gp_reg_c, i_xgemm_desc->n * i_packed_width * l_micro_kernel_config.datatype_size ); } else { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_N_BLOCK ); return; } /* advance A pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_a, l_micro_kernel_config.datatype_size*i_xgemm_desc->lda); /* advance C pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_c, l_micro_kernel_config.datatype_size*i_packed_width*i_xgemm_desc->ldc); /* close m loop */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_cmp_instruction, l_gp_reg_mapping.gp_reg_mloop, i_xgemm_desc->m ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, l_micro_kernel_config.alu_jmp_instruction, &l_loop_label_tracker ); /* close asm */ libxsmm_x86_instruction_close_stream( io_generated_code, &l_gp_reg_mapping, i_xgemm_desc->prefetch ); } LIBXSMM_API_INTERN void libxsmm_generator_packed_gemm_bc_rm_avx_avx2_avx512_kloop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_packed_width, const unsigned int i_n_blocking ) { /* calculate how many iterations we need */ unsigned int l_simd_packed_remainder = 0; unsigned int l_simd_packed_iters = 0; unsigned int l_simd_packed_width = 0; /* select simd packing width */ if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { l_simd_packed_width = 8; } else { l_simd_packed_width = 4; } } else { if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { l_simd_packed_width = 16; } else { l_simd_packed_width = 8; } } l_simd_packed_remainder = i_packed_width % l_simd_packed_width; l_simd_packed_iters = i_packed_width/l_simd_packed_width; /* check if we have a single SIMD devisor */ if ( l_simd_packed_width == i_packed_width ) { /* run inner compute kernel */ libxsmm_generator_packed_gemm_bc_rm_avx_avx2_avx512_kloop_simd_packed( io_generated_code, io_loop_label_tracker, i_gp_reg_mapping, i_micro_kernel_config, i_xgemm_desc, i_packed_width, l_simd_packed_width, l_simd_packed_width, i_n_blocking ); /* check if we have a perfect SIMD devisor */ } else if ( l_simd_packed_remainder == 0 ) { /* initilize packed loop */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_help_0, 0 ); libxsmm_x86_instruction_register_jump_back_label( io_generated_code, io_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_help_0, l_simd_packed_width ); /* run inner compute kernel */ libxsmm_generator_packed_gemm_bc_rm_avx_avx2_avx512_kloop_simd_packed( io_generated_code, io_loop_label_tracker, i_gp_reg_mapping, i_micro_kernel_config, i_xgemm_desc, i_packed_width, l_simd_packed_width, l_simd_packed_width, i_n_blocking ); /* advance B pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_b, l_simd_packed_width * i_micro_kernel_config->datatype_size ); /* advance pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_c, l_simd_packed_width * i_micro_kernel_config->datatype_size ); /* jump back to pack loop label */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_cmp_instruction, i_gp_reg_mapping->gp_reg_help_0, i_packed_width ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, i_micro_kernel_config->alu_jmp_instruction, io_loop_label_tracker ); /* reset B pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_b, l_simd_packed_iters * l_simd_packed_width * i_micro_kernel_config->datatype_size ); /* reset C pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_c, l_simd_packed_iters * l_simd_packed_width * i_micro_kernel_config->datatype_size ); /* we need masking and have less than SIMD width */ } else if ( l_simd_packed_width > i_packed_width ) { /* run remainder compute kernel */ libxsmm_generator_packed_gemm_bc_rm_avx_avx2_avx512_kloop_simd_packed( io_generated_code, io_loop_label_tracker, i_gp_reg_mapping, i_micro_kernel_config, i_xgemm_desc, i_packed_width, l_simd_packed_width, l_simd_packed_remainder, i_n_blocking ); /* we need the general case */ } else { /* initilize packed loop */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_help_0, 0 ); libxsmm_x86_instruction_register_jump_back_label( io_generated_code, io_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_help_0, l_simd_packed_width ); /* run inner compute kernel */ libxsmm_generator_packed_gemm_bc_rm_avx_avx2_avx512_kloop_simd_packed( io_generated_code, io_loop_label_tracker, i_gp_reg_mapping, i_micro_kernel_config, i_xgemm_desc, i_packed_width, l_simd_packed_width, l_simd_packed_width, i_n_blocking ); /* advance B pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_b, l_simd_packed_width * i_micro_kernel_config->datatype_size ); /* advance C pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_c, l_simd_packed_width * i_micro_kernel_config->datatype_size ); /* jump back to pack loop label */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_cmp_instruction, i_gp_reg_mapping->gp_reg_help_0, i_packed_width - l_simd_packed_remainder ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, i_micro_kernel_config->alu_jmp_instruction, io_loop_label_tracker ); /* run remainder compute kernel */ libxsmm_generator_packed_gemm_bc_rm_avx_avx2_avx512_kloop_simd_packed( io_generated_code, io_loop_label_tracker, i_gp_reg_mapping, i_micro_kernel_config, i_xgemm_desc, i_packed_width, l_simd_packed_width, l_simd_packed_remainder, i_n_blocking ); /* reset B pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_b, l_simd_packed_iters * l_simd_packed_width * i_micro_kernel_config->datatype_size ); /* reset C pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_c, l_simd_packed_iters * l_simd_packed_width * i_micro_kernel_config->datatype_size ); } /* advance B and C pointers if N is bigger than our register blocking */ if ( i_xgemm_desc->n != i_n_blocking ) { /* advance B pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_b, i_n_blocking * i_packed_width * i_micro_kernel_config->datatype_size ); /* advance C pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_c, i_n_blocking * i_packed_width * i_micro_kernel_config->datatype_size ); } } LIBXSMM_API_INTERN void libxsmm_generator_packed_gemm_bc_rm_avx_avx2_avx512_kloop_simd_packed( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_packed_width, const unsigned int i_simd_packed_width, const unsigned int i_simd_packed_valid, const unsigned int i_n_blocking ) { unsigned int l_n = 0; unsigned int l_use_masking = 0; unsigned int l_avx_ac_move_instr = 0; /* check if we need to compute a mask */ if ( i_simd_packed_width > i_simd_packed_valid ) { /* on AVX512 we can use mask registers */ if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { libxsmm_generator_gemm_initialize_avx512_mask( io_generated_code, i_gp_reg_mapping->gp_reg_help_1, i_xgemm_desc, i_simd_packed_width - i_simd_packed_valid ); } else { char l_id = (char)i_n_blocking; unsigned char l_data[32]; unsigned int l_count; if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { unsigned long long* l_i64_ptr = (unsigned long long*)l_data; for ( l_count = 0; l_count < 4; ++l_count ) { if ( l_count < i_simd_packed_valid ) { l_i64_ptr[l_count] = 0xffffffffffffffff; } else { l_i64_ptr[l_count] = 0x0; } } l_avx_ac_move_instr = LIBXSMM_X86_INSTR_VMASKMOVPD; } else { unsigned int* l_i32_ptr = (unsigned int*)l_data; for ( l_count = 0; l_count < 8; ++l_count ) { if ( l_count < i_simd_packed_valid ) { l_i32_ptr[l_count] = 0xffffffff; } else { l_i32_ptr[l_count] = 0x0; } } l_avx_ac_move_instr = LIBXSMM_X86_INSTR_VMASKMOVPS; } libxsmm_x86_instruction_full_vec_load_of_constants ( io_generated_code, l_data, &l_id, 'y', 15 ); } l_use_masking = LIBXSMM_X86_AVX512_MASK; } /* load C accumulator */ for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { if (0 != (LIBXSMM_GEMM_FLAG_BETA_0 & i_xgemm_desc->flags)) { /* Beta=0 */ libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vxor_instruction, i_micro_kernel_config->vector_name, l_n, l_n, l_n ); } else { /* in case of masking we need to distinguish between AVX/AVX2 and AVX512 */ if ( l_use_masking ) { if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->c_vmove_instruction, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, l_n*i_packed_width*i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, l_n, l_use_masking, 1, 0 ); } else { libxsmm_x86_instruction_vec_mask_move( io_generated_code, l_avx_ac_move_instr, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, l_n*i_packed_width*i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, l_n, 15, 0); } } else { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->c_vmove_instruction, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, l_n*i_packed_width*i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, l_n, 0, 1, 0 ); } } } /* k loop */ libxsmm_generator_gemm_header_kloop( io_generated_code, io_loop_label_tracker, i_gp_reg_mapping, i_micro_kernel_config, 0, 1 ); /* broadcast of A */ libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, i_n_blocking, 0, 1, 0 ); /* loop over the register block */ for ( l_n = 0; l_n < i_n_blocking; ++l_n ) { if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { if ( i_simd_packed_width > i_simd_packed_valid ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_n * i_packed_width * i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, i_n_blocking+1, l_use_masking, 1, 0 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, i_n_blocking, i_n_blocking+1, l_n ); } else { libxsmm_x86_instruction_vec_compute_mem( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, 0, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_n * i_packed_width * i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, i_n_blocking, l_n ); } } else if ( io_generated_code->arch == LIBXSMM_X86_AVX2 ) { if ( i_simd_packed_width > i_simd_packed_valid ) { libxsmm_x86_instruction_vec_mask_move( io_generated_code, l_avx_ac_move_instr, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_n * i_packed_width * i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, i_n_blocking+1, 15, 0); } else { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_n * i_packed_width * i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, i_n_blocking+1, 0, 1, 0 ); } libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, i_n_blocking, i_n_blocking+1, l_n ); } else if ( io_generated_code->arch == LIBXSMM_X86_AVX ) { if ( i_simd_packed_width > i_simd_packed_valid ) { libxsmm_x86_instruction_vec_mask_move( io_generated_code, l_avx_ac_move_instr, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_n * i_packed_width * i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, i_n_blocking+1, 15, 0); } else { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_n * i_packed_width * i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, i_n_blocking+1, 0, 1, 0 ); } libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, i_n_blocking, i_n_blocking+1, i_n_blocking+1 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vadd_instruction, i_micro_kernel_config->vector_name, i_n_blocking+1, l_n, l_n ); } else { /* cannot happen */ } } /* advance A pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a, i_micro_kernel_config->datatype_size ); /* advance B pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_b, i_packed_width * i_xgemm_desc->ldb * i_micro_kernel_config->datatype_size ); /* close k loop */ libxsmm_generator_gemm_footer_kloop( io_generated_code, io_loop_label_tracker, i_gp_reg_mapping, i_micro_kernel_config, i_xgemm_desc, 0, i_xgemm_desc->k, 0 ); /* store C accumulator */ for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { /* in case of masking we need to distinguish between AVX/AVX2 and AVX512 */ if ( l_use_masking ) { if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->c_vmove_instruction, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, l_n*i_packed_width*i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, l_n, l_use_masking, 0, 1 ); } else { libxsmm_x86_instruction_vec_mask_move( io_generated_code, l_avx_ac_move_instr, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, l_n*i_packed_width*i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, l_n, 15, 1); } } else { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->c_vmove_instruction, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, l_n*i_packed_width*i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, l_n, 0, 0, 1 ); } } /* reset A pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_a, i_xgemm_desc->k * i_micro_kernel_config->datatype_size ); /* reset B pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_b, i_xgemm_desc->k * i_xgemm_desc->ldb * i_packed_width * i_micro_kernel_config->datatype_size ); } libxsmm-1.17/src/generator_packed_gemm_bc_rm_avx_avx2_avx512.h000066400000000000000000000071201415223013700244430ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef GENERATOR_PACKED_GEMM_BC_RM_AVX_AVX2_AVX512_H #define GENERATOR_PACKED_GEMM_BC_RM_AVX_AVX2_AVX512_H #include #include "generator_common.h" LIBXSMM_API_INTERN void libxsmm_generator_packed_gemm_bc_rm_avx_avx2_avx512( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_packed_width ); LIBXSMM_API_INTERN void libxsmm_generator_packed_gemm_bc_rm_avx_avx2_avx512_kloop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_packed_width, const unsigned int i_n_blocking ); LIBXSMM_API_INTERN void libxsmm_generator_packed_gemm_bc_rm_avx_avx2_avx512_kloop_simd_packed( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_packed_width, const unsigned int i_simd_packed_width, const unsigned int i_simd_packed_valid, const unsigned int i_n_blocking ); #endif /* GENERATOR_PACKED_GEMM_BC_RM_AVX_AVX2_AVX512_H */ libxsmm-1.17/src/generator_packed_gemmnn.h000066400000000000000000001121061415223013700207120ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Greg Henry (Intel Corp.) ******************************************************************************/ #ifndef GENERATOR_PACKED_GEMMNN_H #define GENERATOR_PACKED_GEMMNN_H #include "generator_x86_instructions.h" #include "generator_common.h" #include /* Does C(cm1:cm2,cn1:cn2) <- alpha*A(am1:am2,ak1:ak2)*B(bk1:bk2,bn1:bn2) + beta *C(cm1:cm2,cn1:cn2) Obviously, the dimensions must conform Alpha and Beta are doubles even if the work should be single. Just convert */ LIBXSMM_API_INLINE void compact_gemmnn_ ( unsigned int tra, /* 0 if non-transpose */ unsigned int trb, /* 0 if non-transpose */ unsigned int trc, /* 0 if non-transpose */ unsigned int am1, unsigned int am2, unsigned int ak1, unsigned int ak2, unsigned int bk1, unsigned int bk2, unsigned int bn1, unsigned int bn2, unsigned int cm1, unsigned int cm2, unsigned int cn1, unsigned int cn2, double alpha, unsigned int areg, unsigned int lda, unsigned int breg, unsigned int ldb, double beta, unsigned int creg, unsigned int ldc, libxsmm_generated_code* io_code, unsigned int numb, char regset, unsigned int iunroll, unsigned int junroll, unsigned int loopi, unsigned int loopj ) { unsigned int i, j, l, datasz, nloopcnt=0, mloopcnt=0, nborder, mborder, mloopadj = 0; int aoffset, boffset, coffset; /* Address calcs for the loops */ unsigned int iun=3, jun=3; /* Register blocking sizes */ int a0 = -1, a1 = -1, a2 = -1, a3 = -1, a4 = -1, a5 = -1, a6 = -1, a7 = -1; int b0 = -1, b1 = -1, b2 = -1, b3 = -1, b4 = -1, b5 = -1, b6 = -1, b7 = -1; int c00 = -1, c01 = -1, c02 = -1, c03 = -1, c04 = -1, c05 = -1, c06 = -1, c07 = -1; int c10 = -1, c11 = -1, c12 = -1, c13 = -1, c14 = -1, c15 = -1, c16 = -1, c17 = -1; int c20 = -1, c21 = -1, c22 = -1, c23 = -1, c24 = -1, c25 = -1, c26 = -1, c27 = -1; int c30 = -1, c31 = -1, c32 = -1, c33 = -1, c34 = -1, c35 = -1, c36 = -1, c37 = -1; int c40 = -1, c41 = -1, c42 = -1, c43 = -1, c44 = -1, c45 = -1, c46 = -1, c47 = -1; int c50 = -1, c51 = -1, c52 = -1, c53 = -1, c54 = -1, c55 = -1, c56 = -1, c57 = -1; int c60 = -1, c61 = -1, c62 = -1, c63 = -1, c64 = -1, c65 = -1, c66 = -1, c67 = -1; int c70 = -1, c71 = -1, c72 = -1, c73 = -1, c74 = -1, c75 = -1, c76 = -1, c77 = -1; int c0 = 0, c1 = 0, c2 = 0, c3 = 0, c4 = 0, c5 = 0, c6 = 0, c7 = 0; int j0 = 1, j1, j2, j3, j4, j5, j6, j7; int i0 = 1, i1, i2, i3, i4, i5, i6, i7; unsigned int maxregblocking = 8, maxreg = 16; libxsmm_loop_label_tracker l_loop_label_tracker; /* Test that the dimensions conform */ if ( (am2-am1) != (cm2-cm1) ) { printf("compact_gemmnn m-dimensions don't conform: %u != %u\n",am2-am1+1,cm2-cm1+1); exit(-1); } if ( (ak2-ak1) != (bk2-bk1) ) { printf("compact_gemmnn k-dimensions don't conform: %u != %u\n",ak2-ak1+1,bk2-bk1+1); exit(-1); } if ( (bn2-bn1) != (cn2-cn1) ) { printf("compact_gemmnn n-dimensions don't conform: %u != %u\n",ak2-ak1+1,bk2-bk1+1); exit(-1); } /* See that all dimensions are at least 1 */ if ( am2 < am1) { printf("compact_gemmnn m-dimension too small: %u\n",am2-am1+1); exit(-1); } if ( ak2 < ak1) { printf("compact_gemmnn k-dimension too small: %u\n",ak2-ak1+1); exit(-1); } if ( bn2 < bn1) { printf("compact_gemmnn n-dimension too small: %u\n",bn2-bn1+1); exit(-1); } /* Check that areg, breg, creg is valid */ if ( /*(areg < 0) ||*/ (areg > 15) ) { printf("compact_gemmnn A gp register invalid: %u\n",areg); exit(-1); } if ( /*(breg < 0) ||*/ (breg > 15) ) { printf("compact_gemmnn B gp register invalid: %u\n",breg); exit(-1); } if ( /*(creg < 0) ||*/ (creg > 15) ) { printf("compact_gemmnn C gp register invalid: %u\n",creg); exit(-1); } if ( (numb == 8) && (regset=='z') ) { datasz = 8; } else if ( (numb == 16) && (regset=='z') ) { datasz = 4; } else if ( (numb == 8) && (regset=='y') ) { datasz = 4; } else if ( (numb == 4) && (regset=='y') ) { datasz = 8; } else { printf("compact_gemmnn Unknown number=%u or regset=%c\n",numb,regset); exit(-1); } if ( regset == 'y' ) { iun = 3; jun = 3; maxreg=16; maxregblocking= 7; } if ( regset == 'z' ) { iun = 5; jun = 4; maxreg=32; maxregblocking= 8; } if ( iunroll > 0 ) iun = iunroll; if ( junroll > 0 ) jun = junroll; /* Make sure values of register blocking are between 1 and maxregblocking */ iun = LIBXSMM_MAX(LIBXSMM_MIN(iun,maxregblocking),1); jun = LIBXSMM_MAX(LIBXSMM_MIN(jun,maxregblocking),1); /* CHeck to see the register blocking parameters make sense: */ if ( maxreg < 3 ) { printf("Sorry, not enough registers available in compact gemm nn\n"); exit(-1); } while ( iun+jun+iun*jun > maxreg ) { if ( (iun >= jun) && (iun > 1) ) --iun; else if ( (jun >= iun) && (jun > 1) ) --jun; else { printf("Seems strange that we can't reduce the registers in compact gemm nn\n"); exit(-1); } } /* Determine if the problem is too small for loops giving this register blocking */ mloopcnt = (int)((am2-am1+1)/iun); nloopcnt = (int)((bn2-bn1+1)/jun); if ( mloopcnt < 2 ) loopi = 0; if ( nloopcnt < 2 ) loopj = 0; mborder = (am2-am1+1)-mloopcnt*iun; nborder = (bn2-bn1+1)-nloopcnt*jun; if ( loopj || loopi ) { libxsmm_reset_loop_label_tracker ( &l_loop_label_tracker ); } /* DO register blocking */ a0 = 0; if ( iun > 1 ) a1 = 1; if ( iun > 2 ) a2 = 2; if ( iun > 3 ) a3 = 3; if ( iun > 4 ) a4 = 4; if ( iun > 5 ) a5 = 5; if ( iun > 6 ) a6 = 6; if ( iun > 7 ) a7 = 7; b0 = iun; if ( jun > 1 ) b1 = b0+1; if ( jun > 2 ) b2 = b0+2; if ( jun > 3 ) b3 = b0+3; if ( jun > 4 ) b4 = b0+4; if ( jun > 5 ) b5 = b0+5; if ( jun > 6 ) b6 = b0+6; if ( jun > 7 ) b7 = b0+7; c00 = iun + jun; if ( jun > 1 ) c01 = c00 + 1; if ( jun > 2 ) c02 = c00 + 2; if ( jun > 3 ) c03 = c00 + 3; if ( jun > 4 ) c04 = c00 + 4; if ( jun > 5 ) c05 = c00 + 5; if ( jun > 6 ) c06 = c00 + 6; if ( jun > 7 ) c07 = c00 + 7; if ( iun > 1 ) c10 = c00 + jun; if ( (iun > 1) && (jun > 1) ) c11 = c10 + 1; if ( (iun > 1) && (jun > 2) ) c12 = c10 + 2; if ( (iun > 1) && (jun > 3) ) c13 = c10 + 3; if ( (iun > 1) && (jun > 4) ) c14 = c10 + 4; if ( (iun > 1) && (jun > 5) ) c15 = c10 + 5; if ( (iun > 1) && (jun > 6) ) c16 = c10 + 6; if ( (iun > 1) && (jun > 7) ) c17 = c10 + 7; if ( iun > 2 ) c20 = c10 + jun; if ( (iun > 2) && (jun > 1) ) c21 = c20 + 1; if ( (iun > 2) && (jun > 2) ) c22 = c20 + 2; if ( (iun > 2) && (jun > 3) ) c23 = c20 + 3; if ( (iun > 2) && (jun > 4) ) c24 = c20 + 4; if ( (iun > 2) && (jun > 5) ) c25 = c20 + 5; if ( (iun > 2) && (jun > 6) ) c26 = c20 + 6; if ( (iun > 2) && (jun > 7) ) c27 = c20 + 7; if ( iun > 3 ) c30 = c20 + jun; if ( (iun > 3) && (jun > 1) ) c31 = c30 + 1; if ( (iun > 3) && (jun > 2) ) c32 = c30 + 2; if ( (iun > 3) && (jun > 3) ) c33 = c30 + 3; if ( (iun > 3) && (jun > 4) ) c34 = c30 + 4; if ( (iun > 3) && (jun > 5) ) c35 = c30 + 5; if ( (iun > 3) && (jun > 6) ) c36 = c30 + 6; if ( (iun > 3) && (jun > 7) ) c37 = c30 + 7; if ( iun > 4 ) c40 = c30 + jun; if ( (iun > 4) && (jun > 1) ) c41 = c40 + 1; if ( (iun > 4) && (jun > 2) ) c42 = c40 + 2; if ( (iun > 4) && (jun > 3) ) c43 = c40 + 3; if ( (iun > 4) && (jun > 4) ) c44 = c40 + 4; if ( (iun > 4) && (jun > 5) ) c45 = c40 + 5; if ( (iun > 4) && (jun > 6) ) c46 = c40 + 6; if ( (iun > 4) && (jun > 7) ) c47 = c40 + 7; if ( iun > 5 ) c50 = c40 + jun; if ( (iun > 5) && (jun > 1) ) c51 = c50 + 1; if ( (iun > 5) && (jun > 2) ) c52 = c50 + 2; if ( (iun > 5) && (jun > 3) ) c53 = c50 + 3; if ( (iun > 5) && (jun > 4) ) c54 = c50 + 4; if ( (iun > 5) && (jun > 5) ) c55 = c50 + 5; if ( (iun > 5) && (jun > 6) ) c56 = c50 + 6; if ( (iun > 5) && (jun > 7) ) c57 = c50 + 7; if ( iun > 6 ) c60 = c50 + jun; if ( (iun > 6) && (jun > 1) ) c61 = c60 + 1; if ( (iun > 6) && (jun > 2) ) c62 = c60 + 2; if ( (iun > 6) && (jun > 3) ) c63 = c60 + 3; if ( (iun > 6) && (jun > 4) ) c64 = c60 + 4; if ( (iun > 6) && (jun > 5) ) c65 = c60 + 5; if ( (iun > 6) && (jun > 6) ) c66 = c60 + 6; if ( (iun > 6) && (jun > 7) ) c67 = c60 + 7; if ( iun > 7 ) c70 = c60 + jun; if ( (iun > 7) && (jun > 1) ) c71 = c70 + 1; if ( (iun > 7) && (jun > 2) ) c72 = c70 + 2; if ( (iun > 7) && (jun > 3) ) c73 = c70 + 3; if ( (iun > 7) && (jun > 4) ) c74 = c70 + 4; if ( (iun > 7) && (jun > 5) ) c75 = c70 + 5; if ( (iun > 7) && (jun > 6) ) c76 = c70 + 6; if ( (iun > 7) && (jun > 7) ) c77 = c70 + 7; #if 0 #define COMPACT_GEMMNN_DEBUG #endif #ifdef COMPACT_GEMMNN_DEBUG printf("iun=%d jun=%d loopi=%d loopj=%d\n",iun,jun,loopi,loopj); printf("areg=%d breg=%d creg=%d mborder=%d nborder=%d\n",areg,breg,creg,mborder,nborder); printf("a0:7=%d %d %d %d %d %d %d %d\n",a0,a1,a2,a3,a4,a5,a6,a7); printf("b0:7=%d %d %d %d %d %d %d %d\n",b0,b1,b2,b3,b4,b5,b6,b7); printf("c0,0:7=%d %d %d %d %d %d %d %d\n",c00,c01,c02,c03,c04,c05,c06,c07); if (c10>0) printf("c1,0:7=%d %d %d %d %d %d %d %d\n",c10,c11,c12,c13,c14,c15,c16,c17); if (c20>0) printf("c2,0:7=%d %d %d %d %d %d %d %d\n",c20,c21,c22,c23,c24,c25,c26,c27); if (c30>0) printf("c3,0:7=%d %d %d %d %d %d %d %d\n",c30,c31,c32,c33,c34,c35,c36,c37); if (c40>0) printf("c4,0:7=%d %d %d %d %d %d %d %d\n",c40,c41,c42,c43,c44,c45,c46,c47); if (c50>0) printf("c5,0:7=%d %d %d %d %d %d %d %d\n",c50,c51,c52,c53,c54,c55,c56,c57); if (c60>0) printf("c6,0:7=%d %d %d %d %d %d %d %d\n",c60,c61,c62,c63,c64,c65,c66,c67); if (c70>0) printf("c7,0:7=%d %d %d %d %d %d %d %d\n",c70,c71,c72,c73,c74,c75,c76,c77); #endif if ( loopj && (nloopcnt >=2) ) { #ifdef COMPACT_GEMMNN_DEBUG printf("Setting up n-loop: loopj=%d nloopcnt=%d\n",loopj,nloopcnt); #endif libxsmm_x86_instruction_alu_imm( io_code, LIBXSMM_X86_INSTR_MOVQ, LIBXSMM_X86_GP_REG_RAX, nloopcnt ); libxsmm_x86_instruction_register_jump_back_label( io_code, &l_loop_label_tracker ); } for ( j = bn1; j <= bn2; j+=jun ) { #ifdef COMPACT_GEMMNN_DEBUG printf("Doing j loop from %d to %d with blocksize %d\n",bn1,bn2,jun); #endif if ( ( j <= bn2 ) && ( jun >= 1 ) ) j0 = 1; else j0 = 0; if ( ( j+1 <= bn2 ) && ( jun >= 2 ) ) j1 = 1; else j1 = 0; if ( ( j+2 <= bn2 ) && ( jun >= 3 ) ) j2 = 1; else j2 = 0; if ( ( j+3 <= bn2 ) && ( jun >= 4 ) ) j3 = 1; else j3 = 0; if ( ( j+4 <= bn2 ) && ( jun >= 5 ) ) j4 = 1; else j4 = 0; if ( ( j+5 <= bn2 ) && ( jun >= 6 ) ) j5 = 1; else j5 = 0; if ( ( j+6 <= bn2 ) && ( jun >= 7 ) ) j6 = 1; else j6 = 0; if ( ( j+7 <= bn2 ) && ( jun >= 8 ) ) j7 = 1; else j7 = 0; if ( loopj && (j > bn1) && (j + jun -1 <= bn2) ) { /* Turn everything off, we're really supposed to be in a loop */ j0=0; j1=0; j2=0; j3=0; j4=0; j5=0; j6=0; j7=0; #ifdef COMPACT_GEMMNN_DEBUG printf("Emptying n-loop for j=%d\n",j); #endif } if ( loopi && (mloopcnt >=2) && j0 ) { #ifdef COMPACT_GEMMNN_DEBUG printf("Setting up m-loop: loopi=%d mloopcnt=%d\n",loopi,mloopcnt); #endif libxsmm_x86_instruction_alu_imm( io_code, LIBXSMM_X86_INSTR_MOVQ, LIBXSMM_X86_GP_REG_RCX, mloopcnt ); libxsmm_x86_instruction_register_jump_back_label( io_code, &l_loop_label_tracker ); mloopadj = 1; } for ( i = am1; i <= am2; i+=iun ) { if ( ( i <= am2 ) && ( iun >= 1 ) ) i0 = 1; else i0 = 0; if ( ( i+1 <= am2 ) && ( iun >= 2 ) ) i1 = 1; else i1 = 0; if ( ( i+2 <= am2 ) && ( iun >= 3 ) ) i2 = 1; else i2 = 0; if ( ( i+3 <= am2 ) && ( iun >= 4 ) ) i3 = 1; else i3 = 0; if ( ( i+4 <= am2 ) && ( iun >= 5 ) ) i4 = 1; else i4 = 0; if ( ( i+5 <= am2 ) && ( iun >= 6 ) ) i5 = 1; else i5 = 0; if ( ( i+6 <= am2 ) && ( iun >= 7 ) ) i6 = 1; else i6 = 0; if ( ( i+7 <= am2 ) && ( iun >= 8 ) ) i7 = 1; else i7 = 0; #ifdef COMPACT_GEMMNN_DEBUG printf("Doing i loop from %d to %d with blocksize %d (%d,%d,%d,%d,%d,%d,%d,%d)\n",am1,am2,iun,i0,i1,i2,i3,i4,i5,i6,i7); #endif if ( loopi && (i > am1) && (i + iun -1 <= am2) ) { /* Turn everything off, we're really supposed to be in a loop */ i0=0; i1=0; i2=0; i3=0; i4=0; i5=0; i6=0; i7=0; #ifdef COMPACT_GEMMNN_DEBUG printf("Emptying m-loop for i=%d j=%d i0=%d j0=%d\n",i,j,i0,j0); #endif } #ifdef COMPACT_GEMMNN_DEBUG if (i0 && j0 ) printf("Loading A into %d with tra=%d lda=%d i=%d j=%d numb=%d datasz=%d regset=%c areg=%d\n",a0,tra,lda,i,ak1,numb,datasz, regset,areg); #endif if (i0 && j0) compact_load_matrix_gen_ ( io_code, tra, lda, i, ak1, a0, numb, datasz, regset, areg ); #ifdef COMPACT_GEMMNN_DEBUG if (i0 && j0 ) printf("Loaded A into %d with tra=%d lda=%d i=%d j=%d numb=%d datasz=%d regset=%c areg=%d\n",a0,tra,lda,i,ak1,numb,datasz, regset,areg); #endif if (i0 && j0) compact_load_matrix_gen_ ( io_code, trb, ldb, bk1, j, b0, numb, datasz, regset, breg ); if (i0 && j0) compact_mult_two_nums_ ( io_code, a0, b0, c00, numb, regset ); if (i1 && j0) compact_load_matrix_gen_ ( io_code, tra, lda, i+1, ak1, a1, numb, datasz, regset, areg ); if (i1 && j0) compact_mult_two_nums_ ( io_code, a1, b0, c10, numb, regset ); if (i2 && j0) compact_load_matrix_gen_ ( io_code, tra, lda, i+2, ak1, a2, numb, datasz, regset, areg ); if (i2 && j0) compact_mult_two_nums_ ( io_code, a2, b0, c20, numb, regset ); if (i3 && j0) compact_load_matrix_gen_ ( io_code, tra, lda, i+3, ak1, a3, numb, datasz, regset, areg ); if (i3 && j0) compact_mult_two_nums_ ( io_code, a3, b0, c30, numb, regset ); if (i4 && j0) compact_load_matrix_gen_ ( io_code, tra, lda, i+4, ak1, a4, numb, datasz, regset, areg ); if (i4 && j0) compact_mult_two_nums_ ( io_code, a4, b0, c40, numb, regset ); if (i5 && j0) compact_load_matrix_gen_ ( io_code, tra, lda, i+5, ak1, a5, numb, datasz, regset, areg ); if (i5 && j0) compact_mult_two_nums_ ( io_code, a5, b0, c50, numb, regset ); if (i6 && j0) compact_load_matrix_gen_ ( io_code, tra, lda, i+6, ak1, a6, numb, datasz, regset, areg ); if (i6 && j0) compact_mult_two_nums_ ( io_code, a6, b0, c60, numb, regset ); if (i7 && j0) compact_load_matrix_gen_ ( io_code, tra, lda, i+7, ak1, a7, numb, datasz, regset, areg ); if (i7 && j0) compact_mult_two_nums_ ( io_code, a7, b0, c70, numb, regset ); if (i0 && j1) compact_load_matrix_gen_ ( io_code, trb, ldb, bk1, j+1, b1, numb, datasz, regset, breg ); if (i0 && j1) compact_mult_two_nums_ ( io_code, a0, b1, c01, numb, regset ); if (i0 && j2) compact_load_matrix_gen_ ( io_code, trb, ldb, bk1, j+2, b2, numb, datasz, regset, breg ); if (i0 && j2) compact_mult_two_nums_ ( io_code, a0, b2, c02, numb, regset ); if (i0 && j3) compact_load_matrix_gen_ ( io_code, trb, ldb, bk1, j+3, b3, numb, datasz, regset, breg ); if (i0 && j3) compact_mult_two_nums_ ( io_code, a0, b3, c03, numb, regset ); if (i0 && j4) compact_load_matrix_gen_ ( io_code, trb, ldb, bk1, j+4, b4, numb, datasz, regset, breg ); if (i0 && j4) compact_mult_two_nums_ ( io_code, a0, b4, c04, numb, regset ); if (i0 && j5) compact_load_matrix_gen_ ( io_code, trb, ldb, bk1, j+5, b5, numb, datasz, regset, breg ); if (i0 && j5) compact_mult_two_nums_ ( io_code, a0, b5, c05, numb, regset ); if (i0 && j6) compact_load_matrix_gen_ ( io_code, trb, ldb, bk1, j+6, b6, numb, datasz, regset, breg ); if (i0 && j6) compact_mult_two_nums_ ( io_code, a0, b6, c06, numb, regset ); if (i0 && j7) compact_load_matrix_gen_ ( io_code, trb, ldb, bk1, j+7, b7, numb, datasz, regset, breg ); if (i0 && j7) compact_mult_two_nums_ ( io_code, a0, b7, c07, numb, regset ); if (i1 && j1) compact_mult_two_nums_ ( io_code, a1, b1, c11, numb, regset ); if (i1 && j2) compact_mult_two_nums_ ( io_code, a1, b2, c12, numb, regset ); if (i1 && j3) compact_mult_two_nums_ ( io_code, a1, b3, c13, numb, regset ); if (i1 && j4) compact_mult_two_nums_ ( io_code, a1, b4, c14, numb, regset ); if (i1 && j5) compact_mult_two_nums_ ( io_code, a1, b5, c15, numb, regset ); if (i1 && j6) compact_mult_two_nums_ ( io_code, a1, b6, c16, numb, regset ); if (i1 && j7) compact_mult_two_nums_ ( io_code, a1, b7, c17, numb, regset ); if (i2 && j1) compact_mult_two_nums_ ( io_code, a2, b1, c21, numb, regset ); if (i2 && j2) compact_mult_two_nums_ ( io_code, a2, b2, c22, numb, regset ); if (i2 && j3) compact_mult_two_nums_ ( io_code, a2, b3, c23, numb, regset ); if (i2 && j4) compact_mult_two_nums_ ( io_code, a2, b4, c24, numb, regset ); if (i2 && j5) compact_mult_two_nums_ ( io_code, a2, b5, c25, numb, regset ); if (i2 && j6) compact_mult_two_nums_ ( io_code, a2, b6, c26, numb, regset ); if (i2 && j7) compact_mult_two_nums_ ( io_code, a2, b7, c27, numb, regset ); if (i3 && j1) compact_mult_two_nums_ ( io_code, a3, b1, c31, numb, regset ); if (i3 && j2) compact_mult_two_nums_ ( io_code, a3, b2, c32, numb, regset ); if (i3 && j3) compact_mult_two_nums_ ( io_code, a3, b3, c33, numb, regset ); if (i3 && j4) compact_mult_two_nums_ ( io_code, a3, b4, c34, numb, regset ); if (i3 && j5) compact_mult_two_nums_ ( io_code, a3, b5, c35, numb, regset ); if (i3 && j6) compact_mult_two_nums_ ( io_code, a3, b6, c36, numb, regset ); if (i3 && j7) compact_mult_two_nums_ ( io_code, a3, b7, c37, numb, regset ); if (i4 && j1) compact_mult_two_nums_ ( io_code, a4, b1, c41, numb, regset ); if (i4 && j2) compact_mult_two_nums_ ( io_code, a4, b2, c42, numb, regset ); if (i4 && j3) compact_mult_two_nums_ ( io_code, a4, b3, c43, numb, regset ); if (i4 && j4) compact_mult_two_nums_ ( io_code, a4, b4, c44, numb, regset ); if (i4 && j5) compact_mult_two_nums_ ( io_code, a4, b5, c45, numb, regset ); if (i4 && j6) compact_mult_two_nums_ ( io_code, a4, b6, c46, numb, regset ); if (i4 && j7) compact_mult_two_nums_ ( io_code, a4, b7, c47, numb, regset ); if (i5 && j1) compact_mult_two_nums_ ( io_code, a5, b1, c51, numb, regset ); if (i5 && j2) compact_mult_two_nums_ ( io_code, a5, b2, c52, numb, regset ); if (i5 && j3) compact_mult_two_nums_ ( io_code, a5, b3, c53, numb, regset ); if (i5 && j4) compact_mult_two_nums_ ( io_code, a5, b4, c54, numb, regset ); if (i5 && j5) compact_mult_two_nums_ ( io_code, a5, b5, c55, numb, regset ); if (i5 && j6) compact_mult_two_nums_ ( io_code, a5, b6, c56, numb, regset ); if (i5 && j7) compact_mult_two_nums_ ( io_code, a5, b7, c57, numb, regset ); if (i6 && j1) compact_mult_two_nums_ ( io_code, a6, b1, c61, numb, regset ); if (i6 && j2) compact_mult_two_nums_ ( io_code, a6, b2, c62, numb, regset ); if (i6 && j3) compact_mult_two_nums_ ( io_code, a6, b3, c63, numb, regset ); if (i6 && j4) compact_mult_two_nums_ ( io_code, a6, b4, c64, numb, regset ); if (i6 && j5) compact_mult_two_nums_ ( io_code, a6, b5, c65, numb, regset ); if (i6 && j6) compact_mult_two_nums_ ( io_code, a6, b6, c66, numb, regset ); if (i6 && j7) compact_mult_two_nums_ ( io_code, a6, b7, c67, numb, regset ); if (i7 && j1) compact_mult_two_nums_ ( io_code, a7, b1, c71, numb, regset ); if (i7 && j2) compact_mult_two_nums_ ( io_code, a7, b2, c72, numb, regset ); if (i7 && j3) compact_mult_two_nums_ ( io_code, a7, b3, c73, numb, regset ); if (i7 && j4) compact_mult_two_nums_ ( io_code, a7, b4, c74, numb, regset ); if (i7 && j5) compact_mult_two_nums_ ( io_code, a7, b5, c75, numb, regset ); if (i7 && j6) compact_mult_two_nums_ ( io_code, a7, b6, c76, numb, regset ); if (i7 && j7) compact_mult_two_nums_ ( io_code, a7, b7, c77, numb, regset ); for ( l = ak1+1; l <= ak2; l++ ) { #ifdef COMPACT_GEMMNN_DEBUG printf("Doing l loop from %d to %d\n",ak1+1,ak2); #endif if (i0 && j0) compact_load_matrix_gen_ ( io_code, tra, lda, i, l, a0, numb, datasz, regset, areg ); if (i0 && j0) compact_load_matrix_gen_ ( io_code, trb, ldb, l-ak1+bk1, j, b0, numb, datasz, regset, breg); if (i0 && j0) compact_fma_cplusab_ ( io_code, c00, a0, b0, numb, regset ); if (i1 && j0) compact_load_matrix_gen_ ( io_code, tra, lda, i+1, l, a1, numb, datasz, regset, areg ); if (i1 && j0) compact_fma_cplusab_ ( io_code, c10, a1, b0, numb, regset ); if (i2 && j0) compact_load_matrix_gen_ ( io_code, tra, lda, i+2, l, a2, numb, datasz, regset, areg ); if (i2 && j0) compact_fma_cplusab_ ( io_code, c20, a2, b0, numb, regset ); if (i3 && j0) compact_load_matrix_gen_ ( io_code, tra, lda, i+3, l, a3, numb, datasz, regset, areg ); if (i3 && j0) compact_fma_cplusab_ ( io_code, c30, a3, b0, numb, regset ); if (i4 && j0) compact_load_matrix_gen_ ( io_code, tra, lda, i+4, l, a4, numb, datasz, regset, areg ); if (i4 && j0) compact_fma_cplusab_ ( io_code, c40, a4, b0, numb, regset ); if (i5 && j0) compact_load_matrix_gen_ ( io_code, tra, lda, i+5, l, a5, numb, datasz, regset, areg ); if (i5 && j0) compact_fma_cplusab_ ( io_code, c50, a5, b0, numb, regset ); if (i6 && j0) compact_load_matrix_gen_ ( io_code, tra, lda, i+6, l, a6, numb, datasz, regset, areg ); if (i6 && j0) compact_fma_cplusab_ ( io_code, c60, a6, b0, numb, regset ); if (i7 && j0) compact_load_matrix_gen_ ( io_code, tra, lda, i+7, l, a7, numb, datasz, regset, areg ); if (i7 && j0) compact_fma_cplusab_ ( io_code, c70, a7, b0, numb, regset ); if (i0 && j1) compact_load_matrix_gen_ ( io_code, trb, ldb, l-ak1+bk1, j+1, b1, numb, datasz, regset, breg); if (i0 && j1) compact_fma_cplusab_ ( io_code, c01, a0, b1, numb, regset ); if (i0 && j2) compact_load_matrix_gen_ ( io_code, trb, ldb, l-ak1+bk1, j+2, b2, numb, datasz, regset, breg); if (i0 && j2) compact_fma_cplusab_ ( io_code, c02, a0, b2, numb, regset ); if (i0 && j3) compact_load_matrix_gen_ ( io_code, trb, ldb, l-ak1+bk1, j+3, b3, numb, datasz, regset, breg); if (i0 && j3) compact_fma_cplusab_ ( io_code, c03, a0, b3, numb, regset ); if (i0 && j4) compact_load_matrix_gen_ ( io_code, trb, ldb, l-ak1+bk1, j+4, b4, numb, datasz, regset, breg); if (i0 && j4) compact_fma_cplusab_ ( io_code, c04, a0, b4, numb, regset ); if (i0 && j5) compact_load_matrix_gen_ ( io_code, trb, ldb, l-ak1+bk1, j+5, b5, numb, datasz, regset, breg); if (i0 && j5) compact_fma_cplusab_ ( io_code, c05, a0, b5, numb, regset ); if (i0 && j6) compact_load_matrix_gen_ ( io_code, trb, ldb, l-ak1+bk1, j+6, b6, numb, datasz, regset, breg); if (i0 && j6) compact_fma_cplusab_ ( io_code, c06, a0, b6, numb, regset ); if (i0 && j7) compact_load_matrix_gen_ ( io_code, trb, ldb, l-ak1+bk1, j+7, b7, numb, datasz, regset, breg); if (i0 && j7) compact_fma_cplusab_ ( io_code, c07, a0, b7, numb, regset ); if (i1 && j1) compact_fma_cplusab_ ( io_code, c11, a1, b1, numb, regset ); if (i1 && j2) compact_fma_cplusab_ ( io_code, c12, a1, b2, numb, regset ); if (i1 && j3) compact_fma_cplusab_ ( io_code, c13, a1, b3, numb, regset ); if (i1 && j4) compact_fma_cplusab_ ( io_code, c14, a1, b4, numb, regset ); if (i1 && j5) compact_fma_cplusab_ ( io_code, c15, a1, b5, numb, regset ); if (i1 && j6) compact_fma_cplusab_ ( io_code, c16, a1, b6, numb, regset ); if (i1 && j7) compact_fma_cplusab_ ( io_code, c17, a1, b7, numb, regset ); if (i2 && j1) compact_fma_cplusab_ ( io_code, c21, a2, b1, numb, regset ); if (i2 && j2) compact_fma_cplusab_ ( io_code, c22, a2, b2, numb, regset ); if (i2 && j3) compact_fma_cplusab_ ( io_code, c23, a2, b3, numb, regset ); if (i2 && j4) compact_fma_cplusab_ ( io_code, c24, a2, b4, numb, regset ); if (i2 && j5) compact_fma_cplusab_ ( io_code, c25, a2, b5, numb, regset ); if (i2 && j6) compact_fma_cplusab_ ( io_code, c26, a2, b6, numb, regset ); if (i2 && j7) compact_fma_cplusab_ ( io_code, c27, a2, b7, numb, regset ); if (i3 && j1) compact_fma_cplusab_ ( io_code, c31, a3, b1, numb, regset ); if (i3 && j2) compact_fma_cplusab_ ( io_code, c32, a3, b2, numb, regset ); if (i3 && j3) compact_fma_cplusab_ ( io_code, c33, a3, b3, numb, regset ); if (i3 && j4) compact_fma_cplusab_ ( io_code, c34, a3, b4, numb, regset ); if (i3 && j5) compact_fma_cplusab_ ( io_code, c35, a3, b5, numb, regset ); if (i3 && j6) compact_fma_cplusab_ ( io_code, c36, a3, b6, numb, regset ); if (i3 && j7) compact_fma_cplusab_ ( io_code, c37, a3, b7, numb, regset ); if (i4 && j1) compact_fma_cplusab_ ( io_code, c41, a4, b1, numb, regset ); if (i4 && j2) compact_fma_cplusab_ ( io_code, c42, a4, b2, numb, regset ); if (i4 && j3) compact_fma_cplusab_ ( io_code, c43, a4, b3, numb, regset ); if (i4 && j4) compact_fma_cplusab_ ( io_code, c44, a4, b4, numb, regset ); if (i4 && j5) compact_fma_cplusab_ ( io_code, c45, a4, b5, numb, regset ); if (i4 && j6) compact_fma_cplusab_ ( io_code, c46, a4, b6, numb, regset ); if (i4 && j7) compact_fma_cplusab_ ( io_code, c47, a4, b7, numb, regset ); if (i5 && j1) compact_fma_cplusab_ ( io_code, c51, a5, b1, numb, regset ); if (i5 && j2) compact_fma_cplusab_ ( io_code, c52, a5, b2, numb, regset ); if (i5 && j3) compact_fma_cplusab_ ( io_code, c53, a5, b3, numb, regset ); if (i5 && j4) compact_fma_cplusab_ ( io_code, c54, a5, b4, numb, regset ); if (i5 && j5) compact_fma_cplusab_ ( io_code, c55, a5, b5, numb, regset ); if (i5 && j6) compact_fma_cplusab_ ( io_code, c56, a5, b6, numb, regset ); if (i5 && j7) compact_fma_cplusab_ ( io_code, c57, a5, b7, numb, regset ); if (i6 && j1) compact_fma_cplusab_ ( io_code, c61, a6, b1, numb, regset ); if (i6 && j2) compact_fma_cplusab_ ( io_code, c62, a6, b2, numb, regset ); if (i6 && j3) compact_fma_cplusab_ ( io_code, c63, a6, b3, numb, regset ); if (i6 && j4) compact_fma_cplusab_ ( io_code, c64, a6, b4, numb, regset ); if (i6 && j5) compact_fma_cplusab_ ( io_code, c65, a6, b5, numb, regset ); if (i6 && j6) compact_fma_cplusab_ ( io_code, c66, a6, b6, numb, regset ); if (i6 && j7) compact_fma_cplusab_ ( io_code, c67, a6, b7, numb, regset ); if (i7 && j1) compact_fma_cplusab_ ( io_code, c71, a7, b1, numb, regset ); if (i7 && j2) compact_fma_cplusab_ ( io_code, c72, a7, b2, numb, regset ); if (i7 && j3) compact_fma_cplusab_ ( io_code, c73, a7, b3, numb, regset ); if (i7 && j4) compact_fma_cplusab_ ( io_code, c74, a7, b4, numb, regset ); if (i7 && j5) compact_fma_cplusab_ ( io_code, c75, a7, b5, numb, regset ); if (i7 && j6) compact_fma_cplusab_ ( io_code, c76, a7, b6, numb, regset ); if (i7 && j7) compact_fma_cplusab_ ( io_code, c77, a7, b7, numb, regset ); } /* Inner loop */ /* Storing into C, do it one column at a time and reuse some regs */ for ( l = j; l <= LIBXSMM_MIN(j+jun-1,bn2); l++ ) { #ifdef COMPACT_GEMMNN_DEBUG printf("Doing j wrap-up storage from %d to %d\n",j,LIBXSMM_MIN(j+jun-1,bn2)); #endif if (l== j ) { c0=c00; c1=c10; c2=c20; c3=c30; c4=c40; c5=c50; c6=c60; c7=c70; } if (l==j+1) { c0=c01; c1=c11; c2=c21; c3=c31; c4=c41; c5=c51; c6=c61; c7=c71; } if (l==j+2) { c0=c02; c1=c12; c2=c22; c3=c32; c4=c42; c5=c52; c6=c62; c7=c72; } if (l==j+3) { c0=c03; c1=c13; c2=c23; c3=c33; c4=c43; c5=c53; c6=c63; c7=c73; } if (l==j+4) { c0=c04; c1=c14; c2=c24; c3=c34; c4=c44; c5=c54; c6=c64; c7=c74; } if (l==j+5) { c0=c05; c1=c15; c2=c25; c3=c35; c4=c45; c5=c55; c6=c65; c7=c75; } if (l==j+6) { c0=c06; c1=c16; c2=c26; c3=c36; c4=c46; c5=c56; c6=c66; c7=c76; } if (l==j+7) { c0=c07; c1=c17; c2=c27; c3=c37; c4=c47; c5=c57; c6=c67; c7=c77; } if ( beta == 1.0 ) { if (i0 && j0) compact_load_matrix_gen_ ( io_code, trc, ldc, i-am1+cm1, l-bn1+cn1, a0, numb, datasz, regset, creg ); if (i1 && j0) compact_load_matrix_gen_ ( io_code, trc, ldc, i-am1+cm1+1, l-bn1+cn1, a1, numb, datasz, regset, creg ); if (i2 && j0) compact_load_matrix_gen_ ( io_code, trc, ldc, i-am1+cm1+2, l-bn1+cn1, a2, numb, datasz, regset, creg ); if (i3 && j0) compact_load_matrix_gen_ ( io_code, trc, ldc, i-am1+cm1+3, l-bn1+cn1, a3, numb, datasz, regset, creg ); if (i4 && j0) compact_load_matrix_gen_ ( io_code, trc, ldc, i-am1+cm1+4, l-bn1+cn1, a4, numb, datasz, regset, creg ); if (i5 && j0) compact_load_matrix_gen_ ( io_code, trc, ldc, i-am1+cm1+5, l-bn1+cn1, a5, numb, datasz, regset, creg ); if (i6 && j0) compact_load_matrix_gen_ ( io_code, trc, ldc, i-am1+cm1+6, l-bn1+cn1, a6, numb, datasz, regset, creg ); if (i7 && j0) compact_load_matrix_gen_ ( io_code, trc, ldc, i-am1+cm1+7, l-bn1+cn1, a7, numb, datasz, regset, creg ); } else if ( (beta == 0.0) && (alpha != 1.0) ) { if (i0 && j0) compact_set_zero_( io_code, a0, numb, datasz, regset ); if (i1 && j0) compact_set_zero_( io_code, a1, numb, datasz, regset ); if (i2 && j0) compact_set_zero_( io_code, a2, numb, datasz, regset ); if (i3 && j0) compact_set_zero_( io_code, a3, numb, datasz, regset ); if (i4 && j0) compact_set_zero_( io_code, a4, numb, datasz, regset ); if (i5 && j0) compact_set_zero_( io_code, a5, numb, datasz, regset ); if (i6 && j0) compact_set_zero_( io_code, a6, numb, datasz, regset ); if (i7 && j0) compact_set_zero_( io_code, a7, numb, datasz, regset ); } if ( alpha == -1.0 ) { if (i0 && j0) compact_sub_two_nums_ ( io_code, a0, c0, c0, numb, regset ); if (i1 && j0) compact_sub_two_nums_ ( io_code, a1, c1, c1, numb, regset ); if (i2 && j0) compact_sub_two_nums_ ( io_code, a2, c2, c2, numb, regset ); if (i3 && j0) compact_sub_two_nums_ ( io_code, a3, c3, c3, numb, regset ); if (i4 && j0) compact_sub_two_nums_ ( io_code, a4, c4, c4, numb, regset ); if (i5 && j0) compact_sub_two_nums_ ( io_code, a5, c5, c5, numb, regset ); if (i6 && j0) compact_sub_two_nums_ ( io_code, a6, c6, c6, numb, regset ); if (i7 && j0) compact_sub_two_nums_ ( io_code, a7, c7, c7, numb, regset ); } else if ( (beta != 0.0) && (alpha==1.0) ) { if (i0 && j0) compact_add_two_nums_ ( io_code, a0, c0, c0, numb, regset ); if (i1 && j0) compact_add_two_nums_ ( io_code, a1, c1, c1, numb, regset ); if (i2 && j0) compact_add_two_nums_ ( io_code, a2, c2, c2, numb, regset ); if (i3 && j0) compact_add_two_nums_ ( io_code, a3, c3, c3, numb, regset ); if (i4 && j0) compact_add_two_nums_ ( io_code, a4, c4, c4, numb, regset ); if (i5 && j0) compact_add_two_nums_ ( io_code, a5, c5, c5, numb, regset ); if (i6 && j0) compact_add_two_nums_ ( io_code, a6, c6, c6, numb, regset ); if (i7 && j0) compact_add_two_nums_ ( io_code, a7, c7, c7, numb, regset ); } if (i0 && j0) compact_store_matrix_gen_ ( io_code, trc, ldc, i-am1+cm1, l-bn1+cn1, c0, numb, datasz, regset, creg ); if (i1 && j0) compact_store_matrix_gen_ ( io_code, trc, ldc, i-am1+cm1+1, l-bn1+cn1, c1, numb, datasz, regset, creg ); if (i2 && j0) compact_store_matrix_gen_ ( io_code, trc, ldc, i-am1+cm1+2, l-bn1+cn1, c2, numb, datasz, regset, creg ); if (i3 && j0) compact_store_matrix_gen_ ( io_code, trc, ldc, i-am1+cm1+3, l-bn1+cn1, c3, numb, datasz, regset, creg ); if (i4 && j0) compact_store_matrix_gen_ ( io_code, trc, ldc, i-am1+cm1+4, l-bn1+cn1, c4, numb, datasz, regset, creg ); if (i5 && j0) compact_store_matrix_gen_ ( io_code, trc, ldc, i-am1+cm1+5, l-bn1+cn1, c5, numb, datasz, regset, creg ); if (i6 && j0) compact_store_matrix_gen_ ( io_code, trc, ldc, i-am1+cm1+6, l-bn1+cn1, c6, numb, datasz, regset, creg ); if (i7 && j0) compact_store_matrix_gen_ ( io_code, trc, ldc, i-am1+cm1+7, l-bn1+cn1, c7, numb, datasz, regset, creg ); } /* Store the results */ if ( loopi && j0 ) { aoffset = datasz*iun*numb; coffset = datasz*iun*numb; if ( i == am1 ) { #ifdef COMPACT_GEMMNN_DEBUG printf("Should be putting in a m-jump soon: i=%d j=%d i0=%d j0=%d am1=%d am2=%d\n",i,j,i0,j0,am1,am2); #endif libxsmm_x86_instruction_alu_imm( io_code, LIBXSMM_X86_INSTR_ADDQ, areg, aoffset ); libxsmm_x86_instruction_alu_imm( io_code, LIBXSMM_X86_INSTR_ADDQ, creg, coffset ); libxsmm_x86_instruction_alu_imm( io_code, LIBXSMM_X86_INSTR_SUBQ, LIBXSMM_X86_GP_REG_RCX, 1 ); libxsmm_x86_instruction_jump_back_to_label( io_code, LIBXSMM_X86_INSTR_JG, &l_loop_label_tracker ); } if ( (am2-i+1 < 2*iun) && ((mborder > 0) || (j + jun - 1 < bn2)) && (mloopadj==1) ) { #ifdef COMPACT_GEMMNN_DEBUG printf("Finished with m-loop, doing clean-up: i=%d i0=%d j0=%d mborder=%d j=%d jun=%d bn2=%d\n",i,i0,j0,mborder,j,jun,bn2); #endif aoffset = datasz*iun*numb*mloopcnt; coffset = datasz*iun*numb*mloopcnt; libxsmm_x86_instruction_alu_imm( io_code, LIBXSMM_X86_INSTR_SUBQ, areg, aoffset ); libxsmm_x86_instruction_alu_imm( io_code, LIBXSMM_X86_INSTR_SUBQ, creg, coffset ); mloopadj = 0; } i0 = 1; /* Turn everything back on again */ } } /* M-loop */ if ( loopj ) { coffset = ldc*datasz*jun*numb; boffset = ldb*datasz*jun*numb; if ( j == bn1 ) { #ifdef COMPACT_GEMMNN_DEBUG printf("Should be putting in a n-jump soon: j=%d bn1=%d bn2=%d\n",j,bn1,bn2); #endif libxsmm_x86_instruction_alu_imm( io_code, LIBXSMM_X86_INSTR_ADDQ, creg, coffset ); libxsmm_x86_instruction_alu_imm( io_code, LIBXSMM_X86_INSTR_ADDQ, breg, boffset ); libxsmm_x86_instruction_alu_imm( io_code, LIBXSMM_X86_INSTR_SUBQ, LIBXSMM_X86_GP_REG_RAX, 1 ); libxsmm_x86_instruction_jump_back_to_label( io_code, LIBXSMM_X86_INSTR_JG, &l_loop_label_tracker ); } if ( (bn2-j+1 < 2*jun) && (nborder > 0) ) { #ifdef COMPACT_GEMMNN_DEBUG printf("Finished with n-loop, doing clean-up, j=%d\n",j); #endif coffset = ldc*datasz*jun*numb*nloopcnt; boffset = ldb*datasz*jun*numb*nloopcnt; libxsmm_x86_instruction_alu_imm( io_code, LIBXSMM_X86_INSTR_SUBQ, creg, coffset ); libxsmm_x86_instruction_alu_imm( io_code, LIBXSMM_X86_INSTR_SUBQ, breg, boffset ); } j0 = 1; /* Turn everything back on again */ } } /* N-loop */ #ifdef COMPACT_GEMMNN_DEBUG printf("Inlined Compact GEMM code pointer ends at: %u\n",io_code->code_size); #endif } #endif /*GENERATOR_PACKED_GEMMNN_H*/ libxsmm-1.17/src/generator_packed_getrf_avx_avx512.c000066400000000000000000000236341415223013700225260ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Greg Henry, Hans Pabst, Timothy Costa (Intel Corp.) ******************************************************************************/ #include "generator_packed_getrf_avx_avx512.h" #include "generator_x86_instructions.h" #include "generator_packed_aux.h" #include "generator_packed_gemmnn.h" #include "generator_common.h" #include "libxsmm_main.h" #if 0 # define GENERATOR_PACKED_GETRF_DEBUG #endif LIBXSMM_API_INTERN void libxsmm_generator_packed_getrf_avx_avx512_kernel( libxsmm_generated_code* io_code, const libxsmm_getrf_descriptor* i_packed_getrf_desc, const char* i_arch ) { unsigned char *const buf = (unsigned char *) io_code->generated_code; libxsmm_loop_label_tracker l_loop_label_tracker /*= { 0 }*/; /* avx512 just represents whether we want to use zmm registers or not * * A value of 0 says not, a value of 1 targets AVX512_CORE, a value * * of 2 targets AVX512_MIC */ int avx512; #if defined(_WIN32) || defined(__CYGWIN__) int l_matrix_gpreg = LIBXSMM_X86_GP_REG_RCX; #else int l_matrix_gpreg = LIBXSMM_X86_GP_REG_RDI; #endif #if 0 /* TOD: introduce/use register mapping rather than directly/hard-coding registers */ /* Just reuse transpose gp mapping */ libxsmm_getrf_gp_reg_mapping l_gp_reg_mapping = { 0/*avoid warning "maybe used uninitialized" */ }; /* define gp register mapping */ #if defined(_WIN32) || defined(__CYGWIN__) l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_lda = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_R8; l_gp_reg_mapping.gp_reg_ldb = LIBXSMM_X86_GP_REG_R9; l_gp_reg_mapping.gp_reg_m_loop = LIBXSMM_X86_GP_REG_RDI; l_gp_reg_mapping.gp_reg_n_loop = LIBXSMM_X86_GP_REG_RSI; #else /* match calling convention on Linux */ l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RDI; l_gp_reg_mapping.gp_reg_lda = LIBXSMM_X86_GP_REG_RSI; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_ldb = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_m_loop = LIBXSMM_X86_GP_REG_R8; l_gp_reg_mapping.gp_reg_n_loop = LIBXSMM_X86_GP_REG_R9; #endif l_gp_reg_mapping.gp_reg_help_0 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_1 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_2 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_3 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_4 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_5 = LIBXSMM_X86_GP_REG_UNDEF; /* Actually, the logic is this: we need a, lda, and b. We don't need ldb * * If n>=6, we need rbx * * If n>=8, we need rbp * * If LIBXSMM_MIN(n,REGSIZE)>=5 and m%REGSIZE==1, we need r12 * * If LIBXSMM_MIN(n,REGSIZE)>=6 and m%REGSIZE==1, we need r13 * * If LIBXSMM_MIN(n,REGSIZE)>=7 and m%REGSIZE==1, we need r14 * * If LIBXSMM_MIN(n,REGSIZE)>=8 and m%REGSIZE==1, we need r15 * * Otherwise, we get by with registers that don't require pushing/popping */ #endif /* define loop_label_tracker */ libxsmm_reset_loop_label_tracker( &l_loop_label_tracker ); /* define transposition kernel config */ if (strcmp(i_arch, "skx") == 0) { avx512 = 1; } else if (strcmp(i_arch, "knl") == 0 || strcmp(i_arch, "knm") == 0) { avx512 = 2; } else if (strcmp(i_arch, "snb") == 0 || strcmp(i_arch, "hsw") == 0) { avx512 = 0; } else { LIBXSMM_HANDLE_ERROR( io_code, LIBXSMM_ERR_UNSUP_ARCH ); return; } /* @Greg add more fields here */ /* @Greg add generator code here, please use functions defined in generator_x86_instructions.h */ /* Todo-> I first want this code to work, and verify it works, then I can * convert one instruction at a time to those in * generator_x86_instructions.h. Or add to the existing instructions */ if ( io_code->code_type > 1 ) { unsigned int i = io_code->code_size; unsigned int m = i_packed_getrf_desc->m; unsigned int n = i_packed_getrf_desc->n; unsigned int lda = i_packed_getrf_desc->lda; const unsigned int lay = (unsigned int)i_packed_getrf_desc->layout; unsigned int datasz = (unsigned int)i_packed_getrf_desc->typesize; /*const double beta = 1.0;*/ unsigned int m1=m, n1=n, mn; unsigned int j, k, ii; unsigned int tra=0, trb=0, trc=0, iunroll=3, junroll=3, loopi=1, loopj=1; /*int REGSIZE;*/ int numb = 0; unsigned int bot, fincol; /*int nounit=0;*/ unsigned int /*mb,*/ nb; /*int iun, jun;*/ char regset = 'y'; double one = 1.0; double none = -1.0; /* Register mapping: */ int a0 = 0, a1 = 1, a2 = 2; int b0 = 3/*, b1 = 4, b2 = 5, b3*/; /*int c00 = 6, c01 = 7, c02 = 8, c03;*/ /*int c10 = 9, c11 = 10, c12 = 11, c13;*/ /*int c20 = 12, c21 = 13, c22 = 14, c23;*/ /*int c30, c31, c32, c33;*/ /*int c40, c41, c42, c43;*/ /*int c0, c2, c3, c4;*/ int onereg = 15; if ( lay == 101 ) { #if 0 if (i_packed_trsm_desc->side == 'L' || i_packed_trsm_desc->side == 'l' ) side = 'R'; else side = 'L'; if (i_packed_trsm_desc->uplo == 'L' || i_packed_trsm_desc->uplo == 'l' ) uplo = 'U'; else uplo = 'L'; m1 = n; n1 = m; #endif tra = 1; trb = 1; trc = 1; } #if defined(GENERATOR_PACKED_GETRF_DEBUG) printf("Inside libxsmm_generator_packed_getrf_avx_avx512_kernel: m=%d n=%d lay=%d lda=%d datasz=%d\n",m,n,lay,lda,datasz); #endif if ( ( datasz !=4 ) && (datasz != 8) ) { fprintf(stderr,"Expecting a datasize of 4 or 8 but got %u\n",datasz); exit(-1); } if ( avx512 < 0 || avx512 > 2 ) { fprintf(stderr,"Expecting an integer between 0 and 2 for avx512, got %i\n",avx512); exit(-1); } if ( datasz == 4 && avx512 == 0 ) { numb = 8; regset = 'y'; } else if ( datasz == 8 && avx512 == 0 ) { numb = 4; regset = 'y'; } else if ( datasz == 4 && avx512 > 0 ) { numb = 16; regset = 'z'; iunroll = 4; junroll = 4; onereg = 25; } else if ( datasz == 8 && avx512 > 0 ) { numb = 8; regset = 'z'; iunroll = 4; junroll = 4; onereg = 25; } /* Determine ideal blocksizes: */ nb = 2; if ( m1 <= 3 ) nb = 1; if ( n1 <= 2 ) nb = 1; mn = LIBXSMM_MIN(m1,n1); if ( mn >= 6 ) nb = 3; if ( mn >= 12 ) nb = 4; compact_set_one_ ( io_code, onereg, numb, datasz, regset ); #if 0 compact_store_matrix_gen_ ( io_code, tra, lda, 1, 1, onereg, numb, datasz, regset, l_matrix_gpreg ); mn=0; #endif for ( ii = 1; ii <= mn; ii += nb ) { bot = LIBXSMM_MIN(ii+nb-1,mn); for ( j = ii; j <= bot; j++ ) { for ( i = j+1; i <= m1; i++ ) { if ( i == j+1 ) { compact_load_matrix_gen_ ( io_code, tra, lda, j, j, a0, numb, datasz, regset, l_matrix_gpreg ); compact_divide_two_nums_ ( io_code, onereg, a0, a0, numb, regset ); } compact_load_matrix_gen_ ( io_code, tra, lda, i, j, a1, numb, datasz, regset, l_matrix_gpreg ); compact_mult_two_nums_ ( io_code, a0, a1, a1, numb, regset ); fincol = bot; if ( i <= bot ) fincol = n1; for ( k = j+1; k <= fincol; k++ ) { compact_load_matrix_gen_ ( io_code, tra, lda, i, k, a2, numb, datasz, regset, l_matrix_gpreg ); compact_load_matrix_gen_ ( io_code, tra, lda, j, k, b0, numb, datasz, regset, l_matrix_gpreg ); compact_fms_cminusab_ ( io_code, a2, a1, b0, numb, regset ); compact_store_matrix_gen_ ( io_code, tra, lda, i, k, a2, numb, datasz, regset, l_matrix_gpreg ); } compact_store_matrix_gen_ ( io_code, tra, lda, i, j, a1, numb, datasz, regset, l_matrix_gpreg ); } } if ( (bot < m1) && (bot < n1) ) { /* * Solve bottom right A22 part with a DGEMM("Notrans","Notrans",m-bot,n-bot,bot-ii+1,-1.0,A(bot+1,ii),lda,A(ii,bot+1),lda,1.0,A(bot+1,bot+1),lda) * A(bot+1:m,bot+1:n) = A(bot+1:m,bot+1:n) - A(bot+1:m,ii:bot)*A(ii:bot,bot+1:n); * */ compact_gemmnn_(tra,trb,trc,bot+1,m1,ii,bot,ii,bot,bot+1,n1,bot+1,m1,bot+1,n1,none,l_matrix_gpreg,lda,l_matrix_gpreg,lda,one,l_matrix_gpreg,lda,io_code,numb,regset,iunroll,junroll,loopi,loopj); } /* Nonempty DGEMM conditional */ } /* Main loop for LU */ } { int i = io_code->code_size; buf[i++] = 0xc3; /* retq */ io_code->code_size = i; } /* close asm: note that we really didn't need to push everything */ /* libxsmm_x86_instruction_close_stream_transpose( io_code, i_arch ); */ #if 0 #define DEBUG_GIVE_BYTE_CODE_OUTPUT #endif #ifdef DEBUG_GIVE_BYTE_CODE_OUTPUT buf = (unsigned char *) io_code->generated_code; printf("#Final Routine: \n"); for ( i = 0; i < io_code->code_size; i+=8 ) { printf("#\tBytes %d-%d\n",i,i+7); printf(".byte 0x%02x, 0x%02x, 0x%02x, 0x%02x, 0x%02x, 0x%02x, 0x%02x, 0x%02x\n",buf[i],buf[i+1],buf[i+2],buf[i+3],buf[i+4],buf[i+5],buf[i+6],buf[i+7]); } #endif } libxsmm-1.17/src/generator_packed_getrf_avx_avx512.h000066400000000000000000000024061415223013700225250ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Greg Henry, Timothy Costa (Intel Corp.) ******************************************************************************/ #ifndef GENERATOR_PACKED_GETRF_AVX_AVX512_H #define GENERATOR_PACKED_GETRF_AVX_AVX512_H #include "generator_common.h" LIBXSMM_API_INTERN void libxsmm_generator_packed_getrf_avx_avx512_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_getrf_descriptor* i_packed_getrf_desc, const char* i_arch ); #endif /*GENERATOR_PACKED_GETRF_AVX_AVX512_H*/ libxsmm-1.17/src/generator_packed_trmm_avx_avx512.c000066400000000000000000000670751415223013700224050ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Greg Henry, Hans Pabst, Timothy Costa (Intel Corp.) ******************************************************************************/ #include "generator_packed_trmm_avx_avx512.h" #include "generator_x86_instructions.h" #include "generator_packed_aux.h" #include "generator_common.h" #include "libxsmm_main.h" #if 0 # define GENERATOR_PACKED_TRMM_DEBUG #endif LIBXSMM_API_INTERN void libxsmm_generator_packed_trmm_avx_avx512_kernel( libxsmm_generated_code* io_code, const libxsmm_trmm_descriptor* i_packed_trmm_desc, const char* i_arch ) { unsigned char *const buf = (unsigned char *) io_code->generated_code; libxsmm_loop_label_tracker l_loop_label_tracker /*= { 0 }*/; /* avx512 just represents whether we want to use zmm registers or not * * A value of 0 says not, a value of 1 targets AVX512_CORE, a value * * of 2 targets AVX512_MIC */ int avx512; #if 0 /* TOD: introduce/use register mapping rather than directly/hard-coding registers */ /* Just reuse transpose gp mapping */ libxsmm_trmm_gp_reg_mapping l_gp_reg_mapping = { 0/*avoid warning "maybe used uninitialized" */ }; /* define gp register mapping */ #if defined(_WIN32) || defined(__CYGWIN__) l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_lda = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_R8; l_gp_reg_mapping.gp_reg_ldb = LIBXSMM_X86_GP_REG_R9; l_gp_reg_mapping.gp_reg_m_loop = LIBXSMM_X86_GP_REG_RDI; l_gp_reg_mapping.gp_reg_n_loop = LIBXSMM_X86_GP_REG_RSI; #else /* match calling convention on Linux */ l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RDI; l_gp_reg_mapping.gp_reg_lda = LIBXSMM_X86_GP_REG_RSI; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_ldb = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_m_loop = LIBXSMM_X86_GP_REG_R8; l_gp_reg_mapping.gp_reg_n_loop = LIBXSMM_X86_GP_REG_R9; #endif l_gp_reg_mapping.gp_reg_help_0 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_1 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_2 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_3 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_4 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_5 = LIBXSMM_X86_GP_REG_UNDEF; /* Actually, the logic is this: we need a, lda, and b. We don't need ldb * * If n>=6, we need rbx * * If n>=8, we need rbp * * If LIBXSMM_MIN(n,REGSIZE)>=5 and m%REGSIZE==1, we need r12 * * If LIBXSMM_MIN(n,REGSIZE)>=6 and m%REGSIZE==1, we need r13 * * If LIBXSMM_MIN(n,REGSIZE)>=7 and m%REGSIZE==1, we need r14 * * If LIBXSMM_MIN(n,REGSIZE)>=8 and m%REGSIZE==1, we need r15 * * Otherwise, we get by with registers that don't require pushing/popping */ #endif /* define loop_label_tracker */ libxsmm_reset_loop_label_tracker( &l_loop_label_tracker ); /* define transposition kernel config */ if (strcmp(i_arch, "skx") == 0) { avx512 = 1; } else if (strcmp(i_arch, "knl") == 0 || strcmp(i_arch, "knm") == 0) { avx512 = 2; } else if (strcmp(i_arch, "snb") == 0 || strcmp(i_arch, "hsw") == 0) { avx512 = 0; } else { LIBXSMM_HANDLE_ERROR( io_code, LIBXSMM_ERR_UNSUP_ARCH ); return; } /* @Greg add more fields here */ /* @Greg add generator code here, please use functions defined in generator_x86_instructions.h */ /* Todo-> I first want this code to work, and verify it works, then I can * convert one instruction at a time to those in * generator_x86_instructions.h. Or add to the existing instructions */ if ( io_code->code_type > 1 ) { unsigned int i = io_code->code_size; unsigned int m = i_packed_trmm_desc->m; unsigned int n = i_packed_trmm_desc->n; unsigned int lda = i_packed_trmm_desc->lda; unsigned int ldb = i_packed_trmm_desc->ldb; char trans = i_packed_trmm_desc->transa; char side = i_packed_trmm_desc->side; char uplo = i_packed_trmm_desc->uplo; char diag = i_packed_trmm_desc->diag; #if defined(_WIN32) || defined(__CYGWIN__) unsigned char areg = LIBXSMM_X86_GP_REG_RCX; unsigned char breg = LIBXSMM_X86_GP_REG_RDX; #else unsigned char areg = LIBXSMM_X86_GP_REG_RDI; unsigned char breg = LIBXSMM_X86_GP_REG_RSI; #endif const unsigned int lay = (unsigned int)i_packed_trmm_desc->layout; unsigned int datasz = (unsigned int)i_packed_trmm_desc->typesize; const double alpha = (8 == datasz ? i_packed_trmm_desc->alpha.d : ((double)i_packed_trmm_desc->alpha.s)); unsigned int m1=m, n1=n; unsigned int j, k; /*int REGSIZE;*/ int numb = 0; /*int scalealpha = 0;*/ int nounit=0; char regset = 'y'; if ( lay == 101 ) { if (i_packed_trmm_desc->side == 'L' || i_packed_trmm_desc->side == 'l' ) side = 'R'; else side = 'L'; if (i_packed_trmm_desc->uplo == 'L' || i_packed_trmm_desc->uplo == 'l' ) uplo = 'U'; else uplo = 'L'; m1 = n; n1 = m; } #ifdef GENERATOR_PACKED_TRMM_DEBUG printf("Inside libxsmm_generator_packed_trmm_avx_avx512_kernel: %c%c%c%c m=%d n=%d lay=%d alpha=%g datasz=%d\n",side,uplo,trans,diag,m1,n1,lay,alpha,datasz); #endif if ( ( datasz !=4 ) && (datasz != 8) ) { fprintf(stderr,"Expecting a datasize of 4 or 8 but got %u\n",datasz); exit(-1); } if ( avx512 < 0 || avx512 > 2 ) { fprintf(stderr,"Expecting an integer between 0 and 2 for avx512, got %i\n",avx512); exit(-1); } if ( datasz == 4 && avx512 == 0 ) { numb = 8; regset = 'y'; } else if ( datasz == 8 && avx512 == 0 ) { numb = 4; regset = 'y'; } else if ( datasz == 4 && avx512 > 0 ) { numb = 16; regset = 'z'; } else if ( datasz == 8 && avx512 > 0 ) { numb = 8; regset = 'z'; } if ( LIBXSMM_FEQ(0, alpha) ) { compact_set_zero_ ( io_code, 0, numb, datasz, regset ); for ( j = 1; j <= n1; j++ ) { for ( i = 1; i <= m1; i++ ) { compact_store_matrix_gen_ ( io_code, 0, ldb, i, j, 0, numb, datasz, regset, breg ); } } i = io_code->code_size; buf[i++] = 0xc3; /* retq */ io_code->code_size = i; return; } if ( LIBXSMM_NEQ(1, alpha) ) { compact_load_parameter_ ( io_code, alpha, 2, numb, regset ); } nounit = ( (diag=='N') || (diag=='n') ); if ( (side=='L') || (side=='l') ) { if ( (trans=='N') || (trans=='n') ) { if ( (uplo=='U') || (uplo=='u') ) { /* Do LUN* cases: B<- alpha*inv(A)*B */ for ( j = 1; j <= n1; j+=3 ) { for ( k = 1; k <= m1; k++ ) { compact_load_matrix_gen_ ( io_code, 0, ldb, k, j, 0, numb, datasz, regset, breg ); if ( j+1 <= n1 ) compact_load_matrix_gen_ ( io_code, 0, ldb, k, j+1, 4, numb, datasz, regset, breg ); if ( j+2 <= n1 ) compact_load_matrix_gen_ ( io_code, 0, ldb, k, j+2, 7, numb, datasz, regset, breg ); if ( LIBXSMM_NEQ(1, alpha) ) { compact_mult_two_nums_ ( io_code, 0, 2, 0, numb, regset ); if ( j+1 <= n1 ) compact_mult_two_nums_ ( io_code, 4, 2, 4, numb, regset ); if ( j+2 <= n1 ) compact_mult_two_nums_ ( io_code, 7, 2, 7, numb, regset ); } for ( i = 1; i <= k-1; i++ ) { compact_load_matrix_gen_ ( io_code, 0, ldb, i, j, 3, numb, datasz, regset, breg ); compact_load_matrix_gen_ ( io_code, 0, lda, i, k, 1, numb, datasz, regset, areg ); compact_fma_cplusab_ ( io_code, 3, 0, 1, numb, regset ); compact_store_matrix_gen_ ( io_code, 0, ldb, i, j, 3, numb, datasz, regset, breg ); if ( j+1 <= n1 ) { compact_load_matrix_gen_ ( io_code, 0, ldb, i, j+1, 6, numb, datasz, regset, breg ); compact_fma_cplusab_ ( io_code, 6, 4, 1, numb, regset ); compact_store_matrix_gen_ ( io_code, 0, ldb, i, j+1, 6, numb, datasz, regset, breg ); } if ( j+2 <= n1 ) { compact_load_matrix_gen_ ( io_code, 0, ldb, i, j+2, 8, numb, datasz, regset, breg ); compact_fma_cplusab_ ( io_code, 8, 7, 1, numb, regset ); compact_store_matrix_gen_ ( io_code, 0, ldb, i, j+2, 8, numb, datasz, regset, breg ); } } if ( nounit ) { compact_load_matrix_gen_ ( io_code, 0, lda, k, k, 1, numb, datasz, regset, areg ); compact_mult_two_nums_ ( io_code, 0, 1, 0, numb, regset ); if ( j+1 <= n1 ) { compact_mult_two_nums_ ( io_code, 4, 1, 4, numb, regset ); } if ( j+2 <= n1 ) { compact_mult_two_nums_ ( io_code, 7, 1, 7, numb, regset ); } } compact_store_matrix_gen_ ( io_code, 0, ldb, k, j, 0, numb, datasz, regset, breg ); if ( j+1 <= n1 ) { compact_store_matrix_gen_ ( io_code, 0, ldb, k, j+1, 4, numb, datasz, regset, breg ); } if ( j+2 <= n1 ) { compact_store_matrix_gen_ ( io_code, 0, ldb, k, j+2, 7, numb, datasz, regset, breg ); } } } } else { /* Do LLN* cases: B <- alpha * inv(A)*B */ for ( j = 1; j <= n1; j+=3 ) { for ( k = m1; k >= 1; k-- ) { compact_load_matrix_gen_ ( io_code, 0, ldb, k, j, 0, numb, datasz, regset, breg ); if ( j+1<=n1 ) compact_load_matrix_gen_ ( io_code, 0, ldb, k, j+1, 4, numb, datasz, regset, breg ); if ( j+2<=n1 ) compact_load_matrix_gen_ ( io_code, 0, ldb, k, j+2, 7, numb, datasz, regset, breg ); if ( LIBXSMM_NEQ(1, alpha) ) { compact_mult_two_nums_ ( io_code, 0, 2, 0, numb, regset ); compact_store_matrix_gen_ ( io_code, 0, ldb, k, j, 0, numb, datasz, regset, breg ); if ( j+1 <= n1 ) { compact_mult_two_nums_ ( io_code, 4, 2, 4, numb, regset ); compact_store_matrix_gen_ ( io_code, 0, ldb, k, j+1, 4, numb, datasz, regset, breg ); } if ( j+2 <= n1 ) { compact_mult_two_nums_ ( io_code, 7, 2, 7, numb, regset ); compact_store_matrix_gen_ ( io_code, 0, ldb, k, j+2, 7, numb, datasz, regset, breg ); } } if ( nounit ) { compact_load_matrix_gen_ ( io_code, 0, lda, k, k, 1, numb, datasz, regset, areg ); compact_mult_two_nums_ ( io_code, 0, 1, 3, numb, regset ); compact_store_matrix_gen_ ( io_code, 0, ldb, k, j, 3, numb, datasz, regset, breg ); if ( j+1 <= n1 ) { compact_mult_two_nums_ ( io_code, 4, 1, 6, numb, regset ); compact_store_matrix_gen_ ( io_code, 0, ldb, k, j+1, 6, numb, datasz, regset, breg ); } if ( j+2 <= n1 ) { compact_mult_two_nums_ ( io_code, 7, 1, 8, numb, regset ); compact_store_matrix_gen_ ( io_code, 0, ldb, k, j+2, 8, numb, datasz, regset, breg ); } } for ( i = k+1; i <= m1; i++ ) { compact_load_matrix_gen_ ( io_code, 0, ldb, i, j, 3, numb, datasz, regset, breg ); compact_load_matrix_gen_ ( io_code, 0, lda, i, k, 1, numb, datasz, regset, areg ); compact_fma_cplusab_ ( io_code, 3, 0, 1, numb, regset ); compact_store_matrix_gen_ ( io_code, 0, ldb, i, j, 3, numb, datasz, regset, breg ); if ( j+1 <= n1 ) { compact_load_matrix_gen_ ( io_code, 0, ldb, i, j+1, 6, numb, datasz, regset, breg); compact_fma_cplusab_ ( io_code, 6, 4, 1, numb, regset ); compact_store_matrix_gen_ ( io_code, 0, ldb, i, j+1, 6, numb, datasz, regset, breg ); } if ( j+2 <= n1 ) { compact_load_matrix_gen_ ( io_code, 0, ldb, i, j+2, 8, numb, datasz, regset, breg); compact_fma_cplusab_ ( io_code, 8, 7, 1, numb, regset ); compact_store_matrix_gen_ ( io_code, 0, ldb, i, j+2, 8, numb, datasz, regset, breg ); } } /* for i LLN main loop */ } /* for k LLN loop */ } /* for j LLN loop */ } /* uplo */ } else { if ( (uplo=='U') || (uplo=='u') ) { /* Do LUT* cases: B<- alpha*A^T*B */ for ( j = 1; j <= n1; j+=3 ) { for ( i = m1; i >= 1; i-- ) { compact_load_matrix_gen_ ( io_code, 0, ldb, i, j, 0, numb, datasz, regset, breg ); if ( j+1 <= n1 ) compact_load_matrix_gen_ ( io_code, 0, ldb, i, j+1, 4, numb, datasz, regset, breg ); if ( j+2 <= n1 ) compact_load_matrix_gen_ ( io_code, 0, ldb, i, j+2, 7, numb, datasz, regset, breg ); if ( nounit ) { compact_load_matrix_gen_ ( io_code, 0, lda, i, i, 1, numb, datasz, regset, areg ); compact_mult_two_nums_ ( io_code, 0, 1, 0, numb, regset ); if ( j+1 <= n1 ) compact_mult_two_nums_ ( io_code, 4, 1, 4, numb, regset ); if ( j+2 <= n1 ) compact_mult_two_nums_ ( io_code, 7, 1, 7, numb, regset ); } for ( k = 1; k <= i-1; k++ ) { compact_load_matrix_gen_ ( io_code, 0, ldb, k, j, 3, numb, datasz, regset, breg ); compact_load_matrix_gen_ ( io_code, 0, lda, k, i, 1, numb, datasz, regset, areg ); compact_fma_cplusab_ ( io_code, 0, 1, 3, numb, regset ); if ( j+1 <= n1 ) { compact_load_matrix_gen_ ( io_code, 0, ldb, k, j+1, 6, numb, datasz, regset, breg ); compact_fma_cplusab_ ( io_code, 4, 1, 6, numb, regset ); } if ( j+2 <= n1 ) { compact_load_matrix_gen_ ( io_code, 0, ldb, k, j+2, 8, numb, datasz, regset, breg ); compact_fma_cplusab_ ( io_code, 7, 1, 8, numb, regset ); } } if ( LIBXSMM_NEQ(1, alpha) ) { compact_mult_two_nums_ ( io_code, 0, 2, 0, numb, regset ); if ( j+1 <= n1 ) compact_mult_two_nums_ ( io_code, 4, 2, 4, numb, regset ); if ( j+2 <= n1 ) compact_mult_two_nums_ ( io_code, 7, 2, 7, numb, regset ); } compact_store_matrix_gen_ ( io_code, 0, ldb, i, j, 0, numb, datasz, regset, breg ); if ( j+1 <= n1 ) compact_store_matrix_gen_ ( io_code, 0, ldb, i, j+1, 4, numb, datasz, regset, breg ); if ( j+2 <= n1 ) compact_store_matrix_gen_ ( io_code, 0, ldb, i, j+2, 7, numb, datasz, regset, breg ); } } } else { /* Do LLT* cases: B <- alpha * A*B */ for ( j = 1; j <= n1; j+=3 ) { for ( i = 1; i <= m1; i++ ) { compact_load_matrix_gen_ ( io_code, 0, ldb, i, j, 0, numb, datasz, regset, breg ); if ( j+1 <= n1 ) compact_load_matrix_gen_ ( io_code, 0, ldb, i, j+1, 4, numb, datasz, regset, breg ); if ( j+2 <= n1 ) compact_load_matrix_gen_ ( io_code, 0, ldb, i, j+2, 7, numb, datasz, regset, breg ); if ( nounit ) { compact_load_matrix_gen_ ( io_code, 0, lda, i, i, 1, numb, datasz, regset, areg ); compact_mult_two_nums_ ( io_code, 0, 1, 0, numb, regset ); if ( j+1 <= n1 ) compact_mult_two_nums_ ( io_code, 4, 1, 4, numb, regset ); if ( j+2 <= n1 ) compact_mult_two_nums_ ( io_code, 7, 1, 7, numb, regset ); } for ( k = i+1; k <= m1; k++ ) { compact_load_matrix_gen_ ( io_code, 0, ldb, k, j, 3, numb, datasz, regset, breg ); compact_load_matrix_gen_ ( io_code, 0, lda, k, i, 1, numb, datasz, regset, areg ); compact_fma_cplusab_ ( io_code, 0, 1, 3, numb, regset ); if ( j+1 <= n1 ) { compact_load_matrix_gen_ ( io_code, 0, ldb, k, j+1, 6, numb, datasz, regset, breg ); compact_fma_cplusab_ ( io_code, 4, 1, 6, numb, regset ); } if ( j+2 <= n1 ) { compact_load_matrix_gen_ ( io_code, 0, ldb, k, j+2, 8, numb, datasz, regset, breg ); compact_fma_cplusab_ ( io_code, 7, 1, 8, numb, regset ); } } if ( LIBXSMM_NEQ(1, alpha) ) { compact_mult_two_nums_ ( io_code, 0, 2, 0, numb, regset ); if ( j+1 <= n1 ) compact_mult_two_nums_ ( io_code, 4, 2, 4, numb, regset ); if ( j+2 <= n1 ) compact_mult_two_nums_ ( io_code, 7, 2, 7, numb, regset ); } compact_store_matrix_gen_ ( io_code, 0, ldb, i, j, 0, numb, datasz, regset, breg ); if ( j+1 <= n1 ) compact_store_matrix_gen_ ( io_code, 0, ldb, i, j+1, 4, numb, datasz, regset, breg ); if ( j+2 <= n1 ) compact_store_matrix_gen_ ( io_code, 0, ldb, i, j+2, 7, numb, datasz, regset, breg ); } } } /* uplo */ } /* trans */ } else { if ( (trans=='N') || (trans=='n') ) { if ( (uplo=='U') || (uplo=='u') ) { /* Do RUN* cases: B<- alpha*B*A */ for ( j = n1; j >= 1; j-- ) { if ( nounit ) { compact_load_matrix_gen_ ( io_code, 0, lda, j, j, 1, numb, datasz, regset, areg ); } if ( LIBXSMM_NEQ(1, alpha) ) { if ( nounit ) { compact_mult_two_nums_ ( io_code, 1, 2, 1, numb, regset ); } #ifdef GENERATOR_PACKED_TRMM_DEBUG else { printf("wrong temp values for TRMM's RUN\n"); } #endif } if ( LIBXSMM_NEQ(1, alpha) || nounit ) { for ( i = 1; i <= m1; i++ ) { compact_load_matrix_gen_ ( io_code, 0, ldb, i, j, 0, numb, datasz, regset, breg ); compact_mult_two_nums_ ( io_code, 0, 1, 0, numb, regset ); compact_store_matrix_gen_ ( io_code, 0, ldb, i, j, 0, numb, datasz, regset, breg ); } } for ( k = 1; k <= j - 1; k++ ) { compact_load_matrix_gen_ ( io_code, 0, lda, k, j, 1, numb, datasz, regset, areg ); if ( LIBXSMM_NEQ(1, alpha) ) { compact_mult_two_nums_ ( io_code, 1, 2, 1, numb, regset ); } for ( i = 1; i <= m1; i++ ) { compact_load_matrix_gen_ ( io_code, 0, ldb, i, j, 0, numb, datasz, regset, breg ); compact_load_matrix_gen_ ( io_code, 0, ldb, i, k, 3, numb, datasz, regset, breg ); compact_fma_cplusab_ ( io_code, 0, 1, 3, numb, regset ); compact_store_matrix_gen_ ( io_code, 0, ldb, i, j, 0, numb, datasz, regset, breg ); } } } } else { /* Do RLN* cases: B <- alpha * B * A */ for ( j = 1; j <= n1; j++ ) { if ( nounit ) { compact_load_matrix_gen_ ( io_code, 0, lda, j, j, 1, numb, datasz, regset, areg ); } if ( LIBXSMM_NEQ(1, alpha) ) { if ( nounit ) { compact_mult_two_nums_ ( io_code, 1, 2, 1, numb, regset ); } #ifdef GENERATOR_PACKED_TRMM_DEBUG else { printf("wrong temp values for TRMM's RLN\n"); } #endif } if ( LIBXSMM_NEQ(1, alpha) || nounit ) { for ( i = 1; i <= m1; i++ ) { compact_load_matrix_gen_ ( io_code, 0, ldb, i, j, 0, numb, datasz, regset, breg ); compact_mult_two_nums_ ( io_code, 0, 1, 0, numb, regset ); compact_store_matrix_gen_ ( io_code, 0, ldb, i, j, 0, numb, datasz, regset, breg ); } } for ( k = j+1; k <= n1; k++ ) { compact_load_matrix_gen_ ( io_code, 0, lda, k, j, 1, numb, datasz, regset, areg ); if ( LIBXSMM_NEQ(1, alpha) ) { compact_mult_two_nums_ ( io_code, 1, 2, 1, numb, regset ); } for ( i = 1; i <= m1; i++ ) { compact_load_matrix_gen_ ( io_code, 0, ldb, i, j, 0, numb, datasz, regset, breg ); compact_load_matrix_gen_ ( io_code, 0, ldb, i, k, 3, numb, datasz, regset, breg ); compact_fma_cplusab_ ( io_code, 0, 1, 3, numb, regset ); compact_store_matrix_gen_ ( io_code, 0, ldb, i, j, 0, numb, datasz, regset, breg ); } } } } /* uplo */ } else { if ( (uplo=='U') || (uplo=='u') ) { /* Do RUT* cases: B<- alpha*B *A^T */ for ( k = 1; k <= n1; k++ ) { for ( j = 1; j <= k-1; j++ ) { compact_load_matrix_gen_ ( io_code, 0, lda, j, k, 1, numb, datasz, regset, areg ); if ( LIBXSMM_NEQ(1, alpha) ) { compact_mult_two_nums_ ( io_code, 1, 2, 1, numb, regset ); } for ( i = 1; i <= m1; i++ ) { compact_load_matrix_gen_ ( io_code, 0, ldb, i, j, 0, numb, datasz, regset, breg ); compact_load_matrix_gen_ ( io_code, 0, ldb, i, k, 3, numb, datasz, regset, breg ); compact_fma_cplusab_ ( io_code, 0, 1, 3, numb, regset ); compact_store_matrix_gen_ ( io_code, 0, ldb, i, j, 0, numb, datasz, regset, breg ); } } if ( nounit ) { compact_load_matrix_gen_ ( io_code, 0, lda, k, k, 1, numb, datasz, regset, areg ); } if ( LIBXSMM_NEQ(1, alpha) ) { if ( nounit ) { compact_mult_two_nums_ ( io_code, 1, 2, 1, numb, regset ); } #ifdef GENERATOR_PACKED_TRMM_DEBUG else { printf("wrong temp values for TRMM's RUT\n"); } #endif } if ( LIBXSMM_NEQ(1, alpha) || nounit ) { for ( i = 1; i <= m1; i++ ) { compact_load_matrix_gen_ ( io_code, 0, ldb, i, k, 0, numb, datasz, regset, breg ); compact_mult_two_nums_ ( io_code, 0, 1, 0, numb, regset ); compact_store_matrix_gen_ ( io_code, 0, ldb, i, k, 0, numb, datasz, regset, breg ); } } } } else { /* Do RLT* cases: B <- alpha * B *inv(A^T) */ for ( k = n1; k >= 1; k-- ) { for ( j = k+1; j <= n1; j++ ) { compact_load_matrix_gen_ ( io_code, 0, lda, j, k, 1, numb, datasz, regset, areg ); if ( LIBXSMM_NEQ(1, alpha) ) { compact_mult_two_nums_ ( io_code, 1, 2, 1, numb, regset ); } for ( i = 1; i <= m1; i++ ) { compact_load_matrix_gen_ ( io_code, 0, ldb, i, j, 0, numb, datasz, regset, breg ); compact_load_matrix_gen_ ( io_code, 0, ldb, i, k, 3, numb, datasz, regset, breg ); compact_fma_cplusab_ ( io_code, 0, 1, 3, numb, regset ); compact_store_matrix_gen_ ( io_code, 0, ldb, i, j, 0, numb, datasz, regset, breg ); } } if ( nounit ) { compact_load_matrix_gen_ ( io_code, 0, lda, k, k, 1, numb, datasz, regset, areg ); } if ( LIBXSMM_NEQ(1, alpha) ) { if ( nounit ) { compact_mult_two_nums_ ( io_code, 1, 2, 1, numb, regset ); } #ifdef GENERATOR_PACKED_TRMM_DEBUG else { printf("wrong temp values for TRMM's RLT\n"); } #endif } if ( LIBXSMM_NEQ(1, alpha) || nounit ) { for ( i = 1; i <= m1; i++ ) { compact_load_matrix_gen_ ( io_code, 0, ldb, i, k, 0, numb, datasz, regset, breg ); compact_mult_two_nums_ ( io_code, 0, 1, 0, numb, regset ); compact_store_matrix_gen_ ( io_code, 0, ldb, i, k, 0, numb, datasz, regset, breg ); } } } } /* uplo */ } /* trans */ } /* side */ } { int i = io_code->code_size; buf[i++] = 0xc3; /* retq */ io_code->code_size = i; } /* close asm: note that we really didn't need to push everything */ /* libxsmm_x86_instruction_close_stream_transpose( io_code, i_arch ); */ #if 0 #define DEBUG_GIVE_BYTE_CODE_OUTPUT #endif #ifdef DEBUG_GIVE_BYTE_CODE_OUTPUT buf = (unsigned char *) io_code->generated_code; printf("#Final Routine: \n"); for ( i = 0; i < io_code->code_size; i+=8 ) { printf("#\tBytes %d-%d\n",i,i+7); printf(".byte 0x%02x, 0x%02x, 0x%02x, 0x%02x, 0x%02x, 0x%02x, 0x%02x, 0x%02x\n",buf[i],buf[i+1],buf[i+2],buf[i+3],buf[i+4],buf[i+5],buf[i+6],buf[i+7]); } #endif } libxsmm-1.17/src/generator_packed_trmm_avx_avx512.h000066400000000000000000000023771415223013700224040ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Greg Henry, Timothy Costa (Intel Corp.) ******************************************************************************/ #ifndef GENERATOR_PACKED_TRMM_AVX_AVX512_H #define GENERATOR_PACKED_TRMM_AVX_AVX512_H #include "generator_common.h" LIBXSMM_API_INTERN void libxsmm_generator_packed_trmm_avx_avx512_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_trmm_descriptor* i_packed_trmm_desc, const char* i_arch ); #endif /*GENERATOR_PACKED_TRMM_AVX_AVX512_H*/ libxsmm-1.17/src/generator_packed_trsm_avx_avx512.c000066400000000000000000001253341415223013700224040ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Greg Henry, Hans Pabst, Timothy Costa (Intel Corp.) ******************************************************************************/ #include "generator_packed_trsm_avx_avx512.h" #include "generator_x86_instructions.h" #include "generator_packed_aux.h" #include "generator_common.h" #include "libxsmm_main.h" #if 0 # define GENERATOR_PACKED_TRSM_DEBUG #endif LIBXSMM_API_INTERN void libxsmm_generator_packed_trsm_avx_avx512_kernel( libxsmm_generated_code* io_code, const libxsmm_trsm_descriptor* i_packed_trsm_desc, const char* i_arch ) { unsigned char *const buf = (unsigned char *) io_code->generated_code; libxsmm_loop_label_tracker l_loop_label_tracker /*= { 0 }*/; /* avx512 just represents whether we want to use zmm registers or not * * A value of 0 says not, a value of 1 targets AVX512_CORE, a value * * of 2 targets AVX512_MIC */ int avx512; #if 0 /* TOD: introduce/use register mapping rather than directly/hard-coding registers */ /* Just reuse transpose gp mapping */ libxsmm_trsm_gp_reg_mapping l_gp_reg_mapping = { 0/*avoid warning "maybe used uninitialized" */ }; /* define gp register mapping */ #if defined(_WIN32) || defined(__CYGWIN__) l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_lda = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_R8; l_gp_reg_mapping.gp_reg_ldb = LIBXSMM_X86_GP_REG_R9; l_gp_reg_mapping.gp_reg_m_loop = LIBXSMM_X86_GP_REG_RDI; l_gp_reg_mapping.gp_reg_n_loop = LIBXSMM_X86_GP_REG_RSI; #else /* match calling convention on Linux */ l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RDI; l_gp_reg_mapping.gp_reg_lda = LIBXSMM_X86_GP_REG_RSI; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_ldb = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_m_loop = LIBXSMM_X86_GP_REG_R8; l_gp_reg_mapping.gp_reg_n_loop = LIBXSMM_X86_GP_REG_R9; #endif l_gp_reg_mapping.gp_reg_help_0 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_1 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_2 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_3 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_4 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_5 = LIBXSMM_X86_GP_REG_UNDEF; /* Actually, the logic is this: we need a, lda, and b. We don't need ldb * * If n>=6, we need rbx * * If n>=8, we need rbp * * If LIBXSMM_MIN(n,REGSIZE)>=5 and m%REGSIZE==1, we need r12 * * If LIBXSMM_MIN(n,REGSIZE)>=6 and m%REGSIZE==1, we need r13 * * If LIBXSMM_MIN(n,REGSIZE)>=7 and m%REGSIZE==1, we need r14 * * If LIBXSMM_MIN(n,REGSIZE)>=8 and m%REGSIZE==1, we need r15 * * Otherwise, we get by with registers that don't require pushing/popping */ #endif /* define loop_label_tracker */ libxsmm_reset_loop_label_tracker( &l_loop_label_tracker ); /* define transposition kernel config */ if (strcmp(i_arch, "skx") == 0) { avx512 = 1; } else if (strcmp(i_arch, "knl") == 0 || strcmp(i_arch, "knm") == 0) { avx512 = 2; } else if (strcmp(i_arch, "snb") == 0 || strcmp(i_arch, "hsw") == 0) { avx512 = 0; } else { LIBXSMM_HANDLE_ERROR( io_code, LIBXSMM_ERR_UNSUP_ARCH ); return; } /* @Greg add more fields here */ /* @Greg add generator code here, please use functions defined in generator_x86_instructions.h */ /* Todo-> I first want this code to work, and verify it works, then I can * convert one instruction at a time to those in * generator_x86_instructions.h. Or add to the existing instructions */ if (NULL == buf) { LIBXSMM_HANDLE_ERROR(io_code, LIBXSMM_ERR_BUFFER_TOO_SMALL); return; } if ( io_code->code_type > 1 ) { unsigned int i = io_code->code_size; unsigned int m = i_packed_trsm_desc->m; unsigned int n = i_packed_trsm_desc->n; unsigned int lda = i_packed_trsm_desc->lda; unsigned int ldb = i_packed_trsm_desc->ldb; char trans = i_packed_trsm_desc->transa; char side = i_packed_trsm_desc->side; char uplo = i_packed_trsm_desc->uplo; char diag = i_packed_trsm_desc->diag; const unsigned int layout = (unsigned int)i_packed_trsm_desc->layout; unsigned int datasz = (unsigned int)i_packed_trsm_desc->typesize; const double alpha = (8 == datasz ? i_packed_trsm_desc->alpha.d : ((double)i_packed_trsm_desc->alpha.s)); unsigned int m1=m, n1=n; unsigned int j, k; /*int REGSIZE;*/ int numb = 0; int scalealpha = 0; int nounit=0; char regset = 'y'; if ( layout == 101 ) { if (i_packed_trsm_desc->side == 'L' || i_packed_trsm_desc->side == 'l' ) side = 'R'; else side = 'L'; if (i_packed_trsm_desc->uplo == 'L' || i_packed_trsm_desc->uplo == 'l' ) uplo = 'U'; else uplo = 'L'; m1 = n; n1 = m; } #ifdef GENERATOR_PACKED_TRSM_DEBUG printf("Inside libxsmm_generator_packed_trsm_avx_avx512_kernel: %c%c%c%c m=%d n=%d lay=102 alpha=%g datasz=%d\n",side,uplo,trans,diag,m1,n1,alpha,datasz); #endif if ( ( datasz !=4 ) && (datasz != 8) ) { fprintf(stderr,"Expecting a datasize of 4 or 8 but got %u\n",datasz); exit(-1); } if ( avx512 < 0 ) { fprintf(stderr,"Expecting a nonnegative number for avx512: %i\n",avx512); exit(-1); } if ( datasz == 4 && avx512 == 0 ) { numb = 8; regset = 'y'; } else if ( datasz == 8 && avx512 == 0 ) { numb = 4; regset = 'y'; } else if ( datasz == 4 && avx512 > 0 ) { numb = 16; regset = 'z'; } else if ( datasz == 8 && avx512 > 0 ) { numb = 8; regset = 'z'; } if ( LIBXSMM_FEQ(0, alpha) ) { compact_set_zero_ ( io_code, 0, numb, datasz, regset ); for ( j = 1; j <= n1; j++ ) { for ( i = 1; i <= m1; i++ ) { compact_store_matrix2_ ( io_code, ldb, i, j, 0, numb, datasz, regset ); } } i = io_code->code_size; buf[i++] = 0xc3; /* retq */ io_code->code_size = i; return; } if ( LIBXSMM_NEQ(1, alpha) ) { compact_load_parameter_ ( io_code, alpha, 2, numb, regset ); } nounit = ( (diag=='N') || (diag=='n') ); if ( (side=='L') || (side=='l') ) { if ( (trans=='N') || (trans=='n') ) { if ( (uplo=='U') || (uplo=='u') ) { /* Do LUN* cases: B<- alpha*inv(A)*B */ if ( nounit ) { compact_set_one_ ( io_code, 15, numb, datasz, regset ); for ( i = 1; i <= m1; i++ ) { compact_load_matrix1_ ( io_code, lda, i, i, 3, numb, datasz, regset ); compact_divide_two_nums_ ( io_code, 15, 3, 3 , numb, regset ); compact_store_matrix3_ ( io_code, m1, i, 1, 3, numb, datasz, regset ); } } for ( j = 1; j <= n1; j++ ) { if ( LIBXSMM_NEQ(1, alpha) ) { for ( i = 1; i <= m1; i++ ) { compact_load_matrix2_ ( io_code, ldb, i, j, 0, numb, datasz, regset ); compact_mult_two_nums_ ( io_code, 0, 2, 0, numb, regset ); compact_store_matrix2_ ( io_code, ldb, i, j, 0, numb, datasz, regset ); } } for ( k = m1; k >= 1; k-- ) { compact_load_matrix2_ ( io_code, ldb, k, j, 0, numb, datasz, regset ); if ( nounit ) { #if 0 compact_load_matrix1_ ( io_code, lda, k, k, 1, numb, datasz, regset ); compact_divide_two_nums_ ( io_code, 0, 1, 0, numb, regset ); #else compact_load_matrix3_ ( io_code, m1, k, 1, 1, numb, datasz, regset ); compact_mult_two_nums_ ( io_code, 0, 1, 0, numb, regset ); #endif compact_store_matrix2_ ( io_code, ldb, k, j, 0, numb, datasz, regset ); } for ( i = 1; i <= k-1; i++ ) { compact_load_matrix2_ ( io_code, ldb, i, j, 1, numb, datasz, regset ); compact_load_matrix1_ ( io_code, lda, i, k, 3, numb, datasz, regset ); compact_fms_cminusab_ ( io_code, 1, 0, 3, numb, regset ); compact_store_matrix2_ ( io_code, ldb, i, j, 1, numb, datasz, regset ); } } } } else { /* Do LLN* cases: B <- alpha * inv(A)*B */ #if 0 #define USE_XCT_LLNN #endif #ifdef USE_XCT_LLNN int done = 0; if ( (avx512==0) && (alpha==1.0) ) { #include "generator_compact_xct_avx2_lln.h" done = 1; } if ( done == 0 ) #endif { /* Do LLN* cases: B <- alpha * inv(A)*B */ if ( nounit ) { compact_set_one_ ( io_code, 15, numb, datasz, regset ); for ( i = 1; i <= m1; i++ ) { compact_load_matrix1_ ( io_code, lda, i, i, 3, numb, datasz, regset ); compact_divide_two_nums_ ( io_code, 15, 3, 3 , numb, regset ); compact_store_matrix3_ ( io_code, m1, i, 1, 3, numb, datasz, regset ); } } for ( j = 1; j <= n1; j+=3 ) { for ( k = 1; k <= m1; k+=2 ) { scalealpha = 0; if ( LIBXSMM_NEQ(1, alpha) && (k==1) ) scalealpha = 1; compact_load_matrix2_ ( io_code, ldb, k, j, 0, numb, datasz, regset ); if ( j+1 <= n1 ) compact_load_matrix2_ ( io_code, ldb, k, j+1, 4, numb, datasz, regset ); if ( j+2 <= n1 ) compact_load_matrix2_ ( io_code, ldb, k, j+2, 7, numb, datasz, regset ); if ( scalealpha == 1 ) { compact_mult_two_nums_ ( io_code, 0, 2, 0, numb, regset ); if ( j+1 <= n1 ) compact_mult_two_nums_ ( io_code, 4, 2, 4, numb, regset ); if ( j+2 <= n1 ) compact_mult_two_nums_ ( io_code, 7, 2, 7, numb, regset ); } if ( nounit ) { compact_load_matrix3_ ( io_code, m1, k, 1, 1, numb, datasz, regset ); compact_mult_two_nums_ ( io_code, 0, 1, 0, numb, regset ); compact_store_matrix2_ ( io_code, ldb, k, j, 0, numb, datasz, regset ); if ( j+1 <= n1 ) { compact_mult_two_nums_ ( io_code, 4, 1, 4, numb, regset ); compact_store_matrix2_ ( io_code, ldb, k, j+1, 4, numb, datasz, regset ); } if ( j+2 <= n1 ) { compact_mult_two_nums_ ( io_code, 7, 1, 7, numb, regset ); compact_store_matrix2_ ( io_code, ldb, k, j+2, 7, numb, datasz, regset ); } } if ( k+1 <= m1 ) { compact_load_matrix2_ ( io_code, ldb, k+1, j, 10, numb, datasz, regset ); if ( scalealpha == 1 ) { compact_mult_two_nums_ ( io_code, 10, 2, 10, numb, regset ); } compact_load_matrix1_ ( io_code, ldb, k+1, k, 3, numb, datasz, regset ); compact_fms_cminusab_ ( io_code, 10, 0, 3, numb, regset ); if ( j+1 <= n1 ) { compact_load_matrix2_ ( io_code, ldb, k+1, j+1, 14, numb, datasz, regset ); if ( scalealpha == 1 ) { compact_mult_two_nums_ ( io_code, 14, 2, 14, numb, regset ); } compact_load_matrix1_ ( io_code, ldb, k+1, k, 3, numb, datasz, regset ); compact_fms_cminusab_ ( io_code, 14, 4, 3, numb, regset ); } if ( j+2 <= n1 ) { compact_load_matrix2_ ( io_code, ldb, k+1, j+2, 9, numb, datasz, regset ); if ( scalealpha == 1 ) { compact_mult_two_nums_ ( io_code, 9, 2, 9, numb, regset ); } compact_fms_cminusab_ ( io_code, 9, 7, 3, numb, regset ); } if ( nounit ) { compact_load_matrix3_ ( io_code, m1, k+1, 1, 11, numb, datasz, regset ); compact_mult_two_nums_ ( io_code, 10, 11, 10, numb, regset ); compact_store_matrix2_ ( io_code, ldb, k+1, j, 10, numb, datasz, regset ); if ( j+1 <= n1 ) { compact_mult_two_nums_ ( io_code, 14, 11, 14, numb, regset ); compact_store_matrix2_ ( io_code, ldb, k+1, j+1, 14, numb, datasz, regset ); } if ( j+2 <= n1 ) { compact_mult_two_nums_ ( io_code, 9, 11, 9, numb, regset ); compact_store_matrix2_ ( io_code, ldb, k+1, j+2, 9, numb, datasz, regset ); } } } for ( i = k+2; i <= m1; i++ ) { compact_load_matrix2_ ( io_code, ldb, i, j, 1, numb, datasz, regset ); if ( scalealpha == 1 ) { compact_mult_two_nums_ ( io_code, 1, 2, 1, numb, regset ); } compact_load_matrix1_ ( io_code, ldb, i, k, 3, numb, datasz, regset ); compact_fms_cminusab_ ( io_code, 1, 0, 3, numb, regset ); if ( k+1 > m1 ) { compact_store_matrix2_ ( io_code, ldb, i, j, 1, numb, datasz, regset ); } if ( j+1 <= n1 ) { compact_load_matrix2_ ( io_code, ldb, i, j+1, 6, numb, datasz, regset ); if ( scalealpha == 1 ) { compact_mult_two_nums_ ( io_code, 6, 2, 6, numb, regset ); } compact_fms_cminusab_ ( io_code, 6, 4, 3, numb, regset ); if ( k+1 > m1 ) { compact_store_matrix2_ ( io_code, ldb, i, j+1, 6, numb, datasz, regset ); } } if ( j+2 <= n1 ) { compact_load_matrix2_ ( io_code, ldb, i, j+2, 12, numb, datasz, regset ); if ( scalealpha == 1 ) { compact_mult_two_nums_ ( io_code, 12, 2, 12, numb, regset ); } compact_fms_cminusab_ ( io_code, 12, 7, 3, numb, regset ); if ( k+1 > m1 ) { compact_store_matrix2_ ( io_code, ldb, i, j+2, 12, numb, datasz, regset ); } } if ( k+1 <= m1 ) { compact_load_matrix1_ ( io_code, ldb, i, k+1, 13, numb, datasz, regset ); compact_fms_cminusab_ ( io_code, 1, 10, 13, numb, regset ); compact_store_matrix2_ ( io_code, ldb, i, j, 1, numb, datasz, regset ); if ( j+1 <= n1 ) { compact_fms_cminusab_ ( io_code, 6, 14, 13, numb, regset ); compact_store_matrix2_ ( io_code, ldb, i, j+1, 6, numb, datasz, regset ); } if ( j+2 <= n1 ) { compact_fms_cminusab_ ( io_code, 12, 9, 13, numb, regset ); compact_store_matrix2_ ( io_code, ldb, i, j+2, 12, numb, datasz, regset ); } } } /* for i LLN main loop */ } /* for k LLN loop */ } /* for j LLN loop */ } /* Call XCT LLN kernel or not */ } /* uplo */ } else { if ( (uplo=='U') || (uplo=='u') ) { /* Do LUT* cases: B<- alpha*inv(A^T)*B */ #define LUT_RECIPROCATE #ifdef LUT_RECIPROCATE if ( nounit ) { compact_set_one_ ( io_code, 15, numb, datasz, regset ); for ( i = 1; i <= m1; i++ ) { compact_load_matrix1_ ( io_code, lda, i, i, 3, numb, datasz, regset ); compact_divide_two_nums_ ( io_code, 15, 3, 3 , numb, regset ); compact_store_matrix3_ ( io_code, m1, i, 1, 3, numb, datasz, regset ); } } #endif #define LUT_N2 #ifdef LUT_N2 for ( j = 1; j <= n1; j+=2 ) #else for ( j = 1; j <= n1; j++ ) #endif { for ( i = 1; i <= m1; i+=2 ) { compact_load_matrix2_ ( io_code, ldb, i, j, 0, numb, datasz, regset ); #ifdef LUT_N2 if ( j+1 <= n1 ) compact_load_matrix2_ ( io_code, ldb, i, j+1, 4, numb, datasz, regset ); #endif if ( i+1 <= m1 ) compact_load_matrix2_ ( io_code, ldb, i+1, j, 7, numb, datasz, regset ); #ifdef LUT_N2 if ((i+1<=m1)&&(j+1<=n1)) compact_load_matrix2_ ( io_code, ldb, i+1, j+1, 9, numb, datasz, regset ); #endif if ( LIBXSMM_NEQ(1, alpha) ) { compact_mult_two_nums_ ( io_code, 0, 2, 0, numb, regset ); #ifdef LUT_N2 if ( j+1 <= n1 ) compact_mult_two_nums_ ( io_code, 4, 2, 4, numb, regset ); #endif if ( i+1 <= m1 ) compact_mult_two_nums_ ( io_code, 7, 2, 7, numb, regset ); #ifdef LUT_N2 if ((i+1<=m1)&&(j+1<=n1)) compact_mult_two_nums_ ( io_code, 9, 2, 9, numb, regset ); #endif } for ( k = 1; k <= i-1; k++ ) { compact_load_matrix2_( io_code, ldb, k, j, 1, numb, datasz, regset ); compact_load_matrix1_( io_code, lda, k, i, 3, numb, datasz, regset ); compact_fms_cminusab_ ( io_code, 0, 3, 1, numb, regset ); #ifdef LUT_N2 if ( j+1 <= n1 ) { compact_load_matrix2_( io_code, ldb, k, j+1, 5, numb, datasz, regset ); compact_fms_cminusab_ ( io_code, 4, 3, 5, numb, regset ); } #endif if ( i+1 <= m1 ) { compact_load_matrix1_( io_code, lda, k, i+1, 8, numb, datasz, regset ); compact_fms_cminusab_ ( io_code, 7, 8, 1, numb, regset ); } #ifdef LUT_N2 if ((i+1<=m1)&&(j+1<=n1)) { compact_fms_cminusab_ ( io_code, 9, 8, 5, numb, regset ); } #endif } if ( nounit ) { #ifdef LUT_RECIPROCATE compact_load_matrix3_ ( io_code, m1, i, 1, 3, numb, datasz, regset ); compact_mult_two_nums_ ( io_code, 0, 3, 0, numb, regset ); # ifdef LUT_N2 if ( j+1 <= n1 ) compact_mult_two_nums_ ( io_code, 4, 3, 4, numb, regset ); # endif #else compact_load_matrix1_ ( io_code, lda, i, i, 3, numb, datasz, regset ); compact_divide_two_nums_ ( io_code, 0, 3, 0, numb, regset ); # ifdef LUT_N2 if ( j+1 <= n1 ) compact_divide_two_nums_ ( io_code, 4, 3, 4, numb, regset ); # endif #endif } compact_store_matrix2_ ( io_code, ldb, i, j, 0, numb, datasz, regset ); #ifdef LUT_N2 if ( j+1 <= n1 ) compact_store_matrix2_ ( io_code, ldb, i, j+1, 4, numb, datasz, regset ); #endif if ( i+1 <= m1 ) { compact_load_matrix1_( io_code, lda, i, i+1, 8, numb, datasz, regset ); compact_fms_cminusab_ ( io_code, 7, 8, 0, numb, regset ); #ifdef LUT_N2 if ( j+1 <= n1 ) compact_fms_cminusab_ ( io_code, 9, 8, 4, numb, regset ); #endif if ( nounit ) { #ifdef LUT_RECIPROCATE compact_load_matrix3_ ( io_code, m1, i+1, 1, 3, numb, datasz, regset ); compact_mult_two_nums_ ( io_code, 7, 3, 7, numb, regset ); # ifdef LUT_N2 if ( j+1 <= n1 ) compact_mult_two_nums_ ( io_code, 9, 3, 9, numb, regset ); # endif #else compact_load_matrix1_ ( io_code, lda, i+1, i+1, 3, numb, datasz, regset ); compact_divide_two_nums_ ( io_code, 7, 3, 7, numb, regset ); # ifdef LUT_N2 if ( j+1 <= n1 ) compact_divide_two_nums_ ( io_code, 9, 3, 9, numb, regset ); # endif #endif } compact_store_matrix2_ ( io_code, ldb, i+1, j, 7, numb, datasz, regset ); #ifdef LUT_N2 if ( j+1 <= n1 ) compact_store_matrix2_ ( io_code, ldb, i+1, j+1, 9, numb, datasz, regset ); #endif } } } } else { /* Do LLT* cases: B <- alpha * inv(A)*B */ #define LLT_N2 #define LLT_M2 #if 1 #define LLT_RECIPROCATE #endif #ifdef LLT_RECIPROCATE if ( nounit ) { compact_set_one_ ( io_code, 15, numb, datasz, regset ); for ( i = 1; i <= m1; i++ ) { compact_load_matrix1_ ( io_code, lda, i, i, 3, numb, datasz, regset ); compact_divide_two_nums_ ( io_code, 15, 3, 3 , numb, regset ); compact_store_matrix3_ ( io_code, m1, i, 1, 3, numb, datasz, regset ); } } #endif #ifdef LLT_N2 for ( j = 1; j <= n1; j+=2 ) #else for ( j = 1; j <= n1; j+=1 ) #endif { #ifdef LLT_M2 for ( i = m1; i >= 1; i-=2 ) #else for ( i = m1; i >= 1; i-=1 ) #endif { compact_load_matrix2_ ( io_code, ldb, i, j, 0, numb, datasz, regset ); #ifdef LLT_M2 if ( i-1 >= 1 ) compact_load_matrix2_ ( io_code, ldb, i-1, j, 4, numb, datasz, regset ); #endif #ifdef LLT_N2 if ( j+1 <= n1 ) compact_load_matrix2_ ( io_code, ldb, i, j+1, 7, numb, datasz, regset ); #endif #if defined(LLT_N2) && defined(LLT_M2) if ( (i-1>=1) && (j+1<=n1) ) compact_load_matrix2_ ( io_code, ldb, i-1, j+1, 10, numb, datasz, regset ); #endif if ( LIBXSMM_NEQ(1, alpha) ) { compact_mult_two_nums_ ( io_code, 0, 2, 0, numb, regset ); #ifdef LLT_M2 if ( i-1 >= 1 ) compact_mult_two_nums_ ( io_code, 4, 2, 4, numb, regset ); #endif #ifdef LLT_N2 if ( j+1 <= n1) compact_mult_two_nums_ ( io_code, 7, 2, 7, numb, regset ); #endif #if defined(LLT_N2) && defined(LLT_M2) if ((i-1>=1)&&(j+1<=n1)) compact_mult_two_nums_ ( io_code, 10, 2, 10, numb, regset ); #endif } for ( k = i+1; k <= m1; k++ ) { compact_load_matrix2_ ( io_code, ldb, k, j, 1, numb, datasz, regset ); compact_load_matrix1_ ( io_code, lda, k, i, 3, numb, datasz, regset ); compact_fms_cminusab_ ( io_code, 0, 3, 1, numb, regset ); #ifdef LLT_M2 if ( i-1 >= 1 ) { compact_load_matrix1_ ( io_code, lda, k, i-1, 6, numb, datasz, regset ); compact_fms_cminusab_ ( io_code, 4, 6, 1, numb, regset ); } #endif #ifdef LLT_N2 if ( j+1 <= n1) { compact_load_matrix2_ ( io_code, ldb, k, j+1, 8, numb, datasz, regset ); # if 0 compact_load_matrix1_ ( io_code, lda, k, i, 9, numb, datasz, regset ); compact_fms_cminusab_ ( io_code, 7, 9, 8, numb, regset ); # else compact_fms_cminusab_ ( io_code, 7, 3, 8, numb, regset ); # endif } #endif #if defined(LLT_N2) && defined(LLT_M2) if ((i-1>=1)&&(j+1<=n1)) { # if 0 compact_load_matrix2_ ( io_code, ldb, k, j+1, 11, numb, datasz, regset ); compact_load_matrix1_ ( io_code, lda, k, i-1, 12, numb, datasz, regset ); compact_fms_cminusab_ ( io_code, 10, 12, 11, numb, regset ); # else compact_fms_cminusab_ ( io_code, 10, 6, 8 , numb, regset ); # endif } #endif } if ( nounit ) { #ifndef LLT_RECIPROCATE compact_load_matrix1_ ( io_code, lda, i, i, 3, numb, datasz, regset ); compact_divide_two_nums_ ( io_code, 0, 3, 0, numb, regset ); # ifdef LLT_N2 if ( j+1 <= n1 ) { compact_divide_two_nums_ ( io_code, 7, 3, 7, numb, regset ); } # endif #else compact_load_matrix3_ ( io_code, m1, i, 1, 3, numb, datasz, regset ); compact_mult_two_nums_ ( io_code, 0, 3, 0, numb, regset ); # ifdef LLT_N2 if ( j+1 <= n1 ) { compact_mult_two_nums_ ( io_code, 7, 3, 7, numb, regset ); } # endif #endif } compact_store_matrix2_ ( io_code, ldb, i, j, 0, numb, datasz, regset ); #ifdef LLT_M2 if ( i-1 >= 1 ) { compact_load_matrix1_ ( io_code, lda, i, i-1, 6, numb, datasz, regset ); compact_fms_cminusab_ ( io_code, 4, 6, 0, numb, regset ); if ( nounit ) { # ifndef LLT_RECIPROCATE compact_load_matrix1_ ( io_code, lda, i-1, i-1, 6, numb, datasz, regset ); compact_divide_two_nums_ ( io_code, 4, 6, 4, numb, regset ); # else compact_load_matrix3_ ( io_code, m1, i-1, 1, 6, numb, datasz, regset ); compact_mult_two_nums_ ( io_code, 4, 6, 4, numb, regset ); # endif } compact_store_matrix2_ ( io_code, ldb, i-1, j, 4, numb, datasz, regset ); } #endif #ifdef LLT_N2 if ( j+1 <= n1) compact_store_matrix2_ ( io_code, ldb, i, j+1, 7, numb, datasz, regset ); #endif #if defined(LLT_N2) && defined(LLT_M2) if ((i-1>=1)&&(j+1<=n1)) { compact_load_matrix1_ ( io_code, lda, i, i-1, 12, numb, datasz, regset ); compact_fms_cminusab_ ( io_code, 10, 12, 7, numb, regset ); if ( nounit ) { # ifdef LLT_RECIPROCATE compact_mult_two_nums_ ( io_code, 10, 6, 10, numb, regset ); # else compact_load_matrix1_ ( io_code, lda, i-1, i-1, 12, numb, datasz, regset ); compact_divide_two_nums_ ( io_code, 10, 12, 10, numb, regset ); # endif } compact_store_matrix2_ ( io_code, ldb, i-1, j+1, 10, numb, datasz, regset ); } #endif } } } /* uplo */ } /* trans */ } else { compact_set_one_ ( io_code, 5, numb, datasz, regset ); if ( (trans=='N') || (trans=='n') ) { if ( (uplo=='U') || (uplo=='u') ) { /* Do RUN* cases: B<- alpha*B*inv(A) */ if ( nounit ) { compact_set_one_ ( io_code, 15, numb, datasz, regset ); for ( i = 1; i <= n1; i++ ) { compact_load_matrix1_ ( io_code, lda, i, i, 3, numb, datasz, regset ); compact_divide_two_nums_ ( io_code, 15, 3, 3 , numb, regset ); compact_store_matrix3_ ( io_code, n1, i, 1, 3, numb, datasz, regset ); } } for ( j = 1; j <= n1; j+=2 ) { if ( LIBXSMM_NEQ(1, alpha) && (j==1) ) { for ( i = 1; i <= m1; i++ ) { compact_load_matrix2_ ( io_code, ldb, i, j, 0, numb, datasz, regset ); if ( j+1 <= n1 ) compact_load_matrix2_ ( io_code, ldb, i, j+1, 1, numb, datasz, regset ); #if 0 if ( j+2 <= n1 ) compact_load_matrix2_ ( io_code, ldb, i, j+2, 3, numb, datasz, regset ); if ( j+3 <= n1 ) compact_load_matrix2_ ( io_code, ldb, i, j+3, 4, numb, datasz, regset ); #endif compact_mult_two_nums_ ( io_code, 0, 2, 0, numb, regset ); if ( j+1 <= n1 ) compact_mult_two_nums_ ( io_code, 1, 2, 1, numb, regset ); #if 0 if ( j+2 <= n1 ) compact_mult_two_nums_ ( io_code, 3, 2, 3, numb, regset ); if ( j+3 <= n1 ) compact_mult_two_nums_ ( io_code, 4, 2, 4, numb, regset ); #endif compact_store_matrix2_ ( io_code, ldb, i, j, 0, numb, datasz, regset ); if ( j+1 <= n1 ) compact_store_matrix2_ ( io_code, ldb, i, j+1, 1, numb, datasz, regset ); #if 0 if ( j+2 <= n1 ) compact_store_matrix2_ ( io_code, ldb, i, j+2, 3, numb, datasz, regset ); if ( j+3 <= n1 ) compact_store_matrix2_ ( io_code, ldb, i, j+3, 4, numb, datasz, regset ); #endif } } for ( k = 1; k <= j-1; k++ ) { if ( (k==j-1) && (nounit) ) { compact_load_matrix3_ ( io_code, n1, j, 1, 5, numb, datasz, regset ); } compact_load_matrix1_ ( io_code, lda, k, j, 3, numb, datasz, regset ); if ( j+1 <= n1 ) compact_load_matrix1_ ( io_code, lda, k, j+1, 6, numb, datasz, regset ); #if 0 if ( j+2 <= n1 ) compact_load_matrix1_ ( io_code, lda, k, j+2, 10, numb, datasz, regset ); if ( j+3 <= n1 ) compact_load_matrix1_ ( io_code, lda, k, j+3, 12, numb, datasz, regset ); #endif for ( i = 1; i <= m1; i++ ) { compact_load_matrix2_ ( io_code, ldb, i, j, 1, numb, datasz, regset ); if (j+1<=n1) compact_load_matrix2_ ( io_code, ldb, i, j+1, 7, numb, datasz, regset ); #if 0 if (j+2<=n1) compact_load_matrix2_ ( io_code, ldb, i, j+2, 11, numb, datasz, regset ); if (j+3<=n1) compact_load_matrix2_ ( io_code, ldb, i, j+3, 13, numb, datasz, regset ); #endif if ((k==1)&&LIBXSMM_NEQ(1,alpha)) compact_mult_two_nums_ ( io_code, 1, 2, 1, numb, regset ); if ((j+1<=n1)&&(k==1)&&LIBXSMM_NEQ(1,alpha)) compact_mult_two_nums_ ( io_code, 7, 2, 7, numb, regset ); #if 0 if ((j+2<=n1)&&(k==1)&&LIBXSMM_NEQ(1,alpha)) compact_mult_two_nums_ ( io_code, 11, 2, 11, numb, regset ); if ((j+3<=n1)&&(k==1)&&LIBXSMM_NEQ(1,alpha)) compact_mult_two_nums_ ( io_code, 13, 2, 13, numb, regset ); #endif compact_load_matrix2_ ( io_code, ldb, i, k, 4, numb, datasz, regset ); compact_fms_cminusab_ ( io_code, 1, 3, 4, numb, regset ); if (j+1<=n1) compact_fms_cminusab_ ( io_code, 7, 6, 4, numb, regset ); #if 0 if (j+2<=n1) compact_fms_cminusab_ ( io_code, 11, 10, 4, numb, regset ); if (j+3<=n1) compact_fms_cminusab_ ( io_code, 13, 12, 4, numb, regset ); #endif if ( (k==j-1) && (nounit) ) compact_mult_two_nums_ ( io_code, 1, 5, 1, numb, regset ); compact_store_matrix2_ ( io_code, ldb, i, j, 1, numb, datasz, regset ); if (j+1<=n1) compact_store_matrix2_ ( io_code, ldb, i, j+1, 7, numb, datasz, regset ); #if 0 if (j+2<=n1) compact_store_matrix2_ ( io_code, ldb, i, j+2, 11, numb, datasz, regset ); if (j+3<=n1) compact_store_matrix2_ ( io_code, ldb, i, j+3, 13, numb, datasz, regset ); #endif } } if ( j+1 <= n1 ) { for ( k = j; k <= j; k++ ) { compact_load_matrix1_ ( io_code, lda, k, j+1, 6, numb, datasz, regset ); if ( j==1 && nounit ) compact_load_matrix3_ ( io_code, n1, j, 1, 9, numb, datasz, regset ); if ( nounit ) compact_load_matrix3_ ( io_code, n1, j+1, 1, 1, numb, datasz, regset ); for ( i = 1; i <= m1; i++ ) { compact_load_matrix2_ ( io_code, ldb, i, j+1, 7, numb, datasz, regset ); compact_load_matrix2_ ( io_code, ldb, i, k, 8, numb, datasz, regset ); if (j==1 && nounit) { compact_mult_two_nums_ ( io_code, 8, 9, 8, numb, regset ); compact_store_matrix2_ ( io_code, ldb, i, k, 8, numb, datasz, regset ); } compact_fms_cminusab_ ( io_code, 7, 6, 8, numb, regset ); if (nounit) compact_mult_two_nums_ ( io_code, 7, 1, 7, numb, regset ); compact_store_matrix2_ ( io_code, ldb, i, j+1, 7, numb, datasz, regset ); } } } #if 0 if ( j+2 <= n1 ) { for ( k = j; k <= j+1; k++ ) { compact_load_matrix1_ ( io_code, lda, k, j+2, 6, numb, datasz, regset ); if ( (k==j+1) && nounit ) compact_load_matrix3_ ( io_code, n1, j+2, 1, 1, numb, datasz, regset ); for ( i = 1; i <= m1; i++ ) { compact_load_matrix2_ ( io_code, ldb, i, j+2, 7, numb, datasz, regset ); compact_load_matrix2_ ( io_code, ldb, i, k, 8, numb, datasz, regset ); compact_fms_cminusab_ ( io_code, 7, 6, 8, numb, regset ); if ((k==j+1)&& nounit) compact_mult_two_nums_ ( io_code, 7, 1, 7, numb, regset ); compact_store_matrix2_ ( io_code, ldb, i, j+2, 7, numb, datasz, regset ); } } } if ( j+3 <= n1 ) { for ( k = j; k <= j+2; k++ ) { compact_load_matrix1_ ( io_code, lda, k, j+3, 6, numb, datasz, regset ); if ( (k==j+2) && nounit ) compact_load_matrix3_ ( io_code, n1, j+3, 1, 1, numb, datasz, regset ); for ( i = 1; i <= m1; i++ ) { compact_load_matrix2_ ( io_code, ldb, i, j+3, 7, numb, datasz, regset ); compact_load_matrix2_ ( io_code, ldb, i, k, 8, numb, datasz, regset ); compact_fms_cminusab_ ( io_code, 7, 6, 8, numb, regset ); if ((k==j+2)&& nounit) compact_mult_two_nums_ ( io_code, 7, 1, 7, numb, regset ); compact_store_matrix2_ ( io_code, ldb, i, j+3, 7, numb, datasz, regset ); } } } #endif } } else { /* Do RLN* cases: B <- alpha * B * inv(A) */ for ( j = n1; j >= 1; j-- ) { if ( LIBXSMM_NEQ(1, alpha) ) { for ( i = 1; i <= m1; i++ ) { compact_load_matrix2_ ( io_code, ldb, i, j, 0, numb, datasz, regset ); compact_mult_two_nums_ ( io_code, 0, 2, 0, numb, regset ); compact_store_matrix2_ ( io_code, ldb, i, j, 0, numb, datasz, regset ); } } for ( k = j+1; k <= n1; k++ ) { compact_load_matrix1_ ( io_code, lda, k, j, 3, numb, datasz, regset ); for ( i = 1; i <= m1; i++ ) { compact_load_matrix2_ ( io_code, ldb, i, j, 1, numb, datasz, regset ); compact_load_matrix2_ ( io_code, ldb, i, k, 4, numb, datasz, regset ); compact_fms_cminusab_ ( io_code, 1, 3, 4, numb, regset ); compact_store_matrix2_ ( io_code, ldb, i, j, 1, numb, datasz, regset ); } } if ( nounit ) { compact_load_matrix1_( io_code, lda, j, j, 1, numb, datasz, regset ); compact_divide_two_nums_ ( io_code, 5, 1, 1, numb, regset ); for ( i = 1; i <= m1; i++ ) { compact_load_matrix2_ ( io_code, ldb, i, j, 3, numb, datasz, regset ); compact_mult_two_nums_ ( io_code, 1, 3, 3, numb, regset ); compact_store_matrix2_ ( io_code, ldb, i, j, 3, numb, datasz, regset ); } } } } /* uplo */ } else { if ( (uplo=='U') || (uplo=='u') ) { /* Do RUT* cases: B<- alpha*B *inv(A^T) */ for ( k = n1; k >= 1; k-- ) { if ( nounit ) { compact_load_matrix1_( io_code, lda, k, k, 1, numb, datasz, regset ); compact_divide_two_nums_ ( io_code, 5, 1, 1, numb, regset ); for ( i = 1; i <= m1; i++ ) { compact_load_matrix2_ ( io_code, ldb, i, k, 3, numb, datasz, regset ); compact_mult_two_nums_ ( io_code, 1, 3, 3, numb, regset ); compact_store_matrix2_ ( io_code, ldb, i, k, 3, numb, datasz, regset ); } } for ( j = 1; j <= k-1; j++ ) { compact_load_matrix1_ ( io_code, lda, j, k, 1, numb, datasz, regset ); for ( i = 1; i <= m1; i++ ) { compact_load_matrix2_ ( io_code, ldb, i, j, 0, numb, datasz, regset ); compact_load_matrix2_ ( io_code, ldb, i, k, 3, numb, datasz, regset ); compact_fms_cminusab_ ( io_code, 0, 1, 3, numb, regset ); compact_store_matrix2_ ( io_code, ldb, i, j, 0, numb, datasz, regset ); } } if ( LIBXSMM_NEQ(1, alpha) ) { for ( i = 1; i <= m1; i++ ) { compact_load_matrix2_ ( io_code, ldb, i, k, 0, numb, datasz, regset ); compact_mult_two_nums_ ( io_code, 0, 2, 0, numb, regset ); compact_store_matrix2_ ( io_code, ldb, i, k, 0, numb, datasz, regset ); } } } } else { /* Do RLT* cases: B <- alpha * B *inv(A^T) */ for ( k = 1; k <= n1; k++ ) { if ( nounit ) { compact_load_matrix1_ ( io_code, lda, k, k, 1, numb, datasz, regset ); compact_divide_two_nums_ ( io_code, 5, 1, 1, numb, regset ); for ( i = 1; i <= m1; i++ ) { compact_load_matrix2_ ( io_code, ldb, i, k, 3, numb, datasz, regset ); compact_mult_two_nums_ ( io_code, 1, 3, 3, numb, regset ); compact_store_matrix2_ ( io_code, ldb, i, k, 3, numb, datasz, regset ); } } for ( j = k+1; j <= n1; j++ ) { compact_load_matrix1_ ( io_code, lda, j, k, 1, numb, datasz, regset ); for ( i = 1; i <= m1; i++ ) { compact_load_matrix2_( io_code, ldb, i, j, 0, numb, datasz, regset ); compact_load_matrix2_( io_code, ldb, i, k, 3, numb, datasz, regset ); compact_fms_cminusab_ ( io_code, 0, 1, 3, numb, regset ); compact_store_matrix2_ ( io_code, ldb, i, j, 0, numb, datasz, regset ); } } if ( LIBXSMM_NEQ(1, alpha) ) { for ( i = 1; i <= m1; i++ ) { compact_load_matrix2_ ( io_code, ldb, i, k, 0, numb, datasz, regset ); compact_mult_two_nums_ ( io_code, 0, 2, 0, numb, regset ); compact_store_matrix2_ ( io_code, ldb, i, k, 0, numb, datasz, regset ); } } } } /* uplo */ } /* trans */ } /* side */ } { int i = io_code->code_size; buf[i++] = 0xc3; /* retq */ io_code->code_size = i; } /* close asm: note that we really didn't need to push everything */ /* libxsmm_x86_instruction_close_stream_transpose( io_code, i_arch ); */ #ifdef GENERATOR_PACKED_TRSM_DEBUG printf("done with m=%d n=%d i=%d\n",i_trans_desc->m,i_trans_desc->n,io_code->code_size); #endif #if 0 #define DEBUG_GIVE_BYTE_CODE_OUTPUT #endif #ifdef DEBUG_GIVE_BYTE_CODE_OUTPUT buf = (unsigned char *) io_code->generated_code; printf("#Final Routine: \n"); for ( i = 0; i < io_code->code_size; i+=8 ) { printf("#\tBytes %d-%d\n",i,i+7); printf(".byte 0x%02x, 0x%02x, 0x%02x, 0x%02x, 0x%02x, 0x%02x, 0x%02x, 0x%02x\n",buf[i],buf[i+1],buf[i+2],buf[i+3],buf[i+4],buf[i+5],buf[i+6],buf[i+7]); } #endif } libxsmm-1.17/src/generator_packed_trsm_avx_avx512.h000066400000000000000000000023741415223013700224070ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Greg Henry, Timothy Costa (Intel Corp.) ******************************************************************************/ #ifndef GENERATOR_PACKED_TRSM_AVX_AVX512_H #define GENERATOR_PACKED_TRSM_AVX_AVX512_H #include "generator_common.h" LIBXSMM_API_INTERN void libxsmm_generator_packed_trsm_avx_avx512_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_trsm_descriptor* i_packed_trsm_desc, const char* i_arch ); #endif /*GENERATOR_PACKED_TRSM_AVX_AVX512_H*/ libxsmm-1.17/src/generator_packed_trsm_dmacros.h000066400000000000000000000062441415223013700221330ustar00rootroot00000000000000#define XCT_NOUNIT_DIAG #define N_UNROLL_AVX2 2 #define M_UNROLL_AVX2 2 #define ELE_IN_REGISTER_AVX2_F32 8 #define ELE_IN_REGISTER_AVX2_F64 4 #define P_UNROLL_AVX2_F32 ELE_IN_REGISTER_AVX2_F32 #define P_UNROLL_AVX2_F64 ELE_IN_REGISTER_AVX2_F64 #define trsm_ll_ap(i,j,lda,_is_row_) ((_is_row_) ? ((j)+((i)*(lda))) : ((i)+((j)*(lda)))) #define trsm_ll_bp(i,j,ldb,_is_row_) ((_is_row_) ? ((j)+((i)*(ldb))) : ((i)+((j)*(ldb)))) /* In the original XCT code, this was a #define that depended on what we want. But since we want this to work everywhere, I'm turning this on for everything. */ #define _XCT_NOUNIT_DIAG_ #define xct_ftype float /* Obsoleted in LIBXSMM, just a carry-over from XCT */ #define SET_ZERO_PACKED(x,y,z) do {\ if (datasz==8) libxsmm_x86_instruction_vec_compute_reg ( io_code, LIBXSMM_X86_AVX2, LIBXSMM_X86_INSTR_VXORPD, i_vector_name, (z), (y), (x) ); \ else libxsmm_x86_instruction_vec_compute_reg ( io_code, LIBXSMM_X86_AVX2, LIBXSMM_X86_INSTR_VXORPS, i_vector_name, (z), (y), (x) ); \ } while(0) #define VMOVU_PACKED(reg, mat_ptr, mat_offset, load_store) do { \ if (load_store && datasz==8) libxsmm_x86_instruction_vec_move ( io_code, LIBXSMM_X86_AVX2, LIBXSMM_X86_INSTR_VMOVUPD, mat_ptr, LIBXSMM_X86_GP_REG_UNDEF, 1, (mat_offset)*2, i_vector_name, (reg), 0, 1 ); \ else if (load_store && datasz==4) libxsmm_x86_instruction_vec_move ( io_code, LIBXSMM_X86_AVX2, LIBXSMM_X86_INSTR_VMOVUPS, mat_ptr, LIBXSMM_X86_GP_REG_UNDEF, 1, (mat_offset), i_vector_name, (reg), 0, 1 ); \ else if (datasz==8) libxsmm_x86_instruction_vec_move ( io_code, LIBXSMM_X86_AVX2, LIBXSMM_X86_INSTR_VMOVUPD, mat_ptr, LIBXSMM_X86_GP_REG_UNDEF, 1, (mat_offset)*2, i_vector_name, (reg), 0, 0 ); \ else libxsmm_x86_instruction_vec_move ( io_code, LIBXSMM_X86_AVX2, LIBXSMM_X86_INSTR_VMOVUPS, mat_ptr, LIBXSMM_X86_GP_REG_UNDEF, 1, (mat_offset), i_vector_name, (reg), 0, 0 ); \ } while(0) #define VFMADD231_PACKED(x,y,z) do { \ if (datasz==8) libxsmm_x86_instruction_vec_compute_reg ( io_code, LIBXSMM_X86_AVX2, LIBXSMM_X86_INSTR_VFMADD231PD, i_vector_name, (z), (y), (x) ); \ else libxsmm_x86_instruction_vec_compute_reg ( io_code, LIBXSMM_X86_AVX2, LIBXSMM_X86_INSTR_VFMADD231PS, i_vector_name, (z), (y), (x) ); \ } while(0) #define VSUB_PACKED(x,y,z) do { \ if (datasz==8) libxsmm_x86_instruction_vec_compute_reg ( io_code, LIBXSMM_X86_AVX2, LIBXSMM_X86_INSTR_VSUBPD, i_vector_name, (x), (y), (z)); \ else libxsmm_x86_instruction_vec_compute_reg ( io_code, LIBXSMM_X86_AVX2, LIBXSMM_X86_INSTR_VSUBPS, i_vector_name, (z), (y), (x)); \ } while(0) #define VDIV_PACKED(x,y,z) do { \ if (datasz==8) libxsmm_x86_instruction_vec_compute_reg ( io_code, LIBXSMM_X86_AVX2, LIBXSMM_X86_INSTR_VDIVPD, i_vector_name, (x), (y), (z)); \ else libxsmm_x86_instruction_vec_compute_reg ( io_code, LIBXSMM_X86_AVX2, LIBXSMM_X86_INSTR_VDIVPS, i_vector_name, (z), (y), (x)); \ } while(0) #define VMUL_PACKED(x,y,z) do { \ if (datasz==8) libxsmm_x86_instruction_vec_compute_reg ( io_code, LIBXSMM_X86_AVX2, LIBXSMM_X86_INSTR_VMULPD, i_vector_name, (z), (y), (x)); \ else libxsmm_x86_instruction_vec_compute_reg ( io_code, LIBXSMM_X86_AVX2, LIBXSMM_X86_INSTR_VMULPS, i_vector_name, (z), (y), (x)); \ } while(0) libxsmm-1.17/src/generator_packed_xct_avx2_lln.h000066400000000000000000000340321415223013700220350ustar00rootroot00000000000000#if 0 # define GENERATOR_PACKED_XCT_DEBUG #endif #ifdef GENERATOR_PACKED_XCT_DEBUG printf("******* HI *********\n"); #endif #include "packed_trsm_dmacros.h" auto int a_ptr = LIBXSMM_X86_GP_REG_RDI; auto int b_ptr = LIBXSMM_X86_GP_REG_RSI; char i_vector_name = 'y'; #ifdef GENERATOR_PACKED_XCT_DEBUG printf("Inside %c%c%c%c trsm generator\n",*side_ptr,*uplo_ptr,*transa_ptr,*diag_ptr); #endif int n_in, m_in; int _is_row_; int ii; if (*layout == 101) { n_in = m; m_in = n; _is_row_ = 1; } else { m_in = m; n_in = n; _is_row_ = 0; } if ( nounit && (datasz==8) ) { double one_vector[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; i = io_code->code_size; libxsmm_x86_instruction_full_vec_load_of_constants ( io_code, (unsigned char*) one_vector, "one_vec", i_vector_name, 15 ); i = io_code->code_size; } if ( nounit && (datasz==4) ) { float one_vector[16] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; i = io_code->code_size; libxsmm_x86_instruction_full_vec_load_of_constants ( io_code, (unsigned char*) one_vector, "one_vec", i_vector_name, 15 ); i = io_code->code_size; } int ymm0 = 0; int ymm1 = 1; int ymm2 = 2; int ymm3 = 3; int ymm4 = 4; int ymm5 = 5; int ymm6 = 6; int ymm7 = 7; int ymm8 = 8; int ymm9 = 9; int ymm10 = 10; int ymm11 = 11; int ymm12 = 12; int ymm13 = 13; int ymm14 = 14; int ymm15 = 15; int P_UNROLL_AVX2; if ( datasz == 8 ) P_UNROLL_AVX2 = P_UNROLL_AVX2_F64; else P_UNROLL_AVX2 = P_UNROLL_AVX2_F32; /* zero accumulation registers */ i = io_code->code_size; SET_ZERO_PACKED(ymm0, ymm0, ymm0); /* T11*/ i = io_code->code_size; if (m_in > 1) SET_ZERO_PACKED(ymm1, ymm1, ymm1); /* T21 */ if (n_in > 1) { SET_ZERO_PACKED(ymm2, ymm2, ymm2); /* T12 */ if (m_in > 1) SET_ZERO_PACKED(ymm3, ymm3, ymm3); /* T22 */ } for (j=0; j<(n_in/N_UNROLL_AVX2)*N_UNROLL_AVX2; j+=N_UNROLL_AVX2) { for (i=0; i<(m_in/M_UNROLL_AVX2)*M_UNROLL_AVX2; i+=M_UNROLL_AVX2) { /* gemm update */ for (ii=0; ii B1 */ VSUB_PACKED(ymm2, ymm9, ymm2); /* T12 = B2-T12 */ if ( nounit ) { VMUL_PACKED(ymm2, ymm2, ymm4); /* T12 *= ONE/A1 */ } VMOVU_PACKED(ymm2, b_ptr, sizeof(xct_ftype)*(P_UNROLL_AVX2*(trsm_ll_bp(0+i,1+j,ldb,_is_row_))), 1); /* Store T12 -> B2 */ /* 2nd */ if ( nounit ) { VMOVU_PACKED(ymm4, a_ptr, sizeof(xct_ftype)*(P_UNROLL_AVX2*(trsm_ll_ap(1+i,1+i,lda,_is_row_))), 0); /* A1 */ VDIV_PACKED(ymm4, ymm15, ymm4); /* A1 /= ONE */ } VMOVU_PACKED(ymm5, a_ptr, sizeof(xct_ftype)*(P_UNROLL_AVX2*(trsm_ll_ap(1+i,0+i,lda,_is_row_))), 0); /* A2 */ VMOVU_PACKED(ymm8, b_ptr, sizeof(xct_ftype)*(P_UNROLL_AVX2*(trsm_ll_bp(1+i,0+j,ldb,_is_row_))), 0); /* B1 */ VMOVU_PACKED(ymm9, b_ptr, sizeof(xct_ftype)*(P_UNROLL_AVX2*(trsm_ll_bp(1+i,1+j,ldb,_is_row_))), 0); /* B2 */ VFMADD231_PACKED(ymm1, ymm5, ymm0); /* T21 += A2*T11 */ VSUB_PACKED(ymm1, ymm8, ymm1); /* T21 = B1 - T21 */ if ( nounit ) { VMUL_PACKED(ymm1, ymm1, ymm4); /* T21 *= ONE/A1 */ } VMOVU_PACKED(ymm1, b_ptr, sizeof(xct_ftype)*(P_UNROLL_AVX2*(trsm_ll_bp(1+i,0+j,ldb,_is_row_))), 1); /* Store T21 -> B1 */ SET_ZERO_PACKED(ymm0, ymm0, ymm0); /* ZERO T11 */ SET_ZERO_PACKED(ymm1, ymm1, ymm1); /* ZERO T21 */ VFMADD231_PACKED(ymm3, ymm5, ymm2); /* T22 += A2*T12 */ VSUB_PACKED(ymm3, ymm9, ymm3); /* T22 = B2 - T22 */ #ifdef _XCT_NOUNIT_DIAG_ if ( nounit ) { VMUL_PACKED(ymm3, ymm3, ymm4); /* T22 *= ONE/A1 */ } #endif VMOVU_PACKED(ymm3, b_ptr, sizeof(xct_ftype)*(P_UNROLL_AVX2*(trsm_ll_bp(1+i,1+j,ldb,_is_row_))), 1); /* Store T22 -> B2 */ SET_ZERO_PACKED(ymm2, ymm2, ymm2); /* ZERO T12 */ SET_ZERO_PACKED(ymm3, ymm3, ymm3); /* ZERO T22 */ } if (m_in & 1) { /* gemm update */ for (ii=0; ii B1 */ SET_ZERO_PACKED(ymm0, ymm0, ymm0); /* ZERO T11 */ VSUB_PACKED(ymm2, ymm9, ymm2); /* T12 = B2-T12 */ #ifdef _XCT_NOUNIT_DIAG_ if ( nounit ) { VMUL_PACKED(ymm2, ymm2, ymm4); /* T12 *= ONE/A1 */ } #endif VMOVU_PACKED(ymm2, b_ptr, sizeof(xct_ftype)*(P_UNROLL_AVX2*(trsm_ll_bp(0+i,1+j,ldb,_is_row_))), 1); /* Store T12 -> B2 */ SET_ZERO_PACKED(ymm2, ymm2, ymm2); /* ZERO T12 */ } } if (n_in & 1) { for (i=0; i<(m_in/M_UNROLL_AVX2)*M_UNROLL_AVX2; i+=M_UNROLL_AVX2) { /* gemm update */ for (ii=0; ii B1 */ /* 2nd */ #ifdef _XCT_NOUNIT_DIAG_ if ( nounit ) { VMOVU_PACKED(ymm4, a_ptr, sizeof(xct_ftype)*(P_UNROLL_AVX2*(trsm_ll_ap(1+i,1+i,lda,_is_row_))), 0); /* A1 */ VDIV_PACKED(ymm4, ymm15, ymm4); /* A1 /= ONE */ } #endif VMOVU_PACKED(ymm5, a_ptr, sizeof(xct_ftype)*(P_UNROLL_AVX2*(trsm_ll_ap(1+i,0+i,lda,_is_row_))), 0); /* A2 */ VMOVU_PACKED(ymm8, b_ptr, sizeof(xct_ftype)*(P_UNROLL_AVX2*(trsm_ll_bp(1+i,0+j,ldb,_is_row_))), 0); /* B1 */ VFMADD231_PACKED(ymm1, ymm5, ymm0); /* T21 += A2*T11 */ VSUB_PACKED(ymm1, ymm8, ymm1); /* T21 = B1 - T21 */ #ifdef _XCT_NOUNIT_DIAG_ if ( nounit ) { VMUL_PACKED(ymm1, ymm1, ymm4); /* T21 *= ONE/A1 */ } #endif VMOVU_PACKED(ymm1, b_ptr, sizeof(xct_ftype)*(P_UNROLL_AVX2*(trsm_ll_bp(1+i,0+j,ldb,_is_row_))), 1); /* Store T21 -> B1 */ SET_ZERO_PACKED(ymm0, ymm0, ymm0); /* ZERO T11 */ SET_ZERO_PACKED(ymm1, ymm1, ymm1); /* ZERO T21 */ } if (m_in & 1) { /* gemm update */ for (ii=0; ii B1 */ } } libxsmm-1.17/src/generator_spgemm.c000066400000000000000000000437251415223013700174170ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "generator_common.h" #include "generator_spgemm_csc_reader.h" #include "generator_spgemm_csr_reader.h" #include "generator_spgemm_csc_asparse.h" #include "generator_spgemm_csc_bsparse.h" #include "generator_spgemm_csr_asparse.h" #include "generator_spgemm_csr_asparse_reg.h" #include "generator_spgemm_csr_bsparse_soa.h" #include "generator_spgemm_csr_asparse_soa.h" #include "generator_spgemm_csc_bsparse_soa.h" #include "generator_spgemm_csc_csparse_soa.h" #include "libxsmm_main.h" LIBXSMM_API void libxsmm_generator_spgemm_csc_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const double* i_values ) { /* A matrix is sparse */ if ( (i_xgemm_desc->lda == 0) && (i_xgemm_desc->ldb > 0) && (i_xgemm_desc->ldc > 0) ) { /* check LDB */ if ( i_xgemm_desc->ldb < i_xgemm_desc->k ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_LDB ); return; } /* check LDC */ if ( i_xgemm_desc->ldc < i_xgemm_desc->m ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_LDC ); return; } libxsmm_generator_spgemm_csc_asparse( io_generated_code, i_xgemm_desc, i_arch, i_row_idx, i_column_idx, i_values ); /* B matrix is sparse */ } else if ( (i_xgemm_desc->lda > 0) && (i_xgemm_desc->ldb == 0) && (i_xgemm_desc->ldc > 0) ) { /* check LDA */ if ( i_xgemm_desc->lda < i_xgemm_desc->m ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_LDA ); return; } /* check LDC */ if ( i_xgemm_desc->ldc < i_xgemm_desc->m ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_LDC ); return; } libxsmm_generator_spgemm_csc_bsparse( io_generated_code, i_xgemm_desc, i_arch, i_row_idx, i_column_idx, i_values ); } else { /* something bad happened... */ LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_SPGEMM_GEN ); return; } } LIBXSMM_API void libxsmm_generator_spgemm_csr_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const double* i_values ) { /* A matrix is sparse */ if ( (i_xgemm_desc->lda == 0) && (i_xgemm_desc->ldb > 0) && (i_xgemm_desc->ldc > 0) ) { /* check LDB */ if ( i_xgemm_desc->ldb < i_xgemm_desc->n ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_LDB ); return; } /* check LDC */ if ( i_xgemm_desc->ldc < i_xgemm_desc->n ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_LDC ); return; } libxsmm_generator_spgemm_csr_asparse( io_generated_code, i_xgemm_desc, i_arch, i_row_idx, i_column_idx, i_values ); /* B matrix is sparse */ } else if ( (i_xgemm_desc->lda > 0) && (i_xgemm_desc->ldb == 0) && (i_xgemm_desc->ldc > 0) ) { /* check LDA */ if ( i_xgemm_desc->lda < i_xgemm_desc->k ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_LDA ); return; } /* check LDC */ if ( i_xgemm_desc->ldc < i_xgemm_desc->n ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_LDC ); return; } /* something bad happened... */ fprintf(stderr, "LIBXSMM fatal error: B sparse for CSR data structure is not yet available!\n"); exit(-1); } else { /* something bad happened... */ LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_SPGEMM_GEN ); return; } } LIBXSMM_API void libxsmm_generator_spgemm_csr_reg_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const double* i_values ) { /* A matrix is sparse */ if ( (i_xgemm_desc->lda == 0) && (i_xgemm_desc->ldb > 0) && (i_xgemm_desc->ldc > 0) ) { /* check LDB */ if ( i_xgemm_desc->ldb < i_xgemm_desc->n ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_LDB ); return; } /* check LDC */ if ( i_xgemm_desc->ldc < i_xgemm_desc->n ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_LDC ); return; } libxsmm_generator_spgemm_csr_asparse_reg( io_generated_code, i_xgemm_desc, i_arch, i_row_idx, i_column_idx, i_values ); /* B matrix is sparse */ } else if ( (i_xgemm_desc->lda > 0) && (i_xgemm_desc->ldb == 0) && (i_xgemm_desc->ldc > 0) ) { /* check LDA */ if ( i_xgemm_desc->lda < i_xgemm_desc->k ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_LDA ); return; } /* check LDC */ if ( i_xgemm_desc->ldc < i_xgemm_desc->n ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_LDC ); return; } /* something bad happened... */ fprintf(stderr, "LIBXSMM fatal error:B sparse for CSR data structure is not yet available!\n"); exit(-1); } else { /* something bad happened... */ LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_SPGEMM_GEN ); return; } } LIBXSMM_API void libxsmm_generator_spgemm_csr_soa_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const void* i_values, const unsigned int i_packed_width ) { /* A matrix is sparse */ if ( (i_xgemm_desc->lda == 0) && (i_xgemm_desc->ldb > 0) && (i_xgemm_desc->ldc > 0) ) { /* check LDB */ if ( i_xgemm_desc->ldb < i_xgemm_desc->n ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_LDB ); return; } /* check LDC */ if ( i_xgemm_desc->ldc < i_xgemm_desc->n ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_LDC ); return; } libxsmm_generator_spgemm_csr_asparse_soa( io_generated_code, i_xgemm_desc, i_arch, i_row_idx, i_column_idx, i_values, i_packed_width ); /* B matrix is sparse */ } else if ( (i_xgemm_desc->lda > 0) && (i_xgemm_desc->ldb == 0) && (i_xgemm_desc->ldc > 0) ) { /* check LDA */ if ( i_xgemm_desc->lda < i_xgemm_desc->k ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_LDA ); return; } /* check LDC */ /* coverity[copy_paste_error] */ if ( i_xgemm_desc->ldc < i_xgemm_desc->n ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_LDC ); return; } libxsmm_generator_spgemm_csr_bsparse_soa( io_generated_code, i_xgemm_desc, i_arch, i_row_idx, i_column_idx, i_values, i_packed_width ); } else { /* something bad happened... */ LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_SPGEMM_GEN ); return; } } LIBXSMM_API void libxsmm_generator_spgemm_csc_soa_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const void* i_values, const unsigned int i_packed_width ) { /* B matrix is sparse */ if ( (i_xgemm_desc->lda > 0) && (i_xgemm_desc->ldb == 0) && (i_xgemm_desc->ldc > 0) ) { /* check LDA */ if ( i_xgemm_desc->lda < i_xgemm_desc->k ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_LDA ); return; } /* check LDC */ if ( i_xgemm_desc->ldc < i_xgemm_desc->n ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_LDC ); return; } libxsmm_generator_spgemm_csc_bsparse_soa( io_generated_code, i_xgemm_desc, i_arch, i_row_idx, i_column_idx, i_values, i_packed_width ); /* C matrix is sparse */ } else if ( (i_xgemm_desc->lda > 0) && (i_xgemm_desc->ldb > 0) && (i_xgemm_desc->ldc == 0) ) { #if 0 /* check LDA */ if ( i_xgemm_desc->lda < i_xgemm_desc->k ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_LDA ); return; } #endif /* check LDB */ if ( i_xgemm_desc->ldb < i_xgemm_desc->n ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_LDB ); return; } libxsmm_generator_spgemm_csc_csparse_soa( io_generated_code, i_xgemm_desc, i_arch, i_row_idx, i_column_idx, i_values, i_packed_width ); } else { /* something bad happened... */ LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_SPGEMM_GEN ); return; } } LIBXSMM_API void libxsmm_generator_spgemm( const char* i_file_out, const char* i_routine_name, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const char* i_file_in, const int i_is_csr ) { /* CSC/CSR structure */ unsigned int* l_row_idx = NULL; unsigned int* l_column_idx = NULL; double* l_values = NULL; unsigned int l_row_count; unsigned int l_column_count; unsigned int l_element_count = 0; /* init generated code object */ libxsmm_generated_code l_generated_code; l_generated_code.generated_code = NULL; l_generated_code.buffer_size = 0; l_generated_code.code_size = 0; l_generated_code.code_type = 0; l_generated_code.last_error = 0; l_generated_code.sf_size = 0; /* add signature to code string */ if (i_is_csr == 3) { libxsmm_mmfunction_signature_asparse_reg( &l_generated_code, i_routine_name, i_xgemm_desc ); } else { libxsmm_mmfunction_signature( &l_generated_code, i_routine_name, i_xgemm_desc ); } /* check if generate to CSC */ /* @TODO, this i_is_csr is very hacky.... change it in future */ if ( (i_is_csr == 0) || (i_is_csr > 9) ) { /* read CSC file and construct CSC data structure */ libxsmm_sparse_csc_reader( &l_generated_code, i_file_in, &l_row_idx, &l_column_idx, &l_values, &l_row_count, &l_column_count, &l_element_count ); if (0 != l_row_idx && 0 != l_column_idx && 0 != l_values) { #if !defined(NDEBUG) double *const l_tmp = (double*)malloc((size_t)l_row_count * l_column_count * sizeof(double)); unsigned int l_n; unsigned int l_m; /* mute static analysis about garbage content */ memset(l_tmp, 0, (size_t)l_row_count * l_column_count * sizeof(double)); printf("CSC matrix data structure we just read:\n"); printf("rows: %u, columns: %u, elements: %u\n", l_row_count, l_column_count, l_element_count); if (l_tmp == NULL) { fprintf(stderr, "LIBXSMM fatal error:Could allocate dense value array to test CSC data structure!\n"); exit(-1); } for ( l_n = 0; l_n < (l_row_count * l_column_count); l_n++) { l_tmp[l_n] = 0.0; } for ( l_n = 0; l_n < l_row_count+1; l_n++) { printf("%u ", l_column_idx[l_n]); } printf("\n"); for ( l_n = 0; l_n < l_element_count; l_n++) { printf("%u ", l_row_idx[l_n]); } printf("\n"); for ( l_n = 0; l_n < l_element_count; l_n++) { printf("%f ", l_values[l_n]); } printf("\n"); for ( l_n = 0; l_n < l_column_count; l_n++) { const unsigned int l_column_elems = l_column_idx[l_n+1] - l_column_idx[l_n]; assert(l_column_idx[l_n+1] >= l_column_idx[l_n]); for ( l_m = 0; l_m < l_column_elems; l_m++) { l_tmp[(l_row_idx[l_column_idx[l_n] + l_m]*l_column_count) + l_n] = l_values[l_column_idx[l_n] + l_m]; } } assert(0 != l_tmp); for ( l_n = 0; l_n < l_row_count; l_n++) { for ( l_m = 0; l_m < l_column_count; l_m++) { printf("%f ", l_tmp[(l_n * l_column_count) + l_m]); } printf("\n"); } free( l_tmp ); #endif /* generate the actual kernel code for current description depending on the architecture */ if (i_is_csr == 0) { libxsmm_generator_spgemm_csc_kernel( &l_generated_code, i_xgemm_desc, i_arch, l_row_idx, l_column_idx, l_values ); } else if (i_is_csr == 10) { assert(0/*should not happen*/); /*libxsmm_generator_spgemm_csc_soa_kernel( &l_generated_code, i_xgemm_desc, i_arch, l_row_idx, l_column_idx, l_values, 16 );*/ } else { assert(0/*should not happen*/); } } } else { /* read CSR file and construct CSR data structure */ libxsmm_sparse_csr_reader( &l_generated_code, i_file_in, &l_row_idx, &l_column_idx, &l_values, &l_row_count, &l_column_count, &l_element_count ); if (0 != l_row_idx && 0 != l_column_idx && 0 != l_values) { /* libxsmm_sparse_*_reader may have deallocated l_values */ #if !defined(NDEBUG) double *const l_tmp = (double*)malloc((size_t)l_row_count * l_column_count * sizeof(double)); unsigned int l_n; unsigned int l_m; /* mute static analysis about garbage content */ memset(l_tmp, 0, (size_t)l_row_count * l_column_count * sizeof(double)); printf("CSR matrix data structure we just read:\n"); printf("rows: %u, columns: %u, elements: %u\n", l_row_count, l_column_count, l_element_count); if (l_tmp == NULL) { fprintf(stderr, "LIBXSMM fatal error:Could allocate dense value array to test CSR data structure!\n"); exit(-1); } for ( l_n = 0; l_n < (l_row_count * l_column_count); l_n++) { l_tmp[l_n] = 0.0; } for ( l_n = 0; l_n < l_row_count+1; l_n++) { printf("%u ", l_row_idx[l_n]); } printf("\n"); for ( l_n = 0; l_n < l_element_count; l_n++) { printf("%u ", l_column_idx[l_n]); } printf("\n"); for ( l_n = 0; l_n < l_element_count; l_n++) { printf("%f ", l_values[l_n]); } printf("\n"); for ( l_n = 0; l_n < l_row_count; l_n++) { const unsigned int l_row_elems = l_row_idx[l_n+1] - l_row_idx[l_n]; assert(l_row_idx[l_n+1] >= l_row_idx[l_n]); for ( l_m = 0; l_m < l_row_elems; l_m++) { l_tmp[(l_n * l_column_count) + l_column_idx[l_row_idx[l_n] + l_m]] = l_values[l_row_idx[l_n] + l_m]; } } assert(0 != l_tmp); for ( l_n = 0; l_n < l_row_count; l_n++) { for ( l_m = 0; l_m < l_column_count; l_m++) { printf("%f ", l_tmp[(l_n * l_column_count) + l_m]); } printf("\n"); } free( l_tmp ); #endif if (i_is_csr == 1) { /* generate the actual kernel code for current description depending on the architecture */ libxsmm_generator_spgemm_csr_kernel( &l_generated_code, i_xgemm_desc, i_arch, l_row_idx, l_column_idx, l_values ); } else if (i_is_csr == 2) { /* generate the actual kernel code for current description depending on the architecture */ assert(0/*should not happen*/); /*libxsmm_generator_spgemm_csr_soa_kernel( &l_generated_code, i_xgemm_desc, i_arch, l_row_idx, l_column_idx, l_values, 16 );*/ } else if (i_is_csr == 3) { /* generate the actual kernel code for current description depending on the architecture */ libxsmm_generator_spgemm_csr_reg_kernel( &l_generated_code, i_xgemm_desc, i_arch, l_row_idx, l_column_idx, l_values ); } else { assert(0/*should not happen*/); } } } /* close current function */ libxsmm_close_function( &l_generated_code ); /* free if not NULL */ if ( l_row_idx != NULL ) { free( l_row_idx ); } if ( l_column_idx != NULL ) { free( l_column_idx ); } if ( l_values != NULL ) { free( l_values ); } /* check for errors during code generation */ if ( l_generated_code.last_error != 0 ) { LIBXSMM_HANDLE_ERROR_VERBOSE( &l_generated_code, l_generated_code.last_error ); exit(-1); } /* append code to source file */ if ( l_generated_code.generated_code != NULL ) { FILE *const l_file_handle = fopen( i_file_out, "a" ); if ( l_file_handle != NULL ) { fputs( (const char*)l_generated_code.generated_code, l_file_handle ); fclose( l_file_handle ); } else { fprintf(stderr, "LIBXSMM ERROR: libxsmm_generator_spgemm could not write to into destination source file\n"); exit(-1); } } /* free code memory */ free( l_generated_code.generated_code ); } libxsmm-1.17/src/generator_spgemm_csc_asparse.c000066400000000000000000000560311415223013700217570ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ /** * @file * This file is part of GemmCodeGenerator. * * @author Alexander Heinecke (alexander.heinecke AT mytum.de, http://www5.in.tum.de/wiki/index.php/Alexander_Heinecke,_M.Sc.,_M.Sc._with_honors) * * @section LICENSE * Copyright (c) 2012-2014, Technische Universitaet Muenchen * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * @section DESCRIPTION * */ #include "generator_spgemm_csc_asparse.h" #include "generator_common.h" #include "libxsmm_main.h" LIBXSMM_API_INTERN void libxsmm_sparse_csc_asparse_innerloop_scalar( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_k, const unsigned int i_z, const unsigned int* i_row_idx, const unsigned int* i_column_idx ) { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " __m128d c%u_%u = _mm_load_sd(&C[(l_n*%u)+%u]);\n", i_k, i_z, (unsigned int)i_xgemm_desc->ldc, i_row_idx[i_column_idx[i_k] + i_z] ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " __m128d a%u_%u = _mm_load_sd(&A[%u]);\n", i_k, i_z, i_column_idx[i_k] + i_z ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#if defined(__SSE3__) && defined(__AVX__)\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " c%u_%u = _mm_add_sd(c%u_%u, _mm_mul_sd(a%u_%u, _mm256_castpd256_pd128(b%u)));\n", i_k, i_z, i_k, i_z, i_k, i_z, i_k ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#endif\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#if defined(__SSE3__) && !defined(__AVX__)\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " c%u_%u = _mm_add_sd(c%u_%u, _mm_mul_sd(a%u_%u, b%u));\n", i_k, i_z, i_k, i_z, i_k, i_z, i_k ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#endif\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " _mm_store_sd(&C[(l_n*%u)+%u], c%u_%u);\n", (unsigned int)i_xgemm_desc->ldc, i_row_idx[i_column_idx[i_k] + i_z], i_k, i_z ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " __m128 c%u_%u = _mm_load_ss(&C[(l_n*%u)+%u]);\n", i_k, i_z, (unsigned int)i_xgemm_desc->ldc, i_row_idx[i_column_idx[i_k] + i_z] ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " __m128 a%u_%u = _mm_load_ss(&A[%u]);\n", i_k, i_z, i_column_idx[i_k] + i_z ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " c%u_%u = _mm_add_ss(c%u_%u, _mm_mul_ss(a%u_%u, b%u));\n", i_k, i_z, i_k, i_z, i_k, i_z, i_k ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " _mm_store_ss(&C[(l_n*%u)+%u], c%u_%u);\n", (unsigned int)i_xgemm_desc->ldc, i_row_idx[i_column_idx[i_k] + i_z], i_k, i_z ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } LIBXSMM_API_INTERN void libxsmm_sparse_csc_asparse_innerloop_two_vector( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_k, const unsigned int i_z, const unsigned int* i_row_idx, const unsigned int* i_column_idx ) { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " __m128d c%u_%u = _mm_loadu_pd(&C[(l_n*%u)+%u]);\n", i_k, i_z, (unsigned int)i_xgemm_desc->ldc, i_row_idx[i_column_idx[i_k] + i_z] ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " __m128d a%u_%u = _mm_loadu_pd(&A[%u]);\n", i_k, i_z, i_column_idx[i_k] + i_z ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#if defined(__SSE3__) && defined(__AVX__)\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " c%u_%u = _mm_add_pd(c%u_%u, _mm_mul_pd(a%u_%u, _mm256_castpd256_pd128(b%u)));\n", i_k, i_z, i_k, i_z, i_k, i_z, i_k ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#endif\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#if defined(__SSE3__) && !defined(__AVX__)\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " c%u_%u = _mm_add_pd(c%u_%u, _mm_mul_pd(a%u_%u, b%u));\n", i_k, i_z, i_k, i_z, i_k, i_z, i_k ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#endif\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " _mm_storeu_pd(&C[(l_n*%u)+%u], c%u_%u);\n", (unsigned int)i_xgemm_desc->ldc, i_row_idx[i_column_idx[i_k] + i_z], i_k, i_z ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " __m128 c%u_%u = _mm_castpd_ps(_mm_load_sd((const double*)&C[(l_n*%u)+%u]));\n", i_k, i_z, (unsigned int)i_xgemm_desc->ldc, i_row_idx[i_column_idx[i_k] + i_z] ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " __m128 a%u_%u = _mm_castpd_ps(_mm_load_sd((const double*)&A[%u]));\n", i_k, i_z, i_column_idx[i_k] + i_z ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " c%u_%u = _mm_add_ps(c%u_%u, _mm_mul_ps(a%u_%u, b%u));\n", i_k, i_z, i_k, i_z, i_k, i_z, i_k ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " _mm_store_sd((double*)&C[(l_n*%u)+%u], _mm_castps_pd(c%u_%u));\n", (unsigned int)i_xgemm_desc->ldc, i_row_idx[i_column_idx[i_k] + i_z], i_k, i_z ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } LIBXSMM_API_INTERN void libxsmm_sparse_csc_asparse_innerloop_four_vector( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_k, const unsigned int i_z, const unsigned int* i_row_idx, const unsigned int* i_column_idx ) { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { unsigned int l_i; unsigned int l_z = i_z; l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#if defined(__SSE3__) && defined(__AVX__)\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " __m256d c%u_%u = _mm256_loadu_pd(&C[(l_n*%u)+%u]);\n", i_k, i_z, (unsigned int)i_xgemm_desc->ldc, i_row_idx[i_column_idx[i_k] + i_z] ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " __m256d a%u_%u = _mm256_loadu_pd(&A[%u]);\n", i_k, i_z, i_column_idx[i_k] + i_z ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " c%u_%u = _mm256_add_pd(c%u_%u, _mm256_mul_pd(a%u_%u, b%u));\n", i_k, i_z, i_k, i_z, i_k, i_z, i_k ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " _mm256_storeu_pd(&C[(l_n*%u)+%u], c%u_%u);\n", (unsigned int)i_xgemm_desc->ldc, i_row_idx[i_column_idx[i_k] + i_z], i_k, i_z ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#endif\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#if defined(__SSE3__) && !defined(__AVX__)\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); for ( l_i = 0; l_i < 2; l_i++ ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " __m128d c%u_%u = _mm_loadu_pd(&C[(l_n*%u)+%u]);\n", i_k, l_z, (unsigned int)i_xgemm_desc->ldc, i_row_idx[i_column_idx[i_k] + l_z] ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " __m128d a%u_%u = _mm_loadu_pd(&A[%u]);\n", i_k, l_z, i_column_idx[i_k] + l_z ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " c%u_%u = _mm_add_pd(c%u_%u, _mm_mul_pd(a%u_%u, b%u));\n", i_k, l_z, i_k, l_z, i_k, l_z, i_k ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " _mm_storeu_pd(&C[(l_n*%u)+%u], c%u_%u);\n", (unsigned int)i_xgemm_desc->ldc, i_row_idx[i_column_idx[i_k] + l_z], i_k, l_z ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_z += 2; } l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#endif\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " __m128 c%u_%u = _mm_loadu_ps(&C[(l_n*%u)+%u]);\n", i_k, i_z, (unsigned int)i_xgemm_desc->ldc, i_row_idx[i_column_idx[i_k] + i_z] ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " __m128 a%u_%u = _mm_loadu_ps(&A[%u]);\n", i_k, i_z, i_column_idx[i_k] + i_z ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " c%u_%u = _mm_add_ps(c%u_%u, _mm_mul_ps(a%u_%u, b%u));\n", i_k, i_z, i_k, i_z, i_k, i_z, i_k ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " _mm_storeu_ps(&C[(l_n*%u)+%u], c%u_%u);\n", (unsigned int)i_xgemm_desc->ldc, i_row_idx[i_column_idx[i_k] + i_z], i_k, i_z ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csc_asparse( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const double* i_values ) { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; unsigned int l_k; unsigned int l_flop_count = 0; LIBXSMM_UNUSED(i_arch); LIBXSMM_UNUSED(i_values); /* loop over columns in C in generated code, we fully unroll inside each column */ l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " unsigned int l_n = 0;\n #pragma nounroll_and_jam\n for ( l_n = 0; l_n < %u; l_n++) {\n", (unsigned int)i_xgemm_desc->n); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* reset the current column in C if needed */ if (0 != (LIBXSMM_GEMM_FLAG_BETA_0 & i_xgemm_desc->flags)) { /* Beta=0 */ l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " unsigned int l_m = 0;\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); if ( i_xgemm_desc->m > 1 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_m = 0; l_m < %u; l_m++) {\n C[(l_n*%u)+l_m] = 0.0;\n }\n", (unsigned int)i_xgemm_desc->m, (unsigned int)i_xgemm_desc->ldc); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_m = 0; l_m < %u; l_m++) {\n C[(l_n*%u)+l_m] = 0.0f;\n }\n", (unsigned int)i_xgemm_desc->m, (unsigned int)i_xgemm_desc->ldc); } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } assert(0 != i_column_idx); /* loop over columns in A, rows in B and fully unroll */ for ( l_k = 0; l_k < (unsigned int)i_xgemm_desc->k; l_k++ ) { unsigned int l_column_elements = i_column_idx[l_k + 1] - i_column_idx[l_k]; unsigned int l_z = 0; l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#if defined(__SSE3__) || defined(__AVX__)\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); if ( l_column_elements > 0 ) { if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#if defined(__SSE3__) && defined(__AVX__)\n __m256d b%u = _mm256_broadcast_sd(&B[(l_n*%u)+%u]);\n#endif\n", l_k, (unsigned int)i_xgemm_desc->ldb, l_k); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#if defined(__SSE3__) && !defined(__AVX__)\n __m128d b%u = _mm_loaddup_pd(&B[(l_n*%u)+%u]);\n#endif\n", l_k, (unsigned int)i_xgemm_desc->ldb, l_k); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#if defined(__SSE3__) && defined(__AVX__)\n __m128 b%u = _mm_broadcast_ss(&B[(l_n*%u)+%u]);\n#endif\n", l_k, (unsigned int)i_xgemm_desc->ldb, l_k); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#if defined(__SSE3__) && !defined(__AVX__)\n __m128 b%u = _mm_load_ss(&B[(l_n*%u)+%u]); b%u = _mm_shuffle_ps(b%u, b%u, 0x00);\n#endif\n", l_k, (unsigned int)i_xgemm_desc->ldb, l_k, l_k, l_k, l_k); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } /* loop over the columns of A and look for vectorization potential */ for ( l_z = 0; l_z < l_column_elements; l_z++ ) { assert(0 != i_row_idx); /* 4 element vector might be possible */ if ( (l_z < (l_column_elements - 3)) && (l_column_elements > 3) ) { /* check for 256bit vector instruction */ if ((i_row_idx[i_column_idx[l_k] + l_z] + 1 == i_row_idx[i_column_idx[l_k] + l_z + 1]) && (i_row_idx[i_column_idx[l_k] + l_z] + 2 == i_row_idx[i_column_idx[l_k] + l_z + 2]) && (i_row_idx[i_column_idx[l_k] + l_z] + 3 == i_row_idx[i_column_idx[l_k] + l_z + 3]) && (i_row_idx[i_column_idx[l_k] + l_z + 3] < (unsigned int)i_xgemm_desc->m)) { libxsmm_sparse_csc_asparse_innerloop_four_vector(io_generated_code, i_xgemm_desc, l_k, l_z, i_row_idx, i_column_idx); l_z += 3; /* check for 128bit vector instruction */ } else if ((i_row_idx[i_column_idx[l_k] + l_z] + 1 == i_row_idx[i_column_idx[l_k] + l_z + 1]) && (i_row_idx[i_column_idx[l_k] + l_z + 1] < (unsigned int)i_xgemm_desc->m) ) { libxsmm_sparse_csc_asparse_innerloop_two_vector(io_generated_code, i_xgemm_desc, l_k, l_z, i_row_idx, i_column_idx); l_z++; /* scalar instruction */ } else { if ( (i_row_idx[i_column_idx[l_k] + l_z] < (unsigned int)i_xgemm_desc->m) ) { libxsmm_sparse_csc_asparse_innerloop_scalar(io_generated_code, i_xgemm_desc, l_k, l_z, i_row_idx, i_column_idx); } } /* 2 element vector might be possible */ } else if ( (l_z < (l_column_elements - 1)) && (l_column_elements > 1)) { /* check for 128bit vector instruction */ if ((i_row_idx[i_column_idx[l_k] + l_z] + 1 == i_row_idx[i_column_idx[l_k] + l_z + 1]) && (i_row_idx[i_column_idx[l_k] + l_z + 1] < (unsigned int)i_xgemm_desc->m) ) { libxsmm_sparse_csc_asparse_innerloop_two_vector(io_generated_code, i_xgemm_desc, l_k, l_z, i_row_idx, i_column_idx); l_z++; /* scalar instruction */ } else { if ( (i_row_idx[i_column_idx[l_k] + l_z] < (unsigned int)i_xgemm_desc->m) ) { libxsmm_sparse_csc_asparse_innerloop_scalar(io_generated_code, i_xgemm_desc, l_k, l_z, i_row_idx, i_column_idx); } } /* scalar anyways */ } else { if ( (i_row_idx[i_column_idx[l_k] + l_z] < (unsigned int)i_xgemm_desc->m) ) { libxsmm_sparse_csc_asparse_innerloop_scalar(io_generated_code, i_xgemm_desc, l_k, l_z, i_row_idx, i_column_idx); } } } /* C fallback code */ l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#else\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* loop over the columns of A */ for ( l_z = 0; l_z < l_column_elements; l_z++ ) { if ( (i_row_idx[i_column_idx[l_k] + l_z] < (unsigned int)i_xgemm_desc->m) ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " C[(l_n*%u)+%u] += A[%u] * B[(l_n*%u)+%u];\n", (unsigned int)i_xgemm_desc->ldc, i_row_idx[i_column_idx[l_k] + l_z], i_column_idx[l_k] + l_z, (unsigned int)i_xgemm_desc->ldb, l_k ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_flop_count += 2; } } l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#endif\n\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " }\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* add flop counter */ l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "\n#ifndef NDEBUG\n#ifdef _OPENMP\n#pragma omp atomic\n#endif\nlibxsmm_num_total_flops += %u;\n#endif\n", l_flop_count * i_xgemm_desc->n); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } libxsmm-1.17/src/generator_spgemm_csc_asparse.h000066400000000000000000000063251415223013700217650ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef GENERATOR_SPGEMM_CSC_ASPARSE_H #define GENERATOR_SPGEMM_CSC_ASPARSE_H #include LIBXSMM_API_INTERN void libxsmm_sparse_csc_asparse_innerloop_scalar( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_k, const unsigned int i_z, const unsigned int* i_row_idx, const unsigned int* i_column_idx ); LIBXSMM_API_INTERN void libxsmm_sparse_csc_asparse_innerloop_two_vector( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_k, const unsigned int i_z, const unsigned int* i_row_idx, const unsigned int* i_column_idx ); LIBXSMM_API_INTERN void libxsmm_sparse_csc_asparse_innerloop_four_vector( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_k, const unsigned int i_z, const unsigned int* i_row_idx, const unsigned int* i_column_idx ); /* @TODO change int based architecture value */ LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csc_asparse( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const double* i_values ); #endif /* GENERATOR_SPGEMM_CSC_ASPARSE_H */ libxsmm-1.17/src/generator_spgemm_csc_bsparse.c000066400000000000000000000214261415223013700217600ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ /** * @file * This file is part of GemmCodeGenerator. * * @author Alexander Heinecke (alexander.heinecke AT mytum.de, http://www5.in.tum.de/wiki/index.php/Alexander_Heinecke,_M.Sc.,_M.Sc._with_honors) * * @section LICENSE * Copyright (c) 2012-2014, Technische Universitaet Muenchen * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * @section DESCRIPTION * */ #include "generator_spgemm_csc_bsparse.h" #include "generator_common.h" #include "libxsmm_main.h" LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csc_bsparse( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const double* i_values ) { unsigned int l_n; unsigned int l_z; unsigned int l_column_elements; unsigned int l_flop_count = 0; char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; LIBXSMM_UNUSED(i_values); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " unsigned int l_m = 0;\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* reset C if beta is zero */ if (0 != (LIBXSMM_GEMM_FLAG_BETA_0 & i_xgemm_desc->flags)) { /* Beta=0 */ l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " unsigned int l_n = 0;\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_n = 0; l_n < %u; l_n++) {\n", (unsigned int)i_xgemm_desc->n); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); if ( i_xgemm_desc->m > 1 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma vector aligned\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_m = 0; l_m < %u; l_m++) { C[(l_n*%u)+l_m] = 0.0; }\n", (unsigned int)i_xgemm_desc->m, (unsigned int)i_xgemm_desc->ldc); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_m = 0; l_m < %u; l_m++) { C[(l_n*%u)+l_m] = 0.0f; }\n", (unsigned int)i_xgemm_desc->m, (unsigned int)i_xgemm_desc->ldc); } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " }\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* determine the correct simd pragma for each architecture */ if ( ( strcmp( i_arch, "noarch" ) == 0 ) || ( strcmp( i_arch, "wsm" ) == 0 ) || ( strcmp( i_arch, "snb" ) == 0 ) || ( strcmp( i_arch, "hsw" ) == 0 ) ) { if ( i_xgemm_desc->m > 7 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd vectorlength(8)\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else if ( i_xgemm_desc->m > 3 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd vectorlength(4)\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else if ( i_xgemm_desc->m > 1 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd vectorlength(2)\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else {} if ( (i_xgemm_desc->m > 1) && ((LIBXSMM_GEMM_FLAG_ALIGN_A & i_xgemm_desc->flags) != 0) && ((LIBXSMM_GEMM_FLAG_ALIGN_C & i_xgemm_desc->flags) != 0) ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma vector aligned\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } else if ( ( strcmp( i_arch, "knl" ) == 0 ) || ( strcmp( i_arch, "skx" ) == 0 ) || ( strcmp( i_arch, "clx" ) == 0 ) || ( strcmp( i_arch, "cpx" ) == 0 ) ) { if ( (i_xgemm_desc->m > 1) && ((LIBXSMM_GEMM_FLAG_ALIGN_A & i_xgemm_desc->flags) != 0) && ((LIBXSMM_GEMM_FLAG_ALIGN_C & i_xgemm_desc->flags) != 0) ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd vectorlength(32)\n #pragma vector aligned\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } else { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_ARCH ); return; } /* generate the actual kernel */ l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_m = 0; l_m < %u; l_m++) {\n", (unsigned int)i_xgemm_desc->m); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); for ( l_n = 0; l_n < (unsigned int)i_xgemm_desc->n; l_n++ ) { l_column_elements = i_column_idx[l_n+1] - i_column_idx[l_n]; for ( l_z = 0; l_z < l_column_elements; l_z++ ) { /* check k such that we just use rows which actually need to be multiplied */ if ( i_row_idx[i_column_idx[l_n] + l_z] < (unsigned int)i_xgemm_desc->k ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " C[%u+l_m] += A[%u+l_m] * B[%u];\n", l_n * i_xgemm_desc->ldc, i_row_idx[i_column_idx[l_n] + l_z]*i_xgemm_desc->lda, i_column_idx[l_n] + l_z); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_flop_count += 2; } } } l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " }\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* add flop counter */ l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "\n#ifndef NDEBUG\n#ifdef _OPENMP\n#pragma omp atomic\n#endif\nlibxsmm_num_total_flops += %u;\n#endif\n", l_flop_count * (unsigned int)i_xgemm_desc->m); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } libxsmm-1.17/src/generator_spgemm_csc_bsparse.h000066400000000000000000000027451415223013700217700ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef GENERATOR_SPGEMM_CSC_BSPARSE_H #define GENERATOR_SPGEMM_CSC_BSPARSE_H #include /* @TODO change int based architecture value */ LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csc_bsparse( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const double* i_values ); #endif /* GENERATOR_SPGEMM_CSC_BSPARSE_H */ libxsmm-1.17/src/generator_spgemm_csc_bsparse_soa.c000066400000000000000000001176751415223013700226360ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "generator_spgemm_csc_bsparse_soa.h" #include "generator_gemm_common.h" #include "generator_x86_instructions.h" #include "generator_common.h" #include "libxsmm_main.h" LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csc_bsparse_soa( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const void* i_values, const unsigned int i_packed_width ) { if ( strcmp(i_arch, "knl") == 0 || strcmp(i_arch, "knm") == 0 || strcmp(i_arch, "skx") == 0 || strcmp(i_arch, "clx") == 0 || strcmp(i_arch, "cpx") == 0 || strcmp(i_arch, "hsw") == 0 || strcmp(i_arch, "snb") == 0 ) { if ( strcmp(i_arch, "snb") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX; } else if ( strcmp(i_arch, "hsw") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX2; } else if ( strcmp(i_arch, "knl") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_MIC; } else if ( strcmp(i_arch, "knm") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_KNM; } else if ( strcmp(i_arch, "skx") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_CORE; } else if ( strcmp(i_arch, "clx") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_CLX; } else if ( strcmp(i_arch, "cpx") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_CPX; } else { /* cannot happen */ } libxsmm_generator_spgemm_csc_bsparse_soa_avx256_512( io_generated_code, i_xgemm_desc, i_row_idx, i_column_idx, i_values, i_packed_width ); } else { fprintf( stderr, "CSC + SOA is only available for AVX/AVX2/AVX512 at this point\n" ); exit(-1); } } LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csc_bsparse_soa_avx256_512( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const void* i_values, const unsigned int i_packed_width ) { unsigned int l_n = 0; unsigned int l_max_cols = 0; unsigned int l_max_reg_block = 0; unsigned int l_simd_packed_remainder = 0; unsigned int l_simd_packed_iters = 0; unsigned int l_simd_packed_iters_full = 0; unsigned int l_simd_packed_width = 0; unsigned int l_packed_done = 0; unsigned int l_packed_count = 0; unsigned int l_packed_reg_block[2] = {0,0}; unsigned int l_packed_reg_range[2] = {0,0}; unsigned int l_col_reg_block[2][2] = { {0,0}, {0,0} }; unsigned int l_col_reg_range[2][2] = { {0,0}, {0,0} }; libxsmm_micro_kernel_config l_micro_kernel_config; libxsmm_loop_label_tracker l_loop_label_tracker; libxsmm_gp_reg_mapping l_gp_reg_mapping; /* select simd packing width and accumulator blocking */ if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { l_simd_packed_width = 8; l_max_reg_block = 28; } else { l_simd_packed_width = 4; l_max_reg_block = 14; } } else { if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { l_simd_packed_width = 16; l_max_reg_block = 28; } else { l_simd_packed_width = 8; l_max_reg_block = 14; } } l_simd_packed_remainder = i_packed_width % l_simd_packed_width; l_simd_packed_iters_full = i_packed_width/l_simd_packed_width; l_simd_packed_iters = ( l_simd_packed_remainder > 0 ) ? l_simd_packed_iters_full+1 : l_simd_packed_iters_full; /* get max column in C */ l_max_cols = i_xgemm_desc->n; for ( l_n = 0; l_n < i_xgemm_desc->n; l_n++ ) { if ( i_column_idx[l_n] == i_column_idx[i_xgemm_desc->n] ) { l_max_cols = l_n+1; } } /* when we have remainder on lower than AVX512 we need one spare register for a mask */ if ( ( io_generated_code->arch < LIBXSMM_X86_AVX512 ) && ( l_simd_packed_remainder != 0 ) ) { l_max_reg_block = 13; } #if 0 printf("packed parameters: %u, %u, %u, %u, %u\n", i_packed_width, l_simd_packed_remainder, l_simd_packed_iters, l_simd_packed_iters_full, l_simd_packed_width ); #endif /* packed blocking */ /* @TODO for 2^x for l_simd_packed iters we might want to todo something else */ libxsmm_compute_equalized_blocking( l_simd_packed_iters, l_max_reg_block, &(l_packed_reg_range[0]), &(l_packed_reg_block[0]), &(l_packed_reg_range[1]), &(l_packed_reg_block[1]) ); #if 0 printf("packed blocking (range0, block0, range1, block1): %u %u %u %u\n", l_packed_reg_range[0], l_packed_reg_block[0], l_packed_reg_range[1], l_packed_reg_block[1]); #endif /* adjust max reg_blocking to allow for 2d blocking */ if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { if ( l_packed_reg_block[0] == 2 ) { l_max_reg_block = 20; } if ( l_packed_reg_block[0] == 4 ) { l_max_reg_block = 24; } } /* N blocking for packed blocking */ libxsmm_compute_equalized_blocking( l_max_cols, l_max_reg_block/l_packed_reg_block[0], &(l_col_reg_range[0][0]), &(l_col_reg_block[0][0]), &(l_col_reg_range[0][1]), &(l_col_reg_block[0][1]) ); if ( l_packed_reg_block[1] != 0 ) { libxsmm_compute_equalized_blocking( l_max_cols, l_max_reg_block/l_packed_reg_block[1], &(l_col_reg_range[1][0]), &(l_col_reg_block[1][0]), &(l_col_reg_range[1][1]), &(l_col_reg_block[1][1]) ); } #if 0 printf("n blocking 0 (range0, block0, range1, block1): %u %u %u %u\n", l_col_reg_range[0][0], l_col_reg_block[0][0], l_col_reg_range[0][1], l_col_reg_block[0][1]); printf("n blocking 1 (range0, block0, range1, block1): %u %u %u %u\n", l_col_reg_range[1][0], l_col_reg_block[1][0], l_col_reg_range[1][1], l_col_reg_block[1][1]); #endif /* define gp register mapping */ libxsmm_reset_x86_gp_reg_mapping( &l_gp_reg_mapping ); /* matching calling convention on Linux */ #if defined(_WIN32) || defined(__CYGWIN__) l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_c = LIBXSMM_X86_GP_REG_R8; /* TODO: full support for Windows calling convention */ l_gp_reg_mapping.gp_reg_a_prefetch = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_b_prefetch = LIBXSMM_X86_GP_REG_UNDEF; #else /* match calling convention on Linux */ l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RDI; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_RSI; l_gp_reg_mapping.gp_reg_c = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_a_prefetch = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_b_prefetch = LIBXSMM_X86_GP_REG_R8; #endif l_gp_reg_mapping.gp_reg_mloop = LIBXSMM_X86_GP_REG_R12; l_gp_reg_mapping.gp_reg_nloop = LIBXSMM_X86_GP_REG_R13; l_gp_reg_mapping.gp_reg_kloop = LIBXSMM_X86_GP_REG_R14; l_gp_reg_mapping.gp_reg_help_0 = LIBXSMM_X86_GP_REG_R15; l_gp_reg_mapping.gp_reg_help_1 = LIBXSMM_X86_GP_REG_R11; l_gp_reg_mapping.gp_reg_help_2 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_3 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_4 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_5 = LIBXSMM_X86_GP_REG_UNDEF; /* define loop_label_tracker */ libxsmm_reset_loop_label_tracker( &l_loop_label_tracker ); /* define the micro kernel code gen properties */ libxsmm_generator_gemm_init_micro_kernel_config_fullvector( &l_micro_kernel_config, io_generated_code->arch, i_xgemm_desc, 0 ); /* open asm */ libxsmm_x86_instruction_open_stream( io_generated_code, &l_gp_reg_mapping, i_xgemm_desc->prefetch ); /* m loop */ libxsmm_x86_instruction_register_jump_back_label( io_generated_code, &l_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_mloop, 1 ); /* loop over packed blocks */ while ( l_packed_done != l_simd_packed_iters ) { unsigned int l_packed_blocking = l_packed_reg_block[l_packed_count]; unsigned int l_packed_remainder = 0; unsigned int l_n_done = 0; unsigned int l_n_count = 0; unsigned int l_n_processed = 0; if ( (l_simd_packed_remainder != 0) && (l_packed_count == 0) ) { if ( l_packed_reg_block[1] > 0 ) { l_packed_remainder = 0; } else { l_packed_remainder = l_simd_packed_remainder; } } else if (l_simd_packed_remainder != 0) { l_packed_remainder = l_simd_packed_remainder; } while ( l_n_done < l_max_cols ) { unsigned int l_n_blocking = l_col_reg_block[l_packed_count][l_n_count]; for ( l_n_processed = l_n_done; l_n_processed < l_n_done + l_col_reg_range[l_packed_count][l_n_count]; l_n_processed += l_n_blocking ) { libxsmm_generator_spgemm_csc_bsparse_soa_avx256_512_kloop( io_generated_code, &l_loop_label_tracker, &l_gp_reg_mapping, &l_micro_kernel_config, i_xgemm_desc, i_row_idx, i_column_idx, i_values, l_n_processed, l_n_processed + l_n_blocking, l_packed_done, l_packed_done + l_packed_reg_range[l_packed_count], l_packed_blocking, l_packed_remainder, i_packed_width ); } l_n_done += l_col_reg_range[l_packed_count][l_n_count]; l_n_count++; } /* advance N */ l_packed_done += l_packed_reg_range[l_packed_count]; l_packed_count++; } /* advance C pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_c, l_micro_kernel_config.datatype_size*i_packed_width*i_xgemm_desc->ldc); /* advance A pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_a, l_micro_kernel_config.datatype_size*i_packed_width*i_xgemm_desc->lda); /* close m loop */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_cmp_instruction, l_gp_reg_mapping.gp_reg_mloop, i_xgemm_desc->m ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, l_micro_kernel_config.alu_jmp_instruction, &l_loop_label_tracker ); /* close asm */ libxsmm_x86_instruction_close_stream( io_generated_code, &l_gp_reg_mapping, i_xgemm_desc->prefetch ); } LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csc_bsparse_soa_avx256_512_kloop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const void* i_values, const unsigned int i_n_processed, const unsigned int i_n_limit, const unsigned int i_packed_processed, const unsigned int i_packed_range, const unsigned int i_packed_blocking, const unsigned int i_packed_remainder, const unsigned int i_packed_width ) { unsigned int l_n = 0; unsigned int l_p = 0; unsigned int l_k = 0; unsigned int l_found_mul = 0; unsigned int l_max_reg_block = (i_n_limit - i_n_processed) * i_packed_blocking; unsigned int l_n_blocking = i_n_limit - i_n_processed; unsigned int l_avx_mask_instr = 0; LIBXSMM_UNUSED(i_values); LIBXSMM_ASSERT( i_packed_blocking > 0 ); /* packed loop */ if ( i_packed_range/i_packed_blocking > 1 ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_help_0, 0 ); libxsmm_x86_instruction_register_jump_back_label( io_generated_code, io_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_help_0, 1 ); } /* load k if packed remainder is non-zero */ if ( i_packed_remainder != 0 ) { /* on AVX512 we can use mask registers */ if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { libxsmm_generator_gemm_initialize_avx512_mask( io_generated_code, i_gp_reg_mapping->gp_reg_help_1, i_xgemm_desc, i_micro_kernel_config->vector_length-i_packed_remainder ); } else { char l_id = (char)l_n_blocking; unsigned char l_data[32]; unsigned int l_count; if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { unsigned long long* l_i64_ptr = (unsigned long long*)l_data; for ( l_count = 0; l_count < 4; ++l_count ) { if ( l_count < i_packed_remainder ) { l_i64_ptr[l_count] = 0xffffffffffffffff; } else { l_i64_ptr[l_count] = 0x0; } } l_avx_mask_instr = LIBXSMM_X86_INSTR_VMASKMOVPD; } else { unsigned int* l_i32_ptr = (unsigned int*)l_data; for ( l_count = 0; l_count < 8; ++l_count ) { if ( l_count < i_packed_remainder ) { l_i32_ptr[l_count] = 0xffffffff; } else { l_i32_ptr[l_count] = 0x0; } } l_avx_mask_instr = LIBXSMM_X86_INSTR_VMASKMOVPS; } libxsmm_x86_instruction_full_vec_load_of_constants ( io_generated_code, l_data, &l_id, 'y', 15 ); } } /* load C accumulator */ for ( l_n = 0; l_n < l_n_blocking; l_n++ ) { for ( l_p = 0; l_p < i_packed_blocking; l_p++ ) { if (0 != (LIBXSMM_GEMM_FLAG_BETA_0 & i_xgemm_desc->flags)) { /* Beta=0 */ libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vxor_instruction, i_micro_kernel_config->vector_name, (l_n*i_packed_blocking) + l_p, (l_n*i_packed_blocking) + l_p, (l_n*i_packed_blocking) + l_p ); } else { if ( (l_p == i_packed_blocking-1) && (i_packed_remainder != 0) ) { if ( l_avx_mask_instr > 0 ) { libxsmm_x86_instruction_vec_mask_move( io_generated_code, l_avx_mask_instr, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, ( (i_n_processed + l_n)*i_packed_width*i_micro_kernel_config->datatype_size ) + ( (i_packed_processed + l_p)*i_micro_kernel_config->vector_length*i_micro_kernel_config->datatype_size ), i_micro_kernel_config->vector_name, (l_n*i_packed_blocking) + l_p, 15, 0); } else { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->c_vmove_instruction, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, ( (i_n_processed + l_n)*i_packed_width*i_micro_kernel_config->datatype_size ) + ( (i_packed_processed + l_p)*i_micro_kernel_config->vector_length*i_micro_kernel_config->datatype_size ), i_micro_kernel_config->vector_name, (l_n*i_packed_blocking) + l_p, 1, 1, 0 ); } } else { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->c_vmove_instruction, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, ( (i_n_processed + l_n)*i_packed_width*i_micro_kernel_config->datatype_size ) + ( (i_packed_processed + l_p)*i_micro_kernel_config->vector_length*i_micro_kernel_config->datatype_size ), i_micro_kernel_config->vector_name, (l_n*i_packed_blocking) + l_p, 0, 1, 0 ); } } } } /* do dense soa times sparse multiplication */ for ( l_k = 0; l_k < (unsigned int)i_xgemm_desc->k; l_k++ ) { unsigned int l_found_qmadd = 0; unsigned int l_col_k = 0; unsigned int l_column_active[28] = {0}; int l_nnz_idx[28][4] = { {0}, {0} }; /* reset helpers */ for ( l_n = 0; l_n < l_n_blocking; l_n++ ) { l_column_active[l_n] = 0; l_nnz_idx[l_n][0] = -1; l_nnz_idx[l_n][1] = -1; l_nnz_idx[l_n][2] = -1; l_nnz_idx[l_n][3] = -1; } l_found_mul = 0; /* let's figure out if we can apply qmadd when being sin F32 setting and on KNM */ if ( (l_k < ((unsigned int)i_xgemm_desc->k - 3)) && (i_micro_kernel_config->instruction_set == LIBXSMM_X86_AVX512_KNM) && (LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) ) { /* loop over the columns of B/C */ for ( l_n = 0; l_n < l_n_blocking; l_n++ ) { unsigned int l_found = 0; unsigned int l_acol_k = 0; unsigned int l_col_elements = i_column_idx[i_n_processed+l_n+1] - i_column_idx[i_n_processed+l_n]; unsigned int l_cur_column = i_column_idx[i_n_processed+l_n]; for ( l_col_k = 0; l_col_k < l_col_elements; l_col_k++ ) { for ( l_acol_k = l_found; l_acol_k < 4; l_acol_k++ ) { if ( (l_k + l_acol_k) == i_row_idx[l_cur_column + l_col_k] ) { l_nnz_idx[l_n][l_acol_k] = l_cur_column + l_col_k; l_found = l_acol_k+1; } if (l_found == 4) { l_col_k = l_col_elements; } } } /* let's check if we can apply qmadd in col l_n */ if ( (l_nnz_idx[l_n][0] != -1) && (l_nnz_idx[l_n][1] != -1) && (l_nnz_idx[l_n][2] != -1) && (l_nnz_idx[l_n][3] != -1) ) { l_column_active[l_n] = 2; l_found_qmadd = 1; l_found_mul = 1; } else { /* let's check if we have at least one entry in the column that matches one of the four entries */ if ( (l_nnz_idx[l_n][0] != -1) || (l_nnz_idx[l_n][1] != -1) || (l_nnz_idx[l_n][2] != -1) || (l_nnz_idx[l_n][3] != -1) ) { l_column_active[l_n] = 1; l_found_mul = 1; } else { l_column_active[l_n] = 0; } } } } if ( l_found_qmadd == 0 ) { /* loop over the columns of B/C */ for ( l_n = 0; l_n < l_n_blocking; l_n++ ) { unsigned int l_col_elements = i_column_idx[i_n_processed+l_n+1] - i_column_idx[i_n_processed+l_n]; unsigned int l_cur_column = i_column_idx[i_n_processed+l_n]; /* search for entries matching that k */ for ( l_col_k = 0; l_col_k < l_col_elements; l_col_k++ ) { if ( l_k == i_row_idx[l_cur_column + l_col_k] ) { l_nnz_idx[l_n][0] = l_cur_column + l_col_k; l_col_k = l_col_elements; } } /* let's check if we have an entry in the column that matches the k from A */ if ( (l_nnz_idx[l_n][0] != -1) ) { l_column_active[l_n] = 1; l_found_mul = 1; } else { l_column_active[l_n] = 0; } } } /* First case: we can use qmadd */ if ( l_found_qmadd != 0 ) { unsigned int l_lcl_k = 0; for ( l_p = 0; l_p < i_packed_blocking; l_p ++ ) { for ( l_lcl_k = 0; l_lcl_k < 4; l_lcl_k++ ) { if ( (l_p == i_packed_blocking-1) && (i_packed_remainder != 0) ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, ( (l_k+l_lcl_k)*i_packed_width*i_micro_kernel_config->datatype_size ) + ( (i_packed_processed + l_p)*i_micro_kernel_config->vector_length*i_micro_kernel_config->datatype_size ), i_micro_kernel_config->vector_name, l_max_reg_block+l_lcl_k, 1, 1, 0 ); } else { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, ( (l_k+l_lcl_k)*i_packed_width*i_micro_kernel_config->datatype_size ) + ( (i_packed_processed + l_p)*i_micro_kernel_config->vector_length*i_micro_kernel_config->datatype_size ), i_micro_kernel_config->vector_name, l_max_reg_block+l_lcl_k, 0, 1, 0 ); } } /* loop over the columns of B/C */ for ( l_n = 0; l_n < i_n_limit - i_n_processed; l_n++ ) { /* issue a qmadd */ if ( l_column_active[l_n] == 2 ) { libxsmm_x86_instruction_vec_compute_qfma( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_V4FMADDPS, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_nnz_idx[l_n][0] * i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, l_max_reg_block, (l_n*i_packed_blocking) + l_p ); } else if ( l_column_active[l_n] == 1 ) { for ( l_lcl_k = 0; l_lcl_k < 4; l_lcl_k++ ) { if ( l_nnz_idx[l_n][l_lcl_k] != -1 ) { libxsmm_x86_instruction_vec_compute_mem( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, 1, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_nnz_idx[l_n][l_lcl_k] * i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, l_max_reg_block+l_lcl_k, (l_n*i_packed_blocking) + l_p ); } } } } } /* increment by additional 3 columns */ l_k += 3; } else if ( l_found_mul != 0 ) { unsigned int l_preload_b = ( (i_packed_blocking > 1) && ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) && ( (1 + l_n_blocking + (i_packed_blocking * l_n_blocking)) < i_micro_kernel_config->vector_reg_count ) ) ? 1 : 0; unsigned int l_avx_max_reg = ( l_avx_mask_instr > 0 ) ? 14 : 15; if ( l_preload_b ) { for ( l_n = 0; l_n < l_n_blocking; l_n++ ) { if ( l_nnz_idx[l_n][0] != -1 ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_nnz_idx[l_n][0] * i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, l_max_reg_block + l_n + 1, 0, 1, 0 ); } } } for ( l_p = 0; l_p < i_packed_blocking; l_p++ ) { if ( (l_p == i_packed_blocking-1) && (i_packed_remainder != 0) ) { if ( l_avx_mask_instr > 0 ) { libxsmm_x86_instruction_vec_mask_move( io_generated_code, l_avx_mask_instr, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (l_k*i_packed_width*i_micro_kernel_config->datatype_size) + ( (i_packed_processed + l_p)*i_micro_kernel_config->vector_length*i_micro_kernel_config->datatype_size ), i_micro_kernel_config->vector_name, l_max_reg_block, 15, 0 ); } else { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (l_k*i_packed_width*i_micro_kernel_config->datatype_size) + ( (i_packed_processed + l_p)*i_micro_kernel_config->vector_length*i_micro_kernel_config->datatype_size ), i_micro_kernel_config->vector_name, l_max_reg_block, 1, 1, 0 ); } } else { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (l_k*i_packed_width*i_micro_kernel_config->datatype_size) + ( (i_packed_processed + l_p)*i_micro_kernel_config->vector_length*i_micro_kernel_config->datatype_size ), i_micro_kernel_config->vector_name, l_max_reg_block, 0, 1, 0 ); } /* loop over the columns of B/C */ for ( l_n = 0; l_n < l_n_blocking; l_n++ ) { if ( l_nnz_idx[l_n][0] != -1 ) { if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { if ( l_preload_b ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, l_max_reg_block, l_max_reg_block + l_n + 1, (l_n*i_packed_blocking) + l_p ); } else { libxsmm_x86_instruction_vec_compute_mem( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, 1, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_nnz_idx[l_n][0] * i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, l_max_reg_block, (l_n*i_packed_blocking) + l_p ); } } else if ( io_generated_code->arch == LIBXSMM_X86_AVX2 ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_nnz_idx[l_n][0] * i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, l_avx_max_reg, 0, 1, 0 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, l_max_reg_block, l_avx_max_reg, (l_n*i_packed_blocking) + l_p ); } else if ( io_generated_code->arch == LIBXSMM_X86_AVX ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_nnz_idx[l_n][0] * i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, l_avx_max_reg, 0, 1, 0 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, l_max_reg_block, l_avx_max_reg, l_avx_max_reg ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vadd_instruction, i_micro_kernel_config->vector_name, l_avx_max_reg, (l_n*i_packed_blocking) + l_p, (l_n*i_packed_blocking) + l_p ); } else { } } } } } else { /* shouldn't happen */ } } /* store C accumulator */ for ( l_n = 0; l_n < l_n_blocking; l_n++ ) { for ( l_p = 0; l_p < i_packed_blocking; l_p++ ) { if ( (l_p == i_packed_blocking-1) && (i_packed_remainder != 0) ) { if ( l_avx_mask_instr > 0 ) { libxsmm_x86_instruction_vec_mask_move( io_generated_code, l_avx_mask_instr, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, ( (i_n_processed + l_n)*i_packed_width*i_micro_kernel_config->datatype_size ) + ( (i_packed_processed + l_p)*i_micro_kernel_config->vector_length*i_micro_kernel_config->datatype_size ), i_micro_kernel_config->vector_name, (l_n*i_packed_blocking) + l_p, 15, 1); } else { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->c_vmove_instruction, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, ( (i_n_processed + l_n)*i_packed_width*i_micro_kernel_config->datatype_size ) + ( (i_packed_processed + l_p)*i_micro_kernel_config->vector_length*i_micro_kernel_config->datatype_size ), i_micro_kernel_config->vector_name, (l_n*i_packed_blocking) + l_p, 1, 0, 1 ); } } else { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->c_vmove_instruction, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, ( (i_n_processed + l_n)*i_packed_width*i_micro_kernel_config->datatype_size ) + ( (i_packed_processed + l_p)*i_micro_kernel_config->vector_length*i_micro_kernel_config->datatype_size ), i_micro_kernel_config->vector_name, (l_n*i_packed_blocking) + l_p, 0, 0, 1 ); } } } /* packed loop */ if ( i_packed_range/i_packed_blocking > 1 ) { /* advance A and C pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a, i_packed_blocking*i_micro_kernel_config->vector_length*i_micro_kernel_config->datatype_size ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_c, i_packed_blocking*i_micro_kernel_config->vector_length*i_micro_kernel_config->datatype_size ); /* packed loop footer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_cmp_instruction, i_gp_reg_mapping->gp_reg_help_0, i_packed_range/i_packed_blocking ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, i_micro_kernel_config->alu_jmp_instruction, io_loop_label_tracker ); /* reset A and C pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_a, (i_packed_range/i_packed_blocking)*i_micro_kernel_config->vector_length*i_micro_kernel_config->datatype_size ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_c, (i_packed_range/i_packed_blocking)*i_micro_kernel_config->vector_length*i_micro_kernel_config->datatype_size ); } } libxsmm-1.17/src/generator_spgemm_csc_bsparse_soa.h000066400000000000000000000077501415223013700226330ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef GENERATOR_SPGEMM_CSC_BSPARSE_SOA_H #define GENERATOR_SPGEMM_CSC_BSPARSE_SOA_H #include #include "generator_common.h" /* @TODO change int based architecture value */ LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csc_bsparse_soa( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const void* i_values, const unsigned int i_packed_width ); LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csc_bsparse_soa_avx256_512( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const void* i_values, const unsigned int i_packed_width ); LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csc_bsparse_soa_avx256_512_kloop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const void* i_values, const unsigned int i_n_processed, const unsigned int i_n_limit, const unsigned int i_packed_processed, const unsigned int i_packed_range, const unsigned int i_packed_blocking, const unsigned int i_packed_remainder, const unsigned int i_packed_width ); #endif /* GENERATOR_SPGEMM_CSC_BSPARSE_SOA_H */ libxsmm-1.17/src/generator_spgemm_csc_csparse_soa.c000066400000000000000000001330761415223013700226300ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "generator_spgemm_csc_csparse_soa.h" #include "generator_gemm_common.h" #include "generator_x86_instructions.h" #include "libxsmm_main.h" LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csc_csparse_soa_axv256_512_single( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const unsigned int i_packed_width, const unsigned int i_n, const unsigned int i_m ) { /* compute packed loop trip count */ #if 0 unsigned int l_simd_packed_remainder = 0; #endif unsigned int l_simd_packed_iters = 0; unsigned int l_simd_packed_width = 0; /* select simd packing width */ if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { l_simd_packed_width = 8; } else { l_simd_packed_width = 4; } } else { if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { l_simd_packed_width = 16; } else { l_simd_packed_width = 8; } } #if 0 l_simd_packed_remainder = i_packed_width % l_simd_packed_width; #endif l_simd_packed_iters = i_packed_width/l_simd_packed_width; /* set c accumulator to 0 */ libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vxor_instruction, i_micro_kernel_config->vector_name, 31, 31, 31 ); /* k loop header */ if ( i_xgemm_desc->k > 1 ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_kloop, 0 ); libxsmm_x86_instruction_register_jump_back_label( io_generated_code, io_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_kloop, 1 ); } /* packed loop header */ if ( l_simd_packed_iters > 1 ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_help_1, 0 ); libxsmm_x86_instruction_register_jump_back_label( io_generated_code, io_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_help_1, 1 ); } /* load b */ libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMOVUPS, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, i_micro_kernel_config->datatype_size*i_packed_width*i_n, i_micro_kernel_config->vector_name, 0, 0, 1, 0 ); /* FMA with fused load of a */ libxsmm_x86_instruction_vec_compute_mem( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VFMADD231PS, 0, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, i_micro_kernel_config->datatype_size*i_packed_width*i_row_idx[i_column_idx[i_n]+i_m], i_micro_kernel_config->vector_name, 0, 31 ); /* packed loop footer */ if ( l_simd_packed_iters > 1 ) { /* advance a and b pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a, i_micro_kernel_config->datatype_size*l_simd_packed_width ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_b, i_micro_kernel_config->datatype_size*l_simd_packed_width ); /* check loop bound */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_cmp_instruction, i_gp_reg_mapping->gp_reg_help_1, l_simd_packed_iters ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, i_micro_kernel_config->alu_jmp_instruction, io_loop_label_tracker ); /* re-set a and b pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_a, l_simd_packed_iters*i_micro_kernel_config->datatype_size*l_simd_packed_width ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_b, l_simd_packed_iters*i_micro_kernel_config->datatype_size*l_simd_packed_width ); } /* k loop footer */ if ( i_xgemm_desc->k > 1 ) { /* advance a and b pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a, i_micro_kernel_config->datatype_size*i_packed_width*i_xgemm_desc->lda ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_b, i_micro_kernel_config->datatype_size*i_packed_width*i_xgemm_desc->ldb ); /* close k loop */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_cmp_instruction, i_gp_reg_mapping->gp_reg_kloop, i_xgemm_desc->k ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, i_micro_kernel_config->alu_jmp_instruction, io_loop_label_tracker ); /* re-set a and b pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_a, i_xgemm_desc->k*i_micro_kernel_config->datatype_size*i_packed_width*i_xgemm_desc->lda ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_b, i_xgemm_desc->k*i_micro_kernel_config->datatype_size*i_packed_width*i_xgemm_desc->ldb ); } /* reduce C */ /* zmm31; 0000 0000 0000 0000 -> ---- ---- 0000 0000 */ libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 31, 31, 0, 0x4e ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 31, 0, 31 ); /* zmm31: ---- ---- 0000 0000 -> ---- ---- ---- 0000 */ libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 31, 31, 0, 0xb1 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 31, 0, 15 ); /* ymm15; ---- 0000 -> ---- --00 */ libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFPS, 'y', 15, 15, 0, 0x4e ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, 'y', 15, 0, 15 ); /* ymm15; ---- --00 -> ---- ---0 */ libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFPS, 'y', 15, 15, 0, 0x1 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, 'y', 15, 0, 15 ); /* update sparse C */ if ( 0 == (LIBXSMM_GEMM_FLAG_BETA_0 & i_xgemm_desc->flags) ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMOVSS, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, i_micro_kernel_config->datatype_size*(i_column_idx[i_n]+i_m), 'x', 0, 0, 1, 0 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDSS, 'x', 15, 0, 15 ); } libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMOVSS, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, i_micro_kernel_config->datatype_size*(i_column_idx[i_n]+i_m), 'x', 15, 0, 1, 1 ); } LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csc_csparse_soa_axv256_512_16accs( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const unsigned int i_packed_width, const unsigned int i_n, const unsigned int i_m, const unsigned int i_m_blocking ) { /* some helper variables */ unsigned int l_i, l_max_m, l_mask_reg, l_mask_val; /* compute packed loop trip count */ #if 0 unsigned int l_simd_packed_remainder = 0; #endif unsigned int l_simd_packed_iters = 0; unsigned int l_simd_packed_width = 0; /* select simd packing width */ if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { l_simd_packed_width = 8; } else { l_simd_packed_width = 4; } } else { if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { l_simd_packed_width = 16; } else { l_simd_packed_width = 8; } } #if 0 l_simd_packed_remainder = i_packed_width % l_simd_packed_width; #endif l_simd_packed_iters = i_packed_width/l_simd_packed_width; /* we only generated for AVX512 for now, max m is 16; max_m is used for init and reduction */ l_max_m = 16; l_mask_reg = 1; /* load maske register */ l_mask_val = 0xffff >> (16-i_m_blocking); libxsmm_x86_instruction_alu_imm( io_generated_code, LIBXSMM_X86_INSTR_MOVQ, i_gp_reg_mapping->gp_reg_help_0, l_mask_val ); libxsmm_x86_instruction_mask_move( io_generated_code, LIBXSMM_X86_INSTR_KMOVW, i_gp_reg_mapping->gp_reg_help_0, l_mask_reg, 0 ); /* set c accumulator to 0 */ for ( l_i = 0; l_i < l_max_m; ++l_i ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vxor_instruction, i_micro_kernel_config->vector_name, l_i, l_i, l_i ); } /* k loop header */ if ( i_xgemm_desc->k > 1 ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_kloop, 0 ); libxsmm_x86_instruction_register_jump_back_label( io_generated_code, io_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_kloop, 1 ); } /* packed loop header */ if ( l_simd_packed_iters > 1 ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_help_1, 0 ); libxsmm_x86_instruction_register_jump_back_label( io_generated_code, io_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_help_1, 1 ); } /* load b */ libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMOVUPS, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, i_micro_kernel_config->datatype_size*i_packed_width*i_n, i_micro_kernel_config->vector_name, 31, 0, 1, 0 ); /* FMA with fused load of a */ for ( l_i = i_m; l_i < (i_m + i_m_blocking); ++l_i ) { libxsmm_x86_instruction_vec_compute_mem( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VFMADD231PS, 0, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, i_micro_kernel_config->datatype_size*i_packed_width*i_row_idx[i_column_idx[i_n]+l_i], i_micro_kernel_config->vector_name, 31, l_i%16 ); } /* packed loop footer */ if ( l_simd_packed_iters > 1 ) { /* advance a and b pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a, i_micro_kernel_config->datatype_size*l_simd_packed_width ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_b, i_micro_kernel_config->datatype_size*l_simd_packed_width ); /* check loop bound */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_cmp_instruction, i_gp_reg_mapping->gp_reg_help_1, l_simd_packed_iters ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, i_micro_kernel_config->alu_jmp_instruction, io_loop_label_tracker ); /* re-set a and b pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_a, l_simd_packed_iters*i_micro_kernel_config->datatype_size*l_simd_packed_width ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_b, l_simd_packed_iters*i_micro_kernel_config->datatype_size*l_simd_packed_width ); } /* k loop footer */ if ( i_xgemm_desc->k > 1 ) { /* advance a and b pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a, i_micro_kernel_config->datatype_size*i_packed_width*i_xgemm_desc->lda ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_b, i_micro_kernel_config->datatype_size*i_packed_width*i_xgemm_desc->ldb ); /* close k loop */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_cmp_instruction, i_gp_reg_mapping->gp_reg_kloop, i_xgemm_desc->k ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, i_micro_kernel_config->alu_jmp_instruction, io_loop_label_tracker ); /* re-set a and b pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_a, i_xgemm_desc->k*i_micro_kernel_config->datatype_size*i_packed_width*i_xgemm_desc->lda ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_b, i_xgemm_desc->k*i_micro_kernel_config->datatype_size*i_packed_width*i_xgemm_desc->ldb ); } /* reduce C */ /* 1st stage */ /* zmm0/zmm4; 4444 4444 4444 4444 / 0000 0000 0000 0000 -> zmm0: 4444 4444 0000 0000 */ libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 4, 0, 16, 0x44 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 4, 0, 17, 0xee ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 0 ); if ( i_m_blocking > 7 ) { /* zmm8/zmm12; cccc cccc cccc cccc / 8888 8888 8888 8888 -> zmm8: cccc cccc 8888 8888 */ libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 12, 8, 16, 0x44 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 12, 8, 17, 0xee ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 8 ); } /* zmm1/zmm5; 5555 5555 5555 5555 / 1111 1111 1111 1111 -> zmm1: 5555 5555 1111 1111 */ libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 5, 1, 16, 0x44 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 5, 1, 17, 0xee ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 1 ); if ( i_m_blocking > 8 ) { /* zmm9/zmm13; dddd dddd dddd dddd / 9999 9999 9999 9999 -> zmm9: dddd dddd 9999 9999 */ libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 13, 9, 16, 0x44 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 13, 9, 17, 0xee ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 9 ); } /* zmm2/zmm6; 6666 6666 6666 6666 / 2222 2222 2222 2222 -> zmm2: 6666 6666 2222 2222 */ libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 6, 2, 16, 0x44 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 6, 2, 17, 0xee ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 2 ); if ( i_m_blocking > 9 ) { /* zmm10/zmm14; eeee eeee eeee eeee / aaaa aaaa aaaa aaaa -> zmm10: eeee eeee aaaa aaaa */ libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 14, 10, 16, 0x44 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 14, 10, 17, 0xee ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 10 ); } /* zmm3/zmm7; 7777 7777 7777 7777 / 3333 3333 3333 3333 -> zmm3: 7777 7777 3333 3333 */ libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 7, 3, 16, 0x44 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 7, 3, 17, 0xee ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 3 ); if ( i_m_blocking > 10 ) { /* zmm11/zmm15; ffff ffff ffff ffff / bbbb bbbb bbbb bbbb -> zmm11: ffff ffff bbbb bbbb */ libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 15, 11, 16, 0x44 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 15, 11, 17, 0xee ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 11 ); } /* 2nd stage */ /* zmm0/zmm8; 4444 4444 0000 0000 / cccc cccc 8888 8888 -> zmm0: cccc 8888 4444 0000 */ libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 8, 0, 16, 0x88 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 8, 0, 17, 0xdd ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 0 ); /* zmm1/zmm9; 5555 5555 1111 1111 / dddd dddd 9999 9999 -> zmm1: dddd 9999 5555 1111 */ libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 9, 1, 16, 0x88 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 9, 1, 17, 0xdd ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 1 ); /* zmm2/zmm10; 6666 6666 2222 2222 / eeee eeee aaaa aaaa -> zmm2: eeee aaaa 6666 2222 */ libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 10, 2, 16, 0x88 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 10, 2, 17, 0xdd ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 2 ); /* zmm3/zmm11: 7777 7777 3333 3333 / ffff ffff bbbb bbbb -> zmm3: ffff bbbb 7777 3333 */ libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 11, 3, 16, 0x88 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFF64X2, i_micro_kernel_config->vector_name, 11, 3, 17, 0xdd ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 3 ); /* 3rd stage */ /* zmm0/zmm1; cccc 8888 4444 0000 / dddd 9999 5555 1111 -> zmm0: ddcc 9988 5544 1100 */ libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFPS, i_micro_kernel_config->vector_name, 1, 0, 16, 0x44 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFPS, i_micro_kernel_config->vector_name, 1, 0, 17, 0xee ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 0 ); /* zmm2/zmm3; eeee aaaa 6666 2222 / ffff bbbb 7777 3333 -> zmm2: ffee bbaa 7766 3322 */ libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFPS, i_micro_kernel_config->vector_name, 3, 2, 16, 0x44 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFPS, i_micro_kernel_config->vector_name, 3, 2, 17, 0xee ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 2 ); /* 4th stage */ /* zmm0/zmm2; ddcc 9988 5544 1100 / ffee bbaa 7766 3322 -> zmm0: fedc ba98 7654 3210 */ libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFPS, i_micro_kernel_config->vector_name, 2, 0, 16, 0x88 ); libxsmm_x86_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VSHUFPS, i_micro_kernel_config->vector_name, 2, 0, 17, 0xdd ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 16, 17, 0 ); /* update sparse C */ if ( 0 == (LIBXSMM_GEMM_FLAG_BETA_0 & i_xgemm_desc->flags) ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMOVUPS, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, i_micro_kernel_config->datatype_size*(i_column_idx[i_n]+i_m), i_micro_kernel_config->vector_name, 1, l_mask_reg, 1, 0 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VADDPS, i_micro_kernel_config->vector_name, 0, 1, 1 ); } libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, LIBXSMM_X86_INSTR_VMOVUPS, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, i_micro_kernel_config->datatype_size*(i_column_idx[i_n]+i_m), i_micro_kernel_config->vector_name, 1, l_mask_reg, 0, 1 ); } LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csc_csparse_soa( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const void* i_values, const unsigned int i_packed_width ) { if ( strcmp(i_arch, "knl") == 0 || strcmp(i_arch, "knm") == 0 || strcmp(i_arch, "skx") == 0 || strcmp(i_arch, "clx") == 0 || strcmp(i_arch, "cpx") == 0 ) { if ( strcmp(i_arch, "knl") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_MIC; } else if ( strcmp(i_arch, "knm") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_KNM; } else if ( strcmp(i_arch, "skx") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_CORE; } else if ( strcmp(i_arch, "clx") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_CLX; } else if ( strcmp(i_arch, "cpx") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_CPX; } else { /* cannot happen */ } libxsmm_generator_spgemm_csc_csparse_soa_avx256_512( io_generated_code, i_xgemm_desc, i_row_idx, i_column_idx, i_values, i_packed_width ); } else { fprintf( stderr, "CSC + SOA is only available for AVX/AVX2/AVX512 at this point\n" ); exit(-1); } } LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csc_csparse_soa_avx256_512( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const void* i_values, const unsigned int i_packed_width ) { unsigned int l_n = 0; unsigned int l_m = 0; libxsmm_micro_kernel_config l_micro_kernel_config; libxsmm_loop_label_tracker l_loop_label_tracker; libxsmm_gp_reg_mapping l_gp_reg_mapping; LIBXSMM_UNUSED(i_values); /* select soa width */ if ( LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { if ( i_packed_width % 16 != 0 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_ARCH_PREC ); return; } } else { if ( i_packed_width % 8 != 0 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_ARCH_PREC ); return; } } } else { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_ARCH_PREC ); return; } /* @TODO: we need to check this... however LIBXSMM descriptor setup disables A^T hard */ #if 0 /* we need to have the A^T flag set */ if ( (i_xgemm_desc->flags & LIBXSMM_GEMM_FLAG_TRANS_A) == 0 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_INVALID_GEMM_CONFIG ); return; } #endif /*define gp register mapping */ libxsmm_reset_x86_gp_reg_mapping( &l_gp_reg_mapping ); /* matching calling convention on Linux */ #if defined(_WIN32) || defined(__CYGWIN__) l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_c = LIBXSMM_X86_GP_REG_R8; /* TODO: full support for Windows calling convention */ l_gp_reg_mapping.gp_reg_a_prefetch = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_b_prefetch = LIBXSMM_X86_GP_REG_UNDEF; #else /* match calling convention on Linux */ l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RDI; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_RSI; l_gp_reg_mapping.gp_reg_c = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_a_prefetch = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_b_prefetch = LIBXSMM_X86_GP_REG_R8; #endif l_gp_reg_mapping.gp_reg_mloop = LIBXSMM_X86_GP_REG_R12; l_gp_reg_mapping.gp_reg_nloop = LIBXSMM_X86_GP_REG_R13; l_gp_reg_mapping.gp_reg_kloop = LIBXSMM_X86_GP_REG_R14; l_gp_reg_mapping.gp_reg_help_0 = LIBXSMM_X86_GP_REG_R15; l_gp_reg_mapping.gp_reg_help_1 = LIBXSMM_X86_GP_REG_R11; l_gp_reg_mapping.gp_reg_help_2 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_3 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_4 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_5 = LIBXSMM_X86_GP_REG_UNDEF; /* define loop_label_tracker */ libxsmm_reset_loop_label_tracker( &l_loop_label_tracker ); /* define the micro kernel code gen properties */ libxsmm_generator_gemm_init_micro_kernel_config_fullvector( &l_micro_kernel_config, io_generated_code->arch, i_xgemm_desc, 0 ); /* open asm */ libxsmm_x86_instruction_open_stream( io_generated_code, &l_gp_reg_mapping, i_xgemm_desc->prefetch ); /* loop over the sparse elements of C */ for ( l_n = 0; l_n < (unsigned int)i_xgemm_desc->n; l_n++ ) { unsigned int l_col_elements = i_column_idx[l_n+1] - i_column_idx[l_n]; if ( l_col_elements > 2 ) { for ( l_m = 0; l_m < (l_col_elements/16)*16; l_m+=16 ) { libxsmm_generator_spgemm_csc_csparse_soa_axv256_512_16accs( io_generated_code, &l_loop_label_tracker, &l_gp_reg_mapping, &l_micro_kernel_config, i_xgemm_desc, i_row_idx, i_column_idx, i_packed_width, l_n, l_m, 16 ); } if ( l_col_elements % 16 != 0 ) { libxsmm_generator_spgemm_csc_csparse_soa_axv256_512_16accs( io_generated_code, &l_loop_label_tracker, &l_gp_reg_mapping, &l_micro_kernel_config, i_xgemm_desc, i_row_idx, i_column_idx, i_packed_width, l_n, l_m, l_col_elements%16 ); } } else { for ( l_m = 0; l_m < l_col_elements; ++l_m ) { libxsmm_generator_spgemm_csc_csparse_soa_axv256_512_single( io_generated_code, &l_loop_label_tracker, &l_gp_reg_mapping, &l_micro_kernel_config, i_xgemm_desc, i_row_idx, i_column_idx, i_packed_width, l_n, l_m ); } } } /* close asm */ libxsmm_x86_instruction_close_stream( io_generated_code, &l_gp_reg_mapping, i_xgemm_desc->prefetch ); } libxsmm-1.17/src/generator_spgemm_csc_csparse_soa.h000066400000000000000000000112421415223013700226230ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef GENERATOR_SPGEMM_CSC_CSPARSE_SOA_H #define GENERATOR_SPGEMM_CSC_CSPARSE_SOA_H #include "generator_common.h" #include /* @TODO change int based architecture value */ LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csc_csparse_soa( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const void* i_values, const unsigned int i_packed_width ); LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csc_csparse_soa_avx256_512( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const void* i_values, const unsigned int i_packed_width ); LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csc_csparse_soa_axv256_512_single( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const unsigned int i_packed_width, const unsigned int i_n, const unsigned int i_m ); LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csc_csparse_soa_axv256_512_16accs( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const unsigned int i_packed_width, const unsigned int i_n, const unsigned int i_m, const unsigned int i_m_blocking ); #endif /* GENERATOR_SPGEMM_CSC_CSPARSE_SOA_H */ libxsmm-1.17/src/generator_spgemm_csc_reader.c000066400000000000000000000204251415223013700215610ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ /** * @file * This file is part of GemmCodeGenerator. * * @author Alexander Heinecke (alexander.heinecke AT mytum.de, http://www5.in.tum.de/wiki/index.php/Alexander_Heinecke,_M.Sc.,_M.Sc._with_honors) * * @section LICENSE * Copyright (c) 2012-2014, Technische Universitaet Muenchen * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * @section DESCRIPTION * */ #include "generator_common.h" #include "generator_spgemm_csc_reader.h" LIBXSMM_API_INTERN void libxsmm_sparse_csc_reader( libxsmm_generated_code* io_generated_code, const char* i_csc_file_in, unsigned int** o_row_idx, unsigned int** o_column_idx, double** o_values, unsigned int* io_row_count, unsigned int* io_column_count, unsigned int* o_element_count ) { FILE *l_csc_file_handle; const unsigned int l_line_length = 512; char l_line[512/*l_line_length*/+1]; unsigned int l_header_read = 0; unsigned int* l_column_idx_id = NULL; unsigned int l_i = 0; l_csc_file_handle = fopen( i_csc_file_in, "r" ); if ( l_csc_file_handle == NULL ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_CSC_INPUT ); return; } while (fgets(l_line, l_line_length, l_csc_file_handle) != NULL) { if ( strlen(l_line) == l_line_length ) { free(*o_row_idx); free(*o_column_idx); free(*o_values); free(l_column_idx_id); *io_row_count = *io_column_count = *o_element_count = 0; *o_row_idx = *o_column_idx = NULL; *o_values = NULL; fclose( l_csc_file_handle ); /* close mtx file */ LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_CSC_READ_LEN ); return; } /* check if we are still reading comments header */ if ( l_line[0] == '%' ) { continue; } else { /* if we are the first line after comment header, we allocate our data structures */ if ( l_header_read == 0 ) { unsigned int row_count, column_count; if (3 == sscanf(l_line, "%u %u %u", &row_count, &column_count, o_element_count) && 0 != row_count && 0 != column_count && 0 != *o_element_count) { *io_column_count = LIBXSMM_MAX(*io_column_count, column_count); *io_row_count = LIBXSMM_MAX(*io_row_count, row_count); /* allocate CSC data structure matching mtx file, and set everything to zero for init */ /* coverity[tainted_data] */ *o_row_idx = (unsigned int*)calloc(*o_element_count, sizeof(unsigned int)); /* coverity[tainted_data] */ *o_column_idx = (unsigned int*)calloc((size_t)*io_column_count + 1, sizeof(unsigned int)); /* coverity[tainted_data] */ *o_values = (double*)calloc(*o_element_count, sizeof(double)); /* coverity[tainted_data] */ l_column_idx_id = (unsigned int*)calloc(*io_column_count, sizeof(unsigned int)); /* check if mallocs were successful */ if ( ( *o_row_idx == NULL ) || ( *o_column_idx == NULL ) || ( *o_values == NULL ) || ( l_column_idx_id == NULL ) ) { free(*o_row_idx); free(*o_column_idx); free(*o_values); free(l_column_idx_id); *io_row_count = *io_column_count = *o_element_count = 0; *o_row_idx = *o_column_idx = NULL; *o_values = NULL; fclose(l_csc_file_handle); /* close mtx file */ LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_CSC_ALLOC_DATA ); return; } /* init column idx */ /* coverity[tainted_data] */ for (l_i = 0; l_i <= *io_column_count; ++l_i) { (*o_column_idx)[l_i] = *o_element_count; } /* init */ (*o_column_idx)[0] = 0; l_i = 0; l_header_read = 1; } else { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_CSC_READ_DESC ); fclose( l_csc_file_handle ); /* close mtx file */ return; } /* now we read the actual content */ } else { unsigned int l_row = 0, l_column = 0; double l_value = 0; /* read a line of content */ if ( sscanf(l_line, "%u %u %lf", &l_row, &l_column, &l_value) != 3 || l_row > *io_row_count || l_column > *io_column_count || l_i >= *o_element_count ) { free(*o_row_idx); free(*o_column_idx); free(*o_values); free(l_column_idx_id); *io_row_count = *io_column_count = *o_element_count = 0; *o_row_idx = *o_column_idx = NULL; *o_values = NULL; fclose(l_csc_file_handle); /* close mtx file */ LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_CSC_READ_ELEMS ); return; } /* adjust numbers to zero termination */ LIBXSMM_ASSERT(0 != l_row && 0 != l_column); l_row--; l_column--; /* add these values to row and value structure */ (*o_row_idx)[l_i] = l_row; (*o_values)[l_i] = l_value; l_i++; /* handle columns, set id to own for this column, yeah we need to handle empty columns */ /* coverity[tainted_data] */ l_column_idx_id[l_column] = 1; (*o_column_idx)[l_column+1] = l_i; } } } /* close mtx file */ fclose( l_csc_file_handle ); /* check if we read a file which was consistent */ if ( l_i != (*o_element_count) ) { free(*o_row_idx); free(*o_column_idx); free(*o_values); free(l_column_idx_id); *io_row_count = *io_column_count = *o_element_count = 0; *o_row_idx = *o_column_idx = NULL; *o_values = NULL; LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_CSC_LEN ); return; } if ( l_column_idx_id != NULL ) { /* let's handle empty columns */ for ( l_i = 0; l_i < (*io_column_count); l_i++) { if ( l_column_idx_id[l_i] == 0 ) { (*o_column_idx)[l_i+1] = (*o_column_idx)[l_i]; } } /* free helper data structure */ free( l_column_idx_id ); } } libxsmm-1.17/src/generator_spgemm_csc_reader.h000066400000000000000000000027651415223013700215750ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef GENERATOR_SPGEMM_CSC_READER_H #define GENERATOR_SPGEMM_CSC_READER_H #include #include LIBXSMM_API_INTERN void libxsmm_sparse_csc_reader( libxsmm_generated_code* io_generated_code, const char* i_csc_file_in, unsigned int** o_row_idx, unsigned int** o_column_idx, double** o_values, unsigned int* io_row_count, unsigned int* io_column_count, unsigned int* o_element_count ); #endif /* GENERATOR_SPGEMM_CSC_READER_H */ libxsmm-1.17/src/generator_spgemm_csr_asparse.c000066400000000000000000000154571415223013700220050ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "generator_spgemm_csr_asparse.h" #include "generator_common.h" #include "libxsmm_main.h" LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csr_asparse( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const double* i_values ) { unsigned int l_m; unsigned int l_z; unsigned int l_row_elements; unsigned int l_flop_count = 0; char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; LIBXSMM_UNUSED(i_values); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " unsigned int l_n = 0;\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* reset C if beta is zero */ if (0 != (LIBXSMM_GEMM_FLAG_BETA_0 & i_xgemm_desc->flags)) { /* Beta=0 */ l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " unsigned int l_m = 0;\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_m = 0; l_m < %u; l_m++) {\n", (unsigned int)i_xgemm_desc->m); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); if ( i_xgemm_desc->m > 1 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma vector aligned\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_n = 0; l_n < %u; l_n++) { C[(l_m*%u)+l_n] = 0.0; }\n", (unsigned int)i_xgemm_desc->ldc, (unsigned int)i_xgemm_desc->ldc); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_n = 0; l_n < %u; l_n++) { C[(l_m*%u)+l_n] = 0.0f; }\n", (unsigned int)i_xgemm_desc->ldc, (unsigned int)i_xgemm_desc->ldc); } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " }\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* determine the correct simd pragma for each architecture */ if ( ( strcmp( i_arch, "noarch" ) == 0 ) || ( strcmp( i_arch, "wsm" ) == 0 ) || ( strcmp( i_arch, "snb" ) == 0 ) || ( strcmp( i_arch, "hsw" ) == 0 ) ) { if ( i_xgemm_desc->n > 7 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd vectorlength(8)\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else if ( i_xgemm_desc->n > 3 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd vectorlength(4)\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else if ( i_xgemm_desc->n > 1 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd vectorlength(2)\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else {} } else if ( ( strcmp( i_arch, "knl" ) == 0 ) || ( strcmp( i_arch, "knm" ) == 0 ) || ( strcmp( i_arch, "skx" ) == 0 ) || ( strcmp( i_arch, "clx" ) == 0 ) || ( strcmp( i_arch, "cpx" ) == 0 ) ) { if ( (i_xgemm_desc->n > 1) ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd vectorlength(16)\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } else { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_ARCH ); return; } if ( (i_xgemm_desc->n > 1) && ((LIBXSMM_GEMM_FLAG_ALIGN_A & i_xgemm_desc->flags) != 0) && ((LIBXSMM_GEMM_FLAG_ALIGN_C & i_xgemm_desc->flags) != 0) ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma vector aligned\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } /* generate the actual kernel */ l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_n = 0; l_n < %u; l_n++) {\n", (unsigned int)i_xgemm_desc->n); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); for ( l_m = 0; l_m < (unsigned int)i_xgemm_desc->m; l_m++ ) { l_row_elements = i_row_idx[l_m+1] - i_row_idx[l_m]; for ( l_z = 0; l_z < l_row_elements; l_z++ ) { /* check k such that we just use columns which actually need to be multiplied */ if ( i_column_idx[i_row_idx[l_m] + l_z] < (unsigned int)i_xgemm_desc->k ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " C[%u+l_n] += A[%u] * B[%u+l_n];\n", l_m * i_xgemm_desc->ldc, i_row_idx[l_m] + l_z, i_column_idx[i_row_idx[l_m] + l_z]*i_xgemm_desc->ldb ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_flop_count += 2; } } } l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " }\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* add flop counter */ l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "\n#ifndef NDEBUG\n#ifdef _OPENMP\n#pragma omp atomic\n#endif\nlibxsmm_num_total_flops += %u;\n#endif\n", l_flop_count * (unsigned int)i_xgemm_desc->m); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } libxsmm-1.17/src/generator_spgemm_csr_asparse.h000066400000000000000000000027451415223013700220060ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef GENERATOR_SPGEMM_CSR_ASPARSE_H #define GENERATOR_SPGEMM_CSR_ASPARSE_H #include /* @TODO change int based architecture value */ LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csr_asparse( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const double* i_values ); #endif /* GENERATOR_SPGEMM_CSR_ASPARSE_H */ libxsmm-1.17/src/generator_spgemm_csr_asparse_reg.c000066400000000000000000000575461415223013700226470ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "generator_spgemm_csr_asparse_reg.h" #include "generator_x86_instructions.h" #include "generator_gemm_common.h" #include "libxsmm_main.h" LIBXSMM_API_INTERN void libxsmm_mmfunction_signature_asparse_reg( libxsmm_generated_code* io_generated_code, const char* i_routine_name, const libxsmm_gemm_descriptor* i_xgemm_desc ) { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; if ( io_generated_code->code_type > 1 ) { return; } else if ( io_generated_code->code_type == 1 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, ".global %s\n.type %s, @function\n%s:\n", i_routine_name, i_routine_name, i_routine_name); } else { /* selecting the correct signature */ if (LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { if (LIBXSMM_GEMM_PREFETCH_NONE == i_xgemm_desc->prefetch) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "void %s(const float* A, const float* B, float* C) {\n", i_routine_name); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "void %s(const float* A, const float* B, float* C, const float* A_prefetch, const float* B_prefetch, const float* C_prefetch) {\n", i_routine_name); } } else { if (LIBXSMM_GEMM_PREFETCH_NONE == i_xgemm_desc->prefetch) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "void %s(const double* A, const double* B, double* C) {\n", i_routine_name); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "void %s(const double* A, const double* B, double* C, const double* A_prefetch, const double* B_prefetch, const double* C_prefetch) {\n", i_routine_name); } } } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csr_asparse_reg( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const double* i_values ) { unsigned int l_m; unsigned int l_n; unsigned int l_z; unsigned int l_row_elements; unsigned int l_unique; unsigned int l_reg_num; unsigned int l_hit; unsigned int l_n_blocking = 1; unsigned int l_n_row_idx = i_row_idx[i_xgemm_desc->m]; double *const l_unique_values = (double*)(0 != l_n_row_idx ? malloc(sizeof(double) * l_n_row_idx) : NULL); unsigned int *const l_unique_pos = (unsigned int*)(0 != l_n_row_idx ? malloc(sizeof(unsigned int) * l_n_row_idx) : NULL); double l_code_const_dp[8]; float l_code_const_fp[16]; unsigned int l_const_perm_ops[16]; libxsmm_micro_kernel_config l_micro_kernel_config; libxsmm_loop_label_tracker l_loop_label_tracker; libxsmm_gp_reg_mapping l_gp_reg_mapping; /* check if mallocs were successful */ if ( 0 == l_unique_values || 0 == l_unique_pos ) { free(l_unique_values); free(l_unique_pos); LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_CSR_ALLOC_DATA ); return; } /* check that we build for AVX512 */ if ( (strcmp(i_arch, "knl") != 0) && (strcmp(i_arch, "knm") != 0) && (strcmp(i_arch, "skx") != 0) && (strcmp(i_arch, "clx") != 0) && (strcmp(i_arch, "cpx") != 0) ) { free(l_unique_values); free(l_unique_pos); LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_ARCH ); return; } else { if ( strcmp(i_arch, "knl") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_MIC; } else if ( strcmp(i_arch, "knm") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_KNM; } else if ( strcmp(i_arch, "skx") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_CORE; } else if ( strcmp(i_arch, "clx") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_CLX; } else if ( strcmp(i_arch, "cpx") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_CPX; } else { /* cannot happen */ } } /* prerequisite */ assert(0 != i_values); /* Let's figure out how many unique values we have */ l_unique = 1; l_unique_values[0] = i_values[0]; l_unique_pos[0] = 0; for ( l_m = 1; l_m < l_n_row_idx; l_m++ ) { l_hit = 0; /* search for the value */ for ( l_z = 0; l_z < l_unique; l_z++) { if ( /*l_unique_values[l_z] == i_values[l_m]*/!(l_unique_values[l_z] < i_values[l_m]) && !(l_unique_values[l_z] > i_values[l_m]) ) { l_unique_pos[l_m] = l_z; l_hit = 1; } } /* values was not found */ if ( l_hit == 0 ) { l_unique_values[l_unique] = i_values[l_m]; l_unique_pos[l_m] = l_unique; l_unique++; } } /* check that we have enough registers for the datatype */ if ( (LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) && l_unique > LIBXSMM_SPGEMM_ASPARSE_REG_MAX_UNIQUE_L1_DP) || (LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) && l_unique > LIBXSMM_SPGEMM_ASPARSE_REG_MAX_UNIQUE_L1_SP) ) { free(l_unique_values); free(l_unique_pos); LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_UNIQUE_VAL ); return; } /* define gp register mapping */ libxsmm_reset_x86_gp_reg_mapping( &l_gp_reg_mapping ); #if defined(_WIN32) || defined(__CYGWIN__) l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_c = LIBXSMM_X86_GP_REG_R8; /* TODO: full support for Windows calling convention */ l_gp_reg_mapping.gp_reg_a_prefetch = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_b_prefetch = LIBXSMM_X86_GP_REG_UNDEF; /* l_gp_reg_mapping.gp_reg_c_prefetch = LIBXSMM_X86_GP_REG_UNDEF;*/ #else /* match calling convention on Linux */ l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RDI; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_RSI; l_gp_reg_mapping.gp_reg_c = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_a_prefetch = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_b_prefetch = LIBXSMM_X86_GP_REG_R8; /* l_gp_reg_mapping.gp_reg_c_prefetch = LIBXSMM_X86_GP_REG_R9;*/ #endif l_gp_reg_mapping.gp_reg_mloop = LIBXSMM_X86_GP_REG_R12; l_gp_reg_mapping.gp_reg_nloop = LIBXSMM_X86_GP_REG_R13; l_gp_reg_mapping.gp_reg_kloop = LIBXSMM_X86_GP_REG_R14; l_gp_reg_mapping.gp_reg_help_0 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_1 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_2 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_3 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_4 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_5 = LIBXSMM_X86_GP_REG_UNDEF; /* define loop_label_tracker */ libxsmm_reset_loop_label_tracker( &l_loop_label_tracker ); /* define the micro kernel code gen properties */ libxsmm_generator_gemm_init_micro_kernel_config_fullvector( &l_micro_kernel_config, io_generated_code->arch, i_xgemm_desc, 0 ); /* inner chunk size */ if ( i_xgemm_desc->n != l_micro_kernel_config.vector_length ) { free(l_unique_values); free(l_unique_pos); LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_N_BLOCK ); return; } /* open asm */ libxsmm_x86_instruction_open_stream( io_generated_code, &l_gp_reg_mapping, i_xgemm_desc->prefetch ); /* * load A into registers * pre-broadcast if possible, otherwise load for run-time broadcasting */ if (l_unique <= 31 ) { /* pre-broadcast A values into registers */ for ( l_z = 0; l_z < l_unique; l_z++) { char l_id[65]; LIBXSMM_SNPRINTF(l_id, 64, "%u", l_z); if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { for ( l_m = 0; l_m < 8; l_m++) { l_code_const_dp[l_m] = l_unique_values[l_z]; } libxsmm_x86_instruction_full_vec_load_of_constants ( io_generated_code, (unsigned char*)l_code_const_dp, l_id, l_micro_kernel_config.vector_name, l_z ); } else { for ( l_m = 0; l_m < 16; l_m++) { l_code_const_fp[l_m] = (float)l_unique_values[l_z]; } libxsmm_x86_instruction_full_vec_load_of_constants ( io_generated_code, (unsigned char*)l_code_const_fp, l_id, l_micro_kernel_config.vector_name, l_z ); } } } else { /* load packed A into registers */ l_z = 0; l_reg_num = 0; while (l_z < l_unique) { char l_id[65]; LIBXSMM_SNPRINTF(l_id, 64, "%u", l_reg_num); l_m = 0; if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { while (l_z < l_unique && l_m < 8) { l_code_const_dp[l_m++] = l_unique_values[l_z++]; } libxsmm_x86_instruction_full_vec_load_of_constants( io_generated_code, (unsigned char*)l_code_const_dp, l_id, l_micro_kernel_config.vector_name, l_reg_num++ ); } else { while (l_z < l_unique && l_m < 16) { l_code_const_fp[l_m++] = (float)l_unique_values[l_z++]; } libxsmm_x86_instruction_full_vec_load_of_constants( io_generated_code, (unsigned char*)l_code_const_fp, l_id, l_micro_kernel_config.vector_name, l_reg_num++ ); } } /* load permute operands into registers if space is available (otherwise they are read from memory) */ if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) && l_unique <= LIBXSMM_SPGEMM_ASPARSE_REG_MAX_UNIQUE_REG_DP ) { for (l_reg_num=LIBXSMM_SPGEMM_ASPARSE_REG_PERM_FIRST_REG_OP_DP; l_reg_numdatatype ) && l_unique <= LIBXSMM_SPGEMM_ASPARSE_REG_MAX_UNIQUE_REG_SP ){ for (l_reg_num=LIBXSMM_SPGEMM_ASPARSE_REG_PERM_FIRST_REG_OP_SP; l_reg_numm; l_m++ ) { l_row_elements = i_row_idx[l_m+1] - i_row_idx[l_m]; if (l_row_elements > 0) { for ( l_n = 0; l_n < l_n_blocking; l_n++ ) { /* load C or reset to 0 depending on beta */ if (0 == (LIBXSMM_GEMM_FLAG_BETA_0 & i_xgemm_desc->flags)) { /* Beta=1 */ libxsmm_x86_instruction_vec_move( io_generated_code, l_micro_kernel_config.instruction_set, l_micro_kernel_config.c_vmove_instruction, l_gp_reg_mapping.gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, l_m*i_xgemm_desc->ldc*l_micro_kernel_config.datatype_size + l_n*l_micro_kernel_config.datatype_size*l_micro_kernel_config.vector_length, l_micro_kernel_config.vector_name, LIBXSMM_SPGEMM_ASPARSE_REG_ACC_REG+l_n, 0, 1, 0 ); } else { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, l_micro_kernel_config.instruction_set, l_micro_kernel_config.vxor_instruction, l_micro_kernel_config.vector_name, LIBXSMM_SPGEMM_ASPARSE_REG_ACC_REG+l_n, LIBXSMM_SPGEMM_ASPARSE_REG_ACC_REG+l_n, LIBXSMM_SPGEMM_ASPARSE_REG_ACC_REG+l_n ); } /* only prefetch if we do temporal stores */ if ((LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT & i_xgemm_desc->flags) == 0) { libxsmm_x86_instruction_prefetch( io_generated_code, LIBXSMM_X86_INSTR_PREFETCHT2, l_gp_reg_mapping.gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, l_m*i_xgemm_desc->ldc*l_micro_kernel_config.datatype_size + (l_n+1)*l_micro_kernel_config.datatype_size*l_micro_kernel_config.vector_length ); } } } for ( l_z = 0; l_z < l_row_elements; l_z++ ) { /* check k such that we just use columns which actually need to be multiplied */ const unsigned int u = i_row_idx[l_m] + l_z; unsigned int l_unique_reg; LIBXSMM_ASSERT(u < l_n_row_idx); /* broadcast unique element of A if not in pre-broadcast mode */ if (l_unique > 31 ) { if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { /* load permute selector operand if not stored in registers */ if (l_unique > LIBXSMM_SPGEMM_ASPARSE_REG_MAX_UNIQUE_REG_DP) { libxsmm_x86_instruction_vec_move( io_generated_code, l_micro_kernel_config.instruction_set, l_micro_kernel_config.a_vmove_instruction, l_gp_reg_mapping.gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (l_unique_pos[u] % 8)*64, l_micro_kernel_config.vector_name, LIBXSMM_SPGEMM_ASPARSE_REG_BCAST_REG, 0, 1, 0 ); libxsmm_x86_instruction_vec_compute_reg(io_generated_code, l_micro_kernel_config.instruction_set, LIBXSMM_X86_INSTR_VPERMD, l_micro_kernel_config.vector_name, l_unique_pos[u] / 8, LIBXSMM_SPGEMM_ASPARSE_REG_BCAST_REG, LIBXSMM_SPGEMM_ASPARSE_REG_BCAST_REG); /* permute selector operand already in register */ } else { libxsmm_x86_instruction_vec_compute_reg(io_generated_code, l_micro_kernel_config.instruction_set, LIBXSMM_X86_INSTR_VPERMD, l_micro_kernel_config.vector_name, l_unique_pos[u] / 8, LIBXSMM_SPGEMM_ASPARSE_REG_PERM_FIRST_REG_OP_DP + l_unique_pos[u] % 8, LIBXSMM_SPGEMM_ASPARSE_REG_BCAST_REG); } } else { /* load permute selector operand if not stored in registers */ if (l_unique > LIBXSMM_SPGEMM_ASPARSE_REG_MAX_UNIQUE_REG_SP) { libxsmm_x86_instruction_vec_move( io_generated_code, l_micro_kernel_config.instruction_set, l_micro_kernel_config.a_vmove_instruction, l_gp_reg_mapping.gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (l_unique_pos[u] % 16)*64, l_micro_kernel_config.vector_name, LIBXSMM_SPGEMM_ASPARSE_REG_BCAST_REG, 0, 1, 0 ); libxsmm_x86_instruction_vec_compute_reg(io_generated_code, l_micro_kernel_config.instruction_set, LIBXSMM_X86_INSTR_VPERMD, l_micro_kernel_config.vector_name, l_unique_pos[u] / 16, LIBXSMM_SPGEMM_ASPARSE_REG_BCAST_REG, LIBXSMM_SPGEMM_ASPARSE_REG_BCAST_REG); /* permute selector operand already in register */ } else { libxsmm_x86_instruction_vec_compute_reg(io_generated_code, l_micro_kernel_config.instruction_set, LIBXSMM_X86_INSTR_VPERMD, l_micro_kernel_config.vector_name, l_unique_pos[u] / 16, LIBXSMM_SPGEMM_ASPARSE_REG_PERM_FIRST_REG_OP_SP + l_unique_pos[u] % 16, LIBXSMM_SPGEMM_ASPARSE_REG_BCAST_REG); } } } for ( l_n = 0; l_n < l_n_blocking; l_n++ ) { /* select correct register depending on mode */ l_unique_reg = l_unique > 31 ? LIBXSMM_SPGEMM_ASPARSE_REG_BCAST_REG : l_unique_pos[u]; libxsmm_x86_instruction_vec_compute_mem( io_generated_code, l_micro_kernel_config.instruction_set, l_micro_kernel_config.vmul_instruction, 0, l_gp_reg_mapping.gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, i_column_idx[u]*i_xgemm_desc->ldb*l_micro_kernel_config.datatype_size + l_n*l_micro_kernel_config.datatype_size*l_micro_kernel_config.vector_length, l_micro_kernel_config.vector_name, l_unique_reg, LIBXSMM_SPGEMM_ASPARSE_REG_ACC_REG+l_n ); libxsmm_x86_instruction_prefetch( io_generated_code, LIBXSMM_X86_INSTR_PREFETCHT2, l_gp_reg_mapping.gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, i_column_idx[u]*i_xgemm_desc->ldb*l_micro_kernel_config.datatype_size + (l_n+1)*l_micro_kernel_config.datatype_size*l_micro_kernel_config.vector_length ); } } if (l_row_elements > 0) { for ( l_n = 0; l_n < l_n_blocking; l_n++ ) { unsigned int l_store_instruction = 0; if ((LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT & i_xgemm_desc->flags) > 0) { if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { l_store_instruction = LIBXSMM_X86_INSTR_VMOVNTPD; } else { l_store_instruction = LIBXSMM_X86_INSTR_VMOVNTPS; } } else { l_store_instruction = l_micro_kernel_config.c_vmove_instruction; } libxsmm_x86_instruction_vec_move( io_generated_code, l_micro_kernel_config.instruction_set, l_store_instruction, l_gp_reg_mapping.gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, l_m*i_xgemm_desc->ldc*l_micro_kernel_config.datatype_size + l_n*l_micro_kernel_config.datatype_size*l_micro_kernel_config.vector_length, l_micro_kernel_config.vector_name, LIBXSMM_SPGEMM_ASPARSE_REG_ACC_REG+l_n, 0, 0, 1 ); } } } /* close n loop */ #if 0 libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_cmp_instruction, l_gp_reg_mapping.gp_reg_nloop, l_n_blocking ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, l_micro_kernel_config.alu_jmp_instruction, &l_loop_label_tracker ); #endif /* close asm */ libxsmm_x86_instruction_close_stream( io_generated_code, &l_gp_reg_mapping, i_xgemm_desc->prefetch ); free(l_unique_values); free(l_unique_pos); } libxsmm-1.17/src/generator_spgemm_csr_asparse_reg.h000066400000000000000000000044311415223013700226350ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef GENERATOR_SPGEMM_CSR_ASPARSE_REG_H #define GENERATOR_SPGEMM_CSR_ASPARSE_REG_H #include #define LIBXSMM_SPGEMM_ASPARSE_REG_BCAST_REG 30 #define LIBXSMM_SPGEMM_ASPARSE_REG_ACC_REG 31 #define LIBXSMM_SPGEMM_ASPARSE_REG_MAX_UNIQUE_REG_DP 176 #define LIBXSMM_SPGEMM_ASPARSE_REG_MAX_UNIQUE_REG_SP 224 #define LIBXSMM_SPGEMM_ASPARSE_REG_MAX_UNIQUE_L1_DP 240 #define LIBXSMM_SPGEMM_ASPARSE_REG_MAX_UNIQUE_L1_SP 480 /* first register number to store 8/16 permute operands */ #define LIBXSMM_SPGEMM_ASPARSE_REG_PERM_FIRST_REG_OP_DP 22 #define LIBXSMM_SPGEMM_ASPARSE_REG_PERM_FIRST_REG_OP_SP 14 LIBXSMM_API_INTERN void libxsmm_mmfunction_signature_asparse_reg( libxsmm_generated_code* io_generated_code, const char* i_routine_name, const libxsmm_gemm_descriptor* i_xgemm_desc ); /* @TODO change int based architecture value */ LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csr_asparse_reg( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const double* i_values ); #endif /* GENERATOR_SPGEMM_CSR_ASPARSE_REG_H */ libxsmm-1.17/src/generator_spgemm_csr_asparse_soa.c000066400000000000000000001047741415223013700226500ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "generator_spgemm_csr_asparse_soa.h" #include "generator_x86_instructions.h" #include "generator_gemm_common.h" #include "libxsmm_main.h" LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csr_asparse_soa( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const void* i_values, const unsigned int i_packed_width ) { if ( strcmp(i_arch, "knl") == 0 || strcmp(i_arch, "knm") == 0 || strcmp(i_arch, "skx") == 0 || strcmp(i_arch, "clx") == 0 || strcmp(i_arch, "cpx") == 0 || strcmp(i_arch, "snb") == 0 || strcmp(i_arch, "hsw") == 0 ) { if ( strcmp(i_arch, "snb") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX; } else if ( strcmp(i_arch, "hsw") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX2; } else if ( strcmp(i_arch, "knl") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_MIC; } else if ( strcmp(i_arch, "knm") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_KNM; } else if ( strcmp(i_arch, "skx") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_CORE; } else if ( strcmp(i_arch, "clx") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_CLX; } else if ( strcmp(i_arch, "cpx") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_CPX; } else { /* cannot happen */ } libxsmm_generator_spgemm_csr_asparse_soa_packed_loop( io_generated_code, i_xgemm_desc, i_row_idx, i_column_idx, i_values, i_packed_width ); } else { fprintf( stderr, "CSR + SOA is only available for AVX/AVX2/AVX512 at this point\n" ); exit(-1); } } LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csr_asparse_soa_packed_loop( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const void* i_values, const unsigned int i_packed_width ) { unsigned int l_simd_packed_remainder = 0; unsigned int l_simd_packed_iters_full = 0; unsigned int l_simd_packed_width = 0; unsigned int l_n_max_block = 0; libxsmm_micro_kernel_config l_micro_kernel_config; libxsmm_loop_label_tracker l_loop_label_tracker; libxsmm_gp_reg_mapping l_gp_reg_mapping; /* define gp register mapping */ libxsmm_reset_x86_gp_reg_mapping( &l_gp_reg_mapping ); #if defined(_WIN32) || defined(__CYGWIN__) l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_c = LIBXSMM_X86_GP_REG_R8; /* TODO: full support for Windows calling convention */ l_gp_reg_mapping.gp_reg_a_prefetch = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_b_prefetch = LIBXSMM_X86_GP_REG_UNDEF; #else /* match calling convention on Linux */ l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RDI; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_RSI; l_gp_reg_mapping.gp_reg_c = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_a_prefetch = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_b_prefetch = LIBXSMM_X86_GP_REG_R8; #endif /* l_gp_reg_mapping.gp_reg_c_prefetch = LIBXSMM_X86_GP_REG_UNDEF;*/ l_gp_reg_mapping.gp_reg_mloop = LIBXSMM_X86_GP_REG_R12; l_gp_reg_mapping.gp_reg_nloop = LIBXSMM_X86_GP_REG_R13; l_gp_reg_mapping.gp_reg_kloop = LIBXSMM_X86_GP_REG_R14; l_gp_reg_mapping.gp_reg_help_0 = LIBXSMM_X86_GP_REG_R15; l_gp_reg_mapping.gp_reg_help_1 = LIBXSMM_X86_GP_REG_R11; l_gp_reg_mapping.gp_reg_help_2 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_3 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_4 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_5 = LIBXSMM_X86_GP_REG_UNDEF; /* define the micro kernel code gen properties */ libxsmm_generator_gemm_init_micro_kernel_config_fullvector( &l_micro_kernel_config, io_generated_code->arch, i_xgemm_desc, 0 ); /* select soa width */ if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { l_simd_packed_width = 8; } else { l_simd_packed_width = 4; } l_micro_kernel_config.a_vmove_instruction = LIBXSMM_X86_INSTR_VBROADCASTSD; } else { if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { l_simd_packed_width = 16; } else { l_simd_packed_width = 8; } l_micro_kernel_config.a_vmove_instruction = LIBXSMM_X86_INSTR_VBROADCASTSS; } /* calculate the packing count */ l_simd_packed_remainder = i_packed_width % l_simd_packed_width; l_simd_packed_iters_full = i_packed_width / l_simd_packed_width; /* select N blocking width */ if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { l_n_max_block = 28; } else { if ( l_simd_packed_remainder > 0 ) { l_n_max_block = 13; } else { l_n_max_block = 14; } } /* define loop_label_tracker */ libxsmm_reset_loop_label_tracker( &l_loop_label_tracker ); /* open asm */ libxsmm_x86_instruction_open_stream( io_generated_code, &l_gp_reg_mapping, i_xgemm_desc->prefetch ); /* loop over blocks of packing */ if ( (l_simd_packed_iters_full > 1) || (l_simd_packed_remainder > 0 && l_simd_packed_iters_full > 0 ) ) { libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_mov_instruction, l_gp_reg_mapping.gp_reg_help_0, 0 ); libxsmm_x86_instruction_register_jump_back_label( io_generated_code, &l_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_help_0, 1 ); /* save a, b, b_prefetch, c pointers */ libxsmm_x86_instruction_push_reg( io_generated_code, l_gp_reg_mapping.gp_reg_a ); libxsmm_x86_instruction_push_reg( io_generated_code, l_gp_reg_mapping.gp_reg_b ); libxsmm_x86_instruction_push_reg( io_generated_code, l_gp_reg_mapping.gp_reg_b_prefetch ); libxsmm_x86_instruction_push_reg( io_generated_code, l_gp_reg_mapping.gp_reg_c ); } /* call N loop */ if ( l_simd_packed_iters_full > 0 ) { libxsmm_generator_spgemm_csr_asparse_soa_n_loop( io_generated_code, i_xgemm_desc, &l_loop_label_tracker, &l_micro_kernel_config, &l_gp_reg_mapping, i_row_idx, i_column_idx, i_values, l_n_max_block, i_packed_width, 0 ); } /* close packed loop */ if ( (l_simd_packed_iters_full > 1) || (l_simd_packed_remainder > 0 && l_simd_packed_iters_full > 0 ) ) { /* restore a, b, b_prefetch, c pointers */ libxsmm_x86_instruction_pop_reg( io_generated_code, l_gp_reg_mapping.gp_reg_c ); libxsmm_x86_instruction_pop_reg( io_generated_code, l_gp_reg_mapping.gp_reg_b_prefetch ); libxsmm_x86_instruction_pop_reg( io_generated_code, l_gp_reg_mapping.gp_reg_b ); libxsmm_x86_instruction_pop_reg( io_generated_code, l_gp_reg_mapping.gp_reg_a ); /* advance B and C pointers */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_c, l_simd_packed_width*l_micro_kernel_config.datatype_size ); libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_b, l_simd_packed_width*l_micro_kernel_config.datatype_size ); libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_b_prefetch, l_simd_packed_width*l_micro_kernel_config.datatype_size ); libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_cmp_instruction, l_gp_reg_mapping.gp_reg_help_0, l_simd_packed_iters_full ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, l_micro_kernel_config.alu_jmp_instruction, &l_loop_label_tracker ); } if ( l_simd_packed_remainder > 0 ) { if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { /* load k1 with mask */ libxsmm_generator_gemm_initialize_avx512_mask( io_generated_code, l_gp_reg_mapping.gp_reg_help_1, i_xgemm_desc, l_micro_kernel_config.vector_length-l_simd_packed_remainder ); } else { /* load register 15 with the mask */ char l_id = (char)13; unsigned char l_data[32]; unsigned int l_count; if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { unsigned long long* l_i64_ptr = (unsigned long long*)l_data; for ( l_count = 0; l_count < 4; ++l_count ) { if ( l_count < l_simd_packed_remainder ) { l_i64_ptr[l_count] = 0xffffffffffffffff; } else { l_i64_ptr[l_count] = 0x0; } } } else { unsigned int* l_i32_ptr = (unsigned int*)l_data; for ( l_count = 0; l_count < 8; ++l_count ) { if ( l_count < l_simd_packed_remainder ) { l_i32_ptr[l_count] = 0xffffffff; } else { l_i32_ptr[l_count] = 0x0; } } } libxsmm_x86_instruction_full_vec_load_of_constants ( io_generated_code, l_data, &l_id, 'y', 15 ); } /* if we have packed remainder, let's call N loop with it */ libxsmm_generator_spgemm_csr_asparse_soa_n_loop( io_generated_code, i_xgemm_desc, &l_loop_label_tracker, &l_micro_kernel_config, &l_gp_reg_mapping, i_row_idx, i_column_idx, i_values, l_n_max_block, i_packed_width, l_simd_packed_remainder ); } /* close asm */ libxsmm_x86_instruction_close_stream( io_generated_code, &l_gp_reg_mapping, i_xgemm_desc->prefetch ); } LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csr_asparse_soa_n_loop( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const void* i_values, const unsigned int i_n_max_block, const unsigned int i_packed_width, const unsigned int i_packed_mask ) { unsigned int l_gen_m_trips = 0; unsigned int l_a_is_dense = 0; unsigned int l_n_chunks = 0; unsigned int l_n_chunksize = 0; unsigned int l_n_remain = 0; unsigned int l_n_loop = 0; /* test if we should generate a dense version */ if ( i_row_idx[i_xgemm_desc->m] == (unsigned int)(i_xgemm_desc->m*i_xgemm_desc->k) ) { l_gen_m_trips = 1; l_a_is_dense = 1; } else { l_gen_m_trips = i_xgemm_desc->m; l_a_is_dense = 0; } /* calculate the chunk size of current columns to work on */ l_n_chunks = ( (i_xgemm_desc->n % i_n_max_block) == 0 ) ? (i_xgemm_desc->n / i_n_max_block) : (i_xgemm_desc->n / i_n_max_block) + 1; l_n_chunksize = ( (i_xgemm_desc->n % l_n_chunks) == 0 ) ? (i_xgemm_desc->n / l_n_chunks) : (i_xgemm_desc->n / l_n_chunks) + 1; l_n_remain = ( ((i_xgemm_desc->n % l_n_chunksize) == 0) || ((unsigned int)i_xgemm_desc->n <= i_n_max_block) ) ? 0 : 1; l_n_loop = ( l_n_remain == 0 ) ? (l_n_chunks * l_n_chunksize) : ((l_n_chunks-1) * l_n_chunksize); /* loop over blocks of n */ libxsmm_x86_instruction_register_jump_back_label( io_generated_code, io_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_nloop, l_n_chunksize ); /* do matix multiplicatoin for a block of N columns */ libxsmm_generator_spgemm_csr_asparse_soa_m_loop( io_generated_code, i_xgemm_desc, io_loop_label_tracker, i_micro_kernel_config, i_gp_reg_mapping, i_row_idx, i_column_idx, i_values, l_gen_m_trips, l_a_is_dense, l_n_chunksize, i_packed_width, i_packed_mask ); /* adjust B pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_b, i_micro_kernel_config->datatype_size*i_packed_width*l_n_chunksize); /* advance B prefetch pointer */ if ( (i_xgemm_desc->prefetch & LIBXSMM_GEMM_PREFETCH_BL2_VIA_C) > 0 ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_b_prefetch, (i_micro_kernel_config->datatype_size*i_packed_width*i_xgemm_desc->ldb*i_xgemm_desc->m)-(i_micro_kernel_config->datatype_size*i_packed_width*l_n_chunksize)); } /* adjust C pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_c, (i_micro_kernel_config->datatype_size*i_packed_width*i_xgemm_desc->ldc*i_xgemm_desc->m)-(i_micro_kernel_config->datatype_size*i_packed_width*l_n_chunksize)); /* N loop jump back */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_cmp_instruction, i_gp_reg_mapping->gp_reg_nloop, l_n_loop ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, i_micro_kernel_config->alu_jmp_instruction, io_loop_label_tracker ); /* handle remainder of N loop */ if ( l_n_remain != 0 ) { libxsmm_generator_spgemm_csr_asparse_soa_m_loop( io_generated_code, i_xgemm_desc, io_loop_label_tracker, i_micro_kernel_config, i_gp_reg_mapping, i_row_idx, i_column_idx, i_values, l_gen_m_trips, l_a_is_dense, i_xgemm_desc->n - (l_n_chunksize * (l_n_chunks - 1)), i_packed_width, i_packed_mask ); } } LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csr_asparse_soa_m_loop( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const void* i_values, const unsigned int i_gen_m_trips, const unsigned int i_a_is_dense, const unsigned int i_num_c_cols, const unsigned int i_packed_width, const unsigned int i_packed_mask ) { unsigned int l_m; unsigned int l_n; unsigned int l_z; unsigned int l_row_elements; unsigned int l_b_offset; unsigned int l_b_total_offset; unsigned int l_avx_mask_instr; LIBXSMM_UNUSED(i_values); if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { l_avx_mask_instr = LIBXSMM_X86_INSTR_VMASKMOVPD; } else { l_avx_mask_instr = LIBXSMM_X86_INSTR_VMASKMOVPS; } /* do sparse times dense soa multiplication */ for ( l_m = 0; l_m < i_gen_m_trips; l_m++ ) { /* handle b offset */ l_b_offset = 0; l_b_total_offset = 0; /* generate M loop */ if (i_a_is_dense != 0 ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_mloop, 0 ); libxsmm_x86_instruction_register_jump_back_label( io_generated_code, io_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_mloop, 1 ); } l_row_elements = i_row_idx[l_m+1] - i_row_idx[l_m]; if (l_row_elements > 0) { /* load C accumulator */ for ( l_n = 0; l_n < i_num_c_cols; l_n++ ) { if (0 != (LIBXSMM_GEMM_FLAG_BETA_0 & i_xgemm_desc->flags)) { /* Beta=0 */ libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vxor_instruction, i_micro_kernel_config->vector_name, l_n, l_n, l_n ); } else { if ( i_packed_mask == 0 ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->c_vmove_instruction, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, l_n*i_packed_width*i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, l_n, 0, 1, 0 ); } else { if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->c_vmove_instruction, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, l_n*i_packed_width*i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, l_n, 1, 1, 0 ); } else { libxsmm_x86_instruction_vec_mask_move( io_generated_code, l_avx_mask_instr, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, l_n*i_packed_width*i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, l_n, 15, 0); } } } if ( (i_xgemm_desc->prefetch & LIBXSMM_GEMM_PREFETCH_BL2_VIA_C) > 0 ) { libxsmm_x86_instruction_prefetch( io_generated_code, i_micro_kernel_config->prefetch_instruction, i_gp_reg_mapping->gp_reg_b_prefetch, LIBXSMM_X86_GP_REG_UNDEF, 0, l_n*i_packed_width*i_micro_kernel_config->datatype_size ); } } /* loop over the non-zeros in A row m */ for ( l_z = 0; l_z < l_row_elements; l_z++ ) { /* broadcast values of A */ libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (i_row_idx[l_m] + l_z) * i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, i_num_c_cols, 0, 1, 0 ); /* multiply with B */ for ( l_n = 0; l_n < i_num_c_cols; l_n++ ) { l_b_offset = ((i_column_idx[i_row_idx[l_m] + l_z]*i_micro_kernel_config->datatype_size*i_packed_width*i_xgemm_desc->ldb) +(l_n*i_packed_width*i_micro_kernel_config->datatype_size))-l_b_total_offset; if (l_b_offset >= 8192) { l_b_total_offset += l_b_offset; libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_b, l_b_offset); l_b_offset = 0; } if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { if ( i_packed_mask == 0 ) { libxsmm_x86_instruction_vec_compute_mem( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, 0, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_b_offset, i_micro_kernel_config->vector_name, i_num_c_cols, l_n ); } else { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->c_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_b_offset, i_micro_kernel_config->vector_name, i_num_c_cols+1, 1, 1, 0 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, i_num_c_cols+1, i_num_c_cols, l_n ); } } else if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX2 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { if ( i_packed_mask == 0 ) { libxsmm_x86_instruction_vec_compute_mem( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, 0, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_b_offset, i_micro_kernel_config->vector_name, i_num_c_cols, l_n ); } else { libxsmm_x86_instruction_vec_mask_move( io_generated_code, l_avx_mask_instr, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_b_offset, i_micro_kernel_config->vector_name, 14, 15, 0); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, 14, i_num_c_cols, l_n ); } } else { if ( i_packed_mask == 0 ) { /* Mul with full vector load and adding result to final accumulator */ libxsmm_x86_instruction_vec_compute_mem( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, 0, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_b_offset, i_micro_kernel_config->vector_name, i_num_c_cols, 15 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vadd_instruction, i_micro_kernel_config->vector_name, 15, l_n, l_n ); } else { libxsmm_x86_instruction_vec_mask_move( io_generated_code, l_avx_mask_instr, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_b_offset, i_micro_kernel_config->vector_name, 14, 15, 0); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, 14, i_num_c_cols, 14 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vadd_instruction, i_micro_kernel_config->vector_name, 14, l_n, l_n ); } } } } /* store C accumulator */ for ( l_n = 0; l_n < i_num_c_cols; l_n++ ) { if ( i_packed_mask == 0 ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->c_vmove_instruction, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, l_n*i_packed_width*i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, l_n, 0, 0, 1 ); } else { if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->c_vmove_instruction, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, l_n*i_packed_width*i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, l_n, 1, 0, 1 ); } else { libxsmm_x86_instruction_vec_mask_move( io_generated_code, l_avx_mask_instr, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, l_n*i_packed_width*i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, l_n, 15, 1); } } } } /* advance C pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_c, i_micro_kernel_config->datatype_size*i_packed_width*i_xgemm_desc->ldc); /* advance B prefetch pointer */ if ( (i_xgemm_desc->prefetch & LIBXSMM_GEMM_PREFETCH_BL2_VIA_C) > 0 ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_b_prefetch, i_micro_kernel_config->datatype_size*i_packed_width*i_xgemm_desc->ldb); } /* adjust B pointer */ if (l_b_total_offset > 0) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_b, l_b_total_offset); } /* generate M loop */ if (i_a_is_dense != 0 ) { /* advance A pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a, i_micro_kernel_config->datatype_size*i_xgemm_desc->k); /* M loop jump back */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_cmp_instruction, i_gp_reg_mapping->gp_reg_mloop, i_xgemm_desc->m ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, i_micro_kernel_config->alu_jmp_instruction, io_loop_label_tracker ); } } /* reset A pointer */ if (i_a_is_dense != 0 ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_a, i_micro_kernel_config->datatype_size*i_xgemm_desc->k*i_xgemm_desc->m); } } libxsmm-1.17/src/generator_spgemm_csr_asparse_soa.h000066400000000000000000000114261415223013700226440ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef GENERATOR_SPGEMM_CSR_ASPARSE_SOA_H #define GENERATOR_SPGEMM_CSR_ASPARSE_SOA_H #include "generator_common.h" #include /* @TODO change int based architecture value */ LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csr_asparse_soa( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const void* i_values, const unsigned int i_packed_width ); LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csr_asparse_soa_packed_loop( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const void* i_values, const unsigned int i_packed_width ); LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csr_asparse_soa_n_loop( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const void* i_values, const unsigned int i_n_max_block, const unsigned int i_packed_width, const unsigned int i_packed_mask ); LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csr_asparse_soa_m_loop( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const void* i_values, const unsigned int i_gen_m_trips, const unsigned int i_a_is_dense, const unsigned int i_num_c_cols, const unsigned int i_packed_width, const unsigned int i_packed_mask ); #endif /* GENERATOR_SPGEMM_CSR_ASPARSE_SOA_H */ libxsmm-1.17/src/generator_spgemm_csr_bsparse_soa.c000066400000000000000000001000761415223013700226400ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "generator_spgemm_csr_bsparse_soa.h" #include "generator_x86_instructions.h" #include "generator_gemm_common.h" #include "generator_common.h" #include "libxsmm_main.h" LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csr_bsparse_soa( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const void* i_values, const unsigned int i_packed_width ) { if ( strcmp(i_arch, "knl") == 0 || strcmp(i_arch, "knm") == 0 || strcmp(i_arch, "skx") == 0 || strcmp(i_arch, "clx") == 0 || strcmp(i_arch, "cpx") == 0 || strcmp(i_arch, "hsw") == 0 || strcmp(i_arch, "snb") == 0 ) { if ( strcmp(i_arch, "snb") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX; } else if ( strcmp(i_arch, "hsw") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX2; } else if ( strcmp(i_arch, "knl") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_MIC; } else if ( strcmp(i_arch, "knm") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_KNM; } else if ( strcmp(i_arch, "skx") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_CORE; } else if ( strcmp(i_arch, "clx") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_CLX; } else if ( strcmp(i_arch, "cpx") == 0 ) { io_generated_code->arch = LIBXSMM_X86_AVX512_CPX; } else { /* cannot happen */ } libxsmm_generator_spgemm_csr_bsparse_soa_avx256_512( io_generated_code, i_xgemm_desc, i_row_idx, i_column_idx, i_values, i_packed_width ); } else { fprintf( stderr, "CSR + SOA is only available for AVX/AVX2/AVX512 at this point\n" ); exit(-1); } } LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csr_bsparse_soa_avx256_512( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const void* i_values, const unsigned int i_packed_width ) { unsigned int l_n = 0; unsigned int l_max_cols = 0; unsigned int l_max_reg_block = 0; unsigned int l_simd_packed_remainder = 0; unsigned int l_simd_packed_iters = 0; unsigned int l_simd_packed_iters_full = 0; unsigned int l_simd_packed_width = 0; unsigned int l_packed_done = 0; unsigned int l_packed_count = 0; unsigned int l_packed_reg_block[2] = {0,0}; unsigned int l_packed_reg_range[2] = {0,0}; unsigned int l_col_reg_block[2][2] = { {0,0}, {0,0} }; unsigned int l_col_reg_range[2][2] = { {0,0}, {0,0} }; libxsmm_micro_kernel_config l_micro_kernel_config; libxsmm_loop_label_tracker l_loop_label_tracker; libxsmm_gp_reg_mapping l_gp_reg_mapping; /* select simd packing width and accumulator blocking */ if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { l_simd_packed_width = 8; l_max_reg_block = 28; } else { l_simd_packed_width = 4; l_max_reg_block = 14; } } else { if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { l_simd_packed_width = 16; l_max_reg_block = 28; } else { l_simd_packed_width = 8; l_max_reg_block = 14; } } l_simd_packed_remainder = i_packed_width % l_simd_packed_width; l_simd_packed_iters_full = i_packed_width/l_simd_packed_width; l_simd_packed_iters = ( l_simd_packed_remainder > 0 ) ? l_simd_packed_iters_full+1 : l_simd_packed_iters_full; /* get max column in C */ for ( l_n = 0; l_n < i_row_idx[i_xgemm_desc->k]; l_n++ ) { if (l_max_cols < i_column_idx[l_n]) { l_max_cols = i_column_idx[l_n]; } } l_max_cols++; /* when we have remainder on lower than AVX512 we need one spare register for a mask */ if ( ( io_generated_code->arch < LIBXSMM_X86_AVX512 ) && ( l_simd_packed_remainder != 0 ) ) { l_max_reg_block = 13; } #if 0 printf("packed parameters: %u, %u, %u, %u, %u\n", i_packed_width, l_simd_packed_remainder, l_simd_packed_iters, l_simd_packed_iters_full, l_simd_packed_width ); #endif /* packed blocking */ /* @TODO for 2^x for l_simd_packed iters we might want to todo something else */ libxsmm_compute_equalized_blocking( l_simd_packed_iters, l_max_reg_block, &(l_packed_reg_range[0]), &(l_packed_reg_block[0]), &(l_packed_reg_range[1]), &(l_packed_reg_block[1]) ); #if 0 printf("packed blocking (range0, block0, range1, block1): %u %u %u %u\n", l_packed_reg_range[0], l_packed_reg_block[0], l_packed_reg_range[1], l_packed_reg_block[1]); #endif /* adjust max reg_blocking to allow for 2d blocking */ if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { if ( l_packed_reg_block[0] == 2 ) { l_max_reg_block = 20; } if ( l_packed_reg_block[0] == 4 ) { l_max_reg_block = 24; } } /* N blocking for packed blocking */ libxsmm_compute_equalized_blocking( l_max_cols, l_max_reg_block/l_packed_reg_block[0], &(l_col_reg_range[0][0]), &(l_col_reg_block[0][0]), &(l_col_reg_range[0][1]), &(l_col_reg_block[0][1]) ); if ( l_packed_reg_block[1] != 0 ) { libxsmm_compute_equalized_blocking( l_max_cols, l_max_reg_block/l_packed_reg_block[1], &(l_col_reg_range[1][0]), &(l_col_reg_block[1][0]), &(l_col_reg_range[1][1]), &(l_col_reg_block[1][1]) ); } #if 0 printf("n blocking 0 (range0, block0, range1, block1): %u %u %u %u\n", l_col_reg_range[0][0], l_col_reg_block[0][0], l_col_reg_range[0][1], l_col_reg_block[0][1]); printf("n blocking 1 (range0, block0, range1, block1): %u %u %u %u\n", l_col_reg_range[1][0], l_col_reg_block[1][0], l_col_reg_range[1][1], l_col_reg_block[1][1]); #endif /* define gp register mapping */ libxsmm_reset_x86_gp_reg_mapping( &l_gp_reg_mapping ); /* matching calling convention on Linux */ #if defined(_WIN32) || defined(__CYGWIN__) l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_c = LIBXSMM_X86_GP_REG_R8; /* TODO: full support for Windows calling convention */ l_gp_reg_mapping.gp_reg_a_prefetch = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_b_prefetch = LIBXSMM_X86_GP_REG_UNDEF; #else /* match calling convention on Linux */ l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RDI; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_RSI; l_gp_reg_mapping.gp_reg_c = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_a_prefetch = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_b_prefetch = LIBXSMM_X86_GP_REG_R8; #endif /* l_gp_reg_mapping.gp_reg_c_prefetch = LIBXSMM_X86_GP_REG_UNDEF;*/ l_gp_reg_mapping.gp_reg_mloop = LIBXSMM_X86_GP_REG_R12; l_gp_reg_mapping.gp_reg_nloop = LIBXSMM_X86_GP_REG_R13; l_gp_reg_mapping.gp_reg_kloop = LIBXSMM_X86_GP_REG_R14; l_gp_reg_mapping.gp_reg_help_0 = LIBXSMM_X86_GP_REG_R15; l_gp_reg_mapping.gp_reg_help_1 = LIBXSMM_X86_GP_REG_R11; l_gp_reg_mapping.gp_reg_help_2 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_3 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_4 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_5 = LIBXSMM_X86_GP_REG_UNDEF; /* define loop_label_tracker */ libxsmm_reset_loop_label_tracker( &l_loop_label_tracker ); /* define the micro kernel code gen properties */ libxsmm_generator_gemm_init_micro_kernel_config_fullvector( &l_micro_kernel_config, io_generated_code->arch, i_xgemm_desc, 0 ); /* open asm */ libxsmm_x86_instruction_open_stream( io_generated_code, &l_gp_reg_mapping, i_xgemm_desc->prefetch ); /* m loop */ libxsmm_x86_instruction_register_jump_back_label( io_generated_code, &l_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_mloop, 1 ); /* loop over packed blocks */ while ( l_packed_done != l_simd_packed_iters ) { unsigned int l_packed_blocking = l_packed_reg_block[l_packed_count]; unsigned int l_packed_remainder = 0; unsigned int l_n_done = 0; unsigned int l_n_count = 0; unsigned int l_n_processed = 0; if ( (l_simd_packed_remainder != 0) && (l_packed_count == 0) ) { if ( l_packed_reg_block[1] > 0 ) { l_packed_remainder = 0; } else { l_packed_remainder = l_simd_packed_remainder; } } else if (l_simd_packed_remainder != 0) { l_packed_remainder = l_simd_packed_remainder; } while ( l_n_done < l_max_cols ) { unsigned int l_n_blocking = l_col_reg_block[l_packed_count][l_n_count]; for ( l_n_processed = l_n_done; l_n_processed < l_n_done + l_col_reg_range[l_packed_count][l_n_count]; l_n_processed += l_n_blocking ) { libxsmm_generator_spgemm_csr_bsparse_soa_avx256_512_kloop( io_generated_code, &l_loop_label_tracker, &l_gp_reg_mapping, &l_micro_kernel_config, i_xgemm_desc, i_row_idx, i_column_idx, i_values, l_n_processed, l_n_processed + l_n_blocking, l_packed_done, l_packed_done + l_packed_reg_range[l_packed_count], l_packed_blocking, l_packed_remainder, i_packed_width ); } l_n_done += l_col_reg_range[l_packed_count][l_n_count]; l_n_count++; } /* advance N */ l_packed_done += l_packed_reg_range[l_packed_count]; l_packed_count++; } /* advance C pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_c, l_micro_kernel_config.datatype_size*i_packed_width*i_xgemm_desc->ldc); /* advance A pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_a, l_micro_kernel_config.datatype_size*i_packed_width*i_xgemm_desc->lda); /* close m loop */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_cmp_instruction, l_gp_reg_mapping.gp_reg_mloop, i_xgemm_desc->m ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, l_micro_kernel_config.alu_jmp_instruction, &l_loop_label_tracker ); /* close asm */ libxsmm_x86_instruction_close_stream( io_generated_code, &l_gp_reg_mapping, i_xgemm_desc->prefetch ); } LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csr_bsparse_soa_avx256_512_kloop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const void* i_values, const unsigned int i_n_processed, const unsigned int i_n_limit, const unsigned int i_packed_processed, const unsigned int i_packed_range, const unsigned int i_packed_blocking, const unsigned int i_packed_remainder, const unsigned int i_packed_width ) { unsigned int l_n = 0; unsigned int l_p = 0; unsigned int l_k = 0; unsigned int l_found_mul = 0; unsigned int l_max_reg_block = (i_n_limit - i_n_processed) * i_packed_blocking; unsigned int l_n_blocking = i_n_limit - i_n_processed; unsigned int l_avx_mask_instr = 0; unsigned int l_row_elements = 0; LIBXSMM_UNUSED(i_values); LIBXSMM_ASSERT( i_packed_blocking > 0 ); /* packed loop */ if ( i_packed_range/i_packed_blocking > 1 ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_help_0, 0 ); libxsmm_x86_instruction_register_jump_back_label( io_generated_code, io_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_help_0, 1 ); } /* load k if packed remainder is non-zero */ if ( i_packed_remainder != 0 ) { /* on AVX512 we can use mask registers */ if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { libxsmm_generator_gemm_initialize_avx512_mask( io_generated_code, i_gp_reg_mapping->gp_reg_help_1, i_xgemm_desc, i_micro_kernel_config->vector_length-i_packed_remainder ); } else { char l_id = (char)l_n_blocking; unsigned char l_data[32]; unsigned int l_count; if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { unsigned long long* l_i64_ptr = (unsigned long long*)l_data; for ( l_count = 0; l_count < 4; ++l_count ) { if ( l_count < i_packed_remainder ) { l_i64_ptr[l_count] = 0xffffffffffffffff; } else { l_i64_ptr[l_count] = 0x0; } } l_avx_mask_instr = LIBXSMM_X86_INSTR_VMASKMOVPD; } else { unsigned int* l_i32_ptr = (unsigned int*)l_data; for ( l_count = 0; l_count < 8; ++l_count ) { if ( l_count < i_packed_remainder ) { l_i32_ptr[l_count] = 0xffffffff; } else { l_i32_ptr[l_count] = 0x0; } } l_avx_mask_instr = LIBXSMM_X86_INSTR_VMASKMOVPS; } libxsmm_x86_instruction_full_vec_load_of_constants ( io_generated_code, l_data, &l_id, 'y', 15 ); } } /* load C accumulator */ for ( l_n = 0; l_n < l_n_blocking; l_n++ ) { for ( l_p = 0; l_p < i_packed_blocking; l_p++ ) { if (0 != (LIBXSMM_GEMM_FLAG_BETA_0 & i_xgemm_desc->flags)) { /* Beta=0 */ libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vxor_instruction, i_micro_kernel_config->vector_name, (l_n*i_packed_blocking) + l_p, (l_n*i_packed_blocking) + l_p, (l_n*i_packed_blocking) + l_p ); } else { if ( (l_p == i_packed_blocking-1) && (i_packed_remainder != 0) ) { if ( l_avx_mask_instr > 0 ) { libxsmm_x86_instruction_vec_mask_move( io_generated_code, l_avx_mask_instr, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, ( (i_n_processed + l_n)*i_packed_width*i_micro_kernel_config->datatype_size ) + ( (i_packed_processed + l_p)*i_micro_kernel_config->vector_length*i_micro_kernel_config->datatype_size ), i_micro_kernel_config->vector_name, (l_n*i_packed_blocking) + l_p, 15, 0); } else { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->c_vmove_instruction, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, ( (i_n_processed + l_n)*i_packed_width*i_micro_kernel_config->datatype_size ) + ( (i_packed_processed + l_p)*i_micro_kernel_config->vector_length*i_micro_kernel_config->datatype_size ), i_micro_kernel_config->vector_name, (l_n*i_packed_blocking) + l_p, 1, 1, 0 ); } } else { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->c_vmove_instruction, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, ( (i_n_processed + l_n)*i_packed_width*i_micro_kernel_config->datatype_size ) + ( (i_packed_processed + l_p)*i_micro_kernel_config->vector_length*i_micro_kernel_config->datatype_size ), i_micro_kernel_config->vector_name, (l_n*i_packed_blocking) + l_p, 0, 1, 0 ); } } } } /* do dense soa times sparse multiplication */ for ( l_k = 0; l_k < (unsigned int)i_xgemm_desc->k; l_k++ ) { l_row_elements = i_row_idx[l_k+1] - i_row_idx[l_k]; l_found_mul = 0; /* check if we actually need to multiply */ for ( l_n = 0; l_n < l_row_elements; l_n++ ) { if ( (i_column_idx[i_row_idx[l_k] + l_n] < (unsigned int)i_xgemm_desc->n) && (i_column_idx[i_row_idx[l_k] + l_n] >= i_n_processed) && (i_column_idx[i_row_idx[l_k] + l_n] < i_n_limit) ) { l_found_mul = 1; } } /* only load A if multiplication loop is not empty */ if (l_found_mul != 0) { unsigned int l_avx_max_reg = ( l_avx_mask_instr > 0 ) ? 14 : 15; for ( l_p = 0; l_p < i_packed_blocking; l_p++ ) { if ( (l_p == i_packed_blocking-1) && (i_packed_remainder != 0) ) { if ( l_avx_mask_instr > 0 ) { libxsmm_x86_instruction_vec_mask_move( io_generated_code, l_avx_mask_instr, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (l_k*i_packed_width*i_micro_kernel_config->datatype_size) + ( (i_packed_processed + l_p)*i_micro_kernel_config->vector_length*i_micro_kernel_config->datatype_size ), i_micro_kernel_config->vector_name, l_max_reg_block, 15, 0 ); } else { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (l_k*i_packed_width*i_micro_kernel_config->datatype_size) + ( (i_packed_processed + l_p)*i_micro_kernel_config->vector_length*i_micro_kernel_config->datatype_size ), i_micro_kernel_config->vector_name, l_max_reg_block, 1, 1, 0 ); } } else { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (l_k*i_packed_width*i_micro_kernel_config->datatype_size) + ( (i_packed_processed + l_p)*i_micro_kernel_config->vector_length*i_micro_kernel_config->datatype_size ), i_micro_kernel_config->vector_name, l_max_reg_block, 0, 1, 0 ); } /* loop over element in the row of B and multiply*/ for ( l_n = 0; l_n < l_row_elements; l_n++ ) { /* check k such that we just use columns which actually need to be multiplied */ if ( (i_column_idx[i_row_idx[l_k] + l_n] < (unsigned int)i_xgemm_desc->n) && (i_column_idx[i_row_idx[l_k] + l_n] >= i_n_processed) && (i_column_idx[i_row_idx[l_k] + l_n] < i_n_limit) ) { if ( ( io_generated_code->arch >= LIBXSMM_X86_AVX512 ) && ( io_generated_code->arch <= LIBXSMM_X86_ALLFEAT ) ) { libxsmm_x86_instruction_vec_compute_mem( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, 1, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, (i_row_idx[l_k] + l_n) * i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, l_max_reg_block, ((i_column_idx[i_row_idx[l_k] + l_n] - i_n_processed)*i_packed_blocking) + l_p ); } else if ( io_generated_code->arch == LIBXSMM_X86_AVX2 ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, (i_row_idx[l_k] + l_n) * i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, l_avx_max_reg, 0, 1, 0 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, l_max_reg_block, l_avx_max_reg, ((i_column_idx[i_row_idx[l_k] + l_n] - i_n_processed)*i_packed_blocking) + l_p ); } else if ( io_generated_code->arch == LIBXSMM_X86_AVX ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, (i_row_idx[l_k] + l_n) * i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, l_avx_max_reg, 0, 1, 0 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, l_max_reg_block, l_avx_max_reg, l_avx_max_reg ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vadd_instruction, i_micro_kernel_config->vector_name, l_avx_max_reg, ((i_column_idx[i_row_idx[l_k] + l_n] - i_n_processed)*i_packed_blocking) + l_p, ((i_column_idx[i_row_idx[l_k] + l_n] - i_n_processed)*i_packed_blocking) + l_p ); } else { } } } } } } /* store C accumulator */ for ( l_n = 0; l_n < l_n_blocking; l_n++ ) { for ( l_p = 0; l_p < i_packed_blocking; l_p++ ) { if ( (l_p == i_packed_blocking-1) && (i_packed_remainder != 0) ) { if ( l_avx_mask_instr > 0 ) { libxsmm_x86_instruction_vec_mask_move( io_generated_code, l_avx_mask_instr, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, ( (i_n_processed + l_n)*i_packed_width*i_micro_kernel_config->datatype_size ) + ( (i_packed_processed + l_p)*i_micro_kernel_config->vector_length*i_micro_kernel_config->datatype_size ), i_micro_kernel_config->vector_name, (l_n*i_packed_blocking) + l_p, 15, 1); } else { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->c_vmove_instruction, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, ( (i_n_processed + l_n)*i_packed_width*i_micro_kernel_config->datatype_size ) + ( (i_packed_processed + l_p)*i_micro_kernel_config->vector_length*i_micro_kernel_config->datatype_size ), i_micro_kernel_config->vector_name, (l_n*i_packed_blocking) + l_p, 1, 0, 1 ); } } else { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->c_vmove_instruction, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, ( (i_n_processed + l_n)*i_packed_width*i_micro_kernel_config->datatype_size ) + ( (i_packed_processed + l_p)*i_micro_kernel_config->vector_length*i_micro_kernel_config->datatype_size ), i_micro_kernel_config->vector_name, (l_n*i_packed_blocking) + l_p, 0, 0, 1 ); } } } /* packed loop */ if ( i_packed_range/i_packed_blocking > 1 ) { /* advance A and C pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a, i_packed_blocking*i_micro_kernel_config->vector_length*i_micro_kernel_config->datatype_size ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_c, i_packed_blocking*i_micro_kernel_config->vector_length*i_micro_kernel_config->datatype_size ); /* packed loop footer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_cmp_instruction, i_gp_reg_mapping->gp_reg_help_0, i_packed_range/i_packed_blocking ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, i_micro_kernel_config->alu_jmp_instruction, io_loop_label_tracker ); /* reset A and C pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_a, (i_packed_range/i_packed_blocking)*i_micro_kernel_config->vector_length*i_micro_kernel_config->datatype_size ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_c, (i_packed_range/i_packed_blocking)*i_micro_kernel_config->vector_length*i_micro_kernel_config->datatype_size ); } } libxsmm-1.17/src/generator_spgemm_csr_bsparse_soa.h000066400000000000000000000077501415223013700226520ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef GENERATOR_SPGEMM_CSR_BSPARSE_SOA_H #define GENERATOR_SPGEMM_CSR_BSPARSE_SOA_H #include #include "generator_common.h" /* @TODO change int based architecture value */ LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csr_bsparse_soa( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const void* i_values, const unsigned int i_packed_width ); LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csr_bsparse_soa_avx256_512( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const void* i_values, const unsigned int i_packed_width ); LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csr_bsparse_soa_avx256_512_kloop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const void* i_values, const unsigned int i_n_processed, const unsigned int i_n_limit, const unsigned int i_packed_processed, const unsigned int i_packed_range, const unsigned int i_packed_blocking, const unsigned int i_packed_remainder, const unsigned int i_packed_width ); #endif /* GENERATOR_SPGEMM_CSR_BSPARSE_SOA_H */ libxsmm-1.17/src/generator_spgemm_csr_reader.c000066400000000000000000000145661415223013700216110ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "generator_common.h" #include "generator_spgemm_csr_reader.h" LIBXSMM_API_INTERN void libxsmm_sparse_csr_reader( libxsmm_generated_code* io_generated_code, const char* i_csr_file_in, unsigned int** o_row_idx, unsigned int** o_column_idx, double** o_values, unsigned int* io_row_count, unsigned int* io_column_count, unsigned int* o_element_count ) { FILE *l_csr_file_handle; const unsigned int l_line_length = 512; char l_line[512/*l_line_length*/+1]; unsigned int l_header_read = 0; unsigned int* l_row_idx_id = NULL; unsigned int l_i = 0; l_csr_file_handle = fopen( i_csr_file_in, "r" ); if ( l_csr_file_handle == NULL ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_CSR_INPUT ); return; } while (fgets(l_line, l_line_length, l_csr_file_handle) != NULL) { if ( strlen(l_line) == l_line_length ) { free(*o_row_idx); free(*o_column_idx); free(*o_values); free(l_row_idx_id); *io_row_count = *io_column_count = *o_element_count = 0; *o_row_idx = *o_column_idx = NULL; *o_values = NULL; fclose(l_csr_file_handle); /* close mtx file */ LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_CSR_READ_LEN ); return; } /* check if we are still reading comments header */ if ( l_line[0] == '%' ) { continue; } else { /* if we are the first line after comment header, we allocate our data structures */ if ( l_header_read == 0 ) { unsigned int row_count, column_count; if (3 == sscanf(l_line, "%u %u %u", &row_count, &column_count, o_element_count) && 0 != row_count && 0 != column_count && 0 != *o_element_count) { *io_column_count = LIBXSMM_MAX(*io_column_count, column_count); *io_row_count = LIBXSMM_MAX(*io_row_count, row_count); /* allocate CSC data-structure matching mtx file, and set everything to zero for init */ /* coverity[tainted_data] */ *o_column_idx = (unsigned int*)calloc(*o_element_count, sizeof(unsigned int)); /* coverity[tainted_data] */ *o_row_idx = (unsigned int*)calloc((size_t)*io_row_count + 1, sizeof(unsigned int)); /* coverity[tainted_data] */ *o_values = (double*)calloc(*o_element_count, sizeof(double)); /* coverity[tainted_data] */ l_row_idx_id = (unsigned int*)calloc(*io_row_count, sizeof(unsigned int)); /* check if mallocs were successful */ if ( ( *o_row_idx == NULL ) || ( *o_column_idx == NULL ) || ( *o_values == NULL ) || ( l_row_idx_id == NULL ) ) { free(*o_row_idx); free(*o_column_idx); free(*o_values); free(l_row_idx_id); *io_row_count = *io_column_count = *o_element_count = 0; *o_row_idx = *o_column_idx = NULL; *o_values = NULL; fclose(l_csr_file_handle); /* close mtx file */ LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_CSC_ALLOC_DATA ); return; } /* init column idx */ /* coverity[tainted_data] */ for ( l_i = 0; l_i <= *io_row_count; ++l_i ) (*o_row_idx)[l_i] = (*o_element_count); /* init */ (*o_row_idx)[0] = 0; l_i = 0; l_header_read = 1; } else { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_CSR_READ_DESC ); fclose( l_csr_file_handle ); /* close mtx file */ return; } /* now we read the actual content */ } else { unsigned int l_row = 0, l_column = 0; double l_value = 0; /* read a line of content */ if ( sscanf(l_line, "%u %u %lf", &l_row, &l_column, &l_value) != 3 || l_row > * io_row_count || l_column > * io_column_count || l_i >= * o_element_count ) { free(*o_row_idx); free(*o_column_idx); free(*o_values); free(l_row_idx_id); *io_row_count = *io_column_count = *o_element_count = 0; *o_row_idx = *o_column_idx = NULL; *o_values = NULL; fclose(l_csr_file_handle); /* close mtx file */ LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_CSR_READ_ELEMS ); return; } /* adjust numbers to zero termination */ LIBXSMM_ASSERT(0 != l_row && 0 != l_column); l_row--; l_column--; /* add these values to row and value structure */ (*o_column_idx)[l_i] = l_column; (*o_values)[l_i] = l_value; l_i++; /* handle columns, set id to own for this column, yeah we need to handle empty columns */ /* coverity[tainted_data] */ l_row_idx_id[l_row] = 1; (*o_row_idx)[l_row+1] = l_i; } } } /* close mtx file */ fclose( l_csr_file_handle ); /* check if we read a file which was consistent */ if ( l_i != (*o_element_count) ) { free(*o_row_idx); free(*o_column_idx); free(*o_values); free(l_row_idx_id); *io_row_count = *io_column_count = *o_element_count = 0; *o_row_idx = *o_column_idx = NULL; *o_values = NULL; LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_CSR_LEN ); return; } if ( l_row_idx_id != NULL ) { /* let's handle empty rows */ for ( l_i = 0; l_i < (*io_row_count); l_i++) { if ( l_row_idx_id[l_i] == 0 ) { (*o_row_idx)[l_i+1] = (*o_row_idx)[l_i]; } } /* free helper data structure */ free( l_row_idx_id ); } } libxsmm-1.17/src/generator_spgemm_csr_reader.h000066400000000000000000000027651415223013700216140ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef GENERATOR_SPGEMM_CSR_READER_H #define GENERATOR_SPGEMM_CSR_READER_H #include #include LIBXSMM_API_INTERN void libxsmm_sparse_csr_reader( libxsmm_generated_code* io_generated_code, const char* i_csr_file_in, unsigned int** o_row_idx, unsigned int** o_column_idx, double** o_values, unsigned int* io_row_count, unsigned int* io_column_count, unsigned int* o_element_count ); #endif /* GENERATOR_SPGEMM_CSR_READER_H */ libxsmm-1.17/src/generator_transpose.c000066400000000000000000000027111415223013700201330ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Greg Henry (Intel Corp.) ******************************************************************************/ #include #include "generator_common.h" #include "generator_transpose_avx_avx512.h" /* @TODO change int based architecture value */ LIBXSMM_API void libxsmm_generator_transpose_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_trans_descriptor* i_trans_desc, int i_arch ) { /* generate kernel */ if ( LIBXSMM_X86_AVX <= i_arch ) { libxsmm_generator_transpose_avx_avx512_kernel( io_generated_code, i_trans_desc, i_arch ); } else { /* TODO fix this error */ LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_ARCH ); return; } } libxsmm-1.17/src/generator_transpose_avx_avx512.c000066400000000000000000001413071415223013700221240ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Greg Henry (Intel Corp.) ******************************************************************************/ #include "generator_transpose_avx_avx512.h" #include "generator_x86_instructions.h" #include "generator_common.h" #include "libxsmm_main.h" /* #define GENERATOR_TRANSPOSE_DEBUG */ /* d_ymm_or_zmm is automatically generated dispatching code Even on Skylake/KNL, zmm code doesn't always run better than ymm code. Given i_m x i_n matrix to transpose: This returns 1 if we should use xmm/ymm and not zmm This returns 2 if we should use zmm and not xmm/ymm i_m = number of rows of A to transpose i_n = number of columns of A to transpose i_avx512 (based on what the processor can do, not necessarily what's best) =0 to use AVX2 or below, 1 for zmm on Skylake, 2 for zmm on Knights Landing */ LIBXSMM_API_INLINE int d_ymm_or_zmm( const int i_m, const int i_n, const int i_avx512 ) { double dm, dn, dtmp; int l_retval; if ( !i_avx512 ) return 1; dm = (double) i_m; dn = (double) i_n; if ( i_avx512 == 1 ) { /* Skylake dispatching logic */ if ( dn <= 4.00000 ) { dtmp = 1.00000; } else { if ( dn <= 12.00000 ) { if ( dm <= 5.00000) { dtmp = 1.00000; } else { dtmp = 0.00916*dm - 0.16182*dn + 2.66904; } } else { dtmp = 0.02409*dm + 0.00486*dn + 1.25085; } } } else { /* Knights Landing dispatching logic */ if ( -2.30000*dm + 2.00000*dn <= -6.00000 ) { if ( dn <= 2.00000 ) { dtmp = 1.00000; } else { if ( dn <= 4.00000 ) { dtmp = 0.00032*dm - 0.69532*dn + 4.00575; } else { if ( -2.50000*dm - 1.50000*dn <= -32.00000) { if ( dm <= 17.00000 ) { dtmp = -0.07867*dm - 0.01862*dn + 2.97591; } else { dtmp = 2.00000; } } else { dtmp = -0.40000*dm - 0.46667*dn + 7.20000; } } } } else { dtmp = 0.01791*dm + 0.00141*dn + 1.43536; } } /* Now turn it into an integer */ l_retval = (int) dtmp; l_retval = LIBXSMM_MAX(l_retval,1); if ( dtmp - ((double) l_retval) >= 0.5 ) ++l_retval; l_retval = LIBXSMM_MIN(l_retval,2); return ( l_retval ); } /* load_mask_into_var loads a ymm-based-mask based on the remainder m into * ymm0 or ymm13 depending on "reg" * m = size of the border (should be less than the register size) * datasize= number of bytes for 1 unit (4 for single, 8 for double or complex) * reg = 0 or 13 to indicate which ymm register to use * buf = The buffer to contain the instruction/opcode sequence * loc = The location inside the buffer to store the new instruction bytes */ LIBXSMM_API_INLINE void load_mask_into_var ( const int m, const int datasize, int reg, unsigned char *buf, int *loc ) { unsigned char by=0; int i = *loc, m2, j; m2 = m; if ( datasize > 4 ) m2 = (datasize/4)*m; /* Currently, the transpose generator uses ymm0 and ymm13 only */ if ( (reg != 0) && (reg != 13) ) { fprintf(stderr,"strange register value into load_mask_into_var\n"); exit(-1); } if ( m == 1 ) by = 0x80; else by = 0; buf[i]=0xeb; buf[i+1]=0x20; i=i+2; /* unconditional jmp past the data */ for ( j = 1; j <= 8; j++ ) { if ( m2 >= j ) by = 0; else by = 0x80; buf[i]=0x00; buf[i+1]=0x00; buf[i+2]=0x80; buf[i+3]=(unsigned char)(0xbf-by); i+=4; } /* The below is doing vmovups .data(%rip), %ymm(reg) */ if ( reg == 0 ) { buf[i]=0xc5; buf[i+1]=0xfc; buf[i+2]=0x10; buf[i+3]=0x05; i+=4; } else { /* reg == 13 */ buf[i]=0xc5; buf[i+1]=0x7c; buf[i+2]=0x10; buf[i+3]=0x2d; i+=4; } buf[i]=0xd8; buf[i+1]=0xff; buf[i+2]=0xff; buf[i+3]=0xff; i+=4; *loc = i; } /* gen_one_trans generates a mxn single transposition of a subset of A into B * * (assuming m and n are less than the register size.) We also need to * * know any offsets into A or B to know where to load. * * m = number of rows of source A (should be less than the register size) * * n = number of columns of source A (should be less than the register size) * * ldb = In general, we assume that ldb is the original "n", however this * * routine can be called multiple times, and the "n" here might be the * * border. For instance if the register size is 8, and the original n is 9,* * then ldb=9 but during one call, n will be 8 and the other call, n will * * be 1. So the only way to know the original "n" is to look at ldb... * * offsetA = offset in BYTES (not elements) to load for the first A * * offsetB = offset in BYTES (not elements) to store for the first B * * datasize = 4 for single, 8 for double or single complex * * avx512=0 to use AVX2 or below, 1 for zmm on Skylake, * * 2 for zmm on Knights Landing * * maskvar=value used for masking with ymm0 (in case we can reuse it, * * otherwise, we must use ymm13). Obviously only valid when avx512==0 * * * * Note: Assumes rdi = &A, rsi = lda*datasize, r8 = lda*datasize*3, * * rbx =lda*datasize*5, rbp=lda*datasize*7, rdx = &B * * TODO: fix assumptions to match register mapping! */ LIBXSMM_API_INLINE void gen_one_trans( libxsmm_generated_code* io_generated_code, const libxsmm_transpose_gp_reg_mapping* i_gp_reg_mapping, const int m, const int n, const int ldb, const int offsetA, const int offsetB, const int datasize, const int avx512, const int maskvar) { int i = io_generated_code->code_size; unsigned char reg; int shiftmult; int m_nonone_border = 0; int n_nonone_border = 0; int m_fits_in_xmmymm = 0; int n_fits_in_xmm = 0; int m_fits_in_xmm = 0; int m_fits_in_ymm = 0; unsigned int REGSIZE = 4; unsigned int l_instr; char cval = 'x'; if (datasize == 8) { shiftmult = 8; if (m == 2) { cval = 'x'; } if (m == 4) { cval = 'y'; } if (m == 8) { cval = 'z'; } m_nonone_border = (m == 3); n_nonone_border = (n == 3); m_fits_in_xmmymm = ((m == 2) || (m == 4)); n_fits_in_xmm = (n == 2); m_fits_in_xmm = (m == 2); m_fits_in_ymm = (m == 4); if (avx512) REGSIZE = 8; else REGSIZE = 4; } else { shiftmult = 4; if (m == 4) { cval = 'x'; } if (m == 8) { cval = 'y'; } m_nonone_border = ((m == 2) || (m == 3) || (m == 5) || (m == 6) || (m == 7)); n_nonone_border = ((n == 2) || (n == 3) || (n == 5) || (n == 6) || (n == 7)); m_fits_in_xmmymm = ((m == 4) || (m == 8)); n_fits_in_xmm = (n == 4); m_fits_in_xmm = (m == 4); m_fits_in_ymm = (m == 8); if (avx512) REGSIZE = 16; else REGSIZE = 8; } /* Transposition has 3 parts: load the data, transpose it, store the data */ /* The following is part 1: */ if (!avx512) { if (m == 1) { if (datasize == 4) { l_instr = LIBXSMM_X86_INSTR_MOVL; } else { l_instr = LIBXSMM_X86_INSTR_MOVQ; } io_generated_code->code_size = i; /* Do instructions like: movl (%rdi), %r9d */ if (n >= 1) libxsmm_x86_instruction_alu_mem(io_generated_code, l_instr, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetA, i_gp_reg_mapping->gp_reg_n_loop, 0); /* movl (%rdi,%rsi,1), %r10d */ if (n >= 2) libxsmm_x86_instruction_alu_mem(io_generated_code, l_instr, i_gp_reg_mapping->gp_reg_a, i_gp_reg_mapping->gp_reg_lda, 1, offsetA, LIBXSMM_X86_GP_REG_R10, 0); /* movl (%rdi,%rsi,2), %r11d */ if (n >= 3) libxsmm_x86_instruction_alu_mem(io_generated_code, l_instr, i_gp_reg_mapping->gp_reg_a, i_gp_reg_mapping->gp_reg_lda, 2, offsetA, LIBXSMM_X86_GP_REG_R11, 0); /* movl (%rdi,%r8 ,1), %eax */ if (n >= 4) libxsmm_x86_instruction_alu_mem(io_generated_code, l_instr, i_gp_reg_mapping->gp_reg_a, i_gp_reg_mapping->gp_reg_m_loop, 1, offsetA, LIBXSMM_X86_GP_REG_RAX, 0); /* movl (%rdi,%rsi,4), %r12d */ if (n >= 5) libxsmm_x86_instruction_alu_mem(io_generated_code, l_instr, i_gp_reg_mapping->gp_reg_a, i_gp_reg_mapping->gp_reg_lda, 4, offsetA, LIBXSMM_X86_GP_REG_R12, 0); /* movl (%rdi,%rbx,1), %r13d */ if (n >= 6) libxsmm_x86_instruction_alu_mem(io_generated_code, l_instr, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_RBX, 1, offsetA, LIBXSMM_X86_GP_REG_R13, 0); /* movl (%rdi,%r8 ,2), %r14d */ if (n >= 7) libxsmm_x86_instruction_alu_mem(io_generated_code, l_instr, i_gp_reg_mapping->gp_reg_a, i_gp_reg_mapping->gp_reg_m_loop, 2, offsetA, LIBXSMM_X86_GP_REG_R14, 0); /* movl (%rdi,%rbp,1), %r15d */ if (n >= 8) libxsmm_x86_instruction_alu_mem(io_generated_code, l_instr, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_RBP, 1, offsetA, LIBXSMM_X86_GP_REG_R15, 0); i = io_generated_code->code_size; } /* m==1 */ } /* !avx512 */ if (!avx512) { reg = 0; } else { cval = 'z'; if (m % REGSIZE == 0) reg = 0; else reg = 1; } if ((!avx512 && m_fits_in_xmmymm) || (avx512)) { io_generated_code->code_size = i; /* Do instructions like: vmovups (%rdi), zmm1{%k1} */ if (n>0) libxsmm_x86_instruction_vec_move(io_generated_code, LIBXSMM_X86_AVX, LIBXSMM_X86_INSTR_VMOVUPS, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetA, cval, 1, reg, 1, 0); /* vmovups (%rdi,%rsi,1), zmm2{%k1} */ if (n>1) libxsmm_x86_instruction_vec_move(io_generated_code, LIBXSMM_X86_AVX, LIBXSMM_X86_INSTR_VMOVUPS, i_gp_reg_mapping->gp_reg_a, i_gp_reg_mapping->gp_reg_lda, 1, offsetA, cval, 2, reg, 1, 0); /* vmovups (%rdi,%rsi,2), zmm3{%k1} */ if (n>2) libxsmm_x86_instruction_vec_move(io_generated_code, LIBXSMM_X86_AVX, LIBXSMM_X86_INSTR_VMOVUPS, i_gp_reg_mapping->gp_reg_a, i_gp_reg_mapping->gp_reg_lda, 2, offsetA, cval, 3, reg, 1, 0); /* vmovups (%rdi,%r8 ,1), zmm4{%k1} */ if (n>3) libxsmm_x86_instruction_vec_move(io_generated_code, LIBXSMM_X86_AVX, LIBXSMM_X86_INSTR_VMOVUPS, i_gp_reg_mapping->gp_reg_a, i_gp_reg_mapping->gp_reg_m_loop, 1, offsetA, cval, 4, reg, 1, 0); /* vmovups (%rdi,%rsi,4), zmm5{%k1} */ if (n>4) libxsmm_x86_instruction_vec_move(io_generated_code, LIBXSMM_X86_AVX, LIBXSMM_X86_INSTR_VMOVUPS, i_gp_reg_mapping->gp_reg_a, i_gp_reg_mapping->gp_reg_lda, 4, offsetA, cval, 5, reg, 1, 0); /* vmovups (%rdi,%rbx,1), zmm6{%k1} */ if (n>5) libxsmm_x86_instruction_vec_move(io_generated_code, LIBXSMM_X86_AVX, LIBXSMM_X86_INSTR_VMOVUPS, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_RBX, 1, offsetA, cval, 6, reg, 1, 0); /* vmovups (%rdi,%r8 ,2), zmm7{%k1} */ if (n>6) libxsmm_x86_instruction_vec_move(io_generated_code, LIBXSMM_X86_AVX, LIBXSMM_X86_INSTR_VMOVUPS, i_gp_reg_mapping->gp_reg_a, i_gp_reg_mapping->gp_reg_m_loop, 2, offsetA, cval, 7, reg, 1, 0); /* vmovups (%rdi,%rbp,1), zmm8{%k1} */ if (n>7) libxsmm_x86_instruction_vec_move(io_generated_code, LIBXSMM_X86_AVX, LIBXSMM_X86_INSTR_VMOVUPS, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_RBP, 1, offsetA, cval, 8, reg, 1, 0); i = io_generated_code->code_size; } if (!avx512 && m_nonone_border) { /* We need a masked mov: vmaskmovps (%rdi), %ymm0, %ymm1 */ if (datasize == 8) l_instr = LIBXSMM_X86_INSTR_VMASKMOVPD; else l_instr = LIBXSMM_X86_INSTR_VMASKMOVPS; io_generated_code->code_size = i; libxsmm_x86_instruction_vec_mask_move(io_generated_code, l_instr, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetA, 'y', 1, 0, 0); if (n>1) libxsmm_x86_instruction_vec_mask_move(io_generated_code, l_instr, i_gp_reg_mapping->gp_reg_a, i_gp_reg_mapping->gp_reg_lda, 1, offsetA, 'y', 2, 0, 0); if (n>2) libxsmm_x86_instruction_vec_mask_move(io_generated_code, l_instr, i_gp_reg_mapping->gp_reg_a, i_gp_reg_mapping->gp_reg_lda, 2, offsetA, 'y', 3, 0, 0); if (n>3) libxsmm_x86_instruction_vec_mask_move(io_generated_code, l_instr, i_gp_reg_mapping->gp_reg_a, i_gp_reg_mapping->gp_reg_m_loop, 1, offsetA, 'y', 4, 0, 0); if (n>4) libxsmm_x86_instruction_vec_mask_move(io_generated_code, l_instr, i_gp_reg_mapping->gp_reg_a, i_gp_reg_mapping->gp_reg_lda, 4, offsetA, 'y', 5, 0, 0); if (n>5) libxsmm_x86_instruction_vec_mask_move(io_generated_code, l_instr, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_RBX, 1, offsetA, 'y', 6, 0, 0); if (n>6) libxsmm_x86_instruction_vec_mask_move(io_generated_code, l_instr, i_gp_reg_mapping->gp_reg_a, i_gp_reg_mapping->gp_reg_m_loop, 2, offsetA, 'y', 7, 0, 0); if (n>7) libxsmm_x86_instruction_vec_mask_move(io_generated_code, l_instr, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_RBP, 1, offsetA, 'y', 8, 0, 0); i = io_generated_code->code_size; } /* Part 1 is done. The data is loaded */ /* Transpose the data: */ if (avx512 || (!avx512 && (m > 1) && (ldb>1))) { if (!avx512) { if (datasize == 8) { io_generated_code->code_size = i; /* vunpcklpd %ymm2, %ymm1, %ymm5 */ libxsmm_x86_instruction_vec_compute_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VUNPCKLPD, 'y', 2, 1, 5); /* vunpcklpd %ymm4, %ymm3, %ymm6 */ libxsmm_x86_instruction_vec_compute_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VUNPCKLPD, 'y', 4, 3, 6); /* vunpckhpd %ymm2, %ymm1, %ymm7 */ if (m > 1) libxsmm_x86_instruction_vec_compute_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VUNPCKHPD, 'y', 2, 1, 7); /* vunpckhpd %ymm4, %ymm3, %ymm7 */ if (m > 1) libxsmm_x86_instruction_vec_compute_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VUNPCKHPD, 'y', 4, 3, 8); if (m>0) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VPERM2F128, 'y', 6, 5, 1, 32); if (m>1) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VPERM2F128, 'y', 8, 7, 2, 32); if (m>2) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VPERM2F128, 'y', 6, 5, 3, 49); if (m>3) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VPERM2F128, 'y', 8, 7, 4, 49); i = io_generated_code->code_size; } else { /* single precision: */ io_generated_code->code_size = i; /* vunpcklps %ymm2, %ymm1, %ymm9 */ libxsmm_x86_instruction_vec_compute_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VUNPCKLPS, 'y', 2, 1, 9); /* vunpckhps %ymm2, %ymm1, %ymm1 */ libxsmm_x86_instruction_vec_compute_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VUNPCKHPS, 'y', 2, 1, 1); /* vunpcklps %ymm4, %ymm3, %ymm10 */ libxsmm_x86_instruction_vec_compute_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VUNPCKLPS, 'y', 4, 3, 10); /* vunpckhps %ymm4, %ymm3, %ymm2 */ libxsmm_x86_instruction_vec_compute_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VUNPCKHPS, 'y', 4, 3, 2); /* vunpcklps %ymm6, %ymm5, %ymm11 */ libxsmm_x86_instruction_vec_compute_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VUNPCKLPS, 'y', 6, 5, 11); /* vunpckhps %ymm6, %ymm5, %ymm3 */ libxsmm_x86_instruction_vec_compute_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VUNPCKHPS, 'y', 6, 5, 3); /* vunpcklps %ymm8, %ymm7, %ymm12 */ libxsmm_x86_instruction_vec_compute_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VUNPCKLPS, 'y', 8, 7, 12); /* vunpckhps %ymm8, %ymm7, %ymm4 */ libxsmm_x86_instruction_vec_compute_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VUNPCKHPS, 'y', 8, 7, 4); if (m>0) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VSHUFPS, 'y', 10, 9, 5, 68); if (m>1) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VSHUFPS, 'y', 10, 9, 6, 238); if (m>2) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VSHUFPS, 'y', 2, 1, 7, 68); if (m>3) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VSHUFPS, 'y', 2, 1, 8, 238); if (m>0) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VSHUFPS, 'y', 12, 11, 9, 68); if (m>1) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VSHUFPS, 'y', 12, 11, 10, 238); if (m>2) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VSHUFPS, 'y', 4, 3, 11, 68); if (m>3) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VSHUFPS, 'y', 4, 3, 12, 238); if (m>0) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VPERM2F128, 'y', 9, 5, 1, 32); if (m>1) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VPERM2F128, 'y', 10, 6, 2, 32); if (m>2) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VPERM2F128, 'y', 11, 7, 3, 32); if (m>3) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VPERM2F128, 'y', 12, 8, 4, 32); if (m>4) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VPERM2F128, 'y', 9, 5, 5, 49); if (m>5) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VPERM2F128, 'y', 10, 6, 6, 49); if (m>6) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VPERM2F128, 'y', 11, 7, 7, 49); if (m>7) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VPERM2F128, 'y', 12, 8, 8, 49); i = io_generated_code->code_size; } } else { /* avx512 */ /* vshuff64x2 $0xEE, %zmm3 , %zmm1 , %zmm9 */ io_generated_code->code_size = i; if (m>0) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VSHUFF64X2, 'z', 3, 1, 9, 0xEE); /* vshuff64x2 $0x44, %zmm3 , %zmm1 , %zmm1 */ if (m>0) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VSHUFF64X2, 'z', 3, 1, 1, 0x44); /* vshuff64x2 $0xEE, %zmm4 , %zmm2 , %zmm10 */ if (m>2) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VSHUFF64X2, 'z', 4, 2, 10, 0xEE); /* vshuff64x2 $0x44, %zmm4 , %zmm2 , %zmm2 */ if (m>0) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VSHUFF64X2, 'z', 4, 2, 2, 0x44); /* vshuff64x2 $0xEE, %zmm7 , %zmm5 , %zmm11 */ if (m>4) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VSHUFF64X2, 'z', 7, 5, 11, 0xEE); /* vshuff64x2 $0x44, %zmm7 , %zmm5 , %zmm3 */ if (m>0) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VSHUFF64X2, 'z', 7, 5, 3, 0x44); /* vshuff64x2 $0xEE, %zmm8 , %zmm6 , %zmm12 */ if (m>4) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VSHUFF64X2, 'z', 8, 6, 12, 0xEE); /* vshuff64x2 $0x44, %zmm8 , %zmm6 , %zmm4 */ if (m>0) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VSHUFF64X2, 'z', 8, 6, 4, 0x44); /* vshuff64x2 $0xDD, %zmm3 , %zmm1 , %zmm6 */ if (m>0) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VSHUFF64X2, 'z', 3, 1, 6, 0xDD); /* vshuff64x2 $0x88, %zmm3 , %zmm1 , %zmm5 */ if (m>0) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VSHUFF64X2, 'z', 3, 1, 5, 0x88); /* vshuff64x2 $0xDD, %zmm11, %zmm9 , %zmm8 */ if (m>6) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VSHUFF64X2, 'z', 11, 9, 8, 0xDD); /* vshuff64x2 $0x88, %zmm11, %zmm9 , %zmm7 */ if (m>4) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VSHUFF64X2, 'z', 11, 9, 7, 0x88); /* vshuff64x2 $0x88, %zmm12, %zmm10, %zmm11 */ if (m>4) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VSHUFF64X2, 'z', 12, 10, 11, 0x88); /* vshuff64x2 $0xDD, %zmm12, %zmm10, %zmm12 */ if (m>6) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VSHUFF64X2, 'z', 12, 10, 12, 0xDD); /* vshuff64x2 $0xDD, %zmm4 , %zmm2 , %zmm10 */ if (m>2) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VSHUFF64X2, 'z', 4, 2, 10, 0xDD); /* vshuff64x2 $0x88, %zmm4 , %zmm2 , %zmm9 */ if (m>0) libxsmm_x86_instruction_vec_shuffle_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VSHUFF64X2, 'z', 4, 2, 9, 0x88); /* vunpcklpd %zmm9 , %zmm5, %zmm1 */ if (m>0) libxsmm_x86_instruction_vec_compute_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VUNPCKLPD, 'z', 9, 5, 1); /* vunpckhpd %zmm9 , %zmm5, %zmm2 */ if (m>1) libxsmm_x86_instruction_vec_compute_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VUNPCKHPD, 'z', 9, 5, 2); /* vunpcklpd %zmm10, %zmm6, %zmm3 */ if (m>2) libxsmm_x86_instruction_vec_compute_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VUNPCKLPD, 'z', 10, 6, 3); /* vunpckhpd %zmm10, %zmm6, %zmm4 */ if (m>3) libxsmm_x86_instruction_vec_compute_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VUNPCKHPD, 'z', 10, 6, 4); /* vunpcklpd %zmm11, %zmm7, %zmm5 */ if (m>4) libxsmm_x86_instruction_vec_compute_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VUNPCKLPD, 'z', 11, 7, 5); /* vunpckhpd %zmm11, %zmm7, %zmm6 */ if (m>5) libxsmm_x86_instruction_vec_compute_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VUNPCKHPD, 'z', 11, 7, 6); /* vunpcklpd %zmm12, %zmm8, %zmm7 */ if (m>6) libxsmm_x86_instruction_vec_compute_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VUNPCKLPD, 'z', 12, 8, 7); /* vunpckhpd %zmm12, %zmm8, %zmm8 */ if (m>7) libxsmm_x86_instruction_vec_compute_reg(io_generated_code, LIBXSMM_X86_SSE3, LIBXSMM_X86_INSTR_VUNPCKHPD, 'z', 12, 8, 8); i = io_generated_code->code_size; } if (n_fits_in_xmm) { cval = 'x'; } else { cval = 'y'; } } if (!avx512) { /* Special case when ldb==1-> just do a copy */ if (ldb == 1) { if (m_fits_in_xmm) { /* vmovups %xmm1, offsetB(%rdx) */ io_generated_code->code_size = i; libxsmm_x86_instruction_vec_move(io_generated_code, LIBXSMM_X86_AVX, LIBXSMM_X86_INSTR_VMOVUPS, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB, 'x', 1, 0, 0, 1); i = io_generated_code->code_size; } if (m_nonone_border) { io_generated_code->code_size = i; if (datasize == 8) l_instr = LIBXSMM_X86_INSTR_VMASKMOVPD; else l_instr = LIBXSMM_X86_INSTR_VMASKMOVPS; libxsmm_x86_instruction_vec_mask_move(io_generated_code, l_instr, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB, 'y', 1, 0, 1); i = io_generated_code->code_size; } if (m_fits_in_ymm) { /* vmovups %ymm1, (%rdx) */ io_generated_code->code_size = i; libxsmm_x86_instruction_vec_move(io_generated_code, LIBXSMM_X86_AVX, LIBXSMM_X86_INSTR_VMOVUPS, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB, 'y', 1, 0, 0, 1); i = io_generated_code->code_size; } } } /* Part 3: Store out the data */ if (avx512 || (!avx512 && (m > 1) && (ldb > 1))) { if (!avx512 && ((n == 1) || n_nonone_border)) { if (maskvar == n) { reg = 0; /* ymm0 already contains the right mask */ } else { reg = 13; /* ymm13 better have the right mask */ } io_generated_code->code_size = i; if (datasize == 8) l_instr = LIBXSMM_X86_INSTR_VMASKMOVPD; else l_instr = LIBXSMM_X86_INSTR_VMASKMOVPS; libxsmm_x86_instruction_vec_mask_move(io_generated_code, l_instr, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB, 'y', 1, reg, 1); if (m>1) libxsmm_x86_instruction_vec_mask_move(io_generated_code, l_instr, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB + shiftmult*ldb, 'y', 2, reg, 1); if (m>2) libxsmm_x86_instruction_vec_mask_move(io_generated_code, l_instr, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB + 2 * shiftmult*ldb, 'y', 3, reg, 1); if (m>3) libxsmm_x86_instruction_vec_mask_move(io_generated_code, l_instr, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB + 3 * shiftmult*ldb, 'y', 4, reg, 1); if (m>4) libxsmm_x86_instruction_vec_mask_move(io_generated_code, l_instr, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB + 4 * shiftmult*ldb, 'y', 5, reg, 1); if (m>5) libxsmm_x86_instruction_vec_mask_move(io_generated_code, l_instr, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB + 5 * shiftmult*ldb, 'y', 6, reg, 1); if (m>6) libxsmm_x86_instruction_vec_mask_move(io_generated_code, l_instr, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB + 6 * shiftmult*ldb, 'y', 7, reg, 1); if (m>7) libxsmm_x86_instruction_vec_mask_move(io_generated_code, l_instr, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB + 7 * shiftmult*ldb, 'y', 8, reg, 1); i = io_generated_code->code_size; } else { if (!avx512) { io_generated_code->code_size = i; /* vmovups %ymm1, (%rdx) or xmm1 if cval=='x'*/ libxsmm_x86_instruction_vec_move(io_generated_code, LIBXSMM_X86_AVX, LIBXSMM_X86_INSTR_VMOVUPS, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB, cval, 1, 0, 0, 1); /* vmovups %ymm2, (%rdx) */ if (m>1) libxsmm_x86_instruction_vec_move(io_generated_code, LIBXSMM_X86_AVX, LIBXSMM_X86_INSTR_VMOVUPS, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB + shiftmult*ldb, cval, 2, 0, 0, 1); /* vmovups %ymm3, (%rdx) */ if (m>2) libxsmm_x86_instruction_vec_move(io_generated_code, LIBXSMM_X86_AVX, LIBXSMM_X86_INSTR_VMOVUPS, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB + 2 * shiftmult*ldb, cval, 3, 0, 0, 1); /* vmovups %ymm4, (%rdx) */ if (m>3) libxsmm_x86_instruction_vec_move(io_generated_code, LIBXSMM_X86_AVX, LIBXSMM_X86_INSTR_VMOVUPS, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB + 3 * shiftmult*ldb, cval, 4, 0, 0, 1); /* vmovups %ymm5, (%rdx) */ if (m>4) libxsmm_x86_instruction_vec_move(io_generated_code, LIBXSMM_X86_AVX, LIBXSMM_X86_INSTR_VMOVUPS, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB + 4 * shiftmult*ldb, cval, 5, 0, 0, 1); /* vmovups %ymm6, (%rdx) */ if (m>5) libxsmm_x86_instruction_vec_move(io_generated_code, LIBXSMM_X86_AVX, LIBXSMM_X86_INSTR_VMOVUPS, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB + 5 * shiftmult*ldb, cval, 6, 0, 0, 1); /* vmovups %ymm7, (%rdx) */ if (m>6) libxsmm_x86_instruction_vec_move(io_generated_code, LIBXSMM_X86_AVX, LIBXSMM_X86_INSTR_VMOVUPS, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB + 6 * shiftmult*ldb, cval, 7, 0, 0, 1); /* vmovups %ymm8, (%rdx) */ if (m>7) libxsmm_x86_instruction_vec_move(io_generated_code, LIBXSMM_X86_AVX, LIBXSMM_X86_INSTR_VMOVUPS, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB + 7 * shiftmult*ldb, cval, 8, 0, 0, 1); i = io_generated_code->code_size; } else { /* avx512 */ cval = 'z'; if (n % REGSIZE == 0) reg = 0; else reg = 2; io_generated_code->code_size = i; /* vmovups %zmm1, offsetB(%rdx) {%k2} (reg is the mask reg) */ if (m>0) libxsmm_x86_instruction_vec_move(io_generated_code, LIBXSMM_X86_AVX, LIBXSMM_X86_INSTR_VMOVUPS, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB, cval, 1, reg, 0, 1); /* vmovups %zmm2, *(%rdx) {%k2} */ if (m>1) libxsmm_x86_instruction_vec_move(io_generated_code, LIBXSMM_X86_AVX, LIBXSMM_X86_INSTR_VMOVUPS, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB + 8 * ldb, cval, 2, reg, 0, 1); if (m>2) libxsmm_x86_instruction_vec_move(io_generated_code, LIBXSMM_X86_AVX, LIBXSMM_X86_INSTR_VMOVUPS, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB + 16 * ldb, cval, 3, reg, 0, 1); if (m>3) libxsmm_x86_instruction_vec_move(io_generated_code, LIBXSMM_X86_AVX, LIBXSMM_X86_INSTR_VMOVUPS, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB + 24 * ldb, cval, 4, reg, 0, 1); if (m>4) libxsmm_x86_instruction_vec_move(io_generated_code, LIBXSMM_X86_AVX, LIBXSMM_X86_INSTR_VMOVUPS, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB + 32 * ldb, cval, 5, reg, 0, 1); if (m>5) libxsmm_x86_instruction_vec_move(io_generated_code, LIBXSMM_X86_AVX, LIBXSMM_X86_INSTR_VMOVUPS, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB + 40 * ldb, cval, 6, reg, 0, 1); if (m>6) libxsmm_x86_instruction_vec_move(io_generated_code, LIBXSMM_X86_AVX, LIBXSMM_X86_INSTR_VMOVUPS, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB + 48 * ldb, cval, 7, reg, 0, 1); if (m>7) libxsmm_x86_instruction_vec_move(io_generated_code, LIBXSMM_X86_AVX, LIBXSMM_X86_INSTR_VMOVUPS, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB + 56 * ldb, cval, 8, reg, 0, 1); i = io_generated_code->code_size; } /* avx512 */ } } if (!avx512) { if (m == 1) { io_generated_code->code_size = i; if (datasize == 4) { l_instr = LIBXSMM_X86_INSTR_MOVL; } else { l_instr = LIBXSMM_X86_INSTR_MOVQ; } /* movl %r9d, (%rdx) */ if (n >= 1) libxsmm_x86_instruction_alu_mem(io_generated_code, l_instr, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB, i_gp_reg_mapping->gp_reg_n_loop, 1); /* movl %r10d, (%rdx) */ if (n >= 2) libxsmm_x86_instruction_alu_mem(io_generated_code, l_instr, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB + shiftmult, LIBXSMM_X86_GP_REG_R10, 1); /* movl %r11d, (%rdx) */ if (n >= 3) libxsmm_x86_instruction_alu_mem(io_generated_code, l_instr, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB + shiftmult * 2, LIBXSMM_X86_GP_REG_R11, 1); /* movl %eax, (%rdx) */ if (n >= 4) libxsmm_x86_instruction_alu_mem(io_generated_code, l_instr, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB + shiftmult * 3, LIBXSMM_X86_GP_REG_RAX, 1); /* movl %r12d, (%rdx) */ if (n >= 5) libxsmm_x86_instruction_alu_mem(io_generated_code, l_instr, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB + shiftmult * 4, LIBXSMM_X86_GP_REG_R12, 1); /* movl %r13d, (%rdx) */ if (n >= 6) libxsmm_x86_instruction_alu_mem(io_generated_code, l_instr, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB + shiftmult * 5, LIBXSMM_X86_GP_REG_R13, 1); /* movl %r14d, (%rdx) */ if (n >= 7) libxsmm_x86_instruction_alu_mem(io_generated_code, l_instr, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB + shiftmult * 6, LIBXSMM_X86_GP_REG_R14, 1); /* movl %r15d, (%rdx) */ if (n >= 8) libxsmm_x86_instruction_alu_mem(io_generated_code, l_instr, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 1, offsetB + shiftmult * 7, LIBXSMM_X86_GP_REG_R15, 1); i = io_generated_code->code_size; } } /* avx512 */ /* *loc = i; */ io_generated_code->code_size = i; } LIBXSMM_API_INTERN void libxsmm_generator_transpose_avx_avx512_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_trans_descriptor* i_trans_desc, int i_arch ) { libxsmm_transpose_gp_reg_mapping l_gp_reg_mapping; libxsmm_loop_label_tracker l_loop_label_tracker; const char *const cpuid = libxsmm_cpuid_name( i_arch ); /* avx512 just represents whether we want to use zmm registers or not * * A value of 0 says not, a value of 1 targets AVX512_CORE, a value * * of 2 targets AVX512_MIC */ int avx512; /* define loop_label_tracker */ libxsmm_reset_loop_label_tracker( &l_loop_label_tracker ); /* define gp register mapping */ memset(&l_gp_reg_mapping, 0, sizeof(l_gp_reg_mapping)); #if defined(_WIN32) || defined(__CYGWIN__) l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_lda = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_R8; l_gp_reg_mapping.gp_reg_ldb = LIBXSMM_X86_GP_REG_R9; l_gp_reg_mapping.gp_reg_m_loop = LIBXSMM_X86_GP_REG_RDI; l_gp_reg_mapping.gp_reg_n_loop = LIBXSMM_X86_GP_REG_RSI; #else /* match calling convention on Linux */ l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RDI; l_gp_reg_mapping.gp_reg_lda = LIBXSMM_X86_GP_REG_RSI; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_ldb = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_m_loop = LIBXSMM_X86_GP_REG_R8; l_gp_reg_mapping.gp_reg_n_loop = LIBXSMM_X86_GP_REG_R9; #endif l_gp_reg_mapping.gp_reg_help_0 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_1 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_2 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_3 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_4 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_5 = LIBXSMM_X86_GP_REG_UNDEF; /* Actually, the logic is this: we need a, lda, and b. We don't need ldb * * If n>=6, we need rbx * * If n>=8, we need rbp * * If LIBXSMM_MIN(n,REGSIZE)>=5 and m%REGSIZE==1, we need r12 * * If LIBXSMM_MIN(n,REGSIZE)>=6 and m%REGSIZE==1, we need r13 * * If LIBXSMM_MIN(n,REGSIZE)>=7 and m%REGSIZE==1, we need r14 * * If LIBXSMM_MIN(n,REGSIZE)>=8 and m%REGSIZE==1, we need r15 * * Otherwise, we get by with registers that don't require pushing/popping */ /* define transposition kernel config */ if (LIBXSMM_X86_AVX512_CORE <= i_arch) { avx512 = 1; } else if (LIBXSMM_X86_AVX512 <= i_arch) { avx512 = 2; } else if (LIBXSMM_X86_AVX <= i_arch) { avx512 = 0; } else { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_UNSUP_ARCH ); return; } /* @Greg add more fields here */ /* open asm */ /* Note: I'm not sure exactly what to add to open_stream_transpose... this * is for the regular assembly coding? */ libxsmm_x86_instruction_open_stream_transpose( io_generated_code, l_gp_reg_mapping.gp_reg_a, l_gp_reg_mapping.gp_reg_lda, l_gp_reg_mapping.gp_reg_b, l_gp_reg_mapping.gp_reg_ldb, cpuid ); if ( io_generated_code->code_type > 1 ) { unsigned char *buf = (unsigned char *) io_generated_code->generated_code; int i = io_generated_code->code_size; unsigned int m = i_trans_desc->m; unsigned int n = i_trans_desc->n; int loopm = 0, loopn= 0, mjmp, njmp; int imask = 0; int offsetA, offsetB, oldB; int j, k, m0, n0, shiftvalue, shiftmult; /* Note: the transpose routine only works when ldb is fixed at a value * So why do we need a variable "ldb"? Well, it's to keep track of * the original "n" and that's the only reason */ unsigned int ldo = i_trans_desc->ldo; unsigned int ldb; /* REGSIZE is used for masking. REGSIZE is just: * 4 for double on ymm (unless m=1, then it's 8), * 8 for single on ymm or double on zmm, * 16 for single on zmm */ unsigned int REGSIZE; int maskvar = 0; int datasize = i_trans_desc->typesize; if ( ldo < n ) { /* This means that we didn't store ldb correctly. Not sure why, Greg thinks we should change/fix this. */ ldb = n; } else { ldb = ldo; } #ifdef GENERATOR_TRANSPOSE_DEBUG const unsigned int l_maxsize = io_generated_code->buffer_size; printf("Entering libxsmm_generator_transpose_avx_avx512_kernel with i loc=%d m=%d n=%d datasize=%d\n",i,m,n,datasize); printf("Space available: %d - %d = %d\n",l_maxsize,i,l_maxsize-i); #endif assert(0 < datasize); if ( (datasize != 4) && (datasize != 8) ) { fprintf(stderr,"Expecting a datasize of 4 or 8, but got %d\n",datasize); exit(-1); } /* Comment this next conditional out to *FORCE* AVX-512 dispatching */ if ( avx512 ) { /* Determine if we should really use ZMM registers, or not */ if ( d_ymm_or_zmm( m, n, avx512 ) == 1 ) { avx512 = 0; /* Ymm might be faster than zmm */ } if ( datasize == 4 ) avx512 = 0; /* Don't use avx512 on real*4 */ } if ( datasize == 8 ) { shiftvalue = 3; shiftmult = 8; if ( avx512 ) { REGSIZE = 8; } else { if ( m == 1 ) REGSIZE = 8; else REGSIZE = 4; } } else { shiftvalue = 2; shiftmult = 4; if ( avx512 ) REGSIZE = 16; else REGSIZE = 8; } if ( avx512 ) { m0 = m % REGSIZE; n0 = n % REGSIZE; if ( m0 > 0 ) { k = m0; if ( k == 1 ) imask = 3; else if ( k == 2 ) imask = 15; else if ( k == 3 ) imask = 63; else if ( k == 4 ) imask = 255; else if ( k == 5 ) imask = 1023; else if ( k == 6 ) imask = 4095; else if ( k == 7 ) imask = 16383; /* movq imask, %r8: */ io_generated_code->code_size = i; libxsmm_x86_instruction_alu_imm ( io_generated_code, LIBXSMM_X86_INSTR_MOVQ, 8, imask ); /* kmovw %r8d, %k1: */ libxsmm_x86_instruction_mask_move ( io_generated_code, LIBXSMM_X86_INSTR_KMOVW, 8, 1, 0 ); i = io_generated_code->code_size; } if ( n0 > 0 ) { k = n0; if ( k == 1 ) imask = 3; else if ( k == 2 ) imask = 15; else if ( k == 3 ) imask = 63; else if ( k == 4 ) imask = 255; else if ( k == 5 ) imask = 1023; else if ( k == 6 ) imask = 4095; else if ( k == 7 ) imask = 16383; /* movq imask, %r8: */ io_generated_code->code_size = i; libxsmm_x86_instruction_alu_imm ( io_generated_code, LIBXSMM_X86_INSTR_MOVQ, 8, imask ); /* kmovw %r8d, %k2: */ libxsmm_x86_instruction_mask_move ( io_generated_code, LIBXSMM_X86_INSTR_KMOVW, 8, 2, 0 ); i = io_generated_code->code_size; } } if ( n > 1 ) { /* movslq (%rsi), %rsi and salq $shiftvalue, %rsi */ io_generated_code->code_size = i; libxsmm_x86_instruction_alu_mem ( io_generated_code, LIBXSMM_X86_INSTR_MOVSLQ, LIBXSMM_X86_GP_REG_RSI, LIBXSMM_X86_GP_REG_UNDEF, 1, 0, LIBXSMM_X86_GP_REG_RSI, 0 ); libxsmm_x86_instruction_alu_imm ( io_generated_code, LIBXSMM_X86_INSTR_SALQ, LIBXSMM_X86_GP_REG_RSI, shiftvalue ); i = io_generated_code->code_size; } if ( n >= 4 ) { /* movq %rsi, %r8 and imul $3, %r8: */ io_generated_code->code_size = i; libxsmm_x86_instruction_alu_reg ( io_generated_code, LIBXSMM_X86_INSTR_MOVQ, LIBXSMM_X86_GP_REG_RSI, LIBXSMM_X86_GP_REG_R8 ); libxsmm_x86_instruction_alu_imm ( io_generated_code, LIBXSMM_X86_INSTR_IMUL, LIBXSMM_X86_GP_REG_R8, 3 ); i = io_generated_code->code_size; if ( LIBXSMM_MIN(n,REGSIZE) >= 6 ) { /* movq %rsi, %rbx and imul $5, %rbx : */ io_generated_code->code_size = i; libxsmm_x86_instruction_alu_reg ( io_generated_code, LIBXSMM_X86_INSTR_MOVQ, LIBXSMM_X86_GP_REG_RSI, LIBXSMM_X86_GP_REG_RBX ); libxsmm_x86_instruction_alu_imm ( io_generated_code, LIBXSMM_X86_INSTR_IMUL, LIBXSMM_X86_GP_REG_RBX, 5 ); i = io_generated_code->code_size; } if ( LIBXSMM_MIN(n,REGSIZE) >= 8 ) { /* movq %rsi, %rbp and imul $7, %rbp: */ io_generated_code->code_size = i; libxsmm_x86_instruction_alu_reg ( io_generated_code, LIBXSMM_X86_INSTR_MOVQ, LIBXSMM_X86_GP_REG_RSI, LIBXSMM_X86_GP_REG_RBP ); libxsmm_x86_instruction_alu_imm ( io_generated_code, LIBXSMM_X86_INSTR_IMUL, LIBXSMM_X86_GP_REG_RBP, 7 ); i = io_generated_code->code_size; } } #ifdef GENERATOR_TRANSPOSE_DEBUG printf("loc1 m=%d n=%d i=%d datasize=%d\n",m,n,i,datasize); #endif /* Load any necessary masks into ymm0 and/or ymm13 */ if ( !avx512 && (m != 1) ) { int mt = m%REGSIZE; int nt = n%REGSIZE; int reg = 0; if ( datasize == 8 ) { if ( mt == 3 ) { io_generated_code->code_size = i; load_mask_into_var ( mt, datasize, reg, buf, &i ); io_generated_code->code_size = i; if ( reg == 0 ) maskvar = mt; reg = 13; } if ( (nt == 1) || (nt == 3) ) { io_generated_code->code_size = i; load_mask_into_var ( nt, datasize, reg, buf, &i ); io_generated_code->code_size = i; if ( reg == 0 ) maskvar = nt; } } else if ( datasize == 4 ) { if ( (mt==2) || (mt==3) || (mt==5) || (mt==6) || (mt==7) ) { io_generated_code->code_size = i; load_mask_into_var ( mt, datasize, reg, buf, &i ); io_generated_code->code_size = i; if ( reg == 0 ) maskvar = mt; reg = 13; } if ( (nt==1) || ((nt != mt) && (nt != 4)) ) { io_generated_code->code_size = i; load_mask_into_var ( nt, datasize, reg, buf, &i ); io_generated_code->code_size = i; if ( reg == 0 ) maskvar = nt; } } } /* Determine whether to use loops or not */ if ( (n / REGSIZE) >= 2 ) loopn = n / REGSIZE; else loopn = 0; if ( (m / REGSIZE) >= 2 ) loopm = m / REGSIZE; else loopm = 0; /* To prevent and disable looping, just set loopm and loopn to 0 */ #ifdef PREVENT_TRANSPOSE_LOOPING loopm = 0; loopn = 0; #endif if ( loopn > 0 ) { io_generated_code->code_size = i; libxsmm_x86_instruction_alu_imm ( io_generated_code, LIBXSMM_X86_INSTR_MOVQ, l_gp_reg_mapping.gp_reg_ldb, loopn ); libxsmm_x86_instruction_register_jump_back_label ( io_generated_code, &l_loop_label_tracker ); i = io_generated_code->code_size; } /* Here is the main loop and it's logic is simple. We just "stamp" a bunch * of smaller transpositions using the routine "get_one_trans()". * Eventually, incorporate loops into this for smaller footprints */ offsetA = 0; offsetB = 0; oldB = 0; njmp = REGSIZE; if ( loopn > 0 ) njmp = REGSIZE*loopn; mjmp = REGSIZE; if ( loopm > 0 ) mjmp = REGSIZE*loopm; for (j = 1; j <= (int)n; j += njmp ) { offsetA = 0; oldB = offsetB; if ( loopm > 0 ) { io_generated_code->code_size = i; libxsmm_x86_instruction_alu_imm ( io_generated_code, LIBXSMM_X86_INSTR_MOVQ, LIBXSMM_X86_GP_REG_R15, loopm ); libxsmm_x86_instruction_register_jump_back_label ( io_generated_code, &l_loop_label_tracker ); i = io_generated_code->code_size; } for ( k = 1; k <= (int)m; k += mjmp ) { io_generated_code->code_size = i; /* Note that the m, n parameters aren't the original m, n; which is why we also pass in this phony "ldb". Make certain this routine is never called with values in excess of REGSIZE */ #ifdef GENERATOR_TRANSPOSE_DEBUG printf("calling gen_one_trans mxn=%dx%d using %dx%d offsetA=%d offsetB=%d i=%d datasize=%d maskvar=%d\n",m,n,LIBXSMM_MIN(REGSIZE,((int)m)-k+1),LIBXSMM_MIN(REGSIZE,((int)n)-j+1),offsetA,offsetB,i,datasize,maskvar); #endif /* This routine just does a single transpose at a time. */ assert(k <= (int)(m + 1) && j <= (int)(n + 1)); gen_one_trans(io_generated_code, &l_gp_reg_mapping, LIBXSMM_MIN(REGSIZE,m-k+1), LIBXSMM_MIN(REGSIZE,n-j+1), ldb, offsetA, offsetB, datasize, avx512, maskvar); if (0 != io_generated_code->last_error) return; i = io_generated_code->code_size; #ifdef GENERATOR_TRANSPOSE_DEBUG printf("done calling gen_one_trans mxn=%dx%d using %dx%d offsetA=%d offsetB=%d i=%d datasize=%d maskvar=%d\n",m,n,LIBXSMM_MIN(REGSIZE,((int)m)-k+1),LIBXSMM_MIN(REGSIZE,((int)n)-j+1),offsetA,offsetB,i,datasize,maskvar); #endif if ( loopm == 0 ) { offsetA += shiftmult*mjmp; offsetB += shiftmult*mjmp*ldb; } else if ( k==1 ) { io_generated_code->code_size = i; libxsmm_x86_instruction_alu_imm( io_generated_code, LIBXSMM_X86_INSTR_ADDQ, l_gp_reg_mapping.gp_reg_a, shiftmult*REGSIZE ); libxsmm_x86_instruction_alu_imm( io_generated_code, LIBXSMM_X86_INSTR_ADDQ, l_gp_reg_mapping.gp_reg_b, shiftmult*REGSIZE*ldb ); libxsmm_x86_instruction_alu_imm( io_generated_code, LIBXSMM_X86_INSTR_SUBQ, LIBXSMM_X86_GP_REG_R15, 1 ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, LIBXSMM_X86_INSTR_JG, &l_loop_label_tracker ); i = io_generated_code->code_size; } } if ( loopm > 0 ) { io_generated_code->code_size = i; libxsmm_x86_instruction_alu_imm( io_generated_code, LIBXSMM_X86_INSTR_SUBQ, l_gp_reg_mapping.gp_reg_b, shiftmult*REGSIZE*ldb*loopm ); libxsmm_x86_instruction_alu_imm( io_generated_code, LIBXSMM_X86_INSTR_SUBQ, l_gp_reg_mapping.gp_reg_a, shiftmult*REGSIZE*loopm ); i = io_generated_code->code_size; } if ( j+REGSIZE <= n ) { io_generated_code->code_size = i; /* addq %r8, %rdi: */ if ( REGSIZE == 4 ) libxsmm_x86_instruction_alu_reg ( io_generated_code, LIBXSMM_X86_INSTR_ADDQ, LIBXSMM_X86_GP_REG_R8, LIBXSMM_X86_GP_REG_RDI ); /* addq %rbp, %rdi: */ if ( REGSIZE == 8 ) libxsmm_x86_instruction_alu_reg ( io_generated_code, LIBXSMM_X86_INSTR_ADDQ, LIBXSMM_X86_GP_REG_RBP, LIBXSMM_X86_GP_REG_RDI ); /* addq %rsi, %rdi: */ libxsmm_x86_instruction_alu_reg ( io_generated_code, LIBXSMM_X86_INSTR_ADDQ, LIBXSMM_X86_GP_REG_RSI, LIBXSMM_X86_GP_REG_RDI ); i = io_generated_code->code_size; } offsetB = oldB + shiftmult*REGSIZE; } if ( loopn > 0 ) { io_generated_code->code_size = i; libxsmm_x86_instruction_alu_imm( io_generated_code, LIBXSMM_X86_INSTR_ADDQ, l_gp_reg_mapping.gp_reg_b, shiftmult*REGSIZE ); libxsmm_x86_instruction_alu_imm( io_generated_code, LIBXSMM_X86_INSTR_SUBQ, l_gp_reg_mapping.gp_reg_ldb, 1 ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, LIBXSMM_X86_INSTR_JG, &l_loop_label_tracker ); i = io_generated_code->code_size; } io_generated_code->code_size = i; #ifdef GENERATOR_TRANSPOSE_DEBUG printf("almost done with m=%d n=%d i=%d datasize=%d\n",m,n,i,datasize); #endif } /* close asm: note that we really didn't need to push everything */ libxsmm_x86_instruction_close_stream_transpose( io_generated_code, cpuid ); #ifdef GENERATOR_TRANSPOSE_DEBUG printf("done with m=%d n=%d i=%d\n",i_trans_desc->m,i_trans_desc->n,io_generated_code->code_size); #endif } libxsmm-1.17/src/generator_transpose_avx_avx512.h000066400000000000000000000023351415223013700221260ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Greg Henry (Intel Corp.) ******************************************************************************/ #ifndef GENERATOR_TRANSPOSE_AVX_AVX512_H #define GENERATOR_TRANSPOSE_AVX_AVX512_H #include "generator_common.h" LIBXSMM_API_INTERN void libxsmm_generator_transpose_avx_avx512_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_trans_descriptor* i_trans_desc, int i_arch ); #endif /* GENERATOR_TRANSPOSE_AVX_AVX512_H */ libxsmm-1.17/src/generator_x86_instructions.c000066400000000000000000010310161415223013700213670ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Greg Henry (Intel Corp.) ******************************************************************************/ #include "generator_x86_instructions.h" #include "generator_common.h" /** * This routine is for the jit code. All offsets/displacements have similar * byte patterns, so this is used for all of them. */ LIBXSMM_API_INLINE int internal_x86_instructions_add_offset(const unsigned int i_place1, const unsigned int i_place2, const int i_offset, const unsigned int i_forced, const int i_sizereg, unsigned char *buf) { if ((i_offset == 0) && (i_forced == 0)) return (0); else if (((i_offset%i_sizereg) == 0) && (i_offset / i_sizereg <= 127) && (i_offset / i_sizereg >= -128)) { buf[i_place1] += 0x40; buf[i_place2] = (unsigned char)(i_offset / i_sizereg); return (1); } else { unsigned char *l_cptr = (unsigned char *)&i_offset; buf[i_place1] += 0x80; buf[i_place2] = l_cptr[0]; buf[i_place2 + 1] = l_cptr[1]; buf[i_place2 + 2] = l_cptr[2]; buf[i_place2 + 3] = l_cptr[3]; return (4); } } /** * This routine is for the jump jit code. All jumps have similar patterns. * Back jumps can be computed immediately because the source and dest is known * Forward jumps can be estimated as 4-byte jumps taking 5 or 6 bytes in total * i_src_location: location of the start of the jump instruction. It's passed * in as it may have nothing to do with the last location coded in our jit * stream. For backward jumps, it's probably io_generated_code->code_size * i_dest_location: location of the start of the target destination we are * jumping to, or -1 if it's a forward jump and currently unknown * i_jmp_instr is one of the jump instructions we support * This function returns the number of bytes it uses, or 0 if it fails */ LIBXSMM_API_INLINE int internal_x86_jumping( libxsmm_generated_code* io_generated_code, int i_src_location, int i_dest_location, const unsigned int i_jmp_instr ) { unsigned char *buf = (unsigned char *) io_generated_code->generated_code; int l_jmptype; int l_dist; unsigned char *l_cptr = (unsigned char *) &l_dist; /* check that we just handle a valid jump */ switch ( i_jmp_instr ) { case LIBXSMM_X86_INSTR_JL: l_jmptype = 0x7c; break; case LIBXSMM_X86_INSTR_JE: case LIBXSMM_X86_INSTR_JZ: l_jmptype = 0x74; break; case LIBXSMM_X86_INSTR_JG: l_jmptype = 0x7F; break; case LIBXSMM_X86_INSTR_JNE: case LIBXSMM_X86_INSTR_JNZ: l_jmptype = 0x75; break; case LIBXSMM_X86_INSTR_JGE: l_jmptype = 0x7D; break; case LIBXSMM_X86_INSTR_JLE: l_jmptype = 0x7E; break; case LIBXSMM_X86_INSTR_JMP: l_jmptype = 0xEB; break; default: LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_UNSUPPORTED_JUMP ); return 0; } /* The jmp instruction better be somewhere valid in the code */ if ( i_src_location < 0 ) { fprintf(stderr,"Bogus source location for internal jumping routine: %i\n", i_src_location); exit(-1); } /* Make sure i_src_location is no bigger than the end of the code */ if ( (unsigned int)i_src_location > io_generated_code->code_size ) { fprintf(stderr,"How can the source of the jump itself be an instruction far beyond where we've jitted? Something is really strange here src=%i loc=%u\n",i_src_location,io_generated_code->code_size); exit(-1); } if ( i_dest_location < 0 ) { /* Must be a forward jump and we don't yet know it's dest location */ if ( i_jmp_instr == LIBXSMM_X86_INSTR_JMP ) { buf[i_src_location] = 0xe9; /* FIll-in zeros for now, this routine has to be called again: */ buf[i_src_location+1] = 0x00; buf[i_src_location+2] = 0x00; buf[i_src_location+3] = 0x00; buf[i_src_location+4] = 0x00; return 5; } else { buf[i_src_location] = 0x0f; buf[i_src_location+1] = (unsigned char)(l_jmptype + 0x10); /* FIll-in zeros for now, this routine has to be called again: */ buf[i_src_location+2] = 0x00; buf[i_src_location+3] = 0x00; buf[i_src_location+4] = 0x00; buf[i_src_location+5] = 0x00; return 6; } } /* Make sure we aren't trying to jump to the same location as the original jump instruction */ if ( i_src_location==i_dest_location || (i_src_location==i_dest_location+1) ) { fprintf(stderr,"i_src_location=%i is physically too close to i_dest_location=%i\n",i_src_location,i_dest_location); exit(-1); } if ( i_src_location > i_dest_location ) { /* Must be a backward jump */ l_dist = -1*(i_src_location+2-i_dest_location); /* assume 1-byte */ if ( l_dist >= -128 ) /* can it be done in 1-byte? */ { /* Single byte back jump */ buf[i_src_location] = (unsigned char)l_jmptype; buf[i_src_location+1] = (unsigned char)l_dist; return 2; } else { /* 4-byte back jump */ if ( i_jmp_instr != LIBXSMM_X86_INSTR_JMP ) { /* l_cptr better point to l_dist and l_dist needs to be recalculated */ l_dist = -1*(i_src_location+6-i_dest_location); buf[i_src_location] = 0x0f; buf[i_src_location+1] = (unsigned char)(l_jmptype + 0x10); buf[i_src_location+2] = l_cptr[0]; buf[i_src_location+3] = l_cptr[1]; buf[i_src_location+4] = l_cptr[2]; buf[i_src_location+5] = l_cptr[3]; return 6; } else { /* l_cptr better point to l_dist and l_dist needs to be recalculated */ l_dist = -1*(i_src_location+5-i_dest_location); buf[i_src_location] = 0xE9; buf[i_src_location+1] = l_cptr[0]; buf[i_src_location+2] = l_cptr[1]; buf[i_src_location+3] = l_cptr[2]; buf[i_src_location+4] = l_cptr[3]; return 5; } } } else { /* Must be a 4 or 5 byte forward jump with all locations known */ if ( i_jmp_instr == LIBXSMM_X86_INSTR_JMP ) { /* l_cptr better point to l_dist and l_dist needs to be recalculated */ l_dist = (i_dest_location-i_src_location-5); buf[i_src_location] = 0xe9; buf[i_src_location+1] = l_cptr[0]; buf[i_src_location+2] = l_cptr[1]; buf[i_src_location+3] = l_cptr[2]; buf[i_src_location+4] = l_cptr[3]; return 5; } else { /* l_cptr better point to l_dist and l_dist needs to be recalculated */ l_dist = (i_dest_location-i_src_location-6); buf[i_src_location] = 0x0f; buf[i_src_location+1] = (unsigned char)(l_jmptype + 0x10); buf[i_src_location+2] = l_cptr[0]; buf[i_src_location+3] = l_cptr[1]; buf[i_src_location+4] = l_cptr[2]; buf[i_src_location+5] = l_cptr[3]; return 6; } } } LIBXSMM_API_INTERN void libxsmm_x86_instruction_vec_mask_move( libxsmm_generated_code* io_generated_code, const unsigned int i_vmove_instr, const unsigned int i_gp_reg_base, const unsigned int i_gp_reg_idx, const unsigned int i_scale, const int i_displacement, const char i_vector_name, const unsigned int i_vec_reg_number_0, const unsigned int i_vec_reg_mask_0, const unsigned int i_is_store ) { /* @TODO add checks in debug mode */ if ( io_generated_code->code_type > 1 ) { unsigned char *buf = (unsigned char *) io_generated_code->generated_code; int i = io_generated_code->code_size; /* int i = *loc; */ unsigned int l_maxsize = io_generated_code->buffer_size; /* unsigned int l_maxsize = 1024; */ int l_regbas0 = i_gp_reg_base % 8; int l_gp8 = ((i_gp_reg_base > 7)&&(i_gp_reg_base<=15)?1:0); int l_regidx = 0; int l_ix8 = ((i_gp_reg_idx > 7)&&(i_gp_reg_idx<=15)?1:0); int l_vecval0 = i_vec_reg_number_0 % 8; int l_vecgrp0 = i_vec_reg_number_0 / 8; int l_oddgrp0 = ((l_vecgrp0 % 2)==1); int l_vecval1 = i_vec_reg_mask_0 % 8; int l_vecgrp1 = i_vec_reg_mask_0 / 8; int l_oddgrp1 = ((l_vecgrp1 % 2)==1); int l_sca=0; int l_inst = 0; int l_place1; if ( /*(i_gp_reg_idx>=0) &&*/ i_gp_reg_idx<=15 ) l_regidx = i_gp_reg_idx % 8; if ( l_maxsize - i < 20 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_BUFFER_TOO_SMALL ); return; } if (i_scale==2) l_sca=0x40; else if (i_scale==4) l_sca=0x80; else if (i_scale==8) l_sca=0xc0; if ( (i_vector_name != 'y') && (i_vector_name != 'Y') ) { fprintf(stderr, "libxsmm_instruction_vec_mask_move only works with i_vector_name as y for ymm* registers\n"); exit(-1); } switch ( i_vmove_instr ) { case LIBXSMM_X86_INSTR_VMASKMOVPD: if ( i_is_store == 0 ) l_inst= 0x01; else l_inst= 0x03; break; case LIBXSMM_X86_INSTR_VMASKMOVPS: if ( i_is_store == 0 ) l_inst= 0x00; else l_inst= 0x02; break; default: fprintf(stderr, "libxsmm_instruction_vec_mask_move: Exactly what sort of instructions are you using?\n"); exit(-1); } buf[i++] = (unsigned char)(0xc4); buf[i++] = (unsigned char)(0xe2 - l_gp8 * 0x20 - l_ix8 * 0x40 - l_oddgrp0 * 0x80); buf[i++] = (unsigned char)(0x7d - l_oddgrp1 * 0x40 - l_vecval1*8); buf[i++] = (unsigned char)(0x2c + l_inst); l_place1 = i; if ( /*(i_gp_reg_idx>=0) &&*/ i_gp_reg_idx<=15 ) { buf[i++] = (unsigned char)(0x04 + l_vecval0*8); buf[i++] = (unsigned char)(l_sca + l_regbas0 + l_regidx*8); } else { buf[i++] = (unsigned char)(l_regbas0 + l_vecval0*8); } i += internal_x86_instructions_add_offset( l_place1, i, i_displacement, 0, 1, buf ); io_generated_code->code_size = i; /* *loc = i; */ } else { } } LIBXSMM_API_INTERN void libxsmm_x86_instruction_vec_move( libxsmm_generated_code* io_generated_code, const unsigned int i_instruction_set, const unsigned int i_vmove_instr, const unsigned int i_gp_reg_base, const unsigned int i_gp_reg_idx, const unsigned int i_scale, const int i_displacement, const char i_vector_name, const unsigned int i_vec_reg_number_0, const unsigned int i_mask_reg_number, const unsigned int i_use_zero_masking, const unsigned int i_is_store ) { if ( (i_is_store == 0) && ( (i_vmove_instr == LIBXSMM_X86_INSTR_VMOVNTPD) || (i_vmove_instr == LIBXSMM_X86_INSTR_VMOVNTPS) || (i_vmove_instr == LIBXSMM_X86_INSTR_VMOVNTDQ) )) { fprintf(stderr, "libxsmm_instruction_vec_move: streaming stores are only available when setting storing option to true!\n"); exit(-1); } /* @TODO add checks in debug mode */ if ( io_generated_code->code_type > 1 ) { unsigned char *buf = (unsigned char *) io_generated_code->generated_code; int i = io_generated_code->code_size; /* int i = *loc; */ unsigned int l_maxsize = io_generated_code->buffer_size; /* unsigned int l_maxsize = 1024; */ int l_iregnum = i_gp_reg_base % 8; int l_vregnum = i_vec_reg_number_0 % 8; int l_ivectype=0, l_ivectype2=0, l_iregoff=0, l_ivectype3=0; int l_vregoffset=0, l_vregoffset2=0; int l_aligned=0, l_forced_offset=0, l_penultimate=0; int l_place, l_num=0, l_num2=0, l_num3=0, l_sizereg=1; int l_maskingoff=0; int l_wow = 0; int l_scaleadj = 0; int l_bytes = 4; /* base number of bytes */ int l_sse3 = 0; int l_insert_extra_byte = 0; int l_fpadj = 0; if ( (i_vector_name != 'z') && (i_mask_reg_number != 0) ) { fprintf(stderr, "libxsmm_instruction_vec_move: Masking is only enabled with zmm registers!\n"); exit(-1); } if ( (i_use_zero_masking != 0) && (i_mask_reg_number != 0) && (i_is_store != 0) ) { fprintf(stderr, "libxsmm_instruction_vec_move: zero-masked store cannot operate on memory destination!\n"); exit(-1); } if ( l_maxsize - i < 20 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_BUFFER_TOO_SMALL ); return; } l_num = i_vec_reg_number_0 / 8; switch ( i_vmove_instr ) { case LIBXSMM_X86_INSTR_VMOVAPD: l_aligned += 0x18; if ( i_vector_name=='x' ) l_ivectype += 1; if ( l_num == 1 ) l_ivectype3 -= 0x80; l_ivectype2 += 0x81; l_sizereg = 64; break; case LIBXSMM_X86_INSTR_VMOVAPS: l_aligned += 0x18; if ( l_num == 1 ) l_ivectype3 -= 0x80; if ( i_vector_name!='x' ) l_ivectype -= 1; /* single */ l_sizereg = 64; break; case LIBXSMM_X86_INSTR_VMOVSS: if ( i_vector_name!='x' ) { fprintf(stderr, "libxsmm_instruction_vec_move: You want to use vmovss without xmm?\n"); exit(-1); } if ( l_num == 1 ) l_ivectype3 -= 0x80; l_ivectype += 2; break; case LIBXSMM_X86_INSTR_VMOVSD: if ( i_vector_name!='x' ) { fprintf(stderr, "libxsmm_instruction_vec_move: You want to use vmovsd without xmm?\n"); exit(-1); } if ( l_num == 1 ) l_ivectype3 -= 0x80; l_ivectype += 3; break; case LIBXSMM_X86_INSTR_VPBROADCASTD: l_bytes = 5; if ( i_vector_name=='x' || i_vector_name=='y' ) { fprintf(stderr, "libxsmm_instruction_vec_move: vpbroadcastd not yet implemented for xmm/ymm\n"); exit(-1); } if ( i_is_store == 1 ) { fprintf(stderr, "libxsmm_instruction_vec_move: vpbroadcastd and store?\n"); exit(-1); } l_ivectype2 += 0x01; l_penultimate += 0x48; l_num2 += 1; l_num3 += 0x21; l_sizereg = 4; break; case LIBXSMM_X86_INSTR_VPBROADCASTQ: l_bytes = 5; if ( i_vector_name=='x' || i_vector_name=='y' ) { fprintf(stderr, "libxsmm_instruction_vec_move: vpbroadcastq not yet implemented for xmm/ymm\n"); exit(-1); } if ( i_is_store == 1 ) { fprintf(stderr, "libxsmm_instruction_vec_move: vpbroadcastq and store?\n"); exit(-1); } l_ivectype2 += 0x81; l_penultimate += 0x49; l_num2 += 1; l_num3 += 0x21; l_sizereg = 8; break; case LIBXSMM_X86_INSTR_VPBROADCASTB: l_bytes = 5; if ( i_vector_name=='x' || i_vector_name=='y' ) { fprintf(stderr, "libxsmm_instruction_vec_move: vpbroadcastb not yet implemented for xmm/ymm\n"); exit(-1); } if ( i_is_store == 1 ) { fprintf(stderr, "libxsmm_instruction_vec_move: vpbroadcastb and store?\n"); exit(-1); } l_ivectype2 += 0x01; l_penultimate += 0x68; l_num2 += 1; l_num3 += 0x21; l_sizereg = 1; break; case LIBXSMM_X86_INSTR_VPBROADCASTW: l_bytes = 5; if ( i_vector_name=='x' || i_vector_name=='y' ) { fprintf(stderr, "libxsmm_instruction_vec_move: vpbroadcastw not yet implemented for xmm/ymm\n"); exit(-1); } if ( i_is_store == 1 ) { fprintf(stderr, "libxsmm_instruction_vec_move: vpbroadcastw and store?\n"); exit(-1); } l_ivectype2 += 0x01; l_penultimate += 0x69; l_num2 += 1; l_num3 += 0x21; l_sizereg = 2; break; case LIBXSMM_X86_INSTR_VMOVDQA32: l_bytes = 5; if ( i_vector_name=='x' || i_vector_name=='y' ) { fprintf(stderr, "libxsmm_instruction_vec_move: vmovdqa32 not yet implemented for xmm/ymm\n"); exit(-1); } l_ivectype2 += 0x01; l_penultimate += 0x5f; l_num3 += 0x21; l_sizereg = 64; if ( i_is_store == 1 ) l_aligned += 0xf; break; case LIBXSMM_X86_INSTR_VMOVDQA64: l_bytes = 5; if ( i_vector_name=='x' || i_vector_name=='y' ) { fprintf(stderr, "libxsmm_instruction_vec_move: vmovdqa64 not yet implemented for xmm/ymm\n"); exit(-1); } l_ivectype2 += 0x81; l_penultimate += 0x5f; l_num3 += 0x21; l_sizereg = 64; if ( i_is_store == 1 ) l_aligned += 0xf; break; case LIBXSMM_X86_INSTR_VMOVDQU8: l_bytes = 5; if ( i_vector_name=='x' || i_vector_name=='y' ) { fprintf(stderr, "libxsmm_instruction_vec_move: vmovdqu8 not yet implemented for xmm/ymm\n"); exit(-1); } l_ivectype2 += 0x03; l_penultimate += 0x5f; l_num3 += 0x21; l_sizereg = 64; if ( i_is_store == 1 ) l_aligned += 0xf; break; case LIBXSMM_X86_INSTR_VMOVDQU16: l_bytes = 5; if ( i_vector_name=='x' || i_vector_name=='y' ) { fprintf(stderr, "libxsmm_instruction_vec_move: vmovdqu16 not yet implemented for xmm/ymm\n"); exit(-1); } l_ivectype2 += 0x83; l_penultimate += 0x5f; l_num3 += 0x21; l_sizereg = 64; if ( i_is_store == 1 ) l_aligned += 0xf; break; case LIBXSMM_X86_INSTR_VMOVDQU32: l_bytes = 5; if ( i_vector_name=='x' || i_vector_name=='y' ) { fprintf(stderr, "libxsmm_instruction_vec_move: vmovdqu32 not yet implemented for xmm/ymm\n"); exit(-1); } l_ivectype2 += 0x02; l_penultimate += 0x5f; l_num3 += 0x21; l_sizereg = 64; if ( i_is_store == 1 ) l_aligned += 0xf; break; case LIBXSMM_X86_INSTR_VMOVDQU64: l_bytes = 5; if ( i_vector_name=='x' || i_vector_name=='y' ) { fprintf(stderr, "libxsmm_instruction_vec_move: vmovdqu64 not yet implemented for xmm/ymm\n"); exit(-1); } l_ivectype2 += 0x82; l_penultimate += 0x5f; l_num3 += 0x21; l_sizereg = 64; if ( i_is_store == 1 ) l_aligned += 0xf; break; case LIBXSMM_X86_INSTR_VMOVNTPD: l_bytes = 4; if ( i_vector_name=='x' ) { fprintf(stderr,"libxsmm_instruction_vec_move: vmovntpd not yet implemented for xmm\n"); exit(-1); } if ( l_num == 1 ) l_ivectype3 += 0x80; l_ivectype2 += 0x81; l_penultimate += 0x1A; l_sizereg = 64; break; case LIBXSMM_X86_INSTR_VMOVNTPS: l_bytes = 4; if ( i_vector_name=='x' ) { fprintf(stderr,"libxsmm_instruction_vec_move: vmovntps not yet implemented for xmm\n"); exit(-1); } if ( l_num == 1 ) l_ivectype3 += 0x80; l_ivectype -= 0x01; l_penultimate += 0x1A; l_sizereg = 64; break; case LIBXSMM_X86_INSTR_VMOVNTDQ: l_bytes = 4; if ( i_vector_name=='x' ) { fprintf(stderr,"libxsmm_instruction_vec_move: vmovntdq not yet implemented for xmm\n"); exit(-1); } if ( l_num == 1 ) l_ivectype3 += 0x80; l_ivectype2 += 0x01; l_penultimate += 0xD6; l_sizereg = 64; break; case LIBXSMM_X86_INSTR_VBROADCASTSD: l_bytes = 5; if ( i_vector_name=='x' ) { fprintf(stderr, "libxsmm_instruction_vec_move: vbroadcastsd and xmm?\n"); exit(-1); } if ( i_is_store == 1 ) { fprintf(stderr, "libxsmm_instruction_vec_move: vbroadcastsd and stores?\n"); exit(-1); } l_ivectype2 += 0x81; l_penultimate += 9; l_num2 += 1; l_num3 += 0x21; l_sizereg = 8; break; case LIBXSMM_X86_INSTR_VBROADCASTSS: if ( i_vector_name=='x' ) { l_ivectype += 1; } if ( i_is_store == 1 ) { fprintf(stderr, "libxsmm_instruction_vec_move: vbroadcastss and stores?\n"); exit(-1); } l_bytes = 5; l_ivectype2 += 0x1; l_penultimate += 8; l_sizereg = 4; l_num2 += 1; l_num3 += 0x21; break; case LIBXSMM_X86_INSTR_VMOVUPD: if ( i_vector_name=='x' ) l_ivectype += 1; if ( l_num == 1 ) l_ivectype3 -= 0x80; l_sizereg = 64; l_ivectype2 += 0x81; break; case LIBXSMM_X86_INSTR_VPMOVDW: if ( i_vector_name=='x' ) l_ivectype += 1; if ( l_num == 1 ) l_ivectype3 -= 0x80; l_sizereg = 32; l_ivectype2 += 0x02; l_num2 += 1; l_penultimate += 0x22; break; case LIBXSMM_X86_INSTR_VPMOVDB: if ( i_vector_name=='x' ) l_ivectype += 1; if ( l_num == 1 ) l_ivectype3 -= 0x80; l_sizereg = 16; l_ivectype2 += 0x02; l_num2 += 1; l_penultimate += 0x20; break; case LIBXSMM_X86_INSTR_VPMOVSDB: if ( i_vector_name=='x' ) l_ivectype += 1; if ( l_num == 1 ) l_ivectype3 -= 0x80; l_sizereg = 16; l_ivectype2 += 0x02; l_num2 += 1; l_penultimate += 0x10; break; case LIBXSMM_X86_INSTR_VPMOVUSDB: if ( i_vector_name=='x' ) l_ivectype += 1; if ( l_num == 1 ) l_ivectype3 -= 0x80; l_sizereg = 16; l_ivectype2 += 0x02; l_num2 += 1; /* l_penultimate += 0x00;*/ break; case LIBXSMM_X86_INSTR_VPMOVSXWD: if ( i_vector_name=='x' ) l_ivectype += 1; if ( l_num == 1 ) l_ivectype3 -= 0x80; l_sizereg = 64; l_ivectype2 += 0x81; l_num3 += 1; l_penultimate += 0x13; l_bytes = 5; l_wow += 0x20; break; case LIBXSMM_X86_INSTR_VPMOVZXWD: if ( i_vector_name=='x' ) l_ivectype += 1; if ( l_num == 1 ) l_ivectype3 -= 0x80; l_sizereg = 32; l_ivectype2 += 0x01; l_num3 += 1; l_penultimate += 0x23; l_bytes = 5; l_wow += 0x20; l_wow += 0xE1; break; case LIBXSMM_X86_INSTR_VPMOVSXBD: if ( i_vector_name=='x' ) l_ivectype += 1; if ( l_num == 1 ) l_ivectype3 -= 0x80; l_sizereg = 16; l_ivectype2 += 0x01; l_num3 += 1; l_penultimate += 0x11; l_bytes = 5; l_wow += 0x20; l_wow += 0xE1; break; case LIBXSMM_X86_INSTR_VPMOVZXBD: if ( i_vector_name=='x' ) l_ivectype += 1; if ( l_num == 1 ) l_ivectype3 -= 0x80; l_sizereg = 16; l_ivectype2 += 0x01; l_num3 += 1; l_penultimate += 0x21; l_bytes = 5; l_wow += 0x20; l_wow += 0xE1; break; case LIBXSMM_X86_INSTR_VMOVUPS: if ( l_num == 1 ) l_ivectype3 -= 0x80; if ( i_vector_name!='x' ) l_ivectype -= 1; /* single */ l_sizereg = 64; break; case LIBXSMM_X86_INSTR_VMOVDDUP: if ( i_is_store == 1 ) { fprintf(stderr, "libxsmm_instruction_vec_move: vmovddup and stores?\n"); exit(-1); } l_ivectype += 2; l_ivectype2 += 0x83; if ( l_num == 1 ) l_ivectype3 -= 0x80; l_penultimate += 2; l_sizereg = 64; if ( i_vector_name=='x' ) l_ivectype += 1; break; case LIBXSMM_X86_INSTR_MOVAPD: l_sse3 = 1; l_insert_extra_byte = 0x66; l_fpadj = 0x18; break; case LIBXSMM_X86_INSTR_MOVUPD: l_sse3 = 1; l_insert_extra_byte = 0x66; break; case LIBXSMM_X86_INSTR_MOVAPS: l_sse3 = 1; l_fpadj = 0x18; break; case LIBXSMM_X86_INSTR_MOVUPS: l_sse3 = 1; break; case LIBXSMM_X86_INSTR_MOVSD: l_sse3 = 1; l_insert_extra_byte = 0xF2; break; case LIBXSMM_X86_INSTR_MOVSS: l_sse3 = 1; l_insert_extra_byte = 0xF3; break; case LIBXSMM_X86_INSTR_MOVDDUP: l_sse3 = 1; l_insert_extra_byte = 0xF2; l_fpadj = 2; if ( i_is_store ) { fprintf(stderr,"libxsmm_instruction_vec_move: don't support a store with movddup\n"); exit(-1); } break; default: fprintf(stderr, "libxsmm_instruction_vec_move: unexpected instruction number: %u\n",i_vmove_instr); exit(-1); } switch ( i_vector_name ) { case 'x': l_sizereg = 1; if ( l_num > 1 ) { fprintf(stderr, "libxsmm_instruction_vec_move: Are you sure xmm%u exists?\n",i_vec_reg_number_0); exit(-1); } break; case 'y': l_ivectype += 5; l_sizereg = 1; if ( l_num > 2 ) { fprintf(stderr, "libxsmm_instruction_vec_move: Are you sure ymm%u exists?\n",i_vec_reg_number_0); exit(-1); } break; case 'z': l_bytes = 6; break; default: fprintf(stderr, "libxsmm_instruction_vec_move: Exactly what sort of fp regs are you using?\n"); exit(-1); } if ( i_is_store == 1 ) { l_aligned += 1; /*if ( i_use_masking != 0 ) l_maskingoff = i_mask_reg_number;*/ } else { /*The following addition of 0x80 appears broken... if ( i_use_masking != 0 ) l_maskingoff = 0x80 + i_mask_reg_number; */ } if ( !l_sse3 ) { if ( (i_gp_reg_base >= 8) && (i_gp_reg_base <=15) ) { if ( l_bytes < 5 ) l_bytes = 5; else l_iregoff -= 0x20; } if ( (i_gp_reg_idx>=8) && (i_gp_reg_idx<=15) ) { if ( l_bytes < 5 ) { l_bytes = 5; } else { l_wow -= 0x20; } l_wow -= 0x20; } if ( (i_mask_reg_number > 0) && (i_mask_reg_number <= 127) ) { l_maskingoff = i_mask_reg_number; if ( i_use_zero_masking != 0 && i_is_store == 0 ) l_maskingoff += 0x80; } if ( l_num == 0 ) l_vregoffset = 0x90; else if ( l_num == 1 ) { l_vregoffset = 0x10; l_vregoffset2 = -0x80; } else if ( l_num == 2 ) l_vregoffset = 0x80; else if ( l_num == 3 ) l_vregoffset = 0x00; if ( (l_iregnum == 5) && (i_displacement==0) ) { /* Registers like rbp/r13 when you have a displacement of 0, we need force the single byte of zero to appear. */ l_forced_offset = 1; } if ( l_bytes == 4 ) { buf[i++] = 0xc5; buf[i++] = (unsigned char)(0xf8 + l_ivectype + l_ivectype3); } else if ( l_bytes == 5 ) { buf[i++] = 0xc4; buf[i++] = (unsigned char)(0xc1 + l_num3 + l_vregoffset2 + l_iregoff + l_wow); buf[i++] = (unsigned char)(0x78 + l_ivectype); } else if ( l_bytes == 6 ) { buf[i++] = 0x62; buf[i++] = (unsigned char)(0x61 + l_vregoffset + l_iregoff + l_num2 + l_wow); buf[i++] = (unsigned char)(0x7c + l_ivectype2); buf[i++] = (unsigned char)(0x48 + l_maskingoff); } buf[i++] = (unsigned char)(0x10 + l_aligned + l_penultimate); if ( (i_gp_reg_idx != LIBXSMM_X86_GP_REG_UNDEF) && ((int)i_gp_reg_idx >= LIBXSMM_X86_GP_REG_RAX) && (i_gp_reg_idx <= LIBXSMM_X86_GP_REG_R15) ) { buf[i++] = (unsigned char)(0x04 + 8*l_vregnum); l_place = i-1; if ( i_scale == 1 ) l_scaleadj = 0x00; else if ( i_scale == 2 ) l_scaleadj = 0x40; else if ( i_scale == 4 ) l_scaleadj = 0x80; else if ( i_scale == 8 ) l_scaleadj = 0xc0; else { fprintf(stderr, "libxsmm_instruction_vec_move: cannot handle i_scale=%u parameter\n", i_scale); exit(-1); } buf[i++] = (unsigned char)(l_scaleadj + l_iregnum + 8*(i_gp_reg_idx%8)); } else { l_place = i; buf[i++] = (unsigned char)(0x00 + l_iregnum + 8*l_vregnum); if ( l_iregnum == LIBXSMM_X86_GP_REG_RSP ) { buf[i++] = 0x24; } } i += internal_x86_instructions_add_offset( l_place, i, i_displacement, l_forced_offset, l_sizereg, buf ); io_generated_code->code_size = i; /* *loc = i; */ } else { /* SSE3 code */ int l_vecgrp0 = 0; int l_vecval0 = i_vec_reg_number_0 % 8; int l_place1=i+2; int l_regbas0 = i_gp_reg_base % 8; int l_regidx = i_gp_reg_idx % 8; int l_gp8 = ((i_gp_reg_base > 7)&&(i_gp_reg_base<=15)?1:0); if ( (i_vec_reg_number_0>=8) && (i_vec_reg_number_0<=15) ) l_vecgrp0=1; if ( i_is_store ) l_fpadj++; if ( l_insert_extra_byte != 0 ) { buf[i++]= (unsigned char)(l_insert_extra_byte); ++l_place1; } if (i_gp_reg_idx == LIBXSMM_X86_GP_REG_UNDEF ) { int l_sse_preamble2 = 64; if ( l_gp8 || (l_vecgrp0>=1) ) { if (l_gp8) l_sse_preamble2 += 1; if (l_vecgrp0 >=1) l_sse_preamble2 += 4; buf[i++] = (unsigned char)(l_sse_preamble2); ++l_place1; } buf[i++] = (unsigned char)(0x0f); buf[i++] = (unsigned char)(0x10 + l_fpadj); buf[i++] = (unsigned char)(0x00 + l_regbas0 + l_vecval0*8); if ( l_regbas0 == 4 ) buf[i++]=0x24; } else { int l_ix8 = ((i_gp_reg_idx > 7) && (i_gp_reg_idx <= 15) ? 1 : 0); int l_sse_preamble2 = 64; if ( i_scale == 1 ) l_scaleadj = 0x00; else if ( i_scale == 2 ) l_scaleadj = 0x40; else if ( i_scale == 4 ) l_scaleadj = 0x80; else if ( i_scale == 8 ) l_scaleadj = 0xc0; else { fprintf(stderr, "libxsmm_instruction_vec_move sse3 section: cannot handle i_scale=%u parameter\n", i_scale); exit(-1); } if ( l_gp8 || l_ix8 || (l_vecgrp0>=1) ) { if (l_gp8) l_sse_preamble2 += 1; if (l_ix8) l_sse_preamble2 += 2; if (l_vecgrp0 >=1) l_sse_preamble2 += 4; buf[i++] = (unsigned char)(l_sse_preamble2); ++l_place1; } buf[i++] = (unsigned char)(0x0f); buf[i++] = (unsigned char)(0x10 + l_fpadj); buf[i++] = (unsigned char)(0x04 + l_vecval0*8); buf[i++] = (unsigned char)(0x00 + l_scaleadj + l_regbas0 + l_regidx*8); } l_forced_offset = 0; if ( (l_regbas0 == 5) && (i_displacement==0) ) { l_forced_offset = 1; } i += internal_x86_instructions_add_offset( l_place1, i, i_displacement, l_forced_offset, l_sizereg, buf ); io_generated_code->code_size = i; /* *loc = i; */ } } else { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; char l_gp_reg_base_name[4]; char l_instr_name[16]; char l_masking_type[16] = { 0 }; libxsmm_get_x86_gp_reg_name( i_gp_reg_base, l_gp_reg_base_name, 3 ); libxsmm_get_x86_instr_name( i_vmove_instr, l_instr_name, 15 ); if ( i_use_zero_masking == 0 || i_mask_reg_number == 0 || i_is_store != 0 ) { if ( io_generated_code->code_type == 0 ) { l_masking_type[0] = (char)0; /* no zero-masking */ } } else { if ( io_generated_code->code_type == 0 ) { LIBXSMM_SNPRINTF(l_masking_type, 16, "%%{z%%}" ); } else { LIBXSMM_SNPRINTF(l_masking_type, 16, "{z}" ); } } if ( (i_instruction_set >= LIBXSMM_X86_AVX512) && (i_mask_reg_number != 0) ) { /* build vmovpd/ps/sd/ss instruction, load use */ if ( i_is_store == 0 ) { if ( io_generated_code->code_type == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%s %i(%%%%%s), %%%%%cmm%u%%{%%%%k%u%%}%s\\n\\t\"\n", l_instr_name, i_displacement, l_gp_reg_base_name, i_vector_name, i_vec_reg_number_0, i_mask_reg_number, l_masking_type ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %s %i(%%%s), %%%cmm%u{%%k%u}%s\n", l_instr_name, i_displacement, l_gp_reg_base_name, i_vector_name, i_vec_reg_number_0, i_mask_reg_number, l_masking_type ); } } else { /* store */ if ( io_generated_code->code_type == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%s %%%%%cmm%u, %i(%%%%%s)%%{%%%%k%u%%}%s\\n\\t\"\n", l_instr_name, i_vector_name, i_vec_reg_number_0, i_displacement, l_gp_reg_base_name, i_mask_reg_number, l_masking_type ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %s %%%cmm%u, %i(%%%s) {%%k%u}%s\n", l_instr_name, i_vector_name, i_vec_reg_number_0, i_displacement, l_gp_reg_base_name, i_mask_reg_number, l_masking_type ); } } } else { /* build vmovpd/ps/sd/ss instruction, load use */ if ( i_is_store == 0 ) { if ( io_generated_code->code_type == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%s %i(%%%%%s), %%%%%cmm%u\\n\\t\"\n", l_instr_name, i_displacement, l_gp_reg_base_name, i_vector_name, i_vec_reg_number_0 ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %s %i(%%%s), %%%cmm%u\n", l_instr_name, i_displacement, l_gp_reg_base_name, i_vector_name, i_vec_reg_number_0 ); } } else { if ( io_generated_code->code_type == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%s %%%%%cmm%u, %i(%%%%%s)\\n\\t\"\n", l_instr_name, i_vector_name, i_vec_reg_number_0, i_displacement, l_gp_reg_base_name ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %s %%%cmm%u, %i(%%%s)\n", l_instr_name, i_vector_name, i_vec_reg_number_0, i_displacement, l_gp_reg_base_name ); } } } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } LIBXSMM_API_INTERN void libxsmm_x86_instruction_vec_compute_convert ( libxsmm_generated_code* io_generated_code, const unsigned int i_instruction_set, const unsigned int i_vec_instr, const char i_vector_name, const unsigned int i_vec_reg_src_0, const unsigned int i_vec_reg_src_1, const unsigned int i_vec_reg_dst, const unsigned int i_shuffle_operand ) { LIBXSMM_UNUSED(i_instruction_set); if ( io_generated_code->code_type > 1 ) { unsigned char *buf = (unsigned char *) io_generated_code->generated_code; int i = io_generated_code->code_size; /* i = *loc; */ unsigned int l_maxsize = io_generated_code->buffer_size; int l_vec0 = 0, l_vec1 = 0, l_second = 0, l_third = 0, l_fourth = 0, l_fifth = 0; int l_vecval0, l_vecgrp0, l_oddgrp0, l_2or3grp0; int l_vecval1, l_vecgrp1, l_oddgrp1, l_2or3grp1; /* these defines are for LIBXSMM_X86_INSTR_VCVTNE2PS2BF16 only: */ int l_vecvalsrc1, l_vecgrpsrc1, l_oddgrpsrc1, l_2or3grpsrc1; if ( l_maxsize - i < 20 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_BUFFER_TOO_SMALL ); return; } switch ( i_vector_name ) { case 'x': case 'y': fprintf(stderr, "libxsmm_instruction_vec_compute_convert: the highest register should be zmm: use that\n"); break; case 'z': break; default: fprintf(stderr, "libxsmm_instruction_vec_compute_convert: Unknown sort of fp registers\n"); exit(-1); } if ( (i_vec_instr == LIBXSMM_X86_INSTR_VCVTNE2PS2BF16) && (i_vec_reg_src_1 == LIBXSMM_X86_VEC_REG_UNDEF) ) { fprintf(stderr, "libxsmm_instruction_vec_compute_convert: VCVTNE2PS2BF16 needs two inputs\n"); exit(-1); } switch ( i_vec_instr ) { case LIBXSMM_X86_INSTR_VCVTDQ2PS: l_fifth = 0x48; l_vec0 = i_vec_reg_src_0; l_vec1 = i_vec_reg_dst; break; case LIBXSMM_X86_INSTR_VPMOVDB: l_second = 0x1; l_third += 2; l_fifth = 0x1E; l_vec0 = i_vec_reg_dst; l_vec1 = i_vec_reg_src_0; break; case LIBXSMM_X86_INSTR_VPMOVSDB: l_second = 0x1; l_third += 2; l_fifth = 0xE; l_vec0 = i_vec_reg_dst; l_vec1 = i_vec_reg_src_0; break; case LIBXSMM_X86_INSTR_VPMOVUSDB: l_second = 0x1; l_third += 2; l_fifth = -2; l_vec0 = i_vec_reg_dst; l_vec1 = i_vec_reg_src_0; break; case LIBXSMM_X86_INSTR_VCVTPS2DQ: l_fifth = 0x48; l_third += 1; l_vec0 = i_vec_reg_src_0; l_vec1 = i_vec_reg_dst; break; case LIBXSMM_X86_INSTR_VCVTPS2UDQ: l_fifth = 0x66; l_vec0 = i_vec_reg_src_0; l_vec1 = i_vec_reg_dst; break; case LIBXSMM_X86_INSTR_VCVTPS2PD: l_fifth = 0x47; l_vec0 = i_vec_reg_src_0; l_vec1 = i_vec_reg_dst; break; case LIBXSMM_X86_INSTR_VCVTPS2PH: l_second = 2; l_third = 1; l_fifth = 0x0a; l_vec1 = i_vec_reg_src_0; l_vec0 = i_vec_reg_dst; break; case LIBXSMM_X86_INSTR_VCVTPH2PS: l_second = 1; l_third = 1; l_vec0 = i_vec_reg_src_0; l_vec1 = i_vec_reg_dst; break; case LIBXSMM_X86_INSTR_VPMOVDW: l_second = 1; l_third = 2; l_fifth = 0x20; l_vec1 = i_vec_reg_src_0; l_vec0 = i_vec_reg_dst; break; case LIBXSMM_X86_INSTR_VPMOVSXWD: l_second = 1; l_third = 1; l_fifth = 0x10; l_vec0 = i_vec_reg_src_0; l_vec1 = i_vec_reg_dst; break; case LIBXSMM_X86_INSTR_VPMOVZXWD: l_second = 1; l_third = 1; l_fifth = 0x20; l_vec0 = i_vec_reg_src_0; l_vec1 = i_vec_reg_dst; break; case LIBXSMM_X86_INSTR_VPMOVSXBD: l_second = 1; l_third = 1; l_fifth = 0xE; l_vec0 = i_vec_reg_src_0; l_vec1 = i_vec_reg_dst; break; case LIBXSMM_X86_INSTR_VPMOVZXBD: l_second = 1; l_third = 1; l_fifth = 0x1E; l_vec0 = i_vec_reg_src_0; l_vec1 = i_vec_reg_dst; break; case LIBXSMM_X86_INSTR_VCVTNEPS2BF16: l_second = 1; l_third = 2; l_fifth = 0x5F; l_vec1 = i_vec_reg_dst; l_vec0 = i_vec_reg_src_0; break; case LIBXSMM_X86_INSTR_VCVTNE2PS2BF16: l_vecvalsrc1 = i_vec_reg_src_1 % 8; l_vecgrpsrc1 = i_vec_reg_src_1 / 8; l_oddgrpsrc1 = ((l_vecgrpsrc1 % 2)==1); l_2or3grpsrc1 = (l_vecgrpsrc1>=2); l_second = 1; l_third = 3 - l_oddgrpsrc1*0x40 - l_vecvalsrc1*0x08; l_fourth = -l_2or3grpsrc1 * 0x08; l_fifth = 0x5F; l_vec1 = i_vec_reg_dst; l_vec0 = i_vec_reg_src_0; break; default: fprintf(stderr, "libxsmm_instruction_vec_compute_convert: Unknown instruction type: %u\n", i_vec_instr); break; } l_vecval0 = l_vec0 % 8; l_vecgrp0 = l_vec0 / 8; l_oddgrp0 = ((l_vecgrp0 % 2)==1); l_2or3grp0 = (l_vecgrp0>=2); l_vecval1 = l_vec1 % 8; l_vecgrp1 = l_vec1 / 8; l_oddgrp1 = ((l_vecgrp1 % 2)==1); l_2or3grp1 = (l_vecgrp1>=2); buf[i++] = (unsigned char)(0x62); buf[i++] = (unsigned char)(0xf1 + l_second - l_oddgrp0 * 0x20 - l_oddgrp1 * 0x80 - l_2or3grp0 * 0x40 - l_2or3grp1 * 0x10); buf[i++] = (unsigned char)(0x7c + l_third); buf[i++] = (unsigned char)(0x48 + l_fourth); buf[i++] = (unsigned char)(0x13 + l_fifth); buf[i++] = (unsigned char)(0xc0 + l_vecval0 + l_vecval1*8); if ( i_vec_instr == LIBXSMM_X86_INSTR_VCVTPS2PH ) { buf[i++] = (unsigned char)(i_shuffle_operand); } io_generated_code->code_size = i; /* *loc = i; */ } else { } } LIBXSMM_API_INTERN void libxsmm_x86_instruction_vec_compute_reg( libxsmm_generated_code* io_generated_code, const unsigned int i_instruction_set, const unsigned int i_vec_instr, const char i_vector_name, const unsigned int i_vec_reg_number_0, const unsigned int i_vec_reg_number_1, const unsigned int i_vec_reg_number_2 ) { /* @TODO add checks in debug mode */ if ( io_generated_code->code_type > 1 ) { unsigned char *buf = (unsigned char *) io_generated_code->generated_code; int i = io_generated_code->code_size; /* int i = *loc; */ unsigned int l_maxsize = io_generated_code->buffer_size; /* unsigned int l_maxsize = 1024; */ int l_second=0, l_third=0, l_fourth=0, l_xreg=0; int l_reg0, l_reg1, l_reg2; int l_vreg0 = i_vec_reg_number_0; int l_vreg1 = i_vec_reg_number_1; int l_vreg2 = i_vec_reg_number_2; int l_fpadj=0; int l_fpadj2=0; int l_bytes=4; int l_sse = 0; int l_insert_extra_byte = 0; if ( l_maxsize - i < 20 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_BUFFER_TOO_SMALL ); return; } if ( (i_vector_name!='z') && ((l_vreg0>15) || (l_vreg1>15) || (l_vreg2>15)) && (l_vreg2 != LIBXSMM_X86_VEC_REG_UNDEF) ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_NO_AVX512VL ); return; } switch ( i_vec_instr ) { case LIBXSMM_X86_INSTR_VXORPD: l_fpadj = -2; if ( (i_vector_name == 'x') && (l_vreg2 > 15) ) l_fourth -= 0x40; if ( (i_vector_name == 'y') && (l_vreg2 > 15) ) l_fourth -= 0x20; break; case LIBXSMM_X86_INSTR_VMULPD: if ( (i_vector_name == 'x') && (l_vreg2 > 15) ) l_fourth -= 0x40; if ( (i_vector_name == 'y') && (l_vreg2 > 15) ) l_fourth -= 0x20; break; case LIBXSMM_X86_INSTR_VPERMW: l_second += 0x01; l_fpadj += 0x34; if ( i_vector_name == 'x' ) { l_fourth -= 0x40; if ( l_vreg0 >= 16 ) l_fourth -= 0xc0; if ( l_vreg1 >= 16 ) l_fourth -= 0xc0; } else if ( i_vector_name == 'y' ) { l_fourth -= 0x20; if ( l_vreg0 >= 16 ) l_fourth += 0x20; if ( l_vreg1 >= 16 ) l_fourth += 0x20; } l_bytes = 6; break; case LIBXSMM_X86_INSTR_VPERMD: l_second += 0x01; l_fpadj = -0x23; l_fpadj2 = -0x80; if ( i_vector_name == 'x' ) { l_fpadj += 0x57; l_fpadj2 += 0x80; } if ( i_vector_name == 'y' ) { l_fpadj += 0x57; l_fpadj2 += 0x80; } if ( i_vector_name == 'x' ) { l_fourth -= 0x40; if ( l_vreg0 >= 16 ) l_fourth -= 0xc0; if ( l_vreg1 >= 16 ) l_fourth -= 0xc0; } else if ( i_vector_name == 'y' ) { l_fourth -= 0x20; if ( l_vreg0 >= 16 ) l_fourth += 0x20; if ( l_vreg1 >= 16 ) l_fourth += 0x20; } l_bytes = 6; break; case LIBXSMM_X86_INSTR_VUNPCKLPD: l_fpadj = -0x45; if ( (i_vector_name == 'x') && (l_vreg2 > 15) ) l_fourth -= 0x40; if ( (i_vector_name == 'y') && (l_vreg2 > 15) ) l_fourth -= 0x20; break; case LIBXSMM_X86_INSTR_VUNPCKLPS: l_fpadj = -0x45; if ( (i_vector_name!='z') && (l_vreg0<=15) && (l_vreg1<=15) && (l_vreg2<=15) ) l_fpadj2 = -1; else l_fpadj2 = -0x81; if ( (i_vector_name == 'x') && (l_vreg2 > 15) ) l_fourth -= 0x40; if ( (i_vector_name == 'y') && (l_vreg2 > 15) ) l_fourth -= 0x20; break; case LIBXSMM_X86_INSTR_VUNPCKHPD: l_fpadj = -0x44; if ( (i_vector_name == 'x') && (l_vreg2 > 15) ) l_fourth -= 0x40; if ( (i_vector_name == 'y') && (l_vreg2 > 15) ) l_fourth -= 0x20; break; case LIBXSMM_X86_INSTR_VUNPCKHPS: l_fpadj = -0x44; if ( (i_vector_name!='z') && (l_vreg0<=15) && (l_vreg1<=15) && (l_vreg2<=15) ) l_fpadj2 = -1; else l_fpadj2 = -0x81; if ( (i_vector_name == 'x') && (l_vreg2 > 15) ) l_fourth -= 0x40; if ( (i_vector_name == 'y') && (l_vreg2 > 15) ) l_fourth -= 0x20; break; case LIBXSMM_X86_INSTR_VADDPD: l_fpadj = -1; if ( (i_vector_name == 'x') && (l_vreg2 > 15) ) l_fourth -= 0x40; if ( (i_vector_name == 'y') && (l_vreg2 > 15) ) l_fourth -= 0x20; break; case LIBXSMM_X86_INSTR_VDIVPD: l_fpadj = 5; if ( (i_vector_name == 'x') && (l_vreg2 > 15) ) l_fourth -= 0x40; if ( (i_vector_name == 'y') && (l_vreg2 > 15) ) l_fourth -= 0x20; break; case LIBXSMM_X86_INSTR_VDPBF16PS: if ( i_vector_name == 'x' ) { l_fourth -= 0x40; if ( l_vreg0 >= 16 ) l_fourth -= 0xc0; if ( l_vreg1 >= 16 ) l_fourth -= 0xc0; } else if ( i_vector_name == 'y' ) { l_fourth -= 0x20; if ( l_vreg0 >= 16 ) l_fourth += 0x20; if ( l_vreg1 >= 16 ) l_fourth += 0x20; } l_bytes = 6; l_second += 1; l_fpadj = -7; l_fpadj2 = 0x81; break; case LIBXSMM_X86_INSTR_VDIVPS: if ( (i_vector_name!='z') && (l_vreg0 <=15) && (l_vreg1<=15) && (l_vreg2<=15) ) l_fpadj2 = -1; else l_fpadj2 = -0x81; l_fpadj = 5; break; case LIBXSMM_X86_INSTR_VPANDD: if ( (i_vector_name!='z') && (i_vector_name!='Z') ) { fprintf(stderr,"VPANDD in vec_compute_reg expects zmm registers\n"); exit(-1); } l_fpadj2 = -0x80; l_fpadj = 0x82; break; case LIBXSMM_X86_INSTR_VPANDQ: if ( (i_vector_name!='z') && (i_vector_name!='Z') ) { fprintf(stderr,"VPANDQ in vec_compute_reg expects zmm registers\n"); exit(-1); } l_fpadj2 = 0; l_fpadj = 0x82; break; case LIBXSMM_X86_INSTR_VMAXPD: if ( (i_vector_name!='z') && (i_vector_name!='Z') ) { fprintf(stderr,"VMAXPD in vec_compute_reg expects zmm registers\n"); exit(-1); } l_fpadj2 = 0; l_fpadj = 6; break; case LIBXSMM_X86_INSTR_VMAXPS: if ( (i_vector_name!='z') && (i_vector_name!='Z') ) { fprintf(stderr,"VMAXPS in vec_compute_reg expects zmm registers\n"); exit(-1); } l_fpadj2 = -0x81; l_fpadj = 6; break; case LIBXSMM_X86_INSTR_VCVTDQ2PS: l_fpadj2 -= 0x81; l_fpadj += 0x02; if ( l_vreg2 != LIBXSMM_X86_VEC_REG_UNDEF ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_VEC_REG_MUST_BE_UNDEF ); } l_vreg2 = l_vreg1; l_vreg1 = 0; break; case LIBXSMM_X86_INSTR_VCVTPS2PD: l_fpadj2 -= 0x81; l_fpadj += 0x01; if ( l_vreg2 != LIBXSMM_X86_VEC_REG_UNDEF ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_VEC_REG_MUST_BE_UNDEF ); exit(-1); } l_vreg2 = l_vreg1; l_vreg1 = 0; break; case LIBXSMM_X86_INSTR_VRCP14PS: l_fpadj2 -= 0x80; l_fpadj -= 0x0D; l_second += 1; if ( l_vreg2 != LIBXSMM_X86_VEC_REG_UNDEF ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_VEC_REG_MUST_BE_UNDEF ); exit(-1); } l_vreg2 = l_vreg1; l_vreg1 = 0; break; case LIBXSMM_X86_INSTR_VMOVDQU64: l_fpadj2 += 0x01; l_fpadj += 0x16; if ( l_vreg2 != LIBXSMM_X86_VEC_REG_UNDEF ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_VEC_REG_MUST_BE_UNDEF ); exit(-1); } if ( (i_vector_name == 'x') && (l_vreg0 < 16) ) l_fourth -= 0x40; if ( (i_vector_name == 'y') && (l_vreg0 < 16) ) l_fourth -= 0x20; l_bytes = 6; l_vreg2 = l_vreg1; l_vreg1 = 0; break; case LIBXSMM_X86_INSTR_VPMAXSD: l_second += 0x01; l_fpadj -= 0x1C; l_fpadj2 -= 0x80; break; case LIBXSMM_X86_INSTR_VPMINSD: l_second += 0x01; l_fpadj -= 0x20; l_fpadj2 -= 0x80; break; case LIBXSMM_X86_INSTR_VSUBPD: l_fpadj = 3; break; case LIBXSMM_X86_INSTR_VPADDD: l_fpadj2 -= 0x80; l_fpadj += 0xA5; break; case LIBXSMM_X86_INSTR_VPADDQ: l_fpadj += 0x7b; break; case LIBXSMM_X86_INSTR_VPADDW: l_fpadj2 -= 0x80; l_fpadj += 0xA4; break; case LIBXSMM_X86_INSTR_VPADDB: l_fpadj2 -= 0x80; l_fpadj += 0xA3; break; case LIBXSMM_X86_INSTR_VPMADDWD: l_fpadj2 -= 0x80; l_fpadj += 0x9C; break; case LIBXSMM_X86_INSTR_VPMADDUBSW: l_second += 0x01; l_fpadj -= 0x55; l_fpadj2 -= 0x80; break; case LIBXSMM_X86_INSTR_VPADDSW: l_fpadj += 0x94; l_fpadj2 -= 0x80; break; case LIBXSMM_X86_INSTR_VPADDSB: l_fpadj += 0x93; l_fpadj2 -= 0x80; break; case LIBXSMM_X86_INSTR_VFMADD231PD: l_second += 0x21; l_fpadj += 0x5f; l_fpadj2 += 0x80; if ( i_vector_name == 'z' ) { l_second -= 0x20; if ( l_vreg1 > 15 ) l_second += 0x20; if ( l_vreg2 > 15 ) { l_second += 0x20; l_fourth += 0x20; } } else if ( l_vreg0 > 7 ) { l_second -= 0x20; } l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFMADD213PD: l_second += 0x21; l_fpadj += 0x4f; l_fpadj2 += 0x80; if ( i_vector_name == 'z' ) { l_second -= 0x20; if ( l_vreg1 > 15 ) l_second += 0x20; if ( l_vreg2 > 15 ) { l_second += 0x20; l_fourth += 0x20; } } else if ( l_vreg0 > 7 ) { l_second -= 0x20; } l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFMADD132PD: l_second += 0x21; l_fpadj += 0x3f; l_fpadj2 += 0x80; if ( i_vector_name == 'z' ) { l_second -= 0x20; if ( l_vreg1 > 15 ) l_second += 0x20; if ( l_vreg2 > 15 ) { l_second += 0x20; l_fourth += 0x20; } } else if ( l_vreg0 > 7 ) { l_second -= 0x20; } l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFMSUB231PD: l_second += 0x21; l_fpadj += 0x61; l_fpadj2 += 0x80; if ( i_vector_name == 'z' ) { l_second -= 0x20; if ( l_vreg1 > 15 ) l_second += 0x20; if ( l_vreg2 > 15 ) { l_second += 0x20; l_fourth += 0x20; } /*l_fpadj2 -= 0x80;*/ } else if ( l_vreg0 > 7 ) { l_second -= 0x20; } l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFMSUB213PD: l_second += 0x21; l_fpadj += 0x51; l_fpadj2 += 0x80; if ( i_vector_name == 'z' ) { l_second -= 0x20; if ( l_vreg1 > 15 ) l_second += 0x20; if ( l_vreg2 > 15 ) { l_second += 0x20; l_fourth += 0x20; } } else if ( l_vreg0 > 7 ) { l_second -= 0x20; } l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFMSUB132PD: l_second += 0x21; l_fpadj += 0x41; l_fpadj2 += 0x80; if ( i_vector_name == 'z' ) { l_second -= 0x20; if ( l_vreg1 > 15 ) l_second += 0x20; if ( l_vreg2 > 15 ) { l_second += 0x20; l_fourth += 0x20; } } else if ( l_vreg0 > 7 ) { l_second -= 0x20; } l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFNMADD231PD: l_second += 0x21; l_fpadj += 0x63; l_fpadj2 += 0x80; if ( i_vector_name == 'z' ) { l_second -= 0x20; if ( l_vreg1 > 15 ) l_second += 0x20; if ( l_vreg2 > 15 ) { l_second += 0x20; l_fourth += 0x20; } /*l_fpadj2 -= 0x80;*/ } else if ( i_vec_reg_number_0 > 7 ) { l_second -= 0x20; } l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFNMADD213PD: l_second += 0x21; l_fpadj += 0x53; l_fpadj2 += 0x80; if ( i_vector_name == 'z' ) { l_second -= 0x20; if ( l_vreg1 > 15 ) l_second += 0x20; if ( l_vreg2 > 15 ) { l_second += 0x20; l_fourth += 0x20; } } else if ( i_vec_reg_number_0 > 7 ) { l_second -= 0x20; } l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFNMADD132PD: l_second += 0x21; l_fpadj += 0x43; l_fpadj2 += 0x80; if ( i_vector_name == 'z' ) { l_second -= 0x20; if ( l_vreg1 > 15 ) l_second += 0x20; if ( l_vreg2 > 15 ) { l_second += 0x20; l_fourth += 0x20; } } else if ( i_vec_reg_number_0 > 7 ) { l_second -= 0x20; } l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFNMSUB231PD: l_second += 0x21; l_fpadj += 0x65; l_fpadj2 += 0x80; if ( i_vector_name == 'z' ) { l_second -= 0x20; if ( l_vreg1 > 15 ) l_second += 0x20; if ( l_vreg2 > 15 ) { l_second += 0x20; l_fourth += 0x20; } /*l_fpadj2 -= 0x80;*/ } else if ( l_vreg0 > 7 ) { l_second -= 0x20; } l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFNMSUB213PD: l_second += 0x21; l_fpadj += 0x55; l_fpadj2 += 0x80; if ( i_vector_name == 'z' ) { l_second -= 0x20; if ( l_vreg1 > 15 ) l_second += 0x20; if ( l_vreg2 > 15 ) { l_second += 0x20; l_fourth += 0x20; } } else if ( l_vreg0 > 7 ) { l_second -= 0x20; } l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFNMSUB132PD: l_second += 0x21; l_fpadj += 0x45; l_fpadj2 += 0x80; if ( i_vector_name == 'z' ) { l_second -= 0x20; if ( l_vreg1 > 15 ) l_second += 0x20; if ( l_vreg2 > 15 ) { l_second += 0x20; l_fourth += 0x20; } } else if ( l_vreg0 > 7 ) { l_second -= 0x20; } l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFMADD231SD: if (i_vector_name != 'x') fprintf(stderr, "libxsmm_instruction_vec_compute_reg: Really? VFMADD231SD and ymm/zmm?\n"); l_second += 0x21; l_fpadj += 0x60; l_fpadj2 += 0x80; if ( l_vreg0 > 7 ) l_second -= 0x20; l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFMADD213SD: if (i_vector_name != 'x') fprintf(stderr, "libxsmm_instruction_vec_compute_reg: Really? VFMADD213SD and ymm/zmm?\n"); l_second += 0x21; l_fpadj += 0x50; l_fpadj2 += 0x80; if ( l_vreg0 > 7 ) l_second -= 0x20; l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFMADD132SD: if (i_vector_name != 'x') fprintf(stderr, "libxsmm_instruction_vec_compute_reg: Really? VFMADD132SD and ymm/zmm?\n"); l_second += 0x21; l_fpadj += 0x40; l_fpadj2 += 0x80; if ( l_vreg0 > 7 ) l_second -= 0x20; l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFMADD213SS: if (i_vector_name != 'x') fprintf(stderr, "libxsmm_instruction_vec_compute_reg: Really? VFMADD213SS and ymm/zmm?\n"); l_second += 0x21; l_fpadj += 0x50; if ( l_vreg0 > 7 ) l_second -= 0x20; l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFMADD132SS: if (i_vector_name != 'x') fprintf(stderr, "libxsmm_instruction_vec_compute_reg: Really? VFMADD132SS and ymm/zmm?\n"); l_second += 0x21; l_fpadj += 0x40; if ( l_vreg0 > 7 ) l_second -= 0x20; l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFMSUB213SS: if (i_vector_name != 'x') fprintf(stderr, "libxsmm_instruction_vec_compute_reg: Really? VFMSUB213SS and ymm/zmm?\n"); l_second += 0x21; l_fpadj += 0x52; if ( l_vreg0 > 7 ) l_second -= 0x20; l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFMSUB132SS: if (i_vector_name != 'x') fprintf(stderr, "libxsmm_instruction_vec_compute_reg: Really? VFMSUB132SS and ymm/zmm?\n"); l_second += 0x21; l_fpadj += 0x42; if ( l_vreg0 > 7 ) l_second -= 0x20; l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFNMADD213SS: if (i_vector_name != 'x') fprintf(stderr, "libxsmm_instruction_vec_compute_reg: Really? VFNMADD213SS and ymm/zmm?\n"); l_second += 0x21; l_fpadj += 0x54; if ( l_vreg0 > 7 ) l_second -= 0x20; l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFNMADD132SS: if (i_vector_name != 'x') fprintf(stderr, "libxsmm_instruction_vec_compute_reg: Really? VFNMADD132SS and ymm/zmm?\n"); l_second += 0x21; l_fpadj += 0x44; if ( l_vreg0 > 7 ) l_second -= 0x20; l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFNMSUB213SS: if (i_vector_name != 'x') fprintf(stderr, "libxsmm_instruction_vec_compute_reg: Really? VFNMSUB213SS and ymm/zmm?\n"); l_second += 0x21; l_fpadj += 0x56; if ( l_vreg0 > 7 ) l_second -= 0x20; l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFNMSUB132SS: if (i_vector_name != 'x') fprintf(stderr, "libxsmm_instruction_vec_compute_reg: Really? VFNMSUB132SS and ymm/zmm?\n"); l_second += 0x21; l_fpadj += 0x46; if ( l_vreg0 > 7 ) l_second -= 0x20; l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFNMADD213SD: if (i_vector_name != 'x') fprintf(stderr, "libxsmm_instruction_vec_compute_reg: Really? VFNMADD213SD and ymm/zmm?\n"); l_second += 0x21; l_fpadj += 0x54; l_fpadj2 += 0x80; if ( l_vreg0 > 7 ) l_second -= 0x20; l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFNMADD132SD: if (i_vector_name != 'x') fprintf(stderr, "libxsmm_instruction_vec_compute_reg: Really? VFNMADD132SD and ymm/zmm?\n"); l_second += 0x21; l_fpadj += 0x44; l_fpadj2 += 0x80; if ( l_vreg0 > 7 ) l_second -= 0x20; l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFMSUB213SD: if (i_vector_name != 'x') fprintf(stderr, "libxsmm_instruction_vec_compute_reg: Really? VFMSUB213SD and ymm/zmm?\n"); l_second += 0x21; l_fpadj += 0x52; l_fpadj2 += 0x80; if ( l_vreg0 > 7 ) l_second -= 0x20; l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFMSUB132SD: if (i_vector_name != 'x') fprintf(stderr, "libxsmm_instruction_vec_compute_reg: Really? VFMSUB132SD and ymm/zmm?\n"); l_second += 0x21; l_fpadj += 0x42; l_fpadj2 += 0x80; if ( l_vreg0 > 7 ) l_second -= 0x20; l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFNMSUB213SD: if (i_vector_name != 'x') fprintf(stderr, "libxsmm_instruction_vec_compute_reg: Really? VFNMSUB213SD and ymm/zmm?\n"); l_second += 0x21; l_fpadj += 0x56; l_fpadj2 += 0x80; if ( l_vreg0 > 7 ) l_second -= 0x20; l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFNMSUB132SD: if (i_vector_name != 'x') fprintf(stderr, "libxsmm_instruction_vec_compute_reg: Really? VFNMSUB132SD and ymm/zmm?\n"); l_second += 0x21; l_fpadj += 0x46; l_fpadj2 += 0x80; if ( l_vreg0 > 7 ) l_second -= 0x20; l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFMSUB231SD: if (i_vector_name != 'x') fprintf(stderr, "libxsmm_instruction_vec_compute_reg: VFMSUB231SD and ymm/zmm?\n"); l_second += 0x21; l_fpadj += 0x62; l_fpadj2 += 0x80; if ( i_vector_name == 'z' ) { l_second -= 0x20; l_fpadj2 -= 0x80; } else if ( l_vreg0 > 7 ) { l_second -= 0x20; } l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFNMADD231SD: if (i_vector_name != 'x') fprintf(stderr, "libxsmm_instruction_vec_compute_reg: VFNMADD231SD and ymm/zmm?\n"); l_second += 0x21; l_fpadj += 0x64; l_fpadj2 += 0x80; if ( i_vector_name == 'z' ) { l_second -= 0x20; l_fpadj2 -= 0x80; } else if ( l_vreg0 > 7 ) { l_second -= 0x20; } l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFNMSUB231SD: if (i_vector_name != 'x') fprintf(stderr, "libxsmm_instruction_vec_compute_reg: VFNMSUB231SD and ymm/zmm?\n"); l_second += 0x21; l_fpadj += 0x66; l_fpadj2 += 0x80; if ( i_vector_name == 'z' ) { l_second -= 0x20; l_fpadj2 -= 0x80; } else if ( l_vreg0 > 7 ) { l_second -= 0x20; } l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFMADD231PS: l_second += 0x21; l_fpadj += 0x5f; if ( i_vector_name == 'z' ) { l_second -= 0x20; if ( l_vreg1 > 15 ) l_second += 0x20; if ( l_vreg2 > 15 ) { l_second += 0x20; l_fourth += 0x20; } } else if ( l_vreg0 > 7 ) { l_second -= 0x20; } l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFMADD213PS: l_second += 0x21; l_fpadj += 0x4f; if ( i_vector_name == 'z' ) { l_second -= 0x20; if ( l_vreg1 > 15 ) l_second += 0x20; if ( l_vreg2 > 15 ) { l_second += 0x20; l_fourth += 0x20; } } else if ( l_vreg0 > 7 ) { l_second -= 0x20; } l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFMADD132PS: l_second += 0x21; l_fpadj += 0x3f; if ( i_vector_name == 'z' ) { l_second -= 0x20; if ( l_vreg1 > 15 ) l_second += 0x20; if ( l_vreg2 > 15 ) { l_second += 0x20; l_fourth += 0x20; } } else if ( l_vreg0 > 7 ) { l_second -= 0x20; } l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFNMADD213PS: l_second += 0x21; l_fpadj += 0x53; if ( i_vector_name == 'z' ) { l_second -= 0x20; if ( l_vreg1 > 15 ) l_second += 0x20; if ( l_vreg2 > 15 ) { l_second += 0x20; l_fourth += 0x20; } } else if ( l_vreg0 > 7 ) { l_second -= 0x20; } l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFNMADD132PS: l_second += 0x21; l_fpadj += 0x43; if ( i_vector_name == 'z' ) { l_second -= 0x20; if ( l_vreg1 > 15 ) l_second += 0x20; if ( l_vreg2 > 15 ) { l_second += 0x20; l_fourth += 0x20; } } else if ( l_vreg0 > 7 ) { l_second -= 0x20; } l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFNMSUB213PS: l_second += 0x21; l_fpadj += 0x55; if ( i_vector_name == 'z' ) { l_second -= 0x20; if ( l_vreg1 > 15 ) l_second += 0x20; if ( l_vreg2 > 15 ) { l_second += 0x20; l_fourth += 0x20; } } else if ( l_vreg0 > 7 ) { l_second -= 0x20; } l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFNMSUB132PS: l_second += 0x21; l_fpadj += 0x45; if ( i_vector_name == 'z' ) { l_second -= 0x20; if ( l_vreg1 > 15 ) l_second += 0x20; if ( l_vreg2 > 15 ) { l_second += 0x20; l_fourth += 0x20; } } else if ( l_vreg0 > 7 ) { l_second -= 0x20; } l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFMSUB213PS: l_second += 0x21; l_fpadj += 0x51; if ( i_vector_name == 'z' ) { l_second -= 0x20; if ( l_vreg1 > 15 ) l_second += 0x20; if ( l_vreg2 > 15 ) { l_second += 0x20; l_fourth += 0x20; } } else if ( l_vreg0 > 7 ) { l_second -= 0x20; } l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFMSUB132PS: l_second += 0x21; l_fpadj += 0x41; if ( i_vector_name == 'z' ) { l_second -= 0x20; if ( l_vreg1 > 15 ) l_second += 0x20; if ( l_vreg2 > 15 ) { l_second += 0x20; l_fourth += 0x20; } } else if ( l_vreg0 > 7 ) { l_second -= 0x20; } l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFMSUB231PS: l_second += 0x21; l_fpadj += 0x61; if ( i_vector_name == 'z' ) { l_second -= 0x20; if ( l_vreg1 > 15 ) l_second += 0x20; if ( l_vreg2 > 15 ) { l_second += 0x20; l_fourth += 0x20; } } else if ( l_vreg0 > 7 ) { l_second -= 0x20; } l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFNMADD231PS: l_second += 0x21; l_fpadj += 0x63; if ( i_vector_name == 'z' ) { l_second -= 0x20; if ( l_vreg1 > 15 ) l_second += 0x20; if ( l_vreg2 > 15 ) { l_second += 0x20; l_fourth += 0x20; } } else if ( l_vreg0 > 7 ) { l_second -= 0x20; } l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFNMSUB231PS: l_second += 0x21; l_fpadj += 0x65; if ( i_vector_name == 'z' ) { l_second -= 0x20; if ( l_vreg1 > 15 ) l_second += 0x20; if ( l_vreg2 > 15 ) { l_second += 0x20; l_fourth += 0x20; } } else if ( l_vreg0 > 7 ) { l_second -= 0x20; } l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFMADD231SS: if (i_vector_name != 'x') fprintf(stderr, "libxsmm_instruction_vec_compute_reg: VFMADD231SS and ymm/zmm?\n"); l_second += 0x21; l_fpadj += 0x60; if ( i_vector_name == 'z' ) { l_second -= 0x20; l_fpadj2 -= 0x80; } else if ( l_vreg0 > 7 ) { l_second -= 0x20; } l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFMSUB231SS: if (i_vector_name != 'x') fprintf(stderr, "libxsmm_instruction_vec_compute_reg: VFMSUB231SS and ymm/zmm?\n"); l_second += 0x21; l_fpadj += 0x62; if ( i_vector_name == 'z' ) { l_second -= 0x20; l_fpadj2 -= 0x80; } else if ( l_vreg0 > 7 ) { l_second -= 0x20; } l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFNMADD231SS: if (i_vector_name != 'x') fprintf(stderr, "libxsmm_instruction_vec_compute_reg: VFNMADD231SS and ymm/zmm?\n"); l_second += 0x21; l_fpadj += 0x64; if ( i_vector_name == 'z' ) { l_second -= 0x20; l_fpadj2 -= 0x80; } else if ( i_vec_reg_number_0 > 7 ) { l_second -= 0x20; } l_bytes = 5; break; case LIBXSMM_X86_INSTR_VFNMSUB231SS: if (i_vector_name != 'x') fprintf(stderr, "libxsmm_instruction_vec_compute_reg: VFNMSUB231SS and ymm/zmm?\n"); l_second += 0x21; l_fpadj += 0x66; if ( i_vector_name == 'z' ) { l_second -= 0x20; l_fpadj2 -= 0x80; } else if ( l_vreg0 > 7 ) { l_second -= 0x20; } l_bytes = 5; break; case LIBXSMM_X86_INSTR_VMULSD: l_fpadj2 = 2; if (i_vector_name != 'x') fprintf(stderr, "libxsmm_instruction_vec_compute_reg: VMULSD and ymm/zmm?\n"); break; case LIBXSMM_X86_INSTR_VADDSD: l_fpadj =-1; l_fpadj2 = 2; if (i_vector_name != 'x') fprintf(stderr, "libxsmm_instruction_vec_compute_reg: VADDSD and ymm/zmm?\n"); break; case LIBXSMM_X86_INSTR_VSUBSD: l_fpadj = 3; l_fpadj2 = 2; if (i_vector_name != 'x') fprintf(stderr, "libxsmm_instruction_vec_compute_reg: VSUBSD and ymm/zmm?\n"); break; case LIBXSMM_X86_INSTR_VXORPS: l_fpadj2 = -1; l_fpadj = -2; if ( i_vector_name == 'z' ) { l_fpadj2 -= 0x80; } break; case LIBXSMM_X86_INSTR_VMULPS: if ( (i_vector_name!='z') && (l_vreg0<=15) && (l_vreg1<=15) && (l_vreg2<=15) ) l_fpadj2 = -1; else l_fpadj2 = -0x81; break; case LIBXSMM_X86_INSTR_VADDPS: if ( (i_vector_name!='z') && (l_vreg0<=15) && (l_vreg1<=15) && (l_vreg2<=15) ) l_fpadj2 = -1; else l_fpadj2 = -0x81; l_fpadj = -1; break; case LIBXSMM_X86_INSTR_VSUBPS: if ( (i_vector_name!='z') && (l_vreg0<=15) && (l_vreg1<=15) && (l_vreg2<=15) ) l_fpadj2 = -1; else l_fpadj2 = -0x81; l_fpadj = 3; break; case LIBXSMM_X86_INSTR_VPSRAVD: l_second += 0x01; l_fpadj -= 0x13; l_fpadj2 -= 0x80; break; /* SSE instruction support */ case LIBXSMM_X86_INSTR_VMULSS: if (i_vector_name != 'x') fprintf(stderr, "libxsmm_instruction_vec_compute_reg: VMULSS and ymm/zmm?\n"); l_fpadj2 = 1; break; case LIBXSMM_X86_INSTR_VADDSS: if (i_vector_name != 'x') fprintf(stderr, "libxsmm_instruction_vec_compute_reg: VADDSS and ymm/zmm?\n"); l_fpadj =-1; l_fpadj2 = 1; break; case LIBXSMM_X86_INSTR_VSUBSS: if (i_vector_name != 'x') fprintf(stderr, "libxsmm_instruction_vec_compute_reg: VSUBSS and ymm/zmm?\n"); l_fpadj = 3; l_fpadj2 = 1; break; case LIBXSMM_X86_INSTR_VPERMT2W: l_second += 0x01; l_fpadj += 0x24; if ( i_vector_name == 'x' ) { l_fourth -= 0x40; if ( l_vreg0 >= 16 ) l_fourth -= 0xc0; if ( l_vreg1 >= 16 ) l_fourth -= 0xc0; } else if ( i_vector_name == 'y' ) { l_fourth -= 0x20; if ( l_vreg0 >= 16 ) l_fourth += 0x20; if ( l_vreg1 >= 16 ) l_fourth += 0x20; } l_bytes = 6; break; case LIBXSMM_X86_INSTR_VPXORD: l_bytes = 6; if ( i_vector_name == 'x' ) { l_fourth -= 0x40; } else if ( i_vector_name == 'y' ) { l_fourth -= 0x20; } l_fpadj += 0x96; l_fpadj2 += 0x80; break; case LIBXSMM_X86_INSTR_VPORD: l_bytes = 6; if ( i_vector_name == 'x' ) { l_fourth -= 0x40; } else if ( i_vector_name == 'y' ) { l_fourth -= 0x20; } l_fpadj += 0x92; l_fpadj2 += 0x80; break; case LIBXSMM_X86_INSTR_VPDPWSSD: if ( (i_vector_name!='z') && (i_vec_reg_number_0<=15) && (i_vec_reg_number_1<=15) && (i_vec_reg_number_2<=15) ) l_fpadj2 = -1; else l_fpadj2 = -0x81; l_fpadj = -0x07; l_second += 0x01; l_third += 0x01; break; case LIBXSMM_X86_INSTR_VPDPWSSDS: if ( (i_vector_name!='z') && (i_vec_reg_number_0<=15) && (i_vec_reg_number_1<=15) && (i_vec_reg_number_2<=15) ) l_fpadj2 = -1; else l_fpadj2 = -0x81; l_fpadj = -0x06; l_second += 0x01; l_third += 0x01; break; case LIBXSMM_X86_INSTR_VPDPBUSD: if ( (i_vector_name!='z') && (i_vec_reg_number_0<=15) && (i_vec_reg_number_1<=15) && (i_vec_reg_number_2<=15) ) l_fpadj2 = -1; else l_fpadj2 = -0x81; l_fpadj = -0x09; l_second += 0x01; l_third += 0x01; break; case LIBXSMM_X86_INSTR_VPDPBUSDS: if ( (i_vector_name!='z') && (i_vec_reg_number_0<=15) && (i_vec_reg_number_1<=15) && (i_vec_reg_number_2<=15) ) l_fpadj2 = -1; else l_fpadj2 = -0x81; l_fpadj = -0x08; l_second += 0x01; l_third += 0x01; break; case LIBXSMM_X86_INSTR_MOVAPD: l_sse = 1; l_insert_extra_byte = 0x66; l_third = 0x18; break; case LIBXSMM_X86_INSTR_MOVUPD: l_sse = 1; l_insert_extra_byte = 0x66; break; case LIBXSMM_X86_INSTR_MOVAPS: l_sse = 1; l_third = 0x18; break; case LIBXSMM_X86_INSTR_MOVUPS: l_sse = 1; break; case LIBXSMM_X86_INSTR_MOVSD: l_sse = 1; l_insert_extra_byte = 0xF2; break; case LIBXSMM_X86_INSTR_MOVSS: l_sse = 1; l_insert_extra_byte = 0xF3; break; case LIBXSMM_X86_INSTR_MOVDDUP: l_sse = 1; l_third = 2; l_insert_extra_byte = 0xF2; break; case LIBXSMM_X86_INSTR_XORPD: l_sse = 1; l_insert_extra_byte = 0x66; l_third = 0x47; break; case LIBXSMM_X86_INSTR_XORPS: l_sse = 1; l_third = 0x47; break; case LIBXSMM_X86_INSTR_MULPD: l_sse = 1; l_insert_extra_byte = 0x66; l_third = 0x49; break; case LIBXSMM_X86_INSTR_MULPS: l_sse = 1; l_third = 0x49; break; case LIBXSMM_X86_INSTR_ADDPD: l_sse = 1; l_insert_extra_byte = 0x66; l_third = 0x48; break; case LIBXSMM_X86_INSTR_ADDPS: l_sse = 1; l_third = 0x48; break; case LIBXSMM_X86_INSTR_SUBPD: l_sse = 1; l_insert_extra_byte = 0x66; l_third = 0x4c; break; case LIBXSMM_X86_INSTR_SUBPS: l_sse = 1; l_third = 0x4c; break; case LIBXSMM_X86_INSTR_MULSD: l_sse = 1; l_insert_extra_byte = 0xF2; l_third = 0x49; break; case LIBXSMM_X86_INSTR_MULSS: l_sse = 1; l_insert_extra_byte = 0xF3; l_third = 0x49; break; case LIBXSMM_X86_INSTR_ADDSD: l_sse = 1; l_insert_extra_byte = 0xF2; l_third = 0x48; break; case LIBXSMM_X86_INSTR_ADDSS: l_sse = 1; l_insert_extra_byte = 0xF3; l_third = 0x48; break; case LIBXSMM_X86_INSTR_SUBSD: l_sse = 1; l_insert_extra_byte = 0xF2; l_third = 0x4c; break; case LIBXSMM_X86_INSTR_SUBSS: l_sse = 1; l_insert_extra_byte = 0xF3; l_third = 0x4c; break; default: fprintf(stderr, "libxsmm_instruction_vec_compute_reg: Unknown instruction type: %u\n", i_vec_instr); exit(-1); } l_reg0 = l_vreg0 % 8; l_reg1 = l_vreg1 % 8; l_reg2 = l_vreg2 % 8; if ( !l_sse ) { if ( i_vector_name == 'x' ) l_xreg = -4; if ( l_vreg2 >= 8 ) { l_second -= 0x80; } if ( l_vreg1 >= 8 ) { l_third -= 0x40; } if ( (i_vector_name!='z') && (l_vreg0<=15) && (l_vreg1<=15) && (l_vreg2<=15) ) { if ( l_vreg0 >= 8 ) { if ( l_bytes < 5 ) l_bytes = 5; } } else { if ( l_bytes == 5 ) { l_third += 0x80; if ( l_vreg1 > 15 ) l_second += 0xE0; if ( l_vreg2 > 15 ) { l_second -= 0x20; l_fourth -= 0x20; if ( i_vector_name=='x' ) l_fourth -= 0x20; } } l_bytes = 6; } if ( l_bytes == 4 ) { buf[i++] = 0xc5; buf[i++] = (unsigned char)(0xfd - 8*l_reg1 + l_third + l_second + l_xreg + l_fpadj2); buf[i++] = (unsigned char)(0x59 + l_fpadj); buf[i++] = (unsigned char)(0xc0 + l_reg0 + 8*l_reg2); } else if ( l_bytes == 5 ) { buf[i++] = 0xc4; buf[i++] = (unsigned char)(0xc1 + l_second); buf[i++] = (unsigned char)(0x7d - 8*l_reg1 + l_third + l_xreg + l_fpadj2); buf[i++] = (unsigned char)(0x59 + l_fpadj); buf[i++] = (unsigned char)(0xc0 + l_reg0 + 8*l_reg2); } else if ( l_bytes == 6 ) { if ( l_vreg0 >= 8 ) { l_second -= 0x20; } if ( l_vreg0 >= 16 ) { l_second -= 0x20; if ( i_vector_name=='x' ) l_fourth -= 0x40; if ( i_vector_name=='y' ) l_fourth -= 0x20; } if ( l_vreg0 >= 24 ) { l_second -= 0x20; } if ( l_vreg1 >= 16 ) { l_third += 0x40; l_fourth -= 0x08; if ( i_vector_name=='x' ) l_fourth -= 0x40; if ( i_vector_name=='y' ) l_fourth -= 0x20; } if ( l_vreg1 >= 24 ) { l_third -= 0x40; } if ( l_vreg2 >= 16 ) { l_second += 0x70; } if ( l_vreg2 >= 24 ) { l_second -= 0x80; } buf[i++] = 0x62; buf[i++] = (unsigned char)(0xf1 + l_second); buf[i++] = (unsigned char)(0xfd - 8*l_reg1 + l_third + l_fpadj2); buf[i++] = (unsigned char)(0x48 + l_fourth); buf[i++] = (unsigned char)(0x59 + l_fpadj); buf[i++] = (unsigned char)(0xc0 + l_reg0 + 8*l_reg2); } } else { int l_vecgrp0 = 0; int l_vecgrp1 = 0; if ( (l_vreg0 >= 8) && (l_vreg0 <=15) ) l_vecgrp0 = 1; if ( (l_vreg1 >= 8) && (l_vreg1 <=15) ) l_vecgrp1 = 1; if ( l_insert_extra_byte != 0 ) { buf[i++] = (unsigned char)(l_insert_extra_byte); } if ( (l_vecgrp0 >= 1) || (l_vecgrp1 >= 1) ) { int l_extra_byte = 0; if ( l_vecgrp0 >= 1 ) l_extra_byte += 1; if ( l_vecgrp1 >= 1 ) l_extra_byte += 4; buf[i++] = (unsigned char)(0x40 + l_extra_byte); } buf[i++] = (unsigned char)(0x0f); buf[i++] = (unsigned char)(0x10 + l_third); buf[i++] = (unsigned char)(0xc0 + l_reg0 + l_reg1*8); } io_generated_code->code_size = i; /* *loc = i; */ } else { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; char l_instr_name[16]; libxsmm_get_x86_instr_name( i_vec_instr, l_instr_name, 15 ); /* build vXYZpd/ps/sd/ss instruction pure register use*/ if ( i_instruction_set != LIBXSMM_X86_SSE3 ) { if ( io_generated_code->code_type == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%s %%%%%cmm%u, %%%%%cmm%u, %%%%%cmm%u\\n\\t\"\n", l_instr_name, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1, i_vector_name, i_vec_reg_number_2 ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %s %%%cmm%u, %%%cmm%u, %%%cmm%u\n", l_instr_name, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1, i_vector_name, i_vec_reg_number_2 ); } } else { if ( io_generated_code->code_type == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%s %%%%%cmm%u, %%%%%cmm%u\\n\\t\"\n", l_instr_name, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %s %%%cmm%u, %%%cmm%u\n", l_instr_name, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1 ); } } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } LIBXSMM_API_INTERN void libxsmm_x86_instruction_vec_compute_reg_mask( libxsmm_generated_code* io_generated_code, const unsigned int i_instruction_set, const unsigned int i_vec_instr, const char i_vector_name, const unsigned int i_vec_reg_number_0, const unsigned int i_vec_reg_number_1, const unsigned int i_vec_reg_number_2, const unsigned int i_immediate, const unsigned int i_mask_reg_number, const unsigned int i_use_zero_masking ) { /* @TODO add checks in debug mode */ if ( io_generated_code->code_type > 1 ) { unsigned char *buf = (unsigned char *) io_generated_code->generated_code; int i = io_generated_code->code_size; /* int i = *loc; */ unsigned int l_maxsize = io_generated_code->buffer_size; /* unsigned int l_maxsize = 1024; */ int l_vecval0 = i_vec_reg_number_0 % 8; int l_vecgrp0 = i_vec_reg_number_0 / 8; int l_oddgrp0 = ((l_vecgrp0 % 2)==1); int l_2or3grp0 = (l_vecgrp0>=2); int l_vecval1 = i_vec_reg_number_1 % 8; int l_vecgrp1 = i_vec_reg_number_1 / 8; int l_oddgrp1 = ((l_vecgrp1 % 2)==1); int l_2or3grp1 = (l_vecgrp1>=2); int l_vecval2 = i_vec_reg_number_2 % 8; int l_vecgrp2 = i_vec_reg_number_2 / 8; int l_oddgrp2 = ((l_vecgrp2 % 2)==1); int l_2or3grp2 = (l_vecgrp2>=2); int l_second = 0; int l_third = 0; int l_fourth = 0; int l_fifth = 0; if ( l_maxsize - i < 20 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_BUFFER_TOO_SMALL ); return; } switch ( i_vector_name ) { case 'x': case 'y': fprintf(stderr, "libxsmm_instruction_vec_compute_reg_mask: the highest register should be zmm: use that\n"); exit(-1); break; case 'z': break; default: fprintf(stderr, "libxsmm_instruction_vec_compute_reg_mask: Unknown sort of fp registers\n"); exit(-1); } switch ( i_vec_instr ) { case LIBXSMM_X86_INSTR_VBLENDMPS: if ( i_immediate != LIBXSMM_X86_IMM_UNDEF ) { fprintf(stderr,"libxsmm_instruction_vec_compute_reg_mask for VBLENDMPS immediate=%u != %i\n",i_immediate,LIBXSMM_X86_IMM_UNDEF); exit(-1); } l_second = 0x1; l_fourth = i_mask_reg_number; break; case LIBXSMM_X86_INSTR_VPBLENDMB: if ( i_immediate != LIBXSMM_X86_IMM_UNDEF ) { fprintf(stderr,"libxsmm_instruction_vec_compute_reg_mask for VPBLENDMB immediate=%u != %i\n",i_immediate,LIBXSMM_X86_IMM_UNDEF); exit(-1); } l_second = 0x1; l_fourth = i_mask_reg_number; l_fifth = 0x1; break; case LIBXSMM_X86_INSTR_VPBLENDMW: if ( i_immediate != LIBXSMM_X86_IMM_UNDEF ) { fprintf(stderr,"libxsmm_instruction_vec_compute_reg_mask for VPBLENDMW immediate=%u != %i\n",i_immediate,LIBXSMM_X86_IMM_UNDEF); exit(-1); } l_second = 0x1; l_third = 0x80; l_fourth = i_mask_reg_number; l_fifth = 0x1; break; case LIBXSMM_X86_INSTR_VPCMPD: l_second = 0x2; l_fifth = -0x46; l_oddgrp2 = 0; l_2or3grp2 = 0; l_vecval2 = i_mask_reg_number; break; case LIBXSMM_X86_INSTR_VPCMPUD: l_second = 0x2; l_fifth = -0x47; l_oddgrp2 = 0; l_2or3grp2 = 0; l_vecval2 = i_mask_reg_number; break; case LIBXSMM_X86_INSTR_VPCMPW: l_second = 0x2; l_fifth = -0x26; l_third = 0x80; l_oddgrp2 = 0; l_2or3grp2 = 0; l_vecval2 = i_mask_reg_number; break; case LIBXSMM_X86_INSTR_VPCMPB: l_second = 0x2; l_fifth = -0x26; l_oddgrp2 = 0; l_2or3grp2 = 0; l_vecval2 = i_mask_reg_number; break; case LIBXSMM_X86_INSTR_VPCMPUW: l_second = 0x2; l_fifth = -0x27; l_third = 0x80; l_oddgrp2 = 0; l_2or3grp2 = 0; l_vecval2 = i_mask_reg_number; break; case LIBXSMM_X86_INSTR_VPCMPUB: l_second = 0x2; l_fifth = -0x27; l_oddgrp2 = 0; l_2or3grp2 = 0; l_vecval2 = i_mask_reg_number; break; case LIBXSMM_X86_INSTR_VCMPPS: l_third = -1; l_fifth = 0x5d; l_oddgrp2 = 0; l_2or3grp2 = 0; l_vecval2 = i_mask_reg_number; break; case LIBXSMM_X86_INSTR_VPADDD: if ( i_immediate != LIBXSMM_X86_IMM_UNDEF ) { fprintf(stderr,"libxsmm_instruction_vec_compute_reg_mask immediate=%u != %i\n",i_immediate,LIBXSMM_X86_IMM_UNDEF); exit(-1); } l_fifth = 0x99; l_fourth = i_mask_reg_number; break; case LIBXSMM_X86_INSTR_VPANDD: if ( i_immediate != LIBXSMM_X86_IMM_UNDEF ) { fprintf(stderr,"libxsmm_instruction_vec_compute_reg_mask immediate=%u != %i\n",i_immediate,LIBXSMM_X86_IMM_UNDEF); exit(-1); } l_fifth = 0x76; l_fourth = i_mask_reg_number; break; case LIBXSMM_X86_INSTR_VPSUBD: if ( i_immediate != LIBXSMM_X86_IMM_UNDEF ) { fprintf(stderr,"libxsmm_instruction_vec_compute_reg_mask immediate=%u != %i\n",i_immediate,LIBXSMM_X86_IMM_UNDEF); exit(-1); } l_fifth = 0x95; l_fourth = i_mask_reg_number; break; default: fprintf(stderr, "libxsmm_instruction_vec_compute_reg_mask: Unknown instruction type: %u\n", i_vec_instr); exit(-1); } if ( i_use_zero_masking != 0 && i_mask_reg_number != 0 ) l_fourth += 0x80; buf[i++] = (unsigned char)(0x62); buf[i++] = (unsigned char)(0xf1 + l_second - l_oddgrp0 * 0x20 - l_oddgrp2 * 0x80 - l_2or3grp0 * 0x40 - l_2or3grp2 * 0x10); buf[i++] = (unsigned char)(0x7d + l_third - l_oddgrp1 * 0x40 - l_vecval1*8); buf[i++] = (unsigned char)(0x48 - l_2or3grp1 * 0x08 + l_fourth ); buf[i++] = (unsigned char)(0x65 + l_fifth); buf[i++] = (unsigned char)(0xc0 + l_vecval0 + l_vecval2*8); if ( i_immediate != LIBXSMM_X86_IMM_UNDEF ) { buf[i++] = (unsigned char)(i_immediate); } io_generated_code->code_size = i; /* *loc = i; */ } else { /* TODO: Debug- this code was just copied from another routine */ char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; char l_instr_name[16]; char l_masking[16]; libxsmm_get_x86_instr_name( i_vec_instr, l_instr_name, 15 ); if ( i_mask_reg_number != 0 ) { /* avoid format-truncation warning due to unsigned int (theoretically) exceeding length of string (l_masking) */ LIBXSMM_ASSERT_MSG(i_mask_reg_number < 8, "Invalid mask register"); if ( i_use_zero_masking == 0 ) { if ( io_generated_code->code_type == 0 ) { LIBXSMM_SNPRINTF(l_masking, 16, "%%{k%hd%%}", (unsigned short)i_mask_reg_number); } else { LIBXSMM_SNPRINTF(l_masking, 16, "{k%hd}", (unsigned short)i_mask_reg_number); } } else { if ( io_generated_code->code_type == 0 ) { LIBXSMM_SNPRINTF(l_masking, 16, "%%{k%hd%%}%%{z%%}", (unsigned short)i_mask_reg_number); } else { LIBXSMM_SNPRINTF(l_masking, 16, "{k%hd}{z}", (unsigned short)i_mask_reg_number); } } } else l_masking[0] = (char)0; /* no mask */ /* build vXYZpd/ps/sd/ss instruction pure register use*/ if ( i_instruction_set >= LIBXSMM_X86_AVX512 ) { if ( io_generated_code->code_type == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%s %%%%%cmm%u, %%%%%cmm%u, %%%%%cmm%u%s\\n\\t\"\n", l_instr_name, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1, i_vector_name, i_vec_reg_number_2, l_masking ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %s %%%cmm%u, %%%cmm%u, %%%cmm%u%s\n", l_instr_name, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1, i_vector_name, i_vec_reg_number_2, l_masking ); } } else { /* This is an error */ } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } LIBXSMM_API_INTERN void libxsmm_x86_instruction_vec_compute_mem( libxsmm_generated_code* io_generated_code, const unsigned int i_instruction_set, const unsigned int i_vec_instr, const unsigned int i_use_broadcast, const unsigned int i_gp_reg_base, const unsigned int i_gp_reg_idx, const unsigned int i_scale, const int i_displacement, const char i_vector_name, const unsigned int i_vec_reg_number_0, const unsigned int i_vec_reg_number_1 ) { /* @TODO add checks in debug mode */ if ( (i_instruction_set < LIBXSMM_X86_AVX512) && (i_use_broadcast != 0) ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_NO_AVX512_BCAST ); return; } if ( io_generated_code->code_type > 1 ) { unsigned char *buf = (unsigned char *) io_generated_code->generated_code; int i = io_generated_code->code_size; /*int i = *loc;*/ unsigned int l_maxsize = io_generated_code->buffer_size; /*unsigned int l_maxsize = 1024;*/ int l_second=0, l_third=0, l_fourth=0, l_xreg=0; int l_reg0 = 0; int l_vec_0 = i_vec_reg_number_0; int l_vec_1 = i_vec_reg_number_1; int l_reg1 = l_vec_0; int l_reg2 = l_vec_1; int l_fpadj=0, l_place=0; int l_fpadj2=0; int l_bytes=4; int l_regi=0; int l_forced_offset=0; int l_sizereg=64; int l_scaleadj=0; int l_sse3 = 0; int l_insert_extra_byte = 0; int l_fma = 0; /* int l_iregoff = 0; */ int l_broadcast = (int)i_use_broadcast; if ( l_maxsize - i < 20 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_BUFFER_TOO_SMALL ); return; } switch ( i_vector_name ) { case 'x': l_sizereg = 1; if ( l_broadcast == 1 ) { fprintf(stderr, "libxsmm_instruction_vec_compute_mem: broadcasts aren't enabled with xmm yet\n"); exit(-1); } break; case 'y': l_sizereg = 1; if ( l_broadcast == 1 ) { fprintf(stderr, "libxsmm_instruction_vec_compute_mem: broadcasts aren't enabled with ymm yet\n"); exit(-1); } break; case 'z': l_bytes = 6; break; default: fprintf(stderr, "libxsmm_instruction_vec_compute_mem: Unknown sort of fp registers\n"); exit(-1); } if ( l_broadcast == 1 ) l_sizereg = 8; switch ( i_vec_instr ) { case LIBXSMM_X86_INSTR_VXORPD: l_fpadj = -2; break; case LIBXSMM_X86_INSTR_VMULPD: break; case LIBXSMM_X86_INSTR_VADDPD: l_fpadj = -1; break; case LIBXSMM_X86_INSTR_VPANDD: l_fpadj = 0x82; l_fpadj2 = 0x80; l_bytes = 6; l_sizereg = 64; if ( i_vector_name == 'x' ) { l_fourth -= 0x40; l_sizereg = 16; } else if ( i_vector_name == 'y' ) { l_fourth -= 0x20; l_sizereg = 32; } if ( l_broadcast == 1 ) l_sizereg = 4; break; case LIBXSMM_X86_INSTR_VSUBPD: l_fpadj = 3; break; case LIBXSMM_X86_INSTR_VMAXPD: l_fpadj = 6; break; case LIBXSMM_X86_INSTR_VMAXPS: l_fpadj = 6; l_fpadj2 = -0x81; if ( i_vector_name == 'x' ) { l_fourth -= 0x40; l_third += 0x80; } else if ( i_vector_name == 'y' ) { l_third -= 0x80; } break; case LIBXSMM_X86_INSTR_VPERMW: l_second += 0x01; l_fpadj = 0x34; l_bytes = 6; if ( i_vector_name == 'x' ) { l_fourth -= 0x40; l_sizereg = 16; } else if ( i_vector_name == 'y' ) { l_fourth -= 0x20; l_sizereg = 32; } break; case LIBXSMM_X86_INSTR_VPERMD: if (i_vector_name == 'x') { fprintf(stderr, "libxsmm_instruction_vec_compute_mem: vpermd not enabled with xmm\n"); #ifdef ERROR_EXIT exit(-1); #endif } l_bytes = 5; l_fma = 1; /* This is not a FMA, but it does decode as one */ l_fpadj = -0x23; break; case LIBXSMM_X86_INSTR_VFMADD231PD: l_fpadj += 0x5f; l_fpadj2 += 0x80; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFMADD213PD: l_fpadj += 0x4f; l_fpadj2 += 0x80; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFMADD132PD: l_fpadj += 0x3f; l_fpadj2 += 0x80; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFMSUB231PD: l_fpadj += 0x61; l_fpadj2 += 0x80; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFMSUB213PD: l_fpadj += 0x51; l_fpadj2 += 0x80; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFMSUB132PD: l_fpadj += 0x41; l_fpadj2 += 0x80; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFNMADD231PD: l_fpadj += 0x63; l_fpadj2 += 0x80; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFNMADD213PD: l_fpadj += 0x53; l_fpadj2 += 0x80; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFNMADD132PD: l_fpadj += 0x43; l_fpadj2 += 0x80; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFNMSUB231PD: l_fpadj += 0x65; l_fpadj2 += 0x80; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFNMSUB213PD: l_fpadj += 0x55; l_fpadj2 += 0x80; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFNMSUB132PD: l_fpadj += 0x45; l_fpadj2 += 0x80; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFMADD231SD: if (i_vector_name != 'x') { fprintf(stderr, "libxsmm_instruction_vec_compute_mem: vfmadd231sd and ymm/zmm?\n"); #ifdef ERROR_EXIT exit(-1); #endif } l_fpadj += 0x60; l_fpadj2 += 0x80; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFMADD213SD: if (i_vector_name != 'x') { fprintf(stderr, "libxsmm_instruction_vec_compute_mem: vfmadd213sd and ymm/zmm?\n"); #ifdef ERROR_EXIT exit(-1); #endif } l_fpadj += 0x50; l_fpadj2 += 0x80; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFMADD132SD: if (i_vector_name != 'x') { fprintf(stderr, "libxsmm_instruction_vec_compute_mem: vfmadd132sd and ymm/zmm?\n"); #ifdef ERROR_EXIT exit(-1); #endif } l_fpadj += 0x40; l_fpadj2 += 0x80; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFMSUB231SD: if (i_vector_name != 'x') { fprintf(stderr, "libxsmm_instruction_vec_compute_mem: vfmsub231sd and ymm/zmm?\n"); #ifdef ERROR_EXIT exit(-1); #endif } l_fpadj += 0x62; l_fpadj2 += 0x80; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFMSUB213SD: if (i_vector_name != 'x') { fprintf(stderr, "libxsmm_instruction_vec_compute_mem: vfmsub213sd and ymm/zmm?\n"); #ifdef ERROR_EXIT exit(-1); #endif } l_fpadj += 0x52; l_fpadj2 += 0x80; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFMSUB132SD: if (i_vector_name != 'x') { fprintf(stderr, "libxsmm_instruction_vec_compute_mem: vfmsub132sd and ymm/zmm?\n"); #ifdef ERROR_EXIT exit(-1); #endif } l_fpadj += 0x42; l_fpadj2 += 0x80; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFNMADD231SD: if (i_vector_name != 'x') { fprintf(stderr, "libxsmm_instruction_vec_compute_mem: vfnmadd231sd and ymm/zmm?\n"); #ifdef ERROR_EXIT exit(-1); #endif } l_fpadj += 0x64; l_fpadj2 += 0x80; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFNMADD213SD: if (i_vector_name != 'x') { fprintf(stderr, "libxsmm_instruction_vec_compute_mem: vfnmadd213sd and ymm/zmm?\n"); #ifdef ERROR_EXIT exit(-1); #endif } l_fpadj += 0x54; l_fpadj2 += 0x80; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFNMADD132SD: if (i_vector_name != 'x') { fprintf(stderr, "libxsmm_instruction_vec_compute_mem: vfnmadd132sd and ymm/zmm?\n"); #ifdef ERROR_EXIT exit(-1); #endif } l_fpadj += 0x44; l_fpadj2 += 0x80; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFNMSUB231SD: if (i_vector_name != 'x') { fprintf(stderr, "libxsmm_instruction_vec_compute_mem: vfnmsub231sd and ymm/zmm?\n"); #ifdef ERROR_EXIT exit(-1); #endif } l_fpadj += 0x66; l_fpadj2 += 0x80; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFNMSUB213SD: if (i_vector_name != 'x') { fprintf(stderr, "libxsmm_instruction_vec_compute_mem: vfnmsub213sd and ymm/zmm?\n"); #ifdef ERROR_EXIT exit(-1); #endif } l_fpadj += 0x56; l_fpadj2 += 0x80; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFNMSUB132SD: if (i_vector_name != 'x') { fprintf(stderr, "libxsmm_instruction_vec_compute_mem: vfnmsub132sd and ymm/zmm?\n"); #ifdef ERROR_EXIT exit(-1); #endif } l_fpadj += 0x46; l_fpadj2 += 0x80; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFMADD231PS: if ( l_broadcast == 1 ) l_sizereg = 4; l_fpadj += 0x5f; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFMADD213PS: if ( l_broadcast == 1 ) l_sizereg = 4; l_fpadj += 0x4f; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFMADD132PS: if ( l_broadcast == 1 ) l_sizereg = 4; l_fpadj += 0x3f; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFMSUB231PS: if ( l_broadcast == 1 ) l_sizereg = 4; l_fpadj += 0x61; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFMSUB213PS: if ( l_broadcast == 1 ) l_sizereg = 4; l_fpadj += 0x51; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFMSUB132PS: if ( l_broadcast == 1 ) l_sizereg = 4; l_fpadj += 0x41; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFNMADD231PS: if ( l_broadcast == 1 ) l_sizereg = 4; l_fpadj += 0x63; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFNMADD213PS: if ( l_broadcast == 1 ) l_sizereg = 4; l_fpadj += 0x53; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFNMADD132PS: if ( l_broadcast == 1 ) l_sizereg = 4; l_fpadj += 0x43; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFNMSUB231PS: if ( l_broadcast == 1 ) l_sizereg = 4; l_fpadj += 0x65; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFNMSUB213PS: if ( l_broadcast == 1 ) l_sizereg = 4; l_fpadj += 0x55; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFNMSUB132PS: if ( l_broadcast == 1 ) l_sizereg = 4; l_fpadj += 0x45; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFMADD231SS: if (i_vector_name != 'x') { fprintf(stderr, "libxsmm_instruction_vec_compute_mem: vfmadd231ss and ymm/zmm?\n"); #ifdef ERROR_EXIT exit(-1); #endif } l_fpadj += 0x60; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFMADD213SS: if (i_vector_name != 'x') { fprintf(stderr, "libxsmm_instruction_vec_compute_mem: vfmadd213ss and ymm/zmm?\n"); #ifdef ERROR_EXIT exit(-1); #endif } l_fpadj += 0x50; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFMADD132SS: if (i_vector_name != 'x') { fprintf(stderr, "libxsmm_instruction_vec_compute_mem: vfmadd132ss and ymm/zmm?\n"); #ifdef ERROR_EXIT exit(-1); #endif } l_fpadj += 0x40; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFMSUB231SS: if (i_vector_name != 'x') { fprintf(stderr, "libxsmm_instruction_vec_compute_mem: vfmsub231ss and ymm/zmm?\n"); #ifdef ERROR_EXIT exit(-1); #endif } l_fpadj += 0x62; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFMSUB213SS: if (i_vector_name != 'x') { fprintf(stderr, "libxsmm_instruction_vec_compute_mem: vfmsub213ss and ymm/zmm?\n"); #ifdef ERROR_EXIT exit(-1); #endif } l_fpadj += 0x52; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFMSUB132SS: if (i_vector_name != 'x') { fprintf(stderr, "libxsmm_instruction_vec_compute_mem: vfmsub132ss and ymm/zmm?\n"); #ifdef ERROR_EXIT exit(-1); #endif } l_fpadj += 0x42; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFNMADD231SS: if (i_vector_name != 'x') { fprintf(stderr, "libxsmm_instruction_vec_compute_mem: vfnmadd231ss and ymm/zmm?\n"); #ifdef ERROR_EXIT exit(-1); #endif } l_fpadj += 0x64; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFNMADD213SS: if (i_vector_name != 'x') { fprintf(stderr, "libxsmm_instruction_vec_compute_mem: vfnmadd213ss and ymm/zmm?\n"); #ifdef ERROR_EXIT exit(-1); #endif } l_fpadj += 0x54; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFNMADD132SS: if (i_vector_name != 'x') { fprintf(stderr, "libxsmm_instruction_vec_compute_mem: vfnmadd132ss and ymm/zmm?\n"); #ifdef ERROR_EXIT exit(-1); #endif } l_fpadj += 0x44; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFNMSUB231SS: if (i_vector_name != 'x') { fprintf(stderr, "libxsmm_instruction_vec_compute_mem: vfnmsub231ss and ymm/zmm?\n"); #ifdef ERROR_EXIT exit(-1); #endif } l_fpadj += 0x66; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFNMSUB213SS: if (i_vector_name != 'x') { fprintf(stderr, "libxsmm_instruction_vec_compute_mem: vfnmsub213ss and ymm/zmm?\n"); #ifdef ERROR_EXIT exit(-1); #endif } l_fpadj += 0x56; l_fma = 1; break; case LIBXSMM_X86_INSTR_VFNMSUB132SS: if (i_vector_name != 'x') { fprintf(stderr, "libxsmm_instruction_vec_compute_mem: vfnmsub132ss and ymm/zmm?\n"); #ifdef ERROR_EXIT exit(-1); #endif } l_fpadj += 0x46; l_fma = 1; break; case LIBXSMM_X86_INSTR_VMULSD: l_fpadj2 = 2; if (i_vector_name != 'x') { fprintf(stderr, "libxsmm_instruction_vec_compute_mem: vmulsd and ymm/zmm?\n"); #ifdef ERROR_EXIT exit(-1); #endif } break; case LIBXSMM_X86_INSTR_VADDSD: l_fpadj =-1; l_fpadj2 = 2; if (i_vector_name != 'x') { fprintf(stderr, "libxsmm_instruction_vec_compute_mem: vaddsd and ymm/zmm?\n"); #ifdef ERROR_EXIT exit(-1); #endif } break; case LIBXSMM_X86_INSTR_VSUBSD: l_fpadj = 3; l_fpadj2 = 2; if (i_vector_name != 'x') { fprintf(stderr, "libxsmm_instruction_vec_compute_mem: vsubsd and ymm/zmm?\n"); #ifdef ERROR_EXIT exit(-1); #endif } break; case LIBXSMM_X86_INSTR_VPMOVDW: if ( l_vec_1 != LIBXSMM_X86_VEC_REG_UNDEF ) { /*LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_VEC_REG_MUST_BE_UNDEF );*/ fprintf(stderr,"Please call VPMOVDW with vector reg 0 only. Use UNDEF with reg 1\n"); #ifdef ERROR_EXIT exit(-1); #endif } l_bytes = 6; l_second += 0x1; l_fpadj2 -= 0x7F; l_fpadj -= 0x26; l_sizereg = 32; if ( i_vector_name == 'x' ) { l_fourth -= 0x40; l_sizereg = 8 ; } else if ( i_vector_name == 'y' ) { l_fourth -= 0x20; l_sizereg = 16; } break; case LIBXSMM_X86_INSTR_VPMOVSXWD: if ( l_vec_1 != LIBXSMM_X86_VEC_REG_UNDEF ) { /*LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_VEC_REG_MUST_BE_UNDEF );*/ fprintf(stderr,"Please call VPMOVSXWD with vector reg 0 only. Use UNDEF with reg 1\n"); #ifdef ERROR_EXIT exit(-1); #endif } l_bytes = 5; l_second += 0x21; l_fpadj -= 0x36; l_fpadj2 -= 0x0; l_sizereg = 1; break; case LIBXSMM_X86_INSTR_VPMOVZXWD: if ( l_vec_1 != LIBXSMM_X86_VEC_REG_UNDEF ) { /*LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_VEC_REG_MUST_BE_UNDEF );*/ fprintf(stderr,"Please call VPMOVZXWD with vector reg 0 only. Use UNDEF with reg 1\n"); #ifdef ERROR_EXIT exit(-1); #endif } l_bytes = 5; l_second += 0x21; l_fpadj -= 0x26; l_fpadj2 -= 0x0; l_sizereg = 1; break; case LIBXSMM_X86_INSTR_VPMOVSXBD: if ( l_vec_1 != LIBXSMM_X86_VEC_REG_UNDEF ) { /*LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_VEC_REG_MUST_BE_UNDEF );*/ fprintf(stderr,"Please call VPMOVSXBD with vector reg 0 only. Use UNDEF with reg 1\n"); #ifdef ERROR_EXIT exit(-1); #endif } l_bytes = 5; l_second += 0x21; l_fpadj -= 0x38; l_fpadj2 -= 0x0; l_sizereg = 1; break; case LIBXSMM_X86_INSTR_VPMOVZXBD: if ( l_vec_1 != LIBXSMM_X86_VEC_REG_UNDEF ) { /*LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_VEC_REG_MUST_BE_UNDEF );*/ fprintf(stderr,"Please call VPMOVZXBD with vector reg 0 only. Use UNDEF with reg 1\n"); #ifdef ERROR_EXIT exit(-1); #endif } l_bytes = 5; l_second += 0x21; l_fpadj -= 0x28; l_fpadj2 -= 0x0; l_sizereg = 1; break; case LIBXSMM_X86_INSTR_VXORPS: l_fpadj2 = -1; l_fpadj = -2; if ( l_broadcast == 1 ) l_sizereg = 4; if ( i_vector_name == 'z' ) { l_fpadj2 -= 0x80; } break; case LIBXSMM_X86_INSTR_VMULPS: if ( l_broadcast == 1 ) l_sizereg = 4; if ( (i_vector_name!='z') && (l_vec_0<=15) && (l_vec_1<=15) ) l_fpadj2 = -1; else l_fpadj2 = -0x81; break; case LIBXSMM_X86_INSTR_VDPBF16PS: if ( i_vector_name=='y' ) { l_sizereg = 32; l_fourth -= 0x20; } if ( i_vector_name=='x' ) { l_sizereg = 16; l_fourth -= 0x40; } if ( l_broadcast == 1 ) l_sizereg = 4; l_fpadj2 = -0x81; l_fpadj2 += 0x02; l_fpadj = -7; l_second += 1; l_bytes = 6; break; case LIBXSMM_X86_INSTR_VCVTNE2PS2BF16: if ( i_vector_name=='y' ) { l_sizereg = 32; l_fourth -= 0x20; } if ( i_vector_name=='x' ) { l_sizereg = 16; l_fourth -= 0x40; } if ( l_broadcast == 1 ) l_sizereg = 4; #if 0 /* if/else branches with same outcome */ if ( (i_vector_name!='z') && (l_vec_0<=15) && (l_vec_1<=15) ) #endif { l_fpadj2 = -0x81; } #if 0 /* if/else branches with same outcome */ else l_fpadj2 = -0x81; #endif l_fpadj2 += 0x02; l_fpadj = 0x19; l_second += 1; l_third += 1; l_bytes = 6; break; case LIBXSMM_X86_INSTR_VADDPS: if ( l_broadcast == 1 ) l_sizereg = 4; if ( (i_vector_name!='z') && (l_vec_0<=15) && (l_vec_1<=15) ) l_fpadj2 = -1; else l_fpadj2 = -0x81; l_fpadj = -1; break; case LIBXSMM_X86_INSTR_VSUBPS: if ( l_broadcast == 1 ) l_sizereg = 4; if ( (i_vector_name!='z') && (l_vec_0<=15) && (l_vec_1<=15) ) l_fpadj2 = -1; else l_fpadj2 = -0x81; l_fpadj = 3; break; case LIBXSMM_X86_INSTR_VMULSS: if (i_vector_name != 'x') { fprintf(stderr, "libxsmm_instruction_vec_compute_mem: vmulss and ymm/zmm?\n"); #ifdef ERROR_EXIT exit(-1); #endif } l_fpadj2 = 1; break; case LIBXSMM_X86_INSTR_VADDSS: if (i_vector_name != 'x') { fprintf(stderr, "libxsmm_instruction_vec_compute_mem: vaddss and ymm/zmm?\n"); #ifdef ERROR_EXIT exit(-1); #endif } l_fpadj =-1; l_fpadj2 = 1; break; case LIBXSMM_X86_INSTR_VSUBSS: if (i_vector_name != 'x') { fprintf(stderr, "libxsmm_instruction_vec_compute_mem: vsubss and ymm/zmm?\n"); #ifdef ERROR_EXIT exit(-1); #endif } l_fpadj = 3; l_fpadj2 = 1; break; case LIBXSMM_X86_INSTR_VPXORD: l_bytes = 6; if ( i_vector_name == 'x' ) { l_fourth -= 0x40; l_sizereg = 16; } else if ( i_vector_name == 'y' ) { l_fourth -= 0x20; l_sizereg = 32; } if ( l_broadcast == 1 ) l_sizereg = 4; l_fpadj += 0x96; l_fpadj2 += 0x80; break; case LIBXSMM_X86_INSTR_VPORD: l_bytes = 6; if ( i_vector_name == 'x' ) { l_fourth -= 0x40; l_sizereg = 16; } else if ( i_vector_name == 'y' ) { l_fourth -= 0x20; l_sizereg = 32; } if ( l_broadcast == 1 ) l_sizereg = 4; l_fpadj += 0x92; l_fpadj2 += 0x80; break; case LIBXSMM_X86_INSTR_VPSRAVD: l_second += 0x01; l_fpadj -= 0x13; l_fpadj2 -= 0x80; #if 1 if ( i_vector_name == 'x' ) { l_second += 0x20; l_third -= 0x80; l_fourth -= 0x40; if ( (i_gp_reg_base > 7) && (i_gp_reg_base <= 15 ) ) { l_second -= 0x20; } else if ( (i_gp_reg_idx > 7) && (i_gp_reg_idx <= 15) ) { l_second -= 0x20; } } else if ( i_vector_name == 'y' ) { l_second += 0x20; l_third -= 0x80; l_fourth -= 0x20; if ( (i_gp_reg_base > 7) && (i_gp_reg_base <= 15 ) ) { l_second -= 0x20; } else if ( (i_gp_reg_idx > 7) && (i_gp_reg_idx <= 15) ) { l_second -= 0x20; } } #endif l_bytes = 5; if ( l_broadcast == 1 ) l_sizereg = 4; break; case LIBXSMM_X86_INSTR_VPADDD: l_fpadj2 -= 0x80; l_fpadj += 0xA5; if ( i_vector_name == 'x' ) { /* Either l_third, l_second, or l_fpadj2 is off by 0x80 */ l_second += 0x80; if ( (i_gp_reg_base > 7) && (i_gp_reg_base <= 15 ) ) { l_second += 0x80; l_third -= 0x80; } else if ( (i_gp_reg_idx > 7) && (i_gp_reg_idx <= 15) ) { l_second += 0x80; l_third -= 0x80; } } else if ( i_vector_name == 'y' ) { l_second += 0x80; if ( (i_gp_reg_base > 7) && (i_gp_reg_base <= 15 ) ) { l_second += 0x80; l_third -= 0x80; } else if ( (i_gp_reg_idx > 7) && (i_gp_reg_idx <= 15) ) { l_second += 0x80; l_third -= 0x80; } } if ( l_broadcast == 1 ) l_sizereg = 4; break; case LIBXSMM_X86_INSTR_MOVAPD: l_sse3 = 1; l_insert_extra_byte = 0x66; l_fpadj = 0x18; break; case LIBXSMM_X86_INSTR_MOVUPD: l_sse3 = 1; l_insert_extra_byte = 0x66; break; case LIBXSMM_X86_INSTR_MOVAPS: l_sse3 = 1; l_fpadj = 0x18; break; case LIBXSMM_X86_INSTR_MOVUPS: l_sse3 = 1; break; case LIBXSMM_X86_INSTR_MOVSD: l_sse3 = 1; l_insert_extra_byte = 0xF2; break; case LIBXSMM_X86_INSTR_MOVSS: l_sse3 = 1; l_insert_extra_byte = 0xF3; break; case LIBXSMM_X86_INSTR_MOVDDUP: l_sse3 = 1; l_insert_extra_byte = 0xF2; l_fpadj = 2; break; case LIBXSMM_X86_INSTR_XORPD: l_sse3 = 1; l_insert_extra_byte = 0x66; l_fpadj = 0x47; break; case LIBXSMM_X86_INSTR_XORPS: l_sse3 = 1; l_fpadj = 0x47; break; case LIBXSMM_X86_INSTR_MULPD: l_sse3 = 1; l_insert_extra_byte = 0x66; l_fpadj = 0x49; break; case LIBXSMM_X86_INSTR_MULSS: l_sse3 = 1; l_insert_extra_byte = 0xF3; l_fpadj = 0x49; break; case LIBXSMM_X86_INSTR_MULPS: l_sse3 = 1; l_fpadj = 0x49; break; case LIBXSMM_X86_INSTR_ADDPD: l_sse3 = 1; l_insert_extra_byte = 0x66; l_fpadj = 0x48; break; case LIBXSMM_X86_INSTR_ADDSS: l_sse3 = 1; l_insert_extra_byte = 0xF3; l_fpadj = 0x48; break; case LIBXSMM_X86_INSTR_ADDPS: l_sse3 = 1; l_fpadj = 0x48; break; case LIBXSMM_X86_INSTR_ADDSD: l_sse3 = 1; l_insert_extra_byte = 0xF2; l_fpadj = 0x48; break; case LIBXSMM_X86_INSTR_SUBPD: l_sse3 = 1; l_insert_extra_byte = 0x66; l_fpadj = 0x4c; break; case LIBXSMM_X86_INSTR_SUBSS: l_sse3 = 1; l_insert_extra_byte = 0xF3; l_fpadj = 0x4c; break; case LIBXSMM_X86_INSTR_SUBPS: l_sse3 = 1; l_fpadj = 0x4c; break; case LIBXSMM_X86_INSTR_SUBSD: l_sse3 = 1; l_insert_extra_byte = 0xF2; l_fpadj = 0x4c; break; case LIBXSMM_X86_INSTR_MULSD: l_sse3 = 1; l_insert_extra_byte = 0xF2; l_fpadj = 0x49; break; default: fprintf(stderr, "libxsmm_instruction_vec_compute_mem: Unknown instruction type: %u\n", i_vec_instr); break; } if ( (i_gp_reg_idx != LIBXSMM_X86_GP_REG_UNDEF) && ((int)i_gp_reg_idx >= LIBXSMM_X86_GP_REG_RAX) && (i_gp_reg_idx <= LIBXSMM_X86_GP_REG_R15) ) { switch ( i_scale ) { case 1: l_scaleadj=0; break; case 2: l_scaleadj=0x40; break; case 4: l_scaleadj=0x80; break; case 8: l_scaleadj=0xc0; break; default: fprintf(stderr, "libxsmm_instruction_vec_compute_mem: cannot handle i_scale=%u parameter\n", i_scale); exit(-1); } } if ( !l_sse3 ) { if ( l_fma == 1 ) { /* Common code to all 48 FMAs. Pulling outside case statement */ if ( i_vector_name == 'z' ) { l_second += 0x1; l_fpadj2 -= 0x80; } else { l_second += 0x21; if ( (i_gp_reg_base > 7) && (i_gp_reg_base <= 15 ) ) { l_second -= 0x20; } else if ( (i_gp_reg_idx > 7) && (i_gp_reg_idx <= 15) ) { l_second -= 0x20; } } l_bytes = 5; } if ( (i_gp_reg_base >= 8) && (i_gp_reg_base != LIBXSMM_X86_GP_REG_UNDEF) ) { if ( l_bytes < 5 ) l_bytes = 5; /* else l_iregoff -= 0x20; */ } /* Note to Greg: i_gp_reg_idx is an unsigned number hence no comparison against negative range */ /* Note from Greg: Oh, I know. But I reused this code elsewhere in a case where i_gp_reg_idx being UNDEF was -1. I just wanted the same code to work in both places. */ if ( (i_gp_reg_idx == LIBXSMM_X86_GP_REG_UNDEF) /*|| (i_gp_reg_idx < 0)*/ || (i_gp_reg_idx > 15) ) { l_regi = 0; } else { l_regi = i_gp_reg_idx; } if ( (i_gp_reg_idx >= 8) && (i_gp_reg_idx <=15) && (i_gp_reg_idx != LIBXSMM_X86_GP_REG_UNDEF) ) { if ( l_bytes < 5 ) l_bytes = 5; l_regi = i_gp_reg_idx-8; } if ( i_vector_name == 'x' ) l_xreg = -4; l_reg0 = i_gp_reg_base % 8; l_reg1 = l_vec_0 % 8; l_reg2 = l_vec_1 % 8; if ( i_vec_instr == LIBXSMM_X86_INSTR_VPMOVDW ) { /* We only have 1 vector register input */ l_reg2 = l_vec_0 % 8; l_reg1 = 0; l_vec_1 = l_vec_0; l_vec_0 = 0; } if ( (i_vec_instr == LIBXSMM_X86_INSTR_VPMOVSXWD) || (i_vec_instr == LIBXSMM_X86_INSTR_VPMOVZXWD) || (i_vec_instr == LIBXSMM_X86_INSTR_VPMOVSXBD) || (i_vec_instr == LIBXSMM_X86_INSTR_VPMOVZXBD) ) { /* We only have 1 vector register input */ l_reg2 = i_vec_reg_number_0 % 8; l_reg1 = 0; l_vec_0 = 0; l_vec_1 = i_vec_reg_number_0; if ((i_gp_reg_base >= 8) && (i_gp_reg_base != LIBXSMM_X86_GP_REG_UNDEF)) { if ((i_gp_reg_idx < 8) && (i_gp_reg_idx != LIBXSMM_X86_GP_REG_UNDEF)) { l_second -= 0x20; } } if ((i_gp_reg_base < 8) && (i_gp_reg_base != LIBXSMM_X86_GP_REG_UNDEF)) { if ((i_gp_reg_idx >= 8) && (i_gp_reg_idx != LIBXSMM_X86_GP_REG_UNDEF)) { l_second -= 0x20; } if ( i_vector_name == 'z' ) { if ( (i_gp_reg_idx < 8) || (i_gp_reg_idx==LIBXSMM_X86_GP_REG_UNDEF) ) { l_second += 0xE0; } if ( (i_vec_instr == LIBXSMM_X86_INSTR_VPMOVSXBD) || (i_vec_instr == LIBXSMM_X86_INSTR_VPMOVZXBD) ) { l_sizereg = 16; } else { l_sizereg = 32; } } } if ( i_vector_name == 'z' ) l_third -= 0x80; } if ( l_vec_0 >= 8 ) { l_third -= 0x40; } if ( l_vec_1 >= 8 ) { l_second -= 0x80; } if ( (i_vector_name == 'z') || (l_vec_0 > 15) || (l_vec_1 > 15) ) l_bytes = 6; if ( l_bytes == 4 ) { buf[i++] = 0xc5; buf[i++] = (unsigned char)(0xfd - 8*l_reg1 + l_third + l_second + l_xreg + l_fpadj2); buf[i++] = (unsigned char)(0x59 + l_fpadj); /* i_gp_reg_idx is an unsigned number hence no comparison against negative range */ if ( (i_gp_reg_idx != LIBXSMM_X86_GP_REG_UNDEF) /*&& (i_gp_reg_idx >=0)*/ && (i_gp_reg_idx <=15) ) { buf[i++] = (unsigned char)(0x04 + 8*l_reg2); l_place = i-1; buf[i++] = (unsigned char)(0x00 + l_reg0 + l_scaleadj + 8*l_regi); } else { buf[i++] = (unsigned char)(0x00 + l_reg0 + 8*l_reg2); } } else if ( l_bytes == 5 ) { if ((i_gp_reg_base >= 8) && (i_gp_reg_base != LIBXSMM_X86_GP_REG_UNDEF)) { if ((i_gp_reg_idx >= 8) && (i_gp_reg_idx != LIBXSMM_X86_GP_REG_UNDEF)) { l_second -= 0x20; } } if ((i_gp_reg_idx >= 8) && (i_gp_reg_idx <=15) && (i_gp_reg_idx != LIBXSMM_X86_GP_REG_UNDEF)) { l_second -= 0x20; } buf[i++] = 0xc4; buf[i++] = (unsigned char)(0xc1 + l_second); buf[i++] = (unsigned char)(0x7d - 8*l_reg1 + l_third + l_xreg + l_fpadj2); buf[i++] = (unsigned char)(0x59 + l_fpadj); /* i_gp_reg_idx is an unsigned number hence no comparison against negative range */ if ( (i_gp_reg_idx != LIBXSMM_X86_GP_REG_UNDEF) /*&& (i_gp_reg_idx >=0)*/ && (i_gp_reg_idx <=15) ) { buf[i++] = (unsigned char)(0x04 + 8*l_reg2); l_place = i-1; buf[i++] = (unsigned char)(0x00 + l_reg0 + l_scaleadj + 8*l_regi); } else { buf[i++] = (unsigned char)(0x00 + l_reg0 + 8*l_reg2); } } else if ( l_bytes == 6 ) { if ( i_gp_reg_base >= 8 ) { l_second -= 0x20; } if ( (i_gp_reg_idx >= 8) && (i_gp_reg_idx <= 15) && (i_gp_reg_idx != LIBXSMM_X86_GP_REG_UNDEF) ) { l_second -= 0x40; } /* if ( l_vec_0 >= 8 ) { l_third -= 0x40; } */ if ( l_vec_0 >= 16) { l_third += 0x40; l_fourth -= 0x8; } if ( l_vec_0 >= 24) { l_third -= 0x40; } /* if ( l_vec_1 >= 8 ) { l_second -= 0x80; } */ if ( l_vec_1 >= 16) { l_second += 0x70; } if ( l_vec_1 >= 24) { l_second -= 0x80; } if ( l_broadcast == 1 ) { l_fourth += 0x10; } buf[i++] = 0x62; buf[i++] = (unsigned char)(0xf1 + l_second); buf[i++] = (unsigned char)(0xfd - 8*l_reg1 + l_third + l_fpadj2); buf[i++] = (unsigned char)(0x48 + l_fourth); buf[i++] = (unsigned char)(0x59 + l_fpadj); /* i_gp_reg_idx is an unsigned number hence no comparison against negative range */ if ( (i_gp_reg_idx != LIBXSMM_X86_GP_REG_UNDEF) /*&& (i_gp_reg_idx >=0)*/ && (i_gp_reg_idx <=15) ) { buf[i++] = (unsigned char)(0x04 + 8*l_reg2); l_place = i-1; buf[i++] = (unsigned char)(0x00 + l_reg0 + l_scaleadj + 8*l_regi); } else { buf[i++] = (unsigned char)(0x00 + l_reg0 + 8*l_reg2); } } if (l_place==0) l_place = i - 1; if ( ((i_gp_reg_base % 8) == LIBXSMM_X86_GP_REG_RSP) && (i_gp_reg_idx==LIBXSMM_X86_GP_REG_UNDEF) ) { buf[i++] = 0x24; } if ( ( (i_gp_reg_base%8) == 5) && (i_displacement==0) ) { /* Registers like rbp/r13 when you have a displacement of 0, we need force the single byte of zero to appear. */ l_forced_offset = 1; } i += internal_x86_instructions_add_offset( l_place, i, i_displacement, l_forced_offset, l_sizereg, buf ); io_generated_code->code_size = i; /* *loc = i; */ } else { /* SSE3 code */ int l_vecgrp0 = 0; int l_vecval0 = i_vec_reg_number_0 % 8; int l_place1=i+2; int l_regbas0 = i_gp_reg_base % 8; int l_regidx = i_gp_reg_idx % 8; int l_gp8 = ((i_gp_reg_base > 7)&&(i_gp_reg_base<=15)?1:0); LIBXSMM_ASSERT(0 == l_forced_offset); if ( (i_vec_reg_number_0>=8) && (i_vec_reg_number_0<=15) ) l_vecgrp0=1; if ( l_insert_extra_byte != 0 ) { buf[i++]= (unsigned char)(l_insert_extra_byte); ++l_place1; } if (i_gp_reg_idx == LIBXSMM_X86_GP_REG_UNDEF ) { int l_sse_preamble2 = 64; if ( l_gp8 || (l_vecgrp0>=1) ) { if (l_gp8) l_sse_preamble2 += 1; if (l_vecgrp0 >=1) l_sse_preamble2 += 4; buf[i++] = (unsigned char)(l_sse_preamble2); ++l_place1; } buf[i++] = (unsigned char)(0x0f); buf[i++] = (unsigned char)(0x10 + l_fpadj); buf[i++] = (unsigned char)(0x00 + l_regbas0 + l_vecval0*8); if ( l_regbas0 == 4 ) buf[i++]=0x24; } else { int l_sse_preamble2 = 64; int l_ix8 = ((i_gp_reg_idx > 7)&&(i_gp_reg_idx<=15)?1:0); if ( l_gp8 || l_ix8 || (l_vecgrp0>=1) ) { if (l_gp8) l_sse_preamble2 += 1; if (l_ix8) l_sse_preamble2 += 2; if (l_vecgrp0 >=1) l_sse_preamble2 += 4; buf[i++] = (unsigned char)(l_sse_preamble2); ++l_place1; } buf[i++] = (unsigned char)(0x0f); buf[i++] = (unsigned char)(0x10 + l_fpadj); buf[i++] = (unsigned char)(0x04 + l_vecval0*8); buf[i++] = (unsigned char)(0x00 + l_scaleadj + l_regbas0 + l_regidx*8); } if ( (l_regbas0 == 5) && (i_displacement==0) ) { l_forced_offset = 1; } i += internal_x86_instructions_add_offset( l_place1, i, i_displacement, l_forced_offset, 1, buf ); /* *loc = i; */ io_generated_code->code_size = i; } } else { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; char l_gp_reg_base[4]; char l_gp_reg_idx[4]; char l_instr_name[16]; char l_broadcast[8]; unsigned int l_single_precision = libxsmm_is_x86_vec_instr_single_precision( i_vec_instr ); libxsmm_get_x86_gp_reg_name( i_gp_reg_base, l_gp_reg_base, 3 ); libxsmm_get_x86_instr_name( i_vec_instr, l_instr_name, 15 ); if (l_single_precision == 0) { LIBXSMM_SNPRINTF( l_broadcast, 7, "1to8" ); } else { LIBXSMM_SNPRINTF( l_broadcast, 7, "1to16" ); } /* build vXYZpd/ps/sd/ss instruction pure register use*/ if ( i_gp_reg_idx == LIBXSMM_X86_GP_REG_UNDEF ) { if ( io_generated_code->code_type == 0 ) { if (i_use_broadcast != 0) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%s %i(%%%%%s)%%{%s%%}, %%%%%cmm%u, %%%%%cmm%u\\n\\t\"\n", l_instr_name, i_displacement, l_gp_reg_base, l_broadcast, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1 ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%s %i(%%%%%s), %%%%%cmm%u, %%%%%cmm%u\\n\\t\"\n", l_instr_name, i_displacement, l_gp_reg_base, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1 ); } } else { if (i_use_broadcast != 0) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %s %i(%%%s) {%s}, %%%cmm%u, %%%cmm%u\n", l_instr_name, i_displacement, l_gp_reg_base, l_broadcast, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1 ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %s %i(%%%s), %%%cmm%u, %%%cmm%u\n", l_instr_name, i_displacement, l_gp_reg_base, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1 ); } } } else { libxsmm_get_x86_gp_reg_name( i_gp_reg_idx, l_gp_reg_idx, 3 ); if ( io_generated_code->code_type == 0 ) { if (i_use_broadcast != 0) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%s %i(%%%%%s,%%%%%s,%u)%%{%s%%}, %%%%%cmm%u, %%%%%cmm%u\\n\\t\"\n", l_instr_name, i_displacement, l_gp_reg_base, l_gp_reg_idx, i_scale, l_broadcast, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1 ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%s %i(%%%%%s,%%%%%s,%u), %%%%%cmm%u, %%%%%cmm%u\\n\\t\"\n", l_instr_name, i_displacement, l_gp_reg_base, l_gp_reg_idx, i_scale, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1 ); } } else { if (i_use_broadcast != 0) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %s %i(%%%s,%%%s,%u) {%s}, %%%cmm%u, %%%cmm%u\n", l_instr_name, i_displacement, l_gp_reg_base, l_gp_reg_idx, i_scale, l_broadcast, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1 ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %s %i(%%%s,%%%s,%u), %%%cmm%u, %%%cmm%u\n", l_instr_name, i_displacement, l_gp_reg_base, l_gp_reg_idx, i_scale, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1 ); } } } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } LIBXSMM_API_INTERN void libxsmm_x86_instruction_vec_compute_mem_mask ( libxsmm_generated_code* io_generated_code, const unsigned int i_instruction_set, const unsigned int i_vec_instr, const unsigned int i_use_broadcast, const unsigned int i_gp_reg_base, const unsigned int i_gp_reg_idx, const unsigned int i_scale, const int i_displacement, const char i_vector_name, const unsigned int i_vec_reg_number_0, const unsigned int i_vec_reg_number_1, const unsigned int i_immediate, const unsigned int i_mask_reg_number, const unsigned int i_use_zero_masking ) { /* @TODO add checks in debug mode */ if ( (i_instruction_set < LIBXSMM_X86_AVX512) && (i_use_broadcast != 0) ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_NO_AVX512_BCAST ); return; } if ( io_generated_code->code_type > 1 ) { unsigned char *buf = (unsigned char *) io_generated_code->generated_code; int i = io_generated_code->code_size; /*int i = *loc;*/ unsigned int l_maxsize = io_generated_code->buffer_size; /*unsigned int l_maxsize = 1024;*/ int l_regbas0 = i_gp_reg_base % 8; int l_gp8 = ((i_gp_reg_base > 7)&&(i_gp_reg_base<=15)?1:0); int l_regidx = i_gp_reg_idx % 8; int l_ix8 = ((i_gp_reg_idx > 7)&&(i_gp_reg_idx<=15)?1:0); int l_vecval0 = i_vec_reg_number_0 % 8; int l_vecgrp0 = i_vec_reg_number_0 / 8; int l_oddgrp0 = ((l_vecgrp0 % 2)==1); int l_2or3grp0 = (l_vecgrp0>=2); int l_vecval1 = i_vec_reg_number_1 % 8; int l_vecgrp1 = i_vec_reg_number_1 / 8; int l_oddgrp1 = ((l_vecgrp1 % 2)==1); int l_2or3grp1 = (l_vecgrp1>=2); int l_scaleadj = 0; int l_place = i; int l_sizereg = 64; int l_forced_offset = 0; int l_second = 0; int l_third = 0; int l_fourth = 0; int l_fifth = 0; int l_sixth = 0; if ( l_maxsize - i < 20 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_BUFFER_TOO_SMALL ); return; } if ( (i_gp_reg_base == LIBXSMM_X86_GP_REG_UNDEF) || (((int)i_gp_reg_base < LIBXSMM_X86_GP_REG_RAX) || (i_gp_reg_base > LIBXSMM_X86_GP_REG_R15)) ) { fprintf(stderr,"libxsmm_instruction_vec_compute_mem_mask has invalid i_gp_reg_base input\n"); exit(-1); } if ( (i_gp_reg_idx != LIBXSMM_X86_GP_REG_UNDEF) && (((int)i_gp_reg_idx < LIBXSMM_X86_GP_REG_RAX) || (i_gp_reg_idx > LIBXSMM_X86_GP_REG_R15)) ) { fprintf(stderr,"libxsmm_instruction_vec_compute_mem_mask has invalid i_gp_reg_idx input\n"); exit(-1); } switch ( i_vector_name ) { case 'x': case 'y': fprintf(stderr, "libxsmm_instruction_vec_compute_mem_mask: xmm/ymm not enabled yet\n"); exit(-1); break; case 'z': break; default: fprintf(stderr, "libxsmm_instruction_vec_compute_mem_mask: Unknown sort of fp registers\n"); exit(-1); } switch ( i_vec_instr ) { case LIBXSMM_X86_INSTR_VCMPPS: l_place = i + 5; l_sizereg = 64; l_fifth = 0xA3; l_sixth = i_mask_reg_number*8; l_vecval1 = 0; l_vecgrp1 = 0; l_oddgrp1 = 0; l_2or3grp1 = 0; break; case LIBXSMM_X86_INSTR_VPCMPB: l_place = i + 5; l_sizereg = 64; l_second = 2; l_third = 1; l_fifth = 0x20; l_sixth = i_mask_reg_number*8; l_vecval1 = 0; l_vecgrp1 = 0; l_oddgrp1 = 0; l_2or3grp1 = 0; break; case LIBXSMM_X86_INSTR_VPCMPD: l_place = i + 5; l_sizereg = 64; l_second = 2; l_third = 1; l_sixth = i_mask_reg_number*8; l_vecval1 = 0; l_vecgrp1 = 0; l_oddgrp1 = 0; l_2or3grp1 = 0; break; case LIBXSMM_X86_INSTR_VPCMPW: l_place = i + 5; l_sizereg = 64; l_second = 2; l_third = 0x81; l_fifth = 0x20; l_sixth = i_mask_reg_number*8; l_vecval1 = 0; l_vecgrp1 = 0; l_oddgrp1 = 0; l_2or3grp1 = 0; break; case LIBXSMM_X86_INSTR_VPCMPUB: l_place = i + 5; l_sizereg = 64; l_second = 2; l_third = 1; l_fifth = 0x1F; l_sixth = i_mask_reg_number*8; l_vecval1 = 0; l_vecgrp1 = 0; l_oddgrp1 = 0; l_2or3grp1 = 0; break; case LIBXSMM_X86_INSTR_VPCMPUD: l_place = i + 5; l_sizereg = 64; l_second = 2; l_third = 1; l_fifth = -1; l_sixth = i_mask_reg_number*8; l_vecval1 = 0; l_vecgrp1 = 0; l_oddgrp1 = 0; l_2or3grp1 = 0; break; case LIBXSMM_X86_INSTR_VPCMPUW: l_place = i + 5; l_sizereg = 64; l_second = 2; l_third = 0x81; l_fifth = 0x1F; l_sixth = i_mask_reg_number*8; l_vecval1 = 0; l_vecgrp1 = 0; l_oddgrp1 = 0; l_2or3grp1 = 0; break; case LIBXSMM_X86_INSTR_VPADDD: if ( i_immediate != LIBXSMM_X86_IMM_UNDEF ) { fprintf(stderr,"libxsmm_instruction_vec_compute_mem_mask: vpaddd should not use an immediate. You passed %u not %i\n",i_immediate,LIBXSMM_X86_IMM_UNDEF); exit(-1); } l_place = i + 5; l_sizereg = 64; l_third = 1; l_fourth = i_mask_reg_number; l_fifth = 0xDF; break; default: fprintf(stderr, "libxsmm_instruction_vec_compute_mem_mask: Unknown instruction type: %u\n", i_vec_instr); exit(-1); } if ( (i_gp_reg_idx != LIBXSMM_X86_GP_REG_UNDEF) && ((int)i_gp_reg_idx >= LIBXSMM_X86_GP_REG_RAX) && (i_gp_reg_idx <= LIBXSMM_X86_GP_REG_R15) ) { switch ( i_scale ) { case 1: l_scaleadj=0; break; case 2: l_scaleadj=0x40; break; case 4: l_scaleadj=0x80; break; case 8: l_scaleadj=0xc0; break; default: fprintf(stderr, "libxsmm_instruction_vec_compute_mem_mask: cannot handle i_scale=%u parameter\n", i_scale); exit(-1); } } if ( i_use_broadcast ) { l_fourth += 0x10; l_sizereg = 4; } if ( i_use_zero_masking != 0 && i_mask_reg_number != 0 ) l_fourth += 0x80; if (i_gp_reg_idx == LIBXSMM_X86_GP_REG_UNDEF ) { buf[i++] = (unsigned char)(0x62); buf[i++] = (unsigned char)(0xf1 + l_second - l_gp8 * 0x20 - l_oddgrp1 * 0x80 - l_2or3grp1 * 0x10 ); buf[i++] = (unsigned char)(0x7c + l_third - l_oddgrp0 * 0x40 - l_vecval0*8); buf[i++] = (unsigned char)(0x48 + l_fourth - l_2or3grp0 * 0x08); buf[i++] = (unsigned char)(0x1F + l_fifth); buf[i++] = (unsigned char)(0x00 + l_sixth + l_regbas0 + l_vecval1*8 ); if ( l_regbas0 == 4 ) buf[i++]=(unsigned char)(0x24); } else { buf[i++] = (unsigned char)(0x62); buf[i++] = (unsigned char)(0xf1 + l_second - l_gp8 * 0x20 - l_ix8 * 0x40 - l_oddgrp1*0x80 - l_2or3grp1 * 0x10); buf[i++] = (unsigned char)(0x7c + l_third - l_oddgrp0 * 0x40 - l_vecval0*8); buf[i++] = (unsigned char)(0x48 + l_fourth - l_2or3grp0 * 0x08); buf[i++] = (unsigned char)(0x1F + l_fifth); buf[i++] = (unsigned char)(0x04 + l_sixth + l_vecval1*8 ); buf[i++] = (unsigned char)(0x00 + l_scaleadj + l_regbas0 + l_regidx*8); } if ( (l_regbas0 == 5) && (i_displacement==0) ) { /* Registers like rbp/r13 when you have a displacement of 0, we need force the single byte of zero to appear. */ l_forced_offset = 1; } i += internal_x86_instructions_add_offset( l_place, i, i_displacement, l_forced_offset, l_sizereg, buf ); if ( i_immediate != LIBXSMM_X86_IMM_UNDEF ) { buf[i++] = (unsigned char)(i_immediate); } io_generated_code->code_size = i; /* *loc = i; */ } else { /* TODO: Debug. This code was just copy/pasted here */ char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; char l_gp_reg_base[4]; char l_gp_reg_idx[4]; char l_instr_name[16]; char l_broadcast[8]; char l_masking[16]; unsigned int l_single_precision = libxsmm_is_x86_vec_instr_single_precision( i_vec_instr ); libxsmm_get_x86_gp_reg_name( i_gp_reg_base, l_gp_reg_base, 3 ); libxsmm_get_x86_instr_name( i_vec_instr, l_instr_name, 15 ); if (l_single_precision == 0) { LIBXSMM_SNPRINTF( l_broadcast, 7, "1to8" ); } else { LIBXSMM_SNPRINTF( l_broadcast, 7, "1to16" ); } if ( i_mask_reg_number != 0 ) { /* avoid format-truncation warning due to unsigned int (theoretically) exceeding length of string (l_masking) */ LIBXSMM_ASSERT_MSG(i_mask_reg_number < 8, "Invalid mask register"); if ( i_use_zero_masking == 0) { if ( io_generated_code->code_type == 0 ) { LIBXSMM_SNPRINTF(l_masking, 16, "%%{k%hd%%}", (unsigned short)i_mask_reg_number); } else { LIBXSMM_SNPRINTF(l_masking, 16, "{k%hd}", (unsigned short)i_mask_reg_number); } } else { if ( io_generated_code->code_type == 0 ) { LIBXSMM_SNPRINTF(l_masking, 16, "%%{k%hd%%}%%{z%%}", (unsigned short)i_mask_reg_number); } else { LIBXSMM_SNPRINTF(l_masking, 16, "{k%hd}{z}", (unsigned short)i_mask_reg_number); } } } else l_masking[0] = (char)0; /* no mask */ /* build vXYZpd/ps/sd/ss instruction pure register use*/ if ( i_gp_reg_idx == LIBXSMM_X86_GP_REG_UNDEF ) { if ( io_generated_code->code_type == 0 ) { if (i_use_broadcast != 0) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%s %i(%%%%%s)%%{%s%%}, %%%%%cmm%u, %%%%%cmm%u%s\\n\\t\"\n", l_instr_name, i_displacement, l_gp_reg_base, l_broadcast, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1, l_masking ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%s %i(%%%%%s), %%%%%cmm%u, %%%%%cmm%u%s\\n\\t\"\n", l_instr_name, i_displacement, l_gp_reg_base, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1, l_masking ); } } else { if (i_use_broadcast != 0) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %s %i(%%%s) {%s}, %%%cmm%u, %%%cmm%u%s\n", l_instr_name, i_displacement, l_gp_reg_base, l_broadcast, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1, l_masking ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %s %i(%%%s), %%%cmm%u, %%%cmm%u%s\n", l_instr_name, i_displacement, l_gp_reg_base, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1, l_masking ); } } } else { libxsmm_get_x86_gp_reg_name( i_gp_reg_idx, l_gp_reg_idx, 3 ); if ( io_generated_code->code_type == 0 ) { if (i_use_broadcast != 0) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%s %i(%%%%%s,%%%%%s,%u)%%{%s%%}, %%%%%cmm%u, %%%%%cmm%u%s\\n\\t\"\n", l_instr_name, i_displacement, l_gp_reg_base, l_gp_reg_idx, i_scale, l_broadcast, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1, l_masking ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%s %i(%%%%%s,%%%%%s,%u), %%%%%cmm%u, %%%%%cmm%u%s\\n\\t\"\n", l_instr_name, i_displacement, l_gp_reg_base, l_gp_reg_idx, i_scale, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1, l_masking ); } } else { if (i_use_broadcast != 0) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %s %i(%%%s,%%%s,%u) {%s}, %%%cmm%u, %%%cmm%u%s\n", l_instr_name, i_displacement, l_gp_reg_base, l_gp_reg_idx, i_scale, l_broadcast, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1, l_masking ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %s %i(%%%s,%%%s,%u), %%%cmm%u, %%%cmm%u%s\n", l_instr_name, i_displacement, l_gp_reg_base, l_gp_reg_idx, i_scale, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1, l_masking ); } } } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } LIBXSMM_API_INTERN void libxsmm_x86_instruction_vec_compute_qfma( libxsmm_generated_code* io_generated_code, const unsigned int i_instruction_set, const unsigned int i_vec_instr, const unsigned int i_gp_reg_base, const unsigned int i_gp_reg_idx, const unsigned int i_scale, const int i_displacement, const char i_vector_name, const unsigned int i_vec_reg_number_src, const unsigned int i_vec_reg_number_dest ) { /* @TODO add checks in debug mode */ if ( i_instruction_set != LIBXSMM_X86_AVX512_KNM ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_NO_AVX512_QFMA ); return; } if (libxsmm_is_x86_vec_instr_single_precision( i_vec_instr ) == 0) { fprintf( stderr, "LIBXSMM ERROR: QFMA is only supported for single precision\n" ); exit(-1); } if (i_vec_reg_number_src%4 != 0) { fprintf( stderr, "LIBXSMM ERROR: QFMA source register needs to be a multiple of 4\n" ); exit(-1); } if ( io_generated_code->code_type > 1 ) { unsigned char *buf = (unsigned char *) io_generated_code->generated_code; int i = io_generated_code->code_size; /*int i = *loc;*/ unsigned int l_maxsize = io_generated_code->buffer_size; /* unsigned int l_maxsize = 1024; */ int l_place, l_regc0=0, l_regc1=0, l_regc2=0, l_forced_offset=0; int l_sizereg= 1, l_iregnum=0, l_vregnum=0, l_idxnum=0, l_vregdes2=0; int l_scalemov = 0; int l_instr_off = 0; if ( l_maxsize - i < 20 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_BUFFER_TOO_SMALL ); return; } switch ( i_vec_instr ) { case LIBXSMM_X86_INSTR_V4FMADDPS: l_instr_off = 0; break; case LIBXSMM_X86_INSTR_V4FMADDSS: l_instr_off = 0x1; break; case LIBXSMM_X86_INSTR_V4FNMADDPS: l_instr_off = 0x10; break; case LIBXSMM_X86_INSTR_V4FNMADDSS: l_instr_off = 0x11; break; case LIBXSMM_X86_INSTR_VP4DPWSSD: l_instr_off = -0x48; break; case LIBXSMM_X86_INSTR_VP4DPWSSDS: l_instr_off = -0x47; break; default: fprintf(stderr, "Strange qmadd instruction\n"); exit(-1); } if ( i_gp_reg_base == LIBXSMM_X86_GP_REG_RSP ) { fprintf(stderr, "libxsmm_x86_instruction_vec_compute_qfma isn't designed to work with rsp. Base input off\n"); exit(-1); } if ( i_gp_reg_idx == LIBXSMM_X86_GP_REG_RSP ) { fprintf(stderr, "libxsmm_x86_instruction_vec_compute_qfma isn't designed to work with rsp. idx input off\n"); exit(-1); } if ( /*i_vec_reg_number_dest >= 0 &&*/ i_vec_reg_number_dest <= 7 ) l_regc0 = 0; else if ( i_vec_reg_number_dest >= 8 && i_vec_reg_number_dest <= 15 ) l_regc0 = 0x80; else if ( i_vec_reg_number_dest >=16 && i_vec_reg_number_dest <= 23 ) l_regc0 = 0x10; else if ( i_vec_reg_number_dest >=24 && i_vec_reg_number_dest <= 31 ) l_regc0 = 0x90; if ( /*i_vec_reg_number_src >= 0 &&*/ i_vec_reg_number_src <= 7 ) { l_regc1 = 0x40; l_regc2 = 0x08; } else if ( i_vec_reg_number_src >= 8 && i_vec_reg_number_src <=15 ) { l_regc1=0; l_regc2 = 0x08; } else if ( i_vec_reg_number_src >=16 && i_vec_reg_number_src <=23 ) { l_regc1 =0x40; } else if ( i_vec_reg_number_src >=24 && i_vec_reg_number_src <=31 ) { l_regc1 =0; } if ( (i_gp_reg_base != LIBXSMM_X86_GP_REG_UNDEF) && (i_gp_reg_base >= LIBXSMM_X86_GP_REG_R8) && (i_gp_reg_base <= LIBXSMM_X86_GP_REG_R15) ) { l_regc0 += 0x20; } if ( (i_gp_reg_idx != LIBXSMM_X86_GP_REG_UNDEF) && (i_gp_reg_idx >= LIBXSMM_X86_GP_REG_R8) && (i_gp_reg_idx <= LIBXSMM_X86_GP_REG_R15) ) { l_regc0 += 0x40; } l_iregnum = i_gp_reg_base % 8; l_idxnum = i_gp_reg_idx % 8; l_vregnum = (int)(i_vec_reg_number_src/4); l_vregnum *= 4; l_vregnum = l_vregnum % 8; l_vregdes2 = i_vec_reg_number_dest % 8; if ( (l_iregnum == 5) && (i_displacement==0) ) { /* Registers like rbp/r13 when you have a displacement of 0, we need */ /* force the single byte of zero to appear. */ l_forced_offset=1; } if ( i_scale == 1 ) l_scalemov = 0x00; else if ( i_scale == 2 ) l_scalemov = 0x40; else if ( i_scale == 4 ) l_scalemov = 0x80; else if ( i_scale == 8 ) l_scalemov = 0xc0; else if ( (i_gp_reg_idx != LIBXSMM_X86_GP_REG_UNDEF) && /*(i_gp_reg_idx >= LIBXSMM_X86_GP_REG_RAX) &&*/ (i_gp_reg_idx <= LIBXSMM_X86_GP_REG_R15) ) { fprintf(stderr, "libxsmm_x86_instruction_vec_compute_qfma has a strange i_scale parameter\n"); exit(-1); } buf[i++] = 0x62; buf[i++] = (unsigned char)(0xf2 - l_regc0); buf[i++] = (unsigned char)(0x3f + l_regc1 - 8*l_vregnum); buf[i++] = (unsigned char)(0x40 + l_regc2); buf[i++] = (unsigned char)(0x9a + l_instr_off); if ( (i_gp_reg_idx == LIBXSMM_X86_GP_REG_UNDEF) || /*(i_gp_reg_idx < LIBXSMM_X86_GP_REG_RAX) || */ (i_gp_reg_idx > LIBXSMM_X86_GP_REG_R15) ) { l_place = i; l_sizereg = 16; buf[i++] = (unsigned char)(0x00 + l_iregnum + 8*l_vregdes2); } else { l_place = i; buf[i++] = (unsigned char)(0x04 + 8*l_vregdes2); l_sizereg = 16; buf[i++] = (unsigned char)(l_scalemov + l_iregnum + 8*l_idxnum); /* 0x00 + ... */ } /* if ( (l_iregnum == LIBXSMM_X86_GP_REG_RSP) || (l_iregnum == LIBXSMM_X86_GP_REG_RBP) ) { buf[i++] = 0x20 + l_iregnum; } */ i += internal_x86_instructions_add_offset( l_place, i, i_displacement, l_forced_offset, l_sizereg, buf ); io_generated_code->code_size = i; /* *loc = i; */ } else { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; char l_gp_reg_base[4]; char l_gp_reg_idx[4]; char l_instr_name[16]; libxsmm_get_x86_gp_reg_name( i_gp_reg_base, l_gp_reg_base, 3 ); libxsmm_get_x86_instr_name( i_vec_instr, l_instr_name, 15 ); /* build vXYZpd/ps/sd/ss instruction pure register use*/ if ( i_gp_reg_idx == LIBXSMM_X86_GP_REG_UNDEF ) { if ( io_generated_code->code_type == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%s %i(%%%%%s), %%%%%cmm%u, %%%%%cmm%u\\n\\t\"\n", l_instr_name, i_displacement, l_gp_reg_base, i_vector_name, i_vec_reg_number_src, i_vector_name, i_vec_reg_number_dest ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %s %i(%%%s), %%%cmm%u, %%%cmm%u\n", l_instr_name, i_displacement, l_gp_reg_base, i_vector_name, i_vec_reg_number_src, i_vector_name, i_vec_reg_number_dest ); } } else { libxsmm_get_x86_gp_reg_name( i_gp_reg_idx, l_gp_reg_idx, 3 ); if ( io_generated_code->code_type == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%s %i(%%%%%s,%%%%%s,%u), %%%%%cmm%u, %%%%%cmm%u\\n\\t\"\n", l_instr_name, i_displacement, l_gp_reg_base, l_gp_reg_idx, i_scale, i_vector_name, i_vec_reg_number_src, i_vector_name, i_vec_reg_number_dest ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %s %i(%%%s,%%%s,%u), %%%cmm%u, %%%cmm%u\n", l_instr_name, i_displacement, l_gp_reg_base, l_gp_reg_idx, i_scale, i_vector_name, i_vec_reg_number_src, i_vector_name, i_vec_reg_number_dest ); } } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } LIBXSMM_API_INTERN void libxsmm_x86_instruction_vec_shuffle_reg( libxsmm_generated_code* io_generated_code, const unsigned int i_instruction_set, const unsigned int i_vec_instr, const char i_vector_name, const unsigned int i_vec_reg_number_0, const unsigned int i_vec_reg_number_1, const unsigned int i_vec_reg_number_2, const unsigned int i_shuffle_operand ) { /* @TODO add checks in debug mode */ if ( io_generated_code->code_type > 1 ) { /* @TODO-GREG call encoding here */ unsigned char *buf = (unsigned char *) io_generated_code->generated_code; int i = io_generated_code->code_size; /*int i = *loc;*/ unsigned int l_maxsize = io_generated_code->buffer_size; /*unsigned int l_maxsize = 1024;*/ int l_vecval0 = i_vec_reg_number_0 % 8; int l_vecgrp0 = i_vec_reg_number_0 / 8; int l_oddgrp0 = ((l_vecgrp0 % 2)==1); int l_vecval1 = i_vec_reg_number_1 % 8; int l_vecgrp1 = i_vec_reg_number_1 / 8; int l_oddgrp1 = ((l_vecgrp1 % 2)==1); int l_vecval2 = i_vec_reg_number_2 % 8; int l_vecgrp2 = i_vec_reg_number_2 / 8; int l_oddgrp2 = ((l_vecgrp2 % 2)==1); int l_extra_byte = 0; int l_extra_offset = 0; int l_2or3grp0; int l_2or3grp1; int l_2or3grp2; int l_third = 0, l_fifth = 0; if ( l_maxsize - i < 20 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_BUFFER_TOO_SMALL ); return; } switch ( i_vec_instr ) { case LIBXSMM_X86_INSTR_VPERM2F128: if ( (i_vector_name!='y') && (i_vector_name!='Y') ) { fprintf(stderr, "libxsmm_x86_instruction_vec_shuffle_reg: VPERM2F128 only works for ymm\n"); exit(-1); } buf[i++] = (unsigned char)(0xc4); buf[i++] = (unsigned char)(0xe3 - l_oddgrp0 * 0x20 - l_oddgrp2 * 0x80); buf[i++] = (unsigned char)(0x7d - l_oddgrp1 * 0x40 - l_vecval1*8); buf[i++] = (unsigned char)(0x06); buf[i++] = (unsigned char)(0xc0 + l_vecval0 + l_vecval2*8); break; case LIBXSMM_X86_INSTR_SHUFPS: if ( (i_vector_name!='x') && (i_vector_name!='X') ) { fprintf(stderr, "libxsmm_x86_instruction_vec_shuffle_reg: SHUFPS only works for xmm\n"); exit(-1); } l_vecgrp0 = 0; l_vecgrp1 = 0; if ( (i_vec_reg_number_0>=8) && (i_vec_reg_number_0<=15) ) l_vecgrp0 =1; if ( (i_vec_reg_number_1>=8) && (i_vec_reg_number_1<=15) ) l_vecgrp1 =1; if ( (l_vecgrp0 >= 1) || (l_vecgrp1 >= 1) ) { if ( l_vecgrp0 >= 1 ) l_extra_byte += 1; if ( l_vecgrp1 >= 1 ) l_extra_byte += 4; buf[i++] = (unsigned char)(0x40 + l_extra_byte); } buf[i++] = (unsigned char)(0x0f); buf[i++] = (unsigned char)(0xc6); buf[i++] = (unsigned char)(0xc0 + l_vecval0 + l_vecval1*8); break; case LIBXSMM_X86_INSTR_SHUFPD: if ( (i_vector_name!='x') && (i_vector_name!='X') ) { fprintf(stderr, "libxsmm_x86_instruction_vec_shuffle_reg: SHUFPD only works for xmm\n"); exit(-1); } l_vecgrp0 = 0; l_vecgrp1 = 0; if ( (i_vec_reg_number_0>=8) && (i_vec_reg_number_0<=15) ) l_vecgrp0 =1; if ( (i_vec_reg_number_1>=8) && (i_vec_reg_number_1<=15) ) l_vecgrp1 =1; if ( (l_vecgrp0 >= 1) || (l_vecgrp1 >= 1) ) { buf[i++] = (unsigned char)(0x66); l_extra_byte = 0x22; if ( l_vecgrp0 >= 1 ) l_extra_byte += 3; } buf[i++] = (unsigned char)(0x66 - l_extra_byte); buf[i++] = (unsigned char)(0x0f); buf[i++] = (unsigned char)(0xc6); buf[i++] = (unsigned char)(0xc0 + l_vecval0 + l_vecval1*8); break; case LIBXSMM_X86_INSTR_VSHUFPS: if ( (i_vector_name=='x') || (i_vector_name=='X') ) { fprintf(stderr, "libxsmm_x86_instruction_vec_shuffle_reg: VSHUFPS not working for xmm\n"); exit(-1); } if ( (i_vector_name=='y') || (i_vector_name=='Y') ) { if ( l_vecgrp0 >= 1 ) { buf[i++] = (unsigned char)(0xc4); if ( l_vecgrp2 >= 1 ) { l_extra_byte = 0x84; l_extra_offset = 0x80; } else { l_extra_byte = 0x04; } } buf[i++] = (unsigned char)(0xc5 - l_extra_byte); buf[i++] = (unsigned char)(0xfc - l_extra_offset - l_oddgrp0 * 0x80 - l_oddgrp1 * 0x40 - l_oddgrp2 * 0x80 - l_vecval1*8); buf[i++] = (unsigned char)(0xc6); buf[i++] = (unsigned char)(0xc0 + l_vecval0 + l_vecval2*8); } else if ( (i_vector_name=='z') || (i_vector_name=='Z') ) { l_2or3grp0 = (l_vecgrp0>=2); l_2or3grp1 = (l_vecgrp1>=2); l_2or3grp2 = (l_vecgrp2>=2); buf[i++] = (unsigned char)(0x62); buf[i++] = (unsigned char)(0xf1 - l_oddgrp0 * 0x20 - l_oddgrp2 * 0x80 - l_2or3grp0 * 0x40 - l_2or3grp2 * 0x10); buf[i++] = (unsigned char)(0x7c - l_oddgrp1 * 0x40 - l_vecval1*8); buf[i++] = (unsigned char)(0x48 - l_2or3grp1 * 0x08); buf[i++] = (unsigned char)(0xc6); buf[i++] = (unsigned char)(0xc0 + l_vecval0 + l_vecval2*8); } else { fprintf(stderr, "libxsmm_x86_instruction_vec_shuffle_reg: unknown i_vector_name=%c for VSHUFPS\n",i_vector_name); exit(-1); } break; case LIBXSMM_X86_INSTR_VSHUFPD: if ( (i_vector_name=='x') || (i_vector_name=='X') ) { fprintf(stderr, "libxsmm_x86_instruction_vec_shuffle_reg: VSHUFPD not working for xmm\n"); exit(-1); } if ( (i_vector_name=='y') || (i_vector_name=='Y') ) { if ( l_vecgrp0 >= 1 ) { buf[i++] = (unsigned char)(0xc4); if ( l_vecgrp2 >= 1 ) { l_extra_byte = 0x84; l_extra_offset = 0x80; } else { l_extra_byte = 0x04; } } buf[i++] = (unsigned char)(0xc5 - l_extra_byte); /* Only differs from VSHUFS on the 2nd byte here */ buf[i++] = (unsigned char)(0xfd - l_extra_offset - l_oddgrp0 * 0x80 - l_oddgrp1 * 0x40 - l_oddgrp2 * 0x80 - l_vecval1*8); buf[i++] = (unsigned char)(0xc6); buf[i++] = (unsigned char)(0xc0 + l_vecval0 + l_vecval2*8); } else if ( (i_vector_name=='z') || (i_vector_name=='Z') ) { l_2or3grp0 = (l_vecgrp0>=2); l_2or3grp1 = (l_vecgrp1>=2); l_2or3grp2 = (l_vecgrp2>=2); buf[i++] = (unsigned char)(0x62); buf[i++] = (unsigned char)(0xf1 - l_oddgrp0 * 0x20 - l_oddgrp2 * 0x80 - l_2or3grp0 * 0x40 - l_2or3grp2 * 0x10); /* Only differs from VSHUFS on the 3rd byte here */ buf[i++] = (unsigned char)(0xfd - l_oddgrp1 * 0x40 - l_vecval1*8); buf[i++] = (unsigned char)(0x48 - l_2or3grp1 * 0x08); buf[i++] = (unsigned char)(0xc6); buf[i++] = (unsigned char)(0xc0 + l_vecval0 + l_vecval2*8); } else { fprintf(stderr, "libxsmm_x86_instruction_vec_shuffle_reg: unknown i_vector_name=%c for VSHUFPD\n",i_vector_name); exit(-1); } break; case LIBXSMM_X86_INSTR_VPSRAD: if ( i_vec_reg_number_2 != LIBXSMM_X86_VEC_REG_UNDEF ) { fprintf(stderr,"libxsmm_x86_instruction_vec_shuffle_reg: shouldn't use vec reg 2 for VPSRAD\n"); exit(-1); } l_2or3grp0 = (l_vecgrp0>=2); l_2or3grp1 = (l_vecgrp1>=2); buf[i++] = (unsigned char)(0x62); buf[i++] = (unsigned char)(0xf1 - l_oddgrp0 * 0x20 - l_2or3grp0 * 0x40); buf[i++] = (unsigned char)(0x7d - l_oddgrp1 * 0x40 - l_vecval1*8); buf[i++] = (unsigned char)(0x48 - l_2or3grp1 * 0x08); buf[i++] = (unsigned char)(0x72); buf[i++] = (unsigned char)(0xe0 + l_vecval0); break; case LIBXSMM_X86_INSTR_VPSLLD: if ( i_vec_reg_number_2 != LIBXSMM_X86_VEC_REG_UNDEF ) { fprintf(stderr,"libxsmm_x86_instruction_vec_shuffle_reg: shouldn't use vec reg 2 for VPSLLD\n"); exit(-1); } l_2or3grp0 = (l_vecgrp0>=2); l_2or3grp1 = (l_vecgrp1>=2); buf[i++] = (unsigned char)(0x62); buf[i++] = (unsigned char)(0xf1 - l_oddgrp0 * 0x20 - l_2or3grp0 * 0x40); buf[i++] = (unsigned char)(0x7d - l_oddgrp1 * 0x40 - l_vecval1*8); buf[i++] = (unsigned char)(0x48 - l_2or3grp1 * 0x08); buf[i++] = (unsigned char)(0x72); buf[i++] = (unsigned char)(0xf0 + l_vecval0); break; case LIBXSMM_X86_INSTR_VPSRLD: if ( i_vec_reg_number_2 != LIBXSMM_X86_VEC_REG_UNDEF ) { fprintf(stderr,"libxsmm_x86_instruction_vec_shuffle_reg: VPSRLD requires vec2 be undef\n"); exit(-1); } if ( (i_vector_name!='z') && (i_vector_name!='Z') ) { fprintf(stderr, "libxsmm_x86_instruction_vec_shuffle_reg: VPSRLD only works for zmm\n"); exit(-1); } l_2or3grp0 = (l_vecgrp0>=2); l_2or3grp1 = (l_vecgrp1>=2); buf[i++] = (unsigned char)(0x62); buf[i++] = (unsigned char)(0xf1 - l_oddgrp0 * 0x20 - l_2or3grp0 * 0x40); buf[i++] = (unsigned char)(0x7d - l_oddgrp1 * 0x40 - l_vecval1*8); buf[i++] = (unsigned char)(0x48 - l_2or3grp1 * 0x08); buf[i++] = (unsigned char)(0x72); buf[i++] = (unsigned char)(0xd0 + l_vecval0); break; case LIBXSMM_X86_INSTR_VSHUFF64X2: case LIBXSMM_X86_INSTR_VSHUFF32X4: case LIBXSMM_X86_INSTR_VSHUFI32X4: case LIBXSMM_X86_INSTR_VSHUFI64X2: l_2or3grp0 = (l_vecgrp0>=2); l_2or3grp1 = (l_vecgrp1>=2); l_2or3grp2 = (l_vecgrp2>=2); if ( (i_vec_instr == LIBXSMM_X86_INSTR_VSHUFF32X4) || (i_vec_instr == LIBXSMM_X86_INSTR_VSHUFI32X4) ) l_third = -0x80; if ( (i_vec_instr == LIBXSMM_X86_INSTR_VSHUFI32X4) || (i_vec_instr == LIBXSMM_X86_INSTR_VSHUFI64X2) ) l_fifth = 0x20; if ( (i_vector_name!='z') && (i_vector_name!='Z') ) { fprintf(stderr, "libxsmm_x86_instruction_vec_shuffle_reg: VSHUF[IF][36][24]X[24] only works for zmm\n"); exit(-1); } buf[i++] = (unsigned char)(0x62); buf[i++] = (unsigned char)(0xf3 - l_oddgrp0 * 0x20 - l_oddgrp2 * 0x80 - l_2or3grp0 * 0x40 - l_2or3grp2 * 0x10); buf[i++] = (unsigned char)(0xfd + l_third - l_oddgrp1 * 0x40 - l_vecval1*8); buf[i++] = (unsigned char)(0x48 - l_2or3grp1 * 0x08); buf[i++] = (unsigned char)(0x23 + l_fifth); buf[i++] = (unsigned char)(0xc0 + l_vecval0 + l_vecval2*8); break; case LIBXSMM_X86_INSTR_VEXTRACTF32X8: l_2or3grp0 = (l_vecgrp0>=2); l_2or3grp1 = (l_vecgrp1>=2); if ( i_vec_reg_number_2 != LIBXSMM_X86_VEC_REG_UNDEF ) { fprintf(stderr,"libxsmm_x86_instruction_vec_shuffle_reg: VEXTRACTF32X8 requires vec2 be undef\n"); exit(-1); } if ( (i_vector_name!='z') && (i_vector_name!='Z') ) { fprintf(stderr, "libxsmm_x86_instruction_vec_shuffle_reg: VEXTRACTF32X8 only works for zmm\n"); exit(-1); } buf[i++] = (unsigned char)(0x62); buf[i++] = (unsigned char)(0xf3 - l_oddgrp0 * 0x80 - l_oddgrp1 * 0x20 - l_2or3grp0 * 0x10 - l_2or3grp1 * 0x40); buf[i++] = (unsigned char)(0x7d); buf[i++] = (unsigned char)(0x48); buf[i++] = (unsigned char)(0x1b); buf[i++] = (unsigned char)(0xc0 + l_vecval0*8 + l_vecval1); break; case LIBXSMM_X86_INSTR_VEXTRACTF64X4: l_2or3grp0 = (l_vecgrp0>=2); l_2or3grp1 = (l_vecgrp1>=2); if ( i_vec_reg_number_2 != LIBXSMM_X86_VEC_REG_UNDEF ) { fprintf(stderr,"libxsmm_x86_instruction_vec_shuffle_reg: VEXTRACTF64x4 requires vec2 be undef\n"); exit(-1); } if ( (i_vector_name!='z') && (i_vector_name!='Z') ) { fprintf(stderr, "libxsmm_x86_instruction_vec_shuffle_reg: VEXTRACTF64x4 only works for zmm\n"); exit(-1); } buf[i++] = (unsigned char)(0x62); buf[i++] = (unsigned char)(0xf3 - l_oddgrp0 * 0x80 - l_oddgrp1 * 0x20 - l_2or3grp0 * 0x10 - l_2or3grp1 * 0x40); buf[i++] = (unsigned char)(0xfd); buf[i++] = (unsigned char)(0x48); buf[i++] = (unsigned char)(0x1b); buf[i++] = (unsigned char)(0xc0 + l_vecval0*8 + l_vecval1); break; default: fprintf(stderr, "libxsmm_x86_instruction_vec_shuffle_reg doesn't yet do this instruction\n"); exit(-1); } /* Every instruction in this group has 1 byte at the end with the operand */ buf[i++] = (unsigned char)(i_shuffle_operand); io_generated_code->code_size = i; /* *loc = i; */ } else { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; char l_instr_name[16]; libxsmm_get_x86_instr_name( i_vec_instr, l_instr_name, 15 ); if ( i_instruction_set != LIBXSMM_X86_SSE3 ) { if ( io_generated_code->code_type == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%s $%u, %%%%%cmm%u, %%%%%cmm%u, %%%%%cmm%u\\n\\t\"\n", l_instr_name, i_shuffle_operand, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1, i_vector_name, i_vec_reg_number_2 ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %s $%u, %%%cmm%u, %%%cmm%u, %%%cmm%u\n", l_instr_name, i_shuffle_operand, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1, i_vector_name, i_vec_reg_number_2 ); } } else { if ( io_generated_code->code_type == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%s $%u, %%%%%cmm%u, %%%%%cmm%u\\n\\t\"\n", l_instr_name, i_shuffle_operand, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1 ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %s $%u, %%%cmm%u, %%%cmm%u\n", l_instr_name, i_shuffle_operand, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1 ); } } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } LIBXSMM_API_INTERN void libxsmm_x86_instruction_vec_move_gathscat( libxsmm_generated_code* io_generated_code, const unsigned int i_instruction_set, const unsigned int i_vmove_instr, const char i_vector_name, const unsigned int i_gp_reg_base, const unsigned int i_vec_reg_idx, const unsigned int i_scale, const int i_displacement, const unsigned int i_vec_reg_number, const unsigned int i_mask_reg_number, const unsigned int i_is_gather ) { /* @TODO add checks in debug mode */ if ( io_generated_code->code_type > 1 ) { /* @TODO-GREG call encoding here */ unsigned char *buf = (unsigned char *) io_generated_code->generated_code; int i = io_generated_code->code_size; /* int i = *loc; */ unsigned int l_maxsize = io_generated_code->buffer_size; /* unsigned int l_maxsize = 1024; */ int l_sizereg = 0; int l_instr_offset = 0; int l_instr_offset2 = 0; int l_forced_offset = 0; if ( l_maxsize - i < 20 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_BUFFER_TOO_SMALL ); return; } switch ( i_vmove_instr ) { case LIBXSMM_X86_INSTR_VGATHERDPS: l_sizereg = 4; l_instr_offset = 0; l_instr_offset2 = 0; break; case LIBXSMM_X86_INSTR_VGATHERDPD: l_sizereg = 8; l_instr_offset = 0x80; l_instr_offset2 = 0; break; case LIBXSMM_X86_INSTR_VGATHERQPS: l_sizereg = 4; l_instr_offset = 0; l_instr_offset2 = 1; break; case LIBXSMM_X86_INSTR_VGATHERQPD: l_sizereg = 8; l_instr_offset = 0x80; l_instr_offset2 = 1; break; default: fprintf(stderr, "libxsmm_x86_instruction_vec_move_gathscat: Strange gather/scatter instruction:%u\n",i_vmove_instr); exit(-1); } if ( i_vector_name != 'z' ) { fprintf(stderr, "libxsmm_x86_instruction_vec_move_gathscat: encoder only implemented for zmm registers, but notice that i_vector_name=%c\n",i_vector_name); exit(-1); } if ( i_is_gather == 0 ) { fprintf(stderr, "libxsmm_x86_instruction_vec_move_gathscat: encoder not implemented for scatters yet\n"); exit(-1); } { /* open a new scope to avoid warning about mixed declaration and code (C89) */ int l_regbas0 = i_gp_reg_base % 8; int l_gp8 = ((i_gp_reg_base > 7)&&(i_gp_reg_base<=15)?1:0); int l_vecval1 = i_vec_reg_number % 8; int l_vecgrp1 = i_vec_reg_number / 8; int l_oddgrp1 = ((l_vecgrp1 % 2)==1); int l_2or3grp1 = (l_vecgrp1>=2); int l_vecval0 = i_vec_reg_idx % 8; int l_vecgrp0 = i_vec_reg_idx / 8; int l_oddgrp0 = ((l_vecgrp0 % 2)==1); int l_2or3grp0 = (l_vecgrp0>=2); int l_sca=0; if (i_scale==2) l_sca=0x40; else if (i_scale==4) l_sca=0x80; else if (i_scale==8) l_sca=0xc0; buf[i++] = (unsigned char)(0x62); buf[i++] = (unsigned char)(0xf2 - l_gp8 * 0x20 - l_oddgrp0 * 0x40 - l_oddgrp1 * 0x80 - l_2or3grp1 * 0x10); buf[i++] = (unsigned char)(0x7d + l_instr_offset); buf[i++] = (unsigned char)(0x48 - l_2or3grp0 * 0x08 + i_mask_reg_number); buf[i++] = (unsigned char)(0x92 + l_instr_offset2); buf[i++] = (unsigned char)(0x04 + l_vecval1 * 8); buf[i++] = (unsigned char)(0x00 + l_sca + l_regbas0 + l_vecval0 * 8); if ( (l_regbas0 == 5) && (i_displacement==0) ) { l_forced_offset = 1; } i += internal_x86_instructions_add_offset( i-2, i, i_displacement, l_forced_offset, l_sizereg, buf ); io_generated_code->code_size = i; /* *loc = i; */ } } else { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; char l_instr_name[16]; char l_gp_reg_base_name[4]; libxsmm_get_x86_gp_reg_name( i_gp_reg_base, l_gp_reg_base_name, 3 ); libxsmm_get_x86_instr_name( i_vmove_instr, l_instr_name, 15 ); if ( i_is_gather == 0 ) { fprintf(stderr, "LIBXSMM ERROR: libxsmm_x86_instruction_vec_move_gathscat yet needs to be implemented for scatters!\n"); exit(-1); } else { if ( i_instruction_set >= LIBXSMM_X86_AVX512 ) { if ( io_generated_code->code_type == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%s %i(%%%%%s,%%%%zmm%u,%u), %%%%zmm%u%%{%%%%k%u%%}\\n\\t\"\n", l_instr_name, i_displacement, l_gp_reg_base_name, i_vec_reg_idx, i_scale, i_vec_reg_number, i_mask_reg_number); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %s %i(%%%s,%%zmm%u,%u), %%zmm%u{%%k%u}\n", l_instr_name, i_displacement, l_gp_reg_base_name, i_vec_reg_idx, i_scale, i_vec_reg_number, i_mask_reg_number ); } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else { fprintf(stderr, "LIBXSMM ERROR: libxsmm_x86_instruction_vec_move_gathscat yet needs to be implemented for non-AVX512F!\n"); exit(-1); } } } } LIBXSMM_API_INTERN void libxsmm_x86_instruction_prefetch( libxsmm_generated_code* io_generated_code, const unsigned int i_prefetch_instr, const unsigned int i_gp_reg_base, const unsigned int i_gp_reg_idx, const unsigned int i_scale, const int i_displacement ) { /* @TODO add checks in debug mode */ if ( io_generated_code->code_type > 1 ) { unsigned char *buf = (unsigned char *) io_generated_code->generated_code; int i = io_generated_code->code_size; /* int i = *loc; */ unsigned int l_maxsize = io_generated_code->buffer_size; /* unsigned int l_maxsize = 1024; */ int l_instype = 0; int l_forced_offset=0; int l_regbas0 = i_gp_reg_base % 8; int l_gp8 = ((i_gp_reg_base > 7) && (i_gp_reg_base <= 15) ? 1 : 0); int l_ix8 = ((i_gp_reg_idx > 7) && (i_gp_reg_idx <= 15) ? 1 : 0); int l_sse_preamble = 64; int l_place1 = i + 2; if ( l_maxsize - i < 20 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_BUFFER_TOO_SMALL ); return; } if ( ((int)i_gp_reg_base < LIBXSMM_X86_GP_REG_RAX) || ((int)i_gp_reg_base > LIBXSMM_X86_GP_REG_R15) || (i_gp_reg_base > 15) || ((int)i_gp_reg_base == LIBXSMM_X86_GP_REG_UNDEF) ) { fprintf(stderr, "libxsmm_instruction_prefetch: i_gp_reg_base error in libxsmm_instruction_prefetch\n"); exit(-1); } switch ( i_prefetch_instr ) { case LIBXSMM_X86_INSTR_PREFETCHT0: l_instype -= 8; break; case LIBXSMM_X86_INSTR_PREFETCHT1: break; case LIBXSMM_X86_INSTR_PREFETCHT2: l_instype += 8; break; case LIBXSMM_X86_INSTR_PREFETCHNTA: l_instype -= 16; break; case LIBXSMM_X86_INSTR_VPREFETCH0: fprintf(stderr, "libxsmm_instruction_prefetch: don't yet do vprefetch0\n"); exit(-1); break; case LIBXSMM_X86_INSTR_VPREFETCH1: fprintf(stderr, "libxsmm_instruction_prefetch: don't yet do vprefetch1\n"); exit(-1); break; default: fprintf(stderr, "libxsmm_instruction_prefetch: Strange prefetch instruction: %u\n",i_prefetch_instr); exit(-1); } if ( l_gp8 || l_ix8 ) { if (l_gp8) l_sse_preamble += 1; if (l_ix8) l_sse_preamble += 2; buf[i++] = (unsigned char)l_sse_preamble; ++l_place1; } if (i_gp_reg_idx == LIBXSMM_X86_GP_REG_UNDEF ){ LIBXSMM_ASSERT(i_gp_reg_idx == LIBXSMM_X86_GP_REG_UNDEF); buf[i++] = 0x0f; buf[i++] = 0x18; buf[i++] = (unsigned char)(0x10 + l_instype + l_regbas0); if ( l_regbas0 == 4 ) buf[i++]=0x24; } else { const int l_regidx = i_gp_reg_idx % 8; int l_sca = 0; if (i_scale == 2) l_sca = 0x40; else if (i_scale == 4) l_sca = 0x80; else if (i_scale == 8) l_sca = 0xc0; buf[i++] = 0x0f; buf[i++] = 0x18; buf[i++] = (unsigned char)(0x14 + l_instype); buf[i++] = (unsigned char)(0x00 + l_sca + l_regbas0 + l_regidx*8); } if ( ( l_regbas0 == 5) && (i_displacement==0) ) { /* Registers like rbp/r13 when you have a displacement of 0, we need * force the single byte of zero to appear. */ l_forced_offset = 1; } i += internal_x86_instructions_add_offset( l_place1, i, i_displacement, l_forced_offset, 1, buf ); io_generated_code->code_size = i; /* *loc = i; */ } else { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; char l_gp_reg_base_name[4]; char l_gp_reg_idx_name[4]; char l_instr_name[16]; libxsmm_get_x86_gp_reg_name( i_gp_reg_base, l_gp_reg_base_name, 3 ); libxsmm_get_x86_instr_name( i_prefetch_instr, l_instr_name, 15 ); if ( io_generated_code->code_type == 0 ) { if (i_gp_reg_idx == LIBXSMM_X86_GP_REG_UNDEF ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%s %i(%%%%%s)\\n\\t\"\n", l_instr_name, i_displacement, l_gp_reg_base_name ); } else { libxsmm_get_x86_gp_reg_name( i_gp_reg_idx, l_gp_reg_idx_name, 3 ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%s %i(%%%%%s,%%%%%s,%u)\\n\\t\"\n", l_instr_name, i_displacement, l_gp_reg_base_name, l_gp_reg_idx_name, i_scale ); } } else { if (i_gp_reg_idx == LIBXSMM_X86_GP_REG_UNDEF ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %s %i(%%%s)\n", l_instr_name, i_displacement, l_gp_reg_base_name ); } else { libxsmm_get_x86_gp_reg_name( i_gp_reg_idx, l_gp_reg_idx_name, 3 ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %s %i(%%%s,%%%s,%u)\n", l_instr_name, i_displacement, l_gp_reg_base_name, l_gp_reg_idx_name, i_scale ); } } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } LIBXSMM_API_INTERN void libxsmm_x86_instruction_alu_mem( libxsmm_generated_code* io_generated_code, const unsigned int i_alu_instr, const unsigned int i_gp_reg_base, const unsigned int i_gp_reg_idx, const unsigned int i_scale, const int i_displacement, const unsigned int i_gp_reg_number, const unsigned int i_is_store ) { /* @TODO add checks in debug mode */ if ( io_generated_code->code_type > 1 ) { unsigned char *buf = (unsigned char *) io_generated_code->generated_code; int i = io_generated_code->code_size; int l_inst = 0x00, l_base = 0x00, l_place2 = i+2; int l_regbas0, l_gp8, l_regnum, l_nx8, l_sca = 0, l_forced_offset=0; switch ( i_alu_instr ) { case LIBXSMM_X86_INSTR_MOVSLQ: if ( i_is_store == 1 ) { fprintf(stderr, "libxsmm_instruction_alu_mem: only use LIBXSMM_X86_INSTR_MOVSLQ with loads\n"); exit(-1); } break; case LIBXSMM_X86_INSTR_MOVQ: if ( i_is_store == 1 ) { l_inst = 0x26; } else { l_inst = 0x28; } break; case LIBXSMM_X86_INSTR_LEAQ: l_inst = 0x2A; break; case LIBXSMM_X86_INSTR_MOVL: if ( i_is_store == 1 ) { l_inst = 0x26; } else { l_inst = 0x28; } l_base = -8; break; default: fprintf(stderr, "libxsmm_instruction_alu_mem: Unknown instruction: %u\n", i_alu_instr); exit(-1); } l_regbas0 = i_gp_reg_base % 8; l_gp8 = ((i_gp_reg_base > 7)&&(i_gp_reg_base<=15)?1:0); l_regnum = i_gp_reg_number % 8; l_nx8 = ((i_gp_reg_number>7)&&(i_gp_reg_number<=15)?1:0); if (i_scale==2) l_sca=0x40; else if (i_scale==4) l_sca=0x80; else if (i_scale==8) l_sca=0xc0; if (i_gp_reg_idx == LIBXSMM_X86_GP_REG_UNDEF ) { if ((i_alu_instr != LIBXSMM_X86_INSTR_MOVL) || l_gp8 || l_nx8 ) { buf[i++] = (unsigned char)(0x48 + l_base + l_gp8 * 0x01 + l_nx8 * 0x04); } else { l_place2 = i+1; } buf[i++] = (unsigned char)(0x63 + l_inst); buf[i++] = (unsigned char)(l_sca + l_regbas0 + l_regnum * 0x08); if ( l_regbas0 == 4 ) /* rsp or r12 */ { buf[i++] = 0x24; } } else { int l_regidx = i_gp_reg_idx % 8; int l_ix8 = ((i_gp_reg_idx > 7)&&(i_gp_reg_idx<=15)?1:0); if ((i_alu_instr != LIBXSMM_X86_INSTR_MOVL) || l_gp8 || l_nx8 || l_ix8 ) { buf[i++] = (unsigned char)(0x48 + l_base + l_gp8 * 0x01 + l_ix8 * 0x02 + l_nx8 * 0x04); } else { l_place2 = i+1; } buf[i++] = (unsigned char)(0x63 + l_inst); buf[i++] = (unsigned char)(0x04 + l_regnum * 0x08); buf[i++] = (unsigned char)(l_sca + l_regbas0 + l_regidx*8); } if ( (l_regbas0 == 5) && (i_displacement==0) ) { l_forced_offset = 1; } i += internal_x86_instructions_add_offset( l_place2, i, i_displacement, l_forced_offset, 1, buf ); io_generated_code->code_size = i; } } LIBXSMM_API_INTERN void libxsmm_x86_instruction_alu_imm( libxsmm_generated_code* io_generated_code, const unsigned int i_alu_instr, const unsigned int i_gp_reg_number, const long long i_immediate ) { /* @TODO add checks in debug mode */ if ( io_generated_code->code_type > 1 ) { unsigned char *buf = (unsigned char *) io_generated_code->generated_code; int i = io_generated_code->code_size; int l_first = 0; int l_second = 0; int l_third = 0; int l_reg0 = 0; int l_extra = 0; int l_unsignedadj = 0; int l_r8adjment = 1; int l_reg0multiplier = 1; if (NULL == buf) { LIBXSMM_HANDLE_ERROR(io_generated_code, LIBXSMM_ERR_BUFFER_TOO_SMALL); return; } switch ( i_alu_instr ) { case LIBXSMM_X86_INSTR_ADDQ: break; case LIBXSMM_X86_INSTR_SALQ: if ( (i_immediate < 0) || (i_immediate > 127) ) { fprintf(stderr, "libxsmm_instruction_alu_imm is using an out-of-range immediate for salq.\n" "because other immediates are signed but salq is unsigned. So this code\n" "should be changed if you want an immediate in this range.\n"); exit(-1); } l_unsignedadj = 0x3e; l_third += 0x20; break; case LIBXSMM_X86_INSTR_IMUL: /* Note: we assume that if you call imul in alu_imm you mean: something like imul $3,%rax,%rax. That is, we assume that i_gp_reg_number is used twice */ l_unsignedadj = -0x18; l_extra -= 0x18; l_r8adjment = 0x05; l_reg0multiplier = 9; /* We are adjusting by 1 and 8 at the same time */ break; case LIBXSMM_X86_INSTR_SUBQ: l_second += 0x28; l_third += 0x28; break; case LIBXSMM_X86_INSTR_ANDQ: l_second += 0x20; l_third += 0x20; break; case LIBXSMM_X86_INSTR_MOVQ: l_second += 0x46; l_extra += 0x46; break; case LIBXSMM_X86_INSTR_CMPQ: l_second += 0x38; l_third += 0x38; break; default: fprintf(stderr, "libxsmm_instruction_alu_imm: Unknown instruction type: %u\n",i_alu_instr); exit(-1); } if ( (i_gp_reg_number > 7) && (i_gp_reg_number <= 15) ) { l_first += l_r8adjment; l_reg0 = i_gp_reg_number - 8; } else { l_reg0 = i_gp_reg_number; } if ( (i_immediate <= 127) && (i_immediate >= -128) && (i_alu_instr!=LIBXSMM_X86_INSTR_MOVQ) ) { /* one byte (even for 0!) - but never for MOVQ */ buf[i++] = (unsigned char)(0x48 + l_first); buf[i++] = (unsigned char)(0x83 + l_unsignedadj); buf[i++] = (unsigned char)(0xc0 + l_third + l_reg0*l_reg0multiplier); buf[i++] = (unsigned char)(i_immediate); } else { /* four bytes */ unsigned char *l_cptr = (unsigned char *) &i_immediate; buf[i++] = (unsigned char)(0x48 + l_first); if ( i_gp_reg_number==0 && ((i_alu_instr==LIBXSMM_X86_INSTR_SUBQ) || (i_alu_instr==LIBXSMM_X86_INSTR_CMPQ) || (i_alu_instr==LIBXSMM_X86_INSTR_ADDQ) || (i_alu_instr==LIBXSMM_X86_INSTR_ANDQ)) ) { /* special case for %rax! */ buf[i++] = (unsigned char)(0x05 + l_second); } else { buf[i++] = (unsigned char)(0x81 + l_extra); buf[i++] = (unsigned char)(0xc0 + l_third + l_reg0*l_reg0multiplier); } buf[i++] = l_cptr[0]; buf[i++] = l_cptr[1]; buf[i++] = l_cptr[2]; buf[i++] = l_cptr[3]; } io_generated_code->code_size = i; /* *loc = i; */ } else { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; char l_gp_reg_name[4]; char l_instr_name[16]; libxsmm_get_x86_gp_reg_name( i_gp_reg_number, l_gp_reg_name, 3 ); libxsmm_get_x86_instr_name( i_alu_instr, l_instr_name, 15 ); if ( io_generated_code->code_type == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%s $%lli, %%%%%s\\n\\t\"\n", l_instr_name, i_immediate, l_gp_reg_name ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %s $%lli, %%%s\n", l_instr_name, i_immediate, l_gp_reg_name ); } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } LIBXSMM_API_INTERN void libxsmm_x86_instruction_alu_imm_i64( libxsmm_generated_code* io_generated_code, const unsigned int i_alu_instr, const unsigned int i_gp_reg_number, const size_t i_immediate ) { /* @TODO add checks in debug mode */ if ( io_generated_code->code_type > 1 ) { unsigned char *buf = (unsigned char *) io_generated_code->generated_code; unsigned char *l_cptr = (unsigned char *) &i_immediate; int i = io_generated_code->code_size; int l_first = 0; int l_reg0 = 0; if ( i_alu_instr != LIBXSMM_X86_INSTR_MOVQ ) { fprintf(stderr,"How are you doing a 64-byte immediate on instruction: %u\n",i_alu_instr); exit(-1); } if ( /*i_gp_reg_number < 0 ||*/ i_gp_reg_number > 15 ) { fprintf(stderr,"libxsmm_x86_instruction_alu_imm_i64 strange gp reg=%u\n",i_gp_reg_number); exit(-1); } l_reg0 = i_gp_reg_number; if ( i_gp_reg_number >= 8 ) { l_first = 1; l_reg0 -= 8; } buf[i++]= (unsigned char)(0x48 + l_first); buf[i++]= (unsigned char)(0xb8 + l_reg0); buf[i++] = l_cptr[0]; buf[i++] = l_cptr[1]; buf[i++] = l_cptr[2]; buf[i++] = l_cptr[3]; buf[i++] = l_cptr[4]; buf[i++] = l_cptr[5]; buf[i++] = l_cptr[6]; buf[i++] = l_cptr[7]; io_generated_code->code_size = i; /* *loc = i; */ } else { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; char l_gp_reg_name[4]; char l_instr_name[16]; libxsmm_get_x86_gp_reg_name( i_gp_reg_number, l_gp_reg_name, 3 ); libxsmm_get_x86_instr_name( i_alu_instr, l_instr_name, 15 ); if ( io_generated_code->code_type == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%s $%" PRIuPTR ", %%%%%s\\n\\t\"\n", l_instr_name, (uintptr_t)i_immediate, l_gp_reg_name ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %s $%" PRIuPTR ", %%%s\n", l_instr_name, (uintptr_t)i_immediate, l_gp_reg_name ); } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } LIBXSMM_API_INTERN void libxsmm_x86_instruction_alu_reg( libxsmm_generated_code* io_generated_code, const unsigned int i_alu_instr, const unsigned int i_gp_reg_number_src, const unsigned int i_gp_reg_number_dest) { /* @TODO add checks in debug mode */ if ( io_generated_code->code_type > 1 ) { unsigned char *buf = (unsigned char *) io_generated_code->generated_code; int i = io_generated_code->code_size; /* int i = *loc; */ /* unsigned int l_maxsize = io_generated_code->buffer_size;*/ /* unsigned int l_maxsize = 1024; */ int l_second = 0; int l_third = 0; int l_extra_byte = 0; int l_reg1 = i_gp_reg_number_src; int l_reg0 = i_gp_reg_number_dest; switch ( i_alu_instr ) { case LIBXSMM_X86_INSTR_ADDQ: break; case LIBXSMM_X86_INSTR_SUBQ: l_second += 0x28; break; case LIBXSMM_X86_INSTR_MOVQ: l_second += 0x88; break; case LIBXSMM_X86_INSTR_CMPQ: l_second += 0x38; break; case LIBXSMM_X86_INSTR_ANDQ: l_second += 0x20; break; case LIBXSMM_X86_INSTR_CMOVZ: l_second += 0x0e; l_extra_byte = 1; l_reg1 = i_gp_reg_number_dest; l_reg0 = i_gp_reg_number_src; break; case LIBXSMM_X86_INSTR_CMOVNZ: l_second += 0x0e; l_third += 0x01; l_extra_byte = 1; l_reg1 = i_gp_reg_number_dest; l_reg0 = i_gp_reg_number_src; break; case LIBXSMM_X86_INSTR_POPCNT: l_second += 0x0e; l_third += 0x74; l_extra_byte = 1; l_reg1 = i_gp_reg_number_dest; l_reg0 = i_gp_reg_number_src; break; case LIBXSMM_X86_INSTR_TZCNT: l_second += 0x0e; l_third += 0x78; l_extra_byte = 1; l_reg1 = i_gp_reg_number_dest; l_reg0 = i_gp_reg_number_src; break; default: fprintf(stderr, "libxsmm_instruction_alu_reg: Not sure what instruction you have in mind: %u\n",i_alu_instr); exit(-1); } {/* open new scope for additional variable declarations (C89) */ int l_regbas0 = l_reg0 % 8; int l_gp8 = ((l_reg0 > 7)&&(l_reg0 <=15)?1:0); int l_regnum = l_reg1 % 8; int l_nx8 = ((l_reg1 >7)&&(l_reg1<=15)?1:0); if ( (i_alu_instr == LIBXSMM_X86_INSTR_POPCNT) || (i_alu_instr == LIBXSMM_X86_INSTR_TZCNT) ) { buf[i++] = (unsigned char)(0xf3); } buf[i++] = (unsigned char)(0x48 + l_gp8 * 0x01 + l_nx8 * 0x04); buf[i++] = (unsigned char)(0x01 + l_second); if ( l_extra_byte ) { buf[i++] = (unsigned char)(0x44 + l_third); } buf[i++] = (unsigned char)(0xc0 + l_regbas0 + 8*l_regnum); io_generated_code->code_size = i; /* *loc = i; */ } } else { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; char l_gp_reg_name_src[4]; char l_gp_reg_name_dest[4]; char l_instr_name[16]; libxsmm_get_x86_gp_reg_name( i_gp_reg_number_src, l_gp_reg_name_src, 3 ); libxsmm_get_x86_gp_reg_name( i_gp_reg_number_dest, l_gp_reg_name_dest, 3 ); libxsmm_get_x86_instr_name( i_alu_instr, l_instr_name, 15 ); if ( io_generated_code->code_type == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%s %%%%%s, %%%%%s\\n\\t\"\n", l_instr_name, l_gp_reg_name_src, l_gp_reg_name_dest ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %s %%%s, %%%s\n", l_instr_name, l_gp_reg_name_src, l_gp_reg_name_dest ); } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } LIBXSMM_API_INTERN void libxsmm_x86_instruction_push_reg( libxsmm_generated_code* io_generated_code, const unsigned int i_gp_reg_number ) { /* @TODO add checks in debug mode */ if ( io_generated_code->code_type > 1 ) { unsigned char *buf = (unsigned char *) io_generated_code->generated_code; int i = io_generated_code->code_size; unsigned int l_maxsize = io_generated_code->buffer_size; int l_reg0 = 0; if ( l_maxsize - i < 2 ) { fprintf(stderr, "libxsmm_instruction_push_reg: push instructions need up to 2 bytes\n"); exit(-1); } if ( /*i_gp_reg_number < 0 ||*/ i_gp_reg_number > 15 ) { fprintf(stderr, "libxsmm_instruction_push_reg: invalid register\n"); exit(-1); } /* determine register encoding */ if ( (i_gp_reg_number > 7) && (i_gp_reg_number <=15) ) { l_reg0 = i_gp_reg_number - 8; buf[i++] = (unsigned char)(0x41); } else { l_reg0 = i_gp_reg_number; } buf[i++] = (unsigned char)(0x50 + l_reg0); io_generated_code->code_size = i; io_generated_code->sf_size += 8; } else { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; char l_gp_reg_name[4]; libxsmm_get_x86_gp_reg_name( i_gp_reg_number, l_gp_reg_name, 3 ); if ( io_generated_code->code_type == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"pushq %%%%%s\\n\\t\"\n", l_gp_reg_name ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " pushq %%%s\n", l_gp_reg_name ); } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); io_generated_code->sf_size += 8; } } LIBXSMM_API_INTERN void libxsmm_x86_instruction_pop_reg( libxsmm_generated_code* io_generated_code, const unsigned int i_gp_reg_number ) { /* @TODO add checks in debug mode */ if ( io_generated_code->code_type > 1 ) { unsigned char *buf = (unsigned char *) io_generated_code->generated_code; int i = io_generated_code->code_size; unsigned int l_maxsize = io_generated_code->buffer_size; int l_reg0 = 0; if ( l_maxsize - i < 2 ) { fprintf(stderr, "libxsmm_instruction_pop_reg: pop instructions need up to 2 bytes\n"); exit(-1); } if ( /*i_gp_reg_number < 0 ||*/ i_gp_reg_number > 15 ) { fprintf(stderr, "libxsmm_instruction_pop_reg: invalid register\n"); exit(-1); } /* determine register encoding */ if ( (i_gp_reg_number > 7) && (i_gp_reg_number <=15) ) { l_reg0 = i_gp_reg_number - 8; buf[i++] = (unsigned char)(0x41); } else { l_reg0 = i_gp_reg_number; } buf[i++] = (unsigned char)(0x50 + l_reg0 + 8); io_generated_code->code_size = i; io_generated_code->sf_size -= 8; } else { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; char l_gp_reg_name[4]; libxsmm_get_x86_gp_reg_name( i_gp_reg_number, l_gp_reg_name, 3 ); if ( io_generated_code->code_type == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"popq %%%%%s\\n\\t\"\n", l_gp_reg_name ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " popq %%%s\n", l_gp_reg_name ); } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); io_generated_code->sf_size -= 8; } } LIBXSMM_API_INTERN void libxsmm_x86_instruction_mask_move( libxsmm_generated_code* io_generated_code, const unsigned int i_mask_instr, const unsigned int i_gp_reg_number, const unsigned int i_mask_reg_number, const unsigned int i_is_store ) { /* @TODO add checks in debug mode */ if ( io_generated_code->code_type > 1 ) { unsigned char *buf = (unsigned char *) io_generated_code->generated_code; int i = io_generated_code->code_size; /* int i = *loc; */ unsigned int l_maxsize = io_generated_code->buffer_size; /* unsigned int l_maxsize = 1024; */ unsigned int l_case = 0; int l_regnum0 = i_gp_reg_number % 8; int l_nx8 = ((i_gp_reg_number>7)&&(i_gp_reg_number<=15)?1:0); if ( l_maxsize - i < 20 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_BUFFER_TOO_SMALL ); return; } switch ( i_mask_instr ) { case LIBXSMM_X86_INSTR_KMOVW: break; case LIBXSMM_X86_INSTR_KMOVB: l_case += 1; break; case LIBXSMM_X86_INSTR_KMOVD: l_case += 3; break; case LIBXSMM_X86_INSTR_KMOVQ: l_case += 0x83; break; default: fprintf(stderr, "libxsmm_instruction_mask_move: Strange kmov instruction\n"); exit(-1); } if ( (i_is_store == 1) && (i_mask_instr != LIBXSMM_X86_INSTR_KMOVQ) && (l_nx8 > 0) ) l_case -= 0x80; if ( i_is_store != 0 ) l_nx8 *= 4; if ( i_mask_reg_number > 7 ) { fprintf(stderr, "libxsmm_instruction_mask_move: Strange mask number=%u\n",i_mask_reg_number); exit(-1); } if ( (l_nx8&&i_is_store==0) || i_mask_instr==LIBXSMM_X86_INSTR_KMOVQ ) { buf[i++] = (unsigned char)(0xc4); buf[i++] = (unsigned char)(0xe1 - l_nx8*0x20); buf[i++] = (unsigned char)(0x78 + l_case); } else { buf[i++] = (unsigned char)(0xc5); buf[i++] = (unsigned char)(0xf8 + l_case); } if ( i_is_store == 0 ) { buf[i++] = (unsigned char)(0x92); buf[i++] = (unsigned char)(0xc0 + l_regnum0 + 8*i_mask_reg_number); } else { buf[i++] = (unsigned char)(0x93); buf[i++] = (unsigned char)(0xc0 + 8*l_regnum0 + i_mask_reg_number); } io_generated_code->code_size = i; /* *loc = i; */ } else { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; char l_gp_reg_name[4]; char l_instr_name[16]; char l_prefix = '\0'; libxsmm_get_x86_gp_reg_name( i_gp_reg_number, l_gp_reg_name, 3 ); libxsmm_get_x86_instr_name( i_mask_instr, l_instr_name, 15 ); /* check if we need to add a prefix for accessing 32bit in a 64bit register */ if ( (i_gp_reg_number == LIBXSMM_X86_GP_REG_R8 || i_gp_reg_number == LIBXSMM_X86_GP_REG_R9 || i_gp_reg_number == LIBXSMM_X86_GP_REG_R10 || i_gp_reg_number == LIBXSMM_X86_GP_REG_R11 || i_gp_reg_number == LIBXSMM_X86_GP_REG_R12 || i_gp_reg_number == LIBXSMM_X86_GP_REG_R13 || i_gp_reg_number == LIBXSMM_X86_GP_REG_R14 || i_gp_reg_number == LIBXSMM_X86_GP_REG_R15) && (i_mask_instr != LIBXSMM_X86_INSTR_KMOVQ) ) { l_prefix = 'd'; } if ( i_is_store != 0 ) { if ( io_generated_code->code_type == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%s %%%%k%u, %%%%%s%c\\n\\t\"\n", l_instr_name, i_mask_reg_number, l_gp_reg_name, l_prefix ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %s %%k%u, %%%s%c\n", l_instr_name, i_mask_reg_number, l_gp_reg_name, l_prefix ); } } else { if ( io_generated_code->code_type == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%s %%%%%s%c, %%%%k%u\\n\\t\"\n", l_instr_name, l_gp_reg_name, l_prefix, i_mask_reg_number ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %s %%%s%c, %%k%u\n", l_instr_name, l_gp_reg_name, l_prefix, i_mask_reg_number ); } } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } LIBXSMM_API_INTERN void libxsmm_x86_instruction_mask_move_mem( libxsmm_generated_code* io_generated_code, const unsigned int i_mask_instr, const unsigned int i_gp_reg_base, const unsigned int i_gp_reg_idx, const unsigned int i_scale, const int i_displacement, const unsigned int i_mask_reg_number, const unsigned int i_is_store ) { /* @TODO add checks in debug mode */ if ( io_generated_code->code_type > 1 ) { /* @TODO needs to be implemented */ unsigned char *buf = (unsigned char *) io_generated_code->generated_code; int i = io_generated_code->code_size; unsigned int l_maxsize = io_generated_code->buffer_size; int l_regbas0 = i_gp_reg_base % 8; int l_gp8 = ((i_gp_reg_base > 7)&&(i_gp_reg_base<=15)?1:0); int l_regidx = i_gp_reg_idx % 8; int l_ix8 = ((i_gp_reg_idx > 7)&&(i_gp_reg_idx<=15)?1:0); int l_sca=0; int l_place = i+4; int l_sizereg = 1; int l_forced_offset = 0; /*int l_second = 0;*/ int l_third = 0; int l_fourth = 0; /*int l_fifth = 0;*/ /*int l_sixth = 0;*/ int l_bytes = 5; int l_tmp = 0; if ( l_maxsize - i < 20 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_BUFFER_TOO_SMALL ); #if 0 fprintf(stderr,"libxsmm_instruction_mask_move_mem Most instructions need at most 20 bytes\n"); #endif return; } if ( (i_gp_reg_base == LIBXSMM_X86_GP_REG_UNDEF) || (((int)i_gp_reg_base < LIBXSMM_X86_GP_REG_RAX) || (i_gp_reg_base > LIBXSMM_X86_GP_REG_R15)) ) { fprintf(stderr,"libxsmm_instruction_mask_move_mem has invalid i_gp_reg_base input\n"); exit(-1); } if ( (i_gp_reg_idx != LIBXSMM_X86_GP_REG_UNDEF) && (((int)i_gp_reg_idx < LIBXSMM_X86_GP_REG_RAX) || (i_gp_reg_idx > LIBXSMM_X86_GP_REG_R15)) ) { fprintf(stderr,"libxsmm_instruction_mask_move_mem has invalid i_gp_reg_idx input\n"); exit(-1); } switch ( i_mask_instr ) { case LIBXSMM_X86_INSTR_KMOVQ: break; case LIBXSMM_X86_INSTR_KMOVD: l_third += 1; break; case LIBXSMM_X86_INSTR_KMOVB: l_place = i + 3; l_bytes = 4; l_third += 1; if ( l_gp8 || l_ix8 ) { l_bytes = 5; l_third -= 0x80; l_place=i+4; } break; case LIBXSMM_X86_INSTR_KMOVW: l_place = i + 3; l_bytes = 4; if ( l_gp8 || l_ix8 ) { l_bytes = 5; l_third -= 0x80; l_place=i+4; } break; default: fprintf(stderr, "libxsmm_instruction_mask_move_mem: Unknown instruction type: %u\n", i_mask_instr); exit(-1); } if ( i_is_store == 1 ) { if ( l_bytes == 5 ) l_fourth = 1; if ( i_mask_instr==LIBXSMM_X86_INSTR_KMOVW || i_mask_instr==LIBXSMM_X86_INSTR_KMOVB ) l_tmp = 1; } if ( (i_gp_reg_idx != LIBXSMM_X86_GP_REG_UNDEF) && ((int)i_gp_reg_idx >= LIBXSMM_X86_GP_REG_RAX) && (i_gp_reg_idx <= LIBXSMM_X86_GP_REG_R15) ) { switch ( i_scale ) { case 1: l_sca=0; break; case 2: l_sca=0x40; break; case 4: l_sca=0x80; break; case 8: l_sca=0xc0; break; default: fprintf(stderr, "libxsmm_instruction_mask_move_mem: cannot handle i_scale=%u parameter\n", i_scale); exit(-1); } } if ( l_bytes == 4 ) { buf[i++] = (unsigned char)(0xc5); if (i_gp_reg_idx == LIBXSMM_X86_GP_REG_UNDEF ) { buf[i++] = (unsigned char)(0xf8 + l_third); buf[i++] = (unsigned char)(0x90 + l_tmp); buf[i++] = (unsigned char)(0x00 + l_fourth + l_regbas0 + i_mask_reg_number*8); if ( l_regbas0 == 4 ) buf[i++]=(unsigned char)(0x24); } else { buf[i++] = (unsigned char)(0xf8 + l_third); buf[i++] = (unsigned char)(0x90 + l_tmp); buf[i++] = (unsigned char)(0x04 + l_fourth + i_mask_reg_number*8); buf[i++] = (unsigned char)(0x00 + l_sca + l_regbas0 + l_regidx*8); } } else { buf[i++] = (unsigned char)(0xc4); buf[i++] = (unsigned char)(0xe1 - l_gp8 * 0x20 - l_ix8 * 0x40); if (i_gp_reg_idx == LIBXSMM_X86_GP_REG_UNDEF ) { buf[i++] = (unsigned char)(0xf8 + l_third); buf[i++] = (unsigned char)(0x90 + l_fourth); buf[i++] = (unsigned char)(0x00 + l_regbas0 + i_mask_reg_number*8); if ( l_regbas0 == 4 ) buf[i++]=(unsigned char)(0x24); } else { buf[i++] = (unsigned char)(0xf8 + l_third); buf[i++] = (unsigned char)(0x90 + l_fourth); buf[i++] = (unsigned char)(0x04 + i_mask_reg_number*8); buf[i++] = (unsigned char)(0x00 + l_sca + l_regbas0 + l_regidx*8); } } if ( (l_regbas0 == 5) && (i_displacement==0) ) { /* Registers like rbp/r13 when you have a displacement of 0, we need * force the single byte of zero to appear. */ l_forced_offset = 1; } i += internal_x86_instructions_add_offset( l_place, i, i_displacement, l_forced_offset, l_sizereg, buf ); io_generated_code->code_size = i; } else { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; char l_gp_reg_base[4]; char l_gp_reg_idx[4]; char l_instr_name[16]; libxsmm_get_x86_gp_reg_name( i_gp_reg_base, l_gp_reg_base, 3 ); libxsmm_get_x86_instr_name( i_mask_instr, l_instr_name, 15 ); if ( i_is_store != 0 ) { if ( i_gp_reg_idx == LIBXSMM_X86_GP_REG_UNDEF ) { if ( io_generated_code->code_type == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%s %%%%k%u, %i(%%%%%s)\\n\\t\"\n", l_instr_name, i_mask_reg_number, i_displacement, l_gp_reg_base ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %s %%k%u, %i(%%%s)\n", l_instr_name, i_mask_reg_number, i_displacement, l_gp_reg_base ); } } else { libxsmm_get_x86_gp_reg_name( i_gp_reg_idx, l_gp_reg_idx, 3 ); if ( io_generated_code->code_type == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%s %%%%k%u, %i(%%%%%s,%%%%%s,%u)\\n\\t\"\n", l_instr_name, i_mask_reg_number, i_displacement, l_gp_reg_base, l_gp_reg_idx, i_scale ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %s %%k%u, %i(%%%s,%%%s,%u)\n", l_instr_name, i_mask_reg_number, i_displacement, l_gp_reg_base, l_gp_reg_idx, i_scale ); } } } else { if ( i_gp_reg_idx == LIBXSMM_X86_GP_REG_UNDEF ) { if ( io_generated_code->code_type == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%s %i(%%%%%s), %%%%k%u\\n\\t\"\n", l_instr_name, i_displacement, l_gp_reg_base, i_mask_reg_number ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %s %i(%%%s), %%k%u\n", l_instr_name, i_displacement, l_gp_reg_base, i_mask_reg_number ); } } else { libxsmm_get_x86_gp_reg_name( i_gp_reg_idx, l_gp_reg_idx, 3 ); if ( io_generated_code->code_type == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%s %i(%%%%%s,%%%%%s,%u), %%%%k%u\\n\\t\"\n", l_instr_name, i_displacement, l_gp_reg_base, l_gp_reg_idx, i_scale, i_mask_reg_number ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %s %i(%%%s,%%%s,%u), %%k%u\n", l_instr_name, i_displacement, l_gp_reg_base, l_gp_reg_idx, i_scale, i_mask_reg_number ); } } } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } LIBXSMM_API_INTERN void libxsmm_x86_instruction_mask_compute_reg( libxsmm_generated_code* io_generated_code, const unsigned int i_mask_instr, const unsigned int i_mask_reg_number_src_0, const unsigned int i_mask_reg_number_src_1, const unsigned int i_mask_reg_number_dest ) { /* @TODO add checks in debug mode */ if ( io_generated_code->code_type > 1 ) { /* @TODO-GREG call encoding here */ unsigned char *buf = (unsigned char *) io_generated_code->generated_code; int i = io_generated_code->code_size; /* int i = *loc; */ unsigned int l_maxsize = io_generated_code->buffer_size; /* unsigned int l_maxsize = 1024; */ if ( l_maxsize - i < 20 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_BUFFER_TOO_SMALL ); return; } switch ( i_mask_instr ) { case LIBXSMM_X86_INSTR_KXNORW: break; default: fprintf(stderr, "libxsmm_x86_instruction_mask_compute_reg: Strange kmov instruction\n"); exit(-1); } buf[i++] = 0xc5; buf[i++] = (unsigned char)(0xfc - i_mask_reg_number_src_1 * 8); buf[i++] = 0x46; buf[i++] = (unsigned char)(0xc0 + i_mask_reg_number_src_0 + i_mask_reg_number_dest * 8); io_generated_code->code_size = i; /* *loc = i; */ } else { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; char l_instr_name[16]; libxsmm_get_x86_instr_name( i_mask_instr, l_instr_name, 15 ); if ( io_generated_code->code_type == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%s %%%%k%u, %%%%k%u, %%%%k%u\\n\\t\"\n", l_instr_name, i_mask_reg_number_src_0, i_mask_reg_number_src_1, i_mask_reg_number_dest ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %s %%k%u, %%k%u, %%k%u\n", l_instr_name, i_mask_reg_number_src_0, i_mask_reg_number_src_1, i_mask_reg_number_dest ); } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } LIBXSMM_API_INTERN void libxsmm_x86_instruction_register_jump_back_label( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker ) { /* check if we still have label we can jump to */ if ( io_loop_label_tracker->label_count == 32 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_EXCEED_JMPLBL ); return; } /* @TODO add checks in debug mode */ if ( io_generated_code->code_type > 1 ) { int l_lab = io_loop_label_tracker->label_count; io_loop_label_tracker->label_count++; io_loop_label_tracker->label_address[l_lab] = io_generated_code->code_size; } else { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; io_loop_label_tracker->label_address[io_loop_label_tracker->label_count] = io_loop_label_tracker->label_count+32+1; if ( io_generated_code->code_type == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%u:\\n\\t\"\n", io_loop_label_tracker->label_address[io_loop_label_tracker->label_count] ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %u:\n", io_loop_label_tracker->label_address[io_loop_label_tracker->label_count] ); } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); io_loop_label_tracker->label_count++; } } LIBXSMM_API_INTERN void libxsmm_x86_instruction_jump_back_to_label( libxsmm_generated_code* io_generated_code, const unsigned int i_jmp_instr, libxsmm_loop_label_tracker* io_loop_label_tracker ) { /* check that we just handle a valid jump */ switch ( i_jmp_instr ) { case LIBXSMM_X86_INSTR_JL: case LIBXSMM_X86_INSTR_JE: case LIBXSMM_X86_INSTR_JZ: case LIBXSMM_X86_INSTR_JG: case LIBXSMM_X86_INSTR_JNE: case LIBXSMM_X86_INSTR_JNZ: case LIBXSMM_X86_INSTR_JGE: case LIBXSMM_X86_INSTR_JLE: case LIBXSMM_X86_INSTR_JMP: break; default: LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_UNSUPPORTED_JUMP ); return; } /* check if we still have label we can jump to */ if ( io_loop_label_tracker->label_count == 0 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_NO_JMPLBL_AVAIL ); return; } /* @TODO add checks in debug mode */ if ( io_generated_code->code_type > 1 ) { /*unsigned char *buf = (unsigned char *) io_generated_code->generated_code;*/ int i = io_generated_code->code_size; unsigned int l_maxsize = io_generated_code->buffer_size; int l_lab = --io_loop_label_tracker->label_count; int l_val = io_loop_label_tracker->label_address[l_lab]; /*int l_jmptype, l_dist, l_tmp;*/ int l_tmp; if ( l_maxsize - i < 6 ) { fprintf(stderr, "libxsmm_instruction_jump_back_to_label: Our jump instructions need at most 6 bytes\n"); exit(-1); } l_tmp = internal_x86_jumping( io_generated_code, i, l_val, i_jmp_instr ); io_generated_code->code_size = i + l_tmp; } else { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; char l_instr_name[16]; libxsmm_get_x86_instr_name( i_jmp_instr, l_instr_name, 15 ); io_loop_label_tracker->label_count--; if ( io_generated_code->code_type == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%s %ub\\n\\t\"\n", l_instr_name, io_loop_label_tracker->label_address[io_loop_label_tracker->label_count] ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %s %ub\n", l_instr_name, io_loop_label_tracker->label_address[io_loop_label_tracker->label_count] ); } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); io_loop_label_tracker->label_address[io_loop_label_tracker->label_count] = 0; } } LIBXSMM_API_INTERN void libxsmm_x86_instruction_register_jump_label( libxsmm_generated_code* io_generated_code, const unsigned int i_label_no, libxsmm_jump_label_tracker* io_jump_label_tracker ) { /* check if the label we are trying to set inside of bounds */ if ( i_label_no >= 32 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_EXCEED_JMPLBL ); return; } /* check if the label we try to set is still available */ if ( io_jump_label_tracker->label_address[i_label_no] > 0 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_JMPLBL_USED ); return; } /* @TODO add checks in debug mode */ if ( io_generated_code->code_type > 1 ) { unsigned int l_ref = 0; libxsmm_jump_source l_source = io_jump_label_tracker->label_source[i_label_no]; /* first added label to tracker */ io_jump_label_tracker->label_address[i_label_no] = io_generated_code->code_size; /* patching all previous references */ for ( l_ref = 0; l_ref < l_source.ref_count; ++l_ref ) { unsigned int l_jmp_instr = l_source.instr_type[l_ref]; unsigned int l_position = l_source.instr_addr[l_ref]; #if 0 int l_tmp = #endif /* This routine just does everything related to jumping. In this case, we know the destination/target */ internal_x86_jumping ( io_generated_code, l_position, io_generated_code->code_size, l_jmp_instr ); /* We don't need to forward the bytes here */ } } else { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; io_jump_label_tracker->label_address[i_label_no] = i_label_no+1; if ( io_generated_code->code_type == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%u:\\n\\t\"\n", io_jump_label_tracker->label_address[i_label_no] ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %u:\n", io_jump_label_tracker->label_address[i_label_no] ); } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } LIBXSMM_API_INTERN void libxsmm_x86_instruction_jump_to_label( libxsmm_generated_code* io_generated_code, const unsigned int i_jmp_instr, const unsigned int i_label_no, libxsmm_jump_label_tracker* io_jump_label_tracker ) { unsigned int l_pos; /* check if the label we are trying to set inside of bounds */ if ( (i_label_no < 32) == 0 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_EXCEED_JMPLBL ); return; } /* check if we still have label we can jump to */ if ( io_jump_label_tracker->label_source[i_label_no].ref_count == 32-1 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_EXCEED_JMPLBL ); return; } /* add addr at current position and instruction to tracking structure */ l_pos = io_jump_label_tracker->label_source[i_label_no].ref_count; io_jump_label_tracker->label_source[i_label_no].instr_type[l_pos] = i_jmp_instr; io_jump_label_tracker->label_source[i_label_no].instr_addr[l_pos] = io_generated_code->code_size; io_jump_label_tracker->label_source[i_label_no].ref_count++; /* @TODO add checks in debug mode */ if ( io_generated_code->code_type > 1 ) { int l_dest_addr; int l_tmp; if ( io_jump_label_tracker->label_address[i_label_no] == 0 ) { l_dest_addr = -1; /* It's a forward jump to a location we haven't set yet. We'll assume 5-6 bytes */ } else { /* Destination/target address is known here. */ l_dest_addr = io_jump_label_tracker->label_address[i_label_no]; } l_tmp = internal_x86_jumping ( io_generated_code, io_generated_code->code_size, l_dest_addr, i_jmp_instr ); io_generated_code->code_size = io_generated_code->code_size + l_tmp; /* l_tmp is the # of bytes needed */ } else { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; char l_instr_name[16]; char l_jmp_dir; libxsmm_get_x86_instr_name( i_jmp_instr, l_instr_name, 15 ); if ( io_jump_label_tracker->label_address[i_label_no] == 0 ) { l_jmp_dir = 'f'; } else { l_jmp_dir = 'b'; } if ( io_generated_code->code_type == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"%s %u%c\\n\\t\"\n", l_instr_name, i_label_no+1, l_jmp_dir ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " %s %u%c\n", l_instr_name, i_label_no+1, l_jmp_dir ); } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } LIBXSMM_API_INTERN void libxsmm_x86_instruction_full_vec_load_of_constants ( libxsmm_generated_code *io_generated_code, const unsigned char *i_data, const char *i_id, const char i_vector_name, const unsigned int i_vec_reg_number ) { int number_of_bytes_to_load = 0; /*int l_regsize_adjustment = 0;*/ switch ( i_vector_name ) { case 'x': number_of_bytes_to_load = 16; /*l_regsize_adjustment = -4;*/ break; case 'y': number_of_bytes_to_load = 32; break; case 'z': number_of_bytes_to_load = 64; break; default: fprintf(stderr, "libxsmm_x86_instruction_full_vec_load_of_constants: strange input for i_vector_name: %c\n",i_vector_name); exit(-1); } if ( io_generated_code->code_type > 1 ) { unsigned char *buf = (unsigned char *) io_generated_code->generated_code; unsigned char *cval = (unsigned char *) &i_data[0]; int i = io_generated_code->code_size; unsigned int l_maxsize = io_generated_code->buffer_size; int j = 0; int l_stop = 0; int l_regsize_adjustment = 0; int l_last_load_location = 0; int jmpval = 0; int vecval = 0; /* @TODO fix max. size error */ if ( l_maxsize - i < 139 ) { fprintf(stderr, "libxsmm_x86_instruction_full_vec_load_of_constants: Most constant jumps need at most 139 bytes\n"); exit(-1); } #define DISABLE_ALIGNMENT #ifdef DISABLE_ALIGNMENT l_stop = i + 2; #else /* Replace this code with real code to find the right offset "l_stop" so * buf[l_stop] has the right alignment, where l_stop >= i+2 */ for ( j = i+2, l_stop = -1; (j < i+number_of_bytes_to_load+2) && (l_stop==-1); j++ ) { if ( ((size_t)&buf[j])%number_of_bytes_to_load == 0 ) { l_stop = j; } } if ( (l_stop == -1) || (l_stop < i+2) ) { fprintf(stderr, "libxsmm_x86_instruction_full_vec_load_of_constants: never found correct alignment\n"); exit(-1); } j = l_stop; #endif jmpval = number_of_bytes_to_load + l_stop - (i + 2); buf[ i ] = 0xeb; buf[i+1] = (unsigned char)jmpval; /* Let's insert nops until we reach an aligned address */ for ( j = i+2; j < l_stop; j++ ) { buf[ j ] = 0x90; /* nop */ } i = l_stop; for ( j = 0; j < number_of_bytes_to_load; j++ ) { buf[ i ] = cval[j]; i++; } l_last_load_location = i; if ( i_vector_name == 'z' ) { buf[ i ] = 0x62; if ( i_vec_reg_number <= 7 ) { buf[i+1] = 0xf1; vecval = i_vec_reg_number; } else if ( i_vec_reg_number <= 15 ) { buf[i+1] = 0x71; vecval = i_vec_reg_number - 8; } else if ( i_vec_reg_number <= 23 ) { buf[i+1] = 0xe1; vecval = i_vec_reg_number - 16; } else { buf[i+1] = 0x61; vecval = i_vec_reg_number - 24; } buf[i+2] = 0x7c; buf[i+3] = 0x48; i += 4; } else { buf[i] = 0xc5; if ( i_vec_reg_number <= 7 ) { buf[i+1] = (unsigned char)(0xfc + l_regsize_adjustment); vecval = i_vec_reg_number; } else { buf[i+1] = (unsigned char)(0x7c + l_regsize_adjustment); vecval = i_vec_reg_number - 8; } i += 2; } buf[ i ] = 0x10; buf[i+1] = (unsigned char)(0x05 + (8*vecval)); /* 6 bytes is what we have left to encode in the last_load_location */ jmpval = -1*(number_of_bytes_to_load + 6 + (i-l_last_load_location) ); cval = (unsigned char *) &jmpval; buf[i+2] = cval[0]; buf[i+3] = cval[1]; buf[i+4] = cval[2]; buf[i+5] = cval[3]; /* 6 bytes is what we have left to encode in the last_load_location */ i += 6; io_generated_code->code_size = i; } else { unsigned char *cval = (unsigned char *) &i_data[0]; int j = 0; char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; if ( io_generated_code->code_type == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"jmp .continued_%s\\n\\t\"\n", i_id ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \".data_%s:\\n\\t\"\n", i_id ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); for ( j = 0; j < number_of_bytes_to_load; j += 4 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \".byte 0x%02x, 0x%02x, 0x%02x, 0x%02x\\n\\t\"\n", cval[0],cval[1],cval[2],cval[3] ); cval = cval + 4; libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \".continued_%s:\\n\\t\"\n", i_id ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " \"vmovups .data_%s(%%%%rip), %%%%%cmm%u\\n\\t\"\n", i_id, i_vector_name, i_vec_reg_number ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " jmp .continued_%s\n", i_id ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " .data_%s:\n", i_id ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); for ( j = 0; j < number_of_bytes_to_load; j += 4 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " .byte 0x%02x, 0x%02x, 0x%02x, 0x%02x\n", cval[0],cval[1],cval[2],cval[3] ); cval = cval + 4; libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " .continued_%s:\n", i_id ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " vmovups .data_%s(%%rip), %%%cmm%u\n", i_id, i_vector_name, i_vec_reg_number ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } } LIBXSMM_API_INTERN void libxsmm_x86_instruction_load_arg_to_reg( libxsmm_generated_code* io_generated_code, const unsigned int i_arg_number, const unsigned int i_gp_reg_number ) { libxsmm_x86_instruction_alu_mem( io_generated_code, LIBXSMM_X86_INSTR_MOVQ, LIBXSMM_X86_GP_REG_RSP, LIBXSMM_X86_GP_REG_UNDEF, 0, io_generated_code->sf_size+8+(8*i_arg_number), i_gp_reg_number, 0 ); } LIBXSMM_API_INTERN void libxsmm_x86_instruction_open_stream( libxsmm_generated_code* io_generated_code, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, unsigned int i_prefetch) { /* @TODO add checks in debug mode */ if ( io_generated_code->code_type > 1 ) { /* @TODO this is currently System V AMD64 RTL(C) ABI only */ unsigned char* l_code_buffer = (unsigned char *) io_generated_code->generated_code; unsigned int l_code_size = io_generated_code->code_size; unsigned int l_max_size = io_generated_code->buffer_size; if (NULL == l_code_buffer || l_max_size < (l_code_size + 9)) { LIBXSMM_HANDLE_ERROR(io_generated_code, LIBXSMM_ERR_BUFFER_TOO_SMALL); return; } /* push callee save registers */ /* push rbx */ l_code_buffer[l_code_size++] = 0x53; /* push r12 */ l_code_buffer[l_code_size++] = 0x41; l_code_buffer[l_code_size++] = 0x54; /* push r13 */ l_code_buffer[l_code_size++] = 0x41; l_code_buffer[l_code_size++] = 0x55; /* push r14 */ l_code_buffer[l_code_size++] = 0x41; l_code_buffer[l_code_size++] = 0x56; /* push r15 */ l_code_buffer[l_code_size++] = 0x41; l_code_buffer[l_code_size++] = 0x57; /* update code length */ io_generated_code->code_size = l_code_size; /* adjust stack frame size */ io_generated_code->sf_size += 40; } else if ( io_generated_code->code_type == 1 ) { /* @TODO this is currently System V AMD64 RTL(C) ABI only */ char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; /* push callee save registers */ l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " pushq %%rbx\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " pushq %%r12\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " pushq %%r13\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " pushq %%r14\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " pushq %%r15\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* adjust stack frame size */ io_generated_code->sf_size += 40; l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " retq\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; char l_gp_reg_name[4]; /* loading a pointer in assembly */ libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_a, l_gp_reg_name, 3 ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " __asm__ __volatile__(\"movq %%0, %%%%%s\\n\\t\"\n", l_gp_reg_name ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* loading b pointer in assembly */ libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_b, l_gp_reg_name, 3 ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " \"movq %%1, %%%%%s\\n\\t\"\n", l_gp_reg_name ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* loading c pointer in assembly */ libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_c, l_gp_reg_name, 3 ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " \"movq %%2, %%%%%s\\n\\t\"\n", l_gp_reg_name ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* loading b prefetch pointer in assembly */ if ( i_prefetch == LIBXSMM_GEMM_PREFETCH_BL2_VIA_C || i_prefetch == LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C_AHEAD) { libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_b_prefetch, l_gp_reg_name, 3 ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " \"movq %%3, %%%%%s\\n\\t\"\n", l_gp_reg_name ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* loading a prefetch pointer in assembly */ } else if ( i_prefetch == LIBXSMM_GEMM_PREFETCH_AL2 ) { libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_a_prefetch, l_gp_reg_name, 3 ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " \"movq %%3, %%%%%s\\n\\t\"\n", l_gp_reg_name ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* loading a and b prefetch pointer in assembly */ } else if ( i_prefetch == LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C ) { libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_a_prefetch, l_gp_reg_name, 3 ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " \"movq %%3, %%%%%s\\n\\t\"\n", l_gp_reg_name ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_b_prefetch, l_gp_reg_name, 3 ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " \"movq %%4, %%%%%s\\n\\t\"\n", l_gp_reg_name ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else {} } /* reset loop counters */ libxsmm_x86_instruction_alu_imm( io_generated_code, LIBXSMM_X86_INSTR_MOVQ, i_gp_reg_mapping->gp_reg_mloop, 0 ); libxsmm_x86_instruction_alu_imm( io_generated_code, LIBXSMM_X86_INSTR_MOVQ, i_gp_reg_mapping->gp_reg_nloop, 0 ); libxsmm_x86_instruction_alu_imm( io_generated_code, LIBXSMM_X86_INSTR_MOVQ, i_gp_reg_mapping->gp_reg_kloop, 0 ); } LIBXSMM_API_INTERN void libxsmm_x86_instruction_close_stream( libxsmm_generated_code* io_generated_code, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, unsigned int i_prefetch) { /* @TODO add checks in debug mode */ if ( io_generated_code->code_type > 1 ) { /* @TODO this is a very simple System V ABI 64 interface */ unsigned char *l_code_buffer = (unsigned char *) io_generated_code->generated_code; unsigned int l_code_size = io_generated_code->code_size; unsigned int l_max_size = io_generated_code->buffer_size; if (l_max_size < (l_code_size + 10)) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_BUFFER_TOO_SMALL ); return; } /* pop callee save registers */ /* pop r15 */ l_code_buffer[l_code_size++] = 0x41; l_code_buffer[l_code_size++] = 0x5f; /* pop r14 */ l_code_buffer[l_code_size++] = 0x41; l_code_buffer[l_code_size++] = 0x5e; /* pop r13 */ l_code_buffer[l_code_size++] = 0x41; l_code_buffer[l_code_size++] = 0x5d; /* pop r12 */ l_code_buffer[l_code_size++] = 0x41; l_code_buffer[l_code_size++] = 0x5c; /* pop rbx */ l_code_buffer[l_code_size++] = 0x5b; /* adjust stack frame size */ io_generated_code->sf_size -= 40; /* retq */ /* @TODO: I don't know if this is the correct placement in the generation process */ l_code_buffer[l_code_size++] = 0xc3; /* update code length */ io_generated_code->code_size = l_code_size; } else if ( io_generated_code->code_type == 1 ) { /* @TODO this is currently System V AMD64 RTL(C) ABI only */ char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " popq %%r15\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " popq %%r14\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " popq %%r13\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " popq %%r12\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " popq %%rbx\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* adjust stack frame size */ io_generated_code->sf_size -= 40; /* @TODO: I don't know if this is the correct placement in the generation process */ l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " retq\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else { char l_new_code[1024]; int l_max_code_length = 1023; int l_code_length = 0; char l_gp_reg_a[4]; char l_gp_reg_b[4]; char l_gp_reg_c[4]; char l_gp_reg_pre_a[4]; char l_gp_reg_pre_b[4]; char l_gp_reg_mloop[4]; char l_gp_reg_nloop[4]; char l_gp_reg_kloop[4]; libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_a, l_gp_reg_a, 3 ); libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_b, l_gp_reg_b, 3 ); libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_c, l_gp_reg_c, 3 ); libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_a_prefetch, l_gp_reg_pre_a, 3 ); libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_b_prefetch, l_gp_reg_pre_b, 3 ); libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_mloop, l_gp_reg_mloop, 3 ); libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_nloop, l_gp_reg_nloop, 3 ); libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_kloop, l_gp_reg_kloop, 3 ); if ( i_prefetch == LIBXSMM_GEMM_PREFETCH_BL2_VIA_C || i_prefetch == LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C_AHEAD) { if ( io_generated_code->arch <= LIBXSMM_X86_AVX2 ) { l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " : : \"m\"(A), \"m\"(B), \"m\"(C), \"m\"(B_prefetch) : \"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"xmm0\",\"xmm1\",\"xmm2\",\"xmm3\",\"xmm4\",\"xmm5\",\"xmm6\",\"xmm7\",\"xmm8\",\"xmm9\",\"xmm10\",\"xmm11\",\"xmm12\",\"xmm13\",\"xmm14\",\"xmm15\");\n", l_gp_reg_a, l_gp_reg_b, l_gp_reg_c, l_gp_reg_pre_b, l_gp_reg_mloop, l_gp_reg_nloop, l_gp_reg_kloop); } else { l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " : : \"m\"(A), \"m\"(B), \"m\"(C), \"m\"(B_prefetch) : \"k1\",\"rax\",\"rbx\",\"rcx\",\"rdx\",\"rdi\",\"rsi\",\"r8\",\"r9\",\"r10\",\"r11\",\"r12\",\"r13\",\"r14\",\"r15\",\"zmm0\",\"zmm1\",\"zmm2\",\"zmm3\",\"zmm4\",\"zmm5\",\"zmm6\",\"zmm7\",\"zmm8\",\"zmm9\",\"zmm10\",\"zmm11\",\"zmm12\",\"zmm13\",\"zmm14\",\"zmm15\",\"zmm16\",\"zmm17\",\"zmm18\",\"zmm19\",\"zmm20\",\"zmm21\",\"zmm22\",\"zmm23\",\"zmm24\",\"zmm25\",\"zmm26\",\"zmm27\",\"zmm28\",\"zmm29\",\"zmm30\",\"zmm31\");\n"); } } else if ( i_prefetch == LIBXSMM_GEMM_PREFETCH_AL2 ) { if ( io_generated_code->arch <= LIBXSMM_X86_AVX2 ) { l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " : : \"m\"(A), \"m\"(B), \"m\"(C), \"m\"(A_prefetch) : \"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"xmm0\",\"xmm1\",\"xmm2\",\"xmm3\",\"xmm4\",\"xmm5\",\"xmm6\",\"xmm7\",\"xmm8\",\"xmm9\",\"xmm10\",\"xmm11\",\"xmm12\",\"xmm13\",\"xmm14\",\"xmm15\");\n", l_gp_reg_a, l_gp_reg_b, l_gp_reg_c, l_gp_reg_pre_a, l_gp_reg_mloop, l_gp_reg_nloop, l_gp_reg_kloop); } else { l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " : : \"m\"(A), \"m\"(B), \"m\"(C), \"m\"(A_prefetch) : \"k1\",\"rax\",\"rbx\",\"rcx\",\"rdx\",\"rdi\",\"rsi\",\"r8\",\"r9\",\"r10\",\"r11\",\"r12\",\"r13\",\"r14\",\"r15\",\"zmm0\",\"zmm1\",\"zmm2\",\"zmm3\",\"zmm4\",\"zmm5\",\"zmm6\",\"zmm7\",\"zmm8\",\"zmm9\",\"zmm10\",\"zmm11\",\"zmm12\",\"zmm13\",\"zmm14\",\"zmm15\",\"zmm16\",\"zmm17\",\"zmm18\",\"zmm19\",\"zmm20\",\"zmm21\",\"zmm22\",\"zmm23\",\"zmm24\",\"zmm25\",\"zmm26\",\"zmm27\",\"zmm28\",\"zmm29\",\"zmm30\",\"zmm31\");\n"); } } else if ( i_prefetch == LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C ) { if ( io_generated_code->arch <= LIBXSMM_X86_AVX2 ) { l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " : : \"m\"(A), \"m\"(B), \"m\"(C), \"m\"(A_prefetch), \"m\"(B_prefetch) : \"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"xmm0\",\"xmm1\",\"xmm2\",\"xmm3\",\"xmm4\",\"xmm5\",\"xmm6\",\"xmm7\",\"xmm8\",\"xmm9\",\"xmm10\",\"xmm11\",\"xmm12\",\"xmm13\",\"xmm14\",\"xmm15\");\n", l_gp_reg_a, l_gp_reg_b, l_gp_reg_c, l_gp_reg_pre_a, l_gp_reg_pre_b, l_gp_reg_mloop, l_gp_reg_nloop, l_gp_reg_kloop); } else { l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " : : \"m\"(A), \"m\"(B), \"m\"(C), \"m\"(A_prefetch), \"m\"(B_prefetch) : \"k1\",\"rax\",\"rbx\",\"rcx\",\"rdx\",\"rdi\",\"rsi\",\"r8\",\"r9\",\"r10\",\"r11\",\"r12\",\"r13\",\"r14\",\"r15\",\"zmm0\",\"zmm1\",\"zmm2\",\"zmm3\",\"zmm4\",\"zmm5\",\"zmm6\",\"zmm7\",\"zmm8\",\"zmm9\",\"zmm10\",\"zmm11\",\"zmm12\",\"zmm13\",\"zmm14\",\"zmm15\",\"zmm16\",\"zmm17\",\"zmm18\",\"zmm19\",\"zmm20\",\"zmm21\",\"zmm22\",\"zmm23\",\"zmm24\",\"zmm25\",\"zmm26\",\"zmm27\",\"zmm28\",\"zmm29\",\"zmm30\",\"zmm31\");\n"); } } else { if ( io_generated_code->arch <= LIBXSMM_X86_AVX2 ) { l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " : : \"m\"(A), \"m\"(B), \"m\"(C) : \"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"xmm0\",\"xmm1\",\"xmm2\",\"xmm3\",\"xmm4\",\"xmm5\",\"xmm6\",\"xmm7\",\"xmm8\",\"xmm9\",\"xmm10\",\"xmm11\",\"xmm12\",\"xmm13\",\"xmm14\",\"xmm15\");\n", l_gp_reg_a, l_gp_reg_b, l_gp_reg_c, l_gp_reg_mloop, l_gp_reg_nloop, l_gp_reg_kloop); } else { l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " : : \"m\"(A), \"m\"(B), \"m\"(C) : \"k1\",\"rax\",\"rbx\",\"rcx\",\"rdx\",\"rdi\",\"rsi\",\"r8\",\"r9\",\"r10\",\"r11\",\"r12\",\"r13\",\"r14\",\"r15\",\"zmm0\",\"zmm1\",\"zmm2\",\"zmm3\",\"zmm4\",\"zmm5\",\"zmm6\",\"zmm7\",\"zmm8\",\"zmm9\",\"zmm10\",\"zmm11\",\"zmm12\",\"zmm13\",\"zmm14\",\"zmm15\",\"zmm16\",\"zmm17\",\"zmm18\",\"zmm19\",\"zmm20\",\"zmm21\",\"zmm22\",\"zmm23\",\"zmm24\",\"zmm25\",\"zmm26\",\"zmm27\",\"zmm28\",\"zmm29\",\"zmm30\",\"zmm31\");\n"); } } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } LIBXSMM_API_INTERN void libxsmm_x86_instruction_open_stream_transpose( libxsmm_generated_code* io_generated_code, const unsigned int i_gp_reg_a, const unsigned int i_gp_reg_lda, const unsigned int i_gp_reg_b, const unsigned int i_gp_reg_ldb, const char* i_arch ) { LIBXSMM_UNUSED(i_arch); /* @TODO add checks in debug mode */ if ( io_generated_code->code_type > 1 ) { /* @TODO this is currently System V AMD64 RTL(C) ABI only */ unsigned char* l_code_buffer = (unsigned char *) io_generated_code->generated_code; unsigned int l_code_size = io_generated_code->code_size; unsigned int l_max_size = io_generated_code->buffer_size; if (NULL == l_code_buffer || l_max_size < (l_code_size + 9)) { LIBXSMM_HANDLE_ERROR(io_generated_code, LIBXSMM_ERR_BUFFER_TOO_SMALL); return; } /* push rbx */ l_code_buffer[l_code_size++] = 0x53; /* push rbp */ l_code_buffer[l_code_size++] = 0x55; /* push r12 */ l_code_buffer[l_code_size++] = 0x41; l_code_buffer[l_code_size++] = 0x54; /* push r13 */ l_code_buffer[l_code_size++] = 0x41; l_code_buffer[l_code_size++] = 0x55; /* push r14 */ l_code_buffer[l_code_size++] = 0x41; l_code_buffer[l_code_size++] = 0x56; /* push r15 */ l_code_buffer[l_code_size++] = 0x41; l_code_buffer[l_code_size++] = 0x57; /* update code length */ io_generated_code->code_size = l_code_size; /* adjust stack frame size */ io_generated_code->sf_size += 40; } else if ( io_generated_code->code_type == 1 ) { /* @TODO this is currently System V AMD64 RTL(C) ABI only */ char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " pushq %%rbx\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " pushq %%rbp\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " pushq %%r12\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " pushq %%r13\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " pushq %%r14\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " pushq %%r15\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* adjust stack frame size */ io_generated_code->sf_size += 40; /* @TODO: I don't know if this is the correct placement in the generation process */ l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " retq\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; char l_gp_reg_name[4]; /* loading input pointer in assembley */ libxsmm_get_x86_gp_reg_name( i_gp_reg_a, l_gp_reg_name, 3 ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " __asm__ __volatile__(\"movq %%0, %%%%%s\\n\\t\"\n", l_gp_reg_name ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* loading weight pointer in assembley */ libxsmm_get_x86_gp_reg_name( i_gp_reg_lda, l_gp_reg_name, 3 ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " \"movq %%1, %%%%%s\\n\\t\"\n", l_gp_reg_name ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* loading output pointer in assembley */ libxsmm_get_x86_gp_reg_name( i_gp_reg_b, l_gp_reg_name, 3 ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " \"movq %%2, %%%%%s\\n\\t\"\n", l_gp_reg_name ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* loading input pf pointer in assembley */ libxsmm_get_x86_gp_reg_name( i_gp_reg_ldb, l_gp_reg_name, 3 ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " \"movq %%3, %%%%%s\\n\\t\"\n", l_gp_reg_name ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } LIBXSMM_API_INTERN void libxsmm_x86_instruction_close_stream_transpose( libxsmm_generated_code* io_generated_code, const char* i_arch) { /* libxsmm_x86_instruction_close_stream_convolution(io_generated_code, i_arch); */ /* @TODO add checks in debug mode */ if ( io_generated_code->code_type > 1 ) { /* @TODO this is a very simple System V ABI 64 interface */ unsigned char *l_code_buffer = (unsigned char *) io_generated_code->generated_code; unsigned int l_code_size = io_generated_code->code_size; unsigned int l_max_size = io_generated_code->buffer_size; if (NULL == l_code_buffer || l_max_size < (l_code_size + 11)) { LIBXSMM_HANDLE_ERROR(io_generated_code, LIBXSMM_ERR_BUFFER_TOO_SMALL); return; } /* pop r15 */ l_code_buffer[l_code_size++] = 0x41; l_code_buffer[l_code_size++] = 0x5f; /* pop r14 */ l_code_buffer[l_code_size++] = 0x41; l_code_buffer[l_code_size++] = 0x5e; /* pop r13 */ l_code_buffer[l_code_size++] = 0x41; l_code_buffer[l_code_size++] = 0x5d; /* pop r12 */ l_code_buffer[l_code_size++] = 0x41; l_code_buffer[l_code_size++] = 0x5c; /* pop rbp */ l_code_buffer[l_code_size++] = 0x5d; /* pop rbx */ l_code_buffer[l_code_size++] = 0x5b; /* adjust stack frame size */ io_generated_code->sf_size -= 40; /* retq */ /* @TODO: I don't know if this is the correct placement in the generation process */ l_code_buffer[l_code_size++] = 0xc3; /* update code length */ io_generated_code->code_size = l_code_size; } else if ( io_generated_code->code_type == 1 ) { /* @TODO this is currently System V AMD64 RTL(C) ABI only */ char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " popq %%r15\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " popq %%r14\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " popq %%r13\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " popq %%r12\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " popq %%rbx\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " popq %%rbp\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* adjust stack frame size */ io_generated_code->sf_size -= 40; /* @TODO: I don't know if this is the correct placement in the generation process */ l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " retq\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else { char l_new_code[1024]; int l_max_code_length = 1023; int l_code_length = 0; if ( (strcmp(i_arch, "wsm") == 0) || (strcmp(i_arch, "snb") == 0) || (strcmp(i_arch, "hsw") == 0) ) { l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " : : \"m\"(inputptr), \"m\"(weightptr), \"m\"(outputptr), \"m\"(inputpfptr), \"m\"(weightpfptr), \"m\"(outputpfptr) : \"rax\",\"rbx\",\"rcx\",\"rdx\",\"rdi\",\"rsi\",\"r8\",\"r9\",\"r10\",\"r11\",\"r12\",\"r13\",\"r14\",\"r15\",\"xmm0\",\"xmm1\",\"xmm2\",\"xmm3\",\"xmm4\",\"xmm5\",\"xmm6\",\"xmm7\",\"xmm8\",\"xmm9\",\"xmm10\",\"xmm11\",\"xmm12\",\"xmm13\",\"xmm14\",\"xmm15\");\n"); } else { l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " : : \"m\"(inputptr), \"m\"(weightptr), \"m\"(outputptr), \"m\"(inputpfptr), \"m\"(weightpfptr), \"m\"(outputpfptr) : \"rax\",\"rbx\",\"rcx\",\"rdx\",\"rdi\",\"rsi\",\"r8\",\"r9\",\"r10\",\"r11\",\"r12\",\"r13\",\"r14\",\"r15\",\"zmm0\",\"zmm1\",\"zmm2\",\"zmm3\",\"zmm4\",\"zmm5\",\"zmm6\",\"zmm7\",\"zmm8\",\"zmm9\",\"zmm10\",\"zmm11\",\"zmm12\",\"zmm13\",\"zmm14\",\"zmm15\",\"zmm16\",\"zmm17\",\"zmm18\",\"zmm19\",\"zmm20\",\"zmm21\",\"zmm22\",\"zmm23\",\"zmm24\",\"zmm25\",\"zmm26\",\"zmm27\",\"zmm28\",\"zmm29\",\"zmm30\",\"zmm31\");\n"); } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } LIBXSMM_API_INTERN void libxsmm_x86_instruction_open_stream_matcopy( libxsmm_generated_code* io_generated_code, const unsigned int i_gp_reg_a, const unsigned int i_gp_reg_lda, const unsigned int i_gp_reg_b, const unsigned int i_gp_reg_ldb, const unsigned int i_gp_reg_a_pf, const unsigned int i_gp_reg_b_pf, const char* i_arch ) { LIBXSMM_UNUSED(i_arch); /* @TODO add checks in debug mode */ if ( io_generated_code->code_type > 1 ) { /* @TODO this is currently System V AMD64 RTL(C) ABI only */ unsigned char* l_code_buffer = (unsigned char *) io_generated_code->generated_code; unsigned int l_code_size = io_generated_code->code_size; unsigned int l_max_size = io_generated_code->buffer_size; if (NULL == l_code_buffer || l_max_size < (l_code_size + 9)) { LIBXSMM_HANDLE_ERROR(io_generated_code, LIBXSMM_ERR_BUFFER_TOO_SMALL); return; } /* push rbx */ l_code_buffer[l_code_size++] = 0x53; /* push r12 */ l_code_buffer[l_code_size++] = 0x41; l_code_buffer[l_code_size++] = 0x54; /* push r13 */ l_code_buffer[l_code_size++] = 0x41; l_code_buffer[l_code_size++] = 0x55; /* push r14 */ l_code_buffer[l_code_size++] = 0x41; l_code_buffer[l_code_size++] = 0x56; /* push r15 */ l_code_buffer[l_code_size++] = 0x41; l_code_buffer[l_code_size++] = 0x57; /* update code length */ io_generated_code->code_size = l_code_size; /* adjust stack frame size */ io_generated_code->sf_size += 40; } else if ( io_generated_code->code_type == 1 ) { /* @TODO this is currently System V AMD64 RTL(C) ABI only */ char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " pushq %%rbx\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " pushq %%r12\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " pushq %%r13\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " pushq %%r14\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " pushq %%r15\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* adjust stack frame size */ io_generated_code->sf_size += 40; } else { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; char l_gp_reg_name[4]; /* loading a pointer in assembly */ libxsmm_get_x86_gp_reg_name( i_gp_reg_a, l_gp_reg_name, 3 ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " __asm__ __volatile__(\"movq %%0, %%%%%s\\n\\t\"\n", l_gp_reg_name ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* loading lda pointer in assembly */ libxsmm_get_x86_gp_reg_name( i_gp_reg_lda, l_gp_reg_name, 3 ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " \"movq %%1, %%%%%s\\n\\t\"\n", l_gp_reg_name ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* loading b pointer in assembly */ libxsmm_get_x86_gp_reg_name( i_gp_reg_b, l_gp_reg_name, 3 ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " \"movq %%2, %%%%%s\\n\\t\"\n", l_gp_reg_name ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* loading ldb pointer in assembly */ libxsmm_get_x86_gp_reg_name( i_gp_reg_ldb, l_gp_reg_name, 3 ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " \"movq %%3, %%%%%s\\n\\t\"\n", l_gp_reg_name ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* loading a pf pointer in assembly */ libxsmm_get_x86_gp_reg_name( i_gp_reg_a_pf, l_gp_reg_name, 3 ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " \"movq %%4, %%%%%s\\n\\t\"\n", l_gp_reg_name ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* loading b pf pointer in assembly */ libxsmm_get_x86_gp_reg_name( i_gp_reg_b_pf, l_gp_reg_name, 3 ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " \"movq %%6, %%%%%s\\n\\t\"\n", l_gp_reg_name ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } LIBXSMM_API_INTERN void libxsmm_x86_instruction_close_stream_matcopy( libxsmm_generated_code* io_generated_code, const char* i_arch) { /* @TODO add checks in debug mode */ if ( io_generated_code->code_type > 1 ) { /* @TODO this is a very simple System V ABI 64 interface */ unsigned char *l_code_buffer = (unsigned char *) io_generated_code->generated_code; unsigned int l_code_size = io_generated_code->code_size; unsigned int l_max_size = io_generated_code->buffer_size; if (NULL == l_code_buffer || l_max_size < (l_code_size + 10)) { LIBXSMM_HANDLE_ERROR(io_generated_code, LIBXSMM_ERR_BUFFER_TOO_SMALL); return; } /* pop r15 */ l_code_buffer[l_code_size++] = 0x41; l_code_buffer[l_code_size++] = 0x5f; /* pop r14 */ l_code_buffer[l_code_size++] = 0x41; l_code_buffer[l_code_size++] = 0x5e; /* pop r13 */ l_code_buffer[l_code_size++] = 0x41; l_code_buffer[l_code_size++] = 0x5d; /* pop r12 */ l_code_buffer[l_code_size++] = 0x41; l_code_buffer[l_code_size++] = 0x5c; /* pop rbx */ l_code_buffer[l_code_size++] = 0x5b; /* adjust stack frame size */ io_generated_code->sf_size -= 40; /* retq */ /* @TODO: I don't know if this is the correct placement in the generation process */ l_code_buffer[l_code_size++] = 0xc3; /* update code length */ io_generated_code->code_size = l_code_size; } else if ( io_generated_code->code_type == 1 ) { /* @TODO this is currently System V AMD64 RTL(C) ABI only */ char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " popq %%r15\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " popq %%r14\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " popq %%r13\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " popq %%r12\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " popq %%rbx\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* adjust stack frame size */ io_generated_code->sf_size -= 40; /* @TODO: I don't know if this is the correct placement in the generation process */ l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " retq\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else { char l_new_code[1024]; int l_max_code_length = 1023; int l_code_length = 0; if ( (strcmp(i_arch, "wsm") == 0) || (strcmp(i_arch, "snb") == 0) || (strcmp(i_arch, "hsw") == 0) ) { l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " : : \"m\"(aptr), \"m\"(ldaptr), \"m\"(bptr), \"m\"(ldbptr), \"m\"(apfptr), \"m\"(bpfptr) : \"rax\",\"rbx\",\"rcx\",\"rdx\",\"rdi\",\"rsi\",\"r8\",\"r9\",\"r10\",\"r11\",\"r12\",\"r13\",\"r14\",\"r15\",\"xmm0\",\"xmm1\",\"xmm2\",\"xmm3\",\"xmm4\",\"xmm5\",\"xmm6\",\"xmm7\",\"xmm8\",\"xmm9\",\"xmm10\",\"xmm11\",\"xmm12\",\"xmm13\",\"xmm14\",\"xmm15\");\n"); } else { l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " : : \"m\"(aptr), \"m\"(ldaptr), \"m\"(bptr), \"m\"(ldbptr), \"m\"(apfptr), \"m\"(bpfptr) : \"rax\",\"rbx\",\"rcx\",\"rdx\",\"rdi\",\"rsi\",\"r8\",\"r9\",\"r10\",\"r11\",\"r12\",\"r13\",\"r14\",\"r15\",\"zmm0\",\"zmm1\",\"zmm2\",\"zmm3\",\"zmm4\",\"zmm5\",\"zmm6\",\"zmm7\",\"zmm8\",\"zmm9\",\"zmm10\",\"zmm11\",\"zmm12\",\"zmm13\",\"zmm14\",\"zmm15\",\"zmm16\",\"zmm17\",\"zmm18\",\"zmm19\",\"zmm20\",\"zmm21\",\"zmm22\",\"zmm23\",\"zmm24\",\"zmm25\",\"zmm26\",\"zmm27\",\"zmm28\",\"zmm29\",\"zmm30\",\"zmm31\");\n"); } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } LIBXSMM_API_INTERN void libxsmm_x86_instruction_open_stream_mateltwise( libxsmm_generated_code* io_generated_code, const unsigned int i_gp_struct_params, const char* i_arch ) { LIBXSMM_UNUSED(i_arch); /* @TODO add checks in debug mode */ if ( io_generated_code->code_type > 1 ) { /* @TODO this is currently System V AMD64 RTL(C) ABI only */ unsigned char* l_code_buffer = (unsigned char *) io_generated_code->generated_code; unsigned int l_code_size = io_generated_code->code_size; unsigned int l_max_size = io_generated_code->buffer_size; if (NULL == l_code_buffer || l_max_size < (l_code_size + 9)) { LIBXSMM_HANDLE_ERROR(io_generated_code, LIBXSMM_ERR_BUFFER_TOO_SMALL); return; } /* push rbx */ l_code_buffer[l_code_size++] = 0x53; /* push r12 */ l_code_buffer[l_code_size++] = 0x41; l_code_buffer[l_code_size++] = 0x54; /* push r13 */ l_code_buffer[l_code_size++] = 0x41; l_code_buffer[l_code_size++] = 0x55; /* push r14 */ l_code_buffer[l_code_size++] = 0x41; l_code_buffer[l_code_size++] = 0x56; /* push r15 */ l_code_buffer[l_code_size++] = 0x41; l_code_buffer[l_code_size++] = 0x57; /* update code length */ io_generated_code->code_size = l_code_size; /* adjust stack frame size */ io_generated_code->sf_size += 40; } else if ( io_generated_code->code_type == 1 ) { /* @TODO this is currently System V AMD64 RTL(C) ABI only */ char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " pushq %%rbx\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " pushq %%r12\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " pushq %%r13\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " pushq %%r14\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " pushq %%r15\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* adjust stack frame size */ io_generated_code->sf_size += 40; } else { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; char l_gp_reg_name[4]; /* loading struct params pointer in assembly */ libxsmm_get_x86_gp_reg_name( i_gp_struct_params, l_gp_reg_name, 3 ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " __asm__ __volatile__(\"movq %%0, %%%%%s\\n\\t\"\n", l_gp_reg_name ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } LIBXSMM_API_INTERN void libxsmm_x86_instruction_close_stream_mateltwise( libxsmm_generated_code* io_generated_code, const char* i_arch) { /* @TODO add checks in debug mode */ LIBXSMM_ASSERT(NULL != i_arch); if ( io_generated_code->code_type > 1 ) { /* @TODO this is a very simple System V ABI 64 interface */ unsigned char *l_code_buffer = (unsigned char *) io_generated_code->generated_code; unsigned int l_code_size = io_generated_code->code_size; unsigned int l_max_size = io_generated_code->buffer_size; if (NULL == l_code_buffer || l_max_size < (l_code_size + 10)) { LIBXSMM_HANDLE_ERROR(io_generated_code, LIBXSMM_ERR_BUFFER_TOO_SMALL); return; } /* pop r15 */ l_code_buffer[l_code_size++] = 0x41; l_code_buffer[l_code_size++] = 0x5f; /* pop r14 */ l_code_buffer[l_code_size++] = 0x41; l_code_buffer[l_code_size++] = 0x5e; /* pop r13 */ l_code_buffer[l_code_size++] = 0x41; l_code_buffer[l_code_size++] = 0x5d; /* pop r12 */ l_code_buffer[l_code_size++] = 0x41; l_code_buffer[l_code_size++] = 0x5c; /* pop rbx */ l_code_buffer[l_code_size++] = 0x5b; /* adjust stack frame size */ io_generated_code->sf_size -= 40; /* retq */ /* @TODO: I don't know if this is the correct placement in the generation process */ l_code_buffer[l_code_size++] = 0xc3; /* update code length */ io_generated_code->code_size = l_code_size; } else if ( io_generated_code->code_type == 1 ) { /* @TODO this is currently System V AMD64 RTL(C) ABI only */ char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " popq %%r15\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " popq %%r14\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " popq %%r13\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " popq %%r12\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " popq %%rbx\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* adjust stack frame size */ io_generated_code->sf_size -= 40; /* @TODO: I don't know if this is the correct placement in the generation process */ l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " retq\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else { char l_new_code[1024]; int l_max_code_length = 1023; int l_code_length = 0; if ( (strcmp(i_arch, "wsm") == 0) || (strcmp(i_arch, "snb") == 0) || (strcmp(i_arch, "hsw") == 0) ) { l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " : : \"m\"(aptr), \"m\"(ldaptr), \"m\"(bptr), \"m\"(ldbptr), \"m\"(apfptr), \"m\"(bpfptr) : \"rax\",\"rbx\",\"rcx\",\"rdx\",\"rdi\",\"rsi\",\"r8\",\"r9\",\"r10\",\"r11\",\"r12\",\"r13\",\"r14\",\"r15\",\"xmm0\",\"xmm1\",\"xmm2\",\"xmm3\",\"xmm4\",\"xmm5\",\"xmm6\",\"xmm7\",\"xmm8\",\"xmm9\",\"xmm10\",\"xmm11\",\"xmm12\",\"xmm13\",\"xmm14\",\"xmm15\");\n"); } else { l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, " : : \"m\"(aptr), \"m\"(ldaptr), \"m\"(bptr), \"m\"(ldbptr), \"m\"(apfptr), \"m\"(bpfptr) : \"rax\",\"rbx\",\"rcx\",\"rdx\",\"rdi\",\"rsi\",\"r8\",\"r9\",\"r10\",\"r11\",\"r12\",\"r13\",\"r14\",\"r15\",\"zmm0\",\"zmm1\",\"zmm2\",\"zmm3\",\"zmm4\",\"zmm5\",\"zmm6\",\"zmm7\",\"zmm8\",\"zmm9\",\"zmm10\",\"zmm11\",\"zmm12\",\"zmm13\",\"zmm14\",\"zmm15\",\"zmm16\",\"zmm17\",\"zmm18\",\"zmm19\",\"zmm20\",\"zmm21\",\"zmm22\",\"zmm23\",\"zmm24\",\"zmm25\",\"zmm26\",\"zmm27\",\"zmm28\",\"zmm29\",\"zmm30\",\"zmm31\");\n"); } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } libxsmm-1.17/src/generator_x86_instructions.h000066400000000000000000001046431415223013700214020ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Greg Henry (Intel Corp.) ******************************************************************************/ #ifndef GENERATOR_X86_INSTRUCTIONS_H #define GENERATOR_X86_INSTRUCTIONS_H #include "generator_common.h" /** * Opens the inline assembly section / jit stream * * @param io_generated_code pointer to the pointer of the generated code structure * @param i_gp_reg_mapping gp register mapping for initialization * @param i_prefetch prefetch mode which may result in additional gp reg inits */ LIBXSMM_API_INTERN void libxsmm_x86_instruction_open_stream( libxsmm_generated_code* io_generated_code, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, unsigned int i_prefetch ); /** * Closes the inline assembly section / jit stream * * @param io_generated_code pointer to the pointer of the generated code structure * @param i_gp_reg_mapping gp register mapping for clobbering * @param i_prefetch prefetch mode which may result in additional gp reg clobbers */ LIBXSMM_API_INTERN void libxsmm_x86_instruction_close_stream( libxsmm_generated_code* io_generated_code, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, unsigned int i_prefetch ); /** * Generates vmaskmovps/vmaskmovpd with displacements for loads and stores. * Only works with i_vector_name='Y' */ LIBXSMM_API_INTERN void libxsmm_x86_instruction_vec_mask_move( libxsmm_generated_code* io_generated_code, const unsigned int i_vmove_instr, const unsigned int i_gp_reg_base, const unsigned int i_gp_reg_idx, const unsigned int i_scale, const int i_displacement, const char i_vector_name, const unsigned int i_vec_reg_number_0, const unsigned int i_vec_reg_mask_0, const unsigned int i_is_store ); /** * Generates vmovapd/vmovupd/vmovaps/vmovups/vmovsd/vmovss/vbroadcastsd/vbroastcastss/vmovddup instructions with displacements, explicit SIB addressing is not * supported by this function * * @param io_generated_code pointer to the pointer of the generated code structure * @param i_instruction_set requested instruction set to encode * @param i_vmove_instr actual vmov variant * @param i_gp_reg_number the register number (rax=0,rcx=1,rdx=2,rbx=3,rsp=4,rbp=5,rsi=6,rdi=7,r8=8,r9=9,r10=10,r11=11,r12=12,r13=13,r14=14,r15=15) of the base address register * @param i_displacement the offset to the base address * @param i_vector_name the vector register name prefix (x, y or z) * @param i_vec_reg_number_0 the vector register number (xmm/ymm: 0-15, zmm: 0-31) * @param i_mask_reg_number the mask register to be used * @param i_use_zero_masking: 0: merge masking ; !=0: zero masking * @param i_is_store 0: load semantic, other: store semantic */ LIBXSMM_API_INTERN void libxsmm_x86_instruction_vec_move( libxsmm_generated_code* io_generated_code, const unsigned int i_instruction_set, const unsigned int i_vmove_instr, const unsigned int i_gp_reg_base, const unsigned int i_gp_reg_idx, const unsigned int i_scale, const int i_displacement, const char i_vector_name, const unsigned int i_vec_reg_number_0, const unsigned int i_mask_reg_number, const unsigned int i_use_zero_masking, const unsigned int i_is_store ); /** * Generates (v)XYZpd/(v)XYZps/(v)XYZsd/(v)XYZss instructions with 2 or 3 vector registers, memory operands are not supported as first operand * * @param io_generated_code pointer to the pointer of the generated code structure * @param i_instruction_set requested instruction set to encode * @param i_vec_instr actual operation variant * @param i_vector_name the vector register name prefix (x,y or z) * @param i_vec_reg_number_0 the first vector register number (xmm/ymm: 0-15, zmm: 0-31) * @param i_vec_reg_number_1 the second vector register number (xmm/ymm: 0-15, zmm: 0-31) * @param i_vec_reg_number_2 the third vector register number (xmm/ymm: 0-15, zmm: 0-31), if this operand equals LIBXSMM_X86_VEC_REG_UNDEF -> SSE3 code generation */ LIBXSMM_API_INTERN void libxsmm_x86_instruction_vec_compute_reg( libxsmm_generated_code* io_generated_code, const unsigned int i_instruction_set, const unsigned int i_vec_instr, const char i_vector_name, const unsigned int i_vec_reg_number_0, const unsigned int i_vec_reg_number_1, const unsigned int i_vec_reg_number_2 ); /** * Generates (v)XYZpd/(v)XYZps/(v)XYZsd/(v)XYZss convert instructions with 2 vector registers, memory operands are not supported as first operand * * @param io_generated_code pointer to the pointer of the generated code structure * @param i_instruction_set requested instruction set to encode * @param i_vec_instr actual operation variant * @param i_vector_name the vector register name prefix (x,y or z) * @param i_vec_reg_src_0 the first source vector register number (xmm/ymm: 0-15, zmm: 0-31) * @param i_vec_reg_src_1 the second source vector register number (xmm/ymm: 0-15, zmm: 0-31) * @param i_vec_reg_dst the destination vector register number (xmm/ymm: 0-15, zmm: 0-31) * @param i_shuffle_operand is an immediate (only looked at when needed) */ LIBXSMM_API_INTERN void libxsmm_x86_instruction_vec_compute_convert ( libxsmm_generated_code* io_generated_code, const unsigned int i_instruction_set, const unsigned int i_vec_instr, const char i_vector_name, const unsigned int i_vec_reg_src_0, const unsigned int i_vec_reg_src_1, const unsigned int i_vec_reg_dst, const unsigned int i_shuffle_operand ); /** * Generates (v)XYZpd/(v)XYZps/(v)XYZsd/(v)XYZss instructions with 3 vector registers and masking * * @param io_generated_code pointer to the pointer of the generated code structure * @param i_instruction_set requested instruction set to encode * @param i_vec_instr actual operation variant * @param i_vector_name the vector register name prefix (z) * @param i_vec_reg_number_0 the first vector register number (zmm: 0-31) * @param i_vec_reg_number_1 the second vector register number (zmm: 0-31) * @param i_vec_reg_number_3 the second vector register number (zmm: 0-31) * @param i_immediate immediate just as the compare value for a compare instruction * @param i_mask_reg_number the mask register to read/write * @param i_use_zero_masking 0: merge masking, !=0 zero masking */ LIBXSMM_API_INTERN void libxsmm_x86_instruction_vec_compute_reg_mask( libxsmm_generated_code* io_generated_code, const unsigned int i_instruction_set, const unsigned int i_vec_instr, const char i_vector_name, const unsigned int i_vec_reg_number_0, const unsigned int i_vec_reg_number_1, const unsigned int i_vec_reg_number_2, const unsigned int i_immediate, const unsigned int i_mask_reg_number, const unsigned int i_use_zero_masking ); /** * @param i_instruction_set requested instruction set to encode * @param i_vec_instr actual operation variant * @param i_vector_name the vector register name prefix (z) * @param i_vec_reg_number_0 the first vector register number (zmm: 0-31) * @param i_vec_reg_number_1 the second vector register number (zmm: 0-31) * @param i_vec_reg_number_2 the third vector register number (zmm: 0-31) * @param i_mask_reg_number the mask register (0-7) */ LIBXSMM_API_INTERN void libxsmm_x86_instruction_vec_compute_mem( libxsmm_generated_code* io_generated_code, const unsigned int i_instruction_set, const unsigned int i_vec_instr, const unsigned int i_use_broadcast, const unsigned int i_gp_reg_base, const unsigned int i_gp_reg_idx, const unsigned int i_scale, const int i_displacement, const char i_vector_name, const unsigned int i_vec_reg_number_0, const unsigned int i_vec_reg_number_1 ); /** * Generates vector instructions which require an immediate and mask. immediate is optional. * * @param io_generated_code pointer to the pointer of the generated code structure * @param i_instruction_set requested instruction set to encode * @param i_vec_instr actual operation variant * @param i_use_broadcast if != 0 memory operand is interpreted as a scalar and broadcasted in fused fashion, only supported on AVX512 * @param i_gp_reg_base base address register for memory broadcast * @param i_gp_reg_idx index register for memory broadcast, can be LIBXSMM_X86_GP_REG_UNDEF -> then regular displacement version is generated * @param i_scale scale of index register, ignored if i_gp_reg_idx is LIBXSMM_X86_GP_REG_UNDEF * @param i_displacement displacement to SIB address * @param i_vector_name the vector register name prefix (z) * @param i_vec_reg_number_0 the first vector register number (zmm: 0-31) * @param i_vec_reg_number_1 the second vector register number (zmm: 0-31) * @param i_immediate immediate just as the compare value for a compare instruction * @param i_mask_reg_number the mask register to read/write * @param i_use_zero_masking 0: merge masking; !=0: zero masking */ LIBXSMM_API_INTERN void libxsmm_x86_instruction_vec_compute_mem_mask( libxsmm_generated_code* io_generated_code, const unsigned int i_instruction_set, const unsigned int i_vec_instr, const unsigned int i_use_broadcast, const unsigned int i_gp_reg_base, const unsigned int i_gp_reg_idx, const unsigned int i_scale, const int i_displacement, const char i_vector_name, const unsigned int i_vec_reg_number_0, const unsigned int i_vec_reg_number_1, const unsigned int i_immediate, const unsigned int i_mask_reg_number, const unsigned int i_use_zero_masking ); /** * Generates quadmadd instructions added in Knights Mill * * @param io_generated_code pointer to the pointer of the generated code structure * @param i_instruction_set requested instruction set to encode * @param i_vec_instr actual operation variant * @param i_gp_reg_base base address register for memory broadcast * @param i_gp_reg_idx index register for memory broadcast, can be LIBXSMM_X86_GP_REG_UNDEF -> then regular displacement version is generated * @param i_scale scale of index register, ignored if i_gp_reg_idx is LIBXSMM_X86_GP_REG_UNDEF * @param i_displacement displacement to SIB address * @param i_vector_name the vector register name prefix (z) * @param i_vec_reg_number_src the second vector register number (zmm: 0-31), this define a implicit register range * @param i_vec_reg_number_dest the first vector register number (zmm: 0-31) */ LIBXSMM_API_INTERN void libxsmm_x86_instruction_vec_compute_qfma( libxsmm_generated_code* io_generated_code, const unsigned int i_instruction_set, const unsigned int i_vec_instr, const unsigned int i_gp_reg_base, const unsigned int i_gp_reg_idx, const unsigned int i_scale, const int i_displacement, const char i_vector_name, const unsigned int i_vec_reg_number_src, const unsigned int i_vec_reg_number_dest ); /** * Generates shuffle instructions with 2 or 3 vector registers, memory operands are not supported as first operand * * @param io_generated_code pointer to the pointer of the generated code structure * @param i_instruction_set requested instruction set to encode * @param i_vec_instr actual operation variant * @param i_vector_name the vector register name prefix (x,y or z) * @param i_vec_reg_number_0 the first vector register number (xmm/ymm: 0-15, zmm: 0-31) * @param i_vec_reg_number_1 the second vector register number (xmm/ymm: 0-15, zmm: 0-31) * @param i_vec_reg_number_2 the third vector register number (xmm/ymm: 0-15, zmm: 0-31), if this operand equals LIBXSMM_X86_VEC_REG_UNDEF -> SSE3 code generation */ LIBXSMM_API_INTERN void libxsmm_x86_instruction_vec_shuffle_reg( libxsmm_generated_code* io_generated_code, const unsigned int i_instruction_set, const unsigned int i_vec_instr, const char i_vector_name, const unsigned int i_vec_reg_number_0, const unsigned int i_vec_reg_number_1, const unsigned int i_vec_reg_number_2, const unsigned int i_shuffle_operand ); /** * Generates shuffle instructions with 2 or 3 vector registers, memory operands are not supported as first operand * * @param io_generated_code pointer to the pointer of the generated code structure * @param i_instruction_set requested instruction set to encode * @param i_vmove_instr actual operation variant (gather/scatter and single/double) * @param i_vector_name the vector register name prefix (x,y or z) * @param i_gp_reg_number the register number (rax=0,rcx=1,rdx=2,rbx=3,rsp=4,rbp=5,rsi=6,rdi=7,r8=8,r9=9,r10=10,r11=11,r12=12,r13=13,r14=14,r15=15) of the base address register * @param i_vec_reg_idx the index vector registers (ymm0-15 AVX2) (zmm0-zmm32 AVX512) * @param i_scale the scaling of the indexes in i_vec_reg_idx * @param i_displacement the offset to the base address * @param i_vec_reg_number the destination(gather)/source(scatter) vec register (xmm/ymm: 0-15, zmm: 0-31) * @param i_mask_reg_number the mask register (xmm/ymm: 0-15 when using AVX2), (k1-k7 when using AVX512) * @param i_is_gather "true" generate a gather instruction, "false" generator a scatter instruction */ LIBXSMM_API_INTERN void libxsmm_x86_instruction_vec_move_gathscat( libxsmm_generated_code* io_generated_code, const unsigned int i_instruction_set, const unsigned int i_vmove_instr, const char i_vector_name, const unsigned int i_gp_reg_base, const unsigned int i_vec_reg_idx, const unsigned int i_scale, const int i_displacement, const unsigned int i_vec_reg_number, const unsigned int i_mask_reg_number, const unsigned int i_is_gather ); /* @TODO check if we can merge this alu_imm */ /** * Generates prefetch instructions with displacements, SIB addressing is not * supported by this function * * @param io_generated_code pointer to the pointer of the generated code structure * @param i_prefetch_instr actual prefetch variant * @param i_gp_reg_number the register number (rax=0,rcx=1,rdx=2,rbx=3,rsp=4,rbp=5,rsi=6,rdi=7,r8=8,r9=9,r10=10,r11=11,r12=12,r13=13,r14=14,r15=15) of the base address register * @param i_displacement the offset to the base address */ LIBXSMM_API_INTERN void libxsmm_x86_instruction_prefetch( libxsmm_generated_code* io_generated_code, const unsigned int i_prefetch_instr, const unsigned int i_gp_reg_base, const unsigned int i_gp_reg_idx, const unsigned int i_scale, const int i_displacement ); /** * Generates alu memory movements like movq 7(%rax,%rbx,2), %rcx * Takes 3 gp_registers (0-15 values) * i_is_store tells whether this is a store or load */ LIBXSMM_API_INTERN void libxsmm_x86_instruction_alu_mem( libxsmm_generated_code* io_generated_code, const unsigned int i_alu_instr, const unsigned int i_gp_reg_base, const unsigned int i_gp_reg_idx, const unsigned int i_scale, const int i_displacement, const unsigned int i_gp_reg_number, const unsigned int i_is_store ); /** * Generates regular all instructions with immediates * * @param io_generated_code pointer to the pointer of the generated code structure * @param i_alu_instr actual alu gpr instruction * @param i_gp_reg_number the register number (rax=0,rcx=1,rdx=2,rbx=3,rsp=4,rbp=5,rsi=6,rdi=7,r8=8,r9=9,r10=10,r11=11,r12=12,r13=13,r14=14,r15=15) of the base address register * @param i_immediate the immediate operand */ LIBXSMM_API_INTERN void libxsmm_x86_instruction_alu_imm( libxsmm_generated_code* io_generated_code, const unsigned int i_alu_instr, const unsigned int i_gp_reg_number, const long long i_immediate ); /** * Generates regular all instructions with immediates, 64bit * * @param io_generated_code pointer to the pointer of the generated code structure * @param i_alu_instr actual alu gpr instruction * @param i_gp_reg_number the register number (rax=0,rcx=1,rdx=2,rbx=3,rsp=4,rbp=5,rsi=6,rdi=7,r8=8,r9=9,r10=10,r11=11,r12=12,r13=13,r14=14,r15=15) of the base address register * @param i_immediate the immediate operand */ LIBXSMM_API_INTERN void libxsmm_x86_instruction_alu_imm_i64( libxsmm_generated_code* io_generated_code, const unsigned int i_alu_instr, const unsigned int i_gp_reg_number, const size_t i_immediate ); /** * Generates regular all instructions with immediates * * @param io_generated_code pointer to the pointer of the generated code structure * @param i_alu_instr actual alu gpr instruction * @param i_gp_reg_number_src the source register number (rax=0,rcx=1,rdx=2,rbx=3,rsp=4,rbp=5,rsi=6,rdi=7,r8=8,r9=9,r10=10,r11=11,r12=12,r13=13,r14=14,r15=15) of the base address register * @param i_gp_reg_number_dest the destination register number (rax=0,rcx=1,rdx=2,rbx=3,rsp=4,rbp=5,rsi=6,rdi=7,r8=8,r9=9,r10=10,r11=11,r12=12,r13=13,r14=14,r15=15) of the base address register */ LIBXSMM_API_INTERN void libxsmm_x86_instruction_alu_reg( libxsmm_generated_code* io_generated_code, const unsigned int i_alu_instr, const unsigned int i_gp_reg_number_src, const unsigned int i_gp_reg_number_dest); /** * Generates push to the stack for a GPR * * @param io_generated_code pointer to the pointer of the generated code structure * @param i_gp_reg_number the source register number (rax=0,rcx=1,rdx=2,rbx=3,rsp=4,rbp=5,rsi=6,rdi=7,r8=8,r9=9,r10=10,r11=11,r12=12,r13=13,r14=14,r15=15) */ LIBXSMM_API_INTERN void libxsmm_x86_instruction_push_reg( libxsmm_generated_code* io_generated_code, const unsigned int i_gp_reg_number ); /** * Generates pop from the stack for a GPR * * @param io_generated_code pointer to the pointer of the generated code structure * @param i_gp_reg_number the source register number (rax=0,rcx=1,rdx=2,rbx=3,rsp=4,rbp=5,rsi=6,rdi=7,r8=8,r9=9,r10=10,r11=11,r12=12,r13=13,r14=14,r15=15) */ LIBXSMM_API_INTERN void libxsmm_x86_instruction_pop_reg( libxsmm_generated_code* io_generated_code, const unsigned int i_gp_reg_number ); /** * Allows for mask move instructions in AVX512 * * @param io_generated_code pointer to the pointer of the generated code structure * @param i_mask_instr actual mask move instruction * @param i_gp_reg_number the register number (rax=0,rcx=1,rdx=2,rbx=3,rsp=4,rbp=5,rsi=6,rdi=7,r8=8,r9=9,r10=10,r11=11,r12=12,r13=13,r14=14,r15=15) of the base address register * @param i_mask_reg_number the register number (k1=1...k7=7) * @param i_is_store indicates if we wnat to move the mask to gpr */ LIBXSMM_API_INTERN void libxsmm_x86_instruction_mask_move( libxsmm_generated_code* io_generated_code, const unsigned int i_mask_instr, const unsigned int i_gp_reg_number, const unsigned int i_mask_reg_number, const unsigned int i_is_store ); /** * Allows for mask move instructions in AVX512 * * @param io_generated_code pointer to the pointer of the generated code structure * @param i_mask_instr actual mask move instruction * @param i_gp_reg_base base address register for memory broadcast * @param i_gp_reg_idx index register for memory broadcast, can be LIBXSMM_X86_GP_REG_UNDEF -> then regular displacement version is generated * @param i_scale scale of index register, ignored if i_gp_reg_idx is LIBXSMM_X86_GP_REG_UNDEF * @param i_displacement displacement to SIB address * @param i_mask_reg_number the register number (k1=1...k7=7) * @param i_is_store indicates if we wnat to move the mask to gpr */ LIBXSMM_API_INTERN void libxsmm_x86_instruction_mask_move_mem( libxsmm_generated_code* io_generated_code, const unsigned int i_mask_instr, const unsigned int i_gp_reg_base, const unsigned int i_gp_reg_idx, const unsigned int i_scale, const int i_displacement, const unsigned int i_mask_reg_number, const unsigned int i_is_store ); /** * Allows for mask move instructions in AVX512 * * @param io_generated_code pointer to the pointer of the generated code structure * @param i_mask_instr actual mask compute instruction * @param i_mask_reg_number_src_0 the first operand register number (att syntax) (k1=1...k7=7) * @param i_mask_reg_number_src_1 the second operand register number (att syntax) (k1=1...k7=7) * @param i_mask_reg_number_dest the third operand register number (att syntax) (k1=1...k7=7) */ LIBXSMM_API_INTERN void libxsmm_x86_instruction_mask_compute_reg( libxsmm_generated_code* io_generated_code, const unsigned int i_mask_instr, const unsigned int i_mask_reg_number_src_0, const unsigned int i_mask_reg_number_src_1, const unsigned int i_mask_reg_number_dest ); /** * Generates a label to which one can jump back and pushes it on the loop label stack * * @param io_generated_code pointer to the pointer of the generated code structure * @param io_loop_label_tracker data structure to handle loop labels, nested loops are supported, but not overlapping loops */ LIBXSMM_API_INTERN void libxsmm_x86_instruction_register_jump_back_label( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker ); /** * Pops the latest from the loop label stack and jumps there * * @param io_generated_code pointer to the pointer of the generated code structure * @param i_jmp_instr the particular jump instruction used * @param io_loop_label_tracker data structure to handle loop labels will jump to latest registered label */ LIBXSMM_API_INTERN void libxsmm_x86_instruction_jump_back_to_label( libxsmm_generated_code* io_generated_code, const unsigned int i_jmp_instr, libxsmm_loop_label_tracker* io_loop_label_tracker ); /** * Generates a label to which one can jump back and pushes it on the loop label stack * * @param io_generated_code pointer to the pointer of the generated code structure * @parma i_labal_no position in the jump label tracker to set * @param io_jump_forward_label_tracker forward jump tracker structure for tracking the jump addresses/labels */ LIBXSMM_API_INTERN void libxsmm_x86_instruction_register_jump_label( libxsmm_generated_code* io_generated_code, const unsigned int i_label_no, libxsmm_jump_label_tracker* io_jump_label_tracker ); /** * Jumps to the address/label stored a specific position * * @param io_generated_code pointer to the pointer of the generated code structure * @param i_jmp_instr the particular jump instruction used * @param i_label_no position in the jump label tracker to jump to * @param io_jump_label_tracker data structures that tracks arbitrary jump labels */ LIBXSMM_API_INTERN void libxsmm_x86_instruction_jump_to_label( libxsmm_generated_code* io_generated_code, const unsigned int i_jmp_instr, const unsigned int i_label_no, libxsmm_jump_label_tracker* io_jump_label_tracker ); /** * Generates an insertion of constants into the code stream and loads them into * into a vector register * * @param io_generated_code pointer to the pointer of the generated code structure * @param i_data pointer to an array of bytes that should be loaded, length needs to match registerlength specified in i_vector_name (x=16, y=32, z=64) * @param i_id global identifier of constants to load. * @param i_vector_name the vector register name prefix (x,y or z) * @param i_vec_reg_number the destination(gather)/source(scatter) vec register (xmm/ymm: 0-15, zmm: 0-31) */ LIBXSMM_API_INTERN void libxsmm_x86_instruction_full_vec_load_of_constants ( libxsmm_generated_code *io_generated_code, const unsigned char *i_data, const char *i_id, const char i_vector_name, const unsigned int i_vec_reg_number ); /** * Generates a sequence to load function arguments from the stack (arguments ) * * @param io_generated_code pointer to the pointer of the generated code structure * @param i_arg_number the number of an argument which was passed on the stack * @param i_gp_reg_number the destination register number (rax=0,rcx=1,rdx=2,rbx=3,rsp=4,rbp=5,rsi=6,rdi=7,r8=8,r9=9,r10=10,r11=11,r12=12,r13=13,r14=14,r15=15) */ LIBXSMM_API_INTERN void libxsmm_x86_instruction_load_arg_to_reg( libxsmm_generated_code* io_generated_code, const unsigned int i_arg_number, const unsigned int i_gp_reg_number ); /** * @TODO: clean-up * Opens the inline assembly section / jit stream for matcopy, this is hacked and should be cleaned up * * @param io_generated_code pointer to the pointer of the generated code structure * @param i_arch architecture code was generated for (needed to build clobber) */ LIBXSMM_API_INTERN void libxsmm_x86_instruction_open_stream_matcopy( libxsmm_generated_code* io_generated_code, const unsigned int i_gp_reg_a, const unsigned int i_gp_reg_lda, const unsigned int i_gp_reg_b, const unsigned int i_gp_reg_ldb, const unsigned int i_gp_reg_a_pf, const unsigned int i_gp_reg_b_pf, const char* i_arch ); /** * @TODO: clean-up * Closes the inline assembly section / jit stream for matcopy, this is hacked and should be cleaned up * * @param io_generated_code pointer to the pointer of the generated code structure * @param i_arch architecture code was generated for (needed to build clobber) */ LIBXSMM_API_INTERN void libxsmm_x86_instruction_close_stream_matcopy( libxsmm_generated_code* io_generated_code, const char* i_arch ); LIBXSMM_API_INTERN void libxsmm_x86_instruction_open_stream_mateltwise( libxsmm_generated_code* io_generated_code, const unsigned int i_gp_struct_params, const char* i_arch ); LIBXSMM_API_INTERN void libxsmm_x86_instruction_close_stream_mateltwise( libxsmm_generated_code* io_generated_code, const char* i_arch ); /** * @TODO: clean-up * Opens the inline assembly section / jit stream for transposes, this is hacked and should be cleaned up * * @param io_generated_code pointer to the pointer of the generated code structure * @param i_arch architecture code was generated for (needed to build clobber) */ LIBXSMM_API_INTERN void libxsmm_x86_instruction_open_stream_transpose( libxsmm_generated_code* io_generated_code, const unsigned int i_gp_reg_a, const unsigned int i_gp_reg_lda, const unsigned int i_gp_reg_b, const unsigned int i_gp_reg_ldb, const char* i_arch ); /** * @TODO: clean-up * Closes the inline assembly section / jit stream for transposes, this is hacked and should be cleaned up * * @param io_generated_code pointer to the pointer of the generated code structure * @param i_arch architecture code was generated for (needed to build clobber) */ LIBXSMM_API_INTERN void libxsmm_x86_instruction_close_stream_transpose( libxsmm_generated_code* io_generated_code, const char* i_arch ); #endif /* GENERATOR_X86_INSTRUCTIONS_H */ libxsmm-1.17/src/libxsmm_blocked_gemm.c000066400000000000000000000507301415223013700202160ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Kunal Banerjee (Intel Corp.), Dheevatsa Mudigere (Intel Corp.) Alexander Heinecke (Intel Corp.), Hans Pabst (Intel Corp.) ******************************************************************************/ #include "libxsmm_blocked_gemm_types.h" #include LIBXSMM_API libxsmm_blocked_gemm_handle* libxsmm_blocked_gemm_handle_create(/*unsigned*/ int nthreads, libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* bm, const libxsmm_blasint* bn, const libxsmm_blasint* bk, const libxsmm_blasint* b_m1, const libxsmm_blasint* b_n1, const libxsmm_blasint* b_k1, const libxsmm_blasint* b_k2, const void* alpha, const void* beta, const int* gemm_flags, const libxsmm_gemm_prefetch_type* prefetch, const libxsmm_blocked_gemm_order* order) { const char *const env_m = getenv("LIBXSMM_BLOCKED_GEMM_M"), *const env_n = getenv("LIBXSMM_BLOCKED_GEMM_N"), *const env_k = getenv("LIBXSMM_BLOCKED_GEMM_K"); const libxsmm_blasint mm = LIBXSMM_MIN(0 == bm ? ((NULL == env_m || 0 == *env_m) ? 32 : atoi(env_m)) : *bm, m); const libxsmm_blasint kk = LIBXSMM_MIN(0 == bk ? ((NULL == env_k || 0 == *env_k) ? mm : atoi(env_k)) : *bk, k); const libxsmm_blasint nn = LIBXSMM_MIN(0 == bn ? ((NULL == env_n || 0 == *env_n) ? kk : atoi(env_n)) : *bn, n); libxsmm_blocked_gemm_handle* result = 0; static int error_once = 0; if (0 < m && 0 < n && 0 < k && 0 < mm && 0 < nn && 0 < kk && 0 < nthreads) { libxsmm_blocked_gemm_handle handle; memset(&handle, 0, sizeof(handle)); if (0 == (m % mm) && 0 == (n % nn) && 0 == (k % kk) && 0 == (m % *b_m1) && 0 == (n % *b_n1) && 0 == (k % *b_k1) && 0 == ((k / *b_k1 / *b_k2) % kk) && 0 == ((n / *b_n1) % nn) && 0 == ((m / *b_m1) % mm)) { /* check for valid block-size */ libxsmm_gemm_descriptor* desc; libxsmm_descriptor_blob blob; if (0 == prefetch) { /* auto-prefetch */ /* TODO: more sophisticated strategy perhaps according to CPUID */ const libxsmm_gemm_prefetch_type prefetch_default = LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C; const char *const env_p = getenv("LIBXSMM_BLOCKED_GEMM_PREFETCH"); desc = libxsmm_gemm_descriptor_init2(&blob, iprec, oprec, mm, nn, kk, mm/*lda*/, kk/*ldb*/, mm/*ldc*/, alpha, beta, 0 == gemm_flags ? LIBXSMM_GEMM_FLAG_NONE : *gemm_flags, (NULL == env_p || 0 == *env_p) ? prefetch_default : libxsmm_gemm_uid2prefetch(atoi(env_p))); } else { /* user-defined */ desc = libxsmm_gemm_descriptor_init2(&blob, iprec, oprec, mm, nn, kk, mm/*lda*/, kk/*ldb*/, mm/*ldc*/, alpha, beta, 0 == gemm_flags ? LIBXSMM_GEMM_FLAG_NONE : *gemm_flags, *prefetch); } if (0 != desc) { handle.mb = m / mm; handle.nb = n / nn; handle.kb = k / kk; if (LIBXSMM_GEMM_PREFETCH_NONE != desc->prefetch) { handle.kernel_pf = libxsmm_xmmdispatch(desc); desc->prefetch = LIBXSMM_GEMM_PREFETCH_NONE; handle.kernel = libxsmm_xmmdispatch(desc); } else { /* no prefetch */ handle.kernel = libxsmm_xmmdispatch(desc); handle.kernel_pf.xmm = 0; } } if (0 != handle.kernel.xmm) { const size_t tls_size = LIBXSMM_UP2((size_t)mm * nn * LIBXSMM_TYPESIZE(oprec), LIBXSMM_CACHELINE) * nthreads; const size_t size_locks = (size_t)handle.mb * (size_t)handle.nb * sizeof(libxsmm_blocked_gemm_lock); handle.locks = (libxsmm_blocked_gemm_lock*)libxsmm_aligned_malloc(size_locks, LIBXSMM_CACHELINE); handle.buffer = libxsmm_aligned_malloc(tls_size, LIBXSMM_CACHELINE); result = (libxsmm_blocked_gemm_handle*)malloc(sizeof(libxsmm_blocked_gemm_handle)); if (224 <= nthreads #if !defined(__MIC__) && LIBXSMM_X86_AVX512_MIC <= libxsmm_target_archid && LIBXSMM_X86_AVX512_CORE > libxsmm_target_archid #endif ) { handle.barrier = libxsmm_barrier_create(nthreads / 4, 4); } else { handle.barrier = libxsmm_barrier_create(nthreads / 2, 2); } if (0 != result && 0 != handle.barrier && 0 != handle.buffer && 0 != handle.locks) { handle.m = m; handle.n = n; handle.k = k; handle.bm = mm; handle.bn = nn; handle.bk = kk; handle.b_m1 = *b_m1; handle.b_n1 = *b_n1; handle.b_k1 = *b_k1; handle.b_k2 = *b_k2; handle.iprec = iprec; handle.oprec = oprec; memset(handle.locks, 0, size_locks); handle.order = (0 == order ? LIBXSMM_BLOCKED_GEMM_ORDER_JIK : *order); handle.nthreads = nthreads; *result = handle; } else { if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: BGEMM handle allocation failed!\n"); } libxsmm_barrier_release(handle.barrier); libxsmm_free(handle.buffer); libxsmm_free(handle.locks); free(result); result = 0; } } else if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: unsupported BGEMM kernel requested!\n"); } } else if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: BGEMM block-size is invalid!\n"); } } else if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_blocked_gemm_handle_create!\n"); } return result; } LIBXSMM_API void libxsmm_blocked_gemm_handle_destroy(const libxsmm_blocked_gemm_handle* handle) { if (0 != handle) { libxsmm_barrier_release(handle->barrier); libxsmm_free(handle->buffer); libxsmm_free(handle->locks); free((libxsmm_blocked_gemm_handle*)handle); } } LIBXSMM_API int libxsmm_blocked_gemm_copyin_a(const libxsmm_blocked_gemm_handle* handle, const void* src, const libxsmm_blasint* ld, void* dst) { int result = EXIT_SUCCESS; static int error_once = 0; if (0 != handle) { #if 0 /* TODO: support leading dimension for the source buffer */ const libxsmm_blasint ild = (0 == ld ? handle->m : *ld); assert(ild >= handle->m); #else LIBXSMM_UNUSED(ld); #endif switch (handle->iprec) { case LIBXSMM_GEMM_PRECISION_F64: { # define LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE double # include "template/libxsmm_blocked_gemm_copyin_a.tpl.c" # undef LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE } break; case LIBXSMM_GEMM_PRECISION_F32: { # define LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE float # include "template/libxsmm_blocked_gemm_copyin_a.tpl.c" # undef LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE } break; case LIBXSMM_GEMM_PRECISION_I16: { # define LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE short # include "template/libxsmm_blocked_gemm_copyin_a.tpl.c" # undef LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE } break; default: { if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: BGEMM precision of matrix A is not supported!\n"); } result = EXIT_FAILURE; } } } else { if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: BGEMM-handle cannot be NULL!\n"); } result = EXIT_FAILURE; } return result; } LIBXSMM_API int libxsmm_blocked_gemm_copyin_b(const libxsmm_blocked_gemm_handle* handle, const void* src, const libxsmm_blasint* ld, void* dst) { int result = EXIT_SUCCESS; static int error_once = 0; if (0 != handle) { #if 0 /* TODO: support leading dimension for the source buffer */ const libxsmm_blasint ild = (0 == ld ? handle->k : *ld); assert(ild >= handle->k); #else LIBXSMM_UNUSED(ld); #endif switch (handle->iprec) { case LIBXSMM_GEMM_PRECISION_F64: { # define LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE double # include "template/libxsmm_blocked_gemm_copyin_b.tpl.c" # undef LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE } break; case LIBXSMM_GEMM_PRECISION_F32: { # define LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE float # include "template/libxsmm_blocked_gemm_copyin_b.tpl.c" # undef LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE } break; case LIBXSMM_GEMM_PRECISION_I16: { # define LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE short # include "template/libxsmm_blocked_gemm_copyin_b.tpl.c" # undef LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE } break; default: { if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: BGEMM precision of matrix B is not supported!\n"); } result = EXIT_FAILURE; } } } else { if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: BGEMM-handle cannot be NULL!\n"); } result = EXIT_FAILURE; } return result; } LIBXSMM_API int libxsmm_blocked_gemm_copyin_c(const libxsmm_blocked_gemm_handle* handle, const void* src, const libxsmm_blasint* ld, void* dst) { int result = EXIT_SUCCESS; static int error_once = 0; if (0 != handle) { #if 0 /* TODO: support leading dimension for the source buffer */ const libxsmm_blasint ild = (0 == ld ? handle->m : *ld); assert(ild >= handle->m); #else LIBXSMM_UNUSED(ld); #endif switch (handle->oprec) { case LIBXSMM_GEMM_PRECISION_F64: { # define LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE double # include "template/libxsmm_blocked_gemm_copyin_c.tpl.c" # undef LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE } break; case LIBXSMM_GEMM_PRECISION_F32: { # define LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE float # include "template/libxsmm_blocked_gemm_copyin_c.tpl.c" # undef LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE } break; case LIBXSMM_GEMM_PRECISION_I16: { # define LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE int # include "template/libxsmm_blocked_gemm_copyin_c.tpl.c" # undef LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE } break; default: { if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: BGEMM precision of matrix A is not supported!\n"); } result = EXIT_FAILURE; } } } else { if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: BGEMM-handle cannot be NULL!\n"); } result = EXIT_FAILURE; } return result; } LIBXSMM_API int libxsmm_blocked_gemm_copyout_c(const libxsmm_blocked_gemm_handle* handle, const void* src, const libxsmm_blasint* ld, void* dst) { int result = EXIT_SUCCESS; static int error_once = 0; if (0 != handle) { #if 0 /* TODO: support leading dimension for the source buffer */ const libxsmm_blasint ild = (0 == ld ? handle->m : *ld); assert(ild >= handle->m); #else LIBXSMM_UNUSED(ld); #endif switch (handle->oprec) { case LIBXSMM_GEMM_PRECISION_F64: { # define LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE double # include "template/libxsmm_blocked_gemm_copyout_c.tpl.c" # undef LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE } break; case LIBXSMM_GEMM_PRECISION_F32: { # define LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE float # include "template/libxsmm_blocked_gemm_copyout_c.tpl.c" # undef LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE } break; case LIBXSMM_GEMM_PRECISION_I16: { # define LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE int # include "template/libxsmm_blocked_gemm_copyout_c.tpl.c" # undef LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE } break; default: { if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: BGEMM precision of matrix A is not supported!\n"); } result = EXIT_FAILURE; } } } else { if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: BGEMM-handle cannot be NULL!\n"); } result = EXIT_FAILURE; } return result; } LIBXSMM_API int libxsmm_blocked_gemm_convert_b_to_a(const libxsmm_blocked_gemm_handle* handle, const void* src, const libxsmm_blasint* ld, void* dst) { int result = EXIT_SUCCESS; static int error_once = 0; if (0 != handle) { #if 0 /* TODO: support leading dimension for the source buffer */ const libxsmm_blasint ild = (0 == ld ? handle->k : *ld); assert(ild >= handle->k); #else LIBXSMM_UNUSED(ld); #endif switch (handle->iprec) { case LIBXSMM_GEMM_PRECISION_F64: { # define LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE double # include "template/libxsmm_blocked_gemm_convert_b_to_a.tpl.c" # undef LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE } break; case LIBXSMM_GEMM_PRECISION_F32: { # define LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE float # include "template/libxsmm_blocked_gemm_convert_b_to_a.tpl.c" # undef LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE } break; case LIBXSMM_GEMM_PRECISION_I16: { # define LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE short # include "template/libxsmm_blocked_gemm_convert_b_to_a.tpl.c" # undef LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE } break; default: { if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: BGEMM precision of matrix B is not supported!\n"); } result = EXIT_FAILURE; } } } else { if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: BGEMM-handle cannot be NULL!\n"); } result = EXIT_FAILURE; } return result; } LIBXSMM_API int libxsmm_blocked_gemm_transpose_b(const libxsmm_blocked_gemm_handle* handle, const void* src, const libxsmm_blasint* ld, void* dst) { int result = EXIT_SUCCESS; static int error_once = 0; if (0 != handle) { #if 0 /* TODO: support leading dimension for the source buffer */ const libxsmm_blasint ild = (0 == ld ? handle->k : *ld); assert(ild >= handle->k); #else LIBXSMM_UNUSED(ld); #endif switch (handle->iprec) { case LIBXSMM_GEMM_PRECISION_F64: { # define LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE double # include "template/libxsmm_blocked_gemm_transpose_b.tpl.c" # undef LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE } break; case LIBXSMM_GEMM_PRECISION_F32: { # define LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE float # include "template/libxsmm_blocked_gemm_transpose_b.tpl.c" # undef LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE } break; case LIBXSMM_GEMM_PRECISION_I16: { # define LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE short # include "template/libxsmm_blocked_gemm_transpose_b.tpl.c" # undef LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE } break; default: { if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: BGEMM precision of matrix B is not supported!\n"); } result = EXIT_FAILURE; } } } else { if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: BGEMM-handle cannot be NULL!\n"); } result = EXIT_FAILURE; } return result; } LIBXSMM_API_INLINE void internal_bgemm_order(libxsmm_blocked_gemm_order order, libxsmm_blasint w_i, libxsmm_blasint nw_i, libxsmm_blasint nw_j, libxsmm_blasint nw_k, libxsmm_blasint* i2, libxsmm_blasint* j2, libxsmm_blasint* k2) { switch (order) { case LIBXSMM_BLOCKED_GEMM_ORDER_JIK: { *j2 = (w_i / (nw_i * nw_k)); *i2 = (w_i - (*j2) * (nw_i * nw_k)) / nw_k; *k2 = (w_i % nw_k); } break; case LIBXSMM_BLOCKED_GEMM_ORDER_IJK: { *i2 = (w_i / (nw_j * nw_k)); *j2 = (w_i - (*i2) * (nw_j * nw_k)) / nw_k; *k2 = (w_i % nw_k); } break; case LIBXSMM_BLOCKED_GEMM_ORDER_JKI: { *j2 = (w_i / (nw_k * nw_i)); *k2 = (w_i - (*j2) * (nw_k * nw_i)) / nw_i; *i2 = (w_i % nw_i); } break; case LIBXSMM_BLOCKED_GEMM_ORDER_IKJ: { *i2 = (w_i / (nw_k * nw_j)); *k2 = (w_i - (*i2) * (nw_k * nw_j)) / nw_j; *j2 = (w_i % nw_j); } break; case LIBXSMM_BLOCKED_GEMM_ORDER_KJI: { *k2 = (w_i / (nw_j * nw_i)); *j2 = (w_i - (*k2) * (nw_j * nw_i)) / nw_i; *i2 = (w_i % nw_i); } break; case LIBXSMM_BLOCKED_GEMM_ORDER_KIJ: { *k2 = (w_i / (nw_i * nw_j)); *i2 = (w_i - (*k2) * (nw_i * nw_j)) / nw_j; *j2 = (w_i % nw_j); } break; default: assert(0/*should never happen*/); } } LIBXSMM_API void libxsmm_blocked_gemm_st(const libxsmm_blocked_gemm_handle* handle, const void* a, const void* b, void* c, /*unsigned*/int start_thread, /*unsigned*/int tid) { static int error_once = 0; #if defined(LIBXSMM_BLOCKED_GEMM_CHECKS) if (0 != handle && 0 != a && 0 != b && 0 != c && start_thread <= tid && 0 <= tid) #endif { const int ltid = tid - start_thread; if (handle->nthreads > 1) { libxsmm_barrier_init(handle->barrier, ltid); } switch (handle->iprec) { case LIBXSMM_GEMM_PRECISION_F64: { # define LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE_AB double # define LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE_C double # include "template/libxsmm_blocked_gemm.tpl.c" # undef LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE_AB # undef LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE_C } break; case LIBXSMM_GEMM_PRECISION_F32: { # define LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE_AB float # define LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE_C float # include "template/libxsmm_blocked_gemm.tpl.c" # undef LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE_AB # undef LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE_C } break; case LIBXSMM_GEMM_PRECISION_I16: { # define LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE_AB short # define LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE_C int # include "template/libxsmm_blocked_gemm.tpl.c" # undef LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE_C # undef LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE_AB } break; default: if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: BGEMM precision is not supported!\n"); } } if (handle->nthreads > 1) { libxsmm_barrier_wait(handle->barrier, ltid); } } #if defined(LIBXSMM_BLOCKED_GEMM_CHECKS) else if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_blocked_gemm!\n"); } #endif } libxsmm-1.17/src/libxsmm_blocked_gemm_types.h000066400000000000000000000031721415223013700214450ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_BLOCKED_GEMM_TYPES_H #define LIBXSMM_BLOCKED_GEMM_TYPES_H #include "libxsmm_gemm.h" #if !defined(LIBXSMM_BLOCKED_GEMM_CHECKS) && !defined(NDEBUG) # define LIBXSMM_BLOCKED_GEMM_CHECKS #endif LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE libxsmm_blocked_gemm_lock { char pad[LIBXSMM_CACHELINE]; volatile LIBXSMM_ATOMIC_LOCKTYPE state; } libxsmm_blocked_gemm_lock; LIBXSMM_EXTERN_C struct LIBXSMM_RETARGETABLE libxsmm_blocked_gemm_handle { union { double d; float s; int w; } alpha, beta; libxsmm_gemm_precision iprec, oprec; libxsmm_xmmfunction kernel_pf; libxsmm_xmmfunction kernel; libxsmm_barrier* barrier; libxsmm_blocked_gemm_lock* locks; libxsmm_blocked_gemm_order order; libxsmm_blasint m, n, k, bm, bn, bk; libxsmm_blasint b_m1, b_n1, b_k1, b_k2; libxsmm_blasint mb, nb, kb; void* buffer; int nthreads; }; #endif /*LIBXSMM_BLOCKED_GEMM_TYPES_H*/ libxsmm-1.17/src/libxsmm_cpuid_x86.c000066400000000000000000000260641415223013700174220ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include #include #include #if defined(LIBXSMM_PLATFORM_SUPPORTED) /* XGETBV: receive results (EAX, EDX) for eXtended Control Register (XCR). */ /* CPUID, receive results (EAX, EBX, ECX, EDX) for requested FUNCTION/SUBFN. */ #if defined(_MSC_VER) /*defined(_WIN32) && !defined(__GNUC__)*/ # define LIBXSMM_XGETBV(XCR, EAX, EDX) { \ unsigned long long libxsmm_xgetbv_ = _xgetbv(XCR); \ EAX = (int)libxsmm_xgetbv_; \ EDX = (int)(libxsmm_xgetbv_ >> 32); \ } # define LIBXSMM_CPUID_X86(FUNCTION, SUBFN, EAX, EBX, ECX, EDX) { \ int libxsmm_cpuid_x86_[/*4*/] = { 0, 0, 0, 0 }; \ __cpuidex(libxsmm_cpuid_x86_, FUNCTION, SUBFN); \ EAX = (unsigned int)libxsmm_cpuid_x86_[0]; \ EBX = (unsigned int)libxsmm_cpuid_x86_[1]; \ ECX = (unsigned int)libxsmm_cpuid_x86_[2]; \ EDX = (unsigned int)libxsmm_cpuid_x86_[3]; \ } # elif defined(__GNUC__) || !defined(_CRAYC) # if (64 > (LIBXSMM_BITS)) LIBXSMM_EXTERN LIBXSMM_RETARGETABLE int __get_cpuid( /* prototype */ unsigned int, unsigned int*, unsigned int*, unsigned int*, unsigned int*); # define LIBXSMM_XGETBV(XCR, EAX, EDX) EAX = (EDX) = 0xFFFFFFFF # define LIBXSMM_CPUID_X86(FUNCTION, SUBFN, EAX, EBX, ECX, EDX) \ EAX = (EBX) = (EDX) = 0; ECX = (SUBFN); \ __get_cpuid(FUNCTION, &(EAX), &(EBX), &(ECX), &(EDX)) # else /* 64-bit */ # define LIBXSMM_XGETBV(XCR, EAX, EDX) __asm__ __volatile__( \ ".byte 0x0f, 0x01, 0xd0" /*xgetbv*/ : "=a"(EAX), "=d"(EDX) : "c"(XCR) \ ) # define LIBXSMM_CPUID_X86(FUNCTION, SUBFN, EAX, EBX, ECX, EDX) \ __asm__ __volatile__ (".byte 0x0f, 0xa2" /*cpuid*/ \ : "=a"(EAX), "=b"(EBX), "=c"(ECX), "=d"(EDX) \ : "a"(FUNCTION), "b"(0), "c"(SUBFN), "d"(0) \ ) # endif # else /* legacy Cray Compiler */ # define LIBXSMM_XGETBV(XCR, EAX, EDX) EAX = (EDX) = 0 # define LIBXSMM_CPUID_X86(FUNCTION, SUBFN, EAX, EBX, ECX, EDX) EAX = (EBX) = (ECX) = (EDX) = 0 # endif #endif #define LIBXSMM_CPUID_CHECK(VALUE, CHECK) ((CHECK) == ((CHECK) & (VALUE))) LIBXSMM_API int libxsmm_cpuid_x86(libxsmm_cpuid_x86_info* info) { static int result = LIBXSMM_TARGET_ARCH_UNKNOWN; #if !defined(LIBXSMM_PLATFORM_SUPPORTED) if (NULL != info) LIBXSMM_MEMZERO127(info); #else unsigned int eax, ebx, ecx, edx; LIBXSMM_CPUID_X86(0, 0/*ecx*/, eax, ebx, ecx, edx); if (1 <= eax) { /* CPUID max. leaf */ /* avoid redetecting features but redetect on request (info given) */ if (LIBXSMM_TARGET_ARCH_UNKNOWN == result || NULL != info) { int feature_cpu = LIBXSMM_X86_GENERIC, feature_os = LIBXSMM_X86_GENERIC, has_context = 0; unsigned int maxleaf = eax; # if defined(__linux__) if (0 == libxsmm_se) { FILE *const selinux = fopen("/sys/fs/selinux/enforce", "rb"); if (NULL != selinux) { if (1 == fread(&libxsmm_se, 1/*sizeof(char)*/, 1/*count*/, selinux)) { libxsmm_se = ('0' != libxsmm_se ? 1 : 0); } else { /* conservative assumption in case of read-error */ libxsmm_se = 1; } fclose(selinux); } } # endif LIBXSMM_CPUID_X86(1, 0/*ecx*/, eax, ebx, ecx, edx); /* Check for CRC32 (this is not a proper test for SSE 4.2 as a whole!) */ if (LIBXSMM_CPUID_CHECK(ecx, 0x00100000)) { if (LIBXSMM_CPUID_CHECK(ecx, 0x10000000)) { /* AVX(0x10000000) */ if (LIBXSMM_CPUID_CHECK(ecx, 0x00001000)) { /* FMA(0x00001000) */ unsigned int ecx2; LIBXSMM_CPUID_X86(7, 0/*ecx*/, eax, ebx, ecx2, edx); /* AVX512F(0x00010000), AVX512CD(0x10000000) */ if (LIBXSMM_CPUID_CHECK(ebx, 0x10010000)) { /* Common */ /* AVX512DQ(0x00020000), AVX512BW(0x40000000), AVX512VL(0x80000000) */ if (LIBXSMM_CPUID_CHECK(ebx, 0xC0020000)) { /* AVX512-Core */ if (LIBXSMM_CPUID_CHECK(ecx2, 0x00000800)) { /* VNNI */ # if 0 /* no check required yet */ unsigned int ecx3; LIBXSMM_CPUID_X86(7, 1/*ecx*/, eax, ebx, ecx3, edx); # else LIBXSMM_CPUID_X86(7, 1/*ecx*/, eax, ebx, ecx2, edx); # endif if (LIBXSMM_CPUID_CHECK(eax, 0x00000020)) { /* BF16 */ feature_cpu = LIBXSMM_X86_AVX512_CPX; } else feature_cpu = LIBXSMM_X86_AVX512_CLX; /* CLX */ } else feature_cpu = LIBXSMM_X86_AVX512_CORE; /* SKX */ } /* AVX512PF(0x04000000), AVX512ER(0x08000000) */ else if (LIBXSMM_CPUID_CHECK(ebx, 0x0C000000)) { /* AVX512-MIC */ if (LIBXSMM_CPUID_CHECK(edx, 0x0000000C)) { /* KNM */ feature_cpu = LIBXSMM_X86_AVX512_KNM; } else feature_cpu = LIBXSMM_X86_AVX512_MIC; /* KNL */ } else feature_cpu = LIBXSMM_X86_AVX512; /* AVX512-Common */ } else feature_cpu = LIBXSMM_X86_AVX2; } else feature_cpu = LIBXSMM_X86_AVX; } else feature_cpu = LIBXSMM_X86_SSE4; } # if !defined(LIBXSMM_INTRINSICS_DEBUG) LIBXSMM_ASSERT_MSG(LIBXSMM_STATIC_TARGET_ARCH <= LIBXSMM_MAX(LIBXSMM_X86_SSE3, feature_cpu), /* TODO: confirm SSE3 */"missed detecting ISA extensions"); /* coverity[dead_error_line] */ if (LIBXSMM_STATIC_TARGET_ARCH > feature_cpu) feature_cpu = LIBXSMM_STATIC_TARGET_ARCH; # endif /* XSAVE/XGETBV(0x04000000), OSXSAVE(0x08000000) */ if (LIBXSMM_CPUID_CHECK(ecx, 0x0C000000)) { /* OS SSE support */ feature_os = LIBXSMM_MIN(LIBXSMM_X86_SSE4, feature_cpu); if (LIBXSMM_X86_AVX <= feature_cpu) { LIBXSMM_XGETBV(0, eax, edx); if (LIBXSMM_CPUID_CHECK(eax, 0x00000006)) { /* OS XSAVE 256-bit */ feature_os = LIBXSMM_MIN(LIBXSMM_X86_AVX2, feature_cpu); if (LIBXSMM_X86_AVX512 <= feature_cpu && 7 <= maxleaf && LIBXSMM_CPUID_CHECK(eax, 0x000000E0)) /* OS XSAVE 512-bit */ { feature_os = feature_cpu; /* unlimited */ } } } } else if (LIBXSMM_X86_GENERIC <= feature_cpu) { /* assume FXSAVE, which should be fine * 16 years after the first x86_64 OS */ feature_os = LIBXSMM_X86_SSE4; } else feature_os = LIBXSMM_TARGET_ARCH_GENERIC; has_context = (LIBXSMM_STATIC_TARGET_ARCH >= feature_cpu || feature_os >= feature_cpu) ? 1 : 0; if (LIBXSMM_TARGET_ARCH_UNKNOWN == result && 0 != libxsmm_verbosity) { /* library code is expected to be mute */ const int target_vlen32 = libxsmm_cpuid_vlen32(feature_cpu); const char *const compiler_support = (libxsmm_cpuid_vlen32(LIBXSMM_MAX_STATIC_TARGET_ARCH) < target_vlen32 ? "" : (((2 <= libxsmm_verbosity || 0 > libxsmm_verbosity) && LIBXSMM_MAX_STATIC_TARGET_ARCH < feature_cpu) ? "highly " : NULL)); # if !defined(NDEBUG) && defined(__OPTIMIZE__) fprintf(stderr, "LIBXSMM WARNING: library is optimized without -DNDEBUG and contains debug code!\n"); # endif if (NULL != compiler_support) { const char *const name = libxsmm_cpuid_name( /* exclude MIC when running on Core processors */ (((LIBXSMM_X86_AVX512_MIC == LIBXSMM_MAX_STATIC_TARGET_ARCH) || (LIBXSMM_X86_AVX512_KNM == LIBXSMM_MAX_STATIC_TARGET_ARCH)) && (LIBXSMM_X86_AVX512_CORE <= feature_cpu)) ? LIBXSMM_X86_AVX2 : LIBXSMM_MAX_STATIC_TARGET_ARCH); fprintf(stderr, "LIBXSMM WARNING: %soptimized non-JIT code paths are limited to \"%s\"!\n", compiler_support, name); } # if !defined(__APPLE__) || !defined(__MACH__) /* permitted features */ if (0 == has_context) { fprintf(stderr, "LIBXSMM WARNING: detected CPU features are not permitted by the OS!\n"); if (0 == libxsmm_se) { fprintf(stderr, "LIBXSMM WARNING: downgraded code generation to supported features!\n"); } } # endif } /* macOS is faulting AVX-512 (on-demand larger state) */ result = feature_cpu; # if !defined(__APPLE__) || !defined(__MACH__) # if 0 /* opportunistic */ if (0 == libxsmm_se) # endif { /* only permitted features */ result = LIBXSMM_MIN(feature_cpu, feature_os); } # endif if (NULL != info) { LIBXSMM_CPUID_X86(0x80000007, 0/*ecx*/, eax, ebx, ecx, edx); info->constant_tsc = LIBXSMM_CPUID_CHECK(edx, 0x00000100); info->has_context = has_context; } } } else { if (NULL != info) LIBXSMM_MEMZERO127(info); result = LIBXSMM_X86_GENERIC; } #endif return result; } LIBXSMM_API int libxsmm_cpuid(void) { return libxsmm_cpuid_x86(NULL/*info*/); } LIBXSMM_API const char* libxsmm_cpuid_name(int id) { const char* target_arch = NULL; switch (id) { case LIBXSMM_X86_AVX512_CPX: { target_arch = "cpx"; } break; case LIBXSMM_X86_AVX512_CLX: { target_arch = "clx"; } break; case LIBXSMM_X86_AVX512_CORE: { target_arch = "skx"; } break; case LIBXSMM_X86_AVX512_KNM: { target_arch = "knm"; } break; case LIBXSMM_X86_AVX512_MIC: { target_arch = "knl"; } break; case LIBXSMM_X86_AVX512: { /* TODO: rework BE to use target ID instead of set of strings (target_arch = "avx3") */ target_arch = "hsw"; } break; case LIBXSMM_X86_AVX2: { target_arch = "hsw"; } break; case LIBXSMM_X86_AVX: { target_arch = "snb"; } break; case LIBXSMM_X86_SSE4: { /* TODO: rework BE to use target ID instead of set of strings (target_arch = "sse4") */ target_arch = "wsm"; } break; case LIBXSMM_X86_SSE3: { /* WSM includes SSE4, but BE relies on SSE3 only, * hence we enter "wsm" path starting with SSE3. */ target_arch = "wsm"; } break; case LIBXSMM_TARGET_ARCH_GENERIC: { target_arch = "generic"; } break; default: if (LIBXSMM_X86_GENERIC <= id) { target_arch = "x86"; } else { target_arch = "unknown"; } } LIBXSMM_ASSERT(NULL != target_arch); return target_arch; } LIBXSMM_API int libxsmm_cpuid_vlen32(int id) { int result; if (LIBXSMM_X86_AVX512 <= id) { result = 16; } else if (LIBXSMM_X86_AVX <= id) { result = 8; } else if (LIBXSMM_X86_SSE3 <= id) { result = 4; } else { /* scalar */ result = 1; } return result; } libxsmm-1.17/src/libxsmm_diff.h000066400000000000000000000162151415223013700165230ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_DIFF_H #define LIBXSMM_DIFF_H #include #if !defined(LIBXSMM_DIFF_AVX512_ENABLED) && 0 # define LIBXSMM_DIFF_AVX512_ENABLED #endif #define LIBXSMM_DIFF_SSE3_DECL(A) __m128i A #define LIBXSMM_DIFF_SSE3_ASSIGN(A, B) (A) = (B) #define LIBXSMM_DIFF_SSE3_LOAD(A, SRC) A = LIBXSMM_INTRINSICS_LDDQU_SI128((const __m128i*)(SRC)) #define LIBXSMM_DIFF_SSE3(A, B, ...) ((unsigned char)(0xFFFF != _mm_movemask_epi8(_mm_cmpeq_epi8( \ A, LIBXSMM_INTRINSICS_LDDQU_SI128((const __m128i*)(B)))))) #if (LIBXSMM_X86_SSE3 <= LIBXSMM_STATIC_TARGET_ARCH) /*|| defined(LIBXSMM_INTRINSICS_TARGET)*/ # define LIBXSMM_DIFF_16_DECL LIBXSMM_DIFF_SSE3_DECL # define LIBXSMM_DIFF_16_ASSIGN LIBXSMM_DIFF_SSE3_ASSIGN # define LIBXSMM_DIFF_16_LOAD LIBXSMM_DIFF_SSE3_LOAD # define LIBXSMM_DIFF_16 LIBXSMM_DIFF_SSE3 #else # define LIBXSMM_DIFF_16_DECL(A) const uint64_t */*const*/ A # define LIBXSMM_DIFF_16_ASSIGN(A, B) (A) = (B) # define LIBXSMM_DIFF_16_LOAD(A, SRC) A = (const uint64_t*)(SRC) # define LIBXSMM_DIFF_16(A, B, ...) ((unsigned char)(0 != (((A)[0] ^ (*(const uint64_t*)(B))) | \ ((A)[1] ^ ((const uint64_t*)(B))[1])))) #endif #define LIBXSMM_DIFF_AVX2_DECL(A) __m256i A #define LIBXSMM_DIFF_AVX2_ASSIGN(A, B) (A) = (B) #define LIBXSMM_DIFF_AVX2_LOAD(A, SRC) A = _mm256_loadu_si256((const __m256i*)(SRC)) #define LIBXSMM_DIFF_AVX2(A, B, ...) ((unsigned char)(-1 != _mm256_movemask_epi8(_mm256_cmpeq_epi8( \ A, _mm256_loadu_si256((const __m256i*)(B)))))) #if (LIBXSMM_X86_AVX2 <= LIBXSMM_STATIC_TARGET_ARCH) # define LIBXSMM_DIFF_32_DECL LIBXSMM_DIFF_AVX2_DECL # define LIBXSMM_DIFF_32_ASSIGN LIBXSMM_DIFF_AVX2_ASSIGN # define LIBXSMM_DIFF_32_LOAD LIBXSMM_DIFF_AVX2_LOAD # define LIBXSMM_DIFF_32 LIBXSMM_DIFF_AVX2 #else # define LIBXSMM_DIFF_32_DECL(A) LIBXSMM_DIFF_16_DECL(A); LIBXSMM_DIFF_16_DECL(LIBXSMM_CONCATENATE3(libxsmm_diff_32_, A, _)) # define LIBXSMM_DIFF_32_ASSIGN(A, B) LIBXSMM_DIFF_16_ASSIGN(A, B); LIBXSMM_DIFF_16_ASSIGN(LIBXSMM_CONCATENATE3(libxsmm_diff_32_, A, _), LIBXSMM_CONCATENATE3(libxsmm_diff_32_, B, _)) # define LIBXSMM_DIFF_32_LOAD(A, SRC) LIBXSMM_DIFF_16_LOAD(A, SRC); LIBXSMM_DIFF_16_LOAD(LIBXSMM_CONCATENATE3(libxsmm_diff_32_, A, _), (const uint64_t*)(SRC) + 2) # define LIBXSMM_DIFF_32(A, B, ...) ((unsigned char)(0 != LIBXSMM_DIFF_16(A, B, __VA_ARGS__) ? 1 : LIBXSMM_DIFF_16(LIBXSMM_CONCATENATE3(libxsmm_diff_32_, A, _), (const uint64_t*)(B) + 2, __VA_ARGS__))) #endif #define LIBXSMM_DIFF_48_DECL(A) LIBXSMM_DIFF_16_DECL(A); LIBXSMM_DIFF_32_DECL(LIBXSMM_CONCATENATE3(libxsmm_diff_48_, A, _)) #define LIBXSMM_DIFF_48_ASSIGN(A, B) LIBXSMM_DIFF_16_ASSIGN(A, B); LIBXSMM_DIFF_32_ASSIGN(LIBXSMM_CONCATENATE3(libxsmm_diff_48_, A, _), LIBXSMM_CONCATENATE3(libxsmm_diff_48_, B, _)) #define LIBXSMM_DIFF_48_LOAD(A, SRC) LIBXSMM_DIFF_16_LOAD(A, SRC); LIBXSMM_DIFF_32_LOAD(LIBXSMM_CONCATENATE3(libxsmm_diff_48_, A, _), (const uint64_t*)(SRC) + 2) #define LIBXSMM_DIFF_48(A, B, ...) ((unsigned char)(0 != LIBXSMM_DIFF_16(A, B, __VA_ARGS__) ? 1 : LIBXSMM_DIFF_32(LIBXSMM_CONCATENATE3(libxsmm_diff_48_, A, _), (const uint64_t*)(B) + 2, __VA_ARGS__))) #define LIBXSMM_DIFF_64SW_DECL(A) LIBXSMM_DIFF_32_DECL(A); LIBXSMM_DIFF_32_DECL(LIBXSMM_CONCATENATE3(libxsmm_diff_64_, A, _)) #define LIBXSMM_DIFF_64SW_ASSIGN(A, B) LIBXSMM_DIFF_32_ASSIGN(A, B); LIBXSMM_DIFF_32_ASSIGN(LIBXSMM_CONCATENATE3(libxsmm_diff_64_, A, _), LIBXSMM_CONCATENATE3(libxsmm_diff_64_, B, _)) #define LIBXSMM_DIFF_64SW_LOAD(A, SRC) LIBXSMM_DIFF_32_LOAD(A, SRC); LIBXSMM_DIFF_32_LOAD(LIBXSMM_CONCATENATE3(libxsmm_diff_64_, A, _), (const uint64_t*)(SRC) + 4) #define LIBXSMM_DIFF_64SW(A, B, ...) ((unsigned char)(0 != LIBXSMM_DIFF_32(A, B, __VA_ARGS__) ? 1 : LIBXSMM_DIFF_32(LIBXSMM_CONCATENATE3(libxsmm_diff_64_, A, _), (const uint64_t*)(B) + 4, __VA_ARGS__))) #if defined(LIBXSMM_DIFF_AVX512_ENABLED) # define LIBXSMM_DIFF_AVX512_DECL(A) __m512i A # define LIBXSMM_DIFF_AVX512_ASSIGN(A, B) (A) = (B) # define LIBXSMM_DIFF_AVX512_LOAD(A, SRC) A = _mm512_loadu_si512((const __m512i*)(SRC)) # define LIBXSMM_DIFF_AVX512(A, B, ...) ((unsigned char)(0xFFFF != (unsigned int)/*_cvtmask16_u32*/(_mm512_cmpeq_epi32_mask( \ A, _mm512_loadu_si512((const __m512i*)(B)))))) #else # define LIBXSMM_DIFF_AVX512_DECL LIBXSMM_DIFF_64SW_DECL # define LIBXSMM_DIFF_AVX512_ASSIGN LIBXSMM_DIFF_64SW_ASSIGN # define LIBXSMM_DIFF_AVX512_LOAD LIBXSMM_DIFF_64SW_LOAD # define LIBXSMM_DIFF_AVX512 LIBXSMM_DIFF_64SW #endif #if (LIBXSMM_X86_AVX512 <= LIBXSMM_STATIC_TARGET_ARCH) # define LIBXSMM_DIFF_64_DECL LIBXSMM_DIFF_AVX512_DECL # define LIBXSMM_DIFF_64_ASSIGN LIBXSMM_DIFF_AVX512_ASSIGN # define LIBXSMM_DIFF_64_LOAD LIBXSMM_DIFF_AVX512_LOAD # define LIBXSMM_DIFF_64 LIBXSMM_DIFF_AVX512 #else # define LIBXSMM_DIFF_64_DECL LIBXSMM_DIFF_64SW_DECL # define LIBXSMM_DIFF_64_ASSIGN LIBXSMM_DIFF_64SW_ASSIGN # define LIBXSMM_DIFF_64_LOAD LIBXSMM_DIFF_64SW_LOAD # define LIBXSMM_DIFF_64 LIBXSMM_DIFF_64SW #endif #define LIBXSMM_DIFF_DECL(N, A) LIBXSMM_CONCATENATE3(LIBXSMM_DIFF_, N, _DECL)(A) #define LIBXSMM_DIFF_LOAD(N, A, SRC) LIBXSMM_CONCATENATE3(LIBXSMM_DIFF_, N, _LOAD)(A, SRC) #define LIBXSMM_DIFF(N) LIBXSMM_CONCATENATE(LIBXSMM_DIFF_, N) #define LIBXSMM_DIFF_N(TYPE, RESULT, DIFF, A, BN, ELEMSIZE, STRIDE, HINT, N) { \ const char* libxsmm_diff_b_ = (const char*)(BN) + (size_t)(HINT) * (STRIDE); \ for (RESULT = (HINT); (RESULT) < (N); ++(RESULT)) { \ if (0 == DIFF(A, libxsmm_diff_b_, ELEMSIZE)) break; \ libxsmm_diff_b_ += (STRIDE); \ } \ if ((N) == (RESULT)) { /* wrong hint */ \ TYPE libxsmm_diff_r_ = 0; \ libxsmm_diff_b_ = (const char*)(BN); /* reset */ \ for (; libxsmm_diff_r_ < (HINT); ++libxsmm_diff_r_) { \ if (0 == DIFF(A, libxsmm_diff_b_, ELEMSIZE)) { \ RESULT = libxsmm_diff_r_; \ break; \ } \ libxsmm_diff_b_ += (STRIDE); \ } \ } \ } /** Function type representing the diff-functionality. */ LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE unsigned int (*libxsmm_diff_function)( const void* /*a*/, const void* /*b*/, ... /*size*/); /** Compare two data blocks of 16 Byte each. */ LIBXSMM_API unsigned char libxsmm_diff_16(const void* a, const void* b, ...); /** Compare two data blocks of 32 Byte each. */ LIBXSMM_API unsigned char libxsmm_diff_32(const void* a, const void* b, ...); /** Compare two data blocks of 48 Byte each. */ LIBXSMM_API unsigned char libxsmm_diff_48(const void* a, const void* b, ...); /** Compare two data blocks of 64 Byte each. */ LIBXSMM_API unsigned char libxsmm_diff_64(const void* a, const void* b, ...); #endif /*LIBXSMM_DIFF_H*/ libxsmm-1.17/src/libxsmm_dnn.c000066400000000000000000000723571415223013700163760ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst, Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include #include "libxsmm_main.h" #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #include #if defined(_OPENMP) # include #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif LIBXSMM_API_INTERN void libxsmm_dnn_init(int target_arch) { LIBXSMM_UNUSED(target_arch); } LIBXSMM_API_INTERN void libxsmm_dnn_finalize(void) { } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_get_feature_map_blocks( int C, int K, int* C_block, int* K_block, int* fm_lp_block, libxsmm_dnn_datatype datatype_in, libxsmm_dnn_datatype datatype_out ) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; int ifmblock = 0; int ofmblock = 0; int lp_block = 0; int tmp_max_c_block = 32; int tmp_max_k_block = 32; int tmp_block = 0; /* init libxsmm */ LIBXSMM_INIT /* C */ if (libxsmm_target_archid >= LIBXSMM_X86_AVX512_CORE) { tmp_max_c_block = 64; } if ( C < tmp_max_c_block ) { ifmblock = C; } else { for ( tmp_block = 1; tmp_block <= tmp_max_c_block; tmp_block *= 2 ) { if ( C % tmp_block == 0 ) ifmblock = tmp_block; } } /* K */ if (libxsmm_target_archid >= LIBXSMM_X86_AVX512_CORE) { tmp_max_k_block = 64; } if ( K < tmp_max_k_block ) { ofmblock = K; } else { for ( tmp_block = 1; tmp_block <= tmp_max_k_block; tmp_block *= 2 ) { if ( K % tmp_block == 0 ) ofmblock = tmp_block; } } /* when do we need VNNI format? */ if ( (datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) { lp_block = 1; } else if ( (datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) { lp_block = 2; } else if ( (datatype_in == LIBXSMM_DNN_DATATYPE_I16) && ((datatype_out == LIBXSMM_DNN_DATATYPE_I32) || (datatype_out == LIBXSMM_DNN_DATATYPE_F32)) ) { lp_block = 2; } else if (datatype_in == LIBXSMM_DNN_DATATYPE_I8) { lp_block = 4; } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } *C_block = ifmblock; *K_block = ofmblock; *fm_lp_block = lp_block; return status; } LIBXSMM_API const char* libxsmm_dnn_get_error(libxsmm_dnn_err_t code) { switch (code) { case LIBXSMM_DNN_SUCCESS: return "LIBXSMM DNN Success!"; case LIBXSMM_DNN_WARN_FALLBACK: return "LIBXSMM DNN Warning: Falling back to naive code as target is currently not supported by LIBXSMM!"; case LIBXSMM_DNN_WARN_RNN_SUBOPTIMAL_N_BLOCKING: return "LIBXSMM DNN Warning: RNN cell suboptimal minibatch blocking!"; case LIBXSMM_DNN_WARN_RNN_SUBOPTIMAL_C_BLOCKING: return "LIBXSMM DNN Warning: RNN cell suboptimal input feature blocking!"; case LIBXSMM_DNN_WARN_RNN_SUBOPTIMAL_K_BLOCKING: return "LIBXSMM DNN Warning: RNN cell suboptimal output feature blocking!"; case LIBXSMM_DNN_WARN_FC_SUBOPTIMAL_N_BLOCKING: return "LIBXSMM DNN Warning: FC layer suboptimal minibatch blocking!"; case LIBXSMM_DNN_WARN_FC_SUBOPTIMAL_C_BLOCKING: return "LIBXSMM DNN Warning: FC layer suboptimal input feature blocking!"; case LIBXSMM_DNN_WARN_FC_SUBOPTIMAL_K_BLOCKING: return "LIBXSMM DNN Warning: FC layer suboptimal output feature blocking!"; case LIBXSMM_DNN_ERR_GENERAL: return "LIBXSMM DNN Error: General error occurred!"; case LIBXSMM_DNN_ERR_CREATE_HANDLE: return "LIBXSMM DNN Error: Handle creation failed!"; case LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE: return "LIBXSMM DNN Error: Requested datatype is not available!"; case LIBXSMM_DNN_ERR_INVALID_BLOCKING: return "LIBXSMM DNN Error: Requested Input/Output buffer size cannot be blocked!"; case LIBXSMM_DNN_ERR_INVALID_HANDLE: return "LIBXSMM DNN Error: An invalid handle was provided!"; case LIBXSMM_DNN_ERR_DATA_NOT_BOUND: return "LIBXSMM DNN Error: Not all required sources and destinations have been bound to convolution!"; case LIBXSMM_DNN_ERR_CREATE_TENSOR: return "LIBXSMM DNN Error: Tensor creation failed!"; case LIBXSMM_DNN_ERR_INVALID_TENSOR: return "LIBXSMM DNN Error: Invalid tensor was specified!"; case LIBXSMM_DNN_ERR_MISMATCH_TENSOR: return "LIBXSMM DNN Error: Tensor doesn't match handle it should be bind to!"; case LIBXSMM_DNN_ERR_INVALID_HANDLE_TENSOR: return "LIBXSMM DNN Error: Invalid handle or tensor!"; case LIBXSMM_DNN_ERR_INVALID_KIND: return "LIBXSMM DNN Error: Invalid convolution kind!"; case LIBXSMM_DNN_ERR_INVALID_FORMAT_NCHW: return "LIBXSMM DNN Error: NCHW format is currently not natively supported by LIBXSMM!"; case LIBXSMM_DNN_ERR_UNSUPPORTED_DST_FORMAT: return "LIBXSMM DNN Error: Unsupported destination format when copying data!"; case LIBXSMM_DNN_ERR_UNSUPPORTED_SRC_FORMAT: return "LIBXSMM DNN Error: Unsupported source format when copying data!"; case LIBXSMM_DNN_ERR_INVALID_FORMAT_CONVOLVE: return "LIBXSMM DNN Error: Unsupported format when requesting a convolution!"; case LIBXSMM_DNN_ERR_INVALID_FORMAT_KCRS: return "LIBXSMM DNN Error: KCRS format is currently not natively supported by LIBXSMM!"; case LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL: return "LIBXSMM DNN Error: Invalid format was specified!"; case LIBXSMM_DNN_ERR_CREATE_LAYOUT: return "LIBXSMM DNN Error: Layout creation failed!"; case LIBXSMM_DNN_ERR_INVALID_LAYOUT: return "LIBXSMM DNN Error: Invalid layout was specified!"; case LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH: return "LIBXSMM DNN Error: Unsupported architecture!"; case LIBXSMM_DNN_ERR_SCRATCH_NOT_ALLOCED: return "LIBXSMM DNN Error: scratch binding failed as scratch was not allocated!"; case LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE: return "LIBXSMM DNN Error: an unknown tensor type was provided!"; case LIBXSMM_DNN_ERR_INVALID_ALGO: return "LIBXSMM DNN Error: Invalid algorithm was specified!"; case LIBXSMM_DNN_ERR_INVALID_PADDING: return "LIBXSMM DNN Error: Invalid padding was specified!"; case LIBXSMM_DNN_ERR_TIME_STEPS_TOO_SMALL: return "LIBXSMM DNN Error: time steps should be >= 2 for RNN/LSTM!"; case LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS: return "LIBXSMM DNN Error: failed to create internal layout arrays!"; case LIBXSMM_DNN_ERR_NOT_IMPLEMENTED: return "LIBXSMM DNN Error: the requested functionality is right now not implemented!"; case LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER: return "LIBXSMM DNN Error: the requested order of fusion in batch norm is right now not implemented!"; case LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION: return "LIBXSMM DNN Error: the requested fusion in batch norm is right now not implemented!"; case LIBXSMM_DNN_ERR_INVALID_FORMAT_FUSEDBN: return "LIBXSMM DNN Error: Unsupported format when requesting a fused batch norm!"; case LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING: return "LIBXSMM DNN Error: Unsupported pooling operations was requested!"; case LIBXSMM_DNN_ERR_INVALID_FORMAT_FC: return "LIBXSMM DNN Error: Unsupported format when requesting a fullyconnected layer!"; case LIBXSMM_DNN_ERR_RNN_INVALID_SEQ_LEN: return "LIBXSMM DNN Error: max sequence length is shorter than sequence length we attempt to set!"; case LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER: return "LIBXSMM DNN Error: the requested order of fusion in group norm is right now not implemented!"; case LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION: return "LIBXSMM DNN Error: the requested fusion in group norm is right now not implemented!"; case LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION: return "LIBXSMM DNN Error: the requested fusion in fullyconnected is right now not implemented!"; default: return "LIBXSMM DNN Error: Unknown error or warning occurred!"; } } LIBXSMM_API size_t libxsmm_dnn_typesize(libxsmm_dnn_datatype datatype) { switch (datatype) { case LIBXSMM_DNN_DATATYPE_F32: return 4; case LIBXSMM_DNN_DATATYPE_I32: return 4; case LIBXSMM_DNN_DATATYPE_BF16:return 2; case LIBXSMM_DNN_DATATYPE_I16: return 2; case LIBXSMM_DNN_DATATYPE_I8: return 1; /* no error expected as enumeration really arrives at an enum; compiler-checked */ default: return 1; } } LIBXSMM_API size_t libxsmm_dnn_get_simd_width(libxsmm_dnn_datatype datatype) { size_t l_cl_width_bytes; /* init libxsmm */ LIBXSMM_INIT if ( libxsmm_target_archid == LIBXSMM_X86_GENERIC ) { l_cl_width_bytes = libxsmm_dnn_typesize(datatype); } else if ( libxsmm_target_archid == LIBXSMM_X86_SSE3 || libxsmm_target_archid == LIBXSMM_X86_SSE4 ) { l_cl_width_bytes = 16; } else if ( libxsmm_target_archid == LIBXSMM_X86_AVX2 || libxsmm_target_archid == LIBXSMM_X86_AVX ) { l_cl_width_bytes = 32; } else { l_cl_width_bytes = 64; } return l_cl_width_bytes/libxsmm_dnn_typesize(datatype); } LIBXSMM_API_INLINE float libxsmm_internal_get_max( float* in_buffer, int length ) { float absmax_value = LIBXSMM_ABS(in_buffer[0]); int i = 0; #ifdef _OPENMP LIBXSMM_OMP_VAR(i); # pragma omp parallel private(i) { float my_absmax_value = absmax_value; # pragma omp for for (i = 0; i < length; ++i ) { if (LIBXSMM_ABS(in_buffer[i]) > my_absmax_value) { my_absmax_value = LIBXSMM_ABS(in_buffer[i]); } } # pragma omp critical { if (my_absmax_value > absmax_value) { absmax_value = my_absmax_value; } } } #else for (i = 1; i < length; ++i ) { if (LIBXSMM_ABS(in_buffer[i]) > absmax_value) { absmax_value = LIBXSMM_ABS(in_buffer[i]); } } #endif return absmax_value; } LIBXSMM_API_INLINE unsigned char libxsmm_internal_get_max_exp( float* in_buffer, int length ) { libxsmm_intfloat val_exp; unsigned char max_exp = 0; /* bit-wise conversion to int */ val_exp.f = libxsmm_internal_get_max( in_buffer, length ); /* shift by mantissa to the right and convert to char */ max_exp = (unsigned char)((val_exp.ui & LIBXSMM_DNN_MASK_ABS_F32) >> LIBXSMM_DNN_MANT_SZ_F32); return max_exp; } LIBXSMM_API_INLINE short libxsmm_internal_quantize_scalar_no_scf( float input, unsigned char max_exp, unsigned char add_shift, int round_mode ) { libxsmm_intfloat value; unsigned int qvalue = 0; unsigned int mant = 0; unsigned int sign = 0; unsigned char rhs = 0; unsigned char exp_off = 0; /* init libxsmm */ LIBXSMM_INIT /* in case of zero we don't need to do anything */ if (LIBXSMM_FEQ(input, 0)) { qvalue = 0; } else { /* let's get a float copy to work on */ /* vinp = LIBXSMM_INTRINSICS_MM512_LOAD_PS( in_buffer[i] ); */ value.f = input; /* let's compute the offset of the current exp at pos i from max offset, we need to mask the sign bit though */ /*__m512i vexp = _mm512_cvtps_epi32(_mm512_getexp_ps (vinp)); __m512i vexp_off = _mm512_sub_epi32(maxexpf, vexp);*/ exp_off = (unsigned char)(max_exp - ((value.ui & LIBXSMM_DNN_MASK_ABS_F32) >> LIBXSMM_DNN_MANT_SZ_F32)); /* cut out mantissa and set leading bit */ /*__m512i mmask = _mm512_set1_epi32(LIBXSMM_DNN_MASK_MANT_F32); __m512i vmant = _mm512_or_epi32(_mm512_set1_epi32(0x1 << LIBXSMM_DNN_MANT_SZ_F32), _mm512_and_epi32( _mm512_castps_si512( vinp ), mmask));*/ mant = ((0x1 << LIBXSMM_DNN_MANT_SZ_F32) | (value.ui & LIBXSMM_DNN_MASK_MANT_F32)); /* extract sign */ /* __mmask16 smask = _mm512_cmplt_ps_mask (inp, _mm512_set1_ps(0)); */ sign = ((value.ui & LIBXSNN_DNN_MASK_SIGN_F32) >> (LIBXSMM_DNN_SZ_F32-1)); /* calculate rhs, be aware of the now explicit leading bit, @TODO add DFP8/4 */ rhs = (unsigned char)((LIBXSMM_DNN_MANT_SZ_F32+1) - LIBXSMM_DNN_MANT_DFP16 + exp_off + add_shift); /* some safety, to generate 0 when we fall off quant region, @TODO issue a LIBXSMM Warning that we shifted out the entire mantissa */ if (rhs > (LIBXSMM_DNN_MANT_SZ_F32+1)) { rhs = (LIBXSMM_DNN_MANT_SZ_F32+1); } /* finally shift the value into the region we need, this is now a 15-add_rhs bit number for the max value in in_buffer */ qvalue = (mant >> rhs); /* handle sign, 2 complement */ if ( (sign > 0) && (qvalue > 0) ) { qvalue = (~qvalue + 1); } if (round_mode == LIBXSMM_DNN_QUANT_BIAS_ROUND) { /* biased rounding towards next bigger number */ /* first let's determine in the original number if we need a bias rounding, @TODO need fix for F64 */ int bias_needed = (mant & (0x3 << (rhs-2))); /* apply bias */ if (bias_needed > 0) { qvalue++; } } else if (round_mode == LIBXSMM_DNN_QUANT_NEAREST_ROUND) { int nearest_needed = (mant & (0x1 << (rhs-1))); /* apply rounding */ if ((nearest_needed > 0) && (rhs > 1)) { qvalue++; } } else if (round_mode == LIBXSMM_DNN_QUANT_STOCH_ROUND) { /* stochastic rounding, as implemented in the IBM paper from 2015, @TODO, fix F64 and DFP8 */ const float eps = LIXSMMM_DNN_RES_DFP16; /* coverity[dont_call] */ const float r = (float)rand(); libxsmm_intfloat fvalue; float p, q; /* masking all bits which will be shifted out */ fvalue.ui = value.ui & ((LIBXSMM_DNN_MASK_FULL_F32) << rhs); /* drawing a random number */ p = r/((float)RAND_MAX); q = (input - fvalue.f)/eps; /* apply rounding if needed */ if ((p + q) > 0.5f) { ++qvalue; } } else { /* do nothing about rounding, just chop */ } } return (short)qvalue; } /* @TODO make this routine aware of any int type */ LIBXSMM_API void libxsmm_dnn_quantize( float* in_buffer, short* out_buffer, int length, unsigned char add_shift, unsigned char* scf, int round_mode ) { int i = 0; /* init libxsmm */ LIBXSMM_INIT /* in case we are using FP-Mul based quantization we use a different path for now @TODO let's unify the paths by using the similar vectorization for both */ if ( round_mode == LIBXSMM_DNN_QUANT_FPHW_ROUND ) { const float max_value = libxsmm_internal_get_max( in_buffer, length ); int maxexp = 0; /* take return value of LIBXSMM_FREXPF to mute static analysis issue */ float scfq = LIBXSMM_FREXPF(max_value, &maxexp); maxexp -= (15/*LIBXSMM_DNN_MANT_DFP16?*/ - add_shift); scfq = libxsmm_sexp2_i8i(-maxexp); #if (LIBXSMM_X86_AVX512 <= LIBXSMM_STATIC_TARGET_ARCH) if ( length % 16 == 0 ) { __m512 vscfq = _mm512_set1_ps(scfq); #ifdef _OPENMP # pragma omp parallel for private(i) #endif for (i = 0; i < length; i+=16 ) { _mm256_stream_si256( (__m256i *)&(out_buffer[i]), LIBXSMM_INTRINSICS_MM512_QUANTIZE_NEAR_PS_EPI16( &(in_buffer[i]), vscfq ) ); } } else { #endif #ifdef _OPENMP # pragma omp parallel for private(i) #endif for (i = 0; i < length; ++i ) { out_buffer[i] = (short)LIBXSMM_ROUNDF(in_buffer[i] * scfq); } #if (LIBXSMM_X86_AVX512 <= LIBXSMM_STATIC_TARGET_ARCH) } #endif /* @TODO, we need to potentially fix this unsigned char problem */ #if !defined(NDEBUG) /* library code is expected to be mute */ if (maxexp > 0) { fprintf(stderr, "error quant fil\n"); } #endif *scf = (unsigned char)(-maxexp); } else { /* get max exponent */ unsigned char max_exp = libxsmm_internal_get_max_exp( in_buffer, length ); /* if we go for stochastic rounding, let's initialize random seed */ if ( round_mode == LIBXSMM_DNN_QUANT_STOCH_ROUND ) { srand(libxsmm_timer_tick() % ((unsigned int)-1)); } #ifdef _OPENMP # pragma omp parallel for private(i) #endif for (i = 0; i < length; ++i ) { out_buffer[i] = libxsmm_internal_quantize_scalar_no_scf( in_buffer[i], max_exp, add_shift, round_mode ); } *scf = (unsigned char)(14 - add_shift - (max_exp - 127)); } } LIBXSMM_API void libxsmm_dnn_quantize_act( float* in_buffer, short* out_buffer, unsigned int N, unsigned int C, unsigned int H, unsigned int W, unsigned int cblk_f32, unsigned int cblk_i16, unsigned int lp_blk, unsigned char add_shift, unsigned char* scf, int round_mode ) { LIBXSMM_VLA_DECL(5, const float, in, in_buffer, C/cblk_f32, H, W, cblk_f32); LIBXSMM_VLA_DECL(6, short, out, out_buffer, C/(cblk_i16*lp_blk), H, W, cblk_i16, lp_blk); const unsigned int cblk = C/(cblk_i16*lp_blk); int i1 = 0, i2 = 0, i3 = 0, i4 = 0, i5, i6; /* init libxsmm */ LIBXSMM_INIT /* some quick and dirty checks */ assert((C % cblk_f32) == 0); assert((C % cblk_i16) == 0); /* in case we are using FP-Mul based quantization we use a different path for now @TODO let's unify the paths by using the similar vectorization for both */ if ( round_mode == LIBXSMM_DNN_QUANT_FPHW_ROUND ) { const float max_value = libxsmm_internal_get_max( in_buffer, N*C*H*W ); int maxexp = 0; /* take return value of LIBXSMM_FREXPF to mute static analysis issue */ float scfq = LIBXSMM_FREXPF(max_value, &maxexp); maxexp -= (15/*LIBXSMM_DNN_MANT_DFP16?*/ - add_shift); scfq = libxsmm_sexp2_i8i(-maxexp); #if (LIBXSMM_X86_AVX512 <= LIBXSMM_STATIC_TARGET_ARCH) if ( (cblk_f32 == 16) && (cblk_i16*lp_blk == 16) ) { __m512 vscfq = _mm512_set1_ps(scfq); #ifdef _OPENMP LIBXSMM_OMP_VAR(i1); # pragma omp parallel for private(i1) #endif for (i1 = 0; i1 < (int)(N*C*H*W); i1 += 16 ) { _mm256_stream_si256( (__m256i *)&(out_buffer[i1]), LIBXSMM_INTRINSICS_MM512_QUANTIZE_NEAR_PS_EPI16( &(in_buffer[i1]), vscfq ) ); } } else { #endif #ifdef _OPENMP LIBXSMM_OMP_VAR(i1); LIBXSMM_OMP_VAR(i2); LIBXSMM_OMP_VAR(i3); LIBXSMM_OMP_VAR(i4); LIBXSMM_OMP_VAR(i5); LIBXSMM_OMP_VAR(i6); # pragma omp parallel for private(i1, i2, i3, i4, i5, i6) LIBXSMM_OPENMP_COLLAPSE(4) #endif for (i1 = 0; i1 < (int)N; ++i1 ) { for (i2 = 0; i2 < (int)cblk; ++i2 ) { for (i3 = 0; i3 < (int)H; ++i3 ) { for (i4 = 0; i4 < (int)W; ++i4 ) { for (i5 = 0; i5 < (int)cblk_i16; ++i5 ) { for (i6 = 0; i6 < (int)lp_blk; ++i6 ) { const int fi1 = i1; const int fi2 = ((i2*cblk_i16*lp_blk)+(i5*lp_blk)+i6)/cblk_f32; const int fi3 = i3; const int fi4 = i4; const int fi5 = ((i2*cblk_i16*lp_blk)+(i5*lp_blk)+i6)%cblk_f32; LIBXSMM_VLA_ACCESS(6, out, i1, i2, i3, i4, i5, i6, cblk, H, W, cblk_i16, lp_blk) = (short)LIBXSMM_ROUNDF( LIBXSMM_VLA_ACCESS(5, in, fi1, fi2, fi3, fi4, fi5, C / cblk_f32, H, W, cblk_f32) * scfq); } } } } } } #if (LIBXSMM_X86_AVX512 <= LIBXSMM_STATIC_TARGET_ARCH) } #endif /* @TODO, we need to potentially fix this unsigned char problem */ #if !defined(NDEBUG) /* library code is expected to be mute */ if (maxexp > 0) { fprintf(stderr, "error quant act\n"); } #endif *scf = (unsigned char)(-maxexp); } else { /* get max exponent */ unsigned char max_exp = libxsmm_internal_get_max_exp( in_buffer, N*C*H*W ); /* if we go for stochastic rounding, let's initialize random seed */ if ( round_mode == LIBXSMM_DNN_QUANT_STOCH_ROUND ) { srand(libxsmm_timer_tick() % ((unsigned int)-1)); } #ifdef _OPENMP # pragma omp parallel for private(i1, i2, i3, i4, i5, i6) LIBXSMM_OPENMP_COLLAPSE(4) #endif for (i1 = 0; i1 < (int)N; ++i1 ) { for (i2 = 0; i2 < (int)cblk; ++i2 ) { for (i3 = 0; i3 < (int)H; ++i3 ) { for (i4 = 0; i4 < (int)W; ++i4 ) { for (i5 = 0; i5 < (int)cblk_i16; ++i5 ) { for (i6 = 0; i6 < (int)lp_blk; ++i6 ) { const int fi1 = i1; const int fi2 = ((i2*cblk_i16*lp_blk)+(i5*lp_blk)+i6)/cblk_f32; const int fi3 = i3; const int fi4 = i4; const int fi5 = ((i2*cblk_i16*lp_blk)+(i5*lp_blk)+i6)%cblk_f32; LIBXSMM_VLA_ACCESS(6, out, i1, i2, i3, i4, i5, i6, cblk, H, W, cblk_i16, lp_blk) = libxsmm_internal_quantize_scalar_no_scf( LIBXSMM_VLA_ACCESS(5, in, fi1, fi2, fi3, fi4, fi5, C / cblk_f32, H, W, cblk_f32), max_exp, add_shift, round_mode); } } } } } } *scf = (unsigned char)(14 - add_shift - (max_exp - 127)); } } LIBXSMM_API void libxsmm_dnn_quantize_fil( float* in_buffer, short* out_buffer, unsigned int K, unsigned int C, unsigned int R, unsigned int S, unsigned int cblk_f32, unsigned int cblk_i16, unsigned int kblk_f32, unsigned int kblk_i16, unsigned int lp_blk, unsigned char add_shift, unsigned char* scf, int round_mode ) { LIBXSMM_VLA_DECL(6, const float, in, in_buffer, C/cblk_f32, R, S, cblk_f32, kblk_f32); LIBXSMM_VLA_DECL(7, short, out, out_buffer, C/(cblk_i16*lp_blk), R, S, cblk_i16, kblk_i16, lp_blk); unsigned int cblk = C/(cblk_i16*lp_blk); unsigned int kblk = K/kblk_i16; int i1 = 0, i2 = 0, i3 = 0, i4 = 0, i5, i6, i7; /* some quick and dirty checks */ assert((C % cblk_f32) == 0); assert((C % (cblk_i16*lp_blk)) == 0); assert((K % kblk_f32) == 0); assert((K % kblk_i16) == 0); assert((lp_blk % 2) == 0); /* init libxsmm */ LIBXSMM_INIT /* in case we are using FP-Mul based quantization we use a different path for now @TODO let's unify the paths by using the similar vectorization for both */ if ( round_mode == LIBXSMM_DNN_QUANT_FPHW_ROUND ) { const float max_value = libxsmm_internal_get_max( in_buffer, K*C*R*S ); int maxexp = 0; /* take return value of LIBXSMM_FREXPF to mute static analysis issue */ float scfq = LIBXSMM_FREXPF(max_value, &maxexp); maxexp -= (15/*LIBXSMM_DNN_MANT_DFP16?*/ - add_shift); scfq = libxsmm_sexp2_i8i(-maxexp); #if (LIBXSMM_X86_AVX512 <= LIBXSMM_STATIC_TARGET_ARCH) if ( (kblk_f32 == 16) && (cblk_f32 == 16) && (kblk_i16 == 16) && (cblk_i16*lp_blk == 16) ) { const __m512 vscfq = _mm512_set1_ps(scfq); const __m512i permute_compact_idx = _mm512_set_epi32(15,14,13,12,7,6,5,4,11,10,9,8,3,2,1,0); #ifdef _OPENMP # pragma omp parallel for private(i1, i2, i3, i4, i5) LIBXSMM_OPENMP_COLLAPSE(4) #endif for (i1 = 0; i1 < (int)kblk; ++i1 ) { for (i2 = 0; i2 < (int)cblk; ++i2 ) { for (i3 = 0; i3 < (int)R; ++i3 ) { for (i4 = 0; i4 < (int)S; ++i4 ) { for (i5 = 0; i5 < 16; i5+=2 ) { __m256i even_ch = LIBXSMM_INTRINSICS_MM512_QUANTIZE_NEAR_PS_EPI16( &LIBXSMM_VLA_ACCESS(6, in, i1, i2, i3, i4, i5 + 0, 0, C / cblk_f32, R, S, cblk_f32, kblk_f32), vscfq); __m256i odd_ch = LIBXSMM_INTRINSICS_MM512_QUANTIZE_NEAR_PS_EPI16( &LIBXSMM_VLA_ACCESS(6, in, i1, i2, i3, i4, i5 + 1, 0, C / cblk_f32, R, S, cblk_f32, kblk_f32), vscfq); __m256i compressed_lo = _mm256_unpacklo_epi16(even_ch, odd_ch); __m256i compressed_hi = _mm256_unpackhi_epi16(even_ch, odd_ch); __m512i compact = _mm512_inserti64x4( _mm512_setzero_si512(), compressed_lo, 0); compact = _mm512_inserti64x4(compact, compressed_hi, 1); compact = _mm512_permutexvar_epi32(permute_compact_idx, compact); LIBXSMM_INTRINSICS_MM512_STREAM_SI512( (void*)&LIBXSMM_VLA_ACCESS(7, out, i1, i2, i3, i4, i5 / 2, 0, 0, cblk, R, S, cblk_i16, kblk_i16, lp_blk), compact); } } } } } } else { #endif #ifdef _OPENMP LIBXSMM_OMP_VAR(i1); LIBXSMM_OMP_VAR(i2); LIBXSMM_OMP_VAR(i3); LIBXSMM_OMP_VAR(i4); LIBXSMM_OMP_VAR(i5); LIBXSMM_OMP_VAR(i6); LIBXSMM_OMP_VAR(i7); # pragma omp parallel for private(i1, i2, i3, i4, i5, i6, i7) LIBXSMM_OPENMP_COLLAPSE(4) #endif for (i1 = 0; i1 < (int)kblk; ++i1 ) { for (i2 = 0; i2 < (int)cblk; ++i2 ) { for (i3 = 0; i3 < (int)R; ++i3 ) { for (i4 = 0; i4 < (int)S; ++i4 ) { for (i5 = 0; i5 < (int)cblk_i16; ++i5 ) { for (i6 = 0; i6 < (int)kblk_i16; ++i6 ) { for (i7 = 0; i7 < (int)lp_blk; ++i7 ) { const int fi1 = ((i1*kblk_i16)+i6)/kblk_f32; const int fi2 = ((i2*cblk_i16*lp_blk)+(i5*lp_blk)+i7)/cblk_f32; const int fi3 = i3; const int fi4 = i4; const int fi5 = ((i2*cblk_i16*lp_blk)+(i5*lp_blk)+i7)%cblk_f32; const int fi6 = ((i1*kblk_i16)+i6)%kblk_f32; LIBXSMM_VLA_ACCESS(7, out, i1, i2, i3, i4, i5, i6, i7, cblk, R, S, cblk_i16, kblk_i16, lp_blk) = (short)LIBXSMM_ROUNDF( LIBXSMM_VLA_ACCESS(6, in, fi1, fi2, fi3, fi4, fi5, fi6, C / cblk_f32, R, S, cblk_f32, kblk_f32) * scfq); } } } } } } } #if (LIBXSMM_X86_AVX512 <= LIBXSMM_STATIC_TARGET_ARCH) } #endif /* @TODO, we need to potentially fix this unsigned char problem */ #if !defined(NDEBUG) /* library code is expected to be mute */ if (maxexp > 0) { fprintf(stderr, "error quant fil\n"); } #endif *scf = (unsigned char)(-maxexp); } else { /* get max exponent */ unsigned char max_exp = libxsmm_internal_get_max_exp( in_buffer, K*C*R*S ); /* if we go for stochastic rounding, let's initialize random seed */ if ( round_mode == LIBXSMM_DNN_QUANT_STOCH_ROUND ) { srand(libxsmm_timer_tick() % ((unsigned int)-1)); } #ifdef _OPENMP # pragma omp parallel for private(i1, i2, i3, i4, i5, i6, i7) LIBXSMM_OPENMP_COLLAPSE(4) #endif for (i1 = 0; i1 < (int)kblk; ++i1 ) { for (i2 = 0; i2 < (int)cblk; ++i2 ) { for (i3 = 0; i3 < (int)R; ++i3 ) { for (i4 = 0; i4 < (int)S; ++i4 ) { for (i5 = 0; i5 < (int)cblk_i16; ++i5 ) { for (i6 = 0; i6 < (int)kblk_i16; ++i6 ) { for (i7 = 0; i7 < (int)lp_blk; ++i7 ) { const int fi1 = ((i1*kblk_i16)+i6)/kblk_f32; const int fi2 = ((i2*cblk_i16*lp_blk)+(i5*lp_blk)+i7)/cblk_f32; const int fi3 = i3; const int fi4 = i4; const int fi5 = ((i2*cblk_i16*lp_blk)+(i5*lp_blk)+i7)%cblk_f32; const int fi6 = ((i1*kblk_i16)+i6)%kblk_f32; LIBXSMM_VLA_ACCESS(7, out, i1, i2, i3, i4, i5, i6, i7, cblk, R, S, cblk_i16, kblk_i16, lp_blk) = libxsmm_internal_quantize_scalar_no_scf( LIBXSMM_VLA_ACCESS(6, in, fi1, fi2, fi3, fi4, fi5, fi6, C / cblk_f32, R, S, cblk_f32, kblk_f32), max_exp, add_shift, round_mode); } } } } } } } *scf = (unsigned char)(14 - add_shift - (max_exp - 127)); } } LIBXSMM_API void libxsmm_dnn_dequantize( short* in_buffer, float* out_buffer, int length, unsigned char scf ) { const float val_exp = libxsmm_sexp2_i8i(-scf); int i = 0; #ifdef _OPENMP # pragma omp parallel for private(i) #endif for ( i = 0; i < length; ++i ) { out_buffer[i] = ((float)in_buffer[i])*val_exp; } } LIBXSMM_API void libxsmm_truncate_convert_f32_bf16(const float* in, libxsmm_bfloat16* out, unsigned int length) { unsigned int i = 0; /* truncate buffer to bf16 */ for ( i = 0; i < length; ++i ) { libxsmm_bfloat16_hp t; t.f = in[i]; out[i] = t.i[1]; } } LIBXSMM_API void libxsmm_rnaz_convert_fp32_bf16(const float* in, libxsmm_bfloat16* out, unsigned int len) { unsigned int i = 0; /* truncate buffer to bf16 */ for ( i = 0; i < len; ++i ) { unsigned int int_round = 0; unsigned int do_round = 1; int_round = *((unsigned int*)&(in[i])); /* we don't round NaN and inf */ if ( (int_round & 0x7f800000) == 0x7f800000 ) { do_round = 0; } /* perform round nearest tie away from zero */ if ( do_round != 0 ) { int_round = int_round + 0x00008000; } /* create the bf16 value by shifting out the lower 16bits */ int_round = int_round >> 16; out[i] = (libxsmm_bfloat16)int_round; } } LIBXSMM_API void libxsmm_rne_convert_fp32_bf16(const float* in, libxsmm_bfloat16* out, unsigned int len) { unsigned int i = 0; /* truncate buffer to bf16 */ for ( i = 0; i < len; ++i ) { unsigned int int_round = 0; unsigned int do_round = 1; int_round = *((unsigned int*)&(in[i])); /* we don't round NaN and inf */ if ( (int_round & 0x7f800000) == 0x7f800000 ) { do_round = 0; } /* perform round nearest tie even */ if ( do_round != 0 ) { unsigned int fixup = (int_round >> 16) & 1; int_round = int_round + 0x00007fff + fixup; } /* create the bf16 value by shifting out the lower 16bits */ int_round = int_round >> 16; out[i] = (unsigned short)int_round; } } LIBXSMM_API void libxsmm_convert_bf16_f32(const libxsmm_bfloat16* in, float* out, unsigned int length) { unsigned int i = 0; /* up-convert is super simple */ for ( i = 0; i < length; ++i ) { libxsmm_bfloat16_hp t; t.i[1] = in[i]; t.i[0] = 0; out[i] = t.f; } } libxsmm-1.17/src/libxsmm_dnn_convolution.c000066400000000000000000003231061415223013700210240ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst, Alexander Heinecke, Evangelos Georganas, Rajkishore Barik (Intel Corp.) ******************************************************************************/ #include #include "libxsmm_main.h" #include "libxsmm_dnn_convolution_forward.h" #include "libxsmm_dnn_convolution_backward.h" #include "libxsmm_dnn_convolution_weight_update.h" #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #include #if defined(_OPENMP) # include #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif #define MIXED 0 #define KHWC 1 #define HWKC 2 #define CHWK 3 #define HWCK 4 /**********************************************************/ /* Helper functions for convolutions' general param setup */ /**********************************************************/ LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_ifmblock( libxsmm_dnn_layer* handle ) { int result = 1; int ofm, lp; libxsmm_dnn_get_feature_map_blocks( handle->desc.C, handle->desc.K, &result, &ofm, &lp, handle->desc.datatype_in, handle->desc.datatype_out ); return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_ofmblock( libxsmm_dnn_layer* handle ) { int result = 1; int ifm, lp; libxsmm_dnn_get_feature_map_blocks( handle->desc.C, handle->desc.K, &ifm, &result, &lp, handle->desc.datatype_in, handle->desc.datatype_out ); return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_fm_lp_block( libxsmm_dnn_layer* handle ) { int result = 1; int ifm, ofm; libxsmm_dnn_get_feature_map_blocks( handle->desc.C, handle->desc.K, &ifm, &ofm, &result, handle->desc.datatype_in, handle->desc.datatype_out ); return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_fallback_loops_fwd( libxsmm_dnn_layer* handle ) { int result = 0; /* FIXME: For now fallback only if MB is not divisible by number of threads */ if (handle->desc.N % handle->desc.threads != 0) { result = 1; } return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_blocksifm( libxsmm_dnn_layer* handle ) { int result = handle->desc.C / handle->ifmblock; return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_blocksofm( libxsmm_dnn_layer* handle ) { int result = handle->desc.K / handle->ofmblock; return result; } /**********************************************************/ /* Helper functions for FWD convolutions' parameter setup */ /**********************************************************/ LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_fwd_ofw_rb( libxsmm_dnn_layer* handle ) { int result = 0; result = handle->ofw; if (handle->ofw == 56) { result = 28; } if (handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8) { if (handle->ofw % 2 == 0) { result = handle->ofw/2; } } return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_pack_input_fwd( libxsmm_dnn_layer* handle ) { int result = 0; /* Pack only for small images and when having large K to amortize, and we can only pack for 1x1 convolutions */ if ((handle->ofw <= 14) && (handle->desc.K > 512) && (handle->desc.R == 1) && (handle->desc.S == 1) && (handle->desc.u == 2) && (handle->desc.v == 2)) { result = 1; } /* Make sure we don't pack when minibatch is not divisible by number of threads since H is used potentially for parallelism */ if (handle->desc.N != handle->desc.threads) { result = 0; } /* we don't pack for int8 */ if (handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8) { result = 0; } return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_fwd_ofh_rb( libxsmm_dnn_layer* handle ) { int result = 1; /* Multiple rows for "small" images and 1x1 convolutions */ if ((handle->ofh <= 14) && (handle->desc.R == 1) && (handle->desc.S == 1)) { result = handle->ofh; } /* Make sure we don't use multiple rows when we don't pack input and convolutions are strided*/ if ((handle->pack_input == 0) && ((handle->desc.u !=1 ) || (handle->desc.v != 1))) { result = 1; } /* In this case we will be using fallback generic loops, thus ofh_rb should be 1 */ if ((handle->desc.N % handle->desc.threads != 0) || (handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8)) { result = 1; } return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_fwd_block_H( libxsmm_dnn_layer* handle ) { int result = 14; /* Block H only for large images */ if (handle->ofh >= 28) { result = 4; } if (handle->ofh == 28 && handle->desc.R == 3 ) { result = 14; } /* Make sure it is divisible bu the ofh_rb factor in the kernel */ while ( result % handle->fwd_ofh_rb != 0 ) { result--; } return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_blocksifm_blocking( libxsmm_dnn_layer* handle ) { int result = 1; /* For 1x1 Convolutions bring in kernel all IFMs unless filters are huge*/ if ((handle->desc.R == 1) && (handle->desc.S == 1) ) { result = handle->blocksifm; if ((handle->desc.C >= 2048) && (handle->desc.K >= 512)) { result = 1; } if ((libxsmm_target_archid < LIBXSMM_X86_AVX512) && (handle->desc.C >= 512) && (handle->desc.K >= 512) ) { result = 2; } } else { result = 1; /* If small image can bring in more IFMS even if NOT 1x1 convolution */ if (handle->ofw <= 7) { result = 2; } } if (handle->blocksifm % result != 0) { result = 1; } if (handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8) { result = handle->blocksifm; } return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_loop_order_fwd( libxsmm_dnn_layer* handle ) { int result = 0; /* Switch to loop order 1 only if 1x1 convolution with "large" input image and "small" K */ if ((handle->desc.H >= 28) && (handle->desc.R == 1) && (handle->desc.S == 1) && (handle->desc.C >=512) && (handle->desc.K <=512)) { result = 1; } if (handle->ofw == 56 && handle->desc.R == 1 && handle->desc.C == 256 && handle->desc.K == 64 ) { result = 1; } if (handle->ofw == 28 && handle->desc.R == 1) { result = 1; } return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_block_fwd_IFM( libxsmm_dnn_layer* handle ) { int result = 8; if (handle->ofw == 7 && handle->desc.C == 2048 && handle->desc.K == 512) { result = 4; } /* Make sure it is divisible by ifms in the kernel */ while (result % handle->blocksifm_blocking != 0) { result++; } result = LIBXSMM_MIN(handle->blocksifm, result); return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_block_fwd_OFM( libxsmm_dnn_layer* handle ) { int result = 8; if (handle->ofw == 14 && handle->desc.K == 1024) { result = 16; } if (handle->ofw == 7) { result = 16; } result = LIBXSMM_MIN(handle->blocksofm, result); return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_use_ofm_parallelization( libxsmm_dnn_layer* handle ) { int result = 0; #if 0 /* Use "hybrid" minibatch/ofm parallelization if we have huge filters */ if ((handle->desc.R >= 3) && (handle->desc.S >= 3) && (handle->desc.C >= 512) && (handle->desc.K >= 512)) { result = 1; } #endif if ((handle->ofw <= 7) && (handle->desc.C == 1024) && (handle->desc.K == 512)) { result = 1; } return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_avoid_rim_fmas_fwd( libxsmm_dnn_layer* handle ) { int result = 0; /* Avoid rim FMA if the convolution is 3x3 (non-strided) and the image is "small" */ if ((handle->desc.R == 3) && (handle->desc.S == 3) && (handle->desc.u == 1) && (handle->desc.v == 1) && (handle->desc.pad_h_in == 1) && (handle->desc.pad_w_in == 1) && (handle->desc.H == handle->desc.W) ) { if (handle->ofw <= 28) { result = 1; } if (handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8) { result = 0; } } return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_shuffle_filter_accesses( libxsmm_dnn_layer* handle ) { int result = 0; /* Shuffle filter accesses only if "pure minibatch" parallelization and large filters are involved */ if ((handle->use_ofm_parallelization == 0) && (handle->desc.C > 512) && (handle->desc.K > 512)) { result = 1; } if (handle->ofw == 7 && handle->desc.R == 3 && handle->desc.C == 512) { result = 1; } if (handle->ofw == 7 && handle->desc.R == 1 && handle->desc.C == 512 && handle->desc.K == 2048) { result = 1; } if (handle->ofw == 7 && handle->desc.R == 1 && handle->desc.C == 2048 && handle->desc.K == 512) { result = 1; } return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_avoid_acc_load( libxsmm_dnn_layer* handle ) { int result = 0; if ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) { if ((handle->desc.R == 1) && (handle->desc.S == 1)) { if (handle->blocksifm_blocking == handle->blocksifm) { result = 1; } } else { if ((handle->blocksifm_blocking == handle->blocksifm) && (handle->avoid_fmas_in_rim == 0)) { result = 1; } } } return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_init_fwd_gemm_flags( libxsmm_dnn_layer* handle ) { int result = 0; #if defined(LIBXSMM_DNN_CONVOLUTION_SETUP_USE_NTS) /* If large image and NOT already loaded in accumulators, tnen use streaming stores */ if ((handle->ofw >= 56) && (handle->desc.K >= 256) && (handle->avoid_acc_load == 1) && (handle->desc.R == 1) && (handle->desc.S == 1)) { result = LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT; } if (handle->ofw == 56 && handle->desc.C == 64 && handle->desc.K == 64 && handle->desc.R == 1) { result = LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT; } if (handle->ofw == 56 && handle->desc.C == 256 && handle->desc.K == 64 && handle->desc.R == 1) { result = LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT; } /* Disable since the GEMM output is going to f32 scratch */ if (handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16 || handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8) { result = 0; } #else LIBXSMM_UNUSED(handle); #endif return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_fwd_padding_copy( libxsmm_dnn_layer* handle ) { int result = 0; if ( (handle->desc.pad_h != handle->desc.pad_h_in) && (handle->desc.pad_w != handle->desc.pad_w_in) ) { result = 1; } return result; } LIBXSMM_API_INLINE void libxsmm_dnn_convolution_setup_fwd_scratch( libxsmm_dnn_layer* handle ) { handle->fwd_packing_padding_scratch_size = 0; /* packing of input */ if ( handle->pack_input != 0 ) { handle->fwd_packing_padding_scratch_size = (size_t)handle->desc.N * handle->desc.C * handle->desc.H/handle->desc.u * handle->desc.W/handle->desc.v * libxsmm_dnn_typesize(handle->datatype_in); } /* logical padding with copying in the fly */ if ( handle->fwd_padding_copy != 0 ) { handle->fwd_packing_padding_scratch_size = (size_t)handle->desc.N * handle->desc.C * (handle->desc.H + 2*handle->desc.pad_h) * (handle->desc.W + 2*handle->desc.pad_w) * libxsmm_dnn_typesize(handle->datatype_in); } /* output buffer in high precision when we use BF16 */ if ( ( handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16 ) || ( handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8 ) ) { handle->fwd_lp_output_full_scratch_size = (size_t)handle->desc.N * handle->desc.K * handle->ofwp * handle->ofhp * libxsmm_dnn_typesize(LIBXSMM_DNN_DATATYPE_F32); handle->fwd_lp_output_block_scratch_size = (size_t)handle->desc.threads * handle->fwd_ofw_rb * handle->fwd_ofh_rb * handle->ofmblock * libxsmm_dnn_typesize(LIBXSMM_DNN_DATATYPE_F32); } else { handle->fwd_lp_output_full_scratch_size = 0; handle->fwd_lp_output_block_scratch_size = 0; } /* align sizes to full cacheline */ handle->fwd_packing_padding_scratch_size += ( handle->fwd_packing_padding_scratch_size % LIBXSMM_CACHELINE == 0 ) ? 0 : LIBXSMM_CACHELINE - (handle->fwd_packing_padding_scratch_size % LIBXSMM_CACHELINE); handle->fwd_lp_output_full_scratch_size += ( handle->fwd_lp_output_full_scratch_size % LIBXSMM_CACHELINE == 0 ) ? 0 : LIBXSMM_CACHELINE - (handle->fwd_lp_output_full_scratch_size % LIBXSMM_CACHELINE); handle->fwd_lp_output_block_scratch_size += ( handle->fwd_lp_output_block_scratch_size % LIBXSMM_CACHELINE == 0 ) ? 0 : LIBXSMM_CACHELINE - (handle->fwd_lp_output_block_scratch_size % LIBXSMM_CACHELINE); /* set offsets */ handle->fwd_packing_padding_scratch_offset = 0; handle->fwd_lp_output_full_scratch_offset = handle->fwd_packing_padding_scratch_size; handle->fwd_lp_output_block_scratch_offset = handle->fwd_lp_output_full_scratch_offset + handle->fwd_lp_output_full_scratch_size; /* set overall scratch size for forward */ handle->fwd_scratch_size = handle->fwd_packing_padding_scratch_size + handle->fwd_lp_output_full_scratch_size + handle->fwd_lp_output_block_scratch_size; } /**********************************************************/ /* Helper functions for BWD convolutions' parameter setup */ /**********************************************************/ LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_fallback_loops_bwd( libxsmm_dnn_layer* handle ) { int result = 0; /* FIXME: Fallback if MB is not divisible by number of threads */ if (handle->desc.N % handle->desc.threads != 0) { result = 1; } if (handle->desc.R == 1 && handle->desc.S == 1 && (handle->desc.pad_h != 0 || handle->desc.pad_w != 0)) { result = 1; } if ((handle->desc.R > 1 && handle->desc.pad_h == 0) || (handle->desc.S > 1 && handle->desc.pad_w == 0)) { result = 1; } if ((handle->desc.R > 1 && (handle->desc.pad_h_out == 0 || handle->desc.pad_h_in == 0)) || (handle->desc.S > 1 && (handle->desc.pad_w_out == 0 || handle->desc.pad_w_in == 0)) ) { result = 1; } if ((handle->desc.R > 1 && handle->desc.u > 1) || (handle->desc.S > 1 && handle->desc.v > 1)) { result = 1; } return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_bwd_ofw_rb( libxsmm_dnn_layer* handle ) { int result = libxsmm_dnn_convolution_setup_fwd_ofw_rb(handle); return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_bwd_ofh_rb( libxsmm_dnn_layer* handle ) { int result = libxsmm_dnn_convolution_setup_fwd_ofh_rb(handle); return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_bwd_block_H( libxsmm_dnn_layer* handle ) { int result = 0; result = libxsmm_dnn_convolution_setup_fwd_block_H(handle); return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_loop_order_bwd( libxsmm_dnn_layer* handle ) { int result = 0; result = libxsmm_dnn_convolution_setup_loop_order_fwd(handle); return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_block_bwd_IFM( libxsmm_dnn_layer* handle ) { int result = 0; result = LIBXSMM_MIN(handle->blocksifm, 16); return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_block_bwd_OFM( libxsmm_dnn_layer* handle ) { int result = 8; while (result % handle->blocksofm_blocking != 0) { result++; } return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_pack_input_bwd( libxsmm_dnn_layer* handle ) { int result = 0; if ((handle->desc.u != 1) && (handle->bwd_ofh_rb != 1)) { result = 1; } return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_use_ifm_parallelization( libxsmm_dnn_layer* handle ) { int result = 0; if (handle->ofw <= 7) { result = 1; } return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_avoid_rim_fmas_bwd( libxsmm_dnn_layer* handle ) { int result = libxsmm_dnn_convolution_setup_avoid_rim_fmas_fwd(handle); return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_blocksofm_blocking( libxsmm_dnn_layer* handle ) { int result = 0; if (handle->desc.R == 1 && handle->desc.S == 1) { result = handle->blocksofm; } else { result = 1; if (handle->desc.R == 3 && handle->desc.S == 3 && handle->ofh == 7 && handle->ofw == 7) { result = 2; } } if (handle->blocksofm % result != 0) { result = 1; } return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_init_bwd_gemm_flags( libxsmm_dnn_layer* handle ) { int result = 0; /* TODO: May want to experiment with streaming stores */ LIBXSMM_UNUSED( handle ); return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_spread_input_bwd( libxsmm_dnn_layer* handle ) { int result = 0; LIBXSMM_UNUSED(handle); if (((handle->desc.u != 1) || (handle->desc.v != 1)) && (handle->bwd_ofh_rb == 1)) { result = 1; } return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_avoid_acc_load_bwd( libxsmm_dnn_layer* handle ) { int result = 0; if ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) { if ((handle->desc.R == 1) && (handle->desc.S == 1)) { if (handle->blocksofm_blocking == handle->blocksofm) { result = 1; } } else { if ((handle->blocksofm_blocking == handle->blocksofm) && (handle->avoid_fmas_in_rim == 0)) { result = 1; } } } return result; } LIBXSMM_API_INLINE void libxsmm_dnn_convolution_setup_bwd_scratch( libxsmm_dnn_layer* handle ) { /* transpose of weights */ handle->bwd_filter_trans_scratch_size = (size_t)handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S * libxsmm_dnn_typesize(handle->datatype_in); handle->bwd_packing_padding_scratch_size = 0; /* packing of input */ if ( handle->pack_input_bwd != 0 ) { handle->bwd_packing_padding_scratch_size = (size_t)handle->desc.N * handle->desc.C * handle->ofhp * handle->ofwp * libxsmm_dnn_typesize(handle->datatype_in); } /* logical padding with copying in the fly */ if ( handle->use_fallback_bwd_loops != 0 ) { handle->bwd_packing_padding_scratch_size = (size_t)handle->desc.threads * handle->ifmblock * (handle->desc.H + 2*handle->desc.pad_h) * (handle->desc.W + 2*handle->desc.pad_w) * libxsmm_dnn_typesize(handle->datatype_in); } /* input bufffer in high precision when we use BF16 */ if ( handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16 ) { handle->bwd_lp_input_full_scratch_size = (size_t)handle->desc.N * handle->desc.C * handle->ifwp * handle->ifhp * libxsmm_dnn_typesize(LIBXSMM_DNN_DATATYPE_F32); } else { handle->bwd_lp_input_full_scratch_size = 0; } /* align sizes to full cacheline */ handle->bwd_filter_trans_scratch_size += ( handle->bwd_filter_trans_scratch_size % LIBXSMM_CACHELINE == 0 ) ? 0 : LIBXSMM_CACHELINE - (handle->bwd_filter_trans_scratch_size % LIBXSMM_CACHELINE); handle->bwd_packing_padding_scratch_size += ( handle->bwd_packing_padding_scratch_size % LIBXSMM_CACHELINE == 0 ) ? 0 : LIBXSMM_CACHELINE - (handle->bwd_packing_padding_scratch_size % LIBXSMM_CACHELINE); handle->bwd_lp_input_full_scratch_size += ( handle->bwd_lp_input_full_scratch_size % LIBXSMM_CACHELINE == 0 ) ? 0 : LIBXSMM_CACHELINE - (handle->bwd_lp_input_full_scratch_size % LIBXSMM_CACHELINE); /* set offsets */ handle->bwd_filter_trans_scratch_offset = 0; handle->bwd_packing_padding_scratch_offset = handle->bwd_filter_trans_scratch_size; handle->bwd_lp_input_full_scratch_offset = handle->bwd_packing_padding_scratch_offset + handle->bwd_packing_padding_scratch_size; /* set overall scratch size for forward */ handle->bwd_scratch_size = handle->bwd_filter_trans_scratch_size + handle->bwd_packing_padding_scratch_size + handle->bwd_lp_input_full_scratch_size; } /**********************************************************/ /* Helper functions for UPD convolutions' parameter setup */ /**********************************************************/ LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_loop_order_upd( libxsmm_dnn_layer* handle ) { int result = 1; if (handle->ofh == 28 && handle->desc.R == 1 && handle->desc.u == 1 && handle->desc.C == 128 && handle->desc.K == 512) { result = 0; } if (handle->ofh == 28 && handle->desc.R == 3 && handle->desc.u == 1 && handle->desc.C == 128 && handle->desc.K == 128) { result = 0; } if (handle->ofw == 28 && handle->desc.R == 1 && handle->desc.C == 256 && handle->desc.K == 512) { result = 0; } if (handle->ofw == 14 && !(handle->desc.R == 1 && handle->desc.C == 1024 && handle->desc.K == 256)) { result = 0; } if (handle->ofw == 7) { result = 0; } return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_pack_input_upd( libxsmm_dnn_layer* handle ) { int result = 0; /* Pack input only for very small images, 1x1 convs, with large K to amortize the relevant overhead */ if ((handle->ofh <= 7) && (handle->desc.R == 1) && (handle->desc.S == 1) && (handle->desc.u != 1) && (handle->desc.v != 1) && (handle->desc.K >= 2048)) { result = 1; } return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_avoid_rim_fmas_upd( libxsmm_dnn_layer* handle ) { int result = 0; /* Avoid rim FMAs only for small images */ if ( (handle->ofh <= 7) && (handle->desc.R == 3) && (handle->desc.S == 3) && (handle->desc.pad_w == 1) && (handle->desc.pad_h == 1)) { result = 1; } if (handle->desc.N != handle->desc.threads) { result = 0; } return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_upd_ofw_rb( libxsmm_dnn_layer* handle ) { int result = 1; result = handle->ofw; return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_upd_ofh_rb( libxsmm_dnn_layer* handle ) { int result = 1; /* Restrict the reduction chain which is ofw_rb*ofh_rb*/ if (handle->ofh <= 28 ) { result = handle->ofh; } /* In the following scenario with strided convolutions and non batch reduce kernel make sure we have ofh_rb = 1 */ if ((handle->desc.u != 1) && (handle->desc.v != 1) && (handle->upd_use_batchreduce == 0) && (handle->upd_pack_input == 0)) { result = 1; } /* If using linearized taskview and have strided convs, make sure ofh_rb is 1.. */ if (handle->upd_linearized_tasklist == 1 && handle->upd_avoid_rim_fmas == 0 && handle->upd_pack_input == 0 && handle->desc.u != 1) { result = 1; } if (handle->upd_linearized_tasklist == 1 && handle->upd_use_batchreduce == 0 && (handle->desc.R != 1 || handle->desc.S != 1)) { result = 1; } if (handle->upd_linearized_tasklist == 0 && handle->upd_use_batchreduce == 0 && (handle->desc.R != 1 || handle->desc.S != 1)) { result = 1; } if (handle->ofw == 56 && handle->desc.R == 1) { result = 2; } if (handle->upd_linearized_tasklist == 1 && handle->upd_use_batchreduce == 1 && handle->upd_avoid_rim_fmas == 1) { result = handle->ofh; } if ((handle->desc.N != handle->desc.threads) && (handle->desc.R > 1 || handle->desc.S > 1 ) && (handle->desc.u > 1 || handle->desc.v > 1 )) { result = 1; } return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_block_upd_IFM( libxsmm_dnn_layer* handle ) { int result = 1; if (handle->ofh == 56 && handle->desc.R == 1 && handle->desc.S == 1 && handle->desc.u == 1 && handle->desc.v == 1) { result = 4; } return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_block_upd_OFM( libxsmm_dnn_layer* handle ) { int result = 1; LIBXSMM_UNUSED(handle); return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_img_batchreduce_block( libxsmm_dnn_layer* handle ) { int result = 1; LIBXSMM_UNUSED(handle); return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_use_batchreduce_upd( libxsmm_dnn_layer* handle ) { int result = 1; /* If W is large, no need for batchreduce kernel */ if (handle->ofw >= 56) { result = 0; } /* If we have packed the input, then disable batch-reduce GEMM */ if (handle->upd_pack_input == 1) { result = 0; } if (handle->upd_linearized_tasklist == 1 && handle->upd_avoid_rim_fmas == 0) { result = 0; } if (handle->upd_linearized_tasklist == 1 && handle->upd_avoid_rim_fmas == 1) { result = 1; } return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_weight_copies_upd( libxsmm_dnn_layer* handle ) { int result = handle->desc.threads; if (handle->ofw <= 14) { result = 9; } if (handle->ofw == 14 && handle->desc.N == 92 && handle->desc.threads == 92) { result = 23; } if (handle->ofw == 7 && handle->desc.N == 92 && handle->desc.threads == 92 && handle->desc.R == 3 && handle->desc.S == 3 && handle->desc.u == 1 && handle->desc.v == 1) { result = 23; } while (handle->desc.threads % result != 0) { result--; } /* FIXME: Hardcoded logic for N=27, N=26 */ if (handle->desc.N == 27 && handle->desc.threads == 27 && handle->desc.R == 1 && handle->ofw == 14 && handle->desc.u == 1) { result = 7; } if (handle->ofh == 14 && handle->desc.R == 3 && handle->desc.S == 3) { if (handle->desc.N == 26) { result = 13; } } if ((handle->desc.N != handle->desc.threads) && !(handle->upd_linearized_tasklist == 0 && handle->upd_use_batchreduce == 0)) { result = handle->desc.N; } /* Make sure a single copy when we use linearized-task view */ if (handle->upd_linearized_tasklist == 1) { result = 1; } return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_linearized_tasklist_upd( libxsmm_dnn_layer* handle ) { int result = 0; /* Use linearized task-list (i.e. no reduction) only if small images and large filters */ if (handle->ofh <= 10 && handle->ofw <= 10) { result = 1; } if (handle->ofw == 7 && handle->desc.N == 92 && handle->desc.threads == 92 && handle->desc.R == 3 && handle->desc.S == 3 && handle->desc.u == 1 && handle->desc.v == 1) { result = 0; } if (handle->ofh == 14 && handle->ofw == 14 && handle->desc.N == 23 && handle->desc.threads == 23) { result = 1; } #if 0 if ((handle->blocksofm * handle->blocksifm * handle->desc.R * handle->desc.S > (handle->desc.threads * 4)) && (handle->ofh <= 56)) { result = 1; } #endif if (handle->desc.u == 2 && handle->desc.v == 2 && handle->desc.K == 512) { result = 0; } return result; } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_init_upd_gemm_flags( libxsmm_dnn_layer* handle ) { int result = 0; LIBXSMM_UNUSED(handle); return result; } LIBXSMM_API_INLINE void libxsmm_dnn_convolution_setup_bf16_upd( libxsmm_dnn_layer* handle ) { int remainder_pixels, max_init_offset, max_compute_offset_input, input_compute_pad, accum_length_pixels, compute_pixels; const int multiple_target = 2; handle->upd_linearized_pixels = 1; if (handle->desc.S != 1 && handle->desc.v != 1) { handle->upd_linearized_pixels = 0; handle->upd_trans_w_only = 0; } /* For large images facilitate the "large" transposes by blocking the pixel/reduction domains */ if (handle->ofw >= 56 && handle->ofh >=56 && handle->desc.R == 1 && handle->desc.S == 1 && handle->desc.u == 1 && handle->desc.v == 1) { handle->upd_linearized_pixels = 0; handle->upd_trans_w_only = 1; } handle->on_the_fly_input_packing = 0; handle->upd_pack_input_upfront = 0; handle->use_hybrid_imgofm_parallelization = 0; handle->upd_linearized_tasklist = 0; if (handle->upd_linearized_pixels == 1) { /* Logistics to pad accumulation chainlength */ compute_pixels = handle->ofw * handle->ofh + 2 * handle->desc.pad_w * (handle->ofh-1); remainder_pixels = (compute_pixels % multiple_target == 0) ? 0 : (compute_pixels/multiple_target+1)*multiple_target - compute_pixels; accum_length_pixels = compute_pixels + remainder_pixels; /* In this case compact input upfront */ if (handle->desc.R == 1 && handle->desc.S == 1 && (handle->desc.u != 1 || handle->desc.v != 1)) { handle->upd_pack_input_upfront = 1; } /* Logistics for input transpose and additional pixel padding */ max_init_offset = 2 * handle->desc.pad_h * handle->ifwp + 2 * handle->desc.pad_w; max_compute_offset_input = max_init_offset + accum_length_pixels; input_compute_pad = (max_compute_offset_input > handle->ifwp*handle->ifhp) ? max_compute_offset_input - handle->ifwp*handle->ifhp : 0; handle->input_pixels = handle->ifwp * handle->ifhp + input_compute_pad; if (handle->upd_pack_input_upfront) { handle->input_pixels = accum_length_pixels; } handle->output_pixels = accum_length_pixels; handle->pixel_blocking = accum_length_pixels; handle->n_used_pixels = accum_length_pixels; handle->compute_pixels = compute_pixels; handle->use_intermediate_f32_wt_tensor = (handle->pixel_blocking == handle->n_used_pixels) ? 0 : 1; if (handle->ofw <= 14) { handle->use_hybrid_imgofm_parallelization = 1; handle->weight_copies = libxsmm_dnn_convolution_setup_weight_copies_upd(handle); if (handle->ofw == 14 && handle->desc.K >= 1024) { handle->use_hybrid_imgofm_parallelization = 0; handle->weight_copies = handle->desc.threads; } } else { handle->weight_copies = handle->desc.threads; } } if (handle->upd_linearized_pixels == 0) { handle->weight_copies = handle->desc.threads; if (handle->desc.v !=1) { handle->on_the_fly_input_packing = 1; } remainder_pixels = (handle->ofw % multiple_target == 0) ? 0 : (handle->ofw/multiple_target+1)*multiple_target - handle->ofw; handle->ofwp_extended = handle->ofwp + remainder_pixels; handle->ifwp_extended = handle->ifwp + remainder_pixels; handle->output_pixels = handle->ofwp * handle->ofwp_extended; /* coverity[identical_branches] */ handle->batchreduce_h_pixels = (handle->upd_trans_w_only) ? 1 : 1; /* TODO: identical_branches */ handle->use_intermediate_f32_wt_tensor = (handle->batchreduce_h_pixels == handle->ofh) ? 0 : 1; } if (handle->desc.N != handle->desc.threads) { handle->use_intermediate_f32_wt_tensor = 1; handle->use_hybrid_imgofm_parallelization = 0; handle->weight_copies = LIBXSMM_MIN(handle->desc.N, handle->desc.threads); } } LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_upd_padding_copy( libxsmm_dnn_layer* handle ) { int result = 0; if ( (handle->desc.pad_h != handle->desc.pad_h_in) && (handle->desc.pad_w != handle->desc.pad_w_in) ) { result = 1; } return result; } LIBXSMM_API_INLINE void libxsmm_dnn_convolution_setup_upd_scratch( libxsmm_dnn_layer* handle ) { handle->upd_packing_padding_scratch_size = 0; /* packing of input */ if ( handle->upd_pack_input != 0 ) { handle->upd_packing_padding_scratch_size = (size_t)handle->desc.N * handle->desc.C * handle->desc.H/handle->desc.u * handle->desc.W/handle->desc.v * libxsmm_dnn_typesize(handle->datatype_in); } /* logical padding with copying in the fly */ if ( handle->upd_padding_copy != 0 ) { handle->upd_packing_padding_scratch_size = (size_t)handle->desc.N * handle->desc.C * (handle->desc.H + 2*handle->desc.pad_h) * (handle->desc.W + 2*handle->desc.pad_w) * libxsmm_dnn_typesize(handle->datatype_in); } /* output/input buffer to transpose when we use bf16 */ if ( handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16 ) { const int multiple_target = 2; if (handle->upd_linearized_pixels == 1) { int compute_pixels = handle->ofw * handle->ofh + 2 * handle->desc.pad_w * (handle->ofh-1); int remainder_pixels = (compute_pixels % multiple_target == 0) ? 0 : (compute_pixels/multiple_target+1)*multiple_target - compute_pixels; int accum_length_pixels = compute_pixels + remainder_pixels; int max_init_offset = 2 * handle->desc.pad_h * (handle->desc.W + 2*handle->desc.pad_w) + 2 * handle->desc.pad_w; int max_compute_offset_input = max_init_offset + accum_length_pixels; int input_compute_pad = (max_compute_offset_input > (handle->desc.W+2*handle->desc.pad_w) * (handle->desc.H+2*handle->desc.pad_h)) ? max_compute_offset_input - (handle->desc.W+2*handle->desc.pad_w) * (handle->desc.H+2*handle->desc.pad_h) : 0; int input_pixels = (handle->desc.W+2*handle->desc.pad_w) * (handle->desc.H+2*handle->desc.pad_h) + input_compute_pad; if (handle->upd_pack_input_upfront == 1) { input_pixels = accum_length_pixels; } handle->upd_lp_output_full_scratch_size = (size_t) (handle->desc.N * accum_length_pixels * handle->desc.K * sizeof(handle->datatype_in)); handle->upd_lp_input_full_scratch_size = (size_t) (handle->desc.N * input_pixels * handle->desc.C * sizeof(handle->datatype_in)); } if (handle->upd_linearized_pixels == 0) { int remainder_pixels = (handle->ofw % multiple_target == 0) ? 0 : (handle->ofw/multiple_target+1)*multiple_target - handle->ofw; int ofwp_extended = (handle->desc.W+2*handle->desc.pad_w) + remainder_pixels; int ifwp_extended = (handle->desc.W+2*handle->desc.pad_w) + remainder_pixels; handle->upd_lp_output_full_scratch_size = (size_t) (handle->desc.N * (handle->desc.H+2*handle->desc.pad_h) * ofwp_extended * handle->desc.K * sizeof(handle->datatype_in)); handle->upd_lp_input_full_scratch_size = (size_t) (handle->desc.N * (handle->desc.H+2*handle->desc.pad_h) * ifwp_extended * handle->desc.C * sizeof(handle->datatype_in)); } handle->upd_lp_filter_full_scratch_size = (size_t)handle->desc.R * handle->desc.S * handle->desc.C * handle->desc.K * handle->desc.threads * libxsmm_dnn_typesize(LIBXSMM_DNN_DATATYPE_F32); } else { handle->upd_lp_output_full_scratch_size = 0; handle->upd_lp_input_full_scratch_size = 0; handle->upd_lp_filter_full_scratch_size = 0; } /* filter scratch */ handle->upd_filter_scratch_size = (size_t) handle->desc.R * handle->desc.S * handle->desc.C * handle->desc.K * LIBXSMM_MAX(handle->desc.threads, handle->desc.N) * sizeof(float); /* align sizes to full cacheline */ handle->upd_packing_padding_scratch_size += ( handle->upd_packing_padding_scratch_size % LIBXSMM_CACHELINE == 0 ) ? 0 : LIBXSMM_CACHELINE - (handle->upd_packing_padding_scratch_size % LIBXSMM_CACHELINE); handle->upd_lp_output_full_scratch_size += ( handle->upd_lp_output_full_scratch_size % LIBXSMM_CACHELINE == 0 ) ? 0 : LIBXSMM_CACHELINE - (handle->upd_lp_output_full_scratch_size % LIBXSMM_CACHELINE); handle->upd_lp_input_full_scratch_size += ( handle->upd_lp_input_full_scratch_size % LIBXSMM_CACHELINE == 0 ) ? 0 : LIBXSMM_CACHELINE - (handle->upd_lp_input_full_scratch_size % LIBXSMM_CACHELINE); handle->upd_filter_scratch_size += ( handle->upd_filter_scratch_size % LIBXSMM_CACHELINE == 0 ) ? 0 : LIBXSMM_CACHELINE - (handle->upd_filter_scratch_size % LIBXSMM_CACHELINE); handle->upd_lp_filter_full_scratch_size += ( handle->upd_lp_filter_full_scratch_size % LIBXSMM_CACHELINE == 0 ) ? 0 : LIBXSMM_CACHELINE - (handle->upd_lp_filter_full_scratch_size % LIBXSMM_CACHELINE); /* calculate offsets */ handle->upd_packing_padding_scratch_offset = 0; handle->upd_lp_output_full_scratch_offset = handle->upd_packing_padding_scratch_size; handle->upd_lp_input_full_scratch_offset = handle->upd_lp_output_full_scratch_offset + handle->upd_lp_output_full_scratch_size; handle->upd_filter_scratch_offset = handle->upd_lp_input_full_scratch_offset + handle->upd_lp_input_full_scratch_size; handle->upd_lp_filter_full_scratch_offset = handle->upd_filter_scratch_offset + handle->upd_filter_scratch_size; /* set overall scratch size for update */ handle->upd_scratch_size = handle->upd_packing_padding_scratch_size + handle->upd_lp_output_full_scratch_size + handle->upd_lp_input_full_scratch_size + handle->upd_filter_scratch_size + handle->upd_lp_filter_full_scratch_size; } LIBXSMM_API_INLINE libxsmm_dnn_err_t libxsmm_dnn_convolution_setup( libxsmm_dnn_layer* handle ) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; const libxsmm_trans_descriptor* tr_desc = 0; libxsmm_blasint _ldi = 64, _ldo = 64; libxsmm_descriptor_blob blob; /* init libxsmm */ LIBXSMM_INIT /* Generic parameter setup */ handle->ifmblock = libxsmm_dnn_convolution_setup_ifmblock(handle); handle->ofmblock = libxsmm_dnn_convolution_setup_ofmblock(handle); handle->fm_lp_block = libxsmm_dnn_convolution_setup_fm_lp_block(handle); handle->blocksifm = libxsmm_dnn_convolution_setup_blocksifm(handle); handle->blocksofm = libxsmm_dnn_convolution_setup_blocksofm(handle); /* FWD parameter setup */ handle->fwd_ofw_rb = libxsmm_dnn_convolution_setup_fwd_ofw_rb(handle); handle->pack_input = libxsmm_dnn_convolution_setup_pack_input_fwd(handle); handle->fwd_ofh_rb = libxsmm_dnn_convolution_setup_fwd_ofh_rb(handle); handle->block_fwd_oj = libxsmm_dnn_convolution_setup_fwd_block_H(handle); handle->loop_order = libxsmm_dnn_convolution_setup_loop_order_fwd(handle); handle->blocksifm_blocking = libxsmm_dnn_convolution_setup_blocksifm_blocking(handle); handle->block_fwd_ofm = libxsmm_dnn_convolution_setup_block_fwd_OFM(handle); handle->block_fwd_ifm = libxsmm_dnn_convolution_setup_block_fwd_IFM(handle);; handle->avoid_fmas_in_rim = libxsmm_dnn_convolution_setup_avoid_rim_fmas_fwd(handle); handle->use_ofm_parallelization = libxsmm_dnn_convolution_setup_use_ofm_parallelization(handle); handle->shuffle_filter_accesses = libxsmm_dnn_convolution_setup_shuffle_filter_accesses(handle); handle->avoid_acc_load = libxsmm_dnn_convolution_setup_avoid_acc_load(handle); handle->fwd_flags = libxsmm_dnn_convolution_setup_init_fwd_gemm_flags(handle); handle->use_fallback_fwd_loops = libxsmm_dnn_convolution_setup_fallback_loops_fwd(handle); handle->fwd_padding_copy = libxsmm_dnn_convolution_setup_fwd_padding_copy(handle); handle->code_fwd[0].ptr = 0; handle->code_fwd[1].ptr = 0; handle->code_fwd[2].ptr = 0; /* JIT cvt eltwise functions for fwd convolutions */ if (handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16) { _ldi = handle->ofmblock * handle->ofwp; _ldo = handle->ofmblock * handle->ofwp; handle->fwd_cvtfp32bf16_kernel = libxsmm_dispatch_meltw_cvtfp32bf16(handle->ofmblock * handle->fwd_ofw_rb, handle->fwd_ofh_rb, &_ldi, &_ldo, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_BF16); } /* Create strided BRGEMMs for i8i32 convolutions */ if ((handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8) && (handle->datatype_out == LIBXSMM_DNN_DATATYPE_I32)) { const libxsmm_blasint ldx = (handle->pack_input == 1) ? (libxsmm_blasint)handle->ifmblock : (libxsmm_blasint)handle->desc.v*handle->ifmblock; const libxsmm_blasint ldA = handle->ofmblock; const libxsmm_blasint ldC = handle->ofmblock; const int beta = (handle->avoid_acc_load) ? 0 : 1; int l_flags = ( LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N') ) | handle->fwd_flags; if (handle->desc.R == 1 && handle->desc.S == 1) { const int IFW = (handle->pack_input == 1) ? handle->ofwp : handle->ifwp; const int IFH = (handle->pack_input == 1) ? handle->ofhp : handle->ifhp; libxsmm_blasint stride_A = handle->ifmblock * handle->ofmblock * sizeof(char); libxsmm_blasint stride_B = handle->ifmblock * IFW * IFH * sizeof(char) ; handle->gemm_fwd.xgemm.subimrs = libxsmm_subimmdispatch_reducebatch_strd(handle->ofmblock, handle->fwd_ofh_rb*handle->fwd_ofw_rb, handle->ifmblock, stride_A, stride_B, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); } else { const int IFW = (handle->pack_input == 1) ? handle->ofwp : handle->ifwp; const int IFH = (handle->pack_input == 1) ? handle->ofhp : handle->ifhp; if (handle->avoid_fmas_in_rim == 0) { int n_blocks = handle->desc.R * handle->desc.S * handle->blocksifm_blocking; int i = 0, ifm, ki, kj; handle->A_offsets = (unsigned long long*) malloc(n_blocks * sizeof(unsigned long long)); handle->B_offsets = (unsigned long long*) malloc(n_blocks * sizeof(unsigned long long)); for (ifm = 0; ifm < handle->blocksifm_blocking; ifm++) { for (kj = 0; kj < handle->desc.R; kj++) { for (ki = 0; ki < handle->desc.S; ki++) { handle->A_offsets[i] = (ifm * handle->desc.R * handle->desc.S * handle->ifmblock * handle->ofmblock + kj * handle->desc.S * handle->ifmblock * handle->ofmblock + ki * handle->ifmblock * handle->ofmblock) * sizeof(char); handle->B_offsets[i] = (ifm * IFH * IFW * handle->ifmblock + kj * IFW * handle->ifmblock + ki * handle->ifmblock) * sizeof(char); i++; } } } handle->gemm_fwd.xgemm.subimro = libxsmm_subimmdispatch_reducebatch_offs(handle->ofmblock, handle->fwd_ofh_rb*handle->fwd_ofw_rb, handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); } else { libxsmm_blasint stride_A = handle->ifmblock * handle->desc.R * handle->desc.S * handle->ofmblock * sizeof(char); libxsmm_blasint stride_B = handle->ifmblock * IFW * IFH * sizeof(char) ; handle->gemm_fwd.xgemm.subimrs = libxsmm_subimmdispatch_reducebatch_strd(handle->ofmblock, handle->fwd_ofh_rb*handle->fwd_ofw_rb, handle->ifmblock, stride_A, stride_B, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); handle->gemm_fwd2.xgemm.subimrs = libxsmm_subimmdispatch_reducebatch_strd(handle->ofmblock, handle->fwd_ofh_rb*(handle->fwd_ofw_rb-1), handle->ifmblock, stride_A, stride_B, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); } } } else if ((handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8) && (handle->datatype_out == LIBXSMM_DNN_DATATYPE_I8)) { const libxsmm_blasint ldx = (libxsmm_blasint)handle->desc.v*handle->ifmblock; const libxsmm_blasint ldA = handle->ofmblock; const libxsmm_blasint ldC = handle->ofmblock; const int beta = 0; int l_flags = ( LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N') ) | handle->fwd_flags; if (handle->desc.R == 1 && handle->desc.S == 1) { const int IFW = handle->ifwp; const int IFH = handle->ifhp; libxsmm_blasint stride_A = handle->ifmblock * handle->ofmblock * sizeof(char); libxsmm_blasint stride_B = handle->ifmblock * IFW * IFH * sizeof(char) ; handle->gemm_fwd.xgemm.sububmrs = libxsmm_sububmmdispatch_reducebatch_strd(handle->ofmblock, handle->fwd_ofh_rb*handle->fwd_ofw_rb, handle->ifmblock, stride_A, stride_B, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); } else { const int IFW = handle->ifwp; const int IFH = handle->ifhp; int n_blocks = handle->desc.R * handle->desc.S * handle->blocksifm_blocking; int i = 0, ifm, ki, kj; handle->A_offsets = (unsigned long long*) malloc(n_blocks * sizeof(unsigned long long)); handle->B_offsets = (unsigned long long*) malloc(n_blocks * sizeof(unsigned long long)); for (ifm = 0; ifm < handle->blocksifm_blocking; ifm++) { for (kj = 0; kj < handle->desc.R; kj++) { for (ki = 0; ki < handle->desc.S; ki++) { handle->A_offsets[i] = (ifm * handle->desc.R * handle->desc.S * handle->ifmblock * handle->ofmblock + kj * handle->desc.S * handle->ifmblock * handle->ofmblock + ki * handle->ifmblock * handle->ofmblock) * sizeof(char); handle->B_offsets[i] = (ifm * IFH * IFW * handle->ifmblock + kj * IFW * handle->ifmblock + ki * handle->ifmblock) * sizeof(char); i++; } } } handle->gemm_fwd.xgemm.sububmro = libxsmm_sububmmdispatch_reducebatch_offs(handle->ofmblock, handle->fwd_ofh_rb*handle->fwd_ofw_rb, handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); } } #if 0 /* Spit out FWD parameters that are selected... */ printf("FWD params...\n"); printf("Fwd_ofw_rb = %d\n", handle->fwd_ofw_rb); printf("Fwd_ofh_rb = %d\n", handle->fwd_ofh_rb); printf("Pack input = %d\n", handle->pack_input); printf("Block oj = %d\n", handle->block_fwd_oj); printf("Loop order = %d\n", handle->loop_order); printf("Blocksifm_blocking = %d\n", handle->blocksifm_blocking); printf("Block fwd ofm = %d\n", handle->block_fwd_ofm); printf("Block fwd ifm = %d\n", handle->block_fwd_ifm); printf("Avoid rim fmas = %d\n", handle->avoid_fmas_in_rim); printf("Ofm parallelization = %d\n", handle->use_ofm_parallelization); printf("Shuffle filter accesses = %d\n", handle->shuffle_filter_accesses); printf("Avoid acc load = %d\n", handle->avoid_acc_load); printf("Fwd GEMM flags = %d\n", handle->fwd_flags); #endif /* BWD parameter setup */ handle->bwd_ofw_rb = libxsmm_dnn_convolution_setup_bwd_ofw_rb(handle); handle->bwd_ofh_rb = libxsmm_dnn_convolution_setup_bwd_ofh_rb(handle); handle->pack_input_bwd = libxsmm_dnn_convolution_setup_pack_input_bwd(handle); handle->spread_input_bwd = libxsmm_dnn_convolution_setup_spread_input_bwd(handle); handle->blocksofm_blocking = libxsmm_dnn_convolution_setup_blocksofm_blocking(handle); handle->avoid_acc_load_bwd = libxsmm_dnn_convolution_setup_avoid_acc_load_bwd(handle); handle->use_ifm_parallelization = libxsmm_dnn_convolution_setup_use_ifm_parallelization(handle); handle->block_bwd_ofm = libxsmm_dnn_convolution_setup_block_bwd_OFM(handle); handle->block_bwd_ifm = libxsmm_dnn_convolution_setup_block_bwd_IFM(handle); handle->block_bwd_oj = libxsmm_dnn_convolution_setup_bwd_block_H(handle); handle->use_fallback_bwd_loops = libxsmm_dnn_convolution_setup_fallback_loops_bwd(handle); #if 0 /* Spit out BWD parameters that are selected... */ printf("BWD params...\n"); printf("Bwd_ofw_rb = %d\n", handle->bwd_ofw_rb); printf("Bwd_ofh_rb = %d\n", handle->bwd_ofh_rb); printf("Pack input = %d\n", handle->pack_input_bwd); printf("Spread input = %d\n", handle->spread_input_bwd); printf("Blocksofm_blocking = %d\n", handle->blocksofm_blocking); printf("Avoid acc load = %d\n", handle->avoid_acc_load_bwd); printf("Ifm parallelization = %d\n", handle->use_ifm_parallelization); printf("Block bwd ofm = %d\n", handle->block_bwd_ofm); printf("Block bwd ifm = %d\n", handle->block_bwd_ifm); printf("Block oj = %d\n", handle->block_bwd_oj); #endif handle->code_bwd[0].ptr = 0; handle->code_bwd[1].ptr = 0; handle->code_bwd[2].ptr = 0; /* Transpose kernel used for filter transpose in bwd pass */ tr_desc = libxsmm_trans_descriptor_init(&blob, sizeof(float), 64, 16, 64); handle->tr_kernel = libxsmm_dispatch_trans(tr_desc); /* UPD parameter setup */ handle->upd_linearized_tasklist = libxsmm_dnn_convolution_setup_linearized_tasklist_upd(handle); handle->upd_avoid_rim_fmas = libxsmm_dnn_convolution_setup_avoid_rim_fmas_upd(handle); handle->upd_pack_input = libxsmm_dnn_convolution_setup_pack_input_upd(handle); handle->upd_use_batchreduce = libxsmm_dnn_convolution_setup_use_batchreduce_upd(handle); handle->upd_ofw_rb = libxsmm_dnn_convolution_setup_upd_ofw_rb(handle); handle->upd_ofh_rb = libxsmm_dnn_convolution_setup_upd_ofh_rb(handle); handle->upd_loop_order = libxsmm_dnn_convolution_setup_loop_order_upd(handle); handle->weight_copies = libxsmm_dnn_convolution_setup_weight_copies_upd(handle); handle->block_upd_ofm = libxsmm_dnn_convolution_setup_block_upd_OFM(handle); handle->block_upd_ifm = libxsmm_dnn_convolution_setup_block_upd_IFM(handle); handle->upd_loop_order = libxsmm_dnn_convolution_setup_loop_order_upd(handle); if (handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16) { libxsmm_dnn_convolution_setup_bf16_upd(handle); } handle->upd_padding_copy = libxsmm_dnn_convolution_setup_upd_padding_copy(handle); #if 0 /* Spit out UPD parameters that are selected... */ printf("UPD params...\n"); if (handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16) { printf("BF16 path...\n"); printf("UPD use_hybrid_imgofm_parallelization = %d\n", handle->use_hybrid_imgofm_parallelization); printf("UPD linearized_pixels = %d\n", handle->upd_linearized_pixels); printf("UPD upd_trans_w_only = %d\n", handle->upd_trans_w_only); printf("UPD on_the_fly_input_packing = %d\n", handle->on_the_fly_input_packing); printf("UPD use_intermediate_f32_wt_tensor = %d\n", handle->use_intermediate_f32_wt_tensor); } printf("UPD linearized tasks = %d\n", handle->upd_linearized_tasklist); printf("UPD avoid rim fmas = %d\n", handle->upd_avoid_rim_fmas); printf("UPD Pack input = %d\n", handle->upd_pack_input); printf("UPD use batch-reduce GEMM = %d\n", handle->upd_use_batchreduce); printf("Upd_ofw_rb = %d\n", handle->upd_ofw_rb); printf("Upd_ofh_rb = %d\n", handle->upd_ofh_rb); printf("UPD loop order = %d\n", handle->upd_loop_order); printf("UPD weight_copies = %d\n", handle->weight_copies); printf("Block upd ofm = %d\n", handle->block_upd_ofm); printf("Block upd ifm = %d\n", handle->block_upd_ifm); #endif handle->code_upd[0].ptr = 0; handle->code_upd[1].ptr = 0; /* prepare barrier */ handle->barrier = libxsmm_barrier_create(handle->desc.threads, 1); /* setup up scratch */ libxsmm_dnn_convolution_setup_fwd_scratch( handle ); libxsmm_dnn_convolution_setup_bwd_scratch( handle ); libxsmm_dnn_convolution_setup_upd_scratch( handle ); handle->scratch = 0; handle->scratch_size = LIBXSMM_MAX( handle->fwd_scratch_size, LIBXSMM_MAX( handle->bwd_scratch_size, handle->upd_scratch_size ) ); return status; } #undef MIXED #undef KHWC #undef HWKC #undef CHWK #undef HWCK LIBXSMM_API libxsmm_dnn_layer* libxsmm_dnn_create_conv_layer( libxsmm_dnn_conv_desc conv_desc, libxsmm_dnn_err_t* status) { libxsmm_dnn_layer* handle = 0; *status = LIBXSMM_DNN_SUCCESS; /* currently we don't support NCHW */ if ( (conv_desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NCHW) > 0 ) { *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_NCHW; return 0; } /* currently we don't support KCRS */ if ( (conv_desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_KCRS) > 0 ) { *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_KCRS; return 0; } /* we only support physical paddind in these days */ /* @TODO: add logical padding support for other datatypes than FP32 */ if ( ( ( conv_desc.pad_h != conv_desc.pad_h_in ) || ( conv_desc.pad_w != conv_desc.pad_w_in ) || ( conv_desc.pad_h != conv_desc.pad_h_out ) || ( conv_desc.pad_w != conv_desc.pad_w_out ) ) && ( conv_desc.datatype_in != LIBXSMM_DNN_DATATYPE_F32 ) ) { *status = LIBXSMM_DNN_ERR_INVALID_PADDING; return 0; } handle = (libxsmm_dnn_layer*)malloc(sizeof(libxsmm_dnn_layer)); if (0 != handle) { /* zero entire content; not only safer but also sets data and code pointers to NULL */ memset(handle, 0, sizeof(*handle)); /* initialize known handle components */ handle->desc = conv_desc; handle->datatype_in = conv_desc.datatype_in; handle->datatype_out = conv_desc.datatype_out; /* select the intermediate format, only applicable for integer types */ if ( (conv_desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (conv_desc.datatype_out != LIBXSMM_DNN_DATATYPE_F32) ) { /* error */ } else if ( (conv_desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (conv_desc.datatype_out != LIBXSMM_DNN_DATATYPE_BF16) ) { /* error */ } else if ( (conv_desc.datatype_in == LIBXSMM_DNN_DATATYPE_I16) && (conv_desc.datatype_out != LIBXSMM_DNN_DATATYPE_F32) ) { /* error */ } else if ( (conv_desc.datatype_in == LIBXSMM_DNN_DATATYPE_I8) && (conv_desc.datatype_out != LIBXSMM_DNN_DATATYPE_I32) ) { /* error */ } else if ( (conv_desc.datatype_in == LIBXSMM_DNN_DATATYPE_I8) && (conv_desc.datatype_out != LIBXSMM_DNN_DATATYPE_I8) ) { /* error */ } else if ( (conv_desc.datatype_in == LIBXSMM_DNN_DATATYPE_I8) && (conv_desc.datatype_out != LIBXSMM_DNN_DATATYPE_F32) ) { /* error */ } else { /* fine, no error */ } handle->buffer_format = conv_desc.buffer_format; handle->filter_format = conv_desc.filter_format; handle->fuse_ops = conv_desc.fuse_ops; handle->options = conv_desc.options; /* derive additional values */ handle->ifhp = conv_desc.H + 2*conv_desc.pad_h_in; handle->ifwp = conv_desc.W + 2*conv_desc.pad_w_in; handle->ofh = (conv_desc.H + 2*conv_desc.pad_h - conv_desc.R) / conv_desc.u + 1; handle->ofw = (conv_desc.W + 2*conv_desc.pad_w - conv_desc.S) / conv_desc.v + 1; handle->ofhp = handle->ofh + 2*conv_desc.pad_h_out; handle->ofwp = handle->ofw + 2*conv_desc.pad_w_out; handle->ifmblock = 1; handle->ofmblock = 1; handle->blocksifm = conv_desc.C; handle->blocksofm = conv_desc.K; handle->fwd_ofw_rb = 1; handle->fwd_ofh_rb = 1; handle->bwd_ofw_rb = 1; handle->bwd_ofh_rb = 1; handle->upd_ofw_rb = 1; handle->upd_ofh_rb = 1; handle->fm_lp_block = 1; handle->blocksifm_blocking = 1; handle->blocksofm_blocking = 1; /* Set algorithm to use */ if (conv_desc.algo == LIBXSMM_DNN_CONV_ALGO_AUTO) { handle->algo = LIBXSMM_DNN_CONV_ALGO_DIRECT; } else { handle->algo = conv_desc.algo; } if ( handle->algo != LIBXSMM_DNN_CONV_ALGO_DIRECT ) { *status = LIBXSMM_DNN_ERR_INVALID_ALGO; free(handle); handle = 0; return 0; } *status = libxsmm_dnn_convolution_setup(handle); } else { *status = LIBXSMM_DNN_ERR_CREATE_HANDLE; } /* account for eventually deallocated handle */ if ( LIBXSMM_DNN_SUCCESS != *status ) { handle = 0; } return handle; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_conv_layer(const libxsmm_dnn_layer* handle) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { /* Deallocate barrier */ if (handle->barrier != 0 ) { libxsmm_barrier_release((const libxsmm_barrier*)handle->barrier); } /* deallocate handle structure itself */ free(/*remove constness*/(libxsmm_dnn_layer*)handle); } return status; } LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_create_tensor_datalayout(const libxsmm_dnn_layer* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status) { libxsmm_dnn_tensor_datalayout* layout; *status = LIBXSMM_DNN_SUCCESS; layout = 0; if (handle != 0) { layout = (libxsmm_dnn_tensor_datalayout*) malloc(sizeof(libxsmm_dnn_tensor_datalayout)); if (layout != 0) { memset(layout, 0, sizeof(libxsmm_dnn_tensor_datalayout)); if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) || (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { layout->format = handle->buffer_format; layout->tensor_type = LIBXSMM_DNN_ACTIVATION; if ((handle->buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0) { if ( ((handle->datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) ) { layout->datatype = LIBXSMM_DNN_DATATYPE_F32; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 5; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) ) { layout->dim_size[0] = handle->ifmblock; layout->dim_size[1] = handle->ifwp; layout->dim_size[2] = handle->ifhp; layout->dim_size[3] = handle->blocksifm; layout->dim_size[4] = handle->desc.N; } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { layout->dim_size[0] = handle->ofmblock; layout->dim_size[1] = handle->ofwp; layout->dim_size[2] = handle->ofhp; layout->dim_size[3] = handle->blocksofm; layout->dim_size[4] = handle->desc.N; } else { free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } /* @TODO this need to change */ } else if ( (handle->datatype_in == LIBXSMM_DNN_DATATYPE_I16) && (handle->datatype_out == LIBXSMM_DNN_DATATYPE_I32) ) { if ( ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_INPUT) ) ) { layout->datatype = handle->datatype_in; } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { layout->datatype = handle->datatype_out; } layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 5; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) ) { layout->dim_size[0] = handle->ifmblock; layout->dim_size[1] = handle->ifwp; layout->dim_size[2] = handle->ifhp; layout->dim_size[3] = handle->blocksifm; layout->dim_size[4] = handle->desc.N; } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { layout->dim_size[0] = handle->ofmblock; layout->dim_size[1] = handle->ofwp; layout->dim_size[2] = handle->ofhp; layout->dim_size[3] = handle->blocksofm; layout->dim_size[4] = handle->desc.N; } else { free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } } else if ( (handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) { layout->datatype = LIBXSMM_DNN_DATATYPE_BF16; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(6*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(6*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 5; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) ) { layout->dim_size[0] = handle->ifmblock; layout->dim_size[1] = handle->ifwp; layout->dim_size[2] = handle->ifhp; layout->dim_size[3] = handle->blocksifm; layout->dim_size[4] = handle->desc.N; } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { layout->dim_size[0] = handle->ofmblock; layout->dim_size[1] = handle->ofwp; layout->dim_size[2] = handle->ofhp; layout->dim_size[3] = handle->blocksofm; layout->dim_size[4] = handle->desc.N; } else { /* coverity[dead_error_begin] */ free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } } else if ( ((handle->datatype_in == LIBXSMM_DNN_DATATYPE_I16) && (handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32)) || (handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8) ) { if ( ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_INPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) ) ) { layout->datatype = handle->datatype_in; } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) ) { layout->datatype = handle->datatype_out; } layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_INPUT) ) { layout->num_dims = 5; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; layout->dim_size[0] = handle->ifmblock; layout->dim_size[1] = handle->ifwp; layout->dim_size[2] = handle->ifhp; layout->dim_size[3] = handle->blocksifm; layout->dim_size[4] = handle->desc.N; } else if ( type == LIBXSMM_DNN_GRADIENT_OUTPUT ) { layout->num_dims = 5; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; layout->dim_size[0] = handle->ofmblock; layout->dim_size[1] = handle->ofwp; layout->dim_size[2] = handle->ofhp; layout->dim_size[3] = handle->blocksofm; layout->dim_size[4] = handle->desc.N; } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { layout->num_dims = 5; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; layout->dim_size[0] = handle->ofmblock; layout->dim_size[1] = handle->ofwp; layout->dim_size[2] = handle->ofhp; layout->dim_size[3] = handle->blocksofm; layout->dim_size[4] = handle->desc.N; } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { layout->num_dims = 5; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; layout->dim_size[0] = handle->ifmblock; layout->dim_size[1] = handle->ifwp; layout->dim_size[2] = handle->ifhp; layout->dim_size[3] = handle->blocksifm; layout->dim_size[4] = handle->desc.N; } else { /* coverity[dead_error_begin] */ free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } else if ((handle->buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NHWC) > 0) { if ( ((handle->datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) ) { layout->datatype = LIBXSMM_DNN_DATATYPE_F32; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(4*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(4*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 4; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) ) { layout->dim_size[0] = handle->ifmblock * handle->blocksifm; layout->dim_size[1] = handle->ifwp; layout->dim_size[2] = handle->ifhp; layout->dim_size[3] = handle->desc.N; } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { layout->dim_size[0] = handle->ofmblock * handle->blocksofm; layout->dim_size[1] = handle->ofwp; layout->dim_size[2] = handle->ofhp; layout->dim_size[3] = handle->desc.N; } else { /* coverity[dead_error_begin] */ free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; } } else if ( (type == LIBXSMM_DNN_REGULAR_FILTER) || (type == LIBXSMM_DNN_GRADIENT_FILTER) || (type == LIBXSMM_DNN_FILTER) ) { layout->format = handle->filter_format; layout->tensor_type = LIBXSMM_DNN_FILTER; if ((handle->filter_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0) { if ( (handle->datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) { layout->datatype = LIBXSMM_DNN_DATATYPE_F32; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(6*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(6*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 6; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_S; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_R; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[5] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_size[0] = handle->ofmblock; layout->dim_size[1] = handle->ifmblock; layout->dim_size[2] = handle->desc.S; layout->dim_size[3] = handle->desc.R; layout->dim_size[4] = handle->blocksifm; layout->dim_size[5] = handle->blocksofm; } } else if ( (handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) { layout->datatype = LIBXSMM_DNN_DATATYPE_BF16; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(7*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(7*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 7; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_S; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_R; layout->dim_type[5] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[6] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_size[0] = handle->fm_lp_block; layout->dim_size[1] = handle->ofmblock; layout->dim_size[2] = handle->ifmblock/handle->fm_lp_block; layout->dim_size[3] = handle->desc.S; layout->dim_size[4] = handle->desc.R; layout->dim_size[5] = handle->blocksifm; layout->dim_size[6] = handle->blocksofm; } } else if ( ((handle->datatype_in == LIBXSMM_DNN_DATATYPE_I16) && (handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32)) || (handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8 ) ) { if ( (type == LIBXSMM_DNN_REGULAR_FILTER) || (type == LIBXSMM_DNN_FILTER) ) { layout->datatype = handle->datatype_in; } else if (type == LIBXSMM_DNN_GRADIENT_FILTER) { layout->datatype = handle->datatype_out; } layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(7*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(7*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ if ((type == LIBXSMM_DNN_REGULAR_FILTER) || (type == LIBXSMM_DNN_FILTER)) { layout->num_dims = 7; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_S; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_R; layout->dim_type[5] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[6] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_size[0] = handle->fm_lp_block; layout->dim_size[1] = handle->ofmblock; layout->dim_size[2] = handle->ifmblock/handle->fm_lp_block; layout->dim_size[3] = handle->desc.S; layout->dim_size[4] = handle->desc.R; layout->dim_size[5] = handle->blocksifm; layout->dim_size[6] = handle->blocksofm; } } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } else if ((handle->filter_format & LIBXSMM_DNN_TENSOR_FORMAT_RSCK) > 0) { if ( (handle->datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) { layout->datatype = LIBXSMM_DNN_DATATYPE_F32; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(4*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(4*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 4; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_S; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_R; layout->dim_size[0] = handle->ofmblock * handle->blocksofm; layout->dim_size[1] = handle->ifmblock * handle->blocksifm; layout->dim_size[2] = handle->desc.S; layout->dim_size[3] = handle->desc.R; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; } } else if ( type == LIBXSMM_DNN_REGULAR_FILTER_TRANS ) { layout->format = handle->filter_format; layout->tensor_type = LIBXSMM_DNN_REGULAR_FILTER_TRANS; if ((handle->filter_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0) { if ( (handle->datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) { layout->datatype = LIBXSMM_DNN_DATATYPE_F32; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(6*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(6*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 6; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_S; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_R; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[5] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_size[0] = handle->ifmblock; layout->dim_size[1] = handle->ofmblock; layout->dim_size[2] = handle->desc.S; layout->dim_size[3] = handle->desc.R; layout->dim_size[4] = handle->blocksofm; layout->dim_size[5] = handle->blocksifm; } } else if ( (handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) { layout->datatype = LIBXSMM_DNN_DATATYPE_BF16; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(7*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(7*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 7; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_S; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_R; layout->dim_type[5] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[6] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_size[0] = handle->fm_lp_block; layout->dim_size[1] = handle->ifmblock; layout->dim_size[2] = handle->ofmblock/handle->fm_lp_block; layout->dim_size[3] = handle->desc.S; layout->dim_size[4] = handle->desc.R; layout->dim_size[5] = handle->blocksofm; layout->dim_size[6] = handle->blocksifm; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } #if 0 } else if ((handle->filter_format & LIBXSMM_DNN_TENSOR_FORMAT_RSCK) > 0) { if ( (handle->datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) { layout->datatype = LIBXSMM_DNN_DATATYPE_F32; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(4*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(4*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 4; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_S; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_R; layout->dim_size[0] = handle->ofmblock * handle->blocksofm; layout->dim_size[1] = handle->ifmblock * handle->blocksifm; layout->dim_size[2] = handle->desc.S; layout->dim_size[3] = handle->desc.K; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } #endif } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; } } else if ( (type == LIBXSMM_DNN_REGULAR_CHANNEL_BIAS) || (type == LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS) || (type == LIBXSMM_DNN_CHANNEL_BIAS) ) { layout->format = handle->buffer_format; layout->tensor_type = LIBXSMM_DNN_CHANNEL_SCALAR; if ((handle->buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0) { if ( handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { layout->datatype = handle->datatype_out; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(2*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(2*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 2; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_size[0] = handle->ofmblock; layout->dim_size[1] = handle->blocksofm; } #if 0 } else if ( (handle->datatype_in == LIBXSMM_DNN_DATATYPE_I16) || (handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8) ) { layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(3*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(3*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 3; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_size[0] = handle->fm_lp_block; layout->dim_size[1] = handle->ofmblock; layout->dim_size[2] = handle->blocksofm; } #endif } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } else if ((handle->buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NHWC) > 0) { layout->datatype = handle->datatype_out; if ( handle->datatype_in == LIBXSMM_DNN_DATATYPE_F32 ) { layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(1*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(1*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 1; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_size[0] = handle->ofmblock*handle->blocksofm; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; } } else if ( (type == LIBXSMM_DNN_BATCH_STATS) ) { layout->format = handle->buffer_format; layout->tensor_type = LIBXSMM_DNN_BATCH_STATS; if ((handle->buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0) { if ( (handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32) || (handle->datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) { layout->datatype = LIBXSMM_DNN_DATATYPE_F32; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(4*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(4*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 2; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_X; layout->dim_size[0] = handle->ofmblock; layout->dim_size[1] = handle->desc.N; layout->dim_size[2] = handle->blocksofm; layout->dim_size[3] = 2; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; } } else if (type == LIBXSMM_DNN_MAX_STATS_FWD) { layout->format = handle->buffer_format; layout->tensor_type = LIBXSMM_DNN_MAX_STATS_FWD; layout->datatype = LIBXSMM_DNN_DATATYPE_F32; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(2*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(2*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 2; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; layout->dim_size[0] = handle->ifmblock; layout->dim_size[1] = handle->desc.N; } } else if (type == LIBXSMM_DNN_MAX_STATS_BWD) { layout->format = handle->buffer_format; layout->tensor_type = LIBXSMM_DNN_MAX_STATS_BWD; layout->datatype = LIBXSMM_DNN_DATATYPE_F32; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(2*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(2*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 2; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; layout->dim_size[0] = handle->ifmblock; layout->dim_size[1] = handle->desc.N; } } else if (type == LIBXSMM_DNN_MAX_STATS_UPD) { layout->format = handle->buffer_format; layout->tensor_type = LIBXSMM_DNN_MAX_STATS_UPD; layout->datatype = LIBXSMM_DNN_DATATYPE_F32; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(2*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(2*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 2; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; layout->dim_size[0] = handle->ifmblock; layout->dim_size[1] = handle->desc.N; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT; } } else { *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return layout; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_trans_reg_bf16_filter(const libxsmm_dnn_layer* handle) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (handle != 0) { if ( (handle->reg_filter != 0) && (handle->reg_filter_tr != 0) ) { /* TODO handle more datatypes */ int ifm1, ifm2, kj, ki, ofm1, ofm2; int ofmblock_lp = handle->ofmblock/handle->fm_lp_block; int ifmblock_lp = handle->ifmblock/handle->fm_lp_block; int lpb = handle->fm_lp_block; LIBXSMM_VLA_DECL(7, libxsmm_bfloat16, wt, (libxsmm_bfloat16*)handle->reg_filter->data, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, lpb); LIBXSMM_VLA_DECL(7, libxsmm_bfloat16, tr_wt, (libxsmm_bfloat16*)handle->reg_filter_tr->data, handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb); /* TODO we might want to do this in parallel.... */ for ( ifm1 = 0; ifm1 < handle->blocksifm; ++ifm1 ) { for ( ofm1 = 0; ofm1 < handle->blocksofm; ++ofm1 ) { for (kj=0; kj < handle->desc.R; ++kj) { for (ki=0; ki < handle->desc.S; ++ki) { for ( ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2 ) { for ( ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2 ) { LIBXSMM_VLA_ACCESS(7, tr_wt, ifm1, ofm1, handle->desc.R-1-kj , handle->desc.S-1-ki, ofm2/lpb, ifm2, ofm2%lpb, handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb) = LIBXSMM_VLA_ACCESS(7, wt, ofm1, ifm1, kj, ki, ifm2/lpb, ofm2, ifm2%lpb, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, lpb); } } } } } } } else { status = LIBXSMM_DNN_ERR_INVALID_TENSOR; } } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_trans_reg_filter(const libxsmm_dnn_layer* handle) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (handle != 0) { if ( (handle->reg_filter != 0) && (handle->reg_filter_tr != 0) ) { /* TODO handle more datatypes */ int ifm1, ifm2, kj, ki, ofm1, ofm2; LIBXSMM_VLA_DECL(6, float, wt, (float*)handle->reg_filter->data, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); LIBXSMM_VLA_DECL(6, float, tr_wt, (float*)handle->reg_filter_tr->data, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); /* TODO we might want to do this in parallel.... */ for ( ifm1 = 0; ifm1 < handle->blocksifm; ++ifm1 ) { for ( ofm1 = 0; ofm1 < handle->blocksofm; ++ofm1 ) { for (kj=0; kj < handle->desc.R; ++kj) { for (ki=0; ki < handle->desc.S; ++ki) { for ( ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2 ) { for ( ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2 ) { LIBXSMM_VLA_ACCESS(6, tr_wt, ifm1, ofm1, handle->desc.R-1-kj, handle->desc.S-1-ki, ofm2, ifm2, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock) = LIBXSMM_VLA_ACCESS(6, wt, ofm1, ifm1, kj, ki, ifm2, ofm2, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); } } } } } } } else { status = LIBXSMM_DNN_ERR_INVALID_TENSOR; } } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_bind_tensor(libxsmm_dnn_layer* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check for tensor type */ if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_GRADIENT_OUTPUT) && (type != LIBXSMM_DNN_REGULAR_FILTER) && (type != LIBXSMM_DNN_GRADIENT_FILTER) && (type != LIBXSMM_DNN_REGULAR_CHANNEL_BIAS) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS) && (type != LIBXSMM_DNN_REGULAR_FILTER_TRANS) && (type != LIBXSMM_DNN_BATCH_STATS) && (type != LIBXSMM_DNN_MAX_STATS_FWD) && (type != LIBXSMM_DNN_MAX_STATS_BWD) && (type != LIBXSMM_DNN_MAX_STATS_UPD) ) { status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; return status; } if (handle != 0 && tensor != 0) { libxsmm_dnn_tensor_datalayout* handle_layout = libxsmm_dnn_create_tensor_datalayout(handle, type, &status); if ( libxsmm_dnn_compare_tensor_datalayout(handle_layout, tensor->layout, &status) == 0 ) { if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { handle->reg_input = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { handle->grad_input = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { handle->reg_output = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_GRADIENT_OUTPUT ) { handle->grad_output = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_REGULAR_FILTER ) { handle->reg_filter = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_GRADIENT_FILTER ) { handle->grad_filter = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_BIAS ) { handle->reg_bias = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS ) { handle->grad_bias = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_REGULAR_FILTER_TRANS ) { handle->reg_filter_tr = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_BATCH_STATS ) { handle->batch_stats = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_MAX_STATS_FWD ) { handle->maxstats_fwd = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_MAX_STATS_BWD ) { handle->maxstats_bwd = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_MAX_STATS_UPD ) { handle->maxstats_upd = (libxsmm_dnn_tensor*)tensor; } else { /* cannot happen */ } } else { status = LIBXSMM_DNN_ERR_MISMATCH_TENSOR; } libxsmm_dnn_destroy_tensor_datalayout( handle_layout ); } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE_TENSOR; } return status; } LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_get_tensor(libxsmm_dnn_layer* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status) { libxsmm_dnn_tensor* return_tensor = 0; *status = LIBXSMM_DNN_SUCCESS; /* check for tensor type */ if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_GRADIENT_OUTPUT) && (type != LIBXSMM_DNN_REGULAR_FILTER) && (type != LIBXSMM_DNN_GRADIENT_FILTER) && (type != LIBXSMM_DNN_REGULAR_CHANNEL_BIAS) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS) && (type != LIBXSMM_DNN_REGULAR_FILTER_TRANS) && (type != LIBXSMM_DNN_BATCH_STATS) && (type != LIBXSMM_DNN_MAX_STATS_FWD) && (type != LIBXSMM_DNN_MAX_STATS_BWD) && (type != LIBXSMM_DNN_MAX_STATS_UPD) ) { *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; return return_tensor; } if (handle != 0) { if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { return_tensor = handle->reg_input; } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { return_tensor = handle->grad_input; } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { return_tensor = handle->reg_output; } else if ( type == LIBXSMM_DNN_GRADIENT_OUTPUT ) { return_tensor = handle->grad_output; } else if ( type == LIBXSMM_DNN_REGULAR_FILTER ) { return_tensor = handle->reg_filter; } else if ( type == LIBXSMM_DNN_GRADIENT_FILTER ) { return_tensor = handle->grad_filter; } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_BIAS ) { return_tensor = handle->reg_bias; } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS ) { return_tensor = handle->grad_bias; } else if ( type == LIBXSMM_DNN_REGULAR_FILTER_TRANS ) { return_tensor = handle->reg_filter_tr; } else if ( type == LIBXSMM_DNN_BATCH_STATS ) { return_tensor = handle->batch_stats; } else if ( type == LIBXSMM_DNN_MAX_STATS_FWD ) { return_tensor = handle->maxstats_fwd; } else if ( type == LIBXSMM_DNN_MAX_STATS_BWD ) { return_tensor = handle->maxstats_bwd; } else if ( type == LIBXSMM_DNN_MAX_STATS_UPD ) { return_tensor = handle->maxstats_upd; } else { /* cannot happen */ } } else { *status = LIBXSMM_DNN_ERR_INVALID_HANDLE_TENSOR; } return return_tensor; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_release_tensor(libxsmm_dnn_layer* handle, const libxsmm_dnn_tensor_type type) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check for tensor type */ if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_GRADIENT_OUTPUT) && (type != LIBXSMM_DNN_REGULAR_FILTER) && (type != LIBXSMM_DNN_GRADIENT_FILTER) && (type != LIBXSMM_DNN_REGULAR_CHANNEL_BIAS) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS) && (type != LIBXSMM_DNN_REGULAR_FILTER_TRANS) && (type != LIBXSMM_DNN_BATCH_STATS) && (type != LIBXSMM_DNN_MAX_STATS_FWD) && (type != LIBXSMM_DNN_MAX_STATS_BWD) && (type != LIBXSMM_DNN_MAX_STATS_UPD) ) { status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; return status; } if (handle != 0) { if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { handle->reg_input = 0; } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { handle->grad_input = 0; } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { handle->reg_output = 0; } else if ( type == LIBXSMM_DNN_GRADIENT_OUTPUT ) { handle->grad_output = 0; } else if ( type == LIBXSMM_DNN_REGULAR_FILTER ) { handle->reg_filter = 0; } else if ( type == LIBXSMM_DNN_GRADIENT_FILTER ) { handle->grad_filter = 0; } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_BIAS ) { handle->reg_bias = 0; } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS ) { handle->grad_bias = 0; } else if ( type == LIBXSMM_DNN_REGULAR_FILTER_TRANS ) { handle->reg_filter_tr = 0; } else if ( type == LIBXSMM_DNN_BATCH_STATS ) { handle->batch_stats = 0; } else if ( type == LIBXSMM_DNN_MAX_STATS_FWD ) { handle->maxstats_fwd = 0; } else if ( type == LIBXSMM_DNN_MAX_STATS_BWD ) { handle->maxstats_bwd = 0; } else if ( type == LIBXSMM_DNN_MAX_STATS_UPD ) { handle->maxstats_upd = 0; } else { /* cannot happen */ } } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE_TENSOR; } return status; } LIBXSMM_API size_t libxsmm_dnn_get_scratch_size(const libxsmm_dnn_layer* handle, const libxsmm_dnn_compute_kind kind, libxsmm_dnn_err_t* status) { size_t l_scratch_size = 0; *status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { switch (kind) { case LIBXSMM_DNN_COMPUTE_KIND_FWD: break; case LIBXSMM_DNN_COMPUTE_KIND_BWD: break; case LIBXSMM_DNN_COMPUTE_KIND_UPD: break; case LIBXSMM_DNN_COMPUTE_KIND_ALL: break; default: { *status = LIBXSMM_DNN_ERR_INVALID_KIND; } } l_scratch_size += handle->scratch_size + 64; } else { *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return l_scratch_size; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_bind_scratch(libxsmm_dnn_layer* handle, const libxsmm_dnn_compute_kind kind, const void* scratch) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; uintptr_t address = (uintptr_t)scratch; size_t offset = 0; if (scratch == 0) { status = LIBXSMM_DNN_ERR_SCRATCH_NOT_ALLOCED; return status; } if (0 != handle) { if (address % 64 == 0) { handle->scratch = (void*)address; } else { offset = (64 - address % 64); handle->scratch = (void*)(address+offset); } address += handle->scratch_size + 64; switch (kind) { case LIBXSMM_DNN_COMPUTE_KIND_FWD: break; case LIBXSMM_DNN_COMPUTE_KIND_BWD: break; case LIBXSMM_DNN_COMPUTE_KIND_UPD: break; case LIBXSMM_DNN_COMPUTE_KIND_ALL: break; default: { status = LIBXSMM_DNN_ERR_INVALID_KIND; } } } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_release_scratch(libxsmm_dnn_layer* handle, const libxsmm_dnn_compute_kind kind) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { handle->scratch = 0; switch (kind) { case LIBXSMM_DNN_COMPUTE_KIND_FWD: break; case LIBXSMM_DNN_COMPUTE_KIND_BWD: break; case LIBXSMM_DNN_COMPUTE_KIND_UPD: break; case LIBXSMM_DNN_COMPUTE_KIND_ALL: break; default: { status = LIBXSMM_DNN_ERR_INVALID_KIND; } } } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API_INLINE libxsmm_dnn_err_t internal_execute_st(libxsmm_dnn_layer* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { switch (handle->algo) { case LIBXSMM_DNN_CONV_ALGO_DIRECT: { switch (kind) { case LIBXSMM_DNN_COMPUTE_KIND_FWD: { switch (handle->buffer_format) { case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { switch (handle->filter_format) { case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { status = libxsmm_dnn_convolve_st_fwd_custom_custom(handle, start_thread, tid); } break; default: { status = LIBXSMM_DNN_ERR_INVALID_FORMAT_CONVOLVE; } } } break; case LIBXSMM_DNN_TENSOR_FORMAT_NHWC: { switch (handle->filter_format) { case LIBXSMM_DNN_TENSOR_FORMAT_RSCK: { status = libxsmm_dnn_convolve_st_fwd_nhwc_rsck(handle, start_thread, tid); } break; case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { status = libxsmm_dnn_convolve_st_fwd_nhwc_custom(handle, start_thread, tid); } break; default: { status = LIBXSMM_DNN_ERR_INVALID_FORMAT_CONVOLVE; } } } break; default: { status = LIBXSMM_DNN_ERR_INVALID_FORMAT_CONVOLVE; } } } break; case LIBXSMM_DNN_COMPUTE_KIND_BWD: { switch (handle->buffer_format) { case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { switch (handle->filter_format) { case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { status = libxsmm_dnn_convolve_st_bwd_custom_custom(handle, start_thread, tid); } break; default: { status = LIBXSMM_DNN_ERR_INVALID_FORMAT_CONVOLVE; } } } break; case LIBXSMM_DNN_TENSOR_FORMAT_NHWC: { switch (handle->filter_format) { case LIBXSMM_DNN_TENSOR_FORMAT_RSCK: { status = libxsmm_dnn_convolve_st_bwd_nhwc_rsck(handle, start_thread, tid); } break; case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { status = libxsmm_dnn_convolve_st_bwd_nhwc_custom(handle, start_thread, tid); } break; default: { status = LIBXSMM_DNN_ERR_INVALID_FORMAT_CONVOLVE; } } } break; default: { status = LIBXSMM_DNN_ERR_INVALID_FORMAT_CONVOLVE; } } } break; case LIBXSMM_DNN_COMPUTE_KIND_UPD: { switch (handle->buffer_format) { case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { switch (handle->filter_format) { case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { status = libxsmm_dnn_convolve_st_upd_custom_custom(handle, start_thread, tid); } break; default: { status = LIBXSMM_DNN_ERR_INVALID_FORMAT_CONVOLVE; } } } break; case LIBXSMM_DNN_TENSOR_FORMAT_NHWC: { switch (handle->filter_format) { case LIBXSMM_DNN_TENSOR_FORMAT_RSCK: { status = libxsmm_dnn_convolve_st_upd_nhwc_rsck(handle, start_thread, tid); } break; case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { status = libxsmm_dnn_convolve_st_upd_nhwc_custom(handle, start_thread, tid); } break; default: { status = LIBXSMM_DNN_ERR_INVALID_FORMAT_CONVOLVE; } } } break; default: { status = LIBXSMM_DNN_ERR_INVALID_FORMAT_CONVOLVE; } } } break; case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: { switch (handle->buffer_format) { case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { switch (handle->filter_format) { case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { status = libxsmm_dnn_convolve_st_upd_custom_custom(handle, start_thread, tid); status = libxsmm_dnn_convolve_st_bwd_custom_custom(handle, start_thread, tid); } break; default: { status = LIBXSMM_DNN_ERR_INVALID_FORMAT_CONVOLVE; } } } break; case LIBXSMM_DNN_TENSOR_FORMAT_NHWC: { switch (handle->filter_format) { case LIBXSMM_DNN_TENSOR_FORMAT_RSCK: { status = libxsmm_dnn_convolve_st_upd_nhwc_rsck(handle, start_thread, tid); status = libxsmm_dnn_convolve_st_bwd_nhwc_rsck(handle, start_thread, tid); } break; case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { status = libxsmm_dnn_convolve_st_upd_nhwc_custom(handle, start_thread, tid); status = libxsmm_dnn_convolve_st_bwd_nhwc_custom(handle, start_thread, tid); } break; default: { status = LIBXSMM_DNN_ERR_INVALID_FORMAT_CONVOLVE; } } } break; default: { status = LIBXSMM_DNN_ERR_INVALID_FORMAT_CONVOLVE; } } } break; default: { status = LIBXSMM_DNN_ERR_INVALID_KIND; } } } break; default: { status = LIBXSMM_DNN_ERR_INVALID_ALGO; } } } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_execute_st(libxsmm_dnn_layer* handle, libxsmm_dnn_compute_kind kind, /*unsigned*/int start_thread, /*unsigned*/int tid) { return internal_execute_st(handle, kind, start_thread, tid); } LIBXSMM_API void libxsmm_dnn_execute(libxsmm_dnn_layer* handle, libxsmm_dnn_compute_kind kind) { #if defined(_OPENMP) # pragma omp parallel num_threads(handle->desc.threads) { const int tid = omp_get_thread_num(); internal_execute_st(handle, kind, 0, tid); } #else internal_execute_st(handle, kind, 0/*start_thread*/, 0/*tid*/); #endif } libxsmm-1.17/src/libxsmm_dnn_convolution_backward.c000066400000000000000000000644561415223013700226740ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Rajkishore Barik, Ankush Mandal, Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "libxsmm_dnn_convolution_backward.h" #include "libxsmm_main.h" LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_custom_custom_f32_f32(libxsmm_dnn_layer* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_nhwc_custom_f32_f32(libxsmm_dnn_layer* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_nhwc_rsck_f32_f32(libxsmm_dnn_layer* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_custom_custom_bf16_bf16_emu(libxsmm_dnn_layer* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_custom_custom_bf16_bf16(libxsmm_dnn_layer* handle, int start_thread, int tid); LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_custom_custom_f32_f32(libxsmm_dnn_layer* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ if (handle->use_fallback_bwd_loops == 0) { const libxsmm_blasint ldB = (libxsmm_blasint)handle->ofmblock; const libxsmm_blasint ldA = (libxsmm_blasint)handle->ifmblock; const libxsmm_blasint ldC = (handle->spread_input_bwd == 1) ? (libxsmm_blasint)(handle->ifmblock * handle->desc.v) : (libxsmm_blasint)handle->ifmblock; const float beta = (handle->avoid_acc_load_bwd ? 0.f : 1.f); typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function; int l_flags = LIBXSMM_GEMM_FLAGS('N', 'N'); /* let's do a ifmblock x ofw_rb x ofmblock GEMM :-) or in other words M=nbIfm, N=ofw, K=nbOfm (col-major) */ gemm_br_function br_gemm_kernel = libxsmm_smmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*handle->bwd_ofw_rb, handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, NULL); gemm_br_function br_gemm_kernel2 = libxsmm_smmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*(handle->bwd_ofw_rb-1), handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, NULL); # include "template/libxsmm_dnn_convolve_st_bwd_custom_custom_generic.tpl.c" } else { const libxsmm_blasint ldC = (libxsmm_blasint)(handle->desc.v*handle->ifmblock); typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; typedef libxsmm_smmfunction gemm_function; /* let's do a ifmblock x ofw_rb x ofmblock GEMM :-) or in other words M=nbIfm, N=ofw, K=nbOfm (col-major) */ gemm_function gemm_kernel = libxsmm_smmdispatch(handle->ifmblock, handle->ofw, handle->ofmblock, NULL, NULL, &ldC, NULL, NULL, NULL, NULL); #include "template/libxsmm_dnn_convolve_st_bwd_custom_custom_fallback_generic.tpl.c" } #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_custom_custom_bf16_bf16_emu(libxsmm_dnn_layer* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ if (handle->use_fallback_bwd_loops == 0) { const libxsmm_blasint ldB = (libxsmm_blasint)handle->ofmblock; const libxsmm_blasint ldA = (libxsmm_blasint)handle->ifmblock; const libxsmm_blasint ldC = (handle->spread_input_bwd == 1) ? (libxsmm_blasint)(handle->ifmblock * handle->desc.v) : (libxsmm_blasint)handle->ifmblock; const float beta = (handle->avoid_acc_load_bwd ? 0.f : 1.f); typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef libxsmm_bfloat16 element_filter_type; typedef libxsmm_bsmmfunction_reducebatch_addr gemm_br_function; typedef libxsmm_bmmfunction_reducebatch_addr gemm_br_function_bf16bf16; int l_flags = LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N'); /* some portable macrros fof BF16 <-> FP32 */ # include "template/libxsmm_dnn_bf16_macros_define.tpl.c" /* let's do a ifmblock x ofw_rb x ofmblock GEMM :-) or in other words M=nbIfm, N=ofw, K=nbOfm (col-major) */ gemm_br_function br_gemm_kernel = libxsmm_bsmmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*handle->bwd_ofw_rb, handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, NULL); gemm_br_function br_gemm_kernel2 = libxsmm_bsmmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*(handle->bwd_ofw_rb-1), handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, NULL); gemm_br_function_bf16bf16 br_gemm_kernel_bf16bf16 = libxsmm_bmmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*handle->bwd_ofw_rb, handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, NULL); gemm_br_function_bf16bf16 br_gemm_kernel2_bf16bf16 = libxsmm_bmmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*(handle->bwd_ofw_rb-1), handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, NULL); # include "template/libxsmm_dnn_convolve_st_bwd_custom_custom_generic_bf16.tpl.c" # include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } #if defined(LIBXSMM_INTRINSICS_AVX512_CPX) LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CPX) libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_custom_custom_bf16_bf16(libxsmm_dnn_layer* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ if (handle->use_fallback_bwd_loops == 0) { const libxsmm_blasint ldB = (libxsmm_blasint)handle->ofmblock; const libxsmm_blasint ldA = (libxsmm_blasint)handle->ifmblock; const libxsmm_blasint ldC = (handle->spread_input_bwd == 1) ? (libxsmm_blasint)(handle->ifmblock * handle->desc.v) : (libxsmm_blasint)handle->ifmblock; const float beta = (handle->avoid_acc_load_bwd ? 0.f : 1.f); typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef libxsmm_bfloat16 element_filter_type; typedef libxsmm_bsmmfunction_reducebatch_addr gemm_br_function; typedef libxsmm_bmmfunction_reducebatch_addr gemm_br_function_bf16bf16; int l_flags = LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N'); #define LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI /* some portable macrros fof BF16 <-> FP32 */ # include "template/libxsmm_dnn_bf16_macros_define.tpl.c" /* let's do a ifmblock x ofw_rb x ofmblock GEMM :-) or in other words M=nbIfm, N=ofw, K=nbOfm (col-major) */ gemm_br_function br_gemm_kernel = libxsmm_bsmmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*handle->bwd_ofw_rb, handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, NULL); gemm_br_function br_gemm_kernel2 = libxsmm_bsmmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*(handle->bwd_ofw_rb-1), handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, NULL); gemm_br_function_bf16bf16 br_gemm_kernel_bf16bf16 = libxsmm_bmmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*handle->bwd_ofw_rb, handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, NULL); gemm_br_function_bf16bf16 br_gemm_kernel2_bf16bf16 = libxsmm_bmmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*(handle->bwd_ofw_rb-1), handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, NULL); # include "template/libxsmm_dnn_convolve_st_bwd_custom_custom_generic_bf16.tpl.c" # include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" #undef LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } #else LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_custom_custom_bf16_bf16(libxsmm_dnn_layer* handle, int start_thread, int tid) { return libxsmm_dnn_convolve_st_bwd_custom_custom_bf16_bf16_emu( handle, start_thread, tid ); } #endif LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_nhwc_custom_f32_f32(libxsmm_dnn_layer* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ if (handle->use_fallback_bwd_loops == 0) { const libxsmm_blasint ldB = (libxsmm_blasint)(handle->blocksofm * handle->ofmblock); const libxsmm_blasint ldA = (libxsmm_blasint)handle->ifmblock; const libxsmm_blasint ldC = (handle->spread_input_bwd == 1) ? (libxsmm_blasint)(handle->blocksifm * handle->ifmblock * handle->desc.v) : (libxsmm_blasint)(handle->blocksifm * handle->ifmblock); const float beta = (handle->avoid_acc_load_bwd ? 0.f : 1.f); typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function; int l_flags = LIBXSMM_GEMM_FLAGS('N', 'N'); /* let's do a ifmblock x ofw_rb x ofmblock GEMM :-) or in other words M=nbIfm, N=ofw, K=nbOfm (col-major) */ gemm_br_function br_gemm_kernel = libxsmm_smmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*handle->bwd_ofw_rb, handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, NULL); gemm_br_function br_gemm_kernel2 = libxsmm_smmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*(handle->bwd_ofw_rb-1), handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, NULL); #define LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_CUSTOM # include "template/libxsmm_dnn_convolve_st_bwd_nhwc_custom-rsck_generic.tpl.c" #undef LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_CUSTOM } else { const libxsmm_blasint ldB = (libxsmm_blasint)(handle->blocksofm * handle->ofmblock); const libxsmm_blasint ldA = (libxsmm_blasint)handle->ifmblock; const libxsmm_blasint ldC = ( (handle->desc.pad_h != handle->desc.pad_h_in) || (handle->desc.pad_w != handle->desc.pad_w_in) ) ? (libxsmm_blasint)(handle->ifmblock * handle->desc.v) : (libxsmm_blasint)(handle->blocksifm * handle->ifmblock * handle->desc.v); typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; typedef libxsmm_smmfunction gemm_function; /* let's do a ifmblock x ofw_rb x ofmblock GEMM :-) or in other words M=nbIfm, N=ofw, K=nbOfm (col-major) */ gemm_function gemm_kernel = libxsmm_smmdispatch(handle->ifmblock, handle->ofw, handle->ofmblock, &ldA, &ldB, &ldC, NULL, NULL, NULL, NULL); #define LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_CUSTOM #include "template/libxsmm_dnn_convolve_st_bwd_nhwc_custom-rsck_fallback_generic.tpl.c" #undef LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_CUSTOM } #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_nhwc_rsck_f32_f32(libxsmm_dnn_layer* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ if (handle->use_fallback_bwd_loops == 0) { const libxsmm_blasint ldB = (libxsmm_blasint)(handle->blocksofm * handle->ofmblock); const libxsmm_blasint ldA = (libxsmm_blasint)handle->ifmblock; const libxsmm_blasint ldC = (handle->spread_input_bwd == 1) ? (libxsmm_blasint)(handle->blocksifm * handle->ifmblock * handle->desc.v) : (libxsmm_blasint)(handle->blocksifm * handle->ifmblock); const float beta = (handle->avoid_acc_load_bwd ? 0.f : 1.f); typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function; int l_flags = LIBXSMM_GEMM_FLAGS('N', 'N'); /* let's do a ifmblock x ofw_rb x ofmblock GEMM :-) or in other words M=nbIfm, N=ofw, K=nbOfm (col-major) */ gemm_br_function br_gemm_kernel = libxsmm_smmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*handle->bwd_ofw_rb, handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, NULL); gemm_br_function br_gemm_kernel2 = libxsmm_smmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*(handle->bwd_ofw_rb-1), handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, NULL); #define LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_RSCK # include "template/libxsmm_dnn_convolve_st_bwd_nhwc_custom-rsck_generic.tpl.c" #undef LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_RSCK } else { const libxsmm_blasint ldB = (libxsmm_blasint)(handle->blocksofm * handle->ofmblock); const libxsmm_blasint ldA = (libxsmm_blasint)handle->ifmblock; const libxsmm_blasint ldC = ( (handle->desc.pad_h != handle->desc.pad_h_in) || (handle->desc.pad_w != handle->desc.pad_w_in) ) ? (libxsmm_blasint)(handle->ifmblock * handle->desc.v) : (libxsmm_blasint)(handle->blocksifm * handle->ifmblock * handle->desc.v); typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; typedef libxsmm_smmfunction gemm_function; /* let's do a ifmblock x ofw_rb x ofmblock GEMM :-) or in other words M=nbIfm, N=ofw, K=nbOfm (col-major) */ gemm_function gemm_kernel = libxsmm_smmdispatch(handle->ifmblock, handle->ofw, handle->ofmblock, &ldA, &ldB, &ldC, NULL, NULL, NULL, NULL); #define LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_RSCK #include "template/libxsmm_dnn_convolve_st_bwd_nhwc_custom-rsck_fallback_generic.tpl.c" #undef LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_RSCK } #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_custom_custom(libxsmm_dnn_layer* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check if we have input, output and filter */ if (handle->grad_input == 0 || handle->grad_output == 0 || handle->reg_filter == 0 || handle->scratch == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } /* check if we are on AVX512 */ #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) { if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_convolve_st_bwd_custom_custom_f32_f32( handle, start_thread, tid); } #if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_CORE && libxsmm_target_archid < LIBXSMM_X86_AVX512_CPX ) { status = libxsmm_dnn_convolve_st_bwd_custom_custom_bf16_bf16_emu( handle, start_thread, tid); } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_CPX ) { status = libxsmm_dnn_convolve_st_bwd_custom_custom_bf16_bf16( handle, start_thread, tid); } #elif defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_CORE ) { status = libxsmm_dnn_convolve_st_bwd_custom_custom_bf16_bf16_emu( handle, start_thread, tid); } #endif else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else #endif { if (handle->datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { if (handle->use_fallback_bwd_loops == 0) { const libxsmm_blasint ldx = ((libxsmm_blasint)handle->ofmblock); const libxsmm_blasint ldA = handle->ifmblock; const libxsmm_blasint ldC = (handle->spread_input_bwd == 1) ? handle->ifmblock * handle->desc.v : handle->ifmblock; const float beta = (handle->avoid_acc_load_bwd) ? 0.f : 1.f; typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function; int l_flags = LIBXSMM_GEMM_FLAGS('N', 'N'); /* let's do a ifmblock x ofw_rb x ofmblock GEMM :-) or in other words M=nbIfm, N=ofw, K=nbOfm (col-major) */ gemm_br_function br_gemm_kernel = libxsmm_smmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*handle->bwd_ofw_rb, handle->ofmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); gemm_br_function br_gemm_kernel2 = libxsmm_smmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*(handle->bwd_ofw_rb-1), handle->ofmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); # include "template/libxsmm_dnn_convolve_st_bwd_custom_custom_generic.tpl.c" } else { const libxsmm_blasint ldx = ((libxsmm_blasint)handle->desc.v*handle->ifmblock); typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; typedef libxsmm_smmfunction gemm_function; /* let's do a ifmblock x ofw_rb x ofmblock GEMM :-) or in other words M=nbIfm, N=ofw, K=nbOfm (col-major) */ gemm_function gemm_kernel = libxsmm_smmdispatch(handle->ifmblock, handle->ofw, handle->ofmblock, NULL, NULL, &ldx, NULL, NULL, NULL, NULL); #include "template/libxsmm_dnn_convolve_st_bwd_custom_custom_fallback_generic.tpl.c" } } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_nhwc_rsck(libxsmm_dnn_layer* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check if we have input, output and filter */ if (handle->grad_input == 0 || handle->grad_output == 0 || handle->reg_filter == 0 || handle->scratch == 0) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } /* check if we are on AVX512 */ #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) { if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_convolve_st_bwd_nhwc_rsck_f32_f32( handle, start_thread, tid); } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else #endif { if (handle->datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { if (handle->use_fallback_bwd_loops == 0) { const libxsmm_blasint ldB = (libxsmm_blasint)(handle->blocksofm * handle->ofmblock); const libxsmm_blasint ldA = (libxsmm_blasint)handle->ifmblock; const libxsmm_blasint ldC = (handle->spread_input_bwd == 1) ? (libxsmm_blasint)(handle->blocksifm * handle->ifmblock * handle->desc.v) : (libxsmm_blasint)(handle->blocksifm * handle->ifmblock); const float beta = (handle->avoid_acc_load_bwd ? 0.f : 1.f); typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function; int l_flags = LIBXSMM_GEMM_FLAGS('N', 'N'); /* let's do a ifmblock x ofw_rb x ofmblock GEMM :-) or in other words M=nbIfm, N=ofw, K=nbOfm (col-major) */ gemm_br_function br_gemm_kernel = libxsmm_smmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*handle->bwd_ofw_rb, handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, NULL); gemm_br_function br_gemm_kernel2 = libxsmm_smmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*(handle->bwd_ofw_rb-1), handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, NULL); #define LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_RSCK # include "template/libxsmm_dnn_convolve_st_bwd_nhwc_custom-rsck_generic.tpl.c" #undef LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_RSCK } else { const libxsmm_blasint ldB = (libxsmm_blasint)(handle->blocksofm * handle->ofmblock); const libxsmm_blasint ldA = (libxsmm_blasint)handle->ifmblock; const libxsmm_blasint ldC = ( (handle->desc.pad_h != handle->desc.pad_h_in) || (handle->desc.pad_w != handle->desc.pad_w_in) ) ? (libxsmm_blasint)(handle->ifmblock * handle->desc.v) : (libxsmm_blasint)(handle->blocksifm * handle->ifmblock * handle->desc.v); typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; typedef libxsmm_smmfunction gemm_function; /* let's do a ifmblock x ofw_rb x ofmblock GEMM :-) or in other words M=nbIfm, N=ofw, K=nbOfm (col-major) */ gemm_function gemm_kernel = libxsmm_smmdispatch(handle->ifmblock, handle->ofw, handle->ofmblock, &ldA, &ldB, &ldC, NULL, NULL, NULL, NULL); #define LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_RSCK #include "template/libxsmm_dnn_convolve_st_bwd_nhwc_custom-rsck_fallback_generic.tpl.c" #undef LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_RSCK } } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_nhwc_custom(libxsmm_dnn_layer* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check if we have input, output and filter */ if (handle->grad_input == 0 || handle->grad_output == 0 || handle->reg_filter == 0 || handle->scratch == 0) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } /* check if we are on AVX512 */ #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) { if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_convolve_st_bwd_nhwc_custom_f32_f32( handle, start_thread, tid); } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else #endif { if (handle->datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { if (handle->use_fallback_bwd_loops == 0) { const libxsmm_blasint ldB = (libxsmm_blasint)(handle->blocksofm * handle->ofmblock); const libxsmm_blasint ldA = (libxsmm_blasint)handle->ifmblock; const libxsmm_blasint ldC = (handle->spread_input_bwd == 1) ? (libxsmm_blasint)(handle->blocksifm * handle->ifmblock * handle->desc.v) : (libxsmm_blasint)(handle->blocksifm * handle->ifmblock); const float beta = (handle->avoid_acc_load_bwd ? 0.f : 1.f); typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function; int l_flags = LIBXSMM_GEMM_FLAGS('N', 'N'); /* let's do a ifmblock x ofw_rb x ofmblock GEMM :-) or in other words M=nbIfm, N=ofw, K=nbOfm (col-major) */ gemm_br_function br_gemm_kernel = libxsmm_smmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*handle->bwd_ofw_rb, handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, NULL); gemm_br_function br_gemm_kernel2 = libxsmm_smmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*(handle->bwd_ofw_rb-1), handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, NULL); #define LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_CUSTOM # include "template/libxsmm_dnn_convolve_st_bwd_nhwc_custom-rsck_generic.tpl.c" #undef LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_CUSTOM } else { const libxsmm_blasint ldB = (libxsmm_blasint)(handle->blocksofm * handle->ofmblock); const libxsmm_blasint ldA = (libxsmm_blasint)handle->ifmblock; const libxsmm_blasint ldC = ( (handle->desc.pad_h != handle->desc.pad_h_in) || (handle->desc.pad_w != handle->desc.pad_w_in) ) ? (libxsmm_blasint)(handle->ifmblock * handle->desc.v) : (libxsmm_blasint)(handle->blocksifm * handle->ifmblock * handle->desc.v); typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; typedef libxsmm_smmfunction gemm_function; /* let's do a ifmblock x ofw_rb x ofmblock GEMM :-) or in other words M=nbIfm, N=ofw, K=nbOfm (col-major) */ gemm_function gemm_kernel = libxsmm_smmdispatch(handle->ifmblock, handle->ofw, handle->ofmblock, &ldA, &ldB, &ldC, NULL, NULL, NULL, NULL); #define LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_CUSTOM #include "template/libxsmm_dnn_convolve_st_bwd_nhwc_custom-rsck_fallback_generic.tpl.c" #undef LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_CUSTOM } } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } return status; } libxsmm-1.17/src/libxsmm_dnn_convolution_backward.h000066400000000000000000000025021415223013700226610ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Rajkishore Barik, Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_DNN_CONVOLUTION_BACKWARD_H #define LIBXSMM_DNN_CONVOLUTION_BACKWARD_H #include LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_custom_custom(libxsmm_dnn_layer* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_nhwc_rsck(libxsmm_dnn_layer* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_nhwc_custom(libxsmm_dnn_layer* handle, int start_thread, int tid); #endif /* LIBXSMM_DNN_CONVOLUTION_BACKWARD_H */ libxsmm-1.17/src/libxsmm_dnn_convolution_forward.c000066400000000000000000000543631415223013700225560ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Evangelos Georganas, Hans Pabst (Intel Corp.) ******************************************************************************/ #include "libxsmm_dnn_convolution_forward.h" #include "libxsmm_main.h" LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_custom_custom_f32_f32(libxsmm_dnn_layer* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_nhwc_custom_f32_f32(libxsmm_dnn_layer* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_nhwc_rsck_f32_f32(libxsmm_dnn_layer* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_custom_custom_bf16_bf16_emu(libxsmm_dnn_layer* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_custom_custom_bf16_bf16(libxsmm_dnn_layer* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_custom_custom_i8_i32(libxsmm_dnn_layer* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_custom_custom_i8_i8(libxsmm_dnn_layer* handle, int start_thread, int tid); LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_custom_custom_f32_f32(libxsmm_dnn_layer* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ const libxsmm_blasint ldx = (handle->pack_input == 1) ? (libxsmm_blasint)handle->ifmblock : (libxsmm_blasint)handle->desc.v*handle->ifmblock; const libxsmm_blasint ldA = handle->ofmblock; const libxsmm_blasint ldC = handle->ofmblock; const float beta = (handle->avoid_acc_load) ? 0.f : 1.f; typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function; int l_flags = ( LIBXSMM_GEMM_FLAGS('N', 'N') ) | handle->fwd_flags; /* let's do a ofmblock x ofw_rb x ifmblock GEMM :-) or in other words M=nbOfm, N=ofw, K=nbIfm (col-major) */ gemm_br_function br_gemm_kernel = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*handle->fwd_ofw_rb, handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); gemm_br_function br_gemm_kernel2 = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*(handle->fwd_ofw_rb-1), handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); # include "template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic.tpl.c" #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_custom_custom_bf16_bf16_emu(libxsmm_dnn_layer* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ const libxsmm_blasint ldx = (handle->pack_input == 1) ? (libxsmm_blasint)handle->ifmblock : (libxsmm_blasint)handle->desc.v*handle->ifmblock; const libxsmm_blasint ldA = handle->ofmblock; const libxsmm_blasint ldC = handle->ofmblock; const float beta = (handle->avoid_acc_load) ? 0.f : 1.f; typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef libxsmm_bfloat16 element_filter_type; typedef libxsmm_bsmmfunction_reducebatch_addr gemm_br_function; typedef libxsmm_bmmfunction_reducebatch_addr gemm_br_function_bf16bf16; int l_flags = ( LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N') ) | handle->fwd_flags; /* some portable macrros fof BF16 <-> FP32 */ # include "template/libxsmm_dnn_bf16_macros_define.tpl.c" /* let's do a ofmblock x ofw_rb x ifmblock GEMM :-) or in other words M=nbOfm, N=ofw, K=nbIfm (col-major) */ gemm_br_function br_gemm_kernel = libxsmm_bsmmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*handle->fwd_ofw_rb, handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); gemm_br_function br_gemm_kernel2 = libxsmm_bsmmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*(handle->fwd_ofw_rb-1), handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); gemm_br_function_bf16bf16 br_gemm_kernel_bf16bf16 = libxsmm_bmmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*handle->fwd_ofw_rb, handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); gemm_br_function_bf16bf16 br_gemm_kernel2_bf16bf16 = libxsmm_bmmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*(handle->fwd_ofw_rb-1), handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); # include "template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic_bf16.tpl.c" # include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } #if defined(LIBXSMM_INTRINSICS_AVX512_CPX) LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CPX) libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_custom_custom_bf16_bf16(libxsmm_dnn_layer* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ const libxsmm_blasint ldx = (handle->pack_input == 1) ? (libxsmm_blasint)handle->ifmblock : (libxsmm_blasint)handle->desc.v*handle->ifmblock; const libxsmm_blasint ldA = handle->ofmblock; const libxsmm_blasint ldC = handle->ofmblock; const float beta = (handle->avoid_acc_load) ? 0.f : 1.f; typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef libxsmm_bfloat16 element_filter_type; typedef libxsmm_bsmmfunction_reducebatch_addr gemm_br_function; typedef libxsmm_bmmfunction_reducebatch_addr gemm_br_function_bf16bf16; int l_flags = ( LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N') ) | handle->fwd_flags; #define LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI /* some portable macrros fof BF16 <-> FP32 */ # include "template/libxsmm_dnn_bf16_macros_define.tpl.c" /* let's do a ofmblock x ofw_rb x ifmblock GEMM :-) or in other words M=nbOfm, N=ofw, K=nbIfm (col-major) */ gemm_br_function br_gemm_kernel = libxsmm_bsmmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*handle->fwd_ofw_rb, handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); gemm_br_function br_gemm_kernel2 = libxsmm_bsmmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*(handle->fwd_ofw_rb-1), handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); gemm_br_function_bf16bf16 br_gemm_kernel_bf16bf16 = libxsmm_bmmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*handle->fwd_ofw_rb, handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); gemm_br_function_bf16bf16 br_gemm_kernel2_bf16bf16 = libxsmm_bmmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*(handle->fwd_ofw_rb-1), handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); # include "template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic_bf16.tpl.c" # include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" #undef LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } #else LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_custom_custom_bf16_bf16(libxsmm_dnn_layer* handle, int start_thread, int tid) { return libxsmm_dnn_convolve_st_fwd_custom_custom_bf16_bf16_emu( handle, start_thread, tid ); } #endif LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_custom_custom_i8_i32(libxsmm_dnn_layer* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef unsigned char element_input_type; typedef int element_output_type; typedef char element_filter_type; /* Basically we need only offset based and strided BRGEMMs */ libxsmm_subimmfunction_reducebatch_strd br_gemm_kernel_strided = handle->gemm_fwd.xgemm.subimrs; libxsmm_subimmfunction_reducebatch_strd br_gemm_kernel_strided2 = handle->gemm_fwd2.xgemm.subimrs; libxsmm_subimmfunction_reducebatch_offs br_gemm_kernel_offset = handle->gemm_fwd.xgemm.subimro; # include "template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic_i8i32.tpl.c" #else LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_custom_custom_i8_i8(libxsmm_dnn_layer* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef unsigned char element_input_type; typedef unsigned char element_output_type; typedef char element_filter_type; /* Basically we need only offset based and strided BRGEMMs */ libxsmm_sububmmfunction_reducebatch_strd br_gemm_kernel_strided = handle->gemm_fwd.xgemm.sububmrs; libxsmm_sububmmfunction_reducebatch_offs br_gemm_kernel_offset = handle->gemm_fwd.xgemm.sububmro; # include "template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic_i8i8.tpl.c" #else LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_nhwc_custom_f32_f32(libxsmm_dnn_layer* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ const libxsmm_blasint ldx = (handle->pack_input == 1) ? (libxsmm_blasint)handle->blocksifm*handle->ifmblock : (libxsmm_blasint)handle->blocksifm*handle->desc.v*handle->ifmblock; const libxsmm_blasint ldA = handle->ofmblock; const libxsmm_blasint ldC = handle->blocksofm*handle->ofmblock; const float beta = (handle->avoid_acc_load) ? 0.f : 1.f; typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function; int l_flags = ( LIBXSMM_GEMM_FLAGS('N', 'N') ) | handle->fwd_flags; /* let's do a ofmblock x ofw_rb x ifmblock GEMM :-) or in other words M=nbOfm, N=ofw, K=nbIfm (col-major) */ gemm_br_function br_gemm_kernel = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*handle->fwd_ofw_rb, handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); gemm_br_function br_gemm_kernel2 = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*(handle->fwd_ofw_rb-1), handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); #define LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_CUSTOM # include "template/libxsmm_dnn_convolve_st_fwd_nhwc_custom-rsck_generic.tpl.c" #undef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_CUSTOM #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_nhwc_rsck_f32_f32(libxsmm_dnn_layer* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ const libxsmm_blasint ldx = (handle->pack_input == 1) ? (libxsmm_blasint)handle->blocksifm*handle->ifmblock : (libxsmm_blasint)handle->blocksifm*handle->desc.v*handle->ifmblock; const libxsmm_blasint ldA = handle->blocksofm*handle->ofmblock; const libxsmm_blasint ldC = handle->blocksofm*handle->ofmblock; const float beta = (handle->avoid_acc_load) ? 0.f : 1.f; typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function; int l_flags = ( LIBXSMM_GEMM_FLAGS('N', 'N') ) | handle->fwd_flags; /* let's do a ofmblock x ofw_rb x ifmblock GEMM :-) or in other words M=nbOfm, N=ofw, K=nbIfm (col-major) */ gemm_br_function br_gemm_kernel = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*handle->fwd_ofw_rb, handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); gemm_br_function br_gemm_kernel2 = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*(handle->fwd_ofw_rb-1), handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); #define LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_RSCK # include "template/libxsmm_dnn_convolve_st_fwd_nhwc_custom-rsck_generic.tpl.c" #undef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_RSCK #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_custom_custom(libxsmm_dnn_layer* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check if we have input, output and filter */ if (handle->reg_input == 0 || handle->reg_output == 0 || handle->reg_filter == 0) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } /* check if we are on AVX512 */ #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) { if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_convolve_st_fwd_custom_custom_f32_f32( handle, start_thread, tid); } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_I8 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_I32 ) { status = libxsmm_dnn_convolve_st_fwd_custom_custom_i8_i32( handle, start_thread, tid); } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_I8 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_I8 ) { status = libxsmm_dnn_convolve_st_fwd_custom_custom_i8_i8( handle, start_thread, tid); } #if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_CORE && libxsmm_target_archid < LIBXSMM_X86_AVX512_CPX) { status = libxsmm_dnn_convolve_st_fwd_custom_custom_bf16_bf16_emu( handle, start_thread, tid); } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_CPX ) { status = libxsmm_dnn_convolve_st_fwd_custom_custom_bf16_bf16( handle, start_thread, tid); } #elif defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_CORE ) { status = libxsmm_dnn_convolve_st_fwd_custom_custom_bf16_bf16_emu( handle, start_thread, tid); } #endif else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else #endif { if (handle->datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { const libxsmm_blasint ldx = (handle->pack_input == 1) ? (libxsmm_blasint)handle->ifmblock : (libxsmm_blasint)handle->desc.v*handle->ifmblock; const libxsmm_blasint ldA = handle->ofmblock; const libxsmm_blasint ldC = handle->ofmblock; const float beta = (handle->avoid_acc_load) ? 0.f : 1.f; typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function; int l_flags = ( LIBXSMM_GEMM_FLAGS('N', 'N') ) | handle->fwd_flags; /* let's do a ofmblock x ofw_rb x ifmblock GEMM :-) or in other words M=nbOfm, N=ofw, K=nbIfm (col-major) */ gemm_br_function br_gemm_kernel = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*handle->fwd_ofw_rb, handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); gemm_br_function br_gemm_kernel2 = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*(handle->fwd_ofw_rb-1), handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); # include "template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic.tpl.c" } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_nhwc_custom(libxsmm_dnn_layer* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check if we have input, output and filter */ if (handle->reg_input == 0 || handle->reg_output == 0 || handle->reg_filter == 0) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } /* check if we are on AVX512 */ #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) { if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_convolve_st_fwd_nhwc_custom_f32_f32( handle, start_thread, tid); } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else #endif { if (handle->datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { const libxsmm_blasint ldx = (handle->pack_input == 1) ? (libxsmm_blasint)handle->blocksifm*handle->ifmblock : (libxsmm_blasint)handle->blocksifm*handle->desc.v*handle->ifmblock; const libxsmm_blasint ldA = handle->ofmblock; const libxsmm_blasint ldC = handle->blocksofm*handle->ofmblock; const float beta = (handle->avoid_acc_load) ? 0.f : 1.f; typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function; int l_flags = ( LIBXSMM_GEMM_FLAGS('N', 'N') ) | handle->fwd_flags; /* let's do a ofmblock x ofw_rb x ifmblock GEMM :-) or in other words M=nbOfm, N=ofw, K=nbIfm (col-major) */ gemm_br_function br_gemm_kernel = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*handle->fwd_ofw_rb, handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); gemm_br_function br_gemm_kernel2 = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*(handle->fwd_ofw_rb-1), handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); #define LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_CUSTOM # include "template/libxsmm_dnn_convolve_st_fwd_nhwc_custom-rsck_generic.tpl.c" #undef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_CUSTOM } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_nhwc_rsck(libxsmm_dnn_layer* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check if we have input, output and filter */ if (handle->reg_input == 0 || handle->reg_output == 0 || handle->reg_filter == 0) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } /* check if we are on AVX512 */ #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) { if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_convolve_st_fwd_nhwc_rsck_f32_f32( handle, start_thread, tid); } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else #endif { if (handle->datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { const libxsmm_blasint ldx = (handle->pack_input == 1) ? (libxsmm_blasint)handle->blocksifm*handle->ifmblock : (libxsmm_blasint)handle->blocksifm*handle->desc.v*handle->ifmblock; const libxsmm_blasint ldA = handle->blocksofm*handle->ofmblock; const libxsmm_blasint ldC = handle->blocksofm*handle->ofmblock; const float beta = (handle->avoid_acc_load) ? 0.f : 1.f; typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function; int l_flags = ( LIBXSMM_GEMM_FLAGS('N', 'N') ) | handle->fwd_flags; /* let's do a ofmblock x ofw_rb x ifmblock GEMM :-) or in other words M=nbOfm, N=ofw, K=nbIfm (col-major) */ gemm_br_function br_gemm_kernel = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*handle->fwd_ofw_rb, handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); gemm_br_function br_gemm_kernel2 = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*(handle->fwd_ofw_rb-1), handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); #define LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_RSCK # include "template/libxsmm_dnn_convolve_st_fwd_nhwc_custom-rsck_generic.tpl.c" #undef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_RSCK } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } return status; } libxsmm-1.17/src/libxsmm_dnn_convolution_forward.h000066400000000000000000000024551415223013700225560ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_DNN_CONVOLUTION_FORWARD_H #define LIBXSMM_DNN_CONVOLUTION_FORWARD_H #include LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_custom_custom(libxsmm_dnn_layer* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_nhwc_custom(libxsmm_dnn_layer* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_nhwc_rsck(libxsmm_dnn_layer* handle, int start_thread, int tid); #endif /* LIBXSMM_DNN_CONVOLUTION_FORWARD_H */ libxsmm-1.17/src/libxsmm_dnn_convolution_weight_update.c000066400000000000000000001206411415223013700237340ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Rajkishore Barik, Alexander Heinecke, Ankush Mandal, Jason Sewall (Intel Corp.) ******************************************************************************/ #include "libxsmm_dnn_convolution_weight_update.h" #include "libxsmm_main.h" /* function prototypes for below implementations */ LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_custom_custom_f32_f32(libxsmm_dnn_layer* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_custom_custom_bf16_bf16_emu(libxsmm_dnn_layer* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_custom_custom_bf16_bf16(libxsmm_dnn_layer* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_nhwc_custom_f32_f32(libxsmm_dnn_layer* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_nhwc_rsck_f32_f32(libxsmm_dnn_layer* handle, int start_thread, int tid); LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) void transpose_32x16(const libxsmm_bfloat16 *in, libxsmm_bfloat16 *out, int ld_in, int ld_out) { #if defined(LIBXSMM_INTRINSICS_AVX512_CORE) __m512i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf; __m512i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, te, tf; const int in_width=ld_in, out_width=ld_out; const __m512i idx_lo = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0); const __m512i idx_hi = _mm512_set_epi64(7, 6, 15, 14, 3, 2, 11, 10); r0 = _mm512_loadu_si512(in + 0*in_width); r1 = _mm512_loadu_si512(in + 1*in_width); r2 = _mm512_loadu_si512(in + 2*in_width); r3 = _mm512_loadu_si512(in + 3*in_width); r4 = _mm512_loadu_si512(in + 4*in_width); r5 = _mm512_loadu_si512(in + 5*in_width); r6 = _mm512_loadu_si512(in + 6*in_width); r7 = _mm512_loadu_si512(in + 7*in_width); r8 = _mm512_loadu_si512(in + 8*in_width); r9 = _mm512_loadu_si512(in + 9*in_width); ra = _mm512_loadu_si512(in + 10*in_width); rb = _mm512_loadu_si512(in + 11*in_width); rc = _mm512_loadu_si512(in + 12*in_width); rd = _mm512_loadu_si512(in + 13*in_width); re = _mm512_loadu_si512(in + 14*in_width); rf = _mm512_loadu_si512(in + 15*in_width); t0 = _mm512_unpacklo_epi16(r0,r1); t1 = _mm512_unpackhi_epi16(r0,r1); t2 = _mm512_unpacklo_epi16(r2,r3); t3 = _mm512_unpackhi_epi16(r2,r3); t4 = _mm512_unpacklo_epi16(r4,r5); t5 = _mm512_unpackhi_epi16(r4,r5); t6 = _mm512_unpacklo_epi16(r6,r7); t7 = _mm512_unpackhi_epi16(r6,r7); t8 = _mm512_unpacklo_epi16(r8,r9); t9 = _mm512_unpackhi_epi16(r8,r9); ta = _mm512_unpacklo_epi16(ra,rb); tb = _mm512_unpackhi_epi16(ra,rb); tc = _mm512_unpacklo_epi16(rc,rd); td = _mm512_unpackhi_epi16(rc,rd); te = _mm512_unpacklo_epi16(re,rf); tf = _mm512_unpackhi_epi16(re,rf); r0 = _mm512_unpacklo_epi32(t0,t2); r1 = _mm512_unpackhi_epi32(t0,t2); r2 = _mm512_unpacklo_epi32(t1,t3); r3 = _mm512_unpackhi_epi32(t1,t3); r4 = _mm512_unpacklo_epi32(t4,t6); r5 = _mm512_unpackhi_epi32(t4,t6); r6 = _mm512_unpacklo_epi32(t5,t7); r7 = _mm512_unpackhi_epi32(t5,t7); r8 = _mm512_unpacklo_epi32(t8,ta); r9 = _mm512_unpackhi_epi32(t8,ta); ra = _mm512_unpacklo_epi32(t9,tb); rb = _mm512_unpackhi_epi32(t9,tb); rc = _mm512_unpacklo_epi32(tc,te); rd = _mm512_unpackhi_epi32(tc,te); re = _mm512_unpacklo_epi32(td,tf); rf = _mm512_unpackhi_epi32(td,tf); t0 = _mm512_unpacklo_epi64(r0,r4); t1 = _mm512_unpackhi_epi64(r0,r4); t2 = _mm512_unpacklo_epi64(r1,r5); t3 = _mm512_unpackhi_epi64(r1,r5); t4 = _mm512_unpacklo_epi64(r2,r6); t5 = _mm512_unpackhi_epi64(r2,r6); t6 = _mm512_unpacklo_epi64(r3,r7); t7 = _mm512_unpackhi_epi64(r3,r7); t8 = _mm512_unpacklo_epi64(r8,rc); t9 = _mm512_unpackhi_epi64(r8,rc); ta = _mm512_unpacklo_epi64(r9,rd); tb = _mm512_unpackhi_epi64(r9,rd); tc = _mm512_unpacklo_epi64(ra,re); td = _mm512_unpackhi_epi64(ra,re); te = _mm512_unpacklo_epi64(rb,rf); tf = _mm512_unpackhi_epi64(rb,rf); r0 = _mm512_shuffle_i32x4(t0, t1, 0x88); r1 = _mm512_shuffle_i32x4(t2, t3, 0x88); r2 = _mm512_shuffle_i32x4(t4, t5, 0x88); r3 = _mm512_shuffle_i32x4(t6, t7, 0x88); r4 = _mm512_shuffle_i32x4(t0, t1, 0xdd); r5 = _mm512_shuffle_i32x4(t2, t3, 0xdd); r6 = _mm512_shuffle_i32x4(t4, t5, 0xdd); r7 = _mm512_shuffle_i32x4(t6, t7, 0xdd); r8 = _mm512_shuffle_i32x4(t8, t9, 0x88); r9 = _mm512_shuffle_i32x4(ta, tb, 0x88); ra = _mm512_shuffle_i32x4(tc, td, 0x88); rb = _mm512_shuffle_i32x4(te, tf, 0x88); rc = _mm512_shuffle_i32x4(t8, t9, 0xdd); rd = _mm512_shuffle_i32x4(ta, tb, 0xdd); re = _mm512_shuffle_i32x4(tc, td, 0xdd); rf = _mm512_shuffle_i32x4(te, tf, 0xdd); t0 = _mm512_permutex2var_epi64(r0, idx_lo, r8); t1 = _mm512_permutex2var_epi64(r1, idx_lo, r9); t2 = _mm512_permutex2var_epi64(r2, idx_lo, ra); t3 = _mm512_permutex2var_epi64(r3, idx_lo, rb); t4 = _mm512_permutex2var_epi64(r4, idx_lo, rc); t5 = _mm512_permutex2var_epi64(r5, idx_lo, rd); t6 = _mm512_permutex2var_epi64(r6, idx_lo, re); t7 = _mm512_permutex2var_epi64(r7, idx_lo, rf); t8 = _mm512_permutex2var_epi64(r8, idx_hi, r0); t9 = _mm512_permutex2var_epi64(r9, idx_hi, r1); ta = _mm512_permutex2var_epi64(ra, idx_hi, r2); tb = _mm512_permutex2var_epi64(rb, idx_hi, r3); tc = _mm512_permutex2var_epi64(rc, idx_hi, r4); td = _mm512_permutex2var_epi64(rd, idx_hi, r5); te = _mm512_permutex2var_epi64(re, idx_hi, r6); tf = _mm512_permutex2var_epi64(rf, idx_hi, r7); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 0*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t0, 0)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 1*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t0, 1)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 2*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t1, 0)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 3*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t1, 1)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 4*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t2, 0)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 5*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t2, 1)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 6*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t3, 0)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 7*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t3, 1)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 8*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t4, 0)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 9*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t4, 1)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 10*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t5, 0)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 11*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t5, 1)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 12*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t6, 0)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 13*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t6, 1)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 14*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t7, 0)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 15*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t7, 1)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 16*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t8, 0)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 17*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t8, 1)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 18*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t9, 0)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 19*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t9, 1)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 20*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(ta, 0)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 21*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(ta, 1)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 22*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tb, 0)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 23*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tb, 1)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 24*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tc, 0)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 25*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tc, 1)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 26*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(td, 0)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 27*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(td, 1)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 28*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(te, 0)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 29*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(te, 1)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 30*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tf, 0)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 31*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tf, 1)); #else LIBXSMM_UNUSED(in); LIBXSMM_UNUSED(out); LIBXSMM_UNUSED(ld_in); LIBXSMM_UNUSED(ld_out); #endif } LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) void transpose_32xcols(const libxsmm_bfloat16 *in, libxsmm_bfloat16 *out, int col, int ld_in, int ld_out) { #if defined(LIBXSMM_INTRINSICS_AVX512_CORE) __m512i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf; __m512i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, te, tf; const int in_width=ld_in, out_width=ld_out; const __m512i idx_lo = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0); const __m512i idx_hi = _mm512_set_epi64(7, 6, 15, 14, 3, 2, 11, 10); __mmask16 store_mask = LIBXSMM_INTRINSICS_MM512_CVTU32_MASK16(((unsigned int)1 << col) - 1); rf = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); if (col == 15) { r0 = _mm512_loadu_si512(in + 0*in_width); r1 = _mm512_loadu_si512(in + 1*in_width); r2 = _mm512_loadu_si512(in + 2*in_width); r3 = _mm512_loadu_si512(in + 3*in_width); r4 = _mm512_loadu_si512(in + 4*in_width); r5 = _mm512_loadu_si512(in + 5*in_width); r6 = _mm512_loadu_si512(in + 6*in_width); r7 = _mm512_loadu_si512(in + 7*in_width); r8 = _mm512_loadu_si512(in + 8*in_width); r9 = _mm512_loadu_si512(in + 9*in_width); ra = _mm512_loadu_si512(in + 10*in_width); rb = _mm512_loadu_si512(in + 11*in_width); rc = _mm512_loadu_si512(in + 12*in_width); rd = _mm512_loadu_si512(in + 13*in_width); re = _mm512_loadu_si512(in + 14*in_width); } else if (col == 14) { re = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r0 = _mm512_loadu_si512(in + 0*in_width); r1 = _mm512_loadu_si512(in + 1*in_width); r2 = _mm512_loadu_si512(in + 2*in_width); r3 = _mm512_loadu_si512(in + 3*in_width); r4 = _mm512_loadu_si512(in + 4*in_width); r5 = _mm512_loadu_si512(in + 5*in_width); r6 = _mm512_loadu_si512(in + 6*in_width); r7 = _mm512_loadu_si512(in + 7*in_width); r8 = _mm512_loadu_si512(in + 8*in_width); r9 = _mm512_loadu_si512(in + 9*in_width); ra = _mm512_loadu_si512(in + 10*in_width); rb = _mm512_loadu_si512(in + 11*in_width); rc = _mm512_loadu_si512(in + 12*in_width); rd = _mm512_loadu_si512(in + 13*in_width); } else if (col == 13) { rd = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); re = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r0 = _mm512_loadu_si512(in + 0*in_width); r1 = _mm512_loadu_si512(in + 1*in_width); r2 = _mm512_loadu_si512(in + 2*in_width); r3 = _mm512_loadu_si512(in + 3*in_width); r4 = _mm512_loadu_si512(in + 4*in_width); r5 = _mm512_loadu_si512(in + 5*in_width); r6 = _mm512_loadu_si512(in + 6*in_width); r7 = _mm512_loadu_si512(in + 7*in_width); r8 = _mm512_loadu_si512(in + 8*in_width); r9 = _mm512_loadu_si512(in + 9*in_width); ra = _mm512_loadu_si512(in + 10*in_width); rb = _mm512_loadu_si512(in + 11*in_width); rc = _mm512_loadu_si512(in + 12*in_width); } else if (col == 12) { rc = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rd = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); re = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r0 = _mm512_loadu_si512(in + 0*in_width); r1 = _mm512_loadu_si512(in + 1*in_width); r2 = _mm512_loadu_si512(in + 2*in_width); r3 = _mm512_loadu_si512(in + 3*in_width); r4 = _mm512_loadu_si512(in + 4*in_width); r5 = _mm512_loadu_si512(in + 5*in_width); r6 = _mm512_loadu_si512(in + 6*in_width); r7 = _mm512_loadu_si512(in + 7*in_width); r8 = _mm512_loadu_si512(in + 8*in_width); r9 = _mm512_loadu_si512(in + 9*in_width); ra = _mm512_loadu_si512(in + 10*in_width); rb = _mm512_loadu_si512(in + 11*in_width); } else if (col == 11) { rb = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rc = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rd = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); re = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r0 = _mm512_loadu_si512(in + 0*in_width); r1 = _mm512_loadu_si512(in + 1*in_width); r2 = _mm512_loadu_si512(in + 2*in_width); r3 = _mm512_loadu_si512(in + 3*in_width); r4 = _mm512_loadu_si512(in + 4*in_width); r5 = _mm512_loadu_si512(in + 5*in_width); r6 = _mm512_loadu_si512(in + 6*in_width); r7 = _mm512_loadu_si512(in + 7*in_width); r8 = _mm512_loadu_si512(in + 8*in_width); r9 = _mm512_loadu_si512(in + 9*in_width); ra = _mm512_loadu_si512(in + 10*in_width); } else if (col == 10) { ra = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rb = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rc = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rd = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); re = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r0 = _mm512_loadu_si512(in + 0*in_width); r1 = _mm512_loadu_si512(in + 1*in_width); r2 = _mm512_loadu_si512(in + 2*in_width); r3 = _mm512_loadu_si512(in + 3*in_width); r4 = _mm512_loadu_si512(in + 4*in_width); r5 = _mm512_loadu_si512(in + 5*in_width); r6 = _mm512_loadu_si512(in + 6*in_width); r7 = _mm512_loadu_si512(in + 7*in_width); r8 = _mm512_loadu_si512(in + 8*in_width); r9 = _mm512_loadu_si512(in + 9*in_width); } else if (col == 9) { r9 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); ra = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rb = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rc = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rd = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); re = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r0 = _mm512_loadu_si512(in + 0*in_width); r1 = _mm512_loadu_si512(in + 1*in_width); r2 = _mm512_loadu_si512(in + 2*in_width); r3 = _mm512_loadu_si512(in + 3*in_width); r4 = _mm512_loadu_si512(in + 4*in_width); r5 = _mm512_loadu_si512(in + 5*in_width); r6 = _mm512_loadu_si512(in + 6*in_width); r7 = _mm512_loadu_si512(in + 7*in_width); r8 = _mm512_loadu_si512(in + 8*in_width); } else if (col == 8) { r8 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r9 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); ra = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rb = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rc = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rd = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); re = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r0 = _mm512_loadu_si512(in + 0*in_width); r1 = _mm512_loadu_si512(in + 1*in_width); r2 = _mm512_loadu_si512(in + 2*in_width); r3 = _mm512_loadu_si512(in + 3*in_width); r4 = _mm512_loadu_si512(in + 4*in_width); r5 = _mm512_loadu_si512(in + 5*in_width); r6 = _mm512_loadu_si512(in + 6*in_width); r7 = _mm512_loadu_si512(in + 7*in_width); } else if (col == 7) { r7 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r8 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r9 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); ra = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rb = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rc = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rd = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); re = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r0 = _mm512_loadu_si512(in + 0*in_width); r1 = _mm512_loadu_si512(in + 1*in_width); r2 = _mm512_loadu_si512(in + 2*in_width); r3 = _mm512_loadu_si512(in + 3*in_width); r4 = _mm512_loadu_si512(in + 4*in_width); r5 = _mm512_loadu_si512(in + 5*in_width); r6 = _mm512_loadu_si512(in + 6*in_width); } else if (col == 6) { r6 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r7 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r8 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r9 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); ra = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rb = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rc = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rd = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); re = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r0 = _mm512_loadu_si512(in + 0*in_width); r1 = _mm512_loadu_si512(in + 1*in_width); r2 = _mm512_loadu_si512(in + 2*in_width); r3 = _mm512_loadu_si512(in + 3*in_width); r4 = _mm512_loadu_si512(in + 4*in_width); r5 = _mm512_loadu_si512(in + 5*in_width); } else if (col == 5) { r5 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r6 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r7 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r8 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r9 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); ra = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rb = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rc = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rd = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); re = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r0 = _mm512_loadu_si512(in + 0*in_width); r1 = _mm512_loadu_si512(in + 1*in_width); r2 = _mm512_loadu_si512(in + 2*in_width); r3 = _mm512_loadu_si512(in + 3*in_width); r4 = _mm512_loadu_si512(in + 4*in_width); } else if (col == 4) { r4 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r5 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r6 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r7 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r8 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r9 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); ra = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rb = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rc = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rd = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); re = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r0 = _mm512_loadu_si512(in + 0*in_width); r1 = _mm512_loadu_si512(in + 1*in_width); r2 = _mm512_loadu_si512(in + 2*in_width); r3 = _mm512_loadu_si512(in + 3*in_width); } else if (col == 3) { r3 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r4 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r5 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r6 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r7 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r8 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r9 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); ra = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rb = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rc = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rd = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); re = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r0 = _mm512_loadu_si512(in + 0*in_width); r1 = _mm512_loadu_si512(in + 1*in_width); r2 = _mm512_loadu_si512(in + 2*in_width); } else if (col == 2) { r2 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r3 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r4 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r5 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r6 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r7 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r8 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r9 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); ra = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rb = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rc = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rd = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); re = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r0 = _mm512_loadu_si512(in + 0*in_width); r1 = _mm512_loadu_si512(in + 1*in_width); } else if (col == 1) { r1 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r2 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r3 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r4 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r5 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r6 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r7 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r8 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r9 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); ra = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rb = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rc = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rd = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); re = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r0 = _mm512_loadu_si512(in + 0*in_width); } else { r0 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r1 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r2 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r3 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r4 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r5 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r6 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r7 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r8 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); r9 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); ra = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rb = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rc = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); rd = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); re = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); } t0 = _mm512_unpacklo_epi16(r0,r1); t1 = _mm512_unpackhi_epi16(r0,r1); t2 = _mm512_unpacklo_epi16(r2,r3); t3 = _mm512_unpackhi_epi16(r2,r3); t4 = _mm512_unpacklo_epi16(r4,r5); t5 = _mm512_unpackhi_epi16(r4,r5); t6 = _mm512_unpacklo_epi16(r6,r7); t7 = _mm512_unpackhi_epi16(r6,r7); t8 = _mm512_unpacklo_epi16(r8,r9); t9 = _mm512_unpackhi_epi16(r8,r9); ta = _mm512_unpacklo_epi16(ra,rb); tb = _mm512_unpackhi_epi16(ra,rb); tc = _mm512_unpacklo_epi16(rc,rd); td = _mm512_unpackhi_epi16(rc,rd); te = _mm512_unpacklo_epi16(re,rf); tf = _mm512_unpackhi_epi16(re,rf); r0 = _mm512_unpacklo_epi32(t0,t2); r1 = _mm512_unpackhi_epi32(t0,t2); r2 = _mm512_unpacklo_epi32(t1,t3); r3 = _mm512_unpackhi_epi32(t1,t3); r4 = _mm512_unpacklo_epi32(t4,t6); r5 = _mm512_unpackhi_epi32(t4,t6); r6 = _mm512_unpacklo_epi32(t5,t7); r7 = _mm512_unpackhi_epi32(t5,t7); r8 = _mm512_unpacklo_epi32(t8,ta); r9 = _mm512_unpackhi_epi32(t8,ta); ra = _mm512_unpacklo_epi32(t9,tb); rb = _mm512_unpackhi_epi32(t9,tb); rc = _mm512_unpacklo_epi32(tc,te); rd = _mm512_unpackhi_epi32(tc,te); re = _mm512_unpacklo_epi32(td,tf); rf = _mm512_unpackhi_epi32(td,tf); t0 = _mm512_unpacklo_epi64(r0,r4); t1 = _mm512_unpackhi_epi64(r0,r4); t2 = _mm512_unpacklo_epi64(r1,r5); t3 = _mm512_unpackhi_epi64(r1,r5); t4 = _mm512_unpacklo_epi64(r2,r6); t5 = _mm512_unpackhi_epi64(r2,r6); t6 = _mm512_unpacklo_epi64(r3,r7); t7 = _mm512_unpackhi_epi64(r3,r7); t8 = _mm512_unpacklo_epi64(r8,rc); t9 = _mm512_unpackhi_epi64(r8,rc); ta = _mm512_unpacklo_epi64(r9,rd); tb = _mm512_unpackhi_epi64(r9,rd); tc = _mm512_unpacklo_epi64(ra,re); td = _mm512_unpackhi_epi64(ra,re); te = _mm512_unpacklo_epi64(rb,rf); tf = _mm512_unpackhi_epi64(rb,rf); r0 = _mm512_shuffle_i32x4(t0, t1, 0x88); r1 = _mm512_shuffle_i32x4(t2, t3, 0x88); r2 = _mm512_shuffle_i32x4(t4, t5, 0x88); r3 = _mm512_shuffle_i32x4(t6, t7, 0x88); r4 = _mm512_shuffle_i32x4(t0, t1, 0xdd); r5 = _mm512_shuffle_i32x4(t2, t3, 0xdd); r6 = _mm512_shuffle_i32x4(t4, t5, 0xdd); r7 = _mm512_shuffle_i32x4(t6, t7, 0xdd); r8 = _mm512_shuffle_i32x4(t8, t9, 0x88); r9 = _mm512_shuffle_i32x4(ta, tb, 0x88); ra = _mm512_shuffle_i32x4(tc, td, 0x88); rb = _mm512_shuffle_i32x4(te, tf, 0x88); rc = _mm512_shuffle_i32x4(t8, t9, 0xdd); rd = _mm512_shuffle_i32x4(ta, tb, 0xdd); re = _mm512_shuffle_i32x4(tc, td, 0xdd); rf = _mm512_shuffle_i32x4(te, tf, 0xdd); t0 = _mm512_permutex2var_epi64(r0, idx_lo, r8); t1 = _mm512_permutex2var_epi64(r1, idx_lo, r9); t2 = _mm512_permutex2var_epi64(r2, idx_lo, ra); t3 = _mm512_permutex2var_epi64(r3, idx_lo, rb); t4 = _mm512_permutex2var_epi64(r4, idx_lo, rc); t5 = _mm512_permutex2var_epi64(r5, idx_lo, rd); t6 = _mm512_permutex2var_epi64(r6, idx_lo, re); t7 = _mm512_permutex2var_epi64(r7, idx_lo, rf); t8 = _mm512_permutex2var_epi64(r8, idx_hi, r0); t9 = _mm512_permutex2var_epi64(r9, idx_hi, r1); ta = _mm512_permutex2var_epi64(ra, idx_hi, r2); tb = _mm512_permutex2var_epi64(rb, idx_hi, r3); tc = _mm512_permutex2var_epi64(rc, idx_hi, r4); td = _mm512_permutex2var_epi64(rd, idx_hi, r5); te = _mm512_permutex2var_epi64(re, idx_hi, r6); tf = _mm512_permutex2var_epi64(rf, idx_hi, r7); _mm256_mask_storeu_epi16(out + 0*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t0, 0)); _mm256_mask_storeu_epi16(out + 1*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t0, 1)); _mm256_mask_storeu_epi16(out + 2*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t1, 0)); _mm256_mask_storeu_epi16(out + 3*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t1, 1)); _mm256_mask_storeu_epi16(out + 4*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t2, 0)); _mm256_mask_storeu_epi16(out + 5*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t2, 1)); _mm256_mask_storeu_epi16(out + 6*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t3, 0)); _mm256_mask_storeu_epi16(out + 7*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t3, 1)); _mm256_mask_storeu_epi16(out + 8*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t4, 0)); _mm256_mask_storeu_epi16(out + 9*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t4, 1)); _mm256_mask_storeu_epi16(out + 10*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t5, 0)); _mm256_mask_storeu_epi16(out + 11*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t5, 1)); _mm256_mask_storeu_epi16(out + 12*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t6, 0)); _mm256_mask_storeu_epi16(out + 13*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t6, 1)); _mm256_mask_storeu_epi16(out + 14*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t7, 0)); _mm256_mask_storeu_epi16(out + 15*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t7, 1)); _mm256_mask_storeu_epi16(out + 16*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t8, 0)); _mm256_mask_storeu_epi16(out + 17*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t8, 1)); _mm256_mask_storeu_epi16(out + 18*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t9, 0)); _mm256_mask_storeu_epi16(out + 19*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t9, 1)); _mm256_mask_storeu_epi16(out + 20*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(ta, 0)); _mm256_mask_storeu_epi16(out + 21*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(ta, 1)); _mm256_mask_storeu_epi16(out + 22*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tb, 0)); _mm256_mask_storeu_epi16(out + 23*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tb, 1)); _mm256_mask_storeu_epi16(out + 24*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tc, 0)); _mm256_mask_storeu_epi16(out + 25*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tc, 1)); _mm256_mask_storeu_epi16(out + 26*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(td, 0)); _mm256_mask_storeu_epi16(out + 27*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(td, 1)); _mm256_mask_storeu_epi16(out + 28*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(te, 0)); _mm256_mask_storeu_epi16(out + 29*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(te, 1)); _mm256_mask_storeu_epi16(out + 30*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tf, 0)); _mm256_mask_storeu_epi16(out + 31*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tf, 1)); #else LIBXSMM_UNUSED(in); LIBXSMM_UNUSED(out); LIBXSMM_UNUSED(col); LIBXSMM_UNUSED(ld_in); LIBXSMM_UNUSED(ld_out); #endif } LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) void transpose_input_pixels_bf16(const libxsmm_bfloat16 *in, libxsmm_bfloat16 *out, int M, int N, int ld_in, int ld_out){ #if defined(LIBXSMM_INTRINSICS_AVX512_CORE) int i, j; int full16_chunks = N/16; int remainder_cols = N%16; int _N = N - remainder_cols; if (full16_chunks) { for (i=0; i FP32 */ # include "template/libxsmm_dnn_bf16_macros_define.tpl.c" # include "template/libxsmm_dnn_convolve_st_upd_custom_custom_generic_bf16.tpl.c" # include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } #if defined(LIBXSMM_INTRINSICS_AVX512_CPX) LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CPX) libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_custom_custom_bf16_bf16(libxsmm_dnn_layer* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef libxsmm_bfloat16 element_filter_type; typedef libxsmm_bsmmfunction gemm_function; typedef libxsmm_bsmmfunction_reducebatch_addr gemm_br_function; #define LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI /* some portable macrros fof BF16 <-> FP32 */ # include "template/libxsmm_dnn_bf16_macros_define.tpl.c" # include "template/libxsmm_dnn_convolve_st_upd_custom_custom_generic_bf16.tpl.c" # include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" #undef LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } #else LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_custom_custom_bf16_bf16(libxsmm_dnn_layer* handle, int start_thread, int tid) { return libxsmm_dnn_convolve_st_upd_custom_custom_bf16_bf16_emu( handle, start_thread, tid ); } #endif LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_nhwc_custom_f32_f32(libxsmm_dnn_layer* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; typedef libxsmm_smmfunction gemm_function; typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function; #define LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM # include "template/libxsmm_dnn_convolve_st_upd_nhwc_custom-rsck_generic.tpl.c" #undef LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_nhwc_rsck_f32_f32(libxsmm_dnn_layer* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; typedef libxsmm_smmfunction gemm_function; typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function; #define LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK # include "template/libxsmm_dnn_convolve_st_upd_nhwc_custom-rsck_generic.tpl.c" #undef LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_custom_custom(libxsmm_dnn_layer* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check if we have input, output and filter */ if (handle->reg_input == 0 || handle->grad_output == 0 || handle->grad_filter == 0 || handle->scratch == 0) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } /* check if we are on AVX512 */ #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) { if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_convolve_st_upd_custom_custom_f32_f32( handle, start_thread, tid); } #if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_CORE && libxsmm_target_archid < LIBXSMM_X86_AVX512_CPX ) { status = libxsmm_dnn_convolve_st_upd_custom_custom_bf16_bf16_emu( handle, start_thread, tid); } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_CPX ) { status = libxsmm_dnn_convolve_st_upd_custom_custom_bf16_bf16( handle, start_thread, tid); } #elif defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_CORE ) { status = libxsmm_dnn_convolve_st_upd_custom_custom_bf16_bf16_emu( handle, start_thread, tid); } #endif else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else #endif { if (handle->datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; typedef libxsmm_smmfunction gemm_function; typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function; # include "template/libxsmm_dnn_convolve_st_upd_custom_custom_generic.tpl.c" } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_nhwc_custom(libxsmm_dnn_layer* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check if we have input, output and filter */ if (handle->reg_input == 0 || handle->grad_output == 0 || handle->grad_filter == 0 || handle->scratch == 0) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } /* check if we are on AVX512 */ #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) { if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_convolve_st_upd_nhwc_custom_f32_f32( handle, start_thread, tid); } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else #endif { if (handle->datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; typedef libxsmm_smmfunction gemm_function; typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function; #define LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM # include "template/libxsmm_dnn_convolve_st_upd_nhwc_custom-rsck_generic.tpl.c" #undef LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_nhwc_rsck(libxsmm_dnn_layer* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check if we have input, output and filter */ if (handle->reg_input == 0 || handle->grad_output == 0 || handle->grad_filter == 0 || handle->scratch == 0) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } /* check if we are on AVX512 */ #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) { if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_convolve_st_upd_nhwc_rsck_f32_f32( handle, start_thread, tid); } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else #endif { if (handle->datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; typedef libxsmm_smmfunction gemm_function; typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function; #define LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK # include "template/libxsmm_dnn_convolve_st_upd_nhwc_custom-rsck_generic.tpl.c" #undef LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } return status; } libxsmm-1.17/src/libxsmm_dnn_convolution_weight_update.h000066400000000000000000000025211415223013700237350ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Rajkishore Barik, Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_DNN_CONVOLUTION_WEIGHT_UPDATE_H #define LIBXSMM_DNN_CONVOLUTION_WEIGHT_UPDATE_H #include LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_custom_custom(libxsmm_dnn_layer* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_nhwc_rsck(libxsmm_dnn_layer* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_nhwc_custom(libxsmm_dnn_layer* handle, int start_thread, int tid); #endif /* LIBXSMM_DNN_CONVOLUTION_WEIGHT_UPDATE_H */ libxsmm-1.17/src/libxsmm_dnn_elementwise.c000066400000000000000000000641271415223013700207730ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Kunal Banerjee, Evangelos Georganas (Intel Corp.) ******************************************************************************/ #include "libxsmm_dnn_elementwise.h" #include "libxsmm_blocked_gemm_types.h" #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #include #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif LIBXSMM_API_INTERN void libxsmm_internal_matrix_zero(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, int start_thread, int tid, int nthreads) { const int ltid = tid - start_thread; /* compute chunk size */ const libxsmm_blasint chunksize = (size % nthreads == 0) ? (size / nthreads) : (size / nthreads) + 1; /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin = (ltid * chunksize < size) ? (ltid * chunksize) : size; const libxsmm_blasint thr_end = LIBXSMM_MIN(ltid * chunksize + chunksize, size); libxsmm_blasint i; for (i = thr_begin; i < thr_end; i++) { src[i] = (LIBXSMM_DNN_ELTWISE_FTYPE)0; } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_add(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *a, LIBXSMM_DNN_ELTWISE_FTYPE *b, LIBXSMM_DNN_ELTWISE_FTYPE *c, int start_thread, int tid, int nthreads) { const int ltid = tid - start_thread; /* compute chunk size */ const libxsmm_blasint chunksize = (size % nthreads == 0) ? (size / nthreads) : (size / nthreads) + 1; /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin = (ltid * chunksize < size) ? (ltid * chunksize) : size; const libxsmm_blasint thr_end = LIBXSMM_MIN(ltid * chunksize + chunksize, size); libxsmm_blasint i; for (i = thr_begin; i < thr_end; i++) { c[i] = a[i] + b[i]; } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_eltwise_mult(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *a, LIBXSMM_DNN_ELTWISE_FTYPE *b, LIBXSMM_DNN_ELTWISE_FTYPE *c, int start_thread, int tid, int nthreads) { const int ltid = tid - start_thread; /* compute chunk size */ const libxsmm_blasint chunksize = (size % nthreads == 0) ? (size / nthreads) : (size / nthreads) + 1; /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin = (ltid * chunksize < size) ? (ltid * chunksize) : size; const libxsmm_blasint thr_end = LIBXSMM_MIN(ltid * chunksize + chunksize, size); libxsmm_blasint i; for (i = thr_begin; i < thr_end; i++) { c[i] = a[i] * b[i]; } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_sigmoid(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads) { const int ltid = tid - start_thread; /* compute chunk size */ const libxsmm_blasint chunksize = (size % nthreads == 0) ? (size / nthreads) : (size / nthreads) + 1; /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin = (ltid * chunksize < size) ? (ltid * chunksize) : size; const libxsmm_blasint thr_end = LIBXSMM_MIN(ltid * chunksize + chunksize, size); libxsmm_blasint i; for (i = thr_begin; i < thr_end; i++) { const LIBXSMM_DNN_ELTWISE_FTYPE exp_value = (LIBXSMM_DNN_ELTWISE_FTYPE)exp((double) -src[i]); dst[i] = 1 / (1 + exp_value); } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_tanh(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads) { const int ltid = tid - start_thread; /* compute chunk size */ const libxsmm_blasint chunksize = (size % nthreads == 0) ? (size / nthreads) : (size / nthreads) + 1; /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin = (ltid * chunksize < size) ? (ltid * chunksize) : size; const libxsmm_blasint thr_end = LIBXSMM_MIN(ltid * chunksize + chunksize, size); libxsmm_blasint i; for (i = thr_begin; i < thr_end; i++) { dst[i] = (LIBXSMM_DNN_ELTWISE_FTYPE)tanh((double)src[i]); } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_relu(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads) { const int ltid = tid - start_thread; /* compute chunk size */ const libxsmm_blasint chunksize = (size % nthreads == 0) ? (size / nthreads) : (size / nthreads) + 1; /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin = (ltid * chunksize < size) ? (ltid * chunksize) : size; const libxsmm_blasint thr_end = LIBXSMM_MIN(ltid * chunksize + chunksize, size); libxsmm_blasint i; for (i = thr_begin; i < thr_end; i++) { dst[i] = (src[i] > 0.0f) ? src[i] : 0.0f; } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_sigmoid_inverse(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads) { const int ltid = tid - start_thread; /* compute chunk size */ const libxsmm_blasint chunksize = (size % nthreads == 0) ? (size / nthreads) : (size / nthreads) + 1; /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin = (ltid * chunksize < size) ? (ltid * chunksize) : size; const libxsmm_blasint thr_end = LIBXSMM_MIN(ltid * chunksize + chunksize, size); libxsmm_blasint i; for (i = thr_begin; i < thr_end; i++) { const LIBXSMM_DNN_ELTWISE_FTYPE exp_value = (LIBXSMM_DNN_ELTWISE_FTYPE)exp((double) -src[i]); const LIBXSMM_DNN_ELTWISE_FTYPE sig_exp = 1 / (1 + exp_value); dst[i] = (1 - sig_exp)*sig_exp; } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_tanh_inverse(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads) { const int ltid = tid - start_thread; /* compute chunk size */ const libxsmm_blasint chunksize = (size % nthreads == 0) ? (size / nthreads) : (size / nthreads) + 1; /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin = (ltid * chunksize < size) ? (ltid * chunksize) : size; const libxsmm_blasint thr_end = LIBXSMM_MIN(ltid * chunksize + chunksize, size); libxsmm_blasint i; for (i = thr_begin; i < thr_end; i++) { const LIBXSMM_DNN_ELTWISE_FTYPE tanh_value = (LIBXSMM_DNN_ELTWISE_FTYPE)tanh((double)src[i]); dst[i] = 1 - (tanh_value * tanh_value); } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_relu_inverse(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads) { const int ltid = tid - start_thread; /* compute chunk size */ const libxsmm_blasint chunksize = (size % nthreads == 0) ? (size / nthreads) : (size / nthreads) + 1; /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin = (ltid * chunksize < size) ? (ltid * chunksize) : size; const libxsmm_blasint thr_end = LIBXSMM_MIN(ltid * chunksize + chunksize, size); libxsmm_blasint i; for (i = thr_begin; i < thr_end; i++) { dst[i] = (LIBXSMM_DNN_ELTWISE_FTYPE)(src[i] > 0.0f ? 1.0f : 0.0f); } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_transpose(libxsmm_blasint rows, libxsmm_blasint cols, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads) { const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const libxsmm_blasint size = rows * cols; /* compute chunk size */ const libxsmm_blasint chunksize = (size % nthreads == 0) ? (size / nthreads) : (size / nthreads) + 1; /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin = (ltid * chunksize < size) ? (ltid * chunksize) : size; const libxsmm_blasint thr_end = LIBXSMM_MIN(ltid * chunksize + chunksize, size); LIBXSMM_VLA_DECL(2, LIBXSMM_DNN_ELTWISE_FTYPE, src2D, src, cols); LIBXSMM_VLA_DECL(2, LIBXSMM_DNN_ELTWISE_FTYPE, dst2D, dst, rows); libxsmm_blasint job; for (job = thr_begin; job < thr_end; ++job) { const libxsmm_blasint i = job / cols; const libxsmm_blasint j = job % cols; LIBXSMM_VLA_ACCESS(2, dst2D, j, i, rows) = LIBXSMM_VLA_ACCESS(2, src2D, i, j, cols); } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_copy(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads) { const int ltid = tid - start_thread; /* compute chunk size */ const libxsmm_blasint chunksize = (size % nthreads == 0) ? (size / nthreads) : (size / nthreads) + 1; /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin = (ltid * chunksize < size) ? (ltid * chunksize) : size; const libxsmm_blasint thr_end = LIBXSMM_MIN(ltid * chunksize + chunksize, size); libxsmm_blasint i; for (i = thr_begin; i < thr_end; i++) { dst[i] = src[i]; } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_complement(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads) { const int ltid = tid - start_thread; /* compute chunk size */ const libxsmm_blasint chunksize = (size % nthreads == 0) ? (size / nthreads) : (size / nthreads) + 1; /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin = (ltid * chunksize < size) ? (ltid * chunksize) : size; const libxsmm_blasint thr_end = LIBXSMM_MIN(ltid * chunksize + chunksize, size); libxsmm_blasint i; for (i = thr_begin; i < thr_end; i++) { dst[i] = 1 - src[i]; } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_complement_square(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads) { const int ltid = tid - start_thread; /* compute chunk size */ const libxsmm_blasint chunksize = (size % nthreads == 0) ? (size / nthreads) : (size / nthreads) + 1; /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin = (ltid * chunksize < size) ? (ltid * chunksize) : size; const libxsmm_blasint thr_end = LIBXSMM_MIN(ltid * chunksize + chunksize, size); libxsmm_blasint i; for (i = thr_begin; i < thr_end; i++) { dst[i] = 1 - (src[i] * src[i]); } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_inverse(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads) { const int ltid = tid - start_thread; /* compute chunk size */ const libxsmm_blasint chunksize = (size % nthreads == 0) ? (size / nthreads) : (size / nthreads) + 1; /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin = (ltid * chunksize < size) ? (ltid * chunksize) : size; const libxsmm_blasint thr_end = LIBXSMM_MIN(ltid * chunksize + chunksize, size); libxsmm_blasint i; for (i = thr_begin; i < thr_end; i++) { dst[i] = -src[i]; } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_1D_2D(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint bm, libxsmm_blasint bn, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads) { const int ltid = tid - start_thread; /* compute chunk size */ const libxsmm_blasint chunksize = (m % nthreads == 0) ? (m / nthreads) : (m / nthreads) + 1; /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin = (ltid * chunksize < m) ? (ltid * chunksize) : m; const libxsmm_blasint thr_end = LIBXSMM_MIN(ltid * chunksize + chunksize, m); libxsmm_blasint i, j; LIBXSMM_VLA_DECL(4, LIBXSMM_DNN_ELTWISE_FTYPE, real_dst, (LIBXSMM_DNN_ELTWISE_FTYPE*)dst, m/bm, bn, bm); for (i = thr_begin; i < thr_end; i++) { const libxsmm_blasint mb = i/bm; const libxsmm_blasint ibm = i%bm; for (j = 0; j < n; j++) { const libxsmm_blasint nb = j/bn; const libxsmm_blasint ibn = j%bn; LIBXSMM_VLA_ACCESS(4, real_dst, nb, mb, ibn, ibm, m/bm, bn, bm) = src[i]; } } } /* #define LSTM_TIMING */ #if defined(LSTM_TIMING) extern double Gbl_t_input_total, Gbl_t_recur_total, Gbl_t_eltwise_total, Gbl_t_nonlin_total; extern unsigned long long Gbl_t_input, Gbl_t_recur, Gbl_t_eltwise, Gbl_t_nonlin; extern double Gbl_duration_input, Gbl_duration_recur, Gbl_duration_eltwise, Gbl_duration_nonlin; #endif LIBXSMM_API_INTERN void libxsmm_internal_recursive_step(libxsmm_blocked_gemm_handle* handle, LIBXSMM_DNN_ELTWISE_FTYPE* u, LIBXSMM_DNN_ELTWISE_FTYPE* h, LIBXSMM_DNN_ELTWISE_FTYPE* op1, LIBXSMM_DNN_ELTWISE_FTYPE *op2, LIBXSMM_DNN_ELTWISE_FTYPE *temp, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int act, libxsmm_blasint size, int start_thread, int tid) { const int ltid = tid - start_thread; #if defined(LSTM_TIMING) if (ltid == 0) { Gbl_t_recur = libxsmm_timer_tick(); } #endif libxsmm_blocked_gemm_st(handle, u, h, op1, start_thread, ltid); #if defined(LSTM_TIMING) if (ltid == 0) { Gbl_duration_recur = libxsmm_timer_duration(Gbl_t_recur, libxsmm_timer_tick()); Gbl_t_recur_total += Gbl_duration_recur; Gbl_t_eltwise = libxsmm_timer_tick(); } #endif libxsmm_internal_matrix_add(size, op1, op2, temp, start_thread, ltid, handle->nthreads); #if defined(LSTM_TIMING) libxsmm_barrier_wait(handle->barrier, ltid); /* Additional barrier introduced to measure time */ if (ltid == 0) { Gbl_duration_eltwise = libxsmm_timer_duration(Gbl_t_eltwise, libxsmm_timer_tick()); Gbl_t_eltwise_total += Gbl_duration_eltwise; Gbl_t_nonlin = libxsmm_timer_tick(); } #endif switch (act) { case 0: /* do nothing */ dst = temp; break; case 1: libxsmm_internal_matrix_relu(size, temp, dst, start_thread, tid, handle->nthreads); break; case 2: libxsmm_internal_matrix_sigmoid(size, temp, dst, start_thread, tid, handle->nthreads); break; case 3: libxsmm_internal_matrix_tanh(size, temp, dst, start_thread, tid, handle->nthreads); break; default: /* fprintf(stdout, "Unsupported activation function: %d\n", act); */ dst = temp; } #if defined(LSTM_TIMING) libxsmm_barrier_wait(handle->barrier, ltid); /* Additional barrier introduced to measure time */ if (ltid == 0) { Gbl_duration_nonlin = libxsmm_timer_duration(Gbl_t_nonlin, libxsmm_timer_tick()); Gbl_t_nonlin_total += Gbl_duration_nonlin; } #endif } LIBXSMM_API_INTERN void libxsmm_internal_matrix_zero_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *srcdst) { libxsmm_blasint i = 0, j; for ( j = 0; j < n; ++j ) { LIBXSMM_PRAGMA_SIMD for ( i = 0; i < m; ++i ) { srcdst[(j*ld)+i] = (LIBXSMM_DNN_ELTWISE_FTYPE)0; } } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_copy_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst) { libxsmm_blasint i = 0, j; for ( j = 0; j < n; ++j ) { LIBXSMM_PRAGMA_SIMD for ( i = 0; i < m; ++i ) { dst[(j*ld)+i] = src[(j*ld)+i]; } } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_add_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src0, LIBXSMM_DNN_ELTWISE_FTYPE *src1, LIBXSMM_DNN_ELTWISE_FTYPE *dst) { libxsmm_blasint i = 0, j; for ( j = 0; j < n; ++j ) { LIBXSMM_PRAGMA_SIMD for ( i = 0; i < m; ++i ) { dst[(j*ld)+i] = src0[(j*ld)+i] + src1[(j*ld)+i]; } } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_sub_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src0, LIBXSMM_DNN_ELTWISE_FTYPE *src1, LIBXSMM_DNN_ELTWISE_FTYPE *dst) { libxsmm_blasint i = 0, j; for ( j = 0; j < n; ++j ) { LIBXSMM_PRAGMA_SIMD for ( i = 0; i < m; ++i ) { dst[(j*ld)+i] = src0[(j*ld)+i] - src1[(j*ld)+i]; } } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_eltwise_mult_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src0, LIBXSMM_DNN_ELTWISE_FTYPE *src1, LIBXSMM_DNN_ELTWISE_FTYPE *dst) { libxsmm_blasint i = 0, j; for ( j = 0; j < n; ++j ) { LIBXSMM_PRAGMA_SIMD for ( i = 0; i < m; ++i ) { dst[(j*ld)+i] = src0[(j*ld)+i] * src1[(j*ld)+i]; } } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_inplace_eltwise_mult_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src0, LIBXSMM_DNN_ELTWISE_FTYPE *srcdst) { libxsmm_blasint i = 0, j; for ( j = 0; j < n; ++j ) { LIBXSMM_PRAGMA_SIMD for ( i = 0; i < m; ++i ) { srcdst[(j*ld)+i] *= src0[(j*ld)+i]; } } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_eltwise_fma_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src0, LIBXSMM_DNN_ELTWISE_FTYPE *src1, LIBXSMM_DNN_ELTWISE_FTYPE *dst) { libxsmm_blasint i = 0, j; for ( j = 0; j < n; ++j ) { LIBXSMM_PRAGMA_SIMD for ( i = 0; i < m; ++i ) { dst[(j*ld)+i] += src0[(j*ld)+i] * src1[(j*ld)+i]; } } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_add_colvector_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *srcdst, LIBXSMM_DNN_ELTWISE_FTYPE *colv) { libxsmm_blasint i = 0, j; for ( j = 0; j < n; ++j ) { LIBXSMM_PRAGMA_SIMD for ( i = 0; i < m; ++i ) { srcdst[(j*ld)+i] += colv[i]; } } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_bcst_colvector_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *srcdst, LIBXSMM_DNN_ELTWISE_FTYPE *colv) { libxsmm_blasint i = 0, j; for ( j = 0; j < n; ++j ) { LIBXSMM_PRAGMA_SIMD for ( i = 0; i < m; ++i ) { srcdst[(j*ld)+i] = colv[i]; } } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_bcst_cvt_bf16_fp32_colvector_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *srcdst, libxsmm_bfloat16 *colv) { libxsmm_blasint i, j; libxsmm_bfloat16_hp t; t.i[0] = 0; for ( j = 0; j < n; ++j ) { for ( i = 0; i < m; ++i ) { t.i[1] = colv[i]; srcdst[(j*ld)+i] = t.f; } } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_bcst_colvector_const_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *srcdst, LIBXSMM_DNN_ELTWISE_FTYPE *colv, LIBXSMM_DNN_ELTWISE_FTYPE const_bias) { libxsmm_blasint i = 0, j; for ( j = 0; j < n; ++j ) { LIBXSMM_PRAGMA_SIMD for ( i = 0; i < m; ++i ) { srcdst[(j*ld)+i] = colv[i] + const_bias; } } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_bcst_cvt_bf16_fp32_colvector_const_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *srcdst, libxsmm_bfloat16 *colv, LIBXSMM_DNN_ELTWISE_FTYPE const_bias) { libxsmm_blasint i, j; libxsmm_bfloat16_hp t; t.i[0] = 0; for ( j = 0; j < n; ++j ) { for ( i = 0; i < m; ++i ) { t.i[1] = colv[i]; srcdst[(j*ld)+i] = t.f + const_bias; } } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_sigmoid_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst) { libxsmm_blasint i = 0, j; for ( j = 0; j < n; ++j ) { LIBXSMM_PRAGMA_SIMD for ( i = 0; i < m; ++i ) { const LIBXSMM_DNN_ELTWISE_FTYPE mid_value = (LIBXSMM_DNN_ELTWISE_FTYPE)exp((double) -src[(j*ld)+i]); dst[(j*ld)+i] = (LIBXSMM_DNN_ELTWISE_FTYPE)1 / ((LIBXSMM_DNN_ELTWISE_FTYPE)1 + mid_value); } } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_tanh_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst) { libxsmm_blasint i = 0, j; for ( j = 0; j < n; ++j ) { LIBXSMM_PRAGMA_SIMD for ( i = 0; i < m; ++i ) { dst[(j*ld)+i] = (LIBXSMM_DNN_ELTWISE_FTYPE)tanh((double) src[(j*ld)+i]); } } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_relu_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst) { libxsmm_blasint i = 0, j; for ( j = 0; j < n; ++j ) { LIBXSMM_PRAGMA_SIMD for ( i = 0; i < m; ++i ) { dst[(j*ld)+i] = (src[(j*ld)+i] < 0) ? (LIBXSMM_DNN_ELTWISE_FTYPE)0 : src[(j*ld)+i]; } } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_sigmoid_inverse_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst) { libxsmm_blasint i = 0, j; for ( j = 0; j < n; ++j ) { LIBXSMM_PRAGMA_SIMD for ( i = 0; i < m; ++i ) { LIBXSMM_DNN_ELTWISE_FTYPE exp_value = (LIBXSMM_DNN_ELTWISE_FTYPE)exp((double) -src[(j*ld)+i]); LIBXSMM_DNN_ELTWISE_FTYPE mid_value = (LIBXSMM_DNN_ELTWISE_FTYPE)1 / ((LIBXSMM_DNN_ELTWISE_FTYPE)1 + exp_value); dst[(j*ld)+i] = ((LIBXSMM_DNN_ELTWISE_FTYPE)1 - mid_value) * mid_value; } } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_tanh_inverse_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst) { libxsmm_blasint i = 0, j; for ( j = 0; j < n; ++j ) { LIBXSMM_PRAGMA_SIMD for ( i = 0; i < m; ++i ) { LIBXSMM_DNN_ELTWISE_FTYPE tanh_value = (LIBXSMM_DNN_ELTWISE_FTYPE)tanh((double) src[(j*ld)+i]); dst[(j*ld)+i] = (LIBXSMM_DNN_ELTWISE_FTYPE)1 - (tanh_value * tanh_value); } } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_relu_inverse_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst) { libxsmm_blasint i = 0, j; for ( j = 0; j < n; ++j ) { LIBXSMM_PRAGMA_SIMD for ( i = 0; i < m; ++i ) { dst[(j*ld)+i] = (src[(j*ld)+i] < 0) ? (LIBXSMM_DNN_ELTWISE_FTYPE)0 : (LIBXSMM_DNN_ELTWISE_FTYPE)1; } } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_sigmoid_inverse_inplace_eltwise_mult_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst) { libxsmm_blasint i = 0, j; for ( j = 0; j < n; ++j ) { LIBXSMM_PRAGMA_SIMD for ( i = 0; i < m; ++i ) { LIBXSMM_DNN_ELTWISE_FTYPE exp_value = (LIBXSMM_DNN_ELTWISE_FTYPE)exp((double) -src[(j*ld)+i]); LIBXSMM_DNN_ELTWISE_FTYPE mid_value = (LIBXSMM_DNN_ELTWISE_FTYPE)1 / ((LIBXSMM_DNN_ELTWISE_FTYPE)1 + exp_value); dst[(j*ld)+i] *= ((LIBXSMM_DNN_ELTWISE_FTYPE)1 - mid_value) * mid_value; } } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_tanh_inverse_inplace_eltwise_mult_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst) { libxsmm_blasint i = 0, j; for ( j = 0; j < n; ++j ) { LIBXSMM_PRAGMA_SIMD for ( i = 0; i < m; ++i ) { LIBXSMM_DNN_ELTWISE_FTYPE tanh_value = (LIBXSMM_DNN_ELTWISE_FTYPE)tanh((double) src[(j*ld)+i]); dst[(j*ld)+i] *= (LIBXSMM_DNN_ELTWISE_FTYPE)1 - (tanh_value * tanh_value); } } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_relu_inverse_inplace_eltwise_mult_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst) { libxsmm_blasint i = 0, j; for ( j = 0; j < n; ++j ) { LIBXSMM_PRAGMA_SIMD for ( i = 0; i < m; ++i ) { dst[(j*ld)+i] *= (src[(j*ld)+i] < 0) ? (LIBXSMM_DNN_ELTWISE_FTYPE)0 : (LIBXSMM_DNN_ELTWISE_FTYPE)1; } } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_complement_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst) { libxsmm_blasint i = 0, j; for ( j = 0; j < n; ++j ) { LIBXSMM_PRAGMA_SIMD for ( i = 0; i < m; ++i ) { dst[(j*ld)+i] = (LIBXSMM_DNN_ELTWISE_FTYPE)1 - src[(j*ld)+i]; } } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_complement_square_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst) { libxsmm_blasint i = 0, j; for ( j = 0; j < n; ++j ) { LIBXSMM_PRAGMA_SIMD for ( i = 0; i < m; ++i ) { dst[(j*ld)+i] = (LIBXSMM_DNN_ELTWISE_FTYPE)1 - (src[(j*ld)+i] * src[(j*ld)+i]); } } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_rne_mask_fp32_bfp16_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, float* src, float* dst) { libxsmm_blasint i,j; /* rnaz buffer to bfp16 */ for ( j = 0; j < n; ++j ) { for ( i = 0; i < m; ++i ) { unsigned int int_round = 0; unsigned int do_round = 1; const void *const ptr = &int_round; int_round = *((unsigned int*)&(src[(j*ld)+i])); /* we don't round NaN and inf */ if ( (int_round & 0x7f800000) == 0x7f800000 ) { do_round = 0; } /* perform round nearest tie even */ if ( do_round != 0 ) { unsigned int fixup = (int_round >> 16) & 1; int_round = int_round + 0x00007fff + fixup; } /* chop bits to create BFP16 in FP32 */ int_round = int_round & 0xffff0000; dst[(j*ld)+i] = *((float*)ptr); } } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_rne_cvt_fp32_bfp16_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, float* src, libxsmm_bfloat16* dst) { libxsmm_blasint i,j; /* truncate buffer to bfp16 */ for ( j = 0; j < n; ++j ) { for ( i = 0; i < m; ++i ) { unsigned int int_round = 0; unsigned int do_round = 1; int_round = *((unsigned int*)&(src[(j*ld)+i])); /* we don't round NaN and inf */ if ( (int_round & 0x7f800000) == 0x7f800000 ) { do_round = 0; } /* perform round nearest tie even */ if ( do_round != 0 ) { unsigned int fixup = (int_round >> 16) & 1; int_round = int_round + 0x00007fff + fixup; } /* create the bfp16 value by shifting out the lower 16bits */ int_round = int_round >> 16; dst[(j*ld)+i] = (unsigned short)int_round; } } } LIBXSMM_API_INTERN void libxsmm_internal_matrix_cvt_bf16_fp32_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, libxsmm_bfloat16 *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst) { libxsmm_blasint i, j; libxsmm_bfloat16_hp t; t.i[0] = 0; for ( j = 0; j < n; ++j ) { for ( i = 0; i < m; ++i ) { t.i[1] = src[(j*ld)+i]; dst[(j*ld)+i] = t.f; } } } libxsmm-1.17/src/libxsmm_dnn_elementwise.h000066400000000000000000000221101415223013700207620ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Kunal Banerjee, Evangelos Georganas (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_DNN_ELEMENTWISE_H #define LIBXSMM_DNN_ELEMENTWISE_H #include #if !defined(LIBXSMM_DNN_ELTWISE_FTYPE) # define LIBXSMM_DNN_ELTWISE_FTYPE float #endif LIBXSMM_API_INTERN void libxsmm_internal_matrix_zero(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, int start_thread, int tid, int nthreads); LIBXSMM_API_INTERN void libxsmm_internal_matrix_add(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *a, LIBXSMM_DNN_ELTWISE_FTYPE *b, LIBXSMM_DNN_ELTWISE_FTYPE *c, int start_thread, int tid, int nthreads); LIBXSMM_API_INTERN void libxsmm_internal_matrix_eltwise_mult(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *a, LIBXSMM_DNN_ELTWISE_FTYPE *b, LIBXSMM_DNN_ELTWISE_FTYPE *c, int start_thread, int tid, int nthreads); LIBXSMM_API_INTERN void libxsmm_internal_matrix_sigmoid(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads); LIBXSMM_API_INTERN void libxsmm_internal_matrix_tanh(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads); LIBXSMM_API_INTERN void libxsmm_internal_matrix_relu(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads); LIBXSMM_API_INTERN void libxsmm_internal_matrix_sigmoid_inverse(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads); LIBXSMM_API_INTERN void libxsmm_internal_matrix_tanh_inverse(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads); LIBXSMM_API_INTERN void libxsmm_internal_matrix_relu_inverse(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads); LIBXSMM_API_INTERN void libxsmm_internal_matrix_transpose(libxsmm_blasint rows, libxsmm_blasint cols, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads); LIBXSMM_API_INTERN void libxsmm_internal_matrix_copy(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads); LIBXSMM_API_INTERN void libxsmm_internal_matrix_complement(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads); LIBXSMM_API_INTERN void libxsmm_internal_matrix_complement_square(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads); LIBXSMM_API_INTERN void libxsmm_internal_matrix_inverse(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads); LIBXSMM_API_INTERN void libxsmm_internal_matrix_1D_2D(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint bm, libxsmm_blasint bn, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads); LIBXSMM_API_INTERN void libxsmm_internal_recursive_step(libxsmm_blocked_gemm_handle* handle, LIBXSMM_DNN_ELTWISE_FTYPE* u, LIBXSMM_DNN_ELTWISE_FTYPE* h, LIBXSMM_DNN_ELTWISE_FTYPE* op1, LIBXSMM_DNN_ELTWISE_FTYPE *op2, LIBXSMM_DNN_ELTWISE_FTYPE *temp, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int act, libxsmm_blasint size, int start_thread, int tid); LIBXSMM_API_INTERN void libxsmm_internal_matrix_zero_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *srcdst); LIBXSMM_API_INTERN void libxsmm_internal_matrix_add_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src0, LIBXSMM_DNN_ELTWISE_FTYPE *src1, LIBXSMM_DNN_ELTWISE_FTYPE *dst); LIBXSMM_API_INTERN void libxsmm_internal_matrix_sub_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src0, LIBXSMM_DNN_ELTWISE_FTYPE *src1, LIBXSMM_DNN_ELTWISE_FTYPE *dst); LIBXSMM_API_INTERN void libxsmm_internal_matrix_copy_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst); LIBXSMM_API_INTERN void libxsmm_internal_matrix_eltwise_mult_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src0, LIBXSMM_DNN_ELTWISE_FTYPE *src1, LIBXSMM_DNN_ELTWISE_FTYPE *dst); LIBXSMM_API_INTERN void libxsmm_internal_matrix_inplace_eltwise_mult_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src0, LIBXSMM_DNN_ELTWISE_FTYPE *srcdst); LIBXSMM_API_INTERN void libxsmm_internal_matrix_eltwise_fma_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src0, LIBXSMM_DNN_ELTWISE_FTYPE *src1, LIBXSMM_DNN_ELTWISE_FTYPE *dst); LIBXSMM_API_INTERN void libxsmm_internal_matrix_add_colvector_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *srcdst, LIBXSMM_DNN_ELTWISE_FTYPE *colv); LIBXSMM_API_INTERN void libxsmm_internal_matrix_bcst_colvector_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *srcdst, LIBXSMM_DNN_ELTWISE_FTYPE *colv); LIBXSMM_API_INTERN void libxsmm_internal_matrix_bcst_colvector_const_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *srcdst, LIBXSMM_DNN_ELTWISE_FTYPE *colv, LIBXSMM_DNN_ELTWISE_FTYPE const_bias); LIBXSMM_API_INTERN void libxsmm_internal_matrix_bcst_cvt_bf16_fp32_colvector_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *srcdst, libxsmm_bfloat16 *colv); LIBXSMM_API_INTERN void libxsmm_internal_matrix_bcst_cvt_bf16_fp32_colvector_const_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *srcdst, libxsmm_bfloat16 *colv, LIBXSMM_DNN_ELTWISE_FTYPE const_bias); LIBXSMM_API_INTERN void libxsmm_internal_matrix_sigmoid_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst); LIBXSMM_API_INTERN void libxsmm_internal_matrix_tanh_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst); LIBXSMM_API_INTERN void libxsmm_internal_matrix_relu_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst); LIBXSMM_API_INTERN void libxsmm_internal_matrix_sigmoid_inverse_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst); LIBXSMM_API_INTERN void libxsmm_internal_matrix_tanh_inverse_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst); LIBXSMM_API_INTERN void libxsmm_internal_matrix_relu_inverse_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst); LIBXSMM_API_INTERN void libxsmm_internal_matrix_sigmoid_inverse_inplace_eltwise_mult_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst); LIBXSMM_API_INTERN void libxsmm_internal_matrix_tanh_inverse_inplace_eltwise_mult_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst); LIBXSMM_API_INTERN void libxsmm_internal_matrix_relu_inverse_inplace_eltwise_mult_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst); LIBXSMM_API_INTERN void libxsmm_internal_matrix_complement_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst); LIBXSMM_API_INTERN void libxsmm_internal_matrix_complement_square_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst); LIBXSMM_API_INTERN void libxsmm_internal_matrix_rne_mask_fp32_bfp16_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, float* src, float* dst); LIBXSMM_API_INTERN void libxsmm_internal_matrix_rne_cvt_fp32_bfp16_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, float* src, libxsmm_bfloat16* dst); LIBXSMM_API_INTERN void libxsmm_internal_matrix_cvt_bf16_fp32_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, libxsmm_bfloat16 *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst); #endif /*LIBXSMM_DNN_ELEMENTWISE_H*/ libxsmm-1.17/src/libxsmm_dnn_fullyconnected.c000066400000000000000000002134571415223013700214720ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ #include "libxsmm_dnn_fullyconnected_backward_weight_update.h" #include "libxsmm_dnn_fullyconnected_forward.h" #include "libxsmm_main.h" LIBXSMM_API libxsmm_dnn_fullyconnected* libxsmm_dnn_create_fullyconnected(libxsmm_dnn_fullyconnected_desc fullyconnected_desc, libxsmm_dnn_err_t* status) { libxsmm_dnn_fullyconnected* handle = 0; const libxsmm_trans_descriptor* tr_desc = 0; libxsmm_descriptor_blob blob; /* init libxsmm */ LIBXSMM_INIT if ( ((fullyconnected_desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (fullyconnected_desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16)) || ((fullyconnected_desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (fullyconnected_desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) || ((fullyconnected_desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (fullyconnected_desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) ) { handle = (libxsmm_dnn_fullyconnected*)malloc(sizeof(libxsmm_dnn_fullyconnected)); if (0 != handle) { *status = LIBXSMM_DNN_SUCCESS; /* zero entire content; not only safer but also sets data and code pointers to NULL */ memset(handle, 0, sizeof(*handle)); /* let's make the description persistent */ handle->desc = fullyconnected_desc; /* @TODO perhaps we need a better switch here */ if ( (handle->desc.buffer_format == LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED) && (handle->desc.filter_format == LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED) ) { handle->bk = handle->desc.bk; handle->bn = handle->desc.bn; handle->bc = handle->desc.bc; if ( handle->desc.N % handle->bn != 0 ) { handle->bn = handle->desc.N; *status = LIBXSMM_DNN_WARN_FC_SUBOPTIMAL_N_BLOCKING; } if ( handle->desc.C % handle->bc != 0 ) { handle->bc = handle->desc.C; *status = LIBXSMM_DNN_WARN_FC_SUBOPTIMAL_C_BLOCKING; } if ( handle->desc.K % handle->bk != 0 ) { handle->bk = handle->desc.K; *status = LIBXSMM_DNN_WARN_FC_SUBOPTIMAL_K_BLOCKING; } if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) { #if 0 handle->fwd_bf = atoi(getenv("FWD_BF")); handle->bwd_bf = atoi(getenv("BWD_BF")); handle->upd_bf = atoi(getenv("UPD_BF")); handle->fwd_2d_blocking = atoi(getenv("FWD_2D_BLOCKING")); handle->bwd_2d_blocking = atoi(getenv("BWD_2D_BLOCKING")); handle->upd_2d_blocking = atoi(getenv("UPD_2D_BLOCKING")); handle->fwd_row_teams = atoi(getenv("FWD_ROW_TEAMS")); handle->fwd_column_teams = atoi(getenv("FWD_COLUMN_TEAMS")); handle->bwd_row_teams = atoi(getenv("BWD_ROW_TEAMS")); handle->bwd_column_teams = atoi(getenv("BWD_COLUMN_TEAMS")); handle->upd_row_teams = atoi(getenv("UPD_ROW_TEAMS")); handle->upd_column_teams = atoi(getenv("UPD_COLUMN_TEAMS")); handle->ifm_subtasks = atoi(getenv("IFM_SUBTASKS")); handle->ofm_subtasks = atoi(getenv("OFM_SUBTASKS")); #else /* Initialize with default values */ handle->fwd_bf = 1; handle->bwd_bf = 1; handle->upd_bf = 1; handle->fwd_2d_blocking = 0; handle->bwd_2d_blocking = 0; handle->upd_2d_blocking = 0; handle->fwd_row_teams = 1; handle->fwd_column_teams = 1; handle->bwd_row_teams = 1; handle->bwd_column_teams = 1; handle->upd_row_teams = 1; handle->upd_column_teams = 1; handle->ifm_subtasks = 1; handle->ofm_subtasks = 1; if (handle->desc.C == 100 && handle->desc.K == 1024 && handle->desc.threads == 28) { handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; handle->fwd_2d_blocking = 1; handle->fwd_row_teams = 14; handle->fwd_column_teams = 2; handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; handle->bwd_2d_blocking = 0; handle->bwd_row_teams = 1; handle->bwd_column_teams = 1; handle->upd_bf = ((handle->desc.N/handle->bn) % 14 == 0) ? 14 : 1; handle->upd_2d_blocking = 0; handle->upd_row_teams = 1; handle->upd_column_teams = 1; handle->ifm_subtasks = 1/*((handle->bc % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; } if (handle->desc.C == 1024 && handle->desc.K == 1024 && handle->desc.threads == 28) { handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; handle->fwd_2d_blocking = 1; handle->fwd_row_teams = 7; handle->fwd_column_teams = 4; handle->bwd_bf = ((handle->desc.K/handle->bk) % 8 == 0) ? 8 : 1; handle->bwd_2d_blocking = 0; handle->bwd_row_teams = 7; handle->bwd_column_teams = 4; handle->upd_bf = ((handle->desc.N/handle->bn) % 14 == 0) ? 14 : 1; handle->upd_2d_blocking = 0; handle->upd_row_teams = 7; handle->upd_column_teams = 4; handle->ifm_subtasks = ((handle->bc % 2 == 0) && (handle->upd_2d_blocking == 0)) ? 2 : 1; handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; } if (handle->desc.C == 512 && handle->desc.K == 512 && handle->desc.threads == 28) { handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; handle->fwd_2d_blocking = 0; handle->fwd_row_teams = 1; handle->fwd_column_teams = 1; handle->bwd_bf = ((handle->desc.K/handle->bk) % 4 == 0) ? 4 : 1; handle->bwd_2d_blocking = 0; handle->bwd_row_teams = 1; handle->bwd_column_teams = 1; handle->upd_bf = ((handle->desc.N/handle->bn) % 14 == 0) ? 14 : 1; handle->upd_2d_blocking = 0; handle->upd_row_teams = 1; handle->upd_column_teams = 1; handle->ifm_subtasks = ((handle->bc % 2 == 0) && (handle->upd_2d_blocking == 0)) ? 2 : 1; handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; } if (handle->desc.C == 1024 && handle->desc.K == 1 && handle->desc.threads == 28) { handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; handle->fwd_2d_blocking = 0; handle->fwd_row_teams = 1; handle->fwd_column_teams = 1; handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; handle->bwd_2d_blocking = 1; handle->bwd_row_teams = 14; handle->bwd_column_teams = 2; handle->upd_bf = ((handle->desc.N/handle->bn) % 2 == 0) ? 2 : 1; handle->upd_2d_blocking = 0; handle->upd_row_teams = 1; handle->upd_column_teams = 1; handle->ifm_subtasks = ((handle->bc % 2 == 0) && (handle->upd_2d_blocking == 0)) ? 2 : 1; handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; } if (handle->desc.C == 1024 && handle->desc.K == 1024 && handle->desc.threads == 20) { handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; handle->fwd_2d_blocking = 0; handle->fwd_row_teams = 5; handle->fwd_column_teams = 4; handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; handle->bwd_2d_blocking = 1; handle->bwd_row_teams = 5; handle->bwd_column_teams = 4; handle->upd_bf = ((handle->desc.N/handle->bn) % 15 == 0) ? 15 : 1; handle->upd_2d_blocking = 0; handle->upd_row_teams = 5; handle->upd_column_teams = 4; handle->ifm_subtasks = 1/*((handle->bc % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; } if (handle->desc.C == 100 && handle->desc.K == 1024 && handle->desc.threads == 20) { handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; handle->fwd_2d_blocking = 1; handle->fwd_row_teams = 5; handle->fwd_column_teams = 4; handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; handle->bwd_2d_blocking = 0; handle->bwd_row_teams = 1; handle->bwd_column_teams = 1; handle->upd_bf = ((handle->desc.N/handle->bn) % 9 == 0) ? 9 : 1; handle->upd_2d_blocking = 0; handle->upd_row_teams = 1; handle->upd_column_teams = 1; handle->ifm_subtasks = 1/*((handle->bc % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; handle->ofm_subtasks = ((handle->bk % 2 == 0) && (handle->upd_2d_blocking == 0)) ? 2 : 1; } if (handle->desc.C == 1024 && handle->desc.K == 1024 && handle->desc.threads == 24) { handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; handle->fwd_2d_blocking = 0; handle->fwd_row_teams = 6; handle->fwd_column_teams = 4; handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; handle->bwd_2d_blocking = 0; handle->bwd_row_teams = 6; handle->bwd_column_teams = 4; handle->upd_bf = ((handle->desc.N/handle->bn) % 15 == 0) ? 15 : 1; handle->upd_2d_blocking = 0; handle->upd_row_teams = 6; handle->upd_column_teams = 4; handle->ifm_subtasks = ((handle->bc % 2 == 0) && (handle->upd_2d_blocking == 0)) ? 2 : 1; handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; } if (handle->desc.C == 100 && handle->desc.K == 1024 && handle->desc.threads == 24) { handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; handle->fwd_2d_blocking = 0; handle->fwd_row_teams = 5; handle->fwd_column_teams = 4; handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; handle->bwd_2d_blocking = 1; handle->bwd_row_teams = 12; handle->bwd_column_teams = 2; handle->upd_bf = ((handle->desc.N/handle->bn) % 15 == 0) ? 15 : 1; handle->upd_2d_blocking = 0; handle->upd_row_teams = 5; handle->upd_column_teams = 4; handle->ifm_subtasks = 1/*((handle->bc % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; } if (handle->desc.C == 512 && handle->desc.K == 512 && handle->desc.threads == 24) { handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; handle->fwd_2d_blocking = 0; handle->fwd_row_teams = 5; handle->fwd_column_teams = 4; handle->bwd_bf = ((handle->desc.K/handle->bk) % 4 == 0) ? 4 : 1; handle->bwd_2d_blocking = 0; handle->bwd_row_teams = 5; handle->bwd_column_teams = 4; handle->upd_bf = ((handle->desc.N/handle->bn) % 15 == 0) ? 15 : 1; handle->upd_2d_blocking = 0; handle->upd_row_teams = 5; handle->upd_column_teams = 4; handle->ifm_subtasks = ((handle->bc % 2 == 0) && (handle->upd_2d_blocking == 0)) ? 2 : 1; handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; } if (handle->desc.C == 512 && handle->desc.K == 512 && handle->desc.threads == 20) { handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; handle->fwd_2d_blocking = 1; handle->fwd_row_teams = 5; handle->fwd_column_teams = 4; handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; handle->bwd_2d_blocking = 0; handle->bwd_row_teams = 1; handle->bwd_column_teams = 1; handle->upd_bf = ((handle->desc.N/handle->bn) % 15 == 0) ? 15 : 1; handle->upd_2d_blocking = 0; handle->upd_row_teams = 1; handle->upd_column_teams = 1; handle->ifm_subtasks = ((handle->bc % 4 == 0) && (handle->upd_2d_blocking == 0)) ? 4 : 1; handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; } if (handle->desc.C == 1024 && handle->desc.K == 1 && handle->desc.threads == 24) { handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; handle->fwd_2d_blocking = 0; handle->fwd_row_teams = 5; handle->fwd_column_teams = 4; handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; handle->bwd_2d_blocking = 0; handle->bwd_row_teams = 5; handle->bwd_column_teams = 4; handle->upd_bf = 1/*((handle->desc.N/handle->bn) % 1 == 0) ? 1 : 1*/; handle->upd_2d_blocking = 0; handle->upd_row_teams = 5; handle->upd_column_teams = 4; handle->ifm_subtasks = ((handle->bc % 4 == 0) && (handle->upd_2d_blocking == 0)) ? 4 : 1; handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; } if (handle->desc.C == 1024 && handle->desc.K == 1 && handle->desc.threads == 20) { handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; handle->fwd_2d_blocking = 0; handle->fwd_row_teams = 6; handle->fwd_column_teams = 4; handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; handle->bwd_2d_blocking = 1; handle->bwd_row_teams = 5; handle->bwd_column_teams = 4; handle->upd_bf = 1/*((handle->desc.N/handle->bn) % 1 == 0) ? 1 : 1*/; handle->upd_2d_blocking = 0; handle->upd_row_teams = 6; handle->upd_column_teams = 4; handle->ifm_subtasks = 1/*((handle->bc % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; } #endif } else if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) { #if 0 handle->fwd_bf = atoi(getenv("FWD_BF")); handle->bwd_bf = atoi(getenv("BWD_BF")); handle->upd_bf = atoi(getenv("UPD_BF")); handle->fwd_2d_blocking = atoi(getenv("FWD_2D_BLOCKING")); handle->bwd_2d_blocking = atoi(getenv("BWD_2D_BLOCKING")); handle->upd_2d_blocking = atoi(getenv("UPD_2D_BLOCKING")); handle->fwd_row_teams = atoi(getenv("FWD_ROW_TEAMS")); handle->fwd_column_teams = atoi(getenv("FWD_COLUMN_TEAMS")); handle->bwd_row_teams = atoi(getenv("BWD_ROW_TEAMS")); handle->bwd_column_teams = atoi(getenv("BWD_COLUMN_TEAMS")); handle->upd_row_teams = atoi(getenv("UPD_ROW_TEAMS")); handle->upd_column_teams = atoi(getenv("UPD_COLUMN_TEAMS")); handle->ifm_subtasks = atoi(getenv("IFM_SUBTASKS")); handle->ofm_subtasks = atoi(getenv("OFM_SUBTASKS")); #else /* Initialize with default values */ handle->fwd_bf = 1; handle->bwd_bf = 1; handle->upd_bf = 1; handle->fwd_2d_blocking = 0; handle->bwd_2d_blocking = 0; handle->upd_2d_blocking = 0; handle->fwd_row_teams = 1; handle->fwd_column_teams = 1; handle->bwd_row_teams = 1; handle->bwd_column_teams = 1; handle->upd_row_teams = 1; handle->upd_column_teams = 1; handle->ifm_subtasks = 1; handle->ofm_subtasks = 1; if (handle->desc.C == 100 && handle->desc.K == 1024 && handle->desc.threads == 28) { handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; handle->fwd_2d_blocking = 1; handle->fwd_row_teams = 14; handle->fwd_column_teams = 2; handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; handle->bwd_2d_blocking = 0; handle->bwd_row_teams = 1; handle->bwd_column_teams = 1; handle->upd_bf = ((handle->desc.N/handle->bn) % 14 == 0) ? 14 : 1; handle->upd_2d_blocking = 0; handle->upd_row_teams = 1; handle->upd_column_teams = 1; handle->ifm_subtasks = 1/*((handle->bc % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; } if (handle->desc.C == 1024 && handle->desc.K == 1024 && handle->desc.threads == 28) { handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; handle->fwd_2d_blocking = 1; handle->fwd_row_teams = 7; handle->fwd_column_teams = 4; handle->bwd_bf = ((handle->desc.K/handle->bk) % 8 == 0) ? 8 : 1; handle->bwd_2d_blocking = 0; handle->bwd_row_teams = 7; handle->bwd_column_teams = 4; handle->upd_bf = ((handle->desc.N/handle->bn) % 14 == 0) ? 14 : 1; handle->upd_2d_blocking = 0; handle->upd_row_teams = 7; handle->upd_column_teams = 4; handle->ifm_subtasks = ((handle->bc % 2 == 0) && (handle->upd_2d_blocking == 0)) ? 2 : 1; handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; } if (handle->desc.C == 512 && handle->desc.K == 512 && handle->desc.threads == 28) { handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; handle->fwd_2d_blocking = 0; handle->fwd_row_teams = 1; handle->fwd_column_teams = 1; handle->bwd_bf = ((handle->desc.K/handle->bk) % 4 == 0) ? 4 : 1; handle->bwd_2d_blocking = 0; handle->bwd_row_teams = 1; handle->bwd_column_teams = 1; handle->upd_bf = ((handle->desc.N/handle->bn) % 14 == 0) ? 14 : 1; handle->upd_2d_blocking = 0; handle->upd_row_teams = 1; handle->upd_column_teams = 1; handle->ifm_subtasks = ((handle->bc % 2 == 0) && (handle->upd_2d_blocking == 0)) ? 2 : 1; handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; } if (handle->desc.C == 1024 && handle->desc.K == 1 && handle->desc.threads == 28) { handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; handle->fwd_2d_blocking = 0; handle->fwd_row_teams = 1; handle->fwd_column_teams = 1; handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; handle->bwd_2d_blocking = 1; handle->bwd_row_teams = 14; handle->bwd_column_teams = 2; handle->upd_bf = ((handle->desc.N/handle->bn) % 2 == 0) ? 2 : 1; handle->upd_2d_blocking = 0; handle->upd_row_teams = 1; handle->upd_column_teams = 1; handle->ifm_subtasks = ((handle->bc % 2 == 0) && (handle->upd_2d_blocking == 0)) ? 2 : 1; handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; } if (handle->desc.C == 1024 && handle->desc.K == 1024 && handle->desc.threads == 20) { handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; handle->fwd_2d_blocking = 0; handle->fwd_row_teams = 5; handle->fwd_column_teams = 4; handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; handle->bwd_2d_blocking = 1; handle->bwd_row_teams = 5; handle->bwd_column_teams = 4; handle->upd_bf = ((handle->desc.N/handle->bn) % 15 == 0) ? 15 : 1; handle->upd_2d_blocking = 0; handle->upd_row_teams = 5; handle->upd_column_teams = 4; handle->ifm_subtasks = 1/*((handle->bc % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; } if (handle->desc.C == 100 && handle->desc.K == 1024 && handle->desc.threads == 20) { handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; handle->fwd_2d_blocking = 1; handle->fwd_row_teams = 5; handle->fwd_column_teams = 4; handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; handle->bwd_2d_blocking = 0; handle->bwd_row_teams = 1; handle->bwd_column_teams = 1; handle->upd_bf = ((handle->desc.N/handle->bn) % 9 == 0) ? 9 : 1; handle->upd_2d_blocking = 0; handle->upd_row_teams = 1; handle->upd_column_teams = 1; handle->ifm_subtasks = 1/*((handle->bc % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; handle->ofm_subtasks = ((handle->bk % 2 == 0) && (handle->upd_2d_blocking == 0)) ? 2 : 1; } if (handle->desc.C == 1024 && handle->desc.K == 1024 && handle->desc.threads == 24) { handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; handle->fwd_2d_blocking = 0; handle->fwd_row_teams = 6; handle->fwd_column_teams = 4; handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; handle->bwd_2d_blocking = 0; handle->bwd_row_teams = 6; handle->bwd_column_teams = 4; handle->upd_bf = ((handle->desc.N/handle->bn) % 15 == 0) ? 15 : 1; handle->upd_2d_blocking = 0; handle->upd_row_teams = 6; handle->upd_column_teams = 4; handle->ifm_subtasks = ((handle->bc % 2 == 0) && (handle->upd_2d_blocking == 0)) ? 2 : 1; handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; } if (handle->desc.C == 100 && handle->desc.K == 1024 && handle->desc.threads == 24) { handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; handle->fwd_2d_blocking = 0; handle->fwd_row_teams = 5; handle->fwd_column_teams = 4; handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; handle->bwd_2d_blocking = 1; handle->bwd_row_teams = 12; handle->bwd_column_teams = 2; handle->upd_bf = ((handle->desc.N/handle->bn) % 15 == 0) ? 15 : 1; handle->upd_2d_blocking = 0; handle->upd_row_teams = 5; handle->upd_column_teams = 4; handle->ifm_subtasks = 1/*((handle->bc % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; } if (handle->desc.C == 512 && handle->desc.K == 512 && handle->desc.threads == 24) { handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; handle->fwd_2d_blocking = 0; handle->fwd_row_teams = 5; handle->fwd_column_teams = 4; handle->bwd_bf = ((handle->desc.K/handle->bk) % 4 == 0) ? 4 : 1; handle->bwd_2d_blocking = 0; handle->bwd_row_teams = 5; handle->bwd_column_teams = 4; handle->upd_bf = ((handle->desc.N/handle->bn) % 15 == 0) ? 15 : 1; handle->upd_2d_blocking = 0; handle->upd_row_teams = 5; handle->upd_column_teams = 4; handle->ifm_subtasks = ((handle->bc % 2 == 0) && (handle->upd_2d_blocking == 0)) ? 2 : 1; handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; } if (handle->desc.C == 512 && handle->desc.K == 512 && handle->desc.threads == 20) { handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; handle->fwd_2d_blocking = 1; handle->fwd_row_teams = 5; handle->fwd_column_teams = 4; handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; handle->bwd_2d_blocking = 0; handle->bwd_row_teams = 1; handle->bwd_column_teams = 1; handle->upd_bf = ((handle->desc.N/handle->bn) % 15 == 0) ? 15 : 1; handle->upd_2d_blocking = 0; handle->upd_row_teams = 1; handle->upd_column_teams = 1; handle->ifm_subtasks = ((handle->bc % 4 == 0) && (handle->upd_2d_blocking == 0)) ? 4 : 1; handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; } if (handle->desc.C == 1024 && handle->desc.K == 1 && handle->desc.threads == 24) { handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; handle->fwd_2d_blocking = 0; handle->fwd_row_teams = 5; handle->fwd_column_teams = 4; handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; handle->bwd_2d_blocking = 0; handle->bwd_row_teams = 5; handle->bwd_column_teams = 4; handle->upd_bf = 1/*((handle->desc.N/handle->bn) % 1 == 0) ? 1 : 1*/; handle->upd_2d_blocking = 0; handle->upd_row_teams = 5; handle->upd_column_teams = 4; handle->ifm_subtasks = ((handle->bc % 4 == 0) && (handle->upd_2d_blocking == 0)) ? 4 : 1; handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; } if (handle->desc.C == 1024 && handle->desc.K == 1 && handle->desc.threads == 20) { handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; handle->fwd_2d_blocking = 0; handle->fwd_row_teams = 6; handle->fwd_column_teams = 4; handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; handle->bwd_2d_blocking = 1; handle->bwd_row_teams = 5; handle->bwd_column_teams = 4; handle->upd_bf = 1/*((handle->desc.N/handle->bn) % 1 == 0) ? 1 : 1*/; handle->upd_2d_blocking = 0; handle->upd_row_teams = 6; handle->upd_column_teams = 4; handle->ifm_subtasks = 1/*((handle->bc % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; } #endif } } else { /* check that we cannot fuse */ if ( handle->desc.fuse_ops != LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { free( handle ); *status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; return 0; } /* we need to compute the memory layout given the */ if ( (handle->desc.C % 16 == 0) && (handle->desc.K % 16 == 0) ) { if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) { *status = libxsmm_dnn_get_feature_map_blocks( handle->desc.C, handle->desc.K, &(handle->ifmblock), &(handle->ofmblock), &(handle->fm_lp_block), LIBXSMM_DNN_DATATYPE_F32, LIBXSMM_DNN_DATATYPE_F32 ); } else if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) { *status = libxsmm_dnn_get_feature_map_blocks( handle->desc.C, handle->desc.K, &(handle->ifmblock), &(handle->ofmblock), &(handle->fm_lp_block), handle->desc.datatype_in, handle->desc.datatype_out ); } else { /* should not happen, not implemented */ } } else if ( (handle->desc.C % 64 == 0) && (handle->desc.K == 1000) ) { /* @TODO this a hack for the last FC layer */ handle->ifmblock = 64; handle->fm_lp_block = 1; handle->ofmblock = 10; } else if ( (handle->desc.C % 16 == 0) && (handle->desc.K == 1000) ) { /* @TODO this a hack for the last FC layer */ handle->ifmblock = 16; handle->fm_lp_block = 1; handle->ofmblock = 10; } else { *status = LIBXSMM_DNN_ERR_CREATE_HANDLE; free( handle ); return 0; } /* compute the outer blocks */ handle->blocksifm = handle->desc.C / handle->ifmblock; handle->blocksofm = handle->desc.K / handle->ofmblock; } /* create barrier */ handle->barrier = libxsmm_barrier_create(handle->desc.threads, 1); /* calculate scratch size */ if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) { handle->scratch_size = sizeof(float) * ( ( (size_t)handle->desc.C * (size_t)handle->desc.N ) + ( (size_t)handle->desc.C * (size_t)handle->desc.K ) ); } else if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) { /* Let's allocate maximum required scratch */ size_t size_fwd = sizeof(float) * handle->desc.K * handle->desc.N; /* In case of K = 1 we pad A and B to "bk=2" */ size_t size_bwd = (handle->desc.K != 1) ? ( sizeof(float) * handle->desc.C * handle->desc.N + sizeof(libxsmm_bfloat16) * handle->desc.C * handle->desc.K ) : ( sizeof(float) * handle->desc.C * handle->desc.N + sizeof(libxsmm_bfloat16) * handle->desc.C * 2 + sizeof(libxsmm_bfloat16) * 2 * handle->desc.N ); size_t size_upd = sizeof(float) * handle->desc.C * handle->desc.K + sizeof(libxsmm_bfloat16) * handle->desc.threads * handle->bk * handle->bc + sizeof(libxsmm_bfloat16) * (handle->desc.N * (handle->desc.C + handle->desc.K)); handle->scratch_size = LIBXSMM_MAX(LIBXSMM_MAX(size_fwd, size_bwd), size_upd); handle->doutput_scratch_mark = handle->scratch_size; handle->scratch_size += 2 * sizeof(libxsmm_bfloat16) * handle->desc.N * handle->desc.K; } else { handle->scratch_size = sizeof(float) * ( (((size_t)handle->desc.C + (size_t)handle->desc.K) * (size_t)handle->desc.N) + ((size_t)handle->desc.C * (size_t)handle->desc.K) ); } /* create code pointers in some special cases */ if ( ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED) > 0) && ((handle->desc.filter_format & LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED) > 0) ) { if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) { float alpha = 1.0f; /* beta is set to 1 for ncnc kcck format because ifm is split into 2 blocks */ float beta = 1.0f; float zerobeta = 0.0f; int updflags = LIBXSMM_GEMM_FLAGS( 'N', 'T' ); /* For UPD kernels we consider subtasking... */ libxsmm_blasint M = handle->bk/handle->ofm_subtasks; libxsmm_blasint N = handle->bc/handle->ifm_subtasks; libxsmm_blasint lda = (libxsmm_blasint)handle->bk; libxsmm_blasint ldb = (libxsmm_blasint)handle->bc; libxsmm_blasint ldc = (libxsmm_blasint)handle->bk; handle->gemm_fwd.xgemm.smrs = libxsmm_smmdispatch_reducebatch_strd(handle->bk, handle->bn, handle->bc, handle->bk*handle->bc*sizeof(float), handle->bc*handle->bn*sizeof(float), &lda, &ldb, &ldc, &alpha, &beta, NULL, NULL); handle->gemm_fwd2.xgemm.smrs = libxsmm_smmdispatch_reducebatch_strd(handle->bk, handle->bn, handle->bc, handle->bk*handle->bc*sizeof(float), handle->bc*handle->bn*sizeof(float), &lda, &ldb, &ldc, &alpha, &zerobeta, NULL, NULL); handle->gemm_bwd.xgemm.smrs = libxsmm_smmdispatch_reducebatch_strd(handle->bc, handle->bn, handle->bk, handle->bk*handle->bc*sizeof(float), handle->bk*handle->bn*sizeof(float), &ldb, &lda, &ldb, &alpha, &beta, NULL, NULL); handle->gemm_bwd2.xgemm.smrs = libxsmm_smmdispatch_reducebatch_strd(handle->bc, handle->bn, handle->bk, handle->bk*handle->bc*sizeof(float), handle->bk*handle->bn*sizeof(float), &ldb, &lda, &ldb, &alpha, &zerobeta, NULL, NULL); /* Transpose kernel used for weight transpose in bwd pass */ tr_desc = libxsmm_trans_descriptor_init(&blob, sizeof(float), handle->bk, handle->bc, handle->bc); handle->tr_kernel = libxsmm_dispatch_trans(tr_desc); /* update has different LDs */ lda = (libxsmm_blasint)handle->bk; ldb = (libxsmm_blasint)handle->bc; ldc = (libxsmm_blasint)handle->bk; handle->gemm_upd.xgemm.smrs = libxsmm_smmdispatch_reducebatch_strd(M, N, handle->bn, handle->desc.K*handle->bn*sizeof(float), handle->desc.C*handle->bn*sizeof(float), &lda, &ldb, &ldc, &alpha, &beta, &updflags, NULL); handle->gemm_upd2.xgemm.smrs = libxsmm_smmdispatch_reducebatch_strd(M, N, handle->bn, handle->desc.K*handle->bn*sizeof(float), handle->desc.C*handle->bn*sizeof(float), &lda, &ldb, &ldc, &alpha, &zerobeta, &updflags, NULL); } else if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) { float alpha = 1.0f; float beta = 1.0f; float zerobeta = 0.0f; /* For UPD kernels we consider subtasking... */ libxsmm_blasint M = handle->bk/handle->ofm_subtasks; libxsmm_blasint N = handle->bc/handle->ifm_subtasks; libxsmm_blasint lda = (libxsmm_blasint)handle->bk; libxsmm_blasint ldb = (libxsmm_blasint)handle->bc; libxsmm_blasint ldc = (libxsmm_blasint)handle->bk; handle->gemm_fwd.xgemm.bsmrs = libxsmm_bsmmdispatch_reducebatch_strd(handle->bk, handle->bn, handle->bc, handle->bk*handle->bc*sizeof(libxsmm_bfloat16), handle->bc*handle->bn*sizeof(libxsmm_bfloat16), &lda, &ldb, &ldc, &alpha, &beta, NULL, NULL); handle->gemm_fwd2.xgemm.bmrs = libxsmm_bmmdispatch_reducebatch_strd(handle->bk, handle->bn, handle->bc, handle->bk*handle->bc*sizeof(libxsmm_bfloat16), handle->bc*handle->bn*sizeof(libxsmm_bfloat16), &lda, &ldb, &ldc, &alpha, &zerobeta, NULL, NULL); handle->gemm_fwd3.xgemm.bmrs = libxsmm_bmmdispatch_reducebatch_strd(handle->bk, handle->bn, handle->bc, handle->bk*handle->bc*sizeof(libxsmm_bfloat16), handle->bc*handle->bn*sizeof(libxsmm_bfloat16), &lda, &ldb, &ldc, &alpha, &beta, NULL, NULL); /* Special bwd kernels for K == 1 */ if (handle->desc.K == 1) { libxsmm_blasint _bk = 2; handle->gemm_bwd.xgemm.bsmrs = libxsmm_bsmmdispatch_reducebatch_strd(handle->bc, handle->bn, _bk, _bk*handle->bc*sizeof(libxsmm_bfloat16), _bk*handle->bn*sizeof(libxsmm_bfloat16), &ldb, &_bk, &ldb, &alpha, &beta, NULL, NULL); handle->gemm_bwd2.xgemm.bmrs = libxsmm_bmmdispatch_reducebatch_strd(handle->bc, handle->bn, _bk, _bk*handle->bc*sizeof(libxsmm_bfloat16), _bk*handle->bn*sizeof(libxsmm_bfloat16), &ldb, &_bk, &ldb, &alpha, &zerobeta, NULL, NULL); } else { handle->gemm_bwd.xgemm.bsmrs = libxsmm_bsmmdispatch_reducebatch_strd(handle->bc, handle->bn, handle->bk, handle->bk*handle->bc*sizeof(libxsmm_bfloat16), handle->bk*handle->bn*sizeof(libxsmm_bfloat16), &ldb, &lda, &ldb, &alpha, &beta, NULL, NULL); handle->gemm_bwd2.xgemm.bmrs = libxsmm_bmmdispatch_reducebatch_strd(handle->bc, handle->bn, handle->bk, handle->bk*handle->bc*sizeof(libxsmm_bfloat16), handle->bk*handle->bn*sizeof(libxsmm_bfloat16), &ldb, &lda, &ldb, &alpha, &zerobeta, NULL, NULL); } lda = (libxsmm_blasint)handle->bk; ldb = (libxsmm_blasint)handle->bn; ldc = (libxsmm_blasint)handle->bk; handle->gemm_upd.xgemm.bsmrs = libxsmm_bsmmdispatch_reducebatch_strd(M, N, handle->bn, handle->bk*handle->bn*sizeof(libxsmm_bfloat16), handle->bc*handle->bn*sizeof(libxsmm_bfloat16), &lda, &ldb, &ldc, &alpha, &beta, NULL, NULL); handle->gemm_upd2.xgemm.bmrs = libxsmm_bmmdispatch_reducebatch_strd(M, N, handle->bn, handle->bk*handle->bn*sizeof(libxsmm_bfloat16), handle->bc*handle->bn*sizeof(libxsmm_bfloat16), &lda, &ldb, &ldc, &alpha, &zerobeta, NULL, NULL); } else { } } } else { *status = LIBXSMM_DNN_ERR_CREATE_HANDLE; } } else { *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } return handle; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_fullyconnected(const libxsmm_dnn_fullyconnected* handle) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { /* Deallocate barrier */ if (handle->barrier != 0 ) { libxsmm_barrier_release((const libxsmm_barrier*)handle->barrier); } /* deallocate handle structure */ free(/*remove constness*/(libxsmm_dnn_fullyconnected*)handle); } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_fullyconnected_create_tensor_datalayout(const libxsmm_dnn_fullyconnected* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status) { libxsmm_dnn_tensor_datalayout* layout; *status = LIBXSMM_DNN_SUCCESS; layout = 0; if (handle != 0) { layout = (libxsmm_dnn_tensor_datalayout*) malloc(sizeof(libxsmm_dnn_tensor_datalayout)); if (layout != 0) { memset(layout, 0, sizeof(libxsmm_dnn_tensor_datalayout)); if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) || (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { layout->format = handle->desc.buffer_format; if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0) { if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) { layout->datatype = LIBXSMM_DNN_DATATYPE_F32; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { layout->num_dims = 5; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) ) { layout->dim_size[0] = handle->ifmblock; layout->dim_size[1] = 1; layout->dim_size[2] = 1; layout->dim_size[3] = handle->blocksifm; layout->dim_size[4] = handle->desc.N; } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { layout->dim_size[0] = handle->ofmblock; layout->dim_size[1] = 1; layout->dim_size[2] = 1; layout->dim_size[3] = handle->blocksofm; layout->dim_size[4] = handle->desc.N; } else { /* coverity[dead_error_begin] */ free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; } } else if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) { if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) ) { layout->datatype = handle->desc.datatype_in; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { layout->num_dims = 5; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; layout->dim_size[0] = handle->ifmblock; layout->dim_size[1] = 1; layout->dim_size[2] = 1; layout->dim_size[3] = handle->blocksifm; layout->dim_size[4] = handle->desc.N; } else { free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; } } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { layout->datatype = handle->desc.datatype_out; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { layout->num_dims = 5; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; layout->dim_size[0] = handle->ofmblock; layout->dim_size[1] = 1; layout->dim_size[2] = 1; layout->dim_size[3] = handle->blocksofm; layout->dim_size[4] = handle->desc.N; } else { free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } else if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NHWC) > 0) { if ( ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) || ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) || ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16)) ) { layout->datatype = handle->desc.datatype_in; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(4*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(4*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 4; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) ) { layout->dim_size[0] = handle->desc.C; layout->dim_size[1] = 1; layout->dim_size[2] = 1; layout->dim_size[3] = handle->desc.N; } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { layout->dim_size[0] = handle->desc.K; layout->dim_size[1] = 1; layout->dim_size[2] = 1; layout->dim_size[3] = handle->desc.N; } else { free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } else if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED) > 0) { if ( ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) || ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16)) ) { layout->datatype = handle->desc.datatype_in; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(4*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(4*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 4; if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) ) { layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; layout->dim_size[0] = (unsigned int)handle->bc; layout->dim_size[1] = (unsigned int)handle->bn; layout->dim_size[2] = (unsigned int)(handle->desc.C / handle->bc); layout->dim_size[3] = (unsigned int)(handle->desc.N / handle->bn); } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) ) { layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; layout->dim_size[0] = (unsigned int)handle->bk; layout->dim_size[1] = (unsigned int)handle->bn; layout->dim_size[2] = (unsigned int)(handle->desc.K / handle->bk); layout->dim_size[3] = (unsigned int)(handle->desc.N / handle->bn); } else { free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; } } else if ( (type == LIBXSMM_DNN_REGULAR_FILTER) || (type == LIBXSMM_DNN_GRADIENT_FILTER) || (type == LIBXSMM_DNN_FILTER) ) { layout->format = handle->desc.filter_format; layout->tensor_type = LIBXSMM_DNN_FILTER; if ((handle->desc.filter_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0) { if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) { layout->datatype = handle->desc.datatype_in; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(6*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(6*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 6; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_S; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_R; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[5] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_size[0] = handle->ofmblock; layout->dim_size[1] = handle->ifmblock; layout->dim_size[2] = 1; layout->dim_size[3] = 1; layout->dim_size[4] = handle->blocksifm; layout->dim_size[5] = handle->blocksofm; } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; } } else if ( ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) || ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) ) { layout->datatype = LIBXSMM_DNN_DATATYPE_BF16; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(7*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(7*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 7; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_S; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_R; layout->dim_type[5] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[6] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_size[0] = handle->fm_lp_block; layout->dim_size[1] = handle->ofmblock; layout->dim_size[2] = handle->ifmblock/handle->fm_lp_block; layout->dim_size[3] = 1; layout->dim_size[4] = 1; layout->dim_size[5] = handle->blocksifm; layout->dim_size[6] = handle->blocksofm; } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } else if ((handle->desc.filter_format & LIBXSMM_DNN_TENSOR_FORMAT_RSCK) > 0) { if ( ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) || ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) || ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16)) ) { layout->datatype = handle->desc.datatype_in; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(4*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(4*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { layout->num_dims = 4; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_S; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_R; layout->dim_size[0] = handle->ofmblock * handle->blocksofm; layout->dim_size[1] = handle->ifmblock * handle->blocksifm; layout->dim_size[2] = 1; layout->dim_size[3] = 1; } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } else if ((handle->desc.filter_format & LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED) > 0) { if ( ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) ) { layout->datatype = LIBXSMM_DNN_DATATYPE_F32; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(4*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(4*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 4; if ( (type == LIBXSMM_DNN_REGULAR_FILTER) || (type == LIBXSMM_DNN_GRADIENT_FILTER) ) { layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_size[0] = (unsigned int)handle->bk; layout->dim_size[1] = (unsigned int)handle->bc; layout->dim_size[2] = (unsigned int)(handle->desc.C / handle->bc); layout->dim_size[3] = (unsigned int)(handle->desc.K / handle->bk); } else { free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else if ( ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) ) { layout->datatype = LIBXSMM_DNN_DATATYPE_BF16; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 5; if ( (type == LIBXSMM_DNN_REGULAR_FILTER) || (type == LIBXSMM_DNN_GRADIENT_FILTER) ) { layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_size[0] = (unsigned int)2; layout->dim_size[1] = (unsigned int)handle->bk; layout->dim_size[2] = (unsigned int)handle->bc/2; layout->dim_size[3] = (unsigned int)(handle->desc.C / handle->bc); layout->dim_size[4] = (unsigned int)(handle->desc.K / handle->bk); } else { free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; } } else if ( (type == LIBXSMM_DNN_REGULAR_CHANNEL_BIAS) || (type == LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS) || (type == LIBXSMM_DNN_CHANNEL_BIAS) ) { layout->format = handle->desc.buffer_format; layout->tensor_type = LIBXSMM_DNN_CHANNEL_SCALAR; if ( ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED) > 0) ) { if ( (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32) || (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) { layout->datatype = handle->desc.datatype_out; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(2*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(2*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 2; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_size[0] = (unsigned int)handle->bk; layout->dim_size[1] = (unsigned int)(handle->desc.K / handle->bk); } else { free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; } } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else if ( (type == LIBXSMM_DNN_RELU_MASK) ) { layout->format = handle->desc.buffer_format; layout->tensor_type = LIBXSMM_DNN_RELU_MASK; if ( ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED) > 0) ) { layout->datatype = LIBXSMM_DNN_DATATYPE_I8; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(1*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(1*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { layout->num_dims = 1; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_X; layout->dim_size[0] = handle->desc.N * handle->desc.K; } else { free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT; } } else { *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return layout; } LIBXSMM_API size_t libxsmm_dnn_fullyconnected_get_scratch_size(const libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_err_t* status) { size_t l_scratch_size = 0; *status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { l_scratch_size = handle->scratch_size + 64; /* 64 byte extra in case the user code does not care about alignment */ } else { *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return l_scratch_size; } LIBXSMM_API void* libxsmm_dnn_fullyconnected_get_scratch_ptr(const libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_err_t* status) { *status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { return handle->scratch; } else { *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return 0; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_bind_scratch(libxsmm_dnn_fullyconnected* handle, const void* scratch) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; uintptr_t address = (uintptr_t)scratch; size_t offset = 0; if (scratch == 0) { status = LIBXSMM_DNN_ERR_SCRATCH_NOT_ALLOCED; return status; } if (0 != handle) { /* align the internal scratch buffer if needed */ if (address % 64 == 0) { handle->scratch = (void*)address; } else { offset = (64 - address % 64); handle->scratch = (void*)(address+offset); } } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_release_scratch(libxsmm_dnn_fullyconnected* handle) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { handle->scratch = 0; } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_bind_tensor(libxsmm_dnn_fullyconnected* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check for tensor type */ if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_GRADIENT_OUTPUT) && (type != LIBXSMM_DNN_REGULAR_FILTER) && (type != LIBXSMM_DNN_GRADIENT_FILTER) && (type != LIBXSMM_DNN_REGULAR_CHANNEL_BIAS) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS) && (type != LIBXSMM_DNN_RELU_MASK) ) { status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; return status; } if (handle != 0 && tensor != 0) { libxsmm_dnn_tensor_datalayout* handle_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout(handle, type, &status); if ( libxsmm_dnn_compare_tensor_datalayout(handle_layout, tensor->layout, &status) == 0 ) { if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { handle->reg_input = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { handle->grad_input = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { handle->reg_output = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_GRADIENT_OUTPUT ) { handle->grad_output = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_REGULAR_FILTER ) { handle->reg_filter = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_GRADIENT_FILTER ) { handle->grad_filter = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_BIAS ) { handle->reg_bias = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS ) { handle->grad_bias = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_RELU_MASK ) { handle->relumask = (libxsmm_dnn_tensor*)tensor; } else { /* cannot happen */ } } else { status = LIBXSMM_DNN_ERR_MISMATCH_TENSOR; } libxsmm_dnn_destroy_tensor_datalayout( handle_layout ); } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE_TENSOR; } return status; } LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_fullyconnected_get_tensor(libxsmm_dnn_fullyconnected* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status) { libxsmm_dnn_tensor* return_tensor = 0; *status = LIBXSMM_DNN_SUCCESS; /* check for tensor type */ if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_GRADIENT_OUTPUT) && (type != LIBXSMM_DNN_REGULAR_FILTER) && (type != LIBXSMM_DNN_GRADIENT_FILTER) && (type != LIBXSMM_DNN_REGULAR_CHANNEL_BIAS) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS) && (type != LIBXSMM_DNN_RELU_MASK) ) { *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; return return_tensor; } if (handle != 0) { if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { return_tensor = handle->reg_input; } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { return_tensor = handle->grad_input; } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { return_tensor = handle->reg_output; } else if ( type == LIBXSMM_DNN_GRADIENT_OUTPUT ) { return_tensor = handle->grad_output; } else if ( type == LIBXSMM_DNN_REGULAR_FILTER ) { return_tensor = handle->reg_filter; } else if ( type == LIBXSMM_DNN_GRADIENT_FILTER ) { return_tensor = handle->grad_filter; } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_BIAS ) { return_tensor = handle->reg_bias; } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS ) { return_tensor = handle->grad_bias; } else if ( type == LIBXSMM_DNN_RELU_MASK ) { return_tensor = handle->relumask; } else { /* cannot happen */ } } else { *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return return_tensor; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_release_tensor(libxsmm_dnn_fullyconnected* handle, const libxsmm_dnn_tensor_type type) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check for tensor type */ if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_GRADIENT_OUTPUT) && (type != LIBXSMM_DNN_REGULAR_FILTER) && (type != LIBXSMM_DNN_GRADIENT_FILTER) && (type != LIBXSMM_DNN_REGULAR_CHANNEL_BIAS) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS) && (type != LIBXSMM_DNN_RELU_MASK) ) { status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; return status; } if (handle != 0) { if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { handle->reg_input = 0; } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { handle->grad_input = 0; } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { handle->reg_output = 0; } else if ( type == LIBXSMM_DNN_GRADIENT_OUTPUT ) { handle->grad_output = 0; } else if ( type == LIBXSMM_DNN_REGULAR_FILTER ) { handle->reg_filter = 0; } else if ( type == LIBXSMM_DNN_GRADIENT_FILTER ) { handle->grad_filter = 0; } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_BIAS ) { handle->reg_bias = 0; } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS ) { handle->grad_bias = 0; } else if ( type == LIBXSMM_DNN_RELU_MASK ) { handle->relumask = 0; } else { /* cannot happen */ } } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_execute_st(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, /*unsigned*/int start_thread, /*unsigned*/int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; LIBXSMM_UNUSED( start_thread ); LIBXSMM_UNUSED( tid ); if (0 != handle) { switch (kind) { case LIBXSMM_DNN_COMPUTE_KIND_FWD: { if ( (handle->desc.buffer_format == LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) && (handle->desc.filter_format == LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) ) { status = libxsmm_dnn_fullyconnected_st_fwd_custom( handle, start_thread, tid ); } else if ( (handle->desc.buffer_format == LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED) && (handle->desc.filter_format == LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED) ) { status = libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck( handle, start_thread, tid ); } else { status = LIBXSMM_DNN_ERR_INVALID_FORMAT_FC; } } break; case LIBXSMM_DNN_COMPUTE_KIND_BWD: case LIBXSMM_DNN_COMPUTE_KIND_UPD: case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: { if ( (handle->desc.buffer_format == LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) && (handle->desc.filter_format == LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) ) { status = libxsmm_dnn_fullyconnected_st_bwdupd_custom( handle, kind, start_thread, tid ); } else if ( (handle->desc.buffer_format == LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED) && (handle->desc.filter_format == LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED) ) { status = libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck( handle, kind, start_thread, tid ); } else { status = LIBXSMM_DNN_ERR_INVALID_FORMAT_FC; } } break; default: { status = LIBXSMM_DNN_ERR_INVALID_KIND; } } } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } libxsmm-1.17/src/libxsmm_dnn_fullyconnected_backward_weight_update.c000066400000000000000000001521461415223013700262360ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Evangelos Georganas (Intel Corp.) ******************************************************************************/ #include "libxsmm_dnn_fullyconnected_backward_weight_update.h" #include "libxsmm_main.h" LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_custom_f32_f32(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_f32_f32(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_custom_bf16_f32(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_bf16_bf16(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_bf16_bf16_emu(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) void bf16_vnni_transpose_16x16(void* source_void, void* dest_void, int source_stride, int dest_stride) { #if defined(LIBXSMM_INTRINSICS_AVX512_CORE) libxsmm_bfloat16 *source = (libxsmm_bfloat16*)source_void; libxsmm_bfloat16 *dest = (libxsmm_bfloat16*)dest_void; __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7; __m512i tmp0, tmp1, tmp2, tmp3; const __m512i abcdefgh_to_abefcdgh = _mm512_set4_epi32(0x0f0e0b0a, 0x0d0c0908, 0x07060302, 0x05040100); zmm0 = _mm512_loadu_si512(source); zmm1 = _mm512_loadu_si512(source + source_stride); zmm2 = _mm512_loadu_si512(source + source_stride*2); zmm3 = _mm512_loadu_si512(source + source_stride*3); zmm4 = _mm512_loadu_si512(source + source_stride*4); zmm5 = _mm512_loadu_si512(source + source_stride*5); zmm6 = _mm512_loadu_si512(source + source_stride*6); zmm7 = _mm512_loadu_si512(source + source_stride*7); zmm0 = _mm512_shuffle_epi8(zmm0, abcdefgh_to_abefcdgh); zmm1 = _mm512_shuffle_epi8(zmm1, abcdefgh_to_abefcdgh); zmm2 = _mm512_shuffle_epi8(zmm2, abcdefgh_to_abefcdgh); zmm3 = _mm512_shuffle_epi8(zmm3, abcdefgh_to_abefcdgh); zmm4 = _mm512_shuffle_epi8(zmm4, abcdefgh_to_abefcdgh); zmm5 = _mm512_shuffle_epi8(zmm5, abcdefgh_to_abefcdgh); zmm6 = _mm512_shuffle_epi8(zmm6, abcdefgh_to_abefcdgh); zmm7 = _mm512_shuffle_epi8(zmm7, abcdefgh_to_abefcdgh); tmp0 = _mm512_unpacklo_epi64(zmm0, zmm1); tmp1 = _mm512_unpackhi_epi64(zmm0, zmm1); tmp2 = _mm512_unpacklo_epi64(zmm2, zmm3); tmp3 = _mm512_unpackhi_epi64(zmm2, zmm3); zmm0 = _mm512_unpacklo_epi64(zmm4, zmm5); zmm1 = _mm512_unpackhi_epi64(zmm4, zmm5); zmm2 = _mm512_unpacklo_epi64(zmm6, zmm7); zmm3 = _mm512_unpackhi_epi64(zmm6, zmm7); zmm4 = _mm512_shuffle_i32x4(tmp0, tmp2, 0x88); zmm6 = _mm512_shuffle_i32x4(tmp0, tmp2, 0xdd); zmm5 = _mm512_shuffle_i32x4(tmp1, tmp3, 0x88); zmm7 = _mm512_shuffle_i32x4(tmp1, tmp3, 0xdd); tmp0 = _mm512_shuffle_i32x4(zmm0, zmm2, 0x88); tmp1 = _mm512_shuffle_i32x4(zmm0, zmm2, 0xdd); tmp2 = _mm512_shuffle_i32x4(zmm1, zmm3, 0x88); tmp3 = _mm512_shuffle_i32x4(zmm1, zmm3, 0xdd); zmm0 = _mm512_shuffle_i32x4(zmm4, tmp0, 0x88); zmm1 = _mm512_shuffle_i32x4(zmm5, tmp2, 0x88); zmm2 = _mm512_shuffle_i32x4(zmm6, tmp1, 0x88); zmm3 = _mm512_shuffle_i32x4(zmm7, tmp3, 0x88); zmm4 = _mm512_shuffle_i32x4(zmm4, tmp0, 0xdd); zmm5 = _mm512_shuffle_i32x4(zmm5, tmp2, 0xdd); zmm6 = _mm512_shuffle_i32x4(zmm6, tmp1, 0xdd); zmm7 = _mm512_shuffle_i32x4(zmm7, tmp3, 0xdd); _mm512_storeu_si512(dest, zmm0); _mm512_storeu_si512(dest + dest_stride, zmm1); _mm512_storeu_si512(dest + dest_stride * 2, zmm2); _mm512_storeu_si512(dest + dest_stride * 3, zmm3); _mm512_storeu_si512(dest + dest_stride * 4, zmm4); _mm512_storeu_si512(dest + dest_stride * 5, zmm5); _mm512_storeu_si512(dest + dest_stride * 6, zmm6); _mm512_storeu_si512(dest + dest_stride * 7, zmm7); #else LIBXSMM_UNUSED(source_void); LIBXSMM_UNUSED(dest_void); LIBXSMM_UNUSED(source_stride); LIBXSMM_UNUSED(dest_stride); #endif } LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) void bf16_vnni_transpose(libxsmm_bfloat16* src, libxsmm_bfloat16* dst, int M, int N, int ld_in, int ld_out) { #if defined(LIBXSMM_INTRINSICS_AVX512_CORE) const int _M = M/16, _N = N/16; int i = 0, j = 0; for (i = 0; i < _N; i++) { for (j = 0; j < _M; j++) { bf16_vnni_transpose_16x16((libxsmm_bfloat16*) src+i*16*ld_in+j*32, (libxsmm_bfloat16*) dst+j*16*ld_out+i*32, ld_in*2, ld_out*2); } } #else LIBXSMM_UNUSED(src); LIBXSMM_UNUSED(dst); LIBXSMM_UNUSED(M); LIBXSMM_UNUSED(N); LIBXSMM_UNUSED(ld_in); LIBXSMM_UNUSED(ld_out); #endif } LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) void bf16_transpose_32x16(libxsmm_bfloat16 *in, libxsmm_bfloat16 *out, int ld_in, int ld_out) { #if defined(LIBXSMM_INTRINSICS_AVX512_CORE) __m512i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf; __m512i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, te, tf; const int in_width=ld_in, out_width=ld_out; const __m512i idx_lo = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0); const __m512i idx_hi = _mm512_set_epi64(7, 6, 15, 14, 3, 2, 11, 10); r0 = _mm512_loadu_si512(in + 0*in_width); r1 = _mm512_loadu_si512(in + 1*in_width); r2 = _mm512_loadu_si512(in + 2*in_width); r3 = _mm512_loadu_si512(in + 3*in_width); r4 = _mm512_loadu_si512(in + 4*in_width); r5 = _mm512_loadu_si512(in + 5*in_width); r6 = _mm512_loadu_si512(in + 6*in_width); r7 = _mm512_loadu_si512(in + 7*in_width); r8 = _mm512_loadu_si512(in + 8*in_width); r9 = _mm512_loadu_si512(in + 9*in_width); ra = _mm512_loadu_si512(in + 10*in_width); rb = _mm512_loadu_si512(in + 11*in_width); rc = _mm512_loadu_si512(in + 12*in_width); rd = _mm512_loadu_si512(in + 13*in_width); re = _mm512_loadu_si512(in + 14*in_width); rf = _mm512_loadu_si512(in + 15*in_width); t0 = _mm512_unpacklo_epi16(r0,r1); t1 = _mm512_unpackhi_epi16(r0,r1); t2 = _mm512_unpacklo_epi16(r2,r3); t3 = _mm512_unpackhi_epi16(r2,r3); t4 = _mm512_unpacklo_epi16(r4,r5); t5 = _mm512_unpackhi_epi16(r4,r5); t6 = _mm512_unpacklo_epi16(r6,r7); t7 = _mm512_unpackhi_epi16(r6,r7); t8 = _mm512_unpacklo_epi16(r8,r9); t9 = _mm512_unpackhi_epi16(r8,r9); ta = _mm512_unpacklo_epi16(ra,rb); tb = _mm512_unpackhi_epi16(ra,rb); tc = _mm512_unpacklo_epi16(rc,rd); td = _mm512_unpackhi_epi16(rc,rd); te = _mm512_unpacklo_epi16(re,rf); tf = _mm512_unpackhi_epi16(re,rf); r0 = _mm512_unpacklo_epi32(t0,t2); r1 = _mm512_unpackhi_epi32(t0,t2); r2 = _mm512_unpacklo_epi32(t1,t3); r3 = _mm512_unpackhi_epi32(t1,t3); r4 = _mm512_unpacklo_epi32(t4,t6); r5 = _mm512_unpackhi_epi32(t4,t6); r6 = _mm512_unpacklo_epi32(t5,t7); r7 = _mm512_unpackhi_epi32(t5,t7); r8 = _mm512_unpacklo_epi32(t8,ta); r9 = _mm512_unpackhi_epi32(t8,ta); ra = _mm512_unpacklo_epi32(t9,tb); rb = _mm512_unpackhi_epi32(t9,tb); rc = _mm512_unpacklo_epi32(tc,te); rd = _mm512_unpackhi_epi32(tc,te); re = _mm512_unpacklo_epi32(td,tf); rf = _mm512_unpackhi_epi32(td,tf); t0 = _mm512_unpacklo_epi64(r0,r4); t1 = _mm512_unpackhi_epi64(r0,r4); t2 = _mm512_unpacklo_epi64(r1,r5); t3 = _mm512_unpackhi_epi64(r1,r5); t4 = _mm512_unpacklo_epi64(r2,r6); t5 = _mm512_unpackhi_epi64(r2,r6); t6 = _mm512_unpacklo_epi64(r3,r7); t7 = _mm512_unpackhi_epi64(r3,r7); t8 = _mm512_unpacklo_epi64(r8,rc); t9 = _mm512_unpackhi_epi64(r8,rc); ta = _mm512_unpacklo_epi64(r9,rd); tb = _mm512_unpackhi_epi64(r9,rd); tc = _mm512_unpacklo_epi64(ra,re); td = _mm512_unpackhi_epi64(ra,re); te = _mm512_unpacklo_epi64(rb,rf); tf = _mm512_unpackhi_epi64(rb,rf); r0 = _mm512_shuffle_i32x4(t0, t1, 0x88); r1 = _mm512_shuffle_i32x4(t2, t3, 0x88); r2 = _mm512_shuffle_i32x4(t4, t5, 0x88); r3 = _mm512_shuffle_i32x4(t6, t7, 0x88); r4 = _mm512_shuffle_i32x4(t0, t1, 0xdd); r5 = _mm512_shuffle_i32x4(t2, t3, 0xdd); r6 = _mm512_shuffle_i32x4(t4, t5, 0xdd); r7 = _mm512_shuffle_i32x4(t6, t7, 0xdd); r8 = _mm512_shuffle_i32x4(t8, t9, 0x88); r9 = _mm512_shuffle_i32x4(ta, tb, 0x88); ra = _mm512_shuffle_i32x4(tc, td, 0x88); rb = _mm512_shuffle_i32x4(te, tf, 0x88); rc = _mm512_shuffle_i32x4(t8, t9, 0xdd); rd = _mm512_shuffle_i32x4(ta, tb, 0xdd); re = _mm512_shuffle_i32x4(tc, td, 0xdd); rf = _mm512_shuffle_i32x4(te, tf, 0xdd); t0 = _mm512_permutex2var_epi64(r0, idx_lo, r8); t1 = _mm512_permutex2var_epi64(r1, idx_lo, r9); t2 = _mm512_permutex2var_epi64(r2, idx_lo, ra); t3 = _mm512_permutex2var_epi64(r3, idx_lo, rb); t4 = _mm512_permutex2var_epi64(r4, idx_lo, rc); t5 = _mm512_permutex2var_epi64(r5, idx_lo, rd); t6 = _mm512_permutex2var_epi64(r6, idx_lo, re); t7 = _mm512_permutex2var_epi64(r7, idx_lo, rf); t8 = _mm512_permutex2var_epi64(r8, idx_hi, r0); t9 = _mm512_permutex2var_epi64(r9, idx_hi, r1); ta = _mm512_permutex2var_epi64(ra, idx_hi, r2); tb = _mm512_permutex2var_epi64(rb, idx_hi, r3); tc = _mm512_permutex2var_epi64(rc, idx_hi, r4); td = _mm512_permutex2var_epi64(rd, idx_hi, r5); te = _mm512_permutex2var_epi64(re, idx_hi, r6); tf = _mm512_permutex2var_epi64(rf, idx_hi, r7); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 0*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t0, 0)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 1*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t0, 1)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 2*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t1, 0)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 3*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t1, 1)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 4*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t2, 0)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 5*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t2, 1)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 6*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t3, 0)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 7*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t3, 1)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 8*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t4, 0)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 9*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t4, 1)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 10*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t5, 0)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 11*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t5, 1)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 12*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t6, 0)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 13*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t6, 1)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 14*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t7, 0)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 15*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t7, 1)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 16*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t8, 0)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 17*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t8, 1)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 18*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t9, 0)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 19*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t9, 1)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 20*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(ta, 0)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 21*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(ta, 1)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 22*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tb, 0)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 23*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tb, 1)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 24*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tc, 0)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 25*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tc, 1)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 26*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(td, 0)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 27*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(td, 1)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 28*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(te, 0)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 29*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(te, 1)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 30*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tf, 0)); LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 31*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tf, 1)); #else LIBXSMM_UNUSED(in); LIBXSMM_UNUSED(out); LIBXSMM_UNUSED(ld_in); LIBXSMM_UNUSED(ld_out); #endif } LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) void bf16_transpose_32xcols(libxsmm_bfloat16 *in, libxsmm_bfloat16 *out, int col, int ld_in, int ld_out) { #if defined(LIBXSMM_INTRINSICS_AVX512_CORE) __m512i r0 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), r1 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), r2 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), r3 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), r4 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), r5 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), r6 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), r7 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), r8 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), r9 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), ra = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), rb = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), rc = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), rd = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), re = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), rf = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); __m512i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, te, tf; const int in_width=ld_in, out_width=ld_out; const __m512i idx_lo = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0); const __m512i idx_hi = _mm512_set_epi64(7, 6, 15, 14, 3, 2, 11, 10); __mmask16 store_mask = LIBXSMM_INTRINSICS_MM512_CVTU32_MASK16(((unsigned int)1 << col) - 1); if (col == 15) { r0 = _mm512_loadu_si512(in + 0*in_width); r1 = _mm512_loadu_si512(in + 1*in_width); r2 = _mm512_loadu_si512(in + 2*in_width); r3 = _mm512_loadu_si512(in + 3*in_width); r4 = _mm512_loadu_si512(in + 4*in_width); r5 = _mm512_loadu_si512(in + 5*in_width); r6 = _mm512_loadu_si512(in + 6*in_width); r7 = _mm512_loadu_si512(in + 7*in_width); r8 = _mm512_loadu_si512(in + 8*in_width); r9 = _mm512_loadu_si512(in + 9*in_width); ra = _mm512_loadu_si512(in + 10*in_width); rb = _mm512_loadu_si512(in + 11*in_width); rc = _mm512_loadu_si512(in + 12*in_width); rd = _mm512_loadu_si512(in + 13*in_width); re = _mm512_loadu_si512(in + 14*in_width); } else if (col == 14) { r0 = _mm512_loadu_si512(in + 0*in_width); r1 = _mm512_loadu_si512(in + 1*in_width); r2 = _mm512_loadu_si512(in + 2*in_width); r3 = _mm512_loadu_si512(in + 3*in_width); r4 = _mm512_loadu_si512(in + 4*in_width); r5 = _mm512_loadu_si512(in + 5*in_width); r6 = _mm512_loadu_si512(in + 6*in_width); r7 = _mm512_loadu_si512(in + 7*in_width); r8 = _mm512_loadu_si512(in + 8*in_width); r9 = _mm512_loadu_si512(in + 9*in_width); ra = _mm512_loadu_si512(in + 10*in_width); rb = _mm512_loadu_si512(in + 11*in_width); rc = _mm512_loadu_si512(in + 12*in_width); rd = _mm512_loadu_si512(in + 13*in_width); } else if (col == 13) { r0 = _mm512_loadu_si512(in + 0*in_width); r1 = _mm512_loadu_si512(in + 1*in_width); r2 = _mm512_loadu_si512(in + 2*in_width); r3 = _mm512_loadu_si512(in + 3*in_width); r4 = _mm512_loadu_si512(in + 4*in_width); r5 = _mm512_loadu_si512(in + 5*in_width); r6 = _mm512_loadu_si512(in + 6*in_width); r7 = _mm512_loadu_si512(in + 7*in_width); r8 = _mm512_loadu_si512(in + 8*in_width); r9 = _mm512_loadu_si512(in + 9*in_width); ra = _mm512_loadu_si512(in + 10*in_width); rb = _mm512_loadu_si512(in + 11*in_width); rc = _mm512_loadu_si512(in + 12*in_width); } else if (col == 12) { r0 = _mm512_loadu_si512(in + 0*in_width); r1 = _mm512_loadu_si512(in + 1*in_width); r2 = _mm512_loadu_si512(in + 2*in_width); r3 = _mm512_loadu_si512(in + 3*in_width); r4 = _mm512_loadu_si512(in + 4*in_width); r5 = _mm512_loadu_si512(in + 5*in_width); r6 = _mm512_loadu_si512(in + 6*in_width); r7 = _mm512_loadu_si512(in + 7*in_width); r8 = _mm512_loadu_si512(in + 8*in_width); r9 = _mm512_loadu_si512(in + 9*in_width); ra = _mm512_loadu_si512(in + 10*in_width); rb = _mm512_loadu_si512(in + 11*in_width); } else if (col == 11) { r0 = _mm512_loadu_si512(in + 0*in_width); r1 = _mm512_loadu_si512(in + 1*in_width); r2 = _mm512_loadu_si512(in + 2*in_width); r3 = _mm512_loadu_si512(in + 3*in_width); r4 = _mm512_loadu_si512(in + 4*in_width); r5 = _mm512_loadu_si512(in + 5*in_width); r6 = _mm512_loadu_si512(in + 6*in_width); r7 = _mm512_loadu_si512(in + 7*in_width); r8 = _mm512_loadu_si512(in + 8*in_width); r9 = _mm512_loadu_si512(in + 9*in_width); ra = _mm512_loadu_si512(in + 10*in_width); } else if (col == 10) { r0 = _mm512_loadu_si512(in + 0*in_width); r1 = _mm512_loadu_si512(in + 1*in_width); r2 = _mm512_loadu_si512(in + 2*in_width); r3 = _mm512_loadu_si512(in + 3*in_width); r4 = _mm512_loadu_si512(in + 4*in_width); r5 = _mm512_loadu_si512(in + 5*in_width); r6 = _mm512_loadu_si512(in + 6*in_width); r7 = _mm512_loadu_si512(in + 7*in_width); r8 = _mm512_loadu_si512(in + 8*in_width); r9 = _mm512_loadu_si512(in + 9*in_width); } else if (col == 9) { r0 = _mm512_loadu_si512(in + 0*in_width); r1 = _mm512_loadu_si512(in + 1*in_width); r2 = _mm512_loadu_si512(in + 2*in_width); r3 = _mm512_loadu_si512(in + 3*in_width); r4 = _mm512_loadu_si512(in + 4*in_width); r5 = _mm512_loadu_si512(in + 5*in_width); r6 = _mm512_loadu_si512(in + 6*in_width); r7 = _mm512_loadu_si512(in + 7*in_width); r8 = _mm512_loadu_si512(in + 8*in_width); } else if (col == 8) { r0 = _mm512_loadu_si512(in + 0*in_width); r1 = _mm512_loadu_si512(in + 1*in_width); r2 = _mm512_loadu_si512(in + 2*in_width); r3 = _mm512_loadu_si512(in + 3*in_width); r4 = _mm512_loadu_si512(in + 4*in_width); r5 = _mm512_loadu_si512(in + 5*in_width); r6 = _mm512_loadu_si512(in + 6*in_width); r7 = _mm512_loadu_si512(in + 7*in_width); } else if (col == 7) { r0 = _mm512_loadu_si512(in + 0*in_width); r1 = _mm512_loadu_si512(in + 1*in_width); r2 = _mm512_loadu_si512(in + 2*in_width); r3 = _mm512_loadu_si512(in + 3*in_width); r4 = _mm512_loadu_si512(in + 4*in_width); r5 = _mm512_loadu_si512(in + 5*in_width); r6 = _mm512_loadu_si512(in + 6*in_width); } else if (col == 6) { r0 = _mm512_loadu_si512(in + 0*in_width); r1 = _mm512_loadu_si512(in + 1*in_width); r2 = _mm512_loadu_si512(in + 2*in_width); r3 = _mm512_loadu_si512(in + 3*in_width); r4 = _mm512_loadu_si512(in + 4*in_width); r5 = _mm512_loadu_si512(in + 5*in_width); } else if (col == 5) { r0 = _mm512_loadu_si512(in + 0*in_width); r1 = _mm512_loadu_si512(in + 1*in_width); r2 = _mm512_loadu_si512(in + 2*in_width); r3 = _mm512_loadu_si512(in + 3*in_width); r4 = _mm512_loadu_si512(in + 4*in_width); } else if (col == 4) { r0 = _mm512_loadu_si512(in + 0*in_width); r1 = _mm512_loadu_si512(in + 1*in_width); r2 = _mm512_loadu_si512(in + 2*in_width); r3 = _mm512_loadu_si512(in + 3*in_width); } else if (col == 3) { r0 = _mm512_loadu_si512(in + 0*in_width); r1 = _mm512_loadu_si512(in + 1*in_width); r2 = _mm512_loadu_si512(in + 2*in_width); } else if (col == 2) { r0 = _mm512_loadu_si512(in + 0*in_width); r1 = _mm512_loadu_si512(in + 1*in_width); } else if (col == 1) { r0 = _mm512_loadu_si512(in + 0*in_width); } t0 = _mm512_unpacklo_epi16(r0,r1); t1 = _mm512_unpackhi_epi16(r0,r1); t2 = _mm512_unpacklo_epi16(r2,r3); t3 = _mm512_unpackhi_epi16(r2,r3); t4 = _mm512_unpacklo_epi16(r4,r5); t5 = _mm512_unpackhi_epi16(r4,r5); t6 = _mm512_unpacklo_epi16(r6,r7); t7 = _mm512_unpackhi_epi16(r6,r7); t8 = _mm512_unpacklo_epi16(r8,r9); t9 = _mm512_unpackhi_epi16(r8,r9); ta = _mm512_unpacklo_epi16(ra,rb); tb = _mm512_unpackhi_epi16(ra,rb); tc = _mm512_unpacklo_epi16(rc,rd); td = _mm512_unpackhi_epi16(rc,rd); te = _mm512_unpacklo_epi16(re,rf); tf = _mm512_unpackhi_epi16(re,rf); r0 = _mm512_unpacklo_epi32(t0,t2); r1 = _mm512_unpackhi_epi32(t0,t2); r2 = _mm512_unpacklo_epi32(t1,t3); r3 = _mm512_unpackhi_epi32(t1,t3); r4 = _mm512_unpacklo_epi32(t4,t6); r5 = _mm512_unpackhi_epi32(t4,t6); r6 = _mm512_unpacklo_epi32(t5,t7); r7 = _mm512_unpackhi_epi32(t5,t7); r8 = _mm512_unpacklo_epi32(t8,ta); r9 = _mm512_unpackhi_epi32(t8,ta); ra = _mm512_unpacklo_epi32(t9,tb); rb = _mm512_unpackhi_epi32(t9,tb); rc = _mm512_unpacklo_epi32(tc,te); rd = _mm512_unpackhi_epi32(tc,te); re = _mm512_unpacklo_epi32(td,tf); rf = _mm512_unpackhi_epi32(td,tf); t0 = _mm512_unpacklo_epi64(r0,r4); t1 = _mm512_unpackhi_epi64(r0,r4); t2 = _mm512_unpacklo_epi64(r1,r5); t3 = _mm512_unpackhi_epi64(r1,r5); t4 = _mm512_unpacklo_epi64(r2,r6); t5 = _mm512_unpackhi_epi64(r2,r6); t6 = _mm512_unpacklo_epi64(r3,r7); t7 = _mm512_unpackhi_epi64(r3,r7); t8 = _mm512_unpacklo_epi64(r8,rc); t9 = _mm512_unpackhi_epi64(r8,rc); ta = _mm512_unpacklo_epi64(r9,rd); tb = _mm512_unpackhi_epi64(r9,rd); tc = _mm512_unpacklo_epi64(ra,re); td = _mm512_unpackhi_epi64(ra,re); te = _mm512_unpacklo_epi64(rb,rf); tf = _mm512_unpackhi_epi64(rb,rf); r0 = _mm512_shuffle_i32x4(t0, t1, 0x88); r1 = _mm512_shuffle_i32x4(t2, t3, 0x88); r2 = _mm512_shuffle_i32x4(t4, t5, 0x88); r3 = _mm512_shuffle_i32x4(t6, t7, 0x88); r4 = _mm512_shuffle_i32x4(t0, t1, 0xdd); r5 = _mm512_shuffle_i32x4(t2, t3, 0xdd); r6 = _mm512_shuffle_i32x4(t4, t5, 0xdd); r7 = _mm512_shuffle_i32x4(t6, t7, 0xdd); r8 = _mm512_shuffle_i32x4(t8, t9, 0x88); r9 = _mm512_shuffle_i32x4(ta, tb, 0x88); ra = _mm512_shuffle_i32x4(tc, td, 0x88); rb = _mm512_shuffle_i32x4(te, tf, 0x88); rc = _mm512_shuffle_i32x4(t8, t9, 0xdd); rd = _mm512_shuffle_i32x4(ta, tb, 0xdd); re = _mm512_shuffle_i32x4(tc, td, 0xdd); rf = _mm512_shuffle_i32x4(te, tf, 0xdd); t0 = _mm512_permutex2var_epi64(r0, idx_lo, r8); t1 = _mm512_permutex2var_epi64(r1, idx_lo, r9); t2 = _mm512_permutex2var_epi64(r2, idx_lo, ra); t3 = _mm512_permutex2var_epi64(r3, idx_lo, rb); t4 = _mm512_permutex2var_epi64(r4, idx_lo, rc); t5 = _mm512_permutex2var_epi64(r5, idx_lo, rd); t6 = _mm512_permutex2var_epi64(r6, idx_lo, re); t7 = _mm512_permutex2var_epi64(r7, idx_lo, rf); t8 = _mm512_permutex2var_epi64(r8, idx_hi, r0); t9 = _mm512_permutex2var_epi64(r9, idx_hi, r1); ta = _mm512_permutex2var_epi64(ra, idx_hi, r2); tb = _mm512_permutex2var_epi64(rb, idx_hi, r3); tc = _mm512_permutex2var_epi64(rc, idx_hi, r4); td = _mm512_permutex2var_epi64(rd, idx_hi, r5); te = _mm512_permutex2var_epi64(re, idx_hi, r6); tf = _mm512_permutex2var_epi64(rf, idx_hi, r7); _mm256_mask_storeu_epi16(out + 0*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t0, 0)); _mm256_mask_storeu_epi16(out + 1*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t0, 1)); _mm256_mask_storeu_epi16(out + 2*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t1, 0)); _mm256_mask_storeu_epi16(out + 3*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t1, 1)); _mm256_mask_storeu_epi16(out + 4*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t2, 0)); _mm256_mask_storeu_epi16(out + 5*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t2, 1)); _mm256_mask_storeu_epi16(out + 6*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t3, 0)); _mm256_mask_storeu_epi16(out + 7*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t3, 1)); _mm256_mask_storeu_epi16(out + 8*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t4, 0)); _mm256_mask_storeu_epi16(out + 9*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t4, 1)); _mm256_mask_storeu_epi16(out + 10*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t5, 0)); _mm256_mask_storeu_epi16(out + 11*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t5, 1)); _mm256_mask_storeu_epi16(out + 12*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t6, 0)); _mm256_mask_storeu_epi16(out + 13*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t6, 1)); _mm256_mask_storeu_epi16(out + 14*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t7, 0)); _mm256_mask_storeu_epi16(out + 15*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t7, 1)); _mm256_mask_storeu_epi16(out + 16*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t8, 0)); _mm256_mask_storeu_epi16(out + 17*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t8, 1)); _mm256_mask_storeu_epi16(out + 18*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t9, 0)); _mm256_mask_storeu_epi16(out + 19*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t9, 1)); _mm256_mask_storeu_epi16(out + 20*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(ta, 0)); _mm256_mask_storeu_epi16(out + 21*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(ta, 1)); _mm256_mask_storeu_epi16(out + 22*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tb, 0)); _mm256_mask_storeu_epi16(out + 23*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tb, 1)); _mm256_mask_storeu_epi16(out + 24*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tc, 0)); _mm256_mask_storeu_epi16(out + 25*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tc, 1)); _mm256_mask_storeu_epi16(out + 26*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(td, 0)); _mm256_mask_storeu_epi16(out + 27*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(td, 1)); _mm256_mask_storeu_epi16(out + 28*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(te, 0)); _mm256_mask_storeu_epi16(out + 29*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(te, 1)); _mm256_mask_storeu_epi16(out + 30*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tf, 0)); _mm256_mask_storeu_epi16(out + 31*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tf, 1)); #else LIBXSMM_UNUSED(in); LIBXSMM_UNUSED(out); LIBXSMM_UNUSED(ld_in); LIBXSMM_UNUSED(ld_out); LIBXSMM_UNUSED(col); #endif } LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) void bf16_transpose(libxsmm_bfloat16 *in, libxsmm_bfloat16 *out, int M, int N, int ld_in, int ld_out){ #if defined(LIBXSMM_INTRINSICS_AVX512_CORE) int i, j; int full16_chunks = N/16; int remainder_cols = N%16; int _N = N - remainder_cols; if (full16_chunks) { for (i=0; iifmblock; libxsmm_blasint ldb_bwd = (libxsmm_blasint)handle->desc.K; libxsmm_blasint ldc_bwd = (libxsmm_blasint)handle->desc.C; libxsmm_blasint lda_upd = (libxsmm_blasint)handle->desc.K; libxsmm_blasint ldb_upd = (libxsmm_blasint)handle->desc.N; libxsmm_blasint ldc_upd = (libxsmm_blasint)handle->ofmblock; element_input_type alpha = (element_input_type)1; element_input_type beta = (element_input_type)0; if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { typedef libxsmm_smmfunction gemm_function; gemm_function gemm_kernel_bwd = libxsmm_smmdispatch(handle->ifmblock, handle->desc.N, handle->desc.K, &lda_bwd, &ldb_bwd, &ldc_bwd, &alpha, &beta, NULL, NULL); gemm_function gemm_kernel_upd = libxsmm_smmdispatch(handle->ofmblock, handle->ifmblock, handle->desc.N, &lda_upd, &ldb_upd, &ldc_upd, &alpha, &beta, NULL, NULL); # include "template/libxsmm_dnn_fullyconnected_st_bwdupd_custom_generic.tpl.c" } else { status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; } #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(kind); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_custom_bf16_f32(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef libxsmm_bfloat16 element_input_type; typedef float element_output_type; typedef libxsmm_bfloat16 element_filter_type; typedef libxsmm_smmfunction gemm_function; libxsmm_blasint lda_bwd = (libxsmm_blasint)handle->ifmblock; libxsmm_blasint ldb_bwd = (libxsmm_blasint)handle->desc.K; libxsmm_blasint ldc_bwd = (libxsmm_blasint)handle->desc.C; libxsmm_blasint lda_upd = (libxsmm_blasint)handle->desc.K; libxsmm_blasint ldb_upd = (libxsmm_blasint)handle->desc.N; libxsmm_blasint ldc_upd = (libxsmm_blasint)handle->ofmblock; float alpha = (element_input_type)1; float beta = (element_input_type)0; if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { gemm_function gemm_kernel_bwd = libxsmm_smmdispatch(handle->ifmblock, handle->desc.N, handle->desc.K, &lda_bwd, &ldb_bwd, &ldc_bwd, &alpha, &beta, NULL, NULL); gemm_function gemm_kernel_upd = libxsmm_smmdispatch(handle->ofmblock, handle->ifmblock, handle->desc.N, &lda_upd, &ldb_upd, &ldc_upd, &alpha, &beta, NULL, NULL); # define LIBXSMM_DNN_FULLYCONNECTED_BWD_BF16_F32 # define LIBXSMM_DNN_FULLYCONNECTED_UPD_BF16_F32 # include "template/libxsmm_dnn_fullyconnected_st_bwdupd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FULLYCONNECTED_UPD_BF16_F32 # undef LIBXSMM_DNN_FULLYCONNECTED_BWD_BF16_F32 } else { status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; } #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(kind); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_f32_f32(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; libxsmm_smmfunction_reducebatch_strd batchreduce_kernel_bwd = handle->gemm_bwd.xgemm.smrs; libxsmm_smmfunction_reducebatch_strd batchreduce_kernel_bwd_zerobeta = handle->gemm_bwd2.xgemm.smrs; libxsmm_smmfunction_reducebatch_strd batchreduce_kernel_upd = handle->gemm_upd.xgemm.smrs; libxsmm_smmfunction_reducebatch_strd batchreduce_kernel_upd_zerobeta = handle->gemm_upd2.xgemm.smrs; #define LIBXSMM_DNN_FC_BWD_USE_AVX512 if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { # include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic.tpl.c" } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS ) { #define LIBXSMM_DNN_FC_BWD_FUSE_BIAS # include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic.tpl.c" #undef LIBXSMM_DNN_FC_BWD_FUSE_BIAS } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_RELU ) { #define LIBXSMM_DNN_FC_BWD_FUSE_RELU # include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic.tpl.c" #undef LIBXSMM_DNN_FC_BWD_FUSE_RELU } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_SIGMOID ) { #define LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID # include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic.tpl.c" #undef LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_RELU ) { #define LIBXSMM_DNN_FC_BWD_FUSE_BIAS #define LIBXSMM_DNN_FC_BWD_FUSE_RELU # include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic.tpl.c" #undef LIBXSMM_DNN_FC_BWD_FUSE_RELU #undef LIBXSMM_DNN_FC_BWD_FUSE_BIAS } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_SIGMOID ) { #define LIBXSMM_DNN_FC_BWD_FUSE_BIAS #define LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID # include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic.tpl.c" #undef LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID #undef LIBXSMM_DNN_FC_BWD_FUSE_BIAS } else { status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; } #undef LIBXSMM_DNN_FC_BWD_USE_AVX512 #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(kind); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_bf16_bf16_emu(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef libxsmm_bfloat16 element_filter_type; libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernel_bwd = handle->gemm_bwd.xgemm.bsmrs; libxsmm_bmmfunction_reducebatch_strd batchreduce_kernel_bwd_zerobeta = handle->gemm_bwd2.xgemm.bmrs; libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernel_upd = handle->gemm_upd.xgemm.bsmrs; libxsmm_bmmfunction_reducebatch_strd batchreduce_kernel_upd_zerobeta = handle->gemm_upd2.xgemm.bmrs; /* some portable macrros fof BF16 <-> FP32 */ # include "template/libxsmm_dnn_bf16_macros_define.tpl.c" if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { # include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16.tpl.c" } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS ) { #define LIBXSMM_DNN_FC_BWD_FUSE_BIAS # include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16.tpl.c" #undef LIBXSMM_DNN_FC_BWD_FUSE_BIAS } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_RELU ) { #define LIBXSMM_DNN_FC_BWD_FUSE_RELU # include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16.tpl.c" #undef LIBXSMM_DNN_FC_BWD_FUSE_RELU } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_SIGMOID ) { #define LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID # include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16.tpl.c" #undef LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_RELU ) { #define LIBXSMM_DNN_FC_BWD_FUSE_BIAS #define LIBXSMM_DNN_FC_BWD_FUSE_RELU # include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16.tpl.c" #undef LIBXSMM_DNN_FC_BWD_FUSE_RELU #undef LIBXSMM_DNN_FC_BWD_FUSE_BIAS } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_SIGMOID ) { #define LIBXSMM_DNN_FC_BWD_FUSE_BIAS #define LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID # include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16.tpl.c" #undef LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID #undef LIBXSMM_DNN_FC_BWD_FUSE_BIAS } else { status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; } # include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(kind); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } #if defined(LIBXSMM_INTRINSICS_AVX512_CPX) LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CPX) libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_bf16_bf16(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef libxsmm_bfloat16 element_filter_type; libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernel_bwd = handle->gemm_bwd.xgemm.bsmrs; libxsmm_bmmfunction_reducebatch_strd batchreduce_kernel_bwd_zerobeta = handle->gemm_bwd2.xgemm.bmrs; libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernel_upd = handle->gemm_upd.xgemm.bsmrs; libxsmm_bmmfunction_reducebatch_strd batchreduce_kernel_upd_zerobeta = handle->gemm_upd2.xgemm.bmrs; #define LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI /* some portable macrros fof BF16 <-> FP32 */ # include "template/libxsmm_dnn_bf16_macros_define.tpl.c" if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { # include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16.tpl.c" } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS ) { #define LIBXSMM_DNN_FC_BWD_FUSE_BIAS # include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16.tpl.c" #undef LIBXSMM_DNN_FC_BWD_FUSE_BIAS } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_RELU ) { #define LIBXSMM_DNN_FC_BWD_FUSE_RELU # include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16.tpl.c" #undef LIBXSMM_DNN_FC_BWD_FUSE_RELU } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_SIGMOID ) { #define LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID # include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16.tpl.c" #undef LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_RELU ) { #define LIBXSMM_DNN_FC_BWD_FUSE_BIAS #define LIBXSMM_DNN_FC_BWD_FUSE_RELU # include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16.tpl.c" #undef LIBXSMM_DNN_FC_BWD_FUSE_RELU #undef LIBXSMM_DNN_FC_BWD_FUSE_BIAS } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_SIGMOID ) { #define LIBXSMM_DNN_FC_BWD_FUSE_BIAS #define LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID # include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16.tpl.c" #undef LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID #undef LIBXSMM_DNN_FC_BWD_FUSE_BIAS } else { status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; } # include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" #undef LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(kind); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } #else LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_bf16_bf16(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) { return libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_bf16_bf16_emu( handle, kind, start_thread, tid ); } #endif LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_custom(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check if all required tensors are bound */ if ( kind == LIBXSMM_DNN_COMPUTE_KIND_BWD ) { if (handle->grad_input == 0 || handle->grad_output == 0 || handle->reg_filter == 0 || handle->scratch == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } } else if ( kind == LIBXSMM_DNN_COMPUTE_KIND_UPD ) { if (handle->reg_input == 0 || handle->grad_output == 0 || handle->grad_filter == 0 || handle->scratch == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } } else { if (handle->grad_input == 0 || handle->grad_output == 0 || handle->reg_input == 0 || handle->grad_filter == 0 || handle->reg_filter == 0 || handle->scratch == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } } /* check if we are on an AVX512 platform */ #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_fullyconnected_st_bwdupd_custom_f32_f32( handle, kind, start_thread, tid); } #if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__*/ else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_fullyconnected_st_bwdupd_custom_bf16_f32( handle, kind, start_thread, tid); } #endif else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else #endif { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; typedef libxsmm_smmfunction gemm_function; libxsmm_blasint lda_bwd = (libxsmm_blasint)handle->ifmblock; libxsmm_blasint ldb_bwd = (libxsmm_blasint)handle->desc.K; libxsmm_blasint ldc_bwd = (libxsmm_blasint)handle->desc.C; libxsmm_blasint lda_upd = (libxsmm_blasint)handle->desc.K; libxsmm_blasint ldb_upd = (libxsmm_blasint)handle->desc.N; libxsmm_blasint ldc_upd = (libxsmm_blasint)handle->ofmblock; element_input_type alpha = (element_input_type)1; element_input_type beta = (element_input_type)0; if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { gemm_function gemm_kernel_bwd = libxsmm_smmdispatch(handle->ifmblock, handle->desc.N, handle->desc.K, &lda_bwd, &ldb_bwd, &ldc_bwd, &alpha, &beta, NULL, NULL); gemm_function gemm_kernel_upd = libxsmm_smmdispatch(handle->ofmblock, handle->ifmblock, handle->desc.N, &lda_upd, &ldb_upd, &ldc_upd, &alpha, &beta, NULL, NULL); # include "template/libxsmm_dnn_fullyconnected_st_bwdupd_custom_generic.tpl.c" } else { status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; } } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { typedef libxsmm_bfloat16 element_input_type; typedef float element_output_type; typedef libxsmm_bfloat16 element_filter_type; typedef libxsmm_smmfunction gemm_function; libxsmm_blasint lda_bwd = (libxsmm_blasint)handle->ifmblock; libxsmm_blasint ldb_bwd = (libxsmm_blasint)handle->desc.K; libxsmm_blasint ldc_bwd = (libxsmm_blasint)handle->desc.C; libxsmm_blasint lda_upd = (libxsmm_blasint)handle->desc.K; libxsmm_blasint ldb_upd = (libxsmm_blasint)handle->desc.N; libxsmm_blasint ldc_upd = (libxsmm_blasint)handle->ofmblock; float alpha = (element_input_type)1; float beta = (element_input_type)0; if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { gemm_function gemm_kernel_bwd = libxsmm_smmdispatch(handle->ifmblock, handle->desc.N, handle->desc.K, &lda_bwd, &ldb_bwd, &ldc_bwd, &alpha, &beta, NULL, NULL); gemm_function gemm_kernel_upd = libxsmm_smmdispatch(handle->ofmblock, handle->ifmblock, handle->desc.N, &lda_upd, &ldb_upd, &ldc_upd, &alpha, &beta, NULL, NULL); # define LIBXSMM_DNN_FULLYCONNECTED_BWD_BF16_F32 # define LIBXSMM_DNN_FULLYCONNECTED_UPD_BF16_F32 # include "template/libxsmm_dnn_fullyconnected_st_bwdupd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FULLYCONNECTED_UPD_BF16_F32 # undef LIBXSMM_DNN_FULLYCONNECTED_BWD_BF16_F32 } else { status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; } } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check if all required tensors are bound */ if ( kind == LIBXSMM_DNN_COMPUTE_KIND_BWD ) { if (handle->grad_input == 0 || handle->grad_output == 0 || handle->reg_filter == 0 || handle->scratch == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } } else if ( kind == LIBXSMM_DNN_COMPUTE_KIND_UPD ) { if (handle->reg_input == 0 || handle->grad_output == 0 || handle->grad_filter == 0 || handle->scratch == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } } else { if (handle->grad_input == 0 || handle->grad_output == 0 || handle->reg_input == 0 || handle->grad_filter == 0 || handle->reg_filter == 0 || handle->scratch == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } } if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS ) != 0) && ( handle->grad_bias == 0 ) ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FULLYCONNECTED_FUSE_RELU ) != 0) && ( handle->relumask == 0 ) ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } /* check if we are on an AVX512 platform */ #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_f32_f32( handle, kind, start_thread, tid); } #if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_CORE && libxsmm_target_archid < LIBXSMM_X86_AVX512_CPX) { status = libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_bf16_bf16_emu( handle, kind, start_thread, tid); } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_CPX ) { status = libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_bf16_bf16( handle, kind, start_thread, tid); } #elif defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_CORE ) { status = libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_bf16_bf16_emu( handle, kind, start_thread, tid); } #endif else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else #endif { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; libxsmm_smmfunction_reducebatch_strd batchreduce_kernel_bwd = handle->gemm_bwd.xgemm.smrs; libxsmm_smmfunction_reducebatch_strd batchreduce_kernel_bwd_zerobeta = handle->gemm_bwd2.xgemm.smrs; libxsmm_smmfunction_reducebatch_strd batchreduce_kernel_upd = handle->gemm_upd.xgemm.smrs; libxsmm_smmfunction_reducebatch_strd batchreduce_kernel_upd_zerobeta = handle->gemm_upd2.xgemm.smrs; if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { # include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic.tpl.c" } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS ) { #define LIBXSMM_DNN_FC_BWD_FUSE_BIAS # include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic.tpl.c" #undef LIBXSMM_DNN_FC_BWD_FUSE_BIAS } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_RELU ) { #define LIBXSMM_DNN_FC_BWD_FUSE_RELU # include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic.tpl.c" #undef LIBXSMM_DNN_FC_BWD_FUSE_RELU } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_SIGMOID ) { #define LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID # include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic.tpl.c" #undef LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_RELU ) { #define LIBXSMM_DNN_FC_BWD_FUSE_BIAS #define LIBXSMM_DNN_FC_BWD_FUSE_RELU # include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic.tpl.c" #undef LIBXSMM_DNN_FC_BWD_FUSE_RELU #undef LIBXSMM_DNN_FC_BWD_FUSE_BIAS } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_SIGMOID ) { #define LIBXSMM_DNN_FC_BWD_FUSE_BIAS #define LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID # include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic.tpl.c" #undef LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID #undef LIBXSMM_DNN_FC_BWD_FUSE_BIAS } else { status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; } } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_nhwc(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; LIBXSMM_UNUSED( handle ); LIBXSMM_UNUSED( kind ); LIBXSMM_UNUSED( start_thread ); LIBXSMM_UNUSED( tid ); return status; } libxsmm-1.17/src/libxsmm_dnn_fullyconnected_backward_weight_update.h000066400000000000000000000027531415223013700262410ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_DNN_FULLYCONNECTED_BACKWARD_WEIGHT_UPDATE_H #define LIBXSMM_DNN_FULLYCONNECTED_BACKWARD_WEIGHT_UPDATE_H #include LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_custom(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_nhwc(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); #endif /* LIBXSMM_DNN_FULLYCONNECTED_BACKWARD_WEIGHT_UPDATE_H */ libxsmm-1.17/src/libxsmm_dnn_fullyconnected_forward.c000066400000000000000000000471361415223013700232150ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Evangelos Georganas (Intel Corp.) ******************************************************************************/ #include "libxsmm_dnn_fullyconnected_forward.h" #include "libxsmm_main.h" LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_custom_f32_f32(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_custom_bf16_f32(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_f32_f32(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_bf16_bf16(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_bf16_bf16_emu(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid); LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_custom_f32_f32(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; typedef libxsmm_smmfunction gemm_function; element_input_type alpha = (element_input_type)1; element_input_type beta = (element_input_type)0; libxsmm_blasint lda = (libxsmm_blasint)handle->ofmblock; libxsmm_blasint ldb = (libxsmm_blasint)handle->desc.C; libxsmm_blasint ldc = (libxsmm_blasint)handle->desc.K; if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { gemm_function gemm_kernel = libxsmm_smmdispatch(handle->ofmblock, handle->desc.N, handle->desc.C, &lda, &ldb, &ldc, &alpha, &beta, NULL, NULL); # include "template/libxsmm_dnn_fullyconnected_st_fwd_custom_generic.tpl.c" } else { status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; } #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_custom_bf16_f32(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef libxsmm_bfloat16 element_input_type; typedef float element_output_type; typedef libxsmm_bfloat16 element_filter_type; typedef libxsmm_smmfunction gemm_function; libxsmm_blasint lda = (libxsmm_blasint)handle->ofmblock; libxsmm_blasint ldb = (libxsmm_blasint)handle->desc.C; libxsmm_blasint ldc = (libxsmm_blasint)handle->desc.K; float alpha = (element_input_type)1; float beta = (element_input_type)0; if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { gemm_function gemm_kernel = libxsmm_smmdispatch(handle->ofmblock, handle->desc.N, handle->desc.C, &lda, &ldb, &ldc, &alpha, &beta, NULL, NULL); # define LIBXSMM_DNN_FULLYCONNECTED_FWD_BF16_F32 # include "template/libxsmm_dnn_fullyconnected_st_fwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FULLYCONNECTED_FWD_BF16_F32 } else { status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; } #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_f32_f32(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; libxsmm_smmfunction_reducebatch_strd batchreduce_kernel_beta = handle->gemm_fwd.xgemm.smrs; libxsmm_smmfunction_reducebatch_strd batchreduce_kernel_zerobeta = handle->gemm_fwd2.xgemm.smrs; #define LIBXSMM_DNN_FC_FWD_USE_AVX512 if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { #define LIBXSMM_DNN_FC_FWD_FUSE_NONE # include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic.tpl.c" #undef LIBXSMM_DNN_FC_FWD_FUSE_NONE } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS ) { #define LIBXSMM_DNN_FC_FWD_FUSE_BIAS # include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic.tpl.c" #undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_RELU ) { #define LIBXSMM_DNN_FC_FWD_FUSE_RELU # include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic.tpl.c" #undef LIBXSMM_DNN_FC_FWD_FUSE_RELU } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_SIGMOID ) { #define LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID # include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic.tpl.c" #undef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_RELU ) { #define LIBXSMM_DNN_FC_FWD_FUSE_BIAS #define LIBXSMM_DNN_FC_FWD_FUSE_RELU # include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic.tpl.c" #undef LIBXSMM_DNN_FC_FWD_FUSE_RELU #undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_SIGMOID ) { #define LIBXSMM_DNN_FC_FWD_FUSE_BIAS #define LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID # include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic.tpl.c" #undef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID #undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS } else { status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; } #undef LIBXSMM_DNN_FC_FWD_USE_AVX512 #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_bf16_bf16_emu(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef libxsmm_bfloat16 element_filter_type; libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernel = handle->gemm_fwd.xgemm.bsmrs; libxsmm_bmmfunction_reducebatch_strd batchreduce_kernel_zerobeta = handle->gemm_fwd2.xgemm.bmrs; libxsmm_bmmfunction_reducebatch_strd batchreduce_kernel_beta = handle->gemm_fwd3.xgemm.bmrs; /* some portable macrros fof BF16 <-> FP32 */ # include "template/libxsmm_dnn_bf16_macros_define.tpl.c" if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { #define LIBXSMM_DNN_FC_FWD_FUSE_NONE # include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16.tpl.c" #undef LIBXSMM_DNN_FC_FWD_FUSE_NONE } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS ) { #define LIBXSMM_DNN_FC_FWD_FUSE_BIAS # include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16.tpl.c" #undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_RELU ) { #define LIBXSMM_DNN_FC_FWD_FUSE_RELU # include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16.tpl.c" #undef LIBXSMM_DNN_FC_FWD_FUSE_RELU } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_SIGMOID ) { #define LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID # include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16.tpl.c" #undef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_RELU ) { #define LIBXSMM_DNN_FC_FWD_FUSE_BIAS #define LIBXSMM_DNN_FC_FWD_FUSE_RELU # include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16.tpl.c" #undef LIBXSMM_DNN_FC_FWD_FUSE_RELU #undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_SIGMOID ) { #define LIBXSMM_DNN_FC_FWD_FUSE_BIAS #define LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID # include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16.tpl.c" #undef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID #undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS } else { status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; } # include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } #if defined(LIBXSMM_INTRINSICS_AVX512_CPX) LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CPX) libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_bf16_bf16(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef libxsmm_bfloat16 element_filter_type; libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernel = handle->gemm_fwd.xgemm.bsmrs; libxsmm_bmmfunction_reducebatch_strd batchreduce_kernel_zerobeta = handle->gemm_fwd2.xgemm.bmrs; libxsmm_bmmfunction_reducebatch_strd batchreduce_kernel_beta = handle->gemm_fwd3.xgemm.bmrs; #define LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI /* some portable macrros fof BF16 <-> FP32 */ # include "template/libxsmm_dnn_bf16_macros_define.tpl.c" if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { #define LIBXSMM_DNN_FC_FWD_FUSE_NONE # include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16.tpl.c" #undef LIBXSMM_DNN_FC_FWD_FUSE_NONE } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS ) { #define LIBXSMM_DNN_FC_FWD_FUSE_BIAS # include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16.tpl.c" #undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_RELU ) { #define LIBXSMM_DNN_FC_FWD_FUSE_RELU # include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16.tpl.c" #undef LIBXSMM_DNN_FC_FWD_FUSE_RELU } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_SIGMOID ) { #define LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID # include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16.tpl.c" #undef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_RELU ) { #define LIBXSMM_DNN_FC_FWD_FUSE_BIAS #define LIBXSMM_DNN_FC_FWD_FUSE_RELU # include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16.tpl.c" #undef LIBXSMM_DNN_FC_FWD_FUSE_RELU #undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_SIGMOID ) { #define LIBXSMM_DNN_FC_FWD_FUSE_BIAS #define LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID # include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16.tpl.c" #undef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID #undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS } else { status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; } # include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" #undef LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } #else LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_bf16_bf16(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid) { return libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_bf16_bf16_emu( handle, start_thread, tid ); } #endif LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_custom(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check if all required tensors are bound */ if (handle->reg_input == 0 || handle->reg_output == 0 || handle->reg_filter == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } /* check if we are on an AVX512 platform */ #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) { if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_fullyconnected_st_fwd_custom_f32_f32( handle, start_thread, tid); } #if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_CORE ) { status = libxsmm_dnn_fullyconnected_st_fwd_custom_bf16_f32( handle, start_thread, tid); } #endif else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else #endif { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; typedef libxsmm_smmfunction gemm_function; libxsmm_blasint lda = (libxsmm_blasint)handle->ofmblock; libxsmm_blasint ldb = (libxsmm_blasint)handle->desc.C; libxsmm_blasint ldc = (libxsmm_blasint)handle->desc.K; element_input_type beta = (element_input_type)0; element_input_type alpha = (element_input_type)1; if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { gemm_function gemm_kernel = libxsmm_smmdispatch(handle->ofmblock, handle->desc.N, handle->desc.C, &lda, &ldb, &ldc, &alpha, &beta, NULL, NULL); # include "template/libxsmm_dnn_fullyconnected_st_fwd_custom_generic.tpl.c" } else { status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; } } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check if all required tensors are bound */ if (handle->reg_input == 0 || handle->reg_output == 0 || handle->reg_filter == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS ) != 0) && ( handle->reg_bias == 0 ) ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FULLYCONNECTED_FUSE_RELU ) != 0) && ( handle->relumask == 0 ) ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } /* check if we are on an AVX512 platform */ #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_f32_f32( handle, start_thread, tid); } #if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_CORE && libxsmm_target_archid < LIBXSMM_X86_AVX512_CPX) { status = libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_bf16_bf16_emu( handle, start_thread, tid); } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_CPX ) { status = libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_bf16_bf16( handle, start_thread, tid); } #elif defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_CORE ) { status = libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_bf16_bf16_emu( handle, start_thread, tid); } #endif else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else #endif { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; libxsmm_smmfunction_reducebatch_strd batchreduce_kernel_beta = handle->gemm_fwd.xgemm.smrs; libxsmm_smmfunction_reducebatch_strd batchreduce_kernel_zerobeta = handle->gemm_fwd2.xgemm.smrs; if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { #define LIBXSMM_DNN_FC_FWD_FUSE_NONE # include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic.tpl.c" #undef LIBXSMM_DNN_FC_FWD_FUSE_NONE } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS ) { #define LIBXSMM_DNN_FC_FWD_FUSE_BIAS # include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic.tpl.c" #undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_RELU ) { #define LIBXSMM_DNN_FC_FWD_FUSE_RELU # include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic.tpl.c" #undef LIBXSMM_DNN_FC_FWD_FUSE_RELU } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_SIGMOID ) { #define LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID # include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic.tpl.c" #undef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_RELU ) { #define LIBXSMM_DNN_FC_FWD_FUSE_BIAS #define LIBXSMM_DNN_FC_FWD_FUSE_RELU # include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic.tpl.c" #undef LIBXSMM_DNN_FC_FWD_FUSE_RELU #undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_SIGMOID ) { #define LIBXSMM_DNN_FC_FWD_FUSE_BIAS #define LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID # include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic.tpl.c" #undef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID #undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS } else { status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; } } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_nhwc(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; LIBXSMM_UNUSED( handle ); LIBXSMM_UNUSED( start_thread ); LIBXSMM_UNUSED( tid ); return status; } libxsmm-1.17/src/libxsmm_dnn_fullyconnected_forward.h000066400000000000000000000025301415223013700232070ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_DNN_FULLYCONNECTED_FORWARD_H #define LIBXSMM_DNN_FULLYCONNECTED_FORWARD_H #include LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_custom(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_nhwc(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid); #endif /* LIBXSMM_DNN_FULLYCONNECTED_FORWARD_H */ libxsmm-1.17/src/libxsmm_dnn_fusedbatchnorm.c000066400000000000000000000730541415223013700214550ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ #include "libxsmm_dnn_fusedbatchnorm_backward.h" #include "libxsmm_dnn_fusedbatchnorm_forward.h" #include "libxsmm_main.h" LIBXSMM_API libxsmm_dnn_fusedbatchnorm* libxsmm_dnn_create_fusedbatchnorm(libxsmm_dnn_fusedbatchnorm_desc fusedbatchnorm_desc, libxsmm_dnn_err_t* status) { libxsmm_dnn_fusedbatchnorm* handle = 0; int lpb; /* init libxsmm */ LIBXSMM_INIT if ( fusedbatchnorm_desc.partN > fusedbatchnorm_desc.fullN ) { *status = LIBXSMM_DNN_ERR_CREATE_HANDLE; return handle; } else if ( (fusedbatchnorm_desc.partN != fusedbatchnorm_desc.fullN) && ((fusedbatchnorm_desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) == 0 ) && ((fusedbatchnorm_desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) == 0 ) ) { *status = LIBXSMM_DNN_ERR_CREATE_HANDLE; return handle; } else { } if ( ((fusedbatchnorm_desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (fusedbatchnorm_desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16)) || ((fusedbatchnorm_desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (fusedbatchnorm_desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) ) { handle = (libxsmm_dnn_fusedbatchnorm*)malloc(sizeof(libxsmm_dnn_fusedbatchnorm)); if (0 != handle) { *status = LIBXSMM_DNN_SUCCESS; /* zero entire content; not only safer but also sets data and code pointers to NULL */ memset(handle, 0, sizeof(*handle)); /* let's make the description persistent */ handle->desc = fusedbatchnorm_desc; /* we need to compute the memory layout given the */ *status = libxsmm_dnn_get_feature_map_blocks( handle->desc.C, handle->desc.C, &(handle->ifmblock), &(handle->ofmblock), &lpb, handle->desc.datatype_in, handle->desc.datatype_out ); /* compute the outer blocks */ handle->blocksifm = handle->desc.C / handle->ifmblock; handle->blocksofm = handle->desc.C / handle->ofmblock; /* create barrier */ handle->barrier = libxsmm_barrier_create(handle->desc.threads, 1); /* calculate scratch size for batchstats */ handle->scratch_size = (sizeof(float) * 2 * handle->desc.C * handle->desc.partN); } else { *status = LIBXSMM_DNN_ERR_CREATE_HANDLE; } } else { *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } return handle; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_fusedbatchnorm(const libxsmm_dnn_fusedbatchnorm* handle) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { /* Deallocate barrier */ if (handle->barrier != 0 ) { libxsmm_barrier_release((const libxsmm_barrier*)handle->barrier); } /* deallocate handle structure */ free(/*remove constness*/(libxsmm_dnn_fusedbatchnorm*)handle); } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout(const libxsmm_dnn_fusedbatchnorm* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status) { libxsmm_dnn_tensor_datalayout* layout; *status = LIBXSMM_DNN_SUCCESS; layout = 0; if (handle != 0) { layout = (libxsmm_dnn_tensor_datalayout*) malloc(sizeof(libxsmm_dnn_tensor_datalayout)); if (layout != 0) { memset(layout, 0, sizeof(libxsmm_dnn_tensor_datalayout)); layout->format = handle->desc.buffer_format; if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) || (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) || (type == LIBXSMM_DNN_REGULAR_INPUT_ADD) || (type == LIBXSMM_DNN_GRADIENT_INPUT_ADD) ) { if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0) { if ( ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) ) { layout->datatype = LIBXSMM_DNN_DATATYPE_F32; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { layout->num_dims = 5; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) || (type == LIBXSMM_DNN_REGULAR_INPUT_ADD) || (type == LIBXSMM_DNN_GRADIENT_INPUT_ADD) ) { layout->dim_size[0] = handle->ifmblock; layout->dim_size[1] = handle->desc.W + (2*handle->desc.pad_w_in); layout->dim_size[2] = handle->desc.H + (2*handle->desc.pad_h_in); layout->dim_size[3] = handle->blocksifm; layout->dim_size[4] = handle->desc.partN; } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { layout->dim_size[0] = handle->ofmblock; layout->dim_size[1] = (handle->desc.W/handle->desc.v) + (2*handle->desc.pad_w_out); layout->dim_size[2] = (handle->desc.H/handle->desc.u) + (2*handle->desc.pad_h_out); layout->dim_size[3] = handle->blocksofm; layout->dim_size[4] = handle->desc.partN; } else { /* coverity[dead_error_begin] */ free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; } } else if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) { layout->datatype = LIBXSMM_DNN_DATATYPE_BF16; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { layout->num_dims = 5; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) || (type == LIBXSMM_DNN_REGULAR_INPUT_ADD) || (type == LIBXSMM_DNN_GRADIENT_INPUT_ADD) ) { layout->dim_size[0] = handle->ifmblock; layout->dim_size[1] = handle->desc.W + (2*handle->desc.pad_w_in); layout->dim_size[2] = handle->desc.H + (2*handle->desc.pad_h_in); layout->dim_size[3] = handle->blocksifm; layout->dim_size[4] = handle->desc.partN; } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { layout->dim_size[0] = handle->ofmblock; layout->dim_size[1] = (handle->desc.W/handle->desc.v) + (2*handle->desc.pad_w_out); layout->dim_size[2] = (handle->desc.H/handle->desc.u) + (2*handle->desc.pad_h_out); layout->dim_size[3] = handle->blocksofm; layout->dim_size[4] = handle->desc.partN; } else { /* coverity[dead_error_begin] */ free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } else if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NHWC) > 0) { if ( ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) || ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16)) ) { layout->datatype = handle->desc.datatype_in; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(4*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(4*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 4; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) || (type == LIBXSMM_DNN_REGULAR_INPUT_ADD) || (type == LIBXSMM_DNN_GRADIENT_INPUT_ADD) ) { layout->dim_size[0] = handle->desc.C; layout->dim_size[1] = handle->desc.W + (2*handle->desc.pad_w_in); layout->dim_size[2] = handle->desc.H + (2*handle->desc.pad_h_in); layout->dim_size[3] = handle->desc.partN; } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { layout->dim_size[0] = handle->desc.C; layout->dim_size[1] = (handle->desc.W/handle->desc.v) + (2*handle->desc.pad_w_out); layout->dim_size[2] = (handle->desc.H/handle->desc.u) + (2*handle->desc.pad_h_out); layout->dim_size[3] = handle->desc.partN; } else { /* coverity[dead_error_begin] */ free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; } } else if ( (type == LIBXSMM_DNN_REGULAR_CHANNEL_BETA) || (type == LIBXSMM_DNN_GRADIENT_CHANNEL_BETA) || (type == LIBXSMM_DNN_CHANNEL_BETA) || (type == LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA) || (type == LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA) || (type == LIBXSMM_DNN_CHANNEL_GAMMA) || (type == LIBXSMM_DNN_CHANNEL_EXPECTVAL) || (type == LIBXSMM_DNN_CHANNEL_RCPSTDDEV) || (type == LIBXSMM_DNN_CHANNEL_VARIANCE) ) { layout->tensor_type = LIBXSMM_DNN_CHANNEL_SCALAR; if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0) { if ( handle->desc.datatype_stats == LIBXSMM_DNN_DATATYPE_F32 ) { layout->datatype = handle->desc.datatype_stats; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(2*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(2*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { layout->num_dims = 2; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_size[0] = handle->ifmblock; layout->dim_size[1] = handle->blocksifm; } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } else if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NHWC) > 0) { if ( handle->desc.datatype_stats == LIBXSMM_DNN_DATATYPE_F32 ) { layout->datatype = handle->desc.datatype_stats; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(1*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(1*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { layout->num_dims = 1; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_size[0] = handle->desc.C; } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; } } else if ( (type == LIBXSMM_DNN_RELU_MASK) ) { layout->tensor_type = LIBXSMM_DNN_RELU_MASK; if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0) { layout->datatype = LIBXSMM_DNN_DATATYPE_I8; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { layout->num_dims = 5; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; layout->dim_size[0] = handle->ofmblock; layout->dim_size[1] = (handle->desc.W/handle->desc.v) + (2*handle->desc.pad_w_out); layout->dim_size[2] = (handle->desc.H/handle->desc.u) + (2*handle->desc.pad_h_out); layout->dim_size[3] = handle->blocksofm; layout->dim_size[4] = handle->desc.partN; } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; } } else if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NHWC) > 0) { layout->datatype = LIBXSMM_DNN_DATATYPE_I8; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(4*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(4*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { layout->num_dims = 6; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; layout->dim_size[0] = handle->ofmblock*handle->blocksofm; layout->dim_size[1] = (handle->desc.W/handle->desc.v) + (2*handle->desc.pad_w_out); layout->dim_size[2] = (handle->desc.H/handle->desc.u) + (2*handle->desc.pad_h_out); layout->dim_size[3] = handle->desc.partN; } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT; } } else { *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return layout; } LIBXSMM_API size_t libxsmm_dnn_fusedbatchnorm_get_scratch_size(const libxsmm_dnn_fusedbatchnorm* handle, libxsmm_dnn_err_t* status) { size_t l_scratch_size = 0; *status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { l_scratch_size = handle->scratch_size + 64; /* 64 byte extra in case the user code does not care about alignment */ } else { *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return l_scratch_size; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_bind_scratch(libxsmm_dnn_fusedbatchnorm* handle, const void* scratch) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; uintptr_t address = (uintptr_t)scratch; size_t offset = 0; if (scratch == 0) { status = LIBXSMM_DNN_ERR_SCRATCH_NOT_ALLOCED; return status; } if (0 != handle) { /* align the internal scratch buffer if needed */ if (address % 64 == 0) { handle->scratch = (void*)address; } else { offset = (64 - address % 64); handle->scratch = (void*)(address+offset); } } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_release_scratch(libxsmm_dnn_fusedbatchnorm* handle) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { handle->scratch = 0; } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_bind_tensor(libxsmm_dnn_fusedbatchnorm* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check for tensor type */ if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_GRADIENT_OUTPUT) && (type != LIBXSMM_DNN_REGULAR_INPUT_ADD) && (type != LIBXSMM_DNN_GRADIENT_INPUT_ADD) && (type != LIBXSMM_DNN_REGULAR_CHANNEL_BETA) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_BETA) && (type != LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA) && (type != LIBXSMM_DNN_CHANNEL_EXPECTVAL) && (type != LIBXSMM_DNN_CHANNEL_RCPSTDDEV) && (type != LIBXSMM_DNN_CHANNEL_VARIANCE) && (type != LIBXSMM_DNN_RELU_MASK) ) { status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; return status; } if (handle != 0 && tensor != 0) { libxsmm_dnn_tensor_datalayout* handle_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout(handle, type, &status); if ( libxsmm_dnn_compare_tensor_datalayout(handle_layout, tensor->layout, &status) == 0 ) { if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { handle->reg_input = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { handle->grad_input = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { handle->reg_output = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_GRADIENT_OUTPUT ) { handle->grad_output = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_REGULAR_INPUT_ADD ) { handle->reg_add = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT_ADD ) { handle->grad_add = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_BETA ) { handle->reg_beta = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_BETA ) { handle->grad_beta = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA ) { handle->reg_gamma = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA ) { handle->grad_gamma = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_CHANNEL_EXPECTVAL ) { handle->expvalue = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_CHANNEL_RCPSTDDEV ) { handle->rcpstddev = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_CHANNEL_VARIANCE ) { handle->variance = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_RELU_MASK ) { handle->relumask = (libxsmm_dnn_tensor*)tensor; } else { /* cannot happen */ } } else { status = LIBXSMM_DNN_ERR_MISMATCH_TENSOR; } libxsmm_dnn_destroy_tensor_datalayout( handle_layout ); } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE_TENSOR; } return status; } LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_fusedbatchnorm_get_tensor(libxsmm_dnn_fusedbatchnorm* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status) { libxsmm_dnn_tensor* return_tensor = 0; *status = LIBXSMM_DNN_SUCCESS; /* check for tensor type */ if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_GRADIENT_OUTPUT) && (type != LIBXSMM_DNN_REGULAR_INPUT_ADD) && (type != LIBXSMM_DNN_GRADIENT_INPUT_ADD) && (type != LIBXSMM_DNN_REGULAR_CHANNEL_BETA) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_BETA) && (type != LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA) && (type != LIBXSMM_DNN_CHANNEL_EXPECTVAL) && (type != LIBXSMM_DNN_CHANNEL_RCPSTDDEV) && (type != LIBXSMM_DNN_CHANNEL_VARIANCE) && (type != LIBXSMM_DNN_RELU_MASK) ) { *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; return return_tensor; } if (handle != 0) { if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { return_tensor = handle->reg_input; } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { return_tensor = handle->grad_input; } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { return_tensor = handle->reg_output; } else if ( type == LIBXSMM_DNN_GRADIENT_OUTPUT ) { return_tensor = handle->grad_output; } else if ( type == LIBXSMM_DNN_REGULAR_INPUT_ADD ) { return_tensor = handle->reg_add; } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT_ADD ) { return_tensor = handle->grad_add; } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_BETA ) { return_tensor = handle->reg_beta; } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_BETA ) { return_tensor = handle->grad_beta; } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA ) { return_tensor = handle->reg_gamma; } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA ) { return_tensor = handle->grad_gamma; } else if ( type == LIBXSMM_DNN_CHANNEL_EXPECTVAL ) { return_tensor = handle->expvalue; } else if ( type == LIBXSMM_DNN_CHANNEL_RCPSTDDEV ) { return_tensor = handle->rcpstddev; } else if ( type == LIBXSMM_DNN_CHANNEL_VARIANCE ) { return_tensor = handle->variance; } else if ( type == LIBXSMM_DNN_RELU_MASK ) { return_tensor = handle->relumask; } else { /* cannot happen */ } } else { *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return return_tensor; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_release_tensor(libxsmm_dnn_fusedbatchnorm* handle, const libxsmm_dnn_tensor_type type) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check for tensor type */ if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_GRADIENT_OUTPUT) && (type != LIBXSMM_DNN_REGULAR_INPUT_ADD) && (type != LIBXSMM_DNN_GRADIENT_INPUT_ADD) && (type != LIBXSMM_DNN_REGULAR_CHANNEL_BETA) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_BETA) && (type != LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA) && (type != LIBXSMM_DNN_CHANNEL_EXPECTVAL) && (type != LIBXSMM_DNN_CHANNEL_RCPSTDDEV) && (type != LIBXSMM_DNN_CHANNEL_VARIANCE) && (type != LIBXSMM_DNN_RELU_MASK) ) { status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; return status; } if (handle != 0) { if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { handle->reg_input = 0; } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { handle->grad_input = 0; } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { handle->reg_output = 0; } else if ( type == LIBXSMM_DNN_GRADIENT_OUTPUT ) { handle->grad_output = 0; } else if ( type == LIBXSMM_DNN_REGULAR_INPUT_ADD ) { handle->reg_add = 0; } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT_ADD ) { handle->grad_add = 0; } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_BETA ) { handle->reg_beta = 0; } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_BETA ) { handle->grad_beta = 0; } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA ) { handle->reg_gamma = 0; } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA ) { handle->grad_gamma = 0; } else if ( type == LIBXSMM_DNN_CHANNEL_EXPECTVAL ) { handle->expvalue = 0; } else if ( type == LIBXSMM_DNN_CHANNEL_RCPSTDDEV ) { handle->rcpstddev = 0; } else if ( type == LIBXSMM_DNN_CHANNEL_VARIANCE ) { handle->variance = 0; } else if ( type == LIBXSMM_DNN_RELU_MASK ) { handle->relumask = 0; } else { /* cannot happen */ } } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_execute_st(libxsmm_dnn_fusedbatchnorm* handle, libxsmm_dnn_compute_kind kind, /*unsigned*/int start_thread, /*unsigned*/int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { switch (kind) { case LIBXSMM_DNN_COMPUTE_KIND_FWD: { switch (handle->desc.buffer_format) { case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { status = libxsmm_dnn_fusedbatchnorm_st_fwd_custom( handle, start_thread, tid ); } break; default: { status = LIBXSMM_DNN_ERR_INVALID_FORMAT_FUSEDBN; } } } break; case LIBXSMM_DNN_COMPUTE_KIND_BWD: { switch (handle->desc.buffer_format) { case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { status = libxsmm_dnn_fusedbatchnorm_st_bwd_custom( handle, start_thread, tid ); } break; default: { status = LIBXSMM_DNN_ERR_INVALID_FORMAT_FUSEDBN; } } } break; default: { status = LIBXSMM_DNN_ERR_INVALID_KIND; } } } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_reduce_stats_st(libxsmm_dnn_fusedbatchnorm** handles, int num_handles, libxsmm_dnn_compute_kind kind, /*unsigned*/int start_thread, /*unsigned*/int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (0 != handles && num_handles > 0) { switch (kind) { case LIBXSMM_DNN_COMPUTE_KIND_FWD: { switch (handles[0]->desc.buffer_format) { case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { status = libxsmm_dnn_fusedbatchnorm_reduce_stats_st_fwd_custom( handles, num_handles, start_thread, tid ); } break; default: { status = LIBXSMM_DNN_ERR_INVALID_FORMAT_FUSEDBN; } } } break; case LIBXSMM_DNN_COMPUTE_KIND_BWD: { switch (handles[0]->desc.buffer_format) { case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { status = libxsmm_dnn_fusedbatchnorm_reduce_stats_st_bwd_custom( handles, num_handles, start_thread, tid ); } break; default: { status = LIBXSMM_DNN_ERR_INVALID_FORMAT_FUSEDBN; } } } break; default: { status = LIBXSMM_DNN_ERR_INVALID_KIND; } } } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } libxsmm-1.17/src/libxsmm_dnn_fusedbatchnorm_backward.c000066400000000000000000001006031415223013700233020ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "libxsmm_dnn_fusedbatchnorm_backward.h" #include "libxsmm_main.h" LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_f32_c16(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_f32_c32(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_f32_c64(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_bwd_custom_bf16_bf16_c16(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_bwd_custom_bf16_bf16_c32(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_bwd_custom_bf16_bf16_c64(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid); LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_f32_c16(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef float element_input_type; typedef float element_output_type; typedef float element_stats_type; if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU ) { status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER; } else { if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BN) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) ) { # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_RELU ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK } else { status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION; } } #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_f32_c32(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef float element_input_type; typedef float element_output_type; typedef float element_stats_type; if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU ) { status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER; } else { if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BN) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) ) { # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_RELU ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK } else { status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION; } } #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_f32_c64(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef float element_input_type; typedef float element_output_type; typedef float element_stats_type; if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU ) { status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER; } else { if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BN) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) ) { # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_RELU ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK } else { status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION; } } #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_bwd_custom_bf16_bf16_c16(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef float element_stats_type; # define LIBXSMM_DNN_FUSEDBN_BWD_BF16 if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU ) { status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER; } else { if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BN) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) ) { # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_RELU ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK } else { status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION; } } # undef LIBXSMM_DNN_FUSEDBN_BWD_BF16 #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_bwd_custom_bf16_bf16_c32(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef float element_stats_type; # define LIBXSMM_DNN_FUSEDBN_BWD_BF16 if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU ) { status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER; } else { if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BN) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) ) { # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_RELU ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK } else { status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION; } } # undef LIBXSMM_DNN_FUSEDBN_BWD_BF16 #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_bwd_custom_bf16_bf16_c64(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef float element_stats_type; # define LIBXSMM_DNN_FUSEDBN_BWD_BF16 if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU ) { status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER; } else { if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BN) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) ) { # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_RELU ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK } else { status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION; } } # undef LIBXSMM_DNN_FUSEDBN_BWD_BF16 #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_bwd_custom(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check if all required tensors are bound */ if ( handle->reg_input == 0 || handle->reg_gamma == 0 || handle->grad_input == 0 || handle->grad_output == 0 || handle->grad_beta == 0 || handle->grad_gamma == 0 || handle->expvalue == 0 || handle->rcpstddev == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0 ) { if ( handle->scratch == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } } if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) > 0 ) { if ( handle->grad_add == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } } if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU) > 0 ) { if ( handle->reg_output == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } } if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) > 0 ) { if ( handle->relumask == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } } /* check if we are on an AVX512 platform */ #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && (handle->ofmblock == 16) ) { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_f32_c16( handle, start_thread, tid ); } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { status = libxsmm_dnn_fusedbatchnorm_st_bwd_custom_bf16_bf16_c16( handle, start_thread, tid ); } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && (handle->ofmblock == 32) ) { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_f32_c32( handle, start_thread, tid ); } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { status = libxsmm_dnn_fusedbatchnorm_st_bwd_custom_bf16_bf16_c32( handle, start_thread, tid ); } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && (handle->ofmblock == 64) ) { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_f32_c64( handle, start_thread, tid ); } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { status = libxsmm_dnn_fusedbatchnorm_st_bwd_custom_bf16_bf16_c64( handle, start_thread, tid ); } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else #endif { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { typedef float element_input_type; typedef float element_output_type; typedef float element_stats_type; if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU ) { status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER; } else { if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BN) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) ) { # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_generic.tpl.c" } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_RELU ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK } else { status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION; } } } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef float element_stats_type; # define LIBXSMM_DNN_FUSEDBN_BWD_BF16 if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU ) { status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER; } else { if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BN) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) ) { # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_generic.tpl.c" } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_RELU ) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) { # define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK } else { status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION; } } # undef LIBXSMM_DNN_FUSEDBN_BWD_BF16 } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_bwd_nhwc(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; LIBXSMM_UNUSED( handle ); LIBXSMM_UNUSED( start_thread ); LIBXSMM_UNUSED( tid ); return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_reduce_stats_st_bwd_custom(libxsmm_dnn_fusedbatchnorm** handles, int num_handles, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; int l_count; /* check if all required tensors are bound */ for ( l_count = 0; l_count < num_handles; ++l_count ) { if ( handles[l_count]->grad_beta == 0 || handles[l_count]->grad_gamma == 0 || handles[l_count]->scratch == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } } #if 0 /* check if we are on an AVX512 platform */ if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) { status = libxsmm_dnn_fusedbatchnorm_reduce_stats_st_bwd_custom_avx512( handles, num_handles, start_thread, tid ); } else #endif { const int nImg = handles[0]->desc.partN; const int nBlocksFm = handles[0]->blocksifm; const int nFmBlock = handles[0]->ifmblock; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const int work2 = nBlocksFm; /* compute chunk size */ const int chunksize2 = (work2 % handles[0]->desc.threads == 0) ? (work2 / handles[0]->desc.threads) : ((work2 / handles[0]->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; int v = 0, fm; LIBXSMM_VLA_DECL(2, float, dgamma0, (float*)handles[0]->grad_gamma->data, nFmBlock); LIBXSMM_VLA_DECL(2, float, dbeta0, (float*)handles[0]->grad_beta->data, nFmBlock); LIBXSMM_VLA_DECL(3, float, dgamma_img0, (float*)handles[0]->scratch, nImg, nFmBlock); LIBXSMM_VLA_DECL(3, float, dbeta_img0, ((float*)handles[0]->scratch) + ((size_t)nImg * (size_t)nBlocksFm * (size_t)nFmBlock), nImg, nFmBlock); /* lazy barrier init */ libxsmm_barrier_init(handles[0]->barrier, ltid); for ( fm = thr_begin2; fm < thr_end2; ++fm ) { float* dgamma0_ptr = &LIBXSMM_VLA_ACCESS(2, dgamma0, fm, 0, nFmBlock); float* dbeta0_ptr = &LIBXSMM_VLA_ACCESS(2, dbeta0, fm, 0, nFmBlock); float* dgamma_img0_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img0, fm, 0, 0, nImg, nFmBlock); float* dbeta_img0_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img0, fm, 0, 0, nImg, nFmBlock); LIBXSMM_PRAGMA_SIMD for ( v=0; v < nFmBlock; v++ ) { dgamma0_ptr[v] = dgamma_img0_ptr[v]; dbeta0_ptr[v] = dbeta_img0_ptr[v]; } } /* now we need to reduce the dgamma and dbeta */ for ( l_count = 1; l_count < num_handles; ++l_count ) { LIBXSMM_VLA_DECL(3, float, dgamma_imgr, (float*)handles[l_count]->scratch, nImg, nFmBlock); LIBXSMM_VLA_DECL(3, float, dbeta_imgr, ((float*)handles[l_count]->scratch) + ((size_t)nImg * (size_t)nBlocksFm * (size_t)nFmBlock), nImg, nFmBlock); for ( fm = thr_begin2; fm < thr_end2; ++fm ) { float* dgamma0_ptr = &LIBXSMM_VLA_ACCESS(2, dgamma0, fm, 0, nFmBlock); float* dbeta0_ptr = &LIBXSMM_VLA_ACCESS(2, dbeta0, fm, 0, nFmBlock); float* dgamma_imgr_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_imgr, fm, 0, 0, nImg, nFmBlock); float* dbeta_imgr_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_imgr, fm, 0, 0, nImg, nFmBlock); LIBXSMM_PRAGMA_SIMD for ( v=0; v < nFmBlock; v++ ) { dgamma0_ptr[v] += dgamma_imgr_ptr[v]; dbeta0_ptr[v] += dbeta_imgr_ptr[v]; } } } for ( l_count = 1; l_count < num_handles; ++l_count ) { LIBXSMM_VLA_DECL(2, float, dgammar, (float*)handles[l_count]->grad_gamma->data, nFmBlock); LIBXSMM_VLA_DECL(2, float, dbetar, (float*)handles[l_count]->grad_beta->data, nFmBlock); for ( fm = thr_begin2; fm < thr_end2; ++fm ) { float* dgamma0_ptr = &LIBXSMM_VLA_ACCESS(2, dgamma0, fm, 0, nFmBlock); float* dbeta0_ptr = &LIBXSMM_VLA_ACCESS(2, dbeta0, fm, 0, nFmBlock); float* dgammar_ptr = &LIBXSMM_VLA_ACCESS(2, dgammar, fm, 0, nFmBlock); float* dbetar_ptr = &LIBXSMM_VLA_ACCESS(2, dbetar, fm, 0, nFmBlock); LIBXSMM_PRAGMA_SIMD for ( v=0; v < nFmBlock; v++ ) { dgammar_ptr[v] = dgamma0_ptr[v]; dbetar_ptr[v] = dbeta0_ptr[v]; } } } libxsmm_barrier_wait(handles[0]->barrier, ltid); } return status; } libxsmm-1.17/src/libxsmm_dnn_fusedbatchnorm_backward.h000066400000000000000000000025701415223013700233130ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_DNN_FUSEDBATCHNORM_BACKWARD_H #define LIBXSMM_DNN_FUSEDBATCHNORM_BACKWARD_H #include LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_bwd_custom(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_bwd_nhwc(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_reduce_stats_st_bwd_custom(libxsmm_dnn_fusedbatchnorm** handles, int num_handles, int start_thread, int tid); #endif /* LIBXSMM_DNN_FUSEDBATCHNORM_BACKWARD_H */ libxsmm-1.17/src/libxsmm_dnn_fusedbatchnorm_forward.c000066400000000000000000001026231415223013700231740ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "libxsmm_dnn_fusedbatchnorm_forward.h" #include "libxsmm_main.h" #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #include #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_f32_c16(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_f32_c32(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_f32_c64(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_fwd_custom_bf16_bf16_c16(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_fwd_custom_bf16_bf16_c32(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_fwd_custom_bf16_bf16_c64(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid); LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_f32_c16(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef float element_input_type; typedef float element_output_type; typedef float element_stats_type; if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU ) { status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER; } else { if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BN) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) ) { # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_RELU ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK } else { status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION; } } #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_f32_c32(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef float element_input_type; typedef float element_output_type; typedef float element_stats_type; if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU ) { status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER; } else { if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BN) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) ) { # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_RELU ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK } else { status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION; } } #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_f32_c64(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef float element_input_type; typedef float element_output_type; typedef float element_stats_type; if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU ) { status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER; } else { if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BN) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) ) { # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_RELU ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK } else { status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION; } } #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_fwd_custom_bf16_bf16_c16(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef float element_stats_type; # define LIBXSMM_DNN_FUSEDBN_FWD_BF16 if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU ) { status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER; } else { if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BN) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) ) { # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_RELU ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK } else { status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION; } } # undef LIBXSMM_DNN_FUSEDBN_FWD_BF16 #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_fwd_custom_bf16_bf16_c32(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef float element_stats_type; # define LIBXSMM_DNN_FUSEDBN_FWD_BF16 if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU ) { status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER; } else { if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BN) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) ) { # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_RELU ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK } else { status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION; } } # undef LIBXSMM_DNN_FUSEDBN_FWD_BF16 #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_fwd_custom_bf16_bf16_c64(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef float element_stats_type; # define LIBXSMM_DNN_FUSEDBN_FWD_BF16 if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU ) { status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER; } else { if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BN) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) ) { # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_RELU ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK } else { status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION; } } # undef LIBXSMM_DNN_FUSEDBN_FWD_BF16 #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_fwd_custom(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check if all required tensors are bound */ if ( handle->reg_input == 0 || handle->reg_output == 0 || handle->reg_beta == 0 || handle->reg_gamma == 0 || handle->expvalue == 0 || handle->rcpstddev == 0 || handle->variance == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0 ) { if ( handle->scratch == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } } if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) > 0 ) { if ( handle->reg_add == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } } if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) > 0 ) { if ( handle->relumask == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } } /* check if we are on an AVX512 platform */ #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && (handle->ofmblock == 16) ) { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_f32_c16( handle, start_thread, tid ); } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { status = libxsmm_dnn_fusedbatchnorm_st_fwd_custom_bf16_bf16_c16( handle, start_thread, tid ); } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && (handle->ofmblock == 32) ) { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_f32_c32( handle, start_thread, tid ); } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { status = libxsmm_dnn_fusedbatchnorm_st_fwd_custom_bf16_bf16_c32( handle, start_thread, tid ); } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && (handle->ofmblock == 64) ) { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_f32_c64( handle, start_thread, tid ); } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { status = libxsmm_dnn_fusedbatchnorm_st_fwd_custom_bf16_bf16_c64( handle, start_thread, tid ); } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else #endif { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { typedef float element_input_type; typedef float element_output_type; typedef float element_stats_type; if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU ) { status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER; } else { if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BN) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) ) { # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_generic.tpl.c" } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_RELU ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK } else { status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION; } } } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef float element_stats_type; # define LIBXSMM_DNN_FUSEDBN_FWD_BF16 if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU ) { status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER; } else { if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BN) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) ) { # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_generic.tpl.c" } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_RELU ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK } else { status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION; } } # undef LIBXSMM_DNN_FUSEDBN_FWD_BF16 } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_fwd_nhwc(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; LIBXSMM_UNUSED( handle ); LIBXSMM_UNUSED( start_thread ); LIBXSMM_UNUSED( tid ); return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_reduce_stats_st_fwd_custom(libxsmm_dnn_fusedbatchnorm** handles, int num_handles, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; int l_count; /* check if all required tensors are bound */ for ( l_count = 0; l_count < num_handles; ++l_count ) { if ( handles[l_count]->expvalue == 0 || handles[l_count]->rcpstddev == 0 || handles[l_count]->variance == 0 || handles[l_count]->scratch == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } } #if 0 /* check if we are on an AVX512 platform */ if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) { status = libxsmm_dnn_fusedbatchnorm_reduce_stats_st_fwd_custom_avx512( handles, num_handles, start_thread, tid ); } else #endif { const int nImg = handles[0]->desc.partN; const int nBlocksFm = handles[0]->blocksifm; const int nFmBlock = handles[0]->ifmblock; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const int work2 = nBlocksFm; /* compute chunk size */ const int chunksize2 = (work2 % handles[0]->desc.threads == 0) ? (work2 / handles[0]->desc.threads) : ((work2 / handles[0]->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; int v = 0, fm; const float sqrt_eps = 1e-7f; const float nhw = (float)(handles[0]->desc.fullN * handles[0]->desc.H * handles[0]->desc.W); const float recp_nhw = 1.0f/nhw; LIBXSMM_VLA_DECL(2, float, bmean0, (float*)handles[0]->expvalue->data, nFmBlock); LIBXSMM_VLA_DECL(2, float, brstd0, (float*)handles[0]->rcpstddev->data, nFmBlock); LIBXSMM_VLA_DECL(2, float, variance0, (float*)handles[0]->variance->data, nFmBlock); LIBXSMM_VLA_DECL(3, float, sum_img0, (float*)handles[0]->scratch, nImg, nFmBlock); LIBXSMM_VLA_DECL(3, float, sumsq_img0, ((float*)handles[0]->scratch) + ((size_t)nImg * (size_t)nBlocksFm * (size_t)nFmBlock), nImg, nFmBlock); /* lazy barrier init */ libxsmm_barrier_init(handles[0]->barrier, ltid); /* now we need to reduce the sum and sum^2, we use the final */ for ( l_count = 1; l_count < num_handles; ++l_count ) { LIBXSMM_VLA_DECL(3, float, sum_imgr, (float*)handles[l_count]->scratch, nImg, nFmBlock); LIBXSMM_VLA_DECL(3, float, sumsq_imgr, ((float*)handles[l_count]->scratch) + ((size_t)nImg * (size_t)nBlocksFm * (size_t)nFmBlock), nImg, nFmBlock); for ( fm = thr_begin2; fm < thr_end2; ++fm ) { float* sum_img0_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img0, fm, 0, 0, nImg, nFmBlock); float* sumsq_img0_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img0, fm, 0, 0, nImg, nFmBlock); float* sum_imgr_ptr = &LIBXSMM_VLA_ACCESS(3, sum_imgr, fm, 0, 0, nImg, nFmBlock); float* sumsq_imgr_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_imgr, fm, 0, 0, nImg, nFmBlock); LIBXSMM_PRAGMA_SIMD for ( v=0; v < nFmBlock; v++ ) { sum_img0_ptr[v] += sum_imgr_ptr[v]; sumsq_img0_ptr[v] += sumsq_imgr_ptr[v]; } } } for ( fm = thr_begin2; fm < thr_end2; ++fm ) { float* bmean0_ptr = &LIBXSMM_VLA_ACCESS(2, bmean0, fm, 0, nFmBlock); float* brstd0_ptr = &LIBXSMM_VLA_ACCESS(2, brstd0, fm, 0, nFmBlock); float* tvar0_ptr = &LIBXSMM_VLA_ACCESS(2, variance0, fm, 0, nFmBlock); float* sum_img0_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img0, fm, 0, 0, nImg, nFmBlock); float* sumsq_img0_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img0, fm, 0, 0, nImg, nFmBlock); LIBXSMM_PRAGMA_SIMD for ( v=0; v < nFmBlock; v++ ) { const float tbmean = (recp_nhw * sum_img0_ptr[v]); const float tbmeansq = tbmean * tbmean; const float tsqbmean = recp_nhw * sumsq_img0_ptr[v]; const float tvar = tsqbmean - tbmeansq; const float tbrstd = (float)(1.0/sqrt((double)tvar + sqrt_eps)); bmean0_ptr[v] = tbmean; brstd0_ptr[v] = tbrstd; tvar0_ptr[v] = tvar; } } for ( l_count = 1; l_count < num_handles; ++l_count ) { LIBXSMM_VLA_DECL(2, float, bmeanr, (float*)handles[l_count]->expvalue->data, nFmBlock); LIBXSMM_VLA_DECL(2, float, brstdr, (float*)handles[l_count]->rcpstddev->data, nFmBlock); LIBXSMM_VLA_DECL(2, float, variancer, (float*)handles[l_count]->variance->data, nFmBlock); for ( fm = thr_begin2; fm < thr_end2; ++fm ) { float* bmean0_ptr = &LIBXSMM_VLA_ACCESS(2, bmean0, fm, 0, nFmBlock); float* brstd0_ptr = &LIBXSMM_VLA_ACCESS(2, brstd0, fm, 0, nFmBlock); float* tvar0_ptr = &LIBXSMM_VLA_ACCESS(2, variance0, fm, 0, nFmBlock); float* bmeanr_ptr = &LIBXSMM_VLA_ACCESS(2, bmeanr, fm, 0, nFmBlock); float* brstdr_ptr = &LIBXSMM_VLA_ACCESS(2, brstdr, fm, 0, nFmBlock); float* tvarr_ptr = &LIBXSMM_VLA_ACCESS(2, variancer, fm, 0, nFmBlock); LIBXSMM_PRAGMA_SIMD for ( v=0; v < nFmBlock; v++ ) { bmeanr_ptr[v] = bmean0_ptr[v]; brstdr_ptr[v] = brstd0_ptr[v]; tvarr_ptr[v] = tvar0_ptr[v]; } } } libxsmm_barrier_wait(handles[0]->barrier, ltid); } return status; } libxsmm-1.17/src/libxsmm_dnn_fusedbatchnorm_forward.h000066400000000000000000000025651415223013700232050ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_DNN_FUSEDBATCHNORM_FORWARD_H #define LIBXSMM_DNN_FUSEDBATCHNORM_FORWARD_H #include LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_fwd_custom(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_fwd_nhwc(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_reduce_stats_st_fwd_custom(libxsmm_dnn_fusedbatchnorm** handles, int num_handles, int start_thread, int tid); #endif /* LIBXSMM_DNN_FUSEDBATCHNORM_FORWARD_H */ libxsmm-1.17/src/libxsmm_dnn_fusedgroupnorm.c000066400000000000000000000740251415223013700215270ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ #include "libxsmm_dnn_fusedgroupnorm_backward.h" #include "libxsmm_dnn_fusedgroupnorm_forward.h" #include "libxsmm_main.h" LIBXSMM_API libxsmm_dnn_fusedgroupnorm* libxsmm_dnn_create_fusedgroupnorm(libxsmm_dnn_fusedgroupnorm_desc fusedgroupnorm_desc, libxsmm_dnn_err_t* status) { libxsmm_dnn_fusedgroupnorm* handle = 0; int lpb; /* init libxsmm */ LIBXSMM_INIT if ( ((fusedgroupnorm_desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (fusedgroupnorm_desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16)) || ((fusedgroupnorm_desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (fusedgroupnorm_desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) ) { handle = (libxsmm_dnn_fusedgroupnorm*)malloc(sizeof(libxsmm_dnn_fusedgroupnorm)); if (0 != handle) { *status = LIBXSMM_DNN_SUCCESS; /* zero entire content; not only safer but also sets data and code pointers to NULL */ memset(handle, 0, sizeof(*handle)); /* let's make the description persistent */ handle->desc = fusedgroupnorm_desc; /* we need to compute the memory layout given the */ *status = libxsmm_dnn_get_feature_map_blocks( handle->desc.C, handle->desc.C, &(handle->ifmblock), &(handle->ofmblock), &lpb, handle->desc.datatype_in, handle->desc.datatype_out ); /* compute the outer blocks */ handle->blocksifm = handle->desc.C / handle->ifmblock; handle->blocksofm = handle->desc.C / handle->ofmblock; /* create barrier */ handle->barrier = libxsmm_barrier_create(handle->desc.threads, 1); /* calculate scratch size for batchstats */ handle->scratch_size = (sizeof(float) * 2 * ((handle->desc.C * handle->desc.N) + (handle->desc.G * handle->desc.N))); } else { *status = LIBXSMM_DNN_ERR_CREATE_HANDLE; } } else { *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } return handle; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_fusedgroupnorm(const libxsmm_dnn_fusedgroupnorm* handle) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { /* Deallocate barrier */ if (handle->barrier != 0 ) { libxsmm_barrier_release((const libxsmm_barrier*)handle->barrier); } /* deallocate handle structure */ free(/*remove constness*/(libxsmm_dnn_fusedgroupnorm*)handle); } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_fusedgroupnorm_create_tensor_datalayout(const libxsmm_dnn_fusedgroupnorm* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status) { libxsmm_dnn_tensor_datalayout* layout; *status = LIBXSMM_DNN_SUCCESS; layout = 0; if (handle != 0) { layout = (libxsmm_dnn_tensor_datalayout*) malloc(sizeof(libxsmm_dnn_tensor_datalayout)); if (layout != 0) { memset(layout, 0, sizeof(libxsmm_dnn_tensor_datalayout)); layout->format = handle->desc.buffer_format; if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) || (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) || (type == LIBXSMM_DNN_REGULAR_INPUT_ADD) || (type == LIBXSMM_DNN_GRADIENT_INPUT_ADD) ) { if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0) { if ( ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) ) { layout->datatype = LIBXSMM_DNN_DATATYPE_F32; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { layout->num_dims = 5; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) || (type == LIBXSMM_DNN_REGULAR_INPUT_ADD) || (type == LIBXSMM_DNN_GRADIENT_INPUT_ADD) ) { layout->dim_size[0] = handle->ifmblock; layout->dim_size[1] = handle->desc.W + (2*handle->desc.pad_w_in); layout->dim_size[2] = handle->desc.H + (2*handle->desc.pad_h_in); layout->dim_size[3] = handle->blocksifm; layout->dim_size[4] = handle->desc.N; } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { layout->dim_size[0] = handle->ofmblock; layout->dim_size[1] = (handle->desc.W/handle->desc.v) + (2*handle->desc.pad_w_out); layout->dim_size[2] = (handle->desc.H/handle->desc.u) + (2*handle->desc.pad_h_out); layout->dim_size[3] = handle->blocksofm; layout->dim_size[4] = handle->desc.N; } else { /* coverity[dead_error_begin] */ free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; } } else if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) { layout->datatype = LIBXSMM_DNN_DATATYPE_BF16; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { layout->num_dims = 5; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) || (type == LIBXSMM_DNN_REGULAR_INPUT_ADD) || (type == LIBXSMM_DNN_GRADIENT_INPUT_ADD) ) { layout->dim_size[0] = handle->ifmblock; layout->dim_size[1] = handle->desc.W + (2*handle->desc.pad_w_in); layout->dim_size[2] = handle->desc.H + (2*handle->desc.pad_h_in); layout->dim_size[3] = handle->blocksifm; layout->dim_size[4] = handle->desc.N; } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { layout->dim_size[0] = handle->ofmblock; layout->dim_size[1] = (handle->desc.W/handle->desc.v) + (2*handle->desc.pad_w_out); layout->dim_size[2] = (handle->desc.H/handle->desc.u) + (2*handle->desc.pad_h_out); layout->dim_size[3] = handle->blocksofm; layout->dim_size[4] = handle->desc.N; } else { free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } else if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NHWC) > 0) { if ( ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) || ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16)) ) { layout->datatype = handle->desc.datatype_in; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(4*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(4*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 4; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) || (type == LIBXSMM_DNN_REGULAR_INPUT_ADD) || (type == LIBXSMM_DNN_GRADIENT_INPUT_ADD) ) { layout->dim_size[0] = handle->desc.C; layout->dim_size[1] = handle->desc.W + (2*handle->desc.pad_w_in); layout->dim_size[2] = handle->desc.H + (2*handle->desc.pad_h_in); layout->dim_size[3] = handle->desc.N; } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { layout->dim_size[0] = handle->desc.C; layout->dim_size[1] = (handle->desc.W/handle->desc.v) + (2*handle->desc.pad_w_out); layout->dim_size[2] = (handle->desc.H/handle->desc.u) + (2*handle->desc.pad_h_out); layout->dim_size[3] = handle->desc.N; } else { free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; } } else if ( (type == LIBXSMM_DNN_REGULAR_CHANNEL_BETA) || (type == LIBXSMM_DNN_GRADIENT_CHANNEL_BETA) || (type == LIBXSMM_DNN_CHANNEL_BETA) || (type == LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA) || (type == LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA) || (type == LIBXSMM_DNN_CHANNEL_GAMMA) ) { layout->tensor_type = LIBXSMM_DNN_CHANNEL_SCALAR; if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0) { if ( handle->desc.datatype_stats == LIBXSMM_DNN_DATATYPE_F32 ) { layout->datatype = handle->desc.datatype_stats; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(2*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(2*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { layout->num_dims = 2; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_size[0] = handle->ifmblock; layout->dim_size[1] = handle->blocksifm; } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } else if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NHWC) > 0) { if ( handle->desc.datatype_stats == LIBXSMM_DNN_DATATYPE_F32 ) { layout->datatype = handle->desc.datatype_stats; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(1*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(1*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { layout->num_dims = 1; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_size[0] = handle->desc.C; } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; } } else if ( (type == LIBXSMM_DNN_CHANNEL_EXPECTVAL) || (type == LIBXSMM_DNN_CHANNEL_RCPSTDDEV) || (type == LIBXSMM_DNN_CHANNEL_VARIANCE) ) { layout->tensor_type = LIBXSMM_DNN_CHANNEL_SCALAR; if ( ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0) || ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NHWC) > 0) ) { if ( handle->desc.datatype_stats == LIBXSMM_DNN_DATATYPE_F32 ) { layout->datatype = handle->desc.datatype_stats; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(2*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(2*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { layout->num_dims = 2; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_G; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; layout->dim_size[0] = handle->desc.G; layout->dim_size[1] = handle->desc.N; } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; } } else if ( (type == LIBXSMM_DNN_RELU_MASK) ) { layout->tensor_type = LIBXSMM_DNN_RELU_MASK; if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0) { layout->datatype = LIBXSMM_DNN_DATATYPE_I8; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { layout->num_dims = 5; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; layout->dim_size[0] = handle->ofmblock; layout->dim_size[1] = (handle->desc.W/handle->desc.v) + (2*handle->desc.pad_w_out); layout->dim_size[2] = (handle->desc.H/handle->desc.u) + (2*handle->desc.pad_h_out); layout->dim_size[3] = handle->blocksofm; layout->dim_size[4] = handle->desc.N; } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; } } else if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NHWC) > 0) { layout->datatype = LIBXSMM_DNN_DATATYPE_I8; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(4*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(4*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { layout->num_dims = 6; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; layout->dim_size[0] = handle->ofmblock*handle->blocksofm; layout->dim_size[1] = (handle->desc.W/handle->desc.v) + (2*handle->desc.pad_w_out); layout->dim_size[2] = (handle->desc.H/handle->desc.u) + (2*handle->desc.pad_h_out); layout->dim_size[3] = handle->desc.N; } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT; } } else { *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return layout; } LIBXSMM_API size_t libxsmm_dnn_fusedgroupnorm_get_scratch_size(const libxsmm_dnn_fusedgroupnorm* handle, libxsmm_dnn_err_t* status) { size_t l_scratch_size = 0; *status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { l_scratch_size = handle->scratch_size + 64; /* 64 byte extra in case the user code does not care about alignment */ } else { *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return l_scratch_size; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_bind_scratch(libxsmm_dnn_fusedgroupnorm* handle, const void* scratch) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; uintptr_t address = (uintptr_t)scratch; size_t offset = 0; if (scratch == 0) { status = LIBXSMM_DNN_ERR_SCRATCH_NOT_ALLOCED; return status; } if (0 != handle) { /* align the internal scratch buffer if needed */ if (address % 64 == 0) { handle->scratch = (void*)address; } else { offset = (64 - address % 64); handle->scratch = (void*)(address+offset); } } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_release_scratch(libxsmm_dnn_fusedgroupnorm* handle) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { handle->scratch = 0; } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_bind_tensor(libxsmm_dnn_fusedgroupnorm* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check for tensor type */ if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_GRADIENT_OUTPUT) && (type != LIBXSMM_DNN_REGULAR_INPUT_ADD) && (type != LIBXSMM_DNN_GRADIENT_INPUT_ADD) && (type != LIBXSMM_DNN_REGULAR_CHANNEL_BETA) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_BETA) && (type != LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA) && (type != LIBXSMM_DNN_CHANNEL_EXPECTVAL) && (type != LIBXSMM_DNN_CHANNEL_RCPSTDDEV) && (type != LIBXSMM_DNN_CHANNEL_VARIANCE) && (type != LIBXSMM_DNN_RELU_MASK) ) { status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; return status; } if (handle != 0 && tensor != 0) { libxsmm_dnn_tensor_datalayout* handle_layout = libxsmm_dnn_fusedgroupnorm_create_tensor_datalayout(handle, type, &status); if ( libxsmm_dnn_compare_tensor_datalayout(handle_layout, tensor->layout, &status) == 0 ) { if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { handle->reg_input = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { handle->grad_input = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { handle->reg_output = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_GRADIENT_OUTPUT ) { handle->grad_output = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_REGULAR_INPUT_ADD ) { handle->reg_add = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT_ADD ) { handle->grad_add = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_BETA ) { handle->reg_beta = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_BETA ) { handle->grad_beta = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA ) { handle->reg_gamma = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA ) { handle->grad_gamma = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_CHANNEL_EXPECTVAL ) { handle->expvalue = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_CHANNEL_RCPSTDDEV ) { handle->rcpstddev = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_CHANNEL_VARIANCE ) { handle->variance = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_RELU_MASK ) { handle->relumask = (libxsmm_dnn_tensor*)tensor; } else { /* cannot happen */ } } else { status = LIBXSMM_DNN_ERR_MISMATCH_TENSOR; } libxsmm_dnn_destroy_tensor_datalayout( handle_layout ); } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE_TENSOR; } return status; } LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_fusedgroupnorm_get_tensor(libxsmm_dnn_fusedgroupnorm* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status) { libxsmm_dnn_tensor* return_tensor = 0; *status = LIBXSMM_DNN_SUCCESS; /* check for tensor type */ if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_GRADIENT_OUTPUT) && (type != LIBXSMM_DNN_REGULAR_INPUT_ADD) && (type != LIBXSMM_DNN_GRADIENT_INPUT_ADD) && (type != LIBXSMM_DNN_REGULAR_CHANNEL_BETA) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_BETA) && (type != LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA) && (type != LIBXSMM_DNN_CHANNEL_EXPECTVAL) && (type != LIBXSMM_DNN_CHANNEL_RCPSTDDEV) && (type != LIBXSMM_DNN_CHANNEL_VARIANCE) && (type != LIBXSMM_DNN_RELU_MASK) ) { *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; return return_tensor; } if (handle != 0) { if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { return_tensor = handle->reg_input; } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { return_tensor = handle->grad_input; } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { return_tensor = handle->reg_output; } else if ( type == LIBXSMM_DNN_GRADIENT_OUTPUT ) { return_tensor = handle->grad_output; } else if ( type == LIBXSMM_DNN_REGULAR_INPUT_ADD ) { return_tensor = handle->reg_add; } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT_ADD ) { return_tensor = handle->grad_add; } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_BETA ) { return_tensor = handle->reg_beta; } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_BETA ) { return_tensor = handle->grad_beta; } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA ) { return_tensor = handle->reg_gamma; } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA ) { return_tensor = handle->grad_gamma; } else if ( type == LIBXSMM_DNN_CHANNEL_EXPECTVAL ) { return_tensor = handle->expvalue; } else if ( type == LIBXSMM_DNN_CHANNEL_RCPSTDDEV ) { return_tensor = handle->rcpstddev; } else if ( type == LIBXSMM_DNN_CHANNEL_VARIANCE ) { return_tensor = handle->variance; } else if ( type == LIBXSMM_DNN_RELU_MASK ) { return_tensor = handle->relumask; } else { /* cannot happen */ } } else { *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return return_tensor; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_release_tensor(libxsmm_dnn_fusedgroupnorm* handle, const libxsmm_dnn_tensor_type type) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check for tensor type */ if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_GRADIENT_OUTPUT) && (type != LIBXSMM_DNN_REGULAR_INPUT_ADD) && (type != LIBXSMM_DNN_GRADIENT_INPUT_ADD) && (type != LIBXSMM_DNN_REGULAR_CHANNEL_BETA) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_BETA) && (type != LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA) && (type != LIBXSMM_DNN_CHANNEL_EXPECTVAL) && (type != LIBXSMM_DNN_CHANNEL_RCPSTDDEV) && (type != LIBXSMM_DNN_CHANNEL_VARIANCE) && (type != LIBXSMM_DNN_RELU_MASK) ) { status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; return status; } if (handle != 0) { if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { handle->reg_input = 0; } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { handle->grad_input = 0; } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { handle->reg_output = 0; } else if ( type == LIBXSMM_DNN_GRADIENT_OUTPUT ) { handle->grad_output = 0; } else if ( type == LIBXSMM_DNN_REGULAR_INPUT_ADD ) { handle->reg_add = 0; } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT_ADD ) { handle->grad_add = 0; } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_BETA ) { handle->reg_beta = 0; } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_BETA ) { handle->grad_beta = 0; } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA ) { handle->reg_gamma = 0; } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA ) { handle->grad_gamma = 0; } else if ( type == LIBXSMM_DNN_CHANNEL_EXPECTVAL ) { handle->expvalue = 0; } else if ( type == LIBXSMM_DNN_CHANNEL_RCPSTDDEV ) { handle->rcpstddev = 0; } else if ( type == LIBXSMM_DNN_CHANNEL_VARIANCE ) { handle->variance = 0; } else if ( type == LIBXSMM_DNN_RELU_MASK ) { handle->relumask = 0; } else { /* cannot happen */ } } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_execute_st(libxsmm_dnn_fusedgroupnorm* handle, libxsmm_dnn_compute_kind kind, /*unsigned*/int start_thread, /*unsigned*/int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { switch (kind) { case LIBXSMM_DNN_COMPUTE_KIND_FWD: { switch (handle->desc.buffer_format) { case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { status = libxsmm_dnn_fusedgroupnorm_st_fwd_custom( handle, start_thread, tid ); } break; default: { status = LIBXSMM_DNN_ERR_INVALID_FORMAT_FUSEDBN; } } } break; case LIBXSMM_DNN_COMPUTE_KIND_BWD: { switch (handle->desc.buffer_format) { case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { status = libxsmm_dnn_fusedgroupnorm_st_bwd_custom( handle, start_thread, tid ); } break; default: { status = LIBXSMM_DNN_ERR_INVALID_FORMAT_FUSEDBN; } } } break; default: { status = LIBXSMM_DNN_ERR_INVALID_KIND; } } } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_reduce_stats_st(libxsmm_dnn_fusedgroupnorm** handles, int num_handles, libxsmm_dnn_compute_kind kind, /*unsigned*/int start_thread, /*unsigned*/int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (0 != handles && num_handles > 0) { switch (kind) { case LIBXSMM_DNN_COMPUTE_KIND_BWD: { switch (handles[0]->desc.buffer_format) { case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { status = libxsmm_dnn_fusedgroupnorm_reduce_stats_st_bwd_custom( handles, num_handles, start_thread, tid ); } break; default: { status = LIBXSMM_DNN_ERR_INVALID_FORMAT_FUSEDBN; } } } break; default: { status = LIBXSMM_DNN_ERR_INVALID_KIND; } } } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } libxsmm-1.17/src/libxsmm_dnn_fusedgroupnorm_backward.c000066400000000000000000000711401415223013700233600ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "libxsmm_dnn_fusedgroupnorm_backward.h" #include "libxsmm_main.h" #if 0 LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_f32_c16(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_f32_c32(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_f32_c64(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_bwd_custom_bf16_bf16_c16(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_bwd_custom_bf16_bf16_c32(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_bwd_custom_bf16_bf16_c64(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid); LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_f32_c16(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef float element_input_type; typedef float element_output_type; typedef float element_stats_type; if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU ) { status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER; } else { if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDGN_OPS_GN) ) { # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK } else { status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION; } } #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_f32_c32(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef float element_input_type; typedef float element_output_type; typedef float element_stats_type; if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU ) { status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER; } else { if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDGN_OPS_GN) ) { # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK } else { status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION; } } #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_f32_c64(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef float element_input_type; typedef float element_output_type; typedef float element_stats_type; if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU ) { status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER; } else { if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDGN_OPS_GN) ) { # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK } else { status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION; } } #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_bwd_custom_bf16_bf16_c16(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef float element_stats_type; # define LIBXSMM_DNN_FUSEDGN_BWD_BF16 if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU ) { status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER; } else { if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDGN_OPS_GN) ) { # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK } else { status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION; } } # undef LIBXSMM_DNN_FUSEDGN_BWD_BF16 #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_bwd_custom_bf16_bf16_c32(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef float element_stats_type; # define LIBXSMM_DNN_FUSEDGN_BWD_BF16 if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU ) { status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER; } else { if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDGN_OPS_GN) ) { # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK } else { status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION; } } # undef LIBXSMM_DNN_FUSEDGN_BWD_BF16 #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_bwd_custom_bf16_bf16_c64(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef float element_stats_type; # define LIBXSMM_DNN_FUSEDGN_BWD_BF16 if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU ) { status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER; } else { if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDGN_OPS_GN) ) { # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK } else { status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION; } } # undef LIBXSMM_DNN_FUSEDGN_BWD_BF16 #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } #endif LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_bwd_custom(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check if all required tensors are bound */ if ( handle->reg_input == 0 || handle->reg_gamma == 0 || handle->grad_input == 0 || handle->grad_output == 0 || handle->grad_beta == 0 || handle->grad_gamma == 0 || handle->expvalue == 0 || handle->rcpstddev == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_GN) > 0 ) { if ( handle->scratch == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } } if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) > 0 ) { if ( handle->grad_add == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } } if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU) > 0 ) { if ( handle->reg_output == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } } if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) > 0 ) { if ( handle->relumask == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } } /* check if we are on an AVX512 platform */ #if 0 #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && (handle->ofmblock == 16) ) { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_f32_c16( handle, start_thread, tid ); } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { status = libxsmm_dnn_fusedgroupnorm_st_bwd_custom_bf16_bf16_c16( handle, start_thread, tid ); } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && (handle->ofmblock == 32) ) { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_f32_c32( handle, start_thread, tid ); } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { status = libxsmm_dnn_fusedgroupnorm_st_bwd_custom_bf16_bf16_c32( handle, start_thread, tid ); } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && (handle->ofmblock == 64) ) { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_f32_c64( handle, start_thread, tid ); } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { status = libxsmm_dnn_fusedgroupnorm_st_bwd_custom_bf16_bf16_c64( handle, start_thread, tid ); } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else #endif #endif { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { typedef float element_input_type; typedef float element_output_type; typedef float element_stats_type; if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU ) { status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER; } else { if ( handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDGN_OPS_GN ) { # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_generic.tpl.c" } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU) == LIBXSMM_DNN_FUSEDGN_OPS_RELU ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK } else { status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION; } } } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef float element_stats_type; # define LIBXSMM_DNN_FUSEDGN_BWD_BF16 if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU ) { status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER; } else { if ( handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDGN_OPS_GN ) { # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_generic.tpl.c" } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU) == LIBXSMM_DNN_FUSEDGN_OPS_RELU ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK } else { status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION; } } # undef LIBXSMM_DNN_FUSEDGN_BWD_BF16 } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_bwd_nhwc(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; LIBXSMM_UNUSED( handle ); LIBXSMM_UNUSED( start_thread ); LIBXSMM_UNUSED( tid ); return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_reduce_stats_st_bwd_custom(libxsmm_dnn_fusedgroupnorm** handles, int num_handles, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; int l_count; /* check if all required tensors are bound */ for ( l_count = 0; l_count < num_handles; ++l_count ) { if ( handles[l_count]->grad_beta == 0 || handles[l_count]->grad_gamma == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } } #if 0 /* check if we are on an AVX512 platform */ if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) { status = libxsmm_dnn_fusedgroupnorm_reduce_stats_st_bwd_custom_avx512( handles, num_handles, start_thread, tid ); } else #endif { const int nBlocksFm = handles[0]->blocksifm; const int nFmBlock = handles[0]->ifmblock; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const int work2 = nBlocksFm; /* compute chunk size */ const int chunksize2 = (work2 % handles[0]->desc.threads == 0) ? (work2 / handles[0]->desc.threads) : ((work2 / handles[0]->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; int v = 0, fm; LIBXSMM_VLA_DECL(2, float, dgamma0, (float*)handles[0]->grad_gamma->data, nFmBlock); LIBXSMM_VLA_DECL(2, float, dbeta0, (float*)handles[0]->grad_beta->data, nFmBlock); /* lazy barrier init */ libxsmm_barrier_init(handles[0]->barrier, ltid); /* now we need to reduce the dgamma and dbeta */ for ( l_count = 1; l_count < num_handles; ++l_count ) { LIBXSMM_VLA_DECL(2, float, dgammar, (float*)handles[l_count]->grad_gamma->data, nFmBlock); LIBXSMM_VLA_DECL(2, float, dbetar, (float*)handles[l_count]->grad_beta->data, nFmBlock); for ( fm = thr_begin2; fm < thr_end2; ++fm ) { float* dgamma0_ptr = &LIBXSMM_VLA_ACCESS(2, dgamma0, fm, 0, nFmBlock); float* dbeta0_ptr = &LIBXSMM_VLA_ACCESS(2, dbeta0, fm, 0, nFmBlock); float* dgammar_ptr = &LIBXSMM_VLA_ACCESS(2, dgammar, fm, 0, nFmBlock); float* dbetar_ptr = &LIBXSMM_VLA_ACCESS(2, dbetar, fm, 0, nFmBlock); LIBXSMM_PRAGMA_SIMD for ( v=0; v < nFmBlock; v++ ) { dgamma0_ptr[v] += dgammar_ptr[v]; dbeta0_ptr[v] += dbetar_ptr[v]; } } } for ( l_count = 1; l_count < num_handles; ++l_count ) { LIBXSMM_VLA_DECL(2, float, dgammar, (float*)handles[l_count]->grad_gamma->data, nFmBlock); LIBXSMM_VLA_DECL(2, float, dbetar, (float*)handles[l_count]->grad_beta->data, nFmBlock); for ( fm = thr_begin2; fm < thr_end2; ++fm ) { float* dgamma0_ptr = &LIBXSMM_VLA_ACCESS(2, dgamma0, fm, 0, nFmBlock); float* dbeta0_ptr = &LIBXSMM_VLA_ACCESS(2, dbeta0, fm, 0, nFmBlock); float* dgammar_ptr = &LIBXSMM_VLA_ACCESS(2, dgammar, fm, 0, nFmBlock); float* dbetar_ptr = &LIBXSMM_VLA_ACCESS(2, dbetar, fm, 0, nFmBlock); LIBXSMM_PRAGMA_SIMD for ( v=0; v < nFmBlock; v++ ) { dgammar_ptr[v] = dgamma0_ptr[v]; dbetar_ptr[v] = dbeta0_ptr[v]; } } } libxsmm_barrier_wait(handles[0]->barrier, ltid); } return status; } libxsmm-1.17/src/libxsmm_dnn_fusedgroupnorm_backward.h000066400000000000000000000025701415223013700233660ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_DNN_FUSEDGROUPNORM_BACKWARD_H #define LIBXSMM_DNN_FUSEDGROUPNORM_BACKWARD_H #include LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_bwd_custom(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_bwd_nhwc(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_reduce_stats_st_bwd_custom(libxsmm_dnn_fusedgroupnorm** handles, int num_handles, int start_thread, int tid); #endif /* LIBXSMM_DNN_FUSEDGROUPNORM_BACKWARD_H */ libxsmm-1.17/src/libxsmm_dnn_fusedgroupnorm_forward.c000066400000000000000000000623011415223013700232450ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "libxsmm_dnn_fusedgroupnorm_forward.h" #include "libxsmm_main.h" #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #include #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif #if 0 LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_f32_c16(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_f32_c32(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_f32_c64(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_fwd_custom_bf16_bf16_c16(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_fwd_custom_bf16_bf16_c32(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_fwd_custom_bf16_bf16_c64(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid); LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_f32_c16(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef float element_input_type; typedef float element_output_type; typedef float element_stats_type; if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU ) { status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER; } else { if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDGN_OPS_GN) ) { # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK } else { status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION; } } #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_f32_c32(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef float element_input_type; typedef float element_output_type; typedef float element_stats_type; if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU ) { status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER; } else { if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDGN_OPS_GN) ) { # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK } else { status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION; } } #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_f32_c64(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef float element_input_type; typedef float element_output_type; typedef float element_stats_type; if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU ) { status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER; } else { if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDGN_OPS_GN) ) { # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK } else { status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION; } } #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_fwd_custom_bf16_bf16_c16(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef float element_stats_type; # define LIBXSMM_DNN_FUSEDGN_FWD_BF16 if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU ) { status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER; } else { if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDGN_OPS_GN) ) { # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK } else { status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION; } } # undef LIBXSMM_DNN_FUSEDGN_FWD_BF16 #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_fwd_custom_bf16_bf16_c32(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef float element_stats_type; # define LIBXSMM_DNN_FUSEDGN_FWD_BF16 if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU ) { status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER; } else { if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDGN_OPS_GN) ) { # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK } else { status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION; } } # undef LIBXSMM_DNN_FUSEDGN_FWD_BF16 #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_fwd_custom_bf16_bf16_c64(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef float element_stats_type; # define LIBXSMM_DNN_FUSEDGN_FWD_BF16 if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU ) { status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER; } else { if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDGN_OPS_GN) ) { # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) > 0 ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK } else { status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION; } } # undef LIBXSMM_DNN_FUSEDGN_FWD_BF16 #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } #endif LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_fwd_custom(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check if all required tensors are bound */ if ( handle->reg_input == 0 || handle->reg_output == 0 || handle->reg_beta == 0 || handle->reg_gamma == 0 || handle->expvalue == 0 || handle->rcpstddev == 0 || handle->variance == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_GN) > 0 ) { if ( handle->scratch == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } } if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) > 0 ) { if ( handle->reg_add == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } } if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) > 0 ) { if ( handle->relumask == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } } /* check if we are on an AVX512 platform */ #if 0 #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && (handle->ofmblock == 16) ) { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_f32_c16( handle, start_thread, tid ); } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { status = libxsmm_dnn_fusedgroupnorm_st_fwd_custom_bf16_bf16_c16( handle, start_thread, tid ); } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && (handle->ofmblock == 32) ) { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_f32_c32( handle, start_thread, tid ); } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { status = libxsmm_dnn_fusedgroupnorm_st_fwd_custom_bf16_bf16_c32( handle, start_thread, tid ); } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && (handle->ofmblock == 64) ) { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_f32_c64( handle, start_thread, tid ); } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { status = libxsmm_dnn_fusedgroupnorm_st_fwd_custom_bf16_bf16_c64( handle, start_thread, tid ); } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else #endif #endif { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { typedef float element_input_type; typedef float element_output_type; typedef float element_stats_type; if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU ) { status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER; } else { if ( handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDGN_OPS_GN ) { # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_generic.tpl.c" } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU) == LIBXSMM_DNN_FUSEDGN_OPS_RELU ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK } else { status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION; } } } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef float element_stats_type; # define LIBXSMM_DNN_FUSEDGN_FWD_BF16 if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU ) { status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER; } else { if ( handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDGN_OPS_GN ) { # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_generic.tpl.c" } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU) == LIBXSMM_DNN_FUSEDGN_OPS_RELU ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK ) { # define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK # include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK } else { status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION; } } # undef LIBXSMM_DNN_FUSEDGN_FWD_BF16 } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_fwd_nhwc(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; LIBXSMM_UNUSED( handle ); LIBXSMM_UNUSED( start_thread ); LIBXSMM_UNUSED( tid ); return status; } libxsmm-1.17/src/libxsmm_dnn_fusedgroupnorm_forward.h000066400000000000000000000023061415223013700232510ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_DNN_FUSEDGROUPNORM_FORWARD_H #define LIBXSMM_DNN_FUSEDGROUPNORM_FORWARD_H #include LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_fwd_custom(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_fwd_nhwc(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid); #endif /* LIBXSMM_DNN_FUSEDGROUPNORM_FORWARD_H */ libxsmm-1.17/src/libxsmm_dnn_optimizer.c000066400000000000000000000302211415223013700204600ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ #include "libxsmm_dnn_optimizer_sgd.h" #include "libxsmm_main.h" LIBXSMM_API libxsmm_dnn_optimizer* libxsmm_dnn_create_optimizer(libxsmm_dnn_optimizer_desc optimizer_desc, libxsmm_dnn_err_t* status) { libxsmm_dnn_optimizer* handle = 0; /* init libxsmm */ LIBXSMM_INIT if ( (optimizer_desc.datatype == LIBXSMM_DNN_DATATYPE_F32) || (optimizer_desc.datatype == LIBXSMM_DNN_DATATYPE_BF16) ) { handle = (libxsmm_dnn_optimizer*)malloc(sizeof(libxsmm_dnn_optimizer)); if (0 != handle) { *status = LIBXSMM_DNN_SUCCESS; /* zero entire content; not only safer but also sets data and code pointers to NULL */ memset(handle, 0, sizeof(*handle)); /* let's make the description persistent */ handle->desc = optimizer_desc; if ( (handle->desc.filter_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0 ) { /* we need to compute the memory layout given the */ *status = libxsmm_dnn_get_feature_map_blocks( handle->desc.C, handle->desc.K, &(handle->bc), &(handle->bk), &(handle->fm_lp_block), handle->desc.datatype, handle->desc.datatype ); /* compute the outer blocks */ handle->Bc = handle->desc.C / handle->bc; handle->Bk = handle->desc.K / handle->bk; } else if ( (handle->desc.filter_format & LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED) > 0 ) { if ( optimizer_desc.datatype == LIBXSMM_DNN_DATATYPE_F32 ) { handle->fm_lp_block = 1; } else if ( optimizer_desc.datatype == LIBXSMM_DNN_DATATYPE_BF16 ) { handle->fm_lp_block = 2; } else { } handle->bc = handle->desc.bc; handle->bk = handle->desc.bk; handle->Bc = handle->desc.C / handle->bc; handle->Bk = handle->desc.K / handle->bk; } else { *status = LIBXSMM_DNN_ERR_CREATE_HANDLE; free( handle ); handle = 0; return handle; } /* create barrier */ handle->barrier = libxsmm_barrier_create(handle->desc.threads, 1); /* calculate scratch size for local optimizer copies of one feature map block per thread */ handle->scratch_size = 1; } else { *status = LIBXSMM_DNN_ERR_CREATE_HANDLE; } } else { *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } return handle; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_optimizer(const libxsmm_dnn_optimizer* handle) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { /* Deallocate barrier */ if (handle->barrier != 0 ) { libxsmm_barrier_release((const libxsmm_barrier*)handle->barrier); } /* deallocate handle structure */ free(/*remove constness*/(libxsmm_dnn_optimizer*)handle); } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_optimizer_create_tensor_datalayout(const libxsmm_dnn_optimizer* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status) { libxsmm_dnn_tensor_datalayout* layout; *status = LIBXSMM_DNN_SUCCESS; layout = 0; if (handle != 0) { layout = (libxsmm_dnn_tensor_datalayout*) malloc(sizeof(libxsmm_dnn_tensor_datalayout)); if (layout != 0) { memset(layout, 0, sizeof(libxsmm_dnn_tensor_datalayout)); layout->format = handle->desc.filter_format; if ( (type == LIBXSMM_DNN_REGULAR_FILTER) || (type == LIBXSMM_DNN_GRADIENT_FILTER) || (type == LIBXSMM_DNN_MASTER_FILTER) ) { if ( ((handle->desc.filter_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0) || ((handle->desc.filter_format & LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED) > 0) ) { if ( handle->desc.datatype == LIBXSMM_DNN_DATATYPE_F32 ) { layout->datatype = handle->desc.datatype; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(4*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(4*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { layout->num_dims = 4; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_size[0] = handle->bk; layout->dim_size[1] = handle->bc; layout->dim_size[2] = handle->Bc; layout->dim_size[3] = handle->Bk; } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; } } else if ( handle->desc.datatype == LIBXSMM_DNN_DATATYPE_BF16 ) { layout->datatype = handle->desc.datatype; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { layout->num_dims = 5; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_size[0] = handle->fm_lp_block; layout->dim_size[1] = handle->bk; layout->dim_size[2] = handle->bc/handle->fm_lp_block; layout->dim_size[3] = handle->Bc; layout->dim_size[4] = handle->Bk; } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT; } } else { *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return layout; } LIBXSMM_API size_t libxsmm_dnn_optimizer_get_scratch_size(const libxsmm_dnn_optimizer* handle, libxsmm_dnn_err_t* status) { size_t l_scratch_size = 0; *status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { l_scratch_size = handle->scratch_size + 64; /* 64 byte extra in case the user code does not care about alignment */ } else { *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return l_scratch_size; } LIBXSMM_API void* libxsmm_dnn_optimizer_get_scratch_ptr(const libxsmm_dnn_optimizer* handle, libxsmm_dnn_err_t* status) { *status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { return handle->scratch; } else { *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return 0; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_optimizer_bind_scratch(libxsmm_dnn_optimizer* handle, const void* scratch) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; uintptr_t address = (uintptr_t)scratch; size_t offset = 0; if (scratch == 0) { status = LIBXSMM_DNN_ERR_SCRATCH_NOT_ALLOCED; return status; } if (0 != handle) { /* align the internal scratch buffer if needed */ if (address % 64 == 0) { handle->scratch = (void*)address; } else { offset = (64 - address % 64); handle->scratch = (void*)(address+offset); } } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_optimizer_release_scratch(libxsmm_dnn_optimizer* handle) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { handle->scratch = 0; } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_optimizer_bind_tensor(libxsmm_dnn_optimizer* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check for tensor type */ if ( (type != LIBXSMM_DNN_REGULAR_FILTER) && (type != LIBXSMM_DNN_GRADIENT_FILTER) && (type != LIBXSMM_DNN_MASTER_FILTER) ) { status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; return status; } if (handle != 0 && tensor != 0) { libxsmm_dnn_tensor_datalayout* handle_layout = libxsmm_dnn_optimizer_create_tensor_datalayout(handle, type, &status); if ( libxsmm_dnn_compare_tensor_datalayout(handle_layout, tensor->layout, &status) == 0 ) { if ( type == LIBXSMM_DNN_REGULAR_FILTER ) { handle->reg_filter = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_GRADIENT_FILTER ) { handle->grad_filter = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_MASTER_FILTER ) { handle->master_filter = (libxsmm_dnn_tensor*)tensor; } else { /* cannot happen */ } } else { status = LIBXSMM_DNN_ERR_MISMATCH_TENSOR; } libxsmm_dnn_destroy_tensor_datalayout( handle_layout ); } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE_TENSOR; } return status; } LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_optimizer_get_tensor(libxsmm_dnn_optimizer* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status) { libxsmm_dnn_tensor* return_tensor = 0; *status = LIBXSMM_DNN_SUCCESS; /* check for tensor type */ if ( (type != LIBXSMM_DNN_REGULAR_FILTER) && (type != LIBXSMM_DNN_GRADIENT_FILTER) && (type != LIBXSMM_DNN_MASTER_FILTER) ) { *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; return return_tensor; } if (handle != 0) { if ( type == LIBXSMM_DNN_REGULAR_FILTER ) { return_tensor = handle->reg_filter; } else if ( type == LIBXSMM_DNN_GRADIENT_FILTER ) { return_tensor = handle->grad_filter; } else if ( type == LIBXSMM_DNN_MASTER_FILTER ) { return_tensor = handle->master_filter; } else { /* cannot happen */ } } else { *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return return_tensor; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_optimizer_release_tensor(libxsmm_dnn_optimizer* handle, const libxsmm_dnn_tensor_type type) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check for tensor type */ if ( (type != LIBXSMM_DNN_REGULAR_FILTER) && (type != LIBXSMM_DNN_GRADIENT_FILTER) && (type != LIBXSMM_DNN_MASTER_FILTER) ) { status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; return status; } if (handle != 0) { if ( type == LIBXSMM_DNN_REGULAR_FILTER ) { handle->reg_filter = 0; } else if ( type == LIBXSMM_DNN_GRADIENT_FILTER ) { handle->grad_filter = 0; } else if ( type == LIBXSMM_DNN_MASTER_FILTER ) { handle->master_filter = 0; } else { /* cannot happen */ } } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_optimizer_execute_st(libxsmm_dnn_optimizer* handle, /*unsigned*/int start_thread, /*unsigned*/int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } libxsmm-1.17/src/libxsmm_dnn_optimizer_sgd.c000066400000000000000000000100651415223013700213210ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "libxsmm_dnn_optimizer_sgd.h" #include "libxsmm_main.h" LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_optimizer_sgd_st_f32_f32(libxsmm_dnn_optimizer* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_optimizer_sgd_st_bf16_bf16(libxsmm_dnn_optimizer* handle, int start_thread, int tid); LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_optimizer_sgd_st_f32_f32(libxsmm_dnn_optimizer* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef float element_filter_type; # define LIBXSMM_DNN_OPTIMIZER_SGD_F32_AVX512 # include "template/libxsmm_dnn_optimizer_sgd_st_generic.tpl.c" # undef LIBXSMM_DNN_OPTIMIZER_SGD_F32_AVX512 #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_optimizer_sgd_st_bf16_bf16(libxsmm_dnn_optimizer* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef libxsmm_bfloat16 element_filter_type; typedef float element_master_type; # define LIBXSMM_DNN_OPTIMIZER_SGD_BF16_AVX512 # include "template/libxsmm_dnn_optimizer_sgd_st_generic.tpl.c" # undef LIBXSMM_DNN_OPTIMIZER_SGD_BF16_AVX512 #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_optimizer_sgd_st(libxsmm_dnn_optimizer* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check if we have filter, grad_filter */ if ( handle->reg_filter == 0 || handle->grad_filter == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } if ( (handle->master_filter == 0) && (handle->desc.datatype == LIBXSMM_DNN_DATATYPE_BF16) ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } /* check if we are on an AVX512 platform */ #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) { if ( handle->desc.datatype == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_optimizer_sgd_st_f32_f32( handle, start_thread, tid); } else if ( handle->desc.datatype == LIBXSMM_DNN_DATATYPE_BF16 ) { status = libxsmm_dnn_optimizer_sgd_st_bf16_bf16( handle, start_thread, tid); } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else #endif { if ( handle->desc.datatype == LIBXSMM_DNN_DATATYPE_F32 ) { typedef float element_filter_type; # include "template/libxsmm_dnn_optimizer_sgd_st_generic.tpl.c" } else if ( handle->desc.datatype == LIBXSMM_DNN_DATATYPE_BF16 ) { typedef libxsmm_bfloat16 element_filter_type; typedef float element_master_type; # define LIBXSMM_DNN_OPTIMIZER_SGD_BF16 # include "template/libxsmm_dnn_optimizer_sgd_st_generic.tpl.c" # undef LIBXSMM_DNN_OPTIMIZER_SGD_BF16 } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } return status; } libxsmm-1.17/src/libxsmm_dnn_optimizer_sgd.h000066400000000000000000000020101415223013700213150ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_DNN_OPTIMIZER_SGD_H #define LIBXSMM_DNN_OPTIMIZER_SGD_H #include LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_optimizer_sgd_st(libxsmm_dnn_optimizer* handle, int start_thread, int tid); #endif /* LIBXSMM_DNN_OPTIMIZER_SGD_H */ libxsmm-1.17/src/libxsmm_dnn_pooling.c000066400000000000000000000470151415223013700201160ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ #include "libxsmm_dnn_pooling_backward.h" #include "libxsmm_dnn_pooling_forward.h" #include "libxsmm_main.h" LIBXSMM_API libxsmm_dnn_pooling* libxsmm_dnn_create_pooling(libxsmm_dnn_pooling_desc pooling_desc, libxsmm_dnn_err_t* status) { libxsmm_dnn_pooling* handle = 0; int lpb; /* init libxsmm */ LIBXSMM_INIT if ( ((pooling_desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (pooling_desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16)) || ((pooling_desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (pooling_desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) ) { handle = (libxsmm_dnn_pooling*)malloc(sizeof(libxsmm_dnn_pooling)); if (0 != handle) { *status = LIBXSMM_DNN_SUCCESS; /* zero entire content; not only safer but also sets data and code pointers to NULL */ memset(handle, 0, sizeof(*handle)); /* let's make the description persistent */ handle->desc = pooling_desc; /* we need to compute the memory layout given the */ *status = libxsmm_dnn_get_feature_map_blocks( handle->desc.C, handle->desc.C, &(handle->ifmblock), &(handle->ofmblock), &lpb, handle->desc.datatype_in, handle->desc.datatype_out ); /* compute the outer blocks */ handle->blocksifm = handle->desc.C / handle->ifmblock; handle->blocksofm = handle->desc.C / handle->ofmblock; /* setting ofh and ofw */ handle->ofh = (handle->desc.H + 2 * handle->desc.pad_h - handle->desc.R) / handle->desc.u + 1; handle->ofw = (handle->desc.W + 2 * handle->desc.pad_w - handle->desc.S) / handle->desc.v + 1; /* create barrier */ handle->barrier = libxsmm_barrier_create(handle->desc.threads, 1); /* calculate scratch size for local pooling copies of one feature map block per thread */ handle->scratch_size = (sizeof(float) * ( (size_t)handle->desc.H + (size_t)LIBXSMM_MAX(handle->desc.pad_h_in, handle->desc.pad_h_out)*2 ) * ( (size_t)handle->desc.W + (size_t)LIBXSMM_MAX(handle->desc.pad_w_in, handle->desc.pad_w_out)*2 ) * LIBXSMM_MAX( handle->ofmblock, handle->ifmblock ) * handle->desc.threads ); } else { *status = LIBXSMM_DNN_ERR_CREATE_HANDLE; } } else { *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } return handle; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_pooling(const libxsmm_dnn_pooling* handle) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { /* Deallocate barrier */ if (handle->barrier != 0 ) { libxsmm_barrier_release((const libxsmm_barrier*)handle->barrier); } /* deallocate handle structure */ free(/*remove constness*/(libxsmm_dnn_pooling*)handle); } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_pooling_create_tensor_datalayout(const libxsmm_dnn_pooling* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status) { libxsmm_dnn_tensor_datalayout* layout; *status = LIBXSMM_DNN_SUCCESS; layout = 0; if (handle != 0) { layout = (libxsmm_dnn_tensor_datalayout*) malloc(sizeof(libxsmm_dnn_tensor_datalayout)); if (layout != 0) { memset(layout, 0, sizeof(libxsmm_dnn_tensor_datalayout)); layout->format = handle->desc.buffer_format; if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) || (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) || (type == LIBXSMM_DNN_POOLING_MASK) ) { if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0) { if ( ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) ) { if ( type == LIBXSMM_DNN_POOLING_MASK ) { layout->datatype = handle->desc.datatype_mask; } else { layout->datatype = LIBXSMM_DNN_DATATYPE_F32; } layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { layout->num_dims = 5; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) ) { layout->dim_size[0] = handle->ifmblock; layout->dim_size[1] = handle->desc.W + (2*handle->desc.pad_w_in); layout->dim_size[2] = handle->desc.H + (2*handle->desc.pad_h_in); layout->dim_size[3] = handle->blocksifm; layout->dim_size[4] = handle->desc.N; } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { layout->dim_size[0] = handle->ofmblock; layout->dim_size[1] = (handle->ofw) + (2*handle->desc.pad_w_out); layout->dim_size[2] = (handle->ofh) + (2*handle->desc.pad_h_out); layout->dim_size[3] = handle->blocksofm; layout->dim_size[4] = handle->desc.N; } else if ( (type == LIBXSMM_DNN_POOLING_MASK) ) { layout->dim_size[0] = handle->ofmblock; layout->dim_size[1] = handle->ofw; layout->dim_size[2] = handle->ofh; layout->dim_size[3] = handle->blocksofm; layout->dim_size[4] = handle->desc.N; } else { /* coverity[dead_error_begin] */ free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; } } else if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) { if ( type == LIBXSMM_DNN_POOLING_MASK ) { layout->datatype = handle->desc.datatype_mask; } else { layout->datatype = LIBXSMM_DNN_DATATYPE_BF16; } layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { layout->num_dims = 5; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) ) { layout->dim_size[0] = handle->ifmblock; layout->dim_size[1] = handle->desc.W + (2*handle->desc.pad_w_in); layout->dim_size[2] = handle->desc.H + (2*handle->desc.pad_h_in); layout->dim_size[3] = handle->blocksifm; layout->dim_size[4] = handle->desc.N; } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { layout->dim_size[0] = handle->ofmblock; layout->dim_size[1] = (handle->ofw) + (2*handle->desc.pad_w_out); layout->dim_size[2] = (handle->ofh) + (2*handle->desc.pad_h_out); layout->dim_size[3] = handle->blocksofm; layout->dim_size[4] = handle->desc.N; } else if ( (type == LIBXSMM_DNN_POOLING_MASK) ) { layout->dim_size[0] = handle->ofmblock; layout->dim_size[1] = handle->ofw; layout->dim_size[2] = handle->ofh; layout->dim_size[3] = handle->blocksofm; layout->dim_size[4] = handle->desc.N; } else { free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } else if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NHWC) > 0) { if ( ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) || ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16)) ) { if ( type == LIBXSMM_DNN_POOLING_MASK ) { layout->datatype = handle->desc.datatype_mask; } else { layout->datatype = handle->desc.datatype_in; } layout->datatype = handle->desc.datatype_in; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(4*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(4*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 4; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) ) { layout->dim_size[0] = handle->desc.C; layout->dim_size[1] = handle->desc.W + (2*handle->desc.pad_w_in); layout->dim_size[2] = handle->desc.H + (2*handle->desc.pad_h_in); layout->dim_size[3] = handle->desc.N; } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { layout->dim_size[0] = handle->desc.C; layout->dim_size[1] = (handle->ofw) + (2*handle->desc.pad_w_out); layout->dim_size[2] = (handle->ofh) + (2*handle->desc.pad_h_out); layout->dim_size[3] = handle->desc.N; } else { free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT; } } else { *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return layout; } LIBXSMM_API size_t libxsmm_dnn_pooling_get_scratch_size(const libxsmm_dnn_pooling* handle, libxsmm_dnn_err_t* status) { size_t l_scratch_size = 0; *status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { l_scratch_size = handle->scratch_size + 64; /* 64 byte extra in case the user code does not care about alignment */ } else { *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return l_scratch_size; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_pooling_bind_scratch(libxsmm_dnn_pooling* handle, const void* scratch) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; uintptr_t address = (uintptr_t)scratch; size_t offset = 0; if (scratch == 0) { status = LIBXSMM_DNN_ERR_SCRATCH_NOT_ALLOCED; return status; } if (0 != handle) { /* align the internal scratch buffer if needed */ if (address % 64 == 0) { handle->scratch = (void*)address; } else { offset = (64 - address % 64); handle->scratch = (void*)(address+offset); } } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_pooling_release_scratch(libxsmm_dnn_pooling* handle) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { handle->scratch = 0; } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_pooling_bind_tensor(libxsmm_dnn_pooling* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check for tensor type */ if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_GRADIENT_OUTPUT) && (type != LIBXSMM_DNN_POOLING_MASK) ) { status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; return status; } if (handle != 0 && tensor != 0) { libxsmm_dnn_tensor_datalayout* handle_layout = libxsmm_dnn_pooling_create_tensor_datalayout(handle, type, &status); if ( libxsmm_dnn_compare_tensor_datalayout(handle_layout, tensor->layout, &status) == 0 ) { if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { handle->reg_input = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { handle->grad_input = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { handle->reg_output = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_GRADIENT_OUTPUT ) { handle->grad_output = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_POOLING_MASK ) { handle->mask = (libxsmm_dnn_tensor*)tensor; } else { /* cannot happen */ } } else { status = LIBXSMM_DNN_ERR_MISMATCH_TENSOR; } libxsmm_dnn_destroy_tensor_datalayout( handle_layout ); } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE_TENSOR; } return status; } LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_pooling_get_tensor(libxsmm_dnn_pooling* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status) { libxsmm_dnn_tensor* return_tensor = 0; *status = LIBXSMM_DNN_SUCCESS; /* check for tensor type */ if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_GRADIENT_OUTPUT) && (type != LIBXSMM_DNN_POOLING_MASK) ) { *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; return return_tensor; } if (handle != 0) { if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { return_tensor = handle->reg_input; } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { return_tensor = handle->grad_input; } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { return_tensor = handle->reg_output; } else if ( type == LIBXSMM_DNN_GRADIENT_OUTPUT ) { return_tensor = handle->grad_output; } else if ( type == LIBXSMM_DNN_POOLING_MASK ) { return_tensor = handle->mask; } else { /* cannot happen */ } } else { *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return return_tensor; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_pooling_release_tensor(libxsmm_dnn_pooling* handle, const libxsmm_dnn_tensor_type type) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check for tensor type */ if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_GRADIENT_OUTPUT) && (type != LIBXSMM_DNN_POOLING_MASK) ) { status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; return status; } if (handle != 0) { if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { handle->reg_input = 0; } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { handle->grad_input = 0; } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { handle->reg_output = 0; } else if ( type == LIBXSMM_DNN_GRADIENT_OUTPUT ) { handle->grad_output = 0; } else if ( type == LIBXSMM_DNN_POOLING_MASK ) { handle->mask = 0; } else { /* cannot happen */ } } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_pooling_execute_st(libxsmm_dnn_pooling* handle, libxsmm_dnn_compute_kind kind, /*unsigned*/int start_thread, /*unsigned*/int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { switch (kind) { case LIBXSMM_DNN_COMPUTE_KIND_FWD: { switch (handle->desc.buffer_format) { case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { status = libxsmm_dnn_pooling_st_fwd_custom( handle, start_thread, tid ); } break; default: { status = LIBXSMM_DNN_ERR_INVALID_FORMAT_FUSEDBN; } } } break; case LIBXSMM_DNN_COMPUTE_KIND_BWD: { switch (handle->desc.buffer_format) { case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { status = libxsmm_dnn_pooling_st_bwd_custom( handle, start_thread, tid ); } break; default: { status = LIBXSMM_DNN_ERR_INVALID_FORMAT_FUSEDBN; } } } break; default: { status = LIBXSMM_DNN_ERR_INVALID_KIND; } } } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } libxsmm-1.17/src/libxsmm_dnn_pooling_backward.c000066400000000000000000000316131415223013700217510ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "libxsmm_dnn_pooling_backward.h" #include "libxsmm_main.h" LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_custom_f32_f32_c16(libxsmm_dnn_pooling* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_custom_f32_f32_c32(libxsmm_dnn_pooling* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_custom_f32_f32_c64(libxsmm_dnn_pooling* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_custom_bf16_bf16_c16(libxsmm_dnn_pooling* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_custom_bf16_bf16_c32(libxsmm_dnn_pooling* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_custom_bf16_bf16_c64(libxsmm_dnn_pooling* handle, int start_thread, int tid); LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_custom_f32_f32_c16(libxsmm_dnn_pooling* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef float element_input_type; typedef float element_output_type; if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX ) { # define LIBXSMM_DNN_POOLING_BWD_MAX typedef int element_mask_type; # include "template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_POOLING_BWD_MAX } else if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_AVG ) { # define LIBXSMM_DNN_POOLING_BWD_AVG # include "template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_POOLING_BWD_AVG } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING; } #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_custom_f32_f32_c32(libxsmm_dnn_pooling* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef float element_input_type; typedef float element_output_type; if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX ) { # define LIBXSMM_DNN_POOLING_BWD_MAX typedef int element_mask_type; # include "template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_POOLING_BWD_MAX } else if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_AVG ) { # define LIBXSMM_DNN_POOLING_BWD_AVG # include "template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_POOLING_BWD_AVG } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING; } #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_custom_f32_f32_c64(libxsmm_dnn_pooling* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef float element_input_type; typedef float element_output_type; if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX ) { # define LIBXSMM_DNN_POOLING_BWD_MAX typedef int element_mask_type; # include "template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_POOLING_BWD_MAX } else if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_AVG ) { # define LIBXSMM_DNN_POOLING_BWD_AVG # include "template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_POOLING_BWD_AVG } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING; } #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_custom_bf16_bf16_c16(libxsmm_dnn_pooling* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; # define LIBXSMM_DNN_POOLING_BWD_BF16 if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX ) { # define LIBXSMM_DNN_POOLING_BWD_MAX typedef int element_mask_type; # include "template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_POOLING_BWD_MAX } else if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_AVG ) { # define LIBXSMM_DNN_POOLING_BWD_AVG # include "template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_POOLING_BWD_AVG } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING; } # undef LIBXSMM_DNN_POOLING_BWD_BF16 #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_custom_bf16_bf16_c32(libxsmm_dnn_pooling* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; # define LIBXSMM_DNN_POOLING_BWD_BF16 if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX ) { # define LIBXSMM_DNN_POOLING_BWD_MAX typedef int element_mask_type; # include "template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_POOLING_BWD_MAX } else if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_AVG ) { # define LIBXSMM_DNN_POOLING_BWD_AVG # include "template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_POOLING_BWD_AVG } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING; } # undef LIBXSMM_DNN_POOLING_BWD_BF16 #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_custom_bf16_bf16_c64(libxsmm_dnn_pooling* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; # define LIBXSMM_DNN_POOLING_BWD_BF16 if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX ) { # define LIBXSMM_DNN_POOLING_BWD_MAX typedef int element_mask_type; # include "template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_POOLING_BWD_MAX } else if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_AVG ) { # define LIBXSMM_DNN_POOLING_BWD_AVG # include "template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_POOLING_BWD_AVG } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING; } # undef LIBXSMM_DNN_POOLING_BWD_BF16 #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_custom(libxsmm_dnn_pooling* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check if we have input, output and mask */ if ( handle->grad_input == 0 || handle->grad_output == 0 || ( (handle->mask == 0) && (handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX) ) ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } /* check if we are on an AVX512 platform */ #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && (handle->ofmblock == 16) ) { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { LIBXSMM_ASSERT(NULL != handle->mask); status = libxsmm_dnn_pooling_st_bwd_custom_f32_f32_c16( handle, start_thread, tid); } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { LIBXSMM_ASSERT(NULL != handle->mask); status = libxsmm_dnn_pooling_st_bwd_custom_bf16_bf16_c16( handle, start_thread, tid); } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && (handle->ofmblock == 32) ) { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { LIBXSMM_ASSERT(NULL != handle->mask); status = libxsmm_dnn_pooling_st_bwd_custom_f32_f32_c32( handle, start_thread, tid); } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { LIBXSMM_ASSERT(NULL != handle->mask); status = libxsmm_dnn_pooling_st_bwd_custom_bf16_bf16_c32( handle, start_thread, tid); } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && (handle->ofmblock == 64) ) { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { LIBXSMM_ASSERT(NULL != handle->mask); status = libxsmm_dnn_pooling_st_bwd_custom_f32_f32_c64( handle, start_thread, tid); } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { LIBXSMM_ASSERT(NULL != handle->mask); status = libxsmm_dnn_pooling_st_bwd_custom_bf16_bf16_c64( handle, start_thread, tid); } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else #endif { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { typedef float element_input_type; typedef float element_output_type; if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX ) { # define LIBXSMM_DNN_POOLING_BWD_MAX typedef int element_mask_type; # include "template/libxsmm_dnn_pooling_st_bwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_POOLING_BWD_MAX } else if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_AVG ) { # define LIBXSMM_DNN_POOLING_BWD_AVG # include "template/libxsmm_dnn_pooling_st_bwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_POOLING_BWD_AVG } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING; } } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; # define LIBXSMM_DNN_POOLING_BWD_BF16 if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX ) { # define LIBXSMM_DNN_POOLING_BWD_MAX typedef int element_mask_type; # include "template/libxsmm_dnn_pooling_st_bwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_POOLING_BWD_MAX } else if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_AVG ) { # define LIBXSMM_DNN_POOLING_BWD_AVG # include "template/libxsmm_dnn_pooling_st_bwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_POOLING_BWD_AVG } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING; } # undef LIBXSMM_DNN_POOLING_BWD_BF16 } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_nhwc(libxsmm_dnn_pooling* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; LIBXSMM_UNUSED( handle ); LIBXSMM_UNUSED( start_thread ); LIBXSMM_UNUSED( tid ); return status; } libxsmm-1.17/src/libxsmm_dnn_pooling_backward.h000066400000000000000000000022211415223013700217470ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_DNN_POOLING_BACKWARD_H #define LIBXSMM_DNN_POOLING_BACKWARD_H #include LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_custom(libxsmm_dnn_pooling* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_nhwc(libxsmm_dnn_pooling* handle, int start_thread, int tid); #endif /* LIBXSMM_DNN_POOLING_BACKWARD_H */ libxsmm-1.17/src/libxsmm_dnn_pooling_forward.c000066400000000000000000000316101415223013700216340ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "libxsmm_dnn_pooling_forward.h" #include "libxsmm_main.h" LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_fwd_custom_f32_f32_c16(libxsmm_dnn_pooling* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_fwd_custom_f32_f32_c32(libxsmm_dnn_pooling* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_fwd_custom_f32_f32_c64(libxsmm_dnn_pooling* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_fwd_custom_bf16_bf16_c16(libxsmm_dnn_pooling* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_fwd_custom_bf16_bf16_c32(libxsmm_dnn_pooling* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_fwd_custom_bf16_bf16_c64(libxsmm_dnn_pooling* handle, int start_thread, int tid); LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_pooling_st_fwd_custom_f32_f32_c16(libxsmm_dnn_pooling* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef float element_input_type; typedef float element_output_type; if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX ) { # define LIBXSMM_DNN_POOLING_FWD_MAX typedef int element_mask_type; # include "template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_POOLING_FWD_MAX } else if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_AVG ) { # define LIBXSMM_DNN_POOLING_FWD_AVG # include "template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_POOLING_FWD_AVG } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING; } #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_pooling_st_fwd_custom_f32_f32_c32(libxsmm_dnn_pooling* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef float element_input_type; typedef float element_output_type; if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX ) { # define LIBXSMM_DNN_POOLING_FWD_MAX typedef int element_mask_type; # include "template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_POOLING_FWD_MAX } else if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_AVG ) { # define LIBXSMM_DNN_POOLING_FWD_AVG # include "template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_POOLING_FWD_AVG } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING; } #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_pooling_st_fwd_custom_f32_f32_c64(libxsmm_dnn_pooling* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef float element_input_type; typedef float element_output_type; if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX ) { # define LIBXSMM_DNN_POOLING_FWD_MAX typedef int element_mask_type; # include "template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_POOLING_FWD_MAX } else if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_AVG ) { # define LIBXSMM_DNN_POOLING_FWD_AVG # include "template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_POOLING_FWD_AVG } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING; } #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_pooling_st_fwd_custom_bf16_bf16_c16(libxsmm_dnn_pooling* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; # define LIBXSMM_DNN_POOLING_FWD_BF16 if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX ) { # define LIBXSMM_DNN_POOLING_FWD_MAX typedef int element_mask_type; # include "template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_POOLING_FWD_MAX } else if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_AVG ) { # define LIBXSMM_DNN_POOLING_FWD_AVG # include "template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_POOLING_FWD_AVG } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING; } # undef LIBXSMM_DNN_POOLING_FWD_BF16 #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_pooling_st_fwd_custom_bf16_bf16_c32(libxsmm_dnn_pooling* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; # define LIBXSMM_DNN_POOLING_FWD_BF16 if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX ) { # define LIBXSMM_DNN_POOLING_FWD_MAX typedef int element_mask_type; # include "template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_POOLING_FWD_MAX } else if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_AVG ) { # define LIBXSMM_DNN_POOLING_FWD_AVG # include "template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" # undef LIBXSMM_DNN_POOLING_FWD_AVG } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING; } # undef LIBXSMM_DNN_POOLING_FWD_BF16 #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_pooling_st_fwd_custom_bf16_bf16_c64(libxsmm_dnn_pooling* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; # define LIBXSMM_DNN_POOLING_FWD_BF16 if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX ) { # define LIBXSMM_DNN_POOLING_FWD_MAX typedef int element_mask_type; # include "template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_POOLING_FWD_MAX } else if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_AVG ) { # define LIBXSMM_DNN_POOLING_FWD_AVG # include "template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" # undef LIBXSMM_DNN_POOLING_FWD_AVG } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING; } # undef LIBXSMM_DNN_POOLING_FWD_BF16 #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_fwd_custom(libxsmm_dnn_pooling* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check if we have input, output and mask */ if ( handle->reg_input == 0 || handle->reg_output == 0 || ( (handle->mask == 0) && (handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX) ) ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } /* check if we are on an AVX512 platform */ #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && (handle->ofmblock == 16) ) { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { LIBXSMM_ASSERT(NULL != handle->mask); status = libxsmm_dnn_pooling_st_fwd_custom_f32_f32_c16( handle, start_thread, tid); } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { LIBXSMM_ASSERT(NULL != handle->mask); status = libxsmm_dnn_pooling_st_fwd_custom_bf16_bf16_c16( handle, start_thread, tid); } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && (handle->ofmblock == 32) ) { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { LIBXSMM_ASSERT(NULL != handle->mask); status = libxsmm_dnn_pooling_st_fwd_custom_f32_f32_c32( handle, start_thread, tid); } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { LIBXSMM_ASSERT(NULL != handle->mask); status = libxsmm_dnn_pooling_st_fwd_custom_bf16_bf16_c32( handle, start_thread, tid); } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && (handle->ofmblock == 64) ) { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { LIBXSMM_ASSERT(NULL != handle->mask); status = libxsmm_dnn_pooling_st_fwd_custom_f32_f32_c64( handle, start_thread, tid); } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { LIBXSMM_ASSERT(NULL != handle->mask); status = libxsmm_dnn_pooling_st_fwd_custom_bf16_bf16_c64( handle, start_thread, tid); } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else #endif { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { typedef float element_input_type; typedef float element_output_type; if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX ) { # define LIBXSMM_DNN_POOLING_FWD_MAX typedef int element_mask_type; # include "template/libxsmm_dnn_pooling_st_fwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_POOLING_FWD_MAX } else if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_AVG ) { # define LIBXSMM_DNN_POOLING_FWD_AVG # include "template/libxsmm_dnn_pooling_st_fwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_POOLING_FWD_AVG } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING; } } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; # define LIBXSMM_DNN_POOLING_FWD_BF16 if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX ) { # define LIBXSMM_DNN_POOLING_FWD_MAX typedef int element_mask_type; # include "template/libxsmm_dnn_pooling_st_fwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_POOLING_FWD_MAX } else if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_AVG ) { # define LIBXSMM_DNN_POOLING_FWD_AVG # include "template/libxsmm_dnn_pooling_st_fwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_POOLING_FWD_AVG } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING; } # undef LIBXSMM_DNN_POOLING_FWD_BF16 } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_fwd_nhwc(libxsmm_dnn_pooling* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; LIBXSMM_UNUSED( handle ); LIBXSMM_UNUSED( start_thread ); LIBXSMM_UNUSED( tid ); return status; } libxsmm-1.17/src/libxsmm_dnn_pooling_forward.h000066400000000000000000000022161415223013700216410ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_DNN_POOLING_FORWARD_H #define LIBXSMM_DNN_POOLING_FORWARD_H #include LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_fwd_custom(libxsmm_dnn_pooling* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_fwd_nhwc(libxsmm_dnn_pooling* handle, int start_thread, int tid); #endif /* LIBXSMM_DNN_POOLING_FORWARD_H */ libxsmm-1.17/src/libxsmm_dnn_rnncell.c000066400000000000000000003313001415223013700200750ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Evangelos Georganas, Kunal Banerjee (Intel Corp.) ******************************************************************************/ #include "libxsmm_dnn_rnncell_forward.h" #include "libxsmm_dnn_rnncell_backward_weight_update.h" #include "libxsmm_dnn_elementwise.h" #include "libxsmm_main.h" #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #include #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif LIBXSMM_API libxsmm_dnn_rnncell* libxsmm_dnn_create_rnncell(libxsmm_dnn_rnncell_desc rnncell_desc, libxsmm_dnn_err_t* status) { libxsmm_dnn_rnncell* handle = 0; /* init libxsmm */ LIBXSMM_INIT /* some check we can do before allocating the handle */ if ( (rnncell_desc.datatype_in != rnncell_desc.datatype_out) || ( (rnncell_desc.datatype_in != LIBXSMM_DNN_DATATYPE_BF16) && (rnncell_desc.datatype_in != LIBXSMM_DNN_DATATYPE_F32) ) ) { *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return NULL; } /* let's do some simple checks for BF16 as this limits the cell and architecture */ if ( (rnncell_desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) || (rnncell_desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) { if ( (LIBXSMM_X86_AVX512_CORE > libxsmm_target_archid) || (rnncell_desc.C % 16 != 0) || (rnncell_desc.K % 16 != 0) ) { *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return NULL; } } /* we need at least one timestep */ if (rnncell_desc.max_T < 1) { *status = LIBXSMM_DNN_ERR_TIME_STEPS_TOO_SMALL; return NULL; } handle = (libxsmm_dnn_rnncell*)malloc(sizeof(libxsmm_dnn_rnncell)); if (0 != handle) { *status = LIBXSMM_DNN_SUCCESS; /* zero entire content; not only safer but also sets data and code pointers to NULL */ memset(handle, 0, sizeof(*handle)); /* initialize known handle components */ handle->desc = rnncell_desc; /* set current seq length to max length */ handle->T = rnncell_desc.max_T; /* set blocking factors */ handle->bk = (handle->desc.bk == 0) ? 64 : handle->desc.bk; handle->bn = (handle->desc.bn == 0) ? 64 : handle->desc.bn; handle->bc = (handle->desc.bc == 0) ? 64 : handle->desc.bc; if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) { handle->lpb = 2; } else { handle->lpb = 1; } /* validate blocking factors */ if ( handle->desc.N % handle->bn != 0 ) { handle->bn = handle->desc.N; *status = LIBXSMM_DNN_WARN_RNN_SUBOPTIMAL_N_BLOCKING; } if ( handle->desc.C % handle->bc != 0 ) { handle->bc = handle->desc.C; *status = LIBXSMM_DNN_WARN_RNN_SUBOPTIMAL_C_BLOCKING; } if ( handle->desc.K % handle->bk != 0 ) { handle->bk = handle->desc.K; *status = LIBXSMM_DNN_WARN_RNN_SUBOPTIMAL_K_BLOCKING; } /* In case of BF16 for now hoist the BRGEMM and make them to use STRIDED variant by default */ if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) { const int typesize_in = (int)libxsmm_dnn_typesize(handle->desc.datatype_in); const libxsmm_blasint K = handle->desc.K; const libxsmm_blasint N = handle->desc.N; const libxsmm_blasint C = handle->desc.C; const libxsmm_blasint bk = handle->bk; const libxsmm_blasint bn = handle->bn; const libxsmm_blasint bc = handle->bc; const libxsmm_blasint cBlocks = C/bc; const libxsmm_blasint kBlocks = K/bk; const libxsmm_blasint nBlocks = N/bn; libxsmm_blasint BF, CB_BLOCKS, KB_BLOCKS; libxsmm_blasint stride_a, stride_b; int kernel_flags = LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N'); /* Blocking reduction domain if it is too large */ BF = 1; if ((C > 1024 && C <= 2048) || (K > 1024 && K <= 2048)) { BF = 8; while ( (cBlocks % BF != 0) || (kBlocks % BF != 0) ) { BF--; } } if (C > 2048 || K > 2048) { BF = 16; while ( (cBlocks % BF != 0) || (kBlocks % BF != 0) ) { BF--; } } if (C == 2048 && K == 1024) { BF = 2; } CB_BLOCKS = cBlocks/BF; KB_BLOCKS = kBlocks/BF; /* define batch-reduce gemm kernels */ stride_a = bc * bk * typesize_in; stride_b = bc * typesize_in; handle->fwd_kernela = libxsmm_bsmmdispatch_reducebatch_strd_unroll( bk, bn, bc, stride_a, stride_b, CB_BLOCKS, &bk, &C, &K, NULL, NULL, &kernel_flags, NULL ); stride_a = bk * bk * typesize_in; stride_b = bk * typesize_in; handle->fwd_kernelb = libxsmm_bsmmdispatch_reducebatch_strd_unroll( bk, bn, bk, stride_a, stride_b, KB_BLOCKS, &bk, &K, &K, NULL, NULL, &kernel_flags, NULL ); KB_BLOCKS = kBlocks/BF; stride_a = bc * bk * typesize_in; stride_b = bk * typesize_in; handle->bwdupd_kernela = libxsmm_bsmmdispatch_reducebatch_strd_unroll( bc, bn, bk, stride_a, stride_b, KB_BLOCKS, &bc, &K, &C, NULL, NULL, &kernel_flags, NULL); stride_a = bn * bk * typesize_in; stride_b = bn * typesize_in; handle->bwdupd_kernelb = libxsmm_bsmmdispatch_reducebatch_strd_unroll( bk, bk, bn, stride_a, stride_b, nBlocks, &bk, &N, &bk, NULL, NULL, &kernel_flags, NULL); stride_a = bn * bk * typesize_in; stride_b = bn * typesize_in; handle->bwdupd_kernelc = libxsmm_bsmmdispatch_reducebatch_strd_unroll( bk, bc, bn, stride_a, stride_b, nBlocks, &bk, &N, &bk, NULL, NULL, &kernel_flags, NULL); stride_a = bk * bk * typesize_in; stride_b = bk * typesize_in; handle->bwdupd_kerneld = libxsmm_bsmmdispatch_reducebatch_strd_unroll( bk, bn, bk, stride_a, stride_b, KB_BLOCKS, &bk, &K, &K, NULL, NULL, &kernel_flags, NULL); } /* Need to allocate space for scratch libxsmm_dnn_tensor's, let's set all pointers to zero */ handle->internal_z = 0; handle->scratch_wT = 0; handle->scratch_rT = 0; handle->scratch_xT = 0; handle->scratch_hT = 0; handle->scratch_deltat = 0; handle->scratch_di = 0; handle->scratch_df = 0; handle->scratch_do = 0; handle->scratch_dci = 0; handle->scratch_diB = 0; handle->scratch_dfB = 0; handle->scratch_dpB = 0; handle->scratch_dciB = 0; /* initialize a high-performant barrier */ handle->barrier = libxsmm_barrier_create(handle->desc.threads, 1); if (NULL == handle->barrier) { *status = LIBXSMM_DNN_ERR_CREATE_HANDLE; free(handle); return NULL; } } else { *status = LIBXSMM_DNN_ERR_CREATE_HANDLE; } return handle; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_rnncell(const libxsmm_dnn_rnncell* handle) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { /* Deallocate barrier */ if (handle->barrier != 0 ) { libxsmm_barrier_release((const libxsmm_barrier*)handle->barrier); } /* deallocate handle structure */ free(/*remove constness*/(libxsmm_dnn_rnncell*)handle); } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_rnncell_create_tensor_datalayout(const libxsmm_dnn_rnncell* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status) { libxsmm_dnn_tensor_datalayout* layout; *status = LIBXSMM_DNN_SUCCESS; layout = 0; if (handle != 0) { layout = (libxsmm_dnn_tensor_datalayout*) malloc(sizeof(libxsmm_dnn_tensor_datalayout)); if (layout != 0) { memset(layout, 0, sizeof(libxsmm_dnn_tensor_datalayout)); if ( (type == LIBXSMM_DNN_RNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_RNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_RNN_REGULAR_CS_PREV) || (type == LIBXSMM_DNN_RNN_GRADIENT_CS_PREV) || (type == LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV) || (type == LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV) || (type == LIBXSMM_DNN_RNN_REGULAR_CS) || (type == LIBXSMM_DNN_RNN_GRADIENT_CS) || (type == LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE) || (type == LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE) || (type == LIBXSMM_DNN_RNN_INTERNAL_I) || (type == LIBXSMM_DNN_RNN_INTERNAL_F) || (type == LIBXSMM_DNN_RNN_INTERNAL_O) || (type == LIBXSMM_DNN_RNN_INTERNAL_CI) || (type == LIBXSMM_DNN_RNN_INTERNAL_CO) ) { layout->format = handle->desc.buffer_format; layout->tensor_type = LIBXSMM_DNN_ACTIVATION; if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED) > 0) { if ( ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) || ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16)) ) { layout->datatype = handle->desc.datatype_in; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 5; if ( (type == LIBXSMM_DNN_RNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_RNN_GRADIENT_INPUT) ) { layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_T; layout->dim_size[0] = (unsigned int)handle->bc; layout->dim_size[1] = (unsigned int)handle->bn; layout->dim_size[2] = (unsigned int)(handle->desc.C / handle->bc); layout->dim_size[3] = (unsigned int)(handle->desc.N / handle->bn); layout->dim_size[4] = (unsigned int)handle->desc.max_T; } else if ( (type == LIBXSMM_DNN_RNN_REGULAR_CS_PREV) || (type == LIBXSMM_DNN_RNN_GRADIENT_CS_PREV) || (type == LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV) || (type == LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV) || (type == LIBXSMM_DNN_RNN_REGULAR_CS) || (type == LIBXSMM_DNN_RNN_GRADIENT_CS) || (type == LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE) || (type == LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE) || (type == LIBXSMM_DNN_RNN_INTERNAL_I) || (type == LIBXSMM_DNN_RNN_INTERNAL_F) || (type == LIBXSMM_DNN_RNN_INTERNAL_O) || (type == LIBXSMM_DNN_RNN_INTERNAL_CI) || (type == LIBXSMM_DNN_RNN_INTERNAL_CO) ) { layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_T; layout->dim_size[0] = (unsigned int)handle->bk; layout->dim_size[1] = (unsigned int)handle->bn; layout->dim_size[2] = (unsigned int)(handle->desc.K / handle->bk); layout->dim_size[3] = (unsigned int)(handle->desc.N / handle->bn); layout->dim_size[4] = (unsigned int)handle->desc.max_T; } else { free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } else if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NC) > 0) { if ( ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) || ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16)) ) { layout->datatype = handle->desc.datatype_in; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(3*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(3*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 3; if ( (type == LIBXSMM_DNN_RNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_RNN_GRADIENT_INPUT) ) { layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_T; layout->dim_size[0] = (unsigned int)handle->desc.C; layout->dim_size[1] = (unsigned int)handle->desc.N; layout->dim_size[2] = (unsigned int)handle->desc.max_T; } else if ( (type == LIBXSMM_DNN_RNN_REGULAR_CS_PREV) || (type == LIBXSMM_DNN_RNN_GRADIENT_CS_PREV) || (type == LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV) || (type == LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV) || (type == LIBXSMM_DNN_RNN_REGULAR_CS) || (type == LIBXSMM_DNN_RNN_GRADIENT_CS) || (type == LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE) || (type == LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE) || (type == LIBXSMM_DNN_RNN_INTERNAL_I) || (type == LIBXSMM_DNN_RNN_INTERNAL_F) || (type == LIBXSMM_DNN_RNN_INTERNAL_O) || (type == LIBXSMM_DNN_RNN_INTERNAL_CI) || (type == LIBXSMM_DNN_RNN_INTERNAL_CO) ) { layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_T; layout->dim_size[0] = (unsigned int)handle->desc.K; layout->dim_size[1] = (unsigned int)handle->desc.N; layout->dim_size[2] = (unsigned int)handle->desc.max_T; } else { free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; } } else if ( (type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT) || (type == LIBXSMM_DNN_RNN_GRADIENT_WEIGHT) || (type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT) || (type == LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT) ) { layout->format = handle->desc.filter_format; layout->tensor_type = LIBXSMM_DNN_FILTER; if ((handle->desc.filter_format & LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED) > 0) { if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) { layout->datatype = handle->desc.datatype_in; if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM || handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 5; if ( (type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT) || (type == LIBXSMM_DNN_RNN_GRADIENT_WEIGHT) ) { layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_X; layout->dim_size[0] = (unsigned int)handle->bk; layout->dim_size[1] = (unsigned int)handle->bc; layout->dim_size[2] = (unsigned int)(handle->desc.C / handle->bc); layout->dim_size[3] = (unsigned int)(handle->desc.K / handle->bk); if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { layout->dim_size[4] = 4; } else { layout->dim_size[4] = 3; } } else if ( (type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT) || (type == LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT) ) { layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_X; layout->dim_size[0] = (unsigned int)handle->bk; layout->dim_size[1] = (unsigned int)handle->bk; layout->dim_size[2] = (unsigned int)(handle->desc.K / handle->bk); layout->dim_size[3] = (unsigned int)(handle->desc.K / handle->bk); if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { layout->dim_size[4] = 4; } else { layout->dim_size[4] = 3; } } else { free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(4*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(4*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 4; if ( (type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT) || (type == LIBXSMM_DNN_RNN_GRADIENT_WEIGHT) ) { layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_size[0] = (unsigned int)handle->bk; layout->dim_size[1] = (unsigned int)handle->bc; layout->dim_size[2] = (unsigned int)(handle->desc.C / handle->bc); layout->dim_size[3] = (unsigned int)(handle->desc.K / handle->bk); } else if ( (type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT) || (type == LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT) ) { layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_size[0] = (unsigned int)handle->bk; layout->dim_size[1] = (unsigned int)handle->bk; layout->dim_size[2] = (unsigned int)(handle->desc.K / handle->bk); layout->dim_size[3] = (unsigned int)(handle->desc.K / handle->bk); } else { free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } } else if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) { layout->datatype = handle->desc.datatype_in; if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM || handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(6*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(6*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 6; if ( (type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT) || (type == LIBXSMM_DNN_RNN_GRADIENT_WEIGHT) ) { layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[5] = LIBXSMM_DNN_TENSOR_DIMTYPE_X; layout->dim_size[0] = (unsigned int)handle->lpb; layout->dim_size[1] = (unsigned int)handle->bk; layout->dim_size[2] = (unsigned int)(handle->bc / handle->lpb); layout->dim_size[3] = (unsigned int)(handle->desc.C / handle->bc); layout->dim_size[4] = (unsigned int)(handle->desc.K / handle->bk); if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { layout->dim_size[5] = 4; } else { layout->dim_size[5] = 3; } } else if ( (type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT) || (type == LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT) ) { layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[5] = LIBXSMM_DNN_TENSOR_DIMTYPE_X; layout->dim_size[0] = (unsigned int)handle->lpb; layout->dim_size[1] = (unsigned int)handle->bk; layout->dim_size[2] = (unsigned int)(handle->bk / handle->lpb); layout->dim_size[3] = (unsigned int)(handle->desc.K / handle->bk); layout->dim_size[4] = (unsigned int)(handle->desc.K / handle->bk); if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { layout->dim_size[5] = 4; } else { layout->dim_size[5] = 3; } } else { free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 5; if ( (type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT) || (type == LIBXSMM_DNN_RNN_GRADIENT_WEIGHT) ) { layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_size[0] = (unsigned int)handle->lpb; layout->dim_size[1] = (unsigned int)handle->bk; layout->dim_size[2] = (unsigned int)(handle->bc / handle->lpb); layout->dim_size[3] = (unsigned int)(handle->desc.C / handle->bc); layout->dim_size[4] = (unsigned int)(handle->desc.K / handle->bk); } else if ( (type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT) || (type == LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT) ) { layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_size[0] = (unsigned int)handle->lpb; layout->dim_size[1] = (unsigned int)handle->bk; layout->dim_size[2] = (unsigned int)(handle->bk / handle->lpb); layout->dim_size[3] = (unsigned int)(handle->desc.K / handle->bk); layout->dim_size[4] = (unsigned int)(handle->desc.K / handle->bk); } else { free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } else if ((handle->desc.filter_format & LIBXSMM_DNN_TENSOR_FORMAT_CK) > 0) { if ( ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) || ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16)) ) { layout->datatype = handle->desc.datatype_in; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(2*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(2*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 2; if ( (type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT) || (type == LIBXSMM_DNN_RNN_GRADIENT_WEIGHT) ) { layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { layout->dim_size[0] = (unsigned int)(handle->desc.K * 4); layout->dim_size[1] = (unsigned int)handle->desc.C; } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { layout->dim_size[0] = (unsigned int)(handle->desc.K * 3); layout->dim_size[1] = (unsigned int)handle->desc.C; } else { layout->dim_size[0] = (unsigned int)handle->desc.K; layout->dim_size[1] = (unsigned int)handle->desc.C; } } else if ( (type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT) || (type == LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT) ) { layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { layout->dim_size[0] = (unsigned int)(handle->desc.K * 4); layout->dim_size[1] = (unsigned int)handle->desc.K; } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { layout->dim_size[0] = (unsigned int)(handle->desc.K * 3); layout->dim_size[1] = (unsigned int)handle->desc.K; } else { layout->dim_size[0] = (unsigned int)handle->desc.K; layout->dim_size[1] = (unsigned int)handle->desc.K; } } else { free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; } } else if ( (type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS) || (type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS) ) { layout->format = handle->desc.filter_format; layout->tensor_type = LIBXSMM_DNN_FILTER; if ((handle->desc.filter_format & LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED) > 0) { if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) { layout->datatype = handle->desc.datatype_in; if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM || handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 5; if ( (type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS) ) { layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_X; layout->dim_size[0] = (unsigned int)handle->bc; layout->dim_size[1] = (unsigned int)handle->bk; layout->dim_size[2] = (unsigned int)(handle->desc.K / handle->bk); layout->dim_size[3] = (unsigned int)(handle->desc.C / handle->bc); if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { layout->dim_size[4] = 4; } else { layout->dim_size[4] = 3; } } else if ( (type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS) ) { layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_X; layout->dim_size[0] = (unsigned int)handle->bk; layout->dim_size[1] = (unsigned int)handle->bk; layout->dim_size[2] = (unsigned int)(handle->desc.K / handle->bk); layout->dim_size[3] = (unsigned int)(handle->desc.K / handle->bk); if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { layout->dim_size[4] = 4; } else { layout->dim_size[4] = 3; } } else { free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(4*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(4*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 4; if ( (type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS) ) { layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_size[0] = (unsigned int)handle->bc; layout->dim_size[1] = (unsigned int)handle->bk; layout->dim_size[2] = (unsigned int)(handle->desc.K / handle->bk); layout->dim_size[3] = (unsigned int)(handle->desc.C / handle->bc); } else if ( (type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS) ) { layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_size[0] = (unsigned int)handle->bk; layout->dim_size[1] = (unsigned int)handle->bk; layout->dim_size[2] = (unsigned int)(handle->desc.K / handle->bk); layout->dim_size[3] = (unsigned int)(handle->desc.K / handle->bk); } else { free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } } else if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) { layout->datatype = handle->desc.datatype_in; if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM || handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(6*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(6*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 6; if ( (type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS) ) { layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[5] = LIBXSMM_DNN_TENSOR_DIMTYPE_X; layout->dim_size[0] = (unsigned int)handle->lpb; layout->dim_size[1] = (unsigned int)handle->bc; layout->dim_size[2] = (unsigned int)(handle->bk / handle->lpb); layout->dim_size[3] = (unsigned int)(handle->desc.K / handle->bk); layout->dim_size[4] = (unsigned int)(handle->desc.C / handle->bc); if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { layout->dim_size[5] = 4; } else { layout->dim_size[5] = 3; } } else if ( (type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS) ) { layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[5] = LIBXSMM_DNN_TENSOR_DIMTYPE_X; layout->dim_size[0] = (unsigned int)handle->lpb; layout->dim_size[1] = (unsigned int)handle->bk; layout->dim_size[2] = (unsigned int)(handle->bk / handle->lpb); layout->dim_size[3] = (unsigned int)(handle->desc.K / handle->bk); layout->dim_size[4] = (unsigned int)(handle->desc.K / handle->bk); if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { layout->dim_size[5] = 4; } else { layout->dim_size[5] = 3; } } else { free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 5; if ( (type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS) ) { layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_size[0] = (unsigned int)handle->lpb; layout->dim_size[1] = (unsigned int)handle->bc; layout->dim_size[2] = (unsigned int)(handle->bk / handle->lpb); layout->dim_size[3] = (unsigned int)(handle->desc.K / handle->bk); layout->dim_size[4] = (unsigned int)(handle->desc.C / handle->bc); } else if ( (type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS) ) { layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_size[0] = (unsigned int)handle->lpb; layout->dim_size[1] = (unsigned int)handle->bk; layout->dim_size[2] = (unsigned int)(handle->bk / handle->lpb); layout->dim_size[3] = (unsigned int)(handle->desc.K / handle->bk); layout->dim_size[4] = (unsigned int)(handle->desc.K / handle->bk); } else { free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } else if ((handle->desc.filter_format & LIBXSMM_DNN_TENSOR_FORMAT_CK) > 0) { if ( ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) || ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16)) ) { layout->datatype = handle->desc.datatype_in; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(2*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(2*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 2; if ( (type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS) ) { layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { layout->dim_size[0] = (unsigned int)handle->desc.C; layout->dim_size[1] = (unsigned int)(handle->desc.K * 4); } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { layout->dim_size[0] = (unsigned int)handle->desc.C; layout->dim_size[1] = (unsigned int)(handle->desc.K * 3); } else { layout->dim_size[0] = (unsigned int)handle->desc.C; layout->dim_size[1] = (unsigned int)handle->desc.K; } } else if ( (type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS) ) { layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { layout->dim_size[0] = (unsigned int)handle->desc.K; layout->dim_size[1] = (unsigned int)(handle->desc.K * 4); } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { layout->dim_size[0] = (unsigned int)handle->desc.K; layout->dim_size[1] = (unsigned int)(handle->desc.K * 3); } else { layout->dim_size[0] = (unsigned int)handle->desc.K; layout->dim_size[1] = (unsigned int)handle->desc.K; } } else { free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; } } else if ( (type == LIBXSMM_DNN_RNN_REGULAR_BIAS) || (type == LIBXSMM_DNN_RNN_GRADIENT_BIAS) ) { layout->format = handle->desc.buffer_format; layout->tensor_type = LIBXSMM_DNN_CHANNEL_SCALAR; if ( ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NC) > 0) || ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED) > 0) ) { if ( ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) || ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16)) ) { layout->datatype = handle->desc.datatype_in; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(1*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(1*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ layout->num_dims = 1; if ( (type == LIBXSMM_DNN_RNN_REGULAR_BIAS) || (type == LIBXSMM_DNN_RNN_GRADIENT_BIAS) ) { layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { layout->dim_size[0] = (unsigned int)(handle->desc.K * 4); } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { layout->dim_size[0] = (unsigned int)(handle->desc.K * 3); } else { layout->dim_size[0] = (unsigned int)handle->desc.K; } } else { /* coverity[dead_error_begin] */ free(layout->dim_type); free(layout->dim_size); free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT; } } else { *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return layout; } LIBXSMM_API size_t libxsmm_dnn_rnncell_get_scratch_size(const libxsmm_dnn_rnncell* handle, const libxsmm_dnn_compute_kind kind, libxsmm_dnn_err_t* status) { size_t size = 0; *status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { const size_t typesize_in = libxsmm_dnn_typesize(handle->desc.datatype_in); const size_t dwdr_typesize = (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ? sizeof(float) : typesize_in; switch (handle->desc.cell_type) { case LIBXSMM_DNN_RNNCELL_RNN_RELU: case LIBXSMM_DNN_RNNCELL_RNN_SIGMOID: case LIBXSMM_DNN_RNNCELL_RNN_TANH: { switch (kind) { case LIBXSMM_DNN_COMPUTE_KIND_FWD: { size += 0; } break; case LIBXSMM_DNN_COMPUTE_KIND_BWD: case LIBXSMM_DNN_COMPUTE_KIND_UPD: case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: case LIBXSMM_DNN_COMPUTE_KIND_ALL: { size += (size_t)handle->desc.C * (size_t)handle->desc.K * typesize_in + 64; /* wT */ size += (size_t)handle->desc.K * (size_t)handle->desc.K * typesize_in + 64; /* rT */ size += (size_t)handle->desc.C * (size_t)handle->desc.N * typesize_in + 64; /* xT */ size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* hT */ size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) * (size_t)handle->desc.max_T + 64; /* deltat */ } break; default: { *status = LIBXSMM_DNN_ERR_INVALID_KIND; } } } break; case LIBXSMM_DNN_RNNCELL_LSTM: { switch (kind) { case LIBXSMM_DNN_COMPUTE_KIND_FWD: { size += (size_t)handle->desc.C * (size_t)handle->desc.K * typesize_in * 4 + 4 * 64; /* w */ size += (size_t)handle->desc.K * (size_t)handle->desc.K * typesize_in * 4 + 4 * 64; /* r */ /* The scratches below are needed only for BF16 code for the intermediate results */ if (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) { size += (size_t)7 *((size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64); /* intermediate scratches */ size += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) + 64; /* intermediate scratches */ } } break; case LIBXSMM_DNN_COMPUTE_KIND_BWD: case LIBXSMM_DNN_COMPUTE_KIND_UPD: case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: case LIBXSMM_DNN_COMPUTE_KIND_ALL: { size += (size_t)handle->desc.C * (size_t)handle->desc.K * dwdr_typesize * 4 + 4 * 64; /* w */ size += (size_t)handle->desc.K * (size_t)handle->desc.K * dwdr_typesize * 4 + 4 * 64; /* r */ size += (size_t)handle->desc.C * (size_t)handle->desc.K * typesize_in * 4 + 4 * 64; /* wT */ size += (size_t)handle->desc.K * (size_t)handle->desc.K * typesize_in * 4 + 4 * 64; /* rT */ size += (size_t)handle->desc.C * (size_t)handle->desc.N * typesize_in + 64; /* xT */ size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* hT */ size += (size_t)handle->desc.K * (size_t)handle->desc.N * dwdr_typesize + 64; /* deltat */ size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* di */ size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* df */ size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* do */ size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* dci */ size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* diB */ size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* dfB */ size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* dpB */ size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* dciB */ size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* t1 */ size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* t2 */ /* The scratches below are needed only for BF16 code for the intermediate results */ if (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) { size += (size_t)4 *((size_t)handle->desc.K * sizeof(float) + 64); /* intermediate db scratch */ size += (size_t)handle->desc.C * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64; /* intermediate dx scratches */ size += (size_t)7 *((size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64); /* intermediate scratches */ size += (size_t)2 *((size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) + 64); /* intermediate scratches */ } } break; default: { *status = LIBXSMM_DNN_ERR_INVALID_KIND; } } } break; case LIBXSMM_DNN_RNNCELL_GRU: { switch (kind) { case LIBXSMM_DNN_COMPUTE_KIND_FWD: { size += (size_t)handle->desc.C * (size_t)handle->desc.K * typesize_in * 3 + 3 * 64; /* w */ size += (size_t)handle->desc.K * (size_t)handle->desc.K * typesize_in * 3 + 3 * 64; /* r */ } break; case LIBXSMM_DNN_COMPUTE_KIND_BWD: case LIBXSMM_DNN_COMPUTE_KIND_UPD: case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: case LIBXSMM_DNN_COMPUTE_KIND_ALL: { size += (size_t)handle->desc.C * (size_t)handle->desc.K * dwdr_typesize * 3 + 3 * 64; /* w */ size += (size_t)handle->desc.K * (size_t)handle->desc.K * dwdr_typesize * 3 + 3 * 64; /* r */ size += (size_t)handle->desc.C * (size_t)handle->desc.K * typesize_in * 3 + 3 * 64; /* wT */ size += (size_t)handle->desc.K * (size_t)handle->desc.K * typesize_in * 3 + 3 * 64; /* rT */ size += (size_t)handle->desc.C * (size_t)handle->desc.N * typesize_in + 64; /* xT */ size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* hT */ size += (size_t)handle->desc.K * (size_t)handle->desc.N * dwdr_typesize + 64; /* deltat */ size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* di */ size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* dc */ size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* df */ size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* do */ size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* diB */ size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* dcB */ size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* dfB */ size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* oT */ size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* t1 */ size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* t2 */ } break; default: { *status = LIBXSMM_DNN_ERR_INVALID_KIND; } } } break; default: { *status = LIBXSMM_DNN_ERR_INVALID_RNN_TYPE; } } } else { *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return size; } LIBXSMM_API void* libxsmm_dnn_rnncell_get_scratch_ptr(const libxsmm_dnn_rnncell* handle, libxsmm_dnn_err_t* status) { *status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { return handle->scratch_base; } else { *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return NULL; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_bind_scratch(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_compute_kind kind, const void* scratch) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (NULL != handle) { const size_t typesize_in = libxsmm_dnn_typesize(handle->desc.datatype_in); const size_t dwdr_typesize = (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ? sizeof(float) : typesize_in; uintptr_t address = (uintptr_t)scratch; size_t offset = 0; switch (handle->desc.cell_type) { case LIBXSMM_DNN_RNNCELL_RNN_RELU: case LIBXSMM_DNN_RNNCELL_RNN_SIGMOID: case LIBXSMM_DNN_RNNCELL_RNN_TANH: { switch (kind) { case LIBXSMM_DNN_COMPUTE_KIND_FWD: { /* forward only has no scratch need */ } break; case LIBXSMM_DNN_COMPUTE_KIND_BWD: case LIBXSMM_DNN_COMPUTE_KIND_UPD: case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: case LIBXSMM_DNN_COMPUTE_KIND_ALL: { if (scratch == 0) { status = LIBXSMM_DNN_ERR_SCRATCH_NOT_ALLOCED; return status; } handle->scratch_base = (void*)address; /* wT */ if (address % 64 == 0) { handle->scratch_wT = (void*)address; } else { offset = (64 - address % 64); handle->scratch_wT = (void*)(address+offset); } address += ((size_t)handle->desc.C * (size_t)handle->desc.K * typesize_in) + 64; /* rT */ if (address % 64 == 0) { handle->scratch_rT = (void*)address; } else { offset = (64 - address % 64); handle->scratch_rT = (void*)(address+offset); } address += ((size_t)handle->desc.K * (size_t)handle->desc.K * typesize_in) + 64; /* xT */ if (address % 64 == 0) { handle->scratch_xT = (void*)address; } else { offset = (64 - address % 64); handle->scratch_xT = (void*)(address+offset); } address += ((size_t)handle->desc.C * (size_t)handle->desc.N * typesize_in) + 64; /* hT */ if (address % 64 == 0) { handle->scratch_hT = (void*)address; } else { offset = (64 - address % 64); handle->scratch_hT = (void*)(address+offset); } address += ((size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out)) + 64; /* deltat */ if (address % 64 == 0) { handle->scratch_deltat = (void*)address; } else { offset = (64 - address % 64); handle->scratch_deltat = (void*)(address+offset); } address += ((size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) * (size_t)handle->desc.max_T) + 64; } break; default: { status = LIBXSMM_DNN_ERR_INVALID_KIND; } } } break; case LIBXSMM_DNN_RNNCELL_LSTM: { switch (kind) { case LIBXSMM_DNN_COMPUTE_KIND_FWD: { if (scratch == 0) { status = LIBXSMM_DNN_ERR_SCRATCH_NOT_ALLOCED; return status; } handle->scratch_base = (void*)address; /* w scratch */ if (address % 64 == 0) { handle->scratch_w = (void*)address; } else { offset = (64 - address % 64); handle->scratch_w = (void*)(address+offset); } address += ((size_t)handle->desc.C * (size_t)handle->desc.K * typesize_in) * 4 + 64; /* r scratch */ if (address % 64 == 0) { handle->scratch_r = (void*)address; } else { offset = (64 - address % 64); handle->scratch_r = (void*)(address+offset); } address += ((size_t)handle->desc.K * (size_t)handle->desc.K * typesize_in) * 4 + 64; /* The scratches below are needed only for BF16 code for the intermediate results */ if (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) { /* cst scratch */ if (address % 64 == 0) { handle->cst_scratch = (void*)address; } else { offset = (64 - address % 64); handle->cst_scratch = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64; /* ht scratch */ if (address % 64 == 0) { handle->ht_scratch = (void*)address; } else { offset = (64 - address % 64); handle->ht_scratch = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64; /* it scratch */ if (address % 64 == 0) { handle->it_scratch = (void*)address; } else { offset = (64 - address % 64); handle->it_scratch = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64; /* ft scratch */ if (address % 64 == 0) { handle->ft_scratch = (void*)address; } else { offset = (64 - address % 64); handle->ft_scratch = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64; /* ot scratch */ if (address % 64 == 0) { handle->ot_scratch = (void*)address; } else { offset = (64 - address % 64); handle->ot_scratch = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64; /* cit scratch */ if (address % 64 == 0) { handle->cit_scratch = (void*)address; } else { offset = (64 - address % 64); handle->cit_scratch = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64; /* cot scratch */ if (address % 64 == 0) { handle->cot_scratch = (void*)address; } else { offset = (64 - address % 64); handle->cot_scratch = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64; /* csp scratch */ if (address % 64 == 0) { handle->csp_scratch = (void*)address; } else { offset = (64 - address % 64); handle->csp_scratch = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) + 64; } } break; case LIBXSMM_DNN_COMPUTE_KIND_BWD: case LIBXSMM_DNN_COMPUTE_KIND_UPD: case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: case LIBXSMM_DNN_COMPUTE_KIND_ALL: { if (scratch == 0) { status = LIBXSMM_DNN_ERR_SCRATCH_NOT_ALLOCED; return status; } handle->scratch_base = (void*)address; /* w scratch */ if (address % 64 == 0) { handle->scratch_w = (void*)address; } else { offset = (64 - address % 64); handle->scratch_w = (void*)(address+offset); } address += ((size_t)handle->desc.C * (size_t)handle->desc.K * dwdr_typesize) * 4 + 64; /* r scratch */ if (address % 64 == 0) { handle->scratch_r = (void*)address; } else { offset = (64 - address % 64); handle->scratch_r = (void*)(address+offset); } address += ((size_t)handle->desc.K * (size_t)handle->desc.K * dwdr_typesize) * 4 + 64; /* wT */ if (address % 64 == 0) { handle->scratch_wT = (void*)address; } else { offset = (64 - address % 64); handle->scratch_wT = (void*)(address+offset); } address += ((size_t)handle->desc.C * (size_t)handle->desc.K * typesize_in) * 4 + 64; /* rT */ if (address % 64 == 0) { handle->scratch_rT = (void*)address; } else { offset = (64 - address % 64); handle->scratch_rT = (void*)(address+offset); } address += ((size_t)handle->desc.K * (size_t)handle->desc.K * typesize_in) * 4 + 64; /* xT */ if (address % 64 == 0) { handle->scratch_xT = (void*)address; } else { offset = (64 - address % 64); handle->scratch_xT = (void*)(address+offset); } address += (size_t)handle->desc.C * (size_t)handle->desc.N * typesize_in + 64; /* hT */ if (address % 64 == 0) { handle->scratch_hT = (void*)address; } else { offset = (64 - address % 64); handle->scratch_hT = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* deltat */ if (address % 64 == 0) { handle->scratch_deltat = (void*)address; } else { offset = (64 - address % 64); handle->scratch_deltat = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * dwdr_typesize + 64; /* di */ if (address % 64 == 0) { handle->scratch_di = (void*)address; } else { offset = (64 - address % 64); handle->scratch_di = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* df */ if (address % 64 == 0) { handle->scratch_df = (void*)address; } else { offset = (64 - address % 64); handle->scratch_df = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* do */ if (address % 64 == 0) { handle->scratch_do = (void*)address; } else { offset = (64 - address % 64); handle->scratch_do = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* dci */ if (address % 64 == 0) { handle->scratch_dci = (void*)address; } else { offset = (64 - address % 64); handle->scratch_dci = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* diB */ if (address % 64 == 0) { handle->scratch_diB = (void*)address; } else { offset = (64 - address % 64); handle->scratch_diB = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* dfB */ if (address % 64 == 0) { handle->scratch_dfB = (void*)address; } else { offset = (64 - address % 64); handle->scratch_dfB = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* dpB */ if (address % 64 == 0) { handle->scratch_dpB = (void*)address; } else { offset = (64 - address % 64); handle->scratch_dpB = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* dciB */ if (address % 64 == 0) { handle->scratch_dciB = (void*)address; } else { offset = (64 - address % 64); handle->scratch_dciB = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* t1 */ if (address % 64 == 0) { handle->scratch_t1 = (void*)address; } else { offset = (64 - address % 64); handle->scratch_t1 = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* t2 */ if (address % 64 == 0) { handle->scratch_t2 = (void*)address; } else { offset = (64 - address % 64); handle->scratch_t2 = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* The scratches below are needed only for BF16 code for the intermediate results */ if (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) { /* dx scratch */ if (address % 64 == 0) { handle->scratch_dx = (void*)address; } else { offset = (64 - address % 64); handle->scratch_dx = (void*)(address+offset); } address += (size_t)handle->desc.C * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64; /* dhp scratch */ if (address % 64 == 0) { handle->scratch_dhp = (void*)address; } else { offset = (64 - address % 64); handle->scratch_dhp = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) + 64; /* db scratch */ if (address % 64 == 0) { handle->scratch_db = (void*)address; } else { offset = (64 - address % 64); handle->scratch_db = (void*)(address+offset); } address += (size_t)handle->desc.K * 4 * sizeof(float) + 64; /* cst scratch */ if (address % 64 == 0) { handle->cst_scratch = (void*)address; } else { offset = (64 - address % 64); handle->cst_scratch = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64; /* ht scratch */ if (address % 64 == 0) { handle->ht_scratch = (void*)address; } else { offset = (64 - address % 64); handle->ht_scratch = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64; /* it scratch */ if (address % 64 == 0) { handle->it_scratch = (void*)address; } else { offset = (64 - address % 64); handle->it_scratch = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64; /* ft scratch */ if (address % 64 == 0) { handle->ft_scratch = (void*)address; } else { offset = (64 - address % 64); handle->ft_scratch = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64; /* ot scratch */ if (address % 64 == 0) { handle->ot_scratch = (void*)address; } else { offset = (64 - address % 64); handle->ot_scratch = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64; /* cit scratch */ if (address % 64 == 0) { handle->cit_scratch = (void*)address; } else { offset = (64 - address % 64); handle->cit_scratch = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64; /* cot scratch */ if (address % 64 == 0) { handle->cot_scratch = (void*)address; } else { offset = (64 - address % 64); handle->cot_scratch = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64; /* csp scratch */ if (address % 64 == 0) { handle->csp_scratch = (void*)address; } else { offset = (64 - address % 64); handle->csp_scratch = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) + 64; } } break; default: { status = LIBXSMM_DNN_ERR_INVALID_KIND; } } } break; case LIBXSMM_DNN_RNNCELL_GRU: { switch (kind) { case LIBXSMM_DNN_COMPUTE_KIND_FWD: { if (scratch == 0) { status = LIBXSMM_DNN_ERR_SCRATCH_NOT_ALLOCED; return status; } handle->scratch_base = (void*)address; /* w scratch */ if (address % 64 == 0) { handle->scratch_w = (void*)address; } else { offset = (64 - address % 64); handle->scratch_w = (void*)(address+offset); } address += ((size_t)handle->desc.C * (size_t)handle->desc.K * typesize_in) * 3 + 64; /* r scratch */ if (address % 64 == 0) { handle->scratch_r = (void*)address; } else { offset = (64 - address % 64); handle->scratch_r = (void*)(address+offset); } address += ((size_t)handle->desc.K * (size_t)handle->desc.K * typesize_in) * 3 + 64; } break; case LIBXSMM_DNN_COMPUTE_KIND_BWD: case LIBXSMM_DNN_COMPUTE_KIND_UPD: case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: case LIBXSMM_DNN_COMPUTE_KIND_ALL: { if (scratch == 0) { status = LIBXSMM_DNN_ERR_SCRATCH_NOT_ALLOCED; return status; } handle->scratch_base = (void*)address; /* w scratch */ if (address % 64 == 0) { handle->scratch_w = (void*)address; } else { offset = (64 - address % 64); handle->scratch_w = (void*)(address+offset); } address += ((size_t)handle->desc.C * (size_t)handle->desc.K * dwdr_typesize) * 3 + 64; /* r scratch */ if (address % 64 == 0) { handle->scratch_r = (void*)address; } else { offset = (64 - address % 64); handle->scratch_r = (void*)(address+offset); } address += ((size_t)handle->desc.K * (size_t)handle->desc.K * dwdr_typesize) * 3 + 64; /* wT */ if (address % 64 == 0) { handle->scratch_wT = (void*)address; } else { offset = (64 - address % 64); handle->scratch_wT = (void*)(address+offset); } address += ((size_t)handle->desc.C * (size_t)handle->desc.K * typesize_in) * 3 + 64; /* rT */ if (address % 64 == 0) { handle->scratch_rT = (void*)address; } else { offset = (64 - address % 64); handle->scratch_rT = (void*)(address+offset); } address += ((size_t)handle->desc.K * (size_t)handle->desc.K * typesize_in) * 3 + 64; /* xT */ if (address % 64 == 0) { handle->scratch_xT = (void*)address; } else { offset = (64 - address % 64); handle->scratch_xT = (void*)(address+offset); } address += (size_t)handle->desc.C * (size_t)handle->desc.N * typesize_in + 64; /* hT */ if (address % 64 == 0) { handle->scratch_hT = (void*)address; } else { offset = (64 - address % 64); handle->scratch_hT = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* deltat */ if (address % 64 == 0) { handle->scratch_deltat = (void*)address; } else { offset = (64 - address % 64); handle->scratch_deltat = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * dwdr_typesize + 64; /* di */ if (address % 64 == 0) { handle->scratch_di = (void*)address; } else { offset = (64 - address % 64); handle->scratch_di = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* dc */ if (address % 64 == 0) { handle->scratch_dci = (void*)address; } else { offset = (64 - address % 64); handle->scratch_dci = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* df */ if (address % 64 == 0) { handle->scratch_df = (void*)address; } else { offset = (64 - address % 64); handle->scratch_df = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* do */ if (address % 64 == 0) { handle->scratch_do = (void*)address; } else { offset = (64 - address % 64); handle->scratch_do = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* diB */ if (address % 64 == 0) { handle->scratch_diB = (void*)address; } else { offset = (64 - address % 64); handle->scratch_diB = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* dcB */ if (address % 64 == 0) { handle->scratch_dciB = (void*)address; } else { offset = (64 - address % 64); handle->scratch_dciB = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* dfB */ if (address % 64 == 0) { handle->scratch_dfB = (void*)address; } else { offset = (64 - address % 64); handle->scratch_dfB = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* doB (repurposed for oT) */ if (address % 64 == 0) { handle->scratch_dpB = (void*)address; } else { offset = (64 - address % 64); handle->scratch_dpB = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* t1 */ if (address % 64 == 0) { handle->scratch_t1 = (void*)address; } else { offset = (64 - address % 64); handle->scratch_t1 = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* t2 */ if (address % 64 == 0) { handle->scratch_t2 = (void*)address; } else { offset = (64 - address % 64); handle->scratch_t2 = (void*)(address+offset); } address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; } break; default: { status = LIBXSMM_DNN_ERR_INVALID_KIND; } } } break; default: { status = LIBXSMM_DNN_ERR_INVALID_RNN_TYPE; } } } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_release_scratch(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_compute_kind kind) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { switch (handle->desc.cell_type) { case LIBXSMM_DNN_RNNCELL_RNN_RELU: case LIBXSMM_DNN_RNNCELL_RNN_SIGMOID: case LIBXSMM_DNN_RNNCELL_RNN_TANH: { switch (kind) { case LIBXSMM_DNN_COMPUTE_KIND_FWD: { /* forward only has no scratch need */ } break; case LIBXSMM_DNN_COMPUTE_KIND_BWD: case LIBXSMM_DNN_COMPUTE_KIND_UPD: case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: case LIBXSMM_DNN_COMPUTE_KIND_ALL: { handle->scratch_wT = 0; handle->scratch_rT = 0; handle->scratch_xT = 0; handle->scratch_hT = 0; handle->scratch_deltat = 0; } break; default: { status = LIBXSMM_DNN_ERR_INVALID_KIND; } } } break; case LIBXSMM_DNN_RNNCELL_LSTM: { switch (kind) { case LIBXSMM_DNN_COMPUTE_KIND_FWD: { handle->scratch_w = 0; handle->scratch_r = 0; handle->csp_scratch = 0; handle->cst_scratch = 0; handle->ht_scratch = 0; handle->it_scratch = 0; handle->ft_scratch = 0; handle->ot_scratch = 0; handle->cit_scratch = 0; handle->cot_scratch = 0; } break; case LIBXSMM_DNN_COMPUTE_KIND_BWD: case LIBXSMM_DNN_COMPUTE_KIND_UPD: case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: case LIBXSMM_DNN_COMPUTE_KIND_ALL: { handle->scratch_w = 0; handle->scratch_r = 0; handle->scratch_wT = 0; handle->scratch_rT = 0; handle->scratch_xT = 0; handle->scratch_hT = 0; handle->scratch_deltat = 0; handle->scratch_di = 0; handle->scratch_df = 0; handle->scratch_do = 0; handle->scratch_dci = 0; handle->scratch_diB = 0; handle->scratch_dfB = 0; handle->scratch_dpB = 0; handle->scratch_dciB = 0; handle->scratch_t1 = 0; handle->scratch_t2 = 0; handle->csp_scratch = 0; handle->cst_scratch = 0; handle->ht_scratch = 0; handle->it_scratch = 0; handle->ft_scratch = 0; handle->ot_scratch = 0; handle->cit_scratch = 0; handle->cot_scratch = 0; } break; default: { status = LIBXSMM_DNN_ERR_INVALID_KIND; } } } break; case LIBXSMM_DNN_RNNCELL_GRU: { switch (kind) { case LIBXSMM_DNN_COMPUTE_KIND_FWD: { handle->scratch_w = 0; handle->scratch_r = 0; handle->ht_scratch = 0; handle->it_scratch = 0; handle->cit_scratch = 0; handle->ft_scratch = 0; handle->ot_scratch = 0; } break; case LIBXSMM_DNN_COMPUTE_KIND_BWD: case LIBXSMM_DNN_COMPUTE_KIND_UPD: case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: case LIBXSMM_DNN_COMPUTE_KIND_ALL: { handle->scratch_w = 0; handle->scratch_r = 0; handle->scratch_wT = 0; handle->scratch_rT = 0; handle->scratch_xT = 0; handle->scratch_hT = 0; handle->scratch_deltat = 0; handle->scratch_di = 0; handle->scratch_dci = 0; handle->scratch_df = 0; handle->scratch_do = 0; handle->scratch_diB = 0; handle->scratch_dciB = 0; handle->scratch_dfB = 0; handle->scratch_dpB = 0; handle->scratch_t1 = 0; handle->scratch_t2 = 0; handle->ht_scratch = 0; handle->it_scratch = 0; handle->ft_scratch = 0; handle->ot_scratch = 0; handle->cit_scratch = 0; } break; default: { status = LIBXSMM_DNN_ERR_INVALID_KIND; } } } break; default: { status = LIBXSMM_DNN_ERR_INVALID_RNN_TYPE; } } } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API size_t libxsmm_dnn_rnncell_get_internalstate_size(const libxsmm_dnn_rnncell* handle, const libxsmm_dnn_compute_kind kind, libxsmm_dnn_err_t* status) { size_t size = 0; *status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { const size_t sizeof_datatype = sizeof(float); switch (handle->desc.cell_type) { case LIBXSMM_DNN_RNNCELL_RNN_RELU: case LIBXSMM_DNN_RNNCELL_RNN_SIGMOID: case LIBXSMM_DNN_RNNCELL_RNN_TANH: { switch (kind) { case LIBXSMM_DNN_COMPUTE_KIND_FWD: { size += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof_datatype * (size_t)handle->desc.max_T + 64; /* zt */ } break; case LIBXSMM_DNN_COMPUTE_KIND_BWD: case LIBXSMM_DNN_COMPUTE_KIND_UPD: case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: case LIBXSMM_DNN_COMPUTE_KIND_ALL: { size += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof_datatype * (size_t)handle->desc.max_T + 64; /* zt */ } break; default: { *status = LIBXSMM_DNN_ERR_INVALID_KIND; } } } break; case LIBXSMM_DNN_RNNCELL_LSTM: { switch (kind) { case LIBXSMM_DNN_COMPUTE_KIND_FWD: { /* with i, f, o, ci, co, cs exposed as i/o, there is currently no need for internal state */ } break; case LIBXSMM_DNN_COMPUTE_KIND_BWD: case LIBXSMM_DNN_COMPUTE_KIND_UPD: case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: case LIBXSMM_DNN_COMPUTE_KIND_ALL: { /* with i, f, o, ci, co, cs exposed as i/o, there is currently no need for internal state */ } break; default: { *status = LIBXSMM_DNN_ERR_INVALID_KIND; } } } break; case LIBXSMM_DNN_RNNCELL_GRU: { switch (kind) { case LIBXSMM_DNN_COMPUTE_KIND_FWD: { /* with i, f, c, o exposed as i/o, there is currently no need for internal state */ } break; case LIBXSMM_DNN_COMPUTE_KIND_BWD: case LIBXSMM_DNN_COMPUTE_KIND_UPD: case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: case LIBXSMM_DNN_COMPUTE_KIND_ALL: { /* with i, f, c, o exposed as i/o, there is currently no need for internal state */ } break; default: { *status = LIBXSMM_DNN_ERR_INVALID_KIND; } } } break; default: { *status = LIBXSMM_DNN_ERR_INVALID_RNN_TYPE; } } } else { *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return size; } LIBXSMM_API void* libxsmm_dnn_rnncell_get_internalstate_ptr(const libxsmm_dnn_rnncell* handle, libxsmm_dnn_err_t* status) { *status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { return handle->internal_z; } else { *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return NULL; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_bind_internalstate(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_compute_kind kind, const void* internalstate) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; uintptr_t address = (uintptr_t)internalstate; size_t offset = 0; if (0 != handle) { switch (handle->desc.cell_type) { case LIBXSMM_DNN_RNNCELL_RNN_RELU: case LIBXSMM_DNN_RNNCELL_RNN_SIGMOID: case LIBXSMM_DNN_RNNCELL_RNN_TANH: { if (internalstate == 0) { status = LIBXSMM_DNN_ERR_SCRATCH_NOT_ALLOCED; return status; } switch (kind) { case LIBXSMM_DNN_COMPUTE_KIND_FWD: { if (address % 64 == 0) { handle->internal_z = (void*)address; } else { offset = (64 - address % 64); handle->internal_z = (void*)(address+offset); } } break; case LIBXSMM_DNN_COMPUTE_KIND_BWD: case LIBXSMM_DNN_COMPUTE_KIND_UPD: case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: case LIBXSMM_DNN_COMPUTE_KIND_ALL: { if (address % 64 == 0) { handle->internal_z = (void*)address; } else { offset = (64 - address % 64); handle->internal_z = (void*)(address+offset); } } break; default: { status = LIBXSMM_DNN_ERR_INVALID_KIND; } } } break; case LIBXSMM_DNN_RNNCELL_LSTM: { switch (kind) { case LIBXSMM_DNN_COMPUTE_KIND_FWD: { } break; case LIBXSMM_DNN_COMPUTE_KIND_BWD: case LIBXSMM_DNN_COMPUTE_KIND_UPD: case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: case LIBXSMM_DNN_COMPUTE_KIND_ALL: { } break; default: { status = LIBXSMM_DNN_ERR_INVALID_KIND; } } } break; case LIBXSMM_DNN_RNNCELL_GRU: { switch (kind) { case LIBXSMM_DNN_COMPUTE_KIND_FWD: { } break; case LIBXSMM_DNN_COMPUTE_KIND_BWD: case LIBXSMM_DNN_COMPUTE_KIND_UPD: case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: case LIBXSMM_DNN_COMPUTE_KIND_ALL: { } break; default: { status = LIBXSMM_DNN_ERR_INVALID_KIND; } } } break; default: { status = LIBXSMM_DNN_ERR_INVALID_RNN_TYPE; } } } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_release_internalstate(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_compute_kind kind) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { switch (handle->desc.cell_type) { case LIBXSMM_DNN_RNNCELL_RNN_RELU: case LIBXSMM_DNN_RNNCELL_RNN_SIGMOID: case LIBXSMM_DNN_RNNCELL_RNN_TANH: { switch (kind) { case LIBXSMM_DNN_COMPUTE_KIND_FWD: { handle->internal_z = 0; } break; case LIBXSMM_DNN_COMPUTE_KIND_BWD: case LIBXSMM_DNN_COMPUTE_KIND_UPD: case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: case LIBXSMM_DNN_COMPUTE_KIND_ALL: { handle->internal_z = 0; } break; default: { status = LIBXSMM_DNN_ERR_INVALID_KIND; } } } break; case LIBXSMM_DNN_RNNCELL_LSTM: { switch (kind) { case LIBXSMM_DNN_COMPUTE_KIND_FWD: { } break; case LIBXSMM_DNN_COMPUTE_KIND_BWD: case LIBXSMM_DNN_COMPUTE_KIND_UPD: case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: case LIBXSMM_DNN_COMPUTE_KIND_ALL: { } break; default: { status = LIBXSMM_DNN_ERR_INVALID_KIND; } } } break; case LIBXSMM_DNN_RNNCELL_GRU: { switch (kind) { case LIBXSMM_DNN_COMPUTE_KIND_FWD: { } break; case LIBXSMM_DNN_COMPUTE_KIND_BWD: case LIBXSMM_DNN_COMPUTE_KIND_UPD: case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: case LIBXSMM_DNN_COMPUTE_KIND_ALL: { } break; default: { status = LIBXSMM_DNN_ERR_INVALID_KIND; } } } break; default: { status = LIBXSMM_DNN_ERR_INVALID_RNN_TYPE; } } } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_allocate_forget_bias(libxsmm_dnn_rnncell* handle, const float forget_bias) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (handle != 0) { handle->forget_bias = forget_bias; } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE_TENSOR; } return status; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_bind_tensor(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check for tensor type */ if ( (type != LIBXSMM_DNN_RNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_RNN_GRADIENT_INPUT) && (type != LIBXSMM_DNN_RNN_REGULAR_CS_PREV) && (type != LIBXSMM_DNN_RNN_GRADIENT_CS_PREV) && (type != LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV) && (type != LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV) && (type != LIBXSMM_DNN_RNN_REGULAR_WEIGHT) && (type != LIBXSMM_DNN_RNN_GRADIENT_WEIGHT) && (type != LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT) && (type != LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT) && (type != LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS) && (type != LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS) && (type != LIBXSMM_DNN_RNN_REGULAR_BIAS) && (type != LIBXSMM_DNN_RNN_GRADIENT_BIAS) && (type != LIBXSMM_DNN_RNN_REGULAR_CS) && (type != LIBXSMM_DNN_RNN_GRADIENT_CS) && (type != LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE) && (type != LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE) && (type != LIBXSMM_DNN_RNN_INTERNAL_I) && (type != LIBXSMM_DNN_RNN_INTERNAL_F) && (type != LIBXSMM_DNN_RNN_INTERNAL_O) && (type != LIBXSMM_DNN_RNN_INTERNAL_CI) && (type != LIBXSMM_DNN_RNN_INTERNAL_CO) ) { status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; return status; } if (handle != 0 && tensor != 0) { libxsmm_dnn_tensor_datalayout* handle_layout = libxsmm_dnn_rnncell_create_tensor_datalayout(handle, type, &status); if ( libxsmm_dnn_compare_tensor_datalayout(handle_layout, tensor->layout, &status) == 0 ) { if ( type == LIBXSMM_DNN_RNN_REGULAR_INPUT ) { handle->xt = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_INPUT ) { handle->dxt = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_RNN_REGULAR_CS_PREV ) { handle->csp = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_CS_PREV ) { handle->dcsp = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV ) { handle->hp = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV ) { handle->dhp = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT ) { handle->w = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS ) { handle->wt = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_WEIGHT ) { handle->dw = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT ) { handle->r = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS ) { handle->rt = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT ) { handle->dr = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_RNN_REGULAR_BIAS ) { handle->b = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_BIAS ) { handle->db = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_RNN_REGULAR_CS ) { handle->cst = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_CS ) { handle->dcs = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE ) { handle->ht = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE ) { handle->dht = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_RNN_INTERNAL_I ) { handle->it = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_RNN_INTERNAL_F ) { handle->ft = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_RNN_INTERNAL_O ) { handle->ot = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_RNN_INTERNAL_CI ) { handle->cit = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_RNN_INTERNAL_CO ) { handle->cot = (libxsmm_dnn_tensor*)tensor; } else { /* cannot happen */ } } else { status = LIBXSMM_DNN_ERR_MISMATCH_TENSOR; } libxsmm_dnn_destroy_tensor_datalayout( handle_layout ); } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE_TENSOR; } return status; } LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_rnncell_get_tensor(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status) { libxsmm_dnn_tensor* tensor = 0; LIBXSMM_UNUSED(status/*TODO*/); /* check for tensor type */ if ( (type != LIBXSMM_DNN_RNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_RNN_GRADIENT_INPUT) && (type != LIBXSMM_DNN_RNN_REGULAR_CS_PREV) && (type != LIBXSMM_DNN_RNN_GRADIENT_CS_PREV) && (type != LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV) && (type != LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV) && (type != LIBXSMM_DNN_RNN_REGULAR_WEIGHT) && (type != LIBXSMM_DNN_RNN_GRADIENT_WEIGHT) && (type != LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT) && (type != LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT) && (type != LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS) && (type != LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS) && (type != LIBXSMM_DNN_RNN_REGULAR_BIAS) && (type != LIBXSMM_DNN_RNN_GRADIENT_BIAS) && (type != LIBXSMM_DNN_RNN_REGULAR_CS) && (type != LIBXSMM_DNN_RNN_GRADIENT_CS) && (type != LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE) && (type != LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE) && (type != LIBXSMM_DNN_RNN_INTERNAL_I) && (type != LIBXSMM_DNN_RNN_INTERNAL_F) && (type != LIBXSMM_DNN_RNN_INTERNAL_O) && (type != LIBXSMM_DNN_RNN_INTERNAL_CI) && (type != LIBXSMM_DNN_RNN_INTERNAL_CO) ) { return tensor; } if (handle != 0) { if ( type == LIBXSMM_DNN_RNN_REGULAR_INPUT ) { tensor = handle->xt; } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_INPUT ) { tensor = handle->dxt; } else if ( type == LIBXSMM_DNN_RNN_REGULAR_CS_PREV ) { tensor = handle->csp; } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_CS_PREV ) { tensor = handle->dcsp; } else if ( type == LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV ) { tensor = handle->hp; } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV ) { tensor = handle->dhp; } else if ( type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT ) { tensor = handle->w; } else if ( type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS ) { tensor = handle->wt; } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_WEIGHT ) { tensor = handle->dw; } else if ( type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT ) { tensor = handle->r; } else if ( type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS ) { tensor = handle->rt; } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT ) { tensor = handle->dr; } else if ( type == LIBXSMM_DNN_RNN_REGULAR_BIAS ) { tensor = handle->b; } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_BIAS ) { tensor = handle->db; } else if ( type == LIBXSMM_DNN_RNN_REGULAR_CS ) { tensor = handle->cst; } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_CS ) { tensor = handle->dcs; } else if ( type == LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE ) { tensor = handle->ht; } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE ) { tensor = handle->dht; } else if ( type == LIBXSMM_DNN_RNN_INTERNAL_I ) { tensor = handle->it; } else if ( type == LIBXSMM_DNN_RNN_INTERNAL_F ) { tensor = handle->ft; } else if ( type == LIBXSMM_DNN_RNN_INTERNAL_O ) { tensor = handle->ot; } else if ( type == LIBXSMM_DNN_RNN_INTERNAL_CI ) { tensor = handle->cit; } else if ( type == LIBXSMM_DNN_RNN_INTERNAL_CO ) { tensor = handle->cot; } else { /* cannot happen */ } } return tensor; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_release_tensor(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_tensor_type type) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check for tensor type */ if ( (type != LIBXSMM_DNN_RNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_RNN_GRADIENT_INPUT) && (type != LIBXSMM_DNN_RNN_REGULAR_CS_PREV) && (type != LIBXSMM_DNN_RNN_GRADIENT_CS_PREV) && (type != LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV) && (type != LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV) && (type != LIBXSMM_DNN_RNN_REGULAR_WEIGHT) && (type != LIBXSMM_DNN_RNN_GRADIENT_WEIGHT) && (type != LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT) && (type != LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT) && (type != LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS) && (type != LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS) && (type != LIBXSMM_DNN_RNN_REGULAR_BIAS) && (type != LIBXSMM_DNN_RNN_GRADIENT_BIAS) && (type != LIBXSMM_DNN_RNN_REGULAR_CS) && (type != LIBXSMM_DNN_RNN_GRADIENT_CS) && (type != LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE) && (type != LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE) && (type != LIBXSMM_DNN_RNN_INTERNAL_I) && (type != LIBXSMM_DNN_RNN_INTERNAL_F) && (type != LIBXSMM_DNN_RNN_INTERNAL_O) && (type != LIBXSMM_DNN_RNN_INTERNAL_CI) && (type != LIBXSMM_DNN_RNN_INTERNAL_CO) ) { status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; return status; } if (handle != 0) { if ( type == LIBXSMM_DNN_RNN_REGULAR_INPUT ) { handle->xt = 0; } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_INPUT ) { handle->dxt = 0; } else if ( type == LIBXSMM_DNN_RNN_REGULAR_CS_PREV ) { handle->csp = 0; } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_CS_PREV ) { handle->dcsp = 0; } else if ( type == LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV ) { handle->hp = 0; } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV ) { handle->dhp = 0; } else if ( type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT ) { handle->w = 0; } else if ( type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS ) { handle->wt = 0; } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_WEIGHT ) { handle->dw = 0; } else if ( type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT ) { handle->r = 0; } else if ( type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS ) { handle->rt = 0; } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT ) { handle->dr = 0; } else if ( type == LIBXSMM_DNN_RNN_REGULAR_BIAS ) { handle->b = 0; } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_BIAS ) { handle->db = 0; } else if ( type == LIBXSMM_DNN_RNN_REGULAR_CS ) { handle->cst = 0; } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_CS ) { handle->dcs = 0; } else if ( type == LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE ) { handle->ht = 0; } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE ) { handle->dht = 0; } else if ( type == LIBXSMM_DNN_RNN_INTERNAL_I ) { handle->it = 0; } else if ( type == LIBXSMM_DNN_RNN_INTERNAL_F ) { handle->ft = 0; } else if ( type == LIBXSMM_DNN_RNN_INTERNAL_O ) { handle->ot = 0; } else if ( type == LIBXSMM_DNN_RNN_INTERNAL_CI ) { handle->cit = 0; } else if ( type == LIBXSMM_DNN_RNN_INTERNAL_CO ) { handle->cot = 0; } else { /* cannot happen */ } } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE_TENSOR; } return status; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_set_sequence_length( libxsmm_dnn_rnncell* handle, const libxsmm_blasint T ) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { if ( handle->desc.max_T < T ) { status = LIBXSMM_DNN_ERR_RNN_INVALID_SEQ_LEN; } else { handle->T = T; } } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_blasint libxsmm_dnn_rnncell_get_sequence_length( libxsmm_dnn_rnncell* handle, libxsmm_dnn_err_t* status ) { *status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { return handle->T; } else { *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return 0; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_execute_st(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, /*unsigned*/int start_thread, /*unsigned*/int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { switch (kind) { case LIBXSMM_DNN_COMPUTE_KIND_FWD: { if ( (handle->desc.buffer_format == LIBXSMM_DNN_TENSOR_FORMAT_NC) && (handle->desc.filter_format == LIBXSMM_DNN_TENSOR_FORMAT_CK) ) { status = libxsmm_dnn_rnncell_st_fwd_nc_ck( handle, start_thread, tid ); } else if ( (handle->desc.buffer_format == LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED) && (handle->desc.filter_format == LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED) ) { status = libxsmm_dnn_rnncell_st_fwd_ncnc_kcck( handle, start_thread, tid ); } else if ( (handle->desc.buffer_format == LIBXSMM_DNN_TENSOR_FORMAT_NC) && (handle->desc.filter_format == LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED) ) { status = libxsmm_dnn_rnncell_st_fwd_nc_kcck( handle, start_thread, tid ); } else { status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; } } break; case LIBXSMM_DNN_COMPUTE_KIND_BWD: case LIBXSMM_DNN_COMPUTE_KIND_UPD: case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: { if ( (handle->desc.buffer_format == LIBXSMM_DNN_TENSOR_FORMAT_NC) && (handle->desc.filter_format == LIBXSMM_DNN_TENSOR_FORMAT_CK) ) { status = libxsmm_dnn_rnncell_st_bwdupd_nc_ck( handle, kind, start_thread, tid ); } else if ( (handle->desc.buffer_format == LIBXSMM_DNN_TENSOR_FORMAT_NC) && (handle->desc.filter_format == LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED) ) { status = libxsmm_dnn_rnncell_st_bwdupd_nc_kcck( handle, kind, start_thread, tid ); } else { status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; } } break; default: { status = LIBXSMM_DNN_ERR_INVALID_KIND; } } } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } libxsmm-1.17/src/libxsmm_dnn_rnncell_backward_weight_update.c000066400000000000000000000506621415223013700246550ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Kunal Banerjee, Evangelos Georganas (Intel Corp.) ******************************************************************************/ #include "libxsmm_dnn_rnncell_backward_weight_update.h" #include "libxsmm_dnn_elementwise.h" #include "libxsmm_main.h" LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_ck_f32_f32(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_ck_bf16_bf16(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_ck_bf16_bf16_emu(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_kcck_bf16_bf16(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_kcck_bf16_bf16_emu(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_kcck_f32_f32(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_ncnc_kcck_f32_f32(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_ck_f32_f32(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ #define LIBXSMM_RNN_CELL_AVX512 typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { # define LIBXSMM_DNN_RNN_RELU_BWDUPD # include "template/libxsmm_dnn_rnncell_st_rnn_bwdupd_nc_ck_generic.tpl.c" # undef LIBXSMM_DNN_RNN_RELU_BWDUPD } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { # define LIBXSMM_DNN_RNN_SIGMOID_BWDUPD # include "template/libxsmm_dnn_rnncell_st_rnn_bwdupd_nc_ck_generic.tpl.c" # undef LIBXSMM_DNN_RNN_SIGMOID_BWDUPD } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { # define LIBXSMM_DNN_RNN_TANH_BWDUPD # include "template/libxsmm_dnn_rnncell_st_rnn_bwdupd_nc_ck_generic.tpl.c" # undef LIBXSMM_DNN_RNN_TANH_BWDUPD } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { # include "template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_ck_generic.tpl.c" } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { # include "template/libxsmm_dnn_rnncell_st_gru_bwdupd_nc_ck_generic.tpl.c" } else { /* should not happen */ } #undef LIBXSMM_RNN_CELL_AVX512 #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); LIBXSMM_UNUSED(kind); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_ck_bf16_bf16_emu(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ #define LIBXSMM_RNN_CELL_AVX512 typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef libxsmm_bfloat16 element_filter_type; /* some portable macrros fof BF16 <-> FP32 */ # include "template/libxsmm_dnn_bf16_macros_define.tpl.c" if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { # include "template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_ck_generic_bf16.tpl.c" } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else { /* should not happen */ } # include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" #undef LIBXSMM_RNN_CELL_AVX512 #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); LIBXSMM_UNUSED(kind); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CPX) libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_ck_bf16_bf16(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ #define LIBXSMM_RNN_CELL_AVX512 typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef libxsmm_bfloat16 element_filter_type; #define LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI /* some portable macrros fof BF16 <-> FP32 */ # include "template/libxsmm_dnn_bf16_macros_define.tpl.c" if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { # include "template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_ck_generic_bf16.tpl.c" } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else { /* should not happen */ } # include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" #undef LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI #undef LIBXSMM_RNN_CELL_AVX512 #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); LIBXSMM_UNUSED(kind); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_kcck_bf16_bf16_emu(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ #define LIBXSMM_RNN_CELL_AVX512 typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef libxsmm_bfloat16 element_filter_type; /* some portable macrros fof BF16 <-> FP32 */ # include "template/libxsmm_dnn_bf16_macros_define.tpl.c" if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { # include "template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_bf16.tpl.c" } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else { /* should not happen */ } # include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" #undef LIBXSMM_RNN_CELL_AVX512 #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); LIBXSMM_UNUSED(kind); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CPX) libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_kcck_bf16_bf16(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ #define LIBXSMM_RNN_CELL_AVX512 typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef libxsmm_bfloat16 element_filter_type; #define LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI /* some portable macrros fof BF16 <-> FP32 */ # include "template/libxsmm_dnn_bf16_macros_define.tpl.c" if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { # include "template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_bf16.tpl.c" } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else { /* should not happen */ } # include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" #undef LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI #undef LIBXSMM_RNN_CELL_AVX512 #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); LIBXSMM_UNUSED(kind); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_kcck_f32_f32(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ #define LIBXSMM_RNN_CELL_AVX512 typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { # define LIBXSMM_DNN_RNN_RELU_BWDUPD # include "template/libxsmm_dnn_rnncell_st_rnn_bwdupd_nc_kcck.tpl.c" # undef LIBXSMM_DNN_RNN_RELU_BWDUPD } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { # define LIBXSMM_DNN_RNN_SIGMOID_BWDUPD # include "template/libxsmm_dnn_rnncell_st_rnn_bwdupd_nc_kcck.tpl.c" # undef LIBXSMM_DNN_RNN_SIGMOID_BWDUPD } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { # define LIBXSMM_DNN_RNN_TANH_BWDUPD # include "template/libxsmm_dnn_rnncell_st_rnn_bwdupd_nc_kcck.tpl.c" # undef LIBXSMM_DNN_RNN_TANH_BWDUPD } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { # include "template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck.tpl.c" } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { # include "template/libxsmm_dnn_rnncell_st_gru_bwdupd_nc_kcck.tpl.c" } else { /* should not happen */ } #undef LIBXSMM_RNN_CELL_AVX512 #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); LIBXSMM_UNUSED(kind); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_ncnc_kcck_f32_f32(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; #if 0 typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; # include "template/libxsmm_dnn_rnncell_st_rnn_bwdupd_ncnc_kcck_generic.tpl.c" #endif LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); LIBXSMM_UNUSED(kind); #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); LIBXSMM_UNUSED(kind); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_ck(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check if we have input, output and filter */ #if 0 if (handle->? == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } #endif /* check if we are on AVX512 */ #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_rnncell_st_bwdupd_nc_ck_f32_f32( handle, kind, start_thread, tid ); } #if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) { if ( handle->desc.N % 2 != 0 ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else { #if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512_CORE && libxsmm_target_archid < LIBXSMM_X86_AVX512_CPX ) { status = libxsmm_dnn_rnncell_st_bwdupd_nc_ck_bf16_bf16_emu( handle, kind, start_thread, tid ); } else if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512_CPX ) { status = libxsmm_dnn_rnncell_st_bwdupd_nc_ck_bf16_bf16( handle, kind, start_thread, tid ); } #else if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512_CORE ) { status = libxsmm_dnn_rnncell_st_bwdupd_nc_ck_bf16_bf16_emu( handle, kind, start_thread, tid ); } #endif else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } } #endif else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else #endif { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { #define LIBXSMM_DNN_RNN_RELU_BWDUPD # include "template/libxsmm_dnn_rnncell_st_rnn_bwdupd_nc_ck_generic.tpl.c" #undef LIBXSMM_DNN_RNN_RELU_BWDUPD } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { #define LIBXSMM_DNN_RNN_SIGMOID_BWDUPD # include "template/libxsmm_dnn_rnncell_st_rnn_bwdupd_nc_ck_generic.tpl.c" #undef LIBXSMM_DNN_RNN_SIGMOID_BWDUPD } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { #define LIBXSMM_DNN_RNN_TANH_BWDUPD # include "template/libxsmm_dnn_rnncell_st_rnn_bwdupd_nc_ck_generic.tpl.c" #undef LIBXSMM_DNN_RNN_TANH_BWDUPD } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { # include "template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_ck_generic.tpl.c" } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { # include "template/libxsmm_dnn_rnncell_st_gru_bwdupd_nc_ck_generic.tpl.c" } else { /* should not happen */ } } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_kcck(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check if we have input, output and filter */ #if 0 if (handle->? == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } #endif /* check if we are on AVX512 */ #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) { if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_rnncell_st_bwdupd_nc_kcck_f32_f32( handle, kind, start_thread, tid ); } #if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { if ( handle->desc.N % 2 != 0 ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else { #if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512_CORE && libxsmm_target_archid < LIBXSMM_X86_AVX512_CPX ) { status = libxsmm_dnn_rnncell_st_bwdupd_nc_kcck_bf16_bf16_emu( handle, kind, start_thread, tid ); } else if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512_CPX ) { status = libxsmm_dnn_rnncell_st_bwdupd_nc_kcck_bf16_bf16( handle, kind, start_thread, tid ); } #else if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512_CORE ) { status = libxsmm_dnn_rnncell_st_bwdupd_nc_kcck_bf16_bf16_emu( handle, kind, start_thread, tid ); } #endif else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } } #endif else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else #endif { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { #define LIBXSMM_DNN_RNN_RELU_BWDUPD # include "template/libxsmm_dnn_rnncell_st_rnn_bwdupd_nc_kcck.tpl.c" #undef LIBXSMM_DNN_RNN_RELU_BWDUPD } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { #define LIBXSMM_DNN_RNN_SIGMOID_BWDUPD # include "template/libxsmm_dnn_rnncell_st_rnn_bwdupd_nc_kcck.tpl.c" #undef LIBXSMM_DNN_RNN_SIGMOID_BWDUPD } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { #define LIBXSMM_DNN_RNN_TANH_BWDUPD # include "template/libxsmm_dnn_rnncell_st_rnn_bwdupd_nc_kcck.tpl.c" #undef LIBXSMM_DNN_RNN_TANH_BWDUPD } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { # include "template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck.tpl.c" } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { # include "template/libxsmm_dnn_rnncell_st_gru_bwdupd_nc_kcck.tpl.c" } else { /* should not happen */ } } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_ncnc_kcck(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check if we have input, output and filter */ #if 0 if (handle->? == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } #endif /* check if we are on AVX512 */ #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_rnncell_st_bwdupd_ncnc_kcck_f32_f32( handle, kind, start_thread, tid ); } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else #endif { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { LIBXSMM_UNUSED(kind); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } return status; } libxsmm-1.17/src/libxsmm_dnn_rnncell_backward_weight_update.h000066400000000000000000000027231415223013700246550ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Evangelos Georganas (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_DNN_RNNCELL_BACKWARD_WEIGHT_UPDATE_H #define LIBXSMM_DNN_RNNCELL_BACKWARD_WEIGHT_UPDATE_H #include #include LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_ck(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_kcck(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_ncnc_kcck(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); #endif /* LIBXSMM_DNN_RNNCELL_BACKWARD_WEIGHT_UPDATE_H */ libxsmm-1.17/src/libxsmm_dnn_rnncell_forward.c000066400000000000000000000515361415223013700216330ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Kunal Banerjee (Intel Corp.) ******************************************************************************/ #include "libxsmm_dnn_rnncell_forward.h" #include "libxsmm_dnn_elementwise.h" #include "libxsmm_main.h" LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_ck_f32_f32(libxsmm_dnn_rnncell* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_ck_bf16_bf16(libxsmm_dnn_rnncell* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_ck_bf16_bf16_emu(libxsmm_dnn_rnncell* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_ncnc_kcck_f32_f32(libxsmm_dnn_rnncell* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_kcck_f32_f32(libxsmm_dnn_rnncell* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_kcck_bf16_bf16(libxsmm_dnn_rnncell* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_kcck_bf16_bf16_emu(libxsmm_dnn_rnncell* handle, int start_thread, int tid); LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_ck_f32_f32(libxsmm_dnn_rnncell* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { # define LIBXSMM_DNN_RNN_RELU_FWD # include "template/libxsmm_dnn_rnncell_st_rnn_fwd_nc_ck_generic.tpl.c" # undef LIBXSMM_DNN_RNN_RELU_FWD } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { # define LIBXSMM_DNN_RNN_SIGMOID_FWD # include "template/libxsmm_dnn_rnncell_st_rnn_fwd_nc_ck_generic.tpl.c" # undef LIBXSMM_DNN_RNN_SIGMOID_FWD } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { # define LIBXSMM_DNN_RNN_TANH_FWD # include "template/libxsmm_dnn_rnncell_st_rnn_fwd_nc_ck_generic.tpl.c" # undef LIBXSMM_DNN_RNN_TANH_FWD } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { #define LIBXSMM_RNN_CELL_AVX512 # include "template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_ck_generic.tpl.c" #undef LIBXSMM_RNN_CELL_AVX512 } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { # include "template/libxsmm_dnn_rnncell_st_gru_fwd_nc_ck_generic.tpl.c" } else { /* should not happen */ } #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_ck_bf16_bf16_emu(libxsmm_dnn_rnncell* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__, __AVX512BW__, __AVX512DQ__*/ typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef libxsmm_bfloat16 element_filter_type; /* some portable macrros fof BF16 <-> FP32 */ # include "template/libxsmm_dnn_bf16_macros_define.tpl.c" if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { #define LIBXSMM_RNN_CELL_AVX512 # include "template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_ck_generic_bf16.tpl.c" #undef LIBXSMM_RNN_CELL_AVX512 } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else { /* should not happen */ } # include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CPX) libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_ck_bf16_bf16(libxsmm_dnn_rnncell* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__, __AVX512BW__, __AVX512DQ__, __AVX512BF16__*/ typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef libxsmm_bfloat16 element_filter_type; #define LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI /* some portable macrros fof BF16 <-> FP32 */ # include "template/libxsmm_dnn_bf16_macros_define.tpl.c" if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { #define LIBXSMM_RNN_CELL_AVX512 # include "template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_ck_generic_bf16.tpl.c" #undef LIBXSMM_RNN_CELL_AVX512 } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else { /* should not happen */ } # include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" #undef LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_ncnc_kcck_f32_f32(libxsmm_dnn_rnncell* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { # define LIBXSMM_DNN_RNN_RELU_FWD # include "template/libxsmm_dnn_rnncell_st_rnn_fwd_ncnc_kcck.tpl.c" # undef LIBXSMM_DNN_RNN_RELU_FWD } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { # define LIBXSMM_DNN_RNN_SIGMOID_FWD # include "template/libxsmm_dnn_rnncell_st_rnn_fwd_ncnc_kcck.tpl.c" # undef LIBXSMM_DNN_RNN_SIGMOID_FWD } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { # define LIBXSMM_DNN_RNN_TANH_FWD # include "template/libxsmm_dnn_rnncell_st_rnn_fwd_ncnc_kcck.tpl.c" # undef LIBXSMM_DNN_RNN_TANH_FWD } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else { /* should not happen */ } #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_kcck_f32_f32(libxsmm_dnn_rnncell* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { # define LIBXSMM_DNN_RNN_RELU_FWD # include "template/libxsmm_dnn_rnncell_st_rnn_fwd_nc_kcck.tpl.c" # undef LIBXSMM_DNN_RNN_RELU_FWD } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { # define LIBXSMM_DNN_RNN_SIGMOID_FWD # include "template/libxsmm_dnn_rnncell_st_rnn_fwd_nc_kcck.tpl.c" # undef LIBXSMM_DNN_RNN_SIGMOID_FWD } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { # define LIBXSMM_DNN_RNN_TANH_FWD # include "template/libxsmm_dnn_rnncell_st_rnn_fwd_nc_kcck.tpl.c" # undef LIBXSMM_DNN_RNN_TANH_FWD } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { #define LIBXSMM_RNN_CELL_AVX512 # include "template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck.tpl.c" #undef LIBXSMM_RNN_CELL_AVX512 } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { # include "template/libxsmm_dnn_rnncell_st_gru_fwd_nc_kcck.tpl.c" } else { /* should not happen */ } #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_kcck_bf16_bf16_emu(libxsmm_dnn_rnncell* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef libxsmm_bfloat16 element_filter_type; /* some portable macrros fof BF16 <-> FP32 */ # include "template/libxsmm_dnn_bf16_macros_define.tpl.c" if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { #define LIBXSMM_RNN_CELL_AVX512 # include "template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_bf16.tpl.c" #undef LIBXSMM_RNN_CELL_AVX512 } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else { /* should not happen */ } # include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CPX) libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_kcck_bf16_bf16(libxsmm_dnn_rnncell* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef libxsmm_bfloat16 element_filter_type; #define LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI /* some portable macrros fof BF16 <-> FP32 */ # include "template/libxsmm_dnn_bf16_macros_define.tpl.c" if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { #define LIBXSMM_RNN_CELL_AVX512 # include "template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_bf16.tpl.c" #undef LIBXSMM_RNN_CELL_AVX512 } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else { /* should not happen */ } # include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" #undef LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_ck(libxsmm_dnn_rnncell* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check if we have input, output and filter */ #if 0 if (handle->? == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } #endif /* check if we are on AVX512 */ #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) { if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_rnncell_st_fwd_nc_ck_f32_f32( handle, start_thread, tid); } #if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_CORE && libxsmm_target_archid < LIBXSMM_X86_AVX512_CPX ) { status = libxsmm_dnn_rnncell_st_fwd_nc_ck_bf16_bf16_emu( handle, start_thread, tid); } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_CPX ) { status = libxsmm_dnn_rnncell_st_fwd_nc_ck_bf16_bf16( handle, start_thread, tid); } #elif defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_CORE ) { status = libxsmm_dnn_rnncell_st_fwd_nc_ck_bf16_bf16_emu( handle, start_thread, tid); } #endif else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else #endif { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { #define LIBXSMM_DNN_RNN_RELU_FWD # include "template/libxsmm_dnn_rnncell_st_rnn_fwd_nc_ck_generic.tpl.c" #undef LIBXSMM_DNN_RNN_RELU_FWD } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { #define LIBXSMM_DNN_RNN_SIGMOID_FWD # include "template/libxsmm_dnn_rnncell_st_rnn_fwd_nc_ck_generic.tpl.c" #undef LIBXSMM_DNN_RNN_SIGMOID_FWD } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { #define LIBXSMM_DNN_RNN_TANH_FWD # include "template/libxsmm_dnn_rnncell_st_rnn_fwd_nc_ck_generic.tpl.c" #undef LIBXSMM_DNN_RNN_TANH_FWD } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { # include "template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_ck_generic.tpl.c" } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { # include "template/libxsmm_dnn_rnncell_st_gru_fwd_nc_ck_generic.tpl.c" } else { /* should not happen */ } } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_ncnc_kcck(libxsmm_dnn_rnncell* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check if we have input, output and filter */ #if 0 if (handle->? == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } #endif /* check if we are on AVX512 */ #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_rnncell_st_fwd_ncnc_kcck_f32_f32( handle, start_thread, tid); } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else #endif { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { #define LIBXSMM_DNN_RNN_RELU_FWD # include "template/libxsmm_dnn_rnncell_st_rnn_fwd_ncnc_kcck.tpl.c" #undef LIBXSMM_DNN_RNN_RELU_FWD } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { #define LIBXSMM_DNN_RNN_SIGMOID_FWD # include "template/libxsmm_dnn_rnncell_st_rnn_fwd_ncnc_kcck.tpl.c" #undef LIBXSMM_DNN_RNN_SIGMOID_FWD } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { #define LIBXSMM_DNN_RNN_TANH_FWD # include "template/libxsmm_dnn_rnncell_st_rnn_fwd_ncnc_kcck.tpl.c" #undef LIBXSMM_DNN_RNN_TANH_FWD } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; } else { /* should not happen */ } } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_kcck(libxsmm_dnn_rnncell* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check if we have input, output and filter */ #if 0 if (handle->? == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } #endif /* check if we are on AVX512 */ #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) { if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_rnncell_st_fwd_nc_kcck_f32_f32( handle, start_thread, tid); } #if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_CORE && libxsmm_target_archid < LIBXSMM_X86_AVX512_CPX ) { status = libxsmm_dnn_rnncell_st_fwd_nc_kcck_bf16_bf16_emu( handle, start_thread, tid); } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_CPX ) { status = libxsmm_dnn_rnncell_st_fwd_nc_kcck_bf16_bf16( handle, start_thread, tid); } #elif defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_CORE ) { status = libxsmm_dnn_rnncell_st_fwd_nc_kcck_bf16_bf16_emu( handle, start_thread, tid); } #endif else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else #endif { if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { #define LIBXSMM_DNN_RNN_RELU_FWD # include "template/libxsmm_dnn_rnncell_st_rnn_fwd_nc_kcck.tpl.c" #undef LIBXSMM_DNN_RNN_RELU_FWD } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { #define LIBXSMM_DNN_RNN_SIGMOID_FWD # include "template/libxsmm_dnn_rnncell_st_rnn_fwd_nc_kcck.tpl.c" #undef LIBXSMM_DNN_RNN_SIGMOID_FWD } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { #define LIBXSMM_DNN_RNN_TANH_FWD # include "template/libxsmm_dnn_rnncell_st_rnn_fwd_nc_kcck.tpl.c" #undef LIBXSMM_DNN_RNN_TANH_FWD } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { # include "template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck.tpl.c" } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { # include "template/libxsmm_dnn_rnncell_st_gru_fwd_nc_kcck.tpl.c" } else { /* should not happen */ } } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } return status; } libxsmm-1.17/src/libxsmm_dnn_rnncell_forward.h000066400000000000000000000025001415223013700216230ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Evangelos Georganas (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_DNN_RNNCELL_FORWARD_H #define LIBXSMM_DNN_RNNCELL_FORWARD_H #include #include LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_ck(libxsmm_dnn_rnncell* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_ncnc_kcck(libxsmm_dnn_rnncell* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_kcck(libxsmm_dnn_rnncell* handle, int start_thread, int tid); #endif /* LIBXSMM_DNN_RNNCELL_FORWARD_H */ libxsmm-1.17/src/libxsmm_dnn_softmaxloss.c000066400000000000000000000326001415223013700210230ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ #include "libxsmm_dnn_softmaxloss_backward.h" #include "libxsmm_dnn_softmaxloss_forward.h" #include "libxsmm_main.h" LIBXSMM_API libxsmm_dnn_softmaxloss* libxsmm_dnn_create_softmaxloss(libxsmm_dnn_softmaxloss_desc softmaxloss_desc, libxsmm_dnn_err_t* status) { libxsmm_dnn_softmaxloss* handle = 0; int lpb; /* init libxsmm */ LIBXSMM_INIT if ( (softmaxloss_desc.datatype == LIBXSMM_DNN_DATATYPE_F32) || (softmaxloss_desc.datatype == LIBXSMM_DNN_DATATYPE_BF16) ) { handle = (libxsmm_dnn_softmaxloss*)malloc(sizeof(libxsmm_dnn_softmaxloss)); if (0 != handle) { *status = LIBXSMM_DNN_SUCCESS; /* zero entire content; not only safer but also sets data and code pointers to NULL */ memset(handle, 0, sizeof(*handle)); /* let's make the description persistent */ handle->desc = softmaxloss_desc; /* cnn */ if ( (handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0 ) { int bk; /* we need to compute the memory layout given the */ *status = libxsmm_dnn_get_feature_map_blocks( handle->desc.C, handle->desc.C, &(handle->bc), &bk, &lpb, handle->desc.datatype, handle->desc.datatype ); /* compute the outer blocks */ handle->Bc = handle->desc.C / handle->bc; handle->bn = 1; handle->Bn = handle->desc.N; } else if ( (handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED) > 0 ) { handle->bc = handle->desc.bc; handle->bn = handle->desc.bn; handle->Bc = handle->desc.C / handle->bc; handle->Bn = handle->desc.N / handle->bn; } else { *status = LIBXSMM_DNN_ERR_CREATE_HANDLE; free( handle ); handle = 0; return handle; } /* create barrier */ handle->barrier = libxsmm_barrier_create(handle->desc.threads, 1); /* calculate scratch size for local softmaxloss copies of one feature map block per thread */ if ( softmaxloss_desc.datatype == LIBXSMM_DNN_DATATYPE_BF16 ) { handle->scratch_size = (sizeof(float)*handle->desc.C*handle->desc.N*2); } else { handle->scratch_size = 1; } } else { *status = LIBXSMM_DNN_ERR_CREATE_HANDLE; } } else { *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } return handle; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_softmaxloss(const libxsmm_dnn_softmaxloss* handle) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { /* Deallocate barrier */ if (handle->barrier != 0 ) { libxsmm_barrier_release((const libxsmm_barrier*)handle->barrier); } /* deallocate handle structure */ free(/*remove constness*/(libxsmm_dnn_softmaxloss*)handle); } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_softmaxloss_create_tensor_datalayout(const libxsmm_dnn_softmaxloss* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status) { libxsmm_dnn_tensor_datalayout* layout; *status = LIBXSMM_DNN_SUCCESS; layout = 0; if (handle != 0) { layout = (libxsmm_dnn_tensor_datalayout*) malloc(sizeof(libxsmm_dnn_tensor_datalayout)); if (layout != 0) { memset(layout, 0, sizeof(libxsmm_dnn_tensor_datalayout)); layout->format = handle->desc.buffer_format; if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) || (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0) { layout->datatype = handle->desc.datatype; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(3*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(3*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { layout->num_dims = 3; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; layout->dim_size[0] = handle->bc; layout->dim_size[1] = handle->Bc; layout->dim_size[2] = handle->desc.N; } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; } } else if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED) > 0) { layout->datatype = handle->desc.datatype; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(4*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(4*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { layout->num_dims = 4; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; layout->dim_size[0] = handle->bc; layout->dim_size[1] = handle->bn; layout->dim_size[2] = handle->Bc; layout->dim_size[3] = handle->Bn; } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; } } else if ( type == LIBXSMM_DNN_LABEL ) { layout->datatype = LIBXSMM_DNN_DATATYPE_I32; layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(1*sizeof(libxsmm_dnn_tensor_dimtype)); layout->dim_size = (unsigned int*) malloc(1*sizeof(unsigned int)); if (0 != layout->dim_type && 0 != layout->dim_size) { layout->num_dims = 1; layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; layout->dim_size[0] = handle->desc.N; } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; } } else { free(layout); layout = 0; /* make sure a NULL is returned */ *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; } } else { *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT; } } else { *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return layout; } LIBXSMM_API size_t libxsmm_dnn_softmaxloss_get_scratch_size(const libxsmm_dnn_softmaxloss* handle, libxsmm_dnn_err_t* status) { size_t l_scratch_size = 0; *status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { l_scratch_size = handle->scratch_size + 64; /* 64 byte extra in case the user code does not care about alignment */ } else { *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return l_scratch_size; } LIBXSMM_API void* libxsmm_dnn_softmaxloss_get_scratch_ptr(const libxsmm_dnn_softmaxloss* handle, libxsmm_dnn_err_t* status) { *status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { return handle->scratch; } else { *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return 0; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_bind_scratch(libxsmm_dnn_softmaxloss* handle, const void* scratch) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; uintptr_t address = (uintptr_t)scratch; size_t offset = 0; if (scratch == 0) { status = LIBXSMM_DNN_ERR_SCRATCH_NOT_ALLOCED; return status; } if (0 != handle) { /* align the internal scratch buffer if needed */ if (address % 64 == 0) { handle->scratch = (void*)address; } else { offset = (64 - address % 64); handle->scratch = (void*)(address+offset); } } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_release_scratch(libxsmm_dnn_softmaxloss* handle) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { handle->scratch = 0; } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_bind_tensor(libxsmm_dnn_softmaxloss* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check for tensor type */ if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_LABEL) ) { status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; return status; } if (handle != 0 && tensor != 0) { libxsmm_dnn_tensor_datalayout* handle_layout = libxsmm_dnn_softmaxloss_create_tensor_datalayout(handle, type, &status); if ( libxsmm_dnn_compare_tensor_datalayout(handle_layout, tensor->layout, &status) == 0 ) { if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { handle->reg_input = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { handle->grad_input = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { handle->reg_output = (libxsmm_dnn_tensor*)tensor; } else if ( type == LIBXSMM_DNN_LABEL ) { handle->label = (libxsmm_dnn_tensor*)tensor; } else { /* cannot happen */ } } else { status = LIBXSMM_DNN_ERR_MISMATCH_TENSOR; } libxsmm_dnn_destroy_tensor_datalayout( handle_layout ); } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE_TENSOR; } return status; } LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_softmaxloss_get_tensor(libxsmm_dnn_softmaxloss* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status) { libxsmm_dnn_tensor* return_tensor = 0; *status = LIBXSMM_DNN_SUCCESS; /* check for tensor type */ if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_LABEL) ) { *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; return return_tensor; } if (handle != 0) { if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { return_tensor = handle->reg_input; } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { return_tensor = handle->grad_input; } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { return_tensor = handle->reg_output; } else if ( type == LIBXSMM_DNN_LABEL ) { return_tensor = handle->label; } else { /* cannot happen */ } } else { *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return return_tensor; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_release_tensor(libxsmm_dnn_softmaxloss* handle, const libxsmm_dnn_tensor_type type) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check for tensor type */ if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_LABEL) ) { status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; return status; } if (handle != 0) { if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { handle->reg_input = 0; } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { handle->grad_input = 0; } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { handle->reg_output = 0; } else if ( type == LIBXSMM_DNN_LABEL ) { handle->label = 0; } else { /* cannot happen */ } } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_execute_st(libxsmm_dnn_softmaxloss* handle, libxsmm_dnn_compute_kind kind, /*unsigned*/int start_thread, /*unsigned*/int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { switch (kind) { case LIBXSMM_DNN_COMPUTE_KIND_FWD: { status = libxsmm_dnn_softmaxloss_st_fwd_ncnc( handle, start_thread, tid ); } break; case LIBXSMM_DNN_COMPUTE_KIND_BWD: { status = libxsmm_dnn_softmaxloss_st_bwd_ncnc( handle, start_thread, tid ); } break; default: { status = LIBXSMM_DNN_ERR_INVALID_KIND; } } } else { status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return status; } LIBXSMM_API float libxsmm_dnn_softmaxloss_get_loss(const libxsmm_dnn_softmaxloss* handle, libxsmm_dnn_err_t* status) { float l_loss = 0.0f; *status = LIBXSMM_DNN_SUCCESS; if (0 != handle) { l_loss = handle->loss; } else { *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; } return l_loss; } libxsmm-1.17/src/libxsmm_dnn_softmaxloss_backward.c000066400000000000000000000102541415223013700226620ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "libxsmm_dnn_softmaxloss_backward.h" #include "libxsmm_main.h" LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_st_bwd_ncnc_f32_f32(libxsmm_dnn_softmaxloss* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_st_bwd_ncnc_bf16_bf16(libxsmm_dnn_softmaxloss* handle, int start_thread, int tid); LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_st_bwd_ncnc_f32_f32(libxsmm_dnn_softmaxloss* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef float element_input_type; typedef float element_output_type; typedef int element_label_type; # include "template/libxsmm_dnn_softmaxloss_st_bwd_ncnc_generic.tpl.c" #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_st_bwd_ncnc_bf16_bf16(libxsmm_dnn_softmaxloss* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef int element_label_type; # define LIBXSMM_DNN_SOFTMAXLOSS_BWD_BF16_AVX512 # include "template/libxsmm_dnn_softmaxloss_st_bwd_ncnc_generic.tpl.c" # undef LIBXSMM_DNN_SOFTMAXLOSS_BWD_BF16_AVX512 #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_st_bwd_ncnc(libxsmm_dnn_softmaxloss* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check if we have input, output and mask */ if ( handle->grad_input == 0 || handle->reg_output == 0 || handle->label == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } /* check if we are on an AVX512 platform */ #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) { if ( handle->desc.datatype == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_softmaxloss_st_bwd_ncnc_f32_f32( handle, start_thread, tid); } else if ( handle->desc.datatype == LIBXSMM_DNN_DATATYPE_BF16 ) { status = libxsmm_dnn_softmaxloss_st_bwd_ncnc_bf16_bf16( handle, start_thread, tid); } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else #endif { if ( handle->desc.datatype == LIBXSMM_DNN_DATATYPE_F32 ) { typedef float element_input_type; typedef float element_output_type; typedef int element_label_type; # include "template/libxsmm_dnn_softmaxloss_st_bwd_ncnc_generic.tpl.c" } else if ( handle->desc.datatype == LIBXSMM_DNN_DATATYPE_BF16 ) { typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef int element_label_type; # define LIBXSMM_DNN_SOFTMAXLOSS_BWD_BF16 # include "template/libxsmm_dnn_softmaxloss_st_bwd_ncnc_generic.tpl.c" # undef LIBXSMM_DNN_SOFTMAXLOSS_BWD_BF16 } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } return status; } libxsmm-1.17/src/libxsmm_dnn_softmaxloss_backward.h000066400000000000000000000020501415223013700226620ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_DNN_SOFTMAXLOSS_BACKWARD_H #define LIBXSMM_DNN_SOFTMAXLOSS_BACKWARD_H #include LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_st_bwd_ncnc(libxsmm_dnn_softmaxloss* handle, int start_thread, int tid); #endif /* LIBXSMM_DNN_SOFTMAXLOSS_BACKWARD_H */ libxsmm-1.17/src/libxsmm_dnn_softmaxloss_forward.c000066400000000000000000000102411415223013700225440ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "libxsmm_dnn_softmaxloss_forward.h" #include "libxsmm_main.h" LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_st_fwd_ncnc_f32_f32(libxsmm_dnn_softmaxloss* handle, int start_thread, int tid); LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_st_fwd_ncnc_bf16_bf16(libxsmm_dnn_softmaxloss* handle, int start_thread, int tid); LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_st_fwd_ncnc_f32_f32(libxsmm_dnn_softmaxloss* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef float element_input_type; typedef float element_output_type; typedef int element_label_type; # include "template/libxsmm_dnn_softmaxloss_st_fwd_ncnc_generic.tpl.c" #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_st_fwd_ncnc_bf16_bf16(libxsmm_dnn_softmaxloss* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef int element_label_type; # define LIBXSMM_DNN_SOFTMAXLOSS_FWD_BF16_AVX512 # include "template/libxsmm_dnn_softmaxloss_st_fwd_ncnc_generic.tpl.c" # undef LIBXSMM_DNN_SOFTMAXLOSS_FWD_BF16_AVX512 #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; } LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_st_fwd_ncnc(libxsmm_dnn_softmaxloss* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* check if we have input, output and mask */ if ( handle->reg_input == 0 || handle->reg_output == 0 || handle->label == 0 ) { status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; return status; } /* check if we are on an AVX512 platform */ #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) { if ( handle->desc.datatype == LIBXSMM_DNN_DATATYPE_F32 ) { status = libxsmm_dnn_softmaxloss_st_fwd_ncnc_f32_f32( handle, start_thread, tid); } else if ( handle->desc.datatype == LIBXSMM_DNN_DATATYPE_BF16 ) { status = libxsmm_dnn_softmaxloss_st_fwd_ncnc_bf16_bf16( handle, start_thread, tid); } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } else #endif { if ( handle->desc.datatype == LIBXSMM_DNN_DATATYPE_F32 ) { typedef float element_input_type; typedef float element_output_type; typedef int element_label_type; # include "template/libxsmm_dnn_softmaxloss_st_fwd_ncnc_generic.tpl.c" } else if ( handle->desc.datatype == LIBXSMM_DNN_DATATYPE_BF16 ) { typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; typedef int element_label_type; # define LIBXSMM_DNN_SOFTMAXLOSS_FWD_BF16 # include "template/libxsmm_dnn_softmaxloss_st_fwd_ncnc_generic.tpl.c" # undef LIBXSMM_DNN_SOFTMAXLOSS_FWD_BF16 } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; return status; } } return status; } libxsmm-1.17/src/libxsmm_dnn_softmaxloss_forward.h000066400000000000000000000020451415223013700225540ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_DNN_SOFTMAXLOSS_FORWARD_H #define LIBXSMM_DNN_SOFTMAXLOSS_FORWARD_H #include LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_st_fwd_ncnc(libxsmm_dnn_softmaxloss* handle, int start_thread, int tid); #endif /* LIBXSMM_DNN_SOFTMAXLOSS_FORWARD_H */ libxsmm-1.17/src/libxsmm_dnn_tensor.c000066400000000000000000001025121415223013700177530ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst, Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include #include "libxsmm_main.h" #include "libxsmm_dnn_tensor.h" #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #include #if defined(_OPENMP) # include #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_link_tensor(const libxsmm_dnn_tensor_datalayout* layout, const void* data, libxsmm_dnn_err_t* status) { return libxsmm_dnn_link_qtensor(layout, data, 0, status); } LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_link_qtensor(const libxsmm_dnn_tensor_datalayout* layout, const void* data, const unsigned char scf, libxsmm_dnn_err_t* status) { libxsmm_dnn_tensor* tensor = (libxsmm_dnn_tensor*)malloc(sizeof(libxsmm_dnn_tensor)); *status = LIBXSMM_DNN_SUCCESS; if (layout != 0 && tensor != 0 && data != 0) { memset(tensor, 0, sizeof(libxsmm_dnn_tensor)); tensor->layout = libxsmm_dnn_duplicate_tensor_datalayout(layout, status); tensor->data = (void*)data; tensor->scf = scf; /* when layout copy failed, free layout */ if (*status != LIBXSMM_DNN_SUCCESS) { libxsmm_dnn_destroy_tensor_datalayout(tensor->layout); } } else { *status = LIBXSMM_DNN_ERR_CREATE_TENSOR; } if (*status != LIBXSMM_DNN_SUCCESS) { free((libxsmm_dnn_tensor*)tensor); tensor = 0; } return tensor; } LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_duplicate_tensor_datalayout(const libxsmm_dnn_tensor_datalayout* layout, libxsmm_dnn_err_t* status) { libxsmm_dnn_tensor_datalayout* dst_layout; *status = LIBXSMM_DNN_SUCCESS; dst_layout = 0; if (layout != 0 && layout->num_dims != 0) { unsigned int dim = 0; dst_layout = (libxsmm_dnn_tensor_datalayout*)malloc(sizeof(libxsmm_dnn_tensor_datalayout)); if (0 != dst_layout) { memset(dst_layout, 0, sizeof(libxsmm_dnn_tensor_datalayout)); dst_layout->dim_type = (libxsmm_dnn_tensor_dimtype*)malloc(layout->num_dims * sizeof(libxsmm_dnn_tensor_dimtype)); dst_layout->dim_size = (unsigned int*)malloc(layout->num_dims * sizeof(unsigned int)); dst_layout->num_dims = layout->num_dims; dst_layout->format = layout->format; dst_layout->datatype = layout->datatype; dst_layout->tensor_type = layout->tensor_type; if (0 != dst_layout->dim_type && 0 != dst_layout->dim_size) { for (dim = 0; dim < layout->num_dims; ++dim) { dst_layout->dim_type[dim] = layout->dim_type[dim]; dst_layout->dim_size[dim] = layout->dim_size[dim]; } } else { *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT; } } else { *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT; } } else { *status = LIBXSMM_DNN_ERR_INVALID_LAYOUT; } return dst_layout; } LIBXSMM_API unsigned int libxsmm_dnn_compare_tensor_datalayout(const libxsmm_dnn_tensor_datalayout* layout_a, const libxsmm_dnn_tensor_datalayout* layout_b, libxsmm_dnn_err_t* status) { unsigned int result = 0; *status = LIBXSMM_DNN_SUCCESS; if (layout_a != 0 && layout_b != 0) { unsigned int dim = 0; if (layout_a->num_dims != layout_b->num_dims) { result = 1; } if (layout_a->format != layout_b->format) { result = 1; } if (layout_a->datatype != layout_b->datatype) { result = 1; } if (result == 0) { for ( dim = 0; dim < layout_a->num_dims; ++dim ) { if ( layout_a->dim_type[dim] != layout_b->dim_type[dim] ) { result = 1; } if ( layout_a->dim_size[dim] != layout_b->dim_size[dim] ) { result = 1; } } } } else { *status = LIBXSMM_DNN_ERR_INVALID_LAYOUT; result = 100; } return result; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_tensor_datalayout(libxsmm_dnn_tensor_datalayout* layout) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (0 != layout) { free(layout->dim_type); free(layout->dim_size); free(layout); } else { status = LIBXSMM_DNN_ERR_INVALID_LAYOUT; } return status; } LIBXSMM_API unsigned int libxsmm_dnn_get_tensor_size(const libxsmm_dnn_tensor_datalayout* layout, libxsmm_dnn_err_t* status) { unsigned int size = 0; *status = LIBXSMM_DNN_SUCCESS; if (0 != layout) { unsigned int dim = 0; size = (unsigned int)libxsmm_dnn_typesize(layout->datatype); for (dim = 0; dim < layout->num_dims; ++dim) { size *= layout->dim_size[dim]; } } else { *status = LIBXSMM_DNN_ERR_INVALID_LAYOUT; } return size; } LIBXSMM_API unsigned int libxsmm_dnn_get_tensor_elements(const libxsmm_dnn_tensor_datalayout* layout, libxsmm_dnn_err_t* status) { unsigned int elements = 1; *status = LIBXSMM_DNN_SUCCESS; if (0 != layout) { unsigned int dim = 0; for ( dim = 0; dim < layout->num_dims; ++dim ) { elements *= layout->dim_size[dim]; } } else { *status = LIBXSMM_DNN_ERR_INVALID_LAYOUT; elements = 0; } return elements; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_set_tensor_data_ptr(libxsmm_dnn_tensor* tensor, const void* data) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if ((0 != tensor) && (0 != data)) { if (0 != tensor->layout) { if (0 < tensor->layout->num_dims) { tensor->data = (void*)data; } else { status = LIBXSMM_DNN_ERR_INVALID_LAYOUT; } } else { status = LIBXSMM_DNN_ERR_INVALID_LAYOUT; } } else { status = LIBXSMM_DNN_ERR_INVALID_TENSOR; } return status; } LIBXSMM_API void* libxsmm_dnn_get_tensor_data_ptr(const libxsmm_dnn_tensor* tensor, libxsmm_dnn_err_t* status) { *status = LIBXSMM_DNN_SUCCESS; if (0 != tensor) { return tensor->data; } else { *status = LIBXSMM_DNN_ERR_INVALID_TENSOR; } return 0; } LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_get_tensor_datalayout(const libxsmm_dnn_tensor* tensor, libxsmm_dnn_err_t* status) { libxsmm_dnn_tensor_datalayout* dst_layout = NULL; *status = LIBXSMM_DNN_SUCCESS; if (0 != tensor) { dst_layout = libxsmm_dnn_duplicate_tensor_datalayout( tensor->layout, status ); } else { *status = LIBXSMM_DNN_ERR_INVALID_TENSOR; } return dst_layout; } LIBXSMM_API unsigned char libxsmm_dnn_get_qtensor_scf(const libxsmm_dnn_tensor* tensor, libxsmm_dnn_err_t* status) { *status = LIBXSMM_DNN_SUCCESS; if (0 != tensor) { return tensor->scf; } else { *status = LIBXSMM_DNN_ERR_INVALID_TENSOR; } return 0; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_set_qtensor_scf(libxsmm_dnn_tensor* tensor, const unsigned char scf) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (0 != tensor) { tensor->scf = scf; } else { status = LIBXSMM_DNN_ERR_INVALID_TENSOR; } return status; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_tensor(const libxsmm_dnn_tensor* tensor) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (0 != tensor) { /* it is not an error attempting to destroy a NULL-handle */ /* free layout information stored in tensor */ if (0 != tensor->layout) { libxsmm_dnn_destroy_tensor_datalayout( (libxsmm_dnn_tensor_datalayout*)tensor->layout ); } /* deallocate handle structure */ free(/*remove constness*/(libxsmm_dnn_tensor*)tensor); } #if 0 /* releasing a NULL-buffer should be not an error (similar to freeing a NULL pointer) */ else { status = LIBXSMM_DNN_ERR_INVALID_TENSOR; } #endif return status; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_copyin_tensor(const libxsmm_dnn_tensor* tensor, const void* data, const libxsmm_dnn_tensor_format in_format) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* @TODO check for valid combination */ if (0 != tensor) { switch (tensor->layout->tensor_type) { case LIBXSMM_DNN_REGULAR_INPUT: case LIBXSMM_DNN_GRADIENT_INPUT: case LIBXSMM_DNN_REGULAR_OUTPUT: case LIBXSMM_DNN_GRADIENT_OUTPUT: case LIBXSMM_DNN_INPUT: case LIBXSMM_DNN_OUTPUT: case LIBXSMM_DNN_ACTIVATION: { switch (in_format) { case LIBXSMM_DNN_TENSOR_FORMAT_NCHW: { if ( (tensor->layout->format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0 ) { switch (tensor->layout->datatype) { case LIBXSMM_DNN_DATATYPE_F32: { typedef float element_type; #include "template/libxsmm_dnn_tensor_buffer_copy_in_nchw.tpl.c" } break; case LIBXSMM_DNN_DATATYPE_BF16: { typedef libxsmm_bfloat16 element_type; #define LIBXSMM_DNN_COPY_LOW_PRECISION #include "template/libxsmm_dnn_tensor_buffer_copy_in_nchw.tpl.c" #undef LIBXSMM_DNN_COPY_LOW_PRECISION } break; case LIBXSMM_DNN_DATATYPE_I32: { typedef int element_type; #include "template/libxsmm_dnn_tensor_buffer_copy_in_nchw.tpl.c" } break; case LIBXSMM_DNN_DATATYPE_I16: { typedef short element_type; #define LIBXSMM_DNN_COPY_LOW_PRECISION #include "template/libxsmm_dnn_tensor_buffer_copy_in_nchw.tpl.c" #undef LIBXSMM_DNN_COPY_LOW_PRECISION } break; case LIBXSMM_DNN_DATATYPE_I8: { typedef unsigned char element_type; #define LIBXSMM_DNN_COPY_LOW_PRECISION #include "template/libxsmm_dnn_tensor_buffer_copy_in_nchw.tpl.c" #undef LIBXSMM_DNN_COPY_LOW_PRECISION } break; default: { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DST_FORMAT; } } break; default: { status = LIBXSMM_DNN_ERR_UNSUPPORTED_SRC_FORMAT; } } } break; case LIBXSMM_DNN_REGULAR_FILTER: case LIBXSMM_DNN_GRADIENT_FILTER: case LIBXSMM_DNN_FILTER: { switch (in_format) { case LIBXSMM_DNN_TENSOR_FORMAT_KCRS: { if ( (tensor->layout->format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0 ) { switch (tensor->layout->datatype) { case LIBXSMM_DNN_DATATYPE_F32: { typedef float element_type; #include "template/libxsmm_dnn_tensor_filter_copy_in_kcrs.tpl.c" } break; case LIBXSMM_DNN_DATATYPE_BF16: { typedef libxsmm_bfloat16 element_type; #include "template/libxsmm_dnn_tensor_filter_copy_in_kcrs.tpl.c" } break; case LIBXSMM_DNN_DATATYPE_I16: { typedef short element_type; #include "template/libxsmm_dnn_tensor_filter_copy_in_kcrs.tpl.c" } break; case LIBXSMM_DNN_DATATYPE_I8: { typedef char element_type; #include "template/libxsmm_dnn_tensor_filter_copy_in_kcrs.tpl.c" } break; default: { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DST_FORMAT; } } break; default: { status = LIBXSMM_DNN_ERR_UNSUPPORTED_SRC_FORMAT; } } } break; case LIBXSMM_DNN_REGULAR_CHANNEL_BIAS: case LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS: case LIBXSMM_DNN_CHANNEL_BIAS: case LIBXSMM_DNN_REGULAR_CHANNEL_BETA: case LIBXSMM_DNN_GRADIENT_CHANNEL_BETA: case LIBXSMM_DNN_CHANNEL_BETA: case LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA: case LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA: case LIBXSMM_DNN_CHANNEL_GAMMA: case LIBXSMM_DNN_CHANNEL_EXPECTVAL: case LIBXSMM_DNN_CHANNEL_RCPSTDDEV: case LIBXSMM_DNN_CHANNEL_VARIANCE: case LIBXSMM_DNN_CHANNEL_SCALAR: { switch (in_format) { case LIBXSMM_DNN_TENSOR_FORMAT_NCHW: { if ( (tensor->layout->format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0 ) { switch (tensor->layout->datatype) { case LIBXSMM_DNN_DATATYPE_F32: { typedef float element_type; #include "template/libxsmm_dnn_tensor_bias_copy_in_nchw.tpl.c" } break; case LIBXSMM_DNN_DATATYPE_BF16: { typedef libxsmm_bfloat16 element_type; #define LIBXSMM_DNN_COPY_LOW_PRECISION #include "template/libxsmm_dnn_tensor_bias_copy_in_nchw.tpl.c" #undef LIBXSMM_DNN_COPY_LOW_PRECISION } break; case LIBXSMM_DNN_DATATYPE_I16: { typedef short element_type; #define LIBXSMM_DNN_COPY_LOW_PRECISION #include "template/libxsmm_dnn_tensor_bias_copy_in_nchw.tpl.c" #undef LIBXSMM_DNN_COPY_LOW_PRECISION } break; case LIBXSMM_DNN_DATATYPE_I8: { typedef char element_type; #define LIBXSMM_DNN_COPY_LOW_PRECISION #include "template/libxsmm_dnn_tensor_bias_copy_in_nchw.tpl.c" #undef LIBXSMM_DNN_COPY_LOW_PRECISION } break; default: { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DST_FORMAT; } } break; default: { status = LIBXSMM_DNN_ERR_UNSUPPORTED_SRC_FORMAT; } } } break; default: { status = LIBXSMM_DNN_ERR_INVALID_TENSOR; } } } else { status = LIBXSMM_DNN_ERR_INVALID_TENSOR; } return status; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_zero_tensor(const libxsmm_dnn_tensor* tensor) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; if (0 != tensor) { const size_t size = libxsmm_dnn_get_tensor_elements( tensor->layout, &status ); size_t i; /* use for-loops to potentially leverage NUMA in the future */ switch (tensor->layout->datatype) { case LIBXSMM_DNN_DATATYPE_F32: { float* fp32_data = (float*)tensor->data; for (i = 0; i < size; ++i) fp32_data[i] = 0.0f; } break; case LIBXSMM_DNN_DATATYPE_BF16: { libxsmm_bfloat16* bfp16_data = (libxsmm_bfloat16*)tensor->data; for (i = 0; i < size; ++i) bfp16_data[i] = 0; } break; case LIBXSMM_DNN_DATATYPE_I32: { int* int32_data = (int*)tensor->data; for (i = 0; i < size; ++i) int32_data[i] = 0; } break; case LIBXSMM_DNN_DATATYPE_I16: { short* int16_data = (short*)tensor->data; for (i = 0; i < size; ++i) int16_data[i] = 0; } break; case LIBXSMM_DNN_DATATYPE_I8: { char* int8_data = (char*)tensor->data; for (i = 0; i < size; ++i) int8_data[i] = 0; } break; default: { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } } else { status = LIBXSMM_DNN_ERR_INVALID_TENSOR; } return status; } LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_copyout_tensor(const libxsmm_dnn_tensor* tensor, void* data, const libxsmm_dnn_tensor_format out_format) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; /* @TODO check for valid combination */ if (0 != tensor) { switch (tensor->layout->tensor_type) { case LIBXSMM_DNN_REGULAR_INPUT: case LIBXSMM_DNN_GRADIENT_INPUT: case LIBXSMM_DNN_REGULAR_OUTPUT: case LIBXSMM_DNN_GRADIENT_OUTPUT: case LIBXSMM_DNN_INPUT: case LIBXSMM_DNN_OUTPUT: case LIBXSMM_DNN_ACTIVATION: { switch (out_format) { case LIBXSMM_DNN_TENSOR_FORMAT_NCHW: { if ( (tensor->layout->format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0 ) { switch (tensor->layout->datatype) { case LIBXSMM_DNN_DATATYPE_F32: { typedef float element_type; #include "template/libxsmm_dnn_tensor_buffer_copy_out_nchw.tpl.c" } break; case LIBXSMM_DNN_DATATYPE_BF16: { typedef libxsmm_bfloat16 element_type; #define LIBXSMM_DNN_COPY_LOW_PRECISION #include "template/libxsmm_dnn_tensor_buffer_copy_out_nchw.tpl.c" #undef LIBXSMM_DNN_COPY_LOW_PRECISION } break; case LIBXSMM_DNN_DATATYPE_I32: { typedef int element_type; #include "template/libxsmm_dnn_tensor_buffer_copy_out_nchw.tpl.c" } break; case LIBXSMM_DNN_DATATYPE_I16: { typedef short element_type; #define LIBXSMM_DNN_COPY_LOW_PRECISION #include "template/libxsmm_dnn_tensor_buffer_copy_out_nchw.tpl.c" #undef LIBXSMM_DNN_COPY_LOW_PRECISION } break; case LIBXSMM_DNN_DATATYPE_I8: { typedef unsigned char element_type; #define LIBXSMM_DNN_COPY_LOW_PRECISION #include "template/libxsmm_dnn_tensor_buffer_copy_out_nchw.tpl.c" #undef LIBXSMM_DNN_COPY_LOW_PRECISION } break; default: { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_SRC_FORMAT; } } break; default: { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DST_FORMAT; } } } break; case LIBXSMM_DNN_REGULAR_FILTER: case LIBXSMM_DNN_GRADIENT_FILTER: case LIBXSMM_DNN_FILTER: { switch (out_format) { case LIBXSMM_DNN_TENSOR_FORMAT_KCRS: { if ( (tensor->layout->format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0 ) { switch (tensor->layout->datatype) { case LIBXSMM_DNN_DATATYPE_F32: { typedef float element_type; #include "template/libxsmm_dnn_tensor_filter_copy_out_kcrs.tpl.c" } break; case LIBXSMM_DNN_DATATYPE_BF16: { typedef libxsmm_bfloat16 element_type; #include "template/libxsmm_dnn_tensor_filter_copy_out_kcrs.tpl.c" } break; case LIBXSMM_DNN_DATATYPE_I32: { typedef int element_type; #include "template/libxsmm_dnn_tensor_filter_copy_out_kcrs.tpl.c" } break; case LIBXSMM_DNN_DATATYPE_I16: { typedef short element_type; #include "template/libxsmm_dnn_tensor_filter_copy_out_kcrs.tpl.c" } break; case LIBXSMM_DNN_DATATYPE_I8: { typedef char element_type; #include "template/libxsmm_dnn_tensor_filter_copy_out_kcrs.tpl.c" } break; default: { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_SRC_FORMAT; } } break; default: { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DST_FORMAT; } } } break; case LIBXSMM_DNN_REGULAR_CHANNEL_BIAS: case LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS: case LIBXSMM_DNN_CHANNEL_BIAS: case LIBXSMM_DNN_REGULAR_CHANNEL_BETA: case LIBXSMM_DNN_GRADIENT_CHANNEL_BETA: case LIBXSMM_DNN_CHANNEL_BETA: case LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA: case LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA: case LIBXSMM_DNN_CHANNEL_GAMMA: case LIBXSMM_DNN_CHANNEL_EXPECTVAL: case LIBXSMM_DNN_CHANNEL_RCPSTDDEV: case LIBXSMM_DNN_CHANNEL_VARIANCE: case LIBXSMM_DNN_CHANNEL_SCALAR: { switch (out_format) { case LIBXSMM_DNN_TENSOR_FORMAT_NCHW: { if ( (tensor->layout->format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0 ) { switch (tensor->layout->datatype) { case LIBXSMM_DNN_DATATYPE_F32: { typedef float element_type; #include "template/libxsmm_dnn_tensor_bias_copy_out_nchw.tpl.c" } break; case LIBXSMM_DNN_DATATYPE_BF16: { typedef libxsmm_bfloat16 element_type; #define LIBXSMM_DNN_COPY_LOW_PRECISION #include "template/libxsmm_dnn_tensor_bias_copy_out_nchw.tpl.c" #undef LIBXSMM_DNN_COPY_LOW_PRECISION } break; case LIBXSMM_DNN_DATATYPE_I16: { typedef short element_type; #define LIBXSMM_DNN_COPY_LOW_PRECISION #include "template/libxsmm_dnn_tensor_bias_copy_out_nchw.tpl.c" #undef LIBXSMM_DNN_COPY_LOW_PRECISION } break; case LIBXSMM_DNN_DATATYPE_I8: { typedef char element_type; #define LIBXSMM_DNN_COPY_LOW_PRECISION #include "template/libxsmm_dnn_tensor_bias_copy_out_nchw.tpl.c" #undef LIBXSMM_DNN_COPY_LOW_PRECISION } break; default: { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; } } } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_SRC_FORMAT; } } break; default: { status = LIBXSMM_DNN_ERR_UNSUPPORTED_DST_FORMAT; } } } break; default: { status = LIBXSMM_DNN_ERR_INVALID_TENSOR; } } } else { status = LIBXSMM_DNN_ERR_INVALID_TENSOR; } return status; } libxsmm-1.17/src/libxsmm_ext.c000066400000000000000000000340361415223013700164070ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include "libxsmm_ext.h" #include "libxsmm_gemm.h" #include #if defined(LIBXSMM_BUILD) #if defined(LIBXSMM_BUILD_EXT) && !defined(__STATIC) LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_ATTRIBUTE_WEAK void LIBXSMM_FSYMBOL(dgemm_batch)(const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], const double alpha_array[], const double* a_array[], const libxsmm_blasint lda_array[], const double* b_array[], const libxsmm_blasint ldb_array[], const double beta_array[], double* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) LIBXSMM_BLAS_NOEXCEPT(gemm_batch) { if (LIBXSMM_FSYMBOL(__real_dgemm_batch) != libxsmm_original_dgemm_batch_function) { LIBXSMM_FSYMBOL(__wrap_dgemm_batch)(transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size); } else { libxsmm_blas_error("dgemm_batch")(transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size); } } LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_ATTRIBUTE_WEAK void LIBXSMM_FSYMBOL(sgemm_batch)(const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], const float alpha_array[], const float* a_array[], const libxsmm_blasint lda_array[], const float* b_array[], const libxsmm_blasint ldb_array[], const float beta_array[], float* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) LIBXSMM_BLAS_NOEXCEPT(gemm_batch) { if (LIBXSMM_FSYMBOL(__real_sgemm_batch) != libxsmm_original_sgemm_batch_function) { LIBXSMM_FSYMBOL(__wrap_sgemm_batch)(transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size); } else { libxsmm_blas_error("sgemm_batch")(transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size); } } LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_ATTRIBUTE_WEAK void LIBXSMM_FSYMBOL(dgemm)(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const double* alpha, const double* a, const libxsmm_blasint* lda, const double* b, const libxsmm_blasint* ldb, const double* beta, double* c, const libxsmm_blasint* ldc) LIBXSMM_BLAS_NOEXCEPT(gemm) { if (LIBXSMM_FSYMBOL(__real_dgemm) != libxsmm_original_dgemm_function) { LIBXSMM_FSYMBOL(__wrap_dgemm)(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } else { libxsmm_blas_error("dgemm")(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); LIBXSMM_INLINE_XGEMM(double, double, /* try producing a result even if LIBXSMM_INLINE_XGEMM is limited */ transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } } LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_ATTRIBUTE_WEAK void LIBXSMM_FSYMBOL(sgemm)(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const float* alpha, const float* a, const libxsmm_blasint* lda, const float* b, const libxsmm_blasint* ldb, const float* beta, float* c, const libxsmm_blasint* ldc) LIBXSMM_BLAS_NOEXCEPT(gemm) { if (LIBXSMM_FSYMBOL(__real_sgemm) != libxsmm_original_sgemm_function) { LIBXSMM_FSYMBOL(__wrap_sgemm)(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } else { libxsmm_blas_error("sgemm")(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); LIBXSMM_INLINE_XGEMM(float, float, /* try producing a result even if LIBXSMM_INLINE_XGEMM is limited */ transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } } LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_ATTRIBUTE_WEAK void LIBXSMM_FSYMBOL(dgemv)(const char* trans, const libxsmm_blasint* m, const libxsmm_blasint* n, const double* alpha, const double* a, const libxsmm_blasint* lda, const double* x, const libxsmm_blasint* incx, const double* beta, double* y, const libxsmm_blasint* incy) LIBXSMM_BLAS_NOEXCEPT(gemv) { if (LIBXSMM_FSYMBOL(__real_dgemv) != libxsmm_original_dgemv_function) { LIBXSMM_FSYMBOL(__wrap_dgemv)(trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } else { libxsmm_blas_error("dgemv")(trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } } LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_ATTRIBUTE_WEAK void LIBXSMM_FSYMBOL(sgemv)(const char* trans, const libxsmm_blasint* m, const libxsmm_blasint* n, const float* alpha, const float* a, const libxsmm_blasint* lda, const float* x, const libxsmm_blasint* incx, const float* beta, float* y, const libxsmm_blasint* incy) LIBXSMM_BLAS_NOEXCEPT(gemv) { if (LIBXSMM_FSYMBOL(__real_sgemv) != libxsmm_original_sgemv_function) { LIBXSMM_FSYMBOL(__wrap_sgemv)(trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } else { libxsmm_blas_error("sgemv")(trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } } LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_ATTRIBUTE_WEAK void dgemm_batch(const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], const double alpha_array[], const double* a_array[], const libxsmm_blasint lda_array[], const double* b_array[], const libxsmm_blasint ldb_array[], const double beta_array[], double* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) LIBXSMM_BLAS_NOEXCEPT(gemm_batch) { LIBXSMM_FSYMBOL(dgemm_batch)(transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size); } LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_ATTRIBUTE_WEAK void sgemm_batch(const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], const float alpha_array[], const float* a_array[], const libxsmm_blasint lda_array[], const float* b_array[], const libxsmm_blasint ldb_array[], const float beta_array[], float* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) LIBXSMM_BLAS_NOEXCEPT(gemm_batch) { LIBXSMM_FSYMBOL(sgemm_batch)(transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size); } #elif (0 != LIBXSMM_NO_BLAS) /* no-BLAS library */ LIBXSMM_APIVAR_PUBLIC_DEF(LIBXSMM_ATTRIBUTE_COMMON unsigned int libxsmm_intrinsics_mm512_rng_state0[16]); LIBXSMM_APIVAR_PUBLIC_DEF(LIBXSMM_ATTRIBUTE_COMMON unsigned int libxsmm_intrinsics_mm512_rng_state1[16]); LIBXSMM_APIVAR_PUBLIC_DEF(LIBXSMM_ATTRIBUTE_COMMON unsigned int libxsmm_intrinsics_mm512_rng_state2[16]); LIBXSMM_APIVAR_PUBLIC_DEF(LIBXSMM_ATTRIBUTE_COMMON unsigned int libxsmm_intrinsics_mm512_rng_state3[16]); LIBXSMM_API_INTERN LIBXSMM_ATTRIBUTE_NO_TRACE void internal_noblas_sink(LIBXSMM_VARIADIC); LIBXSMM_API_INTERN void internal_noblas_sink(LIBXSMM_VARIADIC) { /* does nothing else but sinking given arguments */ } LIBXSMM_API_INTERN LIBXSMM_ATTRIBUTE_NO_TRACE libxsmm_sink_function internal_noblas_error(const char* /*symbol*/); LIBXSMM_API_INTERN libxsmm_sink_function internal_noblas_error(const char* symbol) { static int internal_noblas_nerror = 0; LIBXSMM_BLAS_ERROR(symbol, &internal_noblas_nerror); return internal_noblas_sink; } LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_ATTRIBUTE_NO_TRACE /*LIBXSMM_ATTRIBUTE_WEAK*/ void LIBXSMM_FSYMBOL(dgemm_batch)(const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], const double alpha_array[], const double* a_array[], const libxsmm_blasint lda_array[], const double* b_array[], const libxsmm_blasint ldb_array[], const double beta_array[], double* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) LIBXSMM_BLAS_NOEXCEPT(gemm_batch) { internal_noblas_error("dgemm_batch")(transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size); } LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_ATTRIBUTE_NO_TRACE /*LIBXSMM_ATTRIBUTE_WEAK*/ void LIBXSMM_FSYMBOL(sgemm_batch)(const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], const float alpha_array[], const float* a_array[], const libxsmm_blasint lda_array[], const float* b_array[], const libxsmm_blasint ldb_array[], const float beta_array[], float* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) LIBXSMM_BLAS_NOEXCEPT(gemm_batch) { internal_noblas_error("sgemm_batch")(transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size); } LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_ATTRIBUTE_NO_TRACE /*LIBXSMM_ATTRIBUTE_WEAK*/ void LIBXSMM_FSYMBOL(dgemm)(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const double* alpha, const double* a, const libxsmm_blasint* lda, const double* b, const libxsmm_blasint* ldb, const double* beta, double* c, const libxsmm_blasint* ldc) LIBXSMM_BLAS_NOEXCEPT(gemm) { internal_noblas_error("dgemm")(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); LIBXSMM_INLINE_XGEMM(double, double, /* try producing a result even if LIBXSMM_INLINE_XGEMM is limited */ transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_ATTRIBUTE_NO_TRACE /*LIBXSMM_ATTRIBUTE_WEAK*/ void LIBXSMM_FSYMBOL(sgemm)(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const float* alpha, const float* a, const libxsmm_blasint* lda, const float* b, const libxsmm_blasint* ldb, const float* beta, float* c, const libxsmm_blasint* ldc) LIBXSMM_BLAS_NOEXCEPT(gemm) { internal_noblas_error("sgemm")(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); LIBXSMM_INLINE_XGEMM(float, float, /* try producing a result even if LIBXSMM_INLINE_XGEMM is limited */ transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_ATTRIBUTE_NO_TRACE /*LIBXSMM_ATTRIBUTE_WEAK*/ void LIBXSMM_FSYMBOL(dgemv)(const char* trans, const libxsmm_blasint* m, const libxsmm_blasint* n, const double* alpha, const double* a, const libxsmm_blasint* lda, const double* x, const libxsmm_blasint* incx, const double* beta, double* y, const libxsmm_blasint* incy) LIBXSMM_BLAS_NOEXCEPT(gemv) { internal_noblas_error("dgemv")(trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_ATTRIBUTE_NO_TRACE /*LIBXSMM_ATTRIBUTE_WEAK*/ void LIBXSMM_FSYMBOL(sgemv)(const char* trans, const libxsmm_blasint* m, const libxsmm_blasint* n, const float* alpha, const float* a, const libxsmm_blasint* lda, const float* x, const libxsmm_blasint* incx, const float* beta, float* y, const libxsmm_blasint* incy) LIBXSMM_BLAS_NOEXCEPT(gemv) { internal_noblas_error("sgemv")(trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_ATTRIBUTE_NO_TRACE void dgemm_batch(const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], const double alpha_array[], const double* a_array[], const libxsmm_blasint lda_array[], const double* b_array[], const libxsmm_blasint ldb_array[], const double beta_array[], double* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) LIBXSMM_BLAS_NOEXCEPT(gemm_batch) { LIBXSMM_FSYMBOL(dgemm_batch)(transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size); } LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_ATTRIBUTE_NO_TRACE void sgemm_batch(const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], const float alpha_array[], const float* a_array[], const libxsmm_blasint lda_array[], const float* b_array[], const libxsmm_blasint ldb_array[], const float beta_array[], float* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) LIBXSMM_BLAS_NOEXCEPT(gemm_batch) { LIBXSMM_FSYMBOL(sgemm_batch)(transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size); } #endif #endif /*defined(LIBXSMM_BUILD)*/ libxsmm-1.17/src/libxsmm_ext.h000066400000000000000000000034601415223013700164110ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_EXT_H #define LIBXSMM_EXT_H #include "libxsmm_main.h" #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #if defined(_OPENMP) # if !defined(__INTEL_COMPILER) # if defined(__clang__) # pragma clang diagnostic push # elif defined(__GNUC__) && LIBXSMM_VERSION2(4, 6) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__) # pragma GCC diagnostic push # endif # if defined(__clang__) # pragma clang diagnostic ignored "-Wpedantic" # elif defined(__GNUC__) && LIBXSMM_VERSION2(4, 6) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__) # pragma GCC diagnostic ignored "-Wpedantic" # endif # endif # include # if defined(LIBXSMM_TRACE_CALLERID_GCCBUILTIN) && !defined(__INTEL_COMPILER) # if defined(__clang__) # pragma clang diagnostic pop # elif defined(__GNUC__) && LIBXSMM_VERSION2(4, 6) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__) # pragma GCC diagnostic pop # endif # endif #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif #endif /*LIBXSMM_EXT_H*/ libxsmm-1.17/src/libxsmm_ext_blocked_gemm.c000066400000000000000000000034111415223013700210700ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Kunal Banerjee (Intel Corp.), Dheevatsa Mudigere (Intel Corp.) Alexander Heinecke (Intel Corp.), Hans Pabst (Intel Corp.) ******************************************************************************/ #include "libxsmm_blocked_gemm_types.h" #include "libxsmm_main.h" #include "libxsmm_ext.h" LIBXSMM_APIEXT void libxsmm_blocked_gemm_omp(const libxsmm_blocked_gemm_handle* handle, const void* a, const void* b, void* c, /*unsigned*/int count) { static int error_once = 0; if (0 != handle && 0 != a && 0 != b && 0 != c && 0 < count) { #if defined(_OPENMP) # pragma omp parallel num_threads(handle->nthreads) #endif /*defined(_OPENMP)*/ { int i; #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif for (i = 0; i < count; ++i) { libxsmm_blocked_gemm_st(handle, a, b, c, 0/*start_thread*/, tid); } } } else if (0 != libxsmm_get_verbosity() /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_blocked_gemm_omp!\n"); } } libxsmm-1.17/src/libxsmm_ext_gemm.c000066400000000000000000001712061415223013700174150ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include #include "libxsmm_gemm.h" #include "libxsmm_ext.h" #if defined(LIBXSMM_WRAP) && defined(LIBXSMM_BUILD_EXT) # include "libxsmm_trace.h" #endif #if !defined(LIBXSMM_EXT_GEMM_PARGROUPS_INFO) && 0 # define LIBXSMM_EXT_GEMM_PARGROUPS_INFO #endif #if defined(LIBXSMM_WRAP) && defined(LIBXSMM_BUILD_EXT) # if !defined(LIBXSMM_EXT_GEMM_MMBATCH_PREFETCH) # define LIBXSMM_EXT_GEMM_MMBATCH_PREFETCH libxsmm_get_gemm_prefetch(LIBXSMM_PREFETCH_AUTO) # endif # if !defined(LIBXSMM_EXT_GEMM_MMBATCH_MAXDEPTH) # define LIBXSMM_EXT_GEMM_MMBATCH_MAXDEPTH 8/*POT*/ # endif LIBXSMM_APIVAR_DEFINE(libxsmm_gemm_descriptor internal_ext_gemm_batchdesc[LIBXSMM_EXT_GEMM_MMBATCH_MAXDEPTH]); LIBXSMM_APIVAR_DEFINE(unsigned int internal_ext_gemm_batchdepth); LIBXSMM_APIVAR_DEFINE(unsigned int internal_ext_gemm_batchsize); #endif #if defined(LIBXSMM_WRAP) && defined(LIBXSMM_BUILD_EXT) LIBXSMM_API_INLINE int internal_mmbatch_sortrev(const void* stat_a, const void* stat_b) { const libxsmm_mmbatch_item *const a = (const libxsmm_mmbatch_item*)stat_a; const libxsmm_mmbatch_item *const b = (const libxsmm_mmbatch_item*)stat_b; LIBXSMM_ASSERT(NULL != stat_a && NULL != stat_b); return a->stat.count < b->stat.count ? 1 : (b->stat.count < a->stat.count ? -1 : 0); } #endif /*defined(LIBXSMM_WRAP) && defined(LIBXSMM_BUILD_EXT)*/ LIBXSMM_API_INLINE int internal_mmbatch_flush(const libxsmm_gemm_descriptor* batchdesc, libxsmm_blasint batchsize, libxsmm_mmbatch_item* batcharray) { int result = EXIT_SUCCESS; #if defined(LIBXSMM_WRAP) && defined(LIBXSMM_BUILD_EXT) if (0 != batchsize) { /* recorded/lazy multiplications */ const libxsmm_blasint itemsize = sizeof(libxsmm_mmbatch_item); LIBXSMM_ASSERT(NULL != batchdesc && 0 < batchsize); if (0 == (LIBXSMM_MMBATCH_FLAG_STATISTIC & batchdesc->flags)) { /* process batch */ const libxsmm_xmmfunction kernel = libxsmm_xmmdispatch(batchdesc); if (NULL != kernel.xmm) { const unsigned char itypesize = libxsmm_typesize((libxsmm_datatype)LIBXSMM_GETENUM_INP(batchdesc->datatype)); const unsigned char otypesize = libxsmm_typesize((libxsmm_datatype)LIBXSMM_GETENUM_OUT(batchdesc->datatype)); #if defined(_OPENMP) if (0 == (LIBXSMM_MMBATCH_FLAG_SEQUENTIAL & batchdesc->flags)) { /* parallelized */ const int nchunks = (int)LIBXSMM_UPDIV(batchsize, libxsmm_gemm_taskgrain); # if defined(LIBXSMM_EXT_TASKS) if (0 == omp_get_active_level()) { const int max_nthreads = omp_get_max_threads(); const int nthreads = LIBXSMM_MIN(max_nthreads, nchunks); if (0 == libxsmm_gemm_tasks) # else if (0 == omp_in_parallel()) { const int max_nthreads = omp_get_max_threads(); const int nthreads = LIBXSMM_MIN(max_nthreads, nchunks); # endif { /* classic internal parallelization */ # pragma omp parallel num_threads(nthreads) /*check*/libxsmm_mmbatch_kernel( kernel, 0/*index_base*/, 0/*index_stride*/, &itemsize, &itemsize, &itemsize, &batcharray->value.a, &batcharray->value.b, &batcharray->value.c, 0 == (LIBXSMM_MMBATCH_FLAG_SYNCHRONIZED & batchdesc->flags) ? batchsize : -batchsize, omp_get_thread_num(), nthreads, itypesize, otypesize, batchdesc->flags); } # if defined(LIBXSMM_EXT_TASKS) else { /* internal parallelization with tasks */ # pragma omp parallel num_threads(nthreads) { /* first thread discovering work will launch all tasks */ # pragma omp single nowait /* anyone is good */ { int tid; for (tid = 0; tid < nchunks/*ntasks*/; ++tid) { # pragma omp task untied /*check*/libxsmm_mmbatch_kernel( kernel, 0/*index_base*/, 0/*index_stride*/, &itemsize, &itemsize, &itemsize, &batcharray->value.a, &batcharray->value.b, &batcharray->value.c, 0 == (LIBXSMM_MMBATCH_FLAG_SYNCHRONIZED & batchdesc->flags) ? batchsize : -batchsize, tid, nchunks/*ntasks*/, itypesize, otypesize, batchdesc->flags); } } } /* implicit synchronization (barrier) */ } # endif } else { /* assume external parallelization */ int tid; for (tid = 0; tid < nchunks/*ntasks*/; ++tid) { # if defined(LIBXSMM_EXT_TASKS) # pragma omp task untied #endif /*check*/libxsmm_mmbatch_kernel( kernel, 0/*index_base*/, 0/*index_stride*/, &itemsize, &itemsize, &itemsize, &batcharray->value.a, &batcharray->value.b, &batcharray->value.c, 0 == (LIBXSMM_MMBATCH_FLAG_SYNCHRONIZED & batchdesc->flags) ? batchsize : -batchsize, tid, nchunks/*ntasks*/, itypesize, otypesize, batchdesc->flags); } # if defined(LIBXSMM_EXT_TASKS) if (0 == libxsmm_nosync) { /* allow to omit synchronization */ # pragma omp taskwait } # endif } } else #endif { /* sequential */ result = libxsmm_mmbatch_kernel( kernel, 0/*index_base*/, 0/*index_stride*/, &itemsize, &itemsize, &itemsize, &batcharray->value.a, &batcharray->value.b, &batcharray->value.c, batchsize, 0/*tid*/, 1/*nthreads*/, itypesize, otypesize, batchdesc->flags); } } else { /* no fall-back */ /* several reasons to arrive here: try-lock, unsuitable SMM, etc. */ result = EXIT_FAILURE; } memset(batcharray, 0, (size_t)batchsize * (size_t)itemsize); /* clear */ } else { /* print statistic */ const libxsmm_blasint limit = (LIBXSMM_GEMM_MMBATCH_VERBOSITY < libxsmm_verbosity ? batchsize/*unlimited*/ : 7/*limited*/); unsigned int threshold, batchcount; libxsmm_blasint count = 0, i; LIBXSMM_ASSERT(NULL != batcharray); qsort(batcharray, (size_t)batchsize, (size_t)itemsize, internal_mmbatch_sortrev); batchcount = batcharray[0].stat.count; threshold = ((LIBXSMM_GEMM_MMBATCH_VERBOSITY < libxsmm_verbosity || 3 >= batchsize) ? 0 : (batchcount / 2)); for (i = 1; i < batchsize; ++i) batchcount += batcharray[i].stat.count; LIBXSMM_STDIO_ACQUIRE(); for (i = 0; i < batchsize; ++i) { const libxsmm_gemm_descriptor descriptor = batcharray[i].stat.desc; const libxsmm_blasint lda = descriptor.lda, ldb = descriptor.ldb, ldc = descriptor.ldc; const libxsmm_blasint m = descriptor.m, n = descriptor.n, k = descriptor.k; const char *const symbol = batcharray[i].stat.symbol; const unsigned int ci = batcharray[i].stat.count; LIBXSMM_MEMZERO127(batcharray + i); /* clear */ if (threshold < ci && count < limit /* limit printed statistic */ && 0 < m && 0 < n && 0 < k) { const unsigned int ciperc = (unsigned int)(100.0 * ci / batchcount + 0.5); if (0 != ciperc) { LIBXSMM_ASSERT(0 != ci); if (0 == count) { fprintf(stderr, "\nLIBXSMM STATISTIC: %u multiplication%c\n", batchcount, 1 < batchcount ? 's' : ' '); } LIBXSMM_GEMM_PRINT2(stderr, LIBXSMM_GETENUM_INP(descriptor.datatype), LIBXSMM_GETENUM_OUT(descriptor.datatype), descriptor.flags, m, n, k, /*0 != (LIBXSMM_GEMM_FLAG_ALPHA_0 & descriptor.flags) ? 0 : */1, NULL/*a*/, lda, NULL/*b*/, ldb, 0 != (LIBXSMM_GEMM_FLAG_BETA_0 & descriptor.flags) ? 0 : 1, NULL/*c*/, ldc); if (NULL != symbol && 0 != *symbol) { fprintf(stderr, ": %u%% [%s]\n", ciperc, symbol); } else { fprintf(stderr, ": %u%%\n", ciperc); } ++count; } else break; } } LIBXSMM_STDIO_RELEASE(); } } #else LIBXSMM_UNUSED(batchdesc); LIBXSMM_UNUSED(batchsize); LIBXSMM_UNUSED(batcharray); #endif return result; } #if defined(LIBXSMM_BUILD) && defined(LIBXSMM_BUILD_EXT) #if defined(LIBXSMM_BLAS_WRAP_DYNAMIC) LIBXSMM_API libxsmm_dgemm_batch_function libxsmm_original_dgemm_batch(void) { # if (0 != LIBXSMM_BLAS) LIBXSMM_BLAS_WRAPPER(1, double, gemm_batch, libxsmm_original_dgemm_batch_function, libxsmm_original_dgemm_batch/*self*/); /*LIBXSMM_ASSERT(NULL != libxsmm_original_dgemm_batch_function);*/ # else LIBXSMM_BLAS_WRAPPER(0, double, gemm_batch, libxsmm_original_dgemm_batch_function, libxsmm_original_dgemm_batch/*self*/); # endif return libxsmm_original_dgemm_batch_function; } LIBXSMM_API libxsmm_sgemm_batch_function libxsmm_original_sgemm_batch(void) { # if (0 != LIBXSMM_BLAS) LIBXSMM_BLAS_WRAPPER(1, float, gemm_batch, libxsmm_original_sgemm_batch_function, libxsmm_original_sgemm_batch/*self*/); /*LIBXSMM_ASSERT(NULL != libxsmm_original_sgemm_batch_function);*/ # else LIBXSMM_BLAS_WRAPPER(0, float, gemm_batch, libxsmm_original_sgemm_batch_function, libxsmm_original_sgemm_batch/*self*/); # endif return libxsmm_original_sgemm_batch_function; } LIBXSMM_API libxsmm_dgemm_function libxsmm_original_dgemm(void) { # if (0 != LIBXSMM_BLAS) LIBXSMM_BLAS_WRAPPER(1, double, gemm, libxsmm_original_dgemm_function, libxsmm_original_dgemm/*self*/); LIBXSMM_ASSERT(NULL != libxsmm_original_dgemm_function); # else LIBXSMM_BLAS_WRAPPER(0, double, gemm, libxsmm_original_dgemm_function, libxsmm_original_dgemm/*self*/); # endif return libxsmm_original_dgemm_function; } LIBXSMM_API libxsmm_sgemm_function libxsmm_original_sgemm(void) { # if (0 != LIBXSMM_BLAS) LIBXSMM_BLAS_WRAPPER(1, float, gemm, libxsmm_original_sgemm_function, libxsmm_original_sgemm/*self*/); LIBXSMM_ASSERT(NULL != libxsmm_original_sgemm_function); # else LIBXSMM_BLAS_WRAPPER(0, float, gemm, libxsmm_original_sgemm_function, libxsmm_original_sgemm/*self*/); # endif return libxsmm_original_sgemm_function; } LIBXSMM_API libxsmm_dgemv_function libxsmm_original_dgemv(void) { # if (0 != LIBXSMM_BLAS) LIBXSMM_BLAS_WRAPPER(1, double, gemv, libxsmm_original_dgemv_function, libxsmm_original_dgemv/*self*/); LIBXSMM_ASSERT(NULL != libxsmm_original_dgemv_function); # else LIBXSMM_BLAS_WRAPPER(0, double, gemv, libxsmm_original_dgemv_function, libxsmm_original_dgemv/*self*/); # endif return libxsmm_original_dgemv_function; } LIBXSMM_API libxsmm_sgemv_function libxsmm_original_sgemv(void) { # if (0 != LIBXSMM_BLAS) LIBXSMM_BLAS_WRAPPER(1, float, gemv, libxsmm_original_sgemv_function, libxsmm_original_sgemv/*self*/); LIBXSMM_ASSERT(NULL != libxsmm_original_sgemv_function); # else LIBXSMM_BLAS_WRAPPER(0, float, gemv, libxsmm_original_sgemv_function, libxsmm_original_sgemv/*self*/); # endif return libxsmm_original_sgemv_function; } #endif /*defined(LIBXSMM_BLAS_WRAP_DYNAMIC)*/ LIBXSMM_APIEXT LIBXSMM_ATTRIBUTE_USED void LIBXSMM_FSYMBOL(__wrap_dgemm_batch)( const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], const double alpha_array[], const double* a_array[], const libxsmm_blasint lda_array[], const double* b_array[], const libxsmm_blasint ldb_array[], const double beta_array[], double* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) { LIBXSMM_ASSERT(NULL != lda_array && NULL != ldb_array && NULL != ldc_array && NULL != m_array && NULL != n_array && NULL != k_array); LIBXSMM_ASSERT(NULL != transa_array && NULL != transb_array && NULL != alpha_array && NULL != beta_array); LIBXSMM_ASSERT(NULL != group_count && NULL != group_size); LIBXSMM_INIT if (0 != libxsmm_gemm_wrap) { if (0 != (libxsmm_gemm_wrap & 1)) { /* sequential */ libxsmm_dgemm_batch(transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size); } else { /* parallelized */ libxsmm_dgemm_batch_omp(transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size); } } else { LIBXSMM_GEMM_BATCH_SYMBOL(double)(transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size); } } LIBXSMM_APIEXT LIBXSMM_ATTRIBUTE_USED void LIBXSMM_FSYMBOL(__wrap_sgemm_batch)( const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], const float alpha_array[], const float* a_array[], const libxsmm_blasint lda_array[], const float* b_array[], const libxsmm_blasint ldb_array[], const float beta_array[], float* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) { LIBXSMM_ASSERT(NULL != lda_array && NULL != ldb_array && NULL != ldc_array && NULL != m_array && NULL != n_array && NULL != k_array); LIBXSMM_ASSERT(NULL != transa_array && NULL != transb_array && NULL != alpha_array && NULL != beta_array); LIBXSMM_ASSERT(NULL != group_count && NULL != group_size); LIBXSMM_INIT if (0 != libxsmm_gemm_wrap) { if (0 != (libxsmm_gemm_wrap & 1)) { /* sequential */ libxsmm_sgemm_batch(transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size); } else { /* parallelized */ libxsmm_sgemm_batch_omp(transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size); } } else { LIBXSMM_GEMM_BATCH_SYMBOL(float)(transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size); } } LIBXSMM_APIEXT LIBXSMM_ATTRIBUTE_USED void LIBXSMM_FSYMBOL(__wrap_dgemm)( const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const double* alpha, const double* a, const libxsmm_blasint* lda, const double* b, const libxsmm_blasint* ldb, const double* beta, double* c, const libxsmm_blasint* ldc) { LIBXSMM_ASSERT(NULL != lda && NULL != ldb && NULL != ldc && NULL != m && NULL != n && NULL != k); LIBXSMM_ASSERT(NULL != transa && NULL != transb && NULL != alpha && NULL != beta); { #if defined(LIBXSMM_WRAP) && defined(LIBXSMM_BUILD_EXT) unsigned int i = 0; /* no flush */ int flags = -1; # if !defined(NDEBUG) static int error_once = 0; int result = EXIT_SUCCESS; # endif LIBXSMM_INIT if (0 != libxsmm_gemm_wrap && (NULL == libxsmm_mmbatch_array || LIBXSMM_GEMM_PRECISION_F64 != libxsmm_mmbatch_desc.datatype || ((unsigned int)*lda) != libxsmm_mmbatch_desc.lda || ((unsigned int)*ldb) != libxsmm_mmbatch_desc.ldb || ((unsigned int)*ldc) != libxsmm_mmbatch_desc.ldc || ((unsigned int)*m) != libxsmm_mmbatch_desc.m || ((unsigned int)*n) != libxsmm_mmbatch_desc.n || ((unsigned int)*k) != libxsmm_mmbatch_desc.k || (flags = LIBXSMM_GEMM_FLAGS(*transa, *transb)) != (int)(LIBXSMM_GEMM_FLAG_TRANS_AB & libxsmm_mmbatch_desc.flags) || LIBXSMM_NEQ(/*0 != (LIBXSMM_GEMM_FLAG_ALPHA_0 & libxsmm_mmbatch_desc.flags) ? 0 : */1, *alpha) || LIBXSMM_NEQ(0 != (LIBXSMM_GEMM_FLAG_BETA_0 & libxsmm_mmbatch_desc.flags) ? 0 : 1, *beta))) #endif { #if defined(_DEBUG) const char *const env_check = getenv("LIBXSMM_GEMM_CHECK"); const double check = LIBXSMM_ABS(NULL == env_check ? 0 : atof(env_check)); void* d = NULL; if (LIBXSMM_NEQ(0, check)) { const size_t size = (size_t)(*ldc) * (size_t)(*n) * sizeof(double); d = libxsmm_scratch_malloc(size, 0/*auto*/, LIBXSMM_MALLOC_INTERNAL_CALLER); if (NULL != d && LIBXSMM_NEQ(0, *beta)) memcpy(d, c, size); /* copy destination */ } #endif if (0 != (libxsmm_gemm_wrap & 1)) { /* sequential */ libxsmm_dgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } else { /* parallelized */ libxsmm_dgemm_omp(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } #if defined(_DEBUG) if (NULL != d) { libxsmm_matdiff_info diff; libxsmm_blas_dgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, d, ldc); if (EXIT_SUCCESS == libxsmm_matdiff(&diff, LIBXSMM_DATATYPE_F64, *m, *n, d, c, ldc, ldc) && check < 100.0 * diff.normf_rel) { LIBXSMM_STDIO_ACQUIRE(); fprintf(stderr, "LIBXSMM: "); libxsmm_gemm_print(stderr, LIBXSMM_GEMM_PRECISION_F64, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); fprintf(stderr, " => %f%% ERROR\n", 100.0 * diff.normf_rel); LIBXSMM_STDIO_RELEASE(); } libxsmm_free(d); } #endif #if defined(LIBXSMM_WRAP) && defined(LIBXSMM_BUILD_EXT) if (0 != (LIBXSMM_MMBATCH_FLAG_STATISTIC & libxsmm_mmbatch_desc.flags)) { libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const descriptor = libxsmm_dgemm_descriptor_init(&blob, *m, *n, *k, *lda, *ldb, *ldc, *alpha, *beta, LIBXSMM_GEMM_FLAGS(*transa, *transb), LIBXSMM_EXT_GEMM_MMBATCH_PREFETCH); LIBXSMM_ASSERT(0 != libxsmm_mmbatch_size); if (NULL != descriptor) { const unsigned int max_batchsize = (unsigned int)((LIBXSMM_GEMM_MMBATCH_SCALE) * libxsmm_mmbatch_size); const unsigned int batchsize = LIBXSMM_ATOMIC_LOAD(&internal_ext_gemm_batchsize, LIBXSMM_ATOMIC_RELAXED); const unsigned int max_size = (0 != batchsize ? (((batchsize - 1) % max_batchsize) + 1) : 0); libxsmm_mmbatch_item *const batcharray = (libxsmm_mmbatch_item*)libxsmm_mmbatch_array; libxsmm_mmbatch_item* batcharray_cur = batcharray; unsigned int size = max_size; if (libxsmm_mmbatch_size < max_size) { size = max_size - libxsmm_mmbatch_size; batcharray_cur += libxsmm_mmbatch_size; } i = libxsmm_diff_n(descriptor, batcharray_cur, sizeof(libxsmm_gemm_descriptor), sizeof(libxsmm_mmbatch_item)/*stride*/, 0/*hint*/, size); if (i < size) { /* update existing entry */ LIBXSMM_ATOMIC_ADD_FETCH(&batcharray_cur[i].stat.count, 1, LIBXSMM_ATOMIC_RELAXED); } else { /* new entry needed */ const int all = -1, shift = 0; void* extra = 0; i = ((LIBXSMM_ATOMIC_ADD_FETCH(&internal_ext_gemm_batchsize, 1, LIBXSMM_ATOMIC_RELAXED) - 1) % max_batchsize) + 1; batcharray[i-1].stat.desc = *descriptor; batcharray[i-1].stat.count = 1; batcharray[i-1].stat.symbol = libxsmm_trace_info(NULL/*depth*/, NULL/*tid*/, &all, LIBXSMM_FUNCNAME, &shift, &all); if (EXIT_SUCCESS == libxsmm_get_malloc_xinfo(libxsmm_mmbatch_array, NULL/*size*/, NULL/*flags*/, &extra)) { *(libxsmm_mmbatch_flush_function*)extra = libxsmm_mmbatch_end; } # if !defined(NDEBUG) else { result = EXIT_FAILURE; } # endif } } } #endif } #if defined(LIBXSMM_WRAP) && defined(LIBXSMM_BUILD_EXT) else { libxsmm_mmbatch_item *const batcharray = (libxsmm_mmbatch_item*)libxsmm_mmbatch_array; const unsigned int max_batchsize = (unsigned int)((LIBXSMM_GEMM_MMBATCH_SCALE) * libxsmm_mmbatch_size); i = ((LIBXSMM_ATOMIC_ADD_FETCH(&internal_ext_gemm_batchsize, 1, LIBXSMM_ATOMIC_RELAXED) - 1) % max_batchsize) + 1; batcharray[i-1].value.a = a; batcharray[i-1].value.b = b; batcharray[i-1].value.c = c; LIBXSMM_ASSERT(0 <= flags); } if (libxsmm_mmbatch_size == (i - 1)) { /* condition ensure to flush once (first discovery) */ # if !defined(NDEBUG) result = # endif internal_mmbatch_flush(&libxsmm_mmbatch_desc, libxsmm_mmbatch_size, (libxsmm_mmbatch_item*)libxsmm_mmbatch_array); } # if !defined(NDEBUG) /* library code is expected to be mute */ if (EXIT_SUCCESS != result && 0 != libxsmm_verbosity && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: DGEMM batch recording failed!\n"); } # endif #endif } } LIBXSMM_APIEXT LIBXSMM_ATTRIBUTE_USED void LIBXSMM_FSYMBOL(__wrap_sgemm)( const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const float* alpha, const float* a, const libxsmm_blasint* lda, const float* b, const libxsmm_blasint* ldb, const float* beta, float* c, const libxsmm_blasint* ldc) { LIBXSMM_ASSERT(NULL != lda && NULL != ldb && NULL != ldc && NULL != m && NULL != n && NULL != k); LIBXSMM_ASSERT(NULL != transa && NULL != transb && NULL != alpha && NULL != beta); { #if defined(LIBXSMM_WRAP) && defined(LIBXSMM_BUILD_EXT) unsigned int i = 0; /* no flush */ int flags = -1; # if !defined(NDEBUG) static int error_once = 0; int result = EXIT_SUCCESS; # endif LIBXSMM_INIT if (0 != libxsmm_gemm_wrap && (NULL == libxsmm_mmbatch_array || LIBXSMM_GEMM_PRECISION_F32 != libxsmm_mmbatch_desc.datatype || ((unsigned int)*lda) != libxsmm_mmbatch_desc.lda || ((unsigned int)*ldb) != libxsmm_mmbatch_desc.ldb || ((unsigned int)*ldc) != libxsmm_mmbatch_desc.ldc || ((unsigned int)*m) != libxsmm_mmbatch_desc.m || ((unsigned int)*n) != libxsmm_mmbatch_desc.n || ((unsigned int)*k) != libxsmm_mmbatch_desc.k || (flags = LIBXSMM_GEMM_FLAGS(*transa, *transb)) != (int)(LIBXSMM_GEMM_FLAG_TRANS_AB & libxsmm_mmbatch_desc.flags) || LIBXSMM_NEQ(/*0 != (LIBXSMM_GEMM_FLAG_ALPHA_0 & libxsmm_mmbatch_desc.flags) ? 0 : */1, *alpha) || LIBXSMM_NEQ(0 != (LIBXSMM_GEMM_FLAG_BETA_0 & libxsmm_mmbatch_desc.flags) ? 0 : 1, *beta))) #endif { #if defined(_DEBUG) const char *const env_check = getenv("LIBXSMM_GEMM_CHECK"); const double check = LIBXSMM_ABS(NULL == env_check ? 0 : atof(env_check)); void* d = NULL; if (LIBXSMM_NEQ(0, check)) { const size_t size = (size_t)(*ldc) * (size_t)(*n) * sizeof(float); d = libxsmm_scratch_malloc(size, 0/*auto*/, LIBXSMM_MALLOC_INTERNAL_CALLER); if (NULL != d && LIBXSMM_NEQ(0, *beta)) memcpy(d, c, size); /* copy destination */ } #endif if (0 != (libxsmm_gemm_wrap & 1)) { /* sequential */ libxsmm_sgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } else { /* parallelized */ libxsmm_sgemm_omp(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } #if defined(_DEBUG) if (NULL != d) { libxsmm_matdiff_info diff; libxsmm_blas_sgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, d, ldc); if (EXIT_SUCCESS == libxsmm_matdiff(&diff, LIBXSMM_DATATYPE_F32, *m, *n, d, c, ldc, ldc) && check < 100.0 * diff.normf_rel) { LIBXSMM_STDIO_ACQUIRE(); fprintf(stderr, "LIBXSMM: "); libxsmm_gemm_print(stderr, LIBXSMM_GEMM_PRECISION_F32, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); fprintf(stderr, " => %f%% ERROR\n", 100.0 * diff.normf_rel); LIBXSMM_STDIO_RELEASE(); } libxsmm_free(d); } #endif #if defined(LIBXSMM_WRAP) && defined(LIBXSMM_BUILD_EXT) if (0 != (LIBXSMM_MMBATCH_FLAG_STATISTIC & libxsmm_mmbatch_desc.flags)) { libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const descriptor = libxsmm_sgemm_descriptor_init(&blob, *m, *n, *k, *lda, *ldb, *ldc, *alpha, *beta, LIBXSMM_GEMM_FLAGS(*transa, *transb), LIBXSMM_EXT_GEMM_MMBATCH_PREFETCH); LIBXSMM_ASSERT(0 != libxsmm_mmbatch_size); if (NULL != descriptor) { const unsigned int max_batchsize = (unsigned int)((LIBXSMM_GEMM_MMBATCH_SCALE) * libxsmm_mmbatch_size); const unsigned int batchsize = LIBXSMM_ATOMIC_LOAD(&internal_ext_gemm_batchsize, LIBXSMM_ATOMIC_RELAXED); const unsigned int max_size = (0 != batchsize ? (((batchsize - 1) % max_batchsize) + 1) : 0); libxsmm_mmbatch_item *const batcharray = (libxsmm_mmbatch_item*)libxsmm_mmbatch_array; libxsmm_mmbatch_item* batcharray_cur = batcharray; unsigned int size = max_size; if (libxsmm_mmbatch_size < max_size) { size = max_size - libxsmm_mmbatch_size; batcharray_cur += libxsmm_mmbatch_size; } i = libxsmm_diff_n(descriptor, batcharray_cur, sizeof(libxsmm_gemm_descriptor), sizeof(libxsmm_mmbatch_item)/*stride*/, 0/*hint*/, size); if (i < size) { /* update existing entry */ LIBXSMM_ATOMIC_ADD_FETCH(&batcharray_cur[i].stat.count, 1, LIBXSMM_ATOMIC_RELAXED); } else { /* new entry needed */ const int all = -1, shift = 0; void* extra = 0; i = ((LIBXSMM_ATOMIC_ADD_FETCH(&internal_ext_gemm_batchsize, 1, LIBXSMM_ATOMIC_RELAXED) - 1) % max_batchsize) + 1; batcharray[i-1].stat.desc = *descriptor; batcharray[i-1].stat.count = 1; batcharray[i-1].stat.symbol = libxsmm_trace_info(NULL/*depth*/, NULL/*tid*/, &all, LIBXSMM_FUNCNAME, &shift, &all); if (EXIT_SUCCESS == libxsmm_get_malloc_xinfo(libxsmm_mmbatch_array, NULL/*size*/, NULL/*flags*/, &extra)) { *(libxsmm_mmbatch_flush_function*)extra = libxsmm_mmbatch_end; } # if !defined(NDEBUG) else { result = EXIT_FAILURE; } # endif } } } #endif } #if defined(LIBXSMM_WRAP) && defined(LIBXSMM_BUILD_EXT) else { libxsmm_mmbatch_item *const batcharray = (libxsmm_mmbatch_item*)libxsmm_mmbatch_array; const unsigned int max_batchsize = (unsigned int)((LIBXSMM_GEMM_MMBATCH_SCALE) * libxsmm_mmbatch_size); i = ((LIBXSMM_ATOMIC_ADD_FETCH(&internal_ext_gemm_batchsize, 1, LIBXSMM_ATOMIC_RELAXED) - 1) % max_batchsize) + 1; batcharray[i-1].value.a = a; batcharray[i-1].value.b = b; batcharray[i-1].value.c = c; LIBXSMM_ASSERT(0 <= flags); } if (libxsmm_mmbatch_size == (i - 1)) { /* condition ensure to flush once (first discovery) */ # if !defined(NDEBUG) result = # endif internal_mmbatch_flush(&libxsmm_mmbatch_desc, libxsmm_mmbatch_size, (libxsmm_mmbatch_item*)libxsmm_mmbatch_array); } # if !defined(NDEBUG) /* library code is expected to be mute */ if (EXIT_SUCCESS != result && 0 != libxsmm_verbosity && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: SGEMM batch recording failed!\n"); } # endif #endif } } LIBXSMM_APIEXT LIBXSMM_ATTRIBUTE_USED void LIBXSMM_FSYMBOL(__wrap_dgemv)(const char* trans, const libxsmm_blasint* m, const libxsmm_blasint* n, const double* alpha, const double* a, const libxsmm_blasint* lda, const double* x, const libxsmm_blasint* incx, const double* beta, double* y, const libxsmm_blasint* incy) { LIBXSMM_ASSERT(NULL != trans && NULL != m && NULL != n && NULL != lda && NULL != incx && NULL != incy && NULL != alpha && NULL != beta); LIBXSMM_INIT if ((2 < libxsmm_gemm_wrap || 2 > libxsmm_gemm_wrap) && 1 == *incx && 1 == *incy && LIBXSMM_SMM(*m, 1, *n, 2/*RFO*/, sizeof(double))) { if (0 != (libxsmm_gemm_wrap & 1)) { /* sequential */ const int flags = LIBXSMM_GEMM_FLAGS(*trans, 'N'); const libxsmm_dmmfunction xgemv = libxsmm_dmmdispatch(*m, 1, *n, lda, n/*ldb*/, m/*ldc*/, alpha, beta, &flags, NULL); if (NULL != xgemv) { LIBXSMM_MMCALL_LDX(xgemv, a, x, y, *m, 1, *n, *lda, *n/*ldb*/, *m/*ldc*/); } else { LIBXSMM_GEMV_SYMBOL(double)(trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } } else { /* TODO: parallelized */ LIBXSMM_GEMV_SYMBOL(double)(trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } } else { LIBXSMM_GEMV_SYMBOL(double)(trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } } LIBXSMM_APIEXT LIBXSMM_ATTRIBUTE_USED void LIBXSMM_FSYMBOL(__wrap_sgemv)(const char* trans, const libxsmm_blasint* m, const libxsmm_blasint* n, const float* alpha, const float* a, const libxsmm_blasint* lda, const float* x, const libxsmm_blasint* incx, const float* beta, float* y, const libxsmm_blasint* incy) { LIBXSMM_ASSERT(NULL != trans && NULL != m && NULL != n && NULL != lda && NULL != incx && NULL != incy && NULL != alpha && NULL != beta); LIBXSMM_INIT if ((2 < libxsmm_gemm_wrap || 2 > libxsmm_gemm_wrap) && 1 == *incx && 1 == *incy && LIBXSMM_SMM(*m, 1, *n, 2/*RFO*/, sizeof(float))) { if (0 != (libxsmm_gemm_wrap & 1)) { /* sequential */ const int flags = LIBXSMM_GEMM_FLAGS(*trans, 'N'); const libxsmm_smmfunction xgemv = libxsmm_smmdispatch(*m, 1, *n, lda, n/*ldb*/, m/*ldc*/, alpha, beta, &flags, NULL); if (NULL != xgemv) { LIBXSMM_MMCALL_LDX(xgemv, a, x, y, *m, 1, *n, *lda, *n/*ldb*/, *m/*ldc*/); } else { LIBXSMM_GEMV_SYMBOL(float)(trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } } else { /* TODO: parallelized */ LIBXSMM_GEMV_SYMBOL(float)(trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } } else { LIBXSMM_GEMV_SYMBOL(float)(trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } } LIBXSMM_APIEXT LIBXSMM_ATTRIBUTE_USED void __wrap_dgemm_batch( const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], const double alpha_array[], const double* a_array[], const libxsmm_blasint lda_array[], const double* b_array[], const libxsmm_blasint ldb_array[], const double beta_array[], double* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) { LIBXSMM_FSYMBOL(__wrap_dgemm_batch)(transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size); } LIBXSMM_APIEXT LIBXSMM_ATTRIBUTE_USED void __wrap_sgemm_batch( const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], const float alpha_array[], const float* a_array[], const libxsmm_blasint lda_array[], const float* b_array[], const libxsmm_blasint ldb_array[], const float beta_array[], float* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) { LIBXSMM_FSYMBOL(__wrap_sgemm_batch)(transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size); } #endif /*defined(LIBXSMM_BUILD) && defined(LIBXSMM_BUILD_EXT)*/ LIBXSMM_APIEXT void libxsmm_xgemm_omp(libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const void* beta, void* c, const libxsmm_blasint* ldc) { libxsmm_gemm_blob blob; #if defined(LIBXSMM_EXT_TASKS) /* implies _OPENMP */ const int outerpar = omp_get_active_level(), nthreads = (0 == outerpar ? omp_get_max_threads() : omp_get_num_threads()); #elif defined(_OPENMP) const int outerpar = omp_in_parallel(), nthreads = (0 == outerpar ? omp_get_max_threads() : 1); #else const int nthreads = 1; #endif const libxsmm_gemm_handle *const handle = libxsmm_gemm_handle_init(&blob, iprec, oprec, transa, transb, m, n, k, lda, ldb, ldc, alpha, beta, LIBXSMM_GEMM_HANDLE_FLAG_AUTO, nthreads); const size_t scratch_size = libxsmm_gemm_handle_get_scratch_size(handle); void* scratch = NULL; if (NULL != handle && (0 == scratch_size || NULL != (scratch = libxsmm_scratch_malloc(scratch_size, LIBXSMM_CACHELINE, LIBXSMM_MALLOC_INTERNAL_CALLER)))) { #if defined(_OPENMP) if (0 == outerpar) { /* enable internal parallelization */ # if defined(LIBXSMM_EXT_TASKS) if (0 == libxsmm_gemm_tasks) # endif { # pragma omp parallel num_threads(nthreads) libxsmm_gemm_thread(handle, scratch, a, b, c, omp_get_thread_num(), nthreads); } # if defined(LIBXSMM_EXT_TASKS) else { /* tasks requested */ const int ntasks = nthreads; /* TODO: apply grain-size */ # pragma omp parallel num_threads(nthreads) { /* first thread discovering work will launch all tasks */ # pragma omp single nowait /* anyone is good */ { int tid; for (tid = 0; tid < ntasks; ++tid) { # pragma omp task untied libxsmm_gemm_thread(handle, scratch, a, b, c, tid, ntasks); } } } /* implicit synchronization (barrier) */ } # endif } else { /* assume external parallelization */ # if defined(LIBXSMM_EXT_TASKS) /* implies _OPENMP */ const int ntasks = nthreads; /* TODO: apply grain-size */ int tid; for (tid = 0; tid < ntasks; ++tid) { # pragma omp task untied libxsmm_gemm_thread(handle, scratch, a, b, c, tid, ntasks); } if (0 == libxsmm_nosync) { /* allow to omit synchronization */ # pragma omp taskwait } # else libxsmm_gemm_thread(handle, scratch, a, b, c, 0/*tid*/, 1/*nthreads*/); # endif } if (LIBXSMM_VERBOSITY_HIGH <= libxsmm_verbosity || 0 > libxsmm_verbosity) { /* library code is expected to be mute */ const unsigned int ntasks = handle->mt * handle->nt * handle->kt; const double imbalance = 100.0 * LIBXSMM_DELTA((unsigned int)nthreads, ntasks) / nthreads; static double max_imbalance = 50.0; if (max_imbalance < imbalance) { fprintf(stderr, "LIBXSMM WARNING (XGEMM): %.0f%% imbalance (%u of %i workers utilized)!\n", imbalance, ntasks, nthreads); max_imbalance = imbalance; } } #else libxsmm_gemm_thread(handle, scratch, a, b, c, 0/*tid*/, 1/*nthreads*/); #endif /*defined(_OPENMP)*/ libxsmm_free(scratch); } else { /* fall-back or error */ static int error_once = 0; if (NULL == handle) { /* fall-back */ if ((LIBXSMM_VERBOSITY_HIGH <= libxsmm_verbosity || 0 > libxsmm_verbosity) /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM WARNING (XGEMM): fall-back code path triggered!\n"); } } else if (0 != libxsmm_verbosity && /* library code is expected to be mute */ 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: failed to allocate GEMM-scratch memory!\n"); } libxsmm_blas_xgemm(iprec, oprec, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } } LIBXSMM_API_INLINE void internal_gemm_batch_omp(libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, const char transa[], const char transb[], const libxsmm_blasint m[], const libxsmm_blasint n[], const libxsmm_blasint k[], const void* alpha, const void* a[], const libxsmm_blasint lda[], const void* b[], const libxsmm_blasint ldb[], const void* beta, void* c[], const libxsmm_blasint ldc[], libxsmm_blasint index_base, libxsmm_blasint index_stride, const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], const libxsmm_blasint batchsize[], libxsmm_blasint group_count) { static int error_once = 0; LIBXSMM_INIT if ( /* check for sensible arguments */ #if defined(LIBXSMM_GEMM_CHECK) NULL != a && NULL != b && NULL != c && (1 == group_count || -1 == group_count || (0 == index_stride && (NULL == stride_a || 0 != *stride_a) && (NULL == stride_b || 0 != *stride_b) && (NULL == stride_c || 0 != *stride_c))) && #endif 0 != group_count) { int result = EXIT_SUCCESS; const int max_npargroups = (int)(0 < libxsmm_gemm_npargroups ? LIBXSMM_MIN(libxsmm_gemm_npargroups, LIBXSMM_GEMM_NPARGROUPS) : LIBXSMM_GEMM_NPARGROUPS); const libxsmm_gemm_prefetch_type prefetch = libxsmm_get_gemm_prefetch(LIBXSMM_PREFETCH_AUTO); const size_t sa = (NULL != stride_a ? (size_t)(*stride_a) : sizeof(void*)); const size_t sb = (NULL != stride_b ? (size_t)(*stride_b) : sizeof(void*)); const size_t sc = (NULL != stride_c ? (size_t)(*stride_c) : sizeof(void*)); const unsigned char otypesize = libxsmm_typesize((libxsmm_datatype)oprec); const int ngroups = (int)LIBXSMM_ABS(group_count); int group = 0, group_next = LIBXSMM_GEMM_NPARGROUPS; libxsmm_code_pointer kernel[LIBXSMM_GEMM_NPARGROUPS]; libxsmm_blasint base[LIBXSMM_GEMM_NPARGROUPS], i; #if !defined(LIBXSMM_EXT_GEMM_PARGROUPS_INFO) int kflags[LIBXSMM_GEMM_NPARGROUPS]; #endif int max_nthreads = 1; #if defined(_OPENMP) # if defined(LIBXSMM_EXT_TASKS) const int outerpar = omp_get_active_level(); # else const int outerpar = omp_in_parallel(); # endif if (0 == outerpar) max_nthreads = omp_get_max_threads(); #endif for (i = 0; i < max_npargroups; ++i) { #if !defined(NDEBUG) kernel[i].ptr = NULL; # if !defined(LIBXSMM_EXT_GEMM_PARGROUPS_INFO) kflags[i] = 0; # endif #endif base[i] = 0; } for (group = 0; group < ngroups; group = group_next, group_next += max_npargroups) { const int npargroups = LIBXSMM_MIN(group_next, ngroups); libxsmm_blasint size = 0; int suitable = 0; if (0 < group) { /* base is maintained even if par-group is not suitable */ for (i = 0; i < npargroups; ++i) { const libxsmm_blasint isize = batchsize[group+i-1], asize = LIBXSMM_ABS(isize); base[i] += asize; } } for (i = 0; i < npargroups; ++i) { const libxsmm_blasint g = group + i, im = m[g], in = n[g], ik = k[g]; suitable = LIBXSMM_SMM_AI(im, in, ik, 2/*RFO*/, otypesize); if (0 != suitable) { const libxsmm_blasint isize = batchsize[g], asize = LIBXSMM_ABS(isize); const char *const ta = (NULL != transa ? (transa + g) : NULL); const char *const tb = (NULL != transb ? (transb + g) : NULL); const int flags = LIBXSMM_GEMM_PFLAGS(ta, tb, LIBXSMM_FLAGS); const void **const galpha = &alpha, **const gbeta = β libxsmm_descriptor_blob blob; /* coverity[ptr_arith] */ libxsmm_gemm_descriptor *const desc = libxsmm_gemm_descriptor_init2(&blob, iprec, oprec, im, in, ik, NULL != lda ? lda[g] : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & flags) ? im : ik), NULL != ldb ? ldb[g] : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & flags) ? ik : in), NULL != ldc ? ldc[g] : im, NULL != alpha ? galpha[g] : NULL, NULL != beta ? gbeta[g] : NULL, flags, prefetch); if (NULL != desc) { libxsmm_gemm_internal_set_batchflag(desc, c, index_stride, 0 < group_count ? isize : -asize, 1 != max_nthreads); kernel[i].xgemm = libxsmm_xmmdispatch(desc); } else kernel[i].ptr = NULL; if (NULL != kernel[i].ptr_const) { if (size < asize) size = asize; #if !defined(LIBXSMM_EXT_GEMM_PARGROUPS_INFO) LIBXSMM_ASSERT(NULL != desc); /* coverity[var_deref_op] */ kflags[i] = desc->flags; #endif } else { suitable = 0; break; } } else break; } if (0 != suitable) { /* check if an SMM is suitable */ const unsigned char itypesize = libxsmm_typesize((libxsmm_datatype)iprec); #if defined(_OPENMP) const int nchunks = (int)LIBXSMM_UPDIV(size, libxsmm_gemm_taskgrain); const int ntasks = nchunks * npargroups, nthreads = LIBXSMM_MIN(max_nthreads, ntasks); if (1 < nthreads) { if (0 == outerpar) { /* enable internal parallelization */ # if defined(LIBXSMM_EXT_TASKS) if (0 == libxsmm_gemm_tasks) # endif { # pragma omp parallel for num_threads(nthreads) private(i) for (i = 0; i < ntasks; ++i) { const libxsmm_blasint j = i * libxsmm_gemm_taskgrain, u = j / size, v = j - u * size, g = group + u; const libxsmm_blasint isize = batchsize[g], asize = LIBXSMM_ABS(isize); if (v < asize) { #if defined(LIBXSMM_EXT_GEMM_PARGROUPS_INFO) libxsmm_mmkernel_info kernel_info; #endif /*check*/libxsmm_mmbatch_kernel(kernel[g].xgemm, index_base, index_stride, stride_a, stride_b, stride_c, (const char*)a + sa * base[u], (const char*)b + sb * base[u], (char*)c + sc * base[u], 0 < group_count ? isize : -asize, (int)i, nchunks, itypesize, otypesize, #if defined(LIBXSMM_EXT_GEMM_PARGROUPS_INFO) EXIT_SUCCESS == libxsmm_get_mmkernel_info(kernel[g].xgemm, &kernel_info) ? kernel_info.flags : 0); #else kflags[g]); #endif } } } # if defined(LIBXSMM_EXT_TASKS) else { /* tasks requested */ # pragma omp parallel num_threads(nthreads) private(i) { /* first thread discovering work will launch all tasks */ # pragma omp single nowait /* anyone is good */ for (i = 0; i < ntasks; ++i) { const libxsmm_blasint j = i * libxsmm_gemm_taskgrain, u = j / size, v = j - u * size, g = group + u; const libxsmm_blasint isize = batchsize[g], asize = LIBXSMM_ABS(isize); if (v < asize) { # pragma omp task { #if defined(LIBXSMM_EXT_GEMM_PARGROUPS_INFO) libxsmm_mmkernel_info kernel_info; #endif /*check*/libxsmm_mmbatch_kernel(kernel[g].xgemm, index_base, index_stride, stride_a, stride_b, stride_c, (const char*)a + sa * base[u], (const char*)b + sb * base[u], (char*)c + sc * base[u], 0 < group_count ? isize : -asize, (int)i, nchunks, itypesize, otypesize, #if defined(LIBXSMM_EXT_GEMM_PARGROUPS_INFO) EXIT_SUCCESS == libxsmm_get_mmkernel_info(kernel[g].xgemm, &kernel_info) ? kernel_info.flags : 0); #else kflags[g]); #endif } } } } /* implicit synchronization (barrier) */ } # endif } else { /* assume external parallelization */ for (i = 0; i < (libxsmm_blasint)ntasks; ++i) { const libxsmm_blasint j = i * libxsmm_gemm_taskgrain, u = j / size, v = j - u * size, g = group + u; const libxsmm_blasint isize = batchsize[g], asize = LIBXSMM_ABS(isize); if (v < asize) { # if defined(LIBXSMM_EXT_TASKS) /* OpenMP-tasks */ # pragma omp task #endif { #if defined(LIBXSMM_EXT_GEMM_PARGROUPS_INFO) libxsmm_mmkernel_info kernel_info; #endif /*check*/libxsmm_mmbatch_kernel(kernel[g].xgemm, index_base, index_stride, stride_a, stride_b, stride_c, (const char*)a + sa * base[u], (const char*)b + sb * base[u], (char*)c + sc * base[u], 0 < group_count ? isize : -asize, (int)i, nchunks, itypesize, otypesize, #if defined(LIBXSMM_EXT_GEMM_PARGROUPS_INFO) EXIT_SUCCESS == libxsmm_get_mmkernel_info(kernel[g].xgemm, &kernel_info) ? kernel_info.flags : 0); #else kflags[g]); #endif } } } # if defined(LIBXSMM_EXT_TASKS) /* OpenMP-tasks */ if (0 == libxsmm_nosync) { /* allow to omit synchronization */ # pragma omp taskwait } # endif } } else #endif /*defined(_OPENMP)*/ { /* sequential */ for (i = 0; i < npargroups; ++i) { const libxsmm_blasint g = group + i; #if defined(LIBXSMM_EXT_GEMM_PARGROUPS_INFO) libxsmm_mmkernel_info kernel_info; #endif libxsmm_mmbatch_kernel(kernel[i].xgemm, index_base, index_stride, stride_a, stride_b, stride_c, (const char*)a + sa * base[i], (const char*)b + sb * base[i], (char*)c + sc * base[i], batchsize[g], 0/*tid*/, 1/*nthreads*/, itypesize, otypesize, #if defined(LIBXSMM_EXT_GEMM_PARGROUPS_INFO) EXIT_SUCCESS == libxsmm_get_mmkernel_info(kernel[i].xgemm, &kernel_info) ? kernel_info.flags : 0); #else kflags[i]); #endif } } } else { /* trigger fall-back */ result = EXIT_FAILURE; } if (EXIT_SUCCESS != result) { for (i = 0; i < npargroups; ++i) { const libxsmm_blasint g = group + i; const char *const ta = (NULL != transa ? (transa + g) : NULL); const char *const tb = (NULL != transb ? (transb + g) : NULL); const int flags = LIBXSMM_GEMM_PFLAGS(ta, tb, LIBXSMM_FLAGS); const libxsmm_blasint im = m[g], in = n[g], ik = k[g]; const libxsmm_blasint ilda = (NULL != lda ? lda[g] : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & flags) ? im : ik)); const libxsmm_blasint ildb = (NULL != ldb ? ldb[g] : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & flags) ? ik : in)); const libxsmm_blasint ildc = (NULL != ldc ? ldc[g] : im); const void **const galpha = &alpha, **const gbeta = β /* coverity[overrun-local] */ const void *const ialpha = (NULL != alpha ? galpha[g] : NULL); /* coverity[overrun-local] */ const void *const ibeta = (NULL != beta ? gbeta[g] : NULL); if (EXIT_SUCCESS == libxsmm_mmbatch_blas(iprec, oprec, ta, tb, im, in, ik, ialpha, (const char*)a + sa * base[i], &ilda, (const char*)b + sb * base[i], &ildb, ibeta, (char*)c + sc * base[i], &ildc, index_base, index_stride, stride_a, stride_b, stride_c, batchsize[g])) { if (LIBXSMM_VERBOSITY_WARN <= libxsmm_verbosity || 0 > libxsmm_verbosity) { const size_t threshold = LIBXSMM_MNK_SIZE(im, in, im); static size_t threshold_max = 0; if (threshold_max < threshold) { LIBXSMM_STDIO_ACQUIRE(); fprintf(stderr, "LIBXSMM WARNING: "); libxsmm_gemm_print2(stderr, iprec, oprec, ta, tb, &im, &in, &ik, ialpha, NULL/*a*/, &ilda, NULL/*b*/, &ildb, ibeta, NULL/*c*/, &ildc); fprintf(stderr, " => batched GEMM/omp was falling back to BLAS!\n"); LIBXSMM_STDIO_RELEASE(); threshold_max = threshold; } } } else { if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: libxsmm_gemm_batch_omp failed!\n"); } return; /* exit routine */ } } } } } #if defined(LIBXSMM_GEMM_CHECK) else if (0 != group_count && 0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: incorrect arguments (libxsmm_gemm_batch_omp)!\n"); } #endif } LIBXSMM_APIEXT void libxsmm_gemm_batch_omp(libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, const char* transa, const char* transb, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const void* beta, void* c, const libxsmm_blasint* ldc, libxsmm_blasint index_base, libxsmm_blasint index_stride, const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], libxsmm_blasint batchsize) { internal_gemm_batch_omp(iprec, oprec, transa, transb, &m, &n, &k, alpha, (const void**)a, lda, (const void**)b, ldb, beta, (void**)c, ldc, index_base, index_stride, stride_a, stride_b, stride_c, &batchsize, 1); } LIBXSMM_APIEXT void libxsmm_dgemm_batch_omp( const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], const double alpha_array[], const double* a_array[], const libxsmm_blasint lda_array[], const double* b_array[], const libxsmm_blasint ldb_array[], const double beta_array[], double* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) { if (NULL != group_count) { const libxsmm_blasint ptrsize = sizeof(void*); internal_gemm_batch_omp(LIBXSMM_GEMM_PRECISION_F64, LIBXSMM_GEMM_PRECISION_F64, transa_array, transb_array, m_array, n_array, k_array, alpha_array, (const void**)a_array, lda_array, (const void**)b_array, ldb_array, beta_array, (void**)c_array, ldc_array, 0/*index_base*/, 0/*index_stride*/, &ptrsize, &ptrsize, &ptrsize, group_size, *group_count); } } LIBXSMM_APIEXT void libxsmm_sgemm_batch_omp( const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], const float alpha_array[], const float* a_array[], const libxsmm_blasint lda_array[], const float* b_array[], const libxsmm_blasint ldb_array[], const float beta_array[], float* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) { if (NULL != group_count) { const libxsmm_blasint ptrsize = sizeof(void*); internal_gemm_batch_omp(LIBXSMM_GEMM_PRECISION_F32, LIBXSMM_GEMM_PRECISION_F32, transa_array, transb_array, m_array, n_array, k_array, alpha_array, (const void**)a_array, lda_array, (const void**)b_array, ldb_array, beta_array, (void**)c_array, ldc_array, 0/*index_base*/, 0/*index_stride*/, &ptrsize, &ptrsize, &ptrsize, group_size, *group_count); } } LIBXSMM_APIEXT void libxsmm_mmbatch_begin(libxsmm_gemm_precision precision, const int* flags, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const void* alpha, const void* beta) { #if defined(LIBXSMM_WRAP) && defined(LIBXSMM_BUILD_EXT) # if defined(_MSC_VER) # pragma warning(push) # pragma warning(disable: 26115) /* try-lock is treated incorrectly by static analysis */ # endif LIBXSMM_INIT if (NULL != libxsmm_mmbatch_array /* batch-recording available, but not yet running */ /* currently, batch recording is only enabled if all values are present (no complex filtering) */ && NULL != flags && NULL != alpha && NULL != beta && NULL != lda && NULL != ldb && NULL != ldc && NULL != m && NULL != n && NULL != k && LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_DEFAULT) == LIBXSMM_LOCK_TRYLOCK(LIBXSMM_LOCK_DEFAULT, &libxsmm_mmbatch_lock)) { libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const descriptor = libxsmm_gemm_descriptor_init(&blob, precision, *m, *n, *k, *lda, *ldb, *ldc, alpha, beta, *flags, libxsmm_get_gemm_prefetch(LIBXSMM_EXT_GEMM_MMBATCH_PREFETCH)); static int error_once = 0; int result = EXIT_SUCCESS; if (NULL != descriptor) { const unsigned int max_batchsize = (unsigned int)((LIBXSMM_GEMM_MMBATCH_SCALE) * libxsmm_mmbatch_size); unsigned int i; #if !defined(NDEBUG) const unsigned int mmbatch_maxdepth = LIBXSMM_UP2POT(LIBXSMM_EXT_GEMM_MMBATCH_MAXDEPTH); LIBXSMM_ASSERT((LIBXSMM_EXT_GEMM_MMBATCH_MAXDEPTH) == mmbatch_maxdepth/*is pot*/); #endif /* eventually overwrite the oldest entry */ i = LIBXSMM_MOD2(internal_ext_gemm_batchdepth, LIBXSMM_EXT_GEMM_MMBATCH_MAXDEPTH); internal_ext_gemm_batchdesc[i] = libxsmm_mmbatch_desc; /* backup */ ++internal_ext_gemm_batchdepth; /* ensure descriptor does not match any GEMM such that... */ LIBXSMM_MEMZERO127(&libxsmm_mmbatch_desc); /* ...the batch stops and completely flushes */ if (0 != internal_ext_gemm_batchsize) { result = internal_mmbatch_flush(internal_ext_gemm_batchdesc + i, (((libxsmm_blasint)internal_ext_gemm_batchsize - 1) % max_batchsize) + 1, (libxsmm_mmbatch_item*)libxsmm_mmbatch_array); } if (EXIT_SUCCESS == result) { /* enable descriptor */ internal_ext_gemm_batchsize = 0; /* reset */ if (0 == (LIBXSMM_MMBATCH_FLAG_STATISTIC & *flags)) { libxsmm_mmbatch_desc = *descriptor; } else { libxsmm_mmbatch_desc.flags = LIBXSMM_MMBATCH_FLAG_STATISTIC; } } } else { result = EXIT_FAILURE; } if (EXIT_SUCCESS != result && 0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: GEMM batch enabling failed!\n"); } LIBXSMM_LOCK_RELEASE(LIBXSMM_LOCK_DEFAULT, &libxsmm_mmbatch_lock); } # if defined(_MSC_VER) # pragma warning(pop) # endif #else LIBXSMM_UNUSED(precision); LIBXSMM_UNUSED(flags); LIBXSMM_UNUSED(m); LIBXSMM_UNUSED(n); LIBXSMM_UNUSED(k); LIBXSMM_UNUSED(lda); LIBXSMM_UNUSED(ldb); LIBXSMM_UNUSED(ldc); LIBXSMM_UNUSED(alpha); LIBXSMM_UNUSED(beta); #endif } LIBXSMM_APIEXT void libxsmm_mmbatch_end(void) { #if defined(LIBXSMM_WRAP) && defined(LIBXSMM_BUILD_EXT) # if defined(_MSC_VER) # pragma warning(push) # pragma warning(disable: 26115) /* try-lock is treated incorrectly by static analysis */ # endif /*const*/ int trystate = LIBXSMM_LOCK_TRYLOCK(LIBXSMM_LOCK_DEFAULT, &libxsmm_mmbatch_lock); if (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_DEFAULT) == trystate) { const unsigned int max_batchsize = (unsigned int)((LIBXSMM_GEMM_MMBATCH_SCALE) * libxsmm_mmbatch_size); const libxsmm_gemm_descriptor flushdesc = libxsmm_mmbatch_desc; static int error_once = 0; #if !defined(NDEBUG) const unsigned int mmbatch_maxdepth = LIBXSMM_UP2POT(LIBXSMM_EXT_GEMM_MMBATCH_MAXDEPTH); #endif /* ensure descriptor does not match any GEMM such that... */ LIBXSMM_MEMZERO127(&libxsmm_mmbatch_desc); /* ...the batch stops and completely flushes */ if (EXIT_SUCCESS == internal_mmbatch_flush(&flushdesc, 0 != internal_ext_gemm_batchsize ? (((internal_ext_gemm_batchsize - 1) % max_batchsize) + 1) : 0, (libxsmm_mmbatch_item*)libxsmm_mmbatch_array)) { internal_ext_gemm_batchsize = 0; /* reset */ --internal_ext_gemm_batchdepth; /* restore the previous descriptor */ assert((LIBXSMM_EXT_GEMM_MMBATCH_MAXDEPTH) == mmbatch_maxdepth/*is pot*/); /* no LIBXSMM_ASSERT! */ libxsmm_mmbatch_desc = internal_ext_gemm_batchdesc[LIBXSMM_MOD2(internal_ext_gemm_batchdepth, LIBXSMM_EXT_GEMM_MMBATCH_MAXDEPTH)]; } else if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: GEMM batch processing failed!\n"); } LIBXSMM_LOCK_RELEASE(LIBXSMM_LOCK_DEFAULT, &libxsmm_mmbatch_lock); } # if defined(_MSC_VER) # pragma warning(pop) # endif #endif } #if defined(LIBXSMM_BUILD) && defined(LIBXSMM_BUILD_EXT) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__)) /* implementation provided for Fortran 77 compatibility */ LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(libxsmm_xgemm_omp)(const libxsmm_gemm_precision*, const libxsmm_gemm_precision*, const char*, const char*, const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, const double*, const double*, const libxsmm_blasint*, const double*, const libxsmm_blasint*, const double*, double*, const libxsmm_blasint*); LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(libxsmm_xgemm_omp)(const libxsmm_gemm_precision* iprec, const libxsmm_gemm_precision* oprec, const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const double* alpha, const double* a, const libxsmm_blasint* lda, const double* b, const libxsmm_blasint* ldb, const double* beta, double* c, const libxsmm_blasint* ldc) { LIBXSMM_ASSERT(NULL != iprec && NULL != oprec); libxsmm_xgemm_omp(*iprec, *oprec, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } /* implementation provided for Fortran 77 compatibility */ LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(libxsmm_dgemm_omp)(const char*, const char*, const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, const double*, const double*, const libxsmm_blasint*, const double*, const libxsmm_blasint*, const double*, double*, const libxsmm_blasint*); LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(libxsmm_dgemm_omp)(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const double* alpha, const double* a, const libxsmm_blasint* lda, const double* b, const libxsmm_blasint* ldb, const double* beta, double* c, const libxsmm_blasint* ldc) { libxsmm_dgemm_omp(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } /* implementation provided for Fortran 77 compatibility */ LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(libxsmm_sgemm_omp)(const char*, const char*, const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, const float*, const float*, const libxsmm_blasint*, const float*, const libxsmm_blasint*, const float*, float*, const libxsmm_blasint*); LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(libxsmm_sgemm_omp)(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const float* alpha, const float* a, const libxsmm_blasint* lda, const float* b, const libxsmm_blasint* ldb, const float* beta, float* c, const libxsmm_blasint* ldc) { libxsmm_sgemm_omp(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } /* implementation provided for Fortran 77 compatibility */ LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(libxsmm_gemm_batch_omp)(const libxsmm_gemm_precision*, const libxsmm_gemm_precision*, const char*, const char*, const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, const void*, const void*, const libxsmm_blasint*, const void*, const libxsmm_blasint*, const void*, void*, const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint[], const libxsmm_blasint[], const libxsmm_blasint[], const libxsmm_blasint*); LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(libxsmm_gemm_batch_omp)(const libxsmm_gemm_precision* iprec, const libxsmm_gemm_precision* oprec, const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const void* beta, void* c, const libxsmm_blasint* ldc, const libxsmm_blasint* index_base, const libxsmm_blasint* index_stride, const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], const libxsmm_blasint* batchsize) { LIBXSMM_ASSERT(NULL != iprec && NULL != oprec && NULL != m && NULL != n && NULL != k); LIBXSMM_ASSERT(NULL != index_base && NULL != index_stride && NULL != batchsize); libxsmm_gemm_batch_omp(*iprec, *oprec, transa, transb, *m, *n, *k, alpha, a, lda, b, ldb, beta, c, ldc, *index_base, *index_stride, stride_a, stride_b, stride_c, *batchsize); } /* implementation provided for Fortran 77 compatibility */ LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(libxsmm_mmbatch_begin)(const libxsmm_gemm_precision*, const int*, const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, const void*, const void*); LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(libxsmm_mmbatch_begin)(const libxsmm_gemm_precision* precision, const int* flags, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const void* alpha, const void* beta) { LIBXSMM_ASSERT(NULL != precision); libxsmm_mmbatch_begin(*precision, flags, m, n, k, lda, ldb, ldc, alpha, beta); } /* implementation provided for Fortran 77 compatibility */ LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(libxsmm_mmbatch_end)(void); LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(libxsmm_mmbatch_end)(void) { libxsmm_mmbatch_end(); } #endif /*defined(LIBXSMM_BUILD) && defined(LIBXSMM_BUILD_EXT) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__))*/ libxsmm-1.17/src/libxsmm_ext_xcopy.c000066400000000000000000000356141415223013700176340ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include "libxsmm_xcopy.h" #include "libxsmm_ext.h" #define LIBXSMM_MCOPY_MT(MT, NT, M, N) ((MT) <= (M) && (NT) <= (N) && (64U * 64U) <= (((unsigned int)(M)) * (N))) LIBXSMM_APIEXT void libxsmm_matcopy_omp(void* out, const void* in, unsigned int typesize, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo) { LIBXSMM_INIT if (0 < typesize && 256 > typesize && m <= ldi && m <= ldo && out != in && ((NULL != out && 0 < m && 0 < n) || (0 == m && 0 == n))) { if (0 < m && 0 < n) { #if defined(_OPENMP) # if (defined(LIBXSMM_XCOPY_JIT) && 0 != (LIBXSMM_XCOPY_JIT)) && !defined(LIBXSMM_XCOPY_MELTW) int prefetch = 0; # endif unsigned int tm, tn, ts; if (NULL != in) { /* mcopy */ # if (defined(LIBXSMM_XCOPY_JIT) && 0 != (LIBXSMM_XCOPY_JIT)) && !defined(LIBXSMM_XCOPY_MELTW) prefetch = libxsmm_mcopy_prefetch; # endif tm = LIBXSMM_UPDIV(libxsmm_mcopy_mbytes, typesize); tn = (unsigned int)(libxsmm_mcopy_nscale * tm); ts = libxsmm_mcopy_mbytes; } else { /* mzero */ tm = LIBXSMM_UPDIV(libxsmm_mzero_mbytes, typesize); tn = (unsigned int)(libxsmm_mzero_nscale * tm); ts = libxsmm_mzero_mbytes; } if (0 == tm) tm = m; if (0 == tn) tn = LIBXSMM_MIN(LIBXSMM_XCOPY_TILE_MIN, n); if (0 != ts && ts < (tm * tn * typesize)) { tm = LIBXSMM_MAX(ts / (tn * typesize), LIBXSMM_XCOPY_TILE_MIN); } if (LIBXSMM_MCOPY_MT(tm, tn, (unsigned int)m, (unsigned int)n)) { /* consider problem-size */ libxsmm_xcopykernel kernel; kernel.ptr = NULL; # if (defined(LIBXSMM_XCOPY_JIT) && 0 != (LIBXSMM_XCOPY_JIT)) if (0 != (2 & libxsmm_xcopy_jit)) { /* JIT'ted matrix-copy permitted? */ # if defined(LIBXSMM_XCOPY_MELTW) const libxsmm_blasint sldi = ldi * typesize, sldo = ldo * typesize; if (NULL != in) { /* mcopy */ kernel.meltw_copy = libxsmm_dispatch_meltw_copy( (libxsmm_blasint)tm * typesize, (libxsmm_blasint)tn * typesize, &sldi, &sldo, LIBXSMM_DATATYPE_I8, LIBXSMM_DATATYPE_I8); } else { /* mzero */ kernel.meltw_zero = libxsmm_dispatch_meltw_zero( (libxsmm_blasint)tm * typesize, (libxsmm_blasint)tn * typesize, &sldi, &sldo, LIBXSMM_DATATYPE_I8, LIBXSMM_DATATYPE_I8); } # else const libxsmm_mcopy_descriptor* desc; libxsmm_descriptor_blob blob; if (NULL != (desc = libxsmm_mcopy_descriptor_init(&blob, typesize, tm, tn, (unsigned int)ldo, (unsigned int)ldi, NULL != in ? 0 : LIBXSMM_MATCOPY_FLAG_ZERO_SOURCE, prefetch, NULL/*default unroll*/))) { kernel.xmcopy = libxsmm_dispatch_mcopy(desc); } # endif } # endif # if defined(LIBXSMM_EXT_TASKS) && 0/* implies _OPENMP */ if (0 == omp_get_active_level()) # else if (0 == omp_in_parallel()) # endif { /* enable internal parallelization */ const int nthreads = omp_get_max_threads(); # if defined(LIBXSMM_EXT_TASKS) if (0 >= libxsmm_xcopy_taskscale) # endif { # pragma omp parallel num_threads(nthreads) # if (defined(LIBXSMM_XCOPY_JIT) && 0 != (LIBXSMM_XCOPY_JIT)) libxsmm_matcopy_thread_internal(out, in, typesize, (unsigned int)m, (unsigned int)n, (unsigned int)ldi, (unsigned int)ldo, tm, tn, kernel, omp_get_thread_num(), nthreads); #else libxsmm_matcopy_thread_internal(out, in, typesize, (unsigned int)m, (unsigned int)n, (unsigned int)ldi, (unsigned int)ldo, tm, tn, kernel, omp_get_thread_num(), nthreads); #endif } # if defined(LIBXSMM_EXT_TASKS) else { /* tasks requested */ const int ntasks = nthreads * libxsmm_xcopy_taskscale; # pragma omp parallel num_threads(nthreads) { /* first thread discovering work will launch all tasks */ # pragma omp single nowait /* anyone is good */ { int tid; for (tid = 0; tid < ntasks; ++tid) { # pragma omp task untied libxsmm_matcopy_thread_internal(out, in, typesize, (unsigned int)m, (unsigned int)n, (unsigned int)ldi, (unsigned int)ldo, tm, tn, kernel, tid, ntasks); } } } } # endif } else { /* assume external parallelization */ # if defined(LIBXSMM_EXT_TASKS) /* implies _OPENMP */ const int nthreads = omp_get_num_threads(); const int ntasks = (0 == libxsmm_xcopy_taskscale ? (LIBXSMM_XCOPY_TASKSCALE) : libxsmm_xcopy_taskscale) * nthreads; int tid; for (tid = 0; tid < ntasks; ++tid) { # pragma omp task untied libxsmm_matcopy_thread_internal(out, in, typesize, (unsigned int)m, (unsigned int)n, (unsigned int)ldi, (unsigned int)ldo, tm, tn, kernel, tid, ntasks); } if (0 == libxsmm_nosync) { /* allow to omit synchronization */ # pragma omp taskwait } # elif (defined(LIBXSMM_XCOPY_JIT) && 0 != (LIBXSMM_XCOPY_JIT)) libxsmm_matcopy_thread_internal(out, in, typesize, (unsigned int)m, (unsigned int)n, (unsigned int)ldi, (unsigned int)ldo, tm, tn, kernel, 0/*tid*/, 1/*nthreads*/); # else libxsmm_matcopy_thread_internal(out, in, typesize, (unsigned int)m, (unsigned int)n, (unsigned int)ldi, (unsigned int)ldo, tm, tn, kernel, 0/*tid*/, 1/*nthreads*/); # endif } } else #endif /*defined(_OPENMP)*/ if (NULL != in) { /* no MT, or small problem-size */ LIBXSMM_XCOPY_NONJIT(LIBXSMM_MCOPY_KERNEL, typesize, out, in, ldi, ldo, 0, m, 0, n); } else { /* no MT, or small problem-size */ /* coverity[ptr_arith] */ LIBXSMM_XCOPY_NONJIT(LIBXSMM_MZERO_KERNEL, typesize, out, in, ldi, ldo, 0, m, 0, n); } } } else { static int error_once = 0; if ( 0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { if (NULL == out) { fprintf(stderr, "LIBXSMM ERROR: the matrix-copy input and/or output is NULL!\n"); } else if (out == in) { fprintf(stderr, "LIBXSMM ERROR: output and input of the matrix-copy must be different!\n"); } else if (0 == typesize || 256 <= typesize) { fprintf(stderr, "LIBXSMM ERROR: invalid type-size for matrix-copy specified!\n"); } else if (ldi < m || ldo < m) { fprintf(stderr, "LIBXSMM ERROR: the leading dimension(s) of the matrix-copy is/are too small!\n"); } else if (0 > m || 0 > n) { fprintf(stderr, "LIBXSMM ERROR: the matrix extent(s) of the matrix-copy is/are negative!\n"); } } } } LIBXSMM_APIEXT void libxsmm_otrans_omp(void* out, const void* in, unsigned int typesize, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo) { static int error_once = 0; LIBXSMM_INIT if (0 < typesize && 256 > typesize && m <= ldi && n <= ldo && ((NULL != out && NULL != in && 0 < m && 0 < n) || (0 == m && 0 == n))) { if (0 < m && 0 < n) { if (out != in) { #if defined(_OPENMP) unsigned int tm = LIBXSMM_UPDIV(libxsmm_tcopy_mbytes, typesize); unsigned int tn = (unsigned int)(libxsmm_tcopy_nscale * tm); if (0 == tm) tm = m; if (0 == tn) tn = LIBXSMM_MIN(LIBXSMM_XCOPY_TILE_MIN, n); if (0 != libxsmm_tcopy_mbytes && libxsmm_tcopy_mbytes < (tm * tn * typesize)) { tm = LIBXSMM_MAX(libxsmm_tcopy_mbytes / (tn * typesize), LIBXSMM_XCOPY_TILE_MIN); } if (tm <= (unsigned int)m && tn <= (unsigned int)n) { /* consider problem-size */ libxsmm_xcopykernel kernel; kernel.ptr = NULL; # if defined(LIBXSMM_EXT_TASKS) /* implies _OPENMP */ if (0 == omp_get_active_level()) # else if (0 == omp_in_parallel()) # endif { /* enable internal parallelization */ const int nthreads = omp_get_max_threads(); # if defined(LIBXSMM_EXT_TASKS) if (0 >= libxsmm_xcopy_taskscale) # endif { # pragma omp parallel num_threads(nthreads) { /* coverity[divide_by_zero] */ libxsmm_otrans_thread_internal(out, in, typesize, (unsigned int)m, (unsigned int)n, (unsigned int)ldi, (unsigned int)ldo, tm, tn, kernel, omp_get_thread_num(), nthreads); } } # if defined(LIBXSMM_EXT_TASKS) else { /* tasks requested */ const int ntasks = nthreads * libxsmm_xcopy_taskscale; # pragma omp parallel num_threads(nthreads) { /* first thread discovering work will launch all tasks */ # pragma omp single nowait /* anyone is good */ { int tid; for (tid = 0; tid < ntasks; ++tid) { # pragma omp task untied libxsmm_otrans_thread_internal(out, in, typesize, (unsigned int)m, (unsigned int)n, (unsigned int)ldi, (unsigned int)ldo, tm, tn, kernel, tid, ntasks); } } } } # endif } else { /* assume external parallelization */ # if defined(LIBXSMM_EXT_TASKS) /* implies _OPENMP */ const int nthreads = omp_get_num_threads(); const int ntasks = (0 == libxsmm_xcopy_taskscale ? (LIBXSMM_XCOPY_TASKSCALE) : libxsmm_xcopy_taskscale) * nthreads; int tid; for (tid = 0; tid < ntasks; ++tid) { # pragma omp task untied libxsmm_otrans_thread_internal(out, in, typesize, (unsigned int)m, (unsigned int)n, (unsigned int)ldi, (unsigned int)ldo, tm, tn, kernel, tid, ntasks); } if (0 == libxsmm_nosync) { /* allow to omit synchronization */ # pragma omp taskwait } # else /* coverity[divide_by_zero] */ libxsmm_otrans_thread_internal(out, in, typesize, (unsigned int)m, (unsigned int)n, (unsigned int)ldi, (unsigned int)ldo, tm, tn, kernel, 0/*tid*/, 1/*nthreads*/); # endif } } else #endif /*defined(_OPENMP)*/ { /* no MT, or small problem-size */ #if (defined(LIBXSMM_XCOPY_JIT) && 0 != (LIBXSMM_XCOPY_JIT)) libxsmm_xcopykernel kernel; const libxsmm_trans_descriptor* desc; libxsmm_descriptor_blob blob; kernel.ptr = NULL; if (0 != (1 & libxsmm_xcopy_jit) /* JIT'ted transpose permitted? */ && NULL != (desc = libxsmm_trans_descriptor_init(&blob, typesize, (unsigned int)m, (unsigned int)n, (unsigned int)ldo)) && NULL != (kernel.xtrans = libxsmm_dispatch_trans(desc))) /* JIT-kernel available */ { LIBXSMM_TCOPY_CALL(kernel, typesize, in, ldi, out, ldo); } else #endif { LIBXSMM_XCOPY_NONJIT(LIBXSMM_TCOPY_KERNEL, typesize, out, in, ldi, ldo, 0, m, 0, n); } } } else if (ldi == ldo) { libxsmm_itrans/*TODO: omp*/(out, typesize, m, n, ldi); } else if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: output and input of the transpose must be different!\n"); } } } else { if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { if (NULL == out || NULL == in) { fprintf(stderr, "LIBXSMM ERROR: the transpose input and/or output is NULL!\n"); } else if (out == in) { fprintf(stderr, "LIBXSMM ERROR: output and input of the transpose must be different!\n"); } else if (0 == typesize || 256 <= typesize) { fprintf(stderr, "LIBXSMM ERROR: invalid type-size for matrix-transpose specified!\n"); } else if (ldi < m || ldo < n) { fprintf(stderr, "LIBXSMM ERROR: the leading dimension(s) of the transpose is/are too small!\n"); } else if (0 > m || 0 > n) { fprintf(stderr, "LIBXSMM ERROR: the matrix extent(s) of the transpose is/are negative!\n"); } } } } #if defined(LIBXSMM_BUILD) && defined(LIBXSMM_BUILD_EXT) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__)) /* implementation provided for Fortran 77 compatibility */ LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(libxsmm_matcopy_omp)(void* /*out*/, const void* /*in*/, const int* /*typesize*/, const libxsmm_blasint* /*m*/, const libxsmm_blasint* /*n*/, const libxsmm_blasint* /*ldi*/, const libxsmm_blasint* /*ldo*/); LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(libxsmm_matcopy_omp)(void* out, const void* in, const int* typesize, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo) { libxsmm_blasint ldx; LIBXSMM_ASSERT(NULL != typesize && 0 < *typesize && NULL != m); ldx = *(NULL != ldi ? ldi : m); libxsmm_matcopy_omp(out, in, (unsigned int)*typesize, *m, *(NULL != n ? n : m), ldx, NULL != ldo ? *ldo : ldx); } /* implementation provided for Fortran 77 compatibility */ LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(libxsmm_otrans_omp)(void* /*out*/, const void* /*in*/, const int* /*typesize*/, const libxsmm_blasint* /*m*/, const libxsmm_blasint* /*n*/, const libxsmm_blasint* /*ldi*/, const libxsmm_blasint* /*ldo*/); LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(libxsmm_otrans_omp)(void* out, const void* in, const int* typesize, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo) { libxsmm_blasint ldx; LIBXSMM_ASSERT(NULL != typesize && 0 < *typesize && NULL != m); ldx = *(NULL != ldi ? ldi : m); libxsmm_otrans_omp(out, in, (unsigned int)*typesize, *m, *(NULL != n ? n : m), ldx, NULL != ldo ? *ldo : ldx); } #endif /*defined(LIBXSMM_BUILD) && defined(LIBXSMM_BUILD_EXT) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__))*/ libxsmm-1.17/src/libxsmm_fsspmdm.c000066400000000000000000000247341415223013700172640ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "generator_spgemm_csr_asparse_reg.h" #include #include "libxsmm_main.h" LIBXSMM_API libxsmm_dfsspmdm* libxsmm_dfsspmdm_create( libxsmm_blasint M, libxsmm_blasint N, libxsmm_blasint K, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, const double alpha, const double beta, libxsmm_blasint c_is_nt, const double* a_dense) { double* a_csr_values = 0; unsigned int* a_csr_rowptr = 0; unsigned int* a_csr_colidx = 0; int flags = LIBXSMM_GEMM_FLAGS('N', 'N'); const libxsmm_gemm_prefetch_type prefetch = LIBXSMM_GEMM_PREFETCH_NONE; const libxsmm_gemm_descriptor* xgemm_desc; libxsmm_descriptor_blob xgemm_blob; libxsmm_dfsspmdm* new_handle = 0; int i, j, a_nnz; /* some checks... */ assert(N % 16 == 0); assert(N >= 16); assert(LIBXSMM_FEQ(alpha, 1.0)); assert(LIBXSMM_FEQ(beta, 1.0) || LIBXSMM_FEQ(beta, 0.0)); assert(K <= lda); assert(N <= ldc); assert(N <= ldb); /* allocate handle */ new_handle = (libxsmm_dfsspmdm*)malloc(sizeof(libxsmm_dfsspmdm)); if (0 == new_handle) return 0; /* initialize the handle */ LIBXSMM_MEMZERO127(new_handle); /* TODO: in case of ILP64, check value ranges */ new_handle->N = (int)N; new_handle->M = (int)M; new_handle->K = (int)K; new_handle->ldb = (int)ldb; new_handle->ldc = (int)ldc; /* get number of non-zeros */ a_nnz = 0; for (i = 0; i < M; ++i) { for (j = 0; j < K; j++) { if (LIBXSMM_NEQ(a_dense[(i*lda) + j], 0.0)) { a_nnz++; } } } if (0 < a_nnz) { /* allocate CSR structure */ a_csr_values = (double*)malloc((size_t)a_nnz * sizeof(double)); a_csr_rowptr = (unsigned int*)malloc(((size_t)M + 1) * sizeof(unsigned int)); a_csr_colidx = (unsigned int*)malloc((size_t)a_nnz * sizeof(unsigned int)); } /* update flags */ if ( (beta == 0.0f) && (c_is_nt != 0) ) { flags |= LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT; } if (0 != a_csr_values && 0 != a_csr_rowptr && 0 != a_csr_colidx) { int n = 0; /* populate CSR structure */ for (i = 0; i < M; i++) { a_csr_rowptr[i] = n; for (j = 0; j < K; j++) { if (LIBXSMM_NEQ(a_dense[(i*lda) + j], 0.0)) { a_csr_values[n] = a_dense[(i*lda) + j]; a_csr_colidx[n] = j; n++; } } } a_csr_rowptr[M] = a_nnz; /* attempt to JIT a sparse_reg */ new_handle->N_chunksize = 8; xgemm_desc = libxsmm_dgemm_descriptor_init(&xgemm_blob, M, new_handle->N_chunksize, K, 0, ldb, ldc, alpha, beta, flags, prefetch); if (0 != xgemm_desc) { new_handle->kernel = libxsmm_create_dcsr_reg(xgemm_desc, a_csr_rowptr, a_csr_colidx, a_csr_values); } } /* continue with sparse A */ if (new_handle->kernel != 0) { assert( a_nnz <= LIBXSMM_SPGEMM_ASPARSE_REG_MAX_UNIQUE_L1_DP ); /* allocate 8 * 512-bit permute operands if not stored in registers */ if (a_nnz > LIBXSMM_SPGEMM_ASPARSE_REG_MAX_UNIQUE_REG_DP) { new_handle->permute_operands = (unsigned int*)libxsmm_aligned_malloc(8*16*sizeof(unsigned int), 64); /* store permute operands */ for (i = 0; i < 8; i++) { j = 0; /* repeat pattern to select 64-bits using vpermd */ while (j < 16) { new_handle->permute_operands[i*16+(j)] = i*2; j++; new_handle->permute_operands[i*16+(j)] = i*2 + 1; j++; } } } /* attempt to JIT dense kernel as sparse_reg failed */ } else { new_handle->N_chunksize = 16; new_handle->kernel = libxsmm_dmmdispatch(new_handle->N_chunksize, M, K, &ldb, &K, &ldc, &alpha, &beta, &flags, (const int*)LIBXSMM_GEMM_PREFETCH_NONE); /* copy A over */ new_handle->a_dense = (double*)libxsmm_aligned_malloc((size_t)M * (size_t)K * sizeof(double), 64); for ( i = 0; i < M; ++i ) { for ( j = 0; j < K; ++j ) { new_handle->a_dense[(i*K)+j] = a_dense[(i*lda)+j]; } } } /* free CSR */ free( a_csr_values ); free( a_csr_rowptr ); free( a_csr_colidx ); return new_handle; } LIBXSMM_API libxsmm_sfsspmdm* libxsmm_sfsspmdm_create( libxsmm_blasint M, libxsmm_blasint N, libxsmm_blasint K, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, const float alpha, const float beta, libxsmm_blasint c_is_nt, const float* a_dense) { float* a_csr_values = 0; unsigned int* a_csr_rowptr = 0; unsigned int* a_csr_colidx = 0; int flags = LIBXSMM_GEMM_FLAGS('N', 'N'); const libxsmm_gemm_prefetch_type prefetch = LIBXSMM_GEMM_PREFETCH_NONE; const libxsmm_gemm_descriptor* xgemm_desc; libxsmm_descriptor_blob xgemm_blob; libxsmm_sfsspmdm* new_handle = 0; int i, j, a_nnz; /* some checks... */ assert(N % 16 == 0); assert(N >= 16); assert(LIBXSMM_FEQ(alpha, 1.0f)); assert(LIBXSMM_FEQ(beta, 1.0f) || LIBXSMM_FEQ(beta, 0.0f)); assert(K <= lda); assert(N <= ldc); assert(N <= ldb); /* allocate handle */ new_handle = (libxsmm_sfsspmdm*)malloc(sizeof(libxsmm_sfsspmdm)); if (0 == new_handle) return 0; /* initialize the handle */ LIBXSMM_MEMZERO127(new_handle); /* TODO: in case of ILP64, check value ranges */ new_handle->N = (int)N; new_handle->M = (int)M; new_handle->K = (int)K; new_handle->ldb = (int)ldb; new_handle->ldc = (int)ldc; /* get number of non-zeros */ a_nnz = 0; for (i = 0; i < M; ++i) { for (j = 0; j < K; j++) { if (LIBXSMM_NEQ(a_dense[(i*lda) + j], 0.0f)) { a_nnz++; } } } if (0 < a_nnz) { /* allocate CSR structure */ a_csr_values = (float*)malloc((size_t)a_nnz * sizeof(float)); a_csr_rowptr = (unsigned int*)malloc(((size_t)M + 1) * sizeof(unsigned int)); a_csr_colidx = (unsigned int*)malloc((size_t)a_nnz * sizeof(unsigned int)); } /* update flags */ if ( (beta == 0.0f) && (c_is_nt != 0) ) { flags |= LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT; } if (0 != a_csr_values && 0 != a_csr_rowptr && 0 != a_csr_colidx) { int n = 0; /* populate CSR structure */ for (i = 0; i < M; i++) { a_csr_rowptr[i] = n; for (j = 0; j < K; j++) { if (LIBXSMM_NEQ(a_dense[(i*lda) + j], 0.0f)) { a_csr_values[n] = a_dense[(i*lda) + j]; a_csr_colidx[n] = j; n++; } } } a_csr_rowptr[M] = a_nnz; /* attempt to JIT a sparse_reg */ new_handle->N_chunksize = 16; xgemm_desc = libxsmm_sgemm_descriptor_init(&xgemm_blob, M, new_handle->N_chunksize, K, 0, ldb, ldc, alpha, beta, flags, prefetch); if (0 != xgemm_desc) { new_handle->kernel = libxsmm_create_scsr_reg(xgemm_desc, a_csr_rowptr, a_csr_colidx, a_csr_values); } } /* continue with sparse A */ if (new_handle->kernel != 0) { assert( a_nnz <= LIBXSMM_SPGEMM_ASPARSE_REG_MAX_UNIQUE_L1_SP ); /* allocate 16 * 512-bit permute operands if not stored in registers */ if (a_nnz > LIBXSMM_SPGEMM_ASPARSE_REG_MAX_UNIQUE_REG_SP) { new_handle->permute_operands = (unsigned int*)libxsmm_aligned_malloc(16*16*sizeof(unsigned int), 64); /* store permute operands */ for (i = 0; i < 16; i++) { j = 0; /* repeat pattern to select 32-bits using vpermd */ while (j < 16) { new_handle->permute_operands[i*16+j] = i; j++; } } } /* attempt to JIT dense kernel as sparse_reg failed */ } else { new_handle->N_chunksize = 16; new_handle->kernel = libxsmm_smmdispatch(new_handle->N_chunksize, M, K, &ldb, &K, &ldc, &alpha, &beta, &flags, (const int*)LIBXSMM_GEMM_PREFETCH_NONE); /* copy A over */ new_handle->a_dense = (float*)libxsmm_aligned_malloc((size_t)M * (size_t)K * sizeof(float), 64); for ( i = 0; i < M; ++i ) { for ( j = 0; j < K; ++j ) { new_handle->a_dense[(i*K)+j] = a_dense[(i*lda)+j]; } } } /* free CSR */ free( a_csr_values ); free( a_csr_rowptr ); free( a_csr_colidx ); return new_handle; } LIBXSMM_API void libxsmm_dfsspmdm_execute( const libxsmm_dfsspmdm* handle, const double* B, double* C ) { int i; assert( handle != 0 ); if ( handle->a_dense == 0 ) { for ( i = 0; i < handle->N; i+=handle->N_chunksize ) { handle->kernel( (double*)handle->permute_operands, B+i, C+i ); } } else { for ( i = 0; i < handle->N; i+=handle->N_chunksize ) { handle->kernel( B+i, handle->a_dense, C+i ); } } } LIBXSMM_API void libxsmm_sfsspmdm_execute( const libxsmm_sfsspmdm* handle, const float* B, float* C ) { int i; assert( handle != 0 ); if ( handle->a_dense == 0 ) { for ( i = 0; i < handle->N; i+=handle->N_chunksize ) { handle->kernel( (float*)handle->permute_operands, B+i, C+i ); } } else { for ( i = 0; i < handle->N; i+=handle->N_chunksize ) { handle->kernel( B+i, handle->a_dense, C+i ); } } } LIBXSMM_API void libxsmm_dfsspmdm_destroy( libxsmm_dfsspmdm* handle ) { assert( handle != 0 ); if (handle->a_dense != 0) { libxsmm_free(handle->a_dense); } else { /* deallocate code known to be not registered; no index attached do not use libxsmm_release_kernel here! We also need to work around pointer-to-function to pointer-to-object conversion */ void* fp; if (handle->permute_operands != 0) { libxsmm_free(handle->permute_operands); } LIBXSMM_ASSIGN127(&fp, &handle->kernel); libxsmm_free(fp); } free(handle); } LIBXSMM_API void libxsmm_sfsspmdm_destroy( libxsmm_sfsspmdm* handle ) { assert( handle != 0 ); if (handle->a_dense != 0) { libxsmm_free(handle->a_dense); } else { /* deallocate code known to be not registered; no index attached do not use libxsmm_release_kernel here! We also need to work around pointer-to-function to pointer-to-object conversion */ void* fp; if (handle->permute_operands != 0) { libxsmm_free(handle->permute_operands); } LIBXSMM_ASSIGN127(&fp, &handle->kernel); libxsmm_free(fp); } free(handle); } libxsmm-1.17/src/libxsmm_gemm.c000066400000000000000000003171051415223013700165350ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include "libxsmm_gemm.h" #include "libxsmm_xcopy.h" #include "libxsmm_hash.h" #include #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #if !defined(LIBXSMM_NO_LIBM) # include #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif #if !defined(LIBXSMM_GEMM_NOJIT_TRANS) && \ /* TODO: fully support calling convention */ \ (defined(_WIN32) || defined(__CYGWIN__)) # define LIBXSMM_GEMM_NOJIT_TRANS #endif #if !defined(LIBXSMM_GEMM_KPARALLEL) && 0 # define LIBXSMM_GEMM_KPARALLEL #endif #if !defined(LIBXSMM_GEMM_BATCHSIZE) # define LIBXSMM_GEMM_BATCHSIZE 1024 #endif #if !defined(LIBXSMM_GEMM_TASKGRAIN) # define LIBXSMM_GEMM_TASKGRAIN 128 #endif #if !defined(LIBXSMM_GEMM_BATCHREDUCE) && !defined(_WIN32) && !defined(__CYGWIN__) /* not supported */ # define LIBXSMM_GEMM_BATCHREDUCE #endif #if !defined(LIBXSMM_GEMM_BATCHSCALE) && (defined(LIBXSMM_GEMM_BATCHREDUCE) || defined(LIBXSMM_WRAP)) #define LIBXSMM_GEMM_BATCHSCALE ((unsigned int)LIBXSMM_ROUND(sizeof(libxsmm_mmbatch_item) * (LIBXSMM_GEMM_MMBATCH_SCALE))) #endif #if defined(LIBXSMM_BUILD) # define LIBXSMM_GEMM_WEAK LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK #else # define LIBXSMM_GEMM_WEAK LIBXSMM_API #endif #if (0 != LIBXSMM_SYNC) /** Locks for the batch interface (duplicated C indexes). */ # define LIBXSMM_GEMM_LOCKIDX(IDX, NPOT) LIBXSMM_MOD2(LIBXSMM_CRC32U(LIBXSMM_BLASINT_NBITS)(2507/*seed*/, &(IDX)), NPOT) # define LIBXSMM_GEMM_LOCKPTR(PTR, NPOT) LIBXSMM_MOD2(LIBXSMM_CRC32U(LIBXSMM_BITS)(1975/*seed*/, &(PTR)), NPOT) # if !defined(LIBXSMM_GEMM_MAXNLOCKS) # define LIBXSMM_GEMM_MAXNLOCKS 1024 # endif # if !defined(LIBXSMM_GEMM_LOCKFWD) # define LIBXSMM_GEMM_LOCKFWD # endif # if LIBXSMM_LOCK_TYPE_ISPOD(LIBXSMM_GEMM_LOCK) LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE internal_gemm_locktype { char pad[LIBXSMM_CACHELINE]; LIBXSMM_LOCK_TYPE(LIBXSMM_GEMM_LOCK) state; } internal_gemm_locktype; # else LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE internal_gemm_locktype { LIBXSMM_LOCK_TYPE(LIBXSMM_GEMM_LOCK) state; } internal_gemm_locktype; # endif LIBXSMM_APIVAR_DEFINE(internal_gemm_locktype internal_gemm_lock[LIBXSMM_GEMM_MAXNLOCKS]); LIBXSMM_APIVAR_DEFINE(unsigned int internal_gemm_nlocks); /* populated number of locks */ #endif /* definition of corresponding variables */ LIBXSMM_APIVAR_PUBLIC_DEF(/*volatile*/libxsmm_dgemm_batch_function libxsmm_original_dgemm_batch_function); LIBXSMM_APIVAR_PUBLIC_DEF(/*volatile*/libxsmm_sgemm_batch_function libxsmm_original_sgemm_batch_function); LIBXSMM_APIVAR_PUBLIC_DEF(/*volatile*/libxsmm_dgemm_function libxsmm_original_dgemm_function); LIBXSMM_APIVAR_PUBLIC_DEF(/*volatile*/libxsmm_sgemm_function libxsmm_original_sgemm_function); LIBXSMM_APIVAR_PUBLIC_DEF(/*volatile*/libxsmm_dgemv_function libxsmm_original_dgemv_function); LIBXSMM_APIVAR_PUBLIC_DEF(/*volatile*/libxsmm_sgemv_function libxsmm_original_sgemv_function); /* definition of corresponding variables */ LIBXSMM_APIVAR_PUBLIC_DEF(libxsmm_gemm_descriptor libxsmm_mmbatch_desc); LIBXSMM_APIVAR_PUBLIC_DEF(void* libxsmm_mmbatch_array); LIBXSMM_APIVAR_PUBLIC_DEF(LIBXSMM_LOCK_TYPE(LIBXSMM_GEMM_LOCK) libxsmm_mmbatch_lock); LIBXSMM_APIVAR_PUBLIC_DEF(unsigned int libxsmm_mmbatch_size); LIBXSMM_APIVAR_PUBLIC_DEF(unsigned int libxsmm_gemm_npargroups); LIBXSMM_APIVAR_PUBLIC_DEF(unsigned int libxsmm_gemm_taskgrain); LIBXSMM_APIVAR_PUBLIC_DEF(int libxsmm_gemm_tasks); LIBXSMM_APIVAR_PUBLIC_DEF(int libxsmm_gemm_wrap); LIBXSMM_APIVAR_PRIVATE_DEF(libxsmm_gemm_prefetch_type libxsmm_gemm_auto_prefetch_default); /** Determines the prefetch strategy, which is used in case of LIBXSMM_PREFETCH_AUTO. */ LIBXSMM_APIVAR_PRIVATE_DEF(libxsmm_gemm_prefetch_type libxsmm_gemm_auto_prefetch); /** Prefetch strategy for tiled GEMM. */ LIBXSMM_APIVAR_DEFINE(libxsmm_gemm_prefetch_type internal_gemm_tiled_prefetch); /** Vector width used for GEMM. */ LIBXSMM_APIVAR_DEFINE(unsigned int internal_gemm_vwidth); /** Limit the M-extent of the tile. */ LIBXSMM_APIVAR_DEFINE(unsigned int internal_gemm_mlimit); /** Table of M-extents per type-size (tile shape). */ LIBXSMM_APIVAR_DEFINE(float internal_gemm_nstretch); /** Table of M-extents per type-size (tile shape). */ LIBXSMM_APIVAR_DEFINE(float internal_gemm_kstretch); /** Determines if batch-reduce is enabled */ LIBXSMM_APIVAR_DEFINE(int internal_gemm_batchreduce); #if defined(LIBXSMM_BUILD) LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void LIBXSMM_FSYMBOL(__real_dgemm_batch)( const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], const double alpha_array[], const double* a_array[], const libxsmm_blasint lda_array[], const double* b_array[], const libxsmm_blasint ldb_array[], const double beta_array[], double* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) { #if (0 != LIBXSMM_BLAS) # if defined(LIBXSMM_WRAP) && (0 > LIBXSMM_WRAP) if (0 > libxsmm_gemm_wrap) { LIBXSMM_FSYMBOL(dgemm_batch)(transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size); } else # endif { const libxsmm_blasint ptrsize = sizeof(void*); libxsmm_blasint i, j = 0; LIBXSMM_ASSERT(NULL != transa_array && NULL != transb_array && NULL != group_count && NULL != group_size); LIBXSMM_ASSERT(NULL != m_array && NULL != n_array && NULL != k_array && NULL != lda_array && NULL != ldb_array && NULL != ldc_array); LIBXSMM_ASSERT(NULL != a_array && NULL != b_array && NULL != c_array && NULL != alpha_array && NULL != beta_array); for (i = 0; i < *group_count; ++i) { const libxsmm_blasint size = group_size[i]; libxsmm_dmmbatch_blas(transa_array + i, transb_array + i, m_array[i], n_array[i], k_array[i], alpha_array + i, a_array + j, lda_array + i, b_array + j, ldb_array + i, beta_array + i, c_array + j, ldc_array + i, 0/*index_base*/, 0/*index_stride*/, &ptrsize, &ptrsize, &ptrsize, size); j += size; } } #else libxsmm_blas_error("dgemm_batch")(transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size); #endif } LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void LIBXSMM_FSYMBOL(__real_sgemm_batch)( const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], const float alpha_array[], const float* a_array[], const libxsmm_blasint lda_array[], const float* b_array[], const libxsmm_blasint ldb_array[], const float beta_array[], float* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) { #if (0 != LIBXSMM_BLAS) # if defined(LIBXSMM_WRAP) && (0 > LIBXSMM_WRAP) if (0 > libxsmm_gemm_wrap) { LIBXSMM_FSYMBOL(sgemm_batch)(transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size); } else # endif { const libxsmm_blasint ptrsize = sizeof(void*); libxsmm_blasint i; LIBXSMM_ASSERT(NULL != transa_array && NULL != transb_array && NULL != group_count && NULL != group_size); LIBXSMM_ASSERT(NULL != m_array && NULL != n_array && NULL != k_array && NULL != lda_array && NULL != ldb_array && NULL != ldc_array); LIBXSMM_ASSERT(NULL != a_array && NULL != b_array && NULL != c_array && NULL != alpha_array && NULL != beta_array); for (i = 0; i < *group_count; ++i) { const libxsmm_blasint size = group_size[i]; libxsmm_smmbatch_blas(transa_array + i, transb_array + i, m_array[i], n_array[i], k_array[i], alpha_array + i, a_array + i, lda_array + i, b_array + i, ldb_array + i, beta_array + i, c_array + i, ldc_array + i, 0/*index_base*/, 0/*index_stride*/, &ptrsize, &ptrsize, &ptrsize, size); } } #else libxsmm_blas_error("sgemm_batch")(transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size); #endif } LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void LIBXSMM_FSYMBOL(__real_dgemm)(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const double* alpha, const double* a, const libxsmm_blasint* lda, const double* b, const libxsmm_blasint* ldb, const double* beta, double* c, const libxsmm_blasint* ldc) { #if (0 != LIBXSMM_BLAS) LIBXSMM_FSYMBOL(dgemm)((LIBXSMM_BLAS_CONST char*)transa, (LIBXSMM_BLAS_CONST char*)transb, (LIBXSMM_BLAS_CONST libxsmm_blasint*)m, (LIBXSMM_BLAS_CONST libxsmm_blasint*)n, (LIBXSMM_BLAS_CONST libxsmm_blasint*)k, (LIBXSMM_BLAS_CONST double*)alpha, (LIBXSMM_BLAS_CONST double*)a, (LIBXSMM_BLAS_CONST libxsmm_blasint*)lda, (LIBXSMM_BLAS_CONST double*)b, (LIBXSMM_BLAS_CONST libxsmm_blasint*)ldb, (LIBXSMM_BLAS_CONST double*) beta, c, (LIBXSMM_BLAS_CONST libxsmm_blasint*)ldc); #else libxsmm_blas_error("dgemm")(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); #endif } LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void LIBXSMM_FSYMBOL(__real_sgemm)(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const float* alpha, const float* a, const libxsmm_blasint* lda, const float* b, const libxsmm_blasint* ldb, const float* beta, float* c, const libxsmm_blasint* ldc) { #if (0 != LIBXSMM_BLAS) LIBXSMM_FSYMBOL(sgemm)((LIBXSMM_BLAS_CONST char*)transa, (LIBXSMM_BLAS_CONST char*)transb, (LIBXSMM_BLAS_CONST libxsmm_blasint*)m, (LIBXSMM_BLAS_CONST libxsmm_blasint*)n, (LIBXSMM_BLAS_CONST libxsmm_blasint*)k, (LIBXSMM_BLAS_CONST float*)alpha, (LIBXSMM_BLAS_CONST float*)a, (LIBXSMM_BLAS_CONST libxsmm_blasint*)lda, (LIBXSMM_BLAS_CONST float*)b, (LIBXSMM_BLAS_CONST libxsmm_blasint*)ldb, (LIBXSMM_BLAS_CONST float*) beta, c, (LIBXSMM_BLAS_CONST libxsmm_blasint*)ldc); #else libxsmm_blas_error("sgemm")(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); #endif } LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void LIBXSMM_FSYMBOL(__real_dgemv)(const char* trans, const libxsmm_blasint* m, const libxsmm_blasint* n, const double* alpha, const double* a, const libxsmm_blasint* lda, const double* x, const libxsmm_blasint* incx, const double* beta, double* y, const libxsmm_blasint* incy) { #if (0 != LIBXSMM_BLAS) LIBXSMM_FSYMBOL(dgemv)((LIBXSMM_BLAS_CONST char*)trans, (LIBXSMM_BLAS_CONST libxsmm_blasint*)m, (LIBXSMM_BLAS_CONST libxsmm_blasint*)n, (LIBXSMM_BLAS_CONST double*)alpha, (LIBXSMM_BLAS_CONST double*)a, (LIBXSMM_BLAS_CONST libxsmm_blasint*)lda, (LIBXSMM_BLAS_CONST double*)x, (LIBXSMM_BLAS_CONST libxsmm_blasint*)incx, (LIBXSMM_BLAS_CONST double*) beta, y, (LIBXSMM_BLAS_CONST libxsmm_blasint*)incy); #else libxsmm_blas_error("dgemv")(trans, m, n, alpha, a, lda, x, incx, beta, y, incy); #endif } LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void LIBXSMM_FSYMBOL(__real_sgemv)(const char* trans, const libxsmm_blasint* m, const libxsmm_blasint* n, const float* alpha, const float* a, const libxsmm_blasint* lda, const float* x, const libxsmm_blasint* incx, const float* beta, float* y, const libxsmm_blasint* incy) { #if (0 != LIBXSMM_BLAS) LIBXSMM_FSYMBOL(sgemv)((LIBXSMM_BLAS_CONST char*)trans, (LIBXSMM_BLAS_CONST libxsmm_blasint*)m, (LIBXSMM_BLAS_CONST libxsmm_blasint*)n, (LIBXSMM_BLAS_CONST float*)alpha, (LIBXSMM_BLAS_CONST float*)a, (LIBXSMM_BLAS_CONST libxsmm_blasint*)lda, (LIBXSMM_BLAS_CONST float*)x, (LIBXSMM_BLAS_CONST libxsmm_blasint*)incx, (LIBXSMM_BLAS_CONST float*) beta, y, (LIBXSMM_BLAS_CONST libxsmm_blasint*)incy); #else libxsmm_blas_error("sgemv")(trans, m, n, alpha, a, lda, x, incx, beta, y, incy); #endif } LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void __real_dgemm_batch( const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], const double alpha_array[], const double* a_array[], const libxsmm_blasint lda_array[], const double* b_array[], const libxsmm_blasint ldb_array[], const double beta_array[], double* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) { LIBXSMM_FSYMBOL(__real_dgemm_batch)(transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size); } LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void __real_sgemm_batch( const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], const float alpha_array[], const float* a_array[], const libxsmm_blasint lda_array[], const float* b_array[], const libxsmm_blasint ldb_array[], const float beta_array[], float* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) { LIBXSMM_FSYMBOL(__real_sgemm_batch)(transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size); } #endif /*defined(LIBXSMM_BUILD)*/ LIBXSMM_GEMM_WEAK libxsmm_dgemm_batch_function libxsmm_original_dgemm_batch(void) { #if (0 != LIBXSMM_BLAS) && defined(LIBXSMM_WRAP) && (0 > LIBXSMM_WRAP) LIBXSMM_BLAS_WRAPPER(1, double, gemm_batch, libxsmm_original_dgemm_batch_function, NULL/*unknown*/); /*LIBXSMM_ASSERT(NULL != libxsmm_original_dgemm_batch_function);*/ #else LIBXSMM_BLAS_WRAPPER(0, double, gemm_batch, libxsmm_original_dgemm_batch_function, NULL/*unknown*/); #endif return libxsmm_original_dgemm_batch_function; } LIBXSMM_GEMM_WEAK libxsmm_sgemm_batch_function libxsmm_original_sgemm_batch(void) { #if (0 != LIBXSMM_BLAS) && defined(LIBXSMM_WRAP) && (0 > LIBXSMM_WRAP) LIBXSMM_BLAS_WRAPPER(1, float, gemm_batch, libxsmm_original_sgemm_batch_function, NULL/*unknown*/); /*LIBXSMM_ASSERT(NULL != libxsmm_original_sgemm_batch_function);*/ #else LIBXSMM_BLAS_WRAPPER(0, float, gemm_batch, libxsmm_original_sgemm_batch_function, NULL/*unknown*/); #endif return libxsmm_original_sgemm_batch_function; } LIBXSMM_GEMM_WEAK libxsmm_dgemm_function libxsmm_original_dgemm(void) { #if (0 != LIBXSMM_BLAS) LIBXSMM_BLAS_WRAPPER(1, double, gemm, libxsmm_original_dgemm_function, NULL/*unknown*/); LIBXSMM_ASSERT(NULL != libxsmm_original_dgemm_function); #else LIBXSMM_BLAS_WRAPPER(0, double, gemm, libxsmm_original_dgemm_function, NULL/*unknown*/); #endif return libxsmm_original_dgemm_function; } LIBXSMM_GEMM_WEAK libxsmm_sgemm_function libxsmm_original_sgemm(void) { #if (0 != LIBXSMM_BLAS) LIBXSMM_BLAS_WRAPPER(1, float, gemm, libxsmm_original_sgemm_function, NULL/*unknown*/); LIBXSMM_ASSERT(NULL != libxsmm_original_sgemm_function); #else LIBXSMM_BLAS_WRAPPER(0, float, gemm, libxsmm_original_sgemm_function, NULL/*unknown*/); #endif return libxsmm_original_sgemm_function; } LIBXSMM_GEMM_WEAK libxsmm_dgemv_function libxsmm_original_dgemv(void) { #if (0 != LIBXSMM_BLAS) LIBXSMM_BLAS_WRAPPER(1, double, gemv, libxsmm_original_dgemv_function, NULL/*unknown*/); LIBXSMM_ASSERT(NULL != libxsmm_original_dgemv_function); #else LIBXSMM_BLAS_WRAPPER(0, double, gemv, libxsmm_original_dgemv_function, NULL/*unknown*/); #endif return libxsmm_original_dgemv_function; } LIBXSMM_GEMM_WEAK libxsmm_sgemv_function libxsmm_original_sgemv(void) { #if (0 != LIBXSMM_BLAS) LIBXSMM_BLAS_WRAPPER(1, float, gemv, libxsmm_original_sgemv_function, NULL/*unknown*/); LIBXSMM_ASSERT(NULL != libxsmm_original_sgemv_function); #else LIBXSMM_BLAS_WRAPPER(0, float, gemv, libxsmm_original_sgemv_function, NULL/*unknown*/); #endif return libxsmm_original_sgemv_function; } LIBXSMM_API libxsmm_sink_function libxsmm_blas_error(const char* symbol) { static int error_once = 0; LIBXSMM_BLAS_ERROR(symbol, &error_once); return libxsmm_sink; } LIBXSMM_API_INTERN void libxsmm_gemm_init(int archid) { const char* env_w = getenv("LIBXSMM_GEMM_WRAP"); LIBXSMM_LOCK_ATTR_TYPE(LIBXSMM_GEMM_LOCK) attr; LIBXSMM_LOCK_ATTR_INIT(LIBXSMM_GEMM_LOCK, &attr); #if defined(LIBXSMM_WRAP) /* determines if wrap is considered */ { /* intercepted GEMMs (1: sequential and non-tiled, 2: parallelized and tiled) */ # if defined(__STATIC) /* with static library the user controls interceptor already */ libxsmm_gemm_wrap = ((NULL == env_w || 0 == *env_w) /* LIBXSMM_WRAP=0: no promotion */ ? (0 < (LIBXSMM_WRAP) ? (LIBXSMM_WRAP + 2) : (LIBXSMM_WRAP - 2)) : atoi(env_w)); # else libxsmm_gemm_wrap = ((NULL == env_w || 0 == *env_w) ? (LIBXSMM_WRAP) : atoi(env_w)); # endif } #endif { /* setup prefetch strategy for tiled GEMMs */ const char *const env_p = getenv("LIBXSMM_TGEMM_PREFETCH"); const libxsmm_gemm_prefetch_type tiled_prefetch_default = LIBXSMM_GEMM_PREFETCH_AL2_AHEAD; const int uid = ((NULL == env_p || 0 == *env_p) ? LIBXSMM_PREFETCH_AUTO/*default*/ : atoi(env_p)); internal_gemm_tiled_prefetch = (0 <= uid ? libxsmm_gemm_uid2prefetch(uid) : tiled_prefetch_default); } #if (0 != LIBXSMM_SYNC) { /* initialize locks for the batch interface */ const char *const env_locks = getenv("LIBXSMM_GEMM_NLOCKS"); const int nlocks = ((NULL == env_locks || 0 == *env_locks) ? -1/*default*/ : atoi(env_locks)); unsigned int i; internal_gemm_nlocks = LIBXSMM_UP2POT(0 > nlocks ? (LIBXSMM_GEMM_MAXNLOCKS) : LIBXSMM_MIN(nlocks, LIBXSMM_GEMM_MAXNLOCKS)); for (i = 0; i < internal_gemm_nlocks; ++i) LIBXSMM_LOCK_INIT(LIBXSMM_GEMM_LOCK, &internal_gemm_lock[i].state, &attr); } #endif #if defined(LIBXSMM_GEMM_BATCHREDUCE) || defined(LIBXSMM_WRAP) { /* determines if batch-reduce kernel or batch-wrap is considered */ const char *const env_r = getenv("LIBXSMM_GEMM_BATCHREDUCE"); internal_gemm_batchreduce = (NULL == env_r || 0 == *env_r) ? 0 : atoi(env_r); if ((NULL == env_w || 0 == *env_w) && ((LIBXSMM_GEMM_MMBATCH_VERBOSITY <= libxsmm_verbosity && INT_MAX != libxsmm_verbosity) || 0 > libxsmm_verbosity)) { libxsmm_mmbatch_desc.flags = LIBXSMM_MMBATCH_FLAG_STATISTIC; /* enable auto-batch statistic */ internal_gemm_batchreduce = 0; } if (0 != internal_gemm_batchreduce || 0 != libxsmm_gemm_wrap) { const char *const env_b = getenv("LIBXSMM_GEMM_BATCHSIZE"); const int env_bi = (NULL == env_b || 0 == *env_b) ? -1/*auto*/ : atoi(env_b); const unsigned int env_bu = (unsigned int)(0 >= env_bi ? (LIBXSMM_GEMM_BATCHSIZE) : env_bi); const unsigned int batchscale = LIBXSMM_ABS(internal_gemm_batchreduce) * 2048/*arbitrary*/ * 2/*A and B-matrices*/ * sizeof(void*); const unsigned int minsize = LIBXSMM_UPDIV(batchscale * env_bu, LIBXSMM_GEMM_BATCHSCALE); const unsigned int batchsize = LIBXSMM_MAX(env_bu, minsize); const void *const extra = NULL; LIBXSMM_ASSERT(1 < (LIBXSMM_GEMM_MMBATCH_SCALE) && NULL == libxsmm_mmbatch_array); if (EXIT_SUCCESS == libxsmm_xmalloc(&libxsmm_mmbatch_array, (size_t)batchsize * (LIBXSMM_GEMM_BATCHSCALE), 0/*auto-alignment*/, LIBXSMM_MALLOC_FLAG_PRIVATE /*| LIBXSMM_MALLOC_FLAG_SCRATCH*/, &extra, sizeof(extra))) { LIBXSMM_LOCK_INIT(LIBXSMM_GEMM_LOCK, &libxsmm_mmbatch_lock, &attr); LIBXSMM_ASSERT(NULL != libxsmm_mmbatch_array); libxsmm_mmbatch_size = batchsize; } } } #else LIBXSMM_UNUSED(env_w); #endif { /* determines grain-size of tasks (when available) */ const char *const env_s = getenv("LIBXSMM_GEMM_NPARGROUPS"); libxsmm_gemm_npargroups = ((NULL == env_s || 0 == *env_s || 0 >= atoi(env_s)) ? (LIBXSMM_GEMM_NPARGROUPS) : atoi(env_s)); } if (LIBXSMM_X86_AVX512_CORE <= archid) { internal_gemm_vwidth = 64; internal_gemm_mlimit = 48; internal_gemm_nstretch = 3.0f; internal_gemm_kstretch = 2.0f; } else if (LIBXSMM_X86_AVX512_MIC <= archid) { internal_gemm_vwidth = 64; internal_gemm_mlimit = 64; internal_gemm_nstretch = 1.0f; internal_gemm_kstretch = 1.0f; } else if (LIBXSMM_X86_AVX2 <= archid) { internal_gemm_vwidth = 32; internal_gemm_mlimit = 48; internal_gemm_nstretch = 3.0f; internal_gemm_kstretch = 2.0f; } else if (LIBXSMM_X86_AVX <= archid) { internal_gemm_vwidth = 32; internal_gemm_mlimit = 48; internal_gemm_nstretch = 5.0f; internal_gemm_kstretch = 1.0f; } else { internal_gemm_vwidth = 16; internal_gemm_mlimit = 48; internal_gemm_nstretch = 7.0f; internal_gemm_kstretch = 5.0f; } { /* setup tile sizes according to environment (LIBXSMM_TGEMM_M, LIBXSMM_TGEMM_N, LIBXSMM_TGEMM_K) */ const char *const env_m = getenv("LIBXSMM_TGEMM_M"), *const env_n = getenv("LIBXSMM_TGEMM_N"), *const env_k = getenv("LIBXSMM_TGEMM_K"); const int m = ((NULL == env_m || 0 == *env_m) ? 0 : atoi(env_m)); const int n = ((NULL == env_n || 0 == *env_n) ? 0 : atoi(env_n)); const int k = ((NULL == env_k || 0 == *env_k) ? 0 : atoi(env_k)); if (0 < m) { if (0 < n) internal_gemm_nstretch = ((float)n) / m; if (0 < k) internal_gemm_kstretch = ((float)k) / m; } } { /* setup tile sizes according to environment (LIBXSMM_TGEMM_NS, LIBXSMM_TGEMM_KS) */ const char *const env_ns = getenv("LIBXSMM_TGEMM_NS"), *const env_ks = getenv("LIBXSMM_TGEMM_KS"); const double ns = ((NULL == env_ns || 0 == *env_ns) ? 0 : atof(env_ns)); const double ks = ((NULL == env_ks || 0 == *env_ks) ? 0 : atof(env_ks)); if (0 < ns) internal_gemm_nstretch = (float)LIBXSMM_MIN(24, ns); if (0 < ks) internal_gemm_kstretch = (float)LIBXSMM_MIN(24, ks); } { /* determines if OpenMP tasks are used (when available) */ const char *const env_t = getenv("LIBXSMM_GEMM_TASKS"); const int gemm_tasks = ((NULL == env_t || 0 == *env_t) ? 0/*disabled*/ : atoi(env_t)); libxsmm_gemm_tasks = (0 <= gemm_tasks ? LIBXSMM_ABS(gemm_tasks) : 1/*enabled*/); } { /* determines grain-size of tasks (when available) */ const char *const env_g = getenv("LIBXSMM_GEMM_TASKGRAIN"); const int gemm_taskgrain = ((NULL == env_g || 0 == *env_g || 0 >= atoi(env_g)) ? (LIBXSMM_GEMM_TASKGRAIN) : atoi(env_g)); /* adjust grain-size or scale beyond the number of threads */ libxsmm_gemm_taskgrain = LIBXSMM_MAX(0 < libxsmm_gemm_tasks ? (gemm_taskgrain / libxsmm_gemm_tasks) : gemm_taskgrain, 1); } LIBXSMM_LOCK_ATTR_DESTROY(LIBXSMM_GEMM_LOCK, &attr); /* determine BLAS function-pointers */ libxsmm_original_dgemm_batch(); libxsmm_original_sgemm_batch(); libxsmm_original_dgemm(); libxsmm_original_sgemm(); libxsmm_original_dgemv(); libxsmm_original_sgemv(); } LIBXSMM_API_INTERN void libxsmm_gemm_finalize(void) { #if (0 != LIBXSMM_SYNC) unsigned int i; for (i = 0; i < internal_gemm_nlocks; ++i) LIBXSMM_LOCK_DESTROY(LIBXSMM_GEMM_LOCK, &internal_gemm_lock[i].state); #endif #if defined(LIBXSMM_GEMM_BATCHREDUCE) || defined(LIBXSMM_WRAP) if (NULL != libxsmm_mmbatch_array) { void *extra = NULL, *const mmbatch_array = libxsmm_mmbatch_array; if (EXIT_SUCCESS == libxsmm_get_malloc_xinfo(mmbatch_array, NULL/*size*/, NULL/*flags*/, &extra) && NULL != extra) { const libxsmm_mmbatch_flush_function flush = *(libxsmm_mmbatch_flush_function*)extra; if (NULL != flush) flush(); } #if !defined(NDEBUG) libxsmm_mmbatch_array = NULL; #endif libxsmm_xfree(mmbatch_array, 0/*no check*/); LIBXSMM_LOCK_DESTROY(LIBXSMM_GEMM_LOCK, &libxsmm_mmbatch_lock); } #endif } LIBXSMM_API libxsmm_gemm_prefetch_type libxsmm_get_gemm_xprefetch(const int* prefetch) { LIBXSMM_INIT /* load configuration */ return libxsmm_get_gemm_prefetch(NULL == prefetch ? ((int)libxsmm_gemm_auto_prefetch) : *prefetch); } LIBXSMM_API libxsmm_gemm_prefetch_type libxsmm_get_gemm_prefetch(int prefetch) { libxsmm_gemm_prefetch_type result; #if !defined(_WIN32) && !defined(__CYGWIN__) && !defined(__MINGW32__) if (0 > prefetch) { LIBXSMM_INIT /* load configuration */ result = libxsmm_gemm_auto_prefetch_default; } else { result = (libxsmm_gemm_prefetch_type)prefetch; } #else /* TODO: full support for Windows calling convention */ result = LIBXSMM_GEMM_PREFETCH_NONE; LIBXSMM_UNUSED(prefetch); #endif return result; } LIBXSMM_API_INTERN int libxsmm_gemm_prefetch2uid(libxsmm_gemm_prefetch_type prefetch) { switch (prefetch) { case LIBXSMM_GEMM_PREFETCH_SIGONLY: return 2; case LIBXSMM_GEMM_PREFETCH_BL2_VIA_C: return 3; case LIBXSMM_GEMM_PREFETCH_AL2_AHEAD: return 4; case LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C_AHEAD: return 5; case LIBXSMM_GEMM_PREFETCH_AL2: return 6; case LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C: return 7; default: { LIBXSMM_ASSERT(LIBXSMM_GEMM_PREFETCH_NONE == prefetch); return 0; } } } LIBXSMM_API_INTERN libxsmm_gemm_prefetch_type libxsmm_gemm_uid2prefetch(int uid) { switch (uid) { case 1: return LIBXSMM_GEMM_PREFETCH_NONE; /* nopf */ case 2: return LIBXSMM_GEMM_PREFETCH_SIGONLY; /* pfsigonly */ case 3: return LIBXSMM_GEMM_PREFETCH_BL2_VIA_C; /* BL2viaC */ case 4: return LIBXSMM_GEMM_PREFETCH_AL2_AHEAD; /* curAL2 */ case 5: return LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C_AHEAD; /* curAL2_BL2viaC */ case 6: return LIBXSMM_GEMM_PREFETCH_AL2; /* AL2 */ case 7: return LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C; /* AL2_BL2viaC */ default: { if (0 != libxsmm_verbosity) { /* library code is expected to be mute */ static int error_once = 0; if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM WARNING: invalid prefetch strategy requested!\n"); } } return LIBXSMM_GEMM_PREFETCH_NONE; } } } LIBXSMM_API void libxsmm_gemm_print(void* ostream, libxsmm_gemm_precision precision, const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const void* beta, void* c, const libxsmm_blasint* ldc) { libxsmm_gemm_print2(ostream, precision, precision, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } LIBXSMM_API void libxsmm_gemm_print2(void* ostream, libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const void* beta, void* c, const libxsmm_blasint* ldc) { const libxsmm_blasint nn = *(n ? n : m), kk = *(k ? k : m); const char ctransa = (char)(NULL != transa ? (*transa) : (0 == (LIBXSMM_FLAGS & LIBXSMM_GEMM_FLAG_TRANS_A) ? 'n' : 't')); const char ctransb = (char)(NULL != transb ? (*transb) : (0 == (LIBXSMM_FLAGS & LIBXSMM_GEMM_FLAG_TRANS_B) ? 'n' : 't')); const libxsmm_blasint ilda = (NULL != lda ? *lda : (('n' == ctransa || 'N' == ctransa) ? *m : kk)); const libxsmm_blasint ildb = (NULL != ldb ? *ldb : (('n' == ctransb || 'N' == ctransb) ? kk : nn)); const libxsmm_blasint ildc = *(NULL != ldc ? ldc : m); libxsmm_mhd_elemtype mhd_elemtype = LIBXSMM_MHD_ELEMTYPE_UNKNOWN; char string_a[128], string_b[128], typeprefix = 0; switch (iprec | oprec) { case LIBXSMM_GEMM_PRECISION_F64: { LIBXSMM_ASSERT(iprec == oprec); LIBXSMM_SNPRINTF(string_a, sizeof(string_a), "%g", NULL != alpha ? *((const double*)alpha) : LIBXSMM_ALPHA); LIBXSMM_SNPRINTF(string_b, sizeof(string_b), "%g", NULL != beta ? *((const double*)beta) : LIBXSMM_BETA); mhd_elemtype = LIBXSMM_MHD_ELEMTYPE_F64; typeprefix = 'd'; } break; case LIBXSMM_GEMM_PRECISION_F32: { LIBXSMM_ASSERT(iprec == oprec); LIBXSMM_SNPRINTF(string_a, sizeof(string_a), "%g", NULL != alpha ? *((const float*)alpha) : LIBXSMM_ALPHA); LIBXSMM_SNPRINTF(string_b, sizeof(string_b), "%g", NULL != beta ? *((const float*)beta) : LIBXSMM_BETA); mhd_elemtype = LIBXSMM_MHD_ELEMTYPE_F32; typeprefix = 's'; } break; default: if (0 != libxsmm_verbosity) { /* library code is expected to be mute */ static int error_once = 0; if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { /* TODO: support I16, etc. */ fprintf(stderr, "LIBXSMM ERROR: unsupported data-type requested!\n"); } } } if (0 != typeprefix) { if (NULL != ostream) { /* print information about GEMM call */ if (NULL != a && NULL != b && NULL != c) { fprintf((FILE*)ostream, "%cgemm('%c', '%c', %" PRIuPTR "/*m*/, %" PRIuPTR "/*n*/, %" PRIuPTR "/*k*/,\n" " %s/*alpha*/, %p/*a*/, %" PRIuPTR "/*lda*/,\n" " %p/*b*/, %" PRIuPTR "/*ldb*/,\n" " %s/*beta*/, %p/*c*/, %" PRIuPTR "/*ldc*/)", typeprefix, ctransa, ctransb, (uintptr_t)*m, (uintptr_t)nn, (uintptr_t)kk, string_a, a, (uintptr_t)ilda, b, (uintptr_t)ildb, string_b, c, (uintptr_t)ildc); } else { fprintf((FILE*)ostream, "%cgemm(trans=%c%c mnk=%" PRIuPTR ",%" PRIuPTR ",%" PRIuPTR " ldx=%" PRIuPTR ",%" PRIuPTR ",%" PRIuPTR " a,b=%s,%s)", typeprefix, ctransa, ctransb, (uintptr_t)*m, (uintptr_t)nn, (uintptr_t)kk, (uintptr_t)ilda, (uintptr_t)ildb, (uintptr_t)ildc, string_a, string_b); } } else { /* dump A, B, and C matrices into MHD files */ char extension_header[256]; size_t data_size[2], size[2]; if (NULL != a) { LIBXSMM_SNPRINTF(extension_header, sizeof(extension_header), "TRANS = %c\nALPHA = %s", ctransa, string_a); LIBXSMM_SNPRINTF(string_a, sizeof(string_a), "libxsmm_a_%p.mhd", a); data_size[0] = (size_t)ilda; data_size[1] = (size_t)kk; size[0] = (size_t)(*m); size[1] = (size_t)kk; libxsmm_mhd_write(string_a, NULL/*offset*/, size, data_size, 2/*ndims*/, 1/*ncomponents*/, mhd_elemtype, NULL/*conversion*/, a, NULL/*header_size*/, extension_header, NULL/*extension*/, 0/*extension_size*/); } if (NULL != b) { LIBXSMM_SNPRINTF(extension_header, sizeof(extension_header), "\nTRANS = %c", ctransb); LIBXSMM_SNPRINTF(string_a, sizeof(string_a), "libxsmm_b_%p.mhd", b); data_size[0] = (size_t)ildb; data_size[1] = (size_t)nn; size[0] = (size_t)kk; size[1] = (size_t)nn; libxsmm_mhd_write(string_a, NULL/*offset*/, size, data_size, 2/*ndims*/, 1/*ncomponents*/, mhd_elemtype, NULL/*conversion*/, b, NULL/*header_size*/, extension_header, NULL/*extension*/, 0/*extension_size*/); } if (NULL != c) { LIBXSMM_SNPRINTF(extension_header, sizeof(extension_header), "BETA = %s", string_b); LIBXSMM_SNPRINTF(string_a, sizeof(string_a), "libxsmm_c_%p.mhd", c); data_size[0] = (size_t)ildc; data_size[1] = (size_t)nn; size[0] = (size_t)(*m); size[1] = (size_t)nn; libxsmm_mhd_write(string_a, NULL/*offset*/, size, data_size, 2/*ndims*/, 1/*ncomponents*/, mhd_elemtype, NULL/*conversion*/, c, NULL/*header_size*/, extension_header, NULL/*extension*/, 0/*extension_size*/); } } } } LIBXSMM_API void libxsmm_gemm_dprint( void* ostream, libxsmm_gemm_precision precision, char transa, char transb, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, double dalpha, const void* a, libxsmm_blasint lda, const void* b, libxsmm_blasint ldb, double dbeta, void* c, libxsmm_blasint ldc) { libxsmm_gemm_dprint2(ostream, precision, precision, transa, transb, m, n, k, dalpha, a, lda, b, ldb, dbeta, c, ldc); } LIBXSMM_API void libxsmm_gemm_dprint2( void* ostream, libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, char transa, char transb, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, double dalpha, const void* a, libxsmm_blasint lda, const void* b, libxsmm_blasint ldb, double dbeta, void* c, libxsmm_blasint ldc) { switch (iprec) { case LIBXSMM_GEMM_PRECISION_F64: { libxsmm_gemm_print2(ostream, LIBXSMM_GEMM_PRECISION_F64, oprec, &transa, &transb, &m, &n, &k, &dalpha, a, &lda, b, &ldb, &dbeta, c, &ldc); } break; case LIBXSMM_GEMM_PRECISION_F32: { const float alpha = (float)dalpha, beta = (float)dbeta; libxsmm_gemm_print2(ostream, LIBXSMM_GEMM_PRECISION_F32, oprec, &transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); } break; default: { libxsmm_gemm_print2(ostream, iprec, oprec, &transa, &transb, &m, &n, &k, &dalpha, a, &lda, b, &ldb, &dbeta, c, &ldc); } } } LIBXSMM_API void libxsmm_gemm_xprint(void* ostream, libxsmm_xmmfunction kernel, const void* a, const void* b, void* c) { const libxsmm_descriptor* desc; libxsmm_code_pointer code; size_t code_size; code.xgemm = kernel; if (NULL != libxsmm_get_kernel_xinfo(code, &desc, &code_size) && NULL != desc && LIBXSMM_KERNEL_KIND_MATMUL == desc->kind) { libxsmm_gemm_dprint2(ostream, (libxsmm_gemm_precision)LIBXSMM_GETENUM_INP(desc->gemm.desc.datatype), (libxsmm_gemm_precision)LIBXSMM_GETENUM_OUT(desc->gemm.desc.datatype), (char)(0 == (LIBXSMM_GEMM_FLAG_TRANS_A & desc->gemm.desc.flags) ? 'N' : 'T'), (char)(0 == (LIBXSMM_GEMM_FLAG_TRANS_B & desc->gemm.desc.flags) ? 'N' : 'T'), (libxsmm_blasint)desc->gemm.desc.m, (libxsmm_blasint)desc->gemm.desc.n, (libxsmm_blasint)desc->gemm.desc.k, /*0 != (LIBXSMM_GEMM_FLAG_ALPHA_0 & libxsmm_mmbatch_desc.flags) ? 0 : */1, a, (libxsmm_blasint)desc->gemm.desc.lda, b, (libxsmm_blasint)desc->gemm.desc.ldb, 0 != (LIBXSMM_GEMM_FLAG_BETA_0 & libxsmm_mmbatch_desc.flags) ? 0 : 1, c, (libxsmm_blasint)desc->gemm.desc.ldc); fprintf((FILE*)ostream, " = %p+%u", code.ptr_const, (unsigned int)code_size); } } LIBXSMM_API void libxsmm_blas_xgemm(libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const void* beta, void* c, const libxsmm_blasint* ldc) { LIBXSMM_INIT switch (iprec) { case LIBXSMM_GEMM_PRECISION_F64: { LIBXSMM_ASSERT(iprec == oprec); LIBXSMM_BLAS_XGEMM(double, double, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } break; case LIBXSMM_GEMM_PRECISION_F32: { LIBXSMM_ASSERT(iprec == oprec); LIBXSMM_BLAS_XGEMM(float, float, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } break; default: if (0 != libxsmm_verbosity) { /* library code is expected to be mute */ static int error_once = 0; LIBXSMM_UNUSED(oprec); if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { /* TODO: support I16, etc. */ fprintf(stderr, "LIBXSMM ERROR: unsupported data-type requested!\n"); } } } } LIBXSMM_API_INLINE int libxsmm_gemm_plan_internal(unsigned int ntasks, unsigned int m, unsigned int n, unsigned int k, /* whole problem size */ unsigned int tm, unsigned int tn, unsigned int tk, /* tile size (kernel) */ unsigned int* nmt, unsigned int* nnt, unsigned int* nkt, /* number of tiles */ unsigned int* mt, unsigned int* nt, unsigned int* kt) /* number of tasks */ { unsigned int result = EXIT_SUCCESS, replan = 0; LIBXSMM_ASSERT(NULL != nmt && NULL != nnt && NULL != nkt); LIBXSMM_ASSERT(NULL != mt && NULL != nt && NULL != kt); LIBXSMM_ASSERT(0 < ntasks); *nmt = (m + tm - 1) / LIBXSMM_MAX(tm, 1); *nnt = (n + tn - 1) / LIBXSMM_MAX(tn, 1); *nkt = (k + tk - 1) / LIBXSMM_MAX(tk, 1); #if !defined(NDEBUG) *mt = *nt = *kt = 0; #endif do { if (1 >= replan) *mt = libxsmm_product_limit(*nmt, ntasks, 0); if (1 == replan || ntasks <= *mt) { /* M-parallelism */ *nt = 1; *kt = 1; replan = 0; } else { const unsigned int mntasks = libxsmm_product_limit((*nmt) * (*nnt), ntasks, 0); if (0 == replan && *mt >= mntasks) replan = 1; if (2 == replan || (0 == replan && ntasks <= mntasks)) { /* MN-parallelism */ *nt = mntasks / *mt; *kt = 1; replan = 0; } else { /* MNK-parallelism */ const unsigned int mnktasks = libxsmm_product_limit((*nmt) * (*nnt) * (*nkt), ntasks, 0); if (mntasks < mnktasks) { #if defined(LIBXSMM_GEMM_KPARALLEL) *nt = mntasks / *mt; *kt = mnktasks / mntasks; replan = 0; #else static int error_once = 0; if ((LIBXSMM_VERBOSITY_HIGH <= libxsmm_verbosity || 0 > libxsmm_verbosity) /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM WARNING (XGEMM): K-parallelism triggered!\n"); } #endif } #if defined(LIBXSMM_GEMM_KPARALLEL) else #endif if (0 == replan) replan = 2; } } } while (0 != replan); if (0 == *mt || 0 == *nt || 0 == *kt) { result = EXIT_FAILURE; } return result; } LIBXSMM_API libxsmm_gemm_handle* libxsmm_gemm_handle_init(libxsmm_gemm_blob* blob, libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const void* alpha, const void* beta, int flags, /*unsigned*/int ntasks) { unsigned int ulda, uldb, um, un, uk, tm = 0, tn = 0, tk = 0, max_ntasks = 0; libxsmm_descriptor_blob desc_blob; union { libxsmm_gemm_handle* ptr; libxsmm_gemm_blob* blob; } result; LIBXSMM_ASSERT(sizeof(libxsmm_gemm_handle) <= sizeof(libxsmm_gemm_blob)); if (NULL != blob && NULL != m && 0 < ntasks) { unsigned int ntm = 0, ntn = 0, ntk = 0, mt = 1, nt = 1, kt = 1; const char *const env_tm = getenv("LIBXSMM_TGEMM_M"); libxsmm_blasint klda, kldb, kldc, km, kn; libxsmm_gemm_descriptor* desc; const int prf_copy = 0; double dbeta; LIBXSMM_INIT result.blob = blob; #if defined(NDEBUG) result.ptr->copy_a.ptr = result.ptr->copy_b.ptr = result.ptr->copy_i.ptr = result.ptr->copy_o.ptr = NULL; #else memset(blob, 0, sizeof(libxsmm_gemm_blob)); #endif if (EXIT_SUCCESS != libxsmm_dvalue((libxsmm_datatype)oprec, beta, &dbeta)) dbeta = LIBXSMM_BETA; /* fuse beta into flags */ result.ptr->gemm_flags = LIBXSMM_GEMM_PFLAGS(transa, transb, LIBXSMM_FLAGS) | (LIBXSMM_NEQ(0, dbeta) ? 0 : LIBXSMM_GEMM_FLAG_BETA_0); /* TODO: check that arguments fit into handle (unsigned int vs. libxsmm_blasint) */ um = (unsigned int)(*m); uk = (NULL != k ? ((unsigned int)(*k)) : um); un = (NULL != n ? ((unsigned int)(*n)) : uk); result.ptr->otypesize = libxsmm_typesize((libxsmm_datatype)oprec); if (NULL == env_tm || 0 >= atoi(env_tm)) { const unsigned int vwidth = LIBXSMM_MAX(internal_gemm_vwidth / result.ptr->otypesize, 1); const double s2 = (double)internal_gemm_nstretch * internal_gemm_kstretch; /* LIBXSMM_INIT! */ unsigned int tmi = libxsmm_product_limit(um, internal_gemm_mlimit, 0); /* LIBXSMM_INIT! */ for (; vwidth <= tmi; tmi = libxsmm_product_limit(um, tmi - 1, 0)) { const double si = (double)(LIBXSMM_CONFIG_MAX_MNK) / ((double)tmi * tmi * tmi), s = (s2 <= si ? 1 : (s2 / si)); unsigned int tni = libxsmm_product_limit(un, LIBXSMM_MAX((unsigned int)(tmi * (s * internal_gemm_nstretch)), 1), 0); unsigned int tki = libxsmm_product_limit(uk, LIBXSMM_MAX((unsigned int)(tmi * (s * internal_gemm_kstretch)), 1), 0); unsigned int ntmi, ntni, ntki, mti = 1, nti = 1, kti = 1; LIBXSMM_ASSERT(tmi <= um && tni <= un && tki <= uk); if (LIBXSMM_GEMM_FLAG_TRANS_AB == (LIBXSMM_GEMM_FLAG_TRANS_AB & result.ptr->gemm_flags)) { const unsigned int ttm = (unsigned int)libxsmm_product_limit(tmi, (unsigned int)ntasks, 0); const unsigned int ttn = (unsigned int)libxsmm_product_limit(tni, (unsigned int)ntasks, 0); tmi = tni = LIBXSMM_MIN(ttm, ttn); /* prefer threads over larger tile */ } if (EXIT_SUCCESS == libxsmm_gemm_plan_internal((unsigned int)ntasks, um, un, uk, tmi, tni, tki, &ntmi, &ntni, &ntki, &mti, &nti, &kti)) { const int exit_plan = ((tmi < um && tni < un && tki < uk && (tm != tmi || tn != tni || tk != tki)) ? 0 : 1); const unsigned itasks = mti * nti * kti; LIBXSMM_ASSERT(1 <= itasks); if (max_ntasks < itasks) { ntm = ntmi; ntn = ntni; ntk = ntki; mt = mti; nt = nti; kt = kti; tm = tmi; tn = tni; tk = tki; max_ntasks = itasks; } if (itasks == (unsigned int)ntasks || 0 != exit_plan) break; } } } else { const unsigned int tmi = atoi(env_tm); const double s2 = (double)internal_gemm_nstretch * internal_gemm_kstretch; /* LIBXSMM_INIT! */ double si, s; tm = libxsmm_product_limit(um, LIBXSMM_MIN(tmi, internal_gemm_mlimit), 0); /* LIBXSMM_INIT! */ si = (double)(LIBXSMM_CONFIG_MAX_MNK) / ((double)tm * tm * tm); s = (s2 <= si ? 1 : (s2 / si)); tn = libxsmm_product_limit(un, LIBXSMM_MAX((unsigned int)(tm * (s * internal_gemm_nstretch)), 1), 0); tk = libxsmm_product_limit(uk, LIBXSMM_MAX((unsigned int)(tm * (s * internal_gemm_kstretch)), 1), 0); if (LIBXSMM_GEMM_FLAG_TRANS_AB == (LIBXSMM_GEMM_FLAG_TRANS_AB & result.ptr->gemm_flags)) { const unsigned int ttm = (unsigned int)libxsmm_product_limit(tm, (unsigned int)ntasks, 0); const unsigned int ttn = (unsigned int)libxsmm_product_limit(tn, (unsigned int)ntasks, 0); tm = tn = LIBXSMM_MIN(ttm, ttn); /* prefer threads over larger tile */ } if (EXIT_SUCCESS == libxsmm_gemm_plan_internal((unsigned int)ntasks, um, un, uk, tm, tn, tk, &ntm, &ntn, &ntk, &mt, &nt, &kt)) { #if defined(NDEBUG) max_ntasks = 2; /* only need something unequal to zero to pass below condition */ #else max_ntasks = mt * nt * kt; #endif } } LIBXSMM_ASSERT(LIBXSMM_GEMM_FLAG_TRANS_AB != (LIBXSMM_GEMM_FLAG_TRANS_AB & result.ptr->gemm_flags) || tm == tn); /* check for non-conforming GEMM parameters (error), and conforming GEMM parameters (fast-path, fall-back) */ if (0 == max_ntasks || 0 == tm || 0 == tn || 0 == tk || 0 != (um % tm) || 0 != (un % tn) || 0 != (uk % tk)) { return NULL; } result.ptr->flags = flags; if (LIBXSMM_GEMM_HANDLE_FLAG_AUTO == flags && 0 == LIBXSMM_SMM_AI(um, un, uk, 0 == (result.ptr->gemm_flags & LIBXSMM_GEMM_FLAG_BETA_0) ? 1 : 2/*RFO*/, result.ptr->otypesize)) { if (um == LIBXSMM_UP2POT(um) || un == LIBXSMM_UP2POT(un)) { /* power-of-two (POT) extent(s) */ result.ptr->flags |= LIBXSMM_GEMM_HANDLE_FLAG_COPY_C; if (LIBXSMM_GEMM_FLAG_TRANS_AB != (LIBXSMM_GEMM_FLAG_TRANS_AB & result.ptr->gemm_flags)) { result.ptr->flags |= LIBXSMM_GEMM_HANDLE_FLAG_COPY_A; } } } result.ptr->itypesize = libxsmm_typesize((libxsmm_datatype)iprec); result.ptr->ldc = (unsigned int)(NULL != ldc ? *ldc : *m); ulda = (NULL != lda ? ((unsigned int)(*lda)) : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & result.ptr->gemm_flags) ? ((unsigned int)(*m)) : uk)); uldb = (NULL != ldb ? ((unsigned int)(*ldb)) : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & result.ptr->gemm_flags) ? uk : un)); if (LIBXSMM_GEMM_FLAG_TRANS_AB != (LIBXSMM_GEMM_FLAG_TRANS_AB & result.ptr->gemm_flags)) { /* NN, NT, or TN */ kldc = (libxsmm_blasint)result.ptr->ldc; klda = (libxsmm_blasint)ulda; kldb = (libxsmm_blasint)uldb; if (0 != (LIBXSMM_GEMM_FLAG_TRANS_A & result.ptr->gemm_flags)) { /* TN */ #if !defined(LIBXSMM_GEMM_NOJIT_TRANS) result.ptr->copy_a.xtrans = libxsmm_dispatch_trans(libxsmm_trans_descriptor_init(&desc_blob, result.ptr->itypesize, tk, tm, tm/*ldo*/)); #endif klda = (libxsmm_blasint)tm; } else if (0 != (LIBXSMM_GEMM_HANDLE_FLAG_COPY_A & result.ptr->flags)) { result.ptr->copy_a.xmatcopy = libxsmm_dispatch_mcopy(libxsmm_mcopy_descriptor_init(&desc_blob, result.ptr->itypesize, tm, tk, tm/*ldo*/, ulda/*ldi*/, 0/*flags*/, prf_copy, NULL/*unroll*/)); klda = (libxsmm_blasint)tm; } if (0 != (LIBXSMM_GEMM_FLAG_TRANS_B & result.ptr->gemm_flags)) { /* NT */ #if !defined(LIBXSMM_GEMM_NOJIT_TRANS) result.ptr->copy_b.xtrans = libxsmm_dispatch_trans(libxsmm_trans_descriptor_init(&desc_blob, result.ptr->itypesize, tn, tk, tk/*ldo*/)); #endif kldb = (libxsmm_blasint)tk; } else if (0 != (LIBXSMM_GEMM_HANDLE_FLAG_COPY_B & result.ptr->flags)) { result.ptr->copy_b.xmatcopy = libxsmm_dispatch_mcopy(libxsmm_mcopy_descriptor_init(&desc_blob, result.ptr->itypesize, tk, tn, tk/*ldo*/, uldb/*ldi*/, 0/*flags*/, prf_copy, NULL/*unroll*/)); kldb = (libxsmm_blasint)tk; } if (0 != (LIBXSMM_GEMM_HANDLE_FLAG_COPY_C & result.ptr->flags)) { result.ptr->copy_o.xmatcopy = libxsmm_dispatch_mcopy(libxsmm_mcopy_descriptor_init(&desc_blob, result.ptr->otypesize, tm, tn, result.ptr->ldc/*ldo*/, tm/*ldi*/, 0/*flags*/, prf_copy, NULL/*unroll*/)); if (0 == (result.ptr->gemm_flags & LIBXSMM_GEMM_FLAG_BETA_0)) { /* copy-in only if beta!=0 */ result.ptr->copy_i.xmatcopy = libxsmm_dispatch_mcopy(libxsmm_mcopy_descriptor_init(&desc_blob, result.ptr->otypesize, tm, tn, tm/*ldo*/, result.ptr->ldc/*ldi*/, 0/*flags*/, prf_copy, NULL/*unroll*/)); } kldc = (libxsmm_blasint)tm; } result.ptr->lda = ulda; result.ptr->ldb = uldb; result.ptr->km = tm; result.ptr->kn = tn; result.ptr->mt = mt; result.ptr->nt = nt; result.ptr->m = um; result.ptr->n = un; result.ptr->dm = LIBXSMM_UPDIV(ntm, mt) * tm; result.ptr->dn = LIBXSMM_UPDIV(ntn, nt) * tn; km = tm; kn = tn; } else { /* TT */ const unsigned int tt = tm; klda = (libxsmm_blasint)uldb; kldb = (libxsmm_blasint)ulda; kldc = (libxsmm_blasint)tt; LIBXSMM_ASSERT(tt == tn); #if !defined(LIBXSMM_GEMM_NOJIT_TRANS) result.ptr->copy_o.xtrans = libxsmm_dispatch_trans(libxsmm_trans_descriptor_init(&desc_blob, result.ptr->otypesize, tt, tt, result.ptr->ldc/*ldo*/)); if (0 == (result.ptr->gemm_flags & LIBXSMM_GEMM_FLAG_BETA_0)) { /* copy-in only if beta!=0 */ result.ptr->copy_i.xtrans = libxsmm_dispatch_trans(libxsmm_trans_descriptor_init(&desc_blob, result.ptr->otypesize, tt, tt, tt/*ldo*/)); } #endif if (0 != (LIBXSMM_GEMM_HANDLE_FLAG_COPY_A & result.ptr->flags)) { result.ptr->copy_a.xmatcopy = libxsmm_dispatch_mcopy(libxsmm_mcopy_descriptor_init(&desc_blob, result.ptr->itypesize, tt, tk, tk/*ldo*/, uldb/*ldi*/, 0/*flags*/, prf_copy, NULL/*unroll*/)); klda = (libxsmm_blasint)tt; } if (0 != (LIBXSMM_GEMM_HANDLE_FLAG_COPY_B & result.ptr->flags)) { result.ptr->copy_b.xmatcopy = libxsmm_dispatch_mcopy(libxsmm_mcopy_descriptor_init(&desc_blob, result.ptr->itypesize, tk, tn, tk/*ldo*/, ulda/*ldi*/, 0/*flags*/, prf_copy, NULL/*unroll*/)); kldb = (libxsmm_blasint)tk; } result.ptr->lda = uldb; result.ptr->ldb = ulda; result.ptr->km = tn; result.ptr->kn = tm; result.ptr->mt = nt; result.ptr->nt = mt; result.ptr->m = un; result.ptr->n = um; result.ptr->dm = LIBXSMM_UPDIV(ntn, nt) * tn; result.ptr->dn = LIBXSMM_UPDIV(ntm, mt) * tm; km = kn = tt; } result.ptr->dk = ntk / kt * tk; result.ptr->kk = tk; result.ptr->kt = kt; result.ptr->k = uk; desc = libxsmm_gemm_descriptor_init2( /* remove transpose flags from kernel request */ &desc_blob, iprec, oprec, km, kn, result.ptr->kk, klda, kldb, kldc, alpha, beta, result.ptr->gemm_flags & ~LIBXSMM_GEMM_FLAG_TRANS_AB, internal_gemm_tiled_prefetch); result.ptr->kernel[0] = libxsmm_xmmdispatch(desc); if (NULL != result.ptr->kernel[0].xmm) { if (0 == (desc->flags & LIBXSMM_GEMM_FLAG_BETA_0)) { /* beta!=0 */ result.ptr->kernel[1] = result.ptr->kernel[0]; } else { /* generate kernel with beta=1 */ desc->flags &= ~LIBXSMM_GEMM_FLAG_BETA_0; result.ptr->kernel[1] = libxsmm_xmmdispatch(desc); if (NULL == result.ptr->kernel[1].xmm) result.ptr = NULL; } } else result.ptr = NULL; } else { result.ptr = NULL; } return result.ptr; } LIBXSMM_API_INLINE size_t libxsmm_gemm_handle_get_scratch_size_a(const libxsmm_gemm_handle* handle) { size_t result; if (NULL == handle || (0 == (handle->flags & LIBXSMM_GEMM_HANDLE_FLAG_COPY_A) && (LIBXSMM_GEMM_FLAG_TRANS_AB == (LIBXSMM_GEMM_FLAG_TRANS_AB & handle->gemm_flags) || (LIBXSMM_GEMM_FLAG_TRANS_A & handle->gemm_flags) == 0))) { result = 0; } else { const size_t size = (size_t)handle->km * handle->kk * handle->itypesize; result = LIBXSMM_UP2(size, LIBXSMM_CACHELINE); } return result; } LIBXSMM_API_INLINE size_t libxsmm_gemm_handle_get_scratch_size_b(const libxsmm_gemm_handle* handle) { size_t result; if (NULL == handle || (0 == (handle->flags & LIBXSMM_GEMM_HANDLE_FLAG_COPY_B) && (LIBXSMM_GEMM_FLAG_TRANS_AB == (LIBXSMM_GEMM_FLAG_TRANS_AB & handle->gemm_flags) || (LIBXSMM_GEMM_FLAG_TRANS_B & handle->gemm_flags) == 0))) { result = 0; } else { const size_t size = (size_t)handle->kk * handle->kn * handle->itypesize; result = LIBXSMM_UP2(size, LIBXSMM_CACHELINE); } return result; } LIBXSMM_API_INLINE size_t libxsmm_gemm_handle_get_scratch_size_c(const libxsmm_gemm_handle* handle) { size_t result; if (NULL == handle || (0 == (handle->flags & LIBXSMM_GEMM_HANDLE_FLAG_COPY_C) && LIBXSMM_GEMM_FLAG_TRANS_AB != (LIBXSMM_GEMM_FLAG_TRANS_AB & handle->gemm_flags))) { result = 0; } else { const size_t size = (size_t)handle->km * handle->kn * handle->otypesize; result = LIBXSMM_UP2(size, LIBXSMM_CACHELINE); } return result; } LIBXSMM_API size_t libxsmm_gemm_handle_get_scratch_size(const libxsmm_gemm_handle* handle) { size_t result; if (NULL != handle) { /* thread-local scratch buffer for GEMM */ const size_t size_a = libxsmm_gemm_handle_get_scratch_size_a(handle); const size_t size_b = libxsmm_gemm_handle_get_scratch_size_b(handle); const size_t size_c = libxsmm_gemm_handle_get_scratch_size_c(handle); result = (size_a + size_b + size_c) * handle->mt * handle->nt * handle->kt; } else { result = 0; } return result; } LIBXSMM_API void libxsmm_gemm_thread(const libxsmm_gemm_handle* handle, void* scratch, const void* a, const void* b, void* c, /*unsigned*/int tid, /*unsigned*/int nthreads) { #if !defined(NDEBUG) if (NULL != handle && 0 <= tid && tid < nthreads) #endif { const unsigned int uthreads = (unsigned int)nthreads; const unsigned int ntasks = handle->mt * handle->nt * handle->kt; const unsigned int spread = (ntasks <= uthreads ? (uthreads / ntasks) : 1); const unsigned int utid = (unsigned int)tid, vtid = utid / spread; if (utid < (spread * ntasks) && 0 == (utid - vtid * spread)) { const int excess = (uthreads << 1) <= (vtid + ntasks); const unsigned int rtid = vtid / handle->mt, mtid = vtid - rtid * handle->mt, ntid = rtid % handle->nt, ktid = vtid / (handle->mt * handle->nt); const unsigned int m0 = mtid * handle->dm, m1 = (0 == excess ? LIBXSMM_MIN(m0 + handle->dm, handle->m) : handle->m); const unsigned int n0 = ntid * handle->dn, n1 = (0 == excess ? LIBXSMM_MIN(n0 + handle->dn, handle->n) : handle->n); const unsigned int k0 = ktid * handle->dk, k1 = (0 == excess ? LIBXSMM_MIN(k0 + handle->dk, handle->k) : handle->k); const unsigned int ldo = (LIBXSMM_GEMM_FLAG_TRANS_AB != (LIBXSMM_GEMM_FLAG_TRANS_AB & handle->gemm_flags) ? handle->km : handle->kk); /* calculate increments to simplify address calculations */ const unsigned int dom = handle->km * handle->otypesize; const unsigned int don = handle->kn * handle->otypesize; const unsigned int dik = handle->kk * handle->itypesize; const unsigned int on = handle->otypesize * n0; /* calculate base address of thread-local storage */ const size_t size_a = libxsmm_gemm_handle_get_scratch_size_a(handle); const size_t size_b = libxsmm_gemm_handle_get_scratch_size_b(handle); const size_t size_c = libxsmm_gemm_handle_get_scratch_size_c(handle); char *const at = (char*)scratch + (size_a + size_b + size_c) * vtid; char *const bt = at + size_a, *const ct = bt + size_b; const libxsmm_xcopykernel kernel = { NULL }; /* loop induction variables and other variables */ unsigned int om = handle->otypesize * m0, im = m0, in = n0, ik = k0, im1, in1, ik1; LIBXSMM_ASSERT_MSG(mtid < handle->mt && ntid < handle->nt && ktid < handle->kt, "Invalid task-ID"); LIBXSMM_ASSERT_MSG(m1 <= handle->m && n1 <= handle->n && k1 <= handle->k, "Invalid task size"); for (im1 = im + handle->km; (im1 - 1) < m1; im = im1, im1 += handle->km, om += dom) { unsigned int dn = don, dka = dik, dkb = dik; char *c0 = (char*)c, *ci; const char *aa; if (LIBXSMM_GEMM_FLAG_TRANS_AB != (LIBXSMM_GEMM_FLAG_TRANS_AB & handle->gemm_flags)) { if (0 != (LIBXSMM_GEMM_FLAG_TRANS_A & handle->gemm_flags)) { /* TN */ aa = (const char*)a + ((size_t)im * handle->lda + k0) * handle->itypesize; } else if (0 != (LIBXSMM_GEMM_FLAG_TRANS_B & handle->gemm_flags)) { /* NT */ aa = (const char*)a + ((size_t)k0 * handle->lda + im) * handle->itypesize; dka *= handle->lda; dkb *= handle->ldb; } else { /* NN */ aa = (const char*)a + ((size_t)k0 * handle->lda + im) * handle->itypesize; dka *= handle->lda; } c0 += (size_t)on * handle->ldc + om; dn *= handle->ldc; } else { /* TT */ aa = (const char*)b + ((size_t)k0 * handle->lda + im) * handle->itypesize; c0 += (size_t)on + handle->ldc * (size_t)om; dka *= handle->lda; } for (in = n0, in1 = in + handle->kn; (in1 - 1) < n1; in = in1, in1 += handle->kn, c0 += dn) { const char *a0 = aa, *b0 = (const char*)b; if (LIBXSMM_GEMM_FLAG_TRANS_AB != (LIBXSMM_GEMM_FLAG_TRANS_AB & handle->gemm_flags)) { if (0 != (LIBXSMM_GEMM_FLAG_TRANS_B & handle->gemm_flags)) { /* NT */ b0 += ((size_t)k0 * handle->ldb + in) * handle->itypesize; } else { /* NN or TN */ b0 += ((size_t)in * handle->ldb + k0) * handle->itypesize; } } else { /* TT */ b0 = (const char*)a + ((size_t)in * handle->ldb + k0) * handle->itypesize; } if (NULL == handle->copy_i.ptr_const) { ci = (NULL == handle->copy_o.ptr_const ? c0 : ct); if (LIBXSMM_GEMM_FLAG_TRANS_AB == (LIBXSMM_GEMM_FLAG_TRANS_AB & handle->gemm_flags)) { const unsigned int km = handle->kn, kn = handle->km; libxsmm_otrans_internal(ct/*out*/, c0/*in*/, handle->otypesize, handle->ldc/*ldi*/, kn/*ldo*/, 0, km, 0, kn, km/*tile*/, kn/*tile*/, kernel); ci = ct; } else if (0 != (LIBXSMM_GEMM_HANDLE_FLAG_COPY_C & handle->flags)) { if (0 == (handle->gemm_flags & LIBXSMM_GEMM_FLAG_BETA_0)) { /* copy-in only if beta!=0 */ libxsmm_matcopy_internal(ct/*out*/, c0/*in*/, handle->otypesize, handle->ldc/*ldi*/, handle->km/*ldo*/, 0, handle->km, 0, handle->kn, handle->km/*tile*/, handle->kn/*tile*/, kernel); } ci = ct; } } else { /* MCOPY/TCOPY kernel */ handle->copy_i.xmatcopy(c0, &handle->ldc, ct, &handle->km); ci = ct; } for (ik = k0, ik1 = ik + handle->kk; (ik1 - 1) < k1; ik = ik1, ik1 += handle->kk) { const char *const a1 = a0 + dka, *const b1 = b0 + dkb, *ai = a0, *bi = b0; if (NULL == handle->copy_a.ptr_const) { if (LIBXSMM_GEMM_FLAG_TRANS_AB != (LIBXSMM_GEMM_FLAG_TRANS_AB & handle->gemm_flags) && (LIBXSMM_GEMM_FLAG_TRANS_A & handle->gemm_flags) != 0) /* pure A-transpose */ { LIBXSMM_ASSERT(ldo == handle->km); libxsmm_otrans_internal(at/*out*/, a0/*in*/, handle->itypesize, handle->lda/*ldi*/, ldo, 0, handle->kk, 0, handle->km, handle->kk/*tile*/, handle->km/*tile*/, kernel); ai = at; } else if (0 != (LIBXSMM_GEMM_HANDLE_FLAG_COPY_A & handle->flags)) { libxsmm_matcopy_internal(at/*out*/, a0/*in*/, handle->itypesize, handle->lda/*ldi*/, ldo, 0, handle->km, 0, handle->kk, handle->km/*tile*/, handle->kk/*tile*/, kernel); ai = at; } } else { /* MCOPY/TCOPY kernel */ handle->copy_a.xmatcopy(a0, &handle->lda, at, &ldo); ai = at; } if (NULL == handle->copy_b.ptr_const) { if (LIBXSMM_GEMM_FLAG_TRANS_AB != (LIBXSMM_GEMM_FLAG_TRANS_AB & handle->gemm_flags) && (LIBXSMM_GEMM_FLAG_TRANS_B & handle->gemm_flags) != 0) /* pure B-transpose */ { libxsmm_otrans_internal(bt/*out*/, b0/*in*/, handle->itypesize, handle->ldb/*ldi*/, handle->kk/*ldo*/, 0, handle->kn, 0, handle->kk, handle->kn/*tile*/, handle->kk/*tile*/, kernel); bi = bt; } else if (0 != (LIBXSMM_GEMM_HANDLE_FLAG_COPY_B & handle->flags)) { libxsmm_matcopy_internal(bt/*out*/, b0/*in*/, handle->itypesize, handle->ldb/*ldi*/, handle->kk/*ldo*/, 0, handle->kk, 0, handle->kn, handle->kk/*tile*/, handle->kn/*tile*/, kernel); bi = bt; } } else { /* MCOPY/TCOPY kernel */ handle->copy_b.xmatcopy(b0, &handle->ldb, bt, &handle->kk); bi = bt; } /* beta0-kernel on first-touch, beta1-kernel otherwise (beta0/beta1 are identical if beta=1) */ LIBXSMM_MMCALL_PRF(handle->kernel[k0!=ik?1:0].xmm, ai, bi, ci, a1, b1, c0); a0 = a1; b0 = b1; } /* TODO: synchronize */ if (NULL == handle->copy_o.ptr_const) { if (LIBXSMM_GEMM_FLAG_TRANS_AB == (LIBXSMM_GEMM_FLAG_TRANS_AB & handle->gemm_flags)) { libxsmm_otrans_internal(c0/*out*/, ct/*in*/, handle->otypesize, handle->km/*ldi*/, handle->ldc/*ldo*/, 0, handle->km, 0, handle->kn, handle->km/*tile*/, handle->kn/*tile*/, kernel); } else if (0 != (LIBXSMM_GEMM_HANDLE_FLAG_COPY_C & handle->flags)) { libxsmm_matcopy_internal(c0/*out*/, ct/*in*/, handle->otypesize, handle->km/*ldi*/, handle->ldc/*ldo*/, 0, handle->km, 0, handle->kn, handle->km/*tile*/, handle->kn/*tile*/, kernel); } } else { /* MCOPY/TCOPY kernel */ handle->copy_o.xmatcopy(ct, &handle->km, c0, &handle->ldc); } } } } } #if !defined(NDEBUG) else if (/*implies LIBXSMM_INIT*/0 != libxsmm_get_verbosity()) { /* library code is expected to be mute */ static int error_once = 0; if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: libxsmm_gemm_thread - invalid handle!\n"); } } #endif } LIBXSMM_API void libxsmm_xgemm(libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const void* beta, void* c, const libxsmm_blasint* ldc) { libxsmm_gemm_blob blob; const libxsmm_gemm_handle *const handle = libxsmm_gemm_handle_init(&blob, iprec, oprec, transa, transb, m, n, k, lda, ldb, ldc, alpha, beta, LIBXSMM_GEMM_HANDLE_FLAG_AUTO, 1/*ntasks*/); const size_t scratch_size = libxsmm_gemm_handle_get_scratch_size(handle); void* scratch = NULL; if (NULL != handle && (0 == scratch_size || NULL != (scratch = libxsmm_scratch_malloc(scratch_size, LIBXSMM_CACHELINE, LIBXSMM_MALLOC_INTERNAL_CALLER)))) { libxsmm_gemm_thread(handle, scratch, a, b, c, 0/*tid*/, 1/*ntasks*/); libxsmm_free(scratch); } else { /* fall-back or error */ static int error_once = 0; if (NULL == handle) { /* fall-back */ if ((LIBXSMM_VERBOSITY_HIGH <= libxsmm_verbosity || 0 > libxsmm_verbosity) /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM WARNING (XGEMM): fall-back code path triggered!\n"); } } else if (0 != libxsmm_verbosity && /* library code is expected to be mute */ 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: failed to allocate GEMM-scratch memory!\n"); } libxsmm_blas_xgemm(iprec, oprec, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } } LIBXSMM_API void libxsmm_dgemm_batch( const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], const double alpha_array[], const double* a_array[], const libxsmm_blasint lda_array[], const double* b_array[], const libxsmm_blasint ldb_array[], const double beta_array[], double* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) { const libxsmm_blasint ngroups = LIBXSMM_ABS(*group_count), ptrsize = sizeof(void*); libxsmm_blasint i, j = 0; for (i = 0; i < ngroups; ++i) { const libxsmm_blasint size = group_size[i]; libxsmm_gemm_batch(LIBXSMM_GEMM_PRECISION_F64, LIBXSMM_GEMM_PRECISION_F64, transa_array + i, transb_array + i, m_array[i], n_array[i], k_array[i], alpha_array + i, a_array + j, lda_array + i, b_array + j, ldb_array + i, beta_array + i, c_array + j, ldc_array + i, 0/*index_base*/, 0/*index_stride*/, &ptrsize, &ptrsize, &ptrsize, size); j += LIBXSMM_ABS(size); } } LIBXSMM_API void libxsmm_sgemm_batch( const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], const float alpha_array[], const float* a_array[], const libxsmm_blasint lda_array[], const float* b_array[], const libxsmm_blasint ldb_array[], const float beta_array[], float* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) { const libxsmm_blasint ngroups = LIBXSMM_ABS(*group_count), ptrsize = sizeof(void*); libxsmm_blasint i, j = 0; for (i = 0; i < ngroups; ++i) { const libxsmm_blasint size = group_size[i]; libxsmm_gemm_batch(LIBXSMM_GEMM_PRECISION_F32, LIBXSMM_GEMM_PRECISION_F32, transa_array + i, transb_array + i, m_array[i], n_array[i], k_array[i], alpha_array + i, a_array + j, lda_array + i, b_array + j, ldb_array + i, beta_array + i, c_array + j, ldc_array + i, 0/*index_base*/, 0/*index_stride*/, &ptrsize, &ptrsize, &ptrsize, size); j += LIBXSMM_ABS(size); } } LIBXSMM_API void libxsmm_dgemm(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const double* alpha, const double* a, const libxsmm_blasint* lda, const double* b, const libxsmm_blasint* ldb, const double* beta, double* c, const libxsmm_blasint* ldc) { LIBXSMM_XGEMM(double, double, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } LIBXSMM_API void libxsmm_sgemm(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const float* alpha, const float* a, const libxsmm_blasint* lda, const float* b, const libxsmm_blasint* ldb, const float* beta, float* c, const libxsmm_blasint* ldc) { LIBXSMM_XGEMM(float, float, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } LIBXSMM_API void libxsmm_wigemm(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const int* alpha, const short* a, const libxsmm_blasint* lda, const short* b, const libxsmm_blasint* ldb, const int* beta, int* c, const libxsmm_blasint* ldc) { LIBXSMM_XGEMM(short, int, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } LIBXSMM_API void libxsmm_bsgemm(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const float* alpha, const libxsmm_bfloat16* a, const libxsmm_blasint* lda, const libxsmm_bfloat16* b, const libxsmm_blasint* ldb, const float* beta, float* c, const libxsmm_blasint* ldc) { LIBXSMM_XGEMM(libxsmm_bfloat16, float, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } LIBXSMM_API int libxsmm_mmbatch_kernel(libxsmm_xmmfunction kernel, libxsmm_blasint index_base, libxsmm_blasint index_stride, const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], const void* a, const void* b, void* c, libxsmm_blasint batchsize, /*unsigned*/int tid, /*unsigned*/int ntasks, unsigned char itypesize, unsigned char otypesize, int flags) { int result = EXIT_SUCCESS; const libxsmm_blasint size = LIBXSMM_ABS(batchsize); const libxsmm_blasint tasksize = LIBXSMM_UPDIV(size, ntasks); const libxsmm_blasint begin = tid * tasksize, span = begin + tasksize; const libxsmm_blasint end = LIBXSMM_MIN(span, size); LIBXSMM_ASSERT(NULL != kernel.xmm); if (begin < end) { const char *const a0 = (const char*)a, *const b0 = (const char*)b; char *const c0 = (char*)c; LIBXSMM_ASSERT(0 < itypesize && 0 < otypesize); if (0 == (LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS & flags)) { if (0 != index_stride) { /* stride arrays contain indexes */ libxsmm_blasint i = begin * index_stride, ic = (NULL != stride_c ? (LIBXSMM_ACCESS(const libxsmm_blasint, stride_c, i) - index_base) : 0); const char* ai = &a0[NULL != stride_a ? ((LIBXSMM_ACCESS(const libxsmm_blasint, stride_a, i) - index_base) * itypesize) : 0]; const char* bi = &b0[NULL != stride_b ? ((LIBXSMM_ACCESS(const libxsmm_blasint, stride_b, i) - index_base) * itypesize) : 0]; char* ci = &c0[ic * otypesize]; const libxsmm_blasint end1 = (end != size ? end : (end - 1)) * index_stride; #if (0 != LIBXSMM_SYNC) if (1 == ntasks || 0 == internal_gemm_nlocks || 0 > batchsize || 0 != (LIBXSMM_GEMM_FLAG_BETA_0 & flags)) #endif { /* no locking */ if (NULL != stride_a && NULL != stride_b && NULL != stride_c) { const unsigned char ibits = (unsigned char)LIBXSMM_INTRINSICS_BITSCANBWD32(itypesize); const unsigned char obits = (unsigned char)LIBXSMM_INTRINSICS_BITSCANBWD32(otypesize); if (itypesize == (1 << ibits) && otypesize == (1 << obits)) { for (i += index_stride; i <= end1; i += index_stride) { const char *const an = &a0[(LIBXSMM_ACCESS(const libxsmm_blasint, stride_a, i) - index_base) << ibits]; const char *const bn = &b0[(LIBXSMM_ACCESS(const libxsmm_blasint, stride_b, i) - index_base) << ibits]; char *const cn = &c0[(LIBXSMM_ACCESS(const libxsmm_blasint, stride_c, i) - index_base) << obits]; kernel.xmm(ai, bi, ci, an, bn, cn); /* with prefetch */ ai = an; bi = bn; ci = cn; } } else { /* non-pot type sizes */ for (i += index_stride; i <= end1; i += index_stride) { const char *const an = &a0[(LIBXSMM_ACCESS(const libxsmm_blasint, stride_a, i) - index_base) * itypesize]; const char *const bn = &b0[(LIBXSMM_ACCESS(const libxsmm_blasint, stride_b, i) - index_base) * itypesize]; char *const cn = &c0[(LIBXSMM_ACCESS(const libxsmm_blasint, stride_c, i) - index_base) * otypesize]; kernel.xmm(ai, bi, ci, an, bn, cn); /* with prefetch */ ai = an; bi = bn; ci = cn; } } } else { /* mixed specification of strides */ for (i += index_stride; i <= end1; i += index_stride) { const char *const an = &a0[NULL != stride_a ? ((LIBXSMM_ACCESS(const libxsmm_blasint, stride_a, i) - index_base) * itypesize) : 0]; const char *const bn = &b0[NULL != stride_b ? ((LIBXSMM_ACCESS(const libxsmm_blasint, stride_b, i) - index_base) * itypesize) : 0]; char *const cn = &c0[NULL != stride_c ? ((LIBXSMM_ACCESS(const libxsmm_blasint, stride_c, i) - index_base) * otypesize) : 0]; kernel.xmm(ai, bi, ci, an, bn, cn); /* with prefetch */ ai = an; bi = bn; ci = cn; } } if (end == size) { /* remainder multiplication */ kernel.xmm(ai, bi, ci, ai, bi, ci); /* pseudo-prefetch */ } } #if (0 != LIBXSMM_SYNC) else { /* synchronize among C-indexes */ LIBXSMM_LOCK_TYPE(LIBXSMM_GEMM_LOCK)* lock = &internal_gemm_lock[LIBXSMM_GEMM_LOCKIDX(ic, internal_gemm_nlocks)].state; # if defined(LIBXSMM_GEMM_LOCKFWD) LIBXSMM_LOCK_TYPE(LIBXSMM_GEMM_LOCK)* lock0 = NULL; # endif LIBXSMM_ASSERT(NULL != lock); if (NULL != stride_a && NULL != stride_b && NULL != stride_c) { for (i += index_stride; i <= end1; i += index_stride) { ic = LIBXSMM_ACCESS(const libxsmm_blasint, stride_c, i) - index_base; { const char *const an = &a0[(LIBXSMM_ACCESS(const libxsmm_blasint, stride_a, i) - index_base) * itypesize]; const char *const bn = &b0[(LIBXSMM_ACCESS(const libxsmm_blasint, stride_b, i) - index_base) * itypesize]; char *const cn = &c0[ic * otypesize]; LIBXSMM_LOCK_TYPE(LIBXSMM_GEMM_LOCK) *const lock1 = &internal_gemm_lock[LIBXSMM_GEMM_LOCKIDX(ic, internal_gemm_nlocks)].state; # if defined(LIBXSMM_GEMM_LOCKFWD) if (lock != lock0) { lock0 = lock; LIBXSMM_LOCK_ACQUIRE(LIBXSMM_GEMM_LOCK, lock); } # else LIBXSMM_LOCK_ACQUIRE(LIBXSMM_GEMM_LOCK, lock); # endif kernel.xmm(ai, bi, ci, an, bn, cn); /* with prefetch */ # if defined(LIBXSMM_GEMM_LOCKFWD) if (lock != lock1 || i == end1) { LIBXSMM_LOCK_RELEASE(LIBXSMM_GEMM_LOCK, lock); lock = lock1; } # else LIBXSMM_LOCK_RELEASE(LIBXSMM_GEMM_LOCK, lock); lock = lock1; # endif ai = an; bi = bn; ci = cn; /* next */ } } } else { for (i += index_stride; i <= end1; i += index_stride) { ic = (NULL != stride_c ? (LIBXSMM_ACCESS(const libxsmm_blasint, stride_c, i) - index_base) : 0); { const char *const an = &a0[NULL != stride_a ? ((LIBXSMM_ACCESS(const libxsmm_blasint, stride_a, i) - index_base) * itypesize) : 0]; const char *const bn = &b0[NULL != stride_b ? ((LIBXSMM_ACCESS(const libxsmm_blasint, stride_b, i) - index_base) * itypesize) : 0]; char *const cn = &c0[ic * otypesize]; LIBXSMM_LOCK_TYPE(LIBXSMM_GEMM_LOCK) *const lock1 = &internal_gemm_lock[LIBXSMM_GEMM_LOCKIDX(ic, internal_gemm_nlocks)].state; # if defined(LIBXSMM_GEMM_LOCKFWD) if (lock != lock0) { lock0 = lock; LIBXSMM_LOCK_ACQUIRE(LIBXSMM_GEMM_LOCK, lock); } # else LIBXSMM_LOCK_ACQUIRE(LIBXSMM_GEMM_LOCK, lock); # endif kernel.xmm(ai, bi, ci, an, bn, cn); /* with prefetch */ # if defined(LIBXSMM_GEMM_LOCKFWD) if (lock != lock1 || i == end1) { LIBXSMM_LOCK_RELEASE(LIBXSMM_GEMM_LOCK, lock); lock = lock1; } # else LIBXSMM_LOCK_RELEASE(LIBXSMM_GEMM_LOCK, lock); lock = lock1; # endif ai = an; bi = bn; ci = cn; /* next */ } } } if (end == size) { /* remainder multiplication */ LIBXSMM_LOCK_ACQUIRE(LIBXSMM_GEMM_LOCK, lock); kernel.xmm(ai, bi, ci, ai, bi, ci); /* pseudo-prefetch */ LIBXSMM_LOCK_RELEASE(LIBXSMM_GEMM_LOCK, lock); } } #endif /*(0 != LIBXSMM_SYNC)*/ } else { /* singular strides are measured in Bytes */ const libxsmm_blasint da = (NULL != stride_a ? (*stride_a - index_base * sizeof(void*)) : 0); const libxsmm_blasint db = (NULL != stride_b ? (*stride_b - index_base * sizeof(void*)) : 0); const libxsmm_blasint dc = (NULL != stride_c ? (*stride_c - index_base * sizeof(void*)) : 0); libxsmm_blasint i; const libxsmm_blasint end1 = (end != size ? end : (end - 1)); const char *ai = a0 + (size_t)da * begin, *bi = b0 + (size_t)db * begin; char* ci = c0 + (size_t)dc * begin; #if (0 != LIBXSMM_SYNC) if (1 == ntasks || 0 == internal_gemm_nlocks || 0 > batchsize || 0 != (LIBXSMM_GEMM_FLAG_BETA_0 & flags)) #endif { /* no locking */ for (i = begin; i < end1; ++i) { const char *const an = ai + da, *const bn = bi + db; char *const cn = ci + dc; #if defined(LIBXSMM_GEMM_CHECK) if (NULL != *((const void**)ai) && NULL != *((const void**)bi) && NULL != *((const void**)ci)) #endif { kernel.xmm( /* with prefetch */ *((const void**)ai), *((const void**)bi), *((void**)ci), *((const void**)an), *((const void**)bn), *((const void**)cn)); } ai = an; bi = bn; ci = cn; /* next */ } if ( /* remainder multiplication */ #if defined(LIBXSMM_GEMM_CHECK) NULL != *((const void**)ai) && NULL != *((const void**)bi) && NULL != *((const void**)ci) && #endif end == size) { kernel.xmm( /* pseudo-prefetch */ *((const void**)ai), *((const void**)bi), *((void**)ci), *((const void**)ai), *((const void**)bi), *((const void**)ci)); } } #if (0 != LIBXSMM_SYNC) else { /* synchronize among C-indexes */ void* cc = *((void**)ci); LIBXSMM_LOCK_TYPE(LIBXSMM_GEMM_LOCK)* lock = &internal_gemm_lock[LIBXSMM_GEMM_LOCKPTR(cc, internal_gemm_nlocks)].state; # if defined(LIBXSMM_GEMM_LOCKFWD) LIBXSMM_LOCK_TYPE(LIBXSMM_GEMM_LOCK)* lock0 = NULL; # endif LIBXSMM_ASSERT(NULL != lock); for (i = begin + 1; i <= end1; ++i) { const char *const an = ai + da, *const bn = bi + db; char *const cn = ci + dc; void *const nc = *((void**)cn); # if defined(LIBXSMM_GEMM_CHECK) if (NULL != *((const void**)ai) && NULL != *((const void**)bi) && NULL != cc) # endif { LIBXSMM_LOCK_TYPE(LIBXSMM_GEMM_LOCK) *const lock1 = &internal_gemm_lock[LIBXSMM_GEMM_LOCKPTR(nc, internal_gemm_nlocks)].state; # if defined(LIBXSMM_GEMM_LOCKFWD) if (lock != lock0) { lock0 = lock; LIBXSMM_LOCK_ACQUIRE(LIBXSMM_GEMM_LOCK, lock); } # else LIBXSMM_LOCK_ACQUIRE(LIBXSMM_GEMM_LOCK, lock); # endif kernel.xmm( /* with prefetch */ *((const void**)ai), *((const void**)bi), cc, *((const void**)an), *((const void**)bn), *((const void**)cn)); # if defined(LIBXSMM_GEMM_LOCKFWD) if (lock != lock1 || i == end1) { LIBXSMM_LOCK_RELEASE(LIBXSMM_GEMM_LOCK, lock); lock = lock1; } # else LIBXSMM_LOCK_RELEASE(LIBXSMM_GEMM_LOCK, lock); lock = lock1; # endif } ai = an; bi = bn; ci = cn; cc = nc; /* next */ } if ( /* remainder multiplication */ # if defined(LIBXSMM_GEMM_CHECK) NULL != *((const void**)ai) && NULL != *((const void**)bi) && NULL != cc && # endif end == size) { LIBXSMM_LOCK_ACQUIRE(LIBXSMM_GEMM_LOCK, lock); kernel.xmm( /* pseudo-prefetch */ *((const void**)ai), *((const void**)bi), cc, *((const void**)ai), *((const void**)bi), cc); LIBXSMM_LOCK_RELEASE(LIBXSMM_GEMM_LOCK, lock); } } #endif /*(0 != LIBXSMM_SYNC)*/ } } #if defined(LIBXSMM_GEMM_BATCHREDUCE) else /* LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS */ # if defined(LIBXSMM_GEMM_CHECK) if ( # if (0 != LIBXSMM_SYNC) (1 == ntasks || 0 == internal_gemm_nlocks || 0 > batchsize) && # endif (0 == (LIBXSMM_GEMM_FLAG_BETA_0 & flags)) && (0 != internal_gemm_batchreduce)) # endif { const unsigned int n = libxsmm_mmbatch_size * (LIBXSMM_GEMM_BATCHSCALE) / ((unsigned int)sizeof(void*)); LIBXSMM_ASSERT(NULL != libxsmm_mmbatch_array && 0 != libxsmm_mmbatch_size); if ((2U/*A and B matrices*/ * tasksize) <= n) { const void **ai = (const void**)libxsmm_mmbatch_array + begin, **bi = ai + size; unsigned long long count; if (0 != index_stride) { /* stride arrays contain indexes */ const size_t end_stride = (size_t)end * index_stride; size_t i = (size_t)begin * index_stride; char *ci = &c0[NULL != stride_c ? ((LIBXSMM_ACCESS(const libxsmm_blasint, stride_c, i) - index_base) * otypesize) : 0], *cn = ci; do { for (count = 0; i < end_stride && ci == cn; ++count) { const size_t j = i + index_stride; *ai++ = &a0[NULL != stride_a ? ((LIBXSMM_ACCESS(const libxsmm_blasint, stride_a, i) - index_base) * itypesize) : 0]; *bi++ = &b0[NULL != stride_b ? ((LIBXSMM_ACCESS(const libxsmm_blasint, stride_b, i) - index_base) * itypesize) : 0]; cn = &c0[NULL != stride_c ? ((LIBXSMM_ACCESS(const libxsmm_blasint, stride_c, j) - index_base) * otypesize) : 0]; i = j; } ai = (const void**)libxsmm_mmbatch_array + begin; bi = ai + size; kernel.xbm(ai, bi, ci, &count); ci = cn; } while (i < end_stride); } else { /* singular strides are measured in Bytes */ const libxsmm_blasint da = (NULL != stride_a ? (*stride_a - index_base * sizeof(void*)) : 0); const libxsmm_blasint db = (NULL != stride_b ? (*stride_b - index_base * sizeof(void*)) : 0); const libxsmm_blasint dc = (NULL != stride_c ? (*stride_c - index_base * sizeof(void*)) : 0); const char *ia = a0 + (size_t)da * begin, *ib = b0 + (size_t)db * begin; char* ic = c0 + (size_t)dc * begin; if ( # if defined(LIBXSMM_GEMM_CHECK) NULL != *((const void**)ia) && NULL != *((const void**)ib) && NULL != *((const void**)ic) && # endif sizeof(void*) == da && sizeof(void*) == db) /* fast path */ { if (0 != dc) { libxsmm_blasint i = begin; char* jc = ic; do { for (count = 0; i < end && *((const void**)ic) == *((const void**)jc); ++i) { # if defined(LIBXSMM_GEMM_CHECK) if (NULL != *((const void**)jc)) # endif ++count; jc += dc; /* next */ } memcpy((void*)ai, ia, count * sizeof(void*)); memcpy((void*)bi, ib, count * sizeof(void*)); kernel.xbm(ai, bi, *((void**)ic), &count); ic = jc; } while (i < end); } else { /* fastest path */ count = (unsigned long long)end - begin; memcpy((void*)ai, ia, count * sizeof(void*)); memcpy((void*)bi, ib, count * sizeof(void*)); kernel.xbm(ai, bi, *((void**)ic), &count); } } else { /* custom-copy required */ libxsmm_blasint i = begin; char* jc = ic; do { for (count = 0; i < end && *((const void**)ic) == *((const void**)jc); ++i) { # if defined(LIBXSMM_GEMM_CHECK) if (NULL != *((const void**)ia) && NULL != *((const void**)ib) && NULL != *((const void**)jc)) # endif { *ai++ = *((const void**)ia); *bi++ = *((const void**)ib); ++count; } ia += da; ib += db; jc += dc; /* next */ } ai = (const void**)libxsmm_mmbatch_array + begin; bi = ai + size; kernel.xbm(ai, bi, *((void**)ic), &count); ic = jc; } while (i < end); } } } else { /* fall-back */ result = EXIT_FAILURE; } } #endif /*defined(LIBXSMM_GEMM_BATCHREDUCE)*/ } /* coverity[missing_unlock] */ return result; } LIBXSMM_API void libxsmm_gemm_internal_set_batchflag(libxsmm_gemm_descriptor* descriptor, void* c, libxsmm_blasint index_stride, libxsmm_blasint batchsize, int multithreaded) { LIBXSMM_ASSERT(NULL != descriptor); if (0 != (LIBXSMM_GEMM_FLAG_BETA_0 & descriptor->flags)) { const uintptr_t vw = (LIBXSMM_X86_AVX512 <= libxsmm_target_archid ? 64 : 32); /* assume that all C-matrices are aligned eventually */ if (0 == LIBXSMM_MOD2((uintptr_t)c, vw) #if 0 /* should fall-back in BE */ && LIBXSMM_X86_AVX <= libxsmm_target_archid #endif && 0 != index_stride) { const int oprec = LIBXSMM_GETENUM_OUT(descriptor->datatype); const libxsmm_blasint typesize = LIBXSMM_TYPESIZE(oprec); const libxsmm_blasint csize = (libxsmm_blasint)descriptor->ldc * descriptor->n * typesize; /* finalize assumption if matrix-size is a multiple of the vector-width */ descriptor->flags |= (unsigned short)(0 == LIBXSMM_MOD2(csize, vw) ? LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT : 0); } } #if defined(LIBXSMM_GEMM_BATCHREDUCE) else if (0 != internal_gemm_batchreduce) { /* check if reduce-batch kernel can be used */ static int error_once = 0; LIBXSMM_ASSERT(NULL != libxsmm_mmbatch_array); # if (0 != LIBXSMM_SYNC) if (0 == multithreaded || 0 == internal_gemm_nlocks || 0 > batchsize) # endif { int result = EXIT_FAILURE; switch (LIBXSMM_GETENUM_INP(descriptor->datatype)) { case LIBXSMM_GEMM_PRECISION_F64: { if (LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_OUT(descriptor->datatype)) { result = EXIT_SUCCESS; } } break; case LIBXSMM_GEMM_PRECISION_F32: { if (LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_OUT(descriptor->datatype)) { result = EXIT_SUCCESS; } } break; } if (EXIT_SUCCESS == result) { descriptor->flags |= LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS; descriptor->prefetch = 0; /* omit decision */ } else { if ((LIBXSMM_VERBOSITY_WARN <= libxsmm_verbosity || 0 > libxsmm_verbosity) && /* library code is expected to be mute */ 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM WARNING: data type not supported in batch-reduce!\n"); } } } # if (0 != LIBXSMM_SYNC) else if ((LIBXSMM_VERBOSITY_WARN <= libxsmm_verbosity || 0 > libxsmm_verbosity) && /* library code is expected to be mute */ 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM: potential data races prevent batch-reduce.\n"); } # endif } #endif /*defined(LIBXSMM_GEMM_BATCHREDUCE)*/ #if !defined(LIBXSMM_GEMM_BATCHREDUCE) || (0 == LIBXSMM_SYNC) LIBXSMM_UNUSED(batchsize); LIBXSMM_UNUSED(multithreaded); #endif } LIBXSMM_API_INTERN void libxsmm_dmmbatch_blas(const char* transa, const char* transb, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const double* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const double* beta, void* c, const libxsmm_blasint* ldc, libxsmm_blasint index_base, libxsmm_blasint index_stride, const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], libxsmm_blasint batchsize) { const libxsmm_blasint end = LIBXSMM_ABS(batchsize); libxsmm_blasint i; if (0 != index_stride) { /* stride arrays contain indexes */ const libxsmm_blasint da = (NULL != stride_a ? (*stride_a - index_base) : 0); const libxsmm_blasint db = (NULL != stride_b ? (*stride_b - index_base) : 0); const libxsmm_blasint dc = (NULL != stride_c ? (*stride_c - index_base) : 0); const libxsmm_blasint end1 = end * index_stride; const double *const a0 = (const double*)a, *const b0 = (const double*)b, *ai = a0 + da, *bi = b0 + db; double *const c0 = (double*)c, *ci = c0 + dc; for (i = index_stride; i <= end1; i += index_stride) { const double *const an = &a0[NULL != stride_a ? (LIBXSMM_ACCESS(const libxsmm_blasint, stride_a, i) - index_base) : 0]; const double *const bn = &b0[NULL != stride_b ? (LIBXSMM_ACCESS(const libxsmm_blasint, stride_b, i) - index_base) : 0]; double *const cn = &c0[NULL != stride_c ? (LIBXSMM_ACCESS(const libxsmm_blasint, stride_c, i) - index_base) : 0]; #if defined(LIBXSMM_GEMM_CHECK) if (NULL != ai && NULL != bi && NULL != ci) #endif { libxsmm_blas_dgemm(transa, transb, &m, &n, &k, alpha, ai, lda, bi, ldb, beta, ci, ldc); } ai = an; bi = bn; ci = cn; } } else { /* singular strides are measured in Bytes */ const libxsmm_blasint da = (NULL != stride_a ? (*stride_a - index_base * sizeof(void*)) : 0); const libxsmm_blasint db = (NULL != stride_b ? (*stride_b - index_base * sizeof(void*)) : 0); const libxsmm_blasint dc = (NULL != stride_c ? (*stride_c - index_base * sizeof(void*)) : 0); const char *const a0 = (const char*)a, *const b0 = (const char*)b, *ai = a0, *bi = b0; char *const c0 = (char*)c, *ci = c0; for (i = 0; i < end; ++i) { const char *const an = ai + da, *const bn = bi + db; char *const cn = ci + dc; #if defined(LIBXSMM_GEMM_CHECK) if (NULL != *((const double**)ai) && NULL != *((const double**)bi) && NULL != *((const double**)ci)) #endif { libxsmm_blas_dgemm(transa, transb, &m, &n, &k, alpha, *((const double**)ai), lda, *((const double**)bi), ldb, beta, *((double**)ci), ldc); } ai = an; bi = bn; ci = cn; /* next */ } } } LIBXSMM_API_INTERN void libxsmm_smmbatch_blas(const char* transa, const char* transb, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const float* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const float* beta, void* c, const libxsmm_blasint* ldc, libxsmm_blasint index_base, libxsmm_blasint index_stride, const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], libxsmm_blasint batchsize) { const libxsmm_blasint end = LIBXSMM_ABS(batchsize); libxsmm_blasint i; if (0 != index_stride) { /* stride arrays contain indexes */ const libxsmm_blasint da = (NULL != stride_a ? (*stride_a - index_base) : 0); const libxsmm_blasint db = (NULL != stride_b ? (*stride_b - index_base) : 0); const libxsmm_blasint dc = (NULL != stride_c ? (*stride_c - index_base) : 0); const libxsmm_blasint end1 = end * index_stride; const float *a0 = (const float*)a, *b0 = (const float*)b, *ai = a0 + da, *bi = b0 + db; float *c0 = (float*)c, *ci = c0 + dc; for (i = index_stride; i <= end1; i += index_stride) { const float *const an = &a0[NULL != stride_a ? (LIBXSMM_ACCESS(const libxsmm_blasint, stride_a, i) - index_base) : 0]; const float *const bn = &b0[NULL != stride_b ? (LIBXSMM_ACCESS(const libxsmm_blasint, stride_b, i) - index_base) : 0]; float *const cn = &c0[NULL != stride_c ? (LIBXSMM_ACCESS(const libxsmm_blasint, stride_c, i) - index_base) : 0]; #if defined(LIBXSMM_GEMM_CHECK) if (NULL != ai && NULL != bi && NULL != ci) #endif { libxsmm_blas_sgemm(transa, transb, &m, &n, &k, alpha, ai, lda, bi, ldb, beta, ci, ldc); } ai = an; bi = bn; ci = cn; } } else { /* singular strides are measured in Bytes */ const libxsmm_blasint da = (NULL != stride_a ? (*stride_a - index_base * sizeof(void*)) : 0); const libxsmm_blasint db = (NULL != stride_b ? (*stride_b - index_base * sizeof(void*)) : 0); const libxsmm_blasint dc = (NULL != stride_c ? (*stride_c - index_base * sizeof(void*)) : 0); const char *a0 = (const char*)a, *b0 = (const char*)b, *ai = a0, *bi = b0; char *c0 = (char*)c, *ci = c0; for (i = 0; i < end; ++i) { const char *const an = ai + da; const char *const bn = bi + db; char *const cn = ci + dc; #if defined(LIBXSMM_GEMM_CHECK) if (NULL != *((const float**)ai) && NULL != *((const float**)bi) && NULL != *((const float**)ci)) #endif { libxsmm_blas_sgemm(transa, transb, &m, &n, &k, alpha, *((const float**)ai), lda, *((const float**)bi), ldb, beta, *((float**)ci), ldc); } ai = an; bi = bn; ci = cn; /* next */ } } } LIBXSMM_API int libxsmm_mmbatch_blas( libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, const char* transa, const char* transb, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const void* beta, void* c, const libxsmm_blasint* ldc, libxsmm_blasint index_base, libxsmm_blasint index_stride, const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], libxsmm_blasint batchsize) { int result; if (NULL != a && NULL != b && NULL != c) { switch (LIBXSMM_GETENUM(iprec, oprec)) { case LIBXSMM_GEMM_PRECISION_F64: { libxsmm_dmmbatch_blas(transa, transb, m, n, k, (const double*)alpha, a, lda, b, ldb, (const double*)beta, c, ldc, index_base, index_stride, stride_a, stride_b, stride_c, batchsize); result = EXIT_SUCCESS; } break; case LIBXSMM_GEMM_PRECISION_F32: { libxsmm_smmbatch_blas(transa, transb, m, n, k, (const float*)alpha, a, lda, b, ldb, (const float*)beta, c, ldc, index_base, index_stride, stride_a, stride_b, stride_c, batchsize); result = EXIT_SUCCESS; } break; default: result = EXIT_FAILURE; } } else { result = EXIT_FAILURE; } return result; } LIBXSMM_API void libxsmm_mmbatch(libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, const char* transa, const char* transb, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const void* beta, void* c, const libxsmm_blasint* ldc, libxsmm_blasint index_base, libxsmm_blasint index_stride, const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], libxsmm_blasint batchsize, /*unsigned*/int tid, /*unsigned*/int nthreads) { static int error_once = 0; #if defined(LIBXSMM_GEMM_CHECK) if (NULL != a && NULL != b && NULL != c && 0 <= tid && tid < nthreads) #endif { const unsigned char otypesize = libxsmm_typesize((libxsmm_datatype)oprec); int result = EXIT_FAILURE; LIBXSMM_INIT if (LIBXSMM_SMM_AI(m, n, k, 2/*RFO*/, otypesize)) { /* check if an SMM is suitable */ const int gemm_flags = LIBXSMM_GEMM_PFLAGS(transa, transb, LIBXSMM_FLAGS); libxsmm_descriptor_blob blob; libxsmm_gemm_descriptor *const desc = libxsmm_gemm_descriptor_init2(&blob, iprec, oprec, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, alpha, beta, gemm_flags, libxsmm_get_gemm_prefetch(LIBXSMM_PREFETCH_AUTO)); if (NULL != desc) { libxsmm_xmmfunction kernel; libxsmm_gemm_internal_set_batchflag(desc, c, index_stride, batchsize, 0/*multi-threaded*/); kernel = libxsmm_xmmdispatch(desc); if (NULL != kernel.xmm) { result = libxsmm_mmbatch_kernel(kernel, index_base, index_stride, stride_a, stride_b, stride_c, a, b, c, batchsize, tid, nthreads, libxsmm_typesize((libxsmm_datatype)iprec), otypesize, desc->flags); } } } if (EXIT_SUCCESS != result) { /* quiet fall-back */ if (EXIT_SUCCESS == libxsmm_mmbatch_blas(iprec, oprec, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, index_base, index_stride, stride_a, stride_b, stride_c, batchsize)) { if (LIBXSMM_VERBOSITY_WARN <= libxsmm_verbosity || 0 > libxsmm_verbosity) { const size_t threshold = LIBXSMM_MNK_SIZE(m, n, m); static size_t threshold_max = 0; if (threshold_max < threshold) { LIBXSMM_STDIO_ACQUIRE(); fprintf(stderr, "LIBXSMM WARNING: "); libxsmm_gemm_print2(stderr, iprec, oprec, transa, transb, &m, &n, &k, alpha, NULL/*a*/, lda, NULL/*b*/, ldb, beta, NULL/*c*/, ldc); fprintf(stderr, " => batched GEMM was falling back to BLAS!\n"); LIBXSMM_STDIO_RELEASE(); threshold_max = threshold; } } } else if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: libxsmm_mmbatch failed!\n"); } } } #if defined(LIBXSMM_GEMM_CHECK) else if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: incorrect arguments (libxsmm_mmbatch)!\n"); } #endif } LIBXSMM_API void libxsmm_gemm_batch(libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, const char* transa, const char* transb, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const void* beta, void* c, const libxsmm_blasint* ldc, libxsmm_blasint index_base, libxsmm_blasint index_stride, const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], libxsmm_blasint batchsize) { libxsmm_mmbatch(iprec, oprec, transa, transb, m, n, k, alpha,a, lda, b, ldb, beta, c, ldc, index_base, index_stride, stride_a, stride_b, stride_c, batchsize, 0/*tid*/, 1/*nthreads*/); } #if defined(LIBXSMM_BUILD) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__)) /* implementation provided for Fortran 77 compatibility */ LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_dgemm)(const char*, const char*, const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, const double*, const double*, const libxsmm_blasint*, const double*, const libxsmm_blasint*, const double*, double*, const libxsmm_blasint*); LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_dgemm)(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const double* alpha, const double* a, const libxsmm_blasint* lda, const double* b, const libxsmm_blasint* ldb, const double* beta, double* c, const libxsmm_blasint* ldc) { libxsmm_dgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } /* implementation provided for Fortran 77 compatibility */ LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_sgemm)(const char*, const char*, const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, const float*, const float*, const libxsmm_blasint*, const float*, const libxsmm_blasint*, const float*, float*, const libxsmm_blasint*); LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_sgemm)(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const float* alpha, const float* a, const libxsmm_blasint* lda, const float* b, const libxsmm_blasint* ldb, const float* beta, float* c, const libxsmm_blasint* ldc) { libxsmm_sgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } /* implementation provided for Fortran 77 compatibility */ LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_wigemm)(const char*, const char*, const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, const int*, const short*, const libxsmm_blasint*, const short*, const libxsmm_blasint*, const int*, int*, const libxsmm_blasint*); LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_wigemm)(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const int* alpha, const short* a, const libxsmm_blasint* lda, const short* b, const libxsmm_blasint* ldb, const int* beta, int* c, const libxsmm_blasint* ldc) { libxsmm_wigemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } /* implementation provided for Fortran 77 compatibility */ LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_bsgemm)(const char*, const char*, const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, const float*, const libxsmm_bfloat16*, const libxsmm_blasint*, const libxsmm_bfloat16*, const libxsmm_blasint*, const float*, float*, const libxsmm_blasint*); LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_bsgemm)(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const float* alpha, const libxsmm_bfloat16* a, const libxsmm_blasint* lda, const libxsmm_bfloat16* b, const libxsmm_blasint* ldb, const float* beta, float* c, const libxsmm_blasint* ldc) { libxsmm_bsgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } /* implementation provided for Fortran 77 compatibility */ LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_blas_xgemm)(const libxsmm_gemm_precision*, const libxsmm_gemm_precision*, const char*, const char*, const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, const float*, const float*, const libxsmm_blasint*, const float*, const libxsmm_blasint*, const float*, float*, const libxsmm_blasint*); LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_blas_xgemm)(const libxsmm_gemm_precision* iprec, const libxsmm_gemm_precision* oprec, const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const float* alpha, const float* a, const libxsmm_blasint* lda, const float* b, const libxsmm_blasint* ldb, const float* beta, float* c, const libxsmm_blasint* ldc) { LIBXSMM_ASSERT(NULL != iprec && NULL != oprec); libxsmm_blas_xgemm(*iprec, *oprec, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } /* implementation provided for Fortran 77 compatibility */ LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_blas_dgemm)(const char*, const char*, const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, const double*, const double*, const libxsmm_blasint*, const double*, const libxsmm_blasint*, const double*, double*, const libxsmm_blasint*); LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_blas_dgemm)(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const double* alpha, const double* a, const libxsmm_blasint* lda, const double* b, const libxsmm_blasint* ldb, const double* beta, double* c, const libxsmm_blasint* ldc) { libxsmm_blas_dgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } /* implementation provided for Fortran 77 compatibility */ LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_blas_sgemm)(const char*, const char*, const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, const float*, const float*, const libxsmm_blasint*, const float*, const libxsmm_blasint*, const float*, float*, const libxsmm_blasint*); LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_blas_sgemm)(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const float* alpha, const float* a, const libxsmm_blasint* lda, const float* b, const libxsmm_blasint* ldb, const float* beta, float* c, const libxsmm_blasint* ldc) { libxsmm_blas_sgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } /* implementation provided for Fortran 77 compatibility */ LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_mmbatch)(const libxsmm_gemm_precision*, const libxsmm_gemm_precision*, const char*, const char*, const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, const void*, const void*, const libxsmm_blasint*, const void*, const libxsmm_blasint*, const void*, void*, const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint[], const libxsmm_blasint[], const libxsmm_blasint[], const libxsmm_blasint*, const /*unsigned*/int*, const /*unsigned*/int*); LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_mmbatch)(const libxsmm_gemm_precision* iprec, const libxsmm_gemm_precision* oprec, const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const void* beta, void* c, const libxsmm_blasint* ldc, const libxsmm_blasint* index_base, const libxsmm_blasint* index_stride, const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], const libxsmm_blasint* batchsize, const /*unsigned*/int* tid, const /*unsigned*/int* nthreads) { LIBXSMM_ASSERT(NULL != iprec && NULL != oprec && NULL != m && NULL != n && NULL != k); LIBXSMM_ASSERT(NULL != index_base && NULL != index_stride && NULL != batchsize); LIBXSMM_ASSERT(NULL != tid && NULL != nthreads); libxsmm_mmbatch(*iprec, *oprec, transa, transb, *m, *n, *k, alpha, a, lda, b, ldb, beta, c, ldc, *index_base, *index_stride, stride_a, stride_b, stride_c, *batchsize, *tid, *nthreads); } /* implementation provided for Fortran 77 compatibility */ LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_gemm_batch)(const libxsmm_gemm_precision*, const libxsmm_gemm_precision*, const char*, const char*, const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, const void*, const void*, const libxsmm_blasint*, const void*, const libxsmm_blasint*, const void*, void*, const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint[], const libxsmm_blasint[], const libxsmm_blasint[], const libxsmm_blasint*); LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_gemm_batch)(const libxsmm_gemm_precision* iprec, const libxsmm_gemm_precision* oprec, const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const void* beta, void* c, const libxsmm_blasint* ldc, const libxsmm_blasint* index_base, const libxsmm_blasint* index_stride, const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], const libxsmm_blasint* batchsize) { LIBXSMM_ASSERT(NULL != iprec && NULL != oprec && NULL != m && NULL != n && NULL != k); LIBXSMM_ASSERT(NULL != index_base && NULL != index_stride && NULL != batchsize); libxsmm_gemm_batch(*iprec, *oprec, transa, transb, *m, *n, *k, alpha, a, lda, b, ldb, beta, c, ldc, *index_base, *index_stride, stride_a, stride_b, stride_c, *batchsize); } #endif /*defined(LIBXSMM_BUILD) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__))*/ libxsmm-1.17/src/libxsmm_gemm.h000066400000000000000000000263221415223013700165400ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_GEMM_H #define LIBXSMM_GEMM_H #include "libxsmm_main.h" #if !defined(LIBXSMM_BLAS_WRAP_DYNAMIC) && defined(LIBXSMM_INTERCEPT_DYNAMIC) && (!defined(__BLAS) || (0 != __BLAS)) # define LIBXSMM_BLAS_WRAP_DYNAMIC #endif #if !defined(LIBXSMM_GEMM_CHECK) && !defined(NDEBUG) # define LIBXSMM_GEMM_CHECK #endif #if !defined(LIBXSMM_GEMM_LOCK) # define LIBXSMM_GEMM_LOCK LIBXSMM_LOCK_DEFAULT #endif #if !defined(LIBXSMM_GEMM_MMBATCH_SCALE) # define LIBXSMM_GEMM_MMBATCH_SCALE 1.5 #endif #if !defined(LIBXSMM_GEMM_MMBATCH_VERBOSITY) # define LIBXSMM_GEMM_MMBATCH_VERBOSITY ((LIBXSMM_VERBOSITY_HIGH) + 1) #endif #if !defined(LIBXSMM_GEMM_NPARGROUPS) # define LIBXSMM_GEMM_NPARGROUPS 128 #endif #if !defined(LIBXSMM_WRAP) && defined(LIBXSMM_BUILD) && \ (defined(LIBXSMM_CONFIG_WRAP) && 0 != (LIBXSMM_CONFIG_WRAP)) && \ (defined(LIBXSMM_BLAS_WRAP_DYNAMIC) || !defined(NDEBUG) || defined(_WIN32)) /* debug */ # define LIBXSMM_WRAP LIBXSMM_CONFIG_WRAP #endif /** Undefine (disarm) MKL's DIRECT_CALL macros. */ #if (defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL)) # if defined(sgemm_) # undef sgemm_ # endif # if defined(dgemm_) # undef dgemm_ # endif #endif #if !defined(LIBXSMM_BLAS_ERROR) #define LIBXSMM_BLAS_ERROR(SYMBOL, PCOUNTER) do { \ if (1 == LIBXSMM_ATOMIC_ADD_FETCH(PCOUNTER, 1, LIBXSMM_ATOMIC_RELAXED)) { \ fprintf(stderr, "LIBXSMM ERROR: application must be linked against LAPACK/BLAS %s!\n", SYMBOL); \ } \ } while(0) #endif #if defined(LIBXSMM_BUILD) # define LIBXSMM_BLAS_WRAPPER_STATIC1(TYPE, KIND, ORIGINAL) if (NULL == (ORIGINAL)) { \ ORIGINAL = LIBXSMM_FSYMBOL(LIBXSMM_CONCATENATE(__real_, LIBXSMM_TPREFIX(TYPE, KIND))); \ } # define LIBXSMM_BLAS_WRAPPER_STATIC0 LIBXSMM_BLAS_WRAPPER_STATIC1 #else # define LIBXSMM_BLAS_WRAPPER_STATIC1(TYPE, KIND, ORIGINAL) if (NULL == (ORIGINAL)) { \ ORIGINAL = (LIBXSMM_BLAS_FNTYPE(TYPE, KIND))LIBXSMM_BLAS_SYMBOL(TYPE, KIND); \ } # define LIBXSMM_BLAS_WRAPPER_STATIC0(TYPE, KIND, ORIGINAL) #endif #define LIBXSMM_BLAS_WRAPPER_STATIC(CONDITION, TYPE, KIND, ORIGINAL) \ LIBXSMM_CONCATENATE(LIBXSMM_BLAS_WRAPPER_STATIC, CONDITION)(TYPE, KIND, ORIGINAL) #if defined(LIBXSMM_BLAS_WRAP_DYNAMIC) # define LIBXSMM_BLAS_WRAPPER_DYNAMIC(TYPE, KIND, ORIGINAL, NEXT) { \ union { const void* pfin; \ LIBXSMM_BLAS_FNTYPE(TYPE, KIND) (*chain)(void); /* chain */ \ LIBXSMM_BLAS_FNTYPE(TYPE, KIND) pfout; \ } libxsmm_blas_wrapper_dynamic_ /*= { 0 }*/; \ dlerror(); /* clear an eventual error status */ \ libxsmm_blas_wrapper_dynamic_.chain = NEXT; \ libxsmm_blas_wrapper_dynamic_.pfin = ((NULL == libxsmm_blas_wrapper_dynamic_.pfin) ? \ dlsym(LIBXSMM_RTLD_NEXT, "libxsmm_original_" LIBXSMM_STRINGIFY(LIBXSMM_TPREFIX(TYPE, KIND))) : NULL); \ if (NULL == libxsmm_blas_wrapper_dynamic_.pfout || NULL != dlerror() || NULL == libxsmm_blas_wrapper_dynamic_.chain()) { \ libxsmm_blas_wrapper_dynamic_.pfin = dlsym(LIBXSMM_RTLD_NEXT, LIBXSMM_STRINGIFY(LIBXSMM_BLAS_SYMBOL(TYPE, KIND))); \ /*LIBXSMM_ATOMIC_STORE(&(ORIGINAL), libxsmm_blas_wrapper_dynamic_.pfout, LIBXSMM_ATOMIC_RELAXED);*/ \ ORIGINAL = (NULL == dlerror() ? libxsmm_blas_wrapper_dynamic_.pfout : NULL); \ } \ } #else # define LIBXSMM_BLAS_WRAPPER_DYNAMIC(TYPE, KIND, ORIGINAL, NEXT) #endif #define LIBXSMM_BLAS_WRAPPER(CONDITION, TYPE, KIND, ORIGINAL, NEXT) if (NULL == (ORIGINAL)) { \ LIBXSMM_BLAS_WRAPPER_DYNAMIC(TYPE, KIND, ORIGINAL, NEXT); \ LIBXSMM_BLAS_WRAPPER_STATIC(CONDITION, TYPE, KIND, ORIGINAL); \ } /** Provides GEMM functions available via BLAS; NOT thread-safe. */ LIBXSMM_API_INTERN void libxsmm_gemm_init(int archid); /** Finalizes the GEMM facility; NOT thread-safe. */ LIBXSMM_API_INTERN void libxsmm_gemm_finalize(void); LIBXSMM_API_INTERN int libxsmm_gemm_prefetch2uid(libxsmm_gemm_prefetch_type prefetch); LIBXSMM_API_INTERN libxsmm_gemm_prefetch_type libxsmm_gemm_uid2prefetch(int uid); #if defined(LIBXSMM_BUILD) #if defined(LIBXSMM_BUILD_EXT) LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(__wrap_dgemm_batch)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, double, gemm_batch)); LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(__wrap_sgemm_batch)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, float, gemm_batch)); LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(__wrap_dgemm)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, double, gemm)); LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(__wrap_sgemm)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, float, gemm)); LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(__wrap_dgemv)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, double, gemv)); LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(__wrap_sgemv)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, float, gemv)); LIBXSMM_APIEXT void __wrap_dgemm_batch(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, double, gemm_batch)); LIBXSMM_APIEXT void __wrap_sgemm_batch(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, float, gemm_batch)); #endif LIBXSMM_API void LIBXSMM_FSYMBOL(__real_dgemm_batch)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, double, gemm_batch)); LIBXSMM_API void LIBXSMM_FSYMBOL(__real_sgemm_batch)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, float, gemm_batch)); LIBXSMM_API void LIBXSMM_FSYMBOL(__real_dgemm)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, double, gemm)); LIBXSMM_API void LIBXSMM_FSYMBOL(__real_sgemm)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, float, gemm)); LIBXSMM_API void LIBXSMM_FSYMBOL(__real_dgemv)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, double, gemv)); LIBXSMM_API void LIBXSMM_FSYMBOL(__real_sgemv)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, float, gemv)); LIBXSMM_API void __real_dgemm_batch(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, double, gemm_batch)); LIBXSMM_API void __real_sgemm_batch(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, float, gemm_batch)); #endif LIBXSMM_BLAS_SYMBOL_FDECL(LIBXSMM_BLAS_CONST*, *, double, gemm_batch); LIBXSMM_BLAS_SYMBOL_CDECL(LIBXSMM_BLAS_CONST*, *, double, gemm_batch); LIBXSMM_BLAS_SYMBOL_FDECL(LIBXSMM_BLAS_CONST*, *, float, gemm_batch); LIBXSMM_BLAS_SYMBOL_CDECL(LIBXSMM_BLAS_CONST*, *, float, gemm_batch); LIBXSMM_BLAS_SYMBOL_FDECL(LIBXSMM_BLAS_CONST*, *, double, gemm); LIBXSMM_BLAS_SYMBOL_FDECL(LIBXSMM_BLAS_CONST*, *, float, gemm); LIBXSMM_BLAS_SYMBOL_FDECL(LIBXSMM_BLAS_CONST*, *, double, gemv); LIBXSMM_BLAS_SYMBOL_FDECL(LIBXSMM_BLAS_CONST*, *, float, gemv); LIBXSMM_EXTERN_C struct LIBXSMM_RETARGETABLE libxsmm_gemm_handle { libxsmm_code_pointer copy_a, copy_b, copy_i, copy_o; libxsmm_xmmfunction kernel[2]; unsigned int m, n, k, lda, ldb, ldc; /* kernel size (tile) */ unsigned int km, kn, kk; /* tile size per task */ unsigned int dm, dn, dk; unsigned int itypesize, otypesize; /* number of tasks per direction */ unsigned int mt, nt, kt; int gemm_flags, flags; }; LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE libxsmm_mmbatch_item { struct { const void *a, *b; void *c; } value; struct { libxsmm_gemm_descriptor desc; unsigned int count; const char* symbol; } stat; /* TODO: consider padding */ } libxsmm_mmbatch_item; LIBXSMM_API void libxsmm_gemm_internal_set_batchflag(libxsmm_gemm_descriptor* descriptor, void* c, libxsmm_blasint index_stride, libxsmm_blasint batchsize, int multithreaded); LIBXSMM_API int libxsmm_mmbatch_kernel(libxsmm_xmmfunction kernel, libxsmm_blasint index_base, libxsmm_blasint index_stride, const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], const void* a, const void* b, void* c, libxsmm_blasint batchsize, /*unsigned*/int tid, /*unsigned*/int ntasks, unsigned char itypesize, unsigned char otypesize, int flags); LIBXSMM_API int libxsmm_mmbatch_blas( libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, const char* transa, const char* transb, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const void* beta, void* c, const libxsmm_blasint* ldc, libxsmm_blasint index_base, libxsmm_blasint index_stride, const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], libxsmm_blasint batchsize); LIBXSMM_API_INTERN void libxsmm_dmmbatch_blas(const char* transa, const char* transb, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const double* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const double* beta, void* c, const libxsmm_blasint* ldc, libxsmm_blasint index_base, libxsmm_blasint index_stride, const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], libxsmm_blasint batchsize); LIBXSMM_API_INTERN void libxsmm_smmbatch_blas(const char* transa, const char* transb, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const float* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const float* beta, void* c, const libxsmm_blasint* ldc, libxsmm_blasint index_base, libxsmm_blasint index_stride, const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], libxsmm_blasint batchsize); LIBXSMM_EXTERN_C typedef void (*libxsmm_mmbatch_flush_function)(void); /** auto-batch descriptor (filter). */ LIBXSMM_APIVAR_PUBLIC(libxsmm_gemm_descriptor libxsmm_mmbatch_desc); /** Records a batch of SMMs or is used for batch-reduce. */ LIBXSMM_APIVAR_PUBLIC(void* libxsmm_mmbatch_array); /** Lock: libxsmm_mmbatch_begin, libxsmm_mmbatch_end, internal_mmbatch_flush. */ LIBXSMM_APIVAR_PUBLIC(LIBXSMM_LOCK_TYPE(LIBXSMM_GEMM_LOCK) libxsmm_mmbatch_lock); /** Maximum size of the recorded batch. */ LIBXSMM_APIVAR_PUBLIC(unsigned int libxsmm_mmbatch_size); /** Maximum number of parallelized batch-groups. */ LIBXSMM_APIVAR_PUBLIC(unsigned int libxsmm_gemm_npargroups); /** Minimum batchsize per thread/task. */ LIBXSMM_APIVAR_PUBLIC(unsigned int libxsmm_gemm_taskgrain); /** Determines if OpenMP tasks are used. */ LIBXSMM_APIVAR_PUBLIC(int libxsmm_gemm_tasks); /** * Intercepted GEMM * - [>=1 and odd]: sequential and non-tiled (small problem sizes only) * - [>=2 and even]: parallelized and tiled (all problem sizes) * - [>=3 and odd]: GEMV is intercepted; small problem sizes * - [>=4 and even]: GEMV is intercepted; all problem sizes * - [0]: disabled */ LIBXSMM_APIVAR_PUBLIC(int libxsmm_gemm_wrap); /** Determines the default prefetch strategy, which is used in case of LIBXSMM_PREFETCH_AUTO. */ LIBXSMM_APIVAR_PRIVATE(libxsmm_gemm_prefetch_type libxsmm_gemm_auto_prefetch_default); /** Determines the prefetch strategy, which is used in case of LIBXSMM_PREFETCH_AUTO. */ LIBXSMM_APIVAR_PRIVATE(libxsmm_gemm_prefetch_type libxsmm_gemm_auto_prefetch); #endif /*LIBXSMM_GEMM_H*/ libxsmm-1.17/src/libxsmm_generator.c000066400000000000000000000632671415223013700176050ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include "libxsmm_main.h" #if !defined(LIBXSMM_PRODUCT_LIMIT) # define LIBXSMM_PRODUCT_LIMIT 1024 #endif #if defined(LIBXSMM_INTRINSICS_AVX512) LIBXSMM_APIVAR_PUBLIC_DEF(unsigned int libxsmm_intrinsics_mm512_rng_state0[16]); LIBXSMM_APIVAR_PUBLIC_DEF(unsigned int libxsmm_intrinsics_mm512_rng_state1[16]); LIBXSMM_APIVAR_PUBLIC_DEF(unsigned int libxsmm_intrinsics_mm512_rng_state2[16]); LIBXSMM_APIVAR_PUBLIC_DEF(unsigned int libxsmm_intrinsics_mm512_rng_state3[16]); #endif /* definition of corresponding variables */ LIBXSMM_APIVAR_PUBLIC_DEF(unsigned int libxsmm_ninit); LIBXSMM_APIVAR_PUBLIC_DEF(int libxsmm_target_archid); LIBXSMM_APIVAR_PUBLIC_DEF(int libxsmm_verbosity); LIBXSMM_APIVAR_PUBLIC_DEF(int libxsmm_se); LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_dgemm_descriptor_init(libxsmm_descriptor_blob* blob, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, double alpha, double beta, int flags, int prefetch) { union { libxsmm_gemm_descriptor* ptr; libxsmm_descriptor_blob* blob; } result; if (LIBXSMM_GEMM_NO_BYPASS(flags, alpha, beta) && LIBXSMM_GEMM_NO_BYPASS_DIMS(lda, ldb, ldc) && LIBXSMM_GEMM_NO_BYPASS_DIMS(m, n, k)) { result.blob = blob; LIBXSMM_GEMM_DESCRIPTOR(*result.ptr, LIBXSMM_GEMM_PRECISION(double), flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch); } else { /* quiet error (unsupported) */ result.ptr = NULL; } return result.ptr; } LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_sgemm_descriptor_init(libxsmm_descriptor_blob* blob, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, float alpha, float beta, int flags, int prefetch) { union { libxsmm_gemm_descriptor* ptr; libxsmm_descriptor_blob* blob; } result; if (LIBXSMM_GEMM_NO_BYPASS(flags, alpha, beta) && LIBXSMM_GEMM_NO_BYPASS_DIMS(lda, ldb, ldc) && LIBXSMM_GEMM_NO_BYPASS_DIMS(m, n, k)) { result.blob = blob; LIBXSMM_GEMM_DESCRIPTOR(*result.ptr, LIBXSMM_GEMM_PRECISION(float), flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch); } else { /* unsupported */ result.ptr = NULL; } return result.ptr; } LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_wigemm_descriptor_init(libxsmm_descriptor_blob* blob, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, int alpha, int beta, int flags, int prefetch) { union { libxsmm_gemm_descriptor* ptr; libxsmm_descriptor_blob* blob; } result; if (LIBXSMM_GEMM_NO_BYPASS(flags, alpha, beta) && LIBXSMM_GEMM_NO_BYPASS_DIMS(lda, ldb, ldc) && LIBXSMM_GEMM_NO_BYPASS_DIMS(m, n, k)) { result.blob = blob; LIBXSMM_GEMM_DESCRIPTOR2(*result.ptr, LIBXSMM_GEMM_PRECISION(short), LIBXSMM_GEMM_PRECISION(int), flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch); } else { /* unsupported */ result.ptr = NULL; } return result.ptr; } LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_bsgemm_descriptor_init(libxsmm_descriptor_blob* blob, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, float alpha, float beta, int flags, int prefetch) { union { libxsmm_gemm_descriptor* ptr; libxsmm_descriptor_blob* blob; } result; if (LIBXSMM_GEMM_NO_BYPASS(flags, alpha, beta) && LIBXSMM_GEMM_NO_BYPASS_DIMS(lda, ldb, ldc) && LIBXSMM_GEMM_NO_BYPASS_DIMS(m, n, k)) { result.blob = blob; LIBXSMM_GEMM_DESCRIPTOR2(*result.ptr, LIBXSMM_GEMM_PRECISION(libxsmm_bfloat16), LIBXSMM_GEMM_PRECISION(float), flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch); } else { /* unsupported */ result.ptr = NULL; } return result.ptr; } LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_bgemm_descriptor_init(libxsmm_descriptor_blob* blob, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, float alpha, float beta, int flags, int prefetch) { union { libxsmm_gemm_descriptor* ptr; libxsmm_descriptor_blob* blob; } result; if (LIBXSMM_GEMM_NO_BYPASS(flags, alpha, beta) && LIBXSMM_GEMM_NO_BYPASS_DIMS(lda, ldb, ldc) && LIBXSMM_GEMM_NO_BYPASS_DIMS(m, n, k)) { result.blob = blob; LIBXSMM_GEMM_DESCRIPTOR2(*result.ptr, LIBXSMM_GEMM_PRECISION(libxsmm_bfloat16), LIBXSMM_GEMM_PRECISION(libxsmm_bfloat16), flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch); } else { /* unsupported */ result.ptr = NULL; } return result.ptr; } LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_bigemm_descriptor_init(libxsmm_descriptor_blob* blob, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, int alpha, int beta, int flags, int prefetch) { union { libxsmm_gemm_descriptor* ptr; libxsmm_descriptor_blob* blob; } result; if (LIBXSMM_GEMM_NO_BYPASS(flags, alpha, beta) && LIBXSMM_GEMM_NO_BYPASS_DIMS(lda, ldb, ldc) && LIBXSMM_GEMM_NO_BYPASS_DIMS(m, n, k)) { result.blob = blob; LIBXSMM_GEMM_DESCRIPTOR2(*result.ptr, LIBXSMM_GEMM_PRECISION(char), LIBXSMM_GEMM_PRECISION(int), flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch); } else { /* unsupported */ result.ptr = NULL; } return result.ptr; } LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_bbgemm_descriptor_init(libxsmm_descriptor_blob* blob, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, int alpha, int beta, int flags, int prefetch) { union { libxsmm_gemm_descriptor* ptr; libxsmm_descriptor_blob* blob; } result; if (LIBXSMM_GEMM_NO_BYPASS(flags, alpha, beta) && LIBXSMM_GEMM_NO_BYPASS_DIMS(lda, ldb, ldc) && LIBXSMM_GEMM_NO_BYPASS_DIMS(m, n, k)) { result.blob = blob; LIBXSMM_GEMM_DESCRIPTOR2(*result.ptr, LIBXSMM_GEMM_PRECISION(char), LIBXSMM_GEMM_PRECISION(char), flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch); } else { /* unsupported */ result.ptr = NULL; } return result.ptr; } LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_gemm_descriptor_dinit(libxsmm_descriptor_blob* blob, libxsmm_gemm_precision precision, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, double alpha, double beta, int flags, int prefetch) { return libxsmm_gemm_descriptor_dinit2(blob, precision, precision, m, n, k, lda, ldb, ldc, alpha, beta, flags, prefetch); } LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_gemm_descriptor_dinit2(libxsmm_descriptor_blob* blob, libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, double alpha, double beta, int flags, int prefetch) { /* avoid warning about potentially uninitialized variable (initialize outside of control flow) */ libxsmm_gemm_descriptor* result = NULL; switch (iprec) { case LIBXSMM_GEMM_PRECISION_F64: { LIBXSMM_ASSERT(iprec == oprec); result = libxsmm_dgemm_descriptor_init(blob, m, n, k, lda, ldb, ldc, alpha, beta, flags, prefetch); } break; case LIBXSMM_GEMM_PRECISION_F32: { LIBXSMM_ASSERT(iprec == oprec); result = libxsmm_sgemm_descriptor_init(blob, m, n, k, lda, ldb, ldc, (float)alpha, (float)beta, flags, prefetch); } break; case LIBXSMM_GEMM_PRECISION_I16: { result = libxsmm_wigemm_descriptor_init(blob, m, n, k, lda, ldb, ldc, (int)alpha, (int)beta, flags, prefetch); } break; case LIBXSMM_GEMM_PRECISION_I8: { if (LIBXSMM_GEMM_PRECISION_I32 == oprec) { result = libxsmm_bigemm_descriptor_init(blob, m, n, k, lda, ldb, ldc, (int)alpha, (int)beta, flags, prefetch); } else if (LIBXSMM_GEMM_PRECISION_I8 == oprec) { result = libxsmm_bbgemm_descriptor_init(blob, m, n, k, lda, ldb, ldc, (int)alpha, (int)beta, flags, prefetch); } } break; case LIBXSMM_GEMM_PRECISION_BF16: { if (LIBXSMM_GEMM_PRECISION_F32 == oprec) { result = libxsmm_bsgemm_descriptor_init(blob, m, n, k, lda, ldb, ldc, (float)alpha, (float)beta, flags, prefetch); } else if (LIBXSMM_GEMM_PRECISION_BF16 == oprec) { result = libxsmm_bgemm_descriptor_init(blob, m, n, k, lda, ldb, ldc, (float)alpha, (float)beta, flags, prefetch); } } break; default: { static int error_once = 0; if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: GEMM precision is not supported!\n"); } } } return result; } LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_gemm_descriptor_init(libxsmm_descriptor_blob* blob, libxsmm_gemm_precision precision, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, const void* alpha, const void* beta, int flags, int prefetch) { return libxsmm_gemm_descriptor_init2(blob, precision, precision, m, n, k, lda, ldb, ldc, alpha, beta, flags, prefetch); } LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_gemm_descriptor_init2(libxsmm_descriptor_blob* blob, libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, const void* alpha, const void* beta, int flags, int prefetch) { return libxsmm_gemm_descriptor_init3(blob, iprec, oprec, m, n, k, lda, ldb, ldc, alpha, beta, flags, prefetch, NULL/*dalpha*/, NULL/*dbeta*/); } LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_gemm_descriptor_init3(libxsmm_descriptor_blob* blob, libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, const void* alpha, const void* beta, int flags, int prefetch, double* dalpha, double* dbeta) { /* avoid warning about potentially uninitialized variable (initialize outside of control flow) */ libxsmm_gemm_descriptor* result = NULL; switch (iprec) { case LIBXSMM_GEMM_PRECISION_F64: { const double aa = (NULL != alpha ? *((const double*)alpha) : (LIBXSMM_ALPHA)); const double bb = (NULL != beta ? *((const double*)beta) : (LIBXSMM_BETA)); LIBXSMM_ASSERT(LIBXSMM_GEMM_PRECISION_F64 == oprec); result = libxsmm_dgemm_descriptor_init(blob, m, n, k, lda, ldb, ldc, aa, bb, flags, prefetch); if (NULL != dalpha) *dalpha = aa; if (NULL != dbeta) *dbeta = bb; } break; case LIBXSMM_GEMM_PRECISION_F32: { const float aa = (NULL != alpha ? *((const float*)alpha) : (LIBXSMM_ALPHA)); const float bb = (NULL != beta ? *((const float*)beta) : (LIBXSMM_BETA)); LIBXSMM_ASSERT(LIBXSMM_GEMM_PRECISION_F32 == oprec); result = libxsmm_sgemm_descriptor_init(blob, m, n, k, lda, ldb, ldc, aa, bb, flags, prefetch); if (NULL != dalpha) *dalpha = (double)aa; if (NULL != dbeta) *dbeta = (double)bb; } break; case LIBXSMM_GEMM_PRECISION_I16: { /** * Take alpha and beta as short data although wgemm works on integers. * However, alpha and beta are only JIT-supported for certain values, * and the call-side may not distinct different input and output types * (integer/short), hence it is safer to only read short data. */ const short aa = (short)(NULL != alpha ? *((const short*)alpha) : (LIBXSMM_ALPHA)); const short bb = (short)(NULL != beta ? *((const short*)beta) : (LIBXSMM_BETA)); result = libxsmm_wigemm_descriptor_init(blob, m, n, k, lda, ldb, ldc, aa, bb, flags, prefetch); if (NULL != dalpha) *dalpha = (double)aa; if (NULL != dbeta) *dbeta = (double)bb; } break; case LIBXSMM_GEMM_PRECISION_I8: { /** * Take alpha and beta as short data although wgemm works on integers. * However, alpha and beta are only JIT-supported for certain values, * and the call-side may not distinct different input and output types * (integer/short), hence it is safer to only read short data. */ if (LIBXSMM_GEMM_PRECISION_I32 == oprec) { const short aa = (short)(NULL != alpha ? *((const short*)alpha) : (LIBXSMM_ALPHA)); const short bb = (short)(NULL != beta ? *((const short*)beta) : (LIBXSMM_BETA)); result = libxsmm_bigemm_descriptor_init(blob, m, n, k, lda, ldb, ldc, aa, bb, flags, prefetch); if (NULL != dalpha) *dalpha = (double)aa; if (NULL != dbeta) *dbeta = (double)bb; } else if (LIBXSMM_GEMM_PRECISION_I8 == oprec) { const short aa = (short)(NULL != alpha ? *((const short*)alpha) : (LIBXSMM_ALPHA)); const short bb = (short)(NULL != beta ? *((const short*)beta) : (LIBXSMM_BETA)); result = libxsmm_bbgemm_descriptor_init(blob, m, n, k, lda, ldb, ldc, aa, bb, flags, prefetch); if (NULL != dalpha) *dalpha = (double)aa; if (NULL != dbeta) *dbeta = (double)bb; } } break; case LIBXSMM_GEMM_PRECISION_BF16: { if (LIBXSMM_GEMM_PRECISION_F32 == oprec) { const float aa = (NULL != alpha ? *((const float*)alpha) : (LIBXSMM_ALPHA)); const float bb = (NULL != beta ? *((const float*)beta) : (LIBXSMM_BETA)); result = libxsmm_bsgemm_descriptor_init(blob, m, n, k, lda, ldb, ldc, aa, bb, flags, prefetch); if (NULL != dalpha) *dalpha = (double)aa; if (NULL != dbeta) *dbeta = (double)bb; } else if (LIBXSMM_GEMM_PRECISION_BF16 == oprec) { const float aa = (NULL != alpha ? *((const float*)alpha) : (LIBXSMM_ALPHA)); const float bb = (NULL != beta ? *((const float*)beta) : (LIBXSMM_BETA)); result = libxsmm_bgemm_descriptor_init(blob, m, n, k, lda, ldb, ldc, aa, bb, flags, prefetch); if (NULL != dalpha) *dalpha = (double)aa; if (NULL != dbeta) *dbeta = (double)bb; } } break; default: { static int error_once = 0; if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: GEMM precision is not supported!\n"); } } } return result; } LIBXSMM_API libxsmm_trans_descriptor* libxsmm_trans_descriptor_init(libxsmm_descriptor_blob* blob, unsigned int typesize, unsigned int m, unsigned int n, unsigned int ldo) { union { libxsmm_trans_descriptor* ptr; libxsmm_descriptor_blob* blob; } result; LIBXSMM_DESCRIPTOR_CLEAR(blob); result.blob = blob; result.ptr->typesize = (unsigned char)typesize; result.ptr->ldo = ldo; result.ptr->m = m; result.ptr->n = n; return result.ptr; } LIBXSMM_API libxsmm_mcopy_descriptor* libxsmm_mcopy_descriptor_init(libxsmm_descriptor_blob* blob, unsigned int typesize, unsigned int m, unsigned int n, unsigned int ldo, unsigned int ldi, int flags, int prefetch, const int* unroll) { union { libxsmm_mcopy_descriptor* ptr; libxsmm_descriptor_blob* blob; } result; LIBXSMM_DESCRIPTOR_CLEAR(blob); result.blob = blob; result.ptr->prefetch = (unsigned char)prefetch; result.ptr->flags = (unsigned char)flags; /* TODO: backend supports typesize <= 4, but certain AVX1/AVX2-kernels are incorrect */ if (4 >= typesize && (LIBXSMM_X86_AVX512 <= libxsmm_target_archid || 32 <= (typesize * m) || ldi == ldo)) { result.ptr->typesize = (unsigned char)typesize; result.ptr->unroll_level = (unsigned char)((NULL == unroll || 0 >= *unroll) ? LIBXSMM_MAX(8 / result.ptr->typesize, 1) : LIBXSMM_MIN(*unroll, 64)); result.ptr->ldi = ldi; result.ptr->ldo = ldo; result.ptr->m = m; result.ptr->n = n; } else { /* fix-up incl. DP-support */ result.ptr->typesize = 4; result.ptr->unroll_level = 2; result.ptr->ldi = ldi * typesize / 4; /* scale */ result.ptr->ldo = ldo * typesize / 4; /* scale */ result.ptr->m = m * typesize / 4; /* scale */ result.ptr->n = n; if (((typesize * ldi) != (4 * result.ptr->ldi) || (typesize * ldo) != (4 * result.ptr->ldo) || (typesize * m) != (4 * result.ptr->m))) { result.ptr = NULL; } } return result.ptr; } LIBXSMM_API libxsmm_meltw_descriptor* libxsmm_meltw_descriptor_init(libxsmm_descriptor_blob* blob, libxsmm_datatype in_type, libxsmm_datatype out_type, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo, int flags, int operation) { union { libxsmm_meltw_descriptor* ptr; libxsmm_descriptor_blob* blob; } result; LIBXSMM_DESCRIPTOR_CLEAR(blob); result.blob = blob; result.ptr->datatype = (unsigned char)LIBXSMM_GETENUM(in_type, out_type); result.ptr->datatype2 = 0; result.ptr->flags = (unsigned char)flags; result.ptr->operation = (unsigned char)operation; result.ptr->ldi = ldi; result.ptr->ldo = ldo; result.ptr->ldx = 0; result.ptr->ldy = 0; result.ptr->m = m; result.ptr->n = n; return result.ptr; } LIBXSMM_API libxsmm_meltw_descriptor* libxsmm_meltw_descriptor_init2(libxsmm_descriptor_blob* blob, libxsmm_datatype in_type, libxsmm_datatype in2_type, libxsmm_datatype out_type, libxsmm_datatype out2_type, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo, libxsmm_blasint ldx, libxsmm_blasint ldy, int flags, int operation) { union { libxsmm_meltw_descriptor* ptr; libxsmm_descriptor_blob* blob; } result; LIBXSMM_DESCRIPTOR_CLEAR(blob); result.blob = blob; result.ptr->datatype = (unsigned char)LIBXSMM_GETENUM(in_type, out_type); result.ptr->datatype2 = (unsigned char)LIBXSMM_GETENUM(in2_type, out2_type); result.ptr->flags = (unsigned char)flags; result.ptr->operation = (unsigned char)operation; result.ptr->ldi = ldi; result.ptr->ldo = ldo; result.ptr->ldx = ldx; result.ptr->ldy = ldy; result.ptr->m = m; result.ptr->n = n; return result.ptr; } LIBXSMM_API libxsmm_trsm_descriptor* libxsmm_trsm_descriptor_init(libxsmm_descriptor_blob* blob, unsigned int typesize, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint lda, libxsmm_blasint ldb, const void* alpha, char transa, char diag, char side, char uplo, int layout) { union { libxsmm_trsm_descriptor* ptr; libxsmm_descriptor_blob* blob; } result; LIBXSMM_DESCRIPTOR_CLEAR(blob); result.blob = blob; result.ptr->typesize = (unsigned char)typesize; result.ptr->lda = (unsigned char)lda; result.ptr->ldb = (unsigned char)ldb; result.ptr->m = (unsigned char)m; result.ptr->n = (unsigned char)n; result.ptr->transa = transa; result.ptr->diag = diag; result.ptr->side = side; result.ptr->uplo = uplo; result.ptr->layout = (unsigned char)layout; switch (typesize) { case 4: { result.ptr->alpha.s = (NULL != alpha ? (*(const float*)alpha) : ((float)LIBXSMM_ALPHA)); } break; case 8: { result.ptr->alpha.d = (NULL != alpha ? (*(const double*)alpha) : ((double)LIBXSMM_ALPHA)); } break; default: /* TODO: generate warning */; } return result.ptr; } LIBXSMM_API libxsmm_trmm_descriptor* libxsmm_trmm_descriptor_init(libxsmm_descriptor_blob* blob, unsigned int typesize, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint lda, libxsmm_blasint ldb, const void* alpha, char transa, char diag, char side, char uplo, int layout) { union { libxsmm_trmm_descriptor* ptr; libxsmm_descriptor_blob* blob; } result; LIBXSMM_DESCRIPTOR_CLEAR(blob); result.blob = blob; result.ptr->typesize = (unsigned char)typesize; result.ptr->lda = (unsigned char)lda; result.ptr->ldb = (unsigned char)ldb; result.ptr->m = (unsigned char)m; result.ptr->n = (unsigned char)n; result.ptr->transa = transa; result.ptr->diag = diag; result.ptr->side = side; result.ptr->uplo = uplo; result.ptr->layout = (unsigned char)layout; switch (typesize) { case 4: { result.ptr->alpha.s = (NULL != alpha ? (*(const float*)alpha) : ((float)LIBXSMM_ALPHA)); } break; case 8: { result.ptr->alpha.d = (NULL != alpha ? (*(const double*)alpha) : ((double)LIBXSMM_ALPHA)); } break; default: /* TODO: generate warning */; } return result.ptr; } LIBXSMM_API libxsmm_pgemm_descriptor* libxsmm_pgemm_descriptor_init(libxsmm_descriptor_blob* blob, unsigned int typesize, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, const void* alpha, char transa, char transb, int layout) { union { libxsmm_pgemm_descriptor* ptr; libxsmm_descriptor_blob* blob; } result; LIBXSMM_DESCRIPTOR_CLEAR(blob); result.blob = blob; result.ptr->typesize = (unsigned char)typesize; result.ptr->lda = (unsigned char)lda; result.ptr->ldb = (unsigned char)ldb; result.ptr->ldc = (unsigned char)ldc; result.ptr->m = (unsigned char)m; result.ptr->n = (unsigned char)n; result.ptr->k = (unsigned char)k; result.ptr->transa = transa; result.ptr->transb = transb; result.ptr->layout = (unsigned char)layout; if ( typesize == 4 ) { float *alpha_val = (float*)alpha; if ( *alpha_val == 1.0 ) result.ptr->alpha_val = 0; else if ( *alpha_val == -1.0 ) result.ptr->alpha_val = 1; else { printf("Warning: real*4 alpha value should be 1.0 or -1.0\n"); exit(-1); } } else { double *alpha_val = (double*)alpha; if ( *alpha_val == 1.0 ) result.ptr->alpha_val = 0; else if ( *alpha_val == -1.0 ) result.ptr->alpha_val = 1; else { printf("Warning: real*8 alpha value should be 1.0 or -1.0\n"); exit(-1); } } return result.ptr; } LIBXSMM_API libxsmm_getrf_descriptor* libxsmm_getrf_descriptor_init(libxsmm_descriptor_blob* blob, unsigned int typesize, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint lda, int layout) { union { libxsmm_getrf_descriptor* ptr; libxsmm_descriptor_blob* blob; } result; LIBXSMM_DESCRIPTOR_CLEAR(blob); result.blob = blob; result.ptr->typesize = (unsigned char)typesize; result.ptr->lda = (unsigned char)lda; result.ptr->m = (unsigned char)m; result.ptr->n = (unsigned char)n; result.ptr->layout = (unsigned char)layout; return result.ptr; } LIBXSMM_API size_t libxsmm_gcd(size_t a, size_t b) { while (0 != b) { const size_t r = a % b; a = b; b = r; } return 0 != a ? a : 1; } LIBXSMM_API size_t libxsmm_lcm(size_t a, size_t b) { const size_t gcd = libxsmm_gcd(a, b); return 0 != gcd ? ((a / gcd) * b) : 0; } LIBXSMM_API int libxsmm_primes_u32(unsigned int num, unsigned int num_factors_n32[]) { unsigned int c = num, i; int n = 0; if (0 < c && 0 == (c & 1)) { /* non-zero even */ unsigned int j = c / 2; while (c == (2 * j)) { num_factors_n32[n++] = 2; c = j; j /= 2; } } for (i = 3; i <= c; i += 2) { unsigned int j = c / i; while (c == (i * j)) { num_factors_n32[n++] = i; c = j; j /= i; } if ((i * i) > num) { break; } } if (1 < c && 0 != n) { num_factors_n32[n++] = c; } return n; } LIBXSMM_API_INLINE unsigned int internal_product_limit(unsigned int product, unsigned int limit) { unsigned int fact[32], maxp = limit, result = 1; int i, n; /* attempt to lower the memory requirement for DP; can miss best solution */ if (LIBXSMM_PRODUCT_LIMIT < limit) { const unsigned int minfct = (limit + limit - 1) / LIBXSMM_PRODUCT_LIMIT; const unsigned int maxfct = (unsigned int)libxsmm_gcd(product, limit); result = maxfct; if (minfct < maxfct) { n = libxsmm_primes_u32(result, fact); for (i = 0; i < n; ++i) { if (minfct < fact[i]) { result = fact[i]; break; } } } maxp /= result; } if (LIBXSMM_PRODUCT_LIMIT >= maxp) { unsigned int k[2][LIBXSMM_PRODUCT_LIMIT], *k0 = k[0], *k1 = k[1], *kt, p; n = libxsmm_primes_u32(product / result, fact); /* initialize table with trivial factor */ for (p = 0; p <= maxp; ++p) k[0][p] = 1; k[0][0] = k[1][0] = 1; for (i = 1; i <= n; ++i) { for (p = 1; p <= maxp; ++p) { const unsigned int f = fact[i - 1], h = k0[p]; if (p < f) { k1[p] = h; } else { const unsigned int g = f * k0[p / f]; k1[p] = LIBXSMM_MAX(g, h); } } kt = k0; k0 = k1; k1 = kt; } result *= k0[maxp]; } else { /* trivial approximation */ n = libxsmm_primes_u32(product, fact); for (i = 0; i < n; ++i) { const unsigned int f = result * fact[i]; if (f <= limit) { result = f; } else break; } } return result; } LIBXSMM_API unsigned int libxsmm_product_limit(unsigned int product, unsigned int limit, int is_lower) { unsigned int result; if (1 < limit) { /* check for fast-path */ result = internal_product_limit(product, limit); } else { result = limit; } if (0 != is_lower && limit < product) { if (result < limit) { result = internal_product_limit(product, 2 * limit - 1); } if (result < limit) { result = product; } LIBXSMM_ASSERT(limit <= result); } if (product < result) { result = product; } LIBXSMM_ASSERT(result <= product); return result; } libxsmm-1.17/src/libxsmm_generator_gemm_driver.c000066400000000000000000000214151415223013700221520ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include LIBXSMM_INLINE void print_help(void) { printf("\nwrong usage -> exit!\n\n\n"); printf("Usage (sparse*dense=dense, dense*sparse=dense):\n"); printf(" sparse, sparse_csr, sparse_csr_reg, sparse_csr_soa\n"); printf(" filename to append\n"); printf(" routine name\n"); printf(" M\n"); printf(" N\n"); printf(" K\n"); printf(" LDA (if < 1 --> A sparse)\n"); printf(" LDB (if < 1 --> B sparse)\n"); printf(" LDC\n"); printf(" alpha: 1\n"); printf(" beta: 0 or 1\n"); printf(" 0: unaligned A, otherwise aligned (ignored for sparse)\n"); printf(" 0: unaligned C, otherwise aligned (ignored for sparse)\n"); printf(" ARCH: noarch, wsm, snb, hsw, knl, knm, skx, clx, cpx\n"); printf(" PREFETCH: nopf (none), pfsigonly, other dense options fall-back to pfsigonly\n"); printf(" PRECISION: SP, DP\n"); printf(" matrix input (CSC mtx file)\n"); printf("\n\n"); printf("Usage (dense*dense=dense):\n"); printf(" dense, dense_asm\n"); printf(" filename to append\n"); printf(" routine name\n"); printf(" M\n"); printf(" N\n"); printf(" K\n"); printf(" LDA\n"); printf(" LDB\n"); printf(" LDC\n"); printf(" alpha: -1 or 1\n"); printf(" beta: 0 or 1\n"); printf(" 0: unaligned A, otherwise aligned\n"); printf(" 0: unaligned C, otherwise aligned\n"); printf(" ARCH: noarch, wsm, snb, hsw, knl, knm, skx, clx, cpx\n"); printf(" PREFETCH: nopf (none), pfsigonly, BL2viaC, AL2, curAL2,\n" " AL2_BL2viaC, curAL2_BL2viaC,\n"); printf(" PRECISION: I16, SP, DP\n"); printf("\n\n\n\n"); } int main(int argc, char* argv []) { const libxsmm_gemm_descriptor* l_xgemm_desc = 0; int l_flags = LIBXSMM_GEMM_FLAGS('N', 'N'); libxsmm_gemm_prefetch_type l_prefetch; libxsmm_descriptor_blob l_xgemm_blob; char* l_type; char* l_file_out; char* l_matrix_file_in; char* l_routine_name; char* l_arch; char* l_precision; int l_m = 0; int l_n = 0; int l_k = 0; int l_lda = 0; int l_ldb = 0; int l_ldc = 0; int l_aligned_a = 0; int l_aligned_c = 0; double l_alpha = 0; double l_beta = 0; int l_single_precision = 0; int l_is_csr = 0; /* check argument count for a valid range */ if (argc != 17 && argc != 18) { print_help(); return EXIT_FAILURE; } /* names of files and routines */ l_type = argv[1]; l_file_out = argv[2]; l_routine_name = argv[3]; /* xgemm sizes */ l_m = atoi(argv[4]); l_n = atoi(argv[5]); l_k = atoi(argv[6]); l_lda = atoi(argv[7]); l_ldb = atoi(argv[8]); l_ldc = atoi(argv[9]); /* condense < 1 to 0 for lda and ldb */ if ( l_lda < 1 ) l_lda = 0; if ( l_ldb < 1 ) l_ldb = 0; /* some sugar */ l_alpha = atof(argv[10]); l_beta = atof(argv[11]); l_aligned_a = atoi(argv[12]); l_aligned_c = atoi(argv[13]); l_flags |= (0 != l_aligned_a ? LIBXSMM_GEMM_FLAG_ALIGN_A : 0); l_flags |= (0 != l_aligned_c ? LIBXSMM_GEMM_FLAG_ALIGN_C : 0); /* arch specific stuff */ l_arch = argv[14]; l_precision = argv[16]; /* some initial parameters checks */ /* check for sparse / dense only */ if ( (strcmp(l_type, "sparse") != 0) && (strcmp(l_type, "sparse_csr") != 0) && (strcmp(l_type, "sparse_csr_reg") != 0) && (strcmp(l_type, "sparse_csr_soa") != 0) && (strcmp(l_type, "dense") != 0) && (strcmp(l_type, "dense_asm") != 0) ) { print_help(); return EXIT_FAILURE; } /* check for the right number of arguments depending on type */ if ( ( (strcmp(l_type, "sparse") == 0) && (argc != 18) ) || ( (strcmp(l_type, "sparse_csr") == 0) && (argc != 18) ) || ( (strcmp(l_type, "sparse_csr_reg") == 0) && (argc != 18) ) || ( (strcmp(l_type, "sparse_csr_soa") == 0) && (argc != 18) ) || ( (strcmp(l_type, "dense") == 0) && (argc != 17) ) || ( (strcmp(l_type, "dense_asm") == 0) && (argc != 17) ) ) { print_help(); return EXIT_FAILURE; } /* set value of prefetch flag */ if (strcmp("nopf", argv[15]) == 0) { l_prefetch = LIBXSMM_GEMM_PREFETCH_NONE; } else if (strcmp("pfsigonly", argv[15]) == 0) { l_prefetch = LIBXSMM_GEMM_PREFETCH_SIGONLY; } else if (strcmp("BL2viaC", argv[15]) == 0) { l_prefetch = LIBXSMM_GEMM_PREFETCH_BL2_VIA_C; } else if (strcmp("curAL2", argv[15]) == 0) { l_prefetch = LIBXSMM_GEMM_PREFETCH_AL2_AHEAD; } else if (strcmp("curAL2_BL2viaC", argv[15]) == 0) { l_prefetch = LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C_AHEAD; } else if (strcmp("AL2", argv[15]) == 0) { l_prefetch = LIBXSMM_GEMM_PREFETCH_AL2; } else if (strcmp("AL2_BL2viaC", argv[15]) == 0) { l_prefetch = LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C; } else { print_help(); return EXIT_FAILURE; } /* check value of arch flag */ if ( (strcmp(l_arch, "wsm") != 0) && (strcmp(l_arch, "snb") != 0) && (strcmp(l_arch, "hsw") != 0) && (strcmp(l_arch, "knl") != 0) && (strcmp(l_arch, "knm") != 0) && (strcmp(l_arch, "skx") != 0) && (strcmp(l_arch, "clx") != 0) && (strcmp(l_arch, "cpx") != 0) && (strcmp(l_arch, "noarch") != 0) ) { print_help(); return EXIT_FAILURE; } /* check and evaluate precision flag */ if ( strcmp(l_precision, "SP") == 0 ) { l_single_precision = 1; } else if ( strcmp(l_precision, "DP") == 0 ) { l_single_precision = 0; } else if ( strcmp(l_precision, "I16") == 0 ) { l_single_precision = 2; } else { print_help(); return EXIT_FAILURE; } /* check alpha */ if ((l_alpha < -1 || -1 < l_alpha) && (l_alpha < 1 || 1 < l_alpha)) { print_help(); return EXIT_FAILURE; } /* check beta */ if ((l_beta < 0 || 0 < l_beta) && (l_beta < 1 || 1 < l_beta)) { print_help(); return EXIT_FAILURE; } switch (l_single_precision) { case 0: { l_xgemm_desc = libxsmm_gemm_descriptor_dinit(&l_xgemm_blob, LIBXSMM_GEMM_PRECISION_F64, l_m, l_n, l_k, l_lda, l_ldb, l_ldc, l_alpha, l_beta, l_flags, l_prefetch); } break; case 1: { l_xgemm_desc = libxsmm_gemm_descriptor_dinit(&l_xgemm_blob, LIBXSMM_GEMM_PRECISION_F32, l_m, l_n, l_k, l_lda, l_ldb, l_ldc, l_alpha, l_beta, l_flags, l_prefetch); } break; case 2: { l_xgemm_desc = libxsmm_gemm_descriptor_dinit(&l_xgemm_blob, LIBXSMM_GEMM_PRECISION_I16, l_m, l_n, l_k, l_lda, l_ldb, l_ldc, l_alpha, l_beta, l_flags, l_prefetch); } break; default: { print_help(); return EXIT_FAILURE; } } if (NULL == l_xgemm_desc) { print_help(); return EXIT_FAILURE; } if ( strcmp(l_type, "sparse") == 0 || strcmp(l_type, "sparse_csr") == 0 || strcmp(l_type, "sparse_csr_reg") == 0 || strcmp(l_type, "sparse_csr_soa") == 0 ) { /* read additional parameter for CSC/CSR description */ l_matrix_file_in = argv[17]; /* some more restrictive checks are needed in case of sparse */ if ( (l_alpha < 1) || (1 < l_alpha) ) { print_help(); return EXIT_FAILURE; } if (l_lda < 1 && l_ldb < 1) { print_help(); return EXIT_FAILURE; } if (l_ldc < 1) { print_help(); return EXIT_FAILURE; } if ( l_single_precision > 1 ) { print_help(); return EXIT_FAILURE; } if ( strcmp(l_type, "sparse_csr") == 0 ) { l_is_csr = 1; } if ( strcmp(l_type, "sparse_csr_soa") == 0 ) { l_is_csr = 2; } if ( strcmp(l_type, "sparse_csr_reg") == 0 ) { l_is_csr = 3; } libxsmm_generator_spgemm( l_file_out, l_routine_name, l_xgemm_desc, l_arch, l_matrix_file_in, l_is_csr ); } if ( (strcmp(l_type, "dense") == 0) || (strcmp(l_type, "dense_asm") == 0) ) { if (l_lda < 1 || l_ldb < 1 || l_ldc < 1) { print_help(); return EXIT_FAILURE; } if ( strcmp(l_type, "dense") == 0 ) { libxsmm_generator_gemm_inlineasm( l_file_out, l_routine_name, l_xgemm_desc, l_arch ); } else { libxsmm_generator_gemm_directasm( l_file_out, l_routine_name, l_xgemm_desc, l_arch ); } } return EXIT_SUCCESS; } libxsmm-1.17/src/libxsmm_hash.c000066400000000000000000000737441415223013700165430ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include "libxsmm_hash.h" #include "libxsmm_main.h" #if !defined(LIBXSMM_HASH_ALIGNMENT) # define LIBXSMM_HASH_ALIGNMENT 8 #endif #define LIBXSMM_HASH_U64(FN, SEED, BEGIN, END) { \ const uint8_t *const end = (NULL != (END) ? ((END) - 7) : NULL); \ for (; (BEGIN) < end; (BEGIN) += 8) { LIBXSMM_ASSERT(NULL != (BEGIN) || NULL == (END)); \ SEED = (uint32_t)FN(SEED, BEGIN); \ } \ } #define LIBXSMM_HASH_U32(FN, SEED, BEGIN, END) { \ const uint8_t *const next = (BEGIN) + 4; \ if (next <= (END)) { LIBXSMM_ASSERT(NULL != (BEGIN) || NULL == (END)); \ SEED = FN(SEED, BEGIN); BEGIN = next; \ } \ } #define LIBXSMM_HASH_U16(FN, SEED, BEGIN, END) { \ const uint8_t *const next = (BEGIN) + 2; \ if (next <= (END)) { LIBXSMM_ASSERT(NULL != (BEGIN) || NULL == (END)); \ SEED = FN(SEED, BEGIN); BEGIN = next; \ } \ } #define LIBXSMM_HASH_U8(FN, SEED, BEGIN, END) { \ if ((BEGIN) < (END)) { LIBXSMM_ASSERT(NULL != (BEGIN) || NULL == (END)); \ SEED = FN(SEED, BEGIN); ++(BEGIN); \ } \ } #define LIBXSMM_HASH_CRC32_U8(SEED, PVALUE) _mm_crc32_u8(SEED, *(const uint8_t*)(PVALUE)) #define LIBXSMM_HASH_CRC32_U16(SEED, PVALUE) _mm_crc32_u16(SEED, *(const uint16_t*)(PVALUE)) #define LIBXSMM_HASH_CRC32_U32(SEED, PVALUE) _mm_crc32_u32(SEED, *(const uint32_t*)(PVALUE)) #if (64 > (LIBXSMM_BITS)) || defined(__PGI) # define LIBXSMM_HASH_CRC32_U64(SEED, PVALUE) \ LIBXSMM_HASH_CRC32_U32(LIBXSMM_HASH_CRC32_U32((uint32_t)(SEED), PVALUE), (const uint32_t*)(PVALUE) + 1) #else # define LIBXSMM_HASH_CRC32_U64(SEED, PVALUE) _mm_crc32_u64(SEED, *(const uint64_t*)(PVALUE)) #endif #define LIBXSMM_HASH_UNALIGNED(FN64, FN32, FN16, FN8, SEED, DATA, SIZE) { \ const uint8_t *begin = (const uint8_t*)(DATA); \ const uint8_t *const endb = begin + (SIZE); \ LIBXSMM_HASH_U64(FN64, SEED, begin, endb); \ LIBXSMM_HASH_U32(FN32, SEED, begin, endb); \ LIBXSMM_HASH_U16(FN16, SEED, begin, endb); \ return begin == endb ? (SEED) : FN8(SEED, begin); \ } #if defined(LIBXSMM_HASH_ALIGNMENT) && 8 < (LIBXSMM_HASH_ALIGNMENT) # define LIBXSMM_HASH(FN64, FN32, FN16, FN8, SEED, DATA, SIZE) { \ const uint8_t *begin = (const uint8_t*)(DATA); \ const uint8_t *const endb = begin + (SIZE); \ const uint8_t *const enda = LIBXSMM_ALIGN(begin, LIBXSMM_HASH_ALIGNMENT); \ if ((SIZE) > (size_t)(endb - enda)) { \ LIBXSMM_HASH_U64(FN64, SEED, begin, enda); \ LIBXSMM_HASH_U32(FN32, SEED, begin, enda); \ LIBXSMM_HASH_U16(FN16, SEED, begin, enda); \ LIBXSMM_HASH_U8(FN8, SEED, begin, enda); \ } \ LIBXSMM_ASSUME_ALIGNED(begin, LIBXSMM_HASH_ALIGNMENT); \ LIBXSMM_HASH_U64(FN64, SEED, begin, endb); \ LIBXSMM_HASH_U32(FN32, SEED, begin, endb); \ LIBXSMM_HASH_U16(FN16, SEED, begin, endb); \ return begin == endb ? (SEED) : FN8(SEED, begin); \ } #elif defined(LIBXSMM_HASH_ALIGNMENT) && 1 < (LIBXSMM_HASH_ALIGNMENT) # define LIBXSMM_HASH(FN64, FN32, FN16, FN8, SEED, DATA, SIZE) { \ const uint8_t *begin = (const uint8_t*)(DATA); \ const uint8_t *const endb = begin + (SIZE); \ const uint8_t *const enda = LIBXSMM_ALIGN(begin, LIBXSMM_HASH_ALIGNMENT); \ if ((SIZE) > (size_t)(endb - enda)) { \ LIBXSMM_HASH_U32(FN32, SEED, begin, enda); \ LIBXSMM_HASH_U16(FN16, SEED, begin, enda); \ LIBXSMM_HASH_U8(FN8, SEED, begin, enda); \ } \ LIBXSMM_ASSUME_ALIGNED(begin, LIBXSMM_HASH_ALIGNMENT); \ LIBXSMM_HASH_U64(FN64, SEED, begin, endb); \ LIBXSMM_HASH_U32(FN32, SEED, begin, endb); \ LIBXSMM_HASH_U16(FN16, SEED, begin, endb); \ return begin == endb ? (SEED) : FN8(SEED, begin); \ } #else # define LIBXSMM_HASH LIBXSMM_HASH_UNALIGNED #endif typedef uint32_t internal_crc32_entry_type[256]; LIBXSMM_APIVAR_DEFINE(const internal_crc32_entry_type* internal_crc32_table); LIBXSMM_APIVAR_DEFINE(libxsmm_hash_function internal_hash_u32_function); LIBXSMM_APIVAR_DEFINE(libxsmm_hash_function internal_hash_u64_function); LIBXSMM_APIVAR_DEFINE(libxsmm_hash_function internal_hash_u128_function); LIBXSMM_APIVAR_DEFINE(libxsmm_hash_function internal_hash_u256_function); LIBXSMM_APIVAR_DEFINE(libxsmm_hash_function internal_hash_u384_function); LIBXSMM_APIVAR_DEFINE(libxsmm_hash_function internal_hash_u512_function); LIBXSMM_APIVAR_DEFINE(libxsmm_hash_function internal_hash_function); LIBXSMM_API_INLINE unsigned int internal_crc32_u8(unsigned int seed, const void* value) { const uint8_t *const pu8 = (const uint8_t*)value; LIBXSMM_ASSERT(NULL != pu8 && NULL != internal_crc32_table); return internal_crc32_table[0][(seed^(*pu8)) & 0xFF] ^ (seed >> 8); } LIBXSMM_API_INLINE unsigned int internal_crc32_u16(unsigned int seed, const void* value) { const uint8_t *const pu8 = (const uint8_t*)value; LIBXSMM_ASSERT(NULL != pu8); seed = internal_crc32_u8(seed, pu8 + 0); seed = internal_crc32_u8(seed, pu8 + 1); return seed; } LIBXSMM_API_INTERN unsigned int internal_crc32_u32(unsigned int seed, const void* value, ...); LIBXSMM_API_INTERN unsigned int internal_crc32_u32(unsigned int seed, const void* value, ...) { const uint32_t *const pu32 = (const uint32_t*)value; uint32_t c0, c1, c2, c3, s; LIBXSMM_ASSERT(NULL != pu32 && NULL != internal_crc32_table); s = seed ^ (*pu32); c0 = internal_crc32_table[0][(s >> 24) & 0xFF]; c1 = internal_crc32_table[1][(s >> 16) & 0xFF]; c2 = internal_crc32_table[2][(s >> 8) & 0xFF]; c3 = internal_crc32_table[3][(s & 0xFF)]; return (c0 ^ c1) ^ (c2 ^ c3); } LIBXSMM_API_INTERN unsigned int internal_crc32_u32_sse4(unsigned int seed, const void* value, ...); LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_SSE4) unsigned int internal_crc32_u32_sse4(unsigned int seed, const void* value, ...) { #if defined(LIBXSMM_INTRINSICS_SSE4) return LIBXSMM_HASH_CRC32_U32(seed, value); #else return internal_crc32_u32(seed, value); #endif } LIBXSMM_API_INTERN unsigned int internal_crc32_u64(unsigned int seed, const void* value, ...); LIBXSMM_API_INTERN unsigned int internal_crc32_u64(unsigned int seed, const void* value, ...) { const uint32_t *const pu32 = (const uint32_t*)value; LIBXSMM_ASSERT(NULL != pu32); seed = internal_crc32_u32(seed, pu32 + 0); seed = internal_crc32_u32(seed, pu32 + 1); return seed; } LIBXSMM_API_INTERN unsigned int internal_crc32_u64_sse4(unsigned int seed, const void* value, ...); LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_SSE4) unsigned int internal_crc32_u64_sse4(unsigned int seed, const void* value, ...) { #if defined(LIBXSMM_INTRINSICS_SSE4) return (unsigned int)LIBXSMM_HASH_CRC32_U64(seed, value); #else return internal_crc32_u64(seed, value); #endif } LIBXSMM_API_INTERN unsigned int internal_crc32_u128(unsigned int seed, const void* value, ...); LIBXSMM_API_INTERN unsigned int internal_crc32_u128(unsigned int seed, const void* value, ...) { const uint64_t *const pu64 = (const uint64_t*)value; LIBXSMM_ASSERT(NULL != pu64); seed = internal_crc32_u64(seed, pu64 + 0); seed = internal_crc32_u64(seed, pu64 + 1); return seed; } LIBXSMM_API_INTERN unsigned int internal_crc32_u128_sse4(unsigned int seed, const void* value, ...); LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_SSE4) unsigned int internal_crc32_u128_sse4(unsigned int seed, const void* value, ...) { #if defined(LIBXSMM_INTRINSICS_SSE4) const uint64_t *const pu64 = (const uint64_t*)value; LIBXSMM_ASSERT(NULL != pu64); seed = (unsigned int)LIBXSMM_HASH_CRC32_U64(seed, pu64 + 0); seed = (unsigned int)LIBXSMM_HASH_CRC32_U64(seed, pu64 + 1); #else seed = internal_crc32_u128(seed, value); #endif return seed; } LIBXSMM_API_INTERN unsigned int internal_crc32_u256(unsigned int seed, const void* value, ...); LIBXSMM_API_INTERN unsigned int internal_crc32_u256(unsigned int seed, const void* value, ...) { const uint8_t *const pu8 = (const uint8_t*)value; LIBXSMM_ASSERT(NULL != pu8); seed = internal_crc32_u128(seed, pu8 + 0x00); seed = internal_crc32_u128(seed, pu8 + 0x10); return seed; } LIBXSMM_API_INTERN unsigned int internal_crc32_u256_sse4(unsigned int seed, const void* value, ...); LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_SSE4) unsigned int internal_crc32_u256_sse4(unsigned int seed, const void* value, ...) { #if defined(LIBXSMM_INTRINSICS_SSE4) const uint8_t *const pu8 = (const uint8_t*)value; LIBXSMM_ASSERT(NULL != pu8); seed = internal_crc32_u128_sse4(seed, pu8 + 0x00); seed = internal_crc32_u128_sse4(seed, pu8 + 0x10); return seed; #else return internal_crc32_u256(seed, value); #endif } LIBXSMM_API_INTERN unsigned int internal_crc32_u384(unsigned int seed, const void* value, ...); LIBXSMM_API_INTERN unsigned int internal_crc32_u384(unsigned int seed, const void* value, ...) { const uint8_t *const pu8 = (const uint8_t*)value; LIBXSMM_ASSERT(NULL != pu8); seed = internal_crc32_u256(seed, pu8 + 0x00); seed = internal_crc32_u128(seed, pu8 + 0x20); return seed; } LIBXSMM_API_INTERN unsigned int internal_crc32_u384_sse4(unsigned int seed, const void* value, ...); LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_SSE4) unsigned int internal_crc32_u384_sse4(unsigned int seed, const void* value, ...) { #if defined(LIBXSMM_INTRINSICS_SSE4) const uint8_t *const pu8 = (const uint8_t*)value; LIBXSMM_ASSERT(NULL != pu8); seed = internal_crc32_u256_sse4(seed, pu8 + 0x00); seed = internal_crc32_u128_sse4(seed, pu8 + 0x20); return seed; #else return internal_crc32_u384(seed, value); #endif } LIBXSMM_API_INTERN unsigned int internal_crc32_u512(unsigned int seed, const void* value, ...); LIBXSMM_API_INTERN unsigned int internal_crc32_u512(unsigned int seed, const void* value, ...) { const uint8_t *const pu8 = (const uint8_t*)value; LIBXSMM_ASSERT(NULL != pu8); seed = internal_crc32_u256(seed, pu8 + 0x00); seed = internal_crc32_u256(seed, pu8 + 0x20); return seed; } LIBXSMM_API_INTERN unsigned int internal_crc32_u512_sse4(unsigned int seed, const void* value, ...); LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_SSE4) unsigned int internal_crc32_u512_sse4(unsigned int seed, const void* value, ...) { #if defined(LIBXSMM_INTRINSICS_SSE4) const uint8_t *const pu8 = (const uint8_t*)value; LIBXSMM_ASSERT(NULL != pu8); seed = internal_crc32_u256_sse4(seed, pu8 + 0x00); seed = internal_crc32_u256_sse4(seed, pu8 + 0x20); return seed; #else return internal_crc32_u512(seed, value); #endif } LIBXSMM_API_INTERN unsigned int internal_crc32(unsigned int seed, const void* data, size_t size); LIBXSMM_API_INTERN unsigned int internal_crc32(unsigned int seed, const void* data, size_t size) { LIBXSMM_ASSERT(NULL != data || 0 == size); LIBXSMM_HASH(internal_crc32_u64, internal_crc32_u32, internal_crc32_u16, internal_crc32_u8, seed, data, size); } LIBXSMM_API_INTERN unsigned int internal_crc32_sse4(unsigned int seed, const void* data, size_t size); LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_SSE4) unsigned int internal_crc32_sse4(unsigned int seed, const void* data, size_t size) { LIBXSMM_ASSERT(NULL != data || 0 == size); #if defined(LIBXSMM_INTRINSICS_SSE4) LIBXSMM_HASH(LIBXSMM_HASH_CRC32_U64, LIBXSMM_HASH_CRC32_U32, LIBXSMM_HASH_CRC32_U16, LIBXSMM_HASH_CRC32_U8, seed, data, size); #else return internal_crc32(seed, data, size); #endif } LIBXSMM_API_INTERN void libxsmm_hash_init(int target_arch) { /* table-based implementation taken from http://dpdk.org/. */ static const LIBXSMM_RETARGETABLE internal_crc32_entry_type crc32_table[] = { { /*table0*/ 0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4, 0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB, 0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B, 0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24, 0x105EC76F, 0xE235446C, 0xF165B798, 0x030E349B, 0xD7C45070, 0x25AFD373, 0x36FF2087, 0xC494A384, 0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54, 0x5D1D08BF, 0xAF768BBC, 0xBC267848, 0x4E4DFB4B, 0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, 0x33ED7D2A, 0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35, 0xAA64D611, 0x580F5512, 0x4B5FA6E6, 0xB93425E5, 0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA, 0x30E349B1, 0xC288CAB2, 0xD1D83946, 0x23B3BA45, 0xF779DEAE, 0x05125DAD, 0x1642AE59, 0xE4292D5A, 0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A, 0x7DA08661, 0x8FCB0562, 0x9C9BF696, 0x6EF07595, 0x417B1DBC, 0xB3109EBF, 0xA0406D4B, 0x522BEE48, 0x86E18AA3, 0x748A09A0, 0x67DAFA54, 0x95B17957, 0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687, 0x0C38D26C, 0xFE53516F, 0xED03A29B, 0x1F682198, 0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927, 0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38, 0xDBFC821C, 0x2997011F, 0x3AC7F2EB, 0xC8AC71E8, 0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7, 0x61C69362, 0x93AD1061, 0x80FDE395, 0x72966096, 0xA65C047D, 0x5437877E, 0x4767748A, 0xB50CF789, 0xEB1FCBAD, 0x197448AE, 0x0A24BB5A, 0xF84F3859, 0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46, 0x7198540D, 0x83F3D70E, 0x90A324FA, 0x62C8A7F9, 0xB602C312, 0x44694011, 0x5739B3E5, 0xA55230E6, 0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36, 0x3CDB9BDD, 0xCEB018DE, 0xDDE0EB2A, 0x2F8B6829, 0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C, 0x456CAC67, 0xB7072F64, 0xA457DC90, 0x563C5F93, 0x082F63B7, 0xFA44E0B4, 0xE9141340, 0x1B7F9043, 0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C, 0x92A8FC17, 0x60C37F14, 0x73938CE0, 0x81F80FE3, 0x55326B08, 0xA759E80B, 0xB4091BFF, 0x466298FC, 0x1871A4D8, 0xEA1A27DB, 0xF94AD42F, 0x0B21572C, 0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033, 0xA24BB5A6, 0x502036A5, 0x4370C551, 0xB11B4652, 0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D, 0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D, 0xEF087A76, 0x1D63F975, 0x0E330A81, 0xFC588982, 0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D, 0x758FE5D6, 0x87E466D5, 0x94B49521, 0x66DF1622, 0x38CC2A06, 0xCAA7A905, 0xD9F75AF1, 0x2B9CD9F2, 0xFF56BD19, 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED, 0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530, 0x0417B1DB, 0xF67C32D8, 0xE52CC12C, 0x1747422F, 0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, 0x5A048DFF, 0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0, 0xD3D3E1AB, 0x21B862A8, 0x32E8915C, 0xC083125F, 0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540, 0x590AB964, 0xAB613A67, 0xB831C993, 0x4A5A4A90, 0x9E902E7B, 0x6CFBAD78, 0x7FAB5E8C, 0x8DC0DD8F, 0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE, 0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1, 0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321, 0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E, 0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81, 0x34F4F86A, 0xC69F7B69, 0xD5CF889D, 0x27A40B9E, 0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E, 0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351 }, { /*table1*/ 0x00000000, 0x13A29877, 0x274530EE, 0x34E7A899, 0x4E8A61DC, 0x5D28F9AB, 0x69CF5132, 0x7A6DC945, 0x9D14C3B8, 0x8EB65BCF, 0xBA51F356, 0xA9F36B21, 0xD39EA264, 0xC03C3A13, 0xF4DB928A, 0xE7790AFD, 0x3FC5F181, 0x2C6769F6, 0x1880C16F, 0x0B225918, 0x714F905D, 0x62ED082A, 0x560AA0B3, 0x45A838C4, 0xA2D13239, 0xB173AA4E, 0x859402D7, 0x96369AA0, 0xEC5B53E5, 0xFFF9CB92, 0xCB1E630B, 0xD8BCFB7C, 0x7F8BE302, 0x6C297B75, 0x58CED3EC, 0x4B6C4B9B, 0x310182DE, 0x22A31AA9, 0x1644B230, 0x05E62A47, 0xE29F20BA, 0xF13DB8CD, 0xC5DA1054, 0xD6788823, 0xAC154166, 0xBFB7D911, 0x8B507188, 0x98F2E9FF, 0x404E1283, 0x53EC8AF4, 0x670B226D, 0x74A9BA1A, 0x0EC4735F, 0x1D66EB28, 0x298143B1, 0x3A23DBC6, 0xDD5AD13B, 0xCEF8494C, 0xFA1FE1D5, 0xE9BD79A2, 0x93D0B0E7, 0x80722890, 0xB4958009, 0xA737187E, 0xFF17C604, 0xECB55E73, 0xD852F6EA, 0xCBF06E9D, 0xB19DA7D8, 0xA23F3FAF, 0x96D89736, 0x857A0F41, 0x620305BC, 0x71A19DCB, 0x45463552, 0x56E4AD25, 0x2C896460, 0x3F2BFC17, 0x0BCC548E, 0x186ECCF9, 0xC0D23785, 0xD370AFF2, 0xE797076B, 0xF4359F1C, 0x8E585659, 0x9DFACE2E, 0xA91D66B7, 0xBABFFEC0, 0x5DC6F43D, 0x4E646C4A, 0x7A83C4D3, 0x69215CA4, 0x134C95E1, 0x00EE0D96, 0x3409A50F, 0x27AB3D78, 0x809C2506, 0x933EBD71, 0xA7D915E8, 0xB47B8D9F, 0xCE1644DA, 0xDDB4DCAD, 0xE9537434, 0xFAF1EC43, 0x1D88E6BE, 0x0E2A7EC9, 0x3ACDD650, 0x296F4E27, 0x53028762, 0x40A01F15, 0x7447B78C, 0x67E52FFB, 0xBF59D487, 0xACFB4CF0, 0x981CE469, 0x8BBE7C1E, 0xF1D3B55B, 0xE2712D2C, 0xD69685B5, 0xC5341DC2, 0x224D173F, 0x31EF8F48, 0x050827D1, 0x16AABFA6, 0x6CC776E3, 0x7F65EE94, 0x4B82460D, 0x5820DE7A, 0xFBC3FAF9, 0xE861628E, 0xDC86CA17, 0xCF245260, 0xB5499B25, 0xA6EB0352, 0x920CABCB, 0x81AE33BC, 0x66D73941, 0x7575A136, 0x419209AF, 0x523091D8, 0x285D589D, 0x3BFFC0EA, 0x0F186873, 0x1CBAF004, 0xC4060B78, 0xD7A4930F, 0xE3433B96, 0xF0E1A3E1, 0x8A8C6AA4, 0x992EF2D3, 0xADC95A4A, 0xBE6BC23D, 0x5912C8C0, 0x4AB050B7, 0x7E57F82E, 0x6DF56059, 0x1798A91C, 0x043A316B, 0x30DD99F2, 0x237F0185, 0x844819FB, 0x97EA818C, 0xA30D2915, 0xB0AFB162, 0xCAC27827, 0xD960E050, 0xED8748C9, 0xFE25D0BE, 0x195CDA43, 0x0AFE4234, 0x3E19EAAD, 0x2DBB72DA, 0x57D6BB9F, 0x447423E8, 0x70938B71, 0x63311306, 0xBB8DE87A, 0xA82F700D, 0x9CC8D894, 0x8F6A40E3, 0xF50789A6, 0xE6A511D1, 0xD242B948, 0xC1E0213F, 0x26992BC2, 0x353BB3B5, 0x01DC1B2C, 0x127E835B, 0x68134A1E, 0x7BB1D269, 0x4F567AF0, 0x5CF4E287, 0x04D43CFD, 0x1776A48A, 0x23910C13, 0x30339464, 0x4A5E5D21, 0x59FCC556, 0x6D1B6DCF, 0x7EB9F5B8, 0x99C0FF45, 0x8A626732, 0xBE85CFAB, 0xAD2757DC, 0xD74A9E99, 0xC4E806EE, 0xF00FAE77, 0xE3AD3600, 0x3B11CD7C, 0x28B3550B, 0x1C54FD92, 0x0FF665E5, 0x759BACA0, 0x663934D7, 0x52DE9C4E, 0x417C0439, 0xA6050EC4, 0xB5A796B3, 0x81403E2A, 0x92E2A65D, 0xE88F6F18, 0xFB2DF76F, 0xCFCA5FF6, 0xDC68C781, 0x7B5FDFFF, 0x68FD4788, 0x5C1AEF11, 0x4FB87766, 0x35D5BE23, 0x26772654, 0x12908ECD, 0x013216BA, 0xE64B1C47, 0xF5E98430, 0xC10E2CA9, 0xD2ACB4DE, 0xA8C17D9B, 0xBB63E5EC, 0x8F844D75, 0x9C26D502, 0x449A2E7E, 0x5738B609, 0x63DF1E90, 0x707D86E7, 0x0A104FA2, 0x19B2D7D5, 0x2D557F4C, 0x3EF7E73B, 0xD98EEDC6, 0xCA2C75B1, 0xFECBDD28, 0xED69455F, 0x97048C1A, 0x84A6146D, 0xB041BCF4, 0xA3E32483 }, { /*table2*/ 0x00000000, 0xA541927E, 0x4F6F520D, 0xEA2EC073, 0x9EDEA41A, 0x3B9F3664, 0xD1B1F617, 0x74F06469, 0x38513EC5, 0x9D10ACBB, 0x773E6CC8, 0xD27FFEB6, 0xA68F9ADF, 0x03CE08A1, 0xE9E0C8D2, 0x4CA15AAC, 0x70A27D8A, 0xD5E3EFF4, 0x3FCD2F87, 0x9A8CBDF9, 0xEE7CD990, 0x4B3D4BEE, 0xA1138B9D, 0x045219E3, 0x48F3434F, 0xEDB2D131, 0x079C1142, 0xA2DD833C, 0xD62DE755, 0x736C752B, 0x9942B558, 0x3C032726, 0xE144FB14, 0x4405696A, 0xAE2BA919, 0x0B6A3B67, 0x7F9A5F0E, 0xDADBCD70, 0x30F50D03, 0x95B49F7D, 0xD915C5D1, 0x7C5457AF, 0x967A97DC, 0x333B05A2, 0x47CB61CB, 0xE28AF3B5, 0x08A433C6, 0xADE5A1B8, 0x91E6869E, 0x34A714E0, 0xDE89D493, 0x7BC846ED, 0x0F382284, 0xAA79B0FA, 0x40577089, 0xE516E2F7, 0xA9B7B85B, 0x0CF62A25, 0xE6D8EA56, 0x43997828, 0x37691C41, 0x92288E3F, 0x78064E4C, 0xDD47DC32, 0xC76580D9, 0x622412A7, 0x880AD2D4, 0x2D4B40AA, 0x59BB24C3, 0xFCFAB6BD, 0x16D476CE, 0xB395E4B0, 0xFF34BE1C, 0x5A752C62, 0xB05BEC11, 0x151A7E6F, 0x61EA1A06, 0xC4AB8878, 0x2E85480B, 0x8BC4DA75, 0xB7C7FD53, 0x12866F2D, 0xF8A8AF5E, 0x5DE93D20, 0x29195949, 0x8C58CB37, 0x66760B44, 0xC337993A, 0x8F96C396, 0x2AD751E8, 0xC0F9919B, 0x65B803E5, 0x1148678C, 0xB409F5F2, 0x5E273581, 0xFB66A7FF, 0x26217BCD, 0x8360E9B3, 0x694E29C0, 0xCC0FBBBE, 0xB8FFDFD7, 0x1DBE4DA9, 0xF7908DDA, 0x52D11FA4, 0x1E704508, 0xBB31D776, 0x511F1705, 0xF45E857B, 0x80AEE112, 0x25EF736C, 0xCFC1B31F, 0x6A802161, 0x56830647, 0xF3C29439, 0x19EC544A, 0xBCADC634, 0xC85DA25D, 0x6D1C3023, 0x8732F050, 0x2273622E, 0x6ED23882, 0xCB93AAFC, 0x21BD6A8F, 0x84FCF8F1, 0xF00C9C98, 0x554D0EE6, 0xBF63CE95, 0x1A225CEB, 0x8B277743, 0x2E66E53D, 0xC448254E, 0x6109B730, 0x15F9D359, 0xB0B84127, 0x5A968154, 0xFFD7132A, 0xB3764986, 0x1637DBF8, 0xFC191B8B, 0x595889F5, 0x2DA8ED9C, 0x88E97FE2, 0x62C7BF91, 0xC7862DEF, 0xFB850AC9, 0x5EC498B7, 0xB4EA58C4, 0x11ABCABA, 0x655BAED3, 0xC01A3CAD, 0x2A34FCDE, 0x8F756EA0, 0xC3D4340C, 0x6695A672, 0x8CBB6601, 0x29FAF47F, 0x5D0A9016, 0xF84B0268, 0x1265C21B, 0xB7245065, 0x6A638C57, 0xCF221E29, 0x250CDE5A, 0x804D4C24, 0xF4BD284D, 0x51FCBA33, 0xBBD27A40, 0x1E93E83E, 0x5232B292, 0xF77320EC, 0x1D5DE09F, 0xB81C72E1, 0xCCEC1688, 0x69AD84F6, 0x83834485, 0x26C2D6FB, 0x1AC1F1DD, 0xBF8063A3, 0x55AEA3D0, 0xF0EF31AE, 0x841F55C7, 0x215EC7B9, 0xCB7007CA, 0x6E3195B4, 0x2290CF18, 0x87D15D66, 0x6DFF9D15, 0xC8BE0F6B, 0xBC4E6B02, 0x190FF97C, 0xF321390F, 0x5660AB71, 0x4C42F79A, 0xE90365E4, 0x032DA597, 0xA66C37E9, 0xD29C5380, 0x77DDC1FE, 0x9DF3018D, 0x38B293F3, 0x7413C95F, 0xD1525B21, 0x3B7C9B52, 0x9E3D092C, 0xEACD6D45, 0x4F8CFF3B, 0xA5A23F48, 0x00E3AD36, 0x3CE08A10, 0x99A1186E, 0x738FD81D, 0xD6CE4A63, 0xA23E2E0A, 0x077FBC74, 0xED517C07, 0x4810EE79, 0x04B1B4D5, 0xA1F026AB, 0x4BDEE6D8, 0xEE9F74A6, 0x9A6F10CF, 0x3F2E82B1, 0xD50042C2, 0x7041D0BC, 0xAD060C8E, 0x08479EF0, 0xE2695E83, 0x4728CCFD, 0x33D8A894, 0x96993AEA, 0x7CB7FA99, 0xD9F668E7, 0x9557324B, 0x3016A035, 0xDA386046, 0x7F79F238, 0x0B899651, 0xAEC8042F, 0x44E6C45C, 0xE1A75622, 0xDDA47104, 0x78E5E37A, 0x92CB2309, 0x378AB177, 0x437AD51E, 0xE63B4760, 0x0C158713, 0xA954156D, 0xE5F54FC1, 0x40B4DDBF, 0xAA9A1DCC, 0x0FDB8FB2, 0x7B2BEBDB, 0xDE6A79A5, 0x3444B9D6, 0x91052BA8 }, { /*table3*/ 0x00000000, 0xDD45AAB8, 0xBF672381, 0x62228939, 0x7B2231F3, 0xA6679B4B, 0xC4451272, 0x1900B8CA, 0xF64463E6, 0x2B01C95E, 0x49234067, 0x9466EADF, 0x8D665215, 0x5023F8AD, 0x32017194, 0xEF44DB2C, 0xE964B13D, 0x34211B85, 0x560392BC, 0x8B463804, 0x924680CE, 0x4F032A76, 0x2D21A34F, 0xF06409F7, 0x1F20D2DB, 0xC2657863, 0xA047F15A, 0x7D025BE2, 0x6402E328, 0xB9474990, 0xDB65C0A9, 0x06206A11, 0xD725148B, 0x0A60BE33, 0x6842370A, 0xB5079DB2, 0xAC072578, 0x71428FC0, 0x136006F9, 0xCE25AC41, 0x2161776D, 0xFC24DDD5, 0x9E0654EC, 0x4343FE54, 0x5A43469E, 0x8706EC26, 0xE524651F, 0x3861CFA7, 0x3E41A5B6, 0xE3040F0E, 0x81268637, 0x5C632C8F, 0x45639445, 0x98263EFD, 0xFA04B7C4, 0x27411D7C, 0xC805C650, 0x15406CE8, 0x7762E5D1, 0xAA274F69, 0xB327F7A3, 0x6E625D1B, 0x0C40D422, 0xD1057E9A, 0xABA65FE7, 0x76E3F55F, 0x14C17C66, 0xC984D6DE, 0xD0846E14, 0x0DC1C4AC, 0x6FE34D95, 0xB2A6E72D, 0x5DE23C01, 0x80A796B9, 0xE2851F80, 0x3FC0B538, 0x26C00DF2, 0xFB85A74A, 0x99A72E73, 0x44E284CB, 0x42C2EEDA, 0x9F874462, 0xFDA5CD5B, 0x20E067E3, 0x39E0DF29, 0xE4A57591, 0x8687FCA8, 0x5BC25610, 0xB4868D3C, 0x69C32784, 0x0BE1AEBD, 0xD6A40405, 0xCFA4BCCF, 0x12E11677, 0x70C39F4E, 0xAD8635F6, 0x7C834B6C, 0xA1C6E1D4, 0xC3E468ED, 0x1EA1C255, 0x07A17A9F, 0xDAE4D027, 0xB8C6591E, 0x6583F3A6, 0x8AC7288A, 0x57828232, 0x35A00B0B, 0xE8E5A1B3, 0xF1E51979, 0x2CA0B3C1, 0x4E823AF8, 0x93C79040, 0x95E7FA51, 0x48A250E9, 0x2A80D9D0, 0xF7C57368, 0xEEC5CBA2, 0x3380611A, 0x51A2E823, 0x8CE7429B, 0x63A399B7, 0xBEE6330F, 0xDCC4BA36, 0x0181108E, 0x1881A844, 0xC5C402FC, 0xA7E68BC5, 0x7AA3217D, 0x52A0C93F, 0x8FE56387, 0xEDC7EABE, 0x30824006, 0x2982F8CC, 0xF4C75274, 0x96E5DB4D, 0x4BA071F5, 0xA4E4AAD9, 0x79A10061, 0x1B838958, 0xC6C623E0, 0xDFC69B2A, 0x02833192, 0x60A1B8AB, 0xBDE41213, 0xBBC47802, 0x6681D2BA, 0x04A35B83, 0xD9E6F13B, 0xC0E649F1, 0x1DA3E349, 0x7F816A70, 0xA2C4C0C8, 0x4D801BE4, 0x90C5B15C, 0xF2E73865, 0x2FA292DD, 0x36A22A17, 0xEBE780AF, 0x89C50996, 0x5480A32E, 0x8585DDB4, 0x58C0770C, 0x3AE2FE35, 0xE7A7548D, 0xFEA7EC47, 0x23E246FF, 0x41C0CFC6, 0x9C85657E, 0x73C1BE52, 0xAE8414EA, 0xCCA69DD3, 0x11E3376B, 0x08E38FA1, 0xD5A62519, 0xB784AC20, 0x6AC10698, 0x6CE16C89, 0xB1A4C631, 0xD3864F08, 0x0EC3E5B0, 0x17C35D7A, 0xCA86F7C2, 0xA8A47EFB, 0x75E1D443, 0x9AA50F6F, 0x47E0A5D7, 0x25C22CEE, 0xF8878656, 0xE1873E9C, 0x3CC29424, 0x5EE01D1D, 0x83A5B7A5, 0xF90696D8, 0x24433C60, 0x4661B559, 0x9B241FE1, 0x8224A72B, 0x5F610D93, 0x3D4384AA, 0xE0062E12, 0x0F42F53E, 0xD2075F86, 0xB025D6BF, 0x6D607C07, 0x7460C4CD, 0xA9256E75, 0xCB07E74C, 0x16424DF4, 0x106227E5, 0xCD278D5D, 0xAF050464, 0x7240AEDC, 0x6B401616, 0xB605BCAE, 0xD4273597, 0x09629F2F, 0xE6264403, 0x3B63EEBB, 0x59416782, 0x8404CD3A, 0x9D0475F0, 0x4041DF48, 0x22635671, 0xFF26FCC9, 0x2E238253, 0xF36628EB, 0x9144A1D2, 0x4C010B6A, 0x5501B3A0, 0x88441918, 0xEA669021, 0x37233A99, 0xD867E1B5, 0x05224B0D, 0x6700C234, 0xBA45688C, 0xA345D046, 0x7E007AFE, 0x1C22F3C7, 0xC167597F, 0xC747336E, 0x1A0299D6, 0x782010EF, 0xA565BA57, 0xBC65029D, 0x6120A825, 0x0302211C, 0xDE478BA4, 0x31035088, 0xEC46FA30, 0x8E647309, 0x5321D9B1, 0x4A21617B, 0x9764CBC3, 0xF54642FA, 0x2803E842 } }; internal_crc32_table = crc32_table; #if (LIBXSMM_X86_SSE4 <= LIBXSMM_STATIC_TARGET_ARCH) LIBXSMM_UNUSED(target_arch); #else if (LIBXSMM_X86_SSE4 <= target_arch) #endif { internal_hash_u32_function = internal_crc32_u32_sse4; internal_hash_u64_function = internal_crc32_u64_sse4; internal_hash_u128_function = internal_crc32_u128_sse4; internal_hash_u256_function = internal_crc32_u256_sse4; internal_hash_u384_function = internal_crc32_u384_sse4; internal_hash_u512_function = internal_crc32_u512_sse4; internal_hash_function = (libxsmm_hash_function)internal_crc32_sse4; } #if (LIBXSMM_X86_SSE4 > LIBXSMM_STATIC_TARGET_ARCH) else { # if !defined(LIBXSMM_INTRINSICS_SSE4) static int error_once = 0; if (0 == error_once && 0 != libxsmm_verbosity) { /* library code is expected to be mute */ fprintf(stderr, "LIBXSMM WARNING: unable to access CRC32 instructions due to the compiler used!\n"); error_once = 1; /* no need for atomics */ } # endif internal_hash_u32_function = internal_crc32_u32; internal_hash_u64_function = internal_crc32_u64; internal_hash_u128_function = internal_crc32_u128; internal_hash_u256_function = internal_crc32_u256; internal_hash_u384_function = internal_crc32_u384; internal_hash_u512_function = internal_crc32_u512; internal_hash_function = (libxsmm_hash_function)internal_crc32; } #endif LIBXSMM_ASSERT(NULL != internal_hash_u32_function); LIBXSMM_ASSERT(NULL != internal_hash_u64_function); LIBXSMM_ASSERT(NULL != internal_hash_u128_function); LIBXSMM_ASSERT(NULL != internal_hash_u256_function); LIBXSMM_ASSERT(NULL != internal_hash_u384_function); LIBXSMM_ASSERT(NULL != internal_hash_u512_function); LIBXSMM_ASSERT(NULL != internal_hash_function); } LIBXSMM_API_INTERN void libxsmm_hash_finalize(void) { #if !defined(NDEBUG) internal_crc32_table = NULL; internal_hash_u32_function = NULL; internal_hash_u64_function = NULL; internal_hash_u128_function = NULL; internal_hash_u256_function = NULL; internal_hash_u384_function = NULL; internal_hash_u512_function = NULL; internal_hash_function = NULL; #endif } LIBXSMM_API_INTERN unsigned int libxsmm_crc32_u32(unsigned int seed, const void* value, ...) { #if (LIBXSMM_X86_SSE4 <= LIBXSMM_STATIC_TARGET_ARCH) return LIBXSMM_HASH_CRC32_U32(seed, value); #elif (LIBXSMM_X86_SSE4 > LIBXSMM_MAX_STATIC_TARGET_ARCH) return internal_crc32_u32(seed, value); #else /* pointer based function call */ LIBXSMM_ASSERT(NULL != internal_hash_u32_function); return internal_hash_u32_function(seed, value); #endif } LIBXSMM_API_INTERN unsigned int libxsmm_crc32_u64(unsigned int seed, const void* value, ...) { #if (LIBXSMM_X86_SSE4 <= LIBXSMM_STATIC_TARGET_ARCH) return (unsigned int)LIBXSMM_HASH_CRC32_U64(seed, value); #elif (LIBXSMM_X86_SSE4 > LIBXSMM_MAX_STATIC_TARGET_ARCH) return internal_crc32_u64(seed, value); #else /* pointer based function call */ LIBXSMM_ASSERT(NULL != internal_hash_u64_function); return internal_hash_u64_function(seed, value); #endif } LIBXSMM_API_INTERN unsigned int libxsmm_crc32_u128(unsigned int seed, const void* value, ...) { #if (LIBXSMM_X86_SSE4 <= LIBXSMM_STATIC_TARGET_ARCH) return internal_crc32_u128_sse4(seed, value); #elif (LIBXSMM_X86_SSE4 > LIBXSMM_MAX_STATIC_TARGET_ARCH) return internal_crc32_u128(seed, value); #else /* pointer based function call */ LIBXSMM_ASSERT(NULL != internal_hash_u128_function); return internal_hash_u128_function(seed, value); #endif } LIBXSMM_API_INTERN unsigned int libxsmm_crc32_u256(unsigned int seed, const void* value, ...) { #if (LIBXSMM_X86_SSE4 <= LIBXSMM_STATIC_TARGET_ARCH) return internal_crc32_u256_sse4(seed, value); #elif (LIBXSMM_X86_SSE4 > LIBXSMM_MAX_STATIC_TARGET_ARCH) return internal_crc32_u256(seed, value); #else /* pointer based function call */ LIBXSMM_ASSERT(NULL != internal_hash_u256_function); return internal_hash_u256_function(seed, value); #endif } LIBXSMM_API_INTERN unsigned int libxsmm_crc32_u384(unsigned int seed, const void* value, ...) { #if (LIBXSMM_X86_SSE4 <= LIBXSMM_STATIC_TARGET_ARCH) return internal_crc32_u384_sse4(seed, value); #elif (LIBXSMM_X86_SSE4 > LIBXSMM_MAX_STATIC_TARGET_ARCH) return internal_crc32_u384(seed, value); #else /* pointer based function call */ LIBXSMM_ASSERT(NULL != internal_hash_u384_function); return internal_hash_u384_function(seed, value); #endif } LIBXSMM_API_INTERN unsigned int libxsmm_crc32_u512(unsigned int seed, const void* value, ...) { #if (LIBXSMM_X86_SSE4 <= LIBXSMM_STATIC_TARGET_ARCH) return internal_crc32_u512_sse4(seed, value); #elif (LIBXSMM_X86_SSE4 > LIBXSMM_MAX_STATIC_TARGET_ARCH) return internal_crc32_u512(seed, value); #else /* pointer based function call */ LIBXSMM_ASSERT(NULL != internal_hash_u512_function); return internal_hash_u512_function(seed, value); #endif } LIBXSMM_API_INTERN unsigned int libxsmm_crc32(unsigned int seed, const void* data, size_t size) { #if (LIBXSMM_X86_SSE4 <= LIBXSMM_STATIC_TARGET_ARCH) return internal_crc32_sse4(seed, data, size); #elif (LIBXSMM_X86_SSE4 > LIBXSMM_MAX_STATIC_TARGET_ARCH) return internal_crc32(seed, data, size); #else /* pointer based function call */ LIBXSMM_ASSERT(NULL != internal_hash_function); return internal_hash_function(seed, data, size); #endif } libxsmm-1.17/src/libxsmm_hash.h000066400000000000000000000046611415223013700165400ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_HASH_H #define LIBXSMM_HASH_H #include /* Map number of Bits to corresponding routine. */ #define LIBXSMM_CRC32U(N) LIBXSMM_CONCATENATE(libxsmm_crc32_u, N) /* Map number of Bytes to number of bits. */ #define LIBXSMM_CRC32(N) LIBXSMM_CONCATENATE(libxsmm_crc32_b, N) #define libxsmm_crc32_b4 libxsmm_crc32_u32 #define libxsmm_crc32_b8 libxsmm_crc32_u64 #define libxsmm_crc32_b16 libxsmm_crc32_u128 #define libxsmm_crc32_b32 libxsmm_crc32_u256 #define libxsmm_crc32_b48 libxsmm_crc32_u384 #define libxsmm_crc32_b64 libxsmm_crc32_u512 /** Function type representing the CRC32 functionality. */ LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE unsigned int (*libxsmm_hash_function)( unsigned int /*seed*/, const void* /*data*/, ... /*size*/); /** Initialize hash function module; not thread-safe. */ LIBXSMM_API_INTERN void libxsmm_hash_init(int target_arch); LIBXSMM_API_INTERN void libxsmm_hash_finalize(void); LIBXSMM_API_INTERN unsigned int libxsmm_crc32_u32(unsigned int seed, const void* value, ...); LIBXSMM_API_INTERN unsigned int libxsmm_crc32_u64(unsigned int seed, const void* value, ...); LIBXSMM_API_INTERN unsigned int libxsmm_crc32_u128(unsigned int seed, const void* value, ...); LIBXSMM_API_INTERN unsigned int libxsmm_crc32_u256(unsigned int seed, const void* value, ...); LIBXSMM_API_INTERN unsigned int libxsmm_crc32_u384(unsigned int seed, const void* value, ...); LIBXSMM_API_INTERN unsigned int libxsmm_crc32_u512(unsigned int seed, const void* value, ...); /** Calculate the CRC32 for a given quantity (size) of raw data according to the seed. */ LIBXSMM_API_INTERN unsigned int libxsmm_crc32(unsigned int seed, const void* data, size_t size); #endif /*LIBXSMM_HASH_H*/ libxsmm-1.17/src/libxsmm_main.c000066400000000000000000006716751415223013700165530ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst, Alexander Heinecke (Intel Corp.) ******************************************************************************/ #include "libxsmm_trace.h" #include "libxsmm_xcopy.h" #include "libxsmm_gemm.h" #include "libxsmm_hash.h" #include "libxsmm_diff.h" #include "libxsmm_main.h" #if defined(LIBXSMM_PERF) # include "libxsmm_perf.h" #endif #include "generator_common.h" #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #if !defined(NDEBUG) # include #endif #if defined(_WIN32) # include #else # include # include # include # include # include #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif #if !defined(LIBXSMM_CODE_MAXSIZE) # define LIBXSMM_CODE_MAXSIZE 131072 #endif #if !defined(LIBXSMM_DIFF_SIZE) # define LIBXSMM_DIFF_SIZE LIBXSMM_DESCRIPTOR_SIGSIZE #endif #if !defined(LIBXSMM_HASH_SIZE) # define LIBXSMM_HASH_SIZE 32 #endif #if !defined(LIBXSMM_HASH_SEED) # define LIBXSMM_HASH_SEED 25071975 #endif #if !defined(LIBXSMM_MALLOC_HOOK_ALIGN) && 1 # define LIBXSMM_MALLOC_HOOK_ALIGN #endif #if !defined(LIBXSMM_MALLOC_HOOK_INIT) && 0 # define LIBXSMM_MALLOC_HOOK_INIT #endif #if !defined(LIBXSMM_ENABLE_DEREG) && 0 # define LIBXSMM_ENABLE_DEREG #endif #if !defined(LIBXSMM_REGLOCK_TRY) && 0 # define LIBXSMM_REGLOCK_TRY #endif #if !defined(LIBXSMM_UNIFY_LOCKS) && 1 # define LIBXSMM_UNIFY_LOCKS #endif #if !defined(LIBXSMM_DIFF_INLINE) && 1 # define LIBXSMM_DIFF_INLINE #endif #if !defined(LIBXSMM_DESC_INLINE) && 0 # define LIBXSMM_DESC_INLINE #endif #if !defined(LIBXSMM_DESC_PAD) && 1 # define LIBXSMM_DESC_PAD #endif #if !defined(LIBXSMM_CACHE_PAD) && 1 # define LIBXSMM_CACHE_PAD #endif #if !defined(LIBXSMM_AUTOPIN) && 1 # define LIBXSMM_AUTOPIN #endif #if !defined(INTERNAL_DELIMS) # define INTERNAL_DELIMS ";,:" #endif #if defined(LIBXSMM_AUTOPIN) && !defined(_WIN32) LIBXSMM_EXTERN int putenv(char*) LIBXSMM_THROW; #endif /* flag fused into the memory address of a code version in case of non-JIT */ #define LIBXSMM_CODE_STATIC (1ULL << (8 * sizeof(void*) - 1)) /* flag fused into the memory address of a code version in case of collision */ #if 1 /* beneficial when registry approaches capacity (collisions) */ # define LIBXSMM_HASH_COLLISION (1ULL << (8 * sizeof(void*) - 2)) #endif /** Helper macro determining the default prefetch strategy which is used for statically generated kernels. */ #if (0 > LIBXSMM_PREFETCH) /* auto-prefetch (frontend) */ || (defined(_WIN32) || defined(__CYGWIN__)) # define INTERNAL_PREFETCH LIBXSMM_GEMM_PREFETCH_NONE #else # define INTERNAL_PREFETCH ((libxsmm_gemm_prefetch_type)LIBXSMM_PREFETCH) #endif #if (0 != LIBXSMM_SYNC) # if !defined(INTERNAL_REGLOCK_MAXN) # if defined(_MSC_VER) # define INTERNAL_REGLOCK_MAXN 0 # else # define INTERNAL_REGLOCK_MAXN 0 # endif # endif # if (1 < INTERNAL_REGLOCK_MAXN) # if !defined(LIBXSMM_CACHE_MAXSIZE) && (8 > INTERNAL_REGLOCK_MAXN) # define LIBXSMM_CACHE_MAXSIZE LIBXSMM_CAPACITY_CACHE # endif # if !defined(LIBXSMM_REGLOCK) # define LIBXSMM_REGLOCK LIBXSMM_LOCK_DEFAULT # endif # if !defined(LIBXSMM_CLEANUP_NTRY) # define LIBXSMM_CLEANUP_NTRY 7 # endif # if LIBXSMM_LOCK_TYPE_ISPOD(LIBXSMM_REGLOCK) LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE internal_reglocktype { char pad[LIBXSMM_CACHELINE]; LIBXSMM_LOCK_TYPE(LIBXSMM_REGLOCK) state; } internal_reglocktype; # else LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE internal_reglocktype { LIBXSMM_LOCK_TYPE(LIBXSMM_REGLOCK) state; } internal_reglocktype; # endif LIBXSMM_APIVAR_DEFINE(internal_reglocktype internal_reglock[INTERNAL_REGLOCK_MAXN]); # else /* RW-lock */ # if !defined(LIBXSMM_CACHE_MAXSIZE) # define LIBXSMM_CACHE_MAXSIZE LIBXSMM_CAPACITY_CACHE # endif # if !defined(LIBXSMM_REGLOCK) # if defined(LIBXSMM_UNIFY_LOCKS) # define LIBXSMM_REGLOCK LIBXSMM_LOCK # elif defined(_MSC_VER) # define LIBXSMM_REGLOCK LIBXSMM_LOCK_MUTEX # elif 0 # define LIBXSMM_REGLOCK LIBXSMM_LOCK_RWLOCK # else # define LIBXSMM_REGLOCK LIBXSMM_LOCK_DEFAULT # endif # endif LIBXSMM_APIVAR_DEFINE(LIBXSMM_LOCK_TYPE(LIBXSMM_REGLOCK)* internal_reglock_ptr); # endif #elif !defined(LIBXSMM_CACHE_MAXSIZE) # define LIBXSMM_CACHE_MAXSIZE LIBXSMM_CAPACITY_CACHE #endif #if defined(LIBXSMM_UNPACKED) /* CCE/Classic */ # define LIBXSMM_CACHE_STRIDE LIBXSMM_MAX(sizeof(libxsmm_descriptor), LIBXSMM_DESCRIPTOR_MAXSIZE) #else # define LIBXSMM_CACHE_STRIDE LIBXSMM_DESCRIPTOR_MAXSIZE #endif #if defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE)) # define INTERNAL_FIND_CODE_CACHE_GROW(RESULT_INDEX, CACHE_SIZE) \ RESULT_INDEX = CACHE_SIZE; CACHE_SIZE = (unsigned char)(0 != (CACHE_SIZE) ? ((CACHE_SIZE) << 1) : 1) # define INTERNAL_FIND_CODE_CACHE_EVICT(RESULT_INDEX, CACHE_SIZE, CACHE_HIT) \ RESULT_INDEX = (unsigned char)LIBXSMM_MOD2((CACHE_HIT) + ((CACHE_SIZE) - 1), CACHE_SIZE) #endif #if (0 == LIBXSMM_SYNC) # define INTERNAL_FIND_CODE_LOCK(LOCKINDEX, INDEX, DIFF, CODE) { # define INTERNAL_FIND_CODE_UNLOCK(LOCKINDEX) } #else # if defined(LIBXSMM_REGLOCK_TRY) # define INTERNAL_REGLOCK_TRY(DIFF, CODE) \ if (1 != internal_reglock_count) { /* (re-)try and get (meanwhile) generated code */ \ LIBXSMM_ASSERT(NULL != internal_registry); /* engine is not shut down */ \ continue; \ } \ else { /* exit dispatch and let client fall back */ \ DIFF = 0; CODE = 0; break; \ } # else # define INTERNAL_REGLOCK_TRY(DIFF, CODE) \ LIBXSMM_ASSERT(NULL != internal_registry); /* engine is not shut down */ \ continue # endif # if (1 < INTERNAL_REGLOCK_MAXN) # define INTERNAL_FIND_CODE_LOCK(LOCKINDEX, INDEX, DIFF, CODE) { \ const unsigned int LOCKINDEX = (0 != internal_reglock_count ? LIBXSMM_MOD2(INDEX, internal_reglock_count) : 0); \ if (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_REGLOCK) != LIBXSMM_LOCK_TRYLOCK(LIBXSMM_REGLOCK, &internal_reglock[LOCKINDEX].state)) { \ INTERNAL_REGLOCK_TRY(DIFF, CODE); \ } # define INTERNAL_FIND_CODE_UNLOCK(LOCKINDEX) LIBXSMM_LOCK_RELEASE(LIBXSMM_REGLOCK, &internal_reglock[LOCKINDEX].state); } # else /* RW-lock */ # define INTERNAL_FIND_CODE_LOCK(LOCKINDEX, INDEX, DIFF, CODE) { \ if (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_REGLOCK) != LIBXSMM_LOCK_TRYLOCK(LIBXSMM_REGLOCK, internal_reglock_ptr)) { \ INTERNAL_REGLOCK_TRY(DIFF, CODE); \ } # define INTERNAL_FIND_CODE_UNLOCK(LOCKINDEX) LIBXSMM_LOCK_RELEASE(LIBXSMM_REGLOCK, internal_reglock_ptr); } # endif #endif LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE internal_statistic_type { unsigned int ntry, ncol, njit, nsta; } internal_statistic_type; #if defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE)) LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE internal_cache_entry_type { libxsmm_descriptor keys[LIBXSMM_CACHE_MAXSIZE]; libxsmm_code_pointer code[LIBXSMM_CACHE_MAXSIZE]; unsigned int id; /* to invalidate */ unsigned char size, hit; } internal_cache_entry_type; LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE internal_cache_type { # if defined(LIBXSMM_CACHE_PAD) char pad[LIBXSMM_UP2(sizeof(internal_cache_entry_type),LIBXSMM_CACHELINE)]; # endif internal_cache_entry_type entry; } internal_cache_type; # if defined(LIBXSMM_NTHREADS_USE) LIBXSMM_APIVAR_DEFINE(internal_cache_type* internal_cache_buffer); # endif LIBXSMM_APIVAR_DEFINE(int internal_cache_size); #endif /*defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE))*/ /** Determines the try-lock property (1m && 1 < desc->n) { /* only record matrix-matrix multiplication */ const unsigned long long kernel_size = LIBXSMM_MNK_SIZE(desc->m, desc->n, desc->k); const int idx = (LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_OUT(desc->datatype) ? 0 : 1); int bucket; if (LIBXSMM_MNK_SIZE(internal_statistic_sml, internal_statistic_sml, internal_statistic_sml) >= kernel_size) { bucket = 0; } else if (LIBXSMM_MNK_SIZE(internal_statistic_med, internal_statistic_med, internal_statistic_med) >= kernel_size) { bucket = 1; } else if (LIBXSMM_MNK_SIZE(internal_statistic_mnk, internal_statistic_mnk, internal_statistic_mnk) >= kernel_size) { bucket = 2; } else { /*huge*/ bucket = 3; } if (0 != ncol) ncol/*dummy assignment*/ = LIBXSMM_ATOMIC_ADD_FETCH(&internal_statistic[idx][bucket].ncol, ncol, LIBXSMM_ATOMIC_RELAXED); if (0 != ntry) ntry/*dummy assignment*/ = LIBXSMM_ATOMIC_ADD_FETCH(&internal_statistic[idx][bucket].ntry, ntry, LIBXSMM_ATOMIC_RELAXED); /* the following counters are not manipulated concurrently (no need for atomic increment) */ if (0 != njit) internal_statistic[idx][bucket].njit += njit; if (0 != nsta) internal_statistic[idx][bucket].nsta += nsta; } } LIBXSMM_API_INLINE unsigned int internal_print_number(unsigned int n, char default_unit, char* unit) { unsigned int number = n; LIBXSMM_ASSERT(NULL != unit); *unit = default_unit; if ((1000000) <= n) { number = (n + 500000) / 1000000; *unit = 'm'; } else if (9999 < n) { number = (n + 500) / 1000; *unit = 'k'; } return number; } LIBXSMM_API_INLINE unsigned int internal_print_statistic(FILE* ostream, const char* target_arch, int precision, unsigned int linebreaks, unsigned int indent) { const internal_statistic_type statistic_sml = internal_statistic[precision][0/*SML*/]; const internal_statistic_type statistic_med = internal_statistic[precision][1/*MED*/]; const internal_statistic_type statistic_big = internal_statistic[precision][2/*BIG*/]; const internal_statistic_type statistic_xxx = internal_statistic[precision][3/*XXX*/]; int printed = 0; LIBXSMM_ASSERT(NULL != ostream && (0 <= precision && precision < 2)); if (/* omit to print anything if it is superfluous */ 0 != statistic_sml.ntry || 0 != statistic_sml.njit || 0 != statistic_sml.nsta || 0 != statistic_sml.ncol || 0 != statistic_med.ntry || 0 != statistic_med.njit || 0 != statistic_med.nsta || 0 != statistic_med.ncol || 0 != statistic_big.ntry || 0 != statistic_big.njit || 0 != statistic_big.nsta || 0 != statistic_big.ncol || 0 != statistic_xxx.ntry || 0 != statistic_xxx.njit || 0 != statistic_xxx.nsta || 0 != statistic_xxx.ncol) { char title[256], range[256], unit[4]; unsigned int counter[4]; { unsigned int n; if (NULL != target_arch && 0 != *target_arch) { assert(strlen(target_arch) < sizeof(title)); /* !LIBXSMM_ASSERT */ for (n = 0; 0 != target_arch[n] /*avoid code-gen. issue with some clang versions: && n < sizeof(title)*/; ++n) { const char c = target_arch[n]; title[n] = (char)(('a' <= c && c <= 'z') ? (c - 32) : c); /* toupper */ } LIBXSMM_SNPRINTF(title + n, sizeof(title) - n, "/%s", 0 == precision ? "DP" : "SP"); } else { LIBXSMM_SNPRINTF(title, sizeof(title), "%s", 0 == precision ? "DP" : "SP"); } for (n = 0; n < linebreaks; ++n) fprintf(ostream, "\n"); } fprintf(ostream, "%*s%-8s %6s %6s %6s %6s\n", (int)indent, "", title, "TRY", "JIT", "STA", "COL"); LIBXSMM_SNPRINTF(range, sizeof(range), "%u..%u", 0u, internal_statistic_sml); counter[0] = internal_print_number(statistic_sml.ntry, ' ', unit + 0); counter[1] = internal_print_number(statistic_sml.njit, ' ', unit + 1); counter[2] = internal_print_number(statistic_sml.nsta, ' ', unit + 2); counter[3] = internal_print_number(statistic_sml.ncol, ' ', unit + 3); fprintf(ostream, "%*s%8s %6u%c %5u%c %5u%c %5u%c\n", (int)indent, "", range, counter[0], unit[0], counter[1], unit[1], counter[2], unit[2], counter[3], unit[3]); LIBXSMM_SNPRINTF(range, sizeof(range), "%u..%u", internal_statistic_sml + 1u, internal_statistic_med); counter[0] = internal_print_number(statistic_med.ntry, ' ', unit + 0); counter[1] = internal_print_number(statistic_med.njit, ' ', unit + 1); counter[2] = internal_print_number(statistic_med.nsta, ' ', unit + 2); counter[3] = internal_print_number(statistic_med.ncol, ' ', unit + 3); fprintf(ostream, "%*s%8s %6u%c %5u%c %5u%c %5u%c\n", (int)indent, "", range, counter[0], unit[0], counter[1], unit[1], counter[2], unit[2], counter[3], unit[3]); LIBXSMM_SNPRINTF(range, sizeof(range), "%u..%u", internal_statistic_med + 1u, internal_statistic_mnk); counter[0] = internal_print_number(statistic_big.ntry, ' ', unit + 0); counter[1] = internal_print_number(statistic_big.njit, ' ', unit + 1); counter[2] = internal_print_number(statistic_big.nsta, ' ', unit + 2); counter[3] = internal_print_number(statistic_big.ncol, ' ', unit + 3); fprintf(ostream, "%*s%8s %6u%c %5u%c %5u%c %5u%c\n", (int)indent, "", range, counter[0], unit[0], counter[1], unit[1], counter[2], unit[2], counter[3], unit[3]); if (0 != statistic_xxx.ntry || 0 != statistic_xxx.njit || 0 != statistic_xxx.nsta || 0 != statistic_xxx.ncol) { LIBXSMM_SNPRINTF(range, sizeof(range), "> %u", internal_statistic_mnk); counter[0] = internal_print_number(statistic_xxx.ntry, ' ', unit + 0); counter[1] = internal_print_number(statistic_xxx.njit, ' ', unit + 1); counter[2] = internal_print_number(statistic_xxx.nsta, ' ', unit + 2); counter[3] = internal_print_number(statistic_xxx.ncol, ' ', unit + 3); fprintf(ostream, "%*s%8s %6u%c %5u%c %5u%c %5u%c\n", (int)indent, "", range, counter[0], unit[0], counter[1], unit[1], counter[2], unit[2], counter[3], unit[3]); } printed = 1; } return printed; } #if !(defined(_WIN32) || defined(__CYGWIN__)) LIBXSMM_API_INLINE unsigned int internal_statistic_ntry(int precision) { return internal_statistic[precision][0/*SML*/].ntry + internal_statistic[precision][1/*MED*/].ntry + internal_statistic[precision][2/*BIG*/].ntry + internal_statistic[precision][3/*XXX*/].ntry; } #endif #if !defined(_WIN32) LIBXSMM_API_INLINE void internal_register_static_code( libxsmm_gemm_precision precision, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_xmmfunction xgemm, libxsmm_code_pointer* registry) { const libxsmm_blasint lda = m, ldb = k, ldc = m; /*const*/ int precondition = LIBXSMM_GEMM_NO_BYPASS_DIMS(m, n, k) && LIBXSMM_GEMM_NO_BYPASS_DIMS(lda, ldb, ldc); if (precondition) { const size_t size = (LIBXSMM_HASH_SIZE) - sizeof(libxsmm_descriptor_kind); libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_gemm_descriptor_dinit(&blob, precision, m, n, k, lda, ldb, ldc, LIBXSMM_ALPHA, LIBXSMM_BETA, LIBXSMM_FLAGS, INTERNAL_PREFETCH); unsigned int i = LIBXSMM_MOD2( libxsmm_crc32(LIBXSMM_HASH_SEED, desc, LIBXSMM_MIN(sizeof(libxsmm_gemm_descriptor), size)), LIBXSMM_CAPACITY_REGISTRY); libxsmm_code_pointer* dst_entry = registry + i; #if !defined(NDEBUG) libxsmm_code_pointer code; code.xgemm = xgemm; LIBXSMM_ASSERT(NULL != code.ptr_const && NULL != registry); LIBXSMM_ASSERT(0 == (LIBXSMM_CODE_STATIC & code.uval)); #endif if (NULL != dst_entry->ptr_const) { /* collision */ const unsigned int i0 = i; do { /* continue to linearly search for an available slot */ i = LIBXSMM_MOD2(i + 1, LIBXSMM_CAPACITY_REGISTRY); if (NULL == registry[i].ptr_const) break; } while (i != i0); #if defined(LIBXSMM_HASH_COLLISION) /* mark entry as a collision */ dst_entry->uval |= LIBXSMM_HASH_COLLISION; #endif dst_entry = registry + i; /* update destination */ internal_update_mmstatistic(desc, 0, 1/*collision*/, 0, 0); /* out of capacity (no registry slot available) */ LIBXSMM_ASSERT(NULL == dst_entry->ptr_const || i == i0); } if (NULL == dst_entry->ptr_const) { /* registry not exhausted */ internal_registry_keys[i].kind = LIBXSMM_KERNEL_KIND_MATMUL; LIBXSMM_ASSIGN127(&internal_registry_keys[i].gemm.desc, desc); dst_entry->xgemm = xgemm; /* mark current entry as static code (non-JIT) */ dst_entry->uval |= LIBXSMM_CODE_STATIC; } internal_update_mmstatistic(desc, 1/*try*/, 0, 0, 0); } } #endif LIBXSMM_API_INTERN void internal_release_scratch(void); LIBXSMM_API_INTERN void internal_release_scratch(void) { libxsmm_xrelease_scratch(NULL/*lock*/); /* release global services */ libxsmm_memory_finalize(); libxsmm_hash_finalize(); libxsmm_malloc_finalize(); } /* Caution: cannot be used multiple time in a single expression! */ LIBXSMM_API_INTERN size_t libxsmm_format_size(char buffer[32], int buffer_size, size_t nbytes, const char scale[], const char* unit, int base) { const int len = (NULL != scale ? ((int)strlen(scale)) : 0); const int m = LIBXSMM_INTRINSICS_BITSCANBWD64(nbytes) / base, n = LIBXSMM_MIN(m, len); int i; buffer[0] = 0; /* clear */ LIBXSMM_ASSERT(NULL != unit && 0 <= base); for (i = 0; i < n; ++i) nbytes >>= base; LIBXSMM_SNPRINTF(buffer, buffer_size, "%i %c%s", (int)nbytes, 0 < n ? scale[n-1] : *unit, 0 < n ? unit : ""); return nbytes; } LIBXSMM_API_INTERN LIBXSMM_ATTRIBUTE_NO_TRACE void internal_dump(FILE* ostream, int urgent); LIBXSMM_API_INTERN void internal_dump(FILE* ostream, int urgent) { char *const env_dump_build = getenv("LIBXSMM_DUMP_BUILD"); char *const env_dump_files = (NULL != getenv("LIBXSMM_DUMP_FILES") ? getenv("LIBXSMM_DUMP_FILES") : getenv("LIBXSMM_DUMP_FILE")); LIBXSMM_ASSERT_MSG(INTERNAL_SINGLETON(internal_singleton_handle), "Invalid handle"); /* determine whether this instance is unique or not */ if (NULL != env_dump_files && 0 != *env_dump_files && 0 == urgent) { /* dump per-node info */ const char* filename = strtok(env_dump_files, INTERNAL_DELIMS); for (; NULL != filename; filename = strtok(NULL, INTERNAL_DELIMS)) { FILE* const file = fopen(filename, "r"); if (NULL != file) { int c = fgetc(file); fprintf(ostream, "\n\nLIBXSMM_DUMP_FILE: %s\n", filename); /* coverity[tainted_data] */ while (EOF != c) { fputc(c, stdout); c = fgetc(file); } fputc('\n', stdout); fclose(file); } } } if (NULL != internal_build_state /* dump build state */ && NULL != env_dump_build && 0 != *env_dump_build) { const int dump_build = atoi(env_dump_build); if (0 == urgent ? (0 < dump_build) : (0 > dump_build)) { fprintf(ostream, "\n\nBUILD_DATE=%i\n", LIBXSMM_CONFIG_BUILD_DATE); fprintf(ostream, "%s\n", internal_build_state); } } } LIBXSMM_API_INTERN void internal_finalize(void); LIBXSMM_API_INTERN void internal_finalize(void) { libxsmm_finalize(); LIBXSMM_STDIO_ACQUIRE(); /* synchronize I/O */ if (0 != libxsmm_verbosity) { /* print statistic on termination */ const char *const env_target_hidden = getenv("LIBXSMM_TARGET_HIDDEN"); const char *const target_arch = (NULL == env_target_hidden || 0 == atoi(env_target_hidden)) ? libxsmm_cpuid_name(libxsmm_target_archid) : NULL/*hidden*/; fprintf(stderr, "\nLIBXSMM_VERSION: %s%s%s (%i)", LIBXSMM_BRANCH, 0 != *(LIBXSMM_BRANCH) ? "-" : "", 0 != *(LIBXSMM_VERSION) ? (LIBXSMM_VERSION) : "unconfigured", LIBXSMM_VERSION4(LIBXSMM_VERSION_MAJOR, LIBXSMM_VERSION_MINOR, LIBXSMM_VERSION_UPDATE, LIBXSMM_VERSION_PATCH)); if (LIBXSMM_VERBOSITY_WARN <= libxsmm_verbosity || 0 > libxsmm_verbosity) { unsigned int linebreak = (0 == internal_print_statistic(stderr, target_arch, 1/*SP*/, 1, 0)) ? 1 : 0; const int high_verbosity = (LIBXSMM_VERBOSITY_HIGH <= libxsmm_verbosity || 0 > libxsmm_verbosity); size_t size_scratch = 0, size_private = 0; libxsmm_scratch_info scratch_info; libxsmm_cpuid_x86_info info; libxsmm_cpuid_x86(&info); if ((LIBXSMM_VERBOSITY_HIGH < libxsmm_verbosity || 0 > libxsmm_verbosity) && 0 == internal_cpuid_info.has_context && 0 != info.has_context) { fprintf(stderr, "\nLIBXSMM: CPU features have been promoted."); } if (0 == internal_print_statistic(stderr, target_arch, 0/*DP*/, linebreak, 0) && 0 != linebreak && NULL != target_arch) { fprintf(stderr, "\nLIBXSMM_TARGET: %s\n", target_arch); } if (EXIT_SUCCESS == libxsmm_get_scratch_info(&scratch_info)) { size_private = scratch_info.internal; size_scratch = scratch_info.size; } if (0 != size_private) { /* should be always true */ char size_private_buffer[32], size_code_buffer[32]; /* coverity[check_return] */ libxsmm_format_size(size_private_buffer, sizeof(size_private_buffer), size_private, "KM", "B", 10); fprintf(stderr, "Registry and code: %s", size_private_buffer); if (0 != libxsmm_format_size(size_code_buffer, sizeof(size_code_buffer), internal_registry_nbytes, "KM", "B", 10)) { fprintf(stderr, " + %s", size_code_buffer); } } if (0 != high_verbosity) { unsigned int ngemms = 0; int i; for (i = 0; i < 4; ++i) { ngemms += internal_statistic[0/*DP*/][i].nsta + internal_statistic[1/*SP*/][i].nsta; ngemms += internal_statistic[0/*DP*/][i].njit + internal_statistic[1/*SP*/][i].njit; } if (0 != ngemms || 0 != internal_statistic_num_gemv || 0 != internal_statistic_num_mcopy || 0 != internal_statistic_num_tcopy || 0 != libxsmm_statistic_num_spmdm || 0 != internal_statistic_num_user || 0 != internal_registry_nleaks) { const char sep[] = " ", *s = ""; fprintf(stderr, " ("); if (0 != ngemms) { fprintf(stderr, "gemm=%u", ngemms); s = sep; } if (0 != internal_statistic_num_gemv) { fprintf(stderr, "%sgemv=%u", s, internal_statistic_num_gemv); s = sep; } if (0 != internal_statistic_num_mcopy) { fprintf(stderr, "%smcopy=%u", s, internal_statistic_num_mcopy); s = sep; } if (0 != internal_statistic_num_meltw) { fprintf(stderr, "%smeltw=%u", s, internal_statistic_num_meltw); s = sep; } if (0 != internal_statistic_num_tcopy) { fprintf(stderr, "%stcopy=%u", s, internal_statistic_num_tcopy); s = sep; } if (0 != libxsmm_statistic_num_spmdm) { fprintf(stderr, "%sspmdm=%u", s, libxsmm_statistic_num_spmdm); s = sep; } if (0 != internal_statistic_num_user) { fprintf(stderr, "%suser=%u", s, internal_statistic_num_user); s = sep; } if (0 != internal_registry_nleaks) { fprintf(stderr, "%snleaks=%u", s, internal_registry_nleaks); s = sep; } fprintf(stderr, ")"); } } fprintf(stderr, "\n"); if (0 != size_scratch) { char size_scratch_buffer[32]; /* coverity[check_return] */ libxsmm_format_size(size_scratch_buffer, sizeof(size_scratch_buffer), size_scratch, "KM", "B", 10); fprintf(stderr, "Scratch: %s", size_scratch_buffer); if (0 != high_verbosity) { fprintf(stderr, " (mallocs=%lu, pools=%u)\n", (unsigned long int)scratch_info.nmallocs, scratch_info.npools); } else { fprintf(stderr, "\n"); } } if (LIBXSMM_VERBOSITY_HIGH < libxsmm_verbosity || 0 > libxsmm_verbosity) { fprintf(stderr, "Uptime: %f s", libxsmm_timer_duration(internal_timer_start, libxsmm_timer_tick())); if (1 < libxsmm_thread_count && INT_MAX == libxsmm_verbosity) { fprintf(stderr, " (nthreads=%u)", libxsmm_thread_count); } fprintf(stderr, "\n"); } } else { fprintf(stderr, "\nLIBXSMM_TARGET: %s\n", target_arch); } } /* release scratch memory pool */ if (EXIT_SUCCESS != atexit(internal_release_scratch) && 0 != libxsmm_verbosity) { fprintf(stderr, "LIBXSMM ERROR: failed to perform final cleanup!\n"); } /* determine whether this instance is unique or not */ if (INTERNAL_SINGLETON(internal_singleton_handle)) { internal_dump(stdout, 0/*urgent*/); /* cleanup singleton */ #if defined(_WIN32) ReleaseMutex(internal_singleton_handle); CloseHandle(internal_singleton_handle); #else unlink(internal_singleton_fname); close(internal_singleton_handle); #endif } LIBXSMM_STDIO_RELEASE(); /* synchronize I/O */ #if (0 != LIBXSMM_SYNC) { /* release locks */ # if (1 < INTERNAL_REGLOCK_MAXN) int i; for (i = 0; i < internal_reglock_count; ++i) LIBXSMM_LOCK_DESTROY(LIBXSMM_REGLOCK, &internal_reglock[i].state); # elif !defined(LIBXSMM_UNIFY_LOCKS) LIBXSMM_LOCK_DESTROY(LIBXSMM_REGLOCK, internal_reglock_ptr); # endif LIBXSMM_LOCK_DESTROY(LIBXSMM_LOCK, &libxsmm_lock_global); } #endif } #if defined(LIBXSMM_INTERCEPT_DYNAMIC) LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void _gfortran_stop_string(const char* /*message*/, int /*len*/, int /*quiet*/); LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void _gfortran_stop_string(const char* message, int len, int quiet) { /* STOP termination handler for GNU Fortran runtime */ static int once = 0; if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&once, 1, LIBXSMM_ATOMIC_RELAXED)) { union { const void* dlsym; void (*ptr)(const char*, int, int); } stop; dlerror(); /* clear an eventual error status */ stop.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "_gfortran_stop_string"); if (NULL != stop.dlsym) { stop.ptr(message, len, quiet); } else exit(EXIT_SUCCESS); /* statically linked runtime */ } } LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void for_stop_core(const char* /*message*/, int /*len*/); LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void for_stop_core(const char* message, int len) { /* STOP termination handler for Intel Fortran runtime */ static int once = 0; if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&once, 1, LIBXSMM_ATOMIC_RELAXED)) { union { const void* dlsym; void (*ptr)(const char*, int); } stop; dlerror(); /* clear an eventual error status */ stop.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "for_stop_core"); if (NULL != stop.dlsym) { stop.ptr(message, len); } else exit(EXIT_SUCCESS); /* statically linked runtime */ } } LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void for_stop_core_quiet(void); LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void for_stop_core_quiet(void) { /* STOP termination handler for Intel Fortran runtime */ static int once = 0; if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&once, 1, LIBXSMM_ATOMIC_RELAXED)) { union { const void* dlsym; void (*ptr)(void); } stop; dlerror(); /* clear an eventual error status */ stop.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "for_stop_core_quiet"); if (NULL != stop.dlsym) { stop.ptr(); } else exit(EXIT_SUCCESS); /* statically linked runtime */ } } #endif LIBXSMM_API_INTERN size_t internal_strlen(const char* /*cstr*/, size_t /*maxlen*/); LIBXSMM_API_INTERN size_t internal_strlen(const char* cstr, size_t maxlen) { size_t result = 0; if (NULL != cstr) { while (0 != cstr[result] && result < maxlen) ++result; } return result; } LIBXSMM_API_INTERN size_t internal_parse_nbytes(const char* /*nbytes*/, size_t /*ndefault*/); LIBXSMM_API_INTERN size_t internal_parse_nbytes(const char* nbytes, size_t ndefault) { size_t result = ndefault; if (NULL != nbytes && 0 != *nbytes) { size_t u = internal_strlen(nbytes, 32) - 1; const char unit[] = "kmgKMG", *const hit = strchr(unit, nbytes[u]); const long long int ibytes = atol(nbytes); /* take with increased type-width */ result = (size_t)ibytes; if ((size_t)LIBXSMM_UNLIMITED != result) { u = (0 != hit ? ((hit - unit) % 3) : 3); if (u < 3) { result <<= (u + 1) * 10; } } } return result; } LIBXSMM_API_INTERN LIBXSMM_ATTRIBUTE_NO_TRACE void internal_init(void); LIBXSMM_API_INTERN void internal_init(void) { int i; #if (0 != LIBXSMM_SYNC) /* setup the locks in a thread-safe fashion */ LIBXSMM_LOCK_ACQUIRE(LIBXSMM_LOCK, &libxsmm_lock_global); # if (1 < INTERNAL_REGLOCK_MAXN) for (i = 0; i < internal_reglock_count; ++i) LIBXSMM_LOCK_ACQUIRE(LIBXSMM_REGLOCK, &internal_reglock[i].state); # elif !defined(LIBXSMM_UNIFY_LOCKS) LIBXSMM_LOCK_ACQUIRE(LIBXSMM_REGLOCK, internal_reglock_ptr); # endif #endif if (NULL == internal_registry) { /* double-check after acquiring the lock(s) */ #if defined(LIBXSMM_INTERCEPT_DYNAMIC) && defined(LIBXSMM_AUTOPIN) /* clear error status (dummy condition: it does not matter if MPI_Init or MPI_Abort) */ const char* const dlsymname = (NULL == dlerror() ? "MPI_Init" : "MPI_Abort"); const void* const dlsymbol = dlsym(LIBXSMM_RTLD_NEXT, dlsymname); const void* const dlmpi = (NULL == dlerror() ? dlsymbol : NULL); #endif const char* const env_verbose = getenv("LIBXSMM_VERBOSE"); void* new_registry = NULL, * new_keys = NULL; #if defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE)) # if defined(LIBXSMM_NTHREADS_USE) void* new_cache = NULL; # endif const char* const env_cache = getenv("LIBXSMM_CACHE"); if (NULL != env_cache && 0 != *env_cache) { const int cache_size = atoi(env_cache), cache_size2 = LIBXSMM_UP2POT(cache_size); internal_cache_size = LIBXSMM_MIN(cache_size2, LIBXSMM_CACHE_MAXSIZE); } else { internal_cache_size = LIBXSMM_CACHE_MAXSIZE; } #endif /* setup verbosity as early as possible since below code may rely on verbose output */ if (NULL != env_verbose && 0 != *env_verbose) { libxsmm_verbosity = atoi(env_verbose); } #if !defined(NDEBUG) else { libxsmm_verbosity = INT_MAX; /* quiet -> verbose */ } #endif #if (0 == LIBXSMM_JIT) if (2 > libxsmm_ninit && (LIBXSMM_VERBOSITY_WARN <= libxsmm_verbosity || 0 > libxsmm_verbosity)) { fprintf(stderr, "LIBXSMM: JIT-code generation was disabled at compile-time.\n"); } #endif #if defined(LIBXSMM_AUTOPIN) # if defined(LIBXSMM_INTERCEPT_DYNAMIC) /* MPI: unwanted affinity can slow-down unrelated jobs (over-subscription), e.g., CP2K regtests */ if (NULL == dlmpi) # endif { /* setup some viable affinity if nothing else is present */ const char *const gomp_cpu_affinity = getenv("GOMP_CPU_AFFINITY"); const char *const kmp_affinity = getenv("KMP_AFFINITY"); const char *const omp_proc_bind = getenv("OMP_PROC_BIND"); if ((NULL == gomp_cpu_affinity || 0 == *gomp_cpu_affinity) && (NULL == kmp_affinity || 0 == *kmp_affinity) && (NULL == omp_proc_bind || 0 == *omp_proc_bind)) { static char affinity[] = "OMP_PROC_BIND=TRUE"; LIBXSMM_EXPECT(EXIT_SUCCESS, LIBXSMM_PUTENV(affinity)); if (LIBXSMM_VERBOSITY_HIGH < libxsmm_verbosity || 0 > libxsmm_verbosity) { /* library code is expected to be mute */ fprintf(stderr, "LIBXSMM: prepared to pin threads.\n"); } } } # if defined(LIBXSMM_INTERCEPT_DYNAMIC) && defined(LIBXSMM_MALLOC) else if (NULL == getenv("I_MPI_SHM_HEAP")) { static char shmheap[] = "I_MPI_SHM_HEAP=1"; LIBXSMM_EXPECT(EXIT_SUCCESS, LIBXSMM_PUTENV(shmheap)); } # endif #endif #if !defined(_WIN32) && 0 umask(S_IRUSR | S_IWUSR); /* setup default/secure file mask */ #endif #if defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (0 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) { const char *const env = getenv("LIBXSMM_SCRATCH_POOLS"); if (NULL == env || 0 == *env) { libxsmm_scratch_pools = LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS; } else { libxsmm_scratch_pools = LIBXSMM_CLMP(atoi(env), 0, LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS); /*libxsmm_scratch_pools_locked = 1;*/ } LIBXSMM_ASSERT(libxsmm_scratch_pools <= LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS); } { const char *const env = getenv("LIBXSMM_SCRATCH_SCALE"); if (NULL == env || 0 == *env) { libxsmm_scratch_scale = LIBXSMM_MALLOC_SCRATCH_SCALE; } else { libxsmm_scratch_scale = LIBXSMM_CLMP(atof(env), 1.0, 10.0); /*libxsmm_scratch_scale_locked = 1;*/ } assert(1 <= libxsmm_scratch_scale); /* !LIBXSMM_ASSERT */ } libxsmm_set_scratch_limit(internal_parse_nbytes(getenv("LIBXSMM_SCRATCH_LIMIT"), LIBXSMM_SCRATCH_DEFAULT)); #endif /*defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (0 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS))*/ { /* setup malloc-interception after internal allocations */ const libxsmm_malloc_function null_malloc_fn = { 0 }; const libxsmm_free_function null_free_fn = { 0 }; const char *const env_k = getenv("LIBXSMM_MALLOC"); char *const env_t = getenv("LIBXSMM_MALLOC_LIMIT"); const char* env_i = (NULL != env_t ? strtok(env_t, INTERNAL_DELIMS) : NULL); const size_t malloc_lo = internal_parse_nbytes(env_i, LIBXSMM_MALLOC_LIMIT); const size_t malloc_hi = (NULL != env_i ? internal_parse_nbytes( strtok(NULL, INTERNAL_DELIMS), LIBXSMM_SCRATCH_UNLIMITED) : LIBXSMM_SCRATCH_UNLIMITED); const int malloc_kind = ((NULL == env_k || 0 == *env_k) ? 0/*disabled*/ : atoi(env_k)); libxsmm_xset_default_allocator(NULL/*lock*/, NULL/*context*/, null_malloc_fn, null_free_fn); libxsmm_xset_scratch_allocator(NULL/*lock*/, NULL/*context*/, null_malloc_fn, null_free_fn); libxsmm_set_malloc(malloc_kind, &malloc_lo, &malloc_hi); /* implies libxsmm_malloc_init */ } #if defined(LIBXSMM_MAXTARGET) libxsmm_set_target_arch(LIBXSMM_STRINGIFY(LIBXSMM_MAXTARGET)); #else /* attempt to set libxsmm_target_archid per environment variable */ libxsmm_set_target_arch(getenv("LIBXSMM_TARGET")); #endif { const char *const env = getenv("LIBXSMM_SYNC"); libxsmm_nosync = (NULL == env || 0 == *env) ? 0/*default*/ : atoi(env); } /* clear internal counters/statistic */ for (i = 0; i < 4/*sml/med/big/xxx*/; ++i) { LIBXSMM_MEMZERO127(&internal_statistic[0/*DP*/][i]); LIBXSMM_MEMZERO127(&internal_statistic[1/*SP*/][i]); } internal_statistic_mnk = LIBXSMM_MAX_DIM; internal_statistic_sml = 13; internal_statistic_med = 23; LIBXSMM_ASSERT(LIBXSMM_CAPACITY_REGISTRY == LIBXSMM_UP2POT(LIBXSMM_CAPACITY_REGISTRY)); libxsmm_hash_init(libxsmm_target_archid); /* used by debug memory allocation (checksum) */ libxsmm_memory_init(libxsmm_target_archid); if ( #if defined(LIBXSMM_NTHREADS_USE) && defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE)) (EXIT_SUCCESS == libxsmm_xmalloc(&new_cache, /* if internal_cache_size is zero, allocation must still happen (later control-flow too expensive) */ sizeof(internal_cache_type) * (LIBXSMM_NTHREADS_MAX), LIBXSMM_CACHELINE/*alignment*/, LIBXSMM_MALLOC_FLAG_PRIVATE, NULL/*extra*/, 0/*extra-size*/) && NULL != new_cache) && #endif (EXIT_SUCCESS == libxsmm_xmalloc(&new_keys, (LIBXSMM_CAPACITY_REGISTRY) * sizeof(libxsmm_descriptor), 0/*auto-align*/, LIBXSMM_MALLOC_FLAG_PRIVATE, NULL/*extra*/, 0/*extra-size*/) && NULL != new_keys) && (EXIT_SUCCESS == libxsmm_xmalloc(&new_registry, (LIBXSMM_CAPACITY_REGISTRY) * sizeof(libxsmm_code_pointer), 0/*auto-align*/, LIBXSMM_MALLOC_FLAG_PRIVATE, NULL/*extra*/, 0/*extra-size*/) && NULL != new_registry)) { #if defined(LIBXSMM_NTHREADS_USE) && defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE)) LIBXSMM_ASSERT(NULL != new_cache); /* SA: suppress false positive */ memset(new_cache, 0, (LIBXSMM_NTHREADS_MAX) * sizeof(internal_cache_type)); #endif libxsmm_xcopy_init(libxsmm_target_archid); libxsmm_dnn_init(libxsmm_target_archid); #if defined(LIBXSMM_PERF) libxsmm_perf_init(); #endif { const char *const env = getenv("LIBXSMM_GEMM_PREFETCH"); #if defined(_WIN32) || defined(__CYGWIN__) libxsmm_gemm_auto_prefetch_default = INTERNAL_PREFETCH; #else libxsmm_gemm_auto_prefetch_default = (0 == internal_statistic_ntry(0/*DP*/) && 0 == internal_statistic_ntry(1/*SP*/)) /* avoid special prefetch if static code is present, since such code uses INTERNAL_PREFETCH */ ? (((LIBXSMM_X86_AVX512 >= libxsmm_target_archid || LIBXSMM_X86_AVX512_CORE <= libxsmm_target_archid)) ? LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C : LIBXSMM_GEMM_PREFETCH_BL2_VIA_C) : INTERNAL_PREFETCH; #endif libxsmm_gemm_auto_prefetch = INTERNAL_PREFETCH; if (NULL != env && 0 != *env) { /* user input beyond auto-prefetch is always considered */ const int uid = atoi(env); if (0 <= uid) { libxsmm_gemm_auto_prefetch_default = libxsmm_gemm_uid2prefetch(uid); libxsmm_gemm_auto_prefetch = libxsmm_gemm_auto_prefetch_default; internal_gemm_auto_prefetch_locked = 1; } } } for (i = 0; i < (LIBXSMM_CAPACITY_REGISTRY); ++i) ((libxsmm_code_pointer*)new_registry)[i].ptr = NULL; LIBXSMM_ASSERT(NULL == internal_registry && NULL == internal_registry_keys); #if defined(LIBXSMM_NTHREADS_USE) && defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE)) LIBXSMM_ASSERT(NULL == internal_cache_buffer); internal_cache_buffer = (internal_cache_type*)new_cache; #endif internal_registry_keys = (libxsmm_descriptor*)new_keys; /* prior to registering static kernels */ #if defined(LIBXSMM_BUILD) && !defined(LIBXSMM_DEFAULT_CONFIG) # include #endif libxsmm_gemm_init(libxsmm_target_archid); #if defined(LIBXSMM_TRACE) { int filter_threadid = 0/*only main-thread*/, filter_mindepth = 0, filter_maxnsyms = 0; const int init_code = libxsmm_trace_init(filter_threadid, filter_mindepth, filter_maxnsyms); if (EXIT_SUCCESS != init_code && 0 != libxsmm_verbosity) { /* library code is expected to be mute */ fprintf(stderr, "LIBXSMM ERROR: failed to initialize TRACE (error #%i)!\n", init_code); } } #endif { /* commit the registry buffer and enable global visibility */ void *const pv_registry = &internal_registry; LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_STORE, LIBXSMM_BITS)((void**)pv_registry, (void*)new_registry, LIBXSMM_ATOMIC_SEQ_CST); } } else { if (0 != libxsmm_verbosity) { /* library code is expected to be mute */ fprintf(stderr, "LIBXSMM ERROR: failed to allocate internal buffers!\n"); } libxsmm_xfree(new_registry, 0/*no check*/); libxsmm_xfree(new_keys, 0/*no check*/); #if defined(LIBXSMM_NTHREADS_USE) && defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE)) libxsmm_xfree(new_cache, 0/*no check*/); #endif } } #if (0 != LIBXSMM_SYNC) /* release locks */ # if (1 < INTERNAL_REGLOCK_MAXN) for (i = 0; i < internal_reglock_count; ++i) LIBXSMM_LOCK_RELEASE(LIBXSMM_REGLOCK, &internal_reglock[i].state); # elif !defined(LIBXSMM_UNIFY_LOCKS) LIBXSMM_LOCK_RELEASE(LIBXSMM_REGLOCK, internal_reglock_ptr); # endif LIBXSMM_LOCK_RELEASE(LIBXSMM_LOCK, &libxsmm_lock_global); #endif } LIBXSMM_API LIBXSMM_ATTRIBUTE_CTOR void libxsmm_init(void) { if (0 == LIBXSMM_ATOMIC_LOAD(&internal_registry, LIBXSMM_ATOMIC_RELAXED)) { static unsigned int ninit = 0, gid = 0; const unsigned int tid = LIBXSMM_ATOMIC_ADD_FETCH(&ninit, 1, LIBXSMM_ATOMIC_SEQ_CST); LIBXSMM_ASSERT(0 < tid); /* libxsmm_ninit (1: initialization started, 2: library initialized, higher: to invalidate code-TLS) */ if (1 == tid) { libxsmm_timer_tickint s0 = libxsmm_timer_tick_rtc(); /* warm-up */ libxsmm_timer_tickint t0 = libxsmm_timer_tick_tsc(); /* warm-up */ s0 = libxsmm_timer_tick_rtc(); t0 = libxsmm_timer_tick_tsc(); /* start timing */ assert(0 == LIBXSMM_ATOMIC_LOAD(&libxsmm_ninit, LIBXSMM_ATOMIC_SEQ_CST)); /* !LIBXSMM_ASSERT */ /* coverity[check_return] */ LIBXSMM_ATOMIC_ADD_FETCH(&libxsmm_ninit, 1, LIBXSMM_ATOMIC_SEQ_CST); gid = tid; /* protect initialization */ #if defined(NDEBUG) LIBXSMM_UNUSED(gid); #endif #if (0 != LIBXSMM_SYNC) /* coverity[check_return] */ LIBXSMM_TLS_CREATE(&libxsmm_tlskey); { /* construct and initialize locks */ # if defined(LIBXSMM_REGLOCK_TRY) const char *const env_trylock = getenv("LIBXSMM_TRYLOCK"); # endif LIBXSMM_LOCK_ATTR_TYPE(LIBXSMM_LOCK) attr_global; # if (1 < INTERNAL_REGLOCK_MAXN) int i; LIBXSMM_LOCK_ATTR_TYPE(LIBXSMM_REGLOCK) attr; LIBXSMM_LOCK_ATTR_INIT(LIBXSMM_REGLOCK, &attr); # elif defined(LIBXSMM_UNIFY_LOCKS) internal_reglock_ptr = &libxsmm_lock_global; # else static LIBXSMM_LOCK_TYPE(LIBXSMM_REGLOCK) internal_reglock; internal_reglock_ptr = &internal_reglock; LIBXSMM_LOCK_ATTR_TYPE(LIBXSMM_REGLOCK) attr; LIBXSMM_LOCK_ATTR_INIT(LIBXSMM_REGLOCK, &attr); LIBXSMM_LOCK_INIT(LIBXSMM_REGLOCK, internal_reglock_ptr, &attr); LIBXSMM_LOCK_ATTR_DESTROY(LIBXSMM_REGLOCK, &attr); # endif LIBXSMM_LOCK_ATTR_INIT(LIBXSMM_LOCK, &attr_global); LIBXSMM_LOCK_INIT(LIBXSMM_LOCK, &libxsmm_lock_global, &attr_global); LIBXSMM_LOCK_ATTR_DESTROY(LIBXSMM_LOCK, &attr_global); /* control number of locks needed; LIBXSMM_TRYLOCK implies only 1 lock */ # if defined(LIBXSMM_REGLOCK_TRY) if (NULL == env_trylock || 0 == *env_trylock) # endif { /* no LIBXSMM_TRYLOCK */ # if defined(LIBXSMM_VTUNE) internal_reglock_count = 1; /* avoid duplicated kernels */ # elif (1 < INTERNAL_REGLOCK_MAXN) const char *const env_nlocks = getenv("LIBXSMM_NLOCKS"); const int reglock_count = (NULL == env_nlocks || 0 == *env_nlocks || 1 > atoi(env_nlocks)) ? (INTERNAL_REGLOCK_MAXN) : LIBXSMM_MIN(atoi(env_nlocks), INTERNAL_REGLOCK_MAXN); internal_reglock_count = LIBXSMM_LO2POT(reglock_count); # else internal_reglock_count = 0; # endif } # if defined(LIBXSMM_REGLOCK_TRY) else { /* LIBXSMM_TRYLOCK environment variable specified */ internal_reglock_count = (0 != atoi(env_trylock) ? 1 # if (1 < INTERNAL_REGLOCK_MAXN) : INTERNAL_REGLOCK_MAXN); # else : 0); # endif } # endif # if (1 < INTERNAL_REGLOCK_MAXN) LIBXSMM_ASSERT(1 <= internal_reglock_count); for (i = 0; i < internal_reglock_count; ++i) LIBXSMM_LOCK_INIT(LIBXSMM_REGLOCK, &internal_reglock[i].state, &attr); LIBXSMM_LOCK_ATTR_DESTROY(LIBXSMM_REGLOCK, &attr); # endif } #endif { /* determine whether this instance is unique or not */ #if defined(_WIN32) internal_singleton_handle = CreateMutex(NULL, TRUE, "GlobalLIBXSMM"); #else const int result = LIBXSMM_SNPRINTF(internal_singleton_fname, sizeof(internal_singleton_fname), "/tmp/.libxsmm.%u", /*rely on user id to avoid permission issues in case of left-over files*/(unsigned int)getuid()); struct flock singleton_flock; int singleton_handle; singleton_flock.l_start = 0; singleton_flock.l_len = 0; /* entire file */ singleton_flock.l_type = F_WRLCK; /* exclusive across PIDs */ singleton_flock.l_whence = SEEK_SET; singleton_handle = ((0 < result && (int)sizeof(internal_singleton_fname) > result) ? open( internal_singleton_fname, O_WRONLY | O_CREAT, S_IRUSR | S_IWUSR) : -1); internal_singleton_handle = fcntl(singleton_handle, F_SETLK, &singleton_flock); if (0 > internal_singleton_handle && 0 <= singleton_handle) close(singleton_handle); #endif /* coverity[leaked_handle] */ } { /* calibrate timer */ int register_termination_proc; libxsmm_timer_tickint s1, t1; internal_init(); /* must be first to initialize verbosity, etc. */ if (INTERNAL_SINGLETON(internal_singleton_handle)) { /* after internal_init */ internal_dump(stdout, 1/*urgent*/); } s1 = libxsmm_timer_tick_rtc(); t1 = libxsmm_timer_tick_tsc(); /* mid-timing */ libxsmm_cpuid_x86(&internal_cpuid_info); if (0 != internal_cpuid_info.constant_tsc && t0 < t1) { libxsmm_timer_scale = libxsmm_timer_duration_rtc(s0, s1) / (t1 - t0); } register_termination_proc = atexit(internal_finalize); s1 = libxsmm_timer_tick_rtc(); t1 = libxsmm_timer_tick_tsc(); /* final timing */ /* set timer-scale and determine start of the "uptime" (shown at termination) */ if (t0 < t1 && 0.0 < libxsmm_timer_scale) { const double scale = libxsmm_timer_duration_rtc(s0, s1) / (t1 - t0); const double diff = LIBXSMM_DELTA(libxsmm_timer_scale, scale) / scale; if (5E-5 > diff) { libxsmm_timer_scale = scale; internal_timer_start = t0; } else { libxsmm_timer_scale = 0; internal_timer_start = s0; #if !defined(NDEBUG) libxsmm_se = 1; #endif } } else { internal_timer_start = s0; libxsmm_timer_scale = 0; } if (0 != libxsmm_verbosity) { /* library code is expected to be mute */ if (EXIT_SUCCESS != register_termination_proc) { fprintf(stderr, "LIBXSMM ERROR: failed to register termination procedure!\n"); } if (0 == libxsmm_timer_scale) { fprintf(stderr, "LIBXSMM WARNING: timer is maybe not cycle-accurate!\n"); } } } assert(1 == LIBXSMM_ATOMIC_LOAD(&libxsmm_ninit, LIBXSMM_ATOMIC_SEQ_CST)); /* !LIBXSMM_ASSERT */ /* coverity[check_return] */ LIBXSMM_ATOMIC_ADD_FETCH(&libxsmm_ninit, 1, LIBXSMM_ATOMIC_SEQ_CST); } else /*if (gid != tid)*/ { /* avoid recursion */ LIBXSMM_ASSERT(gid != tid); while (2 > LIBXSMM_ATOMIC_LOAD(&libxsmm_ninit, LIBXSMM_ATOMIC_RELAXED)) LIBXSMM_SYNC_YIELD; internal_init(); } } LIBXSMM_ASSERT(1 < libxsmm_ninit); } LIBXSMM_API LIBXSMM_ATTRIBUTE_NO_TRACE void libxsmm_finalize(void); LIBXSMM_API LIBXSMM_ATTRIBUTE_DTOR void libxsmm_finalize(void) { void *const regaddr = &internal_registry; uintptr_t regptr = LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_LOAD, LIBXSMM_BITS)((uintptr_t*)regaddr, LIBXSMM_ATOMIC_RELAXED); libxsmm_code_pointer* registry = (libxsmm_code_pointer*)regptr; if (NULL != registry) { int i; #if (0 != LIBXSMM_SYNC) LIBXSMM_LOCK_ACQUIRE(LIBXSMM_LOCK, &libxsmm_lock_global); # if (1 < INTERNAL_REGLOCK_MAXN) { /* acquire locks and thereby shortcut lazy initialization later on */ int ntry = 0, n; do { for (i = 0, n = 0; i < internal_reglock_count; ++i) { if (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_REGLOCK) == LIBXSMM_LOCK_TRYLOCK(LIBXSMM_REGLOCK, &internal_reglock[i].state)) ++n; } ntry += (0 == n ? 1 : 0); } while (n < internal_reglock_count && ntry < LIBXSMM_CLEANUP_NTRY); } # elif !defined(LIBXSMM_UNIFY_LOCKS) LIBXSMM_LOCK_ACQUIRE(LIBXSMM_REGLOCK, internal_reglock_ptr); # endif #endif regptr = LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_LOAD, LIBXSMM_BITS)((uintptr_t*)regaddr, LIBXSMM_ATOMIC_RELAXED); registry = (libxsmm_code_pointer*)regptr; if (NULL != registry) { libxsmm_descriptor *const registry_keys = internal_registry_keys; #if defined(LIBXSMM_NTHREADS_USE) && defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE)) internal_cache_type *const cache_buffer = internal_cache_buffer; #endif unsigned int rest = 0, errors = 0; #if defined(LIBXSMM_TRACE) i = libxsmm_trace_finalize(); if (EXIT_SUCCESS != i && 0 != libxsmm_verbosity) { /* library code is expected to be mute */ fprintf(stderr, "LIBXSMM ERROR: failed to finalize trace (error #%i)!\n", i); } #endif #if defined(LIBXSMM_PERF) libxsmm_perf_finalize(); #endif libxsmm_xcopy_finalize(); libxsmm_gemm_finalize(); libxsmm_dnn_finalize(); /* coverity[check_return] */ LIBXSMM_ATOMIC_ADD_FETCH(&libxsmm_ninit, 1, LIBXSMM_ATOMIC_RELAXED); /* invalidate code cache (TLS) */ #if defined(LIBXSMM_NTHREADS_USE) && defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE)) internal_cache_buffer = NULL; #endif internal_registry_keys = NULL; /* make registry keys unavailable */ LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_STORE_ZERO, LIBXSMM_BITS)((uintptr_t*)regaddr, LIBXSMM_ATOMIC_SEQ_CST); internal_registry_nbytes = 0; internal_registry_nleaks = 0; for (i = 0; i < (LIBXSMM_CAPACITY_REGISTRY); ++i) { /*const*/ libxsmm_code_pointer code = registry[i]; if (NULL != code.ptr_const) { /* check if the registered entity is a GEMM kernel */ switch (registry_keys[i].kind) { case LIBXSMM_KERNEL_KIND_MATMUL: { const libxsmm_gemm_descriptor *const desc = ®istry_keys[i].gemm.desc; if (1 < desc->m && 1 < desc->n) { const unsigned int njit = (0 == (LIBXSMM_CODE_STATIC & code.uval) ? 1 : 0); const unsigned int nsta = (0 != (LIBXSMM_CODE_STATIC & code.uval) ? 1 : 0); /* count whether kernel is static or JIT-code */ internal_update_mmstatistic(desc, 0, 0, njit, nsta); } else { ++internal_statistic_num_gemv; } ++rest; } break; case LIBXSMM_KERNEL_KIND_MCOPY: { ++internal_statistic_num_mcopy; } break; case LIBXSMM_KERNEL_KIND_MELTW: { ++internal_statistic_num_meltw; } break; case LIBXSMM_KERNEL_KIND_TRANS: { ++internal_statistic_num_tcopy; } break; case LIBXSMM_KERNEL_KIND_TRSM: { ++internal_statistic_num_trsm; } break; case LIBXSMM_KERNEL_KIND_TRMM: { ++internal_statistic_num_trmm; } break; case LIBXSMM_KERNEL_KIND_USER: { ++internal_statistic_num_user; } break; default: if (LIBXSMM_KERNEL_UNREGISTERED <= registry_keys[i].kind) { ++errors; } else { ++rest; } } if (0 != libxsmm_verbosity) { /* library code is expected to be mute */ if (0 != errors) { fprintf(stderr, "LIBXSMM ERROR: code registry is corrupted!\n"); } if (LIBXSMM_CAPACITY_REGISTRY == (rest + errors + internal_statistic_num_gemv + internal_statistic_num_mcopy + internal_statistic_num_meltw + internal_statistic_num_tcopy + internal_statistic_num_trsm + internal_statistic_num_trmm + internal_statistic_num_user)) { fprintf(stderr, "LIBXSMM WARNING: code registry was exhausted!\n"); } } if (0 == (LIBXSMM_CODE_STATIC & code.uval)) { /* check for allocated/generated JIT-code */ void* buffer = NULL; size_t size = 0; #if defined(LIBXSMM_HASH_COLLISION) code.uval &= ~LIBXSMM_HASH_COLLISION; /* clear collision flag */ #endif if (EXIT_SUCCESS == libxsmm_get_malloc_xinfo(code.ptr_const, &size, NULL/*flags*/, &buffer)) { #if !defined(NDEBUG) registry[i].ptr = NULL; #endif libxsmm_xfree(code.ptr_const, 0/*no check*/); /* round-up size (it is fine to assume 4 KB pages since it is likely more accurate than not rounding up) */ internal_registry_nbytes += LIBXSMM_UP2(size + (((char*)code.ptr_const) - (char*)buffer), LIBXSMM_PAGE_MINSIZE); } else ++internal_registry_nleaks; } } } /* release buffers (registry, keys, cache) */ #if defined(LIBXSMM_NTHREADS_USE) && defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE)) libxsmm_xfree(cache_buffer, 0/*no check*/); #endif libxsmm_xfree(registry_keys, 0/*no check*/); libxsmm_xfree(registry, 0/*no check*/); } #if (0 != LIBXSMM_SYNC) /* LIBXSMM_LOCK_RELEASE, but no LIBXSMM_LOCK_DESTROY */ # if (1 < INTERNAL_REGLOCK_MAXN) for (i = 0; i < internal_reglock_count; ++i) LIBXSMM_LOCK_RELEASE(LIBXSMM_REGLOCK, &internal_reglock[i].state); # elif !defined(LIBXSMM_UNIFY_LOCKS) LIBXSMM_LOCK_RELEASE(LIBXSMM_REGLOCK, internal_reglock_ptr); # endif LIBXSMM_LOCK_RELEASE(LIBXSMM_LOCK, &libxsmm_lock_global); /* coverity[check_return] */ LIBXSMM_TLS_DESTROY(libxsmm_tlskey); #endif } } LIBXSMM_API void libxsmm_sink(LIBXSMM_VARIADIC) { /* does nothing else but sinking given arguments */ } LIBXSMM_API int libxsmm_get_target_archid(void) { LIBXSMM_INIT #if !defined(__MIC__) return libxsmm_target_archid; #else /* no JIT support */ return LIBXSMM_MIN(libxsmm_target_archid, LIBXSMM_X86_SSE3); #endif } LIBXSMM_API void libxsmm_set_target_archid(int id) { int target_archid = LIBXSMM_TARGET_ARCH_UNKNOWN; switch (id) { case LIBXSMM_X86_AVX512_CPX: case LIBXSMM_X86_AVX512_CLX: case LIBXSMM_X86_AVX512_CORE: case LIBXSMM_X86_AVX512_KNM: case LIBXSMM_X86_AVX512_MIC: case LIBXSMM_X86_AVX512: case LIBXSMM_X86_AVX2: case LIBXSMM_X86_AVX: case LIBXSMM_X86_SSE4: case LIBXSMM_X86_SSE3: case LIBXSMM_TARGET_ARCH_GENERIC: { target_archid = id; } break; default: if (LIBXSMM_X86_GENERIC <= id) { target_archid = LIBXSMM_X86_GENERIC; } else { target_archid = libxsmm_cpuid(); } } LIBXSMM_ATOMIC_STORE(&libxsmm_target_archid, target_archid, LIBXSMM_ATOMIC_RELAXED); if (0 != libxsmm_verbosity) { /* library code is expected to be mute */ const int cpuid = libxsmm_cpuid(); if (cpuid < target_archid) { const char *const target_arch = libxsmm_cpuid_name(target_archid); fprintf(stderr, "LIBXSMM WARNING: \"%s\" code may fail to run on \"%s\"!\n", target_arch, libxsmm_cpuid_name(cpuid)); } } } LIBXSMM_API const char* libxsmm_get_target_arch(void) { LIBXSMM_INIT return libxsmm_cpuid_name(libxsmm_target_archid); } /* function serves as a helper for implementing the Fortran interface */ LIBXSMM_API const char* libxsmmf_get_target_arch(int* length); LIBXSMM_API const char* libxsmmf_get_target_arch(int* length) { const char *const arch = libxsmm_get_target_arch(); /* valid here since function is not in the public interface */ LIBXSMM_ASSERT(NULL != arch && 0 != length); *length = (int)strlen(arch); return arch; } LIBXSMM_API void libxsmm_set_target_arch(const char* arch) { const int cpuid = libxsmm_cpuid(); int target_archid; if (NULL != arch && 0 != *arch) { const int jit = atoi(arch); if (0 == strcmp("0", arch)) { target_archid = LIBXSMM_X86_SSE3; } else if (0 < jit) { target_archid = LIBXSMM_X86_GENERIC + jit; } else if (0 == strcmp("cpx", arch)) { target_archid = LIBXSMM_X86_AVX512_CPX; } else if (0 == strcmp("clx", arch)) { target_archid = LIBXSMM_X86_AVX512_CLX; } else if (0 == strcmp("skx", arch) || 0 == strcmp("skl", arch) /* "avx3"/"avx512" previously enabled LIBXSMM_X86_AVX512 */ || 0 == strcmp("avx3", arch) || 0 == strcmp("avx512", arch)) { target_archid = LIBXSMM_X86_AVX512_CORE; } else if (0 == strcmp("knm", arch)) { target_archid = LIBXSMM_X86_AVX512_KNM; } else if (0 == strcmp("knl", arch) || 0 == strcmp("mic", arch)) { target_archid = LIBXSMM_X86_AVX512_MIC; } else if (0 == strcmp("hsw", arch) || 0 == strcmp("avx2", arch)) { target_archid = LIBXSMM_X86_AVX2; } else if (0 == strcmp("snb", arch) || 0 == strcmp("avx", arch)) { target_archid = LIBXSMM_X86_AVX; } else if (0 == strcmp("wsm", arch) || 0 == strcmp("nhm", arch) || 0 == strcmp("sse4", arch) || 0 == strcmp("sse4_1", arch) || 0 == strcmp("sse4.1", arch) || 0 == strcmp("sse4_2", arch) || 0 == strcmp("sse4.2", arch)) { target_archid = LIBXSMM_X86_SSE4; } else if (0 == strcmp("sse", arch) || 0 == strcmp("sse3", arch) || 0 == strcmp("ssse3", arch) || 0 == strcmp("ssse", arch)) { target_archid = LIBXSMM_X86_SSE3; } else if (0 == strcmp("x86", arch) || 0 == strcmp("x64", arch) || 0 == strcmp("sse2", arch)) { target_archid = LIBXSMM_X86_GENERIC; } else if (0 == strcmp("generic", arch) || 0 == strcmp("none", arch)) { target_archid = LIBXSMM_TARGET_ARCH_GENERIC; } else { target_archid = cpuid; } } else { target_archid = cpuid; } if (cpuid < target_archid) { /* warn about code path if beyond CPUID */ static int error_once = 0; if ( 0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { const char *const target_arch = libxsmm_cpuid_name(target_archid); fprintf(stderr, "LIBXSMM WARNING: \"%s\" code will fail to run on \"%s\"!\n", target_arch, libxsmm_cpuid_name(cpuid)); } #if 0 /* limit code path to confirmed features */ target_archid = cpuid; #endif } LIBXSMM_ATOMIC_STORE(&libxsmm_target_archid, target_archid, LIBXSMM_ATOMIC_RELAXED); } LIBXSMM_API int libxsmm_get_verbosity(void) { LIBXSMM_INIT return libxsmm_verbosity; } LIBXSMM_API void libxsmm_set_verbosity(int level) { LIBXSMM_INIT LIBXSMM_ATOMIC_STORE(&libxsmm_verbosity, level, LIBXSMM_ATOMIC_RELAXED); } LIBXSMM_API libxsmm_gemm_prefetch_type libxsmm_get_gemm_auto_prefetch(void) { return (libxsmm_gemm_prefetch_type)libxsmm_gemm_auto_prefetch; } LIBXSMM_API void libxsmm_set_gemm_auto_prefetch(libxsmm_gemm_prefetch_type strategy) { if (0 == internal_gemm_auto_prefetch_locked) { /* LIBXSMM_GEMM_PREFETCH environment takes precedence */ LIBXSMM_ATOMIC_STORE(&libxsmm_gemm_auto_prefetch_default, strategy, LIBXSMM_ATOMIC_RELAXED); LIBXSMM_ATOMIC_STORE(&libxsmm_gemm_auto_prefetch, strategy, LIBXSMM_ATOMIC_RELAXED); } } LIBXSMM_API unsigned char libxsmm_typesize(libxsmm_datatype datatype) { switch (datatype) { case LIBXSMM_DATATYPE_F64: return 8; case LIBXSMM_DATATYPE_F32: return 4; case LIBXSMM_DATATYPE_BF16: return 2; case LIBXSMM_DATATYPE_I64: return 8; case LIBXSMM_DATATYPE_I32: return 4; case LIBXSMM_DATATYPE_I16: return 2; case LIBXSMM_DATATYPE_I8: return 1; case LIBXSMM_DATATYPE_UNSUPPORTED: { static int error_once = 0; if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: unsupported data type!\n"); } } break; } LIBXSMM_ASSERT_MSG(0, "unsupported data type"); return 1; /* avoid to return 0 to avoid div-by-zero in static analysis of depending code */ } LIBXSMM_API_INTERN int libxsmm_dvalue(libxsmm_datatype datatype, const void* value, double* dvalue) { int result = EXIT_SUCCESS; if (NULL != value && NULL != dvalue) { switch (datatype) { case LIBXSMM_DATATYPE_F64: *dvalue = (*(const double*)value); break; case LIBXSMM_DATATYPE_F32: *dvalue = (double)(*(const float *)value); break; case LIBXSMM_DATATYPE_I32: *dvalue = (double)(*(const int *)value); break; case LIBXSMM_DATATYPE_I16: *dvalue = (double)(*(const short *)value); break; case LIBXSMM_DATATYPE_I8: *dvalue = (double)(*(const char *)value); break; default: result = EXIT_FAILURE; } } else { result = EXIT_FAILURE; } return result; } LIBXSMM_API_INTERN const char* libxsmm_typename(libxsmm_datatype datatype) { switch (datatype) { case LIBXSMM_DATATYPE_F64: return "f64"; case LIBXSMM_DATATYPE_F32: return "f32"; case LIBXSMM_DATATYPE_BF16: return "bf16"; case LIBXSMM_DATATYPE_I64: return "i64"; case LIBXSMM_DATATYPE_I32: return "i32"; case LIBXSMM_DATATYPE_I16: return "i16"; case LIBXSMM_DATATYPE_I8: return "i8"; default: { if (LIBXSMM_GEMM_PRECISION_I16 == LIBXSMM_GETENUM_INP(datatype) && LIBXSMM_GEMM_PRECISION_I32 == LIBXSMM_GETENUM_OUT(datatype)) { return "i16i32"; } else if (LIBXSMM_GEMM_PRECISION_I16 == LIBXSMM_GETENUM_INP(datatype) && LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_OUT(datatype)) { return "i16f32"; } else if (LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_INP(datatype) && LIBXSMM_GEMM_PRECISION_I32 == LIBXSMM_GETENUM_OUT(datatype)) { return "i8i32"; } else if (LIBXSMM_GEMM_PRECISION_BF16 == LIBXSMM_GETENUM_INP(datatype) && LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_OUT(datatype)) { return "bf16f32"; } else { return "void"; } } } } LIBXSMM_API_INLINE void internal_get_typesize_string(char buffer[4], int buffer_size, size_t typesize) { LIBXSMM_ASSERT(256 > typesize && 4 <= buffer_size); if (10 > typesize) { buffer[0] = (char)('0' + typesize); buffer[1] = 0; } else { LIBXSMM_SNPRINTF(buffer, buffer_size, "%i", (int)typesize); } } LIBXSMM_API_INTERN int libxsmm_build(const libxsmm_build_request* request, unsigned int regindex, libxsmm_code_pointer* code) { int result = EXIT_SUCCESS; #if !defined(__MIC__) const char * /*const*/ target_arch = libxsmm_cpuid_name(libxsmm_target_archid); /* large enough temporary buffer for generated code */ char jit_buffer[LIBXSMM_CODE_MAXSIZE], jit_name[256] = { 0 }; libxsmm_generated_code generated_code; libxsmm_kernel_xinfo extra; LIBXSMM_MEMZERO127(&generated_code); generated_code.generated_code = jit_buffer; generated_code.buffer_size = sizeof(jit_buffer); /* setup code generation */ generated_code.arch = libxsmm_target_archid; generated_code.code_type = 2; # if !defined(NDEBUG) /* should not be needed (all members will be initialized below) */ LIBXSMM_MEMZERO127(&extra); # endif extra.registered = regindex; extra.nflops = 0; LIBXSMM_ASSERT(NULL != generated_code.generated_code || 0 == generated_code.buffer_size); LIBXSMM_ASSERT(NULL != request && 0 != libxsmm_target_archid); LIBXSMM_ASSERT(NULL != code && NULL == code->ptr_const); switch (request->kind) { /* generate kernel */ case LIBXSMM_BUILD_KIND_GEMM: { /* small MxM kernel */ LIBXSMM_ASSERT(NULL != request->descriptor.gemm); # if 0 /* dummy kernel for an empty shape is desired */ if (0 < request->descriptor.gemm->m && 0 < request->descriptor.gemm->n && 0 < request->descriptor.gemm->k && 0 < request->descriptor.gemm->lda && 0 < request->descriptor.gemm->ldb && 0 < request->descriptor.gemm->ldc) # endif { const unsigned int m = request->descriptor.gemm->m, n = request->descriptor.gemm->n, k = request->descriptor.gemm->k; extra.nflops = 2 * m * n * k; # if !defined(LIBXSMM_DENY_RETARGET) /* disable: ECFLAGS=-DLIBXSMM_DENY_RETARGET */ if (LIBXSMM_X86_AVX2 < libxsmm_target_archid && (LIBXSMM_GEMM_PRECISION_F64 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.gemm->datatype) || LIBXSMM_GEMM_PRECISION_F32 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.gemm->datatype)) && (16 >= (m * k) || 16 >= (k * n) || 16 >= (m * n))) { /* TODO: shall we update variable "target_arch" (name)? */ generated_code.arch = LIBXSMM_X86_AVX2; } # endif LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_gemm_kernel, &generated_code, request->descriptor.gemm); # if !defined(LIBXSMM_VTUNE) if (0 > libxsmm_verbosity) # endif { const int uid = libxsmm_gemm_prefetch2uid((libxsmm_gemm_prefetch_type)request->descriptor.gemm->prefetch); const char *const tname = libxsmm_typename((libxsmm_datatype)request->descriptor.gemm->datatype); int typesigns = 0, br = 0; /* query batch reduce variant */ if ( (LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS & request->descriptor.gemm->flags) > 1 ) { br = 1; } else if ( (LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET & request->descriptor.gemm->flags) > 1 ) { br = 2; } else if ( (LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE & request->descriptor.gemm->flags) > 1 ) { br = 3; } else { br = 0; } /* query A/B sign combinations */ if ( (LIBXSMM_GEMM_FLAG_A_UNSIGNED & request->descriptor.gemm->flags) > 1 ) { typesigns = 1; } else if ( (LIBXSMM_GEMM_FLAG_B_UNSIGNED & request->descriptor.gemm->flags) > 1 ) { typesigns = 2; } else if ( (LIBXSMM_GEMM_FLAG_AB_UNSIGNED & request->descriptor.gemm->flags) > 1 ) { typesigns = 3; } else { typesigns = 0; } /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */ LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_%s_%c%c_%ux%ux%u_%u_%u_%u_a%i_b%i_p%i_br%i_uh%u_si%i.mxm", target_arch, tname, 0 == (LIBXSMM_GEMM_FLAG_TRANS_A & request->descriptor.gemm->flags) ? 'n' : 't', 0 == (LIBXSMM_GEMM_FLAG_TRANS_B & request->descriptor.gemm->flags) ? 'n' : 't', m, n, k, request->descriptor.gemm->lda, request->descriptor.gemm->ldb, request->descriptor.gemm->ldc, /*0 != (LIBXSMM_GEMM_FLAG_ALPHA_0 & request->descriptor.gemm->flags) ? 0 : */1, 0 != (LIBXSMM_GEMM_FLAG_BETA_0 & request->descriptor.gemm->flags) ? 0 : 1, uid, br, (unsigned int)request->descriptor.gemm->c3, typesigns); } } } break; case LIBXSMM_BUILD_KIND_SRSOA: { /* sparse SOA kernel, CSR format */ LIBXSMM_ASSERT(NULL != request->descriptor.srsoa && 0 != request->descriptor.srsoa->gemm); LIBXSMM_ASSERT(NULL != request->descriptor.srsoa->row_ptr && 0 != request->descriptor.srsoa->column_idx && 0 != request->descriptor.srsoa->values); /* only floating point */ if (LIBXSMM_GEMM_PRECISION_F64 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.srsoa->gemm->datatype) || LIBXSMM_GEMM_PRECISION_F32 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.srsoa->gemm->datatype)) { const unsigned int nnz = (request->descriptor.srsoa->gemm->lda == 0) ? request->descriptor.srsoa->row_ptr[request->descriptor.srsoa->gemm->m] : request->descriptor.srsoa->row_ptr[request->descriptor.srsoa->gemm->k]; const unsigned int simdw = (LIBXSMM_GEMM_PRECISION_F64 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.srsoa->gemm->datatype)) ? libxsmm_cpuid_vlen32(libxsmm_target_archid)/2 : libxsmm_cpuid_vlen32(libxsmm_target_archid); const unsigned int gemm_factor = (request->descriptor.srsoa->gemm->lda == 0) ? request->descriptor.srsoa->gemm->n : request->descriptor.srsoa->gemm->m; extra.nflops = 2 * nnz * gemm_factor * simdw; LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_spgemm_csr_soa_kernel, &generated_code, request->descriptor.srsoa->gemm, target_arch, request->descriptor.srsoa->row_ptr, request->descriptor.srsoa->column_idx, request->descriptor.srsoa->values, request->descriptor.srsoa->packed_width); # if !defined(LIBXSMM_VTUNE) if (0 > libxsmm_verbosity) # endif { const int uid = libxsmm_gemm_prefetch2uid((libxsmm_gemm_prefetch_type)request->descriptor.srsoa->gemm->prefetch); const char *const tname = libxsmm_typename((libxsmm_datatype)request->descriptor.srsoa->gemm->datatype); /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */ LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_%s_%c%c_%ux%ux%u_%u_%u_%u_w%u_a%i_b%i_p%i_nnz%u.srsoa", target_arch, tname, 0 == (LIBXSMM_GEMM_FLAG_TRANS_A & request->descriptor.srsoa->gemm->flags) ? 'n' : 't', 0 == (LIBXSMM_GEMM_FLAG_TRANS_B & request->descriptor.srsoa->gemm->flags) ? 'n' : 't', request->descriptor.srsoa->gemm->m, request->descriptor.srsoa->gemm->n, request->descriptor.srsoa->gemm->k, request->descriptor.srsoa->gemm->lda, request->descriptor.srsoa->gemm->ldb, request->descriptor.srsoa->gemm->ldc, request->descriptor.srsoa->packed_width, /*0 != (LIBXSMM_GEMM_FLAG_ALPHA_0 & request->descriptor.srsoa->gemm->flags) ? 0 : */1, 0 != (LIBXSMM_GEMM_FLAG_BETA_0 & request->descriptor.srsoa->gemm->flags) ? 0 : 1, uid, nnz); } } } break; case LIBXSMM_BUILD_KIND_SCSOA: { /* sparse SOA kernel, CSC format */ LIBXSMM_ASSERT(NULL != request->descriptor.scsoa && 0 != request->descriptor.scsoa->gemm); LIBXSMM_ASSERT(NULL != request->descriptor.scsoa->row_idx && 0 != request->descriptor.scsoa->column_ptr && 0 != request->descriptor.scsoa->values); /* only floating point */ if (LIBXSMM_GEMM_PRECISION_F64 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.scsoa->gemm->datatype) || LIBXSMM_GEMM_PRECISION_F32 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.scsoa->gemm->datatype)) { const unsigned int nnz = (request->descriptor.scsoa->gemm->lda == 0) ? request->descriptor.scsoa->column_ptr[request->descriptor.scsoa->gemm->k] : request->descriptor.scsoa->column_ptr[request->descriptor.scsoa->gemm->n]; const unsigned int simdw = (LIBXSMM_GEMM_PRECISION_F64 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.scsoa->gemm->datatype)) ? libxsmm_cpuid_vlen32(libxsmm_target_archid)/2 : libxsmm_cpuid_vlen32(libxsmm_target_archid); const unsigned int gemm_factor = (request->descriptor.scsoa->gemm->lda == 0) ? request->descriptor.scsoa->gemm->n : request->descriptor.scsoa->gemm->m; extra.nflops = 2 * nnz * gemm_factor * simdw; LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_spgemm_csc_soa_kernel, &generated_code, request->descriptor.scsoa->gemm, target_arch, request->descriptor.scsoa->row_idx, request->descriptor.scsoa->column_ptr, request->descriptor.scsoa->values, request->descriptor.scsoa->packed_width); # if !defined(LIBXSMM_VTUNE) if (0 > libxsmm_verbosity) # endif { const int uid = libxsmm_gemm_prefetch2uid((libxsmm_gemm_prefetch_type)request->descriptor.scsoa->gemm->prefetch); const char *const tname = libxsmm_typename((libxsmm_datatype)request->descriptor.scsoa->gemm->datatype); /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */ LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_%s_%c%c_%ux%ux%u_%u_%u_%u_w%u_a%i_b%i_p%i_nnz%u.scsoa", target_arch, tname, 0 == (LIBXSMM_GEMM_FLAG_TRANS_A & request->descriptor.scsoa->gemm->flags) ? 'n' : 't', 0 == (LIBXSMM_GEMM_FLAG_TRANS_B & request->descriptor.scsoa->gemm->flags) ? 'n' : 't', request->descriptor.scsoa->gemm->m, request->descriptor.scsoa->gemm->n, request->descriptor.scsoa->gemm->k, request->descriptor.scsoa->gemm->lda, request->descriptor.scsoa->gemm->ldb, request->descriptor.scsoa->gemm->ldc, request->descriptor.scsoa->packed_width, /*0 != (LIBXSMM_GEMM_FLAG_ALPHA_0 & request->descriptor.scsoa->gemm->flags) ? 0 : */1, 0 != (LIBXSMM_GEMM_FLAG_BETA_0 & request->descriptor.scsoa->gemm->flags) ? 0 : 1, uid, nnz); } } } break; case LIBXSMM_BUILD_KIND_PGEMMRMAC: { /* packed GEMM, B regular matrix, row-major */ LIBXSMM_ASSERT(NULL != request->descriptor.pgemmacrm && 0 != request->descriptor.pgemmacrm->gemm); /* only floating point */ if (LIBXSMM_GEMM_PRECISION_F64 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.pgemmacrm->gemm->datatype) || LIBXSMM_GEMM_PRECISION_F32 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.pgemmacrm->gemm->datatype)) { extra.nflops = 2 * request->descriptor.pgemmacrm->packed_width * request->descriptor.pgemmacrm->gemm->m * request->descriptor.pgemmacrm->gemm->n * request->descriptor.pgemmacrm->gemm->k; LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_packed_gemm_ac_rm, &generated_code, request->descriptor.pgemmacrm->gemm, request->descriptor.pgemmacrm->packed_width, target_arch); # if !defined(LIBXSMM_VTUNE) if (0 > libxsmm_verbosity) # endif { const int uid = libxsmm_gemm_prefetch2uid((libxsmm_gemm_prefetch_type)request->descriptor.pgemmacrm->gemm->prefetch); const char *const tname = libxsmm_typename((libxsmm_datatype)request->descriptor.pgemmacrm->gemm->datatype); /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */ LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_%s_%c%c_%ux%ux%u_%u_%u_%u_w%u_a%i_b%i_p%i.pgemmacrm", target_arch, tname, 0 == (LIBXSMM_GEMM_FLAG_TRANS_A & request->descriptor.pgemmacrm->gemm->flags) ? 'n' : 't', 0 == (LIBXSMM_GEMM_FLAG_TRANS_B & request->descriptor.pgemmacrm->gemm->flags) ? 'n' : 't', request->descriptor.pgemmacrm->gemm->m, request->descriptor.pgemmacrm->gemm->n, request->descriptor.pgemmacrm->gemm->k, request->descriptor.pgemmacrm->gemm->lda, request->descriptor.pgemmacrm->gemm->ldb, request->descriptor.pgemmacrm->gemm->ldc, request->descriptor.pgemmacrm->packed_width, /*0 != (LIBXSMM_GEMM_FLAG_ALPHA_0 & request->descriptor.pgemmacrm->gemm->flags) ? 0 : */1, 0 != (LIBXSMM_GEMM_FLAG_BETA_0 & request->descriptor.pgemmacrm->gemm->flags) ? 0 : 1, uid); } } } break; case LIBXSMM_BUILD_KIND_PGEMMRMBC: { /* packed GEMM, A regular matrix, row-major */ LIBXSMM_ASSERT(NULL != request->descriptor.pgemmbcrm && 0 != request->descriptor.pgemmbcrm->gemm); /* only floating point */ if (LIBXSMM_GEMM_PRECISION_F64 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.pgemmbcrm->gemm->datatype) || LIBXSMM_GEMM_PRECISION_F32 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.pgemmbcrm->gemm->datatype)) { extra.nflops = 2 * request->descriptor.pgemmbcrm->packed_width * request->descriptor.pgemmbcrm->gemm->m * request->descriptor.pgemmbcrm->gemm->n * request->descriptor.pgemmbcrm->gemm->k; LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_packed_gemm_bc_rm, &generated_code, request->descriptor.pgemmbcrm->gemm, request->descriptor.pgemmbcrm->packed_width, target_arch); # if !defined(LIBXSMM_VTUNE) if (0 > libxsmm_verbosity) # endif { const int uid = libxsmm_gemm_prefetch2uid((libxsmm_gemm_prefetch_type)request->descriptor.pgemmbcrm->gemm->prefetch); const char *const tname = libxsmm_typename((libxsmm_datatype)request->descriptor.pgemmbcrm->gemm->datatype); /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */ LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_%s_%c%c_%ux%ux%u_%u_%u_%u_w%u_a%i_b%i_p%i.pgemmbcrm", target_arch, tname, 0 == (LIBXSMM_GEMM_FLAG_TRANS_A & request->descriptor.pgemmbcrm->gemm->flags) ? 'n' : 't', 0 == (LIBXSMM_GEMM_FLAG_TRANS_B & request->descriptor.pgemmbcrm->gemm->flags) ? 'n' : 't', request->descriptor.pgemmbcrm->gemm->m, request->descriptor.pgemmbcrm->gemm->n, request->descriptor.pgemmbcrm->gemm->k, request->descriptor.pgemmbcrm->gemm->lda, request->descriptor.pgemmbcrm->gemm->ldb, request->descriptor.pgemmbcrm->gemm->ldc, request->descriptor.pgemmbcrm->packed_width, /*0 != (LIBXSMM_GEMM_FLAG_ALPHA_0 & request->descriptor.pgemmbcrm->gemm->flags) ? 0 : */1, 0 != (LIBXSMM_GEMM_FLAG_BETA_0 & request->descriptor.pgemmbcrm->gemm->flags) ? 0 : 1, uid); } } } break; case LIBXSMM_BUILD_KIND_SREG: { /* sparse register kernel */ LIBXSMM_ASSERT(NULL != request->descriptor.sreg && 0 != request->descriptor.sreg->gemm); LIBXSMM_ASSERT(NULL != request->descriptor.sreg->row_ptr && 0 != request->descriptor.sreg->column_idx && 0 != request->descriptor.sreg->values); /* only floating point */ if (LIBXSMM_GEMM_PRECISION_F64 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.sreg->gemm->datatype) || LIBXSMM_GEMM_PRECISION_F32 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.sreg->gemm->datatype)) { const unsigned int nnz = request->descriptor.sreg->row_ptr[request->descriptor.sreg->gemm->m]; extra.nflops = 2 * libxsmm_cpuid_vlen32(libxsmm_target_archid)/2 * request->descriptor.sreg->gemm->n * nnz; LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_spgemm_csr_reg_kernel, &generated_code, request->descriptor.sreg->gemm, target_arch, request->descriptor.sreg->row_ptr, request->descriptor.sreg->column_idx, (const double*)request->descriptor.sreg->values); # if !defined(LIBXSMM_VTUNE) if (0 > libxsmm_verbosity) # endif { const int uid = libxsmm_gemm_prefetch2uid((libxsmm_gemm_prefetch_type)request->descriptor.sreg->gemm->prefetch); const char *const tname = libxsmm_typename((libxsmm_datatype)request->descriptor.sreg->gemm->datatype); /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */ LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_%s_%c%c_%ux%ux%u_%u_%u_%u_a%i_b%i_p%i.sreg", target_arch, tname, 0 == (LIBXSMM_GEMM_FLAG_TRANS_A & request->descriptor.sreg->gemm->flags) ? 'n' : 't', 0 == (LIBXSMM_GEMM_FLAG_TRANS_B & request->descriptor.sreg->gemm->flags) ? 'n' : 't', request->descriptor.sreg->gemm->m, request->descriptor.sreg->gemm->n, request->descriptor.sreg->gemm->k, request->descriptor.sreg->gemm->lda, request->descriptor.sreg->gemm->ldb, request->descriptor.sreg->gemm->ldc, /*0 != (LIBXSMM_GEMM_FLAG_ALPHA_0 & request->descriptor.sreg->gemm->flags) ? 0 : */1, 0 != (LIBXSMM_GEMM_FLAG_BETA_0 & request->descriptor.sreg->gemm->flags) ? 0 : 1, uid); } } } break; case LIBXSMM_BUILD_KIND_MCOPY: { /* matcopy kernel */ LIBXSMM_ASSERT(NULL != request->descriptor.mcopy); # if 0 /* TODO: backend supports typesize <= 4, but kernels for typesize < 4 are incorrect */ if (4 == request->descriptor.mcopy->typesize) # endif { LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_matcopy_kernel, &generated_code, request->descriptor.mcopy, target_arch); # if !defined(LIBXSMM_VTUNE) if (0 > libxsmm_verbosity) # endif { char tsizename[4]; internal_get_typesize_string(tsizename, sizeof(tsizename), request->descriptor.mcopy->typesize); /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */ LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_tsize%s_%ux%u_%ux%u_p%u.mcopy", target_arch, tsizename, request->descriptor.mcopy->m, request->descriptor.mcopy->n, request->descriptor.mcopy->ldi, request->descriptor.mcopy->ldo, (unsigned int)request->descriptor.mcopy->prefetch); } } } break; case LIBXSMM_BUILD_KIND_MELTW: { /* matcopy kernel */ LIBXSMM_ASSERT(NULL != request->descriptor.meltw); # if 0 /* TODO: backend supports typesize <= 4, but kernels for typesize < 4 are incorrect */ if (4 == request->descriptor.meltw->typesize) # endif { LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_mateltwise_kernel, &generated_code, request->descriptor.meltw); # if !defined(LIBXSMM_VTUNE) if (0 > libxsmm_verbosity) # endif { char tsizename[4]; internal_get_typesize_string(tsizename, sizeof(tsizename), request->descriptor.meltw->datatype); /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */ LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_tsize%s_%ux%u_%ux%u_opcode%u_flags%u.meltw", target_arch, tsizename, request->descriptor.meltw->m, request->descriptor.meltw->n, request->descriptor.meltw->ldi, request->descriptor.meltw->ldo, (unsigned int)request->descriptor.meltw->operation, (unsigned int)request->descriptor.meltw->flags); } } } break; case LIBXSMM_BUILD_KIND_TRANS: { /* transpose kernel */ LIBXSMM_ASSERT(NULL != request->descriptor.trans); if (4 == request->descriptor.trans->typesize || 8 == request->descriptor.trans->typesize) { LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_transpose_kernel, &generated_code, request->descriptor.trans, libxsmm_target_archid); # if !defined(LIBXSMM_VTUNE) if (0 > libxsmm_verbosity) # endif { char tsizename[4]; internal_get_typesize_string(tsizename, sizeof(tsizename), request->descriptor.trans->typesize); /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */ LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_tsize%s_%ux%u_%u.trans", target_arch, tsizename, request->descriptor.trans->m, request->descriptor.trans->n, request->descriptor.trans->ldo); } } } break; case LIBXSMM_BUILD_KIND_PGEMM: { /* compact P/GEMM-kernel (packed) */ unsigned int tsize; LIBXSMM_ASSERT(NULL != request->descriptor.pgemm); tsize = (unsigned int)request->descriptor.pgemm->typesize; if (4 == tsize || 8 == tsize) { extra.nflops = 0; /* TODO */ LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_pgemm_kernel, &generated_code, request->descriptor.pgemm, libxsmm_target_archid); # if !defined(LIBXSMM_VTUNE) if (0 > libxsmm_verbosity) # endif { char tsizename[4]; internal_get_typesize_string(tsizename, sizeof(tsizename), tsize); /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */ LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_tsize%s_%c%c%c_%ux%ux%u_%u_%u_%u_%i.pgemm", target_arch, tsizename, request->descriptor.pgemm->transa, request->descriptor.pgemm->transb, request->descriptor.pgemm->layout, request->descriptor.pgemm->m, request->descriptor.pgemm->n, request->descriptor.pgemm->k, request->descriptor.pgemm->lda, request->descriptor.pgemm->ldb, request->descriptor.pgemm->ldc, (int)request->descriptor.pgemm->alpha_val); } } } break; case LIBXSMM_BUILD_KIND_GETRF: { /* compact GETRF kernel (packed) */ unsigned int tsize; LIBXSMM_ASSERT(NULL != request->descriptor.getrf); tsize = (unsigned int)request->descriptor.getrf->typesize; if (4 == tsize || 8 == tsize) { extra.nflops = 0; /* TODO */ LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_getrf_kernel, &generated_code, request->descriptor.getrf, libxsmm_target_archid); # if !defined(LIBXSMM_VTUNE) if (0 > libxsmm_verbosity) # endif { char tsizename[4]; internal_get_typesize_string(tsizename, sizeof(tsizename), tsize); /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */ LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_tsize%s_%c_%ux%u_%u.getrf", target_arch, tsizename, request->descriptor.getrf->layout, request->descriptor.getrf->m, request->descriptor.getrf->n, request->descriptor.getrf->lda); } } } break; case LIBXSMM_BUILD_KIND_TRMM: { /* compact TRMM kernel (packed) */ unsigned int tsize; LIBXSMM_ASSERT(NULL != request->descriptor.trmm); tsize = (unsigned int)request->descriptor.trmm->typesize; if (4 == tsize || 8 == tsize) { extra.nflops = 0; /* TODO */ LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_trmm_kernel, &generated_code, request->descriptor.trmm, target_arch); # if !defined(LIBXSMM_VTUNE) if (0 > libxsmm_verbosity) # endif { char tsizename[4]; internal_get_typesize_string(tsizename, sizeof(tsizename), tsize); /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */ LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_tsize%s_%c%c%c%c_%ux%u_%u_%u.trmm", target_arch, tsizename, request->descriptor.trmm->transa, request->descriptor.trmm->layout, request->descriptor.trmm->side, request->descriptor.trmm->uplo, request->descriptor.trmm->m, request->descriptor.trmm->n, request->descriptor.trmm->lda, request->descriptor.trmm->ldb); /* TODO: alpha */ } } } break; case LIBXSMM_BUILD_KIND_TRSM: if (NULL != request->descriptor.trsm) { /* compact TRSM kernel (packed) */ const unsigned int tsize = (unsigned int)request->descriptor.trsm->typesize; if (4 == tsize || 8 == tsize) { extra.nflops = 0; /* TODO */ LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_trsm_kernel, &generated_code, request->descriptor.trsm, target_arch); # if !defined(LIBXSMM_VTUNE) if (0 > libxsmm_verbosity) # endif { char tsizename[4]; internal_get_typesize_string(tsizename, sizeof(tsizename), tsize); /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */ LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_tsize%s_%c%c%c%c_%ux%u_%u_%u.trsm", target_arch, tsizename, request->descriptor.trsm->transa, request->descriptor.trsm->layout, request->descriptor.trsm->side, request->descriptor.trsm->uplo, request->descriptor.trsm->m, request->descriptor.trsm->n, request->descriptor.trsm->lda, request->descriptor.trsm->ldb); /* TODO: alpha */ } } } break; case LIBXSMM_BUILD_KIND_USER: break; # if !defined(NDEBUG) /* library code is expected to be mute */ default: { /* unknown kind */ static int error_once = 0; if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: invalid build request discovered!\n"); } /*result = EXIT_FAILURE;*/ } # endif } if (0 == generated_code.last_error /* no error raised */ && 0 != generated_code.code_size /*check (tcopy issue?)*/) { char* code_buffer = NULL; void* code_buffer_result = &code_buffer; LIBXSMM_ASSERT(generated_code.code_size <= LIBXSMM_CODE_MAXSIZE); LIBXSMM_ASSERT(NULL != generated_code.generated_code); /* attempt to create executable buffer */ result = libxsmm_xmalloc((void**)code_buffer_result, generated_code.code_size, 0/*auto*/, /* flag must be a superset of what's populated by libxsmm_malloc_attrib */ LIBXSMM_MALLOC_FLAG_RWX, &extra, sizeof(extra)); if (EXIT_SUCCESS == result) { /* check for success */ LIBXSMM_ASSERT(NULL != code_buffer); /* copy temporary buffer into the prepared executable buffer */ # if defined(NDEBUG) { int i; /* precondition: jit_buffer == generated_code.generated_code */ for (i = 0; i < (int)generated_code.code_size; ++i) code_buffer[i] = jit_buffer[i]; } # else memcpy(code_buffer, generated_code.generated_code, generated_code.code_size); # endif /* attribute/protect buffer and revoke unnecessary flags */ result = libxsmm_malloc_attrib((void**)code_buffer_result, LIBXSMM_MALLOC_FLAG_X, jit_name); if (EXIT_SUCCESS == result) { /* check for success */ code->ptr = code_buffer; /* commit buffer */ LIBXSMM_ASSERT(NULL != code->ptr && 0 == (LIBXSMM_CODE_STATIC & code->uval)); } else { /* release buffer */ libxsmm_xfree(code_buffer, 0/*no check*/); } } } else if (request->kind == LIBXSMM_BUILD_KIND_USER && NULL != request->descriptor.ptr) { /* user-data */ if (0 != request->user_size) { void* user_data = &code->ptr; result = libxsmm_xmalloc((void**)user_data, request->user_size, 0/*auto*/, LIBXSMM_MALLOC_FLAG_PRIVATE, &extra, sizeof(extra)); } else { result = EXIT_SUCCESS; code->ptr = NULL; } } else { result = (0 != generated_code.last_error ? generated_code.last_error : EXIT_FAILURE); } #else /* unsupported platform */ LIBXSMM_UNUSED(request); LIBXSMM_UNUSED(regindex); LIBXSMM_UNUSED(code); /* libxsmm_get_target_arch also serves as a runtime check whether JIT is available or not */ if (LIBXSMM_X86_SSE3 <= libxsmm_target_archid) result = EXIT_FAILURE; #endif return result; } #if defined(LIBXSMM_DESC_PAD) LIBXSMM_API_INLINE void internal_pad_descriptor(libxsmm_descriptor* desc, size_t size) { const signed char s = (signed char)LIBXSMM_MAX(LIBXSMM_DIFF_SIZE, LIBXSMM_HASH_SIZE); signed char i; LIBXSMM_ASSERT(NULL != desc && s <= LIBXSMM_DESCRIPTOR_MAXSIZE); for (i = (signed char)size; i < s; ++i) desc->data[i] = 0; } #endif LIBXSMM_API_INLINE libxsmm_code_pointer internal_find_code(libxsmm_descriptor* desc, size_t desc_size, size_t user_size) { libxsmm_code_pointer flux_entry = { 0 }; const size_t size = LIBXSMM_MIN(sizeof(libxsmm_descriptor_kind) + desc_size, LIBXSMM_DIFF_SIZE); #if !defined(NDEBUG) && (0 != LIBXSMM_JIT) int build = EXIT_SUCCESS; #endif #if defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE)) # if defined(LIBXSMM_NTHREADS_USE) const unsigned int tid = libxsmm_get_tid(); internal_cache_type *const cache = internal_cache_buffer + tid; # else static LIBXSMM_TLS internal_cache_type internal_cache_buffer; internal_cache_type *const cache = &internal_cache_buffer; # endif unsigned char cache_index; # if defined(LIBXSMM_DESC_PAD) # if defined(LIBXSMM_DESC_INLINE) LIBXSMM_DIFF_DECL(LIBXSMM_DIFF_SIZE, xdesc); internal_pad_descriptor(desc, size); LIBXSMM_DIFF_LOAD(LIBXSMM_DIFF_SIZE, xdesc, desc); LIBXSMM_DIFF_N(unsigned char, cache_index, LIBXSMM_DIFF(LIBXSMM_DIFF_SIZE), xdesc, cache->entry.keys, LIBXSMM_DIFF_SIZE, LIBXSMM_CACHE_STRIDE, cache->entry.hit, cache->entry.size); # else internal_pad_descriptor(desc, size); cache_index = (unsigned char)libxsmm_diff_n(desc, cache->entry.keys, LIBXSMM_DIFF_SIZE, LIBXSMM_CACHE_STRIDE, cache->entry.hit, cache->entry.size); # endif # elif defined(LIBXSMM_DESC_INLINE) LIBXSMM_DIFF_DECL(LIBXSMM_DIFF_SIZE, xdesc); LIBXSMM_DIFF_LOAD(LIBXSMM_DIFF_SIZE, xdesc, desc); LIBXSMM_DIFF_N(unsigned char, cache_index, LIBXSMM_DIFF(LIBXSMM_DIFF_SIZE), xdesc, cache->entry.keys, size, LIBXSMM_CACHE_STRIDE, cache->entry.hit, cache->entry.size); # else LIBXSMM_ASSERT(NULL != desc); cache_index = (unsigned char)libxsmm_diff_n(desc, cache->entry.keys, size, LIBXSMM_CACHE_STRIDE, cache->entry.hit, cache->entry.size); # endif if (cache->entry.id == libxsmm_ninit && cache_index < cache->entry.size) { /* valid hit */ flux_entry = cache->entry.code[cache_index]; cache->entry.hit = cache_index; } else #else LIBXSMM_ASSERT(NULL != desc); # if defined(LIBXSMM_DESC_PAD) # if defined(LIBXSMM_DESC_INLINE) LIBXSMM_DIFF_DECL(LIBXSMM_DIFF_SIZE, xdesc); internal_pad_descriptor(desc, size); LIBXSMM_DIFF_LOAD(LIBXSMM_DIFF_SIZE, xdesc, desc); # else internal_pad_descriptor(desc, size); # endif # endif #endif { #if defined(LIBXSMM_DESC_PAD) unsigned int i = LIBXSMM_CRC32(LIBXSMM_HASH_SIZE)(LIBXSMM_HASH_SEED, desc); #else unsigned int i = libxsmm_crc32(LIBXSMM_HASH_SEED, desc, LIBXSMM_MIN(size, LIBXSMM_HASH_SIZE)); #endif unsigned int i0 = i = LIBXSMM_MOD2(i, LIBXSMM_CAPACITY_REGISTRY), mode = 0, diff = 1; LIBXSMM_ASSERT(NULL != internal_registry); LIBXSMM_ASSERT(&desc->kind == &desc->gemm.pad && desc->kind == desc->gemm.pad); do { /* use calculated location and check if the requested code is already JITted */ #if (1 < INTERNAL_REGLOCK_MAXN) || !LIBXSMM_LOCK_TYPE_ISRW(LIBXSMM_REGLOCK) /* read registered code */ # if 1 /* omitting an atomic load is safe but avoids race-detectors to highlight this location */ uintptr_t *const fluxaddr = &internal_registry[i].uval; flux_entry.uval = LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_LOAD, LIBXSMM_BITS)(fluxaddr, LIBXSMM_ATOMIC_RELAXED); # else flux_entry = internal_registry[i]; # endif #else LIBXSMM_LOCK_ACQREAD(LIBXSMM_REGLOCK, internal_reglock_ptr); flux_entry = internal_registry[i]; /* read registered code */ LIBXSMM_LOCK_RELREAD(LIBXSMM_REGLOCK, internal_reglock_ptr); #endif if ((NULL != flux_entry.ptr_const || 1 == mode) && 2 > mode) { /* check existing entry further */ if (NULL != flux_entry.ptr_const) { #if defined(LIBXSMM_DESC_PAD) # if defined(LIBXSMM_DIFF_INLINE) # if !defined(LIBXSMM_DESC_INLINE) LIBXSMM_DIFF_DECL(LIBXSMM_DIFF_SIZE, xdesc); LIBXSMM_DIFF_LOAD(LIBXSMM_DIFF_SIZE, xdesc, desc); # endif diff = LIBXSMM_DIFF(LIBXSMM_DIFF_SIZE)(xdesc, internal_registry_keys + i, 0/*dummy*/); # else diff = libxsmm_diff(desc, internal_registry_keys + i, LIBXSMM_DIFF_SIZE); # endif #else diff = libxsmm_diff(desc, internal_registry_keys + i, size); #endif } #if !defined(NDEBUG) else LIBXSMM_ASSERT(0 != diff); #endif if (0 != diff) { /* search for code version */ if (0 == mode) { /* transition to higher mode */ i0 = i; /* keep current position on record */ #if defined(LIBXSMM_HASH_COLLISION) /* enter code generation, and collision fix-up */ if (0 == (LIBXSMM_HASH_COLLISION & flux_entry.uval)) { LIBXSMM_ASSERT(NULL != flux_entry.ptr_const); /* collision */ mode = 3; } else #endif /* search for an existing code version */ mode = 1; /* else */ } i = LIBXSMM_MOD2(i + 1, LIBXSMM_CAPACITY_REGISTRY); if (i == i0) { /* search finished, no code version exists */ #if defined(LIBXSMM_HASH_COLLISION) mode = 3; /* enter code generation, and collision fix-up */ #else mode = 2; /* enter code generation */ #endif if (LIBXSMM_KERNEL_KIND_MATMUL == desc->kind) { internal_update_mmstatistic(&desc->gemm.desc, 0, 1/*collision*/, 0, 0); } } LIBXSMM_ASSERT(0 != diff); /* continue */ } } else { /* enter code generation (there is no code version yet) */ LIBXSMM_ASSERT(0 == mode || 1 < mode); #if (0 == LIBXSMM_JIT) LIBXSMM_UNUSED(user_size); #else if (LIBXSMM_X86_AVX <= libxsmm_target_archid || /* check if JIT is supported (CPUID) */ (LIBXSMM_X86_SSE3 <= libxsmm_target_archid && LIBXSMM_KERNEL_KIND_MATMUL == desc->kind) || (LIBXSMM_KERNEL_KIND_USER == desc->kind)) { LIBXSMM_ASSERT(0 != mode || NULL == flux_entry.ptr_const/*code version does not exist*/); INTERNAL_FIND_CODE_LOCK(lock, i, diff, flux_entry.ptr); /* lock the registry entry */ if (NULL == internal_registry[i].ptr_const) { /* double-check registry after acquiring the lock */ libxsmm_build_request request; /* setup the code build request */ LIBXSMM_ASSERT(desc->kind < LIBXSMM_KERNEL_UNREGISTERED); request.kind = (libxsmm_build_kind)desc->kind; request.descriptor.ptr = &desc->gemm.desc; request.user_size = user_size; # if defined(NDEBUG) if (EXIT_SUCCESS == libxsmm_build(&request, i, &flux_entry) && NULL != flux_entry.ptr_const) # else build = libxsmm_build(&request, i, &flux_entry); if (EXIT_SUCCESS == build && NULL != flux_entry.ptr_const) # endif { LIBXSMM_ASSIGN127(internal_registry_keys + i, desc); # if (1 < INTERNAL_REGLOCK_MAXN) LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_STORE, LIBXSMM_BITS)(&internal_registry[i].ptr, flux_entry.ptr, LIBXSMM_ATOMIC_SEQ_CST); # else internal_registry[i] = flux_entry; # endif # if defined(LIBXSMM_HASH_COLLISION) if (2 < mode) { /* arrived from collision state; now mark as collision */ libxsmm_code_pointer fix_entry; # if (1 < INTERNAL_REGLOCK_MAXN) fix_entry.ptr = LIBXSMM_ATOMIC_LOAD(&internal_registry[i0].ptr, LIBXSMM_ATOMIC_RELAXED); # else fix_entry = internal_registry[i0]; # endif LIBXSMM_ASSERT(NULL != fix_entry.ptr_const); if (0 == (LIBXSMM_HASH_COLLISION & fix_entry.uval)) { fix_entry.uval |= LIBXSMM_HASH_COLLISION; /* mark current entry as collision */ # if (1 < INTERNAL_REGLOCK_MAXN) LIBXSMM_ATOMIC_STORE(&internal_registry[i0].ptr, fix_entry.ptr, LIBXSMM_ATOMIC_RELAXED); # else internal_registry[i0] = fix_entry; # endif } } # endif } if (((int)LIBXSMM_KERNEL_KIND_MATMUL) == desc->kind) { internal_update_mmstatistic(&desc->gemm.desc, 1/*try*/, 0, 0, 0); } /* leave here even in case of a build-error; do not use break (inside of locked region) */ diff = 0; } INTERNAL_FIND_CODE_UNLOCK(lock); if (0 != diff) { /* acquire registry slot */ if (0 == mode) { /* initial condition */ mode = 2; /* continue to linearly search for an empty slot */ i0 = i; /* keep current position on record */ } do { /* continue to linearly search for an available slot */ i = LIBXSMM_MOD2(i + 1, LIBXSMM_CAPACITY_REGISTRY); if (NULL == internal_registry[i].ptr_const) break; } while (i != i0); if (i == i0) { /* out of capacity (no registry slot available) */ diff = 0; /* do not use break if inside of locked region */ } flux_entry.ptr = NULL; /* no result */ } } else /* JIT-code generation not available */ #endif { /* leave the dispatch loop */ if (((int)LIBXSMM_KERNEL_KIND_MATMUL) == desc->kind) { internal_update_mmstatistic(&desc->gemm.desc, 1/*try*/, 0, 0, 0); } #if !defined(NDEBUG) && (0 != LIBXSMM_JIT) build = EXIT_FAILURE; #endif flux_entry.ptr = NULL; diff = 0; } } } while (0 != diff); #if defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE)) if (NULL != flux_entry.ptr_const) { /* keep code version on record (cache) */ LIBXSMM_ASSERT(0 == diff); if (cache->entry.id == libxsmm_ninit) { /* maintain cache */ if (cache->entry.size < internal_cache_size) { /* grow */ INTERNAL_FIND_CODE_CACHE_GROW(cache_index, cache->entry.size); LIBXSMM_ASSERT(cache->entry.size <= internal_cache_size); } else { /* evict */ LIBXSMM_ASSERT(cache->entry.hit < cache->entry.size); INTERNAL_FIND_CODE_CACHE_EVICT(cache_index, cache->entry.size, cache->entry.hit); } } else if (0 != internal_cache_size) { /* reset cache */ # if !defined(NDEBUG) LIBXSMM_MEMZERO127(cache->entry.keys); # endif cache->entry.id = libxsmm_ninit; cache->entry.size = 1; cache_index = 0; } LIBXSMM_ASSIGN127(cache->entry.keys + cache_index, desc); cache->entry.code[cache_index] = flux_entry; cache->entry.hit = cache_index; } #endif } #if defined(LIBXSMM_HASH_COLLISION) flux_entry.uval &= ~(LIBXSMM_CODE_STATIC | LIBXSMM_HASH_COLLISION); /* clear non-JIT and collision flag */ #else flux_entry.uval &= ~LIBXSMM_CODE_STATIC; /* clear non-JIT flag */ #endif #if (0 != LIBXSMM_JIT) assert(LIBXSMM_KERNEL_KIND_MATMUL != desc->kind || NULL != flux_entry.ptr_const || EXIT_SUCCESS != build || 1 == internal_reglock_count); /*!LIBXSMM_ASSERT*/ #endif return flux_entry; } LIBXSMM_API_INTERN const libxsmm_kernel_xinfo* libxsmm_get_kernel_xinfo(libxsmm_code_pointer code, const libxsmm_descriptor** desc, size_t* code_size) { libxsmm_kernel_xinfo* result = NULL; void *const result_address = &result; int flags = LIBXSMM_MALLOC_FLAG_X; if (NULL != code.ptr_const && EXIT_SUCCESS == libxsmm_get_malloc_xinfo(code.ptr_const, code_size, &flags, (void**)result_address) && NULL != result) { if (NULL != desc) { if (NULL != internal_registry && NULL != internal_registry_keys && result->registered < (LIBXSMM_CAPACITY_REGISTRY) #if defined(LIBXSMM_HASH_COLLISION) && code.uval == (~LIBXSMM_HASH_COLLISION & internal_registry[result->registered].uval) #else && code.ptr_const == internal_registry[result->registered].ptr_const #endif && internal_registry_keys[result->registered].kind < LIBXSMM_KERNEL_UNREGISTERED) { *desc = internal_registry_keys + result->registered; } else *desc = NULL; } } else { LIBXSMM_ASSERT(NULL == result); if (NULL != code_size) *code_size = 0; if (NULL != desc) *desc = NULL; } return result; } LIBXSMM_API int libxsmm_get_kernel_info(const void* kernel, libxsmm_kernel_info* info) { int result; const libxsmm_kernel_xinfo* xinfo; libxsmm_kernel_info result_info; const libxsmm_descriptor* desc; libxsmm_code_pointer code; code.ptr_const = kernel; LIBXSMM_MEMZERO127(&result_info); xinfo = libxsmm_get_kernel_xinfo(code, &desc, &result_info.code_size); if (NULL != xinfo) { if (NULL != desc) { const libxsmm_kernel_kind kind = (libxsmm_kernel_kind)desc->kind; result_info.kind = kind; if (LIBXSMM_KERNEL_KIND_USER == kind) { result_info.code_size = 0; /* invalid */ } } else { result_info.kind = LIBXSMM_KERNEL_UNREGISTERED; } result_info.nflops = xinfo->nflops; LIBXSMM_ASSIGN127(info, &result_info); result = EXIT_SUCCESS; } else { LIBXSMM_ASSERT(NULL == desc); if (NULL != info) { LIBXSMM_ASSIGN127(info, &result_info); result = EXIT_FAILURE; } else { result = EXIT_SUCCESS; } } return result; } LIBXSMM_API int libxsmm_get_mmkernel_info(libxsmm_xmmfunction kernel, libxsmm_mmkernel_info* info) { libxsmm_code_pointer code; static int error_once = 0; int result; code.xgemm = kernel; if (NULL != info) { const libxsmm_descriptor* desc; if (NULL != libxsmm_get_kernel_xinfo(code, &desc, NULL/*code_size*/) && NULL != desc && LIBXSMM_KERNEL_KIND_MATMUL == desc->kind) { info->iprecision = (libxsmm_gemm_precision)LIBXSMM_GETENUM_INP(desc->gemm.desc.datatype); info->oprecision = (libxsmm_gemm_precision)LIBXSMM_GETENUM_OUT(desc->gemm.desc.datatype); info->prefetch = (libxsmm_gemm_prefetch_type)desc->gemm.desc.prefetch; info->flags = desc->gemm.desc.flags; info->lda = desc->gemm.desc.lda; info->ldb = desc->gemm.desc.ldb; info->ldc = desc->gemm.desc.ldc; info->m = desc->gemm.desc.m; info->n = desc->gemm.desc.n; info->k = desc->gemm.desc.k; result = EXIT_SUCCESS; } else { if ( 0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { if (NULL == code.ptr_const) { fprintf(stderr, "LIBXSMM ERROR: NULL-kernel cannot be inspected!\n"); } else { fprintf(stderr, "LIBXSMM ERROR: invalid kernel cannot be inspected!\n"); } } result = EXIT_FAILURE; } } else { if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: invalid argument!\n"); } result = EXIT_FAILURE; } return result; } LIBXSMM_API int libxsmm_get_transkernel_info(libxsmm_xtransfunction kernel, libxsmm_transkernel_info* info) { libxsmm_code_pointer code; static int error_once = 0; int result; code.xtrans = kernel; if (NULL != info) { const libxsmm_descriptor* desc; if (NULL != libxsmm_get_kernel_xinfo(code, &desc, NULL/*code_size*/) && NULL != desc && LIBXSMM_KERNEL_KIND_TRANS == desc->kind) { info->typesize = desc->trans.desc.typesize; info->ldo = desc->trans.desc.ldo; info->m = desc->trans.desc.m; info->n = desc->trans.desc.n; result = EXIT_SUCCESS; } else { if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: invalid kernel cannot be inspected!\n"); } result = EXIT_FAILURE; } } else { if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: invalid argument!\n"); } result = EXIT_FAILURE; } return result; } LIBXSMM_API int libxsmm_get_mcopykernel_info(libxsmm_xmcopyfunction kernel, libxsmm_mcopykernel_info* info) { libxsmm_code_pointer code; static int error_once = 0; int result; code.xmatcopy = kernel; if (NULL != info) { const libxsmm_descriptor* desc; if (NULL != libxsmm_get_kernel_xinfo(code, &desc, NULL/*code_size*/) && NULL != desc && LIBXSMM_KERNEL_KIND_MCOPY == desc->kind) { info->typesize = desc->mcopy.desc.typesize; info->prefetch = desc->mcopy.desc.prefetch; info->flags = desc->mcopy.desc.flags; info->ldi = desc->mcopy.desc.ldi; info->ldo = desc->mcopy.desc.ldo; info->m = desc->mcopy.desc.m; info->n = desc->mcopy.desc.n; result = EXIT_SUCCESS; } else { if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: invalid kernel cannot be inspected!\n"); } result = EXIT_FAILURE; } } else { if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: invalid argument!\n"); } result = EXIT_FAILURE; } return result; } LIBXSMM_API int libxsmm_get_meltwkernel_info(libxsmm_xmeltwfunction kernel, libxsmm_meltwkernel_info* info) { libxsmm_code_pointer code; static int error_once = 0; int result; code.xmateltw = kernel; if (NULL != info) { const libxsmm_descriptor* desc; if (NULL != libxsmm_get_kernel_xinfo(code, &desc, NULL/*code_size*/) && NULL != desc && LIBXSMM_KERNEL_KIND_MELTW == desc->kind) { info->datatype = desc->meltw.desc.datatype; info->operation = desc->meltw.desc.operation; info->flags = desc->meltw.desc.flags; info->ldi = desc->meltw.desc.ldi; info->ldo = desc->meltw.desc.ldo; info->m = desc->meltw.desc.m; info->n = desc->meltw.desc.n; result = EXIT_SUCCESS; } else { if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: invalid kernel cannot be inspected!\n"); } result = EXIT_FAILURE; } } else { if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: invalid argument!\n"); } result = EXIT_FAILURE; } return result; } LIBXSMM_API int libxsmm_get_registry_info(libxsmm_registry_info* info) { int result = EXIT_SUCCESS; LIBXSMM_INIT /* verbosity */ if (0 != info && 0 != internal_registry) { size_t i; LIBXSMM_MEMZERO127(info); /* info->nstatic = 0; info->size = 0; */ info->nbytes = (LIBXSMM_CAPACITY_REGISTRY) * (sizeof(libxsmm_code_pointer) + sizeof(libxsmm_descriptor)); info->capacity = LIBXSMM_CAPACITY_REGISTRY; #if defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE)) info->ncache = internal_cache_size; #else info->ncache = 0; #endif for (i = 0; i < (LIBXSMM_CAPACITY_REGISTRY); ++i) { libxsmm_code_pointer code = internal_registry[i]; if (0 != code.ptr_const && EXIT_SUCCESS == result) { if (0 == (LIBXSMM_CODE_STATIC & code.uval)) { /* check for allocated/generated JIT-code */ size_t buffer_size = 0; void* buffer = 0; #if defined(LIBXSMM_HASH_COLLISION) code.uval &= ~LIBXSMM_HASH_COLLISION; /* clear collision flag */ #endif result = libxsmm_get_malloc_xinfo(code.ptr_const, &buffer_size, NULL/*flags*/, &buffer); if (EXIT_SUCCESS == result) { info->nbytes += LIBXSMM_UP2(buffer_size + (((char*)code.ptr_const) - (char*)buffer), LIBXSMM_PAGE_MINSIZE); } } else { ++info->nstatic; } ++info->size; } } } else { result = EXIT_FAILURE; } return result; } LIBXSMM_API void* libxsmm_xregister(const void* key, size_t key_size, size_t value_size, const void* value_init) { static int error_once = 0; void* result; LIBXSMM_INIT /* verbosity */ if (NULL != key && 0 < key_size && LIBXSMM_DESCRIPTOR_MAXSIZE >= key_size) { libxsmm_descriptor wrap; void* dst; #if defined(LIBXSMM_UNPACKED) /* CCE/Classic */ LIBXSMM_MEMSET127(&wrap, 0, key_size); #endif LIBXSMM_MEMCPY127(wrap.user.desc, key, key_size); wrap.kind = LIBXSMM_KERNEL_KIND_USER; dst = internal_find_code(&wrap, key_size, value_size).ptr; if (NULL != dst) { size_t size; if (EXIT_SUCCESS == libxsmm_get_malloc_xinfo(dst, &size, NULL/*flags*/, NULL/*extra*/) && value_size <= size) { if (NULL != value_init) memcpy(dst, value_init, value_size); result = dst; } else { if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: value too large for previously registered key!\n"); } result = NULL; } } else result = NULL; } else { if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { if (LIBXSMM_DESCRIPTOR_MAXSIZE >= key_size) { fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_xregister specified!\n"); } else { fprintf(stderr, "LIBXSMM ERROR: libxsmm_xregister has maximum key-size of %i Byte!\n", LIBXSMM_DESCRIPTOR_MAXSIZE); } } result = NULL; } return result; } LIBXSMM_API void* libxsmm_xdispatch(const void* key, size_t key_size) { void* result; LIBXSMM_INIT /* verbosity */ #if !defined(NDEBUG) if (NULL != key && 0 < key_size && LIBXSMM_DESCRIPTOR_MAXSIZE >= key_size) #endif { libxsmm_descriptor wrap; #if defined(LIBXSMM_UNPACKED) /* CCE/Classic */ LIBXSMM_MEMSET127(&wrap, 0, key_size); #endif LIBXSMM_MEMCPY127(wrap.user.desc, key, key_size); wrap.kind = LIBXSMM_KERNEL_KIND_USER; result = internal_find_code(&wrap, key_size, 0/*user_size*/).ptr; } #if !defined(NDEBUG) else { static int error_once = 0; if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_xdispatch specified!\n"); } result = NULL; } #endif return result; } LIBXSMM_API void libxsmm_xrelease(const void* key, size_t key_size) { libxsmm_release_kernel(libxsmm_xdispatch(key, key_size)); } LIBXSMM_API libxsmm_xmmfunction libxsmm_xmmdispatch(const libxsmm_gemm_descriptor* descriptor) { libxsmm_xmmfunction result; LIBXSMM_INIT /* verbosity */ #if !defined(LIBXSMM_UNPACKED) /* CCE/Classic */ LIBXSMM_ASSERT((sizeof(*descriptor) + sizeof(libxsmm_descriptor_kind)) <= (LIBXSMM_DESCRIPTOR_MAXSIZE)); #endif if (NULL != descriptor) { libxsmm_descriptor wrap; #if defined(LIBXSMM_UNPACKED) /* CCE/Classic */ LIBXSMM_MEMSET127(&wrap, 0, sizeof(*descriptor)); #endif LIBXSMM_ASSIGN127(&wrap.gemm.desc, descriptor); wrap.kind = LIBXSMM_KERNEL_KIND_MATMUL; if (0 != (0x80 & descriptor->prefetch)) { /* "sign"-bit of byte-value is set */ wrap.gemm.desc.prefetch = (unsigned char)libxsmm_get_gemm_prefetch(LIBXSMM_PREFETCH_AUTO); } result = internal_find_code(&wrap, sizeof(*descriptor), 0/*user_size*/).xgemm; #if defined(_DEBUG) if (LIBXSMM_VERBOSITY_HIGH <= libxsmm_verbosity && INT_MAX != libxsmm_verbosity && NULL != result.xmm) { LIBXSMM_STDIO_ACQUIRE(); fprintf(stderr, "\nLIBXSMM: "); libxsmm_gemm_xprint(stderr, result, NULL/*a*/, NULL/*b*/, NULL/*c*/); LIBXSMM_STDIO_RELEASE(); } #endif } else { /* quietly accept NULL-descriptor */ result.xmm = NULL; } return result; } LIBXSMM_API libxsmm_dmmfunction libxsmm_dmmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const double* alpha, const double* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags); libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_dgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); return result.dmm; } LIBXSMM_API libxsmm_smmfunction libxsmm_smmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags); libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_sgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); return result.smm; } LIBXSMM_API libxsmm_bsmmfunction libxsmm_bsmmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_bsgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); return result.bsmm; } LIBXSMM_API libxsmm_bmmfunction libxsmm_bmmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_bgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); return result.bmm; } LIBXSMM_API libxsmm_wimmfunction libxsmm_wimmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_wigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); return result.wimm; } LIBXSMM_API libxsmm_ssbimmfunction libxsmm_ssbimmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); return result.ssbimm; } LIBXSMM_API libxsmm_usbimmfunction libxsmm_usbimmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_A_UNSIGNED, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); return result.usbimm; } LIBXSMM_API libxsmm_subimmfunction libxsmm_subimmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); return result.subimm; } LIBXSMM_API libxsmm_uubimmfunction libxsmm_uubimmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_AB_UNSIGNED, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); return result.uubimm; } LIBXSMM_API libxsmm_sububmmfunction libxsmm_sububmmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_bbgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_C_UNSIGNED, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); return result.sububmm; } LIBXSMM_API libxsmm_dmmfunction_reducebatch_addr libxsmm_dmmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const double* alpha, const double* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags); libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_dgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); return result.dmra; } LIBXSMM_API libxsmm_smmfunction_reducebatch_addr libxsmm_smmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags); libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_sgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); return result.smra; } LIBXSMM_API libxsmm_bsmmfunction_reducebatch_addr libxsmm_bsmmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_bsgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); return result.bsmra; } LIBXSMM_API libxsmm_bmmfunction_reducebatch_addr libxsmm_bmmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_bgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); return result.bmra; } LIBXSMM_API libxsmm_wimmfunction_reducebatch_addr libxsmm_wimmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_wigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); return result.wimra; } LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_addr libxsmm_ssbimmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); return result.ssbimra; } LIBXSMM_API libxsmm_usbimmfunction_reducebatch_addr libxsmm_usbimmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_A_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); return result.usbimra; } LIBXSMM_API libxsmm_subimmfunction_reducebatch_addr libxsmm_subimmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); return result.subimra; } LIBXSMM_API libxsmm_uubimmfunction_reducebatch_addr libxsmm_uubimmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_AB_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); return result.uubimra; } LIBXSMM_API libxsmm_sububmmfunction_reducebatch_addr libxsmm_sububmmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_bbgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_C_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); return result.sububmra; } LIBXSMM_API libxsmm_dmmfunction_reducebatch_addr libxsmm_dmmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const double* alpha, const double* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_dgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0); result = libxsmm_xmmdispatch(desc); return result.dmra; } LIBXSMM_API libxsmm_smmfunction_reducebatch_addr libxsmm_smmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_sgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0); result = libxsmm_xmmdispatch(desc); return result.smra; } LIBXSMM_API libxsmm_bsmmfunction_reducebatch_addr libxsmm_bsmmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bsgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0); result = libxsmm_xmmdispatch(desc); return result.bsmra; } LIBXSMM_API libxsmm_bmmfunction_reducebatch_addr libxsmm_bmmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0); result = libxsmm_xmmdispatch(desc); return result.bmra; } LIBXSMM_API libxsmm_wimmfunction_reducebatch_addr libxsmm_wimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_wigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0); result = libxsmm_xmmdispatch(desc); return result.wimra; } LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_addr libxsmm_ssbimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0); result = libxsmm_xmmdispatch(desc); return result.ssbimra; } LIBXSMM_API libxsmm_usbimmfunction_reducebatch_addr libxsmm_usbimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_A_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0); result = libxsmm_xmmdispatch(desc); return result.usbimra; } LIBXSMM_API libxsmm_subimmfunction_reducebatch_addr libxsmm_subimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0); result = libxsmm_xmmdispatch(desc); return result.subimra; } LIBXSMM_API libxsmm_uubimmfunction_reducebatch_addr libxsmm_uubimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_AB_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0); result = libxsmm_xmmdispatch(desc); return result.uubimra; } LIBXSMM_API libxsmm_sububmmfunction_reducebatch_addr libxsmm_sububmmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bbgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_C_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0); result = libxsmm_xmmdispatch(desc); return result.sububmra; } LIBXSMM_API libxsmm_dmmfunction_reducebatch_offs libxsmm_dmmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const double* alpha, const double* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags); libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_dgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); return result.dmro; } LIBXSMM_API libxsmm_smmfunction_reducebatch_offs libxsmm_smmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags); libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_sgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); return result.smro; } LIBXSMM_API libxsmm_bsmmfunction_reducebatch_offs libxsmm_bsmmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_bsgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); return result.bsmro; } LIBXSMM_API libxsmm_bmmfunction_reducebatch_offs libxsmm_bmmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_bgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); return result.bmro; } LIBXSMM_API libxsmm_wimmfunction_reducebatch_offs libxsmm_wimmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_wigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); return result.wimro; } LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_offs libxsmm_ssbimmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); return result.ssbimro; } LIBXSMM_API libxsmm_usbimmfunction_reducebatch_offs libxsmm_usbimmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_A_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); return result.usbimro; } LIBXSMM_API libxsmm_subimmfunction_reducebatch_offs libxsmm_subimmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); return result.subimro; } LIBXSMM_API libxsmm_uubimmfunction_reducebatch_offs libxsmm_uubimmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_AB_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); return result.uubimro; } LIBXSMM_API libxsmm_sububmmfunction_reducebatch_offs libxsmm_sububmmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_bbgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_C_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); return result.sububmro; } LIBXSMM_API libxsmm_dmmfunction_reducebatch_offs libxsmm_dmmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const double* alpha, const double* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_dgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0); result = libxsmm_xmmdispatch(desc); return result.dmro; } LIBXSMM_API libxsmm_smmfunction_reducebatch_offs libxsmm_smmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_sgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0); result = libxsmm_xmmdispatch(desc); return result.smro; } LIBXSMM_API libxsmm_bsmmfunction_reducebatch_offs libxsmm_bsmmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bsgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0); result = libxsmm_xmmdispatch(desc); return result.bsmro; } LIBXSMM_API libxsmm_bmmfunction_reducebatch_offs libxsmm_bmmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0); result = libxsmm_xmmdispatch(desc); return result.bmro; } LIBXSMM_API libxsmm_wimmfunction_reducebatch_offs libxsmm_wimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_wigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0); result = libxsmm_xmmdispatch(desc); return result.wimro; } LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_offs libxsmm_ssbimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0); result = libxsmm_xmmdispatch(desc); return result.ssbimro; } LIBXSMM_API libxsmm_usbimmfunction_reducebatch_offs libxsmm_usbimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_A_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0); result = libxsmm_xmmdispatch(desc); return result.usbimro; } LIBXSMM_API libxsmm_subimmfunction_reducebatch_offs libxsmm_subimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0); result = libxsmm_xmmdispatch(desc); return result.subimro; } LIBXSMM_API libxsmm_uubimmfunction_reducebatch_offs libxsmm_uubimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_AB_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0); result = libxsmm_xmmdispatch(desc); return result.uubimro; } LIBXSMM_API libxsmm_sububmmfunction_reducebatch_offs libxsmm_sububmmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bbgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_C_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0); result = libxsmm_xmmdispatch(desc); return result.sububmro; } LIBXSMM_API libxsmm_dmmfunction_reducebatch_strd libxsmm_dmmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const double* alpha, const double* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_dgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c1 = (unsigned long long)stride_a; desc->c2 = (unsigned long long)stride_b; if ( (stride_a < 0) || (stride_b < 0) ) { return NULL; } result = libxsmm_xmmdispatch(desc); return result.dmrs; } LIBXSMM_API libxsmm_smmfunction_reducebatch_strd libxsmm_smmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_sgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c1 = (unsigned long long)stride_a; desc->c2 = (unsigned long long)stride_b; if ( (stride_a < 0) || (stride_b < 0) ) { return NULL; } result = libxsmm_xmmdispatch(desc); return result.smrs; } LIBXSMM_API libxsmm_bsmmfunction_reducebatch_strd libxsmm_bsmmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bsgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c1 = (unsigned long long)stride_a; desc->c2 = (unsigned long long)stride_b; if ( (stride_a < 0) || (stride_b < 0) ) { return NULL; } result = libxsmm_xmmdispatch(desc); return result.bsmrs; } LIBXSMM_API libxsmm_bmmfunction_reducebatch_strd libxsmm_bmmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c1 = (unsigned long long)stride_a; desc->c2 = (unsigned long long)stride_b; if ( (stride_a < 0) || (stride_b < 0) ) { return NULL; } result = libxsmm_xmmdispatch(desc); return result.bmrs; } LIBXSMM_API libxsmm_wimmfunction_reducebatch_strd libxsmm_wimmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_wigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c1 = (unsigned long long)stride_a; desc->c2 = (unsigned long long)stride_b; if ( (stride_a < 0) || (stride_b < 0) ) { return NULL; } result = libxsmm_xmmdispatch(desc); return result.wimrs; } LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_strd libxsmm_ssbimmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c1 = (unsigned long long)stride_a; desc->c2 = (unsigned long long)stride_b; if ( (stride_a < 0) || (stride_b < 0) ) { return NULL; } result = libxsmm_xmmdispatch(desc); return result.ssbimrs; } LIBXSMM_API libxsmm_usbimmfunction_reducebatch_strd libxsmm_usbimmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_A_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c1 = (unsigned long long)stride_a; desc->c2 = (unsigned long long)stride_b; if ( (stride_a < 0) || (stride_b < 0) ) { return NULL; } result = libxsmm_xmmdispatch(desc); return result.usbimrs; } LIBXSMM_API libxsmm_subimmfunction_reducebatch_strd libxsmm_subimmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c1 = (unsigned long long)stride_a; desc->c2 = (unsigned long long)stride_b; if ( (stride_a < 0) || (stride_b < 0) ) { return NULL; } result = libxsmm_xmmdispatch(desc); return result.subimrs; } LIBXSMM_API libxsmm_uubimmfunction_reducebatch_strd libxsmm_uubimmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_AB_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c1 = (unsigned long long)stride_a; desc->c2 = (unsigned long long)stride_b; if ( (stride_a < 0) || (stride_b < 0) ) { return NULL; } result = libxsmm_xmmdispatch(desc); return result.uubimrs; } LIBXSMM_API libxsmm_sububmmfunction_reducebatch_strd libxsmm_sububmmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bbgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_C_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c1 = (unsigned long long)stride_a; desc->c2 = (unsigned long long)stride_b; if ( (stride_a < 0) || (stride_b < 0) ) { return NULL; } result = libxsmm_xmmdispatch(desc); return result.sububmrs; } LIBXSMM_API libxsmm_dmmfunction_reducebatch_strd libxsmm_dmmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const double* alpha, const double* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_dgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c1 = (unsigned long long)stride_a; desc->c2 = (unsigned long long)stride_b; desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0); if ( (stride_a < 0) || (stride_b < 0) ) { return NULL; } result = libxsmm_xmmdispatch(desc); return result.dmrs; } LIBXSMM_API libxsmm_smmfunction_reducebatch_strd libxsmm_smmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_sgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c1 = (unsigned long long)stride_a; desc->c2 = (unsigned long long)stride_b; desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0); if ( (stride_a < 0) || (stride_b < 0) ) { return NULL; } result = libxsmm_xmmdispatch(desc); return result.smrs; } LIBXSMM_API libxsmm_bsmmfunction_reducebatch_strd libxsmm_bsmmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bsgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c1 = (unsigned long long)stride_a; desc->c2 = (unsigned long long)stride_b; desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0); if ( (stride_a < 0) || (stride_b < 0) ) { return NULL; } result = libxsmm_xmmdispatch(desc); return result.bsmrs; } LIBXSMM_API libxsmm_bmmfunction_reducebatch_strd libxsmm_bmmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c1 = (unsigned long long)stride_a; desc->c2 = (unsigned long long)stride_b; desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0); if ( (stride_a < 0) || (stride_b < 0) ) { return NULL; } result = libxsmm_xmmdispatch(desc); return result.bmrs; } LIBXSMM_API libxsmm_wimmfunction_reducebatch_strd libxsmm_wimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_wigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c1 = (unsigned long long)stride_a; desc->c2 = (unsigned long long)stride_b; desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0); if ( (stride_a < 0) || (stride_b < 0) ) { return NULL; } result = libxsmm_xmmdispatch(desc); return result.wimrs; } LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_strd libxsmm_ssbimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c1 = (unsigned long long)stride_a; desc->c2 = (unsigned long long)stride_b; desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0); if ( (stride_a < 0) || (stride_b < 0) ) { return NULL; } result = libxsmm_xmmdispatch(desc); return result.ssbimrs; } LIBXSMM_API libxsmm_usbimmfunction_reducebatch_strd libxsmm_usbimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_A_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c1 = (unsigned long long)stride_a; desc->c2 = (unsigned long long)stride_b; desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0); if ( (stride_a < 0) || (stride_b < 0) ) { return NULL; } result = libxsmm_xmmdispatch(desc); return result.usbimrs; } LIBXSMM_API libxsmm_subimmfunction_reducebatch_strd libxsmm_subimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c1 = (unsigned long long)stride_a; desc->c2 = (unsigned long long)stride_b; desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0); if ( (stride_a < 0) || (stride_b < 0) ) { return NULL; } result = libxsmm_xmmdispatch(desc); return result.subimrs; } LIBXSMM_API libxsmm_uubimmfunction_reducebatch_strd libxsmm_uubimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_AB_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c1 = (unsigned long long)stride_a; desc->c2 = (unsigned long long)stride_b; desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0); if ( (stride_a < 0) || (stride_b < 0) ) { return NULL; } result = libxsmm_xmmdispatch(desc); return result.uubimrs; } LIBXSMM_API libxsmm_sububmmfunction_reducebatch_strd libxsmm_sububmmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch) { const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags); libxsmm_descriptor_blob blob; /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bbgemm_descriptor_init(&blob, m, n, k, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_C_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); /*const*/ libxsmm_xmmfunction result; desc->c1 = (unsigned long long)stride_a; desc->c2 = (unsigned long long)stride_b; desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0); if ( (stride_a < 0) || (stride_b < 0) ) { return NULL; } result = libxsmm_xmmdispatch(desc); return result.sububmrs; } LIBXSMM_API libxsmm_xmcopyfunction libxsmm_dispatch_mcopy(const libxsmm_mcopy_descriptor* descriptor) { libxsmm_xmcopyfunction result; LIBXSMM_INIT /* verbosity */ #if !defined(LIBXSMM_UNPACKED) /* CCE/Classic */ LIBXSMM_ASSERT((sizeof(*descriptor) + sizeof(libxsmm_descriptor_kind)) <= (LIBXSMM_DESCRIPTOR_MAXSIZE)); #endif if (NULL != descriptor) { libxsmm_descriptor wrap; #if defined(LIBXSMM_UNPACKED) /* CCE/Classic */ LIBXSMM_MEMSET127(&wrap, 0, sizeof(*descriptor)); #endif LIBXSMM_ASSIGN127(&wrap.mcopy.desc, descriptor); wrap.kind = LIBXSMM_KERNEL_KIND_MCOPY; #if defined(_WIN32) || defined(__CYGWIN__) wrap.mcopy.desc.prefetch = 0; #endif result = internal_find_code(&wrap, sizeof(*descriptor), 0/*user_size*/).xmatcopy; } else { result = NULL; } return result; } LIBXSMM_API libxsmm_xmeltwfunction libxsmm_dispatch_meltw(const libxsmm_meltw_descriptor* descriptor) { libxsmm_xmeltwfunction result; LIBXSMM_INIT /* verbosity */ #if !defined(LIBXSMM_UNPACKED) /* CCE/Classic */ LIBXSMM_ASSERT((sizeof(*descriptor) + sizeof(libxsmm_descriptor_kind)) <= (LIBXSMM_DESCRIPTOR_MAXSIZE)); #endif if (NULL != descriptor) { libxsmm_descriptor wrap; #if defined(LIBXSMM_UNPACKED) /* CCE/Classic */ LIBXSMM_MEMSET127(&wrap, 0, sizeof(*descriptor)); #endif LIBXSMM_ASSIGN127(&wrap.meltw.desc, descriptor); wrap.kind = LIBXSMM_KERNEL_KIND_MELTW; result = internal_find_code(&wrap, sizeof(*descriptor), 0/*user_size*/).xmateltw; } else { result.xmeltw = NULL; } return result; } LIBXSMM_API libxsmm_meltwfunction_copy libxsmm_dispatch_meltw_copy(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type) { libxsmm_descriptor_blob blob; const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init(&blob, in_type, out_type, m, n, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo, 0, LIBXSMM_MELTW_OPERATION_COPY); libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc); return result.meltw_copy; } LIBXSMM_API libxsmm_meltwfunction_zero libxsmm_dispatch_meltw_zero(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type) { libxsmm_descriptor_blob blob; const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init(&blob, in_type, out_type, m, n, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo, 0, LIBXSMM_MELTW_OPERATION_ZERO); libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc); return result.meltw_zero; } LIBXSMM_API libxsmm_meltwfunction_add libxsmm_dispatch_meltw_add(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type) { libxsmm_descriptor_blob blob; const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init(&blob, in_type, out_type, m, n, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo, 0, LIBXSMM_MELTW_OPERATION_ADD); libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc); return result.meltw_add; } LIBXSMM_API libxsmm_meltwfunction_mul libxsmm_dispatch_meltw_mul(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type) { libxsmm_descriptor_blob blob; const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init(&blob, in_type, out_type, m, n, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo, 0, LIBXSMM_MELTW_OPERATION_MUL); libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc); return result.meltw_mul; } LIBXSMM_API libxsmm_meltwfunction_relu libxsmm_dispatch_meltw_relu(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type) { libxsmm_descriptor_blob blob; const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init(&blob, in_type, out_type, m, n, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo, 0, LIBXSMM_MELTW_OPERATION_RELU); libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc); return result.meltw_relu; } LIBXSMM_API libxsmm_meltwfunction_cvtfp32bf16 libxsmm_dispatch_meltw_cvtfp32bf16(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type) { libxsmm_descriptor_blob blob; const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init(&blob, in_type, out_type, m, n, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo, 0, LIBXSMM_MELTW_OPERATION_CVTFP32BF16); libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc); return result.meltw_cvtfp32bf16; } LIBXSMM_API libxsmm_meltwfunction_cvtfp32bf16_act libxsmm_dispatch_meltw_cvtfp32bf16_act(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type, libxsmm_meltw_cvta_flags flags) { libxsmm_descriptor_blob blob; const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init(&blob, in_type, out_type, m, n, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo, libxsmm_get_meltw_comp_cvta_flags( flags ), LIBXSMM_MELTW_OPERATION_CVTFP32BF16_ACT); libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc); return result.meltw_cvtfp32bf16_act; } LIBXSMM_API libxsmm_meltwfunction_act_cvtfp32bf16 libxsmm_dispatch_meltw_act_cvtfp32bf16(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type, libxsmm_meltw_acvt_flags flags) { libxsmm_descriptor_blob blob; const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init(&blob, in_type, out_type, m, n, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo, libxsmm_get_meltw_comp_acvt_flags( flags ), LIBXSMM_MELTW_OPERATION_ACT_CVTFP32BF16); libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc); return result.meltw_act_cvtfp32bf16; } LIBXSMM_API libxsmm_meltwfunction_reduce libxsmm_dispatch_meltw_reduce(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type, libxsmm_meltw_redu_flags flags) { libxsmm_descriptor_blob blob; const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init(&blob, in_type, out_type, m, n, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo, libxsmm_get_meltw_comp_redu_flags( flags ), LIBXSMM_MELTW_OPERATION_REDUCE); libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc); return result.meltw_reduce; } LIBXSMM_API libxsmm_meltwfunction_scale libxsmm_dispatch_meltw_scale(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type, libxsmm_meltw_scal_flags flags) { libxsmm_descriptor_blob blob; const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init(&blob, in_type, out_type, m, n, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo, libxsmm_get_meltw_comp_scal_flags( flags ), LIBXSMM_MELTW_OPERATION_SCALE); libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc); return result.meltw_scale; } LIBXSMM_API libxsmm_xtransfunction libxsmm_dispatch_trans(const libxsmm_trans_descriptor* descriptor) { libxsmm_xtransfunction result; LIBXSMM_INIT /* verbosity */ #if !defined(LIBXSMM_UNPACKED) /* CCE/Classic */ LIBXSMM_ASSERT((sizeof(*descriptor) + sizeof(libxsmm_descriptor_kind)) <= (LIBXSMM_DESCRIPTOR_MAXSIZE)); #endif if (NULL != descriptor) { libxsmm_descriptor wrap; #if defined(LIBXSMM_UNPACKED) /* CCE/Classic */ LIBXSMM_MEMSET127(&wrap, 0, sizeof(*descriptor)); #endif LIBXSMM_ASSIGN127(&wrap.trans.desc, descriptor); wrap.kind = LIBXSMM_KERNEL_KIND_TRANS; result = internal_find_code(&wrap, sizeof(*descriptor), 0/*user_size*/).xtrans; } else { result = NULL; } return result; } LIBXSMM_API libxsmm_pgemm_xfunction libxsmm_dispatch_pgemm(const libxsmm_pgemm_descriptor* descriptor) { libxsmm_trmm_xfunction result; LIBXSMM_INIT /* verbosity */ #if !defined(LIBXSMM_UNPACKED) /* CCE/Classic */ LIBXSMM_ASSERT((sizeof(*descriptor) + sizeof(libxsmm_descriptor_kind)) <= (LIBXSMM_DESCRIPTOR_MAXSIZE)); #endif if (NULL != descriptor) { libxsmm_descriptor wrap; #if defined(LIBXSMM_UNPACKED) /* CCE/Classic */ LIBXSMM_MEMSET127(&wrap, 0, sizeof(*descriptor)); #endif LIBXSMM_ASSIGN127(&wrap.pgemm.desc, descriptor); wrap.kind = LIBXSMM_KERNEL_KIND_PGEMM; result = internal_find_code(&wrap, sizeof(*descriptor), 0/*user_size*/).xpgemm; } else { result = NULL; } return result; } LIBXSMM_API libxsmm_getrf_xfunction libxsmm_dispatch_getrf(const libxsmm_getrf_descriptor* descriptor) { libxsmm_trmm_xfunction result; LIBXSMM_INIT /* verbosity */ #if !defined(LIBXSMM_UNPACKED) /* CCE/Classic */ LIBXSMM_ASSERT((sizeof(*descriptor) + sizeof(libxsmm_descriptor_kind)) <= (LIBXSMM_DESCRIPTOR_MAXSIZE)); #endif if (NULL != descriptor) { libxsmm_descriptor wrap; #if defined(LIBXSMM_UNPACKED) /* CCE/Classic */ LIBXSMM_MEMSET127(&wrap, 0, sizeof(*descriptor)); #endif LIBXSMM_ASSIGN127(&wrap.getrf.desc, descriptor); wrap.kind = LIBXSMM_KERNEL_KIND_GETRF; result = internal_find_code(&wrap, sizeof(*descriptor), 0/*user_size*/).xgetrf; } else { result = NULL; } return result; } LIBXSMM_API libxsmm_trmm_xfunction libxsmm_dispatch_trmm(const libxsmm_trmm_descriptor* descriptor) { libxsmm_trmm_xfunction result; LIBXSMM_INIT /* verbosity */ #if !defined(LIBXSMM_UNPACKED) /* CCE/Classic */ LIBXSMM_ASSERT((sizeof(*descriptor) + sizeof(libxsmm_descriptor_kind)) <= (LIBXSMM_DESCRIPTOR_MAXSIZE)); #endif if (NULL != descriptor) { libxsmm_descriptor wrap; #if defined(LIBXSMM_UNPACKED) /* CCE/Classic */ LIBXSMM_MEMSET127(&wrap, 0, sizeof(*descriptor)); #endif LIBXSMM_ASSIGN127(&wrap.trmm.desc, descriptor); wrap.kind = LIBXSMM_KERNEL_KIND_TRMM; result = internal_find_code(&wrap, sizeof(*descriptor), 0/*user_size*/).xtrmm; } else { result = NULL; } return result; } LIBXSMM_API libxsmm_trsm_xfunction libxsmm_dispatch_trsm(const libxsmm_trsm_descriptor* descriptor) { libxsmm_trsm_xfunction result; LIBXSMM_INIT /* verbosity */ #if !defined(LIBXSMM_UNPACKED) /* CCE/Classic */ LIBXSMM_ASSERT((sizeof(*descriptor) + sizeof(libxsmm_descriptor_kind)) <= (LIBXSMM_DESCRIPTOR_MAXSIZE)); #endif if (NULL != descriptor) { libxsmm_descriptor wrap; #if defined(LIBXSMM_UNPACKED) /* CCE/Classic */ LIBXSMM_MEMSET127(&wrap, 0, sizeof(*descriptor)); #endif LIBXSMM_ASSIGN127(&wrap.trsm.desc, descriptor); wrap.kind = LIBXSMM_KERNEL_KIND_TRSM; result = internal_find_code(&wrap, sizeof(*descriptor), 0/*user_size*/).xtrsm; } else { result = NULL; } return result; } LIBXSMM_API libxsmm_xmmfunction libxsmm_create_xcsr_soa(const libxsmm_gemm_descriptor* descriptor, const unsigned int* row_ptr, const unsigned int* column_idx, const void* values, unsigned int packed_width) { libxsmm_code_pointer result = { 0 }; LIBXSMM_INIT if (NULL != descriptor && NULL != row_ptr && NULL != column_idx && NULL != values) { libxsmm_csr_soa_descriptor srsoa; libxsmm_build_request request; libxsmm_gemm_descriptor desc; if (0 == (0x80 & descriptor->prefetch)) { srsoa.gemm = descriptor; } else { /* "sign"-bit of byte-value is set */ LIBXSMM_ASSIGN127(&desc, descriptor); desc.prefetch = (unsigned char)libxsmm_get_gemm_prefetch(LIBXSMM_PREFETCH_AUTO); srsoa.gemm = &desc; } srsoa.row_ptr = row_ptr; srsoa.column_idx = column_idx; srsoa.values = values; srsoa.packed_width = packed_width; request.descriptor.srsoa = &srsoa; request.kind = LIBXSMM_BUILD_KIND_SRSOA; libxsmm_build(&request, LIBXSMM_CAPACITY_REGISTRY/*not managed*/, &result); } return result.xgemm; } LIBXSMM_API libxsmm_xmmfunction libxsmm_create_xcsc_soa(const libxsmm_gemm_descriptor* descriptor, const unsigned int* column_ptr, const unsigned int* row_idx, const void* values, unsigned int packed_width) { libxsmm_code_pointer result = { 0 }; LIBXSMM_INIT if (NULL != descriptor && NULL != column_ptr && NULL != row_idx && NULL != values) { libxsmm_csc_soa_descriptor scsoa; libxsmm_build_request request; libxsmm_gemm_descriptor desc; if (0 == (0x80 & descriptor->prefetch)) { scsoa.gemm = descriptor; } else { /* "sign"-bit of byte-value is set */ LIBXSMM_ASSIGN127(&desc, descriptor); desc.prefetch = (unsigned char)libxsmm_get_gemm_prefetch(LIBXSMM_PREFETCH_AUTO); scsoa.gemm = &desc; } scsoa.column_ptr = column_ptr; scsoa.row_idx = row_idx; scsoa.values = values; scsoa.packed_width = packed_width; request.descriptor.scsoa = &scsoa; request.kind = LIBXSMM_BUILD_KIND_SCSOA; libxsmm_build(&request, LIBXSMM_CAPACITY_REGISTRY/*not managed*/, &result); } return result.xgemm; } LIBXSMM_API libxsmm_xmmfunction libxsmm_create_pgemm_ac_rm(const libxsmm_gemm_descriptor* descriptor, unsigned int packed_width) { libxsmm_code_pointer result = { 0 }; LIBXSMM_INIT if (NULL != descriptor) { libxsmm_pgemm_ac_rm_descriptor pgemmacrm; libxsmm_build_request request; libxsmm_gemm_descriptor desc; if (0 == (0x80 & descriptor->prefetch)) { pgemmacrm.gemm = descriptor; } else { /* "sign"-bit of byte-value is set */ LIBXSMM_ASSIGN127(&desc, descriptor); desc.prefetch = (unsigned char)libxsmm_get_gemm_prefetch(LIBXSMM_PREFETCH_AUTO); pgemmacrm.gemm = &desc; } pgemmacrm.packed_width = packed_width; request.descriptor.pgemmacrm = &pgemmacrm; request.kind = LIBXSMM_BUILD_KIND_PGEMMRMAC; libxsmm_build(&request, LIBXSMM_CAPACITY_REGISTRY/*not managed*/, &result); } return result.xgemm; } LIBXSMM_API libxsmm_xmmfunction libxsmm_create_pgemm_bc_rm(const libxsmm_gemm_descriptor* descriptor, unsigned int packed_width) { libxsmm_code_pointer result = { 0 }; LIBXSMM_INIT if (NULL != descriptor) { libxsmm_pgemm_bc_rm_descriptor pgemmbcrm; libxsmm_build_request request; libxsmm_gemm_descriptor desc; if (0 == (0x80 & descriptor->prefetch)) { pgemmbcrm.gemm = descriptor; } else { /* "sign"-bit of byte-value is set */ LIBXSMM_ASSIGN127(&desc, descriptor); desc.prefetch = (unsigned char)libxsmm_get_gemm_prefetch(LIBXSMM_PREFETCH_AUTO); pgemmbcrm.gemm = &desc; } pgemmbcrm.packed_width = packed_width; request.descriptor.pgemmbcrm = &pgemmbcrm; request.kind = LIBXSMM_BUILD_KIND_PGEMMRMBC; libxsmm_build(&request, LIBXSMM_CAPACITY_REGISTRY/*not managed*/, &result); } return result.xgemm; } LIBXSMM_API libxsmm_dmmfunction libxsmm_create_dcsr_reg(const libxsmm_gemm_descriptor* descriptor, const unsigned int* row_ptr, const unsigned int* column_idx, const double* values) { libxsmm_code_pointer result = { 0 }; LIBXSMM_INIT if (NULL != descriptor && NULL != row_ptr && NULL != column_idx && NULL != values) { libxsmm_csr_reg_descriptor sreg; libxsmm_build_request request; libxsmm_gemm_descriptor desc; if (0 == (0x80 & descriptor->prefetch)) { sreg.gemm = descriptor; } else { /* "sign"-bit of byte-value is set */ LIBXSMM_ASSIGN127(&desc, descriptor); desc.prefetch = (unsigned char)libxsmm_get_gemm_prefetch(LIBXSMM_PREFETCH_AUTO); sreg.gemm = &desc; } sreg.row_ptr = row_ptr; sreg.column_idx = column_idx; sreg.values = values; request.descriptor.sreg = &sreg; request.kind = LIBXSMM_BUILD_KIND_SREG; libxsmm_build(&request, LIBXSMM_CAPACITY_REGISTRY/*not managed*/, &result); } return result.xgemm.dmm; } LIBXSMM_API libxsmm_smmfunction libxsmm_create_scsr_reg(const libxsmm_gemm_descriptor* descriptor, const unsigned int* row_ptr, const unsigned int* column_idx, const float* values) { libxsmm_code_pointer result = { 0 }; LIBXSMM_INIT if (NULL != descriptor && NULL != row_ptr && NULL != column_idx && NULL != values) { libxsmm_csr_reg_descriptor sreg; libxsmm_build_request request; const unsigned int n = row_ptr[descriptor->m]; double *const d_values = (double*)(0 != n ? malloc(n * sizeof(double)) : NULL); if (NULL != d_values) { libxsmm_gemm_descriptor desc; unsigned int i; /* we need to copy the values into a double precision buffer */ for (i = 0; i < n; ++i) d_values[i] = (double)values[i]; if (0 == (0x80 & descriptor->prefetch)) { sreg.gemm = descriptor; } else { /* "sign"-bit of byte-value is set */ LIBXSMM_ASSIGN127(&desc, descriptor); desc.prefetch = (unsigned char)libxsmm_get_gemm_prefetch(LIBXSMM_PREFETCH_AUTO); sreg.gemm = &desc; } sreg.row_ptr = row_ptr; sreg.column_idx = column_idx; sreg.values = d_values; request.descriptor.sreg = &sreg; request.kind = LIBXSMM_BUILD_KIND_SREG; libxsmm_build(&request, LIBXSMM_CAPACITY_REGISTRY/*not managed*/, &result); free(d_values); } } return result.xgemm.smm; } LIBXSMM_API void libxsmm_release_kernel(const void* kernel) { if (NULL != kernel) { static int error_once = 0; libxsmm_kernel_xinfo* extra = NULL; void *const extra_address = &extra; LIBXSMM_INIT if (EXIT_SUCCESS == libxsmm_get_malloc_xinfo(kernel, NULL/*size*/, NULL/*flags*/, (void**)extra_address) && NULL != extra) { const unsigned int regindex = extra->registered; if ((LIBXSMM_CAPACITY_REGISTRY) <= regindex) { libxsmm_xfree(kernel, 0/*no check*/); } else { /* attempt to unregister kernel */ libxsmm_kernel_info info; #if !defined(LIBXSMM_ENABLE_DEREG) if (EXIT_SUCCESS == libxsmm_get_kernel_info(kernel, &info) && LIBXSMM_KERNEL_KIND_USER == info.kind) #endif { LIBXSMM_ASSERT(LIBXSMM_KERNEL_UNREGISTERED > info.kind); /* coverity[check_return] */ LIBXSMM_ATOMIC_ADD_FETCH(&libxsmm_ninit, 1, LIBXSMM_ATOMIC_RELAXED); /* invalidate code cache (TLS) */ internal_registry[regindex].ptr = NULL; #if !defined(NDEBUG) LIBXSMM_MEMZERO127(internal_registry_keys + regindex); #endif libxsmm_xfree(kernel, 0/*no check*/); } #if !defined(LIBXSMM_ENABLE_DEREG) else if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM WARNING: attempt to unregister JIT-kernel!\n"); } #endif } } else if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: failed to release kernel!\n"); } } } #if defined(LIBXSMM_BUILD) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__)) /* implementation provided for Fortran 77 compatibility */ LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_init)(void); LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_init)(void) { libxsmm_init(); } /* implementation provided for Fortran 77 compatibility */ LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_finalize)(void); LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_finalize)(void) { libxsmm_finalize(); } /* implementation provided for Fortran 77 compatibility */ LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_release_kernel)(const void** /*kernel*/); LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_release_kernel)(const void** kernel) { #if !defined(NDEBUG) if (NULL != kernel) #endif { libxsmm_release_kernel(*kernel); } #if !defined(NDEBUG) else { static int error_once = 0; if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: invalid argument passed into libxsmm_release_kernel!\n"); } } #endif } /* implementation provided for Fortran 77 compatibility */ LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmdispatch2)(intptr_t* /*fn*/, const int* /*iprec*/, const int* /*oprec*/, const libxsmm_blasint* /*m*/, const libxsmm_blasint* /*n*/, const libxsmm_blasint* /*k*/, const libxsmm_blasint* /*lda*/, const libxsmm_blasint* /*ldb*/, const libxsmm_blasint* /*ldc*/, const void* /*alpha*/, const void* /*beta*/, const int* /*flags*/, const int* /*prefetch*/); LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmdispatch2)(intptr_t* fn, const int* iprec, const int* oprec, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const void* alpha, const void* beta, const int* flags, const int* prefetch) { #if !defined(NDEBUG) if (NULL != fn && NULL != m && (NULL == iprec || (0 <= *iprec && *iprec < LIBXSMM_DATATYPE_UNSUPPORTED)) && (NULL == oprec || (0 <= *oprec && *oprec < LIBXSMM_DATATYPE_UNSUPPORTED))) #endif { const int gemm_flags = (NULL != flags ? *flags : LIBXSMM_FLAGS); const libxsmm_gemm_descriptor* descriptor; libxsmm_gemm_prefetch_type gemm_prefetch; libxsmm_descriptor_blob blob; libxsmm_code_pointer result; #if !defined(NDEBUG) const libxsmm_gemm_precision itype = (NULL != iprec ? ((libxsmm_gemm_precision)*iprec) : LIBXSMM_GEMM_PRECISION_F64); const libxsmm_gemm_precision otype = (NULL != oprec ? ((libxsmm_gemm_precision)*oprec) : itype); const libxsmm_blasint kk = *(NULL != k ? k : m), nn = (NULL != n ? *n : kk); #else const libxsmm_gemm_precision itype = (libxsmm_gemm_precision)*iprec, otype = (libxsmm_gemm_precision)*oprec; const libxsmm_blasint kk = *k, nn = *n; #endif LIBXSMM_PRAGMA_FORCEINLINE gemm_prefetch = libxsmm_get_gemm_xprefetch(prefetch); LIBXSMM_PRAGMA_FORCEINLINE descriptor = libxsmm_gemm_descriptor_init2(&blob, itype, otype, *m, nn, kk, NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? *m : kk), NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? kk : nn), *(NULL != ldc ? ldc : m), alpha, beta, gemm_flags, gemm_prefetch); #if !defined(NDEBUG) if (NULL != descriptor) #endif { LIBXSMM_PRAGMA_FORCEINLINE result.xgemm = libxsmm_xmmdispatch(descriptor); *fn = result.ival; } #if !defined(NDEBUG) else { /* quiet */ *fn = 0; } #endif } #if !defined(NDEBUG) else { static int error_once = 0; if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: invalid argument passed into libxsmm_xmmdispatch!\n"); } if (NULL != fn) *fn = 0; } #endif } /* implementation provided for Fortran 77 compatibility */ LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmdispatch)(intptr_t* /*fn*/, const int* /*precision*/, const libxsmm_blasint* /*m*/, const libxsmm_blasint* /*n*/, const libxsmm_blasint* /*k*/, const libxsmm_blasint* /*lda*/, const libxsmm_blasint* /*ldb*/, const libxsmm_blasint* /*ldc*/, const void* /*alpha*/, const void* /*beta*/, const int* /*flags*/, const int* /*prefetch*/); LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmdispatch)(intptr_t* fn, const int* precision, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const void* alpha, const void* beta, const int* flags, const int* prefetch) { LIBXSMM_FSYMBOL(libxsmm_xmmdispatch2)(fn, precision, precision, m, n, k, lda, ldb, ldc, alpha, beta, flags, prefetch); } /* implementation provided for Fortran 77 compatibility */ LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmcall_abc)( const libxsmm_xmmfunction* /*fn*/, const void* /*a*/, const void* /*b*/, void* /*c*/); LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmcall_abc)( const libxsmm_xmmfunction* fn, const void* a, const void* b, void* c) { #if !defined(NDEBUG) static int error_once = 0; if (NULL != fn && NULL != a && NULL != b && NULL != c) #endif { #if !defined(NDEBUG) if (NULL != fn->xmm) #endif { fn->xmm(a, b, c); } #if !defined(NDEBUG) else if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: NULL-function passed into libxsmm_xmmcall_abc!\n"); } #endif } #if !defined(NDEBUG) else if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_xmmcall_abc specified!\n"); } #endif } /* implementation provided for Fortran 77 compatibility */ LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmcall_prf)( const libxsmm_xmmfunction* /*fn*/, const void* /*a*/, const void* /*b*/, void* /*c*/, const void* /*pa*/, const void* /*pb*/, const void* /*pc*/); LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmcall_prf)( const libxsmm_xmmfunction* fn, const void* a, const void* b, void* c, const void* pa, const void* pb, const void* pc) { #if !defined(NDEBUG) static int error_once = 0; if (NULL != fn && NULL != a && NULL != b && NULL != c) #endif { #if !defined(NDEBUG) if (NULL != fn->xmm) #endif { fn->xmm(a, b, c, pa, pb, pc); } #if !defined(NDEBUG) else if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: NULL-function passed into libxsmm_xmmcall_prf!\n"); } #endif } #if !defined(NDEBUG) else if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_xmmcall_prf specified!\n"); } #endif } /* implementation provided for Fortran 77 compatibility */ LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmcall)( const libxsmm_xmmfunction* /*fn*/, const void* /*a*/, const void* /*b*/, void* /*c*/, const void* /*pa*/, const void* /*pb*/, const void* /*pc*/); LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmcall)( const libxsmm_xmmfunction* fn, const void* a, const void* b, void* c, const void* pa, const void* pb, const void* pc) { LIBXSMM_FSYMBOL(libxsmm_xmmcall_prf)(fn, a, b, c, pa, pb, pc); } /* implementation provided for Fortran 77 compatibility */ LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xregister)(void** /*regval*/, const void* /*key*/, const int* /*keysize*/, const int* /*valsize*/, const void* /*valinit*/); LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xregister)(void** regval, const void* key, const int* keysize, const int* valsize, const void* valinit) { #if !defined(NDEBUG) static int error_once = 0; if (NULL != regval && NULL != key && NULL != keysize && NULL != valsize) #endif { *regval = libxsmm_xregister(key, *keysize, *valsize, valinit); } #if !defined(NDEBUG) else if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_xregister specified!\n"); } #endif } /* implementation provided for Fortran 77 compatibility */ LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xdispatch)(void** /*regval*/, const void* /*key*/, const int* /*keysize*/); LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xdispatch)(void** regval, const void* key, const int* keysize) { #if !defined(NDEBUG) static int error_once = 0; if (NULL != regval && NULL != key && NULL != keysize) #endif { *regval = libxsmm_xdispatch(key, *keysize); } #if !defined(NDEBUG) else if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_xdispatch specified!\n"); } #endif } /* implementation provided for Fortran 77 compatibility */ LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xrelease)(const void* /*key*/, const int* /*keysize*/); LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xrelease)(const void* key, const int* keysize) { #if !defined(NDEBUG) static int error_once = 0; if (NULL != key && NULL != keysize) #endif { libxsmm_xrelease(key, *keysize); } #if !defined(NDEBUG) else if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_xrelease specified!\n"); } #endif } #endif /*defined(LIBXSMM_BUILD) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__))*/ libxsmm-1.17/src/libxsmm_main.h000066400000000000000000001123741415223013700165420ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_MAIN_H #define LIBXSMM_MAIN_H #include /** * TF includes src/libxsmm_main.h and uses LIBXSMM's sync primitives * without including libxsmm_sync. However, libxsmm_sync.h shall be * an explicit include separate from including libxsmm.h. */ #include "libxsmm_sync.h" /** Allow external definition to enable testing corner cases (exhausted registry space). */ #if !defined(LIBXSMM_CAPACITY_REGISTRY) /* must be POT */ # define LIBXSMM_CAPACITY_REGISTRY 131072 #endif #if !defined(LIBXSMM_CAPACITY_CACHE) /* must be POT */ # define LIBXSMM_CAPACITY_CACHE 16 #endif #if !defined(LIBXSMM_PAGE_MINSIZE) # define LIBXSMM_PAGE_MINSIZE 4096 /* 4 KB */ #endif #if !defined(LIBXSMM_NTHREADS_MAX) # if (0 != LIBXSMM_SYNC) # define LIBXSMM_NTHREADS_MAX 1024 # else # define LIBXSMM_NTHREADS_MAX 1 # endif #endif /* code relies on LIBXSMM_NTHREADS_MAX or v/forks */ #if !defined(LIBXSMM_NTHREADS_USE) && 1 # define LIBXSMM_NTHREADS_USE #endif #if !defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) # define LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS LIBXSMM_NTHREADS_MAX #endif #if !defined(LIBXSMM_MALLOC_SCRATCH_SCALE) # define LIBXSMM_MALLOC_SCRATCH_SCALE 1.0 #endif #if !defined(LIBXSMM_MALLOC_LIMIT) # define LIBXSMM_MALLOC_LIMIT (2U << 20) /* 2 MB */ #endif #if !defined(LIBXSMM_MALLOC_HOOK_REALLOC) && 1 # define LIBXSMM_MALLOC_HOOK_REALLOC #endif #if !defined(LIBXSMM_MALLOC_HOOK_CALLOC) && 1 # define LIBXSMM_MALLOC_HOOK_CALLOC #endif /* align even if interceptor is disabled at runtime */ #if !defined(LIBXSMM_MALLOC_ALIGN_ALL) && 1 # define LIBXSMM_MALLOC_ALIGN_ALL #endif #if !defined(LIBXSMM_MALLOC_INTERNAL_CALLER_ID) # define LIBXSMM_MALLOC_INTERNAL_CALLER_ID ((uintptr_t)LIBXSMM_UNLIMITED) #endif #if !defined(LIBXSMM_MALLOC_INTERNAL_CALLER) # define LIBXSMM_MALLOC_INTERNAL_CALLER ((const void*)(LIBXSMM_MALLOC_INTERNAL_CALLER_ID)) #endif #if !defined(LIBXSMM_INTERCEPT_DYNAMIC) && defined(LIBXSMM_BUILD) && \ (defined(__GNUC__) || defined(_CRAYC)) && !defined(_WIN32) && !defined(__CYGWIN__) && \ !(defined(__APPLE__) && defined(__MACH__) && LIBXSMM_VERSION2(6, 1) >= \ LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) # define LIBXSMM_INTERCEPT_DYNAMIC #endif #if !defined(LIBXSMM_MALLOC_HOOK_DYNAMIC) && defined(LIBXSMM_INTERCEPT_DYNAMIC) && \ defined(LIBXSMM_MALLOC) && (0 != LIBXSMM_MALLOC) && \ (!defined(_CRAYC) && !defined(__TRACE)) /* TODO */ && \ (defined(LIBXSMM_BUILD) && (1 < (LIBXSMM_BUILD))) /* GLIBC */ # define LIBXSMM_MALLOC_HOOK_DYNAMIC #endif #if !defined(LIBXSMM_MALLOC_HOOK_STATIC) && \ defined(LIBXSMM_MALLOC) && (0 != LIBXSMM_MALLOC) && \ (!defined(_WIN32)) /* TODO */ && \ (defined(LIBXSMM_BUILD) && (1 < (LIBXSMM_BUILD))) /* GLIBC */ # define LIBXSMM_MALLOC_HOOK_STATIC #endif #if !defined(LIBXSMM_DNN_CONVOLUTION_SETUP_USE_NTS) && \ defined(LIBXSMM_MALLOC_HOOK_DYNAMIC) && \ defined(LIBXSMM_MALLOC_ALIGN_ALL) # define LIBXSMM_DNN_CONVOLUTION_SETUP_USE_NTS #endif #if defined(LIBXSMM_INTERCEPT_DYNAMIC) # if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) # endif # include # if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) # endif # if !defined(RTLD_NEXT) # define LIBXSMM_RTLD_NEXT ((void*)-1l) # else # define LIBXSMM_RTLD_NEXT RTLD_NEXT # endif #endif #if !defined(LIBXSMM_VERBOSITY_HIGH) # define LIBXSMM_VERBOSITY_HIGH 3 /* secondary warning or info-verbosity */ #endif #if !defined(LIBXSMM_VERBOSITY_WARN) # define LIBXSMM_VERBOSITY_WARN ((LIBXSMM_VERBOSITY_HIGH) - LIBXSMM_MIN(1, LIBXSMM_VERBOSITY_HIGH)) #endif #if !defined(LIBXSMM_LOCK) # define LIBXSMM_LOCK LIBXSMM_LOCK_DEFAULT #endif #if !defined(LIBXSMM_EXT_MIN_NTASKS) # define LIBXSMM_MIN_NTASKS(NT) 1 #endif #if !defined(LIBXSMM_OVERHEAD) # define LIBXSMM_OVERHEAD(NT) 0 #endif #if !defined(LIBXSMM_NOOP_ARGS) # define LIBXSMM_NOOP_ARGS(...) #endif #if !defined(LIBXSMM_NOOP) # define LIBXSMM_NOOP #endif /** Check if M, N, K, or LDx fits into the descriptor. */ #if (0 != LIBXSMM_ILP64) # define LIBXSMM_GEMM_NO_BYPASS_DIMS(M, N, K) (0xFFFFFFFF >= (M) && 0xFFFFFFFF >= (N) && 0xFFFFFFFF >= (K)) #else /* always fits */ # define LIBXSMM_GEMM_NO_BYPASS_DIMS(M, N, K) 1 #endif #if defined(LIBXSMM_ASSERT) /* assert available */ # define LIBXSMM_GEMM_DESCRIPTOR_DIM_CHECK(M, N, K) LIBXSMM_ASSERT(LIBXSMM_GEMM_NO_BYPASS_DIMS(M, N, K)) #else # define LIBXSMM_GEMM_DESCRIPTOR_DIM_CHECK(M, N, K) #endif #if defined(LIBXSMM_UNPACKED) # define LIBXSMM_DESCRIPTOR_CLEAR_AUX(DST, SIZE) LIBXSMM_MEMSET127(DST, 0, SIZE) #else # define LIBXSMM_DESCRIPTOR_CLEAR_AUX(DST, SIZE) #endif #define LIBXSMM_DESCRIPTOR_CLEAR(BLOB) \ LIBXSMM_ASSERT((LIBXSMM_DESCRIPTOR_MAXSIZE) == sizeof(*(BLOB))); \ LIBXSMM_DESCRIPTOR_CLEAR_AUX(BLOB, LIBXSMM_DESCRIPTOR_MAXSIZE) /** Low-level/internal GEMM descriptor initialization. */ #define LIBXSMM_GEMM_DESCRIPTOR(DESCRIPTOR, DATA_TYPE, FLAGS, M, N, K, LDA, LDB, LDC, ALPHA, BETA, PREFETCH) \ LIBXSMM_GEMM_DESCRIPTOR_DIM_CHECK(LDA, LDB, LDC); \ LIBXSMM_GEMM_DESCRIPTOR_DIM_CHECK(M, N, K); \ LIBXSMM_DESCRIPTOR_CLEAR_AUX(&(DESCRIPTOR), sizeof(DESCRIPTOR)); \ (DESCRIPTOR).datatype = (unsigned char)(DATA_TYPE); (DESCRIPTOR).prefetch = (unsigned char)(PREFETCH); \ (DESCRIPTOR).flags = (unsigned int)((FLAGS) \ /*| (LIBXSMM_NEQ(0, ALPHA) ? 0 : LIBXSMM_GEMM_FLAG_ALPHA_0)*/ \ | (LIBXSMM_NEQ(0, BETA) ? 0 : LIBXSMM_GEMM_FLAG_BETA_0)); \ (DESCRIPTOR).m = (unsigned int)(M); (DESCRIPTOR).n = (unsigned int)(N); (DESCRIPTOR).k = (unsigned int)(K); \ (DESCRIPTOR).lda = (unsigned int)(LDA); (DESCRIPTOR).ldb = (unsigned int)(LDB); (DESCRIPTOR).ldc = (unsigned int)(LDC); \ LIBXSMM_PAD((DESCRIPTOR).pad = 0) (DESCRIPTOR).c1 = 0; (DESCRIPTOR).c2 = 0; (DESCRIPTOR).c3 = 0; \ (DESCRIPTOR).meltw_ldx = 0; (DESCRIPTOR).meltw_ldy = 0; (DESCRIPTOR).meltw_ldz = 0; \ (DESCRIPTOR).meltw_datatype_aux = 0; (DESCRIPTOR).meltw_flags = 0; \ (DESCRIPTOR).meltw_operation = 0 /** Similar to LIBXSMM_GEMM_DESCRIPTOR, but separately taking the input-/output-precision. */ #define LIBXSMM_GEMM_DESCRIPTOR2(DESCRIPTOR, IPREC, OPREC, FLAGS, M, N, K, LDA, LDB, LDC, ALPHA, BETA, PREFETCH) \ LIBXSMM_GEMM_DESCRIPTOR(DESCRIPTOR, LIBXSMM_GETENUM(IPREC, OPREC), FLAGS, M, N, K, LDA, LDB, LDC, ALPHA, BETA, PREFETCH) /** Declare and construct a GEMM descriptor. */ #define LIBXSMM_GEMM_DESCRIPTOR_TYPE(DESCRIPTOR, DATA_TYPE, FLAGS, M, N, K, LDA, LDB, LDC, ALPHA, BETA, PREFETCH) \ libxsmm_gemm_descriptor DESCRIPTOR; LIBXSMM_GEMM_DESCRIPTOR(DESCRIPTOR, DATA_TYPE, \ FLAGS, M, N, K, LDA, LDB, LDC, ALPHA, BETA, PREFETCH) /** Similar to LIBXSMM_GEMM_DESCRIPTOR_TYPE, but separately taking the input-/output-precision. */ #define LIBXSMM_GEMM_DESCRIPTOR2_TYPE(DESCRIPTOR, IPREC, OPREC, FLAGS, M, N, K, LDA, LDB, LDC, ALPHA, BETA, PREFETCH) \ LIBXSMM_GEMM_DESCRIPTOR_TYPE(DESCRIPTOR, LIBXSMM_GETENUM(IPREC, OPREC), FLAGS, M, N, K, LDA, LDB, LDC, ALPHA, BETA, PREFETCH) #define LIBXSMM_REGDESC_DEFAULT #define LIBXSMM_REGDESC(START, MODIFIER) \ START libxsmm_gemm_descriptor MODIFIER gemm; \ START libxsmm_mcopy_descriptor MODIFIER mcopy; \ START libxsmm_meltw_descriptor MODIFIER meltw; \ START libxsmm_trans_descriptor MODIFIER trans; \ START libxsmm_pgemm_descriptor MODIFIER pgemm; \ START libxsmm_getrf_descriptor MODIFIER getrf; \ START libxsmm_trmm_descriptor MODIFIER trmm; \ START libxsmm_trsm_descriptor MODIFIER trsm /** * Packed structure, which stores the argument description of GEMM routines. * The size of the structure is padded to LIBXSMM_DESCRIPTOR_MAXSIZE. */ LIBXSMM_EXTERN_C LIBXSMM_PACKED(struct LIBXSMM_RETARGETABLE) libxsmm_gemm_descriptor { /** Extents of the matrix. */ unsigned int m, n, k; /** Leading dimensions. */ unsigned int lda, ldb, ldc; /** Set of flags. */ unsigned int flags; /** Prefetch strategy. */ unsigned char prefetch; /** Denotes the data-type. */ unsigned char datatype; /** Ignored entry. */ LIBXSMM_PAD(unsigned char pad) /** multipurpose 64bit field, currently used for: a) stride_a in brgemm */ unsigned long long c1; /** multipurpose 64bit field, currently used for: a) stride_b in brgemm */ unsigned long long c2; /** multipurpose 8bit field, currently used for: a) unroll hint in brgemm */ unsigned char c3; /** LDx, LDy, LDz, additional meltw LDs */ unsigned int meltw_ldx, meltw_ldy, meltw_ldz; /** Size of data element. */ unsigned char meltw_datatype_aux; /** Set of flags */ unsigned char meltw_flags; /** operation specifier */ unsigned char meltw_operation; }; /** Packed structure storing the matcopy argument description. */ LIBXSMM_EXTERN_C LIBXSMM_PACKED(struct LIBXSMM_RETARGETABLE) libxsmm_mcopy_descriptor { /** LDx, M, and N. */ unsigned int m, n, ldi, ldo; /** Size of data element. */ unsigned char typesize; /** Level of unrolling. */ unsigned char unroll_level; /** Boolean value (@TODO fix this). */ unsigned char prefetch; /** Set of flags. */ unsigned char flags; }; /** Packed structure storing the mateltw argument description. */ LIBXSMM_EXTERN_C LIBXSMM_PACKED(struct LIBXSMM_RETARGETABLE) libxsmm_meltw_descriptor { /** LDx, M, and N. */ unsigned int m, n, ldi, ldo, ldx, ldy; /** Size of data element. */ unsigned char datatype; unsigned char datatype2; /** Set of flags */ unsigned char flags; /** operation specifier */ unsigned char operation; }; /** Packed structure storing the transpose argument description. */ LIBXSMM_EXTERN_C LIBXSMM_PACKED(struct LIBXSMM_RETARGETABLE) libxsmm_trans_descriptor { /** LD, M, and N. */ unsigned int m, n, ldo; /** Size of data element. */ unsigned char typesize; }; /** Packed structure storing arguments of packed GEMM. */ LIBXSMM_EXTERN_C LIBXSMM_PACKED(struct LIBXSMM_RETARGETABLE) libxsmm_pgemm_descriptor { unsigned int m, n, k, lda, ldb, ldc; unsigned char typesize; unsigned char layout; char transa, transb; char alpha_val; }; /** Packed structure storing arguments of packed GETRF. */ LIBXSMM_EXTERN_C LIBXSMM_PACKED(struct LIBXSMM_RETARGETABLE) libxsmm_getrf_descriptor { unsigned int m, n, lda; unsigned char typesize; unsigned char layout; }; /** Packed structure storing arguments of packed TRSM. */ LIBXSMM_EXTERN_C LIBXSMM_PACKED(struct LIBXSMM_RETARGETABLE) libxsmm_trmm_descriptor { union { double d; float s; } alpha; unsigned int m, n, lda, ldb; unsigned char typesize; unsigned char layout; char diag, side, uplo; char transa; }; /** Packed structure storing arguments of packed TRSM. */ LIBXSMM_EXTERN_C LIBXSMM_PACKED(struct LIBXSMM_RETARGETABLE) libxsmm_trsm_descriptor { union { double d; float s; } alpha; unsigned int m, n, lda, ldb; unsigned char typesize; unsigned char layout; char diag, side, uplo; char transa; }; LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE LIBXSMM_MAY_ALIAS libxsmm_csr_soa_descriptor { const libxsmm_gemm_descriptor* gemm; const unsigned int* row_ptr; const unsigned int* column_idx; const void* values; unsigned int packed_width; } libxsmm_csr_soa_descriptor; LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE LIBXSMM_MAY_ALIAS libxsmm_csc_soa_descriptor { const libxsmm_gemm_descriptor* gemm; const unsigned int* column_ptr; const unsigned int* row_idx; const void* values; unsigned int packed_width; } libxsmm_csc_soa_descriptor; LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE LIBXSMM_MAY_ALIAS libxsmm_pgemm_ac_rm_descriptor { const libxsmm_gemm_descriptor* gemm; unsigned int packed_width; } libxsmm_pgemm_ac_rm_descriptor; LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE LIBXSMM_MAY_ALIAS libxsmm_pgemm_bc_rm_descriptor { const libxsmm_gemm_descriptor* gemm; unsigned int packed_width; } libxsmm_pgemm_bc_rm_descriptor; LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE LIBXSMM_MAY_ALIAS libxsmm_csr_reg_descriptor { const libxsmm_gemm_descriptor* gemm; const unsigned int* row_ptr; const unsigned int* column_idx; const void* values; } libxsmm_csr_reg_descriptor; LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE libxsmm_code_pointer { void (*ptr_fn)(LIBXSMM_VARIADIC); const void* ptr_const; void* ptr; uintptr_t uval; intptr_t ival; libxsmm_xmmfunction xgemm; /* GEMM: smm, dmm, wimm, or void-function */ libxsmm_xmcopyfunction xmatcopy; libxsmm_xmeltwfunction xmateltw; libxsmm_xtransfunction xtrans; libxsmm_pgemm_xfunction xpgemm; libxsmm_getrf_xfunction xgetrf; libxsmm_trmm_xfunction xtrmm; libxsmm_trsm_xfunction xtrsm; } libxsmm_code_pointer; /** Structure which describes all tensors in LIBXSMM's DNN module */ LIBXSMM_EXTERN_C struct LIBXSMM_RETARGETABLE libxsmm_dnn_tensor { libxsmm_dnn_tensor_datalayout* layout; /* data-layout descriptor */ void* data; /* pointer to data */ unsigned char scf; /* fix point scaling factor for this tensor */ }; /* Structure to record segment in stream of code */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE segment_t { int segment_type; int n_convs; int aux_index; } segment_t; LIBXSMM_EXTERN_C struct LIBXSMM_RETARGETABLE libxsmm_dnn_layer { libxsmm_dnn_datatype datatype_in; libxsmm_dnn_datatype datatype_out; libxsmm_dnn_conv_desc desc; libxsmm_dnn_conv_algo algo; libxsmm_dnn_tensor_format buffer_format; libxsmm_dnn_tensor_format filter_format; libxsmm_dnn_conv_fuse_op fuse_ops; libxsmm_dnn_conv_option options; /* additional size for internal data types */ int ifhp; int ifwp; int ofh; int ofw; int ofhp; int ofwp; int ifmblock; int ofmblock; int blocksifm; int blocksofm; int fwd_ofw_rb; int fwd_ofh_rb; int bwd_ofw_rb; int bwd_ofh_rb; int upd_ofw_rb; int upd_ofh_rb; int fm_lp_block; /* additional blocking for low precision datatypes of feature maps */ int blocksifm_blocking; int blocksofm_blocking; int avoid_acc_load; int avoid_acc_load_bwd; int pack_input; int pack_input_bwd; int spread_input_bwd; int weight_copies; int loop_order; int use_ofm_parallelization; int use_ifm_parallelization; int avoid_fmas_in_rim; int upd_use_batchreduce; int upd_pack_input; int upd_loop_order; int upd_linearized_tasklist; int upd_avoid_rim_fmas; int fwd_flags; int shuffle_filter_accesses; int use_fallback_fwd_loops; int use_fallback_bwd_loops; int input_pixels; int output_pixels; int n_used_pixels; int pixel_blocking; int use_intermediate_f32_wt_tensor; int upd_linearized_pixels; int ifwp_extended; int ofwp_extended; int batchreduce_h_pixels; int on_the_fly_input_packing; int upd_pack_input_upfront; int use_hybrid_imgofm_parallelization; int compute_pixels; int upd_trans_w_only; int fwd_padding_copy; int upd_padding_copy; int block_fwd_oj; int block_fwd_ifm; int block_fwd_ofm; int block_bwd_oj; int block_bwd_ifm; int block_bwd_ofm; int block_upd_ifm; int block_upd_ofm; libxsmm_xtransfunction tr_kernel; libxsmm_meltwfunction_cvtfp32bf16 fwd_cvtfp32bf16_kernel; /* internal data representation */ libxsmm_dnn_tensor* reg_input; libxsmm_dnn_tensor* reg_output; libxsmm_dnn_tensor* reg_filter; libxsmm_dnn_tensor* grad_input; libxsmm_dnn_tensor* grad_output; libxsmm_dnn_tensor* grad_filter; libxsmm_dnn_tensor* reg_bias; libxsmm_dnn_tensor* grad_bias; /* internal data representations for copies of tensors */ libxsmm_dnn_tensor* reg_input_tr; libxsmm_dnn_tensor* reg_filter_tr; /* batchnorm stats */ libxsmm_dnn_tensor* batch_stats; /* maxstats used in low-precision kernels */ libxsmm_dnn_tensor* maxstats_fwd; libxsmm_dnn_tensor* maxstats_bwd; libxsmm_dnn_tensor* maxstats_upd; /* barrier */ libxsmm_barrier* barrier; /* scratch */ size_t fwd_packing_padding_scratch_size; size_t fwd_lp_output_full_scratch_size; size_t fwd_lp_output_block_scratch_size; size_t fwd_packing_padding_scratch_offset; size_t fwd_lp_output_full_scratch_offset; size_t fwd_lp_output_block_scratch_offset; size_t fwd_scratch_size; size_t bwd_filter_trans_scratch_size; size_t bwd_packing_padding_scratch_size; size_t bwd_lp_input_full_scratch_size; size_t bwd_filter_trans_scratch_offset; size_t bwd_packing_padding_scratch_offset; size_t bwd_lp_input_full_scratch_offset; size_t bwd_scratch_size; size_t upd_packing_padding_scratch_size; size_t upd_lp_output_full_scratch_size; size_t upd_lp_input_full_scratch_size; size_t upd_filter_scratch_size; size_t upd_lp_filter_full_scratch_size; size_t upd_packing_padding_scratch_offset; size_t upd_lp_output_full_scratch_offset; size_t upd_lp_input_full_scratch_offset; size_t upd_lp_filter_full_scratch_offset; size_t upd_filter_scratch_offset; size_t upd_scratch_size; void* scratch; size_t scratch_size; libxsmm_code_pointer gemm_fwd; /* ability to hoist forward GEMMs */ libxsmm_code_pointer gemm_fwd2; /* ability to hoist forward GEMMs */ unsigned long long *A_offsets; unsigned long long *B_offsets; /* JIT-generated convolution code */ libxsmm_code_pointer code_fwd[3]; libxsmm_code_pointer code_bwd[3]; libxsmm_code_pointer code_upd[2]; libxsmm_code_pointer matcopy_fwd[4]; libxsmm_code_pointer matcopy_bwd[4]; libxsmm_code_pointer matcopy_upd[3]; }; LIBXSMM_EXTERN_C struct LIBXSMM_RETARGETABLE libxsmm_dnn_fusedbatchnorm { libxsmm_dnn_fusedbatchnorm_desc desc; libxsmm_dnn_tensor* reg_input; /* input tensor */ libxsmm_dnn_tensor* reg_output; /* output tensor */ libxsmm_dnn_tensor* grad_input; /* grad input tensor */ libxsmm_dnn_tensor* grad_output; /* grad output tensor */ libxsmm_dnn_tensor* reg_add; /* elementwise tensor */ libxsmm_dnn_tensor* grad_add; /* grad elementwise tensor */ libxsmm_dnn_tensor* reg_beta; /* beta tensor */ libxsmm_dnn_tensor* reg_gamma; /* gamma tensor */ libxsmm_dnn_tensor* grad_beta; /* grad beta tensor */ libxsmm_dnn_tensor* grad_gamma; /* grad gamma tensor */ libxsmm_dnn_tensor* expvalue; /* expected value */ libxsmm_dnn_tensor* rcpstddev; /* reciprocal of standard derivation */ libxsmm_dnn_tensor* variance; /* variance */ libxsmm_dnn_tensor* relumask; /* relumask */ libxsmm_barrier* barrier; /* barrier */ int ifmblock; int ofmblock; int blocksifm; int blocksofm; size_t scratch_size; void* scratch; }; LIBXSMM_EXTERN_C struct LIBXSMM_RETARGETABLE libxsmm_dnn_softmaxloss { libxsmm_dnn_softmaxloss_desc desc; libxsmm_dnn_tensor* reg_input; /* input tensor */ libxsmm_dnn_tensor* reg_output; /* output tensor */ libxsmm_dnn_tensor* grad_input; /* grad input tensor */ libxsmm_dnn_tensor* label; /* labels tensor */ libxsmm_barrier* barrier; /* barrier */ int bc; int Bc; int bn; int Bn; float loss; size_t scratch_size; void* scratch; }; LIBXSMM_EXTERN_C struct LIBXSMM_RETARGETABLE libxsmm_dnn_optimizer { libxsmm_dnn_optimizer_desc desc; libxsmm_dnn_tensor* reg_filter; /* filter tensor */ libxsmm_dnn_tensor* grad_filter; /* grad filter tensor */ libxsmm_dnn_tensor* master_filter; /* master filter tensor */ libxsmm_barrier* barrier; /* barrier */ int bc; int Bc; int bk; int Bk; int fm_lp_block; size_t scratch_size; void* scratch; }; LIBXSMM_EXTERN_C struct LIBXSMM_RETARGETABLE libxsmm_dnn_fusedgroupnorm { libxsmm_dnn_fusedgroupnorm_desc desc; libxsmm_dnn_tensor* reg_input; /* input tensor */ libxsmm_dnn_tensor* reg_output; /* output tensor */ libxsmm_dnn_tensor* grad_input; /* grad input tensor */ libxsmm_dnn_tensor* grad_output; /* grad output tensor */ libxsmm_dnn_tensor* reg_add; /* elementwise tensor */ libxsmm_dnn_tensor* grad_add; /* grad elementwise tensor */ libxsmm_dnn_tensor* reg_beta; /* beta tensor */ libxsmm_dnn_tensor* reg_gamma; /* gamma tensor */ libxsmm_dnn_tensor* grad_beta; /* grad beta tensor */ libxsmm_dnn_tensor* grad_gamma; /* grad gamma tensor */ libxsmm_dnn_tensor* expvalue; /* expected value */ libxsmm_dnn_tensor* rcpstddev; /* reciprocal of standard derivation */ libxsmm_dnn_tensor* variance; /* variance */ libxsmm_dnn_tensor* relumask; /* relumask */ libxsmm_barrier* barrier; /* barrier */ int ifmblock; int ofmblock; int blocksifm; int blocksofm; size_t scratch_size; void* scratch; }; LIBXSMM_EXTERN_C struct LIBXSMM_RETARGETABLE libxsmm_dnn_fullyconnected { libxsmm_dnn_fullyconnected_desc desc; libxsmm_dnn_tensor* reg_input; /* input tensor */ libxsmm_dnn_tensor* reg_output; /* output tensor */ libxsmm_dnn_tensor* grad_input; /* grad input tensor */ libxsmm_dnn_tensor* grad_output; /* grad output tensor */ libxsmm_dnn_tensor* reg_filter; /* filter tensor */ libxsmm_dnn_tensor* grad_filter; /* grad filter tensor */ libxsmm_dnn_tensor* reg_bias; /* bias tensor */ libxsmm_dnn_tensor* grad_bias; /* grad bais tensor */ libxsmm_dnn_tensor* relumask; /* relumask */ libxsmm_barrier* barrier; /* barrier */ int ifmblock; int ofmblock; int blocksifm; int blocksofm; /* Parameters to tune/specialize FC algorithms */ int fwd_2d_blocking; int bwd_2d_blocking; int upd_2d_blocking; int fwd_bf; int bwd_bf; int upd_bf; int fwd_row_teams; int fwd_column_teams; int bwd_row_teams; int bwd_column_teams; int upd_row_teams; int upd_column_teams; int ifm_subtasks; int ofm_subtasks; int fm_lp_block; int bn; int bk; int bc; size_t scratch_size; size_t doutput_scratch_mark; void* scratch; libxsmm_xtransfunction tr_kernel; libxsmm_code_pointer gemm_fwd; /* ability to hoist forward GEMMs */ libxsmm_code_pointer gemm_fwd2; /* ability to hoist forward GEMMs */ libxsmm_code_pointer gemm_fwd3; /* ability to hoist forward GEMMs */ libxsmm_code_pointer gemm_bwd; /* ability to hoist backward GEMMs */ libxsmm_code_pointer gemm_bwd2; /* ability to hoist backward GEMMs */ libxsmm_code_pointer gemm_upd; /* ability to hoist update GEMMs */ libxsmm_code_pointer gemm_upd2; /* ability to hoist update GEMMs */ }; LIBXSMM_EXTERN_C struct LIBXSMM_RETARGETABLE libxsmm_dnn_pooling { libxsmm_dnn_pooling_desc desc; libxsmm_dnn_tensor* reg_input; /* input tensor */ libxsmm_dnn_tensor* reg_output; /* output tensor */ libxsmm_dnn_tensor* grad_input; /* grad input tensor */ libxsmm_dnn_tensor* grad_output; /* grad output tensor */ libxsmm_dnn_tensor* mask; /* elementwise tensor */ libxsmm_barrier* barrier; /* barrier */ int ifmblock; int ofmblock; int blocksifm; int blocksofm; int ofh; int ofw; size_t scratch_size; void* scratch; }; LIBXSMM_EXTERN_C struct LIBXSMM_RETARGETABLE libxsmm_dnn_rnncell { libxsmm_dnn_rnncell_desc desc; libxsmm_blasint T; /* sequence length, must be smaller than max sequence length in desc */ libxsmm_blasint bk; libxsmm_blasint bn; libxsmm_blasint bc; libxsmm_blasint lpb; /* external tensors */ libxsmm_dnn_tensor* xt; libxsmm_dnn_tensor* csp; libxsmm_dnn_tensor* hp; libxsmm_dnn_tensor* w; libxsmm_dnn_tensor* wt; libxsmm_dnn_tensor* r; libxsmm_dnn_tensor* rt; libxsmm_dnn_tensor* b; libxsmm_dnn_tensor* cst; libxsmm_dnn_tensor* ht; libxsmm_dnn_tensor* dxt; libxsmm_dnn_tensor* dcsp; libxsmm_dnn_tensor* dhp; libxsmm_dnn_tensor* dw; libxsmm_dnn_tensor* dr; libxsmm_dnn_tensor* db; libxsmm_dnn_tensor* dcs; libxsmm_dnn_tensor* dht; libxsmm_dnn_tensor* it; libxsmm_dnn_tensor* ft; libxsmm_dnn_tensor* ot; libxsmm_dnn_tensor* cit; libxsmm_dnn_tensor* cot; float forget_bias; /* internal state */ void* internal_z; /* scratch pointers */ void* scratch_base; void* scratch_wT; void* scratch_rT; void* scratch_w; void* scratch_r; void* scratch_xT; void* scratch_hT; void* scratch_deltat; void* scratch_di; void* scratch_df; void* scratch_do; void* scratch_dci; void* scratch_diB; void* scratch_dfB; void* scratch_dpB; void* scratch_dciB; void* scratch_dx; void* scratch_dhp; void* scratch_db; void* scratch_t1; void* scratch_t2; void* csp_scratch; void* cst_scratch; void* ht_scratch; void* it_scratch; void* ft_scratch; void* ot_scratch; void* cit_scratch; void* cot_scratch; /* Ability to hoist GEMMs */ libxsmm_bsmmfunction_reducebatch_strd fwd_kernela; libxsmm_bsmmfunction_reducebatch_strd fwd_kernelb; libxsmm_bsmmfunction_reducebatch_strd bwdupd_kernela; libxsmm_bsmmfunction_reducebatch_strd bwdupd_kernelb; libxsmm_bsmmfunction_reducebatch_strd bwdupd_kernelc; libxsmm_bsmmfunction_reducebatch_strd bwdupd_kerneld; libxsmm_barrier* barrier; /* barrier */ }; struct LIBXSMM_RETARGETABLE libxsmm_dfsspmdm { int M; int N; int K; int ldb; int ldc; int N_chunksize; unsigned int* permute_operands; double* a_dense; libxsmm_dmmfunction kernel; }; struct LIBXSMM_RETARGETABLE libxsmm_sfsspmdm { int M; int N; int K; int ldb; int ldc; int N_chunksize; unsigned int* permute_operands; float* a_dense; libxsmm_smmfunction kernel; }; typedef enum libxsmm_build_kind { LIBXSMM_BUILD_KIND_GEMM = LIBXSMM_KERNEL_KIND_MATMUL, LIBXSMM_BUILD_KIND_MCOPY = LIBXSMM_KERNEL_KIND_MCOPY, LIBXSMM_BUILD_KIND_MELTW = LIBXSMM_KERNEL_KIND_MELTW, LIBXSMM_BUILD_KIND_TRANS = LIBXSMM_KERNEL_KIND_TRANS, LIBXSMM_BUILD_KIND_PGEMM = LIBXSMM_KERNEL_KIND_PGEMM, LIBXSMM_BUILD_KIND_GETRF = LIBXSMM_KERNEL_KIND_GETRF, LIBXSMM_BUILD_KIND_TRMM = LIBXSMM_KERNEL_KIND_TRMM, LIBXSMM_BUILD_KIND_TRSM = LIBXSMM_KERNEL_KIND_TRSM, LIBXSMM_BUILD_KIND_USER = LIBXSMM_KERNEL_KIND_USER, LIBXSMM_BUILD_KIND_PGEMMRMAC = LIBXSMM_KERNEL_UNREGISTERED, LIBXSMM_BUILD_KIND_PGEMMRMBC, LIBXSMM_BUILD_KIND_SRSOA, LIBXSMM_BUILD_KIND_SCSOA, LIBXSMM_BUILD_KIND_SREG } libxsmm_build_kind; /** Integral type (libxsmm_kernel_kind, libxsmm_build_kind). */ #if defined(LIBXSMM_UNPACKED) typedef size_t libxsmm_descriptor_kind; #else typedef unsigned char libxsmm_descriptor_kind; #endif /** All descriptor types, which are valid for code-registration. */ LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE libxsmm_descriptor { char data[LIBXSMM_DESCRIPTOR_MAXSIZE]; libxsmm_descriptor_kind kind; /* kind: must be the first member */ LIBXSMM_REGDESC(LIBXSMM_PACKED(struct) { libxsmm_descriptor_kind /*repeated kind*/ pad; , desc; }); LIBXSMM_PACKED(struct) { libxsmm_descriptor_kind /*repeated kind*/ pad; char desc[1]; } user; } libxsmm_descriptor; LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_build_request { union { const void* ptr; /* raw content */ LIBXSMM_REGDESC(LIBXSMM_REGDESC_DEFAULT, const*); const libxsmm_csr_soa_descriptor* srsoa; const libxsmm_csc_soa_descriptor* scsoa; const libxsmm_pgemm_ac_rm_descriptor* pgemmacrm; const libxsmm_pgemm_bc_rm_descriptor* pgemmbcrm; const libxsmm_csr_reg_descriptor* sreg; } descriptor; libxsmm_build_kind kind; /* used by user-kind */ size_t user_size; } libxsmm_build_request; typedef enum libxsmm_malloc_flags { LIBXSMM_MALLOC_FLAG_DEFAULT = 0, LIBXSMM_MALLOC_FLAG_SCRATCH = 1, LIBXSMM_MALLOC_FLAG_PRIVATE = 2, LIBXSMM_MALLOC_FLAG_REALLOC = 4, LIBXSMM_MALLOC_FLAG_PHUGE = 8, LIBXSMM_MALLOC_FLAG_PLOCK = 16, LIBXSMM_MALLOC_FLAG_MMAP = 32, LIBXSMM_MALLOC_FLAG_R = 64, LIBXSMM_MALLOC_FLAG_W = 128, LIBXSMM_MALLOC_FLAG_X = 256, LIBXSMM_MALLOC_FLAG_RW = LIBXSMM_MALLOC_FLAG_R | LIBXSMM_MALLOC_FLAG_W, LIBXSMM_MALLOC_FLAG_WX = LIBXSMM_MALLOC_FLAG_X | LIBXSMM_MALLOC_FLAG_W, LIBXSMM_MALLOC_FLAG_RWX = LIBXSMM_MALLOC_FLAG_X | LIBXSMM_MALLOC_FLAG_RW, LIBXSMM_MALLOC_FLAG_VALID = LIBXSMM_MALLOC_FLAG_SCRATCH | LIBXSMM_MALLOC_FLAG_PRIVATE | LIBXSMM_MALLOC_FLAG_REALLOC | LIBXSMM_MALLOC_FLAG_PHUGE | LIBXSMM_MALLOC_FLAG_PLOCK | LIBXSMM_MALLOC_FLAG_MMAP | LIBXSMM_MALLOC_FLAG_RWX } libxsmm_malloc_flags; LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void* (*libxsmm_realloc_fun)(void* /*ptr*/, size_t /*size*/); #if defined(LIBXSMM_MALLOC_HOOK_DYNAMIC) LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_malloc_fntype { union { const void* dlsym; void* (*ptr)(size_t, size_t); } alignmem; union { const void* dlsym; void* (*ptr)(size_t, size_t); } memalign; union { const void* dlsym; libxsmm_malloc_fun ptr; } malloc; # if defined(LIBXSMM_MALLOC_HOOK_CALLOC) union { const void* dlsym; void* (*ptr)(size_t, size_t); } calloc; # endif # if defined(LIBXSMM_MALLOC_HOOK_REALLOC) union { const void* dlsym; libxsmm_realloc_fun ptr; } realloc; # endif union { const void* dlsym; libxsmm_free_fun ptr; } free; } libxsmm_malloc_fntype; LIBXSMM_APIVAR_PRIVATE(libxsmm_malloc_fntype libxsmm_malloc_fn); #endif #if (defined(LIBXSMM_BUILD) && (1 < (LIBXSMM_BUILD))) /* prototypes for GLIBC internal implementation */ LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE void* __libc_memalign(size_t alignment, size_t size); LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE void* __libc_malloc(size_t size); #if defined(LIBXSMM_MALLOC_HOOK_CALLOC) LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE void* __libc_calloc(size_t num, size_t size); #endif #if defined(LIBXSMM_MALLOC_HOOK_REALLOC) LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE void* __libc_realloc(void* ptr, size_t size); #endif LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE void __libc_free(void* ptr); #endif /*(defined(LIBXSMM_BUILD) && (1 < (LIBXSMM_BUILD)))*/ LIBXSMM_API_INTERN void* libxsmm_memalign_internal(size_t alignment, size_t size); /* See https://sourceware.org/binutils/docs-2.34/ld/Options.html#index-_002d_002dwrap_003dsymbol */ LIBXSMM_API_INTERN LIBXSMM_ATTRIBUTE_WEAK void* __real_memalign(size_t alignment, size_t size); LIBXSMM_API_INTERN LIBXSMM_ATTRIBUTE_WEAK void* __real_malloc(size_t size); #if defined(LIBXSMM_MALLOC_HOOK_CALLOC) LIBXSMM_API_INTERN LIBXSMM_ATTRIBUTE_WEAK void* __real_calloc(size_t num, size_t size); #endif #if defined(LIBXSMM_MALLOC_HOOK_REALLOC) LIBXSMM_API_INTERN LIBXSMM_ATTRIBUTE_WEAK void* __real_realloc(void* ptr, size_t size); #endif LIBXSMM_API_INTERN LIBXSMM_ATTRIBUTE_WEAK void __real_free(void* ptr); /** Retrieve internal information about a buffer (default memory domain). */ LIBXSMM_API int libxsmm_get_malloc_xinfo(const void* memory, size_t* size, int* flags, void** extra); /** Initializes malloc hooks and other internals. */ LIBXSMM_API_INTERN void libxsmm_malloc_init(void); LIBXSMM_API_INTERN void libxsmm_malloc_finalize(void); /** Calculates an alignment depending on supposedly allocated size; alignment can be zero ("auto"). */ LIBXSMM_API_INTERN size_t libxsmm_alignment(size_t size, size_t alignment); /** Same as libxsmm_set_default_allocator, but takes a lock (can be NULL). */ LIBXSMM_API_INTERN int libxsmm_xset_default_allocator(LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK)* lock, const void* context, libxsmm_malloc_function malloc_fn, libxsmm_free_function free_fn); /** Same as libxsmm_get_default_allocator, but takes a lock (can be NULL). */ LIBXSMM_API_INTERN int libxsmm_xget_default_allocator(LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK)* lock, const void** context, libxsmm_malloc_function* malloc_fn, libxsmm_free_function* free_fn); /** Same as libxsmm_set_scratch_allocator, but takes a lock (can be NULL). */ LIBXSMM_API_INTERN int libxsmm_xset_scratch_allocator(LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK)* lock, const void* context, libxsmm_malloc_function malloc_fn, libxsmm_free_function free_fn); /** Same as libxsmm_get_scratch_allocator, but takes a lock (can be NULL). */ LIBXSMM_API_INTERN int libxsmm_xget_scratch_allocator(LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK)* lock, const void** context, libxsmm_malloc_function* malloc_fn, libxsmm_free_function* free_fn); /** * Attribute memory allocation and protect with only the necessary flags. * This procedure is expected to run only one time per buffer, and may * relocate the given memory. */ LIBXSMM_API_INTERN int libxsmm_malloc_attrib(void** memory, int flags, /** If a name is given, an executable buffer will be dumped into a file. */ const char* name); /** Allocate memory of the requested size, which is aligned according to the given alignment. */ LIBXSMM_API_INTERN int libxsmm_xmalloc(void** memory, size_t size, size_t alignment, int flags, /* The extra information is stored along with the allocated chunk; can be NULL/zero. */ const void* extra, size_t extra_size); /** Release memory, which was allocated using libxsmm_[*]malloc. */ LIBXSMM_API_INTERN void libxsmm_xfree(const void* memory, int check); /** Like libxsmm_release_scratch, but takes a lock (can be NULL). */ LIBXSMM_API_INTERN void libxsmm_xrelease_scratch(LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK)* lock); /** * Format for instance an amount of Bytes like libxsmm_format_size(result, sizeof(result), nbytes, "KMGT", "B", 10). * The value returned is in requested/determined unit so that the user can decide about printing the buffer. */ LIBXSMM_API_INTERN size_t libxsmm_format_size(char buffer[32], int buffer_size, size_t nbytes, const char scale[], const char* unit, int base); /** Returns the type-name of data-type (can be also libxsmm_gemm_precision). */ LIBXSMM_API_INTERN const char* libxsmm_typename(libxsmm_datatype datatype); /** Determines the given value in double-precision based on the given type. */ LIBXSMM_API_INTERN int libxsmm_dvalue(libxsmm_datatype datatype, const void* value, double* dvalue); /** Services a build request, and (optionally) registers the code (use regindex=LIBXSMM_CAPACITY_REGISTRY for unmanaged code). */ LIBXSMM_API_INTERN int libxsmm_build(const libxsmm_build_request* request, unsigned int regindex, libxsmm_code_pointer* code); /** Returns the type-size of data-type (can be also libxsmm_gemm_precision). */ LIBXSMM_API unsigned char libxsmm_typesize(libxsmm_datatype datatype); LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_kernel_xinfo { /** Non-zero of kernel is registered. */ unsigned int registered; /** Number of FLoating Point OPerationS (FLOPS). */ unsigned int nflops; } libxsmm_kernel_xinfo; /** Receive information about JIT-generated code. */ LIBXSMM_API_INTERN const libxsmm_kernel_xinfo* libxsmm_get_kernel_xinfo(libxsmm_code_pointer code, const libxsmm_descriptor** desc, size_t* code_size); /** Calculates duration in seconds from given RTC ticks. */ LIBXSMM_API_INTERN double libxsmm_timer_duration_rtc(libxsmm_timer_tickint tick0, libxsmm_timer_tickint tick1); /** Returns the current tick of platform-specific real-time clock. */ LIBXSMM_API_INTERN libxsmm_timer_tickint libxsmm_timer_tick_rtc(void); /** Returns the current tick of a (monotonic) platform-specific counter. */ LIBXSMM_API_INTERN libxsmm_timer_tickint libxsmm_timer_tick_tsc(void); LIBXSMM_API_INTERN void libxsmm_memory_init(int target_arch); LIBXSMM_API_INTERN void libxsmm_memory_finalize(void); LIBXSMM_API_INTERN void libxsmm_dnn_init(int target_arch); LIBXSMM_API_INTERN void libxsmm_dnn_finalize(void); /** intern function to calculate blockings, that's private API hence it's in this function */ LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_get_feature_map_blocks( int C, int K, int* C_block, int* K_block, int* fm_lp_block, libxsmm_dnn_datatype datatype_in, libxsmm_dnn_datatype datatype_out); /** Global lock; create an own lock for an independent domain. */ LIBXSMM_APIVAR_PUBLIC(LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK) libxsmm_lock_global); /** Determines whether a threaded implementation is synchronized or not. */ LIBXSMM_APIVAR_PUBLIC(int libxsmm_nosync); /** Function used to allocate default memory. */ LIBXSMM_APIVAR_PRIVATE(libxsmm_malloc_function libxsmm_default_malloc_fn); /** Function used to allocate scratch memory. */ LIBXSMM_APIVAR_PRIVATE(libxsmm_malloc_function libxsmm_scratch_malloc_fn); /** Function used to release default memory. */ LIBXSMM_APIVAR_PRIVATE(libxsmm_free_function libxsmm_default_free_fn); /** Function used to release scratch memory. */ LIBXSMM_APIVAR_PRIVATE(libxsmm_free_function libxsmm_scratch_free_fn); /** If non-NULL, this context is used by the context-form of memory allocation. */ LIBXSMM_APIVAR_PRIVATE(const void* libxsmm_default_allocator_context); /** If non-NULL, this context is used by the context-form of memory allocation. */ LIBXSMM_APIVAR_PRIVATE(const void* libxsmm_scratch_allocator_context); /** Number of scratch memory pools used; clamped against internal maximum. */ LIBXSMM_APIVAR_PRIVATE(unsigned int libxsmm_scratch_pools); /** Growth factor used to scale the scratch memory in case of reallocation. */ LIBXSMM_APIVAR_PRIVATE(double libxsmm_scratch_scale); /** Number of seconds per RDTSC-cycle (zero or negative if RDTSC invalid). */ LIBXSMM_APIVAR_PRIVATE(double libxsmm_timer_scale); /** Counts the number of attempts to create an SPMDM-handle. */ LIBXSMM_APIVAR_PRIVATE(unsigned int libxsmm_statistic_num_spmdm); /** Counts the maximum number of thread that have been active. */ LIBXSMM_APIVAR_PRIVATE(unsigned int libxsmm_thread_count); #if (0 != LIBXSMM_SYNC) LIBXSMM_APIVAR_PRIVATE(LIBXSMM_TLS_TYPE libxsmm_tlskey); #endif #endif /*LIBXSMM_MAIN_H*/ libxsmm-1.17/src/libxsmm_malloc.c000066400000000000000000003153271415223013700170630ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include "libxsmm_trace.h" #include "libxsmm_main.h" #include "libxsmm_hash.h" #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #if (defined(LIBXSMM_BUILD) && (1 < (LIBXSMM_BUILD))) # include # include #endif #if !defined(LIBXSMM_MALLOC_GLIBC) # if defined(__GLIBC__) # define LIBXSMM_MALLOC_GLIBC __GLIBC__ # else # define LIBXSMM_MALLOC_GLIBC 6 # endif #endif #if defined(_WIN32) # include # include # include #else # include # if defined(__linux__) # include # include # endif # if defined(MAP_POPULATE) # include # endif # include # include # include # if defined(__MAP_ANONYMOUS) # define LIBXSMM_MAP_ANONYMOUS __MAP_ANONYMOUS # elif defined(MAP_ANONYMOUS) # define LIBXSMM_MAP_ANONYMOUS MAP_ANONYMOUS # elif defined(MAP_ANON) # define LIBXSMM_MAP_ANONYMOUS MAP_ANON # else # define LIBXSMM_MAP_ANONYMOUS 0x20 # endif # if defined(MAP_SHARED) && 0 # define LIBXSMM_MAP_SHARED MAP_SHARED # else # define LIBXSMM_MAP_SHARED 0 # endif LIBXSMM_EXTERN int ftruncate(int, off_t) LIBXSMM_THROW; LIBXSMM_EXTERN int mkstemp(char*) LIBXSMM_NOTHROW; #endif #if !defined(LIBXSMM_MALLOC_FALLBACK) # define LIBXSMM_MALLOC_FINAL 3 #endif #if defined(LIBXSMM_VTUNE) # if (2 <= LIBXSMM_VTUNE) /* no header file required */ # if !defined(LIBXSMM_VTUNE_JITVERSION) # define LIBXSMM_VTUNE_JITVERSION LIBXSMM_VTUNE # endif # define LIBXSMM_VTUNE_JIT_DESC_TYPE iJIT_Method_Load_V2 # define LIBXSMM_VTUNE_JIT_LOAD 21 # define LIBXSMM_VTUNE_JIT_UNLOAD 14 # define iJIT_SAMPLING_ON 0x0001 LIBXSMM_EXTERN unsigned int iJIT_GetNewMethodID(void); LIBXSMM_EXTERN /*iJIT_IsProfilingActiveFlags*/int iJIT_IsProfilingActive(void); LIBXSMM_EXTERN int iJIT_NotifyEvent(/*iJIT_JVM_EVENT*/int event_type, void *EventSpecificData); LIBXSMM_EXTERN_C typedef struct LineNumberInfo { unsigned int Offset; unsigned int LineNumber; } LineNumberInfo; LIBXSMM_EXTERN_C typedef struct iJIT_Method_Load_V2 { unsigned int method_id; char* method_name; void* method_load_address; unsigned int method_size; unsigned int line_number_size; LineNumberInfo* line_number_table; char* class_file_name; char* source_file_name; char* module_name; } iJIT_Method_Load_V2; # else /* more safe due to header dependency */ # include # if !defined(LIBXSMM_VTUNE_JITVERSION) # define LIBXSMM_VTUNE_JITVERSION 2 # endif # if (2 <= LIBXSMM_VTUNE_JITVERSION) # define LIBXSMM_VTUNE_JIT_DESC_TYPE iJIT_Method_Load_V2 # define LIBXSMM_VTUNE_JIT_LOAD iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED_V2 # else # define LIBXSMM_VTUNE_JIT_DESC_TYPE iJIT_Method_Load # define LIBXSMM_VTUNE_JIT_LOAD iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED # endif # define LIBXSMM_VTUNE_JIT_UNLOAD iJVM_EVENT_TYPE_METHOD_UNLOAD_START # endif # if !defined(LIBXSMM_MALLOC_FALLBACK) # define LIBXSMM_MALLOC_FALLBACK LIBXSMM_MALLOC_FINAL # endif #else # if !defined(LIBXSMM_MALLOC_FALLBACK) # define LIBXSMM_MALLOC_FALLBACK 0 # endif #endif /*defined(LIBXSMM_VTUNE)*/ #if !defined(LIBXSMM_MALLOC_XMAP_TEMPLATE) # define LIBXSMM_MALLOC_XMAP_TEMPLATE ".libxsmm_jit." LIBXSMM_MKTEMP_PATTERN #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif #if defined(LIBXSMM_PERF) # include "libxsmm_perf.h" #endif #if !defined(LIBXSMM_MALLOC_ALIGNMAX) # define LIBXSMM_MALLOC_ALIGNMAX (2 << 20) /* 2 MB */ #endif #if !defined(LIBXSMM_MALLOC_ALIGNFCT) # define LIBXSMM_MALLOC_ALIGNFCT 16 #endif #if !defined(LIBXSMM_MALLOC_SEED) # define LIBXSMM_MALLOC_SEED 1051981 #endif #if !defined(LIBXSMM_MALLOC_HOOK_KMP) && 0 # define LIBXSMM_MALLOC_HOOK_KMP #endif #if !defined(LIBXSMM_MALLOC_HOOK_QKMALLOC) && 0 # define LIBXSMM_MALLOC_HOOK_QKMALLOC #endif #if !defined(LIBXSMM_MALLOC_HOOK_IMALLOC) && 1 # define LIBXSMM_MALLOC_HOOK_IMALLOC #endif #if !defined(LIBXSMM_MALLOC_HOOK_CHECK) && 0 # define LIBXSMM_MALLOC_HOOK_CHECK 1 #endif #if !defined(LIBXSMM_MALLOC_CRC_LIGHT) && !defined(_DEBUG) && 1 # define LIBXSMM_MALLOC_CRC_LIGHT #endif #if !defined(LIBXSMM_MALLOC_CRC_OFF) # if defined(NDEBUG) && !defined(LIBXSMM_MALLOC_HOOK_STATIC) && !defined(LIBXSMM_MALLOC_HOOK_DYNAMIC) # define LIBXSMM_MALLOC_CRC_OFF # elif !defined(LIBXSMM_BUILD) # define LIBXSMM_MALLOC_CRC_OFF # endif #endif #if !defined(LIBXSMM_MALLOC_SCRATCH_LIMIT) # define LIBXSMM_MALLOC_SCRATCH_LIMIT 0xFFFFFFFF /* ~4 GB */ #endif #if !defined(LIBXSMM_MALLOC_SCRATCH_PADDING) # define LIBXSMM_MALLOC_SCRATCH_PADDING LIBXSMM_CACHELINE #endif /* pointers are checked first if they belong to scratch */ #if !defined(LIBXSMM_MALLOC_SCRATCH_DELETE_FIRST) && 1 # define LIBXSMM_MALLOC_SCRATCH_DELETE_FIRST #endif /* can clobber memory if allocations are not exactly scoped */ #if !defined(LIBXSMM_MALLOC_SCRATCH_TRIM_HEAD) && 0 # define LIBXSMM_MALLOC_SCRATCH_TRIM_HEAD #endif #if !defined(LIBXSMM_MALLOC_SCRATCH_JOIN) && 1 # define LIBXSMM_MALLOC_SCRATCH_JOIN #endif #if !defined(LIBXSMM_MALLOC_LOCK_ONFAULT) && 0 # if defined(MLOCK_ONFAULT) && defined(SYS_mlock2) # define LIBXSMM_MALLOC_LOCK_ONFAULT # endif #endif /* protected against double-delete (if possible) */ #if !defined(LIBXSMM_MALLOC_DELETE_SAFE) && 0 # define LIBXSMM_MALLOC_DELETE_SAFE #endif /* map memory for scratch buffers */ #if !defined(LIBXSMM_MALLOC_MMAP_SCRATCH) && 1 # define LIBXSMM_MALLOC_MMAP_SCRATCH #endif /* map memory for hooked allocation */ #if !defined(LIBXSMM_MALLOC_MMAP_HOOK) && 1 # define LIBXSMM_MALLOC_MMAP_HOOK #endif /* map memory also for non-executable buffers */ #if !defined(LIBXSMM_MALLOC_MMAP) && 1 # define LIBXSMM_MALLOC_MMAP #endif #if defined(LIBXSMM_MALLOC_ALIGN_ALL) # define INTERNAL_MALLOC_AUTOALIGN(SIZE, ALIGNMENT) libxsmm_alignment(SIZE, ALIGNMENT) #else # define INTERNAL_MALLOC_AUTOALIGN(SIZE, ALIGNMENT) (ALIGNMENT) #endif #define INTERNAL_MEMALIGN_HOOK(RESULT, FLAGS, ALIGNMENT, SIZE, CALLER) { \ const int internal_memalign_hook_recursive_ = LIBXSMM_ATOMIC_ADD_FETCH( \ &internal_malloc_recursive, 1, LIBXSMM_ATOMIC_RELAXED); \ if ( 1 < internal_memalign_hook_recursive_ /* protect against recursion */ \ || 0 == (internal_malloc_kind & 1) || 0 >= internal_malloc_kind \ || (internal_malloc_limit[0] > (SIZE)) \ || (internal_malloc_limit[1] < (SIZE) && 0 != internal_malloc_limit[1])) \ { \ const size_t internal_memalign_hook_alignment_ = INTERNAL_MALLOC_AUTOALIGN(SIZE, ALIGNMENT); \ (RESULT) = (0 != internal_memalign_hook_alignment_ \ ? __real_memalign(internal_memalign_hook_alignment_, SIZE) \ : __real_malloc(SIZE)); \ } \ else { /* redirect */ \ LIBXSMM_INIT \ if (NULL == (CALLER)) { /* libxsmm_trace_caller_id may allocate memory */ \ internal_scratch_malloc(&(RESULT), SIZE, ALIGNMENT, FLAGS, \ libxsmm_trace_caller_id(0/*level*/)); \ } \ else { \ internal_scratch_malloc(&(RESULT), SIZE, ALIGNMENT, FLAGS, CALLER); \ } \ } \ LIBXSMM_ATOMIC_SUB_FETCH(&internal_malloc_recursive, 1, LIBXSMM_ATOMIC_RELAXED); \ } #define INTERNAL_REALLOC_HOOK(RESULT, FLAGS, PTR, SIZE, CALLER) { \ if (0 == (internal_malloc_kind & 1) || 0 >= internal_malloc_kind \ /*|| (0 != LIBXSMM_ATOMIC_LOAD(&internal_malloc_recursive, LIBXSMM_ATOMIC_RELAXED))*/ \ || (internal_malloc_limit[0] > (SIZE)) \ || (internal_malloc_limit[1] < (SIZE) && 0 != internal_malloc_limit[1])) \ { \ (RESULT) = __real_realloc(PTR, SIZE); \ } \ else { \ const int nzeros = LIBXSMM_INTRINSICS_BITSCANFWD64((uintptr_t)(PTR)), alignment = 1 << nzeros; \ LIBXSMM_ASSERT(0 == ((uintptr_t)(PTR) & ~(0xFFFFFFFFFFFFFFFF << nzeros))); \ if (NULL == (CALLER)) { /* libxsmm_trace_caller_id may allocate memory */ \ internal_scratch_malloc(&(PTR), SIZE, (size_t)alignment, FLAGS, \ libxsmm_trace_caller_id(0/*level*/)); \ } \ else { \ internal_scratch_malloc(&(PTR), SIZE, (size_t)alignment, FLAGS, CALLER); \ } \ (RESULT) = (PTR); \ } \ } #define INTERNAL_FREE_HOOK(PTR, CALLER) { \ LIBXSMM_UNUSED(CALLER); \ if (0 == (internal_malloc_kind & 1) || 0 >= internal_malloc_kind \ /*|| (0 != LIBXSMM_ATOMIC_LOAD(&internal_malloc_recursive, LIBXSMM_ATOMIC_RELAXED))*/ \ ){ \ __real_free(PTR); \ } \ else { /* recognize pointers not issued by LIBXSMM */ \ libxsmm_free(PTR); \ } \ } #if !defined(WIN32) # if defined(MAP_32BIT) # define IF_INTERNAL_XMALLOC_MAP32(ENV, MAPSTATE, MFLAGS, SIZE, BUFFER, REPTR) \ if (0 != (MAP_32BIT & (MFLAGS))) { \ (BUFFER) = internal_xmalloc_xmap(ENV, SIZE, (MFLAGS) & ~MAP_32BIT, REPTR); \ } \ if (MAP_FAILED != (BUFFER)) (MAPSTATE) = 0; else # else # define IF_INTERNAL_XMALLOC_MAP32(ENV, MAPSTATE, MFLAGS, SIZE, BUFFER, REPTR) # endif # define INTERNAL_XMALLOC(I, FALLBACK, ENVVAR, ENVDEF, MAPSTATE, MFLAGS, SIZE, BUFFER, REPTR) \ if ((I) == (FALLBACK)) { \ static const char* internal_xmalloc_env_ = NULL; \ if (NULL == internal_xmalloc_env_) { \ internal_xmalloc_env_ = getenv(ENVVAR); \ if (NULL == internal_xmalloc_env_) internal_xmalloc_env_ = ENVDEF; \ } \ (BUFFER) = internal_xmalloc_xmap(internal_xmalloc_env_, SIZE, MFLAGS, REPTR); \ if (MAP_FAILED == (BUFFER)) { \ IF_INTERNAL_XMALLOC_MAP32(internal_xmalloc_env_, MAPSTATE, MFLAGS, SIZE, BUFFER, REPTR) \ (FALLBACK) = (I) + 1; \ } \ } # define INTERNAL_XMALLOC_WATERMARK(NAME, WATERMARK, LIMIT, SIZE) { \ const size_t internal_xmalloc_watermark_ = (WATERMARK) + (SIZE) / 2; /* accept data-race */ \ if (internal_xmalloc_watermark_ < (LIMIT)) { \ static size_t internal_xmalloc_watermark_verbose_ = 0; \ (LIMIT) = internal_xmalloc_watermark_; /* accept data-race */ \ if (internal_xmalloc_watermark_verbose_ < internal_xmalloc_watermark_ && \ (LIBXSMM_VERBOSITY_HIGH <= libxsmm_verbosity || 0 > libxsmm_verbosity)) \ { /* muted */ \ char internal_xmalloc_watermark_buffer_[32]; \ /* coverity[check_return] */ \ libxsmm_format_size(internal_xmalloc_watermark_buffer_, sizeof(internal_xmalloc_watermark_buffer_), \ internal_xmalloc_watermark_, "KM", "B", 10); \ fprintf(stderr, "LIBXSMM WARNING: " NAME " watermark reached at %s!\n", internal_xmalloc_watermark_buffer_); \ internal_xmalloc_watermark_verbose_ = internal_xmalloc_watermark_; \ } \ } \ } # define INTERNAL_XMALLOC_KIND(KIND, NAME, FLAG, FLAGS, MFLAGS, WATERMARK, LIMIT, INFO, SIZE, BUFFER) \ if (0 != ((KIND) & (MFLAGS))) { \ if (MAP_FAILED != (BUFFER)) { \ LIBXSMM_ASSERT(NULL != (BUFFER)); \ LIBXSMM_ATOMIC_ADD_FETCH(&(WATERMARK), SIZE, LIBXSMM_ATOMIC_RELAXED); \ (FLAGS) |= (FLAG); \ } \ else { /* retry */ \ (BUFFER) = mmap(NULL == (INFO) ? NULL : (INFO)->pointer, SIZE, PROT_READ | PROT_WRITE, \ MAP_PRIVATE | LIBXSMM_MAP_ANONYMOUS | ((MFLAGS) & ~(KIND)), -1, 0/*offset*/); \ if (MAP_FAILED != (BUFFER)) { /* successful retry */ \ LIBXSMM_ASSERT(NULL != (BUFFER)); \ INTERNAL_XMALLOC_WATERMARK(NAME, WATERMARK, LIMIT, SIZE); \ } \ } \ } #endif LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE internal_malloc_info_type { libxsmm_free_function free; void *pointer, *reloc; const void* context; size_t size; int flags; #if defined(LIBXSMM_VTUNE) unsigned int code_id; #endif #if !defined(LIBXSMM_MALLOC_CRC_OFF) /* hash *must* be the last entry */ unsigned int hash; #endif } internal_malloc_info_type; LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE internal_malloc_pool_type { char pad[LIBXSMM_MALLOC_SCRATCH_PADDING]; struct { size_t minsize, counter, incsize; char *buffer, *head; #if defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (1 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) const void* site; # if (0 != LIBXSMM_SYNC) unsigned int tid; # endif #endif } instance; } internal_malloc_pool_type; /* Scratch pool, which supports up to MAX_NSCRATCH allocation sites. */ #if defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (0 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) /* LIBXSMM_ALIGNED appears to contradict LIBXSMM_APIVAR, and causes multiple defined symbols (if below is seen in multiple translation units) */ LIBXSMM_APIVAR_DEFINE(char internal_malloc_pool_buffer[(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)*sizeof(internal_malloc_pool_type)+(LIBXSMM_MALLOC_SCRATCH_PADDING)-1]); #endif /* Interval of bytes that permit interception (internal_malloc_kind) */ LIBXSMM_APIVAR_DEFINE(size_t internal_malloc_limit[2]); /* Maximum total size of the scratch memory domain. */ LIBXSMM_APIVAR_DEFINE(size_t internal_malloc_scratch_limit); LIBXSMM_APIVAR_DEFINE(size_t internal_malloc_scratch_nmallocs); LIBXSMM_APIVAR_DEFINE(size_t internal_malloc_private_max); LIBXSMM_APIVAR_DEFINE(size_t internal_malloc_private_cur); LIBXSMM_APIVAR_DEFINE(size_t internal_malloc_public_max); LIBXSMM_APIVAR_DEFINE(size_t internal_malloc_public_cur); LIBXSMM_APIVAR_DEFINE(size_t internal_malloc_local_max); LIBXSMM_APIVAR_DEFINE(size_t internal_malloc_local_cur); LIBXSMM_APIVAR_DEFINE(int internal_malloc_recursive); /** 0: regular, 1/odd: intercept/scratch, otherwise: all/scratch */ LIBXSMM_APIVAR_DEFINE(int internal_malloc_kind); #if (0 != LIBXSMM_SYNC) && defined(LIBXSMM_MALLOC_SCRATCH_JOIN) LIBXSMM_APIVAR_DEFINE(int internal_malloc_join); #endif #if !defined(_WIN32) # if defined(MAP_HUGETLB) LIBXSMM_APIVAR_DEFINE(size_t internal_malloc_hugetlb); # endif # if defined(MAP_LOCKED) LIBXSMM_APIVAR_DEFINE(size_t internal_malloc_plocked); # endif #endif LIBXSMM_API_INTERN size_t libxsmm_alignment(size_t size, size_t alignment) { size_t result; if ((LIBXSMM_MALLOC_ALIGNFCT * LIBXSMM_MALLOC_ALIGNMAX) <= size) { result = libxsmm_lcm(0 == alignment ? (LIBXSMM_ALIGNMENT) : libxsmm_lcm(alignment, LIBXSMM_ALIGNMENT), LIBXSMM_MALLOC_ALIGNMAX); } else { /* small-size request */ if ((LIBXSMM_MALLOC_ALIGNFCT * LIBXSMM_ALIGNMENT) <= size) { result = (0 == alignment ? (LIBXSMM_ALIGNMENT) : libxsmm_lcm(alignment, LIBXSMM_ALIGNMENT)); } else if (0 != alignment) { /* custom alignment */ result = libxsmm_lcm(alignment, sizeof(void*)); } else { /* tiny-size request */ result = sizeof(void*); } } return result; } LIBXSMM_API size_t libxsmm_offset(const size_t offset[], const size_t shape[], size_t ndims, size_t* size) { size_t result = 0, size1 = 0; if (0 != ndims && NULL != shape) { size_t i; result = (NULL != offset ? offset[0] : 0); size1 = shape[0]; for (i = 1; i < ndims; ++i) { result += (NULL != offset ? offset[i] : 0) * size1; size1 *= shape[i]; } } if (NULL != size) *size = size1; return result; } LIBXSMM_API_INLINE internal_malloc_info_type* internal_malloc_info(const void* memory, int check) { const char *const buffer = (const char*)memory; internal_malloc_info_type* result = (internal_malloc_info_type*)(NULL != memory ? (buffer - sizeof(internal_malloc_info_type)) : NULL); #if defined(LIBXSMM_MALLOC_HOOK_CHECK) if ((LIBXSMM_MALLOC_HOOK_CHECK) < check) check = (LIBXSMM_MALLOC_HOOK_CHECK); #endif if (0 != check && NULL != result) { /* check ownership */ #if !defined(_WIN32) /* mprotect: pass address rounded down to page/4k alignment */ if (1 == check || 0 == mprotect((void*)(((uintptr_t)result) & 0xFFFFFFFFFFFFF000), sizeof(internal_malloc_info_type), PROT_READ | PROT_WRITE) || ENOMEM != errno) #endif { const size_t maxsize = LIBXSMM_MAX(LIBXSMM_MAX(internal_malloc_public_max, internal_malloc_local_max), internal_malloc_private_max); const int flags_rs = LIBXSMM_MALLOC_FLAG_REALLOC | LIBXSMM_MALLOC_FLAG_SCRATCH; const int flags_mx = LIBXSMM_MALLOC_FLAG_MMAP | LIBXSMM_MALLOC_FLAG_X; const char* const pointer = (const char*)result->pointer; union { libxsmm_free_fun fun; const void* ptr; } convert; convert.fun = result->free.function; if (((flags_mx != (flags_mx & result->flags)) && NULL != result->reloc) || (0 == (LIBXSMM_MALLOC_FLAG_X & result->flags) ? 0 : (0 != (flags_rs & result->flags))) || (0 != (LIBXSMM_MALLOC_FLAG_X & result->flags) && NULL != result->context) #if defined(LIBXSMM_VTUNE) || (0 == (LIBXSMM_MALLOC_FLAG_X & result->flags) && 0 != result->code_id) #endif || (0 != (~LIBXSMM_MALLOC_FLAG_VALID & result->flags)) || (0 == (LIBXSMM_MALLOC_FLAG_R & result->flags)) || pointer == convert.ptr || pointer == result->context || pointer >= buffer || NULL == pointer || maxsize < result->size || 0 == result->size || 2 > libxsmm_ninit /* before checksum calculation */ #if !defined(LIBXSMM_MALLOC_CRC_OFF) /* last check: checksum over info */ # if defined(LIBXSMM_MALLOC_CRC_LIGHT) || result->hash != LIBXSMM_CRC32U(LIBXSMM_BITS)(LIBXSMM_MALLOC_SEED, &result) # else || result->hash != libxsmm_crc32(LIBXSMM_MALLOC_SEED, result, (const char*)&result->hash - (const char*)result) # endif #endif ) { /* mismatch */ result = NULL; } } #if !defined(_WIN32) else { /* mismatch */ result = NULL; } #endif } return result; } LIBXSMM_API_INTERN int internal_xfree(const void* /*memory*/, internal_malloc_info_type* /*info*/); LIBXSMM_API_INTERN int internal_xfree(const void* memory, internal_malloc_info_type* info) { #if !defined(LIBXSMM_BUILD) || !defined(_WIN32) static int error_once = 0; #endif int result = EXIT_SUCCESS, flags; void* buffer; size_t size; LIBXSMM_ASSERT(NULL != memory && NULL != info); buffer = info->pointer; flags = info->flags; size = info->size; #if !defined(LIBXSMM_BUILD) /* sanity check */ if (NULL != buffer || 0 == size) #endif { const size_t alloc_size = size + (((const char*)memory) - ((const char*)buffer)); LIBXSMM_ASSERT(NULL != buffer || 0 == size); if (0 == (LIBXSMM_MALLOC_FLAG_MMAP & flags)) { if (NULL != info->free.function) { #if defined(LIBXSMM_MALLOC_DELETE_SAFE) info->pointer = NULL; info->size = 0; #endif if (NULL == info->context) { #if (defined(LIBXSMM_MALLOC_HOOK_STATIC) || defined(LIBXSMM_MALLOC_HOOK_DYNAMIC)) && 0 if (free == info->free.function) { __real_free(buffer); } else #endif if (NULL != info->free.function) { info->free.function(buffer); } } else { LIBXSMM_ASSERT(NULL != info->free.ctx_form); info->free.ctx_form(buffer, info->context); } } } else { #if defined(LIBXSMM_VTUNE) if (0 != (LIBXSMM_MALLOC_FLAG_X & flags) && 0 != info->code_id && iJIT_SAMPLING_ON == iJIT_IsProfilingActive()) { iJIT_NotifyEvent(LIBXSMM_VTUNE_JIT_UNLOAD, &info->code_id); } #endif #if defined(_WIN32) result = (NULL == buffer || FALSE != VirtualFree(buffer, 0, MEM_RELEASE)) ? EXIT_SUCCESS : EXIT_FAILURE; #else /* !_WIN32 */ { const size_t unmap_size = LIBXSMM_UP2(alloc_size, LIBXSMM_PAGE_MINSIZE); void* const reloc = info->reloc; if (0 != munmap(buffer, unmap_size)) { if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: %s (attempted to unmap buffer %p+%" PRIuPTR ")!\n", strerror(errno), buffer, (uintptr_t)unmap_size); } result = EXIT_FAILURE; } if (0 != (LIBXSMM_MALLOC_FLAG_X & flags) && EXIT_SUCCESS == result && NULL != reloc && MAP_FAILED != reloc && buffer != reloc && 0 != munmap(reloc, unmap_size)) { if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: %s (attempted to unmap code %p+%" PRIuPTR ")!\n", strerror(errno), reloc, (uintptr_t)unmap_size); } result = EXIT_FAILURE; } } #endif } if (0 == (LIBXSMM_MALLOC_FLAG_X & flags)) { /* update statistics */ #if !defined(_WIN32) # if defined(MAP_HUGETLB) if (0 != (LIBXSMM_MALLOC_FLAG_PHUGE & flags)) { /* huge pages */ LIBXSMM_ASSERT(0 != (LIBXSMM_MALLOC_FLAG_MMAP & flags)); LIBXSMM_ATOMIC_SUB_FETCH(&internal_malloc_hugetlb, alloc_size, LIBXSMM_ATOMIC_RELAXED); } # endif # if defined(MAP_LOCKED) if (0 != (LIBXSMM_MALLOC_FLAG_PLOCK & flags)) { /* page-locked */ LIBXSMM_ASSERT(0 != (LIBXSMM_MALLOC_FLAG_MMAP & flags)); LIBXSMM_ATOMIC_SUB_FETCH(&internal_malloc_plocked, alloc_size, LIBXSMM_ATOMIC_RELAXED); } # endif #endif if (0 == (LIBXSMM_MALLOC_FLAG_PRIVATE & flags)) { /* public */ if (0 != (LIBXSMM_MALLOC_FLAG_SCRATCH & flags)) { /* scratch */ #if 1 const size_t current = (size_t)LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_LOAD, LIBXSMM_BITS)( &internal_malloc_public_cur, LIBXSMM_ATOMIC_RELAXED); LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_STORE, LIBXSMM_BITS)(&internal_malloc_public_cur, alloc_size <= current ? (current - alloc_size) : 0, LIBXSMM_ATOMIC_RELAXED); #else LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_SUB_FETCH, LIBXSMM_BITS)( &internal_malloc_public_cur, alloc_size, LIBXSMM_ATOMIC_RELAXED); #endif } else { /* local */ #if 1 const size_t current = (size_t)LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_LOAD, LIBXSMM_BITS)( &internal_malloc_local_cur, LIBXSMM_ATOMIC_RELAXED); LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_STORE, LIBXSMM_BITS)(&internal_malloc_local_cur, alloc_size <= current ? (current - alloc_size) : 0, LIBXSMM_ATOMIC_RELAXED); #else LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_SUB_FETCH, LIBXSMM_BITS)( &internal_malloc_local_cur, alloc_size, LIBXSMM_ATOMIC_RELAXED); #endif } } else { /* private */ #if 1 const size_t current = (size_t)LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_LOAD, LIBXSMM_BITS)( &internal_malloc_private_cur, LIBXSMM_ATOMIC_RELAXED); LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_STORE, LIBXSMM_BITS)(&internal_malloc_private_cur, alloc_size <= current ? (current - alloc_size) : 0, LIBXSMM_ATOMIC_RELAXED); #else LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_SUB_FETCH, LIBXSMM_BITS)( &internal_malloc_private_cur, alloc_size, LIBXSMM_ATOMIC_RELAXED); #endif } } } #if !defined(LIBXSMM_BUILD) else if ((LIBXSMM_VERBOSITY_WARN <= libxsmm_verbosity || 0 > libxsmm_verbosity) /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM WARNING: attempt to release memory from non-matching implementation!\n"); } #endif return result; } LIBXSMM_API_INLINE size_t internal_get_scratch_size(const internal_malloc_pool_type* exclude) { size_t result = 0; #if !defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) || (1 >= (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) LIBXSMM_UNUSED(exclude); #else const internal_malloc_pool_type* pool = (const internal_malloc_pool_type*)LIBXSMM_UP2( (uintptr_t)internal_malloc_pool_buffer, LIBXSMM_MALLOC_SCRATCH_PADDING); # if (1 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) const internal_malloc_pool_type *const end = pool + libxsmm_scratch_pools; LIBXSMM_ASSERT(libxsmm_scratch_pools <= LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS); for (; pool != end; ++pool) # endif /*(1 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS))*/ { if (0 != pool->instance.minsize) { # if 1 /* memory info is not used */ if (pool != exclude && (LIBXSMM_MALLOC_INTERNAL_CALLER) != pool->instance.site) { result += pool->instance.minsize; } # else const internal_malloc_info_type* const info = internal_malloc_info(pool->instance.buffer, 0/*no check*/); if (NULL != info && pool != exclude && (LIBXSMM_MALLOC_INTERNAL_CALLER) != pool->instance.site) { result += info->size; } # endif } else break; /* early exit */ } #endif /*defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (0 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS))*/ return result; } LIBXSMM_API_INLINE internal_malloc_pool_type* internal_scratch_malloc_pool(const void* memory) { internal_malloc_pool_type* result = NULL; internal_malloc_pool_type* pool = (internal_malloc_pool_type*)LIBXSMM_UP2( (uintptr_t)internal_malloc_pool_buffer, LIBXSMM_MALLOC_SCRATCH_PADDING); const char* const buffer = (const char*)memory; #if defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (1 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) const unsigned int npools = libxsmm_scratch_pools; #else const unsigned int npools = 1; #endif internal_malloc_pool_type *const end = pool + npools; LIBXSMM_ASSERT(npools <= LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS); LIBXSMM_ASSERT(NULL != memory); for (; pool != end; ++pool) { if (0 != pool->instance.minsize) { if (0 != pool->instance.counter #if 1 /* should be implied by non-zero counter */ && NULL != pool->instance.buffer #endif ){/* check if memory belongs to scratch domain or local domain */ #if 1 const size_t size = pool->instance.minsize; #else const internal_malloc_info_type* const info = internal_malloc_info(pool->instance.buffer, 0/*no check*/); const size_t size = info->size; #endif if (pool->instance.buffer == buffer /* fast path */ || (pool->instance.buffer < buffer && buffer < (pool->instance.buffer + size))) { result = pool; break; } } } else break; /* early exit */ } return result; } LIBXSMM_API_INTERN void internal_scratch_free(const void* /*memory*/, internal_malloc_pool_type* /*pool*/); LIBXSMM_API_INTERN void internal_scratch_free(const void* memory, internal_malloc_pool_type* pool) { #if defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (0 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) const size_t counter = LIBXSMM_ATOMIC_SUB_FETCH(&pool->instance.counter, 1, LIBXSMM_ATOMIC_SEQ_CST); char* const pool_buffer = pool->instance.buffer; # if !defined(NDEBUG) || defined(LIBXSMM_MALLOC_SCRATCH_TRIM_HEAD) char *const buffer = (char*)memory; /* non-const */ LIBXSMM_ASSERT(pool_buffer <= buffer && buffer < pool_buffer + pool->instance.minsize); # endif LIBXSMM_ASSERT(pool_buffer <= pool->instance.head); if (0 == counter) { /* reuse or reallocate scratch domain */ internal_malloc_info_type *const info = internal_malloc_info(pool_buffer, 0/*no check*/); const size_t scale_size = (size_t)(1 != libxsmm_scratch_scale ? (libxsmm_scratch_scale * info->size) : info->size); /* hysteresis */ const size_t size = pool->instance.minsize + pool->instance.incsize; LIBXSMM_ASSERT(0 == (LIBXSMM_MALLOC_FLAG_X & info->flags)); /* scratch memory is not executable */ if (size <= scale_size) { /* reuse scratch domain */ pool->instance.head = pool_buffer; /* reuse scratch domain */ } else { /* release buffer */ # if !defined(NDEBUG) static int error_once = 0; # endif pool->instance.buffer = pool->instance.head = NULL; # if defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (1 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) pool->instance.site = NULL; /* clear affinity */ # endif # if !defined(NDEBUG) if (EXIT_SUCCESS != internal_xfree(pool_buffer, info) && 0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: memory deallocation failed!\n"); } # else internal_xfree(pool_buffer, info); /* !libxsmm_free */ # endif } } # if defined(LIBXSMM_MALLOC_SCRATCH_TRIM_HEAD) /* TODO: document linear/scoped allocator policy */ else if (buffer < pool->instance.head) { /* reuse scratch domain */ pool->instance.head = buffer; } # else LIBXSMM_UNUSED(memory); # endif #else LIBXSMM_UNUSED(memory); LIBXSMM_UNUSED(pool); #endif } LIBXSMM_API_INTERN void internal_scratch_malloc(void** /*memory*/, size_t /*size*/, size_t /*alignment*/, int /*flags*/, const void* /*caller*/); LIBXSMM_API_INTERN void internal_scratch_malloc(void** memory, size_t size, size_t alignment, int flags, const void* caller) { LIBXSMM_ASSERT(NULL != memory && 0 == (LIBXSMM_MALLOC_FLAG_X & flags)); if (0 == (LIBXSMM_MALLOC_FLAG_REALLOC & flags) || NULL == *memory) { static int error_once = 0; size_t local_size = 0; #if defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (0 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) if (0 < libxsmm_scratch_pools) { internal_malloc_pool_type *const pools = (internal_malloc_pool_type*)LIBXSMM_UP2( (uintptr_t)internal_malloc_pool_buffer, LIBXSMM_MALLOC_SCRATCH_PADDING); internal_malloc_pool_type *const end = pools + libxsmm_scratch_pools, *pool = pools; const size_t align_size = libxsmm_alignment(size, alignment), alloc_size = size + align_size - 1; # if (0 != LIBXSMM_SYNC) const unsigned int tid = libxsmm_get_tid(); # endif unsigned int npools = 1; # if defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (1 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) const void *const site = caller; /* no further attempt in case of NULL */ internal_malloc_pool_type *pool0 = end; for (; pool != end; ++pool) { /* counter: memory info is not employed as pools are still manipulated */ if (NULL != pool->instance.buffer) { if ((LIBXSMM_MALLOC_INTERNAL_CALLER) != pool->instance.site) ++npools; /* count number of occupied pools */ if ( /* find matching pool and enter fast path (draw from pool-buffer) */ # if (0 != LIBXSMM_SYNC) && !defined(LIBXSMM_MALLOC_SCRATCH_JOIN) (site == pool->instance.site && tid == pool->instance.tid)) # elif (0 != LIBXSMM_SYNC) (site == pool->instance.site && (0 != internal_malloc_join || tid == pool->instance.tid))) # else (site == pool->instance.site)) # endif { break; } } else { if (end == pool0) pool0 = pool; /* first available pool*/ if (0 == pool->instance.minsize) { /* early exit */ pool = pool0; break; } } } # endif LIBXSMM_ASSERT(NULL != pool); if (end != pool && 0 <= internal_malloc_kind) { const size_t counter = LIBXSMM_ATOMIC_ADD_FETCH(&pool->instance.counter, (size_t)1, LIBXSMM_ATOMIC_SEQ_CST); if (NULL != pool->instance.buffer || 1 != counter) { /* attempt to (re-)use existing pool */ const internal_malloc_info_type *const info = internal_malloc_info(pool->instance.buffer, 1/*check*/); const size_t pool_size = ((NULL != info && 0 != counter) ? info->size : 0); const size_t used_size = pool->instance.head - pool->instance.buffer; const size_t req_size = alloc_size + used_size; if (req_size <= pool_size) { /* fast path: draw from pool-buffer */ # if (0 != LIBXSMM_SYNC) && defined(LIBXSMM_MALLOC_SCRATCH_JOIN) void *const headaddr = &pool->instance.head; char *const head = (0 == internal_malloc_join ? (pool->instance.head += alloc_size) : ((char*)LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_ADD_FETCH, LIBXSMM_BITS)( (uintptr_t*)headaddr, alloc_size, LIBXSMM_ATOMIC_SEQ_CST))); # else char *const head = (char*)(pool->instance.head += alloc_size); # endif *memory = LIBXSMM_ALIGN(head - alloc_size, align_size); } else { /* fall-back to local memory allocation */ const size_t incsize = req_size - LIBXSMM_MIN(pool_size, req_size); pool->instance.incsize = LIBXSMM_MAX(pool->instance.incsize, incsize); # if (0 != LIBXSMM_SYNC) && defined(LIBXSMM_MALLOC_SCRATCH_JOIN) if (0 == internal_malloc_join) { --pool->instance.counter; } else { LIBXSMM_ATOMIC_SUB_FETCH(&pool->instance.counter, 1, LIBXSMM_ATOMIC_SEQ_CST); } # else --pool->instance.counter; # endif if ( # if defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (1 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) (LIBXSMM_MALLOC_INTERNAL_CALLER) != pool->instance.site && # endif 0 == (LIBXSMM_MALLOC_FLAG_PRIVATE & flags)) { const size_t watermark = LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_ADD_FETCH, LIBXSMM_BITS)( &internal_malloc_local_cur, alloc_size, LIBXSMM_ATOMIC_RELAXED); if (internal_malloc_local_max < watermark) internal_malloc_local_max = watermark; /* accept data-race */ } else { const size_t watermark = LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_ADD_FETCH, LIBXSMM_BITS)( &internal_malloc_private_cur, alloc_size, LIBXSMM_ATOMIC_RELAXED); if (internal_malloc_private_max < watermark) internal_malloc_private_max = watermark; /* accept data-race */ } local_size = size; } } else { /* fresh pool */ const size_t scratch_limit = libxsmm_get_scratch_limit(); const size_t scratch_size = internal_get_scratch_size(pool); /* exclude current pool */ const size_t limit_size = (1 < npools ? (scratch_limit - LIBXSMM_MIN(scratch_size, scratch_limit)) : LIBXSMM_SCRATCH_UNLIMITED); const size_t scale_size = (size_t)(1 != libxsmm_scratch_scale ? (libxsmm_scratch_scale * alloc_size) : alloc_size); /* hysteresis */ const size_t incsize = (size_t)(libxsmm_scratch_scale * pool->instance.incsize); const size_t maxsize = LIBXSMM_MAX(scale_size, pool->instance.minsize) + incsize; const size_t limsize = LIBXSMM_MIN(maxsize, limit_size); const size_t minsize = limsize; assert(1 <= libxsmm_scratch_scale); /* !LIBXSMM_ASSERT */ LIBXSMM_ASSERT(1 == counter); pool->instance.incsize = 0; /* reset */ pool->instance.minsize = minsize; # if defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (1 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) pool->instance.site = site; # if (0 != LIBXSMM_SYNC) pool->instance.tid = tid; # endif # endif if (alloc_size <= minsize && /* allocate scratch pool */ EXIT_SUCCESS == libxsmm_xmalloc(memory, minsize, 0/*auto-align*/, (flags | LIBXSMM_MALLOC_FLAG_SCRATCH) & ~LIBXSMM_MALLOC_FLAG_REALLOC, NULL/*extra*/, 0/*extra_size*/)) { pool->instance.buffer = (char*)*memory; pool->instance.head = pool->instance.buffer + alloc_size; *memory = LIBXSMM_ALIGN((char*)*memory, align_size); # if defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (1 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) if ((LIBXSMM_MALLOC_INTERNAL_CALLER) != pool->instance.site) # endif { LIBXSMM_ATOMIC_ADD_FETCH(&internal_malloc_scratch_nmallocs, 1, LIBXSMM_ATOMIC_RELAXED); } } else { /* fall-back to local allocation */ LIBXSMM_ATOMIC_SUB_FETCH(&pool->instance.counter, 1, LIBXSMM_ATOMIC_SEQ_CST); if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { if (alloc_size <= minsize) { fprintf(stderr, "LIBXSMM ERROR: failed to allocate scratch memory!\n"); } else if ((LIBXSMM_MALLOC_INTERNAL_CALLER) != caller && (LIBXSMM_VERBOSITY_WARN <= libxsmm_verbosity || 0 > libxsmm_verbosity)) { fprintf(stderr, "LIBXSMM WARNING: scratch memory domain exhausted!\n"); } } local_size = size; } } } else { /* fall-back to local memory allocation */ local_size = size; } } else { /* fall-back to local memory allocation */ local_size = size; } if (0 != local_size) #else local_size = size; #endif /*defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (0 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS))*/ { /* local memory allocation */ if (EXIT_SUCCESS != libxsmm_xmalloc(memory, local_size, alignment, flags & ~(LIBXSMM_MALLOC_FLAG_SCRATCH | LIBXSMM_MALLOC_FLAG_REALLOC), NULL/*extra*/, 0/*extra_size*/) && /* library code is expected to be mute */0 != libxsmm_verbosity && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: scratch memory fall-back failed!\n"); LIBXSMM_ASSERT(NULL == *memory); } if ((LIBXSMM_MALLOC_INTERNAL_CALLER) != caller) { LIBXSMM_ATOMIC_ADD_FETCH(&internal_malloc_scratch_nmallocs, 1, LIBXSMM_ATOMIC_RELAXED); } } } else { /* reallocate memory */ const void *const preserve = *memory; #if defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (0 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) internal_malloc_pool_type *const pool = internal_scratch_malloc_pool(preserve); if (NULL != pool) { const internal_malloc_info_type *const info = internal_malloc_info(pool->instance.buffer, 0/*no check*/); void* buffer; LIBXSMM_ASSERT(pool->instance.buffer <= pool->instance.head && NULL != info); internal_scratch_malloc(&buffer, size, alignment, ~LIBXSMM_MALLOC_FLAG_REALLOC & (LIBXSMM_MALLOC_FLAG_SCRATCH | flags), caller); if (NULL != buffer) { memcpy(buffer, preserve, LIBXSMM_MIN(size, info->size)); /* TODO: memmove? */ *memory = buffer; } internal_scratch_free(memory, pool); } else #endif { /* non-pooled (potentially foreign pointer) */ #if !defined(NDEBUG) const int status = #endif libxsmm_xmalloc(memory, size, alignment/* no need here to determine alignment of given buffer */, ~LIBXSMM_MALLOC_FLAG_SCRATCH & flags, NULL/*extra*/, 0/*extra_size*/); assert(EXIT_SUCCESS == status || NULL == *memory); /* !LIBXSMM_ASSERT */ } } } #if defined(LIBXSMM_MALLOC_HOOK_DYNAMIC) LIBXSMM_APIVAR_PRIVATE_DEF(libxsmm_malloc_fntype libxsmm_malloc_fn); #if defined(LIBXSMM_MALLOC_HOOK_QKMALLOC) LIBXSMM_API_INTERN void* internal_memalign_malloc(size_t /*alignment*/, size_t /*size*/); LIBXSMM_API_INTERN void* internal_memalign_malloc(size_t alignment, size_t size) { LIBXSMM_UNUSED(alignment); LIBXSMM_ASSERT(NULL != libxsmm_malloc_fn.malloc.dlsym); return libxsmm_malloc_fn.malloc.ptr(size); } #elif defined(LIBXSMM_MALLOC_HOOK_KMP) LIBXSMM_API_INTERN void* internal_memalign_twiddle(size_t /*alignment*/, size_t /*size*/); LIBXSMM_API_INTERN void* internal_memalign_twiddle(size_t alignment, size_t size) { LIBXSMM_ASSERT(NULL != libxsmm_malloc_fn.alignmem.dlsym); return libxsmm_malloc_fn.alignmem.ptr(size, alignment); } #endif #endif /*defined(LIBXSMM_MALLOC_HOOK_DYNAMIC)*/ #if (defined(LIBXSMM_MALLOC_HOOK_STATIC) || defined(LIBXSMM_MALLOC_HOOK_DYNAMIC)) LIBXSMM_API_INTERN void* internal_memalign_hook(size_t /*alignment*/, size_t /*size*/, const void* /*caller*/); LIBXSMM_API_INTERN void* internal_memalign_hook(size_t alignment, size_t size, const void* caller) { void* result; # if defined(LIBXSMM_MALLOC_MMAP_HOOK) INTERNAL_MEMALIGN_HOOK(result, LIBXSMM_MALLOC_FLAG_MMAP, alignment, size, caller); # else INTERNAL_MEMALIGN_HOOK(result, LIBXSMM_MALLOC_FLAG_DEFAULT, alignment, size, caller); # endif return result; } LIBXSMM_API void* __wrap_memalign(size_t /*alignment*/, size_t /*size*/); LIBXSMM_API void* __wrap_memalign(size_t alignment, size_t size) { void* result; # if defined(LIBXSMM_MALLOC_MMAP_HOOK) INTERNAL_MEMALIGN_HOOK(result, LIBXSMM_MALLOC_FLAG_MMAP, alignment, size, NULL/*caller*/); # else INTERNAL_MEMALIGN_HOOK(result, LIBXSMM_MALLOC_FLAG_DEFAULT, alignment, size, NULL/*caller*/); # endif return result; } LIBXSMM_API_INTERN void* internal_malloc_hook(size_t /*size*/, const void* /*caller*/); LIBXSMM_API_INTERN void* internal_malloc_hook(size_t size, const void* caller) { return internal_memalign_hook(0/*auto-alignment*/, size, caller); } LIBXSMM_API void* __wrap_malloc(size_t /*size*/); LIBXSMM_API void* __wrap_malloc(size_t size) { void* result; # if defined(LIBXSMM_MALLOC_MMAP_HOOK) INTERNAL_MEMALIGN_HOOK(result, LIBXSMM_MALLOC_FLAG_MMAP, 0/*auto-alignment*/, size, NULL/*caller*/); # else INTERNAL_MEMALIGN_HOOK(result, LIBXSMM_MALLOC_FLAG_DEFAULT, 0/*auto-alignment*/, size, NULL/*caller*/); # endif return result; } #if defined(LIBXSMM_MALLOC_HOOK_CALLOC) LIBXSMM_API void* __wrap_calloc(size_t /*num*/, size_t /*size*/); LIBXSMM_API void* __wrap_calloc(size_t num, size_t size) { void* result; const size_t nbytes = num * size; # if defined(LIBXSMM_MALLOC_MMAP_HOOK) INTERNAL_MEMALIGN_HOOK(result, LIBXSMM_MALLOC_FLAG_MMAP, 0/*auto-alignment*/, nbytes, NULL/*caller*/); # else INTERNAL_MEMALIGN_HOOK(result, LIBXSMM_MALLOC_FLAG_DEFAULT, 0/*auto-alignment*/, nbytes, NULL/*caller*/); # endif /* TODO: signal anonymous/zeroed pages */ if (NULL != result) memset(result, 0, nbytes); return result; } #endif #if defined(LIBXSMM_MALLOC_HOOK_REALLOC) LIBXSMM_API_INTERN void* internal_realloc_hook(void* /*ptr*/, size_t /*size*/, const void* /*caller*/); LIBXSMM_API_INTERN void* internal_realloc_hook(void* ptr, size_t size, const void* caller) { void* result; # if defined(LIBXSMM_MALLOC_MMAP_HOOK) INTERNAL_REALLOC_HOOK(result, LIBXSMM_MALLOC_FLAG_REALLOC | LIBXSMM_MALLOC_FLAG_MMAP, ptr, size, caller); # else INTERNAL_REALLOC_HOOK(result, LIBXSMM_MALLOC_FLAG_REALLOC | LIBXSMM_MALLOC_FLAG_DEFAULT, ptr, size, caller); # endif return result; } LIBXSMM_API void* __wrap_realloc(void* /*ptr*/, size_t /*size*/); LIBXSMM_API void* __wrap_realloc(void* ptr, size_t size) { void* result; # if defined(LIBXSMM_MALLOC_MMAP_HOOK) INTERNAL_REALLOC_HOOK(result, LIBXSMM_MALLOC_FLAG_REALLOC | LIBXSMM_MALLOC_FLAG_MMAP, ptr, size, NULL/*caller*/); # else INTERNAL_REALLOC_HOOK(result, LIBXSMM_MALLOC_FLAG_REALLOC | LIBXSMM_MALLOC_FLAG_DEFAULT, ptr, size, NULL/*caller*/); # endif return result; } #endif LIBXSMM_API_INTERN void internal_free_hook(void* /*ptr*/, const void* /*caller*/); LIBXSMM_API_INTERN void internal_free_hook(void* ptr, const void* caller) { INTERNAL_FREE_HOOK(ptr, caller); } LIBXSMM_API void __wrap_free(void* /*ptr*/); LIBXSMM_API void __wrap_free(void* ptr) { INTERNAL_FREE_HOOK(ptr, NULL/*caller*/); } #endif /*(defined(LIBXSMM_MALLOC_HOOK_STATIC) || defined(LIBXSMM_MALLOC_HOOK_DYNAMIC))*/ #if defined(LIBXSMM_MALLOC_HOOK_DYNAMIC) LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK LIBXSMM_ATTRIBUTE_MALLOC void* memalign(size_t /*alignment*/, size_t /*size*/) LIBXSMM_THROW; LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK LIBXSMM_ATTRIBUTE_MALLOC void* memalign(size_t alignment, size_t size) LIBXSMM_THROW { void* result; # if defined(LIBXSMM_MALLOC_MMAP_HOOK) INTERNAL_MEMALIGN_HOOK(result, LIBXSMM_MALLOC_FLAG_MMAP, alignment, size, NULL/*caller*/); # else INTERNAL_MEMALIGN_HOOK(result, LIBXSMM_MALLOC_FLAG_DEFAULT, alignment, size, NULL/*caller*/); # endif return result; } LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK LIBXSMM_ATTRIBUTE_MALLOC void* malloc(size_t /*size*/) LIBXSMM_THROW; LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK LIBXSMM_ATTRIBUTE_MALLOC void* malloc(size_t size) LIBXSMM_THROW { void* result; # if defined(LIBXSMM_MALLOC_MMAP_HOOK) INTERNAL_MEMALIGN_HOOK(result, LIBXSMM_MALLOC_FLAG_MMAP, 0/*auto-alignment*/, size, NULL/*caller*/); # else INTERNAL_MEMALIGN_HOOK(result, LIBXSMM_MALLOC_FLAG_DEFAULT, 0/*auto-alignment*/, size, NULL/*caller*/); # endif return result; } #if defined(LIBXSMM_MALLOC_HOOK_CALLOC) LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK LIBXSMM_ATTRIBUTE_MALLOC void* calloc(size_t /*num*/, size_t /*size*/) LIBXSMM_THROW; LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK LIBXSMM_ATTRIBUTE_MALLOC void* calloc(size_t num, size_t size) LIBXSMM_THROW { void* result; const size_t nbytes = num * size; # if defined(LIBXSMM_MALLOC_MMAP_HOOK) INTERNAL_MEMALIGN_HOOK(result, LIBXSMM_MALLOC_FLAG_MMAP, 0/*auto-alignment*/, nbytes, NULL/*caller*/); # else INTERNAL_MEMALIGN_HOOK(result, LIBXSMM_MALLOC_FLAG_DEFAULT, 0/*auto-alignment*/, nbytes, NULL/*caller*/); # endif /* TODO: signal anonymous/zeroed pages */ if (NULL != result) memset(result, 0, nbytes); return result; } #endif #if defined(LIBXSMM_MALLOC_HOOK_REALLOC) LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void* realloc(void* /*ptr*/, size_t /*size*/) LIBXSMM_THROW; LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void* realloc(void* ptr, size_t size) LIBXSMM_THROW { void* result; # if defined(LIBXSMM_MALLOC_MMAP_HOOK) INTERNAL_REALLOC_HOOK(result, LIBXSMM_MALLOC_FLAG_REALLOC | LIBXSMM_MALLOC_FLAG_MMAP, ptr, size, NULL/*caller*/); # else INTERNAL_REALLOC_HOOK(result, LIBXSMM_MALLOC_FLAG_REALLOC | LIBXSMM_MALLOC_FLAG_DEFAULT, ptr, size, NULL/*caller*/); # endif return result; } #endif LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void free(void* /*ptr*/) LIBXSMM_THROW; LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void free(void* ptr) LIBXSMM_THROW { INTERNAL_FREE_HOOK(ptr, NULL/*caller*/); } #endif /*defined(LIBXSMM_MALLOC_HOOK_DYNAMIC)*/ LIBXSMM_API_INTERN void libxsmm_malloc_init(void) { #if (0 != LIBXSMM_SYNC) && defined(LIBXSMM_MALLOC_SCRATCH_JOIN) const char *const env = getenv("LIBXSMM_MALLOC_JOIN"); if (NULL != env && 0 != *env) internal_malloc_join = atoi(env); #endif #if defined(LIBXSMM_MALLOC_HOOK_DYNAMIC) # if defined(LIBXSMM_MALLOC_HOOK_QKMALLOC) void* handle_qkmalloc = NULL; dlerror(); /* clear an eventual error status */ handle_qkmalloc = dlopen("libqkmalloc.so", RTLD_LAZY); if (NULL != handle_qkmalloc) { libxsmm_malloc_fn.memalign.ptr = internal_memalign_malloc; libxsmm_malloc_fn.malloc.dlsym = dlsym(handle_qkmalloc, "malloc"); if (NULL == dlerror() && NULL != libxsmm_malloc_fn.malloc.dlsym) { # if defined(LIBXSMM_MALLOC_HOOK_CALLOC) libxsmm_malloc_fn.calloc.dlsym = dlsym(handle_qkmalloc, "calloc"); if (NULL == dlerror() && NULL != libxsmm_malloc_fn.calloc.dlsym) # endif { # if defined(LIBXSMM_MALLOC_HOOK_REALLOC) libxsmm_malloc_fn.realloc.dlsym = dlsym(handle_qkmalloc, "realloc"); if (NULL == dlerror() && NULL != libxsmm_malloc_fn.realloc.dlsym) # endif { libxsmm_malloc_fn.free.dlsym = dlsym(handle_qkmalloc, "free"); } } } dlclose(handle_qkmalloc); } if (NULL == libxsmm_malloc_fn.free.ptr) # elif defined(LIBXSMM_MALLOC_HOOK_KMP) dlerror(); /* clear an eventual error status */ libxsmm_malloc_fn.alignmem.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "kmp_aligned_malloc"); if (NULL == dlerror() && NULL != libxsmm_malloc_fn.alignmem.dlsym) { libxsmm_malloc_fn.memalign.ptr = internal_memalign_twiddle; libxsmm_malloc_fn.malloc.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "kmp_malloc"); if (NULL == dlerror() && NULL != libxsmm_malloc_fn.malloc.dlsym) { # if defined(LIBXSMM_MALLOC_HOOK_CALLOC) libxsmm_malloc_fn.calloc.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "kmp_calloc"); if (NULL == dlerror() && NULL != libxsmm_malloc_fn.calloc.dlsym) # endif { # if defined(LIBXSMM_MALLOC_HOOK_REALLOC) libxsmm_malloc_fn.realloc.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "kmp_realloc"); if (NULL == dlerror() && NULL != libxsmm_malloc_fn.realloc.dlsym) # endif { libxsmm_malloc_fn.free.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "kmp_free"); } } } } if (NULL == libxsmm_malloc_fn.free.ptr) # endif /*defined(LIBXSMM_MALLOC_HOOK_QKMALLOC)*/ { dlerror(); /* clear an eventual error status */ # if (defined(LIBXSMM_BUILD) && (1 < (LIBXSMM_BUILD))) libxsmm_malloc_fn.memalign.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "__libc_memalign"); if (NULL == dlerror() && NULL != libxsmm_malloc_fn.memalign.dlsym) { libxsmm_malloc_fn.malloc.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "__libc_malloc"); if (NULL == dlerror() && NULL != libxsmm_malloc_fn.malloc.dlsym) { # if defined(LIBXSMM_MALLOC_HOOK_CALLOC) libxsmm_malloc_fn.calloc.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "__libc_calloc"); if (NULL == dlerror() && NULL != libxsmm_malloc_fn.calloc.dlsym) # endif { # if defined(LIBXSMM_MALLOC_HOOK_REALLOC) libxsmm_malloc_fn.realloc.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "__libc_realloc"); if (NULL == dlerror() && NULL != libxsmm_malloc_fn.realloc.dlsym) # endif { libxsmm_malloc_fn.free.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "__libc_free"); } } } } if (NULL == libxsmm_malloc_fn.free.ptr) { void* handle_libc = NULL; dlerror(); /* clear an eventual error status */ handle_libc = dlopen("libc.so." LIBXSMM_STRINGIFY(LIBXSMM_MALLOC_GLIBC), RTLD_LAZY); if (NULL != handle_libc) { libxsmm_malloc_fn.memalign.dlsym = dlsym(handle_libc, "__libc_memalign"); if (NULL == dlerror() && NULL != libxsmm_malloc_fn.memalign.dlsym) { libxsmm_malloc_fn.malloc.dlsym = dlsym(handle_libc, "__libc_malloc"); if (NULL == dlerror() && NULL != libxsmm_malloc_fn.malloc.dlsym) { # if defined(LIBXSMM_MALLOC_HOOK_CALLOC) libxsmm_malloc_fn.calloc.dlsym = dlsym(handle_libc, "__libc_calloc"); if (NULL == dlerror() && NULL != libxsmm_malloc_fn.calloc.dlsym) # endif { # if defined(LIBXSMM_MALLOC_HOOK_REALLOC) libxsmm_malloc_fn.realloc.dlsym = dlsym(handle_libc, "__libc_realloc"); if (NULL == dlerror() && NULL != libxsmm_malloc_fn.realloc.dlsym) # endif { libxsmm_malloc_fn.free.dlsym = dlsym(handle_libc, "__libc_free"); } } } } dlclose(handle_libc); } } # if 0 { /* attempt to setup deprecated GLIBC hooks */ union { const void* dlsym; void* (**ptr)(size_t, size_t, const void*); } hook_memalign; dlerror(); /* clear an eventual error status */ hook_memalign.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "__memalign_hook"); if (NULL == dlerror() && NULL != hook_memalign.dlsym) { union { const void* dlsym; void* (**ptr)(size_t, const void*); } hook_malloc; hook_malloc.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "__malloc_hook"); if (NULL == dlerror() && NULL != hook_malloc.dlsym) { # if defined(LIBXSMM_MALLOC_HOOK_REALLOC) union { const void* dlsym; void* (**ptr)(void*, size_t, const void*); } hook_realloc; hook_realloc.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "__realloc_hook"); if (NULL == dlerror() && NULL != hook_realloc.dlsym) # endif { union { const void* dlsym; void (**ptr)(void*, const void*); } hook_free; hook_free.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "__free_hook"); if (NULL == dlerror() && NULL != hook_free.dlsym) { *hook_memalign.ptr = internal_memalign_hook; *hook_malloc.ptr = internal_malloc_hook; # if defined(LIBXSMM_MALLOC_HOOK_REALLOC) *hook_realloc.ptr = internal_realloc_hook; # endif *hook_free.ptr = internal_free_hook; } } } } } # endif # else /* TODO */ # endif /*(defined(LIBXSMM_BUILD) && (1 < (LIBXSMM_BUILD)))*/ } if (NULL != libxsmm_malloc_fn.free.ptr) { # if defined(LIBXSMM_MALLOC_HOOK_IMALLOC) union { const void* dlsym; libxsmm_malloc_fun* ptr; } i_malloc; i_malloc.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "i_malloc"); if (NULL == dlerror() && NULL != i_malloc.dlsym) { # if defined(LIBXSMM_MALLOC_HOOK_CALLOC) union { const void* dlsym; void* (**ptr)(size_t, size_t); } i_calloc; i_calloc.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "i_calloc"); if (NULL == dlerror() && NULL != i_calloc.dlsym) # endif { # if defined(LIBXSMM_MALLOC_HOOK_REALLOC) union { const void* dlsym; libxsmm_realloc_fun* ptr; } i_realloc; i_realloc.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "i_realloc"); if (NULL == dlerror() && NULL != i_realloc.dlsym) # endif { union { const void* dlsym; libxsmm_free_fun* ptr; } i_free; i_free.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "i_free"); if (NULL == dlerror() && NULL != i_free.dlsym) { *i_malloc.ptr = libxsmm_malloc_fn.malloc.ptr; # if defined(LIBXSMM_MALLOC_HOOK_CALLOC) *i_calloc.ptr = libxsmm_malloc_fn.calloc.ptr; # endif # if defined(LIBXSMM_MALLOC_HOOK_REALLOC) *i_realloc.ptr = libxsmm_malloc_fn.realloc.ptr; # endif *i_free.ptr = libxsmm_malloc_fn.free.ptr; } } } } # endif /*defined(LIBXSMM_MALLOC_HOOK_IMALLOC)*/ } else { /* fall-back: potentially recursive */ # if (defined(LIBXSMM_BUILD) && (1 < (LIBXSMM_BUILD))) libxsmm_malloc_fn.memalign.ptr = __libc_memalign; libxsmm_malloc_fn.malloc.ptr = __libc_malloc; # if defined(LIBXSMM_MALLOC_HOOK_CALLOC) libxsmm_malloc_fn.calloc.ptr = __libc_calloc; # endif # if defined(LIBXSMM_MALLOC_HOOK_REALLOC) libxsmm_malloc_fn.realloc.ptr = __libc_realloc; # endif libxsmm_malloc_fn.free.ptr = __libc_free; # else libxsmm_malloc_fn.memalign.ptr = libxsmm_memalign_internal; libxsmm_malloc_fn.malloc.ptr = malloc; # if defined(LIBXSMM_MALLOC_HOOK_CALLOC) libxsmm_malloc_fn.calloc.ptr = calloc; # endif # if defined(LIBXSMM_MALLOC_HOOK_REALLOC) libxsmm_malloc_fn.realloc.ptr = realloc; # endif libxsmm_malloc_fn.free.ptr = free; # endif } #endif } LIBXSMM_API_INTERN void libxsmm_malloc_finalize(void) { } LIBXSMM_API_INTERN int libxsmm_xset_default_allocator(LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK)* lock, const void* context, libxsmm_malloc_function malloc_fn, libxsmm_free_function free_fn) { int result = EXIT_SUCCESS; if (NULL != lock) { LIBXSMM_INIT LIBXSMM_LOCK_ACQUIRE(LIBXSMM_LOCK, lock); } if (NULL != malloc_fn.function && NULL != free_fn.function) { libxsmm_default_allocator_context = context; libxsmm_default_malloc_fn = malloc_fn; libxsmm_default_free_fn = free_fn; } else { libxsmm_malloc_function internal_malloc_fn; libxsmm_free_function internal_free_fn; const void* internal_allocator = NULL; internal_malloc_fn.function = __real_malloc; internal_free_fn.function = __real_free; /*internal_allocator = NULL;*/ if (NULL == malloc_fn.function && NULL == free_fn.function) { libxsmm_default_allocator_context = internal_allocator; libxsmm_default_malloc_fn = internal_malloc_fn; libxsmm_default_free_fn = internal_free_fn; } else { /* invalid allocator */ static int error_once = 0; if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: allocator setup without malloc or free function!\n"); } /* keep any valid (previously instantiated) default allocator */ if (NULL == libxsmm_default_malloc_fn.function || NULL == libxsmm_default_free_fn.function) { libxsmm_default_allocator_context = internal_allocator; libxsmm_default_malloc_fn = internal_malloc_fn; libxsmm_default_free_fn = internal_free_fn; } result = EXIT_FAILURE; } } if (NULL != lock) { LIBXSMM_LOCK_RELEASE(LIBXSMM_LOCK, lock); } LIBXSMM_ASSERT(EXIT_SUCCESS == result); return result; } LIBXSMM_API_INTERN int libxsmm_xget_default_allocator(LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK)* lock, const void** context, libxsmm_malloc_function* malloc_fn, libxsmm_free_function* free_fn) { int result = EXIT_SUCCESS; if (NULL != context || NULL != malloc_fn || NULL != free_fn) { if (NULL != lock) { LIBXSMM_INIT LIBXSMM_LOCK_ACQUIRE(LIBXSMM_LOCK, lock); } if (context) *context = libxsmm_default_allocator_context; if (NULL != malloc_fn) *malloc_fn = libxsmm_default_malloc_fn; if (NULL != free_fn) *free_fn = libxsmm_default_free_fn; if (NULL != lock) { LIBXSMM_LOCK_RELEASE(LIBXSMM_LOCK, lock); } } else if (0 != libxsmm_verbosity) { /* library code is expected to be mute */ static int error_once = 0; if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: invalid signature used to get the default memory allocator!\n"); } result = EXIT_FAILURE; } LIBXSMM_ASSERT(EXIT_SUCCESS == result); return result; } LIBXSMM_API_INTERN int libxsmm_xset_scratch_allocator(LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK)* lock, const void* context, libxsmm_malloc_function malloc_fn, libxsmm_free_function free_fn) { int result = EXIT_SUCCESS; static int error_once = 0; if (NULL != lock) { LIBXSMM_INIT LIBXSMM_LOCK_ACQUIRE(LIBXSMM_LOCK, lock); } /* make sure the default allocator is setup before adopting it eventually */ if (NULL == libxsmm_default_malloc_fn.function || NULL == libxsmm_default_free_fn.function) { const libxsmm_malloc_function null_malloc_fn = { NULL }; const libxsmm_free_function null_free_fn = { NULL }; libxsmm_xset_default_allocator(NULL/*already locked*/, NULL/*context*/, null_malloc_fn, null_free_fn); } if (NULL == malloc_fn.function && NULL == free_fn.function) { /* adopt default allocator */ libxsmm_scratch_allocator_context = libxsmm_default_allocator_context; libxsmm_scratch_malloc_fn = libxsmm_default_malloc_fn; libxsmm_scratch_free_fn = libxsmm_default_free_fn; } else if (NULL != malloc_fn.function) { if (NULL == free_fn.function && /*warning*/(LIBXSMM_VERBOSITY_WARN <= libxsmm_verbosity || 0 > libxsmm_verbosity) && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM WARNING: scratch allocator setup without free function!\n"); } libxsmm_scratch_allocator_context = context; libxsmm_scratch_malloc_fn = malloc_fn; libxsmm_scratch_free_fn = free_fn; /* NULL allowed */ } else { /* invalid scratch allocator */ if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: invalid scratch allocator (default used)!\n"); } /* keep any valid (previously instantiated) scratch allocator */ if (NULL == libxsmm_scratch_malloc_fn.function) { libxsmm_scratch_allocator_context = libxsmm_default_allocator_context; libxsmm_scratch_malloc_fn = libxsmm_default_malloc_fn; libxsmm_scratch_free_fn = libxsmm_default_free_fn; } result = EXIT_FAILURE; } if (NULL != lock) { LIBXSMM_LOCK_RELEASE(LIBXSMM_LOCK, lock); } LIBXSMM_ASSERT(EXIT_SUCCESS == result); return result; } LIBXSMM_API_INTERN int libxsmm_xget_scratch_allocator(LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK)* lock, const void** context, libxsmm_malloc_function* malloc_fn, libxsmm_free_function* free_fn) { int result = EXIT_SUCCESS; if (NULL != context || NULL != malloc_fn || NULL != free_fn) { if (NULL != lock) { LIBXSMM_INIT LIBXSMM_LOCK_ACQUIRE(LIBXSMM_LOCK, lock); } if (context) *context = libxsmm_scratch_allocator_context; if (NULL != malloc_fn) *malloc_fn = libxsmm_scratch_malloc_fn; if (NULL != free_fn) *free_fn = libxsmm_scratch_free_fn; if (NULL != lock) { LIBXSMM_LOCK_RELEASE(LIBXSMM_LOCK, lock); } } else if (0 != libxsmm_verbosity) { /* library code is expected to be mute */ static int error_once = 0; if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: invalid signature used to get the scratch memory allocator!\n"); } result = EXIT_FAILURE; } LIBXSMM_ASSERT(EXIT_SUCCESS == result); return result; } LIBXSMM_API int libxsmm_set_default_allocator(const void* context, libxsmm_malloc_function malloc_fn, libxsmm_free_function free_fn) { return libxsmm_xset_default_allocator(&libxsmm_lock_global, context, malloc_fn, free_fn); } LIBXSMM_API int libxsmm_get_default_allocator(const void** context, libxsmm_malloc_function* malloc_fn, libxsmm_free_function* free_fn) { return libxsmm_xget_default_allocator(&libxsmm_lock_global, context, malloc_fn, free_fn); } LIBXSMM_API int libxsmm_set_scratch_allocator(const void* context, libxsmm_malloc_function malloc_fn, libxsmm_free_function free_fn) { return libxsmm_xset_scratch_allocator(&libxsmm_lock_global, context, malloc_fn, free_fn); } LIBXSMM_API int libxsmm_get_scratch_allocator(const void** context, libxsmm_malloc_function* malloc_fn, libxsmm_free_function* free_fn) { return libxsmm_xget_scratch_allocator(&libxsmm_lock_global, context, malloc_fn, free_fn); } LIBXSMM_API int libxsmm_get_malloc_xinfo(const void* memory, size_t* size, int* flags, void** extra) { int result; #if !defined(NDEBUG) if (NULL != size || NULL != extra) #endif { const int check = ((NULL == flags || 0 == (LIBXSMM_MALLOC_FLAG_X & *flags)) ? 2 : 1); const internal_malloc_info_type *const info = internal_malloc_info(memory, check); if (NULL != info) { if (NULL != size) *size = info->size; if (NULL != flags) *flags = info->flags; if (NULL != extra) *extra = info->pointer; result = EXIT_SUCCESS; } else { /* potentially foreign buffer */ result = (NULL != memory ? EXIT_FAILURE : EXIT_SUCCESS); if (NULL != size) *size = 0; if (NULL != flags) *flags = 0; if (NULL != extra) *extra = 0; } } #if !defined(NDEBUG) else { static int error_once = 0; if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: attachment error for memory buffer %p!\n", memory); } LIBXSMM_ASSERT_MSG(0/*false*/, "LIBXSMM ERROR: attachment error"); result = EXIT_FAILURE; } #endif return result; } #if !defined(_WIN32) LIBXSMM_API_INLINE void internal_xmalloc_mhint(void* buffer, size_t size) { LIBXSMM_ASSERT((MAP_FAILED != buffer && NULL != buffer) || 0 == size); #if defined(_DEFAULT_SOURCE) || defined(_BSD_SOURCE) /* proceed after failed madvise (even in case of an error; take what we got) */ /* issue no warning as a failure seems to be related to the kernel version */ madvise(buffer, size, MADV_NORMAL/*MADV_RANDOM*/ # if defined(MADV_NOHUGEPAGE) /* if not available, we then take what we got (THP) */ | ((LIBXSMM_MALLOC_ALIGNMAX * LIBXSMM_MALLOC_ALIGNFCT) > size ? MADV_NOHUGEPAGE : 0) # endif # if defined(MADV_DONTDUMP) | ((LIBXSMM_MALLOC_ALIGNMAX * LIBXSMM_MALLOC_ALIGNFCT) > size ? 0 : MADV_DONTDUMP) # endif ); #else LIBXSMM_UNUSED(buffer); LIBXSMM_UNUSED(size); #endif } LIBXSMM_API_INLINE void* internal_xmalloc_xmap(const char* dir, size_t size, int flags, void** rx) { void* result = MAP_FAILED; char filename[4096] = LIBXSMM_MALLOC_XMAP_TEMPLATE; int i = 0; LIBXSMM_ASSERT(NULL != rx && MAP_FAILED != *rx); if (NULL != dir && 0 != *dir) { i = LIBXSMM_SNPRINTF(filename, sizeof(filename), "%s/" LIBXSMM_MALLOC_XMAP_TEMPLATE, dir); } if (0 <= i && i < (int)sizeof(filename)) { /* coverity[secure_temp] */ i = mkstemp(filename); if (0 <= i) { if (0 == unlink(filename) && 0 == ftruncate(i, size)) { const int mflags = (flags | LIBXSMM_MAP_SHARED); void *const xmap = mmap(*rx, size, PROT_READ | PROT_EXEC, mflags, i, 0/*offset*/); if (MAP_FAILED != xmap) { LIBXSMM_ASSERT(NULL != xmap); result = mmap(NULL, size, PROT_READ | PROT_WRITE, mflags, i, 0/*offset*/); if (MAP_FAILED != result) { LIBXSMM_ASSERT(NULL != result); internal_xmalloc_mhint(xmap, size); *rx = xmap; } else { munmap(xmap, size); *rx = NULL; } } } close(i); } } return result; } #endif /*!defined(_WIN32)*/ LIBXSMM_API_INLINE void* internal_xrealloc(void** ptr, internal_malloc_info_type** info, size_t size, libxsmm_realloc_fun realloc_fn, libxsmm_free_fun free_fn) { char *const base = (char*)(NULL != *info ? (*info)->pointer : *ptr), *result; LIBXSMM_ASSERT(NULL != *ptr); /* may implicitly invalidate info */ result = (char*)realloc_fn(base, size); if (result == base) { /* signal no-copy */ LIBXSMM_ASSERT(NULL != result); *info = NULL; /* no delete */ *ptr = NULL; /* no copy */ } else if (NULL != result) { /* copy */ const size_t offset_src = (const char*)*ptr - base; *ptr = result + offset_src; /* copy */ *info = NULL; /* no delete */ } #if !defined(NDEBUG) && 0 else { /* failed */ if (NULL != *info) { /* implicitly invalidates info */ internal_xfree(*ptr, *info); } else { /* foreign pointer */ free_fn(*ptr); } *info = NULL; /* no delete */ *ptr = NULL; /* no copy */ } #else LIBXSMM_UNUSED(free_fn); #endif return result; } LIBXSMM_API_INTERN void* internal_xmalloc(void** /*ptr*/, internal_malloc_info_type** /*info*/, size_t /*size*/, const void* /*context*/, libxsmm_malloc_function /*malloc_fn*/, libxsmm_free_function /*free_fn*/); LIBXSMM_API_INTERN void* internal_xmalloc(void** ptr, internal_malloc_info_type** info, size_t size, const void* context, libxsmm_malloc_function malloc_fn, libxsmm_free_function free_fn) { void* result; LIBXSMM_ASSERT(NULL != ptr && NULL != info && NULL != malloc_fn.function); if (NULL == *ptr) { result = (NULL == context ? malloc_fn.function(size) : malloc_fn.ctx_form(size, context)); } else { /* reallocate */ if (NULL != free_fn.function /* prefer free_fn since it is part of pointer-info */ ? (__real_free == free_fn.function || free == free_fn.function) : (__real_malloc == malloc_fn.function || malloc == malloc_fn.function)) { #if defined(LIBXSMM_MALLOC_HOOK_REALLOC) result = internal_xrealloc(ptr, info, size, __real_realloc, __real_free); #else result = internal_xrealloc(ptr, info, size, realloc, __real_free); #endif } else { /* fall-back with regular allocation */ result = (NULL == context ? malloc_fn.function(size) : malloc_fn.ctx_form(size, context)); if (NULL == result) { /* failed */ if (NULL != *info) { internal_xfree(*ptr, *info); } else { /* foreign pointer */ (NULL != free_fn.function ? free_fn.function : __real_free)(*ptr); } *ptr = NULL; /* safe delete */ } } } return result; } LIBXSMM_API_INTERN int libxsmm_xmalloc(void** memory, size_t size, size_t alignment, int flags, const void* extra, size_t extra_size) { int result = EXIT_SUCCESS; #if !defined(NDEBUG) if (NULL != memory) #endif { static int error_once = 0; if (0 != size) { size_t alloc_alignment = 0, alloc_size = 0, max_preserve = 0; internal_malloc_info_type* info = NULL; void* buffer = NULL, * reloc = NULL; /* ATOMIC BEGIN: this region should be atomic/locked */ const void* context = libxsmm_default_allocator_context; libxsmm_malloc_function malloc_fn = libxsmm_default_malloc_fn; libxsmm_free_function free_fn = libxsmm_default_free_fn; if (0 != (LIBXSMM_MALLOC_FLAG_SCRATCH & flags)) { context = libxsmm_scratch_allocator_context; malloc_fn = libxsmm_scratch_malloc_fn; free_fn = libxsmm_scratch_free_fn; #if defined(LIBXSMM_MALLOC_MMAP_SCRATCH) flags |= LIBXSMM_MALLOC_FLAG_MMAP; #endif } if ((0 != (internal_malloc_kind & 1) && 0 < internal_malloc_kind) || NULL == malloc_fn.function || NULL == free_fn.function) { malloc_fn.function = __real_malloc; free_fn.function = __real_free; context = NULL; } /* ATOMIC END: this region should be atomic */ flags |= LIBXSMM_MALLOC_FLAG_RW; /* normalize given flags since flags=0 is accepted as well */ if (0 != (LIBXSMM_MALLOC_FLAG_REALLOC & flags) && NULL != *memory) { info = internal_malloc_info(*memory, 2/*check*/); if (NULL != info) { max_preserve = info->size; } else { /* reallocation of unknown allocation */ flags &= ~LIBXSMM_MALLOC_FLAG_MMAP; } } else *memory = NULL; #if !defined(LIBXSMM_MALLOC_MMAP) if (0 == (LIBXSMM_MALLOC_FLAG_X & flags) && 0 == (LIBXSMM_MALLOC_FLAG_MMAP & flags)) { alloc_alignment = (0 == (LIBXSMM_MALLOC_FLAG_REALLOC & flags) ? libxsmm_alignment(size, alignment) : alignment); alloc_size = size + extra_size + sizeof(internal_malloc_info_type) + alloc_alignment - 1; buffer = internal_xmalloc(memory, &info, alloc_size, context, malloc_fn, free_fn); } else #endif if (NULL == info || size != info->size) { #if defined(_WIN32) ||defined(__CYGWIN__) const int mflags = (0 != (LIBXSMM_MALLOC_FLAG_X & flags) ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE); static SIZE_T alloc_alignmax = 0, alloc_pagesize = 0; if (0 == alloc_alignmax) { /* first/one time */ SYSTEM_INFO system_info; GetSystemInfo(&system_info); alloc_pagesize = system_info.dwPageSize; alloc_alignmax = GetLargePageMinimum(); } if ((LIBXSMM_MALLOC_ALIGNMAX * LIBXSMM_MALLOC_ALIGNFCT) <= size) { /* attempt to use large pages */ HANDLE process_token; alloc_alignment = (NULL == info ? (0 == alignment ? alloc_alignmax : libxsmm_lcm(alignment, alloc_alignmax)) : libxsmm_lcm(alignment, alloc_alignmax)); alloc_size = LIBXSMM_UP2(size + extra_size + sizeof(internal_malloc_info_type) + alloc_alignment - 1, alloc_alignmax); if (TRUE == OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &process_token)) { TOKEN_PRIVILEGES tp; if (TRUE == LookupPrivilegeValue(NULL, TEXT("SeLockMemoryPrivilege"), &tp.Privileges[0].Luid)) { tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; tp.PrivilegeCount = 1; /* enable privilege */ if (TRUE == AdjustTokenPrivileges(process_token, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0) && ERROR_SUCCESS == GetLastError()/*may has failed (regardless of TRUE)*/) { /* VirtualAlloc cannot be used to reallocate memory */ buffer = VirtualAlloc(NULL, alloc_size, MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES, mflags); } tp.Privileges[0].Attributes = 0; /* disable privilege */ AdjustTokenPrivileges(process_token, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0); } CloseHandle(process_token); } } else { /* small allocation using regular page-size */ alloc_alignment = (NULL == info ? libxsmm_alignment(size, alignment) : alignment); alloc_size = LIBXSMM_UP2(size + extra_size + sizeof(internal_malloc_info_type) + alloc_alignment - 1, alloc_pagesize); } if (NULL == buffer) { /* small allocation or retry with regular page size */ /* VirtualAlloc cannot be used to reallocate memory */ buffer = VirtualAlloc(NULL, alloc_size, MEM_RESERVE | MEM_COMMIT, mflags); } if (NULL != buffer) { flags |= LIBXSMM_MALLOC_FLAG_MMAP; /* select the corresponding deallocation */ } else if (0 == (LIBXSMM_MALLOC_FLAG_MMAP & flags)) { /* fall-back allocation */ buffer = internal_xmalloc(memory, &info, alloc_size, context, malloc_fn, free_fn); } #else /* !defined(_WIN32) */ # if defined(MAP_HUGETLB) static size_t limit_hugetlb = LIBXSMM_SCRATCH_UNLIMITED; # endif # if defined(MAP_LOCKED) static size_t limit_plocked = LIBXSMM_SCRATCH_UNLIMITED; # endif # if defined(MAP_32BIT) static int map32 = 1; # endif int mflags = 0 # if defined(MAP_UNINITIALIZED) && 0/*fails with WSL*/ | MAP_UNINITIALIZED /* unlikely available */ # endif # if defined(MAP_NORESERVE) | (LIBXSMM_MALLOC_ALIGNMAX < size ? 0 : MAP_NORESERVE) # endif # if defined(MAP_32BIT) | ((0 != (LIBXSMM_MALLOC_FLAG_X & flags) && 0 != map32 && LIBXSMM_X86_AVX512_CORE > libxsmm_target_archid && LIBXSMM_X86_AVX512 < libxsmm_target_archid) ? MAP_32BIT : 0) # endif # if defined(MAP_HUGETLB) /* may fail depending on system settings */ | ((0 == (LIBXSMM_MALLOC_FLAG_X & flags) && ((LIBXSMM_MALLOC_ALIGNMAX * LIBXSMM_MALLOC_ALIGNFCT) <= size || 0 != (LIBXSMM_MALLOC_FLAG_PHUGE & flags)) && (internal_malloc_hugetlb + size) < limit_hugetlb) ? MAP_HUGETLB : 0) # endif # if defined(MAP_LOCKED) && !defined(LIBXSMM_MALLOC_LOCK_ONFAULT) | ((0 == (LIBXSMM_MALLOC_FLAG_X & flags) && (internal_malloc_plocked + size) < limit_plocked) ? MAP_LOCKED : 0) # endif ; /* mflags */ # if defined(MAP_POPULATE) { static int prefault = 0; if (0 == prefault) { /* prefault only on Linux 3.10.0-327 (and later) to avoid data race in page-fault handler */ struct utsname osinfo; unsigned int version_major = 3, version_minor = 10, version_update = 0, version_patch = 327; if (0 <= uname(&osinfo) && 0 == strcmp("Linux", osinfo.sysname) && 4 == sscanf(osinfo.release, "%u.%u.%u-%u", &version_major, &version_minor, &version_update, &version_patch) && LIBXSMM_VERSION4(3, 10, 0, 327) > LIBXSMM_VERSION4(version_major, version_minor, version_update, version_patch)) { mflags |= MAP_POPULATE; prefault = 1; } else prefault = -1; } else if (1 == prefault) mflags |= MAP_POPULATE; } # endif /* make allocated size at least a multiple of the smallest page-size to avoid split-pages (unmap!) */ alloc_alignment = libxsmm_lcm(0 == alignment ? libxsmm_alignment(size, alignment) : alignment, LIBXSMM_PAGE_MINSIZE); alloc_size = LIBXSMM_UP2(size + extra_size + sizeof(internal_malloc_info_type) + alloc_alignment - 1, alloc_alignment); if (0 == (LIBXSMM_MALLOC_FLAG_X & flags)) { /* anonymous and non-executable */ # if defined(MAP_32BIT) LIBXSMM_ASSERT(0 == (MAP_32BIT & mflags)); # endif # if 0 LIBXSMM_ASSERT(NULL != info || NULL == *memory); /* no memory mapping of foreign pointer */ # endif buffer = mmap(NULL == info ? NULL : info->pointer, alloc_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | LIBXSMM_MAP_ANONYMOUS | mflags, -1, 0/*offset*/); # if defined(MAP_HUGETLB) INTERNAL_XMALLOC_KIND(MAP_HUGETLB, "huge-page", LIBXSMM_MALLOC_FLAG_PHUGE, flags, mflags, internal_malloc_hugetlb, limit_hugetlb, info, alloc_size, buffer); # endif # if defined(MAP_LOCKED) # if !defined(LIBXSMM_MALLOC_LOCK_ONFAULT) INTERNAL_XMALLOC_KIND(MAP_LOCKED, "locked-page", LIBXSMM_MALLOC_FLAG_PLOCK, flags, mflags, internal_malloc_plocked, limit_plocked, info, alloc_size, buffer); # else if (0 != (MAP_LOCKED & mflags) && MAP_FAILED != buffer) { LIBXSMM_ASSERT(NULL != buffer); # if 0 /* mlock2 is potentially not exposed */ if (0 == mlock2(buffer, alloc_size, MLOCK_ONFAULT)) # else if (0 == syscall(SYS_mlock2, buffer, alloc_size, MLOCK_ONFAULT)) # endif { LIBXSMM_ATOMIC_ADD_FETCH(&internal_malloc_plocked, alloc_size, LIBXSMM_ATOMIC_RELAXED); flags |= LIBXSMM_MALLOC_FLAG_PLOCK; } else { /* update watermark */ INTERNAL_XMALLOC_WATERMARK("locked-page", internal_malloc_plocked, limit_plocked, alloc_size); } } # endif # endif } else { /* executable buffer requested */ static /*LIBXSMM_TLS*/ int fallback = -1; /* fall-back allocation method */ # if defined(MAP_HUGETLB) LIBXSMM_ASSERT(0 == (MAP_HUGETLB & mflags)); # endif # if defined(MAP_LOCKED) LIBXSMM_ASSERT(0 == (MAP_LOCKED & mflags)); # endif if (0 > (int)LIBXSMM_ATOMIC_LOAD(&fallback, LIBXSMM_ATOMIC_RELAXED)) { const char *const env = getenv("LIBXSMM_SE"); LIBXSMM_ATOMIC_STORE(&fallback, NULL == env /* libxsmm_se decides */ ? (0 == libxsmm_se ? LIBXSMM_MALLOC_FINAL : LIBXSMM_MALLOC_FALLBACK) /* user's choice takes precedence */ : ('0' != *env ? LIBXSMM_MALLOC_FALLBACK : LIBXSMM_MALLOC_FINAL), LIBXSMM_ATOMIC_SEQ_CST); LIBXSMM_ASSERT(0 <= fallback); } INTERNAL_XMALLOC(0, fallback, "TMPDIR", "/tmp", map32, mflags, alloc_size, buffer, &reloc); /* 1st try */ if (1 <= fallback) { /* continue with fall-back */ INTERNAL_XMALLOC(1, fallback, "JITDUMPDIR", "", map32, mflags, alloc_size, buffer, &reloc); /* 2nd try */ if (2 <= fallback) { /* continue with fall-back */ INTERNAL_XMALLOC(2, fallback, "HOME", "", map32, mflags, alloc_size, buffer, &reloc); /* 3rd try */ if (3 <= fallback) { /* continue with fall-back */ if (3 == fallback) { /* 4th try */ buffer = mmap(reloc, alloc_size, PROT_READ | PROT_WRITE | PROT_EXEC, # if defined(MAP_32BIT) MAP_PRIVATE | LIBXSMM_MAP_ANONYMOUS | (mflags & ~MAP_32BIT), # else MAP_PRIVATE | LIBXSMM_MAP_ANONYMOUS | mflags, # endif -1, 0/*offset*/); if (MAP_FAILED == buffer) fallback = 4; } if (4 == fallback && MAP_FAILED != buffer) { /* final */ LIBXSMM_ASSERT(fallback == LIBXSMM_MALLOC_FINAL + 1); buffer = MAP_FAILED; /* trigger final fall-back */ } } } } } if (MAP_FAILED != buffer && NULL != buffer) { flags |= LIBXSMM_MALLOC_FLAG_MMAP; /* select deallocation */ } else { /* allocation failed */ if (0 == (LIBXSMM_MALLOC_FLAG_MMAP & flags)) { /* ultimate fall-back */ buffer = (NULL != malloc_fn.function ? (NULL == context ? malloc_fn.function(alloc_size) : malloc_fn.ctx_form(alloc_size, context)) : (NULL)); } reloc = NULL; } if (MAP_FAILED != buffer && NULL != buffer) { internal_xmalloc_mhint(buffer, alloc_size); } #endif /* !defined(_WIN32) */ } else { /* reallocation of the same pointer and size */ alloc_size = size + extra_size + sizeof(internal_malloc_info_type) + alignment - 1; if (NULL != info) { buffer = info->pointer; flags |= info->flags; } else { flags |= LIBXSMM_MALLOC_FLAG_MMAP; buffer = *memory; } alloc_alignment = alignment; *memory = NULL; /* signal no-copy */ } if ( #if !defined(_WIN32) && !defined(__clang_analyzer__) MAP_FAILED != buffer && #endif NULL != buffer) { char *const cbuffer = (char*)buffer, *const aligned = LIBXSMM_ALIGN( cbuffer + extra_size + sizeof(internal_malloc_info_type), alloc_alignment); internal_malloc_info_type *const buffer_info = (internal_malloc_info_type*)( aligned - sizeof(internal_malloc_info_type)); LIBXSMM_ASSERT((aligned + size) <= (cbuffer + alloc_size)); LIBXSMM_ASSERT(0 < alloc_alignment); /* former content must be preserved prior to setup of buffer_info */ if (NULL != *memory) { /* preserve/copy previous content */ #if 0 LIBXSMM_ASSERT(0 != (LIBXSMM_MALLOC_FLAG_REALLOC & flags)); #endif /* content behind foreign pointers is not explicitly preserved; buffers may overlap */ memmove(aligned, *memory, LIBXSMM_MIN(max_preserve, size)); if (NULL != info /* known allocation (non-foreign pointer) */ && EXIT_SUCCESS != internal_xfree(*memory, info) /* !libxsmm_free */ && 0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { /* display some extra context of the failure (reallocation) */ fprintf(stderr, "LIBXSMM ERROR: memory reallocation failed to release memory!\n"); } } if (NULL != extra || 0 == extra_size) { const char *const src = (const char*)extra; int i; for (i = 0; i < (int)extra_size; ++i) cbuffer[i] = src[i]; } else if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: incorrect extraneous data specification!\n"); /* no EXIT_FAILURE because valid buffer is returned */ } if (0 == (LIBXSMM_MALLOC_FLAG_X & flags)) { /* update statistics */ if (0 == (LIBXSMM_MALLOC_FLAG_PRIVATE & flags)) { /* public */ if (0 != (LIBXSMM_MALLOC_FLAG_SCRATCH & flags)) { /* scratch */ const size_t watermark = LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_ADD_FETCH, LIBXSMM_BITS)( &internal_malloc_public_cur, alloc_size, LIBXSMM_ATOMIC_RELAXED); if (internal_malloc_public_max < watermark) internal_malloc_public_max = watermark; /* accept data-race */ } else { /* local */ const size_t watermark = LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_ADD_FETCH, LIBXSMM_BITS)( &internal_malloc_local_cur, alloc_size, LIBXSMM_ATOMIC_RELAXED); if (internal_malloc_local_max < watermark) internal_malloc_local_max = watermark; /* accept data-race */ } } else { /* private */ const size_t watermark = LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_ADD_FETCH, LIBXSMM_BITS)( &internal_malloc_private_cur, alloc_size, LIBXSMM_ATOMIC_RELAXED); if (internal_malloc_private_max < watermark) internal_malloc_private_max = watermark; /* accept data-race */ } } /* keep allocation function on record */ if (0 == (LIBXSMM_MALLOC_FLAG_MMAP & flags)) { buffer_info->context = context; buffer_info->free = free_fn; } else { buffer_info->free.function = NULL; buffer_info->context = NULL; } buffer_info->size = size; /* record user's size rather than allocated size */ buffer_info->pointer = buffer; buffer_info->reloc = reloc; buffer_info->flags = flags; #if defined(LIBXSMM_VTUNE) buffer_info->code_id = 0; #endif /* info must be initialized to calculate correct checksum */ #if !defined(LIBXSMM_MALLOC_CRC_OFF) # if defined(LIBXSMM_MALLOC_CRC_LIGHT) buffer_info->hash = LIBXSMM_CRC32U(LIBXSMM_BITS)(LIBXSMM_MALLOC_SEED, &buffer_info); # else buffer_info->hash = libxsmm_crc32(LIBXSMM_MALLOC_SEED, buffer_info, (unsigned int)(((char*)&buffer_info->hash) - ((char*)buffer_info))); # endif #endif /* finally commit/return allocated buffer */ *memory = aligned; } else { if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { char alloc_size_buffer[32]; libxsmm_format_size(alloc_size_buffer, sizeof(alloc_size_buffer), alloc_size, "KM", "B", 10); fprintf(stderr, "LIBXSMM ERROR: failed to allocate %s with flag=%i!\n", alloc_size_buffer, flags); } result = EXIT_FAILURE; *memory = NULL; } } else { if ((LIBXSMM_VERBOSITY_HIGH <= libxsmm_verbosity || 0 > libxsmm_verbosity) /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM WARNING: zero-sized memory allocation detected!\n"); } *memory = NULL; /* no EXIT_FAILURE */ } } #if !defined(NDEBUG) else if (0 != size) { result = EXIT_FAILURE; } #endif return result; } LIBXSMM_API_INTERN void libxsmm_xfree(const void* memory, int check) { #if (!defined(LIBXSMM_MALLOC_HOOK_STATIC) && !defined(LIBXSMM_MALLOC_HOOK_DYNAMIC)) || defined(_DEBUG) static int error_once = 0; #endif /*const*/ internal_malloc_info_type *const info = internal_malloc_info(memory, check); if (NULL != info) { /* !libxsmm_free */ #if (!defined(LIBXSMM_MALLOC_HOOK_STATIC) && !defined(LIBXSMM_MALLOC_HOOK_DYNAMIC)) || defined(_DEBUG) if (EXIT_SUCCESS != internal_xfree(memory, info)) { if ( 0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: memory deallocation failed!\n"); } } #else internal_xfree(memory, info); #endif } else if (NULL != memory) { #if 1 union { const void* const_ptr; void* ptr; } cast; cast.const_ptr = memory; /* C-cast still warns */ __real_free(cast.ptr); #endif #if (!defined(LIBXSMM_MALLOC_HOOK_STATIC) && !defined(LIBXSMM_MALLOC_HOOK_DYNAMIC)) || defined(_DEBUG) if ( 0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: deallocation does not match allocation!\n"); } #endif } } #if defined(LIBXSMM_VTUNE) LIBXSMM_API_INLINE void internal_get_vtune_jitdesc(const void* code, unsigned int code_id, size_t code_size, const char* code_name, LIBXSMM_VTUNE_JIT_DESC_TYPE* desc) { LIBXSMM_ASSERT(NULL != code && 0 != code_id && 0 != code_size && NULL != desc); desc->method_id = code_id; /* incorrect constness (method_name) */ desc->method_name = (char*)code_name; /* incorrect constness (method_load_address) */ desc->method_load_address = (void*)code; desc->method_size = code_size; desc->line_number_size = 0; desc->line_number_table = NULL; desc->class_file_name = NULL; desc->source_file_name = NULL; # if (2 <= LIBXSMM_VTUNE_JITVERSION) desc->module_name = "libxsmm.jit"; # endif } #endif LIBXSMM_API_INTERN int libxsmm_malloc_attrib(void** memory, int flags, const char* name) { internal_malloc_info_type *const info = (NULL != memory ? internal_malloc_info(*memory, 0/*no check*/) : NULL); int result = EXIT_SUCCESS; static int error_once = 0; if (NULL != info) { void *const buffer = info->pointer; const size_t size = info->size; #if defined(_WIN32) LIBXSMM_ASSERT(NULL != buffer || 0 == size); #else LIBXSMM_ASSERT((NULL != buffer && MAP_FAILED != buffer) || 0 == size); #endif flags |= (info->flags & ~LIBXSMM_MALLOC_FLAG_RWX); /* merge with current flags */ /* quietly keep the read permission, but eventually revoke write permissions */ if (0 == (LIBXSMM_MALLOC_FLAG_W & flags) || 0 != (LIBXSMM_MALLOC_FLAG_X & flags)) { const size_t alignment = (size_t)(((const char*)(*memory)) - ((const char*)buffer)); const size_t alloc_size = size + alignment; if (0 == (LIBXSMM_MALLOC_FLAG_X & flags)) { /* data-buffer; non-executable */ #if defined(_WIN32) /* TODO: implement memory protection under Microsoft Windows */ LIBXSMM_UNUSED(alloc_size); #else if (EXIT_SUCCESS != mprotect(buffer, alloc_size/*entire memory region*/, PROT_READ) && (LIBXSMM_VERBOSITY_HIGH <= libxsmm_verbosity || 0 > libxsmm_verbosity) /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM WARNING: read-only request for buffer failed!\n"); } #endif } else { /* executable buffer requested */ void *const code_ptr = NULL != info->reloc ? ((void*)(((char*)info->reloc) + alignment)) : *memory; LIBXSMM_ASSERT(0 != (LIBXSMM_MALLOC_FLAG_X & flags)); if (name && *name) { /* profiler support requested */ if (0 > libxsmm_verbosity) { /* avoid dump when only the profiler is enabled */ FILE* code_file = fopen(name, "rb"); int diff = 0; if (NULL == code_file) { /* file does not exist */ code_file = fopen(name, "wb"); if (NULL != code_file) { /* dump byte-code into a file */ fwrite(code_ptr, 1, size, code_file); fclose(code_file); } } else { /* check existing file */ const char* check_a = (const char*)code_ptr; char check_b[4096]; size_t rest = size; do { const size_t n = fread(check_b, 1, LIBXSMM_MIN(sizeof(check_b), rest), code_file); diff += memcmp(check_a, check_b, LIBXSMM_MIN(sizeof(check_b), n)); check_a += n; rest -= n; } while (0 < rest && 0 == diff); fclose(code_file); } fprintf(stderr, "LIBXSMM-JIT-DUMP(ptr:file) %p : %s\n", code_ptr, name); if (0 != diff) { /* override existing dump and warn about erroneous condition */ fprintf(stderr, "LIBXSMM ERROR: %s is shared by different code!\n", name); code_file = fopen(name, "wb"); if (NULL != code_file) { /* dump byte-code into a file */ fwrite(code_ptr, 1, size, code_file); fclose(code_file); } } } #if defined(LIBXSMM_VTUNE) if (iJIT_SAMPLING_ON == iJIT_IsProfilingActive()) { LIBXSMM_VTUNE_JIT_DESC_TYPE vtune_jit_desc; const unsigned int code_id = iJIT_GetNewMethodID(); internal_get_vtune_jitdesc(code_ptr, code_id, size, name, &vtune_jit_desc); iJIT_NotifyEvent(LIBXSMM_VTUNE_JIT_LOAD, &vtune_jit_desc); info->code_id = code_id; } else { info->code_id = 0; } #endif #if defined(LIBXSMM_PERF) /* If JIT is enabled and a valid name is given, emit information for profiler * In jitdump case this needs to be done after mprotect as it gets overwritten * otherwise. */ libxsmm_perf_dump_code(code_ptr, size, name); #endif } if (NULL != info->reloc && info->pointer != info->reloc) { #if defined(_WIN32) /* TODO: implement memory protection under Microsoft Windows */ #else /* memory is already protected at this point; relocate code */ LIBXSMM_ASSERT(0 != (LIBXSMM_MALLOC_FLAG_MMAP & flags)); *memory = code_ptr; /* relocate */ info->pointer = info->reloc; info->reloc = NULL; # if !defined(LIBXSMM_MALLOC_CRC_OFF) /* update checksum */ # if defined(LIBXSMM_MALLOC_CRC_LIGHT) { const internal_malloc_info_type *const code_info = internal_malloc_info(code_ptr, 0/*no check*/); info->hash = LIBXSMM_CRC32U(LIBXSMM_BITS)(LIBXSMM_MALLOC_SEED, &code_info); } # else info->hash = libxsmm_crc32(LIBXSMM_MALLOC_SEED, info, /* info size minus actual hash value */ (unsigned int)(((char*)&info->hash) - ((char*)info))); # endif # endif /* treat memory protection errors as soft error; ignore return value */ munmap(buffer, alloc_size); #endif } #if !defined(_WIN32) else { /* malloc-based fall-back */ int mprotect_result; # if !defined(LIBXSMM_MALLOC_CRC_OFF) && defined(LIBXSMM_VTUNE) /* check checksum */ # if defined(LIBXSMM_MALLOC_CRC_LIGHT) assert(info->hash == LIBXSMM_CRC32U(LIBXSMM_BITS)(LIBXSMM_MALLOC_SEED, &info)); /* !LIBXSMM_ASSERT */ # else assert(info->hash == libxsmm_crc32(LIBXSMM_MALLOC_SEED, info, /* !LIBXSMM_ASSERT */ /* info size minus actual hash value */ (unsigned int)(((char*)&info->hash) - ((char*)info)))); # endif # endif /* treat memory protection errors as soft error; ignore return value */ mprotect_result = mprotect(buffer, alloc_size/*entire memory region*/, PROT_READ | PROT_EXEC); if (EXIT_SUCCESS != mprotect_result) { if (0 != libxsmm_se) { /* hard-error in case of SELinux */ if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: failed to allocate an executable buffer!\n"); } result = mprotect_result; } else if ((LIBXSMM_VERBOSITY_HIGH <= libxsmm_verbosity || 0 > libxsmm_verbosity) /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM WARNING: read-only request for JIT-buffer failed!\n"); } } } #endif } } } else if (NULL == memory || NULL == *memory) { if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: libxsmm_malloc_attrib failed because NULL cannot be attributed!\n"); } result = EXIT_FAILURE; } else if ((LIBXSMM_VERBOSITY_WARN <= libxsmm_verbosity || 0 > libxsmm_verbosity) && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM WARNING: %s buffer %p does not match!\n", 0 != (LIBXSMM_MALLOC_FLAG_X & flags) ? "executable" : "memory", *memory); } return result; } LIBXSMM_API LIBXSMM_ATTRIBUTE_MALLOC void* libxsmm_aligned_malloc(size_t size, size_t alignment) { void* result = NULL; LIBXSMM_INIT if (2 > internal_malloc_kind) { #if !defined(NDEBUG) int status = #endif libxsmm_xmalloc(&result, size, alignment, LIBXSMM_MALLOC_FLAG_DEFAULT, NULL/*extra*/, 0/*extra_size*/); assert(EXIT_SUCCESS == status || NULL == result); /* !LIBXSMM_ASSERT */ } else { /* scratch */ const void *const caller = libxsmm_trace_caller_id(0/*level*/); internal_scratch_malloc(&result, size, alignment, LIBXSMM_MALLOC_FLAG_DEFAULT, caller); } return result; } LIBXSMM_API void* libxsmm_realloc(size_t size, void* ptr) { const int nzeros = LIBXSMM_INTRINSICS_BITSCANFWD64((uintptr_t)ptr), alignment = 1 << nzeros; LIBXSMM_ASSERT(0 == ((uintptr_t)ptr & ~(0xFFFFFFFFFFFFFFFF << nzeros))); LIBXSMM_INIT if (2 > internal_malloc_kind) { #if !defined(NDEBUG) int status = #endif libxsmm_xmalloc(&ptr, size, alignment, LIBXSMM_MALLOC_FLAG_REALLOC, NULL/*extra*/, 0/*extra_size*/); assert(EXIT_SUCCESS == status || NULL == ptr); /* !LIBXSMM_ASSERT */ } else { /* scratch */ const void *const caller = libxsmm_trace_caller_id(0/*level*/); internal_scratch_malloc(&ptr, size, alignment, LIBXSMM_MALLOC_FLAG_REALLOC, caller); } return ptr; } LIBXSMM_API void* libxsmm_scratch_malloc(size_t size, size_t alignment, const void* caller) { void* result; LIBXSMM_INIT internal_scratch_malloc(&result, size, alignment, LIBXSMM_MALLOC_INTERNAL_CALLER != caller ? LIBXSMM_MALLOC_FLAG_DEFAULT : LIBXSMM_MALLOC_FLAG_PRIVATE, caller); return result; } LIBXSMM_API LIBXSMM_ATTRIBUTE_MALLOC void* libxsmm_malloc(size_t size) { return libxsmm_aligned_malloc(size, 0/*auto*/); } LIBXSMM_API void libxsmm_free(const void* memory) { if (NULL != memory) { #if defined(LIBXSMM_MALLOC_SCRATCH_DELETE_FIRST) || /* prefer safe method if possible */ \ (!defined(LIBXSMM_MALLOC_HOOK_STATIC) && !defined(LIBXSMM_MALLOC_HOOK_DYNAMIC)) # if defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (0 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) internal_malloc_pool_type *const pool = internal_scratch_malloc_pool(memory); if (NULL != pool) { /* memory belongs to scratch domain */ internal_scratch_free(memory, pool); } else # endif { /* local */ libxsmm_xfree(memory, 2/*check*/); } #else /* lookup matching pool */ internal_malloc_info_type *const info = internal_malloc_info(memory, 2/*check*/); static int error_once = 0; if (NULL != info && 0 == (LIBXSMM_MALLOC_FLAG_SCRATCH & info->flags)) { /* !libxsmm_free */ # if !defined(NDEBUG) if (EXIT_SUCCESS != internal_xfree(memory, info) && 0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: memory deallocation failed!\n"); } # else internal_xfree(memory, info); /* !libxsmm_free */ # endif } else { # if defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (0 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) internal_malloc_pool_type *const pool = internal_scratch_malloc_pool(memory); if (NULL != pool) { /* memory belongs to scratch domain */ internal_scratch_free(memory, pool); } else # endif { # if defined(NDEBUG) && (defined(LIBXSMM_MALLOC_HOOK_STATIC) || defined(LIBXSMM_MALLOC_HOOK_DYNAMIC)) __real_free((void*)memory); # else # if (defined(LIBXSMM_MALLOC_HOOK_STATIC) || defined(LIBXSMM_MALLOC_HOOK_DYNAMIC)) __real_free((void*)memory); # endif if (0 != libxsmm_verbosity && /* library code is expected to be mute */ 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: deallocation does not match allocation!\n"); } # endif } } #endif } } LIBXSMM_API_INTERN void libxsmm_xrelease_scratch(LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK)* lock) { #if defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (0 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) internal_malloc_pool_type* pools = NULL; libxsmm_scratch_info scratch_info; LIBXSMM_ASSERT(libxsmm_scratch_pools <= LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS); if (NULL != lock) { LIBXSMM_LOCK_ACQUIRE(LIBXSMM_LOCK, lock); } # if defined(LIBXSMM_MALLOC_DELETE_SAFE) if (0 == (internal_malloc_kind & 1) || 0 >= internal_malloc_kind) # endif { unsigned int i; pools = (internal_malloc_pool_type*)LIBXSMM_UP2( (uintptr_t)internal_malloc_pool_buffer, LIBXSMM_MALLOC_SCRATCH_PADDING); for (i = 0; i < libxsmm_scratch_pools; ++i) { if (0 != pools[i].instance.minsize) { if ( # if !defined(LIBXSMM_MALLOC_SCRATCH_DELETE_FIRST) 1 < pools[i].instance.counter && # endif NULL != pools[i].instance.buffer) { internal_malloc_info_type* const info = internal_malloc_info(pools[i].instance.buffer, 2/*check*/); if (NULL != info) internal_xfree(info->pointer, info); } } else break; /* early exit */ } } LIBXSMM_EXPECT(EXIT_SUCCESS, libxsmm_get_scratch_info(&scratch_info)); if (0 != scratch_info.npending && /* library code is expected to be mute */ (LIBXSMM_VERBOSITY_WARN <= libxsmm_verbosity || 0 > libxsmm_verbosity)) { char pending_size_buffer[32]; libxsmm_format_size(pending_size_buffer, sizeof(pending_size_buffer), internal_malloc_public_cur + internal_malloc_local_cur, "KM", "B", 10); fprintf(stderr, "LIBXSMM WARNING: %s pending scratch-memory by %" PRIuPTR " allocation%s!\n", pending_size_buffer, (uintptr_t)scratch_info.npending, 1 < scratch_info.npending ? "s" : ""); } if (NULL != pools) { memset(pools, 0, (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) * sizeof(internal_malloc_pool_type)); /* no reset: keep private watermark (internal_malloc_private_max, internal_malloc_private_cur) */ internal_malloc_public_max = internal_malloc_public_cur = 0; internal_malloc_local_max = internal_malloc_local_cur = 0; internal_malloc_scratch_nmallocs = 0; } if (NULL != lock) { LIBXSMM_LOCK_RELEASE(LIBXSMM_LOCK, lock); } #endif } LIBXSMM_API void libxsmm_release_scratch(void) { libxsmm_xrelease_scratch(&libxsmm_lock_global); } LIBXSMM_API int libxsmm_get_malloc_info(const void* memory, libxsmm_malloc_info* info) { int result = EXIT_SUCCESS; if (NULL != info) { size_t size; result = libxsmm_get_malloc_xinfo(memory, &size, NULL/*flags*/, NULL/*extra*/); LIBXSMM_MEMZERO127(info); if (EXIT_SUCCESS == result) { info->size = size; } #if !defined(NDEBUG) /* library code is expected to be mute */ else if (LIBXSMM_VERBOSITY_WARN <= libxsmm_verbosity || 0 > libxsmm_verbosity) { static int error_once = 0; if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM WARNING: foreign memory buffer %p discovered!\n", memory); } } #endif } else { result = EXIT_FAILURE; } return result; } LIBXSMM_API int libxsmm_get_scratch_info(libxsmm_scratch_info* info) { int result = EXIT_SUCCESS; if (NULL != info) { #if defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (0 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) LIBXSMM_MEMZERO127(info); info->nmallocs = internal_malloc_scratch_nmallocs; info->internal = internal_malloc_private_max; info->local = internal_malloc_local_max; info->size = internal_malloc_public_max; { const internal_malloc_pool_type* pool = (const internal_malloc_pool_type*)LIBXSMM_UP2( (uintptr_t)internal_malloc_pool_buffer, LIBXSMM_MALLOC_SCRATCH_PADDING); # if (1 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) const internal_malloc_pool_type *const end = pool + libxsmm_scratch_pools; LIBXSMM_ASSERT(libxsmm_scratch_pools <= LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS); for (; pool != end; ++pool) if ((LIBXSMM_MALLOC_INTERNAL_CALLER) != pool->instance.site) { # endif if (0 != pool->instance.minsize) { const size_t npending = pool->instance.counter; # if defined(LIBXSMM_MALLOC_SCRATCH_DELETE_FIRST) info->npending += npending; # else info->npending += 1 < npending ? (npending - 1) : 0; # endif ++info->npools; } # if (1 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) else break; /* early exit */ } # endif } #else LIBXSMM_MEMZERO127(info); #endif /*defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (0 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS))*/ } else { result = EXIT_FAILURE; } return result; } LIBXSMM_API void libxsmm_set_scratch_limit(size_t nbytes) { /* !LIBXSMM_INIT */ internal_malloc_scratch_limit = nbytes; } LIBXSMM_API size_t libxsmm_get_scratch_limit(void) { size_t result; /* !LIBXSMM_INIT */ if (LIBXSMM_SCRATCH_DEFAULT != internal_malloc_scratch_limit) { result = internal_malloc_scratch_limit; } else if (0 == internal_malloc_kind) { result = LIBXSMM_MALLOC_SCRATCH_LIMIT; } else { result = LIBXSMM_SCRATCH_UNLIMITED; } return result; } LIBXSMM_API void libxsmm_set_malloc(int enabled, const size_t* lo, const size_t* hi) { /* !LIBXSMM_INIT */ #if !(defined(LIBXSMM_MALLOC_HOOK_DYNAMIC) || defined(LIBXSMM_INTERCEPT_DYNAMIC)) LIBXSMM_UNUSED(enabled); internal_malloc_kind = 0; #elif defined(LIBXSMM_MALLOC) && (0 < LIBXSMM_MALLOC) LIBXSMM_UNUSED(enabled); internal_malloc_kind = LIBXSMM_MALLOC; #else internal_malloc_kind = enabled; #endif /* setup lo/hi after internal_malloc_kind! */ if (NULL != lo) internal_malloc_limit[0] = *lo; if (NULL != hi) { const size_t scratch_limit = libxsmm_get_scratch_limit(); const size_t malloc_upper = LIBXSMM_MIN(*hi, scratch_limit); internal_malloc_limit[1] = LIBXSMM_MAX(malloc_upper, internal_malloc_limit[0]); } libxsmm_malloc_init(); } LIBXSMM_API int libxsmm_get_malloc(size_t* lo, size_t* hi) { int result; LIBXSMM_INIT if (NULL != lo) *lo = internal_malloc_limit[0]; if (NULL != hi) *hi = internal_malloc_limit[1]; #if (defined(LIBXSMM_MALLOC_HOOK_DYNAMIC) || defined(LIBXSMM_INTERCEPT_DYNAMIC)) result = 0 != (internal_malloc_kind & 1) && 0 < internal_malloc_kind; #else result = 0; #endif return result; } libxsmm-1.17/src/libxsmm_math.c000066400000000000000000000444561415223013700165470ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include #include "libxsmm_main.h" #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #if !defined(LIBXSMM_NO_LIBM) # include #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif LIBXSMM_API int libxsmm_matdiff(libxsmm_matdiff_info* info, libxsmm_datatype datatype, libxsmm_blasint m, libxsmm_blasint n, const void* ref, const void* tst, const libxsmm_blasint* ldref, const libxsmm_blasint* ldtst) { int result = EXIT_SUCCESS, result_swap = 0, result_nan = 0; libxsmm_blasint ldr = (NULL == ldref ? m : *ldref), ldt = (NULL == ldtst ? m : *ldtst); if (NULL == ref && NULL != tst) { ref = tst; tst = NULL; result_swap = 1; } if (NULL != ref && NULL != info && m <= ldr && m <= ldt) { libxsmm_blasint mm = m, nn = n; double inf; if (1 == n) { mm = ldr = ldt = 1; nn = m; } /* ensure row-vector shape to standardize results */ libxsmm_matdiff_clear(info); inf = info->min_ref; switch (datatype) { case LIBXSMM_DATATYPE_F64: { # define LIBXSMM_MATDIFF_TEMPLATE_ELEM_TYPE double # include "template/libxsmm_matdiff.tpl.c" # undef LIBXSMM_MATDIFF_TEMPLATE_ELEM_TYPE } break; case LIBXSMM_DATATYPE_F32: { # define LIBXSMM_MATDIFF_TEMPLATE_ELEM_TYPE float # include "template/libxsmm_matdiff.tpl.c" # undef LIBXSMM_MATDIFF_TEMPLATE_ELEM_TYPE } break; case LIBXSMM_DATATYPE_I32: { # define LIBXSMM_MATDIFF_TEMPLATE_ELEM_TYPE int # include "template/libxsmm_matdiff.tpl.c" # undef LIBXSMM_MATDIFF_TEMPLATE_ELEM_TYPE } break; case LIBXSMM_DATATYPE_I16: { # define LIBXSMM_MATDIFF_TEMPLATE_ELEM_TYPE short # include "template/libxsmm_matdiff.tpl.c" # undef LIBXSMM_MATDIFF_TEMPLATE_ELEM_TYPE } break; case LIBXSMM_DATATYPE_I8: { # define LIBXSMM_MATDIFF_TEMPLATE_ELEM_TYPE signed char # include "template/libxsmm_matdiff.tpl.c" # undef LIBXSMM_MATDIFF_TEMPLATE_ELEM_TYPE } break; default: { static int error_once = 0; if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: unsupported data-type requested!\n"); } result = EXIT_FAILURE; } } LIBXSMM_ASSERT((0 <= info->m && 0 <= info->n) || (0 > info->m && 0 > info->n)); LIBXSMM_ASSERT(info->m < mm && info->n < nn); if (EXIT_SUCCESS == result) { const char *const env = getenv("LIBXSMM_DUMP"); LIBXSMM_INIT if (NULL != env && 0 != *env && '0' != *env) { if ('-' != *env || (0 <= info->m && 0 <= info->n)) { const char *const defaultname = (('0' < *env && '9' >= *env) || '-' == *env) ? "libxsmm_dump" : env; const libxsmm_mhd_elemtype type_src = (libxsmm_mhd_elemtype)datatype; const libxsmm_mhd_elemtype type_dst = LIBXSMM_MIN(LIBXSMM_MHD_ELEMTYPE_F32, type_src); const int envi = atoi(env), reshape = (1 < envi || -1 > envi); size_t shape[2], size[2]; char filename[256]; if (0 == reshape) { shape[0] = (size_t)mm; shape[1] = (size_t)nn; size[0] = (size_t)ldr; size[1] = (size_t)nn; } else { /* reshape */ const size_t x = (size_t)mm * (size_t)nn; const size_t y = (size_t)libxsmm_isqrt2_u32((unsigned int)x); shape[0] = x / y; shape[1] = y; size[0] = shape[0]; size[1] = shape[1]; } LIBXSMM_SNPRINTF(filename, sizeof(filename), "%s-%p-ref.mhd", defaultname, ref); libxsmm_mhd_write(filename, NULL/*offset*/, shape, size, 2/*ndims*/, 1/*ncomponents*/, type_src, &type_dst, ref, NULL/*header_size*/, NULL/*extension_header*/, NULL/*extension*/, 0/*extension_size*/); if (NULL != tst) { if (0 == reshape) { size[0] = (size_t)ldt; size[1] = (size_t)nn; } LIBXSMM_SNPRINTF(filename, sizeof(filename), "%s-%p-tst.mhd", defaultname, ref/*adopt ref-ptr*/); libxsmm_mhd_write(filename, NULL/*offset*/, shape, size, 2/*ndims*/, 1/*ncomponents*/, type_src, &type_dst, tst, NULL/*header_size*/, NULL/*extension_header*/, NULL/*extension*/, 0/*extension_size*/); if ('-' == *env && '1' < env[1]) { printf("LIBXSMM MATDIFF (%s): m=%" PRIuPTR " n=%" PRIuPTR " ldi=%" PRIuPTR " ldo=%" PRIuPTR " failed.\n", libxsmm_typename(datatype), (uintptr_t)m, (uintptr_t)n, (uintptr_t)ldr, (uintptr_t)ldt); } } } else if ('-' == *env && '1' < env[1] && NULL != tst) { printf("LIBXSMM MATDIFF (%s): m=%" PRIuPTR " n=%" PRIuPTR " ldi=%" PRIuPTR " ldo=%" PRIuPTR " passed.\n", libxsmm_typename(datatype), (uintptr_t)m, (uintptr_t)n, (uintptr_t)ldr, (uintptr_t)ldt); } } if (0 == result_nan) { info->normf_rel = libxsmm_dsqrt(info->normf_rel); info->l2_abs = libxsmm_dsqrt(info->l2_abs); info->l2_rel = libxsmm_dsqrt(info->l2_rel); } else if (1 == result_nan) { /* in case of NaN in test-set, statistics is not set to inf (ref/test) */ info->norm1_abs = info->norm1_rel = info->normi_abs = info->normi_rel = info->normf_rel = info->linf_abs = info->linf_rel = info->l2_abs = info->l2_rel = inf; } if (1 == n) { const libxsmm_blasint tmp = info->m; info->m = info->n; info->n = tmp; } if (0 != result_swap) { info->min_tst = info->min_ref; info->min_ref = 0; info->max_tst = info->max_ref; info->max_ref = 0; info->avg_tst = info->avg_ref; info->avg_ref = 0; info->var_tst = info->var_ref; info->var_ref = 0; info->l1_tst = info->l1_ref; info->l1_ref = 0; } } } else { result = EXIT_FAILURE; } return result; } LIBXSMM_API void libxsmm_matdiff_reduce(libxsmm_matdiff_info* output, const libxsmm_matdiff_info* input) { LIBXSMM_ASSERT(NULL != output && NULL != input); if (output->linf_abs < input->linf_abs) { output->linf_abs = input->linf_abs; LIBXSMM_ASSERT(0 <= input->m); output->m = input->m; LIBXSMM_ASSERT(0 <= input->n); output->n = input->n; } if (output->norm1_abs < input->norm1_abs) { output->norm1_abs = input->norm1_abs; } if (output->norm1_rel < input->norm1_rel) { output->norm1_rel = input->norm1_rel; } if (output->normi_abs < input->normi_abs) { output->normi_abs = input->normi_abs; } if (output->normi_rel < input->normi_rel) { output->normi_rel = input->normi_rel; } if (output->normf_rel < input->normf_rel) { output->normf_rel = input->normf_rel; } if (output->linf_rel < input->linf_rel) { output->linf_rel = input->linf_rel; } if (output->l2_abs < input->l2_abs) { output->l2_abs = input->l2_abs; } if (output->l2_rel < input->l2_rel) { output->l2_rel = input->l2_rel; } if (output->var_ref < input->var_ref) { output->var_ref = input->var_ref; } if (output->var_tst < input->var_tst) { output->var_tst = input->var_tst; } if (output->max_ref < input->max_ref) { output->max_ref = input->max_ref; } if (output->max_tst < input->max_tst) { output->max_tst = input->max_tst; } if (output->min_ref > input->min_ref) { output->min_ref = input->min_ref; } if (output->min_tst > input->min_tst) { output->min_tst = input->min_tst; } output->avg_ref = 0.5 * (output->avg_ref + input->avg_ref); output->avg_tst = 0.5 * (output->avg_tst + input->avg_tst); output->l1_ref += input->l1_ref; output->l1_tst += input->l1_tst; } LIBXSMM_API void libxsmm_matdiff_clear(libxsmm_matdiff_info* info) { if (NULL != info) { union { int raw; float value; } inf; #if defined(INFINITY) && /*overflow warning*/!defined(_CRAYC) inf.value = (float)(INFINITY); #else inf.raw = 0x7F800000; #endif memset(info, 0, sizeof(*info)); /* nullify */ /* no location discovered yet with a difference */ info->m = info->n = -1; /* initial minimum/maximum of reference/test */ info->min_ref = info->min_tst = +inf.value; info->max_ref = info->max_tst = -inf.value; } } LIBXSMM_API size_t libxsmm_shuffle(unsigned int n) { const unsigned int s = (0 != (n & 1) ? ((n / 2 - 1) | 1) : ((n / 2) & ~1)); const unsigned int d = (0 != (n & 1) ? 1 : 2); unsigned int result = (1 < n ? 1 : 0), i; for (i = (d < n ? (n - 1) : 0); d < i; i -= d) { unsigned int c = (s <= i ? (i - s) : (s - i)); unsigned int a = n, b = c; do { const unsigned int r = a % b; a = b; b = r; } while (0 != b); if (1 == a) { result = c; if (2 * c <= n) { i = d; /* break */ } } } assert((0 == result && 1 >= n) || (result < n && 1 == libxsmm_gcd(result, n))); return result; } LIBXSMM_API unsigned int libxsmm_isqrt_u64(unsigned long long x) { unsigned long long b; unsigned int y = 0, s; for (s = 0x80000000/*2^31*/; 0 < s; s >>= 1) { b = y | s; y |= (b * b <= x ? s : 0); } return y; } LIBXSMM_API unsigned int libxsmm_isqrt_u32(unsigned int x) { unsigned int b; unsigned int y = 0; int s; for (s = 0x40000000/*2^30*/; 0 < s; s >>= 2) { b = y | s; y >>= 1; if (b <= x) { x -= b; y |= s; } } return y; } LIBXSMM_API unsigned int libxsmm_isqrt2_u32(unsigned int x) { return libxsmm_product_limit(x, libxsmm_isqrt_u32(x), 0/*is_lower*/); } LIBXSMM_API LIBXSMM_INTRINSICS(LIBXSMM_X86_GENERIC) double libxsmm_dsqrt(double x) { #if defined(LIBXSMM_INTRINSICS_X86) && !defined(__PGI) const __m128d a = LIBXSMM_INTRINSICS_MM_UNDEFINED_PD(); const double result = _mm_cvtsd_f64(_mm_sqrt_sd(a, _mm_set_sd(x))); #elif !defined(LIBXSMM_NO_LIBM) const double result = sqrt(x); #else /* fall-back */ double result, y = x; if (LIBXSMM_NEQ(0, x)) { do { result = y; y = 0.5 * (y + x / y); } while (LIBXSMM_NEQ(result, y)); } result = y; #endif return result; } LIBXSMM_API LIBXSMM_INTRINSICS(LIBXSMM_X86_GENERIC) float libxsmm_ssqrt(float x) { #if defined(LIBXSMM_INTRINSICS_X86) const float result = _mm_cvtss_f32(_mm_sqrt_ss(_mm_set_ss(x))); #elif !defined(LIBXSMM_NO_LIBM) const float result = LIBXSMM_SQRTF(x); #else /* fall-back */ float result, y = x; if (LIBXSMM_NEQ(0, x)) { do { result = y; y = 0.5f * (y + x / y); } while (LIBXSMM_NEQ(result, y)); } result = y; #endif return result; } LIBXSMM_API unsigned int libxsmm_icbrt_u64(unsigned long long x) { unsigned long long b; unsigned int y = 0; int s; for (s = 63; 0 <= s; s -= 3) { y += y; b = ((unsigned long long)y + 1) * 3 * y + 1ULL; if (b <= (x >> s)) { x -= b << s; ++y; } } return y; } LIBXSMM_API unsigned int libxsmm_icbrt_u32(unsigned int x) { unsigned int b; unsigned int y = 0; int s; for (s = 30; 0 <= s; s -= 3) { y += y; b = 3 * y * (y + 1) + 1; if (b <= (x >> s)) { x -= b << s; ++y; } } return y; } #if defined(LIBXSMM_NO_LIBM) /* Implementation based on Claude Baumann's product (http://www.convict.lu/Jeunes/ultimate_stuff/exp_ln_2.htm). * Exponential function, which exposes the number of iterations taken in the main case (1...22). */ LIBXSMM_API_INLINE float internal_math_sexp2(float x, int maxiter) { static const float lut[] = { /* tabulated powf(2.f, powf(2.f, -index)) */ 2.00000000f, 1.41421354f, 1.18920708f, 1.09050775f, 1.04427373f, 1.02189720f, 1.01088929f, 1.00542986f, 1.00271130f, 1.00135469f, 1.00067711f, 1.00033855f, 1.00016928f, 1.00008464f, 1.00004232f, 1.00002110f, 1.00001061f, 1.00000525f, 1.00000262f, 1.00000131f, 1.00000072f, 1.00000036f, 1.00000012f }; const int lut_size = sizeof(lut) / sizeof(*lut), lut_size1 = lut_size - 1; int sign, temp, unbiased, exponent, mantissa; union { int i; float s; } result; result.s = x; sign = (0 == (result.i & 0x80000000) ? 0 : 1); temp = result.i & 0x7FFFFFFF; /* clear sign */ unbiased = (temp >> 23) - 127; /* exponent */ exponent = -unbiased; mantissa = (temp << 8) | 0x80000000; if (lut_size1 >= exponent) { if (lut_size1 != exponent) { /* multiple lookups needed */ if (7 >= unbiased) { /* not a degenerated case */ const int n = (0 >= maxiter || lut_size1 <= maxiter) ? lut_size1 : maxiter; int i = 1; if (0 > unbiased) { /* regular/main case */ LIBXSMM_ASSERT(0 <= exponent && exponent < lut_size); result.s = lut[exponent]; /* initial value */ i = exponent + 1; /* next LUT offset */ } else { result.s = 2.f; /* lut[0] */ i = 1; /* next LUT offset */ } for (; i <= n && 0 != mantissa; ++i) { mantissa <<= 1; if (0 != (mantissa & 0x80000000)) { /* check MSB */ LIBXSMM_ASSERT(0 <= i && i < lut_size); result.s *= lut[i]; /* TODO: normalized multiply */ } } for (i = 0; i < unbiased; ++i) { /* compute squares */ result.s *= result.s; } if (0 != sign) { /* negative value, so reciprocal */ result.s = 1.f / result.s; } } else { /* out of range */ #if defined(INFINITY) && /*overflow warning*/!defined(_CRAYC) result.s = (0 == sign ? ((float)(INFINITY)) : 0.f); #else result.i = (0 == sign ? 0x7F800000 : 0); #endif } } else if (0 == sign) { result.s = lut[lut_size1]; } else { /* reciprocal */ result.s = 1.f / lut[lut_size1]; } } else { result.s = 1.f; /* case 2^0 */ } return result.s; } #endif LIBXSMM_API float libxsmm_sexp2(float x) { #if !defined(LIBXSMM_NO_LIBM) return LIBXSMM_EXP2F(x); #else /* fall-back */ return internal_math_sexp2(x, 20/*compromise*/); #endif } LIBXSMM_API float libxsmm_sexp2_u8(unsigned char x) { union { int i; float s; } result; if (128 > x) { if (31 < x) { const float r32 = 2.f * ((float)(1U << 31)); /* 2^32 */ const int n = x >> 5; int i; result.s = r32; for (i = 1; i < n; ++i) result.s *= r32; result.s *= (1U << (x - (n << 5))); } else { result.s = (float)(1U << x); } } else { #if defined(INFINITY) && /*overflow warning*/!defined(_CRAYC) result.s = (float)(INFINITY); #else result.i = 0x7F800000; #endif } return result.s; } LIBXSMM_API float libxsmm_sexp2_i8(signed char x) { union { int i; float s; } result; if (-128 != x) { const signed char ux = (signed char)LIBXSMM_ABS(x); if (31 < ux) { const float r32 = 2.f * ((float)(1U << 31)); /* 2^32 */ const int n = ux >> 5; int i; result.s = r32; for (i = 1; i < n; ++i) result.s *= r32; result.s *= (1U << (ux - (n << 5))); } else { result.s = (float)(1U << ux); } if (ux != x) { /* signed */ result.s = 1.f / result.s; } } else { result.i = 0x200000; } return result.s; } LIBXSMM_API float libxsmm_sexp2_i8i(int x) { LIBXSMM_ASSERT(-128 <= x && x <= 127); return libxsmm_sexp2_i8((signed char)x); } #if defined(LIBXSMM_BUILD) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__)) /* implementation provided for Fortran 77 compatibility */ LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_matdiff)(libxsmm_matdiff_info* /*info*/, const int* /*datatype*/, const libxsmm_blasint* /*m*/, const libxsmm_blasint* /*n*/, const void* /*ref*/, const void* /*tst*/, const libxsmm_blasint* /*ldref*/, const libxsmm_blasint* /*ldtst*/); LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_matdiff)(libxsmm_matdiff_info* info, const int* datatype, const libxsmm_blasint* m, const libxsmm_blasint* n, const void* ref, const void* tst, const libxsmm_blasint* ldref, const libxsmm_blasint* ldtst) { static int error_once = 0; if ((NULL == datatype || LIBXSMM_DATATYPE_UNSUPPORTED <= *datatype || 0 > *datatype || NULL == m || EXIT_SUCCESS != libxsmm_matdiff(info, (libxsmm_datatype)*datatype, *m, *(NULL != n ? n : m), ref, tst, ldref, ldtst)) && 0 != libxsmm_verbosity && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_matdiff specified!\n"); } } /* implementation provided for Fortran 77 compatibility */ LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_matdiff_reduce)(libxsmm_matdiff_info* /*output*/, const libxsmm_matdiff_info* /*input*/); LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_matdiff_reduce)(libxsmm_matdiff_info* output, const libxsmm_matdiff_info* input) { libxsmm_matdiff_reduce(output, input); } /* implementation provided for Fortran 77 compatibility */ LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_matdiff_clear)(libxsmm_matdiff_info* /*info*/); LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_matdiff_clear)(libxsmm_matdiff_info* info) { libxsmm_matdiff_clear(info); } /* implementation provided for Fortran 77 compatibility */ LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_shuffle)(long long* /*coprime*/, const int* /*n*/); LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_shuffle)(long long* coprime, const int* n) { #if !defined(NDEBUG) static int error_once = 0; if (NULL != coprime && NULL != n && 0 <= *n) #endif { *coprime = (long long)(libxsmm_shuffle((unsigned int)(*n)) & 0x7FFFFFFF); } #if !defined(NDEBUG) else if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_shuffle specified!\n"); } #endif } #endif /*defined(LIBXSMM_BUILD) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__))*/ libxsmm-1.17/src/libxsmm_memory.c000066400000000000000000000357571415223013700171320ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include #include "libxsmm_hash.h" #include "libxsmm_diff.h" #include "libxsmm_main.h" #if !defined(LIBXSMM_MEMORY_STDLIB) && 0 # define LIBXSMM_MEMORY_STDLIB #endif #if !defined(LIBXSMM_MEMORY_SW) && 0 # define LIBXSMM_MEMORY_SW #endif #if !defined(LIBXSMM_MEMORY_SW) LIBXSMM_APIVAR_DEFINE(unsigned char (*internal_diff_function)(const void*, const void*, unsigned char)); LIBXSMM_APIVAR_DEFINE(int (*internal_memcmp_function)(const void*, const void*, size_t)); #endif LIBXSMM_API_INLINE unsigned char internal_diff_sw(const void* a, const void* b, unsigned char size) { #if defined(LIBXSMM_MEMORY_STDLIB) && defined(LIBXSMM_MEMORY_SW) return (unsigned char)memcmp(a, b, size); #else const uint8_t *const a8 = (const uint8_t*)a, *const b8 = (const uint8_t*)b; unsigned char i; LIBXSMM_PRAGMA_UNROLL/*_N(2)*/ for (i = 0; i < (size & 0xF0); i += 16) { LIBXSMM_DIFF_16_DECL(aa); LIBXSMM_DIFF_16_LOAD(aa, a8 + i); if (LIBXSMM_DIFF_16(aa, b8 + i, 0/*dummy*/)) return 1; } for (; i < size; ++i) if (a8[i] ^ b8[i]) return 1; return 0; #endif } LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_SSE3) unsigned char internal_diff_sse3(const void* a, const void* b, unsigned char size) { #if defined(LIBXSMM_INTRINSICS_SSE3) && !defined(LIBXSMM_MEMORY_SW) const uint8_t *const a8 = (const uint8_t*)a, *const b8 = (const uint8_t*)b; unsigned char i; LIBXSMM_PRAGMA_UNROLL/*_N(2)*/ for (i = 0; i < (size & 0xF0); i += 16) { LIBXSMM_DIFF_SSE3_DECL(aa); LIBXSMM_DIFF_SSE3_LOAD(aa, a8 + i); if (LIBXSMM_DIFF_SSE3(aa, b8 + i, 0/*dummy*/)) return 1; } for (; i < size; ++i) if (a8[i] ^ b8[i]) return 1; return 0; #else return internal_diff_sw(a, b, size); #endif } LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX2) unsigned char internal_diff_avx2(const void* a, const void* b, unsigned char size) { #if defined(LIBXSMM_INTRINSICS_AVX2) && !defined(LIBXSMM_MEMORY_SW) const uint8_t *const a8 = (const uint8_t*)a, *const b8 = (const uint8_t*)b; unsigned char i; LIBXSMM_PRAGMA_UNROLL/*_N(2)*/ for (i = 0; i < (size & 0xE0); i += 32) { LIBXSMM_DIFF_AVX2_DECL(aa); LIBXSMM_DIFF_AVX2_LOAD(aa, a8 + i); if (LIBXSMM_DIFF_AVX2(aa, b8 + i, 0/*dummy*/)) return 1; } for (; i < size; ++i) if (a8[i] ^ b8[i]) return 1; return 0; #else return internal_diff_sw(a, b, size); #endif } LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) unsigned char internal_diff_avx512(const void* a, const void* b, unsigned char size) { #if defined(LIBXSMM_INTRINSICS_AVX512) && !defined(LIBXSMM_MEMORY_SW) const uint8_t *const a8 = (const uint8_t*)a, *const b8 = (const uint8_t*)b; unsigned char i; LIBXSMM_PRAGMA_UNROLL/*_N(2)*/ for (i = 0; i < (size & 0xC0); i += 64) { LIBXSMM_DIFF_AVX512_DECL(aa); LIBXSMM_DIFF_AVX512_LOAD(aa, a8 + i); if (LIBXSMM_DIFF_AVX512(aa, b8 + i, 0/*dummy*/)) return 1; } for (; i < size; ++i) if (a8[i] ^ b8[i]) return 1; return 0; #else return internal_diff_sw(a, b, size); #endif } LIBXSMM_API_INLINE int internal_memcmp_sw(const void* a, const void* b, size_t size) { #if defined(LIBXSMM_MEMORY_STDLIB) return memcmp(a, b, size); #else const uint8_t *const a8 = (const uint8_t*)a, *const b8 = (const uint8_t*)b; size_t i; LIBXSMM_DIFF_16_DECL(aa); LIBXSMM_PRAGMA_UNROLL/*_N(2)*/ for (i = 0; i < (size & 0xFFFFFFFFFFFFFFF0); i += 16) { LIBXSMM_DIFF_16_LOAD(aa, a8 + i); if (LIBXSMM_DIFF_16(aa, b8 + i, 0/*dummy*/)) return 1; } for (; i < size; ++i) if (a8[i] ^ b8[i]) return 1; return 0; #endif } LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_SSE3) int internal_memcmp_sse3(const void* a, const void* b, size_t size) { #if defined(LIBXSMM_INTRINSICS_SSE3) && !defined(LIBXSMM_MEMORY_SW) const uint8_t *const a8 = (const uint8_t*)a, *const b8 = (const uint8_t*)b; size_t i; LIBXSMM_DIFF_SSE3_DECL(aa); LIBXSMM_PRAGMA_UNROLL/*_N(2)*/ for (i = 0; i < (size & 0xFFFFFFFFFFFFFFF0); i += 16) { LIBXSMM_DIFF_SSE3_LOAD(aa, a8 + i); if (LIBXSMM_DIFF_SSE3(aa, b8 + i, 0/*dummy*/)) return 1; } for (; i < size; ++i) if (a8[i] ^ b8[i]) return 1; return 0; #else return internal_memcmp_sw(a, b, size); #endif } LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX2) int internal_memcmp_avx2(const void* a, const void* b, size_t size) { #if defined(LIBXSMM_INTRINSICS_AVX2) && !defined(LIBXSMM_MEMORY_SW) const uint8_t *const a8 = (const uint8_t*)a, *const b8 = (const uint8_t*)b; size_t i; LIBXSMM_DIFF_AVX2_DECL(aa); LIBXSMM_PRAGMA_UNROLL/*_N(2)*/ for (i = 0; i < (size & 0xFFFFFFFFFFFFFFE0); i += 32) { LIBXSMM_DIFF_AVX2_LOAD(aa, a8 + i); if (LIBXSMM_DIFF_AVX2(aa, b8 + i, 0/*dummy*/)) return 1; } for (; i < size; ++i) if (a8[i] ^ b8[i]) return 1; return 0; #else return internal_memcmp_sw(a, b, size); #endif } LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) int internal_memcmp_avx512(const void* a, const void* b, size_t size) { #if defined(LIBXSMM_INTRINSICS_AVX512) && !defined(LIBXSMM_MEMORY_SW) const uint8_t *const a8 = (const uint8_t*)a, *const b8 = (const uint8_t*)b; size_t i; LIBXSMM_DIFF_AVX512_DECL(aa); LIBXSMM_PRAGMA_UNROLL/*_N(2)*/ for (i = 0; i < (size & 0xFFFFFFFFFFFFFFC0); i += 64) { LIBXSMM_DIFF_AVX512_LOAD(aa, a8 + i); if (LIBXSMM_DIFF_AVX512(aa, b8 + i, 0/*dummy*/)) return 1; } for (; i < size; ++i) if (a8[i] ^ b8[i]) return 1; return 0; #else return internal_memcmp_sw(a, b, size); #endif } LIBXSMM_API_INTERN void libxsmm_memory_init(int target_arch) { #if defined(LIBXSMM_MEMORY_SW) LIBXSMM_UNUSED(target_arch); #else if (LIBXSMM_X86_AVX512 <= target_arch) { # if defined(LIBXSMM_DIFF_AVX512_ENABLED) internal_diff_function = internal_diff_avx512; # else internal_diff_function = internal_diff_avx2; # endif # if defined(LIBXSMM_DIFF_AVX512_ENABLED) internal_memcmp_function = internal_memcmp_avx512; # else internal_memcmp_function = internal_memcmp_avx2; # endif } else if (LIBXSMM_X86_AVX2 <= target_arch) { internal_diff_function = internal_diff_avx2; internal_memcmp_function = internal_memcmp_avx2; } else if (LIBXSMM_X86_SSE3 <= target_arch) { internal_diff_function = internal_diff_sse3; internal_memcmp_function = internal_memcmp_sse3; } else { internal_diff_function = internal_diff_sw; internal_memcmp_function = internal_memcmp_sw; } LIBXSMM_ASSERT(NULL != internal_diff_function); LIBXSMM_ASSERT(NULL != internal_memcmp_function); #endif } LIBXSMM_API_INTERN void libxsmm_memory_finalize(void) { #if !defined(NDEBUG) && !defined(LIBXSMM_MEMORY_SW) internal_diff_function = NULL; internal_memcmp_function = NULL; #endif } LIBXSMM_API unsigned char libxsmm_diff_16(const void* a, const void* b, ...) { #if defined(LIBXSMM_MEMORY_SW) return internal_diff_sw(a, b, 16); #else LIBXSMM_DIFF_16_DECL(a16); LIBXSMM_DIFF_16_LOAD(a16, a); return LIBXSMM_DIFF_16(a16, b, 0/*dummy*/); #endif } LIBXSMM_API unsigned char libxsmm_diff_32(const void* a, const void* b, ...) { #if defined(LIBXSMM_MEMORY_SW) return internal_diff_sw(a, b, 32); #else LIBXSMM_DIFF_32_DECL(a32); LIBXSMM_DIFF_32_LOAD(a32, a); return LIBXSMM_DIFF_32(a32, b, 0/*dummy*/); #endif } LIBXSMM_API unsigned char libxsmm_diff_48(const void* a, const void* b, ...) { #if defined(LIBXSMM_MEMORY_SW) return internal_diff_sw(a, b, 48); #else LIBXSMM_DIFF_48_DECL(a48); LIBXSMM_DIFF_48_LOAD(a48, a); return LIBXSMM_DIFF_48(a48, b, 0/*dummy*/); #endif } LIBXSMM_API unsigned char libxsmm_diff_64(const void* a, const void* b, ...) { #if defined(LIBXSMM_MEMORY_SW) return internal_diff_sw(a, b, 64); #else LIBXSMM_DIFF_64_DECL(a64); LIBXSMM_DIFF_64_LOAD(a64, a); return LIBXSMM_DIFF_64(a64, b, 0/*dummy*/); #endif } LIBXSMM_API unsigned char libxsmm_diff(const void* a, const void* b, unsigned char size) { #if defined(LIBXSMM_MEMORY_SW) && !defined(LIBXSMM_MEMORY_STDLIB) return internal_diff_sw(a, b, size); #else # if defined(LIBXSMM_MEMORY_STDLIB) return 0 != memcmp(a, b, size); # elif (LIBXSMM_X86_AVX512 <= LIBXSMM_STATIC_TARGET_ARCH) && defined(LIBXSMM_DIFF_AVX512_ENABLED) return internal_diff_avx512(a, b, size); # elif (LIBXSMM_X86_AVX2 <= LIBXSMM_STATIC_TARGET_ARCH) return internal_diff_avx2(a, b, size); # elif (LIBXSMM_X86_SSE3 <= LIBXSMM_STATIC_TARGET_ARCH) # if (LIBXSMM_X86_AVX2 > LIBXSMM_MAX_STATIC_TARGET_ARCH) return internal_diff_sse3(a, b, size); # else /* pointer based function call */ # if defined(LIBXSMM_INIT_COMPLETED) LIBXSMM_ASSERT(NULL != internal_diff_function); return internal_diff_function(a, b, size); # else return (unsigned char)(NULL != internal_diff_function ? internal_diff_function(a, b, size) : internal_diff_sse3(a, b, size)); # endif # endif # else return internal_diff_sw(a, b, size); # endif #endif } LIBXSMM_API unsigned int libxsmm_diff_n(const void* a, const void* bn, unsigned char size, unsigned char stride, unsigned int hint, unsigned int n) { unsigned int result; LIBXSMM_ASSERT(size <= stride); #if defined(LIBXSMM_MEMORY_STDLIB) && !defined(LIBXSMM_MEMORY_SW) LIBXSMM_DIFF_N(unsigned int, result, memcmp, a, bn, size, stride, hint, n); #else # if !defined(LIBXSMM_MEMORY_SW) switch (size) { case 64: { LIBXSMM_DIFF_64_DECL(a64); LIBXSMM_DIFF_64_LOAD(a64, a); LIBXSMM_DIFF_N(unsigned int, result, LIBXSMM_DIFF_64, a64, bn, size, stride, hint, n); } break; case 48: { LIBXSMM_DIFF_48_DECL(a48); LIBXSMM_DIFF_48_LOAD(a48, a); LIBXSMM_DIFF_N(unsigned int, result, LIBXSMM_DIFF_48, a48, bn, size, stride, hint, n); } break; case 32: { LIBXSMM_DIFF_32_DECL(a32); LIBXSMM_DIFF_32_LOAD(a32, a); LIBXSMM_DIFF_N(unsigned int, result, LIBXSMM_DIFF_32, a32, bn, size, stride, hint, n); } break; case 16: { LIBXSMM_DIFF_16_DECL(a16); LIBXSMM_DIFF_16_LOAD(a16, a); LIBXSMM_DIFF_N(unsigned int, result, LIBXSMM_DIFF_16, a16, bn, size, stride, hint, n); } break; default: # endif { LIBXSMM_DIFF_N(unsigned int, result, libxsmm_diff, a, bn, size, stride, hint, n); } # if !defined(LIBXSMM_MEMORY_SW) } # endif #endif return result; } LIBXSMM_API int libxsmm_memcmp(const void* a, const void* b, size_t size) { #if defined(LIBXSMM_MEMORY_SW) && !defined(LIBXSMM_MEMORY_STDLIB) return internal_memcmp_sw(a, b, size); #else # if defined(LIBXSMM_MEMORY_STDLIB) return memcmp(a, b, size); # elif (LIBXSMM_X86_AVX512 <= LIBXSMM_STATIC_TARGET_ARCH) && defined(LIBXSMM_DIFF_AVX512_ENABLED) return internal_memcmp_avx512(a, b, size); # elif (LIBXSMM_X86_AVX2 <= LIBXSMM_STATIC_TARGET_ARCH) return internal_memcmp_avx2(a, b, size); # elif (LIBXSMM_X86_SSE3 <= LIBXSMM_STATIC_TARGET_ARCH) # if (LIBXSMM_X86_AVX2 > LIBXSMM_MAX_STATIC_TARGET_ARCH) return internal_memcmp_sse3(a, b, size); # else /* pointer based function call */ # if defined(LIBXSMM_INIT_COMPLETED) LIBXSMM_ASSERT(NULL != internal_memcmp_function); return internal_memcmp_function(a, b, size); # else return NULL != internal_memcmp_function ? internal_memcmp_function(a, b, size) : internal_memcmp_sse3(a, b, size); # endif # endif # else return internal_memcmp_sw(a, b, size); # endif #endif } LIBXSMM_API unsigned int libxsmm_hash(const void* data, unsigned int size, unsigned int seed) { LIBXSMM_INIT return libxsmm_crc32(seed, data, size); } LIBXSMM_API unsigned long long libxsmm_hash_string(const char string[]) { unsigned long long result; const size_t length = NULL != string ? strlen(string) : 0; if (sizeof(result) < length) { const size_t length2 = length / 2; unsigned int seed32 = 0; /* seed=0: match else-optimization */ LIBXSMM_INIT seed32 = libxsmm_crc32(seed32, string, length2); result = libxsmm_crc32(seed32, string + length2, length - length2); result = (result << 32) | seed32; } else { /* reinterpret directly as hash value */ char *const s = (char*)&result; signed char i; for (i = 0; i < (signed char)length; ++i) s[i] = string[i]; for (; i < (signed char)sizeof(result); ++i) s[i] = 0; } return result; } #if defined(LIBXSMM_BUILD) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__)) /* implementation provided for Fortran 77 compatibility */ LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xhash)(int* /*hash_seed*/, const void* /*data*/, const int* /*size*/); LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xhash)(int* hash_seed, const void* data, const int* size) { #if !defined(NDEBUG) static int error_once = 0; if (NULL != hash_seed && NULL != data && NULL != size && 0 <= *size) #endif { *hash_seed = (int)(libxsmm_hash(data, (unsigned int)*size, (unsigned int)*hash_seed) & 0x7FFFFFFF/*sign-bit*/); } #if !defined(NDEBUG) else if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_xhash specified!\n"); } #endif } /* implementation provided for Fortran 77 compatibility */ LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xdiff)(int* /*result*/, const void* /*a*/, const void* /*b*/, const long long* /*size*/); LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xdiff)(int* result, const void* a, const void* b, const long long* size) { #if !defined(NDEBUG) static int error_once = 0; if (NULL != result && NULL != a && NULL != b && NULL != size && 0 <= *size) #endif { *result = libxsmm_memcmp(a, b, (size_t)*size); } #if !defined(NDEBUG) else if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_xdiff specified!\n"); } #endif } /* implementation provided for Fortran 77 compatibility */ LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xclear)(void* /*dst*/, const int* /*size*/); LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xclear)(void* dst, const int* size) { #if !defined(NDEBUG) static int error_once = 0; if (NULL != dst && NULL != size && 0 <= *size && 128 > *size) #endif { LIBXSMM_MEMSET127(dst, 0, *size); } #if !defined(NDEBUG) else if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_xclear specified!\n"); } #endif } #endif /*defined(LIBXSMM_BUILD) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__))*/ libxsmm-1.17/src/libxsmm_mhd.c000066400000000000000000001075311415223013700163600ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include #include "libxsmm_main.h" /* libxsmm_typesize */ #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #include #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif #if !defined(LIBXSMM_MHD_MAX_LINELENGTH) # define LIBXSMM_MHD_MAX_LINELENGTH 1024 #endif #if !defined(LIBXSMM_MHD_MAX_ELEMSIZE) # define LIBXSMM_MHD_MAX_ELEMSIZE 8 #endif #define LIBXSMM_MHD_MINMAX(TYPE, DATA, NELEMENTS, PMIN_INOUT, PMAX_INOUT) { \ LIBXSMM_ASSERT(NULL != (PMIN_INOUT) && NULL != (PMAX_INOUT)); \ if (0 < (NELEMENTS)) { \ size_t libxsmm_mhd_minmax_index_ = 0; \ do { \ TYPE libxsmm_mhd_minmax_value_; \ LIBXSMM_ASSERT(NULL != (DATA)); \ libxsmm_mhd_minmax_value_ = ((const TYPE*)DATA)[libxsmm_mhd_minmax_index_]; \ if (libxsmm_mhd_minmax_value_ < *((const TYPE*)PMIN_INOUT)) { \ *((TYPE*)PMIN_INOUT) = libxsmm_mhd_minmax_value_; \ } \ else if (libxsmm_mhd_minmax_value_ > *((const TYPE*)PMAX_INOUT)) { \ *((TYPE*)PMAX_INOUT) = libxsmm_mhd_minmax_value_; \ } \ ++libxsmm_mhd_minmax_index_; \ } while (libxsmm_mhd_minmax_index_ < (NELEMENTS)); \ } \ else *((TYPE*)PMIN_INOUT) = *((TYPE*)PMAX_INOUT) = 0; \ } #define LIBXSMM_MHD_TYPE_PROMOTE(DST_TYPE, SRC_TYPE) \ (LIBXSMM_MHD_ELEMTYPE_I64 > (DST_TYPE) || (LIBXSMM_MHD_ELEMTYPE_U64 > (DST_TYPE) \ ? /*dst is signed*/(LIBXSMM_MHD_ELEMTYPE_U64 > (SRC_TYPE) ? ((SRC_TYPE) > (DST_TYPE)) : 0) \ : /*dst is unsigned*/(LIBXSMM_MHD_ELEMTYPE_U64 > (SRC_TYPE) ? 0 : ((SRC_TYPE) > (DST_TYPE))))) #define LIBXSMM_MHD_ELEMENT_CONVERSION_F(SRC_TYPE, DST_TYPE, DST_ENUM, DST_MIN, DST_MAX, PDST, SRC_ENUM, PSRC, PSRC_MIN, PSRC_MAX, RESULT) { \ const double h = (0.5 - (DST_TYPE)0.5); \ SRC_TYPE s = *((const SRC_TYPE*)PSRC); \ double s0 = 0, s1 = 0; \ if (NULL != (PSRC_MIN) && LIBXSMM_NOTNAN(s)) { \ LIBXSMM_ASSERT_MSG(NULL != (PSRC_MAX) && *((const SRC_TYPE*)PSRC_MIN) <= s && s <= *((const SRC_TYPE*)PSRC_MAX), "Invalid value range"); \ s0 = (double)*((const SRC_TYPE*)PSRC_MIN); s1 = (double)*((const SRC_TYPE*)PSRC_MAX); \ } \ if (LIBXSMM_MHD_ELEMTYPE_I64 <= (DST_ENUM) && s0 < s1) { /* scale */ \ if (LIBXSMM_MHD_ELEMTYPE_U64 <= (DST_ENUM)) { \ const double s0pos = LIBXSMM_MAX(0, s0), s1pos = LIBXSMM_MAX(0, s1), scale = (s0pos < s1pos ? ((s1 - s0) / (s1pos - s0pos)) : 1); \ s = (SRC_TYPE)(scale * (double)LIBXSMM_MAX(0, s)); \ s0 = s0pos; s1 = s1pos; \ } \ else if (0 == LIBXSMM_MHD_TYPE_PROMOTE(DST_ENUM, SRC_ENUM) && 0 > s0 && 0 < s1) { \ s1 = LIBXSMM_MAX(-s0, s1); s0 = -s1; \ } \ { const double d0 = (0 <= s0 ? 0 : (DST_MIN)), d1 = (0 <= s1 ? (DST_MAX) : 0), d = ((double)s - s0) * (d1 - d0) / (s1 - s0) + d0; \ *((DST_TYPE*)PDST) = (DST_TYPE)LIBXSMM_CLMP(0 <= d ? (d + h) : (d - h), d0, d1); \ } \ } \ else if (0 == LIBXSMM_MHD_TYPE_PROMOTE(DST_ENUM, SRC_ENUM)) { /* clamp */ \ *((DST_TYPE*)PDST) = (DST_TYPE)(0 <= s ? LIBXSMM_CLMP(s + h, DST_MIN, DST_MAX) : LIBXSMM_CLMP(s - h, DST_MIN, DST_MAX)); \ } \ else { /* promote */ \ *((DST_TYPE*)PDST) = (DST_TYPE)(0 <= s ? (s + h) : (s - h)); \ } \ RESULT = EXIT_SUCCESS; \ } #define LIBXSMM_MHD_ELEMENT_CONVERSION_I(SRC_TYPE, DST_TYPE, DST_ENUM, DST_MIN, DST_MAX, PDST, SRC_ENUM, PSRC, PSRC_MIN, PSRC_MAX, RESULT) { \ const double h = (0.5 - (DST_TYPE)0.5); \ SRC_TYPE s = *((const SRC_TYPE*)PSRC); \ double s0 = 0, s1 = 0; \ if (NULL != (PSRC_MIN)) { \ LIBXSMM_ASSERT_MSG(NULL != (PSRC_MAX) && *((const SRC_TYPE*)PSRC_MIN) <= s && s <= *((const SRC_TYPE*)PSRC_MAX), "Invalid value range"); \ s0 = (double)*((const SRC_TYPE*)PSRC_MIN); s1 = (double)*((const SRC_TYPE*)PSRC_MAX); \ } \ if (LIBXSMM_MHD_ELEMTYPE_I64 <= (DST_ENUM) && s0 < s1) { /* scale */ \ if (LIBXSMM_MHD_ELEMTYPE_U64 <= (DST_ENUM)) { \ const double s0pos = LIBXSMM_MAX(0, s0), s1pos = LIBXSMM_MAX(0, s1), scale = (s0pos < s1pos ? ((s1 - s0) / (s1pos - s0pos)) : 1); \ const double ss = scale * (double)LIBXSMM_MAX(0, s); \ s = (SRC_TYPE)(0 <= ss ? (ss + h) : (ss - h)); \ s0 = s0pos; s1 = s1pos; \ } \ else if (0 == LIBXSMM_MHD_TYPE_PROMOTE(DST_ENUM, SRC_ENUM) && 0 > s0 && 0 < s1) { \ s1 = LIBXSMM_MAX(-s0, s1); s0 = -s1; \ } \ { const double d0 = (0 <= s0 ? 0 : (DST_MIN)), d1 = (0 <= s1 ? (DST_MAX) : 0), d = ((double)s - s0) * (d1 - d0) / (s1 - s0) + d0; \ *((DST_TYPE*)PDST) = (DST_TYPE)LIBXSMM_CLMP(0 <= d ? (d + h) : (d - h), d0, d1); \ } \ } \ else if (0 == LIBXSMM_MHD_TYPE_PROMOTE(DST_ENUM, SRC_ENUM)) { /* clamp */ \ *((DST_TYPE*)PDST) = (DST_TYPE)LIBXSMM_CLMP(s, DST_MIN, DST_MAX); \ } \ else { /* promote */ \ *((DST_TYPE*)PDST) = (DST_TYPE)s; \ } \ RESULT = EXIT_SUCCESS; \ } #define LIBXSMM_MHD_ELEMENT_CONVERSION_U LIBXSMM_MHD_ELEMENT_CONVERSION_I #define LIBXSMM_MHD_ELEMENT_CONVERSION(DST_TYPE, DST_ENUM, DST_MIN, DST_MAX, PDST, SRC_ENUM, PSRC, PSRC_MIN, PSRC_MAX, RESULT) { \ LIBXSMM_ASSERT_MSG(NULL != (PDST) && NULL != (PSRC), "Invalid input or output"); \ switch(SRC_ENUM) { \ case LIBXSMM_MHD_ELEMTYPE_F64: { \ LIBXSMM_MHD_ELEMENT_CONVERSION_F(double, DST_TYPE, DST_ENUM, DST_MIN, DST_MAX, PDST, SRC_ENUM, PSRC, PSRC_MIN, PSRC_MAX, RESULT); \ } break; \ case LIBXSMM_MHD_ELEMTYPE_F32: { \ LIBXSMM_MHD_ELEMENT_CONVERSION_F(float, DST_TYPE, DST_ENUM, DST_MIN, DST_MAX, PDST, SRC_ENUM, PSRC, PSRC_MIN, PSRC_MAX, RESULT); \ } break; \ case LIBXSMM_MHD_ELEMTYPE_BF16: { \ LIBXSMM_ASSERT_MSG(0, "Not implemented yet"); \ } break; \ case LIBXSMM_MHD_ELEMTYPE_I64: { \ LIBXSMM_MHD_ELEMENT_CONVERSION_I(long long, DST_TYPE, DST_ENUM, DST_MIN, DST_MAX, PDST, SRC_ENUM, PSRC, PSRC_MIN, PSRC_MAX, RESULT); \ } break; \ case LIBXSMM_MHD_ELEMTYPE_I32: { \ LIBXSMM_MHD_ELEMENT_CONVERSION_I(int, DST_TYPE, DST_ENUM, DST_MIN, DST_MAX, PDST, SRC_ENUM, PSRC, PSRC_MIN, PSRC_MAX, RESULT); \ } break; \ case LIBXSMM_MHD_ELEMTYPE_I16: { \ LIBXSMM_MHD_ELEMENT_CONVERSION_I(short, DST_TYPE, DST_ENUM, DST_MIN, DST_MAX, PDST, SRC_ENUM, PSRC, PSRC_MIN, PSRC_MAX, RESULT); \ } break; \ case LIBXSMM_MHD_ELEMTYPE_I8: { \ LIBXSMM_MHD_ELEMENT_CONVERSION_I(signed char, DST_TYPE, DST_ENUM, DST_MIN, DST_MAX, PDST, SRC_ENUM, PSRC, PSRC_MIN, PSRC_MAX, RESULT); \ } break; \ case LIBXSMM_MHD_ELEMTYPE_U64: { \ LIBXSMM_MHD_ELEMENT_CONVERSION_U(unsigned long long, DST_TYPE, DST_ENUM, DST_MIN, DST_MAX, PDST, SRC_ENUM, PSRC, PSRC_MIN, PSRC_MAX, RESULT); \ } break; \ case LIBXSMM_MHD_ELEMTYPE_U32: { \ LIBXSMM_MHD_ELEMENT_CONVERSION_U(unsigned int, DST_TYPE, DST_ENUM, DST_MIN, DST_MAX, PDST, SRC_ENUM, PSRC, PSRC_MIN, PSRC_MAX, RESULT); \ } break; \ case LIBXSMM_MHD_ELEMTYPE_U16: { \ LIBXSMM_MHD_ELEMENT_CONVERSION_U(unsigned short, DST_TYPE, DST_ENUM, DST_MIN, DST_MAX, PDST, SRC_ENUM, PSRC, PSRC_MIN, PSRC_MAX, RESULT); \ } break; \ case LIBXSMM_MHD_ELEMTYPE_U8: { \ LIBXSMM_MHD_ELEMENT_CONVERSION_U(unsigned char, DST_TYPE, DST_ENUM, DST_MIN, DST_MAX, PDST, SRC_ENUM, PSRC, PSRC_MIN, PSRC_MAX, RESULT); \ } break; \ default: RESULT = EXIT_FAILURE; \ } \ } LIBXSMM_API const char* libxsmm_mhd_typename(libxsmm_mhd_elemtype type, size_t* typesize, const char** ctypename) { const char *mhd_typename = NULL, *c_typename = NULL; size_t size = 0; switch (type) { case LIBXSMM_MHD_ELEMTYPE_F64: { size = 8; mhd_typename = "MET_DOUBLE"; c_typename = "double"; } break; case LIBXSMM_MHD_ELEMTYPE_F32: { size = 4; mhd_typename = "MET_FLOAT"; c_typename = "float"; } break; case LIBXSMM_MHD_ELEMTYPE_BF16: { size = 2; mhd_typename = "MET_BFLOAT"; c_typename = "unsigned short"; } break; case LIBXSMM_MHD_ELEMTYPE_I64: { size = 8; mhd_typename = "MET_LONG"; c_typename = "signed long long"; } break; case LIBXSMM_MHD_ELEMTYPE_I32: { size = 4; mhd_typename = "MET_INT"; c_typename = "signed int"; } break; case LIBXSMM_MHD_ELEMTYPE_I16: { size = 2; mhd_typename = "MET_SHORT"; c_typename = "signed short"; } break; case LIBXSMM_MHD_ELEMTYPE_I8: { size = 1; mhd_typename = "MET_CHAR"; c_typename = "signed char"; } break; case LIBXSMM_MHD_ELEMTYPE_U64: { size = 8; mhd_typename = "MET_ULONG"; c_typename = "unsigned long long"; } break; case LIBXSMM_MHD_ELEMTYPE_U32: { size = 4; mhd_typename = "MET_UINT"; c_typename = "unsigned int"; } break; case LIBXSMM_MHD_ELEMTYPE_U16: { size = 2; mhd_typename = "MET_USHORT"; c_typename = "unsigned short"; } break; case LIBXSMM_MHD_ELEMTYPE_U8: { size = 1; mhd_typename = "MET_UCHAR"; c_typename = "unsigned char"; } break; default: size = libxsmm_typesize((libxsmm_datatype)type); /* fallback */ } LIBXSMM_ASSERT(size <= LIBXSMM_MHD_MAX_ELEMSIZE); if (NULL != ctypename) *ctypename = c_typename; if (NULL != typesize) *typesize = size; return mhd_typename; } LIBXSMM_API libxsmm_mhd_elemtype libxsmm_mhd_typeinfo(const char elemname[]) { libxsmm_mhd_elemtype result = LIBXSMM_MHD_ELEMTYPE_UNKNOWN; if (0 == strcmp("MET_DOUBLE", elemname)) { result = LIBXSMM_MHD_ELEMTYPE_F64; } else if (0 == strcmp("MET_FLOAT", elemname)) { result = LIBXSMM_MHD_ELEMTYPE_F32; } else if (0 == strcmp("MET_BFLOAT", elemname)) { result = LIBXSMM_MHD_ELEMTYPE_BF16; } else if (0 == strcmp("MET_LONG", elemname)) { result = LIBXSMM_MHD_ELEMTYPE_I64; } else if (0 == strcmp("MET_INT", elemname)) { result = LIBXSMM_MHD_ELEMTYPE_I32; } else if (0 == strcmp("MET_SHORT", elemname)) { result = LIBXSMM_MHD_ELEMTYPE_I16; } else if (0 == strcmp("MET_CHAR", elemname)) { result = LIBXSMM_MHD_ELEMTYPE_I8; } else if (0 == strcmp("MET_ULONG", elemname)) { result = LIBXSMM_MHD_ELEMTYPE_U64; } else if (0 == strcmp("MET_UINT", elemname)) { result = LIBXSMM_MHD_ELEMTYPE_U32; } else if (0 == strcmp("MET_USHORT", elemname)) { result = LIBXSMM_MHD_ELEMTYPE_U16; } else if (0 == strcmp("MET_UCHAR", elemname)) { result = LIBXSMM_MHD_ELEMTYPE_U8; } return result; } LIBXSMM_API_INLINE int internal_mhd_readline(char buffer[], char split, size_t* key_end, size_t* value_begin) { int result = EXIT_SUCCESS; char *const isplit = strchr(buffer, split); if (NULL != isplit) { char* i = isplit; LIBXSMM_ASSERT(NULL != key_end && NULL != value_begin); while (buffer != i) { --i; if (0 == isspace((int)(*i))) break; } *key_end = i - buffer + 1; i = isplit; while ('\n' != *++i) if (0 == isspace((int)(*i))) break; *value_begin = i - buffer; while (0 != *i && 0 != isprint((int)(*i))) ++i; if (0 == isprint((int)(*i))) *i = 0; /* fix-up */ if (i <= (buffer + *value_begin)) { result = EXIT_FAILURE; } } else { result = EXIT_FAILURE; } return result; } LIBXSMM_API int libxsmm_mhd_read_header(const char header_filename[], size_t filename_max_length, char filename[], size_t* ndims, size_t size[], size_t* ncomponents, libxsmm_mhd_elemtype* type, size_t* header_size, size_t* extension_size) { int result = EXIT_SUCCESS; char buffer[LIBXSMM_MHD_MAX_LINELENGTH]; FILE *const file = ( 0 < filename_max_length && NULL != filename && NULL != ndims && 0 < *ndims && NULL != size && NULL != type && NULL != ncomponents) ? fopen(header_filename, "rb") : NULL; if (NULL != file) { size_t key_end, value_begin; if (NULL != extension_size) *extension_size = 0; if (NULL != header_size) *header_size = 0; memset(size, 0, *ndims * sizeof(*size)); *type = LIBXSMM_MHD_ELEMTYPE_UNKNOWN; *ncomponents = 1; if (header_filename != filename) { *filename = 0; } while (0 != fgets(buffer, sizeof(buffer), file) && EXIT_SUCCESS == result && EXIT_SUCCESS == internal_mhd_readline(buffer, '=', &key_end, &value_begin)) { if (0 == strncmp("NDims", buffer, key_end) && key_end == strlen("NDims")) { const int value = atoi(buffer + value_begin); if (0 < value && value <= ((int)*ndims)) { *ndims = value; } } else if (0 == strncmp("ElementNumberOfChannels", buffer, key_end) && key_end == strlen("ElementNumberOfChannels")) { const int value = atoi(buffer + value_begin); if (0 < value) { *ncomponents = value; } else { result = EXIT_FAILURE; } } else if (NULL != extension_size && 0 == strncmp("ExtensionDataSize", buffer, key_end) && key_end == strlen("ExtensionDataSize")) { const int value = atoi(buffer + value_begin); if (0 <= value) { *extension_size = value; } else { result = EXIT_FAILURE; } } else if (0 == strncmp("ElementType", buffer, key_end) && key_end == strlen("ElementType")) { const libxsmm_mhd_elemtype value = libxsmm_mhd_typeinfo(buffer + value_begin); if (LIBXSMM_MHD_ELEMTYPE_UNKNOWN != value) { *type = value; } } else if (0 == strncmp("ElementDataFile", buffer, key_end) && key_end == strlen("ElementDataFile")) { const char *const value = buffer + value_begin; if (0 == strcmp("LOCAL", value) || 0 == strcmp(header_filename, value)) { if (header_size) { const long file_position = ftell(file); /* determine the header size */ const size_t len = strlen(header_filename); if (0 <= file_position && len < filename_max_length) { memcpy(filename, header_filename, len + 1); LIBXSMM_ASSERT(0 == filename[len]); *header_size = ftell(file); } else { result = EXIT_FAILURE; } break; /* ElementDataFile is just before the raw data */ } } else { const size_t len = strlen(value); if (len < filename_max_length) { memcpy(filename, value, len + 1); LIBXSMM_ASSERT(0 == filename[len]); } else { result = EXIT_FAILURE; } } } else if (0 == strncmp("DimSize", buffer, key_end) && key_end == strlen("DimSize")) { char* value = buffer + value_begin; size_t *isize = size, n = 0; while (EXIT_SUCCESS == internal_mhd_readline(value, ' ', &key_end, &value_begin) && n < *ndims) { const int ivalue = atoi(value); if (0 < ivalue) { *isize = ivalue; } else { result = EXIT_FAILURE; } value += key_end + 1; ++isize; ++n; } if (EXIT_SUCCESS == result) { if (0 != *value && n < *ndims) { const int ivalue = atoi(value); if (0 < ivalue) { *isize = ivalue; } else { result = EXIT_FAILURE; } ++n; } #if 0 else { result = EXIT_FAILURE; } #endif } } else if (0 == strncmp("BinaryData", buffer, key_end) && key_end == strlen("BinaryData")) { const char *const value = buffer + value_begin; if (0 == strcmp("False", value) || 0 != strcmp("True", value)) { result = EXIT_FAILURE; } } else if (0 == strncmp("CompressedData", buffer, key_end) && key_end == strlen("CompressedData")) { const char *const value = buffer + value_begin; if (0 == strcmp("True", value) || 0 != strcmp("False", value)) { result = EXIT_FAILURE; } } else if ((0 == strncmp("BinaryDataByteOrderMSB", buffer, key_end) && key_end == strlen("BinaryDataByteOrderMSB")) || (0 == strncmp("ElementByteOrderMSB", buffer, key_end) && key_end == strlen("ElementByteOrderMSB"))) { const char *const value = buffer + value_begin; if (0 == strcmp("True", value) || 0 != strcmp("False", value)) { result = EXIT_FAILURE; } } } if (EXIT_SUCCESS == result && (0 == *filename || LIBXSMM_MHD_ELEMTYPE_UNKNOWN == *type)) { result = EXIT_FAILURE; } /* check size, and eventually trim dimensionality */ if (EXIT_SUCCESS == result) { size_t i, d = 1; for (i = *ndims; 0 < i; --i) { if (0 != d && 1 == size[i-1]) { --*ndims; } else if (0 == size[i-1]) { result = EXIT_FAILURE; break; } else { d = 0; } } } /* prefix the path of the header file to make sure that the data file can be found */ if (EXIT_SUCCESS == result && (NULL == header_size || 0 == *header_size)) { const char* split = header_filename + strlen(header_filename) - 1; while (header_filename != split && NULL == strchr("/\\", *split)) --split; if (header_filename < split) { const size_t len = strlen(filename), n = split - header_filename + 1; if ((len+ n) <= filename_max_length) { size_t i; for (i = 1; i <= len; ++i) { filename[len + n - i] = filename[len - i]; } for (i = 0; i < n; ++i) { filename[i] = header_filename[i]; } } } } /* release file handle */ if (0 != fclose(file) && EXIT_SUCCESS == result) result = EXIT_FAILURE; } else { result = EXIT_FAILURE; } return result; } LIBXSMM_API int libxsmm_mhd_element_conversion( void* dst, libxsmm_mhd_elemtype dst_type, libxsmm_mhd_elemtype src_type, const void* src, const void* src_min, const void* src_max) { int result = EXIT_SUCCESS; switch (dst_type) { case LIBXSMM_MHD_ELEMTYPE_F64: { LIBXSMM_MHD_ELEMENT_CONVERSION(double, dst_type, -1.0, 1.0, dst, src_type, src, src_min, src_max, result); } break; case LIBXSMM_MHD_ELEMTYPE_F32: { LIBXSMM_MHD_ELEMENT_CONVERSION(float, dst_type, -1.0, 1.0, dst, src_type, src, src_min, src_max, result); } break; case LIBXSMM_MHD_ELEMTYPE_BF16: { LIBXSMM_MHD_ELEMENT_CONVERSION(libxsmm_bfloat16, dst_type, -1.0, 1.0, dst, src_type, src, src_min, src_max, result); } break; case LIBXSMM_MHD_ELEMTYPE_I64: { LIBXSMM_MHD_ELEMENT_CONVERSION(long long, dst_type, -9223372036854775808.0, 9223372036854775807.0, dst, src_type, src, src_min, src_max, result); } break; case LIBXSMM_MHD_ELEMTYPE_I32: { LIBXSMM_MHD_ELEMENT_CONVERSION(int, dst_type, -2147483648.0, 2147483647.0, dst, src_type, src, src_min, src_max, result); } break; case LIBXSMM_MHD_ELEMTYPE_I16: { LIBXSMM_MHD_ELEMENT_CONVERSION(short, dst_type, -32768.0, 32767.0, dst, src_type, src, src_min, src_max, result); } break; case LIBXSMM_MHD_ELEMTYPE_I8: { LIBXSMM_MHD_ELEMENT_CONVERSION(signed char, dst_type, -128.0, 127.0, dst, src_type, src, src_min, src_max, result); } break; case LIBXSMM_MHD_ELEMTYPE_U64: { LIBXSMM_MHD_ELEMENT_CONVERSION(unsigned long long, dst_type, 0.0, 18446744073709551615.0, dst, src_type, src, src_min, src_max, result); } break; case LIBXSMM_MHD_ELEMTYPE_U32: { LIBXSMM_MHD_ELEMENT_CONVERSION(unsigned int, dst_type, 0.0, 4294967295.0, dst, src_type, src, src_min, src_max, result); } break; case LIBXSMM_MHD_ELEMTYPE_U16: { LIBXSMM_MHD_ELEMENT_CONVERSION(unsigned short, dst_type, 0.0, 65535.0, dst, src_type, src, src_min, src_max, result); } break; case LIBXSMM_MHD_ELEMTYPE_U8: { LIBXSMM_MHD_ELEMENT_CONVERSION(unsigned char, dst_type, 0.0, 255.0, dst, src_type, src, src_min, src_max, result); } break; default: result = EXIT_FAILURE; } return result; } LIBXSMM_API int libxsmm_mhd_element_comparison( void* dst, libxsmm_mhd_elemtype dst_type, libxsmm_mhd_elemtype src_type, const void* src, const void* src_min, const void* src_max) { size_t typesize; int result; if (NULL != libxsmm_mhd_typename(src_type, &typesize, NULL/*ctypename*/)) { if (dst_type == src_type) { /* direct comparison */ result = libxsmm_diff(src, dst, (unsigned char)typesize); } else { /* conversion into source type */ char element[LIBXSMM_MHD_MAX_ELEMSIZE]; result = libxsmm_mhd_element_conversion(element, dst_type, src_type, src, src_min, src_max); if (EXIT_SUCCESS == result) { result = libxsmm_diff(src, element, (unsigned char)typesize); } } } else { result = EXIT_FAILURE; } return result; } /* coverity[var_deref_op] */ LIBXSMM_API_INLINE int internal_mhd_minmax(const void* data, size_t nelements, libxsmm_mhd_elemtype type, const void* minval, const void* maxval) { int result; if ((NULL != data || 0 == nelements) && NULL != minval && NULL != maxval) { result = EXIT_SUCCESS; switch (type) { case LIBXSMM_MHD_ELEMTYPE_F64: { LIBXSMM_MHD_MINMAX(double, data, nelements, minval, maxval); } break; case LIBXSMM_MHD_ELEMTYPE_F32: { LIBXSMM_MHD_MINMAX(float, data, nelements, minval, maxval); } break; case LIBXSMM_MHD_ELEMTYPE_BF16: { LIBXSMM_MHD_MINMAX(libxsmm_bfloat16, data, nelements, minval, maxval); } break; case LIBXSMM_MHD_ELEMTYPE_I64: { LIBXSMM_MHD_MINMAX(long long, data, nelements, minval, maxval); } break; case LIBXSMM_MHD_ELEMTYPE_I32: { LIBXSMM_MHD_MINMAX(int, data, nelements, minval, maxval); } break; case LIBXSMM_MHD_ELEMTYPE_I16: { LIBXSMM_MHD_MINMAX(short, data, nelements, minval, maxval); } break; case LIBXSMM_MHD_ELEMTYPE_I8: { LIBXSMM_MHD_MINMAX(signed char, data, nelements, minval, maxval); } break; case LIBXSMM_MHD_ELEMTYPE_U64: { LIBXSMM_MHD_MINMAX(unsigned long long, data, nelements, minval, maxval); } break; case LIBXSMM_MHD_ELEMTYPE_U32: { LIBXSMM_MHD_MINMAX(unsigned int, data, nelements, minval, maxval); } break; case LIBXSMM_MHD_ELEMTYPE_U16: { LIBXSMM_MHD_MINMAX(unsigned short, data, nelements, minval, maxval); } break; case LIBXSMM_MHD_ELEMTYPE_U8: { LIBXSMM_MHD_MINMAX(unsigned char, data, nelements, minval, maxval); } break; default: result = EXIT_FAILURE; } } else { result = EXIT_FAILURE; } return result; } LIBXSMM_API_INLINE int internal_mhd_read(FILE* file, void* data, const size_t size[], const size_t pitch[], size_t ndims, size_t ncomponents, libxsmm_mhd_elemtype type_stored, libxsmm_mhd_elemtype type_data, size_t typesize, libxsmm_mhd_element_handler handle_element, int minmax, void* minval, void* maxval) { int result = EXIT_SUCCESS; size_t typesize_stored; LIBXSMM_ASSERT(NULL != pitch && 0 != typesize); if (NULL != libxsmm_mhd_typename(type_stored, &typesize_stored, NULL/*ctypename*/)) { if (1 < ndims) { if (size[0] <= pitch[0]) { const size_t d = ndims - 1; if (EXIT_SUCCESS == result) { if (size[d] <= pitch[d]) { size_t sub_size = ncomponents * typesize * pitch[0], i; for (i = 1; i < d; ++i) { if (size[i] <= pitch[i]) { sub_size *= pitch[i]; } else { result = EXIT_FAILURE; break; } } for (i = 0; i < size[d] && EXIT_SUCCESS == result; ++i) { result = internal_mhd_read(file, data, size, pitch, d, ncomponents, type_stored, type_data, typesize, handle_element, minmax, minval, maxval); data = ((char*)data) + sub_size; } } else { result = EXIT_FAILURE; } } } else { result = EXIT_FAILURE; } } else if (1 == ndims) { if (size[0] <= pitch[0]) { if (type_stored == type_data && NULL == handle_element) { if (size[0] != fread(data, ncomponents * typesize_stored, size[0], file)) { result = EXIT_FAILURE; } } else { /* data-conversion or custom data-handler */ const libxsmm_mhd_element_handler handler = (0 == minmax ? (NULL != handle_element ? handle_element : libxsmm_mhd_element_conversion) : (NULL)); char element[LIBXSMM_MHD_MAX_ELEMSIZE]; size_t i, j; for (i = 0; i < size[0]; ++i) { for (j = 0; j < ncomponents; ++j) { if (EXIT_SUCCESS == result) { if (1 == fread(element, typesize_stored, 1, file)) { if (NULL == handler) { /* determine value-range for scaled data-conversion */ LIBXSMM_ASSERT(0 != minmax); result = internal_mhd_minmax(element, 1/*n*/, type_stored, minval, maxval); } else { /* re-read data incl. conversion */ LIBXSMM_ASSERT(0 == minmax); result = handler(data, type_data, type_stored, element, minval, maxval); data = ((char*)data) + typesize; } } else { result = EXIT_FAILURE; } } else { i = size[0]; /* break outer */ break; } } } } } else { result = EXIT_FAILURE; } } } else { result = EXIT_FAILURE; } return result; } LIBXSMM_API int libxsmm_mhd_read(const char filename[], const size_t offset[], const size_t size[], const size_t pitch[], size_t ndims, size_t ncomponents, size_t header_size, libxsmm_mhd_elemtype type_stored, const libxsmm_mhd_elemtype* type_data, void* data, libxsmm_mhd_element_handler handle_element, char extension[], size_t extension_size) { int result = EXIT_SUCCESS; FILE *const file = (NULL != filename && 0 != *filename && NULL != size && 0 != ndims && 0 != ncomponents && LIBXSMM_MHD_ELEMTYPE_UNKNOWN != type_stored && (NULL == type_data || LIBXSMM_MHD_ELEMTYPE_UNKNOWN != *type_data) && (NULL != data)) ? fopen(filename, "rb") : NULL; if (NULL != file) { const libxsmm_mhd_elemtype datatype = (type_data ? *type_data : type_stored); const size_t *const shape = (NULL != pitch ? pitch : size); size_t offset1 = (NULL != offset ? offset[0] : 0), typesize = 0, i; /* check that size is less-equal than pitch */ if (EXIT_SUCCESS == result) { for (i = 0; i < ndims; ++i) { if (size[i] > shape[i]) { result = EXIT_FAILURE; break; } } } /* zeroing buffer if pitch is larger than size */ if (EXIT_SUCCESS == result) { if (NULL != libxsmm_mhd_typename(datatype, &typesize, NULL/*ctypename*/)) { size_t size1 = size[0], pitch1 = shape[0]; for (i = 1; i < ndims; ++i) { offset1 += (NULL != offset ? offset[i] : 0) * pitch1; pitch1 *= shape[i]; size1 *= size[i]; } LIBXSMM_ASSERT(size1 <= pitch1); if (size1 != pitch1 && NULL == handle_element) { memset(data, 0, pitch1 * ncomponents * typesize); } } else { result = EXIT_FAILURE; } } if (EXIT_SUCCESS == result) { char *const output = ((char*)data) + offset1 * ncomponents * typesize; char minmax[2*(LIBXSMM_MHD_MAX_ELEMSIZE)]; if (0 != header_size) result = fseek(file, (long)header_size, SEEK_SET); /* set file position to data section */ if (EXIT_SUCCESS == result && datatype != type_stored) { /* conversion needed */ if (1 == fread(minmax, typesize, 1, file)) { LIBXSMM_ASSERT(typesize <= (LIBXSMM_MHD_MAX_ELEMSIZE)); LIBXSMM_MEMCPY127(minmax + (LIBXSMM_MHD_MAX_ELEMSIZE), minmax, typesize); result = fseek(file, (long)header_size, SEEK_SET); /* reset file position */ if (EXIT_SUCCESS == result) { result = internal_mhd_read(file, NULL/*output*/, size, shape, ndims, ncomponents, type_stored, datatype, typesize, handle_element, 1/*search min-max*/, minmax, minmax + (LIBXSMM_MHD_MAX_ELEMSIZE)); } if (EXIT_SUCCESS == result) { result = fseek(file, (long)header_size, SEEK_SET); /* reset file position */ } } else { result = EXIT_FAILURE; } } if (EXIT_SUCCESS == result) { result = internal_mhd_read(file, output, size, shape, ndims, ncomponents, type_stored, datatype, typesize, handle_element, 0/*use min-max*/, minmax, minmax + (LIBXSMM_MHD_MAX_ELEMSIZE)); } } if (NULL != extension && 0 < extension_size) { if (extension_size != fread(extension, 1, extension_size, file)) { result = EXIT_FAILURE; } } /* release file handle */ if (0 != fclose(file) && EXIT_SUCCESS == result) result = EXIT_FAILURE; } else { result = EXIT_FAILURE; } return result; } LIBXSMM_API_INLINE int internal_mhd_write(FILE* file, const void* data, const size_t size[], const size_t pitch[], size_t ndims, size_t ncomponents, libxsmm_mhd_elemtype type_data, libxsmm_mhd_elemtype type, size_t typesize_data, size_t typesize, int minmax, void* minval, void* maxval) { int result = EXIT_SUCCESS; LIBXSMM_ASSERT(NULL != pitch); if (1 < ndims) { if (size[0] <= pitch[0]) { const size_t d = ndims - 1; if (EXIT_SUCCESS == result) { if (size[d] <= pitch[d]) { size_t sub_size = ncomponents * typesize_data * pitch[0], i; for (i = 1; i < d; ++i) { if (size[i] <= pitch[i]) { sub_size *= pitch[i]; } else { result = EXIT_FAILURE; break; } } for (i = 0; i < size[d] && EXIT_SUCCESS == result; ++i) { result = internal_mhd_write(file, data, size, pitch, d, ncomponents, type_data, type, typesize_data, typesize, minmax, minval, maxval); data = ((const char*)data) + sub_size; } } else { result = EXIT_FAILURE; } } } else { result = EXIT_FAILURE; } } else if (1 == ndims) { if (size[0] <= pitch[0]) { if (type == type_data) { if (size[0] != fwrite(data, ncomponents * typesize_data, size[0], file)) { result = EXIT_FAILURE; } } else { /* data-conversion */ char element[LIBXSMM_MHD_MAX_ELEMSIZE]; size_t i, j; if (0 != minmax) { /* determine value-range for scaled data-conversion */ result = internal_mhd_minmax(data, size[0] * ncomponents, type_data, minval, maxval); } else { for (i = 0; i < size[0]; ++i) { for (j = 0; j < ncomponents; ++j) { if (EXIT_SUCCESS == result) { result = libxsmm_mhd_element_conversion(element, type, type_data, data, minval, maxval); if (EXIT_SUCCESS == result) { if (1 == fwrite(element, typesize, 1, file)) { data = ((char*)data) + typesize_data; } else { result = EXIT_FAILURE; } } } else { i = size[0]; /* break outer */ break; } } } } } } else { result = EXIT_FAILURE; } } return result; } LIBXSMM_API int libxsmm_mhd_write(const char filename[], const size_t offset[], const size_t size[], const size_t pitch[], size_t ndims, size_t ncomponents, libxsmm_mhd_elemtype type_data, const libxsmm_mhd_elemtype* type, const void* data, size_t* header_size, const char extension_header[], const void* extension, size_t extension_size) { size_t typesize = 0; const libxsmm_mhd_elemtype elemtype = (NULL == type ? type_data : *type); const char *const elemname = libxsmm_mhd_typename(elemtype, &typesize, NULL/*ctypename*/); FILE *const file = (NULL != filename && 0 != *filename && NULL != size && 0 != ndims && 0 != ncomponents && NULL != data && NULL != elemname && 0 < typesize) ? fopen(filename, "wb") : NULL; int result = EXIT_SUCCESS; if (NULL != file) { size_t typesize_data = 0, i; if (0 < fprintf(file, "NDims = %u\nElementNumberOfChannels = %u\nElementByteOrderMSB = False\nDimSize =", (unsigned int)ndims, (unsigned int)ncomponents)) { for (i = 0; i != ndims; ++i) { if (0 >= fprintf(file, " %u", (unsigned int)size[i])) { result = EXIT_FAILURE; break; } } } else { result = EXIT_FAILURE; } if (EXIT_SUCCESS == result) { if (0 < fprintf(file, "\nElementSpacing =")) { for (i = 0; i != ndims; ++i) { if (0 >= fprintf(file, " 1.0")) { result = EXIT_FAILURE; break; } } } else { result = EXIT_FAILURE; } } if (EXIT_SUCCESS == result && NULL != extension_header && 0 != *extension_header) { if (0 >= fprintf(file, "\n%s", extension_header)) { result = EXIT_FAILURE; } } /* size of the data, which is silently appended after the regular data section */ if (EXIT_SUCCESS == result && 0 < extension_size) { if (0 >= fprintf(file, "\nExtensionDataSize = %u", (unsigned int)extension_size)) { result = EXIT_FAILURE; } } /* source data type is not required to have MHD element name (type-size is needed) */ if (EXIT_SUCCESS == result) { libxsmm_mhd_typename(type_data, &typesize_data, NULL/*ctypename*/); if (0 == typesize_data) result = EXIT_FAILURE; } /* ElementDataFile must be the last entry before writing the data */ if (EXIT_SUCCESS == result && 0 < fprintf(file, "\nElementType = %s\nElementDataFile = LOCAL\n", elemname)) { const size_t *const shape = (NULL != pitch ? pitch : size); const char *const input = ((const char*)data) + libxsmm_offset(offset, shape, ndims, NULL/*size*/) * ncomponents * typesize_data; const long file_position = ftell(file); /* determine the header size */ char minmax[2*(LIBXSMM_MHD_MAX_ELEMSIZE)]; result = (0 <= file_position ? EXIT_SUCCESS : EXIT_FAILURE); if (EXIT_SUCCESS == result && type_data != elemtype) { /* conversion needed */ LIBXSMM_MEMCPY127(minmax, data, typesize_data); LIBXSMM_MEMCPY127(minmax + (LIBXSMM_MHD_MAX_ELEMSIZE), data, typesize_data); /* initial condition */ result = internal_mhd_write(file, input, size, shape, ndims, ncomponents, type_data, elemtype, typesize_data, typesize, 1/*search min-max*/, minmax, minmax + (LIBXSMM_MHD_MAX_ELEMSIZE)); } if (EXIT_SUCCESS == result) { if (NULL != header_size) *header_size = file_position; assert(file_position == ftell(file)); /* !LIBXSMM_ASSERT */ result = internal_mhd_write(file, input, size, shape, ndims, ncomponents, type_data, elemtype, typesize_data, typesize, 0/*use min-max*/, minmax, minmax + (LIBXSMM_MHD_MAX_ELEMSIZE)); } } /* append the extension data after the regular data section */ if (EXIT_SUCCESS == result && 0 < extension_size) { if (extension_size != fwrite(extension, 1, extension_size, file)) { result = EXIT_FAILURE; } } /* release file handle */ if (0 != fclose(file) && EXIT_SUCCESS == result) result = EXIT_FAILURE; } else { result = EXIT_FAILURE; } return result; } libxsmm-1.17/src/libxsmm_perf.c000066400000000000000000000210561415223013700165410ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Maciej Debski (Google Inc.) ******************************************************************************/ #include "libxsmm_perf.h" #include #include #include #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #include "perf_jitdump.h" #if defined(LIBXSMM_PERF_JITDUMP) && !defined(_WIN32) # include # include # include # include # include # include # include #endif #if defined(__linux__) # include #endif #if defined(_WIN32) # include # define LIBXSMM_MAX_PATH MAX_PATH #else # if defined(__linux__) # include # define LIBXSMM_MAX_PATH PATH_MAX # elif defined(PATH_MAX) # define LIBXSMM_MAX_PATH PATH_MAX # else /* fallback */ # define LIBXSMM_MAX_PATH 1024 # endif # include #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif #if !defined(NDEBUG) # define LIBXSMM_PERF_ERROR(msg) fprintf(stderr, msg) #else # define LIBXSMM_PERF_ERROR(msg) #endif #if !defined(PERF_JITDUMP_NOLIBXSMM) LIBXSMM_APIVAR_PRIVATE_DEF(/*const*/ uint32_t JITDUMP_MAGIC); LIBXSMM_APIVAR_PRIVATE_DEF(/*const*/ uint32_t JITDUMP_MAGIC_SWAPPED); LIBXSMM_APIVAR_PRIVATE_DEF(/*const*/ uint32_t JITDUMP_VERSION); LIBXSMM_APIVAR_PRIVATE_DEF(/*const*/ uint64_t JITDUMP_FLAGS_ARCH_TIMESTAMP); LIBXSMM_APIVAR_PRIVATE_DEF(/*const*/ uint32_t JITDUMP_CODE_LOAD); LIBXSMM_APIVAR_PRIVATE_DEF(/*const*/ uint32_t JITDUMP_CODE_MOVE); LIBXSMM_APIVAR_PRIVATE_DEF(/*const*/ uint32_t JITDUMP_CODE_DEBUG_INFO); LIBXSMM_APIVAR_PRIVATE_DEF(/*const*/ uint32_t JITDUMP_CODE_CLOSE); #endif LIBXSMM_APIVAR_DEFINE(FILE* internal_perf_fp); #if defined(LIBXSMM_PERF_JITDUMP) && !defined(_WIN32) LIBXSMM_APIVAR_DEFINE(void* internal_perf_marker); LIBXSMM_APIVAR_DEFINE(int internal_perf_codeidx); #endif LIBXSMM_API_INTERN void libxsmm_perf_init(void) { const uint32_t pid = (uint32_t)libxsmm_get_pid(); char file_name[LIBXSMM_MAX_PATH]; #if defined(LIBXSMM_PERF_JITDUMP) && !defined(_WIN32) char file_path[LIBXSMM_MAX_PATH]; int fd, page_size, res; struct jitdump_file_header header; char * path_base; char date[64]; time_t t = time(NULL); struct tm tm = *localtime(&t); /* initialize global variables */ JITDUMP_MAGIC = ('J' << 24 | 'i' << 16 | 'T' << 8 | 'D'); JITDUMP_MAGIC_SWAPPED = ('J' | 'i' << 8 | 'T' << 16 | 'D' << 24); JITDUMP_VERSION = 1; JITDUMP_FLAGS_ARCH_TIMESTAMP = 1ULL /*<< 0*/; JITDUMP_CODE_LOAD = 0; JITDUMP_CODE_MOVE = 1; JITDUMP_CODE_DEBUG_INFO = 2; JITDUMP_CODE_CLOSE = 3; path_base = getenv("JITDUMPDIR"); if (path_base == NULL) { path_base = getenv("HOME"); } if (path_base == NULL) { path_base = "."; } LIBXSMM_SNPRINTF(file_path, sizeof(file_path), "%s/.debug/", path_base); res = mkdir(file_path, S_IRWXU); if (res < 0 && errno != EEXIST) { LIBXSMM_PERF_ERROR("LIBXSMM ERROR: failed to create .debug dir\n"); goto error; } LIBXSMM_SNPRINTF(file_path, sizeof(file_path), "%s/.debug/jit", path_base); res = mkdir(file_path, S_IRWXU); if (res < 0 && errno != EEXIST) { LIBXSMM_PERF_ERROR("LIBXSMM ERROR: failed to create .debug/jit dir\n"); goto error; } strftime(date, sizeof(date), "%Y%m%d", &tm); LIBXSMM_SNPRINTF(file_path, sizeof(file_path), "%s/.debug/jit/libxsmm-jit-%s.XXXXXX", path_base, date); path_base = mkdtemp(file_path); if (path_base == NULL) { LIBXSMM_PERF_ERROR("LIBXSMM ERROR: failed to create temporary dir\n"); goto error; } LIBXSMM_SNPRINTF(file_name, sizeof(file_name), "%s/jit-%u.dump", path_base, pid); fd = open(file_name, O_CREAT|O_TRUNC|O_RDWR, 0600); if (fd < 0) { LIBXSMM_PERF_ERROR("LIBXSMM ERROR: failed to open file\n"); goto error; } page_size = sysconf(_SC_PAGESIZE); if (page_size < 0) { LIBXSMM_PERF_ERROR("LIBXSMM ERROR: failed to get page size\n"); goto error; } internal_perf_marker = mmap(NULL, page_size, PROT_READ|PROT_EXEC, MAP_PRIVATE, fd, 0); if (internal_perf_marker == MAP_FAILED) { LIBXSMM_PERF_ERROR("LIBXSMM ERROR: mmap failed.\n"); goto error; } /* initialize code index */ internal_perf_codeidx = 0; internal_perf_fp = fdopen(fd, "wb+"); if (internal_perf_fp == NULL) { LIBXSMM_PERF_ERROR("LIBXSMM ERROR: fdopen failed.\n"); goto error; } LIBXSMM_MEMZERO127(&header); header.magic = JITDUMP_MAGIC; header.version = JITDUMP_VERSION; header.elf_mach = 62; /* EM_X86_64 */ header.total_size = sizeof(header); header.pid = pid; header.timestamp = libxsmm_timer_tick(); header.flags = JITDUMP_FLAGS_ARCH_TIMESTAMP; res = fwrite(&header, sizeof(header), 1, internal_perf_fp); if (res != 1) { LIBXSMM_PERF_ERROR("LIBXSMM ERROR: failed to write header.\n"); goto error; } #else LIBXSMM_SNPRINTF(file_name, sizeof(file_name), "/tmp/perf-%u.map", pid); internal_perf_fp = fopen(file_name, "w+"); if (internal_perf_fp == NULL) { LIBXSMM_PERF_ERROR("LIBXSMM ERROR: failed to open map file\n"); goto error; } #endif return; error: if (internal_perf_fp != NULL) { fclose(internal_perf_fp); internal_perf_fp = NULL; } assert(0); } LIBXSMM_API_INTERN void libxsmm_perf_finalize(void) { #if defined(LIBXSMM_PERF_JITDUMP) && !defined(_WIN32) int res, page_size; struct jitdump_record_header hdr; if (internal_perf_fp == NULL) { LIBXSMM_PERF_ERROR("LIBXSMM ERROR: jit dump file not opened\n"); goto error; } LIBXSMM_MEMZERO127(&hdr); hdr.id = JITDUMP_CODE_CLOSE; hdr.total_size = sizeof(hdr); hdr.timestamp = libxsmm_timer_tick(); res = fwrite(&hdr, sizeof(hdr), 1, internal_perf_fp); if (res != 1) { LIBXSMM_PERF_ERROR("LIBXSMM ERROR: failed to write JIT_CODE_CLOSE record\n"); goto error; } page_size = sysconf(_SC_PAGESIZE); if (page_size < 0) { LIBXSMM_PERF_ERROR("LIBXSMM ERROR: failed to get page_size\n"); goto error; } munmap(internal_perf_marker, page_size); fclose(internal_perf_fp); return; error: assert(0); #else fclose(internal_perf_fp); #endif } #if defined(LIBXSMM_PERF_JITDUMP) && !defined(_WIN32) /** Utility function to receive the OS-specific thread ID. */ LIBXSMM_API_INLINE unsigned int internal_perf_get_tid(void) { #if defined(__linux__) return (unsigned int)syscall(__NR_gettid); #else /* fallback */ return libxsmm_get_tid(); #endif } #endif LIBXSMM_API_INTERN void libxsmm_perf_dump_code(const void* memory, size_t size, const char* name) { assert(internal_perf_fp != NULL); assert(name && *name); assert(memory != NULL && size != 0); if (internal_perf_fp != NULL) { #if defined(LIBXSMM_PERF_JITDUMP) && !defined(_WIN32) int res; struct jitdump_record_header hdr; struct jitdump_record_code_load rec; size_t name_len = strlen(name) + 1; LIBXSMM_MEMZERO127(&hdr); LIBXSMM_MEMZERO127(&rec); hdr.id = JITDUMP_CODE_LOAD; hdr.total_size = sizeof(hdr) + sizeof(rec) + name_len + size; hdr.timestamp = libxsmm_timer_tick(); rec.code_size = size; rec.vma = (uintptr_t) memory; rec.code_addr = (uintptr_t) memory; rec.pid = (uint32_t) libxsmm_get_pid(); rec.tid = (uint32_t) internal_perf_get_tid(); LIBXSMM_FLOCK(internal_perf_fp); /* This will be unique as we hold the file lock. */ rec.code_index = internal_perf_codeidx++; /* Count number of written items to check for errors. */ res = 0; res += fwrite_unlocked(&hdr, sizeof(hdr), 1, internal_perf_fp); res += fwrite_unlocked(&rec, sizeof(rec), 1, internal_perf_fp); res += fwrite_unlocked(name, name_len, 1, internal_perf_fp); res += fwrite_unlocked((const void*) memory, size, 1, internal_perf_fp); LIBXSMM_FUNLOCK(internal_perf_fp); fflush(internal_perf_fp); assert(res == 4); /* Expected 4 items written above */ #else fprintf(internal_perf_fp, "%" PRIxPTR " %lx %s\n", (uintptr_t)memory, (unsigned long)size, name); fflush(internal_perf_fp); #endif } } libxsmm-1.17/src/libxsmm_perf.h000066400000000000000000000020471415223013700165450ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Maciej Debski (Google Inc.) ******************************************************************************/ #ifndef LIBXSMM_PERF_H #define LIBXSMM_PERF_H #include LIBXSMM_API_INTERN void libxsmm_perf_init(void); LIBXSMM_API_INTERN void libxsmm_perf_finalize(void); LIBXSMM_API_INTERN void libxsmm_perf_dump_code( const void* memory, size_t size, const char* name); #endif /* LIBXSMM_PERF_H */ libxsmm-1.17/src/libxsmm_python.c000066400000000000000000000137501415223013700171300ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #if defined(__PYTHON) && defined(LIBXSMM_BUILD) && !defined(__STATIC) #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #include /* must be included first */ #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif #endif #include #if defined(__PYTHON) && defined(LIBXSMM_BUILD) && !defined(__STATIC) LIBXSMM_API PyObject* libxsmmpy_get_target_arch(PyObject* self, PyObject* args); LIBXSMM_API PyObject* libxsmmpy_get_target_arch(PyObject* self, PyObject* args) { LIBXSMM_UNUSED(self); LIBXSMM_UNUSED(args); return PyString_InternFromString(libxsmm_get_target_arch()); } LIBXSMM_API PyObject* libxsmmpy_set_target_arch(PyObject* self, PyObject* args); LIBXSMM_API PyObject* libxsmmpy_set_target_arch(PyObject* self, PyObject* args) { int ivalue = LIBXSMM_TARGET_ARCH_UNKNOWN; char* svalue = NULL; LIBXSMM_UNUSED(self); if (0 != PyArg_ParseTuple(args, "s", &svalue)) { libxsmm_set_target_arch(svalue); } else if (0 != PyArg_ParseTuple(args, "i", &ivalue)) { libxsmm_set_target_archid(ivalue); } else { /* error */ return NULL; } Py_RETURN_NONE; } LIBXSMM_API PyObject* libxsmmpy_get_target_archid(PyObject* self, PyObject* args); LIBXSMM_API PyObject* libxsmmpy_get_target_archid(PyObject* self, PyObject* args) { LIBXSMM_UNUSED(self); LIBXSMM_UNUSED(args); return Py_BuildValue("i", libxsmm_get_target_archid()); } LIBXSMM_API PyObject* libxsmmpy_set_target_archid(PyObject* self, PyObject* args); LIBXSMM_API PyObject* libxsmmpy_set_target_archid(PyObject* self, PyObject* args) { int value = LIBXSMM_TARGET_ARCH_UNKNOWN; LIBXSMM_UNUSED(self); if (0 != PyArg_ParseTuple(args, "i", &value)) { libxsmm_set_target_archid(value); } else { /* error */ return NULL; } Py_RETURN_NONE; } LIBXSMM_API PyObject* libxsmmpy_get_verbosity(PyObject* self, PyObject* args); LIBXSMM_API PyObject* libxsmmpy_get_verbosity(PyObject* self, PyObject* args) { LIBXSMM_UNUSED(self); LIBXSMM_UNUSED(args); return Py_BuildValue("i", libxsmm_get_verbosity()); } LIBXSMM_API PyObject* libxsmmpy_set_verbosity(PyObject* self, PyObject* args); LIBXSMM_API PyObject* libxsmmpy_set_verbosity(PyObject* self, PyObject* args) { int value = 0; LIBXSMM_UNUSED(self); if (0 != PyArg_ParseTuple(args, "i", &value)) { libxsmm_set_verbosity(value); } else { /* error */ return NULL; } Py_RETURN_NONE; } LIBXSMM_API PyMODINIT_FUNC initlibxsmm(void); LIBXSMM_API PyMODINIT_FUNC initlibxsmm(void) { static PyMethodDef pymethod_def[] = { { "GetTargetArch", libxsmmpy_get_target_arch, METH_NOARGS, PyDoc_STR("Get the name of the code path.") }, { "SetTargetArch", libxsmmpy_set_target_arch, METH_VARARGS, PyDoc_STR("Set the name of the code path.") }, { "GetTargetArchId", libxsmmpy_get_target_archid, METH_NOARGS, PyDoc_STR("Get the id of the code path.") }, { "SetTargetArchId", libxsmmpy_set_target_archid, METH_VARARGS, PyDoc_STR("Set the id of the code path.") }, { "GetVerbosity", libxsmmpy_get_verbosity, METH_NOARGS, PyDoc_STR("Get the verbosity level.") }, { "SetVerbosity", libxsmmpy_set_verbosity, METH_VARARGS, PyDoc_STR("Set the verbosity level.") }, { NULL, NULL, 0, NULL } /* end of table */ }; PyObject *const pymod = Py_InitModule3("libxsmm", pymethod_def, PyDoc_STR( "Library targeting Intel Architecture for small, dense or " "sparse matrix multiplications, and small convolutions.")); PyModule_AddIntConstant(pymod, "VERSION_API", LIBXSMM_VERSION2(LIBXSMM_VERSION_MAJOR, LIBXSMM_VERSION_MINOR)); PyModule_AddIntConstant(pymod, "VERSION_ALL", LIBXSMM_VERSION4(LIBXSMM_VERSION_MAJOR, LIBXSMM_VERSION_MINOR, LIBXSMM_VERSION_UPDATE, LIBXSMM_VERSION_PATCH)); PyModule_AddIntConstant(pymod, "VERSION_MAJOR", LIBXSMM_VERSION_MAJOR); PyModule_AddIntConstant(pymod, "VERSION_MINOR", LIBXSMM_VERSION_MINOR); PyModule_AddIntConstant(pymod, "VERSION_UPDATE", LIBXSMM_VERSION_UPDATE); PyModule_AddIntConstant(pymod, "VERSION_PATCH", LIBXSMM_VERSION_PATCH); PyModule_AddStringConstant(pymod, "VERSION", LIBXSMM_VERSION); PyModule_AddStringConstant(pymod, "BRANCH", LIBXSMM_BRANCH); PyModule_AddIntConstant(pymod, "TARGET_ARCH_UNKNOWN", LIBXSMM_TARGET_ARCH_UNKNOWN); PyModule_AddIntConstant(pymod, "TARGET_ARCH_GENERIC", LIBXSMM_TARGET_ARCH_GENERIC); PyModule_AddIntConstant(pymod, "X86_GENERIC", LIBXSMM_X86_GENERIC); PyModule_AddIntConstant(pymod, "X86_SSE3", LIBXSMM_X86_SSE3); PyModule_AddIntConstant(pymod, "X86_SSE4", LIBXSMM_X86_SSE4); PyModule_AddIntConstant(pymod, "X86_AVX", LIBXSMM_X86_AVX); PyModule_AddIntConstant(pymod, "X86_AVX2", LIBXSMM_X86_AVX2); PyModule_AddIntConstant(pymod, "X86_AVX512", LIBXSMM_X86_AVX512); PyModule_AddIntConstant(pymod, "X86_AVX512_MIC", LIBXSMM_X86_AVX512_MIC); PyModule_AddIntConstant(pymod, "X86_AVX512_KNM", LIBXSMM_X86_AVX512_KNM); PyModule_AddIntConstant(pymod, "X86_AVX512_CORE", LIBXSMM_X86_AVX512_CORE); PyModule_AddIntConstant(pymod, "X86_AVX512_CLX", LIBXSMM_X86_AVX512_CLX); PyModule_AddIntConstant(pymod, "X86_AVX512_CPX", LIBXSMM_X86_AVX512_CPX); libxsmm_init(); /* initialize LIBXSMM */ } #endif /*defined(__PYTHON) && defined(LIBXSMM_BUILD) && !defined(__STATIC)*/ libxsmm-1.17/src/libxsmm_rng.c000066400000000000000000000250671415223013700164010ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Hans Pabst (Intel Corp.) ******************************************************************************/ #include "libxsmm_rng.h" #include "libxsmm_main.h" #if !defined(LIBXSMM_RNG_DRAND48) && (!defined(_WIN32) && !defined(__CYGWIN__) && (defined(_SVID_SOURCE) || defined(_XOPEN_SOURCE))) # define LIBXSMM_RNG_DRAND48 #endif #if !defined(LIBXSMM_RNG_SIMD_MIN) # define LIBXSMM_RNG_SIMD_MIN 8 #endif /* dispatched RNG functions (separate typedef for legacy Cray C++ needed) */ typedef void (*internal_rng_f32_seq_fn)(float*, libxsmm_blasint); LIBXSMM_APIVAR_DEFINE(internal_rng_f32_seq_fn internal_rng_f32_seq); /* 2048-bit state for RNG */ LIBXSMM_APIVAR_DEFINE(unsigned int internal_rng_state0[16]); LIBXSMM_APIVAR_DEFINE(unsigned int internal_rng_state1[16]); LIBXSMM_APIVAR_DEFINE(unsigned int internal_rng_state2[16]); LIBXSMM_APIVAR_DEFINE(unsigned int internal_rng_state3[16]); LIBXSMM_API_INLINE void internal_rng_float_jump(uint32_t* state0, uint32_t* state1, uint32_t* state2, uint32_t* state3) { static const uint32_t jump_table[] = { 0x8764000b, 0xf542d2d3, 0x6fa035c3, 0x77f2db5b }; uint32_t s0 = 0, s1 = 0, s2 = 0, s3 = 0; int i, b; LIBXSMM_ASSERT(4 == sizeof(jump_table) / sizeof(*jump_table)); for (i = 0; i < 4; ++i) { for (b = 0; b < 32; ++b) { if (jump_table[i] & (1U << b)) { s0 ^= *state0; s1 ^= *state1; s2 ^= *state2; s3 ^= *state3; } { /* draw one more integer */ const uint32_t t = *state1 << 9; *state2 ^= *state0; *state3 ^= *state1; *state1 ^= *state2; *state0 ^= *state3; *state2 ^= t; *state3 = ((*state3 << 11) | (*state3 >> (32 - 11))); } } } *state0 = s0; *state1 = s1; *state2 = s2; *state3 = s3; } LIBXSMM_API_INLINE float internal_rng_scalar_float_next(int i) { const uint32_t rng_mantissa = (internal_rng_state0[i] + internal_rng_state3[i]) >> 9; const uint32_t t = internal_rng_state1[i] << 9; union { uint32_t i; float f; } rng; internal_rng_state2[i] ^= internal_rng_state0[i]; internal_rng_state3[i] ^= internal_rng_state1[i]; internal_rng_state1[i] ^= internal_rng_state2[i]; internal_rng_state0[i] ^= internal_rng_state3[i]; internal_rng_state2[i] ^= t; internal_rng_state3[i] = ((internal_rng_state3[i] << 11) | (internal_rng_state3[i] >> (32 - 11))); rng.i = 0x3f800000 | rng_mantissa; return rng.f - 1.0f; } LIBXSMM_API_INTERN void internal_rng_set_seed_sw(uint32_t seed); LIBXSMM_API_INTERN void internal_rng_set_seed_sw(uint32_t seed) { static const uint32_t temp_state[] = { 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 131, 130, 129, 128, 127, 126, 125, 124, 123, 122, 121, 120, 119, 118, 117, 116, 231, 230, 229, 228, 227, 226, 225, 224, 223, 222, 221, 220, 219, 218, 217, 216, 331, 330, 329, 328, 327, 326, 325, 324, 323, 322, 321, 320, 319, 318, 317, 316 }; libxsmm_blasint i; /* finish initializing the state */ LIBXSMM_ASSERT((16 * 4) == sizeof(temp_state) / sizeof(*temp_state)); for (i = 0; i < 16; ++i) { internal_rng_state0[i] = seed + temp_state[i]; internal_rng_state1[i] = seed + temp_state[i+16]; internal_rng_state2[i] = seed + temp_state[i+32]; internal_rng_state3[i] = seed + temp_state[i+48]; } for (i = 0; i < 16; ++i) { internal_rng_float_jump( /* progress each sequence by 2^64 */ internal_rng_state0 + i, internal_rng_state1 + i, internal_rng_state2 + i, internal_rng_state3 + i); } /* for consistency, other RNGs are seeded as well */ #if !defined(_WIN32) && !defined(__CYGWIN__) && (defined(_SVID_SOURCE) || defined(_XOPEN_SOURCE)) srand48(seed); #endif srand(seed); } LIBXSMM_API_INLINE void internal_rng_f32_seq_sw(float* rngs, libxsmm_blasint count) { libxsmm_blasint i = 0; for (; i < count; ++i) { rngs[i] = internal_rng_scalar_float_next(LIBXSMM_MOD2(i, 16)); } } #if defined(LIBXSMM_INTRINSICS_AVX512) /* __AVX512F__ */ LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) void internal_rng_set_seed_avx512(uint32_t seed) { internal_rng_set_seed_sw(seed); /* bring scalar state to AVX-512 */ LIBXSMM_INTRINSICS_MM512_RNG_STATE(0) = _mm512_loadu_si512(internal_rng_state0); LIBXSMM_INTRINSICS_MM512_RNG_STATE(1) = _mm512_loadu_si512(internal_rng_state1); LIBXSMM_INTRINSICS_MM512_RNG_STATE(2) = _mm512_loadu_si512(internal_rng_state2); LIBXSMM_INTRINSICS_MM512_RNG_STATE(3) = _mm512_loadu_si512(internal_rng_state3); } LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) void internal_rng_f32_seq_avx512(float* rngs, libxsmm_blasint count) { if ((LIBXSMM_RNG_SIMD_MIN << 4) <= count) { /* SIMD code path */ const libxsmm_blasint n = (count >> 4) << 4; /* multiple of vector-length */ libxsmm_blasint i = 0; for (; i < n; i += 16) { _mm512_storeu_ps(rngs + i, LIBXSMM_INTRINSICS_MM512_RNG_PS()); } if (i < count) { /* remainder */ #if 0 /* assert(0 < n) */ if (0 < n) #endif { /* bring AVX-512 state to scalar */ _mm512_storeu_si512(internal_rng_state0, LIBXSMM_INTRINSICS_MM512_RNG_STATE(0)); _mm512_storeu_si512(internal_rng_state1, LIBXSMM_INTRINSICS_MM512_RNG_STATE(1)); _mm512_storeu_si512(internal_rng_state2, LIBXSMM_INTRINSICS_MM512_RNG_STATE(2)); _mm512_storeu_si512(internal_rng_state3, LIBXSMM_INTRINSICS_MM512_RNG_STATE(3)); } LIBXSMM_ASSERT(count < i + 16); do { /* scalar remainder */ rngs[i] = internal_rng_scalar_float_next(LIBXSMM_MOD2(i, 16)); ++i; } while (i < count); /* bring scalar state to AVX-512 */ LIBXSMM_INTRINSICS_MM512_RNG_STATE(0) = _mm512_loadu_si512(internal_rng_state0); LIBXSMM_INTRINSICS_MM512_RNG_STATE(1) = _mm512_loadu_si512(internal_rng_state1); LIBXSMM_INTRINSICS_MM512_RNG_STATE(2) = _mm512_loadu_si512(internal_rng_state2); LIBXSMM_INTRINSICS_MM512_RNG_STATE(3) = _mm512_loadu_si512(internal_rng_state3); } } else { /* scalar code path */ internal_rng_f32_seq_sw(rngs, count); } } #endif /*defined(LIBXSMM_INTRINSICS_AVX512)*/ LIBXSMM_API unsigned int* libxsmm_rng_create_avx512_extstate(unsigned int/*uint32_t*/ seed) { unsigned int* state = (unsigned int*) libxsmm_aligned_malloc( 64*sizeof(unsigned int), 64 ); static const uint32_t temp_state[] = { 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 131, 130, 129, 128, 127, 126, 125, 124, 123, 122, 121, 120, 119, 118, 117, 116, 231, 230, 229, 228, 227, 226, 225, 224, 223, 222, 221, 220, 219, 218, 217, 216, 331, 330, 329, 328, 327, 326, 325, 324, 323, 322, 321, 320, 319, 318, 317, 316 }; libxsmm_blasint i; /* finish initializing the state */ LIBXSMM_ASSERT((16 * 4) == sizeof(temp_state) / sizeof(*temp_state)); for (i = 0; i < 16; ++i) { state[i ] = seed + temp_state[i]; state[i+16] = seed + temp_state[i+16]; state[i+32] = seed + temp_state[i+32]; state[i+48] = seed + temp_state[i+48]; } for (i = 0; i < 16; ++i) { internal_rng_float_jump( /* progress each sequence by 2^64 */ state + i, state + 16 + i, state + 32 + i, state + 48 + i); } return state; } LIBXSMM_API void libxsmm_rng_destroy_avx512_extstate(unsigned int* stateptr) { if ( stateptr != NULL ) { libxsmm_free( stateptr ); } } LIBXSMM_API void libxsmm_rng_set_seed(unsigned int/*uint32_t*/ seed) { LIBXSMM_INIT #if (LIBXSMM_X86_AVX512 <= LIBXSMM_STATIC_TARGET_ARCH) # if !defined(NDEBUG) /* used to track if seed is initialized */ internal_rng_f32_seq = internal_rng_f32_seq_avx512; # endif internal_rng_set_seed_avx512(seed); #elif defined(LIBXSMM_INTRINSICS_AVX512) /* __AVX512F__ */ if (LIBXSMM_X86_AVX512 <= libxsmm_target_archid) { internal_rng_f32_seq = internal_rng_f32_seq_avx512; internal_rng_set_seed_avx512(seed); } else { internal_rng_f32_seq = internal_rng_f32_seq_sw; internal_rng_set_seed_sw(seed); } #else # if !defined(NDEBUG) /* used to track if seed is initialized */ internal_rng_f32_seq = internal_rng_f32_seq_sw; # endif internal_rng_set_seed_sw(seed); #endif } LIBXSMM_API void libxsmm_rng_f32_seq(float* rngs, libxsmm_blasint count) { LIBXSMM_ASSERT_MSG(NULL != internal_rng_f32_seq, "RNG must be initialized"); #if (LIBXSMM_X86_AVX512 <= LIBXSMM_STATIC_TARGET_ARCH) internal_rng_f32_seq_avx512(rngs, count); #else # if defined(LIBXSMM_INTRINSICS_AVX512) /* __AVX512F__ */ if ((LIBXSMM_RNG_SIMD_MIN << 4) <= count) { /* SIMD code path */ internal_rng_f32_seq(rngs, count); /* pointer based function call */ } else /* scalar code path */ # endif internal_rng_f32_seq_sw(rngs, count); #endif } LIBXSMM_API unsigned int libxsmm_rng_u32(unsigned int n) { #if defined(LIBXSMM_RNG_DRAND48) const unsigned int q = ((1U << 31) / n) * n; unsigned int r = (unsigned int)lrand48(); if (q != (1U << 31)) #else const unsigned int rand_max1 = (unsigned int)(RAND_MAX)+1U; const unsigned int q = (rand_max1 / n) * n; unsigned int r = (unsigned int)rand(); if (q != rand_max1) #endif { #if defined(LIBXSMM_RNG_DRAND48) /* coverity[dont_call] */ while (q <= r) r = (unsigned int)lrand48(); #else while (q <= r) r = (unsigned int)rand(); #endif } return r % n; } LIBXSMM_API void libxsmm_rng_seq(void* data, libxsmm_blasint nbytes) { unsigned char* dst = (unsigned char*)data; unsigned char* end = dst + (nbytes & 0xFFFFFFFFFFFFFFFC); unsigned int r; for (; dst < end; dst += 4) { #if defined(LIBXSMM_RNG_DRAND48) /* coverity[dont_call] */ r = (unsigned int)lrand48(); #else r = (unsigned int)rand(); #endif LIBXSMM_MEMCPY127(dst, &r, 4); } end = (unsigned char*)data + nbytes; if (dst < end) { #if defined(LIBXSMM_RNG_DRAND48) r = (unsigned int)lrand48(); #else r = (unsigned int)rand(); #endif LIBXSMM_MEMCPY127(dst, &r, end - dst); } } LIBXSMM_API double libxsmm_rng_f64(void) { #if defined(LIBXSMM_RNG_DRAND48) /* coverity[dont_call] */ return drand48(); #else static const double scale = 1.0 / (RAND_MAX); return scale * (double)rand(); #endif } libxsmm-1.17/src/libxsmm_spmdm.c000066400000000000000000000563111415223013700167270ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Nadathur Satish, Hans Pabst (Intel Corp.) ******************************************************************************/ #include #include "libxsmm_main.h" /* Enable/disable specific code paths */ #if defined(LIBXSMM_INTRINSICS_AVX) && !defined(LIBXSMM_SPMDM_AVX) # define LIBXSMM_SPMDM_AVX #endif #if defined(LIBXSMM_INTRINSICS_AVX2) && !defined(LIBXSMM_SPMDM_AVX2) && \ !(defined(__PGI) && defined(__cplusplus)) # define LIBXSMM_SPMDM_AVX2 #endif #if defined(LIBXSMM_INTRINSICS_AVX512_CORE) && !defined(LIBXSMM_SPMDM_AVX512_CORE) && \ !(defined(__PGI) && defined(__cplusplus)) # define LIBXSMM_SPMDM_AVX512_CORE #endif /* function pointer for the CPUID-dispatched implementation (separate typedef for legacy Cray C++ needed) */ typedef void (*internal_spmdm_createSparseSlice_fp32_thread_fn)(const libxsmm_spmdm_handle*, char, const float*, libxsmm_CSR_sparseslice*, int, int, int); LIBXSMM_APIVAR_DEFINE(internal_spmdm_createSparseSlice_fp32_thread_fn internal_spmdm_createSparseSlice_fp32_thread); typedef void (*internal_spmdm_createSparseSlice_bfloat16_thread_fn)(const libxsmm_spmdm_handle*, char, const libxsmm_bfloat16*, libxsmm_CSR_sparseslice*, int, int, int); LIBXSMM_APIVAR_DEFINE(internal_spmdm_createSparseSlice_bfloat16_thread_fn internal_spmdm_createSparseSlice_bfloat16_thread); typedef void (*internal_spmdm_compute_fp32_thread_fn)(const libxsmm_spmdm_handle*, char, char, const float*, libxsmm_CSR_sparseslice*, const float*, char, const float*, float*, int, int, int); LIBXSMM_APIVAR_DEFINE(internal_spmdm_compute_fp32_thread_fn internal_spmdm_compute_fp32_thread); typedef void (*internal_spmdm_compute_bfloat16_thread_fn)(const libxsmm_spmdm_handle*, char, char, const libxsmm_bfloat16*, libxsmm_CSR_sparseslice*, const libxsmm_bfloat16*, char, const libxsmm_bfloat16*, float*, int, int, int); LIBXSMM_APIVAR_DEFINE(internal_spmdm_compute_bfloat16_thread_fn internal_spmdm_compute_bfloat16_thread); #if defined(LIBXSMM_SPMDM_AVX) LIBXSMM_APIVAR_DEFINE(__m256i* internal_spmdm_shufmasks_32); LIBXSMM_APIVAR_DEFINE(__m256i* internal_spmdm_shufmasks_16); #endif LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX) LIBXSMM_ATTRIBUTE_UNUSED void internal_spmdm_init_shufmask_avx(void) { #if defined(LIBXSMM_SPMDM_AVX) static __m256i spmdm_shufmasks_32[256], spmdm_shufmasks_16[256]; LIBXSMM_ALIGNED(int temp_shufmasks[8], 64); LIBXSMM_ALIGNED(uint16_t temp_shufmasks2[16], 64); unsigned int i, j, c, last_bit; int cnt; for (i = 0; i < 256; i++) { cnt = 0; j = i; for (c = 0; c < 8; c++) temp_shufmasks[c] = 0; for (c = 0; c < 16; c++) temp_shufmasks2[c] = 0; while (j) { last_bit = LIBXSMM_INTRINSICS_BITSCANFWD32(j); temp_shufmasks[cnt] = last_bit; temp_shufmasks2[cnt] = (uint16_t)last_bit; j &= (~(1<mb; int k_blocks = handle->kb; const size_t sz_block = (((size_t)handle->bm + 1) * sizeof(uint16_t) + (size_t)handle->bm * handle->bk * sizeof(uint16_t) + (size_t)handle->bm * handle->bk * sizeof(float) + sizeof(libxsmm_CSR_sparseslice)); size_t sz_all_blocks = sz_block * handle->mb * handle->kb; char* memory_block = 0; void *const pv = &memory_block; /* use low-level scratch memory allocation since life-time of this buffer is unknown */ if (EXIT_SUCCESS == libxsmm_xmalloc((void**)pv, sz_all_blocks, 2097152, LIBXSMM_MALLOC_FLAG_SCRATCH | LIBXSMM_MALLOC_FLAG_PRIVATE, 0/*extra*/, 0/*extra_size*/)) { char* memory_head = memory_block; libxsmm_CSR_sparseslice* libxsmm_output_csr_a = (libxsmm_CSR_sparseslice*)(memory_head); memory_head += (size_t)handle->mb * handle->kb * sizeof(libxsmm_CSR_sparseslice); LIBXSMM_ASSERT(0 != libxsmm_output_csr_a/*sanity check*/); for (kb = 0; kb < k_blocks; kb++) { for (mb = 0; mb < m_blocks; mb++) { int i = kb*m_blocks + mb; libxsmm_output_csr_a[i].rowidx = (uint16_t*)(memory_head); memory_head += ((size_t)handle->bm + 1) * sizeof(uint16_t); libxsmm_output_csr_a[i].colidx = (uint16_t*)(memory_head); memory_head += (size_t)handle->bm * handle->bk * sizeof(uint16_t); libxsmm_output_csr_a[i].values = (float*)(memory_head); memory_head += (size_t)handle->bm * handle->bk * sizeof(float); } } LIBXSMM_ASSERT(memory_head == (memory_block + sz_all_blocks)); *libxsmm_output_csr = libxsmm_output_csr_a; } else if (0 != libxsmm_verbosity) { /* library code is expected to be mute */ fprintf(stderr, "LIBXSMM ERROR: SPMDM CSR scratch memory allocation failed!\n"); } handle->base_ptr_scratch_A = memory_block; } LIBXSMM_API_INLINE void internal_spmdm_allocate_scratch(libxsmm_spmdm_handle* handle, int max_threads) { void *const pv = &handle->base_ptr_scratch_B_scratch_C; size_t sz_total_memory, sz_memory_for_scratch_per_thread = (size_t)handle->bm * handle->bn * sizeof(float) + (size_t)handle->bk * handle->bn * sizeof(float); sz_memory_for_scratch_per_thread = LIBXSMM_UP2(sz_memory_for_scratch_per_thread, 4096); sz_total_memory = sz_memory_for_scratch_per_thread * max_threads; handle->base_ptr_scratch_B_scratch_C = 0; /* use low-level scratch memory allocation since life-time of this buffer is unknown */ if (EXIT_SUCCESS == libxsmm_xmalloc((void**)pv, sz_total_memory, 2097152, LIBXSMM_MALLOC_FLAG_SCRATCH | LIBXSMM_MALLOC_FLAG_PRIVATE, 0/*extra*/, 0/*extra_size*/)) { handle->memory_for_scratch_per_thread = (int)sz_memory_for_scratch_per_thread; } else { if (0 != libxsmm_verbosity) { /* library code is expected to be mute */ fprintf(stderr, "LIBXSMM ERROR: SPMDM scratch memory allocation failed!\n"); } handle->memory_for_scratch_per_thread = 0; } } LIBXSMM_API_INLINE void internal_spmdm_deallocate_csr_a(libxsmm_spmdm_handle* handle) { libxsmm_xfree(handle->base_ptr_scratch_A, 0/*no check*/); handle->base_ptr_scratch_A = NULL; libxsmm_xfree(handle->base_ptr_scratch_B_scratch_C, 0/*no check*/); handle->base_ptr_scratch_B_scratch_C = NULL; } LIBXSMM_API void libxsmm_spmdm_destroy(libxsmm_spmdm_handle* handle) { internal_spmdm_deallocate_csr_a(handle); } LIBXSMM_API int libxsmm_spmdm_get_num_createSparseSlice_blocks(const libxsmm_spmdm_handle* handle) { return handle->mb * handle->kb; } LIBXSMM_API int libxsmm_spmdm_get_num_compute_blocks(const libxsmm_spmdm_handle* handle) { return handle->mb * handle->nb; } LIBXSMM_API_INLINE void internal_spmdm_createSparseSlice_fp32_thread_sw( const libxsmm_spmdm_handle* handle, char transa, const float* a, libxsmm_CSR_sparseslice* libxsmm_output_csr_a, int block_id, int tid, int nthreads) { # include "libxsmm_spmdm_begin.h" # include "template/libxsmm_spmdm_createSparseSlice_fp32_thread.tpl.c" # include "libxsmm_spmdm_end.h" } LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX2) LIBXSMM_ATTRIBUTE_UNUSED void internal_spmdm_createSparseSlice_fp32_thread_avx2( const libxsmm_spmdm_handle* handle, char transa, const float* a, libxsmm_CSR_sparseslice* libxsmm_output_csr_a, int block_id, int tid, int nthreads) { #if defined(LIBXSMM_SPMDM_AVX2) # include "libxsmm_spmdm_begin_avx2.h" # include "template/libxsmm_spmdm_createSparseSlice_fp32_thread.tpl.c" # include "libxsmm_spmdm_end.h" #else internal_spmdm_createSparseSlice_fp32_thread_sw(handle, transa, a, libxsmm_output_csr_a, block_id, tid, nthreads); #endif } #if defined(LIBXSMM_SPMDM_AVX512_CORE) LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) LIBXSMM_ATTRIBUTE_UNUSED void internal_spmdm_createSparseSlice_fp32_thread_avx512_core( const libxsmm_spmdm_handle* handle, char transa, const float* a, libxsmm_CSR_sparseslice* libxsmm_output_csr_a, int block_id, int tid, int nthreads) { #if defined(LIBXSMM_SPMDM_AVX512_CORE) # include "libxsmm_spmdm_begin_avx512.h" # include "template/libxsmm_spmdm_createSparseSlice_fp32_thread.tpl.c" # include "libxsmm_spmdm_end.h" #else internal_spmdm_createSparseSlice_fp32_thread_avx2(handle, transa, a, libxsmm_output_csr_a, block_id, tid, nthreads); #endif } #endif LIBXSMM_API void libxsmm_spmdm_createSparseSlice_fp32_thread( const libxsmm_spmdm_handle* handle, char transa, const float* a, libxsmm_CSR_sparseslice* libxsmm_output_csr_a, int block_id, int tid, int nthreads) { /* if highest implemented code path is statically present, no need for an indirect call (function pointer) */ #if (LIBXSMM_X86_AVX512_CORE <= LIBXSMM_STATIC_TARGET_ARCH) && defined(LIBXSMM_SPMDM_AVX512_CORE) internal_spmdm_createSparseSlice_fp32_thread_avx512_core(handle, transa, a, libxsmm_output_csr_a, block_id, tid, nthreads); #elif (LIBXSMM_X86_AVX2 <= LIBXSMM_STATIC_TARGET_ARCH) && /* no need for an indirect call */ \ (LIBXSMM_X86_AVX512_CORE > LIBXSMM_MAX_STATIC_TARGET_ARCH) internal_spmdm_createSparseSlice_fp32_thread_avx2(handle, transa, a, libxsmm_output_csr_a, block_id, tid, nthreads); #else /* pointer based function call */ LIBXSMM_ASSERT(0 != internal_spmdm_createSparseSlice_fp32_thread); internal_spmdm_createSparseSlice_fp32_thread(handle, transa, a, libxsmm_output_csr_a, block_id, tid, nthreads); #endif } LIBXSMM_API_INLINE void internal_spmdm_createSparseSlice_bfloat16_thread_sw( const libxsmm_spmdm_handle* handle, char transa, const libxsmm_bfloat16* a, libxsmm_CSR_sparseslice* libxsmm_output_csr_a, int block_id, int tid, int nthreads) { # include "libxsmm_spmdm_begin.h" # include "template/libxsmm_spmdm_createSparseSlice_bfloat16_thread.tpl.c" # include "libxsmm_spmdm_end.h" } LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX2) LIBXSMM_ATTRIBUTE_UNUSED void internal_spmdm_createSparseSlice_bfloat16_thread_avx2( const libxsmm_spmdm_handle* handle, char transa, const libxsmm_bfloat16* a, libxsmm_CSR_sparseslice* libxsmm_output_csr_a, int block_id, int tid, int nthreads) { #if defined(LIBXSMM_SPMDM_AVX2) # include "libxsmm_spmdm_begin_avx2.h" # include "template/libxsmm_spmdm_createSparseSlice_bfloat16_thread.tpl.c" # include "libxsmm_spmdm_end.h" #else internal_spmdm_createSparseSlice_bfloat16_thread_sw(handle, transa, a, libxsmm_output_csr_a, block_id, tid, nthreads); #endif } #if defined(LIBXSMM_SPMDM_AVX512_CORE) LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) LIBXSMM_ATTRIBUTE_UNUSED void internal_spmdm_createSparseSlice_bfloat16_thread_avx512_core( const libxsmm_spmdm_handle* handle, char transa, const libxsmm_bfloat16* a, libxsmm_CSR_sparseslice* libxsmm_output_csr_a, int block_id, int tid, int nthreads) { #if defined(LIBXSMM_SPMDM_AVX512_CORE) # include "libxsmm_spmdm_begin_avx512.h" # include "template/libxsmm_spmdm_createSparseSlice_bfloat16_thread.tpl.c" # include "libxsmm_spmdm_end.h" #else internal_spmdm_createSparseSlice_bfloat16_thread_avx2(handle, transa, a, libxsmm_output_csr_a, block_id, tid, nthreads); #endif } #endif LIBXSMM_API void libxsmm_spmdm_createSparseSlice_bfloat16_thread( const libxsmm_spmdm_handle* handle, char transa, const libxsmm_bfloat16* a, libxsmm_CSR_sparseslice* libxsmm_output_csr_a, int block_id, int tid, int nthreads) { /* if highest implemented code path is statically present, no need for an indirect call (function pointer) */ #if (LIBXSMM_X86_AVX512_CORE <= LIBXSMM_STATIC_TARGET_ARCH) && defined(LIBXSMM_SPMDM_AVX512_CORE) internal_spmdm_createSparseSlice_bfloat16_thread_avx512_core(handle, transa, a, libxsmm_output_csr_a, block_id, tid, nthreads); #elif (LIBXSMM_X86_AVX2 <= LIBXSMM_STATIC_TARGET_ARCH) && /* no need for an indirect call */ \ (LIBXSMM_X86_AVX512_CORE > LIBXSMM_MAX_STATIC_TARGET_ARCH) internal_spmdm_createSparseSlice_bfloat16_thread_avx2(handle, transa, a, libxsmm_output_csr_a, block_id, tid, nthreads); #else /* pointer based function call */ LIBXSMM_ASSERT(0 != internal_spmdm_createSparseSlice_fp32_thread); internal_spmdm_createSparseSlice_bfloat16_thread(handle, transa, a, libxsmm_output_csr_a, block_id, tid, nthreads); #endif } LIBXSMM_API_INLINE void internal_spmdm_compute_fp32_thread_sw( const libxsmm_spmdm_handle* handle, char transa, char transb, const float* alpha, libxsmm_CSR_sparseslice* a_sparse, const float* b, char transc, const float* beta, float* c, int block_id, int tid, int nthreads) { # include "libxsmm_spmdm_begin.h" # include "template/libxsmm_spmdm_compute_fp32_thread.tpl.c" # include "libxsmm_spmdm_end.h" } LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX2) LIBXSMM_ATTRIBUTE_UNUSED void internal_spmdm_compute_fp32_thread_avx2( const libxsmm_spmdm_handle* handle, char transa, char transb, const float* alpha, libxsmm_CSR_sparseslice* a_sparse, const float* b, char transc, const float* beta, float* c, int block_id, int tid, int nthreads) { #if defined(LIBXSMM_SPMDM_AVX2) # include "libxsmm_spmdm_begin_avx2.h" # include "template/libxsmm_spmdm_compute_fp32_thread.tpl.c" # include "libxsmm_spmdm_end.h" #else internal_spmdm_compute_fp32_thread_sw(handle, transa, transb, alpha, a_sparse, b, transc, beta, c, block_id, tid, nthreads); #endif } #if defined(LIBXSMM_SPMDM_AVX512_CORE) LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) LIBXSMM_ATTRIBUTE_UNUSED void internal_spmdm_compute_fp32_thread_avx512_core( const libxsmm_spmdm_handle* handle, char transa, char transb, const float* alpha, libxsmm_CSR_sparseslice* a_sparse, const float* b, char transc, const float* beta, float* c, int block_id, int tid, int nthreads) { #if defined(LIBXSMM_SPMDM_AVX512_CORE) # include "libxsmm_spmdm_begin_avx512.h" # include "template/libxsmm_spmdm_compute_fp32_thread.tpl.c" # include "libxsmm_spmdm_end.h" #else internal_spmdm_compute_fp32_thread_avx2(handle, transa, transb, alpha, a_sparse, b, transc, beta, c, block_id, tid, nthreads); #endif } #endif LIBXSMM_API void libxsmm_spmdm_compute_fp32_thread( const libxsmm_spmdm_handle* handle, char transa, char transb, const float* alpha, libxsmm_CSR_sparseslice* a_sparse, const float* b, char transc, const float* beta, float* c, int block_id, int tid, int nthreads) { /* if highest implemented code path is statically present, no need for an indirect call (function pointer) */ #if (LIBXSMM_X86_AVX512_CORE <= LIBXSMM_STATIC_TARGET_ARCH) && defined(LIBXSMM_SPMDM_AVX512_CORE) internal_spmdm_compute_fp32_thread_avx512_core(handle, transa, transb, alpha, a_sparse, b, transc, beta, c, block_id, tid, nthreads); #elif (LIBXSMM_X86_AVX2 <= LIBXSMM_STATIC_TARGET_ARCH) && /* no need for an indirect call */ \ (LIBXSMM_X86_AVX512_CORE > LIBXSMM_MAX_STATIC_TARGET_ARCH) internal_spmdm_compute_fp32_thread_avx2(handle, transa, transb, alpha, a_sparse, b, transc, beta, c, block_id, tid, nthreads); #else /* pointer based function call */ LIBXSMM_ASSERT(0 != internal_spmdm_compute_fp32_thread); internal_spmdm_compute_fp32_thread(handle, transa, transb, alpha, a_sparse, b, transc, beta, c, block_id, tid, nthreads); #endif } LIBXSMM_API_INLINE void internal_spmdm_compute_bfloat16_thread_sw( const libxsmm_spmdm_handle* handle, char transa, char transb, const libxsmm_bfloat16* alpha, libxsmm_CSR_sparseslice* a_sparse, const libxsmm_bfloat16* b, char transc, const libxsmm_bfloat16* beta, float* c, int block_id, int tid, int nthreads) { # include "libxsmm_spmdm_begin.h" # include "template/libxsmm_spmdm_compute_bfloat16_thread.tpl.c" # include "libxsmm_spmdm_end.h" } LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX2) LIBXSMM_ATTRIBUTE_UNUSED void internal_spmdm_compute_bfloat16_thread_avx2( const libxsmm_spmdm_handle* handle, char transa, char transb, const libxsmm_bfloat16* alpha, libxsmm_CSR_sparseslice* a_sparse, const libxsmm_bfloat16* b, char transc, const libxsmm_bfloat16* beta, float* c, int block_id, int tid, int nthreads) { #if defined(LIBXSMM_SPMDM_AVX2) # include "libxsmm_spmdm_begin_avx2.h" # include "template/libxsmm_spmdm_compute_bfloat16_thread.tpl.c" # include "libxsmm_spmdm_end.h" #else internal_spmdm_compute_bfloat16_thread_sw(handle, transa, transb, alpha, a_sparse, b, transc, beta, c, block_id, tid, nthreads); #endif } #if defined(LIBXSMM_SPMDM_AVX512_CORE) LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) LIBXSMM_ATTRIBUTE_UNUSED void internal_spmdm_compute_bfloat16_thread_avx512_core( const libxsmm_spmdm_handle* handle, char transa, char transb, const libxsmm_bfloat16* alpha, libxsmm_CSR_sparseslice* a_sparse, const libxsmm_bfloat16* b, char transc, const libxsmm_bfloat16* beta, float* c, int block_id, int tid, int nthreads) { #if defined(LIBXSMM_SPMDM_AVX512_CORE) # include "libxsmm_spmdm_begin_avx512.h" # include "template/libxsmm_spmdm_compute_bfloat16_thread.tpl.c" # include "libxsmm_spmdm_end.h" #else internal_spmdm_compute_bfloat16_thread_avx2(handle, transa, transb, alpha, a_sparse, b, transc, beta, c, block_id, tid, nthreads); #endif } #endif LIBXSMM_API void libxsmm_spmdm_compute_bfloat16_thread( const libxsmm_spmdm_handle* handle, char transa, char transb, const libxsmm_bfloat16* alpha, libxsmm_CSR_sparseslice* a_sparse, const libxsmm_bfloat16* b, char transc, const libxsmm_bfloat16* beta, float* c, int block_id, int tid, int nthreads) { /* if highest implemented code path is statically present, no need for an indirect call (function pointer) */ #if (LIBXSMM_X86_AVX512_CORE <= LIBXSMM_STATIC_TARGET_ARCH) && defined(LIBXSMM_SPMDM_AVX512_CORE) internal_spmdm_compute_bfloat16_thread_avx512_core(handle, transa, transb, alpha, a_sparse, b, transc, beta, c, block_id, tid, nthreads); #elif (LIBXSMM_X86_AVX2 <= LIBXSMM_STATIC_TARGET_ARCH) && /* no need for an indirect call */ \ (LIBXSMM_X86_AVX512_CORE > LIBXSMM_MAX_STATIC_TARGET_ARCH) internal_spmdm_compute_bfloat16_thread_avx2(handle, transa, transb, alpha, a_sparse, b, transc, beta, c, block_id, tid, nthreads); #else /* pointer based function call */ LIBXSMM_ASSERT(0 != internal_spmdm_compute_bfloat16_thread); internal_spmdm_compute_bfloat16_thread(handle, transa, transb, alpha, a_sparse, b, transc, beta, c, block_id, tid, nthreads); #endif } LIBXSMM_API void libxsmm_spmdm_init(int M, int N, int K, int max_threads, libxsmm_spmdm_handle* handle, libxsmm_CSR_sparseslice** libxsmm_output_csr) { double load_imbalance_tolerate = 1.1; int max_work_per_block; double avg_work_per_block; int max_blocks_per_thread; double avg_blocks_per_thread; double load_imbalance_1, load_imbalance_2, load_imbalance; libxsmm_init(); /* !LIBXSMM_INIT */ { unsigned int dummy = LIBXSMM_ATOMIC_ADD_FETCH(&libxsmm_statistic_num_spmdm, 1, LIBXSMM_ATOMIC_RELAXED); /* count number of invocations */ LIBXSMM_UNUSED(dummy); } handle->m = M; handle->n = N; handle->k = K; handle->bm = (M >= 4096 || M <= 1024) ? 512 : 256; #if defined(LIBXSMM_SPMDM_AVX512_CORE) if (LIBXSMM_X86_AVX512_CORE <= libxsmm_target_archid || LIBXSMM_X86_AVX512_CORE <= LIBXSMM_STATIC_TARGET_ARCH) { internal_spmdm_createSparseSlice_fp32_thread = internal_spmdm_createSparseSlice_fp32_thread_avx512_core; internal_spmdm_createSparseSlice_bfloat16_thread = internal_spmdm_createSparseSlice_bfloat16_thread_avx512_core; internal_spmdm_compute_fp32_thread = internal_spmdm_compute_fp32_thread_avx512_core; internal_spmdm_compute_bfloat16_thread = internal_spmdm_compute_bfloat16_thread_avx512_core; handle->bn = 96; } else #endif #if defined(LIBXSMM_SPMDM_AVX2) if (LIBXSMM_X86_AVX2 <= libxsmm_target_archid || LIBXSMM_X86_AVX2 <= LIBXSMM_STATIC_TARGET_ARCH) { internal_spmdm_createSparseSlice_fp32_thread = internal_spmdm_createSparseSlice_fp32_thread_avx2; internal_spmdm_createSparseSlice_bfloat16_thread = internal_spmdm_createSparseSlice_bfloat16_thread_avx2; internal_spmdm_compute_fp32_thread = internal_spmdm_compute_fp32_thread_avx2; internal_spmdm_compute_bfloat16_thread = internal_spmdm_compute_bfloat16_thread_avx2; handle->bn = 48; } else #endif { internal_spmdm_createSparseSlice_fp32_thread = internal_spmdm_createSparseSlice_fp32_thread_sw; internal_spmdm_createSparseSlice_bfloat16_thread = internal_spmdm_createSparseSlice_bfloat16_thread_sw; internal_spmdm_compute_fp32_thread = internal_spmdm_compute_fp32_thread_sw; internal_spmdm_compute_bfloat16_thread = internal_spmdm_compute_bfloat16_thread_sw; handle->bn = 6; } handle->bk = 128; handle->mb = LIBXSMM_UPDIV(handle->m, handle->bm); handle->nb = LIBXSMM_UPDIV(handle->n, handle->bn); handle->kb = LIBXSMM_UPDIV(handle->k, handle->bk); max_work_per_block = handle->bm * handle->bn; avg_work_per_block = (double)((size_t)handle->m * handle->n) / ((size_t)handle->mb * handle->nb); load_imbalance_1 = max_work_per_block / avg_work_per_block; max_blocks_per_thread = LIBXSMM_UPDIV(handle->mb * handle->nb, max_threads); avg_blocks_per_thread = (double)handle->mb * handle->nb / max_threads; load_imbalance_2 = max_blocks_per_thread / avg_blocks_per_thread; load_imbalance = load_imbalance_1 * load_imbalance_2; while (32 < handle->bm && load_imbalance > load_imbalance_tolerate) { handle->bm--; handle->mb = LIBXSMM_UPDIV(handle->m, handle->bm); max_blocks_per_thread = LIBXSMM_UPDIV(handle->mb * handle->nb, max_threads); avg_blocks_per_thread = (double)handle->mb * handle->nb / max_threads; load_imbalance_2 = max_blocks_per_thread / avg_blocks_per_thread; max_work_per_block = handle->bm * handle->bn; avg_work_per_block = (double)((size_t)handle->m * handle->n) / ((size_t)handle->mb * handle->nb); load_imbalance_1 = max_work_per_block / avg_work_per_block; load_imbalance = load_imbalance_1 * load_imbalance_2; } /* This is temporary space needed; allocate for each different size of a */ internal_spmdm_allocate_csr_a(handle, libxsmm_output_csr); internal_spmdm_allocate_scratch(handle, max_threads); /* Initialize shuffle masks for the computation */ #if defined(LIBXSMM_SPMDM_AVX) if (LIBXSMM_X86_AVX <= libxsmm_target_archid || LIBXSMM_X86_AVX <= LIBXSMM_STATIC_TARGET_ARCH) { internal_spmdm_init_shufmask_avx(); LIBXSMM_ASSERT(0 != internal_spmdm_shufmasks_32); LIBXSMM_ASSERT(0 != internal_spmdm_shufmasks_16); } #endif /* post-conditions */ LIBXSMM_ASSERT(0 != internal_spmdm_createSparseSlice_fp32_thread); LIBXSMM_ASSERT(0 != internal_spmdm_createSparseSlice_bfloat16_thread); LIBXSMM_ASSERT(0 != internal_spmdm_compute_fp32_thread); LIBXSMM_ASSERT(0 != internal_spmdm_compute_bfloat16_thread); } libxsmm-1.17/src/libxsmm_spmdm_begin.h000066400000000000000000000045601415223013700200770ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Nadathur Satish, Hans Pabst (Intel Corp.) ******************************************************************************/ #define SIMD_WIDTH_FP32 (1) #define SIMDTYPE_FP32 float #define SIMDTYPE_INT32 int #define SIMDMASKTYPE_FP32 int #define _MM_SETZERO_FP32() (0) #define _MM_SETZERO_INT32() (0) #define _MM_SET1_FP32(x) (x) #define _MM_SET1_INT32(x) (x) #define _MM_SET1_INT16 (x) #define _MM_LOAD_FP32(x) (*(x)) #define _MM_LOADU_FP32(x) (*(x)) #define _MM_LOAD_INT32(x) (*(x)) #define _MM_STORE_INT32(x,y) ((*(x)) = (y)) #define _MM_LOADU_INT32(x) (*(x)) #define _MM_GATHER_FP32(Addr, idx, scale) (*(Addr + (idx))) #define _MM_CMPNEQ_FP32(v1,v2) (LIBXSMM_FEQ(v1, v2) ? 0 : 1) #define _MM_STORE_FP32(x,y) ((*(x)) = (y)) #define _MM_STOREU_FP32(x,y) ((*(x)) = (y)) #define _MM_ADD_FP32(x,y) ((x) + (y)) #define _MM_FMADD_FP32(x,y,z) (((x)*(y))+(z)) #define _MM_MUL_FP32(x,y) ((x)*(y)) #define _MM_PREFETCH(x, y) #define TRANSPOSE_SIMD_WIDTH_KERNEL(ptr_A, ldA, ptr_B, ldB) ((*(ptr_B)) = (*(ptr_A))) #define TRANSPOSE_SIMD_WIDTH_KERNEL_BFLOAT16(ptr_A, ldA, ptr_B, ldB) { \ uint16_t restmp = (*(ptr_A)); \ union { int i; float f; } res; \ res.i = restmp; \ res.i <<= 16; \ (*(ptr_B)) = res.f; \ } #define COMPRESS_FP32(v, k, m, cnt) if (m) { \ values_ptr[cnt] = v; \ colidx_ptr[cnt] = (uint16_t)(k); \ cnt++; \ } #define EXPAND_BFLOAT16(v, vlo_final, vhi_final) { \ union { int i; float f; } vlo_tmp, vhi_tmp; \ vlo_tmp.i = (v) & 0xFFFF; vlo_tmp.i <<= 16; \ vlo_final = vlo_tmp.f; \ vhi_tmp.i = (v) & 0x0000FFFF; \ vhi_final = vhi_tmp.f; \ } #define COMPRESS_BFLOAT16(vlo, vhi, v) { \ union { int i; float f; } vlo_tmp, vhi_tmp; \ vlo_tmp.f = vlo; \ v = (vlo_tmp.i >> 16); \ vhi_tmp.f = vhi; \ v = v | (vhi_tmp.i & 0xFFFF0000); \ } libxsmm-1.17/src/libxsmm_spmdm_begin_avx2.h000066400000000000000000000215011415223013700210310ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Nadathur Satish, Hans Pabst (Intel Corp.) ******************************************************************************/ #if !defined(LIBXSMM_MAX_STATIC_TARGET_ARCH) # error "libxsmm_intrinsics_x86.h not included!" #endif #if (LIBXSMM_X86_AVX2 <= LIBXSMM_MAX_STATIC_TARGET_ARCH) #define SIMD_WIDTH_FP32 (8) #define SIMDTYPE_FP32 __m256 #define SIMDTYPE_INT32 __m256i #define SIMDMASKTYPE_FP32 __m256 #define _MM_SETZERO_FP32 _mm256_setzero_ps #define _MM_SETZERO_INT32 _mm256_setzero_si256 #define _MM_SET1_FP32 _mm256_set1_ps #define _MM_SET1_INT32 _mm256_set1_epi32 #define _MM_SET1_INT16 _mm256_set1_epi16 #define _MM_SET_INT32 _mm256_set_epi32 #define _MM_LOAD_FP32 _mm256_loadu_ps #define _MM_LOADU_FP32 _mm256_loadu_ps #define _MM_LOAD_INT32 _mm256_loadu_si256 #define _MM_STORE_INT32 _mm256_storeu_si256 #define _MM_LOADU_INT32(x) _mm256_loadu_si256( (__m256i const *)(x)) #define _MM_GATHER_INT32(Addr, idx, scale) _mm256_i32gather_epi32((Addr), (idx), (scale)) #define _MM_GATHER_FP32(Addr, idx, scale) _mm256_i32gather_ps(((float const *)(Addr)), (idx), (scale)) #define _MM_CMPNEQ_FP32(v1,v2) _mm256_cmp_ps(v1,v2,12) #define _MM_STORE_FP32 _mm256_storeu_ps #define _MM_STOREU_FP32 _mm256_storeu_ps #define _MM_ADD_FP32 _mm256_add_ps #define _MM_FMADD_FP32 _mm256_fmadd_ps #define _MM_MUL_FP32 _mm256_mul_ps #define _MM_PREFETCH(x, y) _mm_prefetch(x, y) #define TRANSPOSE_SIMD_WIDTH_KERNEL(ptr_A, ldA, ptr_B, ldB) { \ __m256 ymm9 = _mm256_loadu_ps(ptr_A); \ __m256 ymm10 = _mm256_loadu_ps(ptr_A + (size_t)ldA); \ __m256 ymm11 = _mm256_loadu_ps(ptr_A + (size_t)ldA*2); \ __m256 ymm12 = _mm256_loadu_ps(ptr_A + (size_t)ldA*3); \ __m256 ymm13 = _mm256_loadu_ps(ptr_A + (size_t)ldA*4); \ __m256 ymm14 = _mm256_loadu_ps(ptr_A + (size_t)ldA*5); \ __m256 ymm15 = _mm256_loadu_ps(ptr_A + (size_t)ldA*6); \ __m256 ymm2 = _mm256_loadu_ps(ptr_A + (size_t)ldA*7); \ __m256 ymm6 = _mm256_unpacklo_ps(ymm9, ymm10); \ __m256 ymm1 = _mm256_unpacklo_ps(ymm11, ymm12); \ __m256 ymm8 = _mm256_unpackhi_ps(ymm9, ymm10); \ __m256 ymm0 = _mm256_unpacklo_ps(ymm13, ymm14); \ ymm9 = _mm256_unpacklo_ps(ymm15, ymm2);{ \ __m256 ymm3 = _mm256_shuffle_ps(ymm6, ymm1, 0x4E); \ ymm10 = _mm256_blend_ps(ymm6, ymm3, 0xCC); \ ymm6 = _mm256_shuffle_ps(ymm0, ymm9, 0x4E);{ \ __m256 ymm7 = _mm256_unpackhi_ps(ymm11, ymm12); \ ymm11 = _mm256_blend_ps(ymm0, ymm6, 0xCC); \ ymm12 = _mm256_blend_ps(ymm3, ymm1, 0xCC); \ ymm3 = _mm256_permute2f128_ps(ymm10, ymm11, 0x20); \ _mm256_storeu_ps(ptr_B, ymm3);{ \ __m256 ymm5 = _mm256_unpackhi_ps(ymm13, ymm14); \ ymm13 = _mm256_blend_ps(ymm6, ymm9, 0xCC);{ \ __m256 ymm4 = _mm256_unpackhi_ps(ymm15, ymm2); \ ymm2 = _mm256_permute2f128_ps(ymm12, ymm13, 0x20); \ _mm256_storeu_ps(ptr_B + (size_t)ldB, ymm2); \ ymm14 = _mm256_shuffle_ps(ymm8, ymm7, 0x4E); \ ymm15 = _mm256_blend_ps(ymm14, ymm7, 0xCC); \ ymm7 = _mm256_shuffle_ps(ymm5, ymm4, 0x4E); \ ymm8 = _mm256_blend_ps(ymm8, ymm14, 0xCC); \ ymm5 = _mm256_blend_ps(ymm5, ymm7, 0xCC); \ ymm6 = _mm256_permute2f128_ps(ymm8, ymm5, 0x20); \ _mm256_storeu_ps(ptr_B + (size_t)ldB*2, ymm6); \ ymm4 = _mm256_blend_ps(ymm7, ymm4, 0xCC); \ ymm7 = _mm256_permute2f128_ps(ymm15, ymm4, 0x20); \ _mm256_storeu_ps(ptr_B + (size_t)ldB*3, ymm7); \ ymm1 = _mm256_permute2f128_ps(ymm10, ymm11, 0x31); \ ymm0 = _mm256_permute2f128_ps(ymm12, ymm13, 0x31); \ _mm256_storeu_ps(ptr_B + (size_t)ldB*4, ymm1); \ ymm5 = _mm256_permute2f128_ps(ymm8, ymm5, 0x31); \ ymm4 = _mm256_permute2f128_ps(ymm15, ymm4, 0x31); \ _mm256_storeu_ps(ptr_B + (size_t)ldB*5, ymm0); \ _mm256_storeu_ps(ptr_B + (size_t)ldB*6, ymm5); \ _mm256_storeu_ps(ptr_B + (size_t)ldB*7, ymm4);}}}} \ } #define TRANSPOSE_SIMD_WIDTH_KERNEL_BFLOAT16(ptr_A, ldA, ptr_B, ldB) { \ __m256 ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15, ymm2; \ __m256i vload_1 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i*)(ptr_A))); \ vload_1 = _mm256_inserti128_si256(vload_1, _mm_loadu_si128((const __m128i*)(ptr_A + (size_t)ldA)), 1); \ EXPAND_BFLOAT16(vload_1, ymm9, ymm10);{ \ __m256i vload_2 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i*)(ptr_A + (size_t)ldA*2))); \ vload_2 = _mm256_inserti128_si256(vload_2, _mm_loadu_si128((const __m128i*)(ptr_A + (size_t)ldA*3)), 1); \ EXPAND_BFLOAT16(vload_2, ymm11, ymm12);{ \ __m256i vload_3 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i*)(ptr_A + (size_t)ldA*4))); \ vload_3 = _mm256_inserti128_si256(vload_3, _mm_loadu_si128((const __m128i*)(ptr_A + (size_t)ldA*5)), 1); \ EXPAND_BFLOAT16(vload_3, ymm13, ymm14);{ \ __m256i vload_4 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i*)(ptr_A + (size_t)ldA*6))); \ vload_4 = _mm256_inserti128_si256(vload_4, _mm_loadu_si128((const __m128i*)(ptr_A + (size_t)ldA*7)), 1); \ EXPAND_BFLOAT16(vload_4, ymm15, ymm2);{ \ __m256 ymm6 = _mm256_unpacklo_ps(ymm9, ymm10); \ __m256 ymm1 = _mm256_unpacklo_ps(ymm11, ymm12); \ __m256 ymm8 = _mm256_unpackhi_ps(ymm9, ymm10); \ __m256 ymm0 = _mm256_unpacklo_ps(ymm13, ymm14); \ ymm9 = _mm256_unpacklo_ps(ymm15, ymm2);{ \ __m256 ymm3 = _mm256_shuffle_ps(ymm6, ymm1, 0x4E); \ ymm10 = _mm256_blend_ps(ymm6, ymm3, 0xCC); \ ymm6 = _mm256_shuffle_ps(ymm0, ymm9, 0x4E);{ \ __m256 ymm7 = _mm256_unpackhi_ps(ymm11, ymm12); \ ymm11 = _mm256_blend_ps(ymm0, ymm6, 0xCC); \ ymm12 = _mm256_blend_ps(ymm3, ymm1, 0xCC); \ ymm3 = _mm256_permute2f128_ps(ymm10, ymm11, 0x20); \ _mm256_storeu_ps(ptr_B, ymm3);{ \ __m256 ymm5 = _mm256_unpackhi_ps(ymm13, ymm14); \ ymm13 = _mm256_blend_ps(ymm6, ymm9, 0xCC);{ \ __m256 ymm4 = _mm256_unpackhi_ps(ymm15, ymm2); \ ymm2 = _mm256_permute2f128_ps(ymm12, ymm13, 0x20); \ _mm256_storeu_ps(ptr_B + (size_t)ldB, ymm2); \ ymm14 = _mm256_shuffle_ps(ymm8, ymm7, 0x4E); \ ymm15 = _mm256_blend_ps(ymm14, ymm7, 0xCC); \ ymm7 = _mm256_shuffle_ps(ymm5, ymm4, 0x4E); \ ymm8 = _mm256_blend_ps(ymm8, ymm14, 0xCC); \ ymm5 = _mm256_blend_ps(ymm5, ymm7, 0xCC); \ ymm6 = _mm256_permute2f128_ps(ymm8, ymm5, 0x20); \ _mm256_storeu_ps(ptr_B + (size_t)ldB*2, ymm6); \ ymm4 = _mm256_blend_ps(ymm7, ymm4, 0xCC); \ ymm7 = _mm256_permute2f128_ps(ymm15, ymm4, 0x20); \ _mm256_storeu_ps(ptr_B + (size_t)ldB*3, ymm7); \ ymm1 = _mm256_permute2f128_ps(ymm10, ymm11, 0x31); \ ymm0 = _mm256_permute2f128_ps(ymm12, ymm13, 0x31); \ _mm256_storeu_ps(ptr_B + (size_t)ldB*4, ymm1); \ ymm5 = _mm256_permute2f128_ps(ymm8, ymm5, 0x31); \ ymm4 = _mm256_permute2f128_ps(ymm15, ymm4, 0x31); \ _mm256_storeu_ps(ptr_B + (size_t)ldB*5, ymm0); \ _mm256_storeu_ps(ptr_B + (size_t)ldB*6, ymm5); \ _mm256_storeu_ps(ptr_B + (size_t)ldB*7, ymm4);}}}}}}}} \ } #define COMPRESS_FP32(v, k, m, cnt) { \ const unsigned int mask = _mm256_movemask_ps(m); \ const SIMDTYPE_INT32 vk = _MM_SET1_INT16((short)(k)); \ const __m256i perm_ctrl = _mm256_loadu_si256(&shufmasks[mask]); \ const __m256 v_packed = _mm256_permutevar8x32_ps(v, perm_ctrl); \ const __m256i v_shuff = _mm256_loadu_si256(&shufmasks2[mask]); \ const __m256i v_idx = _mm256_add_epi32(vk, v_shuff); \ _mm256_storeu_ps(values_ptr + (cnt), v_packed); \ _mm256_storeu_si256((__m256i *)(colidx_ptr + (cnt)), v_idx); \ cnt = (unsigned short)((cnt) + _mm_popcnt_u32(mask)); \ } #define EXPAND_BFLOAT16(v, vlo_final, vhi_final) { \ const __m256i vlo = _mm256_unpacklo_epi16(vzero, v); \ const __m256i vhi = _mm256_unpackhi_epi16(vzero, v); \ vlo_final = _mm256_castsi256_ps(_mm256_permute2f128_si256(vlo, vhi, 0x20)); \ vhi_final = _mm256_castsi256_ps(_mm256_permute2f128_si256(vlo, vhi, 0x31)); \ } #define COMPRESS_BFLOAT16(vlo, vhi, v) { \ const __m256i vtmp1 = _mm256_castps_si256(_mm256_permute2f128_ps(vlo, vhi, 0x20)); \ const __m256i vtmp2 = _mm256_castps_si256(_mm256_permute2f128_ps(vlo, vhi, 0x31)); \ const __m256i a = _mm256_srli_epi32(vtmp1, 16), b = _mm256_srli_epi32(vtmp2, 16); \ v = _mm256_packus_epi32(a, b); \ } #endif libxsmm-1.17/src/libxsmm_spmdm_begin_avx512.h000066400000000000000000000356501415223013700212110ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Nadathur Satish, Hans Pabst (Intel Corp.) ******************************************************************************/ #if !defined(LIBXSMM_MAX_STATIC_TARGET_ARCH) # error "libxsmm_intrinsics_x86.h not included!" #endif #if (LIBXSMM_X86_AVX512_CORE <= LIBXSMM_MAX_STATIC_TARGET_ARCH) #define SIMD_WIDTH_FP32 (16) #define SIMDTYPE_FP32 __m512 #define SIMDTYPE_INT32 __m512i #define SIMDMASKTYPE_FP32 __mmask16 #define _MM_SETZERO_FP32 _mm512_setzero_ps #define _MM_SETZERO_INT32 _mm512_setzero_epi32 #define _MM_SET1_FP32 _mm512_set1_ps #define _MM_SET1_INT32 _mm512_set1_epi32 #define _MM_SET1_INT16 _mm512_set1_epi16 #define _MM_SET_INT32 _mm512_set_epi32 #define _MM_LOAD_FP32 LIBXSMM_INTRINSICS_MM512_LOAD_PS #define _MM_LOADU_FP32 _mm512_loadu_ps #define _MM_LOAD_INT32 _mm512_loadu_si512 #define _MM_STORE_INT32 _mm512_storeu_si512 #define _MM_LOADU_INT32(x) _mm512_loadu_si512( (void const *)(x)) #define _MM_GATHER_INT32(Addr, idx, scale) _mm512_i32gather_epi32((idx), (Addr), (scale)) #define _MM_GATHER_FP32(Addr, idx, scale) _mm512_i32gather_ps((idx), (Addr), (scale)) #define _MM_CMPNEQ_FP32(v1,v2) _mm512_cmp_ps_mask(v1,v2,12) #define _MM_STORE_FP32 _mm512_storeu_ps #define _MM_STOREU_FP32 _mm512_storeu_ps #define _MM_ADD_FP32 _mm512_add_ps #define _MM_FMADD_FP32 _mm512_fmadd_ps #define _MM_MUL_FP32 _mm512_mul_ps #define _MM_PREFETCH(x, y) _mm_prefetch(x, y) #define TRANSPOSE_SIMD_WIDTH_KERNEL(ptr_A, ldA, ptr_B, ldB) { \ __m512 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf; \ __m512 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, te, tf; \ r0 = _mm512_loadu_ps(ptr_A); \ r1 = _mm512_loadu_ps(ptr_A + ldA); \ r2 = _mm512_loadu_ps(ptr_A + 2*ldA); \ r3 = _mm512_loadu_ps(ptr_A + 3*ldA); \ r4 = _mm512_loadu_ps(ptr_A + 4*ldA); \ r5 = _mm512_loadu_ps(ptr_A + 5*ldA); \ r6 = _mm512_loadu_ps(ptr_A + 6*ldA); \ r7 = _mm512_loadu_ps(ptr_A + 7*ldA); \ r8 = _mm512_loadu_ps(ptr_A + 8*ldA); \ r9 = _mm512_loadu_ps(ptr_A + 9*ldA); \ ra = _mm512_loadu_ps(ptr_A + 10*ldA); \ rb = _mm512_loadu_ps(ptr_A + 11*ldA); \ rc = _mm512_loadu_ps(ptr_A + 12*ldA); \ rd = _mm512_loadu_ps(ptr_A + 13*ldA); \ re = _mm512_loadu_ps(ptr_A + 14*ldA); \ rf = _mm512_loadu_ps(ptr_A + 15*ldA); \ \ t0 = _mm512_unpacklo_ps(r0,r1); \ t1 = _mm512_unpackhi_ps(r0,r1); \ t2 = _mm512_unpacklo_ps(r2,r3); \ t3 = _mm512_unpackhi_ps(r2,r3); \ t4 = _mm512_unpacklo_ps(r4,r5); \ t5 = _mm512_unpackhi_ps(r4,r5); \ t6 = _mm512_unpacklo_ps(r6,r7); \ t7 = _mm512_unpackhi_ps(r6,r7); \ t8 = _mm512_unpacklo_ps(r8,r9); \ t9 = _mm512_unpackhi_ps(r8,r9); \ ta = _mm512_unpacklo_ps(ra,rb); \ tb = _mm512_unpackhi_ps(ra,rb); \ tc = _mm512_unpacklo_ps(rc,rd); \ td = _mm512_unpackhi_ps(rc,rd); \ te = _mm512_unpacklo_ps(re,rf); \ tf = _mm512_unpackhi_ps(re,rf); \ \ { const __m512d td1 = _mm512_castps_pd(t0), td2 = _mm512_castps_pd(t2); \ r0 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \ r1 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2));} \ { const __m512d td1 = _mm512_castps_pd(t1), td2 = _mm512_castps_pd(t3); \ r2 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \ r3 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2));} \ { const __m512d td1 = _mm512_castps_pd(t4), td2 = _mm512_castps_pd(t6); \ r4 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \ r5 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2));} \ { const __m512d td1 = _mm512_castps_pd(t5), td2 = _mm512_castps_pd(t7); \ r6 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \ r7 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2));} \ { const __m512d td1 = _mm512_castps_pd(t8), td2 = _mm512_castps_pd(ta); \ r8 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \ r9 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2));} \ { const __m512d td1 = _mm512_castps_pd(t9), td2 = _mm512_castps_pd(tb); \ ra = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \ rb = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2));} \ { const __m512d td1 = _mm512_castps_pd(tc), td2 = _mm512_castps_pd(te); \ rc = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \ rd = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2));} \ { const __m512d td1 = _mm512_castps_pd(td), td2 = _mm512_castps_pd(tf); \ re = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \ rf = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2));} \ \ t0 = _mm512_shuffle_f32x4(r0, r4, 0x88); \ t1 = _mm512_shuffle_f32x4(r1, r5, 0x88); \ t2 = _mm512_shuffle_f32x4(r2, r6, 0x88); \ t3 = _mm512_shuffle_f32x4(r3, r7, 0x88); \ t4 = _mm512_shuffle_f32x4(r0, r4, 0xdd); \ t5 = _mm512_shuffle_f32x4(r1, r5, 0xdd); \ t6 = _mm512_shuffle_f32x4(r2, r6, 0xdd); \ t7 = _mm512_shuffle_f32x4(r3, r7, 0xdd); \ t8 = _mm512_shuffle_f32x4(r8, rc, 0x88); \ t9 = _mm512_shuffle_f32x4(r9, rd, 0x88); \ ta = _mm512_shuffle_f32x4(ra, re, 0x88); \ tb = _mm512_shuffle_f32x4(rb, rf, 0x88); \ tc = _mm512_shuffle_f32x4(r8, rc, 0xdd); \ td = _mm512_shuffle_f32x4(r9, rd, 0xdd); \ te = _mm512_shuffle_f32x4(ra, re, 0xdd); \ tf = _mm512_shuffle_f32x4(rb, rf, 0xdd); \ \ r0 = _mm512_shuffle_f32x4(t0, t8, 0x88); \ r1 = _mm512_shuffle_f32x4(t1, t9, 0x88); \ r2 = _mm512_shuffle_f32x4(t2, ta, 0x88); \ r3 = _mm512_shuffle_f32x4(t3, tb, 0x88); \ r4 = _mm512_shuffle_f32x4(t4, tc, 0x88); \ r5 = _mm512_shuffle_f32x4(t5, td, 0x88); \ r6 = _mm512_shuffle_f32x4(t6, te, 0x88); \ r7 = _mm512_shuffle_f32x4(t7, tf, 0x88); \ r8 = _mm512_shuffle_f32x4(t0, t8, 0xdd); \ r9 = _mm512_shuffle_f32x4(t1, t9, 0xdd); \ ra = _mm512_shuffle_f32x4(t2, ta, 0xdd); \ rb = _mm512_shuffle_f32x4(t3, tb, 0xdd); \ rc = _mm512_shuffle_f32x4(t4, tc, 0xdd); \ rd = _mm512_shuffle_f32x4(t5, td, 0xdd); \ re = _mm512_shuffle_f32x4(t6, te, 0xdd); \ rf = _mm512_shuffle_f32x4(t7, tf, 0xdd); \ \ _mm512_storeu_ps(ptr_B + 0*ldB, r0); \ _mm512_storeu_ps(ptr_B + 1*ldB, r1); \ _mm512_storeu_ps(ptr_B + 2*ldB, r2); \ _mm512_storeu_ps(ptr_B + 3*ldB, r3); \ _mm512_storeu_ps(ptr_B + 4*ldB, r4); \ _mm512_storeu_ps(ptr_B + 5*ldB, r5); \ _mm512_storeu_ps(ptr_B + 6*ldB, r6); \ _mm512_storeu_ps(ptr_B + 7*ldB, r7); \ _mm512_storeu_ps(ptr_B + 8*ldB, r8); \ _mm512_storeu_ps(ptr_B + 9*ldB, r9); \ _mm512_storeu_ps(ptr_B + 10*ldB, ra); \ _mm512_storeu_ps(ptr_B + 11*ldB, rb); \ _mm512_storeu_ps(ptr_B + 12*ldB, rc); \ _mm512_storeu_ps(ptr_B + 13*ldB, rd); \ _mm512_storeu_ps(ptr_B + 14*ldB, re); \ _mm512_storeu_ps(ptr_B + 15*ldB, rf); \ } #define TRANSPOSE_SIMD_WIDTH_KERNEL_BFLOAT16(ptr_A, ldA, ptr_B, ldB) { \ __m512 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf; \ __m512 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, te, tf; \ __m512i vload_1 = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(ptr_A))); \ vload_1 = _mm512_inserti32x8(vload_1, _mm256_loadu_si256((const __m256i*)(ptr_A + ldA)), 1); \ EXPAND_BFLOAT16(vload_1, r0, r1);{ \ __m512i vload_2 = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(ptr_A + 2*ldA))); \ vload_2 = _mm512_inserti32x8(vload_2, _mm256_loadu_si256((const __m256i*)(ptr_A + 3*ldA)), 1); \ EXPAND_BFLOAT16(vload_2, r2, r3);{ \ __m512i vload_3 = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(ptr_A + 4*ldA))); \ vload_3 = _mm512_inserti32x8(vload_3, _mm256_loadu_si256((const __m256i*)(ptr_A + 5*ldA)), 1); \ EXPAND_BFLOAT16(vload_3, r4, r5);{ \ __m512i vload_4 = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(ptr_A + 6*ldA))); \ vload_4 = _mm512_inserti32x8(vload_4, _mm256_loadu_si256((const __m256i*)(ptr_A + 7*ldA)), 1); \ EXPAND_BFLOAT16(vload_4, r6, r7);{ \ __m512i vload_5 = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(ptr_A + 8*ldA))); \ vload_5 = _mm512_inserti32x8(vload_5, _mm256_loadu_si256((const __m256i*)(ptr_A + 9*ldA)), 1); \ EXPAND_BFLOAT16(vload_5, r8, r9);{ \ __m512i vload_6 = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(ptr_A + 10*ldA))); \ vload_6 = _mm512_inserti32x8(vload_6, _mm256_loadu_si256((const __m256i*)(ptr_A + 11*ldA)), 1); \ EXPAND_BFLOAT16(vload_6, ra, rb);{ \ __m512i vload_7 = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(ptr_A + 12*ldA))); \ vload_7 = _mm512_inserti32x8(vload_7, _mm256_loadu_si256((const __m256i*)(ptr_A + 13*ldA)), 1); \ EXPAND_BFLOAT16(vload_7, rc, rd);{ \ __m512i vload_8 = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(ptr_A + 14*ldA))); \ vload_8 = _mm512_inserti32x8(vload_8, _mm256_loadu_si256((const __m256i*)(ptr_A + 15*ldA)), 1); \ EXPAND_BFLOAT16(vload_8, re, rf); \ \ t0 = _mm512_unpacklo_ps(r0,r1); \ t1 = _mm512_unpackhi_ps(r0,r1); \ t2 = _mm512_unpacklo_ps(r2,r3); \ t3 = _mm512_unpackhi_ps(r2,r3); \ t4 = _mm512_unpacklo_ps(r4,r5); \ t5 = _mm512_unpackhi_ps(r4,r5); \ t6 = _mm512_unpacklo_ps(r6,r7); \ t7 = _mm512_unpackhi_ps(r6,r7); \ t8 = _mm512_unpacklo_ps(r8,r9); \ t9 = _mm512_unpackhi_ps(r8,r9); \ ta = _mm512_unpacklo_ps(ra,rb); \ tb = _mm512_unpackhi_ps(ra,rb); \ tc = _mm512_unpacklo_ps(rc,rd); \ td = _mm512_unpackhi_ps(rc,rd); \ te = _mm512_unpacklo_ps(re,rf); \ tf = _mm512_unpackhi_ps(re,rf); \ \ { const __m512d td1 = _mm512_castps_pd(t0), td2 = _mm512_castps_pd(t2); \ r0 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \ r1 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2)); } \ { const __m512d td1 = _mm512_castps_pd(t1), td2 = _mm512_castps_pd(t3); \ r2 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \ r3 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2)); } \ { const __m512d td1 = _mm512_castps_pd(t4), td2 = _mm512_castps_pd(t6); \ r4 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \ r5 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2)); } \ { const __m512d td1 = _mm512_castps_pd(t5), td2 = _mm512_castps_pd(t7); \ r6 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \ r7 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2)); } \ { const __m512d td1 = _mm512_castps_pd(t8), td2 = _mm512_castps_pd(ta); \ r8 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \ r9 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2)); } \ { const __m512d td1 = _mm512_castps_pd(t9), td2 = _mm512_castps_pd(tb); \ ra = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \ rb = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2)); } \ { const __m512d td1 = _mm512_castps_pd(tc), td2 = _mm512_castps_pd(te); \ rc = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \ rd = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2)); } \ { const __m512d td1 = _mm512_castps_pd(td), td2 = _mm512_castps_pd(tf); \ re = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \ rf = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2)); } \ \ t0 = _mm512_shuffle_f32x4(r0, r4, 0x88); \ t1 = _mm512_shuffle_f32x4(r1, r5, 0x88); \ t2 = _mm512_shuffle_f32x4(r2, r6, 0x88); \ t3 = _mm512_shuffle_f32x4(r3, r7, 0x88); \ t4 = _mm512_shuffle_f32x4(r0, r4, 0xdd); \ t5 = _mm512_shuffle_f32x4(r1, r5, 0xdd); \ t6 = _mm512_shuffle_f32x4(r2, r6, 0xdd); \ t7 = _mm512_shuffle_f32x4(r3, r7, 0xdd); \ t8 = _mm512_shuffle_f32x4(r8, rc, 0x88); \ t9 = _mm512_shuffle_f32x4(r9, rd, 0x88); \ ta = _mm512_shuffle_f32x4(ra, re, 0x88); \ tb = _mm512_shuffle_f32x4(rb, rf, 0x88); \ tc = _mm512_shuffle_f32x4(r8, rc, 0xdd); \ td = _mm512_shuffle_f32x4(r9, rd, 0xdd); \ te = _mm512_shuffle_f32x4(ra, re, 0xdd); \ tf = _mm512_shuffle_f32x4(rb, rf, 0xdd); \ \ r0 = _mm512_shuffle_f32x4(t0, t8, 0x88); \ r1 = _mm512_shuffle_f32x4(t1, t9, 0x88); \ r2 = _mm512_shuffle_f32x4(t2, ta, 0x88); \ r3 = _mm512_shuffle_f32x4(t3, tb, 0x88); \ r4 = _mm512_shuffle_f32x4(t4, tc, 0x88); \ r5 = _mm512_shuffle_f32x4(t5, td, 0x88); \ r6 = _mm512_shuffle_f32x4(t6, te, 0x88); \ r7 = _mm512_shuffle_f32x4(t7, tf, 0x88); \ r8 = _mm512_shuffle_f32x4(t0, t8, 0xdd); \ r9 = _mm512_shuffle_f32x4(t1, t9, 0xdd); \ ra = _mm512_shuffle_f32x4(t2, ta, 0xdd); \ rb = _mm512_shuffle_f32x4(t3, tb, 0xdd); \ rc = _mm512_shuffle_f32x4(t4, tc, 0xdd); \ rd = _mm512_shuffle_f32x4(t5, td, 0xdd); \ re = _mm512_shuffle_f32x4(t6, te, 0xdd); \ rf = _mm512_shuffle_f32x4(t7, tf, 0xdd); \ \ _mm512_storeu_ps(ptr_B + 0*ldB, r0); \ _mm512_storeu_ps(ptr_B + 1*ldB, r1); \ _mm512_storeu_ps(ptr_B + 2*ldB, r2); \ _mm512_storeu_ps(ptr_B + 3*ldB, r3); \ _mm512_storeu_ps(ptr_B + 4*ldB, r4); \ _mm512_storeu_ps(ptr_B + 5*ldB, r5); \ _mm512_storeu_ps(ptr_B + 6*ldB, r6); \ _mm512_storeu_ps(ptr_B + 7*ldB, r7); \ _mm512_storeu_ps(ptr_B + 8*ldB, r8); \ _mm512_storeu_ps(ptr_B + 9*ldB, r9); \ _mm512_storeu_ps(ptr_B + 10*ldB, ra); \ _mm512_storeu_ps(ptr_B + 11*ldB, rb); \ _mm512_storeu_ps(ptr_B + 12*ldB, rc); \ _mm512_storeu_ps(ptr_B + 13*ldB, rd); \ _mm512_storeu_ps(ptr_B + 14*ldB, re); \ _mm512_storeu_ps(ptr_B + 15*ldB, rf);}}}}}}} \ } #define COMPRESS_FP32(v, k, m, cnt) { \ _mm512_mask_compressstoreu_ps(values_ptr + (cnt), m, v); \ { \ __m256i vk1 = _mm256_set1_epi16((short)(k)); \ __m256i vk2 = _mm256_set1_epi16((short)((k) + 8)); \ __m256i v_idx = _mm256_add_epi32(vk1, _mm256_loadu_si256(&shufmasks2[(m)&0xFF])); \ __m256i v_idx_2 = _mm256_add_epi32(vk2, _mm256_loadu_si256(&shufmasks2[((m)>>8)&0xFF])); \ _mm256_storeu_si256((__m256i *)(colidx_ptr + (cnt)), v_idx); \ cnt = (unsigned short)((cnt) + _mm_popcnt_u32((m)&0xFF)); \ _mm256_storeu_si256((__m256i *)(colidx_ptr + (cnt)), v_idx_2); \ cnt = (unsigned short)((cnt) + _mm_popcnt_u32(((m)>>8)&0xFF)); \ } \ } #define EXPAND_BFLOAT16(v, vlo_final, vhi_final) { \ const __m512i vlo = _mm512_unpacklo_epi16(vzero, v); \ const __m512i vhi = _mm512_unpackhi_epi16(vzero, v); \ const __m512i permmask1 = _mm512_set_epi64(11, 10, 3, 2, 9, 8, 1, 0); \ const __m512i permmask2 = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4); \ vlo_final = _mm512_castsi512_ps(_mm512_permutex2var_epi64(vlo, permmask1, vhi)); \ vhi_final = _mm512_castsi512_ps(_mm512_permutex2var_epi64(vlo, permmask2, vhi)); \ } #define COMPRESS_BFLOAT16(vlo, vhi, v) { \ const __m512i permmask1 = _mm512_set_epi64(13, 12, 9, 8, 5, 4, 1, 0); \ const __m512i permmask2 = _mm512_set_epi64(15, 14, 11, 10, 7, 6, 3, 2); \ const __m512i va = _mm512_castps_si512(vlo), vb = _mm512_castps_si512(vhi); \ const __m512i vtmp1 = _mm512_permutex2var_epi64(va, permmask1, vb); \ const __m512i vtmp2 = _mm512_permutex2var_epi64(va, permmask2, vb); \ const __m512i a = _mm512_srli_epi32(vtmp1, 16), b = _mm512_srli_epi32(vtmp2, 16); \ v = _mm512_packus_epi32(a, b); \ } #endif libxsmm-1.17/src/libxsmm_spmdm_end.h000066400000000000000000000026501415223013700175570ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #undef SIMD_WIDTH_FP32 #undef SIMDTYPE_FP32 #undef SIMDTYPE_INT32 #undef SIMDMASKTYPE_FP32 #undef _MM_SETZERO_FP32 #undef _MM_SETZERO_INT32 #undef _MM_SET1_FP32 #undef _MM_SET1_INT32 #undef _MM_SET1_INT16 #undef _MM_SET_INT32 #undef _MM_LOAD_FP32 #undef _MM_LOADU_FP32 #undef _MM_LOAD_INT32 #undef _MM_STORE_INT32 #undef _MM_LOADU_INT32 #undef _MM_GATHER_INT32 #undef _MM_GATHER_FP32 #undef _MM_CMPNEQ_FP32 #undef _MM_STORE_FP32 #undef _MM_STOREU_FP32 #undef _MM_ADD_FP32 #undef _MM_FMADD_FP32 #undef _MM_MUL_FP32 #undef _MM_PREFETCH #undef TRANSPOSE_SIMD_WIDTH_KERNEL #undef TRANSPOSE_SIMD_WIDTH_KERNEL_BFLOAT16 #undef COMPRESS_FP32 #undef EXPAND_BFLOAT16 #undef COMPRESS_BFLOAT16 #undef num_regs libxsmm-1.17/src/libxsmm_sync.c000066400000000000000000000514601415223013700165630ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst, Alexander Heinecke (Intel Corp.) ******************************************************************************/ /* Lock primitives inspired by Karl Malbrain, Concurrency Kit, and TF/sync. ******************************************************************************/ #include "libxsmm_main.h" #if !defined(LIBXSMM_SYNC_FUTEX) && defined(__linux__) && defined(__USE_GNU) # define LIBXSMM_SYNC_FUTEX #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #include #if defined(_WIN32) # include #else # if defined(LIBXSMM_SYNC_FUTEX) && defined(__linux__) && defined(__USE_GNU) # include # endif # include # include #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif #if !defined(LIBXSMM_SYNC_RWLOCK_BITS) # if defined(__MINGW32__) # define LIBXSMM_SYNC_RWLOCK_BITS 32 # else # define LIBXSMM_SYNC_RWLOCK_BITS 16 # endif #endif #if !defined(LIBXSMM_SYNC_GENERIC_PID) && 1 # define LIBXSMM_SYNC_GENERIC_PID #endif LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE internal_sync_core_tag { /* per-core */ uint8_t id; volatile uint8_t core_sense; volatile uint8_t* thread_senses; volatile uint8_t* my_flags[2]; uint8_t** partner_flags[2]; uint8_t parity; uint8_t sense; } internal_sync_core_tag; LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE internal_sync_thread_tag { /* per-thread */ int core_tid; internal_sync_core_tag *core; } internal_sync_thread_tag; struct LIBXSMM_RETARGETABLE libxsmm_barrier { internal_sync_core_tag** cores; internal_sync_thread_tag** threads; int ncores, nthreads_per_core; int nthreads, ncores_nbits; /* nbits(ncores) != log2(ncores) */ /* internal counter type which is guaranteed to be atomic when using certain methods */ volatile int threads_waiting; /* thread-safety during initialization */ volatile uint8_t init_done; }; LIBXSMM_API libxsmm_barrier* libxsmm_barrier_create(int ncores, int nthreads_per_core) { libxsmm_barrier *const barrier = (libxsmm_barrier*)malloc(sizeof(libxsmm_barrier)); #if (0 == LIBXSMM_SYNC) LIBXSMM_UNUSED(ncores); LIBXSMM_UNUSED(nthreads_per_core); #else if (NULL != barrier && 1 < ncores && 1 <= nthreads_per_core) { barrier->ncores = ncores; barrier->ncores_nbits = (int)LIBXSMM_NBITS(ncores); barrier->nthreads_per_core = nthreads_per_core; barrier->nthreads = ncores * nthreads_per_core; barrier->threads = (internal_sync_thread_tag**)libxsmm_aligned_malloc( barrier->nthreads * sizeof(internal_sync_thread_tag*), LIBXSMM_CACHELINE); barrier->cores = (internal_sync_core_tag**)libxsmm_aligned_malloc( barrier->ncores * sizeof(internal_sync_core_tag*), LIBXSMM_CACHELINE); barrier->threads_waiting = barrier->nthreads; /* atomic */ barrier->init_done = 0; /* false */ } else #endif if (NULL != barrier) { barrier->nthreads = 1; } return barrier; } LIBXSMM_API void libxsmm_barrier_init(libxsmm_barrier* barrier, int tid) { #if (0 == LIBXSMM_SYNC) LIBXSMM_UNUSED(barrier); LIBXSMM_UNUSED(tid); #else if (NULL != barrier && 1 < barrier->nthreads) { const int cid = tid / barrier->nthreads_per_core; /* this thread's core ID */ internal_sync_core_tag* core = 0; int i; internal_sync_thread_tag* thread; /* we only initialize the barrier once */ if (barrier->init_done == 2) { return; } /* allocate per-thread structure */ thread = (internal_sync_thread_tag*)libxsmm_aligned_malloc( sizeof(internal_sync_thread_tag), LIBXSMM_CACHELINE); barrier->threads[tid] = thread; thread->core_tid = tid - (barrier->nthreads_per_core * cid); /* mod */ /* each core's thread 0 does all the allocations */ if (0 == thread->core_tid) { core = (internal_sync_core_tag*)libxsmm_aligned_malloc( sizeof(internal_sync_core_tag), LIBXSMM_CACHELINE); core->id = (uint8_t)cid; core->core_sense = 1; core->thread_senses = (uint8_t*)libxsmm_aligned_malloc( barrier->nthreads_per_core * sizeof(uint8_t), LIBXSMM_CACHELINE); for (i = 0; i < barrier->nthreads_per_core; ++i) core->thread_senses[i] = 1; for (i = 0; i < 2; ++i) { core->my_flags[i] = (uint8_t*)libxsmm_aligned_malloc( barrier->ncores_nbits * sizeof(uint8_t) * LIBXSMM_CACHELINE, LIBXSMM_CACHELINE); core->partner_flags[i] = (uint8_t**)libxsmm_aligned_malloc( barrier->ncores_nbits * sizeof(uint8_t*), LIBXSMM_CACHELINE); } core->parity = 0; core->sense = 1; barrier->cores[cid] = core; } /* barrier to let all the allocations complete */ if (0 == LIBXSMM_ATOMIC_SUB_FETCH(&barrier->threads_waiting, 1, LIBXSMM_ATOMIC_RELAXED)) { barrier->threads_waiting = barrier->nthreads; /* atomic */ barrier->init_done = 1; /* true */ } else { while (0/*false*/ == barrier->init_done); } /* set required per-thread information */ thread->core = barrier->cores[cid]; /* each core's thread 0 completes setup */ if (0 == thread->core_tid) { int di; for (i = di = 0; i < barrier->ncores_nbits; ++i, di += LIBXSMM_CACHELINE) { /* find dissemination partner and link to it */ const int dissem_cid = (cid + (1 << i)) % barrier->ncores; assert(0 != core); /* initialized under the same condition; see above */ core->my_flags[0][di] = core->my_flags[1][di] = 0; core->partner_flags[0][i] = (uint8_t*)&barrier->cores[dissem_cid]->my_flags[0][di]; core->partner_flags[1][i] = (uint8_t*)&barrier->cores[dissem_cid]->my_flags[1][di]; } } /* barrier to let initialization complete */ if (0 == LIBXSMM_ATOMIC_SUB_FETCH(&barrier->threads_waiting, 1, LIBXSMM_ATOMIC_RELAXED)) { barrier->threads_waiting = barrier->nthreads; /* atomic */ barrier->init_done = 2; } else { while (2 != barrier->init_done); } } #endif } LIBXSMM_API LIBXSMM_INTRINSICS(LIBXSMM_X86_GENERIC) void libxsmm_barrier_wait(libxsmm_barrier* barrier, int tid) { #if (0 == LIBXSMM_SYNC) LIBXSMM_UNUSED(barrier); LIBXSMM_UNUSED(tid); #else if (NULL != barrier && 1 < barrier->nthreads) { internal_sync_thread_tag *const thread = barrier->threads[tid]; internal_sync_core_tag *const core = thread->core; /* first let's execute a memory fence */ LIBXSMM_ATOMIC_SYNC(LIBXSMM_ATOMIC_SEQ_CST); /* first signal this thread's arrival */ core->thread_senses[thread->core_tid] = (uint8_t)(0 == core->thread_senses[thread->core_tid] ? 1 : 0); /* each core's thread 0 syncs across cores */ if (0 == thread->core_tid) { int i; /* wait for the core's remaining threads */ for (i = 1; i < barrier->nthreads_per_core; ++i) { uint8_t core_sense = core->core_sense, thread_sense = core->thread_senses[i]; while (core_sense == thread_sense) { /* avoid evaluation in unspecified order */ LIBXSMM_SYNC_PAUSE; core_sense = core->core_sense; thread_sense = core->thread_senses[i]; } } if (1 < barrier->ncores) { int di; # if defined(__MIC__) /* cannot use LIBXSMM_ALIGNED since attribute may not apply to local non-static arrays */ uint8_t sendbuffer[LIBXSMM_CACHELINE+LIBXSMM_CACHELINE-1]; uint8_t *const sendbuf = LIBXSMM_ALIGN(sendbuffer, LIBXSMM_CACHELINE); __m512d m512d; _mm_prefetch((const char*)core->partner_flags[core->parity][0], _MM_HINT_ET1); sendbuf[0] = core->sense; m512d = LIBXSMM_INTRINSICS_MM512_LOAD_PD(sendbuf); # endif for (i = di = 0; i < barrier->ncores_nbits - 1; ++i, di += LIBXSMM_CACHELINE) { # if defined(__MIC__) _mm_prefetch((const char*)core->partner_flags[core->parity][i+1], _MM_HINT_ET1); _mm512_storenrngo_pd(core->partner_flags[core->parity][i], m512d); # else *core->partner_flags[core->parity][i] = core->sense; # endif while (core->my_flags[core->parity][di] != core->sense) LIBXSMM_SYNC_PAUSE; } # if defined(__MIC__) _mm512_storenrngo_pd(core->partner_flags[core->parity][i], m512d); # else *core->partner_flags[core->parity][i] = core->sense; # endif while (core->my_flags[core->parity][di] != core->sense) LIBXSMM_SYNC_PAUSE; if (1 == core->parity) { core->sense = (uint8_t)(0 == core->sense ? 1 : 0); } core->parity = (uint8_t)(1 - core->parity); } /* wake up the core's remaining threads */ core->core_sense = core->thread_senses[0]; } else { /* other threads wait for cross-core sync to complete */ uint8_t core_sense = core->core_sense, thread_sense = core->thread_senses[thread->core_tid]; while (core_sense != thread_sense) { /* avoid evaluation in unspecified order */ LIBXSMM_SYNC_PAUSE; core_sense = core->core_sense; thread_sense = core->thread_senses[thread->core_tid]; } } } #endif } LIBXSMM_API void libxsmm_barrier_destroy(const libxsmm_barrier* barrier) { #if (0 != LIBXSMM_SYNC) if (NULL != barrier && 1 < barrier->nthreads) { if (2 == barrier->init_done) { int i; for (i = 0; i < barrier->ncores; ++i) { int j; libxsmm_free((const void*)barrier->cores[i]->thread_senses); for (j = 0; j < 2; ++j) { libxsmm_free((const void*)barrier->cores[i]->my_flags[j]); libxsmm_free(barrier->cores[i]->partner_flags[j]); } libxsmm_free(barrier->cores[i]); } for (i = 0; i < barrier->nthreads; ++i) { libxsmm_free(barrier->threads[i]); } } libxsmm_free(barrier->threads); libxsmm_free(barrier->cores); } #endif free((libxsmm_barrier*)barrier); } #if (0 != LIBXSMM_SYNC) enum { INTERNAL_SYNC_LOCK_FREE = 0, INTERNAL_SYNC_LOCK_LOCKED = 1, INTERNAL_SYNC_LOCK_CONTESTED = 2, INTERNAL_SYNC_RWLOCK_READINC = 0x10000/*(USHRT_MAX+1)*/, INTERNAL_SYNC_FUTEX = 202 }; #endif typedef unsigned int libxsmm_spinlock_state; struct LIBXSMM_RETARGETABLE libxsmm_spinlock { volatile libxsmm_spinlock_state state; }; LIBXSMM_API libxsmm_spinlock* libxsmm_spinlock_create(void) { libxsmm_spinlock *const result = (libxsmm_spinlock*)malloc(sizeof(libxsmm_spinlock)); #if (0 != LIBXSMM_SYNC) if (0 != result) { result->state = INTERNAL_SYNC_LOCK_FREE; } #endif return result; } LIBXSMM_API void libxsmm_spinlock_destroy(const libxsmm_spinlock* spinlock) { free((libxsmm_spinlock*)spinlock); } LIBXSMM_API int libxsmm_spinlock_trylock(libxsmm_spinlock* spinlock) { #if (0 != LIBXSMM_SYNC) # if 0 /*const*/ libxsmm_spinlock_state lock_free = INTERNAL_SYNC_LOCK_FREE; assert(0 != spinlock); return 0/*false*/ == LIBXSMM_ATOMIC_CMPSWP(&spinlock->state, lock_free, INTERNAL_SYNC_LOCK_LOCKED, LIBXSMM_ATOMIC_RELAXED) ? (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_SPINLOCK) + 1) /* not acquired */ : (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_SPINLOCK)); # else return LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_SPINLOCK) + !LIBXSMM_ATOMIC_TRYLOCK(&spinlock->state, LIBXSMM_ATOMIC_RELAXED); # endif #else LIBXSMM_UNUSED(spinlock); return LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_SPINLOCK); #endif } LIBXSMM_API void libxsmm_spinlock_acquire(libxsmm_spinlock* spinlock) { #if (0 != LIBXSMM_SYNC) assert(0 != spinlock); for (;;) { if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&spinlock->state, 1, LIBXSMM_ATOMIC_RELAXED)) break; LIBXSMM_SYNC_CYCLE(&spinlock->state, INTERNAL_SYNC_LOCK_FREE, LIBXSMM_SYNC_NPAUSE); } LIBXSMM_ATOMIC_SYNC(LIBXSMM_ATOMIC_SEQ_CST); #else LIBXSMM_UNUSED(spinlock); #endif } LIBXSMM_API void libxsmm_spinlock_release(libxsmm_spinlock* spinlock) { #if (0 != LIBXSMM_SYNC) assert(0 != spinlock); LIBXSMM_ATOMIC_SYNC(LIBXSMM_ATOMIC_SEQ_CST); spinlock->state = INTERNAL_SYNC_LOCK_FREE; #else LIBXSMM_UNUSED(spinlock); #endif } #if defined(LIBXSMM_SYNC_FUTEX) && defined(__linux__) && defined(__USE_GNU) typedef int libxsmm_mutex_state; #else typedef char libxsmm_mutex_state; #endif struct LIBXSMM_RETARGETABLE libxsmm_mutex { volatile libxsmm_mutex_state state; }; LIBXSMM_API libxsmm_mutex* libxsmm_mutex_create(void) { libxsmm_mutex *const result = (libxsmm_mutex*)malloc(sizeof(libxsmm_mutex)); #if (0 != LIBXSMM_SYNC) if (0 != result) { result->state = INTERNAL_SYNC_LOCK_FREE; } #endif return result; } LIBXSMM_API void libxsmm_mutex_destroy(const libxsmm_mutex* mutex) { free((libxsmm_mutex*)mutex); } LIBXSMM_API int libxsmm_mutex_trylock(libxsmm_mutex* mutex) { #if (0 != LIBXSMM_SYNC) assert(0 != mutex); return LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_MUTEX) + !LIBXSMM_ATOMIC_TRYLOCK(&mutex->state, LIBXSMM_ATOMIC_RELAXED); #else LIBXSMM_UNUSED(mutex); return LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_MUTEX); #endif } LIBXSMM_API void libxsmm_mutex_acquire(libxsmm_mutex* mutex) { #if (0 != LIBXSMM_SYNC) # if defined(_WIN32) assert(0 != mutex); while (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_MUTEX) != libxsmm_mutex_trylock(mutex)) { LIBXSMM_SYNC_CYCLE(&mutex->state, 0/*free*/, LIBXSMM_SYNC_NPAUSE); } # else libxsmm_mutex_state lock_free = INTERNAL_SYNC_LOCK_FREE, lock_state = INTERNAL_SYNC_LOCK_LOCKED; assert(0 != mutex); while (0/*false*/ == LIBXSMM_ATOMIC_CMPSWP(&mutex->state, lock_free, lock_state, LIBXSMM_ATOMIC_RELAXED)) { libxsmm_mutex_state state; /* coverity[unreachable] may be reachable more than once due to volatile state */ for (state = mutex->state; INTERNAL_SYNC_LOCK_FREE != state; state = mutex->state) { # if defined(LIBXSMM_SYNC_FUTEX) && defined(__linux__) LIBXSMM_SYNC_CYCLE_ELSE(&mutex->state, INTERNAL_SYNC_LOCK_FREE, LIBXSMM_SYNC_NPAUSE, { /*const*/ libxsmm_mutex_state state_locked = INTERNAL_SYNC_LOCK_LOCKED; if (INTERNAL_SYNC_LOCK_LOCKED != state || LIBXSMM_ATOMIC_CMPSWP(&mutex->state, state_locked, INTERNAL_SYNC_LOCK_CONTESTED, LIBXSMM_ATOMIC_RELAXED)) { syscall(INTERNAL_SYNC_FUTEX, &mutex->state, FUTEX_WAIT, INTERNAL_SYNC_LOCK_CONTESTED, NULL, NULL, 0); lock_state = INTERNAL_SYNC_LOCK_CONTESTED; }} ); break; # else LIBXSMM_SYNC_CYCLE(&mutex->state, INTERNAL_SYNC_LOCK_FREE, LIBXSMM_SYNC_NPAUSE); # endif } lock_free = INTERNAL_SYNC_LOCK_FREE; } # endif #else LIBXSMM_UNUSED(mutex); #endif } LIBXSMM_API void libxsmm_mutex_release(libxsmm_mutex* mutex) { #if (0 != LIBXSMM_SYNC) assert(0 != mutex); LIBXSMM_ATOMIC_SYNC(LIBXSMM_ATOMIC_SEQ_CST); # if defined(LIBXSMM_SYNC_FUTEX) && defined(__linux__) && defined(__USE_GNU) if (INTERNAL_SYNC_LOCK_CONTESTED == LIBXSMM_ATOMIC_FETCH_SUB(&mutex->state, 1, LIBXSMM_ATOMIC_RELAXED)) { mutex->state = INTERNAL_SYNC_LOCK_FREE; syscall(INTERNAL_SYNC_FUTEX, &mutex->state, FUTEX_WAKE, 1, NULL, NULL, 0); } # else mutex->state = INTERNAL_SYNC_LOCK_FREE; # endif #else LIBXSMM_UNUSED(mutex); #endif } #if (0 != LIBXSMM_SYNC) typedef LIBXSMM_CONCATENATE3(uint,LIBXSMM_SYNC_RWLOCK_BITS,_t) internal_sync_uint_t; typedef LIBXSMM_CONCATENATE3(int,LIBXSMM_SYNC_RWLOCK_BITS,_t) internal_sync_int_t; LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE internal_sync_counter { struct { internal_sync_uint_t writer, reader; } kind; uint32_t bits; } internal_sync_counter; #endif LIBXSMM_EXTERN_C struct LIBXSMM_RETARGETABLE libxsmm_rwlock { #if (0 != LIBXSMM_SYNC) volatile internal_sync_counter completions; volatile internal_sync_counter requests; #else int dummy; #endif }; LIBXSMM_API libxsmm_rwlock* libxsmm_rwlock_create(void) { libxsmm_rwlock *const result = (libxsmm_rwlock*)malloc(sizeof(libxsmm_rwlock)); if (0 != result) { #if (0 != LIBXSMM_SYNC) LIBXSMM_MEMZERO127(&result->completions); LIBXSMM_MEMZERO127(&result->requests); #else LIBXSMM_MEMZERO127(result); #endif } return result; } LIBXSMM_API void libxsmm_rwlock_destroy(const libxsmm_rwlock* rwlock) { free((libxsmm_rwlock*)rwlock); } #if (0 != LIBXSMM_SYNC) LIBXSMM_API_INLINE int internal_rwlock_trylock(libxsmm_rwlock* rwlock, internal_sync_counter* prev) { internal_sync_counter next; assert(0 != rwlock && 0 != prev); do { prev->bits = rwlock->requests.bits; next.bits = prev->bits; ++next.kind.writer; } while (0/*false*/ == LIBXSMM_ATOMIC_CMPSWP(&rwlock->requests.bits, prev->bits, next.bits, LIBXSMM_ATOMIC_RELAXED)); return rwlock->completions.bits != prev->bits ? (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK) + 1) /* not acquired */ : (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK)); } #endif LIBXSMM_API int libxsmm_rwlock_trylock(libxsmm_rwlock* rwlock) { #if (0 != LIBXSMM_SYNC) internal_sync_counter prev; return internal_rwlock_trylock(rwlock, &prev); #else LIBXSMM_UNUSED(rwlock); return LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK); #endif } LIBXSMM_API void libxsmm_rwlock_acquire(libxsmm_rwlock* rwlock) { #if (0 != LIBXSMM_SYNC) internal_sync_counter prev; if (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK) != internal_rwlock_trylock(rwlock, &prev)) { while (rwlock->completions.bits != prev.bits) { LIBXSMM_SYNC_CYCLE(&rwlock->completions.bits, prev.bits, LIBXSMM_SYNC_NPAUSE); } } #else LIBXSMM_UNUSED(rwlock); #endif } LIBXSMM_API void libxsmm_rwlock_release(libxsmm_rwlock* rwlock) { #if (0 != LIBXSMM_SYNC) assert(0 != rwlock); LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_FETCH_ADD, LIBXSMM_SYNC_RWLOCK_BITS)(&rwlock->completions.kind.writer, 1, LIBXSMM_ATOMIC_SEQ_CST); #else LIBXSMM_UNUSED(rwlock); #endif } #if (0 != LIBXSMM_SYNC) LIBXSMM_API_INLINE int internal_rwlock_tryread(libxsmm_rwlock* rwlock, internal_sync_counter* prev) { #if (0 != LIBXSMM_SYNC) assert(0 != rwlock && 0 != prev); prev->bits = LIBXSMM_ATOMIC_FETCH_ADD(&rwlock->requests.bits, INTERNAL_SYNC_RWLOCK_READINC, LIBXSMM_ATOMIC_SEQ_CST); return rwlock->completions.kind.writer != prev->kind.writer ? (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK) + 1) /* not acquired */ : (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK)); #else LIBXSMM_UNUSED(rwlock); LIBXSMM_UNUSED(prev); return LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK); #endif } #endif LIBXSMM_API int libxsmm_rwlock_tryread(libxsmm_rwlock* rwlock) { #if (0 != LIBXSMM_SYNC) internal_sync_counter prev; return internal_rwlock_tryread(rwlock, &prev); #else LIBXSMM_UNUSED(rwlock); return LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK); #endif } LIBXSMM_API void libxsmm_rwlock_acqread(libxsmm_rwlock* rwlock) { #if (0 != LIBXSMM_SYNC) internal_sync_counter prev; if (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK) != internal_rwlock_tryread(rwlock, &prev)) { while (rwlock->completions.kind.writer != prev.kind.writer) { LIBXSMM_SYNC_CYCLE(&rwlock->completions.kind.writer, prev.kind.writer, LIBXSMM_SYNC_NPAUSE); } } #else LIBXSMM_UNUSED(rwlock); #endif } LIBXSMM_API void libxsmm_rwlock_relread(libxsmm_rwlock* rwlock) { #if (0 != LIBXSMM_SYNC) assert(0 != rwlock); LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_FETCH_ADD, LIBXSMM_SYNC_RWLOCK_BITS)(&rwlock->completions.kind.reader, 1, LIBXSMM_ATOMIC_SEQ_CST); #else LIBXSMM_UNUSED(rwlock); #endif } LIBXSMM_API unsigned int libxsmm_get_pid(void) { #if defined(_WIN32) return (unsigned int)_getpid(); #else return (unsigned int)getpid(); #endif } LIBXSMM_API_INTERN unsigned int internal_get_tid(void); LIBXSMM_API_INTERN unsigned int internal_get_tid(void) { const unsigned int nthreads = LIBXSMM_ATOMIC_ADD_FETCH(&libxsmm_thread_count, 1, LIBXSMM_ATOMIC_RELAXED); #if !defined(NDEBUG) static int error_once = 0; if (LIBXSMM_NTHREADS_MAX < nthreads && 0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: maximum number of threads is exhausted!\n"); } #endif LIBXSMM_ASSERT(LIBXSMM_NTHREADS_MAX == LIBXSMM_UP2POT(LIBXSMM_NTHREADS_MAX)); return LIBXSMM_MOD2(nthreads - 1, LIBXSMM_NTHREADS_MAX); } LIBXSMM_API unsigned int libxsmm_get_tid(void) { #if (0 != LIBXSMM_SYNC) # if defined(LIBXSMM_SYNC_GENERIC_PID) static LIBXSMM_TLS unsigned int tid = 0xFFFFFFFF; if (0xFFFFFFFF == tid) tid = internal_get_tid(); return tid; # else void* tls = LIBXSMM_TLS_GETVALUE(libxsmm_tlskey); if (NULL == tls) { static unsigned int tid[LIBXSMM_NTHREADS_MAX]; const int i = internal_get_tid(); tid[i] = i; tls = tid + i; /* coverity[check_return] */ LIBXSMM_TLS_SETVALUE(libxsmm_tlskey, tls); } return *(unsigned int*)tls; # endif #else return 0; #endif } libxsmm-1.17/src/libxsmm_timer.c000066400000000000000000000137721415223013700167330ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include #include "libxsmm_main.h" #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #if defined(_WIN32) # include #elif defined(__GNUC__) || defined(__PGI) || defined(_CRAYC) # include # include #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif #if defined(__powerpc64__) # include #endif #if !defined(LIBXSMM_TIMER_TSC) # define LIBXSMM_TIMER_TSC #endif #if !defined(LIBXSMM_TIMER_WPC) # define LIBXSMM_TIMER_WPC #endif #if defined(LIBXSMM_TIMER_TSC) # if defined(__powerpc64__) # define LIBXSMM_TIMER_RDTSC(CYCLE) { \ CYCLE = __ppc_get_timebase(); \ } # elif ((defined(__GNUC__) || defined(LIBXSMM_INTEL_COMPILER) || defined(__PGI)) && (64 <= (LIBXSMM_BITS))) # define LIBXSMM_TIMER_RDTSC(CYCLE) { libxsmm_timer_tickint libxsmm_timer_rdtsc_hi_; \ __asm__ __volatile__ ("rdtsc" : "=a"(CYCLE), "=d"(libxsmm_timer_rdtsc_hi_)); \ CYCLE |= libxsmm_timer_rdtsc_hi_ << 32; \ } # elif (defined(_rdtsc) || defined(_WIN32)) # define LIBXSMM_TIMER_RDTSC(CYCLE) (CYCLE = __rdtsc()) # endif #endif LIBXSMM_API_INTERN double libxsmm_timer_duration_rtc(libxsmm_timer_tickint tick0, libxsmm_timer_tickint tick1) { double result = (double)LIBXSMM_DELTA(tick0, tick1); #if defined(_WIN32) # if defined(LIBXSMM_TIMER_WPC) LARGE_INTEGER frequency; QueryPerformanceFrequency(&frequency); result /= (double)frequency.QuadPart; # else /* low resolution */ result *= 1E-3; # endif #elif defined(CLOCK_MONOTONIC) result *= 1E-9; #else result *= 1E-6; #endif return result; } LIBXSMM_API_INTERN libxsmm_timer_tickint libxsmm_timer_tick_rtc(void) { libxsmm_timer_tickint result; #if defined(_WIN32) # if defined(LIBXSMM_TIMER_WPC) LARGE_INTEGER t; QueryPerformanceCounter(&t); result = (libxsmm_timer_tickint)t.QuadPart; # else /* low resolution */ result = (libxsmm_timer_tickint)GetTickCount64(); # endif #elif defined(CLOCK_MONOTONIC) struct timespec t; clock_gettime(CLOCK_MONOTONIC, &t); result = 1000000000ULL * t.tv_sec + t.tv_nsec; #else struct timeval t; gettimeofday(&t, 0); result = 1000000ULL * t.tv_sec + t.tv_usec; #endif return result; } LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_GENERIC) libxsmm_timer_tickint libxsmm_timer_tick_tsc(void) { libxsmm_timer_tickint result; #if defined(LIBXSMM_TIMER_RDTSC) LIBXSMM_TIMER_RDTSC(result); #else result = libxsmm_timer_tick_rtc(); #endif return result; } LIBXSMM_API int libxsmm_get_timer_info(libxsmm_timer_info* info) { int result; if (NULL != info) { #if defined(LIBXSMM_TIMER_RDTSC) if (0 < libxsmm_timer_scale) { info->tsc = 1; } # if !defined(LIBXSMM_INIT_COMPLETED) else if (2 > libxsmm_ninit) { libxsmm_init(); if (0 < libxsmm_timer_scale) { info->tsc = 1; } else { info->tsc = 0; } } # endif else { info->tsc = 0; } #else info->tsc = 0; #endif result = EXIT_SUCCESS; } else { static int error_once = 0; if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: invalid argument for libxsmm_get_timer_info specified!\n"); } result = EXIT_FAILURE; } return result; } LIBXSMM_API libxsmm_timer_tickint libxsmm_timer_tick(void) { libxsmm_timer_tickint result; #if defined(LIBXSMM_TIMER_RDTSC) if (0 < libxsmm_timer_scale) { LIBXSMM_TIMER_RDTSC(result); } # if !defined(LIBXSMM_INIT_COMPLETED) else if (2 > libxsmm_ninit) { libxsmm_init(); if (0 < libxsmm_timer_scale) { LIBXSMM_TIMER_RDTSC(result); } else { result = libxsmm_timer_tick_rtc(); } } # endif else { result = libxsmm_timer_tick_rtc(); } #else result = libxsmm_timer_tick_rtc(); #endif return result; } LIBXSMM_API double libxsmm_timer_duration(libxsmm_timer_tickint tick0, libxsmm_timer_tickint tick1) { double result; #if defined(LIBXSMM_TIMER_RDTSC) if (0 < libxsmm_timer_scale) { result = (double)LIBXSMM_DELTA(tick0, tick1) * libxsmm_timer_scale; } else #endif { result = libxsmm_timer_duration_rtc(tick0, tick1); } return result; } #if defined(LIBXSMM_BUILD) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__)) /* implementation provided for Fortran 77 compatibility */ LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_timer_ncycles)(libxsmm_timer_tickint* /*ncycles*/, const libxsmm_timer_tickint* /*tick0*/, const libxsmm_timer_tickint* /*tick1*/); LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_timer_ncycles)(libxsmm_timer_tickint* ncycles, const libxsmm_timer_tickint* tick0, const libxsmm_timer_tickint* tick1) { #if !defined(NDEBUG) static int error_once = 0; if (NULL != ncycles && NULL != tick0 && NULL != tick1) #endif { *ncycles = libxsmm_timer_ncycles(*tick0, *tick1); } #if !defined(NDEBUG) else if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_timer_ncycles specified!\n"); } #endif } #endif /*defined(LIBXSMM_BUILD) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__))*/ libxsmm-1.17/src/libxsmm_trace.c000066400000000000000000000514211415223013700167020ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include "libxsmm_trace.h" #include "libxsmm_main.h" #if !defined(LIBXSMM_TRACE_MINDEPTH) || 0 > (LIBXSMM_TRACE_MINDEPTH) # undef LIBXSMM_TRACE_MINDEPTH # define LIBXSMM_TRACE_MINDEPTH 1 #endif #if !defined(LIBXSMM_TRACE_MAXDEPTH) || 0 >= (LIBXSMM_TRACE_MAXDEPTH) # undef LIBXSMM_TRACE_MAXDEPTH # define LIBXSMM_TRACE_MAXDEPTH 1024 #endif #if !defined(LIBXSMM_TRACE_SYMBOLSIZE) || 0 >= (LIBXSMM_TRACE_SYMBOLSIZE) # undef LIBXSMM_TRACE_SYMBOLSIZE # define LIBXSMM_TRACE_SYMBOLSIZE 256 #endif #if !defined(LIBXSMM_TRACE_DLINFO) && defined(__USE_GNU) # define LIBXSMM_TRACE_DLINFO #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) #endif #if !defined(NDEBUG) # include #endif #if defined(_WIN32) || defined(__CYGWIN__) # include # if defined(_MSC_VER) # pragma warning(push) # pragma warning(disable: 4091) # endif # include # if defined(_MSC_VER) # pragma comment(lib, "dbghelp") # endif # if defined(_MSC_VER) # pragma warning(pop) # endif LIBXSMM_APIVAR_DEFINE(volatile LONG internal_trace_initialized); #else LIBXSMM_APIVAR_DEFINE(volatile int internal_trace_initialized); # include # if defined(LIBXSMM_TRACE_DLINFO) # include # else # include # include # include # include # include # if (0 != LIBXSMM_SYNC) LIBXSMM_APIVAR_DEFINE(LIBXSMM_TLS_TYPE internal_trace_key); LIBXSMM_APIVAR_DEFINE(void* internal_trace_symbols[LIBXSMM_NTHREADS_MAX]); # endif LIBXSMM_API_INLINE void internal_delete(void* value) { int fd; # if !(defined(__APPLE__) && defined(__MACH__)) LIBXSMM_ASSERT(NULL != value); # endif fd = *((int*)value); # if defined(NDEBUG) munmap(value, LIBXSMM_TRACE_SYMBOLSIZE); # else /* library code is expected to be mute */ if (0 != munmap(value, LIBXSMM_TRACE_SYMBOLSIZE)) { const int error = errno; fprintf(stderr, "LIBXSMM ERROR: %s (munmap error #%i at %p)\n", strerror(error), error, value); } # endif if (0 <= fd) { close(fd); } # if !defined(NDEBUG) /* library code is expected to be mute */ else { fprintf(stderr, "LIBXSMM ERROR: invalid file descriptor (%i)\n", fd); } # endif } # if defined(__APPLE__) && defined(__MACH__) /* taken from "libtransmission" fdlimit.c */ LIBXSMM_API_INLINE int posix_fallocate(int fd, off_t offset, off_t length) { fstore_t fst; fst.fst_flags = F_ALLOCATECONTIG; fst.fst_posmode = F_PEOFPOSMODE; fst.fst_offset = offset; fst.fst_length = length; fst.fst_bytesalloc = 0; return fcntl(fd, F_PREALLOCATE, &fst); } # elif (!defined(_XOPEN_SOURCE) || 600 > _XOPEN_SOURCE) && \ (!defined(_POSIX_C_SOURCE) || 200112L > _POSIX_C_SOURCE) /* C89: avoid warning about posix_fallocate declared implicitly */ LIBXSMM_EXTERN int posix_fallocate(int, off_t, off_t); # endif # endif LIBXSMM_EXTERN int mkstemp(char*) LIBXSMM_NOTHROW; #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif LIBXSMM_APIVAR_DEFINE(int internal_trace_mindepth); LIBXSMM_APIVAR_DEFINE(int internal_trace_threadid); LIBXSMM_APIVAR_DEFINE(int internal_trace_maxnsyms); LIBXSMM_API LIBXSMM_ATTRIBUTE_NO_TRACE int libxsmm_trace_init(int /*filter_threadid*/, int /*filter_mindepth*/, int /*filter_maxnsyms*/); LIBXSMM_API int libxsmm_trace_init(int filter_threadid, int filter_mindepth, int filter_maxnsyms) { int result = EXIT_SUCCESS; if (0 == internal_trace_initialized) { if (0 <= filter_threadid) ++filter_threadid; #if defined(__TRACE) { const char *const env = getenv("LIBXSMM_TRACE"); if (NULL != env && 0 != *env) { char buffer[32] = { 0 }; if (1 == sscanf(env, "%32[^,],", buffer)) { result = (0 <= sscanf(buffer, "%i", &filter_threadid) ? EXIT_SUCCESS : EXIT_FAILURE); } if (1 == sscanf(env, "%*[^,],%32[^,],", buffer)) { result = (0 <= sscanf(buffer, "%i", &filter_mindepth) ? EXIT_SUCCESS : EXIT_FAILURE); } if (1 == sscanf(env, "%*[^,],%*[^,],%32s", buffer)) { result = (0 <= sscanf(buffer, "%i", &filter_maxnsyms) ? EXIT_SUCCESS : EXIT_FAILURE); } else { filter_maxnsyms = -1; /* all */ } if (EXIT_SUCCESS == result) { internal_trace_initialized = -1; /* auto */ } } } if (EXIT_SUCCESS == result) #endif { #if defined(LIBXSMM_TRACE) # if defined(_WIN32) || defined(__CYGWIN__) SymSetOptions(SYMOPT_DEFERRED_LOADS | SYMOPT_UNDNAME); result = (FALSE != SymInitialize(GetCurrentProcess(), NULL, TRUE) ? EXIT_SUCCESS : GetLastError()); # elif (0 != LIBXSMM_SYNC) && !defined(LIBXSMM_TRACE_DLINFO) result = LIBXSMM_TLS_CREATE(&internal_trace_key); # endif if (EXIT_SUCCESS == result) { internal_trace_threadid = filter_threadid; internal_trace_maxnsyms = filter_maxnsyms; internal_trace_mindepth = filter_mindepth; if (0 == internal_trace_initialized) { internal_trace_initialized = 1; } } #else LIBXSMM_UNUSED(filter_threadid); LIBXSMM_UNUSED(filter_mindepth); LIBXSMM_UNUSED(filter_maxnsyms); #endif } } return result; } LIBXSMM_API LIBXSMM_ATTRIBUTE_NO_TRACE int libxsmm_trace_finalize(void); LIBXSMM_API int libxsmm_trace_finalize(void) { int result; #if defined(LIBXSMM_TRACE) result = EXIT_SUCCESS; if (0 != internal_trace_initialized) { internal_trace_initialized = 0; /* disable */ # if defined(_WIN32) || defined(__CYGWIN__) result = (FALSE != SymCleanup(GetCurrentProcess()) ? EXIT_SUCCESS : GetLastError()); # elif (0 != LIBXSMM_SYNC) && !defined(LIBXSMM_TRACE_DLINFO) result = LIBXSMM_TLS_DESTROY(internal_trace_key); { int i = 0; for (; i < LIBXSMM_NTHREADS_MAX; ++i) { void *const buffer = internal_trace_symbols[i]; if (NULL != buffer) internal_delete(buffer); } } # endif } #else result = EXIT_FAILURE; #endif return result; } LIBXSMM_API LIBXSMM_ATTRIBUTE_NO_TRACE unsigned int libxsmm_backtrace(const void* /*buffer*/[], unsigned int /*size*/, unsigned int /*skip*/); LIBXSMM_API #if defined(_WIN32) /*TODO: no inline*/ #elif defined(__GNUC__) /*LIBXSMM_ATTRIBUTE(noinline)*/ #endif unsigned int libxsmm_backtrace(const void* buffer[], unsigned int size, unsigned int skip) { unsigned int result; if (NULL != buffer && 0 != size && skip < size) { skip += LIBXSMM_TRACE_MINDEPTH; #if defined(_WIN32) || defined(__CYGWIN__) result = CaptureStackBackTrace(skip, LIBXSMM_MIN(size, LIBXSMM_TRACE_MAXDEPTH), (PVOID*)buffer, NULL/*hash*/); #else { const int n = backtrace((void**)buffer, LIBXSMM_MIN((int)(size + skip), LIBXSMM_TRACE_MAXDEPTH)); if ((int)skip < n) { result = n - skip; if (0 != skip) { memmove(buffer, buffer + skip, result * sizeof(void*)); } } else { result = 0; } } #endif } else { result = 0; } return result; } #if !defined(_WIN32) && !defined(__CYGWIN__) LIBXSMM_API_INLINE const char* internal_trace_get_symbolname(const void* address, char* map, int fd, off_t fdoff) { const char* result = NULL; #if defined(LIBXSMM_TRACE_DLINFO) Dl_info info; LIBXSMM_UNUSED(fd); LIBXSMM_UNUSED(fdoff); LIBXSMM_ASSERT(NULL != address && NULL != map); if (0 != dladdr(address, &info) && NULL != info.dli_sname) { strncpy(map, info.dli_sname, LIBXSMM_TRACE_SYMBOLSIZE - 1); result = map; } #else LIBXSMM_ASSERT(NULL != address && NULL != map); backtrace_symbols_fd((void**)&address, 1, fd); if (fdoff == lseek(fd, fdoff, SEEK_SET) /* reset map */ && 1 == sscanf(map, "%*[^(](%s0x", map)) { char* c = map; for (; '+' != *c && 0 != *c; ++c); if ('+' == *c && c != map) { result = map; map = c; } } *map = 0; /* terminate */ #endif return result; } #endif LIBXSMM_API LIBXSMM_ATTRIBUTE_NO_TRACE const char* libxsmm_trace_info(unsigned int* /*depth*/, unsigned int* /*threadid*/, const int* /*filter_threadid*/, const void* /*filter_symbol*/, const int* /*filter_mindepth*/, const int* /*filter_maxnsyms*/); LIBXSMM_API #if defined(_WIN32) /*TODO: no inline*/ #elif defined(__GNUC__) /*LIBXSMM_ATTRIBUTE(noinline)*/ #endif const char* libxsmm_trace_info(unsigned int* depth, unsigned int* threadid, const int* filter_threadid, const void* filter_symbol, const int* filter_mindepth, const int* filter_maxnsyms) { const char *fname = NULL; #if defined(LIBXSMM_TRACE) static LIBXSMM_TLS int cerberus = 0; /* check against entering a recursion (recursion should not happen due to * attribute "no_instrument_function" but better prevent this in any case) */ if (0 == cerberus) { int init; ++cerberus; # if defined(__GNUC__) && !defined(_CRAYC) __asm__(""); # endif init = LIBXSMM_ATOMIC_LOAD(&internal_trace_initialized, LIBXSMM_ATOMIC_RELAXED); if (0 != init) { /* do nothing if not yet initialized */ const int mindepth = (NULL != filter_mindepth ? *filter_mindepth : internal_trace_mindepth); const int maxnsyms = (NULL != filter_maxnsyms ? *filter_maxnsyms : internal_trace_maxnsyms); const void *stacktrace[LIBXSMM_TRACE_MAXDEPTH]; const int n = libxsmm_backtrace(stacktrace, LIBXSMM_TRACE_MAXDEPTH, 0); int symbol = 0; if (0 < n) { const int filter = (NULL != filter_threadid ? *filter_threadid : internal_trace_threadid); int abs_tid = 0; # if defined(_WIN32) || defined(__CYGWIN__) || defined(LIBXSMM_TRACE_DLINFO) static LIBXSMM_TLS struct { # if defined(_WIN32) || defined(__CYGWIN__) char buffer[sizeof(SYMBOL_INFO)+LIBXSMM_TRACE_SYMBOLSIZE]; # else char buffer[LIBXSMM_TRACE_SYMBOLSIZE]; # endif int tid; } info; if (0 != info.tid) { abs_tid = LIBXSMM_ABS(info.tid); } else { const int tid = LIBXSMM_ATOMIC_ADD_FETCH(&internal_trace_initialized, 0 < init ? 1 : -1, LIBXSMM_ATOMIC_RELAXED); abs_tid = LIBXSMM_ABS(tid) - 1; /* use sign bit to flag enabled fall-back for symbol resolution */ info.tid = -abs_tid; } LIBXSMM_ASSERT(0 < abs_tid); if (0 > filter || filter == abs_tid) { int next = symbol + 1; # if defined(_WIN32) || defined(__CYGWIN__) const HANDLE process = GetCurrentProcess(); PSYMBOL_INFO value = (PSYMBOL_INFO)info.buffer; value->SizeOfStruct = sizeof(SYMBOL_INFO); value->MaxNameLen = LIBXSMM_TRACE_SYMBOLSIZE - 1; value->NameLen = 0; # endif if (NULL != filter_symbol) { struct { size_t d; int s; } approx = { (size_t)LIBXSMM_UNLIMITED, 0 }; while (next < n && (filter_symbol == stacktrace[symbol] || # if defined(_WIN32) || defined(__CYGWIN__) (FALSE != SymFromAddr(process, (DWORD64)stacktrace[symbol], NULL, value) && 0 < value->NameLen))) { if (filter_symbol == stacktrace[symbol] || NULL != strstr(value->Name, (const char*)filter_symbol)) { # else (NULL != internal_trace_get_symbolname(stacktrace[symbol], info.buffer, 0, 0)))) { if (filter_symbol == stacktrace[symbol] || NULL != strstr(info.buffer, (const char*)filter_symbol)) { # endif symbol = next++; /* determine the symbol after the match which is checked below */ break; } { const size_t d = LIBXSMM_DELTA((const char*)filter_symbol, (const char*)stacktrace[symbol]); if (d < approx.d) { approx.s = symbol + 1; approx.d = d; } } symbol = next++; } symbol = LIBXSMM_MAX((next != n ? symbol : approx.s/*not found*/) + mindepth/*shift*/, 0); } /* apply filters based on absolute symbol position */ if ((NULL != filter_symbol || LIBXSMM_MAX(mindepth, 0) <= symbol) && (0 >= maxnsyms || symbol < maxnsyms)) { if (symbol != next && symbol < n && filter_symbol != stacktrace[symbol] && # if defined(_WIN32) || defined(__CYGWIN__) FALSE != SymFromAddr(process, (DWORD64)stacktrace[symbol], NULL, value) && 0 < value->NameLen) # else NULL != internal_trace_get_symbolname(stacktrace[symbol], info.buffer, 0, 0)) # endif { /* disable fall-back allowing unresolved symbol names */ info.tid = abs_tid; /* make unsigned */ # if defined(_WIN32) || defined(__CYGWIN__) fname = value->Name; # else fname = info.buffer; # endif } if (NULL == fname && 0 > info.tid) { /* fall-back allowing unresolved symbol names */ # if defined(__MINGW32__) sprintf(info.buffer, "%p", stacktrace[symbol]); # else sprintf(info.buffer, "0x%" PRIxPTR, (uintptr_t)stacktrace[symbol]); # endif fname = info.buffer; } } } # else # if (0 == LIBXSMM_SYNC) static char raw_c; char */*const*/ raw_value = &raw_c; /* const: avoid warning (below / constant control-flow) */ # else char *const raw_value = (char*)LIBXSMM_TLS_GETVALUE(internal_trace_key); # endif const off_t fdoff = sizeof(int) * 2; int* ivalue = NULL, fd = -1; char* value = NULL; if (NULL != raw_value) { ivalue = (int*)raw_value; abs_tid = (0 <= ivalue[1] ? ivalue[1] : -ivalue[1]); if (0 > filter || filter == abs_tid) { fd = ivalue[0]; if (0 <= fd && fdoff == lseek(fd, fdoff, SEEK_SET)) { value = raw_value + fdoff; } # if !defined(NDEBUG) /* library code is expected to be mute */ else { fprintf(stderr, "LIBXSMM ERROR: failed to get buffer\n"); } # endif } } else { char filename[] = "/tmp/.libxsmm_map." LIBXSMM_MKTEMP_PATTERN; /* coverity[secure_temp] */ fd = mkstemp(filename); if (0 <= fd) { if (0 == unlink(filename) && 0 == posix_fallocate(fd, 0, LIBXSMM_TRACE_SYMBOLSIZE)) { char *const buffer = (char*)mmap(NULL, LIBXSMM_TRACE_SYMBOLSIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if (MAP_FAILED != buffer) { int check = -1; ivalue = (int*)buffer; ivalue[0] = fd; /* valid file descriptor for internal_delete */ if ( # if (0 != LIBXSMM_SYNC) 0 == LIBXSMM_TLS_SETVALUE(internal_trace_key, buffer) && # endif (sizeof(int) * 1) == read(fd, &check, sizeof(int)) && fdoff == lseek(fd, sizeof(int), SEEK_CUR) && check == fd) { const int tid = LIBXSMM_ATOMIC_ADD_FETCH(&internal_trace_initialized, 0 < init ? 1 : -1, LIBXSMM_ATOMIC_RELAXED); abs_tid = LIBXSMM_ABS(tid) - 1; LIBXSMM_ASSERT(0 < abs_tid); # if (0 != LIBXSMM_SYNC) LIBXSMM_ASSERT(abs_tid < LIBXSMM_NTHREADS_MAX); internal_trace_symbols[abs_tid] = buffer; # endif /* use sign bit to flag enabled fall-back for symbol resolution */ ivalue[1] = -abs_tid; if (0 > filter || (abs_tid - 1) == filter) { value = buffer + fdoff; } } else { # if !defined(NDEBUG) /* library code is expected to be mute */ fprintf(stderr, "LIBXSMM ERROR: failed to setup buffer\n"); # endif internal_delete(buffer); } } # if !defined(NDEBUG) else { const int error = errno; fprintf(stderr, "LIBXSMM ERROR: %s (mmap allocation error #%i)\n", strerror(error), error); } # endif } # if !defined(NDEBUG) /* library code is expected to be mute */ else { fprintf(stderr, "LIBXSMM ERROR: failed to setup file descriptor (%i)\n", fd); } # endif } } if (NULL != value) { int next = symbol + 1; if (NULL != filter_symbol) { struct { size_t d; int s; } approx = { (size_t)LIBXSMM_UNLIMITED, 0 }; while (next < n && (filter_symbol == stacktrace[symbol] || NULL != internal_trace_get_symbolname(stacktrace[symbol], value, fd, fdoff))) { if (filter_symbol == stacktrace[symbol] || NULL != strstr(value, (const char*)filter_symbol)) { symbol = next++; /* determine the symbol after the match which is checked below */ break; } { const size_t d = LIBXSMM_DELTA((const char*)filter_symbol, (const char*)stacktrace[symbol]); if (d < approx.d) { approx.s = symbol + 1; approx.d = d; } } symbol = next++; } symbol = LIBXSMM_MAX((next != n ? symbol : approx.s/*not found*/) + mindepth/*shift*/, 0); } /* apply filters based on absolute symbol position */ if ((NULL != filter_symbol || LIBXSMM_MAX(mindepth, 0) <= symbol) && (0 >= maxnsyms || symbol < maxnsyms)) { if (symbol != next && symbol < n && filter_symbol != stacktrace[symbol] && NULL != internal_trace_get_symbolname(stacktrace[symbol], value, fd, fdoff)) { /* disable fall-back allowing unresolved symbol names */ ivalue[1] = abs_tid; /* make unsigned */ fname = value; } if (NULL == fname && 0 > ivalue[1]) { /* fall-back to symbol address */ sprintf(value, "0x%llx", (unsigned long long)stacktrace[symbol]); fname = value; } } } # endif if (threadid) *threadid = abs_tid - 1; if (depth) *depth = symbol; } } --cerberus; } #else LIBXSMM_UNUSED(depth); LIBXSMM_UNUSED(threadid); LIBXSMM_UNUSED(filter_threadid); LIBXSMM_UNUSED(filter_symbol); LIBXSMM_UNUSED(filter_mindepth); LIBXSMM_UNUSED(filter_maxnsyms); #endif return fname; } LIBXSMM_API LIBXSMM_ATTRIBUTE_NO_TRACE void libxsmm_trace(FILE* stream, const int* /*filter_threadid*/, const void* /*filter_symbol*/, const int* /*filter_mindepth*/, const int* /*filter_maxnsyms*/); LIBXSMM_API void libxsmm_trace(FILE* stream, const int* filter_threadid, const void* filter_symbol, const int* filter_mindepth, const int* filter_maxnsyms) { #if defined(LIBXSMM_TRACE) unsigned int depth, threadid; const char *const name = libxsmm_trace_info(&depth, &threadid, filter_threadid, filter_symbol, filter_mindepth, filter_maxnsyms); if (NULL != name && 0 != *name) { /* implies actual other results to be valid */ LIBXSMM_ASSERT(NULL != stream/*otherwise fprintf handles the error*/); if ((NULL == filter_threadid && 0 > internal_trace_threadid) || (NULL != filter_threadid && 0 > *filter_threadid)) { fprintf(stream, "%*s%s@%u\n", (int)depth, "", name, threadid); } else { fprintf(stream, "%*s%s\n", (int)depth, "", name); } } #else /* suppress warning */ LIBXSMM_UNUSED(stream); LIBXSMM_UNUSED(filter_threadid); LIBXSMM_UNUSED(filter_symbol); LIBXSMM_UNUSED(filter_mindepth); LIBXSMM_UNUSED(filter_maxnsyms); #endif } #if defined(__TRACE) && defined(__GNUC__) && defined(LIBXSMM_BUILD) LIBXSMM_API LIBXSMM_ATTRIBUTE_NO_TRACE void __cyg_profile_func_enter(void* /*this_fn*/, void* /*call_site*/); LIBXSMM_API void __cyg_profile_func_enter(void* this_fn, void* call_site) { #if defined(LIBXSMM_TRACE) if (0 > internal_trace_initialized) { /* NULL: inherit global settings from libxsmm_trace_init */ libxsmm_trace(stderr, NULL/*filter_threadid*/, "__cyg_profile_func_enter"/*LIBXSMM_FUNCNAME*/, NULL, NULL); } #endif LIBXSMM_UNUSED(this_fn); LIBXSMM_UNUSED(call_site); } LIBXSMM_API LIBXSMM_ATTRIBUTE_NO_TRACE void __cyg_profile_func_exit(void* /*this_fn*/, void* /*call_site*/); LIBXSMM_API void __cyg_profile_func_exit(void* this_fn, void* call_site) { LIBXSMM_UNUSED(this_fn); LIBXSMM_UNUSED(call_site); /* suppress warning */ } #endif /*defined(__TRACE) && defined(__GNUC__) && defined(LIBXSMM_BUILD)*/ libxsmm-1.17/src/libxsmm_trace.h000066400000000000000000000123011415223013700167010ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_TRACE_H #define LIBXSMM_TRACE_H #include #if (defined(__TRACE) || defined(LIBXSMM_BUILD) || !defined(_WIN32)) # define LIBXSMM_TRACE #endif #if !defined(LIBXSMM_TRACE_CALLERID_MAXDEPTH) # define LIBXSMM_TRACE_CALLERID_MAXDEPTH 8 #endif #if !defined(LIBXSMM_TRACE_CALLERID_GCCBUILTIN) && \ ((!defined(_WIN32) || defined(__MINGW32__) || (defined(_MSC_VER) && defined(__clang__))) && \ (!defined(__PGI) || LIBXSMM_VERSION2(19, 0) <= LIBXSMM_VERSION2(__PGIC__, __PGIC_MINOR__)) && \ (defined(__GNUC__) || defined(__clang__))) # define LIBXSMM_TRACE_CALLERID_GCCBUILTIN #endif /** Initializes the trace facility; NOT thread-safe. */ LIBXSMM_API int libxsmm_trace_init( /* Filter for thread id (-1: all). */ int filter_threadid, /* Specify min. depth of stack trace (0: all). */ int filter_mindepth, /* Specify max. depth of stack trace (-1: all). */ int filter_maxnsyms); /** Finalizes the trace facility; NOT thread-safe. */ LIBXSMM_API int libxsmm_trace_finalize(void); /** Receives the backtrace of up to 'size' addresses. Returns the actual number of addresses (n <= size). */ LIBXSMM_API unsigned int libxsmm_backtrace(const void* buffer[], unsigned int size, unsigned int skip); #if defined(LIBXSMM_TRACE_CALLERID_GCCBUILTIN) && !defined(__INTEL_COMPILER) # if defined(__clang__) # pragma clang diagnostic push # elif defined(__GNUC__) && LIBXSMM_VERSION2(4, 6) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__) # pragma GCC diagnostic push # endif # if defined(__clang__) # pragma clang diagnostic ignored "-Wunknown-warning-option" # if LIBXSMM_VERSION2(9, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__) # pragma clang diagnostic ignored "-Wframe-address" # endif # elif defined(__GNUC__) /* no version-check */ # pragma GCC diagnostic ignored "-Wpragmas" # pragma GCC diagnostic ignored "-Wframe-address" # endif #endif LIBXSMM_API_INLINE const void* libxsmm_trace_caller_id(unsigned int level) { /* must be inline */ #if defined(LIBXSMM_TRACE_CALLERID_GCCBUILTIN) switch (level) { # if 0 case 0: return __builtin_extract_return_addr(__builtin_return_address(0)); case 1: return __builtin_extract_return_addr(__builtin_return_address(1)); case 2: return __builtin_extract_return_addr(__builtin_return_address(2)); case 3: return __builtin_extract_return_addr(__builtin_return_address(3)); # else case 0: return __builtin_frame_address(1); case 1: return __builtin_frame_address(2); case 2: return __builtin_frame_address(3); case 3: return __builtin_frame_address(4); # endif default: #else { # if defined(_WIN32) if (0 == level) return _AddressOfReturnAddress(); else # endif #endif { const void* stacktrace[LIBXSMM_TRACE_CALLERID_MAXDEPTH]; const unsigned int n = libxsmm_backtrace(stacktrace, LIBXSMM_TRACE_CALLERID_MAXDEPTH, 0/*skip*/); return (level < n ? stacktrace[level] : NULL); } } } #if defined(LIBXSMM_TRACE_CALLERID_GCCBUILTIN) && !defined(__INTEL_COMPILER) # if defined(__clang__) # pragma clang diagnostic pop # elif defined(__GNUC__) && LIBXSMM_VERSION2(4, 6) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__) # pragma GCC diagnostic pop # endif #endif /** Returns the name of the function where libxsmm_trace is called from; thread-safe. */ LIBXSMM_API const char* libxsmm_trace_info( /* Query and output the abs. location in stacktrace (no input). */ unsigned int* depth, /* Query and output the thread id (no input). */ unsigned int* threadid, /* Filter for thread id (-1: all, NULL: libxsmm_trace_init). */ const int* filter_threadid, /* Lookup symbol (depth argument becomes relative to symbol position). */ const void* filter_symbol, /* Specify min. abs. position in stack trace (-1 or 0: all, NULL: libxsmm_trace_init). */ const int* filter_mindepth, /* Specify max. depth of stack trace (-1 or 0: all, NULL: libxsmm_trace_init). */ const int* filter_maxnsyms); /** Prints an entry of the function where libxsmm_trace is called from (indented/hierarchical). */ LIBXSMM_API void libxsmm_trace(FILE* stream, /* Filter for thread id (-1: all, NULL: libxsmm_trace_init). */ const int* filter_threadid, /* Lookup symbol (depth argument becomes relative to symbol position). */ const void* filter_symbol, /* Specify min. absolute pos. in stack trace (-1 or 0: all, NULL: libxsmm_trace_init). */ const int* filter_mindepth, /* Specify max. depth of stack trace (-1 or 0: all, NULL: libxsmm_trace_init). */ const int* filter_maxnsyms); #endif /*LIBXSMM_TRACE_H*/ libxsmm-1.17/src/libxsmm_xcopy.c000066400000000000000000000553621415223013700167560ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include "libxsmm_xcopy.h" #include "libxsmm_main.h" #if !defined(LIBXSMM_MCOPY_JIT_TINY) && 0 # define LIBXSMM_MCOPY_JIT_TINY #endif /* definition of corresponding variables */ #if (defined(LIBXSMM_XCOPY_JIT) && 0 != (LIBXSMM_XCOPY_JIT)) LIBXSMM_APIVAR_PUBLIC_DEF(int libxsmm_xcopy_jit); #endif LIBXSMM_APIVAR_PUBLIC_DEF(int libxsmm_xcopy_taskscale); LIBXSMM_APIVAR_PUBLIC_DEF(unsigned int libxsmm_mcopy_prefetch); LIBXSMM_APIVAR_PUBLIC_DEF(unsigned int libxsmm_mcopy_mbytes); LIBXSMM_APIVAR_PUBLIC_DEF(unsigned int libxsmm_mzero_mbytes); LIBXSMM_APIVAR_PUBLIC_DEF(unsigned int libxsmm_tcopy_mbytes); LIBXSMM_APIVAR_PUBLIC_DEF(float libxsmm_mcopy_nscale); LIBXSMM_APIVAR_PUBLIC_DEF(float libxsmm_mzero_nscale); LIBXSMM_APIVAR_PUBLIC_DEF(float libxsmm_tcopy_nscale); LIBXSMM_API_INTERN void libxsmm_xcopy_init(int archid) { { /* setup tile sizes according to CPUID or environment */ if (LIBXSMM_X86_AVX512_CORE <= archid) { /* avx-512/core */ libxsmm_mcopy_prefetch = 0; libxsmm_mcopy_mbytes = 0; libxsmm_mcopy_nscale = 0.f; libxsmm_mzero_mbytes = 0; libxsmm_mzero_nscale = 0.f; libxsmm_tcopy_mbytes = 32768; libxsmm_tcopy_nscale = 0.f; } else if (LIBXSMM_X86_AVX512_MIC <= archid && LIBXSMM_X86_AVX512_CORE > archid) { libxsmm_mcopy_prefetch = 1; libxsmm_mcopy_mbytes = 0; libxsmm_mcopy_nscale = 0.f; libxsmm_mzero_mbytes = 0; libxsmm_mzero_nscale = 0.f; libxsmm_tcopy_mbytes = 32768; libxsmm_tcopy_nscale = 0.f; } else { /* avx2 */ libxsmm_mcopy_prefetch = 0; libxsmm_mcopy_mbytes = 0; libxsmm_mcopy_nscale = 0.f; libxsmm_mzero_mbytes = 8192; libxsmm_mzero_nscale = 0.f; libxsmm_tcopy_mbytes = 4096; libxsmm_tcopy_nscale = 0.f; } } { /* mcopy: load/adjust tile sizes (measured as if DP) */ const char* const env_m = getenv("LIBXSMM_MCOPY_M"), * const env_n = getenv("LIBXSMM_MCOPY_N"); const int m = ((NULL == env_m || 0 == *env_m) ? 0 : atoi(env_m)); const int n = ((NULL == env_n || 0 == *env_n) ? 0 : atoi(env_n)); if (0 < m) libxsmm_mcopy_mbytes = LIBXSMM_MAX(m, 1) * 8/*DP*/; if (0 != libxsmm_mcopy_mbytes && 0 != libxsmm_mcopy_nscale) { if (0 < n) libxsmm_mcopy_nscale = ((float)(n * 8/*DP*/)) / libxsmm_mcopy_mbytes; if (1 > (libxsmm_mcopy_nscale * libxsmm_mcopy_mbytes)) { const float stretch = 1.f / libxsmm_mcopy_mbytes; libxsmm_mcopy_nscale = LIBXSMM_MAX(stretch, libxsmm_mcopy_nscale); } } } { /* mzero: load/adjust tile sizes (measured as if DP) */ const char* const env_m = getenv("LIBXSMM_MZERO_M"), * const env_n = getenv("LIBXSMM_MZERO_N"); const int m = ((NULL == env_m || 0 == *env_m) ? 0 : atoi(env_m)); const int n = ((NULL == env_n || 0 == *env_n) ? 0 : atoi(env_n)); if (0 < m) libxsmm_mzero_mbytes = LIBXSMM_MAX(m, 1) * 8/*DP*/; if (0 != libxsmm_mzero_mbytes && 0 != libxsmm_mzero_nscale) { if (0 < n) libxsmm_mzero_nscale = ((float)(n * 8/*DP*/)) / libxsmm_mzero_mbytes; if (1 > (libxsmm_mzero_nscale * libxsmm_mzero_mbytes)) { const float stretch = 1.f / libxsmm_mzero_mbytes; libxsmm_mzero_nscale = LIBXSMM_MAX(stretch, libxsmm_mzero_nscale); } } } { /* tcopy: load/adjust tile sizes (measured as if DP) */ const char* const env_m = getenv("LIBXSMM_TCOPY_M"), * const env_n = getenv("LIBXSMM_TCOPY_N"); const int m = ((NULL == env_m || 0 == *env_m) ? 0 : atoi(env_m)); const int n = ((NULL == env_n || 0 == *env_n) ? 0 : atoi(env_n)); if (0 < m) libxsmm_tcopy_mbytes = LIBXSMM_MAX(m, 1) * 8/*DP*/; if (0 != libxsmm_tcopy_mbytes && 0 != libxsmm_tcopy_nscale) { if (0 < n) libxsmm_tcopy_nscale = ((float)(n * 8/*DP*/)) / libxsmm_tcopy_mbytes; if (1 > (libxsmm_tcopy_nscale * libxsmm_tcopy_mbytes)) { const float stretch = 1.f / libxsmm_tcopy_mbytes; libxsmm_tcopy_nscale = LIBXSMM_MAX(stretch, libxsmm_tcopy_nscale); } } } #if (defined(LIBXSMM_XCOPY_JIT) && 0 != (LIBXSMM_XCOPY_JIT)) { /* check if JIT-code generation is permitted */ const char *const env_jit = getenv("LIBXSMM_XCOPY_JIT"); libxsmm_xcopy_jit = ((NULL == env_jit || 0 == *env_jit) ? (LIBXSMM_XCOPY_JIT) : atoi(env_jit)); # if defined(LIBXSMM_XCOPY_MELTW) if (LIBXSMM_X86_AVX512_CORE > archid) libxsmm_xcopy_jit &= ~2; # endif } #endif { /* determines if OpenMP tasks are used (when available) */ const char *const env_t = getenv("LIBXSMM_XCOPY_TASKS"); libxsmm_xcopy_taskscale = ((NULL == env_t || 0 == *env_t) ? 0/*disabled*/ : (LIBXSMM_XCOPY_TASKSCALE * atoi(env_t))); } } LIBXSMM_API_INTERN void libxsmm_xcopy_finalize(void) { } LIBXSMM_API void libxsmm_matcopy_thread_internal(void* out, const void* in, unsigned int typesize, unsigned int m, unsigned int n, unsigned int ldi, unsigned int ldo, unsigned int km, unsigned int kn, libxsmm_xcopykernel kernel, int tid, int nthreads) { const unsigned int tm = (0 == km ? m : km); const unsigned int tn = (0 == kn ? LIBXSMM_MIN(LIBXSMM_XCOPY_TILE_MIN, n) : kn); const int mtasks = LIBXSMM_UPDIV(m, tm); unsigned int m0, m1, n0, n1; LIBXSMM_ASSERT_MSG(tid < nthreads && 0 < nthreads, "Invalid task setup"); LIBXSMM_ASSERT_MSG(tm <= m && tn <= n, "Invalid problem size"); LIBXSMM_ASSERT_MSG(0 < tm && 0 < tn, "Invalid tile size"); LIBXSMM_ASSERT_MSG(typesize <= 255, "Invalid type-size"); LIBXSMM_ASSERT(0 < mtasks); if (nthreads <= mtasks) { /* parallelized over M */ const unsigned int mt = LIBXSMM_UPDIV(m, nthreads); m0 = LIBXSMM_MIN(tid * mt, m); m1 = LIBXSMM_MIN(m0 + mt, m); n0 = 0; n1 = n; } else { /* parallelized over M and N */ const int ntasks = nthreads / mtasks; const int mtid = tid / ntasks, ntid = tid - mtid * ntasks; const unsigned int nt = LIBXSMM_UP(LIBXSMM_UPDIV(n, ntasks), tn) ; m0 = LIBXSMM_MIN(mtid * tm, m); m1 = LIBXSMM_MIN(m0 + tm, m); n0 = LIBXSMM_MIN(ntid * nt, n); n1 = LIBXSMM_MIN(n0 + nt, n); } LIBXSMM_ASSERT_MSG(m0 <= m1 && m1 <= m, "Invalid task size"); LIBXSMM_ASSERT_MSG(n0 <= n1 && n1 <= n, "Invalid task size"); if (NULL != in) { /* copy-kernel */ libxsmm_matcopy_internal(out, in, typesize, ldi, ldo, m0, m1, n0, n1, tm, tn, kernel); } else { libxsmm_matzero_internal(out, typesize, ldo, m0, m1, n0, n1, tm, tn, kernel); } } LIBXSMM_API void libxsmm_otrans_thread_internal(void* out, const void* in, unsigned int typesize, unsigned int m, unsigned int n, unsigned int ldi, unsigned int ldo, unsigned int km, unsigned int kn, libxsmm_xcopykernel kernel, int tid, int nthreads) { const unsigned int tm = (0 == km ? m : km); const unsigned int tn = (0 == kn ? LIBXSMM_MIN(LIBXSMM_XCOPY_TILE_MIN, n) : kn); const int mtasks = LIBXSMM_UPDIV(m, tm); unsigned int m0, m1, n0, n1; LIBXSMM_ASSERT_MSG(tid < nthreads && 0 < nthreads, "Invalid task setup"); LIBXSMM_ASSERT_MSG(tm <= m && tn <= n, "Invalid problem size"); LIBXSMM_ASSERT_MSG(0 < tm && 0 < tn, "Invalid tile size"); LIBXSMM_ASSERT_MSG(typesize <= 255, "Invalid type-size"); LIBXSMM_ASSERT(0 < mtasks); if (nthreads <= mtasks) { /* parallelized over M */ const unsigned int mt = LIBXSMM_UPDIV(m, nthreads); m0 = LIBXSMM_MIN(tid * mt, m); m1 = LIBXSMM_MIN(m0 + mt, m); n0 = 0; n1 = n; } else { /* parallelized over M and N */ const int ntasks = nthreads / mtasks; const int mtid = tid / ntasks, ntid = tid - mtid * ntasks; const unsigned int nt = LIBXSMM_UP(LIBXSMM_UPDIV(n, ntasks), tn); m0 = LIBXSMM_MIN(mtid * tm, m); m1 = LIBXSMM_MIN(m0 + tm, m); n0 = LIBXSMM_MIN(ntid * nt, n); n1 = LIBXSMM_MIN(n0 + nt, n); } LIBXSMM_ASSERT_MSG(m0 <= m1 && m1 <= m, "Invalid task size"); LIBXSMM_ASSERT_MSG(n0 <= n1 && n1 <= n, "Invalid task size"); libxsmm_otrans_internal(out, in, typesize, ldi, ldo, m0, m1, n0, n1, tm, tn, kernel); } LIBXSMM_API_INTERN void libxsmm_matcopy_internal(void* out, const void* in, unsigned int typesize, unsigned int ldi, unsigned int ldo, unsigned int m0, unsigned int m1, unsigned int n0, unsigned int n1, unsigned int tm, unsigned int tn, libxsmm_xcopykernel kernel) { LIBXSMM_ASSERT(NULL != in); if (NULL != kernel.ptr) { const libxsmm_descriptor* desc; libxsmm_code_pointer code; code.ptr_const = kernel.ptr; LIBXSMM_EXPECT_NOT(NULL, libxsmm_get_kernel_xinfo(code, &desc, NULL/*code_size*/)); LIBXSMM_ASSERT(NULL != desc); #if defined(LIBXSMM_XCOPY_MELTW) LIBXSMM_ASSERT(LIBXSMM_KERNEL_KIND_MELTW == desc->kind); #else LIBXSMM_ASSERT(LIBXSMM_KERNEL_KIND_MCOPY == desc->kind); if (0 != desc->mcopy.desc.prefetch) { LIBXSMM_XCOPY(LIBXSMM_MCOPY_KERNEL, LIBXSMM_MCOPY_CALL_PF, kernel, out, in, typesize, ldi, ldo, tm, tn, m0, m1, n0, n1); return; } #endif } LIBXSMM_XCOPY(LIBXSMM_MCOPY_KERNEL, LIBXSMM_MCOPY_CALL, kernel, out, in, typesize, ldi, ldo, tm, tn, m0, m1, n0, n1); } LIBXSMM_API_INTERN void libxsmm_matzero_internal(void* out, unsigned int typesize, unsigned int ldo, unsigned int m0, unsigned int m1, unsigned int n0, unsigned int n1, unsigned int tm, unsigned int tn, libxsmm_xcopykernel kernel) { /* coverity[ptr_arith] */ LIBXSMM_XCOPY(LIBXSMM_MZERO_KERNEL, LIBXSMM_MZERO_CALL, kernel, out, NULL, typesize, 0, ldo, tm, tn, m0, m1, n0, n1); } LIBXSMM_API_INTERN void libxsmm_otrans_internal(void* out, const void* in, unsigned int typesize, unsigned int ldi, unsigned int ldo, unsigned int m0, unsigned int m1, unsigned int n0, unsigned int n1, unsigned int tm, unsigned int tn, libxsmm_xcopykernel kernel) { LIBXSMM_ASSERT(NULL != in); LIBXSMM_XCOPY(LIBXSMM_TCOPY_KERNEL, LIBXSMM_TCOPY_CALL, kernel, out, in, typesize, ldi, ldo, tm, tn, m0, m1, n0, n1); } LIBXSMM_API void libxsmm_matcopy_thread(void* out, const void* in, unsigned int typesize, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo, int tid, int nthreads) { LIBXSMM_INIT if (0 < typesize && 256 > typesize && m <= ldi && m <= ldo && out != in && ((NULL != out && 0 < m && 0 < n) || (0 == m && 0 == n)) && /* use (signed) integer types, but check sanity of input */ 0 <= tid && tid < nthreads) { if (0 < m && 0 < n) { unsigned int tm, tn, ts; #if (defined(LIBXSMM_XCOPY_JIT) && 0 != (LIBXSMM_XCOPY_JIT)) && !defined(LIBXSMM_XCOPY_MELTW) int prefetch = 0; #endif libxsmm_xcopykernel kernel; kernel.ptr = NULL; if (NULL != in) { /* mcopy */ #if (defined(LIBXSMM_XCOPY_JIT) && 0 != (LIBXSMM_XCOPY_JIT)) && !defined(LIBXSMM_XCOPY_MELTW) prefetch = libxsmm_mcopy_prefetch; #endif tm = LIBXSMM_UPDIV(libxsmm_mcopy_mbytes, typesize); tn = (unsigned int)(libxsmm_mcopy_nscale * tm); ts = libxsmm_mcopy_mbytes; } else { /* mzero */ tm = LIBXSMM_UPDIV(libxsmm_mzero_mbytes, typesize); tn = (unsigned int)(libxsmm_mzero_nscale * tm); ts = libxsmm_mzero_mbytes; } if (0 == tm) tm = m; if (0 == tn) tn = LIBXSMM_MIN(LIBXSMM_XCOPY_TILE_MIN, n); if (0 != ts && ts < (tm * tn * typesize)) { tm = LIBXSMM_MAX(ts / (tn * typesize), LIBXSMM_XCOPY_TILE_MIN); } if ((unsigned int)m < tm || (unsigned int)n < tn) { if (1 == nthreads) { tm = (unsigned int)m; tn = (unsigned int)n; } else { const unsigned int tasksize = (((unsigned int)m) * (unsigned int)n) / ((unsigned int)(nthreads * libxsmm_mcopy_nscale)); const unsigned int nn = libxsmm_isqrt_u32(tasksize); const unsigned int mm = (unsigned int)(libxsmm_mcopy_nscale * nn); tn = LIBXSMM_CLMP((unsigned int)n, 1, nn); tm = LIBXSMM_CLMP((unsigned int)m, 1, mm); } } #if (defined(LIBXSMM_XCOPY_JIT) && 0 != (LIBXSMM_XCOPY_JIT)) # if !defined(LIBXSMM_MCOPY_JIT_TINY) else # endif if (0 != (2 & libxsmm_xcopy_jit)) { /* JIT'ted matrix-copy permitted? */ # if defined(LIBXSMM_XCOPY_MELTW) const libxsmm_blasint sldi = ldi * typesize, sldo = ldo * typesize; if (NULL != in) { /* mcopy */ kernel.meltw_copy = libxsmm_dispatch_meltw_copy( (libxsmm_blasint)tm * typesize, (libxsmm_blasint)tn * typesize, &sldi, &sldo, LIBXSMM_DATATYPE_I8, LIBXSMM_DATATYPE_I8); } else { /* mzero */ kernel.meltw_zero = libxsmm_dispatch_meltw_zero( (libxsmm_blasint)tm * typesize, (libxsmm_blasint)tn * typesize, &sldi, &sldo, LIBXSMM_DATATYPE_I8, LIBXSMM_DATATYPE_I8); } # else libxsmm_descriptor_blob blob; kernel.xmcopy = libxsmm_dispatch_mcopy(libxsmm_mcopy_descriptor_init(&blob, typesize, tm, tn, (unsigned int)ldo, (unsigned int)ldi, NULL != in ? LIBXSMM_MATCOPY_FLAG_DEFAULT : LIBXSMM_MATCOPY_FLAG_ZERO_SOURCE, prefetch, NULL/*default unroll*/)); # endif } #endif libxsmm_matcopy_thread_internal(out, in, typesize, (unsigned int)m, (unsigned int)n, (unsigned int)ldi, (unsigned int)ldo, tm, tn, kernel, tid, nthreads); } } else { static int error_once = 0; if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { if (0 > tid || tid >= nthreads) { fprintf(stderr, "LIBXSMM ERROR: the matrix-copy thread-id or number of threads is incorrect!\n"); } else if (NULL == out) { fprintf(stderr, "LIBXSMM ERROR: the matrix-copy input and/or output is NULL!\n"); } else if (out == in) { fprintf(stderr, "LIBXSMM ERROR: output and input of the matrix-copy must be different!\n"); } else if (0 == typesize || 256 <= typesize) { fprintf(stderr, "LIBXSMM ERROR: invalid type-size for matrix-copy specified!\n"); } else if (ldi < m || ldo < m) { fprintf(stderr, "LIBXSMM ERROR: the leading dimension(s) of the matrix-copy is/are too small!\n"); } else if (0 > m || 0 > n) { fprintf(stderr, "LIBXSMM ERROR: the matrix extent(s) of the matrix-copy is/are negative!\n"); } } } } LIBXSMM_API void libxsmm_matcopy(void* out, const void* in, unsigned int typesize, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo) { libxsmm_matcopy_thread(out, in, typesize, m, n, ldi, ldo, 0/*tid*/, 1/*nthreads*/); } LIBXSMM_API void libxsmm_otrans_thread(void* out, const void* in, unsigned int typesize, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo, int tid, int nthreads) { static int error_once = 0; LIBXSMM_INIT if (0 < typesize && 256 > typesize && m <= ldi && n <= ldo && ((NULL != out && NULL != in && 0 < m && 0 < n) || (0 == m && 0 == n)) && /* use (signed) integer types, but check sanity of input */ 0 <= tid && tid < nthreads) { if (0 < m && 0 < n) { if (out != in) { unsigned int tm = LIBXSMM_UPDIV(libxsmm_tcopy_mbytes, typesize); unsigned int tn = (unsigned int)(libxsmm_tcopy_nscale * tm); libxsmm_xcopykernel kernel; kernel.ptr = NULL; if (0 == tm) tm = m; if (0 == tn) tn = LIBXSMM_MIN(LIBXSMM_XCOPY_TILE_MIN, n); if (0 != libxsmm_tcopy_mbytes && libxsmm_tcopy_mbytes < (tm * tn * typesize)) { tm = LIBXSMM_MAX(libxsmm_tcopy_mbytes / (tn * typesize), LIBXSMM_XCOPY_TILE_MIN); } if ((unsigned int)m < tm || (unsigned int)n < tn) { if (1 == nthreads) { #if (defined(LIBXSMM_XCOPY_JIT) && 0 != (LIBXSMM_XCOPY_JIT)) libxsmm_descriptor_blob blob; if (0 != (1 & libxsmm_xcopy_jit) /* JIT'ted transpose permitted? */ && NULL != (kernel.xtrans = libxsmm_dispatch_trans( /* JIT-kernel available? */ libxsmm_trans_descriptor_init(&blob, typesize, (unsigned int)m, (unsigned int)n, (unsigned int)ldo)))) { LIBXSMM_TCOPY_CALL(kernel, typesize, in, ldi, out, ldo); return; /* fast path */ } LIBXSMM_ASSERT(NULL == kernel.ptr); #endif tm = (unsigned int)m; tn = (unsigned int)n; } else { const unsigned int tasksize = (((unsigned int)m) * (unsigned int)n) / ((unsigned int)(nthreads * libxsmm_tcopy_nscale)); const unsigned int nn = libxsmm_isqrt_u32(tasksize); const unsigned int mm = (unsigned int)(libxsmm_tcopy_nscale * nn); tn = LIBXSMM_CLMP((unsigned int)n, 1, nn); tm = LIBXSMM_CLMP((unsigned int)m, 1, mm); #if (defined(LIBXSMM_XCOPY_JIT) && 0 != (LIBXSMM_XCOPY_JIT)) { const libxsmm_trans_descriptor* desc; libxsmm_descriptor_blob blob; if (0 != (1 & libxsmm_xcopy_jit) /* JIT'ted transpose permitted? */ && NULL != (desc = libxsmm_trans_descriptor_init(&blob, typesize, tm, tn, (unsigned int)ldo))) { kernel.xtrans = libxsmm_dispatch_trans(desc); } } #endif } } libxsmm_otrans_thread_internal(out, in, typesize, (unsigned int)m, (unsigned int)n, (unsigned int)ldi, (unsigned int)ldo, tm, tn, kernel, tid, nthreads); } else if (ldi == ldo) { libxsmm_itrans(out, typesize, m, n, ldi); } else if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: output and input of the transpose must be different!\n"); } } } else { if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { if (0 > tid || tid >= nthreads) { fprintf(stderr, "LIBXSMM ERROR: the transpose thread-id or number of threads is incorrect!\n"); } else if (NULL == out || NULL == in) { fprintf(stderr, "LIBXSMM ERROR: the transpose input and/or output is NULL!\n"); } else if (out == in) { fprintf(stderr, "LIBXSMM ERROR: output and input of the transpose must be different!\n"); } else if (0 == typesize || 256 <= typesize) { fprintf(stderr, "LIBXSMM ERROR: invalid type-size for matrix-transpose specified!\n"); } else if (ldi < m || ldo < n) { fprintf(stderr, "LIBXSMM ERROR: the leading dimension(s) of the transpose is/are too small!\n"); } else if (0 > m || 0 > n) { fprintf(stderr, "LIBXSMM ERROR: the matrix extent(s) of the transpose is/are negative!\n"); } } } } LIBXSMM_API void libxsmm_otrans(void* out, const void* in, unsigned int typesize, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo) { libxsmm_otrans_thread(out, in, typesize, m, n, ldi, ldo, 0/*tid*/, 1/*nthreads*/); } LIBXSMM_API void libxsmm_itrans(void* inout, unsigned int typesize, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld) { static int error_once = 0; LIBXSMM_INIT if (NULL != inout && 0 < typesize && typesize <= 127 && m <= ld && LIBXSMM_MAX(n, 1) <= ld) { const signed char c = (signed char)typesize; libxsmm_blasint i, j; if (m == n) { for (i = 0; i < m; ++i) { for (j = 0; j < i; ++j) { char *const a = &((char*)inout)[(i*ld+j)*typesize]; char *const b = &((char*)inout)[(j*ld+i)*typesize]; signed char k = 0; for (; k < c; ++k) LIBXSMM_ISWAP(a[k], b[k]); } } } else { if ( 0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: in-place transpose is not implemented!\n"); } LIBXSMM_ASSERT_MSG(0, "in-place transpose is not implemented!"); } } else if (0 != libxsmm_verbosity /* library code is expected to be mute */ && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: unsupported or invalid arguments for in-place transpose!\n"); } } #if defined(LIBXSMM_BUILD) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__)) /* implementation provided for Fortran 77 compatibility */ LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_matcopy)(void* /*out*/, const void* /*in*/, const int* /*typesize*/, const libxsmm_blasint* /*m*/, const libxsmm_blasint* /*n*/, const libxsmm_blasint* /*ldi*/, const libxsmm_blasint* /*ldo*/); LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_matcopy)(void* out, const void* in, const int* typesize, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo) { libxsmm_blasint ldx; LIBXSMM_ASSERT(NULL != typesize && 0 < *typesize && NULL != m); ldx = *(NULL != ldi ? ldi : m); libxsmm_matcopy(out, in, (unsigned int)*typesize, *m, *(NULL != n ? n : m), ldx, NULL != ldo ? *ldo : ldx); } /* implementation provided for Fortran 77 compatibility */ LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_otrans)(void* /*out*/, const void* /*in*/, const int* /*typesize*/, const libxsmm_blasint* /*m*/, const libxsmm_blasint* /*n*/, const libxsmm_blasint* /*ldi*/, const libxsmm_blasint* /*ldo*/); LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_otrans)(void* out, const void* in, const int* typesize, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo) { libxsmm_blasint ldx; LIBXSMM_ASSERT(NULL != typesize && 0 < *typesize && NULL != m); ldx = *(NULL != ldi ? ldi : m); libxsmm_otrans(out, in, (unsigned int)*typesize, *m, *(NULL != n ? n : m), ldx, NULL != ldo ? *ldo : ldx); } /* implementation provided for Fortran 77 compatibility */ LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_itrans)(void* /*inout*/, const int* /*typesize*/, const libxsmm_blasint* /*m*/, const libxsmm_blasint* /*n*/, const libxsmm_blasint* /*ld*/); LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_itrans)(void* inout, const int* typesize, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* ld) { LIBXSMM_ASSERT(NULL != typesize && 0 < *typesize && NULL != m); libxsmm_itrans(inout, (unsigned int)*typesize, *m, *(NULL != n ? n : m), *(NULL != ld ? ld : m)); } #endif /*defined(LIBXSMM_BUILD) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__))*/ libxsmm-1.17/src/libxsmm_xcopy.h000066400000000000000000000326701415223013700167600ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_XCOPY_H #define LIBXSMM_XCOPY_H #include #if !defined(LIBXSMM_XCOPY_CHECK) && !defined(NDEBUG) # define LIBXSMM_XCOPY_CHECK #endif #if !defined(LIBXSMM_XCOPY_TASKSCALE) # define LIBXSMM_XCOPY_TASKSCALE 2 #endif #if !defined(LIBXSMM_XCOPY_TILE_MIN) # define LIBXSMM_XCOPY_TILE_MIN 2 #endif #if !defined(LIBXSMM_XCOPY_MELTW) && 0 # define LIBXSMM_XCOPY_MELTW #endif /* 0: none, 1: transpose, 2: matcopy, 3: transpose+matcopy */ #if !defined(LIBXSMM_XCOPY_JIT) # if (defined(_WIN32) || defined(__CYGWIN__)) /* only enable matcopy code generation (workaround issue with taking GP registers correctly) */ # define LIBXSMM_XCOPY_JIT 0 # elif defined(LIBXSMM_XCOPY_MELTW) # define LIBXSMM_XCOPY_JIT 3 # else # define LIBXSMM_XCOPY_JIT 1 # endif #endif /* kernel uses consecutive stores */ #define LIBXSMM_MZERO_KERNEL(TYPE, TYPESIZE, OUT, IN, LDI, LDO, INDEX_I, INDEX_J, SRC, DST) \ static /*const*/ TYPE libxsmm_mzero_kernel_src_value_ /* zero */; \ const TYPE *const SRC = &libxsmm_mzero_kernel_src_value_; \ TYPE *const DST = (TYPE*)(((char*)(OUT)) + (TYPESIZE) * ((size_t)(INDEX_I) * (LDO) + (INDEX_J))) /* kernel uses consecutive stores and consecutive loads (copy) */ #define LIBXSMM_MCOPY_KERNEL(TYPE, TYPESIZE, OUT, IN, LDI, LDO, INDEX_I, INDEX_J, SRC, DST) \ const TYPE *const SRC = (const TYPE*)(((const char*) (IN)) + (TYPESIZE) * ((size_t)(INDEX_I) * (LDI) + (INDEX_J))); \ TYPE *const DST = ( TYPE*)((( char*)(OUT)) + (TYPESIZE) * ((size_t)(INDEX_I) * (LDO) + (INDEX_J))) #if defined(LIBXSMM_XCOPY_MELTW) # define LIBXSMM_MZERO_CALL(KERNEL, TYPESIZE, SRC, LDI, DST, LDO) { \ libxsmm_meltw_zero_param libxsmm_mzero_call_args_; \ libxsmm_mzero_call_args_.in_ptr = (SRC); \ libxsmm_mzero_call_args_.out_ptr = (DST); \ (KERNEL).meltw_zero(&libxsmm_mzero_call_args_); \ } # define LIBXSMM_MCOPY_CALL(KERNEL, TYPESIZE, SRC, LDI, DST, LDO) { \ libxsmm_meltw_copy_param libxsmm_mcopy_call_args_; \ libxsmm_mcopy_call_args_.in_ptr = (SRC); \ libxsmm_mcopy_call_args_.out_ptr = (DST); \ (KERNEL).meltw_copy(&libxsmm_mcopy_call_args_); \ } # define LIBXSMM_MCOPY_CALL_PF(KERNEL, TYPESIZE, SRC, LDI, DST, LDO) \ LIBXSMM_MCOPY_CALL(KERNEL, TYPESIZE, SRC, LDI, DST, LDO) #else /* call JIT-kernel (matrix-copy with prefetch) */ # define LIBXSMM_MZERO_CALL(KERNEL, TYPESIZE, SRC, LDI, DST, LDO) { \ const unsigned int libxsmm_mzero_call_uldo_ = (unsigned int)(LDO); \ (KERNEL).xmcopy(SRC, &libxsmm_mzero_call_uldo_, DST, &libxsmm_mzero_call_uldo_); \ } /* call JIT-kernel (matrix-copy) */ # define LIBXSMM_MCOPY_CALL(KERNEL, TYPESIZE, SRC, LDI, DST, LDO) { \ const unsigned int libxsmm_mcopy_call_nopf_uldi_ = (unsigned int)(LDI); \ const unsigned int libxsmm_mcopy_call_nopf_uldo_ = (unsigned int)(LDO); \ (KERNEL).xmcopy(SRC, &libxsmm_mcopy_call_nopf_uldi_, DST, &libxsmm_mcopy_call_nopf_uldo_); \ } /* call JIT-kernel (matrix-copy with prefetch) */ # define LIBXSMM_MCOPY_CALL_PF(KERNEL, TYPESIZE, SRC, LDI, DST, LDO) { \ const unsigned int libxsmm_mcopy_call_uldi_ = (unsigned int)(LDI); \ const unsigned int libxsmm_mcopy_call_uldo_ = (unsigned int)(LDO); \ (KERNEL).xmcopy(SRC, &libxsmm_mcopy_call_uldi_, DST, &libxsmm_mcopy_call_uldo_, \ /*prefetch next line*/((const char*)(SRC)) + (TYPESIZE) * (size_t)(LDI)); \ } #endif /* kernel uses consecutive stores and strided loads (transpose) */ #define LIBXSMM_TCOPY_KERNEL(TYPE, TYPESIZE, OUT, IN, LDI, LDO, INDEX_I, INDEX_J, SRC, DST) \ const TYPE *const SRC = (const TYPE*)(((const char*) (IN)) + (TYPESIZE) * ((size_t)(INDEX_J) * (LDI) + (INDEX_I))); \ TYPE *const DST = ( TYPE*)((( char*)(OUT)) + (TYPESIZE) * ((size_t)(INDEX_I) * (LDO) + (INDEX_J))) /* call JIT-kernel (transpose) */ #define LIBXSMM_TCOPY_CALL(KERNEL, TYPESIZE, SRC, LDI, DST, LDO) { \ const unsigned int libxsmm_tcopy_call_uldi_ = (unsigned int)(LDI); \ const unsigned int libxsmm_tcopy_call_uldo_ = (unsigned int)(LDO); \ (KERNEL).xtrans(SRC, &libxsmm_tcopy_call_uldi_, DST, &libxsmm_tcopy_call_uldo_); \ } #define LIBXSMM_XCOPY_LOOP(TYPE, TYPESIZE, XKERNEL, OUT, IN, LDI, LDO, M0, M1, N0, N1) { \ libxsmm_blasint libxsmm_xcopy_loop_i_, libxsmm_xcopy_loop_j_; \ for (libxsmm_xcopy_loop_i_ = M0; libxsmm_xcopy_loop_i_ < (libxsmm_blasint)(M1); ++libxsmm_xcopy_loop_i_) { \ LIBXSMM_PRAGMA_NONTEMPORAL(OUT) \ for (libxsmm_xcopy_loop_j_ = N0; libxsmm_xcopy_loop_j_ < (libxsmm_blasint)(N1); ++libxsmm_xcopy_loop_j_) { \ XKERNEL(TYPE, TYPESIZE, OUT, IN, LDI, LDO, libxsmm_xcopy_loop_i_, libxsmm_xcopy_loop_j_, \ libxsmm_xcopy_loop_src_, libxsmm_xcopy_loop_dst_); *libxsmm_xcopy_loop_dst_ = *libxsmm_xcopy_loop_src_; \ } \ } \ } #define LIBXSMM_XCOPY_TILE(XKERNEL, TYPESIZE, OUT, IN, LDI, LDO, M0, M1, N0, N1) { \ switch(TYPESIZE) { \ case 2: { \ LIBXSMM_XCOPY_LOOP(short, 2, XKERNEL, OUT, IN, LDI, LDO, M0, M1, N0, N1); \ } break; \ case 4: { \ LIBXSMM_XCOPY_LOOP(float, 4, XKERNEL, OUT, IN, LDI, LDO, M0, M1, N0, N1); \ } break; \ case 8: { \ LIBXSMM_XCOPY_LOOP(double, 8, XKERNEL, OUT, IN, LDI, LDO, M0, M1, N0, N1); \ } break; \ case 16: { \ typedef struct /*libxsmm_xcopy_tile_elem_t*/ { double value[2]; } libxsmm_xcopy_tile_elem_t; \ LIBXSMM_XCOPY_LOOP(libxsmm_xcopy_tile_elem_t, 16, XKERNEL, OUT, IN, LDI, LDO, M0, M1, N0, N1); \ } break; \ default: { /* generic type-size */ \ libxsmm_blasint libxsmm_xcopy_tile_i_, libxsmm_xcopy_tile_j_; \ for (libxsmm_xcopy_tile_i_ = M0; libxsmm_xcopy_tile_i_ < (libxsmm_blasint)(M1); ++libxsmm_xcopy_tile_i_) { \ for (libxsmm_xcopy_tile_j_ = N0; libxsmm_xcopy_tile_j_ < (libxsmm_blasint)(N1); ++libxsmm_xcopy_tile_j_) { \ XKERNEL(char, TYPESIZE, OUT, IN, LDI, LDO, libxsmm_xcopy_tile_i_, libxsmm_xcopy_tile_j_, \ libxsmm_xcopy_tile_src_, libxsmm_xcopy_tile_dst_); \ LIBXSMM_MEMCPY127_LOOP(libxsmm_xcopy_tile_dst_, libxsmm_xcopy_tile_src_, TYPESIZE, LIBXSMM_PRAGMA_NONTEMPORAL); \ } \ } \ } \ } \ } #define LIBXSMM_MZERO_KERNEL_TILE(XKERNEL, TYPESIZE, OUT, IN, LDI, LDO, M0, M1, N0, N1) \ LIBXSMM_XCOPY_TILE(XKERNEL, TYPESIZE, OUT, IN, LDI, LDO, N0, N1, M0, M1) #define LIBXSMM_MCOPY_KERNEL_TILE(XKERNEL, TYPESIZE, OUT, IN, LDI, LDO, M0, M1, N0, N1) \ LIBXSMM_XCOPY_TILE(XKERNEL, TYPESIZE, OUT, IN, LDI, LDO, N0, N1, M0, M1) #define LIBXSMM_TCOPY_KERNEL_TILE(XKERNEL, TYPESIZE, OUT, IN, LDI, LDO, M0, M1, N0, N1) \ LIBXSMM_XCOPY_TILE(XKERNEL, TYPESIZE, OUT, IN, LDI, LDO, M0, M1, N0, N1) #define LIBXSMM_XCOPY_NONJIT(XKERNEL, TYPESIZE, OUT, IN, LDI, LDO, M0, M1, N0, N1) \ LIBXSMM_CONCATENATE(XKERNEL,_TILE)(XKERNEL, TYPESIZE, OUT, IN, LDI, LDO, M0, M1, N0, N1) #if 1 # define LIBXSMM_XCOPY_PRECOND(COND) #else # define LIBXSMM_XCOPY_PRECOND(COND) COND #endif #define LIBXSMM_XCOPY_TILES(XKERNEL, KERNEL_CALL, KERNEL, OUT, IN, TYPESIZE, LDI, LDO, TILE_M, TILE_N, M0, M1, N0, N1) { \ libxsmm_blasint libxsmm_xcopy_i_ = M0, libxsmm_xcopy_j_ = N0; \ LIBXSMM_ASSERT_MSG(0 < (TILE_M) && 0 < (TILE_N), "XCOPY cannot make progress"); \ if (NULL != (KERNEL).ptr) { /* inner tiles with JIT */ \ for (; libxsmm_xcopy_i_ < (((libxsmm_blasint)M1) - ((libxsmm_blasint)TILE_M) + 1); libxsmm_xcopy_i_ += TILE_M) { \ for (libxsmm_xcopy_j_ = N0; libxsmm_xcopy_j_ < (((libxsmm_blasint)N1) - ((libxsmm_blasint)TILE_N) + 1); libxsmm_xcopy_j_ += TILE_N) { \ XKERNEL(char, TYPESIZE, OUT, IN, LDI, LDO, libxsmm_xcopy_i_, libxsmm_xcopy_j_, libxsmm_xcopy_src_, libxsmm_xcopy_dst_); \ KERNEL_CALL(KERNEL, TYPESIZE, libxsmm_xcopy_src_, LDI, libxsmm_xcopy_dst_, LDO); \ } \ } \ } \ else { /* inner tiles without JIT */ \ for (; libxsmm_xcopy_i_ < (((libxsmm_blasint)M1) - ((libxsmm_blasint)TILE_M) + 1); libxsmm_xcopy_i_ += TILE_M) { \ for (libxsmm_xcopy_j_ = N0; libxsmm_xcopy_j_ < (((libxsmm_blasint)N1) - ((libxsmm_blasint)TILE_N) + 1); libxsmm_xcopy_j_ += TILE_N) { \ LIBXSMM_XCOPY_TILE(XKERNEL, TYPESIZE, OUT, IN, LDI, LDO, \ libxsmm_xcopy_i_, libxsmm_xcopy_i_ + (TILE_M), \ libxsmm_xcopy_j_, libxsmm_xcopy_j_ + (TILE_N)); \ } \ } \ } \ { /* remainder/border tiles */ \ LIBXSMM_XCOPY_PRECOND(if (libxsmm_xcopy_j_ < ((libxsmm_blasint)N1))) { \ for (libxsmm_xcopy_i_ = M0; libxsmm_xcopy_i_ < (((libxsmm_blasint)M1) - ((libxsmm_blasint)TILE_M) + 1); libxsmm_xcopy_i_ += TILE_M) { \ LIBXSMM_XCOPY_TILE(XKERNEL, TYPESIZE, OUT, IN, LDI, LDO, \ libxsmm_xcopy_i_, libxsmm_xcopy_i_ + (TILE_M), \ libxsmm_xcopy_j_, N1); \ } \ } \ LIBXSMM_XCOPY_PRECOND(if (libxsmm_xcopy_i_ < ((libxsmm_blasint)M1))) { \ for (libxsmm_xcopy_j_ = N0; libxsmm_xcopy_j_ < (((libxsmm_blasint)N1) - ((libxsmm_blasint)TILE_N)); libxsmm_xcopy_j_ += TILE_N) { \ LIBXSMM_XCOPY_TILE(XKERNEL, TYPESIZE, OUT, IN, LDI, LDO, \ libxsmm_xcopy_i_, M1, \ libxsmm_xcopy_j_, libxsmm_xcopy_j_ + (TILE_N)); \ } \ } \ LIBXSMM_XCOPY_PRECOND(if (libxsmm_xcopy_i_ < ((libxsmm_blasint)M1) && libxsmm_xcopy_j_ < ((libxsmm_blasint)N1))) { \ LIBXSMM_XCOPY_TILE(XKERNEL, TYPESIZE, OUT, IN, LDI, LDO, \ libxsmm_xcopy_i_, M1, \ libxsmm_xcopy_j_, N1); \ } \ } \ } #define LIBXSMM_MZERO_KERNEL_TILES(XKERNEL, KERNEL_CALL, KERNEL, OUT, IN, TYPESIZE, LDI, LDO, TILE_M, TILE_N, M0, M1, N0, N1) \ LIBXSMM_XCOPY_TILES(XKERNEL, KERNEL_CALL, KERNEL, OUT, IN, TYPESIZE, LDI, LDO, TILE_N, TILE_M, N0, N1, M0, M1) #define LIBXSMM_MCOPY_KERNEL_TILES(XKERNEL, KERNEL_CALL, KERNEL, OUT, IN, TYPESIZE, LDI, LDO, TILE_M, TILE_N, M0, M1, N0, N1) \ LIBXSMM_XCOPY_TILES(XKERNEL, KERNEL_CALL, KERNEL, OUT, IN, TYPESIZE, LDI, LDO, TILE_N, TILE_M, N0, N1, M0, M1) #define LIBXSMM_TCOPY_KERNEL_TILES(XKERNEL, KERNEL_CALL, KERNEL, OUT, IN, TYPESIZE, LDI, LDO, TILE_M, TILE_N, M0, M1, N0, N1) \ LIBXSMM_XCOPY_TILES(XKERNEL, KERNEL_CALL, KERNEL, OUT, IN, TYPESIZE, LDI, LDO, TILE_M, TILE_N, M0, M1, N0, N1) #define LIBXSMM_XCOPY(XKERNEL, KERNEL_CALL, KERNEL, OUT, IN, TYPESIZE, LDI, LDO, TILE_M, TILE_N, M0, M1, N0, N1) \ LIBXSMM_CONCATENATE(XKERNEL,_TILES)(XKERNEL, KERNEL_CALL, KERNEL, OUT, IN, TYPESIZE, LDI, LDO, TILE_M, TILE_N, M0, M1, N0, N1) LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE libxsmm_xcopykernel { libxsmm_meltwfunction_copy meltw_copy; libxsmm_meltwfunction_zero meltw_zero; libxsmm_xmcopyfunction xmcopy; libxsmm_xtransfunction xtrans; const void* ptr; } libxsmm_xcopykernel; /** Initializes the transpose functionality; NOT thread-safe. */ LIBXSMM_API_INTERN void libxsmm_xcopy_init(int archid); /** Finalizes the transpose functionality; NOT thread-safe. */ LIBXSMM_API_INTERN void libxsmm_xcopy_finalize(void); LIBXSMM_API void libxsmm_matcopy_thread_internal(void* out, const void* in, unsigned int typesize, unsigned int m, unsigned int n, unsigned int ldi, unsigned int ldo, unsigned int km, unsigned int kn, libxsmm_xcopykernel kernel, int tid, int nthreads); LIBXSMM_API void libxsmm_otrans_thread_internal(void* out, const void* in, unsigned int typesize, unsigned int m, unsigned int n, unsigned int ldi, unsigned int ldo, unsigned int km, unsigned int kn, libxsmm_xcopykernel kernel, int tid, int nthreads); LIBXSMM_API_INTERN void libxsmm_matcopy_internal(void* out, const void* in, unsigned int typesize, unsigned int ldi, unsigned int ldo, unsigned int m0, unsigned int m1, unsigned int n0, unsigned int n1, unsigned int tm, unsigned int tn, libxsmm_xcopykernel kernel); LIBXSMM_API_INTERN void libxsmm_matzero_internal(void* out, unsigned int typesize, unsigned int ldo, unsigned int m0, unsigned int m1, unsigned int n0, unsigned int n1, unsigned int tm, unsigned int tn, libxsmm_xcopykernel kernel); LIBXSMM_API_INTERN void libxsmm_otrans_internal(void* out, const void* in, unsigned int typesize, unsigned int ldi, unsigned int ldo, unsigned int m0, unsigned int m1, unsigned int n0, unsigned int n1, unsigned int tm, unsigned int tn, libxsmm_xcopykernel kernel); #if (defined(LIBXSMM_XCOPY_JIT) && 0 != (LIBXSMM_XCOPY_JIT)) /** Determines whether JIT-kernels are used or not; values see LIBXSMM_XCOPY_JIT. */ LIBXSMM_APIVAR_PUBLIC(int libxsmm_xcopy_jit); # if !defined(LIBXSMM_XCOPY_MELTW) /** Targeted default prefetch */ LIBXSMM_APIVAR_PUBLIC(unsigned int libxsmm_mcopy_prefetch); # endif #endif /** Determines if OpenMP tasks are used, and scales beyond the number of threads. */ LIBXSMM_APIVAR_PUBLIC(int libxsmm_xcopy_taskscale); /** M-extent of type-size in Byte. */ LIBXSMM_APIVAR_PUBLIC(unsigned int libxsmm_mcopy_mbytes); LIBXSMM_APIVAR_PUBLIC(unsigned int libxsmm_mzero_mbytes); LIBXSMM_APIVAR_PUBLIC(unsigned int libxsmm_tcopy_mbytes); /** M-factor shaping the N-extent. */ LIBXSMM_APIVAR_PUBLIC(float libxsmm_mcopy_nscale); LIBXSMM_APIVAR_PUBLIC(float libxsmm_mzero_nscale); LIBXSMM_APIVAR_PUBLIC(float libxsmm_tcopy_nscale); #endif /*LIBXSMM_XCOPY_H*/ libxsmm-1.17/src/perf_jitdump.h000066400000000000000000000066231415223013700165520ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Maciej Debski (Google Inc.) ******************************************************************************/ #ifndef PERF_JITDUMP_H #define PERF_JITDUMP_H #if defined(PERF_JITDUMP_NOLIBXSMM) # define LIBXSMM_RETARGETABLE # define LIBXSMM_EXTERN_C # define PERF_JITDUMP_GLOBAL_VARIABLE(VARIABLE, INIT) VARIABLE = (INIT) #else # include # define PERF_JITDUMP_GLOBAL_VARIABLE(VARIABLE, INIT) LIBXSMM_APIVAR_PRIVATE(VARIABLE) #endif LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE jitdump_file_header { uint32_t magic; uint32_t version; uint32_t total_size; uint32_t elf_mach; uint32_t pad1; uint32_t pid; uint64_t timestamp; uint64_t flags; } jitdump_file_header; LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE jitdump_record_header { uint32_t id; uint32_t total_size; uint64_t timestamp; } jitdump_record_header; LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE jitdump_record_code_load { uint32_t pid; uint32_t tid; uint64_t vma; uint64_t code_addr; uint64_t code_size; uint64_t code_index; /* Needs to be followed with 0-terminated function name and raw native code */ } jitdump_record_code_load; LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE jitdump_record_code_move { uint32_t pid; uint32_t tid; uint64_t vma; uint64_t old_code_addr; uint64_t new_code_addr; uint64_t code_size; uint64_t code_index; } jitdump_record_code_move; LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE jitdump_debug_entry { uint64_t code_addr; uint32_t line; uint32_t discrim; /* Followed by 0-terminated source file name. */ } jitdump_debug_entry; LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE jitdump_record_code_debug_info { uint64_t code_addr; uint64_t nr_entry; /* Followed by nr_entry jitdump_debug_entry structures. */ } jitdump_record_code_debug_info; /* Currently empty */ LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE jitdump_record_code_close { int dummy; /* avoid warning about struct without member */ } jitdump_record_code_close; /* magic is "JiTD", serialized differently dependent on endianness. */ PERF_JITDUMP_GLOBAL_VARIABLE(/*const*/ uint32_t JITDUMP_MAGIC, 'J' << 24 | 'i' << 16 | 'T' << 8 | 'D'); PERF_JITDUMP_GLOBAL_VARIABLE(/*const*/ uint32_t JITDUMP_MAGIC_SWAPPED, 'J' | 'i' << 8 | 'T' << 16 | 'D' << 24); PERF_JITDUMP_GLOBAL_VARIABLE(/*const*/ uint32_t JITDUMP_VERSION, 1); PERF_JITDUMP_GLOBAL_VARIABLE(/*const*/ uint64_t JITDUMP_FLAGS_ARCH_TIMESTAMP, 1ULL /*<< 0*/); PERF_JITDUMP_GLOBAL_VARIABLE(/*const*/ uint32_t JITDUMP_CODE_LOAD, 0); PERF_JITDUMP_GLOBAL_VARIABLE(/*const*/ uint32_t JITDUMP_CODE_MOVE, 1); PERF_JITDUMP_GLOBAL_VARIABLE(/*const*/ uint32_t JITDUMP_CODE_DEBUG_INFO, 2); PERF_JITDUMP_GLOBAL_VARIABLE(/*const*/ uint32_t JITDUMP_CODE_CLOSE, 3); #endif /* PERF_JITDUMP_H */ libxsmm-1.17/src/template/000077500000000000000000000000001415223013700155155ustar00rootroot00000000000000libxsmm-1.17/src/template/libxsmm.f000066400000000000000000003264001415223013700173440ustar00rootroot00000000000000!=======================================================================! ! Copyright (c) Intel Corporation - All rights reserved. ! ! This file is part of the LIBXSMM library. ! ! ! ! For information on the license, see the LICENSE file. ! ! Further information: https://github.com/hfp/libxsmm/ ! ! SPDX-License-Identifier: BSD-3-Clause ! !=======================================================================! ! Hans Pabst (Intel Corp.) !=======================================================================! MODULE LIBXSMM USE, INTRINSIC :: ISO_C_BINDING, ONLY: & & C_DOUBLE, C_FLOAT, C_DOUBLE_COMPLEX, C_FLOAT_COMPLEX, & & C_LONG_LONG, C_INT, C_SHORT, C_CHAR, C_INT8_T, C_BOOL, & & C_F_POINTER, C_ASSOCIATED, C_LOC, C_PTR, & & C_FUNPTR, C_NULL_FUNPTR, C_NULL_PTR IMPLICIT NONE !> Name of the version (stringized set of version numbers). CHARACTER(*), PARAMETER :: LIBXSMM_VERSION = "$VERSION" !> Name of the branch of which the version is derived from. CHARACTER(*), PARAMETER :: LIBXSMM_BRANCH = "$BRANCH" !> Major version based on the last reachable tag under RCS. INTEGER(C_INT), PARAMETER :: LIBXSMM_VERSION_MAJOR = $MAJOR !> Minor version based on the last reachable tag of the RCS. INTEGER(C_INT), PARAMETER :: LIBXSMM_VERSION_MINOR = $MINOR !> Update number based on the last reachable tag under RCS. INTEGER(C_INT), PARAMETER :: LIBXSMM_VERSION_UPDATE = $UPDATE !> Patch number counting commits since the last version stamp. INTEGER(C_INT), PARAMETER :: LIBXSMM_VERSION_PATCH = $PATCH !> Parameters the library and static kernels were built for. INTEGER(C_INT), PARAMETER :: LIBXSMM_CACHELINE = $CACHELINE INTEGER(C_INT), PARAMETER :: LIBXSMM_ALIGNMENT = $CACHELINE INTEGER(C_INT), PARAMETER :: LIBXSMM_PREFETCH = $PREFETCH INTEGER(C_INT), PARAMETER :: LIBXSMM_MAX_MNK = $MAX_MNK INTEGER(C_INT), PARAMETER :: LIBXSMM_MAX_DIM = $MAX_DIM INTEGER(C_INT), PARAMETER :: LIBXSMM_FLAGS = $FLAGS INTEGER(C_INT), PARAMETER :: LIBXSMM_ILP64 = $ILP64 !> Parameters supplied for backward compatibility (deprecated). INTEGER(C_INT), PARAMETER :: LIBXSMM_COL_MAJOR = 1 INTEGER(C_INT), PARAMETER :: LIBXSMM_ROW_MAJOR = 0 !> LIBXSMM_BLASINT_KIND impacts BLAS interface (LP64: 32-bit, ILP64: 64-bit). INTEGER(C_INT), PARAMETER :: LIBXSMM_BLASINT_KIND = $BLASINT_KIND !> Integer kind used by timer interface. INTEGER(C_INT), PARAMETER :: LIBXSMM_TICKINT_KIND = C_LONG_LONG !> Parameters representing the GEMM performed by the simplified interface. REAL(C_DOUBLE), PARAMETER :: LIBXSMM_ALPHA = REAL($ALPHA, C_DOUBLE) REAL(C_DOUBLE), PARAMETER :: LIBXSMM_BETA = REAL($BETA, C_DOUBLE) !> Flag enumeration which can be IORed. INTEGER(C_INT), PARAMETER :: & & LIBXSMM_GEMM_FLAG_NONE = 0, & & LIBXSMM_GEMM_FLAG_TRANS_A = 1, & & LIBXSMM_GEMM_FLAG_TRANS_B = 2, & & LIBXSMM_GEMM_FLAG_TRANS_AB = IOR( & & LIBXSMM_GEMM_FLAG_TRANS_A, LIBXSMM_GEMM_FLAG_TRANS_B), & & LIBXSMM_GEMM_FLAG_BETA_0 = 16 !> Flag enumeration which can be IORed. INTEGER(C_INT), PARAMETER :: & ! Handle recorded batch unsynchronized-parallel. & LIBXSMM_MMBATCH_FLAG_DEFAULT = 0, & ! Synchronize among C matrices. & LIBXSMM_MMBATCH_FLAG_SYNCHRONIZED = 512, & ! Handle recorded batch sequentially. & LIBXSMM_MMBATCH_FLAG_SEQUENTIAL = 1024, & ! Only record a statistic of potential SMMs. & LIBXSMM_MMBATCH_FLAG_STATISTIC = 2048 !> Enumerates element/data types. INTEGER(C_INT), PARAMETER :: & & LIBXSMM_DATATYPE_F64 = 0, & & LIBXSMM_DATATYPE_F32 = 1, & & LIBXSMM_DATATYPE_BF16 = 2, & & LIBXSMM_DATATYPE_I64 = 3, & & LIBXSMM_DATATYPE_I32 = 4, & & LIBXSMM_DATATYPE_I16 = 5, & & LIBXSMM_DATATYPE_I8 = 6, & & LIBXSMM_DATATYPE_UNSUPPORTED = 7 !> Denotes the precision/data type of GEMM (for weak-typed !> interface functions such as libxsmm_xmmdispatch). INTEGER(C_INT), PARAMETER :: & & LIBXSMM_GEMM_PRECISION_F64 = LIBXSMM_DATATYPE_F64, & & LIBXSMM_GEMM_PRECISION_F32 = LIBXSMM_DATATYPE_F32, & & LIBXSMM_GEMM_PRECISION_BF16 = LIBXSMM_DATATYPE_BF16, & & LIBXSMM_GEMM_PRECISION_I32 = LIBXSMM_DATATYPE_I32, & & LIBXSMM_GEMM_PRECISION_I16 = LIBXSMM_DATATYPE_I16, & & LIBXSMM_GEMM_PRECISION_I8 = LIBXSMM_DATATYPE_I8 !> Enumeration of the available prefetch strategies which can be IORed. INTEGER(C_INT), PARAMETER :: & ! Automatically select strategy (frontend). & LIBXSMM_PREFETCH_AUTO = -1, & ! No prefetching and no prefetch function signature. & LIBXSMM_PREFETCH_NONE = 0, & ! Only function prefetch signature. & LIBXSMM_PREFETCH_SIGONLY = 1, & ! Prefetch PA using accesses to A. & LIBXSMM_GEMM_PREFETCH_AL2 = 2, & ! Prefetch PB using accesses to C. & LIBXSMM_GEMM_PREFETCH_BL2_VIA_C = 4, & ! Prefetch A ahead. & LIBXSMM_GEMM_PREFETCH_AL2_AHEAD = 8, & ! Composed prefetch strategies. & LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C = IOR( & & LIBXSMM_GEMM_PREFETCH_BL2_VIA_C, & & LIBXSMM_GEMM_PREFETCH_AL2), & & LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C_AHEAD = IOR( & & LIBXSMM_GEMM_PREFETCH_BL2_VIA_C, & & LIBXSMM_GEMM_PREFETCH_AL2_AHEAD), & ! Current B into L1. & LIBXSMM_GEMM_PREFETCH_BL1 = 16 !> Enumerates the available target architectures and instruction !> set extensions as returned by libxsmm_get_target_archid(). INTEGER(C_INT), PARAMETER :: & & LIBXSMM_TARGET_ARCH_UNKNOWN = 0, & & LIBXSMM_TARGET_ARCH_GENERIC = 1, & & LIBXSMM_X86_GENERIC = 1002, & & LIBXSMM_X86_SSE3 = 1003, & & LIBXSMM_X86_SSE4 = 1004, & & LIBXSMM_X86_AVX = 1005, & & LIBXSMM_X86_AVX2 = 1006, & & LIBXSMM_X86_AVX512 = 1007, & & LIBXSMM_X86_AVX512_MIC = 1010, & & LIBXSMM_X86_AVX512_KNM = 1011, & & LIBXSMM_X86_AVX512_CORE = 1020, & & LIBXSMM_X86_AVX512_CLX = 1021, & & LIBXSMM_X86_AVX512_CPX = 1022 !> Generic function type (double-precision). TYPE, BIND(C) :: LIBXSMM_DMMFUNCTION TYPE(C_FUNPTR) :: handle = C_NULL_FUNPTR END TYPE !> Generic function type (single-precision). TYPE, BIND(C) :: LIBXSMM_SMMFUNCTION TYPE(C_FUNPTR) :: handle = C_NULL_FUNPTR END TYPE !> Generic function type (low-precision) TYPE, BIND(C) :: LIBXSMM_WIMMFUNCTION TYPE(C_FUNPTR) :: handle = C_NULL_FUNPTR END TYPE !> Generic function types with certain arity. ABSTRACT INTERFACE PURE SUBROUTINE LIBXSMM_FUNCTION3(a, b, c) BIND(C) IMPORT :: C_PTR TYPE(C_PTR), INTENT(IN), VALUE :: a, b, c END SUBROUTINE PURE SUBROUTINE LIBXSMM_FUNCTION6(a, b, c, pa, pb, pc) BIND(C) IMPORT :: C_PTR TYPE(C_PTR), INTENT(IN), VALUE :: a, b, c TYPE(C_PTR), INTENT(IN), VALUE :: pa, pb, pc END SUBROUTINE END INTERFACE !> Structure of differences with matrix norms according !> to http://www.netlib.org/lapack/lug/node75.html). TYPE, BIND(C) :: LIBXSMM_MATDIFF_INFO REAL(C_DOUBLE) norm1_abs, norm1_rel !! One-norm REAL(C_DOUBLE) normi_abs, normi_rel !! Infinity-norm REAL(C_DOUBLE) normf_rel !! Froebenius-norm !> Maximum difference, and L2-norm (both absolute and relative). REAL(C_DOUBLE) linf_abs, linf_rel, l2_abs, l2_rel !> Statistics: sum/l1, min., max., arith. avg., and variance. REAL(C_DOUBLE) l1_ref, min_ref, max_ref, avg_ref, var_ref !> Statistics: sum/l1, min., max., arith. avg., and variance. REAL(C_DOUBLE) l1_tst, min_tst, max_tst, avg_tst, var_tst !> Location (m, n) of largest difference (linf_abs). INTEGER(LIBXSMM_BLASINT_KIND) m INTEGER(LIBXSMM_BLASINT_KIND) n END TYPE !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_init, libxsmm_finalize !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_get_gemm_auto_prefetch !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_set_gemm_auto_prefetch !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_get_target_archid !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_set_target_archid !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_set_target_arch !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_get_verbosity !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_set_verbosity !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_release_kernel !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_matdiff_reduce !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_matdiff_clear !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_xmmdispatch2 !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_xmmdispatch !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_xmmcall_abc !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_xmmcall_prf !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_xclear !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_xrelease !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_xmatcopy !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_xitrans !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_xotrans !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_matcopy_omp !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_otrans_omp !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_dgemm_omp !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_sgemm_omp !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_mmbatch !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_mmbatch_begin !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_mmbatch_end !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_gemm_batch !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_gemm_batch_omp !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_timer_duration !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_timer_tick !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_xhash !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_xdiff INTERFACE !> Initialize the library; pay for setup cost at a specific point. SUBROUTINE libxsmm_init() BIND(C) END SUBROUTINE !> De-initialize the library and free internal memory (optional). SUBROUTINE libxsmm_finalize() BIND(C) END SUBROUTINE !> Get the default prefetch strategy. PURE FUNCTION libxsmm_get_gemm_auto_prefetch() BIND(C) IMPORT :: C_INT INTEGER(C_INT) :: libxsmm_get_gemm_auto_prefetch END FUNCTION !> Set the default prefetch strategy. SUBROUTINE libxsmm_set_gemm_auto_prefetch(strategy) BIND(C) IMPORT :: C_INT INTEGER(C_INT), INTENT(IN), VALUE :: strategy END SUBROUTINE !> Returns the architecture and instruction set extension as determined !> by the CPUID flags, as set by the libxsmm_get_target_arch* functions, !> or as set by the LIBXSMM_TARGET environment variable. PURE FUNCTION libxsmm_get_target_archid() BIND(C) IMPORT :: C_INT INTEGER(C_INT) :: libxsmm_get_target_archid END FUNCTION !> Set target architecture (archid: see PARAMETER enumeration) !> for subsequent code generation (JIT). SUBROUTINE libxsmm_set_target_archid(archid) BIND(C) IMPORT :: C_INT INTEGER(C_INT), INTENT(IN), VALUE :: archid END SUBROUTINE !> Set target architecture for subsequent code generation (JIT). !> arch="0"|"sse"|"snb"|"hsw"|"knl"|"knm"|"skx"|"clx"|"cpx", !> or "0" to rely on the CPUID (default). !> There are some alternative target names as well: !> "sse", "avx", "avx2", "avx3" (incomplete list). SUBROUTINE libxsmm_set_target_arch(arch) BIND(C) IMPORT :: C_CHAR CHARACTER(C_CHAR), INTENT(IN) :: arch(*) END SUBROUTINE !> Get the level of verbosity. PURE FUNCTION libxsmm_get_verbosity() BIND(C) IMPORT :: C_INT INTEGER(C_INT) :: libxsmm_get_verbosity END FUNCTION !> Set the level of verbosity (0: off, positive value: verbosity level, !> negative value: maximum verbosity, which also dumps JIT-code). SUBROUTINE libxsmm_set_verbosity(level) BIND(C) IMPORT :: C_INT INTEGER(C_INT), INTENT(IN), VALUE :: level END SUBROUTINE !> Impure function which returns the current clock tick of a !> monotonic timer source; uses a platform-specific resolution. !> Implicit FORTRAN 77 interface: not available. INTEGER(LIBXSMM_TICKINT_KIND) & & FUNCTION libxsmm_timer_tick() BIND(C) IMPORT :: LIBXSMM_TICKINT_KIND END FUNCTION !> Impure function (timer freq. may vary) which returns the duration !> (in seconds) between two values received by libxsmm_timer_tick. !> Implicit FORTRAN 77 interface: not available. FUNCTION libxsmm_timer_duration(tick0, tick1) BIND(C) IMPORT :: LIBXSMM_TICKINT_KIND, C_DOUBLE INTEGER(LIBXSMM_TICKINT_KIND), INTENT(IN), VALUE :: tick0 INTEGER(LIBXSMM_TICKINT_KIND), INTENT(IN), VALUE :: tick1 REAL(C_DOUBLE) :: libxsmm_timer_duration END FUNCTION !> Deallocates the JIT'ted code, or unregisters !> and releases code from the registry. !> Implicit FORTRAN 77 interface: !> INTEGER(8) :: kernel SUBROUTINE libxsmm_release_kernel(kernel) & & BIND(C, NAME="libxsmm_release_kernel_") IMPORT :: C_FUNPTR TYPE(C_FUNPTR), INTENT(IN) :: kernel END SUBROUTINE !> Type-generic (unsafe) code dispatch (trylock: impure routine). !> Implicit FORTRAN 77 interface: !> INTEGER(4) :: gemm_precision, flags, prefetch !> INTEGER(4|8) :: m, n, k, lda, ldb, ldc !> REAL(4|8) :: alpha, beta !> INTEGER(8) :: kernel SUBROUTINE libxsmm_xmmdispatch(kernel, gemm_precision, & & m, n, k, lda, ldb, ldc, alpha, beta, flags, prefetch) & & BIND(C, NAME="libxsmm_xmmdispatch_") IMPORT :: C_FUNPTR, C_PTR, C_INT, LIBXSMM_BLASINT_KIND TYPE(C_FUNPTR), INTENT(OUT) :: kernel INTEGER(C_INT), INTENT(IN) :: gemm_precision INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k TYPE(C_PTR), INTENT(IN), VALUE :: lda, ldb, ldc TYPE(C_PTR), INTENT(IN), VALUE :: alpha, beta TYPE(C_PTR), INTENT(IN), VALUE :: flags, prefetch END SUBROUTINE !> Type-generic (unsafe) code dispatch (trylock: impure routine). !> Implicit FORTRAN 77 interface: !> INTEGER(4) :: iprec, oprec, flags, prefetch !> INTEGER(4|8) :: m, n, k, lda, ldb, ldc !> REAL(4|8) :: alpha, beta !> INTEGER(8) :: kernel SUBROUTINE libxsmm_xmmdispatch2(kernel, iprec, oprec, & & m, n, k, lda, ldb, ldc, alpha, beta, flags, prefetch) & & BIND(C, NAME="libxsmm_xmmdispatch2_") IMPORT :: C_FUNPTR, C_PTR, C_INT, LIBXSMM_BLASINT_KIND TYPE(C_FUNPTR), INTENT(OUT) :: kernel INTEGER(C_INT), INTENT(IN) :: iprec, oprec INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k TYPE(C_PTR), INTENT(IN), VALUE :: lda, ldb, ldc TYPE(C_PTR), INTENT(IN), VALUE :: alpha, beta TYPE(C_PTR), INTENT(IN), VALUE :: flags, prefetch END SUBROUTINE !> Generic call routine (3-argument form). !> Implicit FORTRAN 77 interface: !> REAL(4|8) :: a, b, c !> INTEGER(8) :: kernel PURE SUBROUTINE libxsmm_xmmcall_abc(kernel, a, b, c) & & BIND(C, NAME="libxsmm_xmmcall_abc_") IMPORT C_FUNPTR, C_PTR TYPE(C_FUNPTR), INTENT(IN) :: kernel TYPE(C_PTR), INTENT(IN), VALUE :: a, b, c END SUBROUTINE !> Generic call routine (6-argument form). !> Implicit FORTRAN 77 interface: !> REAL(4|8) :: a, b, c, pa, pb, pc !> INTEGER(8) :: kernel PURE SUBROUTINE libxsmm_xmmcall_prf(kernel, & & a, b, c, pa, pb, pc) & & BIND(C, NAME="libxsmm_xmmcall_prf_") IMPORT C_FUNPTR, C_PTR TYPE(C_FUNPTR), INTENT(IN) :: kernel TYPE(C_PTR), INTENT(IN), VALUE :: a, b, c, pa, pb, pc END SUBROUTINE !> Fill destination with zeros; treats dst in raw/binary fashion. SUBROUTINE libxsmm_xclear(dst, nbytes) & & BIND(C, NAME="libxsmm_xclear_") IMPORT C_PTR, C_INT TYPE(C_PTR), INTENT(IN), VALUE :: dst INTEGER(C_INT), INTENT(IN) :: nbytes END SUBROUTINE !> Remove key-value pair from code registry and release memory. SUBROUTINE libxsmm_xrelease(key, keysize) & & BIND(C, NAME="libxsmm_xrelease_") IMPORT C_PTR, C_INT TYPE(C_PTR), INTENT(IN), VALUE :: key INTEGER(C_INT), INTENT(IN) :: keysize END SUBROUTINE !> Matrix-copy (2-dimensional copy) routine. !> Implicit FORTRAN 77 interface: !> ARRAY :: input, output !> INTEGER(4|8) :: m, n, ldi, ldo !> INTEGER(4) :: typesize PURE SUBROUTINE libxsmm_xmatcopy(output, input, typesize, & & m, n, ldi, ldo) BIND(C, NAME="libxsmm_matcopy_") IMPORT LIBXSMM_BLASINT_KIND, C_PTR, C_INT INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, ldi, ldo TYPE(C_PTR), INTENT(IN), VALUE :: output, input INTEGER(C_INT), INTENT(IN) :: typesize END SUBROUTINE !> Transpose a matrix (in-place form). !> Implicit FORTRAN 77 interface: !> ARRAY :: matrix !> INTEGER(4|8) :: m, n, ld !> INTEGER(4) :: typesize PURE SUBROUTINE libxsmm_xitrans(matrix, typesize, m, n, ld) & & BIND(C, NAME="libxsmm_itrans_") IMPORT C_PTR, C_INT, LIBXSMM_BLASINT_KIND INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, ld TYPE(C_PTR), INTENT(IN), VALUE :: matrix INTEGER(C_INT), INTENT(IN) :: typesize END SUBROUTINE !> Transpose a matrix (out-of-place form). !> Implicit FORTRAN 77 interface: !> ARRAY :: input, output !> INTEGER(4|8) :: m, n, ldi, ldo !> INTEGER(4) :: typesize PURE SUBROUTINE libxsmm_xotrans(output, input, & & typesize, m, n, ldi, ldo) & & BIND(C, NAME="libxsmm_otrans_") IMPORT C_PTR, C_INT, LIBXSMM_BLASINT_KIND INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, ldi, ldo TYPE(C_PTR), INTENT(IN), VALUE :: output, input INTEGER(C_INT), INTENT(IN) :: typesize END SUBROUTINE !> Matrix copy; MT via libxsmmext (out-of-place form). !> Implicit FORTRAN 77 interface: !> ARRAY :: output, input !> INTEGER(4|8) :: m, n, ldi, ldo !> INTEGER(4) :: typesize PURE SUBROUTINE libxsmm_matcopy_omp(output, input, & & typesize, m, n, ldi, ldo) & & BIND(C, NAME="libxsmm_matcopy_omp_") IMPORT C_PTR, C_INT, LIBXSMM_BLASINT_KIND INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, ldi, ldo TYPE(C_PTR), INTENT(IN), VALUE :: output, input INTEGER(C_INT), INTENT(IN) :: typesize END SUBROUTINE !> Matrix transposition; MT via libxsmmext (out-of-place form). !> Implicit FORTRAN 77 interface: !> ARRAY :: output, input !> INTEGER(4|8) :: m, n, ldi, ldo !> INTEGER(4) :: typesize PURE SUBROUTINE libxsmm_otrans_omp(output, input, & & typesize, m, n, ldi, ldo) & & BIND(C, NAME="libxsmm_otrans_omp_") IMPORT C_PTR, C_INT, LIBXSMM_BLASINT_KIND INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, ldi, ldo TYPE(C_PTR), INTENT(IN), VALUE :: output, input INTEGER(C_INT), INTENT(IN) :: typesize END SUBROUTINE !> General dense MM; MT via libxsmmext (double-precision). !> Implicit FORTRAN 77 interface: similar to DGEMM. PURE SUBROUTINE libxsmm_dgemm_omp(transa, transb, m, n, k, & & alpha, a, lda, b, ldb, beta, c, ldc) & & BIND(C, NAME="libxsmm_dgemm_omp_") IMPORT C_DOUBLE, C_CHAR, LIBXSMM_BLASINT_KIND CHARACTER(C_CHAR), INTENT(IN) :: transa, transb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda, ldb, ldc REAL(C_DOUBLE), INTENT(IN) :: alpha, beta REAL(C_DOUBLE), INTENT(IN) :: a(lda,*), b(ldb,*) REAL(C_DOUBLE), INTENT(INOUT) :: c(ldc,*) END SUBROUTINE !> General dense MM; MT via libxsmmext (single-precision). !> Implicit FORTRAN 77 interface: similar to SGEMM. PURE SUBROUTINE libxsmm_sgemm_omp(transa, transb, m, n, k, & & alpha, a, lda, b, ldb, beta, c, ldc) & & BIND(C, NAME="libxsmm_sgemm_omp_") IMPORT C_FLOAT, C_CHAR, LIBXSMM_BLASINT_KIND CHARACTER(C_CHAR), INTENT(IN) :: transa, transb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda, ldb, ldc REAL(C_FLOAT), INTENT(IN) :: alpha, beta REAL(C_FLOAT), INTENT(IN) :: a(lda,*), b(ldb,*) REAL(C_FLOAT), INTENT(INOUT) :: c(ldc,*) END SUBROUTINE !> Process a series of MMs (batch). See also libxsmm_gemm_batch_omp. !> The kind of matrix operands (a, b, c) depend on index_stride: !> index_stride==0: pointers to pointers of elements, e.g., !> double** for the C matrices. !> index_stride!=0: pointer to elements, e.g., !> const double* for the A and B matrices. !> Implicit FORTRAN 77 interface: !> INTEGER(4) :: iprec, oprec !> REAL(4|8) :: alpha, beta !> ARRAY :: a, b, c !> ARRAY/VALUE :: stride_a, stride_b, stride_c !> INTEGER(4|8) :: index_base, index_stride, batchsize !> INTEGER(4) :: tid, nthreads !> Otherwise arguments are similar to GEMM. PURE SUBROUTINE libxsmm_mmbatch(iprec, oprec, transa, transb, & & m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, index_base, & & index_stride, stride_a, stride_b, stride_c, batchsize, & & tid, nthreads) & & BIND(C, NAME="libxsmm_mmbatch_") IMPORT C_PTR, C_CHAR, C_INT, LIBXSMM_BLASINT_KIND !> Determines index-base (usually 0, 1 for one-based indexes). INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: index_base !> Stride (measured in Bytes) used to walk stride_*. !> In Fortran: index_stride!=0. INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: index_stride !> Number of SMMs. If the size is given as a negative value, !> then internal synchronization is omitted. INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: batchsize INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda, ldb, ldc CHARACTER(C_CHAR), INTENT(IN) :: transa, transb TYPE(C_PTR), INTENT(IN), VALUE :: alpha, beta TYPE(C_PTR), INTENT(IN), VALUE :: a, b, c !> Arrays of indexes determining the position of !> a, b, and c operands. TYPE(C_PTR), INTENT(IN), VALUE :: stride_a TYPE(C_PTR), INTENT(IN), VALUE :: stride_b TYPE(C_PTR), INTENT(IN), VALUE :: stride_c INTEGER(C_INT), INTENT(IN) :: iprec, oprec !> Thread-ID (TID), and number of threads. INTEGER(C_INT), INTENT(IN) :: tid, nthreads END SUBROUTINE !> Process a series of SMMs (batch). See also libxsmm_mmbatch. !> Implicit FORTRAN 77 interface: !> INTEGER(4) :: iprec, oprec !> REAL(4|8) :: alpha, beta !> ARRAY :: a, b, c !> ARRAY/VALUE :: stride_a, stride_b, stride_c !> INTEGER(4|8) :: index_base, index_stride, batchsize !> Otherwise arguments are similar to GEMM. PURE SUBROUTINE libxsmm_gemm_batch(iprec, oprec, & & transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, & & index_base, index_stride, stride_a, stride_b, stride_c, & & batchsize) & & BIND(C, NAME="libxsmm_gemm_batch_") IMPORT C_PTR, C_CHAR, C_INT, LIBXSMM_BLASINT_KIND INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: index_base INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: index_stride INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: batchsize INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda, ldb, ldc CHARACTER(C_CHAR), INTENT(IN) :: transa, transb TYPE(C_PTR), INTENT(IN), VALUE :: alpha, beta TYPE(C_PTR), INTENT(IN), VALUE :: a, b, c TYPE(C_PTR), INTENT(IN), VALUE :: stride_a TYPE(C_PTR), INTENT(IN), VALUE :: stride_b TYPE(C_PTR), INTENT(IN), VALUE :: stride_c INTEGER(C_INT), INTENT(IN) :: iprec, oprec END SUBROUTINE !> Process a series of SMMs (batch) with OpenMP (libxsmmext). !> Implicit FORTRAN 77 interface: !> INTEGER(4) :: iprec, oprec !> REAL(4|8) :: alpha, beta !> ARRAY :: a, b, c !> ARRAY/VALUE :: stride_a, stride_b, stride_c !> INTEGER(4|8) :: index_base, index_stride, batchsize !> Otherwise arguments are similar to GEMM. PURE SUBROUTINE libxsmm_gemm_batch_omp(iprec, oprec, & & transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, & & index_base, index_stride, stride_a, stride_b, stride_c, & & batchsize) & & BIND(C, NAME="libxsmm_gemm_batch_omp_") IMPORT C_PTR, C_CHAR, C_INT, LIBXSMM_BLASINT_KIND INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: index_base INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: index_stride INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: batchsize INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda, ldb, ldc CHARACTER(C_CHAR), INTENT(IN) :: transa, transb TYPE(C_PTR), INTENT(IN), VALUE :: alpha, beta TYPE(C_PTR), INTENT(IN), VALUE :: a, b, c TYPE(C_PTR), INTENT(IN), VALUE :: stride_a TYPE(C_PTR), INTENT(IN), VALUE :: stride_b TYPE(C_PTR), INTENT(IN), VALUE :: stride_c INTEGER(C_INT), INTENT(IN) :: iprec, oprec END SUBROUTINE !> This function is a no-op unless LIBXSMM is built to intercept GEMM. !> Pointer arguments are used to filter intercepted GEMM calls such that !> non-NULL values match. Otherwise (NULL) the respective argument is !> considered a "free value", i.e., every value can match; !> libxsmmext required. !> Implicit FORTRAN 77 interface: !> INTEGER(4) :: gemm_precision, flags !> INTEGER(4|8) :: m, n, k, lda, ldb, ldc !> REAL(4|8) :: alpha, beta SUBROUTINE libxsmm_mmbatch_begin(gemm_precision, flags, & & m, n, k, lda, ldb, ldc, alpha, beta) BIND(C) IMPORT C_PTR, C_INT, LIBXSMM_BLASINT_KIND INTEGER(C_INT), INTENT(IN), VALUE :: gemm_precision INTEGER(C_INT), INTENT(IN) :: flags INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda, ldb, ldc TYPE(C_PTR), INTENT(IN), VALUE :: alpha, beta END SUBROUTINE !> Processes the batch of previously recorded SMMs !> (libxsmm_mmbatch_begin); libxsmmext required. !> Implicit FORTRAN 77 interface: available. SUBROUTINE libxsmm_mmbatch_end() BIND(C) END SUBROUTINE !> Reduces input into output such that the difference is maintained !> or increased (max function). The very first (initial) output !> should be zeroed (libxsmm_matdiff_clear). !> Implicit FORTRAN 77 interface: available. PURE SUBROUTINE libxsmm_matdiff_reduce(output, input) BIND(C) IMPORT LIBXSMM_MATDIFF_INFO TYPE(LIBXSMM_MATDIFF_INFO), INTENT(INOUT) :: output TYPE(LIBXSMM_MATDIFF_INFO), INTENT(IN) :: input END SUBROUTINE !> Clears the given info-structure, e.g., for the initial !> reduction-value (libxsmm_matdiff_reduce). !> Implicit FORTRAN 77 interface: available. PURE SUBROUTINE libxsmm_matdiff_clear(info) BIND(C) IMPORT LIBXSMM_MATDIFF_INFO TYPE(LIBXSMM_MATDIFF_INFO), INTENT(OUT) :: info END SUBROUTINE !> Calculates a hash value for the given array and seed. !> Routine suitable for FORTRAN 77; keysize in Bytes. PURE SUBROUTINE libxsmm_xhash(hash_seed, key, keysize) & & BIND(C, NAME="libxsmm_xhash_") IMPORT C_INT, C_PTR INTEGER(C_INT), INTENT(INOUT) :: hash_seed INTEGER(C_INT), INTENT(IN) :: keysize TYPE(C_PTR), INTENT(IN), VALUE :: key END SUBROUTINE !> Calculates if there is a difference between two arrays. !> Routine suitable for FORTRAN 77; size in Bytes. PURE SUBROUTINE libxsmm_xdiff(diff, a, b, nbytes) & & BIND(C, NAME="libxsmm_xdiff_") IMPORT C_PTR, C_LONG_LONG, C_BOOL TYPE(C_PTR), INTENT(IN), VALUE :: a, b INTEGER(C_LONG_LONG), INTENT(IN) :: nbytes LOGICAL(C_BOOL), INTENT(OUT) :: diff END SUBROUTINE END INTERFACE$MNK_INTERFACE_LIST INTERFACE libxsmm_ptr0 MODULE PROCEDURE libxsmm_ptr_z0, libxsmm_ptr_c0 MODULE PROCEDURE libxsmm_ptr_d0, libxsmm_ptr_s0 MODULE PROCEDURE libxsmm_ptr_i0, libxsmm_ptr_w0 MODULE PROCEDURE libxsmm_ptr_j0 !! Byte/char MODULE PROCEDURE libxsmm_ptr_b0 !! Byte/char MODULE PROCEDURE libxsmm_ptr_l0 !! long long END INTERFACE INTERFACE libxsmm_ptr1 MODULE PROCEDURE libxsmm_ptr_z1, libxsmm_ptr_c1 MODULE PROCEDURE libxsmm_ptr_d1, libxsmm_ptr_s1 MODULE PROCEDURE libxsmm_ptr_i1, libxsmm_ptr_w1 MODULE PROCEDURE libxsmm_ptr_j1 !! Byte/char MODULE PROCEDURE libxsmm_ptr_b1 !! Byte/char MODULE PROCEDURE libxsmm_ptr_l1 !! long long MODULE PROCEDURE libxsmm_ptr_dmm MODULE PROCEDURE libxsmm_ptr_smm MODULE PROCEDURE libxsmm_ptr_wimm END INTERFACE INTERFACE libxsmm_ptr2 MODULE PROCEDURE libxsmm_ptr_z2, libxsmm_ptr_c2 MODULE PROCEDURE libxsmm_ptr_d2, libxsmm_ptr_s2 MODULE PROCEDURE libxsmm_ptr_i2, libxsmm_ptr_w2 MODULE PROCEDURE libxsmm_ptr_j2 !! Byte/char MODULE PROCEDURE libxsmm_ptr_b2 !! Byte/char MODULE PROCEDURE libxsmm_ptr_l2 !! long long END INTERFACE INTERFACE libxsmm_ptr MODULE PROCEDURE libxsmm_ptr_z0, libxsmm_ptr_c0 MODULE PROCEDURE libxsmm_ptr_d0, libxsmm_ptr_s0 MODULE PROCEDURE libxsmm_ptr_i0, libxsmm_ptr_w0 MODULE PROCEDURE libxsmm_ptr_j0 !! Byte/char MODULE PROCEDURE libxsmm_ptr_b0 !! Byte/char MODULE PROCEDURE libxsmm_ptr_l0 !! long long MODULE PROCEDURE libxsmm_ptr_z1, libxsmm_ptr_c1 MODULE PROCEDURE libxsmm_ptr_d1, libxsmm_ptr_s1 MODULE PROCEDURE libxsmm_ptr_i1, libxsmm_ptr_w1 MODULE PROCEDURE libxsmm_ptr_j1 !! Byte/char MODULE PROCEDURE libxsmm_ptr_b1 !! Byte/char MODULE PROCEDURE libxsmm_ptr_l1 !! long long MODULE PROCEDURE libxsmm_ptr_z2, libxsmm_ptr_c2 MODULE PROCEDURE libxsmm_ptr_d2, libxsmm_ptr_s2 MODULE PROCEDURE libxsmm_ptr_i2, libxsmm_ptr_w2 MODULE PROCEDURE libxsmm_ptr_j2 !! Byte/char MODULE PROCEDURE libxsmm_ptr_b2 !! Byte/char MODULE PROCEDURE libxsmm_ptr_l2 !! long long MODULE PROCEDURE libxsmm_ptr_dmm MODULE PROCEDURE libxsmm_ptr_smm MODULE PROCEDURE libxsmm_ptr_wimm END INTERFACE !> Deallocates JIT'ted code, or unregisters/releases code from registry. INTERFACE libxsmm_release_mmkernel MODULE PROCEDURE libxsmm_release_dmmkernel MODULE PROCEDURE libxsmm_release_smmkernel MODULE PROCEDURE libxsmm_release_wimmkernel END INTERFACE !> Construct JIT-code depending on given argument set. INTERFACE libxsmm_mmdispatch MODULE PROCEDURE libxsmm_dmmdispatch, libxsmm_smmdispatch MODULE PROCEDURE libxsmm_wimmdispatch END INTERFACE !> Construct JIT-code depending on given argument set. INTERFACE libxsmm_dispatch MODULE PROCEDURE libxsmm_dmmdispatch, libxsmm_smmdispatch MODULE PROCEDURE libxsmm_wimmdispatch END INTERFACE !> Check if a function is available (LIBXSMM_?MMFUNCTION). INTERFACE libxsmm_mmavailable MODULE PROCEDURE libxsmm_dmmavailable, libxsmm_smmavailable MODULE PROCEDURE libxsmm_wimmavailable END INTERFACE !> Check if a function is available (LIBXSMM_?MMFUNCTION). INTERFACE libxsmm_available MODULE PROCEDURE libxsmm_smmavailable, libxsmm_dmmavailable MODULE PROCEDURE libxsmm_wimmavailable END INTERFACE !> Overloaded GEMM routines (double-precision). INTERFACE libxsmm_dgemm MODULE PROCEDURE libxsmm_dgemm0 MODULE PROCEDURE libxsmm_dgemm1 MODULE PROCEDURE libxsmm_dgemm2 MODULE PROCEDURE libxsmm_dgemm3 END INTERFACE !> Overloaded GEMM routines (single-precision). INTERFACE libxsmm_sgemm MODULE PROCEDURE libxsmm_sgemm0 MODULE PROCEDURE libxsmm_sgemm1 MODULE PROCEDURE libxsmm_sgemm2 END INTERFACE !> Overloaded GEMM routines (low-precision). INTERFACE libxsmm_wigemm MODULE PROCEDURE libxsmm_wigemm0 MODULE PROCEDURE libxsmm_wigemm1 MODULE PROCEDURE libxsmm_wigemm2 END INTERFACE !> Overloaded GEMM routines. INTERFACE libxsmm_gemm MODULE PROCEDURE libxsmm_dgemm0 MODULE PROCEDURE libxsmm_dgemm1 MODULE PROCEDURE libxsmm_dgemm2 MODULE PROCEDURE libxsmm_dgemm3 MODULE PROCEDURE libxsmm_sgemm0 MODULE PROCEDURE libxsmm_sgemm1 MODULE PROCEDURE libxsmm_sgemm2 MODULE PROCEDURE libxsmm_sgemm3 MODULE PROCEDURE libxsmm_wigemm0 MODULE PROCEDURE libxsmm_wigemm1 MODULE PROCEDURE libxsmm_wigemm2 MODULE PROCEDURE libxsmm_wigemm3 END INTERFACE !> Overloaded BLAS GEMM routines (double-precision). INTERFACE libxsmm_blas_dgemm MODULE PROCEDURE libxsmm_blas_dgemm0 MODULE PROCEDURE libxsmm_blas_dgemm1 MODULE PROCEDURE libxsmm_blas_dgemm2 MODULE PROCEDURE libxsmm_blas_dgemm3 END INTERFACE !> Overloaded BLAS GEMM routines (single-precision). INTERFACE libxsmm_blas_sgemm MODULE PROCEDURE libxsmm_blas_sgemm0 MODULE PROCEDURE libxsmm_blas_sgemm1 MODULE PROCEDURE libxsmm_blas_sgemm2 MODULE PROCEDURE libxsmm_blas_sgemm3 END INTERFACE !> Overloaded BLAS GEMM routines (single/double-precision). INTERFACE libxsmm_blas_gemm MODULE PROCEDURE libxsmm_blas_dgemm0 MODULE PROCEDURE libxsmm_blas_dgemm1 MODULE PROCEDURE libxsmm_blas_dgemm2 MODULE PROCEDURE libxsmm_blas_dgemm3 MODULE PROCEDURE libxsmm_blas_sgemm0 MODULE PROCEDURE libxsmm_blas_sgemm1 MODULE PROCEDURE libxsmm_blas_sgemm2 MODULE PROCEDURE libxsmm_blas_sgemm3 END INTERFACE !> Overloaded MATCOPY routines (2d-copy). INTERFACE libxsmm_matcopy MODULE PROCEDURE libxsmm_matcopy_p0 MODULE PROCEDURE libxsmm_matcopy_d1 MODULE PROCEDURE libxsmm_matcopy_d2 MODULE PROCEDURE libxsmm_matcopy_s1 MODULE PROCEDURE libxsmm_matcopy_s2 END INTERFACE !> Overloaded TRANSPOSE routines (in-place form). INTERFACE libxsmm_itrans MODULE PROCEDURE libxsmm_itrans_p0 MODULE PROCEDURE libxsmm_itrans_d1 MODULE PROCEDURE libxsmm_itrans_d2 MODULE PROCEDURE libxsmm_itrans_s1 MODULE PROCEDURE libxsmm_itrans_s2 END INTERFACE !> Overloaded TRANSPOSE routines (out-of-place form). INTERFACE libxsmm_otrans MODULE PROCEDURE libxsmm_otrans_p0 MODULE PROCEDURE libxsmm_otrans_d1 MODULE PROCEDURE libxsmm_otrans_d2 MODULE PROCEDURE libxsmm_otrans_s1 MODULE PROCEDURE libxsmm_otrans_s2 END INTERFACE !> Calculate a hash value for a given key value (binary blob). !> Conceptually pure, but C_LOC may be (incorrectly) impure. INTERFACE libxsmm_hash MODULE PROCEDURE libxsmm_hash_char MODULE PROCEDURE libxsmm_hash_i8 MODULE PROCEDURE libxsmm_hash_i32 MODULE PROCEDURE libxsmm_hash_i64 END INTERFACE !> Calculate whether there is a difference between two series of items. !> Conceptually pure, but C_LOC may be (incorrectly) impure. INTERFACE libxsmm_diff MODULE PROCEDURE libxsmm_diff_char MODULE PROCEDURE libxsmm_diff_i8 MODULE PROCEDURE libxsmm_diff_i32 MODULE PROCEDURE libxsmm_diff_i64 END INTERFACE CONTAINS !> Returns the name of the target architecture as determined by !> the CPUID flags, as set by the libxsmm_get_target_arch* functions, !> or as set by the LIBXSMM_TARGET environment variable. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_get_target_arch FUNCTION libxsmm_get_target_arch() !CHARACTER(LEN=:), POINTER :: libxsmm_get_target_arch CHARACTER, POINTER :: libxsmm_get_target_arch(:) INTEGER(C_INT) :: length(1) TYPE(C_PTR) :: arch !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmmf_get_target_arch INTERFACE FUNCTION libxsmmf_get_target_arch(length) BIND(C) IMPORT :: C_INT, C_PTR INTEGER(C_INT), INTENT(OUT) :: length TYPE(C_PTR) :: libxsmmf_get_target_arch END FUNCTION END INTERFACE arch = libxsmmf_get_target_arch(length(1)) CALL C_F_POINTER(arch, libxsmm_get_target_arch, length) END FUNCTION !> Returns C_NULL_PTR. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_ptr_null PURE FUNCTION libxsmm_ptr_null() TYPE(C_PTR) :: libxsmm_ptr_null libxsmm_ptr_null = C_NULL_PTR END FUNCTION !> Determines the C-address of the given array. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_ptr_z0 FUNCTION libxsmm_ptr_z0(a) COMPLEX(C_DOUBLE_COMPLEX), INTENT(IN), TARGET :: a TYPE(C_PTR) :: libxsmm_ptr_z0 libxsmm_ptr_z0 = C_LOC(a) END FUNCTION !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_ptr_z1 FUNCTION libxsmm_ptr_z1(a) COMPLEX(C_DOUBLE_COMPLEX), INTENT(IN), TARGET :: a(*) TYPE(C_PTR) :: libxsmm_ptr_z1 libxsmm_ptr_z1 = C_LOC(a) END FUNCTION !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_ptr_z2 FUNCTION libxsmm_ptr_z2(a) COMPLEX(C_DOUBLE_COMPLEX), INTENT(IN) :: a(:,:) TYPE(C_PTR) :: libxsmm_ptr_z2 libxsmm_ptr_z2 = libxsmm_ptr_z1(a) END FUNCTION !> Determines the C-address of the given array. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_ptr_c0 FUNCTION libxsmm_ptr_c0(a) COMPLEX(C_FLOAT_COMPLEX), INTENT(IN), TARGET :: a TYPE(C_PTR) :: libxsmm_ptr_c0 libxsmm_ptr_c0 = C_LOC(a) END FUNCTION !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_ptr_c1 FUNCTION libxsmm_ptr_c1(a) COMPLEX(C_FLOAT_COMPLEX), INTENT(IN), TARGET :: a(*) TYPE(C_PTR) :: libxsmm_ptr_c1 libxsmm_ptr_c1 = C_LOC(a) END FUNCTION !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_ptr_c2 FUNCTION libxsmm_ptr_c2(a) COMPLEX(C_FLOAT_COMPLEX), INTENT(IN) :: a(:,:) TYPE(C_PTR) :: libxsmm_ptr_c2 libxsmm_ptr_c2 = libxsmm_ptr_c1(a) END FUNCTION !> Determines the C-address of the given array. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_ptr_d0 FUNCTION libxsmm_ptr_d0(a) REAL(C_DOUBLE), INTENT(IN), TARGET :: a TYPE(C_PTR) :: libxsmm_ptr_d0 libxsmm_ptr_d0 = C_LOC(a) END FUNCTION !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_ptr_d1 FUNCTION libxsmm_ptr_d1(a) REAL(C_DOUBLE), INTENT(IN), TARGET :: a(*) TYPE(C_PTR) :: libxsmm_ptr_d1 libxsmm_ptr_d1 = C_LOC(a) END FUNCTION !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_ptr_d2 FUNCTION libxsmm_ptr_d2(a) REAL(C_DOUBLE), INTENT(IN) :: a(:,:) TYPE(C_PTR) :: libxsmm_ptr_d2 libxsmm_ptr_d2 = libxsmm_ptr_d1(a) END FUNCTION !> Determines the C-address of the given array. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_ptr_s0 FUNCTION libxsmm_ptr_s0(a) REAL(C_FLOAT), INTENT(IN), TARGET :: a TYPE(C_PTR) :: libxsmm_ptr_s0 libxsmm_ptr_s0 = C_LOC(a) END FUNCTION !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_ptr_s1 FUNCTION libxsmm_ptr_s1(a) REAL(C_FLOAT), INTENT(IN), TARGET :: a(*) TYPE(C_PTR) :: libxsmm_ptr_s1 libxsmm_ptr_s1 = C_LOC(a) END FUNCTION !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_ptr_s2 FUNCTION libxsmm_ptr_s2(a) REAL(C_FLOAT), INTENT(IN) :: a(:,:) TYPE(C_PTR) :: libxsmm_ptr_s2 libxsmm_ptr_s2 = libxsmm_ptr_s1(a) END FUNCTION !> Determines the C-address of the given array. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_ptr_i0 FUNCTION libxsmm_ptr_i0(a) INTEGER(C_INT), INTENT(IN), TARGET :: a TYPE(C_PTR) :: libxsmm_ptr_i0 libxsmm_ptr_i0 = C_LOC(a) END FUNCTION !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_ptr_i1 FUNCTION libxsmm_ptr_i1(a) INTEGER(C_INT), INTENT(IN), TARGET :: a(*) TYPE(C_PTR) :: libxsmm_ptr_i1 libxsmm_ptr_i1 = C_LOC(a) END FUNCTION !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_ptr_i2 FUNCTION libxsmm_ptr_i2(a) INTEGER(C_INT), INTENT(IN) :: a(:,:) TYPE(C_PTR) :: libxsmm_ptr_i2 libxsmm_ptr_i2 = libxsmm_ptr_i1(a) END FUNCTION !> Determines the C-address of the given array. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_ptr_w0 FUNCTION libxsmm_ptr_w0(a) INTEGER(C_SHORT), INTENT(IN), TARGET :: a TYPE(C_PTR) :: libxsmm_ptr_w0 libxsmm_ptr_w0 = C_LOC(a) END FUNCTION !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_ptr_w1 FUNCTION libxsmm_ptr_w1(a) INTEGER(C_SHORT), INTENT(IN), TARGET :: a(*) TYPE(C_PTR) :: libxsmm_ptr_w1 libxsmm_ptr_w1 = C_LOC(a) END FUNCTION !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_ptr_w2 FUNCTION libxsmm_ptr_w2(a) INTEGER(C_SHORT), INTENT(IN) :: a(:,:) TYPE(C_PTR) :: libxsmm_ptr_w2 libxsmm_ptr_w2 = libxsmm_ptr_w1(a) END FUNCTION !> Determines the C-address of the given array. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_ptr_j0 FUNCTION libxsmm_ptr_j0(a) INTEGER(C_INT8_T), INTENT(IN), TARGET :: a TYPE(C_PTR) :: libxsmm_ptr_j0 libxsmm_ptr_j0 = C_LOC(a) END FUNCTION !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_ptr_j1 FUNCTION libxsmm_ptr_j1(a) INTEGER(C_INT8_T), INTENT(IN), TARGET :: a(*) TYPE(C_PTR) :: libxsmm_ptr_j1 libxsmm_ptr_j1 = C_LOC(a) END FUNCTION !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_ptr_j2 FUNCTION libxsmm_ptr_j2(a) INTEGER(C_INT8_T), INTENT(IN) :: a(:,:) TYPE(C_PTR) :: libxsmm_ptr_j2 libxsmm_ptr_j2 = libxsmm_ptr_j1(a) END FUNCTION !> Determines the C-address of the given array. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_ptr_b0 FUNCTION libxsmm_ptr_b0(a) CHARACTER(C_CHAR), INTENT(IN), TARGET :: a TYPE(C_PTR) :: libxsmm_ptr_b0 libxsmm_ptr_b0 = C_LOC(a) END FUNCTION !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_ptr_b1 FUNCTION libxsmm_ptr_b1(a) CHARACTER(C_CHAR), INTENT(IN), TARGET :: a(*) TYPE(C_PTR) :: libxsmm_ptr_b1 libxsmm_ptr_b1 = C_LOC(a) END FUNCTION !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_ptr_b2 FUNCTION libxsmm_ptr_b2(a) CHARACTER(C_CHAR), INTENT(IN) :: a(:,:) TYPE(C_PTR) :: libxsmm_ptr_b2 libxsmm_ptr_b2 = libxsmm_ptr_b1(a) END FUNCTION !> Determines the C-address of the given array. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_ptr_l0 FUNCTION libxsmm_ptr_l0(a) INTEGER(C_LONG_LONG), INTENT(IN), TARGET :: a TYPE(C_PTR) :: libxsmm_ptr_l0 libxsmm_ptr_l0 = C_LOC(a) END FUNCTION !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_ptr_l1 FUNCTION libxsmm_ptr_l1(a) INTEGER(C_LONG_LONG), INTENT(IN), TARGET :: a(*) TYPE(C_PTR) :: libxsmm_ptr_l1 libxsmm_ptr_l1 = C_LOC(a) END FUNCTION !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_ptr_l2 FUNCTION libxsmm_ptr_l2(a) INTEGER(C_LONG_LONG), INTENT(IN) :: a(:,:) TYPE(C_PTR) :: libxsmm_ptr_l2 libxsmm_ptr_l2 = libxsmm_ptr_l1(a) END FUNCTION !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_ptr_dmm FUNCTION libxsmm_ptr_dmm(a) TYPE(LIBXSMM_DMMFUNCTION), INTENT(IN), TARGET :: a(:) TYPE(LIBXSMM_DMMFUNCTION), POINTER :: p TYPE(C_PTR) :: libxsmm_ptr_dmm p => a(LBOUND(a,1)); libxsmm_ptr_dmm = C_LOC(p%handle) END FUNCTION !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_ptr_smm FUNCTION libxsmm_ptr_smm(a) TYPE(LIBXSMM_SMMFUNCTION), INTENT(IN), TARGET :: a(:) TYPE(LIBXSMM_SMMFUNCTION), POINTER :: p TYPE(C_PTR) :: libxsmm_ptr_smm p => a(LBOUND(a,1)); libxsmm_ptr_smm = C_LOC(p%handle) END FUNCTION !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_ptr_wimm FUNCTION libxsmm_ptr_wimm(a) TYPE(LIBXSMM_WIMMFUNCTION), INTENT(IN), TARGET :: a(:) TYPE(LIBXSMM_WIMMFUNCTION), POINTER :: p TYPE(C_PTR) :: libxsmm_ptr_wimm p => a(LBOUND(a,1)); libxsmm_ptr_wimm = C_LOC(p%handle) END FUNCTION !> Deallocate JIT'ted code created by libxsmm_create routines. To !> unregister code generated with libxsmm_dispatch is unnecessary. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_release_dmmkernel SUBROUTINE libxsmm_release_dmmkernel(kernel) TYPE(LIBXSMM_DMMFUNCTION), INTENT(IN) :: kernel CALL libxsmm_release_kernel(kernel%handle) END SUBROUTINE !> Deallocate JIT'ted code created by libxsmm_create routines. To !> unregister code generated with libxsmm_dispatch is unnecessary. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_release_smmkernel SUBROUTINE libxsmm_release_smmkernel(kernel) TYPE(LIBXSMM_SMMFUNCTION), INTENT(IN) :: kernel CALL libxsmm_release_kernel(kernel%handle) END SUBROUTINE !> Deallocate JIT'ted code created by libxsmm_create routines. To !> unregister code generated with libxsmm_dispatch is unnecessary. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_release_wimmkernel SUBROUTINE libxsmm_release_wimmkernel(kernel) TYPE(LIBXSMM_WIMMFUNCTION), INTENT(IN) :: kernel CALL libxsmm_release_kernel(kernel%handle) END SUBROUTINE !> Query or JIT-generate an SMM-kernel (double-precision). !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_dmmdispatch SUBROUTINE libxsmm_dmmdispatch(kernel, & & m, n, k, lda, ldb, ldc, alpha, beta, flags, prefetch) TYPE(LIBXSMM_DMMFUNCTION), INTENT(OUT) :: kernel INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), & & OPTIONAL, TARGET :: lda, ldb, ldc REAL(C_DOUBLE), INTENT(IN), OPTIONAL, TARGET :: alpha, beta INTEGER(C_INT), INTENT(IN), OPTIONAL, TARGET :: flags INTEGER(C_INT), INTENT(IN), OPTIONAL, TARGET :: prefetch CALL libxsmm_xmmdispatch( & & kernel%handle, LIBXSMM_GEMM_PRECISION_F64, & & m, n, k, C_LOC(lda), C_LOC(ldb), C_LOC(ldc), & & C_LOC(alpha), C_LOC(beta), C_LOC(flags), C_LOC(prefetch)) END SUBROUTINE !> Query or JIT-generate an SMM-kernel (single-precision). !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_smmdispatch SUBROUTINE libxsmm_smmdispatch(kernel, & & m, n, k, lda, ldb, ldc, alpha, beta, flags, prefetch) TYPE(LIBXSMM_SMMFUNCTION), INTENT(OUT) :: kernel INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), & & OPTIONAL, TARGET :: lda, ldb, ldc REAL(C_FLOAT), INTENT(IN), OPTIONAL, TARGET :: alpha, beta INTEGER(C_INT), INTENT(IN), OPTIONAL, TARGET :: flags INTEGER(C_INT), INTENT(IN), OPTIONAL, TARGET :: prefetch CALL libxsmm_xmmdispatch( & & kernel%handle, LIBXSMM_GEMM_PRECISION_F32, & & m, n, k, C_LOC(lda), C_LOC(ldb), C_LOC(ldc), & & C_LOC(alpha), C_LOC(beta), C_LOC(flags), C_LOC(prefetch)) END SUBROUTINE !> Query or JIT-generate an SMM-kernel (low-precision, int-accumulate). !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_wimmdispatch SUBROUTINE libxsmm_wimmdispatch(kernel, & & m, n, k, lda, ldb, ldc, alpha, beta, flags, prefetch) TYPE(LIBXSMM_WIMMFUNCTION), INTENT(OUT) :: kernel INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), & & OPTIONAL, TARGET :: lda, ldb, ldc INTEGER(C_INT), INTENT(IN), OPTIONAL, TARGET :: alpha, beta INTEGER(C_INT), INTENT(IN), OPTIONAL, TARGET :: flags INTEGER(C_INT), INTENT(IN), OPTIONAL, TARGET :: prefetch CALL libxsmm_xmmdispatch2(kernel%handle, & & LIBXSMM_GEMM_PRECISION_I16, LIBXSMM_GEMM_PRECISION_I32, & & m, n, k, C_LOC(lda), C_LOC(ldb), C_LOC(ldc), & & C_LOC(alpha), C_LOC(beta), C_LOC(flags), C_LOC(prefetch)) END SUBROUTINE !> Checks if the given kernel was generated. JIT code is guaranteed !> to be generated if JIT support was enabled at build-time of the !> library (default). This overload belongs to libxsmm_(mm)available. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_dmmavailable LOGICAL FUNCTION libxsmm_dmmavailable(kernel) TYPE(LIBXSMM_DMMFUNCTION), INTENT(IN) :: kernel libxsmm_dmmavailable = C_ASSOCIATED(kernel%handle) END FUNCTION !> Checks if the given kernel was generated. JIT code is guaranteed !> to be generated if JIT support was enabled at build-time of the !> library (default). This overload belongs to libxsmm_(mm)available. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_smmavailable LOGICAL FUNCTION libxsmm_smmavailable(kernel) TYPE(LIBXSMM_SMMFUNCTION), INTENT(IN) :: kernel libxsmm_smmavailable = C_ASSOCIATED(kernel%handle) END FUNCTION !> Checks if the given kernel was generated. JIT code is guaranteed !> to be generated if JIT support was enabled at build-time of the !> library (default). This overload belongs to libxsmm_(mm)available. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_wimmavailable LOGICAL FUNCTION libxsmm_wimmavailable(kernel) TYPE(LIBXSMM_WIMMFUNCTION), INTENT(IN) :: kernel libxsmm_wimmavailable = C_ASSOCIATED(kernel%handle) END FUNCTION !> Calls the kernel with the given arguments. Alternatively, !> PROCPOINTER can be used as shown by the inner comments !> of this routine (LIBXSMM_FUNCTION3). The libxsmm_xmmcall !> routines can be used in FORTRAN77. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_dmmcall_abc SUBROUTINE libxsmm_dmmcall_abc(kernel, a, b, c) TYPE(LIBXSMM_DMMFUNCTION), INTENT(IN) :: kernel REAL(C_DOUBLE), INTENT(IN), TARGET :: a(*), b(*) REAL(C_DOUBLE), INTENT(INOUT), TARGET :: c(*) ! PROCEDURE(LIBXSMM_FUNCTION3), POINTER :: xmm ! CALL C_F_PROCPOINTER(kernel%handle, xmm) ! CALL xmm(...) CALL libxsmm_xmmcall_abc(kernel%handle, & & C_LOC(a), C_LOC(b), C_LOC(c)) END SUBROUTINE !> Calls the kernel with the given arguments. Alternatively, !> PROCPOINTER can be used as shown by the inner comments !> of this routine (LIBXSMM_FUNCTION6). The libxsmm_xmmcall !> routines can be used in FORTRAN77. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_dmmcall_prf SUBROUTINE libxsmm_dmmcall_prf(kernel, a, b, c, pa, pb, pc) TYPE(LIBXSMM_DMMFUNCTION), INTENT(IN) :: kernel REAL(C_DOUBLE), INTENT(IN), TARGET :: a(*), b(*) REAL(C_DOUBLE), INTENT(INOUT), TARGET :: c(*) REAL(C_DOUBLE), INTENT(IN), TARGET :: pa(*) REAL(C_DOUBLE), INTENT(IN), TARGET :: pb(*) REAL(C_DOUBLE), INTENT(IN), TARGET :: pc(*) ! PROCEDURE(LIBXSMM_FUNCTION6), POINTER :: xmm ! CALL C_F_PROCPOINTER(kernel%handle, xmm) ! CALL xmm(...) CALL libxsmm_xmmcall_prf(kernel%handle, & & C_LOC(a), C_LOC(b), C_LOC(c), & & C_LOC(pa), C_LOC(pb), C_LOC(pc)) END SUBROUTINE !> See also libxsmm_dmmcall_abc and libxsmm_dmmcall_prf. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_dmmcall SUBROUTINE libxsmm_dmmcall(kernel, a, b, c, pa, pb, pc) TYPE(LIBXSMM_DMMFUNCTION), INTENT(IN) :: kernel REAL(C_DOUBLE), INTENT(IN), TARGET :: a(*), b(*) REAL(C_DOUBLE), INTENT(INOUT), TARGET :: c(*) REAL(C_DOUBLE), INTENT(IN), OPTIONAL, TARGET :: pa(*) REAL(C_DOUBLE), INTENT(IN), OPTIONAL, TARGET :: pb(*) REAL(C_DOUBLE), INTENT(IN), OPTIONAL, TARGET :: pc(*) ! use .OR. instead of .AND. to avoid full check IF (PRESENT(pa).OR.PRESENT(pb).OR.PRESENT(pc)) THEN CALL libxsmm_xmmcall_prf(kernel%handle, & & C_LOC(a), C_LOC(b), C_LOC(c), & & C_LOC(pa), C_LOC(pb), C_LOC(pc)) ELSE CALL libxsmm_xmmcall_abc(kernel%handle, & & C_LOC(a), C_LOC(b), C_LOC(c)) END IF END SUBROUTINE !> Calls the kernel with the given arguments. Alternatively, !> PROCPOINTER can be used as shown by the inner comments !> of this routine (LIBXSMM_FUNCTION3). The libxsmm_xmmcall !> routines can be used in FORTRAN77. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_smmcall_abc SUBROUTINE libxsmm_smmcall_abc(kernel, a, b, c) TYPE(LIBXSMM_SMMFUNCTION), INTENT(IN) :: kernel REAL(C_FLOAT), INTENT(IN), TARGET :: a(*), b(*) REAL(C_FLOAT), INTENT(INOUT), TARGET :: c(*) ! PROCEDURE(LIBXSMM_FUNCTION3), POINTER :: xmm ! CALL C_F_PROCPOINTER(kernel%handle, xmm) ! CALL xmm(...) CALL libxsmm_xmmcall_abc(kernel%handle, & & C_LOC(a), C_LOC(b), C_LOC(c)) END SUBROUTINE !> Calls the kernel with the given arguments. Alternatively, !> PROCPOINTER can be used as shown by the inner comments !> of this routine (LIBXSMM_FUNCTION6). The libxsmm_xmmcall !> routines can be used in FORTRAN77. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_smmcall_prf SUBROUTINE libxsmm_smmcall_prf(kernel, a, b, c, pa, pb, pc) TYPE(LIBXSMM_SMMFUNCTION), INTENT(IN) :: kernel REAL(C_FLOAT), INTENT(IN), TARGET :: a(*), b(*) REAL(C_FLOAT), INTENT(INOUT), TARGET :: c(*) REAL(C_FLOAT), INTENT(IN), TARGET :: pa(*) REAL(C_FLOAT), INTENT(IN), TARGET :: pb(*) REAL(C_FLOAT), INTENT(IN), TARGET :: pc(*) ! PROCEDURE(LIBXSMM_FUNCTION6), POINTER :: xmm ! CALL C_F_PROCPOINTER(kernel%handle, xmm) ! CALL xmm(...) CALL libxsmm_xmmcall_prf(kernel%handle, & & C_LOC(a), C_LOC(b), C_LOC(c), & & C_LOC(pa), C_LOC(pb), C_LOC(pc)) END SUBROUTINE !> See also libxsmm_smmcall_abc and libxsmm_smmcall_prf. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_smmcall SUBROUTINE libxsmm_smmcall(kernel, a, b, c, pa, pb, pc) TYPE(LIBXSMM_SMMFUNCTION), INTENT(IN) :: kernel REAL(C_FLOAT), INTENT(IN), TARGET :: a(*), b(*) REAL(C_FLOAT), INTENT(INOUT), TARGET :: c(*) REAL(C_FLOAT), INTENT(IN), OPTIONAL, TARGET :: pa(*) REAL(C_FLOAT), INTENT(IN), OPTIONAL, TARGET :: pb(*) REAL(C_FLOAT), INTENT(IN), OPTIONAL, TARGET :: pc(*) ! use .OR. instead of .AND. to avoid full check IF (PRESENT(pa).OR.PRESENT(pb).OR.PRESENT(pc)) THEN CALL libxsmm_xmmcall_prf(kernel%handle, & & C_LOC(a), C_LOC(b), C_LOC(c), & & C_LOC(pa), C_LOC(pb), C_LOC(pc)) ELSE CALL libxsmm_xmmcall_abc(kernel%handle, & & C_LOC(a), C_LOC(b), C_LOC(c)) END IF END SUBROUTINE !> Calls the kernel with the given arguments. Alternatively, !> PROCPOINTER can be used as shown by the inner comments !> of this routine (LIBXSMM_FUNCTION3). The libxsmm_xmmcall !> routines can be used in FORTRAN77. SUBROUTINE libxsmm_wimmcall_abc(kernel, a, b, c) TYPE(LIBXSMM_WIMMFUNCTION), INTENT(IN) :: kernel INTEGER(C_SHORT), INTENT(IN), TARGET :: a(*), b(*) INTEGER(C_INT), INTENT(INOUT), TARGET :: c(*) ! PROCEDURE(LIBXSMM_FUNCTION3), POINTER :: xmm ! CALL C_F_PROCPOINTER(kernel%handle, xmm) ! CALL xmm(...) CALL libxsmm_xmmcall_abc(kernel%handle, & & C_LOC(a), C_LOC(b), C_LOC(c)) END SUBROUTINE !> Calls the kernel with the given arguments. Alternatively, !> PROCPOINTER can be used as shown by the inner comments !> of this routine (LIBXSMM_FUNCTION6). The libxsmm_xmmcall !> routines can be used in FORTRAN77. SUBROUTINE libxsmm_wimmcall_prf(kernel, a, b, c, pa, pb, pc) TYPE(LIBXSMM_WIMMFUNCTION), INTENT(IN) :: kernel INTEGER(C_SHORT), INTENT(IN), TARGET :: a(*), b(*) INTEGER(C_INT), INTENT(INOUT), TARGET :: c(*) INTEGER(C_SHORT), INTENT(IN), TARGET :: pa(*) INTEGER(C_SHORT), INTENT(IN), TARGET :: pb(*) INTEGER(C_SHORT), INTENT(IN), TARGET :: pc(*) ! PROCEDURE(LIBXSMM_FUNCTION6), POINTER :: xmm ! CALL C_F_PROCPOINTER(kernel%handle, xmm) ! CALL xmm(...) CALL libxsmm_xmmcall_prf(kernel%handle, & & C_LOC(a), C_LOC(b), C_LOC(c), & & C_LOC(pa), C_LOC(pb), C_LOC(pc)) END SUBROUTINE !> See also libxsmm_wimmcall_abc and libxsmm_wimmcall_prf. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_wimmcall SUBROUTINE libxsmm_wimmcall(kernel, a, b, c, pa, pb, pc) TYPE(LIBXSMM_WIMMFUNCTION), INTENT(IN) :: kernel INTEGER(C_SHORT), INTENT(IN), TARGET :: a(*), b(*) INTEGER(C_INT), INTENT(INOUT), TARGET :: c(*) INTEGER(C_SHORT), INTENT(IN), OPTIONAL, TARGET :: pa(*) INTEGER(C_SHORT), INTENT(IN), OPTIONAL, TARGET :: pb(*) INTEGER(C_SHORT), INTENT(IN), OPTIONAL, TARGET :: pc(*) ! use .OR. instead of .AND. to avoid full check IF (PRESENT(pa).OR.PRESENT(pb).OR.PRESENT(pc)) THEN CALL libxsmm_xmmcall_prf(kernel%handle, & & C_LOC(a), C_LOC(b), C_LOC(c), & & C_LOC(pa), C_LOC(pb), C_LOC(pc)) ELSE CALL libxsmm_xmmcall_abc(kernel%handle, & & C_LOC(a), C_LOC(b), C_LOC(c)) END IF END SUBROUTINE !> Register user-defined key-value; value can be queried (libxsmm_xdispatch). !> Since the key-type is unknown to LIBXSMM, the key must be binary reproducible, !> i.e., if it is a structured type (padded data may be uninitialized), it must !> be initially zero-filled (libxsmm_xclear) followed by an element-wise setup. !> The size of the key is limited (see documentation). The given value is copied !> by LIBXSMM and may be initialized at registration-time or whenever queried. !> Registered data is released at program termination but can be also released !> if needed (libxsmm_xrelease), .e.g., for larger value for the same key. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_xregister FUNCTION libxsmm_xregister(key, keysize, valsize, valinit) TYPE(C_PTR), INTENT(IN), VALUE :: key TYPE(C_PTR), INTENT(IN), VALUE, OPTIONAL :: valinit INTEGER(C_INT), INTENT(IN) :: keysize, valsize TYPE(C_PTR) :: libxsmm_xregister !DIR$ ATTRIBUTES OFFLOAD:MIC :: internal_xregister INTERFACE SUBROUTINE internal_xregister(regval, & & key, keysize, valsize, valinit) & & BIND(C, NAME="libxsmm_xregister_") IMPORT C_PTR, C_INT TYPE(C_PTR), INTENT(OUT) :: regval TYPE(C_PTR), INTENT(IN), VALUE :: key, valinit INTEGER(C_INT), INTENT(IN) :: keysize, valsize END SUBROUTINE END INTERFACE CALL internal_xregister(libxsmm_xregister, & & key, keysize, valsize, valinit) END FUNCTION !> Query user-defined value from LIBXSMM's code registry. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_xdispatch FUNCTION libxsmm_xdispatch(key, keysize) TYPE(C_PTR), INTENT(IN), VALUE :: key INTEGER(C_INT), INTENT(IN) :: keysize TYPE(C_PTR) :: libxsmm_xdispatch !DIR$ ATTRIBUTES OFFLOAD:MIC :: internal_xdispatch INTERFACE SUBROUTINE internal_xdispatch(regval, key, keysize) & & BIND(C, NAME="libxsmm_xdispatch_") IMPORT C_PTR, C_INT TYPE(C_PTR), INTENT(OUT) :: regval TYPE(C_PTR), INTENT(IN), VALUE :: key INTEGER(C_INT), INTENT(IN) :: keysize END SUBROUTINE END INTERFACE CALL internal_xdispatch(libxsmm_xdispatch, key, keysize) END FUNCTION !> Auto-dispatched general dense MM (double-precision). !> This overload belongs to libxsmm_(d)gemm. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_dgemm0 PURE SUBROUTINE libxsmm_dgemm0(transa, transb, m, n, k, & & alpha, a, lda, b, ldb, beta, c, ldc) CHARACTER, INTENT(IN), OPTIONAL :: transa, transb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: lda INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldc REAL(C_DOUBLE), INTENT(IN), OPTIONAL :: alpha, beta REAL(C_DOUBLE), INTENT(IN) :: a, b REAL(C_DOUBLE), INTENT(INOUT) :: c !DIR$ ATTRIBUTES OFFLOAD:MIC :: internal_gemm INTERFACE PURE SUBROUTINE internal_gemm(transa, transb, m, n, k, & & alpha, a, lda, b, ldb, beta, c, ldc) & & BIND(C, NAME="libxsmm_dgemm_") IMPORT C_CHAR, C_DOUBLE, LIBXSMM_BLASINT_KIND CHARACTER(C_CHAR), INTENT(IN) :: transa, transb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: ldb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: ldc REAL(C_DOUBLE), INTENT(IN) :: alpha, beta REAL(C_DOUBLE), INTENT(IN) :: a, b REAL(C_DOUBLE), INTENT(INOUT) :: c END SUBROUTINE END INTERFACE CALL internal_gemm(transa, transb, m, n, k, & & alpha, a, lda, b, ldb, beta, c, ldc) END SUBROUTINE !> Auto-dispatched general dense MM (double-precision). !> This overload belongs to libxsmm_(d)gemm. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_dgemm1 PURE SUBROUTINE libxsmm_dgemm1(transa, transb, m, n, k, & & alpha, a, lda, b, ldb, beta, c, ldc) CHARACTER, INTENT(IN), OPTIONAL :: transa, transb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: lda INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldc REAL(C_DOUBLE), INTENT(IN), OPTIONAL :: alpha, beta REAL(C_DOUBLE), INTENT(IN) :: a(*), b(*) REAL(C_DOUBLE), INTENT(INOUT) :: c(*) IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN CALL libxsmm_dgemm0(transa, transb, m, n, k, & & alpha, a(LBOUND(a,1)), lda, & & b(LBOUND(b,1)), ldb, & & beta, c(LBOUND(c,1)), ldc) END IF END SUBROUTINE !> Auto-dispatched general dense MM (double-precision). !> This overload belongs to libxsmm_(d)gemm. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_dgemm2 PURE SUBROUTINE libxsmm_dgemm2(transa, transb, m, n, k, & & a, b, c, alpha, beta) CHARACTER, INTENT(IN), OPTIONAL :: transa, transb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k REAL(C_DOUBLE), INTENT(IN), OPTIONAL :: alpha, beta REAL(C_DOUBLE), INTENT(IN) :: a(m,*), b(k,*) REAL(C_DOUBLE), INTENT(INOUT) :: c(m,*) IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN CALL libxsmm_dgemm0(transa, transb, m, n, k, & & alpha, a(LBOUND(a,1),LBOUND(a,2)), m, & & b(LBOUND(b,1),LBOUND(b,2)), k, & & beta, c(LBOUND(c,1),LBOUND(c,2)), m) END IF END SUBROUTINE !> Auto-dispatched general dense MM (double-precision). !> This overload belongs to libxsmm_(d)gemm. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_dgemm3 PURE SUBROUTINE libxsmm_dgemm3(transa, transb, m, n, k, & & alpha, a, lda, b, ldb, beta, c, ldc) CHARACTER, INTENT(IN), OPTIONAL :: transa, transb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda, ldb, ldc REAL(C_DOUBLE), INTENT(IN), OPTIONAL :: alpha, beta REAL(C_DOUBLE), INTENT(IN) :: a(lda,*), b(ldb,*) REAL(C_DOUBLE), INTENT(INOUT) :: c(ldc,*) IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN CALL libxsmm_dgemm0(transa, transb, m, n, k, & & alpha, a(LBOUND(a,1),LBOUND(a,2)), lda, & & b(LBOUND(b,1),LBOUND(b,2)), ldb, & & beta, c(LBOUND(c,1),LBOUND(c,2)), ldc) END IF END SUBROUTINE !> Auto-dispatched general dense MM (single-precision). !> This overload belongs to libxsmm_(s)gemm. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_sgemm0 PURE SUBROUTINE libxsmm_sgemm0(transa, transb, m, n, k, & & alpha, a, lda, b, ldb, beta, c, ldc) CHARACTER, INTENT(IN), OPTIONAL :: transa, transb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: lda INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldc REAL(C_FLOAT), INTENT(IN), OPTIONAL :: alpha, beta REAL(C_FLOAT), INTENT(IN) :: a, b REAL(C_FLOAT), INTENT(INOUT) :: c !DIR$ ATTRIBUTES OFFLOAD:MIC :: internal_gemm INTERFACE PURE SUBROUTINE internal_gemm(transa, transb, m, n, k, & & alpha, a, lda, b, ldb, beta, c, ldc) & & BIND(C, NAME="libxsmm_sgemm_") IMPORT C_CHAR, C_FLOAT, LIBXSMM_BLASINT_KIND CHARACTER(C_CHAR), INTENT(IN) :: transa, transb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: ldb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: ldc REAL(C_FLOAT), INTENT(IN) :: alpha, beta REAL(C_FLOAT), INTENT(IN) :: a, b REAL(C_FLOAT), INTENT(INOUT) :: c END SUBROUTINE END INTERFACE CALL internal_gemm(transa, transb, m, n, k, & & alpha, a, lda, b, ldb, beta, c, ldc) END SUBROUTINE !> Auto-dispatched general dense MM (single-precision). !> This overload belongs to libxsmm_(s)gemm. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_sgemm1 PURE SUBROUTINE libxsmm_sgemm1(transa, transb, m, n, k, & & alpha, a, lda, b, ldb, beta, c, ldc) CHARACTER, INTENT(IN), OPTIONAL :: transa, transb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: lda INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldc REAL(C_FLOAT), INTENT(IN), OPTIONAL :: alpha, beta REAL(C_FLOAT), INTENT(IN) :: a(*), b(*) REAL(C_FLOAT), INTENT(INOUT) :: c(*) IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN CALL libxsmm_sgemm0(transa, transb, m, n, k, & & alpha, a(LBOUND(a,1)), lda, & & b(LBOUND(b,1)), ldb, & & beta, c(LBOUND(c,1)), ldc) END IF END SUBROUTINE !> Auto-dispatched general dense MM (single-precision). !> This overload belongs to libxsmm_(s)gemm. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_sgemm2 PURE SUBROUTINE libxsmm_sgemm2(transa, transb, m, n, k, & & a, b, c, alpha, beta) CHARACTER, INTENT(IN), OPTIONAL :: transa, transb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k REAL(C_FLOAT), INTENT(IN), OPTIONAL :: alpha, beta REAL(C_FLOAT), INTENT(IN) :: a(m,*), b(k,*) REAL(C_FLOAT), INTENT(INOUT) :: c(m,*) IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN CALL libxsmm_sgemm0(transa, transb, m, n, k, & & alpha, a(LBOUND(a,1),LBOUND(a,2)), m, & & b(LBOUND(b,1),LBOUND(b,2)), k, & & beta, c(LBOUND(c,1),LBOUND(c,2)), m) END IF END SUBROUTINE !> Auto-dispatched general dense MM (single-precision). !> This overload belongs to libxsmm_(s)gemm. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_sgemm3 PURE SUBROUTINE libxsmm_sgemm3(transa, transb, m, n, k, & & alpha, a, lda, b, ldb, beta, c, ldc) CHARACTER, INTENT(IN), OPTIONAL :: transa, transb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda, ldb, ldc REAL(C_FLOAT), INTENT(IN), OPTIONAL :: alpha, beta REAL(C_FLOAT), INTENT(IN) :: a(lda,*), b(ldb,*) REAL(C_FLOAT), INTENT(INOUT) :: c(ldc,*) IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN CALL libxsmm_sgemm0(transa, transb, m, n, k, & & alpha, a(LBOUND(a,1),LBOUND(a,2)), lda, & & b(LBOUND(b,1),LBOUND(b,2)), ldb, & & beta, c(LBOUND(c,1),LBOUND(c,2)), ldc) END IF END SUBROUTINE !> Auto-dispatched general dense MM (low-precision, int-accumulate). !> This overload belongs to libxsmm_(wi)gemm. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_wigemm0 PURE SUBROUTINE libxsmm_wigemm0(transa, transb, m, n, k, & & alpha, a, lda, b, ldb, beta, c, ldc) CHARACTER, INTENT(IN), OPTIONAL :: transa, transb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: lda INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldc INTEGER(C_INT), INTENT(IN), OPTIONAL :: alpha, beta INTEGER(C_SHORT), INTENT(IN) :: a, b INTEGER(C_INT), INTENT(INOUT) :: c !DIR$ ATTRIBUTES OFFLOAD:MIC :: internal_gemm INTERFACE PURE SUBROUTINE internal_gemm(transa, transb, m, n, k, & & alpha, a, lda, b, ldb, beta, c, ldc) & & BIND(C, NAME="libxsmm_wigemm_") IMPORT C_CHAR, C_SHORT, C_INT, LIBXSMM_BLASINT_KIND CHARACTER(C_CHAR), INTENT(IN) :: transa, transb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: ldb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: ldc INTEGER(C_INT), INTENT(IN) :: alpha, beta INTEGER(C_SHORT), INTENT(IN) :: a, b INTEGER(C_INT), INTENT(INOUT) :: c END SUBROUTINE END INTERFACE CALL internal_gemm(transa, transb, m, n, k, & & alpha, a, lda, b, ldb, beta, c, ldc) END SUBROUTINE !> Auto-dispatched general dense MM (low-precision, int-accumulate). !> This overload belongs to libxsmm_(wi)gemm. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_wigemm1 PURE SUBROUTINE libxsmm_wigemm1(transa, transb, m, n, k, & & alpha, a, lda, b, ldb, beta, c, ldc) CHARACTER, INTENT(IN), OPTIONAL :: transa, transb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: lda INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldc INTEGER(C_INT), INTENT(IN), OPTIONAL :: alpha, beta INTEGER(C_SHORT), INTENT(IN) :: a(*), b(*) INTEGER(C_INT), INTENT(INOUT) :: c(*) IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN CALL libxsmm_wigemm0(transa, transb, m, n, k, & & alpha, a(LBOUND(a,1)), lda, & & b(LBOUND(b,1)), ldb, & & beta, c(LBOUND(c,1)), ldc) END IF END SUBROUTINE !> Auto-dispatched general dense MM (low-precision, int-accumulate). !> This overload belongs to libxsmm_(wi)gemm. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_wigemm2 PURE SUBROUTINE libxsmm_wigemm2(transa, transb, m, n, k, & & a, b, c, alpha, beta) CHARACTER, INTENT(IN), OPTIONAL :: transa, transb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k INTEGER(C_INT), INTENT(IN), OPTIONAL :: alpha, beta INTEGER(C_SHORT), INTENT(IN) :: a(m,*), b(k,*) INTEGER(C_INT), INTENT(INOUT) :: c(m,*) IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN CALL libxsmm_wigemm0(transa, transb, m, n, k, & & alpha, a(LBOUND(a,1),LBOUND(a,2)), m, & & b(LBOUND(b,1),LBOUND(b,2)), k, & & beta, c(LBOUND(c,1),LBOUND(c,2)), m) END IF END SUBROUTINE !> Auto-dispatched general dense MM (low-precision, int-accumulate). !> This overload belongs to libxsmm_(wi)gemm. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_wigemm3 PURE SUBROUTINE libxsmm_wigemm3(transa, transb, m, n, k, & & alpha, a, lda, b, ldb, beta, c, ldc) CHARACTER, INTENT(IN), OPTIONAL :: transa, transb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda, ldb, ldc INTEGER(C_INT), INTENT(IN), OPTIONAL :: alpha, beta INTEGER(C_SHORT), INTENT(IN) :: a(lda,*), b(ldb,*) INTEGER(C_INT), INTENT(INOUT) :: c(ldc,*) IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN CALL libxsmm_wigemm0(transa, transb, m, n, k, & & alpha, a(LBOUND(a,1),LBOUND(a,2)), lda, & & b(LBOUND(b,1),LBOUND(b,2)), ldb, & & beta, c(LBOUND(c,1),LBOUND(c,2)), ldc) END IF END SUBROUTINE !> Re-exposes BLAS based GEMM routine with an interfaces similar to !> libxsmm_(d)gemm. This overload belongs to libxsmm_blas_(d)gemm. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_blas_dgemm0 PURE SUBROUTINE libxsmm_blas_dgemm0(transa, transb, m, n, k, & & alpha, a, lda, b, ldb, beta, c, ldc) CHARACTER, INTENT(IN), OPTIONAL :: transa, transb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: lda INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldc REAL(C_DOUBLE), INTENT(IN), OPTIONAL :: alpha, beta REAL(C_DOUBLE), INTENT(IN) :: a, b REAL(C_DOUBLE), INTENT(INOUT) :: c !DIR$ ATTRIBUTES OFFLOAD:MIC :: internal_gemm INTERFACE PURE SUBROUTINE internal_gemm(transa, transb, m, n, k, & & alpha, a, lda, b, ldb, beta, c, ldc) & & BIND(C, NAME="libxsmm_blas_dgemm_") IMPORT C_CHAR, C_DOUBLE, LIBXSMM_BLASINT_KIND CHARACTER(C_CHAR), INTENT(IN) :: transa, transb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: ldb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: ldc REAL(C_DOUBLE), INTENT(IN) :: alpha, beta REAL(C_DOUBLE), INTENT(IN) :: a, b REAL(C_DOUBLE), INTENT(INOUT) :: c END SUBROUTINE END INTERFACE CALL internal_gemm(transa, transb, m, n, k, & & alpha, a, lda, b, ldb, beta, c, ldc) END SUBROUTINE !> Re-exposes BLAS based GEMM routine with an interfaces similar to !> libxsmm_(d)gemm. This overload belongs to libxsmm_blas_(d)gemm. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_blas_dgemm1 PURE SUBROUTINE libxsmm_blas_dgemm1(transa, transb, m, n, k, & & alpha, a, lda, b, ldb, beta, c, ldc) CHARACTER, INTENT(IN), OPTIONAL :: transa, transb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: lda INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldc REAL(C_DOUBLE), INTENT(IN), OPTIONAL :: alpha, beta REAL(C_DOUBLE), INTENT(IN) :: a(*), b(*) REAL(C_DOUBLE), INTENT(INOUT) :: c(*) IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN CALL libxsmm_blas_dgemm0(transa, transb, m, n, k, & & alpha, a(LBOUND(a,1)), lda, & & b(LBOUND(b,1)), ldb, & & beta, c(LBOUND(c,1)), ldc) END IF END SUBROUTINE !> Re-exposes BLAS based GEMM routine with an interfaces similar to !> libxsmm_(d)gemm. This overload belongs to libxsmm_blas_(d)gemm. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_blas_dgemm2 PURE SUBROUTINE libxsmm_blas_dgemm2(transa, transb, m, n, k, & & a, b, c, alpha, beta) CHARACTER, INTENT(IN), OPTIONAL :: transa, transb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k REAL(C_DOUBLE), INTENT(IN), OPTIONAL :: alpha, beta REAL(C_DOUBLE), INTENT(IN) :: a(m,*), b(k,*) REAL(C_DOUBLE), INTENT(INOUT) :: c(m,*) IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN CALL libxsmm_blas_dgemm0(transa, transb, m, n, k, & & alpha, a(LBOUND(a,1),LBOUND(a,2)), m, & & b(LBOUND(b,1),LBOUND(b,2)), k, & & beta, c(LBOUND(c,1),LBOUND(c,2)), m) END IF END SUBROUTINE !> Re-exposes BLAS based GEMM routine with an interfaces similar to !> libxsmm_(d)gemm. This overload belongs to libxsmm_blas_(d)gemm. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_blas_dgemm3 PURE SUBROUTINE libxsmm_blas_dgemm3(transa, transb, m, n, k, & & alpha, a, lda, b, ldb, beta, c, ldc) CHARACTER, INTENT(IN), OPTIONAL :: transa, transb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda, ldb, ldc REAL(C_DOUBLE), INTENT(IN), OPTIONAL :: alpha, beta REAL(C_DOUBLE), INTENT(IN) :: a(lda,*), b(ldb,*) REAL(C_DOUBLE), INTENT(INOUT) :: c(ldc,*) IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN CALL libxsmm_blas_dgemm0(transa, transb, m, n, k, & & alpha, a(LBOUND(a,1),LBOUND(a,2)), lda, & & b(LBOUND(b,1),LBOUND(b,2)), ldb, & & beta, c(LBOUND(c,1),LBOUND(c,2)), ldc) END IF END SUBROUTINE !> Re-exposes BLAS based GEMM routine with an interfaces similar to !> libxsmm_(s)gemm. This overload belongs to libxsmm_blas_(s)gemm. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_blas_sgemm0 PURE SUBROUTINE libxsmm_blas_sgemm0(transa, transb, m, n, k, & & alpha, a, lda, b, ldb, beta, c, ldc) CHARACTER, INTENT(IN), OPTIONAL :: transa, transb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: lda INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldc REAL(C_FLOAT), INTENT(IN), OPTIONAL :: alpha, beta REAL(C_FLOAT), INTENT(IN) :: a, b REAL(C_FLOAT), INTENT(INOUT) :: c !DIR$ ATTRIBUTES OFFLOAD:MIC :: internal_gemm INTERFACE PURE SUBROUTINE internal_gemm(transa, transb, m, n, k, & & alpha, a, lda, b, ldb, beta, c, ldc) & & BIND(C, NAME="libxsmm_blas_sgemm_") IMPORT C_CHAR, C_FLOAT, LIBXSMM_BLASINT_KIND CHARACTER(C_CHAR), INTENT(IN) :: transa, transb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: ldb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: ldc REAL(C_FLOAT), INTENT(IN) :: alpha, beta REAL(C_FLOAT), INTENT(IN) :: a, b REAL(C_FLOAT), INTENT(INOUT) :: c END SUBROUTINE END INTERFACE CALL internal_gemm(transa, transb, m, n, k, & & alpha, a, lda, b, ldb, beta, c, ldc) END SUBROUTINE !> Re-exposes BLAS based GEMM routine with an interfaces similar to !> libxsmm_(s)gemm. This overload belongs to libxsmm_blas_(s)gemm. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_blas_sgemm1 PURE SUBROUTINE libxsmm_blas_sgemm1(transa, transb, m, n, k, & & alpha, a, lda, b, ldb, beta, c, ldc) CHARACTER, INTENT(IN), OPTIONAL :: transa, transb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: lda INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldc REAL(C_FLOAT), INTENT(IN), OPTIONAL :: alpha, beta REAL(C_FLOAT), INTENT(IN) :: a(*), b(*) REAL(C_FLOAT), INTENT(INOUT) :: c(*) IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN CALL libxsmm_blas_sgemm0(transa, transb, m, n, k, & & alpha, a(LBOUND(a,1)), lda, & & b(LBOUND(b,1)), ldb, & & beta, c(LBOUND(c,1)), ldc) END IF END SUBROUTINE !> Re-exposes BLAS based GEMM routine with an interfaces similar to !> libxsmm_(s)gemm. This overload belongs to libxsmm_blas_(s)gemm. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_blas_sgemm2 PURE SUBROUTINE libxsmm_blas_sgemm2(transa, transb, m, n, k, & & a, b, c, alpha, beta) CHARACTER, INTENT(IN), OPTIONAL :: transa, transb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k REAL(C_FLOAT), INTENT(IN), OPTIONAL :: alpha, beta REAL(C_FLOAT), INTENT(IN) :: a(m,*), b(k,*) REAL(C_FLOAT), INTENT(INOUT) :: c(m,*) IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN CALL libxsmm_blas_sgemm0(transa, transb, m, n, k, & & alpha, a(LBOUND(a,1),LBOUND(a,2)), m, & & b(LBOUND(b,1),LBOUND(b,2)), k, & & beta, c(LBOUND(c,1),LBOUND(c,2)), m) END IF END SUBROUTINE !> Re-exposes BLAS based GEMM routine with an interfaces similar to !> libxsmm_(s)gemm. This overload belongs to libxsmm_blas_(s)gemm. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_blas_sgemm3 PURE SUBROUTINE libxsmm_blas_sgemm3(transa, transb, m, n, k, & & alpha, a, lda, b, ldb, beta, c, ldc) CHARACTER, INTENT(IN), OPTIONAL :: transa, transb INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda, ldb, ldc REAL(C_FLOAT), INTENT(IN), OPTIONAL :: alpha, beta REAL(C_FLOAT), INTENT(IN) :: a(lda,*), b(ldb,*) REAL(C_FLOAT), INTENT(INOUT) :: c(ldc,*) IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN CALL libxsmm_blas_sgemm0(transa, transb, m, n, k, & & alpha, a(LBOUND(a,1),LBOUND(a,2)), lda, & & b(LBOUND(b,1),LBOUND(b,2)), ldb, & & beta, c(LBOUND(c,1),LBOUND(c,2)), ldc) END IF END SUBROUTINE !> Matrix-copy (2-dimensional copy) routine. If the input (optional) !> is not present, the routine is used to zero-fill the out-matrix. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_matcopy_p0 PURE SUBROUTINE libxsmm_matcopy_p0(output, input, typesize, & & m, n, ldi, ldo) INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), & & OPTIONAL :: n, ldi, ldo INTEGER(C_INT), INTENT(IN) :: typesize TYPE(C_PTR), INTENT(IN), OPTIONAL :: input TYPE(C_PTR), INTENT(IN) :: output CALL libxsmm_xmatcopy(output, input, typesize, & & m, n, ldi, ldo) END SUBROUTINE !> Matrix-copy (2-dimensional copy) routine (DP/rank-1). !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_matcopy_d1 SUBROUTINE libxsmm_matcopy_d1(output, input, m, n, ldi, ldo) INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: n INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldi INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldo REAL(C_DOUBLE), INTENT(OUT), TARGET :: output(*) REAL(C_DOUBLE), INTENT(IN), OPTIONAL, TARGET :: input(*) CALL libxsmm_xmatcopy(C_LOC(output), C_LOC(input), 8, & & m, n, ldi, ldo) END SUBROUTINE !> Matrix-copy (2-dimensional copy) routine (DP/rank-2). !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_matcopy_d2 SUBROUTINE libxsmm_matcopy_d2(output, input, m, n, ldi, ldo) INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, ldi, ldo REAL(C_DOUBLE), INTENT(OUT), TARGET :: output(ldo,*) REAL(C_DOUBLE), INTENT(IN), OPTIONAL, TARGET :: input(ldi,*) CALL libxsmm_xmatcopy(C_LOC(output), C_LOC(input), 8, & & m, n, ldi, ldo) END SUBROUTINE !> Matrix-copy (2-dimensional copy) routine (SP/rank-1). !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_matcopy_s1 SUBROUTINE libxsmm_matcopy_s1(output, input, m, n, ldi, ldo) INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: n INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldi INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldo REAL(C_FLOAT), INTENT(OUT), TARGET :: output(*) REAL(C_FLOAT), INTENT(IN), OPTIONAL, TARGET :: input(*) CALL libxsmm_xmatcopy(C_LOC(output), C_LOC(input), 4, & & m, n, ldi, ldo) END SUBROUTINE !> Matrix-copy (2-dimensional copy) routine (SP/rank-2). !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_matcopy_s2 SUBROUTINE libxsmm_matcopy_s2(output, input, m, n, ldi, ldo) INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, ldi, ldo REAL(C_FLOAT), INTENT(OUT), TARGET :: output(ldo,*) REAL(C_FLOAT), INTENT(IN), OPTIONAL, TARGET :: input(ldi,*) CALL libxsmm_xmatcopy(C_LOC(output), C_LOC(input), 4, & & m, n, ldi, ldo) END SUBROUTINE !> Transpose a matrix (in-place form). !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_itrans_p0 PURE SUBROUTINE libxsmm_itrans_p0(matrix, typesize, m, n, ld) INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: n, ld TYPE(C_PTR), INTENT(IN) :: matrix INTEGER(C_INT), INTENT(IN) :: typesize CALL libxsmm_xitrans(matrix, typesize, m, n, ld) END SUBROUTINE !> Transpose a matrix (in-place form, DP/rank-1). !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_itrans_d1 SUBROUTINE libxsmm_itrans_d1(matrix, m, n, ld) INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: n, ld REAL(C_DOUBLE), INTENT(INOUT), TARGET :: matrix(*) CALL libxsmm_xitrans(C_LOC(matrix), 8, m, n, ld) END SUBROUTINE !> Transpose a matrix (in-place form, DP/rank-2). !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_itrans_d2 SUBROUTINE libxsmm_itrans_d2(matrix, m, n, ld) INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, ld REAL(C_DOUBLE), INTENT(INOUT), TARGET :: matrix(ld,*) CALL libxsmm_xitrans(C_LOC(matrix), 8, m, n, ld) END SUBROUTINE !> Transpose a matrix (in-place form, SP/rank-1). !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_itrans_s1 SUBROUTINE libxsmm_itrans_s1(matrix, m, n, ld) INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: n, ld REAL(C_FLOAT), INTENT(INOUT), TARGET :: matrix(*) CALL libxsmm_xitrans(C_LOC(matrix), 4, m, n, ld) END SUBROUTINE !> Transpose a matrix (in-place form, SP/rank-2). !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_itrans_s2 SUBROUTINE libxsmm_itrans_s2(matrix, m, n, ld) INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, ld REAL(C_FLOAT), INTENT(INOUT), TARGET :: matrix(ld,*) CALL libxsmm_xitrans(C_LOC(matrix), 4, m, n, ld) END SUBROUTINE !> Transpose a matrix (out-of-place form). !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_otrans_p0 PURE SUBROUTINE libxsmm_otrans_p0(output, input, typesize, & & m, n, ldi, ldo) INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: n INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldi INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldo TYPE(C_PTR), INTENT(IN) :: output, input INTEGER(C_INT), INTENT(IN) :: typesize CALL libxsmm_xotrans(output, input, typesize, m, n, ldi, ldo) END SUBROUTINE !> Transpose a matrix (out-of-place form, DP/rank-1). !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_otrans_d1 SUBROUTINE libxsmm_otrans_d1(output, input, m, n, ldi, ldo) INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: n INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldi INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldo REAL(C_DOUBLE), INTENT(OUT), TARGET :: output(*) REAL(C_DOUBLE), INTENT(IN), TARGET :: input(*) CALL libxsmm_xotrans(C_LOC(output), C_LOC(input), & & 8, m, n, ldi, ldo) END SUBROUTINE !> Transpose a matrix (out-of-place form, DP/rank-2). !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_otrans_d2 SUBROUTINE libxsmm_otrans_d2(output, input, m, n, ldi, ldo) INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, ldi, ldo REAL(C_DOUBLE), INTENT(OUT), TARGET :: output(ldo,*) REAL(C_DOUBLE), INTENT(IN), TARGET :: input(ldi,*) CALL libxsmm_xotrans(C_LOC(output), C_LOC(input), & & 8, m, n, ldi, ldo) END SUBROUTINE !> Transpose a matrix (out-of-place form, SP/rank-1). !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_otrans_s1 SUBROUTINE libxsmm_otrans_s1(output, input, m, n, ldi, ldo) INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: n INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldi INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldo REAL(C_FLOAT), INTENT(OUT), TARGET :: output(*) REAL(C_FLOAT), INTENT(IN), TARGET :: input(*) CALL libxsmm_xotrans(C_LOC(output), C_LOC(input), & & 4, m, n, ldi, ldo) END SUBROUTINE !> Transpose a matrix (out-of-place form, SP/rank-2). !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_otrans_s2 SUBROUTINE libxsmm_otrans_s2(output, input, m, n, ldi, ldo) INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, ldi, ldo REAL(C_FLOAT), INTENT(OUT), TARGET :: output(ldo,*) REAL(C_FLOAT), INTENT(IN), TARGET :: input(ldi,*) CALL libxsmm_xotrans(C_LOC(output), C_LOC(input), & & 4, m, n, ldi, ldo) END SUBROUTINE !> Returns the difference between two timer ticks (cycles). !> Implicit FORTRAN 77 interface: subroutine available. !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_timer_ncycles PURE FUNCTION libxsmm_timer_ncycles(tick0, tick1) INTEGER(LIBXSMM_TICKINT_KIND), INTENT(IN) :: tick0, tick1 INTEGER(LIBXSMM_TICKINT_KIND) :: libxsmm_timer_ncycles !DIR$ ATTRIBUTES OFFLOAD:MIC :: internal_timer_ncycles INTERFACE PURE SUBROUTINE internal_timer_ncycles(ncycles, & & tick0, tick1) BIND(C, NAME="libxsmm_timer_ncycles_") IMPORT LIBXSMM_TICKINT_KIND INTEGER(LIBXSMM_TICKINT_KIND), INTENT(IN) :: tick0, tick1 INTEGER(LIBXSMM_TICKINT_KIND), INTENT(OUT) :: ncycles END SUBROUTINE END INTERFACE CALL internal_timer_ncycles( & & libxsmm_timer_ncycles, tick0, tick1) END FUNCTION !> Utility function to calculate a collection of scalar differences !> between two matrices (libxsmm_matdiff_info). The location (m, n) !> of the largest difference (linf_abs) is recorded (also if NaN). !> In case of NaN, differences are set to infinity. If no difference !> is discovered, the location (m, n) is negative (OOB). !> Implicit FORTRAN 77 interface: !> TYPE :: info !> INTEGER(4) :: datatype !> INTEGER(4|8) :: m, n, ldref, ldtst !> ARRAY :: ref, tst !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_matdiff PURE SUBROUTINE libxsmm_matdiff(info, datatype, m, n, & & ref, tst, ldref, ldtst) INTEGER(C_INT), INTENT(IN) :: datatype INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), & & OPTIONAL :: n, ldref, ldtst TYPE(C_PTR), INTENT(IN), OPTIONAL :: ref, tst TYPE(LIBXSMM_MATDIFF_INFO), INTENT(OUT) :: info !DIR$ ATTRIBUTES OFFLOAD:MIC :: internal_matdiff INTERFACE PURE SUBROUTINE internal_matdiff(info, datatype, m, n, & & ref, tst, ldref, ldtst) BIND(C, NAME="libxsmm_matdiff_") IMPORT LIBXSMM_MATDIFF_INFO, LIBXSMM_BLASINT_KIND IMPORT C_PTR, C_INT INTEGER(C_INT), INTENT(IN) :: datatype INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: ldref, ldtst TYPE(C_PTR), INTENT(IN), VALUE :: ref, tst TYPE(LIBXSMM_MATDIFF_INFO), INTENT(OUT) :: info END SUBROUTINE END INTERFACE CALL internal_matdiff(info, datatype, m, n, & & ref, tst, ldref, ldtst) END SUBROUTINE !> Calculate co-prime number <= n/2 (except: libxsmm_shuffle(0|1) == 0). !> Implicit FORTRAN 77 interface: !> INTEGER(4) :: coprime (OUT) !> INTEGER(4) :: n !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_shuffle ELEMENTAL FUNCTION libxsmm_shuffle(n) INTEGER(C_LONG_LONG) :: libxsmm_shuffle INTEGER(C_INT), INTENT(IN) :: n !DIR$ ATTRIBUTES OFFLOAD:MIC :: internal_shuffle INTERFACE PURE SUBROUTINE internal_shuffle(coprime, n) & & BIND(C, NAME="libxsmm_shuffle_") IMPORT C_LONG_LONG, C_INT INTEGER(C_LONG_LONG), INTENT(OUT) :: coprime INTEGER(C_INT), INTENT(IN) :: n END SUBROUTINE END INTERFACE libxsmm_shuffle = INT(0, KIND=C_LONG_LONG) ! avoid warning (older CRAY) CALL internal_shuffle(libxsmm_shuffle, n) END FUNCTION !> Calculates a hash value for the given array and seed. !> FORTRAN 77: see libxsmm_xhash !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_hash_char FUNCTION libxsmm_hash_char(key, seed) CHARACTER(C_CHAR), INTENT(IN) :: key(:) INTEGER(C_INT), INTENT(IN) :: seed INTEGER(C_INT) :: libxsmm_hash_char libxsmm_hash_char = seed CALL libxsmm_xhash(libxsmm_hash_char, & & libxsmm_ptr(key), SIZE(key)) END FUNCTION !> Calculates a hash value for the given array and seed. !> FORTRAN 77: see libxsmm_xhash !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_hash_i8 FUNCTION libxsmm_hash_i8(key, seed) INTEGER(C_INT8_T), INTENT(IN) :: key(:) INTEGER(C_INT), INTENT(IN) :: seed INTEGER(C_INT) :: libxsmm_hash_i8 libxsmm_hash_i8 = seed CALL libxsmm_xhash(libxsmm_hash_i8, & & libxsmm_ptr(key), SIZE(key)) END FUNCTION !> Calculates a hash value for the given array and seed. !> FORTRAN 77: see libxsmm_xhash !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_hash_i32 FUNCTION libxsmm_hash_i32(key, seed) INTEGER(C_INT), INTENT(IN) :: key(:) INTEGER(C_INT), INTENT(IN) :: seed INTEGER(C_INT) :: libxsmm_hash_i32 libxsmm_hash_i32 = seed CALL libxsmm_xhash(libxsmm_hash_i32, & & libxsmm_ptr(key), SIZE(key) * 4) END FUNCTION !> Calculates a hash value for the given array and seed. !> FORTRAN 77: see libxsmm_xhash !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_hash_i64 FUNCTION libxsmm_hash_i64(key, seed) INTEGER(C_LONG_LONG), INTENT(IN) :: key(:) INTEGER(C_INT), INTENT(IN) :: seed INTEGER(C_INT) :: libxsmm_hash_i64 libxsmm_hash_i64 = seed CALL libxsmm_xhash(libxsmm_hash_i64, & & libxsmm_ptr(key), SIZE(key) * 8) END FUNCTION !> Calculates if there is a difference between two arrays. !> FORTRAN 77: see libxsmm_xdiff !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_diff_char FUNCTION libxsmm_diff_char(a, b) CHARACTER(C_CHAR), INTENT(IN) :: a(:), b(:) LOGICAL(C_BOOL) :: libxsmm_diff_char IF (SIZE(a, KIND=C_LONG_LONG) .EQ. SIZE(b, KIND=C_LONG_LONG)) & & THEN CALL libxsmm_xdiff(libxsmm_diff_char, & & libxsmm_ptr(a), libxsmm_ptr(b), & & SIZE(a, KIND=C_LONG_LONG)) ELSE libxsmm_diff_char = LOGICAL(.TRUE., KIND=C_BOOL) END IF END FUNCTION !> Calculates if there is a difference between two arrays. !> FORTRAN 77: see libxsmm_xdiff !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_diff_i8 FUNCTION libxsmm_diff_i8(a, b) INTEGER(C_INT8_T), INTENT(IN) :: a(:), b(:) LOGICAL(C_BOOL) :: libxsmm_diff_i8 IF (SIZE(a, KIND=C_LONG_LONG) .EQ. SIZE(b, KIND=C_LONG_LONG)) & & THEN CALL libxsmm_xdiff(libxsmm_diff_i8, & & libxsmm_ptr(a), libxsmm_ptr(b), & & SIZE(a, KIND=C_LONG_LONG)) ELSE libxsmm_diff_i8 = LOGICAL(.TRUE., KIND=C_BOOL) END IF END FUNCTION !> Calculates if there is a difference between two arrays. !> FORTRAN 77: see libxsmm_xdiff !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_diff_i32 FUNCTION libxsmm_diff_i32(a, b) INTEGER(C_INT), INTENT(IN) :: a(:), b(:) LOGICAL(C_BOOL) :: libxsmm_diff_i32 IF (SIZE(a, KIND=C_LONG_LONG) .EQ. SIZE(b, KIND=C_LONG_LONG)) & & THEN CALL libxsmm_xdiff(libxsmm_diff_i32, & & libxsmm_ptr(a), libxsmm_ptr(b), & & SIZE(a, KIND=C_LONG_LONG) * INT(4, KIND=C_LONG_LONG)) ELSE libxsmm_diff_i32 = LOGICAL(.TRUE., KIND=C_BOOL) END IF END FUNCTION !> Calculates if there is a difference between two arrays. !> FORTRAN 77: see libxsmm_xdiff !DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_diff_i64 FUNCTION libxsmm_diff_i64(a, b) INTEGER(C_LONG_LONG), INTENT(IN) :: a(:), b(:) LOGICAL(C_BOOL) :: libxsmm_diff_i64 IF (SIZE(a, KIND=C_LONG_LONG) .EQ. SIZE(b, KIND=C_LONG_LONG)) & & THEN CALL libxsmm_xdiff(libxsmm_diff_i64, & & libxsmm_ptr(a), libxsmm_ptr(b), & & SIZE(a, KIND=C_LONG_LONG) * INT(8, KIND=C_LONG_LONG)) ELSE libxsmm_diff_i64 = LOGICAL(.TRUE., KIND=C_BOOL) END IF END FUNCTION END MODULE libxsmm-1.17/src/template/libxsmm.h000066400000000000000000002167751415223013700173630ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #ifndef LIBXSMM_H #define LIBXSMM_H #include "libxsmm_config.h" /** * Strings to denote the version of LIBXSMM (libxsmm_config.h). * LIBXSMM_VERSION: Name of the version (stringized version numbers). * LIBXSMM_BRANCH: Name of the branch this version is derived from. */ #define LIBXSMM_VERSION LIBXSMM_CONFIG_VERSION #define LIBXSMM_BRANCH LIBXSMM_CONFIG_BRANCH /** * Semantic version according to https://semver.org/ (see also libxsmm_config.h). * LIBXSMM_VERSION_MAJOR: Major version derived from the most recent RCS-tag. * LIBXSMM_VERSION_MINOR: Minor version derived from the most recent RCS-tag. * LIBXSMM_VERSION_UPDATE: Update number derived from the most recent RCS-tag. * LIBXSMM_VERSION_PATCH: Patch number based on distance to most recent RCS-tag. */ #define LIBXSMM_VERSION_MAJOR LIBXSMM_CONFIG_VERSION_MAJOR #define LIBXSMM_VERSION_MINOR LIBXSMM_CONFIG_VERSION_MINOR #define LIBXSMM_VERSION_UPDATE LIBXSMM_CONFIG_VERSION_UPDATE #define LIBXSMM_VERSION_PATCH LIBXSMM_CONFIG_VERSION_PATCH /** * The following interfaces shall be explicitly included, * i.e., separate from libxsmm.h: * - libxsmm_intrinsics_x86.h * - libxsmm_cpuid.h * - libxsmm_sync.h * - libxsmm_mhd.h */ #include "libxsmm_dnn_convolution.h" #include "libxsmm_dnn_fullyconnected.h" #include "libxsmm_dnn_fusedbatchnorm.h" #include "libxsmm_dnn_fusedgroupnorm.h" #include "libxsmm_dnn_pooling.h" #include "libxsmm_dnn_rnncell.h" #include "libxsmm_dnn_softmaxloss.h" #include "libxsmm_dnn_optimizer.h" #include "libxsmm_blocked_gemm.h" #include "libxsmm_generator.h" #include "libxsmm_frontend.h" #include "libxsmm_fsspmdm.h" #include "libxsmm_malloc.h" #include "libxsmm_spmdm.h" #include "libxsmm_cpuid.h" #include "libxsmm_timer.h" #include "libxsmm_math.h" #include "libxsmm_rng.h" /** Initialize the library; pay for setup cost at a specific point. */ LIBXSMM_API void libxsmm_init(void); /** De-initialize the library and free internal memory (optional). */ LIBXSMM_API void libxsmm_finalize(void); /** * Returns the architecture and instruction set extension as determined by the CPUID flags, as set * by the libxsmm_get_target_arch* functions, or as set by the LIBXSMM_TARGET environment variable. */ LIBXSMM_API int libxsmm_get_target_archid(void); /** Set target architecture (id: see libxsmm_typedefs.h) for subsequent code generation (JIT). */ LIBXSMM_API void libxsmm_set_target_archid(int id); /** * Returns the name of the target architecture as determined by the CPUID flags, as set by the * libxsmm_get_target_arch* functions, or as set by the LIBXSMM_TARGET environment variable. */ LIBXSMM_API const char* libxsmm_get_target_arch(void); /** Set target architecture (arch="0|sse|snb|hsw|knl|knm|skx|clx|cpx", NULL/"0": CPUID). */ LIBXSMM_API void libxsmm_set_target_arch(const char* arch); /** Get the level of verbosity. */ LIBXSMM_API int libxsmm_get_verbosity(void); /** * Set the level of verbosity (0: off, positive value: verbosity level, * negative value: maximum verbosity, which also dumps JIT-code) */ LIBXSMM_API void libxsmm_set_verbosity(int level); /** Get the default prefetch strategy. */ LIBXSMM_API libxsmm_gemm_prefetch_type libxsmm_get_gemm_auto_prefetch(void); /** Set the default prefetch strategy. */ LIBXSMM_API void libxsmm_set_gemm_auto_prefetch(libxsmm_gemm_prefetch_type strategy); /** Receive information about JIT-generated code. */ LIBXSMM_API int libxsmm_get_kernel_info(const void* kernel, libxsmm_kernel_info* info); /** Get information about the matrix multiplication kernel. */ LIBXSMM_API int libxsmm_get_mmkernel_info(libxsmm_xmmfunction kernel, libxsmm_mmkernel_info* info); /** Get information about the matrix transpose kernel. */ LIBXSMM_API int libxsmm_get_transkernel_info(libxsmm_xtransfunction kernel, libxsmm_transkernel_info* info); /** Get information about the matrix copy kernel. */ LIBXSMM_API int libxsmm_get_mcopykernel_info(libxsmm_xmcopyfunction kernel, libxsmm_mcopykernel_info* info); /** Get information about the matrix eltwise kernel. */ LIBXSMM_API int libxsmm_get_meltwkernel_info(libxsmm_xmeltwfunction kernel, libxsmm_meltwkernel_info* info); /** Get information about the code registry. */ LIBXSMM_API int libxsmm_get_registry_info(libxsmm_registry_info* info); /** * Register user-defined key-value. * Since the key-type is unknown to LIBXSMM, the key must be binary reproducible, * i.e., if it is a structured type (padded data may be uninitialized), it must * be initially zero-filled (memset) followed by an element-wise initialization. * The size of the key is limited (see documentation). The given value is copied * by LIBXSMM and may be initialized at registration-time or whenever queried. * Registered data is released at program termination but can be also released * if needed (libxsmm_xrelease), .e.g., for larger value for the same key. */ LIBXSMM_API void* libxsmm_xregister(const void* key, size_t key_size, size_t value_size, const void* value_init); /** Query user-defined value from LIBXSMM's code registry. */ LIBXSMM_API void* libxsmm_xdispatch(const void* key, size_t key_size); /** Remove key-value pair from code registry and release memory. */ LIBXSMM_API void libxsmm_xrelease(const void* key, size_t key_size); /** Query or JIT-generate SMM-kernel; returns NULL if it does not exist or if JIT is not supported (descriptor form). */ LIBXSMM_API libxsmm_xmmfunction libxsmm_xmmdispatch(const libxsmm_gemm_descriptor* descriptor); /** Query or JIT-generate SMM-kernel; returns NULL if it does not exist or if JIT is not supported (double-precision). */ LIBXSMM_API libxsmm_dmmfunction libxsmm_dmmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const double* alpha, const double* beta, const int* flags, const int* prefetch); /** Query or JIT-generate SMM-kernel; returns NULL if it does not exist or if JIT is not supported (single-precision). */ LIBXSMM_API libxsmm_smmfunction libxsmm_smmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate SMM-kernel; returns NULL if it does not exist or if JIT is not supported (bf16 inputs, fp32-accumulate) */ LIBXSMM_API libxsmm_bsmmfunction libxsmm_bsmmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate SMM-kernel; returns NULL if it does not exist or if JIT is not supported (bf16 inputs, fp32-accumulate internally, bf16 outputs) */ LIBXSMM_API libxsmm_bmmfunction libxsmm_bmmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate SMM-kernel; returns NULL if it does not exist or if JIT is not supported (low/short-precision, int-accumulate) */ LIBXSMM_API libxsmm_wimmfunction libxsmm_wimmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate SMM-kernel; returns NULL if it does not exist or if JIT is not supported (low/char-precision, int-accumulate) */ LIBXSMM_API libxsmm_ssbimmfunction libxsmm_ssbimmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); LIBXSMM_API libxsmm_usbimmfunction libxsmm_usbimmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); LIBXSMM_API libxsmm_subimmfunction libxsmm_subimmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); LIBXSMM_API libxsmm_uubimmfunction libxsmm_uubimmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate SMM-kernel; returns NULL if it does not exist or if JIT is not supported (low/char-precision, int-accumulate, int8 outputs) */ LIBXSMM_API libxsmm_sububmmfunction libxsmm_sububmmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (double-precision). */ LIBXSMM_API libxsmm_dmmfunction_reducebatch_addr libxsmm_dmmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const double* alpha, const double* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (single-precision). */ LIBXSMM_API libxsmm_smmfunction_reducebatch_addr libxsmm_smmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (bf16 inputs, fp32-accumulate). */ LIBXSMM_API libxsmm_bsmmfunction_reducebatch_addr libxsmm_bsmmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (bf16 inputs, fp32-accumulate internally, bf16 outputs). */ LIBXSMM_API libxsmm_bmmfunction_reducebatch_addr libxsmm_bmmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int16 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_wimmfunction_reducebatch_addr libxsmm_wimmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_addr libxsmm_ssbimmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_usbimmfunction_reducebatch_addr libxsmm_usbimmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_subimmfunction_reducebatch_addr libxsmm_subimmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_uubimmfunction_reducebatch_addr libxsmm_uubimmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate, int8 outputs). */ LIBXSMM_API libxsmm_sububmmfunction_reducebatch_addr libxsmm_sububmmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (double-precision). */ LIBXSMM_API libxsmm_dmmfunction_reducebatch_addr libxsmm_dmmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const double* alpha, const double* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (single-precision). */ LIBXSMM_API libxsmm_smmfunction_reducebatch_addr libxsmm_smmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /* Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (bf16 inputs, fp32-accumulate). */ LIBXSMM_API libxsmm_bsmmfunction_reducebatch_addr libxsmm_bsmmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (bf16 inputs, fp32-accumulate internally, bf16 outputs). */ LIBXSMM_API libxsmm_bmmfunction_reducebatch_addr libxsmm_bmmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int16 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_wimmfunction_reducebatch_addr libxsmm_wimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_addr libxsmm_ssbimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_usbimmfunction_reducebatch_addr libxsmm_usbimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_subimmfunction_reducebatch_addr libxsmm_subimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_uubimmfunction_reducebatch_addr libxsmm_uubimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate, int8 outputs). */ LIBXSMM_API libxsmm_sububmmfunction_reducebatch_addr libxsmm_sububmmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (double-precision). */ LIBXSMM_API libxsmm_dmmfunction_reducebatch_offs libxsmm_dmmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const double* alpha, const double* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (single-precision). */ LIBXSMM_API libxsmm_smmfunction_reducebatch_offs libxsmm_smmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (bf16 inputs, fp32-accumulate). */ LIBXSMM_API libxsmm_bsmmfunction_reducebatch_offs libxsmm_bsmmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (bf16 inputs, fp32-accumulate internally, bf16 outputs). */ LIBXSMM_API libxsmm_bmmfunction_reducebatch_offs libxsmm_bmmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int16 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_wimmfunction_reducebatch_offs libxsmm_wimmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_offs libxsmm_ssbimmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_usbimmfunction_reducebatch_offs libxsmm_usbimmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_subimmfunction_reducebatch_offs libxsmm_subimmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_uubimmfunction_reducebatch_offs libxsmm_uubimmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate, int8 outputs). */ LIBXSMM_API libxsmm_sububmmfunction_reducebatch_offs libxsmm_sububmmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (double-precision). */ LIBXSMM_API libxsmm_dmmfunction_reducebatch_offs libxsmm_dmmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const double* alpha, const double* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (single-precision). */ LIBXSMM_API libxsmm_smmfunction_reducebatch_offs libxsmm_smmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (bf16 inputs, fp32-accumulate). */ LIBXSMM_API libxsmm_bsmmfunction_reducebatch_offs libxsmm_bsmmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (bf16 inputs, fp32-accumulate internally, bf16 outputs). */ LIBXSMM_API libxsmm_bmmfunction_reducebatch_offs libxsmm_bmmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int16 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_wimmfunction_reducebatch_offs libxsmm_wimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_offs libxsmm_ssbimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_usbimmfunction_reducebatch_offs libxsmm_usbimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_subimmfunction_reducebatch_offs libxsmm_subimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_uubimmfunction_reducebatch_offs libxsmm_uubimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate, int8 outputs). */ LIBXSMM_API libxsmm_sububmmfunction_reducebatch_offs libxsmm_sububmmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (double-precision). */ LIBXSMM_API libxsmm_dmmfunction_reducebatch_strd libxsmm_dmmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const double* alpha, const double* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (single-precision). */ LIBXSMM_API libxsmm_smmfunction_reducebatch_strd libxsmm_smmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (bf16 inputs, fp32-accumulate). */ LIBXSMM_API libxsmm_bsmmfunction_reducebatch_strd libxsmm_bsmmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (bf16 inputs, fp32-accumulate internally, bf16 outputs). */ LIBXSMM_API libxsmm_bmmfunction_reducebatch_strd libxsmm_bmmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int16 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_wimmfunction_reducebatch_strd libxsmm_wimmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_strd libxsmm_ssbimmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_usbimmfunction_reducebatch_strd libxsmm_usbimmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_subimmfunction_reducebatch_strd libxsmm_subimmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_uubimmfunction_reducebatch_strd libxsmm_uubimmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate, int8 outputs). */ LIBXSMM_API libxsmm_sububmmfunction_reducebatch_strd libxsmm_sububmmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (double-precision). */ LIBXSMM_API libxsmm_dmmfunction_reducebatch_strd libxsmm_dmmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const double* alpha, const double* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (single-precision). */ LIBXSMM_API libxsmm_smmfunction_reducebatch_strd libxsmm_smmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (bf16 inputs, fp32-accumulate). */ LIBXSMM_API libxsmm_bsmmfunction_reducebatch_strd libxsmm_bsmmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (bf16 inputs, fp32-accumulate internally, bf16 outputs). */ LIBXSMM_API libxsmm_bmmfunction_reducebatch_strd libxsmm_bmmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int16 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_wimmfunction_reducebatch_strd libxsmm_wimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_strd libxsmm_ssbimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_usbimmfunction_reducebatch_strd libxsmm_usbimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_subimmfunction_reducebatch_strd libxsmm_subimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate). */ LIBXSMM_API libxsmm_uubimmfunction_reducebatch_strd libxsmm_uubimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** Query or JIT-generate reduction kernel; returns NULL if JIT is not supported (int8 inputs, int32-accumulate, int8 outputs). */ LIBXSMM_API libxsmm_sububmmfunction_reducebatch_strd libxsmm_sububmmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const int* alpha, const int* beta, const int* flags, const int* prefetch); /** * Process a series of matrix multiplications (batch). See also libxsmm_gemm_batch/omp. * The kind of matrix operands (a, b, c) depend on index_stride: * index_stride==0: pointers to pointers of elements, e.g., double** for the C matrices. * index_stride!=0: pointer to elements, e.g., const double* for the A and B matrices. */ LIBXSMM_API void libxsmm_mmbatch(libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, const char* transa, const char* transb, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const void* beta, void* c, const libxsmm_blasint* ldc, /** Determines index-base (usually 0, 1 for one-based indexes); uses the same unit as the strides. */ libxsmm_blasint index_base, /** * Stride used to walk stride_a, stride_b, and stride_c; zero turns stride_* into scalar values. * The index_stride is measured in Bytes (sizeof(libxsmm_blasint) determines packed indexes). */ libxsmm_blasint index_stride, /** * Depending on index_stride, the meaning of stride_a, stride_b, and stride_c is different. * index_stride==0: stride_a, stride_b, and stride_c are pointers to scalar values. * index_stride!=0: stride_* are indexes determining the position of a, b, and c operands. */ const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], /** * Number of matrix multiplications. If the size is given as a negative value, * then internal synchronization is omitted. */ libxsmm_blasint batchsize, /** Thread-ID (TID), and number of threads. */ /*unsigned*/int tid, /*unsigned*/int nthreads); /** Process a series of matrix multiplications (batch). See also libxsmm_mmbatch. */ LIBXSMM_API void libxsmm_gemm_batch(libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, const char* transa, const char* transb, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const void* beta, void* c, const libxsmm_blasint* ldc, libxsmm_blasint index_base, libxsmm_blasint index_stride, const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], libxsmm_blasint batchsize); /** Process a series of matrix multiplications (batch) with OpenMP (libxsmmext). See also libxsmm_mmbatch. */ LIBXSMM_APIEXT void libxsmm_gemm_batch_omp(libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, const char* transa, const char* transb, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const void* beta, void* c, const libxsmm_blasint* ldc, libxsmm_blasint index_base, libxsmm_blasint index_stride, const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], libxsmm_blasint batchsize); /** Unlike libxsmm_gemm_batch, groups of homogeneous batches are possible (double-precision). */ LIBXSMM_API void libxsmm_dgemm_batch(const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], const double alpha_array[], const double* a_array[], const libxsmm_blasint lda_array[], const double* b_array[], const libxsmm_blasint ldb_array[], const double beta_array[], double* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]); /** Unlike libxsmm_gemm_batch, groups of homogeneous batches are possible (single-precision). */ LIBXSMM_API void libxsmm_sgemm_batch(const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], const float alpha_array[], const float* a_array[], const libxsmm_blasint lda_array[], const float* b_array[], const libxsmm_blasint ldb_array[], const float beta_array[], float* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]); /** Unlike libxsmm_gemm_batch, groups of homogeneous batches are possible (double-precision). */ LIBXSMM_APIEXT void libxsmm_dgemm_batch_omp(const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], const double alpha_array[], const double* a_array[], const libxsmm_blasint lda_array[], const double* b_array[], const libxsmm_blasint ldb_array[], const double beta_array[], double* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]); /** Unlike libxsmm_gemm_batch, groups of homogeneous batches are possible (single-precision). */ LIBXSMM_APIEXT void libxsmm_sgemm_batch_omp(const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], const float alpha_array[], const float* a_array[], const libxsmm_blasint lda_array[], const float* b_array[], const libxsmm_blasint ldb_array[], const float beta_array[], float* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]); /** * This function is a no-op unless LIBXSMM is built to intercept GEMM calls. * Pointer arguments are used to filter intercepted GEMM calls such that * non-NULL values match. Otherwise (NULL) the respective argument is * considered a "free value", i.e., every value can match; libxsmmext required. */ LIBXSMM_APIEXT void libxsmm_mmbatch_begin(libxsmm_gemm_precision precision, const int* flags, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const void* alpha, const void* beta); /** Processes the batch of previously recorded matrix multiplications (libxsmm_mmbatch_begin); libxsmmext required. */ LIBXSMM_APIEXT void libxsmm_mmbatch_end(void); /** Code generation routine for matrix-copy using a descriptor. */ LIBXSMM_API libxsmm_xmcopyfunction libxsmm_dispatch_mcopy(const libxsmm_mcopy_descriptor* descriptor); /** Code generation routine for matrix-eltwise using a descriptor. */ LIBXSMM_API libxsmm_xmeltwfunction libxsmm_dispatch_meltw(const libxsmm_meltw_descriptor* descriptor); LIBXSMM_API libxsmm_meltwfunction_copy libxsmm_dispatch_meltw_copy(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type); LIBXSMM_API libxsmm_meltwfunction_zero libxsmm_dispatch_meltw_zero(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type); LIBXSMM_API libxsmm_meltwfunction_add libxsmm_dispatch_meltw_add(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type); LIBXSMM_API libxsmm_meltwfunction_mul libxsmm_dispatch_meltw_mul(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type); LIBXSMM_API libxsmm_meltwfunction_relu libxsmm_dispatch_meltw_relu(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type); LIBXSMM_API libxsmm_meltwfunction_cvtfp32bf16 libxsmm_dispatch_meltw_cvtfp32bf16(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type); LIBXSMM_API libxsmm_meltwfunction_cvtfp32bf16_act libxsmm_dispatch_meltw_cvtfp32bf16_act(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type, libxsmm_meltw_cvta_flags flags); LIBXSMM_API libxsmm_meltwfunction_act_cvtfp32bf16 libxsmm_dispatch_meltw_act_cvtfp32bf16(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type, libxsmm_meltw_acvt_flags flags); LIBXSMM_API libxsmm_meltwfunction_reduce libxsmm_dispatch_meltw_reduce(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type, libxsmm_meltw_redu_flags flags); LIBXSMM_API libxsmm_meltwfunction_scale libxsmm_dispatch_meltw_scale(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type, libxsmm_meltw_scal_flags flags); /** Code generation routine for transposes using a descriptor */ LIBXSMM_API libxsmm_xtransfunction libxsmm_dispatch_trans(const libxsmm_trans_descriptor* descriptor); /** Code generation routine for GEMM/packed using a descriptor */ LIBXSMM_API libxsmm_pgemm_xfunction libxsmm_dispatch_pgemm(const libxsmm_pgemm_descriptor* descriptor); /** Code generation routine for GETRF/packed using a descriptor */ LIBXSMM_API libxsmm_getrf_xfunction libxsmm_dispatch_getrf(const libxsmm_getrf_descriptor* descriptor); /** Code generation routine for TRMM/packed using a descriptor */ LIBXSMM_API libxsmm_trmm_xfunction libxsmm_dispatch_trmm(const libxsmm_trmm_descriptor* descriptor); /** Code generation routine for TRSM/packed using a descriptor */ LIBXSMM_API libxsmm_trsm_xfunction libxsmm_dispatch_trsm(const libxsmm_trsm_descriptor* descriptor); /** * Code generation routine for the CSR format which multiplies a dense SOA matrix (each element holds a SIMD-width * wide vector) and a sparse matrix or a sparse matrix with a dense SOA matrix. * The result is always a SOA matrix. There is no code cache, and user code has to manage the code pointers. * Call libxsmm_release_kernel in order to deallocate the JIT'ted code. */ LIBXSMM_API libxsmm_xmmfunction libxsmm_create_xcsr_soa(const libxsmm_gemm_descriptor* descriptor, const unsigned int* row_ptr, const unsigned int* column_idx, const void* values, unsigned int packed_width); /** * Code generation routine for the CSC format which multiplies a dense SOA matrix (each element holds a SIMD-width * wide vector) and a sparse matrix or a sparse matrix with a dense SOA matrix. * The result is always a SOA matrix. There is no code cache, and user code has to manage the code pointers. * Call libxsmm_release_kernel in order to deallocate the JIT'ted code. */ LIBXSMM_API libxsmm_xmmfunction libxsmm_create_xcsc_soa(const libxsmm_gemm_descriptor* descriptor, const unsigned int* column_ptr, const unsigned int* row_idx, const void* values, unsigned int packed_width); /** * Code generation routine for row-major format B matrix which is multiplied by a dense packed matrix (each element holds a SIMD-width * wide vector) and the result is another packed matrix. The memory layout of the SOA matrix is [row][col][packed]. * here is no code cache, and user code has to manage the code pointers. * Call libxsmm_release_kernel in order to deallocate the JIT'ted code. */ LIBXSMM_API libxsmm_xmmfunction libxsmm_create_pgemm_ac_rm(const libxsmm_gemm_descriptor* descriptor, unsigned int packed_width); /** * Code generation routine for row-major format A matrix which is multiplied by a dense packed matrix (each element holds a SIMD-width * wide vector) and the result is another packed matrix. The memory layout of the packed matrix is [row][col][packed]. * here is no code cache, and user code has to manage the code pointers. * Call libxsmm_release_kernel in order to deallocate the JIT'ted code. */ LIBXSMM_API libxsmm_xmmfunction libxsmm_create_pgemm_bc_rm(const libxsmm_gemm_descriptor* descriptor, unsigned int packed_width); /** * Code generation routine for the CSR format which multiplies a dense matrix B into a dense matrix C. * The sparse matrix a is kept in registers. * Call libxsmm_release_kernel in order to deallocate the JIT'ted code. */ LIBXSMM_API libxsmm_dmmfunction libxsmm_create_dcsr_reg(const libxsmm_gemm_descriptor* descriptor, const unsigned int* row_ptr, const unsigned int* column_idx, const double* values); /** * Code generation routine for the CSR format which multiplies a dense matrix B into a dense matrix C. * The sparse matrix a is kept in registers. * Call libxsmm_release_kernel in order to deallocate the JIT'ted code. */ LIBXSMM_API libxsmm_smmfunction libxsmm_create_scsr_reg(const libxsmm_gemm_descriptor* descriptor, const unsigned int* row_ptr, const unsigned int* column_idx, const float* values); /** * Deallocates the JIT'ted code as returned by libxsmm_create_* functions, * unregisters and releases code from the code registry. */ LIBXSMM_API void libxsmm_release_kernel(const void* kernel); /** Matrix copy function ("in" can be NULL to zero the destination). */ LIBXSMM_API void libxsmm_matcopy(void* out, const void* in, unsigned int typesize, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo); /** Matrix copy function ("in" can be NULL to zero the destination, per-thread form). */ LIBXSMM_API void libxsmm_matcopy_thread(void* out, const void* in, unsigned int typesize, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo, /*unsigned*/int tid, /*unsigned*/int nthreads); /** Matrix copy function ("in" can be NULL to zero the destination); MT via libxsmmext. */ LIBXSMM_APIEXT void libxsmm_matcopy_omp(void* out, const void* in, unsigned int typesize, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo); /** Matrix transposition (out-of-place form). */ LIBXSMM_API void libxsmm_otrans(void* out, const void* in, unsigned int typesize, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo); /** Matrix transposition (out-of-place form, per-thread form). */ LIBXSMM_API void libxsmm_otrans_thread(void* out, const void* in, unsigned int typesize, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo, /*unsigned*/int tid, /*unsigned*/int nthreads); /** Matrix transposition; MT via libxsmmext (out-of-place form). */ LIBXSMM_APIEXT void libxsmm_otrans_omp(void* out, const void* in, unsigned int typesize, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo); /** Matrix transposition (in-place form). */ LIBXSMM_API void libxsmm_itrans(void* inout, unsigned int typesize, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld); /** Initialize GEMM-handle; allows to better amortize setup overhead. */ LIBXSMM_API libxsmm_gemm_handle* libxsmm_gemm_handle_init(libxsmm_gemm_blob* blob, libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const void* alpha, const void* beta, int flags, /*unsigned*/int ntasks); /** Calculate required scratch buffer size needed to perform libxsmm_gemm_thread. */ LIBXSMM_API size_t libxsmm_gemm_handle_get_scratch_size(const libxsmm_gemm_handle* handle); /** Low-level type-agnostic GEMM suitable for external threads or tasks. */ LIBXSMM_API void libxsmm_gemm_thread(const libxsmm_gemm_handle* handle, void* scratch, const void* a, const void* b, void* c, /*unsigned*/int tid, /*unsigned*/int nthreads); /** General dense matrix multiplication (sequential). */ LIBXSMM_API void libxsmm_xgemm(libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const void* beta, void* c, const libxsmm_blasint* ldc); /** General dense matrix multiplication (libxsmmext); available as xgemm (generic), dgemm (DP), and sgemm (SP). */ LIBXSMM_APIEXT void libxsmm_xgemm_omp(libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const void* beta, void* c, const libxsmm_blasint* ldc); /** Dispatched general dense matrix multiplication (double-precision). */ LIBXSMM_API void libxsmm_dgemm(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const double* alpha, const double* a, const libxsmm_blasint* lda, const double* b, const libxsmm_blasint* ldb, const double* beta, double* c, const libxsmm_blasint* ldc); /** Dispatched general dense matrix multiplication (single-precision). */ LIBXSMM_API void libxsmm_sgemm(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const float* alpha, const float* a, const libxsmm_blasint* lda, const float* b, const libxsmm_blasint* ldb, const float* beta, float* c, const libxsmm_blasint* ldc); /** Dispatched general dense matrix multiplication (I16 input, I32 result). */ LIBXSMM_API void libxsmm_wigemm(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const int* alpha, const short* a, const libxsmm_blasint* lda, const short* b, const libxsmm_blasint* ldb, const int* beta, int* c, const libxsmm_blasint* ldc); /** Dispatched general dense matrix multiplication (BF16 input, F32 result). */ LIBXSMM_API void libxsmm_bsgemm(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const float* alpha, const libxsmm_bfloat16* a, const libxsmm_blasint* lda, const libxsmm_bfloat16* b, const libxsmm_blasint* ldb, const float* beta, float* c, const libxsmm_blasint* ldc); #if !defined(LIBXSMM_DEFAULT_CONFIG) && !defined(LIBXSMM_SOURCE_H) $MNK_INTERFACE_LIST #endif /*!defined(LIBXSMM_DEFAULT_CONFIG)*/ #if defined(__cplusplus) /** Map a built-in type to libxsmm_gemm_precision (libxsmm_gemm_precision_enum). */ template struct LIBXSMM_RETARGETABLE libxsmm_gemm_precision_enum { static const libxsmm_gemm_precision value = static_cast(LIBXSMM_DATATYPE_UNSUPPORTED); }; template<> struct LIBXSMM_RETARGETABLE libxsmm_gemm_precision_enum { static const libxsmm_gemm_precision value = LIBXSMM_GEMM_PRECISION_F64; }; template<> struct LIBXSMM_RETARGETABLE libxsmm_gemm_precision_enum { static const libxsmm_gemm_precision value = LIBXSMM_GEMM_PRECISION_F32; }; template<> struct LIBXSMM_RETARGETABLE libxsmm_gemm_precision_enum { static const libxsmm_gemm_precision value = LIBXSMM_GEMM_PRECISION_I32; }; template<> struct LIBXSMM_RETARGETABLE libxsmm_gemm_precision_enum { static const libxsmm_gemm_precision value = LIBXSMM_GEMM_PRECISION_I16; }; template<> struct LIBXSMM_RETARGETABLE libxsmm_gemm_precision_enum { static const libxsmm_gemm_precision value = LIBXSMM_GEMM_PRECISION_BF16; }; template<> struct LIBXSMM_RETARGETABLE libxsmm_gemm_precision_enum { static const libxsmm_gemm_precision value = LIBXSMM_GEMM_PRECISION_BF16; }; template<> struct LIBXSMM_RETARGETABLE libxsmm_gemm_precision_enum { static const libxsmm_gemm_precision value = LIBXSMM_GEMM_PRECISION_I8; }; template<> struct LIBXSMM_RETARGETABLE libxsmm_gemm_precision_enum { static const libxsmm_gemm_precision value = LIBXSMM_GEMM_PRECISION_I8; }; template<> struct LIBXSMM_RETARGETABLE libxsmm_gemm_precision_enum { static const libxsmm_gemm_precision value = LIBXSMM_GEMM_PRECISION_I8; }; template struct LIBXSMM_RETARGETABLE libxsmm_gemm_default_output { typedef INP_TYPE type; }; template<> struct LIBXSMM_RETARGETABLE libxsmm_gemm_default_output { typedef int type; }; template<> struct LIBXSMM_RETARGETABLE libxsmm_gemm_default_output { typedef int type; }; /** Construct and execute a specialized function. */ template::type> class LIBXSMM_RETARGETABLE libxsmm_mmfunction { mutable/*retargetable*/ libxsmm_xmmfunction m_function; public: typedef INP_TYPE itype; typedef OUT_TYPE otype; public: libxsmm_mmfunction() { m_function.xmm = 0; } libxsmm_mmfunction(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, int flags = LIBXSMM_FLAGS) { libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_gemm_descriptor_init2(&blob, libxsmm_gemm_precision_enum::value, libxsmm_gemm_precision_enum::value, m, n, k, m, k, m, NULL/*alpha*/, NULL/*beta*/, flags, libxsmm_get_gemm_xprefetch(NULL)); m_function.xmm = (0 != desc ? libxsmm_xmmdispatch(desc).xmm : 0); } libxsmm_mmfunction(int flags, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, int prefetch) { libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_gemm_descriptor_init2(&blob, libxsmm_gemm_precision_enum::value, libxsmm_gemm_precision_enum::value, m, n, k, m, k, m, NULL/*alpha*/, NULL/*beta*/, flags, libxsmm_get_gemm_prefetch(prefetch)); m_function.xmm = (0 != desc ? libxsmm_xmmdispatch(desc).xmm : 0); } libxsmm_mmfunction(int flags, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, otype alpha, otype beta) { libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_gemm_descriptor_init2(&blob, libxsmm_gemm_precision_enum::value, libxsmm_gemm_precision_enum::value, m, n, k, m, k, m, &alpha, &beta, flags, libxsmm_get_gemm_xprefetch(NULL)); m_function.xmm = (0 != desc ? libxsmm_xmmdispatch(desc).xmm : 0); } libxsmm_mmfunction(int flags, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, otype alpha, otype beta, int prefetch) { libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_gemm_descriptor_init2(&blob, libxsmm_gemm_precision_enum::value, libxsmm_gemm_precision_enum::value, m, n, k, m, k, m, &alpha, &beta, flags, libxsmm_get_gemm_prefetch(prefetch)); m_function.xmm = (0 != desc ? libxsmm_xmmdispatch(desc).xmm : 0); } libxsmm_mmfunction(int flags, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, int prefetch) { libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_gemm_descriptor_init2(&blob, libxsmm_gemm_precision_enum::value, libxsmm_gemm_precision_enum::value, m, n, k, lda, ldb, ldc, NULL/*alpha*/, NULL/*beta*/, flags, libxsmm_get_gemm_prefetch(prefetch)); m_function.xmm = (0 != desc ? libxsmm_xmmdispatch(desc).xmm : 0); } libxsmm_mmfunction(int flags, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, otype alpha, otype beta) { libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_gemm_descriptor_init2(&blob, libxsmm_gemm_precision_enum::value, libxsmm_gemm_precision_enum::value, m, n, k, lda, ldb, ldc, &alpha, &beta, flags, libxsmm_get_gemm_xprefetch(NULL)); m_function.xmm = (0 != desc ? libxsmm_xmmdispatch(desc).xmm : 0); } libxsmm_mmfunction(int flags, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, otype alpha, otype beta, int prefetch) { libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_gemm_descriptor_init2(&blob, libxsmm_gemm_precision_enum::value, libxsmm_gemm_precision_enum::value, m, n, k, lda, ldb, ldc, &alpha, &beta, flags, libxsmm_get_gemm_prefetch(prefetch)); m_function.xmm = (0 != desc ? libxsmm_xmmdispatch(desc).xmm : 0); } public: const libxsmm_xmmfunction& kernel() const { return m_function; } operator const void*() const { return 0 != m_function.xmm ? this : 0; } void operator()(const itype* a, const itype* b, otype* c) const { LIBXSMM_MMCALL_ABC(m_function.xmm, a, b, c); } void operator()(const itype* a, const itype* b, otype* c, const itype* pa, const itype* pb, const otype* pc) const { LIBXSMM_MMCALL_PRF(m_function.xmm, a, b, c, pa, pb, pc); } }; /** Matrix copy function ("in" can be NULL to zero the destination). */ template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_matcopy(T* out, const T* in, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo) { return libxsmm_matcopy(out, in, sizeof(T), m, n, ldi, ldo); } template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_matcopy(T* out, const T* in, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi) { return libxsmm_matcopy(out, in, m, n, ldi, ldi); } template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_matcopy(T* out, const T* in, libxsmm_blasint m, libxsmm_blasint n) { return libxsmm_matcopy(out, in, m, n, m); } template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_matcopy(T* out, const T* in, libxsmm_blasint n) { return libxsmm_matcopy(out, in, n, n); } /** Matrix copy function ("in" can be NULL to zero the destination); MT via libxsmmext. */ template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_matcopy_omp(T* out, const T* in, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo) { return libxsmm_matcopy_omp(out, in, sizeof(T), m, n, ldi, ldo); } template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_matcopy_omp(T* out, const T* in, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi) { return libxsmm_matcopy_omp(out, in, m, n, ldi, ldi); } template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_matcopy_omp(T* out, const T* in, libxsmm_blasint m, libxsmm_blasint n) { return libxsmm_matcopy_omp(out, in, m, n, m); } template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_matcopy_omp(T* out, const T* in, libxsmm_blasint n) { return libxsmm_matcopy_omp(out, in, n, n); } /** Matrix transposition (out-of-place form). */ template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_trans(T* out, const T* in, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo) { return libxsmm_otrans(out, in, sizeof(T), m, n, ldi, ldo); } template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_trans(T* out, const T* in, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi) { return libxsmm_trans(out, in, m, n, ldi, ldi); } template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_trans(T* out, const T* in, libxsmm_blasint m, libxsmm_blasint n) { return libxsmm_trans(out, in, m, n, m); } template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_trans(T* out, const T* in, libxsmm_blasint n) { return libxsmm_trans(out, in, n, n); } /** Matrix transposition; MT via libxsmmext (out-of-place form). */ template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_trans_omp(T* out, const T* in, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo) { return libxsmm_otrans_omp(out, in, sizeof(T), m, n, ldi, ldo); } template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_trans_omp(T* out, const T* in, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi) { return libxsmm_trans_omp(out, in, m, n, ldi, ldi); } template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_trans_omp(T* out, const T* in, libxsmm_blasint m, libxsmm_blasint n) { return libxsmm_trans_omp(out, in, m, n, m); } template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_trans_omp(T* out, const T* in, libxsmm_blasint n) { return libxsmm_trans_omp(out, in, n, n); } /** Matrix transposition (in-place form). */ template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_trans(T* inout, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi) { return libxsmm_itrans(inout, sizeof(T), m, n, ldi); } template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_trans(T* inout, libxsmm_blasint m, libxsmm_blasint n) { return libxsmm_trans(inout, m, n, m); } template inline/*superfluous*/ LIBXSMM_RETARGETABLE int libxsmm_trans(T* inout, libxsmm_blasint n) { return libxsmm_trans(inout, n, n); } /** Dispatched general dense matrix multiplication (double-precision). */ inline LIBXSMM_RETARGETABLE void libxsmm_gemm(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const double* alpha, const double* a, const libxsmm_blasint* lda, const double* b, const libxsmm_blasint* ldb, const double* beta, double* c, const libxsmm_blasint* ldc) { libxsmm_dgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } inline LIBXSMM_RETARGETABLE void libxsmm_gemm(const char* transa, const char* transb, /* by-value */ libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const double* alpha, const double* a, const libxsmm_blasint* lda, const double* b, const libxsmm_blasint* ldb, const double* beta, double* c, const libxsmm_blasint* ldc) { libxsmm_dgemm(transa, transb, &m, &n, &k, alpha, a, lda, b, ldb, beta, c, ldc); } /** Dispatched general dense matrix multiplication (single-precision). */ inline LIBXSMM_RETARGETABLE void libxsmm_gemm(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const float* alpha, const float* a, const libxsmm_blasint* lda, const float* b, const libxsmm_blasint* ldb, const float* beta, float* c, const libxsmm_blasint* ldc) { libxsmm_sgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } inline LIBXSMM_RETARGETABLE void libxsmm_gemm(const char* transa, const char* transb, /* by-value */ libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const float* alpha, const float* a, const libxsmm_blasint* lda, const float* b, const libxsmm_blasint* ldb, const float* beta, float* c, const libxsmm_blasint* ldc) { libxsmm_sgemm(transa, transb, &m, &n, &k, alpha, a, lda, b, ldb, beta, c, ldc); } /** Dispatched general dense matrix multiplication (low-precision). */ inline LIBXSMM_RETARGETABLE void libxsmm_gemm(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const int* alpha, const short* a, const libxsmm_blasint* lda, const short* b, const libxsmm_blasint* ldb, const int* beta, int* c, const libxsmm_blasint* ldc) { libxsmm_wigemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } inline LIBXSMM_RETARGETABLE void libxsmm_gemm(const char* transa, const char* transb, /* by-value */ libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const int* alpha, const short* a, const libxsmm_blasint* lda, const short* b, const libxsmm_blasint* ldb, const int* beta, int* c, const libxsmm_blasint* ldc) { libxsmm_wigemm(transa, transb, &m, &n, &k, alpha, a, lda, b, ldb, beta, c, ldc); } /** Dispatched general dense matrix multiplication (low-precision). */ inline LIBXSMM_RETARGETABLE void libxsmm_gemm(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const float* alpha, const libxsmm_bfloat16* a, const libxsmm_blasint* lda, const libxsmm_bfloat16* b, const libxsmm_blasint* ldb, const float* beta, float* c, const libxsmm_blasint* ldc) { libxsmm_bsgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } inline LIBXSMM_RETARGETABLE void libxsmm_gemm(const char* transa, const char* transb, /* by-value */ libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const float* alpha, const libxsmm_bfloat16* a, const libxsmm_blasint* lda, const libxsmm_bfloat16* b, const libxsmm_blasint* ldb, const float* beta, float* c, const libxsmm_blasint* ldc) { libxsmm_bsgemm(transa, transb, &m, &n, &k, alpha, a, lda, b, ldb, beta, c, ldc); } /** General dense matrix multiplication based on LAPACK/BLAS (double-precision). */ inline LIBXSMM_RETARGETABLE void libxsmm_blas_gemm(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const double* alpha, const double* a, const libxsmm_blasint* lda, const double* b, const libxsmm_blasint* ldb, const double* beta, double* c, const libxsmm_blasint* ldc) { libxsmm_blas_dgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } inline LIBXSMM_RETARGETABLE void libxsmm_blas_gemm(const char* transa, const char* transb, /* by-value */ libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const double* alpha, const double* a, const libxsmm_blasint* lda, const double* b, const libxsmm_blasint* ldb, const double* beta, double* c, const libxsmm_blasint* ldc) { libxsmm_blas_dgemm(transa, transb, &m, &n, &k, alpha, a, lda, b, ldb, beta, c, ldc); } /** General dense matrix multiplication based on LAPACK/BLAS (single-precision). */ inline LIBXSMM_RETARGETABLE void libxsmm_blas_gemm(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const float* alpha, const float* a, const libxsmm_blasint* lda, const float* b, const libxsmm_blasint* ldb, const float* beta, float* c, const libxsmm_blasint* ldc) { libxsmm_blas_sgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } inline LIBXSMM_RETARGETABLE void libxsmm_blas_gemm(const char* transa, const char* transb, /* by-value */ libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, const float* alpha, const float* a, const libxsmm_blasint* lda, const float* b, const libxsmm_blasint* ldb, const float* beta, float* c, const libxsmm_blasint* ldc) { libxsmm_blas_sgemm(transa, transb, &m, &n, &k, alpha, a, lda, b, ldb, beta, c, ldc); } #endif /*__cplusplus*/ #endif /*LIBXSMM_H*/ libxsmm-1.17/src/template/libxsmm_blocked_gemm.tpl.c000066400000000000000000000157631415223013700226360ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Kunal Banerjee (Intel Corp.), Dheevatsa Mudigere (Intel Corp.) Alexander Heinecke (Intel Corp.), Hans Pabst (Intel Corp.) ******************************************************************************/ LIBXSMM_VLA_DECL(2, libxsmm_blocked_gemm_lock, locks, handle->locks, handle->nb); /* TODO: pad thread-local buffer members by the size of a cache-line in order to avoid "Ping-Pong" */ LIBXSMM_VLA_DECL(2, LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE_C, l_out, (LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE_C*)(((char*)handle->buffer) + ltid * LIBXSMM_UP2(handle->bm * handle->bn * sizeof(LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE_C), LIBXSMM_CACHELINE)), handle->bm); LIBXSMM_VLA_DECL(4, const LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE_AB, real_a, (const LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE_AB*)a, handle->kb, handle->bk, handle->bm); LIBXSMM_VLA_DECL(4, const LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE_AB, real_b, (const LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE_AB*)b, handle->kb, handle->bn, handle->bk); LIBXSMM_VLA_DECL(4, LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE_C, real_c, (LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE_C*)c, handle->mb, handle->bn, handle->bm); const LIBXSMM_MMFUNCTION_TYPE2(LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE_AB, LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE_C) kernel = handle->kernel.LIBXSMM_TPREFIX2(LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE_AB, LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE_C, mm); const LIBXSMM_MMFUNCTION_TYPE2(LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE_AB, LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE_C) kernel_pf = handle->kernel_pf.LIBXSMM_TPREFIX2(LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE_AB, LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE_C, mm); const libxsmm_blasint b_m1 = handle->b_m1; const libxsmm_blasint b_n1 = handle->b_n1; const libxsmm_blasint b_k1 = handle->b_k1; const libxsmm_blasint b_k2 = handle->b_k2; const libxsmm_blasint mm = handle->m / b_m1; const libxsmm_blasint nn = handle->n / b_n1; const libxsmm_blasint kk = handle->k / b_k1; const libxsmm_blasint nw_i = mm / handle->bm; const libxsmm_blasint nw_j = nn / handle->bn; const libxsmm_blasint nw_k = kk / handle->bk; const libxsmm_blasint nw = nw_i * nw_j; libxsmm_blasint m, n, k, mb, nb, kb; libxsmm_blasint ki, kj = 0, w_i, ki2; libxsmm_blasint nw_k2 = nw_k; /* TODO: take transa and transb into account (flags) */ for (ki = 0; ki < handle->bn; ++ki) { LIBXSMM_PRAGMA_SIMD for (kj = 0; kj < handle->bm; ++kj) { LIBXSMM_VLA_ACCESS(2, l_out, ki, kj, handle->bm) = 0; } } for (mb = 0, m = 0; mb < b_m1; ++mb, m += nw_i) { for (nb = 0, n = 0; nb < b_n1; ++nb, n += nw_j) { for (kb = 0, k = 0; kb < b_k1; ++kb, k += nw_k2) { const libxsmm_blasint nw_k3 = nw_k / b_k2; const libxsmm_blasint nw2 = nw * nw_k3; const libxsmm_blasint s = (ltid * nw2) / handle->nthreads; const libxsmm_blasint e = (((libxsmm_blasint)ltid + 1) * nw2) / handle->nthreads; libxsmm_blasint o_i2 = 0, o_j2 = 0; nw_k2 = nw_k3; for (w_i = s; w_i < e; ++w_i) { libxsmm_blasint i2 = 0, j2 = 0, k2 = 0; internal_bgemm_order(handle->order, w_i, nw_i, nw_j, nw_k2, &i2, &j2, &k2); i2 += m; j2 += n; k2 += k; if (w_i == s) { o_i2 = i2; o_j2 = j2; } else { if (o_i2 != i2 || o_j2 != j2) { libxsmm_blocked_gemm_lock *const lock = &LIBXSMM_VLA_ACCESS(2, locks, o_i2, o_j2, handle->nb); LIBXSMM_ATOMIC_ACQUIRE(&lock->state, LIBXSMM_SYNC_NPAUSE, LIBXSMM_ATOMIC_RELAXED); for (ki = 0; ki < handle->bn; ++ki) { LIBXSMM_PRAGMA_SIMD for (kj = 0; kj < handle->bm; ++kj) { LIBXSMM_VLA_ACCESS(4, real_c, o_j2, o_i2, ki, kj, handle->mb, handle->bn, handle->bm) += LIBXSMM_VLA_ACCESS(2, l_out, ki, kj, handle->bm); } } LIBXSMM_ATOMIC_RELEASE(&lock->state, LIBXSMM_ATOMIC_RELAXED); for (ki = 0; ki < handle->bn; ++ki) { LIBXSMM_PRAGMA_SIMD for (kj = 0; kj < handle->bm; ++kj) { LIBXSMM_VLA_ACCESS(2, l_out, ki, kj, handle->bm) = 0; } } o_i2 = i2; o_j2 = j2; } } if (0 != kernel_pf) { /* prefetch */ for (ki2 = 0, ki = (b_k2 * k2); ki2 < b_k2; ++ki2, ++ki) { if (k2 < (nw_k - 2)) { /* prefetch */ kernel_pf(&LIBXSMM_VLA_ACCESS(4, real_a, i2, ki, 0, 0, handle->kb, handle->bk, handle->bm), &LIBXSMM_VLA_ACCESS(4, real_b, j2, ki, 0, 0, handle->kb, handle->bn, handle->bk), &LIBXSMM_VLA_ACCESS(2, l_out, 0, 0, handle->bm), &LIBXSMM_VLA_ACCESS(4, real_a, i2, ki+1, 0, 0, handle->kb, handle->bk, handle->bm), &LIBXSMM_VLA_ACCESS(4, real_b, j2, ki+1, 0, 0, handle->kb, handle->bn, handle->bk), NULL); } else { /* avoid prefetching OOB */ kernel(&LIBXSMM_VLA_ACCESS(4, real_a, i2, ki, 0, 0, handle->kb, handle->bk, handle->bm), &LIBXSMM_VLA_ACCESS(4, real_b, j2, ki, 0, 0, handle->kb, handle->bn, handle->bk), &LIBXSMM_VLA_ACCESS(2, l_out, 0, 0, handle->bm)); } } } else { /* no prefetch */ for (ki2 = 0, ki = (b_k2 * k2); ki2 < b_k2; ++ki2, ++ki) { kernel(&LIBXSMM_VLA_ACCESS(4, real_a, i2, ki, 0, 0, handle->kb, handle->bk, handle->bm), &LIBXSMM_VLA_ACCESS(4, real_b, j2, ki, 0, 0, handle->kb, handle->bn, handle->bk), &LIBXSMM_VLA_ACCESS(2, l_out, 0, 0, handle->bm)); } } if (w_i == (e - 1)) { libxsmm_blocked_gemm_lock* lock; o_i2 = i2; o_j2 = j2; lock = &LIBXSMM_VLA_ACCESS(2, locks, o_i2, o_j2, handle->nb); LIBXSMM_ATOMIC_ACQUIRE(&lock->state, LIBXSMM_SYNC_NPAUSE, LIBXSMM_ATOMIC_RELAXED); for (ki = 0; ki < handle->bn; ++ki) { LIBXSMM_PRAGMA_SIMD for (kj = 0; kj < handle->bm; ++kj) { LIBXSMM_VLA_ACCESS(4, real_c, o_j2, o_i2, ki, kj, handle->mb, handle->bn, handle->bm) += LIBXSMM_VLA_ACCESS(2, l_out, ki, kj, handle->bm); } } LIBXSMM_ATOMIC_RELEASE(&lock->state, LIBXSMM_ATOMIC_RELAXED); for (ki = 0; ki < handle->bn; ++ki) { LIBXSMM_PRAGMA_SIMD for (kj = 0; kj < handle->bm; ++kj) { LIBXSMM_VLA_ACCESS(2, l_out, ki, kj, handle->bm) = 0; } } } } } } } libxsmm-1.17/src/template/libxsmm_blocked_gemm_convert_b_to_a.tpl.c000066400000000000000000000026641415223013700256750ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Kunal Banerjee (Intel Corp.) ******************************************************************************/ LIBXSMM_VLA_DECL(4, LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE, real_dst, (LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE*)dst, handle->nb, handle->bn, handle->bm); LIBXSMM_VLA_DECL(4, const LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE, real_src, (const LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE*)src, handle->mb, handle->bn, handle->bm); libxsmm_blasint mb, nb, bm, bn; for (mb = 0; mb < handle->mb; ++mb) { for (nb = 0; nb < handle->nb; ++nb) { for (bn = 0; bn < handle->bn; ++bn) { for (bm = 0; bm < handle->bm; ++bm) { LIBXSMM_VLA_ACCESS(4, real_dst, mb, nb, bn, bm, handle->nb, handle->bn, handle->bm) = LIBXSMM_VLA_ACCESS(4, real_src, nb, mb, bn, bm, handle->mb, handle->bn, handle->bm); } } } } libxsmm-1.17/src/template/libxsmm_blocked_gemm_copyin_a.tpl.c000066400000000000000000000026561415223013700245140ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Hans Pabst (Intel Corp.) ******************************************************************************/ LIBXSMM_VLA_DECL(4, LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE, real_dst, (LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE*)dst, handle->kb, handle->bk, handle->bm); LIBXSMM_VLA_DECL(2, const LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE, real_src, (const LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE*)src, handle->m); libxsmm_blasint mb, kb, bm, bk; for (mb = 0; mb < handle->mb; ++mb) { for (kb = 0; kb < handle->kb; ++kb) { for (bk = 0; bk < handle->bk; ++bk) { for (bm = 0; bm < handle->bm; ++bm) { LIBXSMM_VLA_ACCESS(4, real_dst, mb, kb, bk, bm, handle->kb, handle->bk, handle->bm) = LIBXSMM_VLA_ACCESS(2, real_src, kb * handle->bk + bk, mb * handle->bm + bm, handle->m); } } } } libxsmm-1.17/src/template/libxsmm_blocked_gemm_copyin_b.tpl.c000066400000000000000000000026561415223013700245150ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Hans Pabst (Intel Corp.) ******************************************************************************/ LIBXSMM_VLA_DECL(4, LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE, real_dst, (LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE*)dst, handle->kb, handle->bn, handle->bk); LIBXSMM_VLA_DECL(2, const LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE, real_src, (const LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE*)src, handle->k); libxsmm_blasint kb, nb, bk, bn; for (nb = 0; nb < handle->nb; ++nb) { for (kb = 0; kb < handle->kb; ++kb) { for (bn = 0; bn < handle->bn; ++bn) { for (bk = 0; bk < handle->bk; ++bk) { LIBXSMM_VLA_ACCESS(4, real_dst, nb, kb, bn, bk, handle->kb, handle->bn, handle->bk) = LIBXSMM_VLA_ACCESS(2, real_src, nb * handle->bn + bn, kb * handle->bk + bk, handle->k); } } } } libxsmm-1.17/src/template/libxsmm_blocked_gemm_copyin_c.tpl.c000066400000000000000000000026561415223013700245160ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Hans Pabst (Intel Corp.) ******************************************************************************/ LIBXSMM_VLA_DECL(4, LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE, real_dst, (LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE*)dst, handle->mb, handle->bn, handle->bm); LIBXSMM_VLA_DECL(2, const LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE, real_src, (const LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE*)src, handle->m); libxsmm_blasint mb, nb, bm, bn; for (nb = 0; nb < handle->nb; ++nb) { for (mb = 0; mb < handle->mb; ++mb) { for (bn = 0; bn < handle->bn; ++bn) { for (bm = 0; bm < handle->bm; ++bm) { LIBXSMM_VLA_ACCESS(4, real_dst, nb, mb, bn, bm, handle->mb, handle->bn, handle->bm) = LIBXSMM_VLA_ACCESS(2, real_src, nb * handle->bn + bn, mb * handle->bm + bm, handle->m); } } } } libxsmm-1.17/src/template/libxsmm_blocked_gemm_copyout_c.tpl.c000066400000000000000000000026321415223013700247110ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ LIBXSMM_VLA_DECL(4, const LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE, real_src, (const LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE*)src, handle->mb, handle->bn, handle->bm); LIBXSMM_VLA_DECL(2, LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE, real_dst, (LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE*)dst, handle->m); libxsmm_blasint mb, nb, bm, bn; for (nb = 0; nb < handle->nb; ++nb) { for (mb = 0; mb < handle->mb; ++mb) { for (bn = 0; bn < handle->bn; ++bn) { for (bm = 0; bm < handle->bm; ++bm) { LIBXSMM_VLA_ACCESS(2, real_dst, nb * handle->bn + bn, mb * handle->bm + bm, handle->m) = LIBXSMM_VLA_ACCESS(4, real_src, nb, mb, bn, bm, handle->mb, handle->bn, handle->bm); } } } } libxsmm-1.17/src/template/libxsmm_blocked_gemm_transpose_b.tpl.c000066400000000000000000000043111415223013700252200ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Kunal Banerjee (Intel Corp.) ******************************************************************************/ LIBXSMM_VLA_DECL(4, LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE, real_dst, (LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE*)dst, handle->kb, handle->bn, handle->bk); LIBXSMM_VLA_DECL(4, const LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE, real_src, (const LIBXSMM_BLOCKED_GEMM_TEMPLATE_TYPE*)src, handle->nb, handle->bk, handle->bn); libxsmm_blasint kb, nb, bk, bn; libxsmm_blasint ii, jj, job, jobT; if (handle->n == handle->k && handle->bn == handle->bk) { for (kb = 0; kb < handle->kb; ++kb) { for (nb = 0; nb < handle->nb; ++nb) { for (bk = 0; bk < handle->bk; ++bk) { for (bn = 0; bn < handle->bn; ++bn) { LIBXSMM_VLA_ACCESS(4, real_dst, nb, kb, bn, bk, handle->kb, handle->bn, handle->bk) = LIBXSMM_VLA_ACCESS(4, real_src, kb, nb, bk, bn, handle->nb, handle->bk, handle->bn); } } } } } else { for (kb = 0; kb < handle->kb; ++kb) { for (nb = 0; nb < handle->nb; ++nb) { for (bk = 0; bk < handle->bk; ++bk) { for (bn = 0; bn < handle->bn; ++bn) { job = (kb*handle->bk + bk)*handle->n + (nb*handle->bn + bn); ii = job / handle->k; jj = job % handle->k; jobT = jj*handle->n + ii; LIBXSMM_VLA_ACCESS(4, real_dst, (jobT/handle->k)/handle->bn, (jobT%handle->k)/handle->bk, (jobT/handle->k)%handle->bn, (jobT%handle->k)%handle->bk, handle->kb, handle->bn, handle->bk) = LIBXSMM_VLA_ACCESS(4, real_src, kb, nb, bk, bn, handle->nb, handle->bk, handle->bn); } } } } } libxsmm-1.17/src/template/libxsmm_config.h000066400000000000000000000025131415223013700206670ustar00rootroot00000000000000#ifndef LIBXSMM_CONFIG_H #define LIBXSMM_CONFIG_H #if !defined(LIBXSMM_DEFAULT_CONFIG) && (defined(_WIN32) || (defined(LIBXSMM_SOURCE_H) && !defined(LIBXSMM_CONFIGURED))) # define LIBXSMM_DEFAULT_CONFIG #endif #if !defined(LIBXSMM_DEFAULT_CONFIG) && (!defined(LIBXSMM_SOURCE_H) || defined(LIBXSMM_CONFIGURED)) # include "libxsmm_version.h" $LIBXSMM_OFFLOAD_BUILD $MNK_PREPROCESSOR_LIST #else # define LIBXSMM_CONFIG_VERSION "" # define LIBXSMM_CONFIG_BRANCH "" # define LIBXSMM_CONFIG_VERSION_MAJOR INT_MAX # define LIBXSMM_CONFIG_VERSION_MINOR INT_MAX # define LIBXSMM_CONFIG_VERSION_UPDATE INT_MAX # define LIBXSMM_CONFIG_VERSION_PATCH INT_MAX # define LIBXSMM_CONFIG_BUILD_DATE INT_MAX #endif #define LIBXSMM_CONFIG_CACHELINE $CACHELINE #define LIBXSMM_CONFIG_ALIGNMENT $CACHELINE #define LIBXSMM_CONFIG_MALLOC $MALLOC #define LIBXSMM_CONFIG_ILP64 $ILP64 #define LIBXSMM_CONFIG_SYNC $SYNC #define LIBXSMM_CONFIG_JIT $JIT #define LIBXSMM_CONFIG_PREFETCH $PREFETCH #define LIBXSMM_CONFIG_MAX_MNK $MAX_MNK #define LIBXSMM_CONFIG_MAX_DIM $MAX_DIM #define LIBXSMM_CONFIG_AVG_DIM $AVG_DIM #define LIBXSMM_CONFIG_MAX_M $MAX_M #define LIBXSMM_CONFIG_MAX_N $MAX_N #define LIBXSMM_CONFIG_MAX_K $MAX_K #define LIBXSMM_CONFIG_FLAGS $FLAGS #define LIBXSMM_CONFIG_ALPHA $ALPHA #define LIBXSMM_CONFIG_BETA $BETA #define LIBXSMM_CONFIG_WRAP $WRAP #endif libxsmm-1.17/src/template/libxsmm_dnn_bf16_macros_define.tpl.c000066400000000000000000000056301415223013700244710ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas, Alexander Heinecke (Intel Corp.) ******************************************************************************/ #if defined(LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI) # define LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH( A ) (__m256i)_mm512_cvtneps_pbh( A ) # define LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( A, B ) (__m512i)_mm512_cvtne2ps_pbh( A, B ) #else # define LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH( A ) LIBXSMM_INTRINSICS_MM512_CVT_FP32_BF16( A ) # define LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( A, B ) LIBXSMM_INTRINSICS_MM512_CVT2_FP32_BF16( A, B ) #endif #define LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16(in, out, length) do { \ unsigned int full_chunks = length / 32; \ unsigned int remainder = length % 32; \ int __i = 0; \ if (remainder == 0) { \ for ( __i = 0; __i < length; __i+= 32) { \ _mm512_storeu_si512((libxsmm_bfloat16*)out+__i, LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS((const float*)in+__i+16), LIBXSMM_INTRINSICS_MM512_LOAD_PS((const float*)in+__i))); \ } \ } else { \ unsigned int chunk; \ for ( chunk = 0; chunk < full_chunks; chunk++) { \ __i = chunk * 32; \ _mm512_storeu_si512((libxsmm_bfloat16*)out+__i, LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS((const float*)in+__i+16), LIBXSMM_INTRINSICS_MM512_LOAD_PS((const float*)in+__i))); \ } \ libxsmm_rne_convert_fp32_bf16((const float*)in+32*full_chunks, (element_output_type*)out+32*full_chunks, remainder); \ } \ } while(0) #define LIBXSMM_DNN_CONVERT_BUFFER_BF16_F32(in, out, length) do { \ unsigned int full_chunks = length / 16; \ unsigned int remainder = length % 16; \ int __i = 0; \ if (remainder == 0) { \ for ( __i = 0; __i < length; __i+= 16) { \ _mm512_storeu_ps( (float*)out+__i, LIBXSMM_INTRINSICS_MM512_CVTPBH_PS( _mm256_loadu_si256((__m256i*)((const libxsmm_bfloat16*)in+__i)))); \ } \ } else { \ unsigned int chunk; \ for ( chunk = 0; chunk < full_chunks; chunk++) { \ __i = chunk * 16; \ _mm512_storeu_ps( (float*)out+__i, LIBXSMM_INTRINSICS_MM512_CVTPBH_PS( _mm256_loadu_si256((__m256i*)((const libxsmm_bfloat16*)in+__i)))); \ } \ libxsmm_convert_bf16_f32((const libxsmm_bfloat16*)in+16*full_chunks, (float*)out+16*full_chunks, remainder); \ } \ } while(0) libxsmm-1.17/src/template/libxsmm_dnn_bf16_macros_undefine.tpl.c000066400000000000000000000016741415223013700250400ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas, Alexander Heinecke (Intel Corp.) ******************************************************************************/ #undef LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16 #undef LIBXSMM_DNN_CONVERT_BUFFER_BF16_F32 #undef LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH #undef LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH libxsmm-1.17/src/template/libxsmm_dnn_convolve_st_bwd_custom_custom_fallback_generic.tpl.c000066400000000000000000000235451415223013700325560ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Rajkishore Barik, Ankush Mandal, Alexander Heinecke (Intel Corp.) ******************************************************************************/ int imgifm1, img, ofm1, ifm1, oj, ij, oi, ii, kj, ki, ifm2, ofm2, ifm1ofm1; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const int work = handle->desc.N * handle->blocksifm; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* number of tasks for transpose that could be run in parallel */ int transpose_work = handle->blocksifm * handle->blocksofm; /* compute chunk size */ const int transpose_chunksize = (transpose_work % handle->desc.threads == 0) ? (transpose_work / handle->desc.threads) : ((transpose_work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int transpose_thr_begin = (ltid * transpose_chunksize < transpose_work) ? (ltid * transpose_chunksize) : transpose_work; const int transpose_thr_end = ((ltid + 1) * transpose_chunksize < transpose_work) ? ((ltid + 1) * transpose_chunksize) : transpose_work; /* offset pointer in case of physical padding */ element_output_type *const out = (element_output_type*)handle->grad_output->data + ((size_t)handle->desc.pad_h_out * handle->ofwp + handle->desc.pad_w_out) * handle->ofmblock; /* Weight and transpose_weight tensor declaration */ LIBXSMM_VLA_DECL(6, element_filter_type, wt, (element_filter_type*)handle->reg_filter->data, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); LIBXSMM_VLA_DECL(6, element_filter_type, tr_wt, (element_filter_type*)((char*)handle->scratch + handle->bwd_filter_trans_scratch_offset), handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); /* define weight pointer which has the correct format */ element_filter_type* weight_base = 0; /* padding via stack allocated buffers */ const int padded_w = handle->desc.W + (2 * handle->desc.pad_w); const int padded_h = handle->desc.H + (2 * handle->desc.pad_h); const int size_tls1 = padded_h * padded_w * handle->ifmblock; element_input_type *const del_input_scratch_padding = (element_input_type*)((char*)handle->scratch + handle->bwd_packing_padding_scratch_offset) + ltid * size_tls1; for ( ii = 0; ii < size_tls1; ++ii ) { del_input_scratch_padding[ii] = (element_input_type)0; } /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); /* transpose filters, if requested */ if ( (handle->options & LIBXSMM_DNN_CONV_OPTION_BWD_NO_FILTER_TRANSPOSE) > 0 ) { weight_base = (element_filter_type*)handle->reg_filter_tr->data; } else { for (ifm1ofm1 = transpose_thr_begin; ifm1ofm1 < transpose_thr_end; ++ifm1ofm1) { ofm1 = ifm1ofm1 / handle->blocksifm; ifm1 = ifm1ofm1 % handle->blocksifm; for (kj=0; kj < handle->desc.R; kj++) { for (ki=0; ki < handle->desc.S; ki++) { for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { LIBXSMM_VLA_ACCESS(6, tr_wt, ifm1, ofm1, handle->desc.R-1-kj , handle->desc.S-1-ki, ofm2, ifm2, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock) = LIBXSMM_VLA_ACCESS(6, wt, ofm1, ifm1, kj, ki, ifm2, ofm2, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); } } } } } weight_base = (element_filter_type*)((char*)handle->scratch + handle->bwd_filter_trans_scratch_offset); /* wait for transpose to finish */ libxsmm_barrier_wait(handle->barrier, ltid); } {/* open new scope for additional variable declarations (C89) */ LIBXSMM_VLA_DECL(5, element_input_type, del_input, (element_output_type*)handle->grad_input->data, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); LIBXSMM_VLA_DECL(3, element_input_type, del_input_padded, del_input_scratch_padding, padded_w, handle->ifmblock); LIBXSMM_VLA_DECL(5, const element_output_type, output, out, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); LIBXSMM_VLA_DECL(6, const element_filter_type, weight, weight_base, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); for (imgifm1 = thr_begin; imgifm1 < thr_end; ++imgifm1) { img = imgifm1 / handle->blocksifm; ifm1 = imgifm1 % handle->blocksifm; /* check if we need padding, for now we do physical padding on the fly, however we can play with N parameter of the GEMM */ /* @TODO: add variant which deals with multiple GEMMS by varying N to deal with padding */ if ( (handle->desc.pad_h == handle->desc.pad_h_in) && (handle->desc.pad_w == handle->desc.pad_w_in) ) { /* reset result buffer to zero when intent is to overwrite when first block of input channels should be convoluted */ if ( ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) ) { element_input_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, del_input, img, ifm1, 0, 0, 0, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock)); LIBXSMM_PRAGMA_SIMD for (ij = 0; ij < handle->ifhp*handle->ifwp*handle->ifmblock; ij++) { temp_ptr[ij] = (element_input_type)0; } } /* run convolution */ for (ofm1 = 0; ofm1 < handle->blocksofm; ++ofm1) { for ( oj = 0; oj < handle->ofh; ++oj) { ij = oj * handle->desc.u; oi = 0; ii = 0; for (kj = 0; kj < handle->desc.R; ++kj) { for (ki = 0; ki < handle->desc.S; ++ki) { gemm_kernel( &LIBXSMM_VLA_ACCESS(6, weight, ifm1, ofm1, handle->desc.R-1-kj, handle->desc.S-1-ki, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock), &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij + kj, ii + ki, 0, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock) ); } } } } /* zero rim in case of physical padding.... this code is extremely stupid and crappy as it requires a complicated if... */ if (handle->desc.pad_h_in > 0 || handle->desc.pad_w_in > 0) { for ( ij = 0; ij < handle->ifhp; ij++ ) { for ( ii = 0; ii < handle->ifwp; ii++ ) { if ( (ij < handle->desc.pad_h_in) || (ij >= (handle->desc.H+handle->desc.pad_h_in)) || (ii < handle->desc.pad_w_in) || (ii >= (handle->desc.W+handle->desc.pad_w_in)) ) { for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij, ii, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock) = (element_input_type)0; } } } } } } else { /* reset result buffer to zero when intent is to overwrite when first block of input channels should be convoluted */ if ( ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) ) { LIBXSMM_PRAGMA_SIMD for (ij = 0; ij < size_tls1; ++ij) { del_input_scratch_padding[ij] = (element_output_type)0; } } else { for (ij = 0; ij < handle->desc.H; ij++) { for (ii = 0; ii < handle->desc.W; ii++) { LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_VLA_ACCESS(3, del_input_padded, ij + handle->desc.pad_h, ii + handle->desc.pad_w, ifm2, padded_w, handle->ifmblock) = LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij, ii, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); } } } } /* run convolution */ for (ofm1 = 0; ofm1 < handle->blocksofm; ++ofm1) { for ( oj = 0; oj < handle->ofh; ++oj) { ij = oj * handle->desc.u; oi = 0; ii = 0; for (kj = 0; kj < handle->desc.R; ++kj) { for (ki = 0; ki < handle->desc.S; ++ki) { gemm_kernel( &LIBXSMM_VLA_ACCESS(6, weight, ifm1, ofm1, handle->desc.R-1-kj, handle->desc.S-1-ki, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock), &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &LIBXSMM_VLA_ACCESS(3, del_input_padded, ij + kj, ii + ki, 0, padded_w, handle->ifmblock) ); } } } } /* input padding copy back */ for (ij = 0; ij < handle->desc.H; ij++) { for (ii = 0; ii < handle->desc.W; ii++) { LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij, ii, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock) = LIBXSMM_VLA_ACCESS(3, del_input_padded, ij + handle->desc.pad_h, ii + handle->desc.pad_w, ifm2, padded_w, handle->ifmblock); } } } } } /* end of imgifm1 loop */ } /* end of new scope for additional variable declarations (C89) */ libxsmm_barrier_wait(handle->barrier, ltid); libxsmm-1.17/src/template/libxsmm_dnn_convolve_st_bwd_custom_custom_generic.tpl.c000066400000000000000000000530711415223013700307340ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas, Alexander Heinecke, Hans Pabst (Intel Corp.) ******************************************************************************/ int img, ofm1, ofm2, ifm1, ifm2, oj, oi, kj, ki, oi_use, oj_use, ii_use, ij_use, ofmb, ifmb, ojb, myIfmId, nIfmBlocks, ind, task, ifm1ofm1; /* computing first logical thread */ const int ltid = tid - start_thread; int imgpt = LIBXSMM_UPDIV(handle->desc.N, handle->desc.threads); int threads_per_image = handle->desc.threads / handle->desc.N; int my_img_start = LIBXSMM_MIN(ltid * imgpt, handle->desc.N); int my_img_end = LIBXSMM_MIN((ltid+1) * imgpt, handle->desc.N); int my_ifm_start = 0; int my_ifm_end = handle->blocksifm; /* Batch reduce related variables */ const element_filter_type *A_ptrs[1024]; const element_input_type *B_ptrs[1024]; unsigned long long n_blocks; /* number of tasks for transpose that could be run in parallel */ int transpose_work = handle->blocksifm * handle->blocksofm * handle->desc.R * handle->desc.S; /* compute chunk size */ int transpose_chunksize = (transpose_work % handle->desc.threads == 0) ? (transpose_work / handle->desc.threads) : ((transpose_work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ int transpose_thr_begin = (ltid * transpose_chunksize < transpose_work) ? (ltid * transpose_chunksize) : transpose_work; int transpose_thr_end = ((ltid + 1) * transpose_chunksize < transpose_work) ? ((ltid + 1) * transpose_chunksize) : transpose_work; /* offset output pointer in case of physical padding */ const int IFW = (handle->pack_input_bwd == 1) ? handle->ofw : handle->ifwp; const int IFH = (handle->pack_input_bwd == 1) ? handle->ofh : handle->ifhp; element_input_type *input_ptr = (handle->pack_input_bwd == 1) ? (element_input_type*)((char*)handle->scratch + handle->bwd_packing_padding_scratch_offset) : (element_input_type*)handle->grad_input->data + ((size_t)handle->desc.pad_h_in * handle->ifwp + handle->desc.pad_w_in) * handle->ifmblock; LIBXSMM_VLA_DECL(5, element_input_type, del_input, input_ptr, handle->blocksifm, IFH, IFW, handle->ifmblock); element_output_type *const out = (element_output_type*)handle->grad_output->data; LIBXSMM_VLA_DECL(5, const element_output_type, output, out, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); /* Weight and transpose_weight tensor declaration */ LIBXSMM_VLA_DECL(6, element_filter_type, wt, (element_filter_type*)handle->reg_filter->data, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); LIBXSMM_VLA_DECL(6, element_filter_type, tr_wt, (element_filter_type*)((char*)handle->scratch + handle->bwd_filter_trans_scratch_offset), handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); /* define weight pointer which has the correct format */ element_filter_type* weight_base = ((handle->options & LIBXSMM_DNN_CONV_OPTION_BWD_NO_FILTER_TRANSPOSE) > 0 ) ? (element_filter_type*)handle->reg_filter_tr->data : (element_filter_type*)((char*)handle->scratch + handle->bwd_filter_trans_scratch_offset); LIBXSMM_VLA_DECL(6, const element_filter_type, weight, weight_base, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); /* transpose filters, if requested */ if ( (handle->options & LIBXSMM_DNN_CONV_OPTION_BWD_NO_FILTER_TRANSPOSE) == 0 ) { /* Special case of 64x64 transpose with JITed transpose */ if (handle->ifmblock == 64 && handle->ofmblock == 64) { libxsmm_xtransfunction tr_kernel = handle->tr_kernel; const unsigned int ld_in = 64; const unsigned int ld_out = 64; for (task = transpose_thr_begin; task < transpose_thr_end; ++task) { ifm1 = task/(handle->blocksofm * handle->desc.R * handle->desc.S); ofm1 = (task%(handle->blocksofm * handle->desc.R * handle->desc.S))/(handle->desc.R * handle->desc.S); kj = ((task%(handle->blocksofm * handle->desc.R * handle->desc.S))%(handle->desc.R * handle->desc.S))/handle->desc.S; ki = ((task%(handle->blocksofm * handle->desc.R * handle->desc.S))%(handle->desc.R * handle->desc.S))%handle->desc.S; tr_kernel(&LIBXSMM_VLA_ACCESS(6, wt, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &ld_in, &LIBXSMM_VLA_ACCESS(6, tr_wt, ifm1, ofm1, handle->desc.R-1-kj, handle->desc.S-1-ki, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock), &ld_out); tr_kernel(&LIBXSMM_VLA_ACCESS(6, wt, ofm1, ifm1, kj, ki, 16, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &ld_in, &LIBXSMM_VLA_ACCESS(6, tr_wt, ifm1, ofm1, handle->desc.R-1-kj, handle->desc.S-1-ki, 0, 16, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock), &ld_out); tr_kernel(&LIBXSMM_VLA_ACCESS(6, wt, ofm1, ifm1, kj, ki, 32, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &ld_in, &LIBXSMM_VLA_ACCESS(6, tr_wt, ifm1, ofm1, handle->desc.R-1-kj, handle->desc.S-1-ki, 0, 32, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock), &ld_out); tr_kernel(&LIBXSMM_VLA_ACCESS(6, wt, ofm1, ifm1, kj, ki, 48, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &ld_in, &LIBXSMM_VLA_ACCESS(6, tr_wt, ifm1, ofm1, handle->desc.R-1-kj, handle->desc.S-1-ki, 0, 48, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock), &ld_out); } } else { /* number of tasks for transpose that could be run in parallel */ transpose_work = handle->blocksifm * handle->blocksofm; /* compute chunk size */ transpose_chunksize = (transpose_work % handle->desc.threads == 0) ? (transpose_work / handle->desc.threads) : ((transpose_work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ transpose_thr_begin = (ltid * transpose_chunksize < transpose_work) ? (ltid * transpose_chunksize) : transpose_work; transpose_thr_end = ((ltid + 1) * transpose_chunksize < transpose_work) ? ((ltid + 1) * transpose_chunksize) : transpose_work; for (ifm1ofm1 = transpose_thr_begin; ifm1ofm1 < transpose_thr_end; ++ifm1ofm1) { ofm1 = ifm1ofm1 / handle->blocksifm; ifm1 = ifm1ofm1 % handle->blocksifm; for (kj=0; kj < handle->desc.R; kj++) { for (ki=0; ki < handle->desc.S; ki++) { for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { LIBXSMM_VLA_ACCESS(6, tr_wt, ifm1, ofm1, handle->desc.R-1-kj , handle->desc.S-1-ki, ofm2, ifm2, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock) = LIBXSMM_VLA_ACCESS(6, wt, ofm1, ifm1, kj, ki, ifm2, ofm2, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); } } } } } } /* wait for transpose to finish */ libxsmm_barrier_wait(handle->barrier, ltid); } if ( imgpt <= 1 ) { my_img_start = LIBXSMM_MIN(ltid / threads_per_image, handle->desc.N); my_img_end = LIBXSMM_MIN(my_img_start + 1, handle->desc.N); myIfmId = ltid % threads_per_image; nIfmBlocks = LIBXSMM_UPDIV(handle->blocksifm, threads_per_image); my_ifm_start = LIBXSMM_MIN(myIfmId * nIfmBlocks, handle->blocksifm); my_ifm_end = LIBXSMM_MIN((myIfmId+1) * nIfmBlocks, handle->blocksifm); } if ( handle->use_ifm_parallelization == 1 ) { int spread_out = 0; if ( handle->desc.N % 8 == 0) { spread_out = 8; } else if ( handle->desc.N % 4 == 0) { spread_out = 4; } else if (handle->desc.N % 3 == 0) { spread_out = 3; } else if (handle->desc.N % 2 == 0) { spread_out = 2; } else { spread_out = 1; } if ((spread_out > 1) && (handle->desc.threads % spread_out == 0)) { int tile_id = ltid / spread_out; int ifmpt = LIBXSMM_UPDIV(handle->blocksifm, spread_out); int ifm_id = ltid % spread_out; imgpt = LIBXSMM_UPDIV(handle->desc.N, handle->desc.threads) * spread_out; my_img_start = LIBXSMM_MIN(tile_id * imgpt, handle->desc.N); my_img_end = LIBXSMM_MIN((tile_id+1) * imgpt, handle->desc.N); my_ifm_start = LIBXSMM_MIN(ifm_id * ifmpt, handle->blocksifm); my_ifm_end = LIBXSMM_MIN((ifm_id+1) * ifmpt, handle->blocksifm); } } if (handle->loop_order == 0) { /* (loop_order == N_Kb_Cb_Hb_k_c_h_w) {*/ if ( handle->avoid_fmas_in_rim == 1) { for (img = my_img_start; img < my_img_end; img++) { for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += handle->block_bwd_ifm) { for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_bwd_ofm) { for (ojb = 0; ojb < handle->ofh; ojb += handle->block_bwd_oj) { for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_bwd_ifm, my_ifm_end); ifm1++ ) { if ( (ofmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load_bwd == 0 && ojb == 0) { /* set output feature map to zero */ for (oj = 0; oj < handle->ofh; ++oj) { element_input_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, del_input, img, ifm1, oj, 0, 0, handle->blocksifm, IFH, IFW, handle->ifmblock)); for (oi = 0; oi < handle->ofw; ++oi) { LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { temp_ptr[ifm2] = (element_input_type)0; } temp_ptr += handle->ifmblock; } } } for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_bwd_ofm, handle->blocksofm); ofm1 += handle->blocksofm_blocking) { for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_bwd_oj,handle->ofh); oj += handle->bwd_ofh_rb) { for (oi = 0; oi < handle->ofw; oi += handle->bwd_ofw_rb) { for (kj = 0; kj < handle->desc.R; kj++) { for (ki = 0; ki < handle->desc.S; ki++) { /* Prepare batch-reduce kernel arguments */ ij_use = oj; ii_use = oi; oj_use = oj - (1-handle->desc.pad_h_out); oi_use = oi - (1-handle->desc.pad_w_out); if (kj == 0 && oj == 0) { /* Do no FLOPS */ } else if (kj == handle->desc.R-1 && oj == handle->ofh-1 ) { /* Do no FLOPS */ } else if ( oi == 0 && ki == 0 ) { ind = 0; for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ifm1, ofm2, kj, ki, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki + 1, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); ind++; } n_blocks = ind; br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use, ii_use + 1, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), &n_blocks); } else if (oi == handle->ofw-handle->bwd_ofw_rb && ki == handle->desc.S-1) { ind = 0; for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ifm1, ofm2, kj, ki, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); ind++; } n_blocks = ind; br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), &n_blocks); } else { ind = 0; for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ifm1, ofm2, kj, ki, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); ind++; } n_blocks = ind; br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), &n_blocks); } } } } } } } } } } } } else { for (img = my_img_start; img < my_img_end; img++) { for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += handle->block_bwd_ifm) { for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_bwd_ofm) { for (ojb = 0; ojb < handle->ofh; ojb += handle->block_bwd_oj) { for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_bwd_ifm, my_ifm_end); ifm1++ ) { if ( (ofmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load_bwd == 0 && ojb == 0) { /* set output feature map to zero */ for (oj = 0; oj < handle->ofh; ++oj) { element_input_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, del_input, img, ifm1, oj, 0, 0, handle->blocksifm, IFH, IFW, handle->ifmblock)); for (oi = 0; oi < handle->ofw; ++oi) { LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { temp_ptr[ifm2] = (element_input_type)0; } temp_ptr += handle->ifmblock; } } } for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_bwd_ofm, handle->blocksofm); ofm1 += handle->blocksofm_blocking) { for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_bwd_oj,handle->ofh); oj += handle->bwd_ofh_rb) { for (oi = 0; oi < handle->ofw; oi += handle->bwd_ofw_rb) { /* Prepare batch-reduce kernel arguments */ ij_use = (handle->spread_input_bwd == 1) ? oj * handle->desc.u : oj; ii_use = (handle->spread_input_bwd == 1) ? oi * handle->desc.v : oi; oi_use = oi; oj_use = oj; ind = 0; for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { for (kj = 0; kj < handle->desc.R; kj++) { for (ki = 0; ki < handle->desc.S; ki++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ifm1, ofm2, kj, ki, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); ind++; } } } n_blocks = ind; br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), &n_blocks); } } } } } } } } } } if (handle->loop_order == 1) { /* (loop_order == N_Kb_Cb_Hb_k_c_h_w) { */ for (img = my_img_start; img < my_img_end; img++) { for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += handle->block_bwd_ifm) { for (ojb = 0; ojb < handle->ofh; ojb += handle->block_bwd_oj) { for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_bwd_oj,handle->ofh); oj += handle->bwd_ofh_rb) { for (oi = 0; oi < handle->ofw; oi += handle->bwd_ofw_rb) { for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_bwd_ifm, my_ifm_end); ifm1++ ) { for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_bwd_ofm) { if ( (ofmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load_bwd == 0 && ojb == 0 && oj == 0 && oi == 0) { /* set output feature map to zero */ for (oj = 0; oj < handle->ofh; ++oj) { element_input_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, del_input, img, ifm1, oj, 0, 0, handle->blocksifm, IFH, IFW, handle->ifmblock)); for (oi = 0; oi < handle->ofw; ++oi) { LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { temp_ptr[ifm2] = (element_input_type)0; } temp_ptr += handle->ifmblock; } } } for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_bwd_ofm, handle->blocksofm); ofm1 += handle->blocksofm_blocking) { /* Prepare batch-reduce kernel arguments */ ij_use = (handle->spread_input_bwd == 1) ? oj * handle->desc.u : oj; ii_use = (handle->spread_input_bwd == 1) ? oi * handle->desc.v : oi; oi_use = oi; oj_use = oj; ind = 0; for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { for (kj = 0; kj < handle->desc.R; kj++) { for (ki = 0; ki < handle->desc.S; ki++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ifm1, ofm2, kj, ki, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); ind++; } } } n_blocks = ind; br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), &n_blocks); } } } } } } } } } if (handle->pack_input_bwd == 1) { LIBXSMM_VLA_DECL(5, element_input_type, del_input_full, (element_input_type*)handle->grad_input->data + ((size_t)handle->desc.pad_h_in * handle->ifwp + handle->desc.pad_w_in) * handle->ifmblock, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); for (img = my_img_start; img < my_img_end; img++) { for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { for (oj = 0; oj < handle->ifhp; oj++) { for (oi = 0; oi < handle->ifwp; oi++) { if (oi % handle->desc.v != 0 || oj % handle->desc.u != 0) { LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_VLA_ACCESS(5, del_input_full, img, ifm1, oj, oi, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock) = (element_input_type)0; } } else { LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_VLA_ACCESS(5, del_input_full, img, ifm1, oj, oi, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock) = LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, oj/handle->desc.u, oi/handle->desc.v, ifm2, handle->blocksifm, IFH, IFW, handle->ifmblock); } } } } } } } else if (handle->spread_input_bwd == 1) { LIBXSMM_VLA_DECL(5, element_input_type, del_input_full, (element_input_type*)handle->grad_input->data + ((size_t)handle->desc.pad_h_in * handle->ifwp + handle->desc.pad_w_in) * handle->ifmblock, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); for (img = my_img_start; img < my_img_end; img++) { for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { for (oj = 0; oj < handle->ifhp; oj++) { for (oi = 0; oi < handle->ifwp; oi++) { if (oi % handle->desc.v != 0 || oj % handle->desc.u != 0) { LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_VLA_ACCESS(5, del_input_full, img, ifm1, oj, oi, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock) = (element_input_type)0; } } } } } } } libxsmm_barrier_wait(handle->barrier, ltid); libxsmm-1.17/src/template/libxsmm_dnn_convolve_st_bwd_custom_custom_generic_bf16.tpl.c000066400000000000000000000626301415223013700315530ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas, Alexander Heinecke, Hans Pabst (Intel Corp.) ******************************************************************************/ int img, ofm1, ofm2, ifm1, ifm2, oj, ojj, oi, kj, ki, oi_use, oj_use, ii_use, ij_use, ofmb, ifmb, ojb, myIfmId, nIfmBlocks, ind, task; int last_ki, last_kj, next_kj; /* computing first logical thread */ const int ltid = tid - start_thread; int imgpt = LIBXSMM_UPDIV(handle->desc.N, handle->desc.threads); int threads_per_image = handle->desc.threads / handle->desc.N; int my_img_start = LIBXSMM_MIN(ltid * imgpt, handle->desc.N); int my_img_end = LIBXSMM_MIN((ltid+1) * imgpt, handle->desc.N); int my_ifm_start = 0; int my_ifm_end = handle->blocksifm; int ofmblock_lp = handle->ofmblock/handle->fm_lp_block; int ifmblock_lp = handle->ifmblock/handle->fm_lp_block; int lpb = handle->fm_lp_block; /* Batch reduce related variables */ const element_filter_type *A_ptrs[1024]; const element_input_type *B_ptrs[1024]; unsigned long long n_blocks; /* number of tasks for transpose that could be run in parallel */ int transpose_work = handle->blocksifm * handle->blocksofm * handle->desc.R * handle->desc.S; /* compute chunk size */ int transpose_chunksize = (transpose_work % handle->desc.threads == 0) ? (transpose_work / handle->desc.threads) : ((transpose_work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ int transpose_thr_begin = (ltid * transpose_chunksize < transpose_work) ? (ltid * transpose_chunksize) : transpose_work; int transpose_thr_end = ((ltid + 1) * transpose_chunksize < transpose_work) ? ((ltid + 1) * transpose_chunksize) : transpose_work; /* offset output pointer in case of physical padding */ const int IFW = (handle->pack_input_bwd == 1) ? handle->ofw : handle->ifwp; const int IFH = (handle->pack_input_bwd == 1) ? handle->ofh : handle->ifhp; const int ifwp_scratch = (handle->spread_input_bwd == 1) ? handle->desc.v * handle->bwd_ofw_rb : handle->bwd_ofw_rb; /* Auxiliary fp32 accumulators */ float *del_inp_ptr; float *del_inp_fp32 = (float*)((char*)handle->scratch + handle->bwd_lp_input_full_scratch_offset) + ((size_t)handle->desc.pad_h_in * handle->ifwp + handle->desc.pad_w_in) * handle->ifmblock; LIBXSMM_VLA_DECL(5, float, del_input_fp32, del_inp_fp32, handle->blocksifm, IFH, IFW, handle->ifmblock); element_input_type *input_ptr = (handle->pack_input_bwd == 1) ? (element_input_type*)((char*)handle->scratch + handle->bwd_packing_padding_scratch_offset) : (element_input_type*)handle->grad_input->data + ((size_t)handle->desc.pad_h_in * handle->ifwp + handle->desc.pad_w_in) * handle->ifmblock; LIBXSMM_VLA_DECL(5, element_input_type, del_input, input_ptr, handle->blocksifm, IFH, IFW, handle->ifmblock); element_output_type *const out = (element_output_type*)handle->grad_output->data; LIBXSMM_VLA_DECL(5, const element_output_type, output, out, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); /* Weight and transpose_weight tensor declaration */ LIBXSMM_VLA_DECL(7, element_filter_type, wt, (element_filter_type*)handle->reg_filter->data, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, lpb); LIBXSMM_VLA_DECL(7, element_filter_type, tr_wt, (element_filter_type*)((char*)handle->scratch + handle->bwd_filter_trans_scratch_offset), handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb); /* define weight pointer which has the correct format */ element_filter_type* weight_base = ((handle->options & LIBXSMM_DNN_CONV_OPTION_BWD_NO_FILTER_TRANSPOSE) > 0 ) ? (element_filter_type*)handle->reg_filter_tr->data : (element_filter_type*)((char*)handle->scratch + handle->bwd_filter_trans_scratch_offset); LIBXSMM_VLA_DECL(7, const element_filter_type, weight, weight_base, handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb); /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); /* transpose filters, if requested */ if ( (handle->options & LIBXSMM_DNN_CONV_OPTION_BWD_NO_FILTER_TRANSPOSE) == 0 ) { for (task = transpose_thr_begin; task < transpose_thr_end; ++task) { ifm1 = task/(handle->blocksofm * handle->desc.R * handle->desc.S); ofm1 = (task%(handle->blocksofm * handle->desc.R * handle->desc.S))/(handle->desc.R * handle->desc.S); kj = ((task%(handle->blocksofm * handle->desc.R * handle->desc.S))%(handle->desc.R * handle->desc.S))/handle->desc.S; ki = ((task%(handle->blocksofm * handle->desc.R * handle->desc.S))%(handle->desc.R * handle->desc.S))%handle->desc.S; for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { LIBXSMM_VLA_ACCESS(7, tr_wt, ifm1, ofm1, handle->desc.R-1-kj , handle->desc.S-1-ki, ofm2/lpb, ifm2, ofm2%lpb, handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb) = LIBXSMM_VLA_ACCESS(7, wt, ofm1, ifm1, kj, ki, ifm2/lpb, ofm2, ifm2%lpb, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, lpb); } } } /* wait for transpose to finish */ libxsmm_barrier_wait(handle->barrier, ltid); } if ( imgpt <= 1 ) { my_img_start = LIBXSMM_MIN(ltid / threads_per_image, handle->desc.N); my_img_end = LIBXSMM_MIN(my_img_start + 1, handle->desc.N); myIfmId = ltid % threads_per_image; nIfmBlocks = LIBXSMM_UPDIV(handle->blocksifm, threads_per_image); my_ifm_start = LIBXSMM_MIN(myIfmId * nIfmBlocks, handle->blocksifm); my_ifm_end = LIBXSMM_MIN((myIfmId+1) * nIfmBlocks, handle->blocksifm); } if ( handle->use_ifm_parallelization == 1 ) { int spread_out = 0; if ( handle->desc.N % 8 == 0) { spread_out = 8; } else if ( handle->desc.N % 4 == 0) { spread_out = 4; } else if (handle->desc.N % 3 == 0) { spread_out = 3; } else if (handle->desc.N % 2 == 0) { spread_out = 2; } else { spread_out = 1; } if ((spread_out > 1) && (handle->desc.threads % spread_out == 0)) { int tile_id = ltid / spread_out; int ifmpt = LIBXSMM_UPDIV(handle->blocksifm, spread_out); int ifm_id = ltid % spread_out; imgpt = LIBXSMM_UPDIV(handle->desc.N, handle->desc.threads) * spread_out; my_img_start = LIBXSMM_MIN(tile_id * imgpt, handle->desc.N); my_img_end = LIBXSMM_MIN((tile_id+1) * imgpt, handle->desc.N); my_ifm_start = LIBXSMM_MIN(ifm_id * ifmpt, handle->blocksifm); my_ifm_end = LIBXSMM_MIN((ifm_id+1) * ifmpt, handle->blocksifm); } } if (handle->loop_order == 0) { /* (loop_order == N_Kb_Cb_Hb_k_c_h_w) {*/ if ( handle->avoid_fmas_in_rim == 1) { for (img = my_img_start; img < my_img_end; img++) { for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += handle->block_bwd_ifm) { for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_bwd_ofm) { for (ojb = 0; ojb < handle->ofh; ojb += handle->block_bwd_oj) { for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_bwd_ifm, my_ifm_end); ifm1++ ) { if ( (ofmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load_bwd == 0 && ojb == 0) { /* set output feature map to zero */ for (oj = 0; oj < handle->ofh; ++oj) { float *temp_ptr = (float*)&LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, oj, 0, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); for (oi = 0; oi < handle->ofw; ++oi) { LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { temp_ptr[ifm2] = (float)0; } temp_ptr += handle->ifmblock; } } } for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_bwd_ofm, handle->blocksofm); ofm1 += handle->blocksofm_blocking) { for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_bwd_oj,handle->ofh); oj += handle->bwd_ofh_rb) { for (oi = 0; oi < handle->ofw; oi += handle->bwd_ofw_rb) { for (kj = 0; kj < handle->desc.R; kj++) { for (ki = 0; ki < handle->desc.S; ki++) { /* Prepare batch-reduce kernel arguments */ ij_use = oj; ii_use = oi; oj_use = oj - (1-handle->desc.pad_h_out); oi_use = oi - (1-handle->desc.pad_w_out); last_kj = handle->desc.R-1; last_ki = handle->desc.S-1; next_kj = kj+1; if (kj == 0 && oj == 0) { /* Do no FLOPS */ } else if (kj == handle->desc.R-1 && oj == handle->ofh-1 ) { /* Do no FLOPS */ } else if ( oi == 0 && ki == 0 ) { ind = 0; for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ifm1, ofm2, kj, ki, 0, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki + 1, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); ind++; } n_blocks = ind; if (handle->avoid_acc_load_bwd == 1) { br_gemm_kernel2_bf16bf16(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use, ii_use + 1, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), &n_blocks); } else { del_inp_ptr = &LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, ij_use, ii_use + 1, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); br_gemm_kernel2(A_ptrs, B_ptrs, del_inp_ptr, &n_blocks); if (ofm2 == handle->blocksofm && ((kj == last_kj && ki == last_ki) || (next_kj == 0 && next_kj == last_kj && oj == 0) || (next_kj == handle->desc.R-1 && next_kj == last_kj && oj == handle->ofh-1))) { for (ojj = 0; ojj < handle->bwd_ofh_rb; ojj++) { LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, ij_use+ojj, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use+ojj, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), handle->bwd_ofw_rb * handle->ifmblock); } } } } else if (oi == handle->ofw-handle->bwd_ofw_rb && ki == handle->desc.S-1) { ind = 0; for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ifm1, ofm2, kj, ki, 0, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); ind++; } n_blocks = ind; if (handle->avoid_acc_load_bwd == 1) { br_gemm_kernel2_bf16bf16(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), &n_blocks); } else { del_inp_ptr = &LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); br_gemm_kernel2(A_ptrs, B_ptrs, del_inp_ptr, &n_blocks); if (ofm2 == handle->blocksofm && ((kj == last_kj && ki == last_ki) || (next_kj == 0 && next_kj == last_kj && oj == 0) || (next_kj == handle->desc.R-1 && next_kj == last_kj && oj == handle->ofh-1))) { for (ojj = 0; ojj < handle->bwd_ofh_rb; ojj++) { LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, ij_use+ojj, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use+ojj, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), handle->bwd_ofw_rb * handle->ifmblock); } } } } else { ind = 0; for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ifm1, ofm2, kj, ki, 0, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); ind++; } n_blocks = ind; if (handle->avoid_acc_load_bwd == 1) { br_gemm_kernel_bf16bf16(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), &n_blocks); } else { del_inp_ptr = &LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); br_gemm_kernel(A_ptrs, B_ptrs, del_inp_ptr, &n_blocks); if (ofm2 == handle->blocksofm && ((kj == last_kj && ki == last_ki) || (next_kj == 0 && next_kj == last_kj && oj == 0) || (next_kj == handle->desc.R-1 && next_kj == last_kj && oj == handle->ofh-1))) { for (ojj = 0; ojj < handle->bwd_ofh_rb; ojj++) { LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, ij_use+ojj, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use+ojj, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), handle->bwd_ofw_rb * handle->ifmblock); } } } } } } } } } } } } } } } else { for (img = my_img_start; img < my_img_end; img++) { for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += handle->block_bwd_ifm) { for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_bwd_ofm) { for (ojb = 0; ojb < handle->ofh; ojb += handle->block_bwd_oj) { for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_bwd_ifm, my_ifm_end); ifm1++ ) { if ( (ofmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load_bwd == 0 && ojb == 0) { /* set output feature map to zero */ for (oj = 0; oj < handle->ofh; ++oj) { float *temp_ptr = (float*)&LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, oj, 0, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); for (oi = 0; oi < handle->ofw; ++oi) { LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { temp_ptr[ifm2] = (float)0; } temp_ptr += handle->ifmblock; } } } for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_bwd_ofm, handle->blocksofm); ofm1 += handle->blocksofm_blocking) { for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_bwd_oj,handle->ofh); oj += handle->bwd_ofh_rb) { for (oi = 0; oi < handle->ofw; oi += handle->bwd_ofw_rb) { /* Prepare batch-reduce kernel arguments */ ij_use = (handle->spread_input_bwd == 1) ? oj * handle->desc.u : oj; ii_use = (handle->spread_input_bwd == 1) ? oi * handle->desc.v : oi; oi_use = oi; oj_use = oj; ind = 0; kj = 0; ki = 0; for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { for (kj = 0; kj < handle->desc.R; kj++) { for (ki = 0; ki < handle->desc.S; ki++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ifm1, ofm2, kj, ki, 0, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); ind++; } } } n_blocks = ind; if (handle->avoid_acc_load_bwd == 1) { br_gemm_kernel_bf16bf16(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), &n_blocks); } else { del_inp_ptr = &LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); br_gemm_kernel(A_ptrs, B_ptrs, del_inp_ptr, &n_blocks); if (ofm2 == handle->blocksofm && kj == handle->desc.R && ki == handle->desc.S) { for (ojj = 0; ojj < handle->bwd_ofh_rb; ojj++) { LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, ij_use+ojj, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use+ojj, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), ifwp_scratch * handle->ifmblock); } } } } } } } } } } } } } if (handle->loop_order == 1) { /* (loop_order == N_Kb_Cb_Hb_k_c_h_w) { */ for (img = my_img_start; img < my_img_end; img++) { for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += handle->block_bwd_ifm) { for (ojb = 0; ojb < handle->ofh; ojb += handle->block_fwd_oj) { for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_bwd_oj,handle->ofh); oj += handle->bwd_ofh_rb) { for (oi = 0; oi < handle->ofw; oi += handle->bwd_ofw_rb) { for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_bwd_ifm, my_ifm_end); ifm1++ ) { for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_bwd_ofm) { if ( (ofmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load_bwd == 0 && ojb == 0 && oj == 0 && oi == 0) { /* set output feature map to zero */ for (oj = 0; oj < handle->ofh; ++oj) { float *temp_ptr = (float*)&LIBXSMM_VLA_ACCESS( 5, del_input_fp32, img, ifm1, oj, 0, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); for (oi = 0; oi < handle->ofw; ++oi) { LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { temp_ptr[ifm2] = (float)0; } temp_ptr += handle->ifmblock; } } } for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_bwd_ofm, handle->blocksofm); ofm1 += handle->blocksofm_blocking) { /* Prepare batch-reduce kernel arguments */ ij_use = (handle->spread_input_bwd == 1) ? oj * handle->desc.u : oj; ii_use = (handle->spread_input_bwd == 1) ? oi * handle->desc.v : oi; oi_use = oi; oj_use = oj; ind = 0; kj = 0; ki = 0; for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { for (kj = 0; kj < handle->desc.R; kj++) { for (ki = 0; ki < handle->desc.S; ki++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ifm1, ofm2, kj, ki, 0, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); ind++; } } } n_blocks = ind; if (handle->avoid_acc_load_bwd == 1) { br_gemm_kernel_bf16bf16(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), &n_blocks); } else { del_inp_ptr = &LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); br_gemm_kernel(A_ptrs, B_ptrs, del_inp_ptr, &n_blocks); if (ofm2 == handle->blocksofm && kj == handle->desc.R && ki == handle->desc.S) { for (ojj = 0; ojj < handle->bwd_ofh_rb; ojj++) { LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, ij_use+ojj, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use+ojj, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), ifwp_scratch * handle->ifmblock); } } } } } } } } } } } } if (handle->pack_input_bwd == 1) { LIBXSMM_VLA_DECL(5, element_input_type, del_input_full, (element_input_type*)handle->grad_input->data + ((size_t)handle->desc.pad_h_in * handle->ifwp + handle->desc.pad_w_in) * handle->ifmblock, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); for (img = my_img_start; img < my_img_end; img++) { for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { for (oj = 0; oj < handle->ifhp; oj++) { for (oi = 0; oi < handle->ifwp; oi++) { if (oi % handle->desc.v != 0 || oj % handle->desc.u != 0) { LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_VLA_ACCESS(5, del_input_full, img, ifm1, oj, oi, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock) = (element_input_type)0; } } else { LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_VLA_ACCESS(5, del_input_full, img, ifm1, oj, oi, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock) = LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, oj/handle->desc.u, oi/handle->desc.v, ifm2, handle->blocksifm, IFH, IFW, handle->ifmblock); } } } } } } } else if (handle->spread_input_bwd == 1) { LIBXSMM_VLA_DECL(5, element_input_type, del_input_full, (element_input_type*)handle->grad_input->data + ((size_t)handle->desc.pad_h_in * handle->ifwp + handle->desc.pad_w_in) * handle->ifmblock, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); for (img = my_img_start; img < my_img_end; img++) { for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { for (oj = 0; oj < handle->ifhp; oj++) { for (oi = 0; oi < handle->ifwp; oi++) { if (oi % handle->desc.v != 0 || oj % handle->desc.u != 0) { LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_VLA_ACCESS(5, del_input_full, img, ifm1, oj, oi, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock) = (element_input_type)0; } } } } } } } libxsmm_barrier_wait(handle->barrier, ltid); libxsmm-1.17/src/template/libxsmm_dnn_convolve_st_bwd_nhwc_custom-rsck_fallback_generic.tpl.c000066400000000000000000000253501415223013700331370ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Rajkishore Barik, Ankush Mandal, Alexander Heinecke (Intel Corp.) ******************************************************************************/ int imgifm1, img, ofm1, ifm1, oj, ij, oi, ii, kj, ki, ifm2, ofm2, ifm1ofm1; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const int work = handle->desc.N * handle->blocksifm; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* number of tasks for transpose that could be run in parallel */ int transpose_work = handle->blocksifm * handle->blocksofm; /* compute chunk size */ const int transpose_chunksize = (transpose_work % handle->desc.threads == 0) ? (transpose_work / handle->desc.threads) : ((transpose_work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int transpose_thr_begin = (ltid * transpose_chunksize < transpose_work) ? (ltid * transpose_chunksize) : transpose_work; const int transpose_thr_end = ((ltid + 1) * transpose_chunksize < transpose_work) ? ((ltid + 1) * transpose_chunksize) : transpose_work; /* offset pointer in case of physical padding */ element_output_type *const out = (element_output_type*)handle->grad_output->data + ((size_t)handle->desc.pad_h_out * handle->ofwp + handle->desc.pad_w_out) * handle->blocksofm * handle->ofmblock; /* Weight and transpose_weight tensor declaration */ #if defined(LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_CUSTOM) LIBXSMM_VLA_DECL(6, element_filter_type, wt, (element_filter_type*)handle->reg_filter->data, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); #endif #if defined(LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_RSCK) LIBXSMM_VLA_DECL(6, element_filter_type, wt, (element_filter_type*)handle->reg_filter->data, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); #endif LIBXSMM_VLA_DECL(6, element_filter_type, tr_wt, (element_filter_type*)((char*)handle->scratch + handle->bwd_filter_trans_scratch_offset), handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); /* define weight pointer which has the correct format */ element_filter_type* weight_base = 0; /* padding via stack allocated buffers */ const int padded_w = handle->desc.W + (2 * handle->desc.pad_w); const int padded_h = handle->desc.H + (2 * handle->desc.pad_h); const int size_tls1 = padded_h * padded_w * handle->ifmblock; element_input_type *const del_input_scratch_padding = (element_input_type*)((char*)handle->scratch + handle->bwd_packing_padding_scratch_offset) + ltid * size_tls1; for ( ii = 0; ii < size_tls1; ++ii ) { del_input_scratch_padding[ii] = (element_input_type)0; } /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); /* transpose filters, if requested */ if ( (handle->options & LIBXSMM_DNN_CONV_OPTION_BWD_NO_FILTER_TRANSPOSE) > 0 ) { weight_base = (element_filter_type*)handle->reg_filter_tr->data; } else { for (ifm1ofm1 = transpose_thr_begin; ifm1ofm1 < transpose_thr_end; ++ifm1ofm1) { ofm1 = ifm1ofm1 / handle->blocksifm; ifm1 = ifm1ofm1 % handle->blocksifm; for (kj=0; kj < handle->desc.R; kj++) { for (ki=0; ki < handle->desc.S; ki++) { for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { #if defined(LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_CUSTOM) LIBXSMM_VLA_ACCESS(6, tr_wt, ifm1, ofm1, handle->desc.R-1-kj , handle->desc.S-1-ki, ofm2, ifm2, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock) = LIBXSMM_VLA_ACCESS(6, wt, ofm1, ifm1, kj, ki, ifm2, ofm2, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); #endif #if defined(LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_RSCK) LIBXSMM_VLA_ACCESS(6, tr_wt, ifm1, ofm1, handle->desc.R-1-kj , handle->desc.S-1-ki, ofm2, ifm2, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock) = LIBXSMM_VLA_ACCESS(6, wt, kj, ki, ifm1, ifm2, ofm1, ofm2, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); #endif } } } } } weight_base = (element_filter_type*)((char*)handle->scratch + handle->bwd_filter_trans_scratch_offset); /* wait for transpose to finish */ libxsmm_barrier_wait(handle->barrier, ltid); } {/* open new scope for additional variable declarations (C89) */ LIBXSMM_VLA_DECL(5, element_input_type, del_input, (element_output_type*)handle->grad_input->data, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock); LIBXSMM_VLA_DECL(3, element_input_type, del_input_padded, del_input_scratch_padding, padded_w, handle->ifmblock); LIBXSMM_VLA_DECL(5, const element_output_type, output, out, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock); LIBXSMM_VLA_DECL(6, const element_filter_type, weight, weight_base, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); for (imgifm1 = thr_begin; imgifm1 < thr_end; ++imgifm1) { img = imgifm1 / handle->blocksifm; ifm1 = imgifm1 % handle->blocksifm; /* check if we need padding, for now we do physical padding on the fly, however we can play with N parameter of the GEMM */ /* @TODO: add variant which deals with multiple GEMMS by varying N to deal with padding */ if ( (handle->desc.pad_h == handle->desc.pad_h_in) && (handle->desc.pad_w == handle->desc.pad_w_in) ) { /* reset result buffer to zero when intent is to overwrite when first block of input channels should be convoluted */ if ( ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) ) { element_input_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, del_input, img, 0, 0, ifm1, 0, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock)); LIBXSMM_PRAGMA_SIMD for (ij = 0; ij < handle->ifhp*handle->ifwp; ij++) { for (ii = 0; ii < handle->ifmblock; ii++) { temp_ptr[ii] = (element_input_type)0; } temp_ptr += handle->blocksifm * handle->ifmblock; } } /* run convolution */ for (ofm1 = 0; ofm1 < handle->blocksofm; ++ofm1) { for ( oj = 0; oj < handle->ofh; ++oj) { ij = oj * handle->desc.u; oi = 0; ii = 0; for (kj = 0; kj < handle->desc.R; ++kj) { for (ki = 0; ki < handle->desc.S; ++ki) { gemm_kernel( &LIBXSMM_VLA_ACCESS(6, weight, ifm1, ofm1, handle->desc.R-1-kj, handle->desc.S-1-ki, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock), &LIBXSMM_VLA_ACCESS(5, output, img, oj, oi, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock), &LIBXSMM_VLA_ACCESS(5, del_input, img, ij + kj, ii + ki, ifm1, 0, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock) ); } } } } /* zero rim in case of physical padding.... this code is extremely stupid and crappy as it requires a complicated if... */ if (handle->desc.pad_h_in > 0 || handle->desc.pad_w_in > 0) { for ( ij = 0; ij < handle->ifhp; ij++ ) { for ( ii = 0; ii < handle->ifwp; ii++ ) { if ( (ij < handle->desc.pad_h_in) || (ij >= (handle->desc.H+handle->desc.pad_h_in)) || (ii < handle->desc.pad_w_in) || (ii >= (handle->desc.W+handle->desc.pad_w_in)) ) { for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { LIBXSMM_VLA_ACCESS(5, del_input, img, ij, ii, ifm1, ifm2, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock) = (element_input_type)0; } } } } } } else { /* reset result buffer to zero when intent is to overwrite when first block of input channels should be convoluted */ if ( ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) ) { LIBXSMM_PRAGMA_SIMD for (ij = 0; ij < size_tls1; ++ij) { del_input_scratch_padding[ij] = (element_output_type)0; } } else { for (ij = 0; ij < handle->desc.H; ij++) { for (ii = 0; ii < handle->desc.W; ii++) { LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_VLA_ACCESS(3, del_input_padded, ij + handle->desc.pad_h, ii + handle->desc.pad_w, ifm2, padded_w, handle->ifmblock) = LIBXSMM_VLA_ACCESS(5, del_input, img, ij, ii, ifm1, ifm2, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock); } } } } /* run convolution */ for (ofm1 = 0; ofm1 < handle->blocksofm; ++ofm1) { for ( oj = 0; oj < handle->ofh; ++oj) { ij = oj * handle->desc.u; oi = 0; ii = 0; for (kj = 0; kj < handle->desc.R; ++kj) { for (ki = 0; ki < handle->desc.S; ++ki) { gemm_kernel( &LIBXSMM_VLA_ACCESS(6, weight, ifm1, ofm1, handle->desc.R-1-kj, handle->desc.S-1-ki, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock), &LIBXSMM_VLA_ACCESS(5, output, img, oj, oi, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock), &LIBXSMM_VLA_ACCESS(3, del_input_padded, ij + kj, ii + ki, 0, padded_w, handle->ifmblock) ); } } } } /* input padding copy back */ for (ij = 0; ij < handle->desc.H; ij++) { for (ii = 0; ii < handle->desc.W; ii++) { LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_VLA_ACCESS(5, del_input, img, ij, ii, ifm1, ifm2, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock) = LIBXSMM_VLA_ACCESS(3, del_input_padded, ij + handle->desc.pad_h, ii + handle->desc.pad_w, ifm2, padded_w, handle->ifmblock); } } } } } /* end of imgifm1 loop */ } /* end of new scope for additional variable declarations (C89) */ libxsmm_barrier_wait(handle->barrier, ltid); libxsmm-1.17/src/template/libxsmm_dnn_convolve_st_bwd_nhwc_custom-rsck_generic.tpl.c000066400000000000000000000547711415223013700313310ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas, Alexander Heinecke, Hans Pabst (Intel Corp.) ******************************************************************************/ int img, ofm1, ofm2, ifm1, ifm2, oj, oi, kj, ki, oi_use, oj_use, ii_use, ij_use, ofmb, ifmb, ojb, myIfmId, nIfmBlocks, ind, /*task,*/ ifm1ofm1; /* computing first logical thread */ const int ltid = tid - start_thread; int imgpt = LIBXSMM_UPDIV(handle->desc.N, handle->desc.threads); int threads_per_image = handle->desc.threads / handle->desc.N; int my_img_start = LIBXSMM_MIN(ltid * imgpt, handle->desc.N); int my_img_end = LIBXSMM_MIN((ltid+1) * imgpt, handle->desc.N); int my_ifm_start = 0; int my_ifm_end = handle->blocksifm; /* Batch reduce related variables */ const element_filter_type *A_ptrs[1024]; const element_input_type *B_ptrs[1024]; unsigned long long n_blocks; /* number of tasks for transpose that could be run in parallel */ int transpose_work = handle->blocksifm * handle->blocksofm * handle->desc.R * handle->desc.S; /* compute chunk size */ int transpose_chunksize = (transpose_work % handle->desc.threads == 0) ? (transpose_work / handle->desc.threads) : ((transpose_work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ int transpose_thr_begin = (ltid * transpose_chunksize < transpose_work) ? (ltid * transpose_chunksize) : transpose_work; int transpose_thr_end = ((ltid + 1) * transpose_chunksize < transpose_work) ? ((ltid + 1) * transpose_chunksize) : transpose_work; /* offset output pointer in case of physical padding */ const int IFW = (handle->pack_input_bwd == 1) ? handle->ofw : handle->ifwp; const int IFH = (handle->pack_input_bwd == 1) ? handle->ofh : handle->ifhp; element_input_type *input_ptr = (handle->pack_input_bwd == 1) ? (element_input_type*)((char*)handle->scratch + handle->bwd_packing_padding_scratch_offset) : (element_input_type*)handle->grad_input->data + ((size_t)handle->desc.pad_h_in * handle->ifwp + handle->desc.pad_w_in) * handle->blocksifm * handle->ifmblock; LIBXSMM_VLA_DECL(5, element_input_type, del_input, input_ptr, IFH, IFW, handle->blocksifm, handle->ifmblock); element_output_type *const out = (element_output_type*)handle->grad_output->data; LIBXSMM_VLA_DECL(5, const element_output_type, output, out, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock); /* Weight and transpose_weight tensor declaration */ #if defined(LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_CUSTOM) LIBXSMM_VLA_DECL(6, element_filter_type, wt, (element_filter_type*)handle->reg_filter->data, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); #endif #if defined(LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_RSCK) LIBXSMM_VLA_DECL(6, element_filter_type, wt, (element_filter_type*)handle->reg_filter->data, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); #endif LIBXSMM_VLA_DECL(6, element_filter_type, tr_wt, (element_filter_type*)((char*)handle->scratch + handle->bwd_filter_trans_scratch_offset), handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); /* define weight pointer which has the correct format */ element_filter_type* weight_base = ((handle->options & LIBXSMM_DNN_CONV_OPTION_BWD_NO_FILTER_TRANSPOSE) > 0 ) ? (element_filter_type*)handle->reg_filter_tr->data : (element_filter_type*)((char*)handle->scratch + handle->bwd_filter_trans_scratch_offset); LIBXSMM_VLA_DECL(6, const element_filter_type, weight, weight_base, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); /* transpose filters, if requested */ if ( (handle->options & LIBXSMM_DNN_CONV_OPTION_BWD_NO_FILTER_TRANSPOSE) == 0 ) { /* Special case of 64x64 transpose with JITed transpose */ #if 0 if (handle->ifmblock == 64 && handle->ofmblock == 64) { libxsmm_xtransfunction tr_kernel = handle->tr_kernel; const unsigned int ld_in = 64; const unsigned int ld_out = 64; for (task = transpose_thr_begin; task < transpose_thr_end; ++task) { ifm1 = task/(handle->blocksofm * handle->desc.R * handle->desc.S); ofm1 = (task%(handle->blocksofm * handle->desc.R * handle->desc.S))/(handle->desc.R * handle->desc.S); kj = ((task%(handle->blocksofm * handle->desc.R * handle->desc.S))%(handle->desc.R * handle->desc.S))/handle->desc.S; ki = ((task%(handle->blocksofm * handle->desc.R * handle->desc.S))%(handle->desc.R * handle->desc.S))%handle->desc.S; tr_kernel(&LIBXSMM_VLA_ACCESS(6, wt, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &ld_in, &LIBXSMM_VLA_ACCESS(6, tr_wt, ifm1, ofm1, handle->desc.R-1-kj, handle->desc.S-1-ki, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock), &ld_out); tr_kernel(&LIBXSMM_VLA_ACCESS(6, wt, ofm1, ifm1, kj, ki, 16, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &ld_in, &LIBXSMM_VLA_ACCESS(6, tr_wt, ifm1, ofm1, handle->desc.R-1-kj, handle->desc.S-1-ki, 0, 16, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock), &ld_out); tr_kernel(&LIBXSMM_VLA_ACCESS(6, wt, ofm1, ifm1, kj, ki, 32, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &ld_in, &LIBXSMM_VLA_ACCESS(6, tr_wt, ifm1, ofm1, handle->desc.R-1-kj, handle->desc.S-1-ki, 0, 32, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock), &ld_out); tr_kernel(&LIBXSMM_VLA_ACCESS(6, wt, ofm1, ifm1, kj, ki, 48, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &ld_in, &LIBXSMM_VLA_ACCESS(6, tr_wt, ifm1, ofm1, handle->desc.R-1-kj, handle->desc.S-1-ki, 0, 48, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock), &ld_out); } } else { #endif /* number of tasks for transpose that could be run in parallel */ transpose_work = handle->blocksifm * handle->blocksofm; /* compute chunk size */ transpose_chunksize = (transpose_work % handle->desc.threads == 0) ? (transpose_work / handle->desc.threads) : ((transpose_work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ transpose_thr_begin = (ltid * transpose_chunksize < transpose_work) ? (ltid * transpose_chunksize) : transpose_work; transpose_thr_end = ((ltid + 1) * transpose_chunksize < transpose_work) ? ((ltid + 1) * transpose_chunksize) : transpose_work; for (ifm1ofm1 = transpose_thr_begin; ifm1ofm1 < transpose_thr_end; ++ifm1ofm1) { ofm1 = ifm1ofm1 / handle->blocksifm; ifm1 = ifm1ofm1 % handle->blocksifm; for (kj=0; kj < handle->desc.R; kj++) { for (ki=0; ki < handle->desc.S; ki++) { for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { #if defined(LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_CUSTOM) LIBXSMM_VLA_ACCESS(6, tr_wt, ifm1, ofm1, handle->desc.R-1-kj , handle->desc.S-1-ki, ofm2, ifm2, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock) = LIBXSMM_VLA_ACCESS(6, wt, ofm1, ifm1, kj, ki, ifm2, ofm2, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); #endif #if defined(LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_RSCK) LIBXSMM_VLA_ACCESS(6, tr_wt, ifm1, ofm1, handle->desc.R-1-kj , handle->desc.S-1-ki, ofm2, ifm2, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock) = LIBXSMM_VLA_ACCESS(6, wt, kj, ki, ifm1, ifm2, ofm1, ofm2, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); #endif } } } } } #if 0 } #endif /* wait for transpose to finish */ libxsmm_barrier_wait(handle->barrier, ltid); } if ( imgpt <= 1 ) { my_img_start = LIBXSMM_MIN(ltid / threads_per_image, handle->desc.N); my_img_end = LIBXSMM_MIN(my_img_start + 1, handle->desc.N); myIfmId = ltid % threads_per_image; nIfmBlocks = LIBXSMM_UPDIV(handle->blocksifm, threads_per_image); my_ifm_start = LIBXSMM_MIN(myIfmId * nIfmBlocks, handle->blocksifm); my_ifm_end = LIBXSMM_MIN((myIfmId+1) * nIfmBlocks, handle->blocksifm); } if ( handle->use_ifm_parallelization == 1 ) { int spread_out = 0; if ( handle->desc.N % 8 == 0) { spread_out = 8; } else if ( handle->desc.N % 4 == 0) { spread_out = 4; } else if (handle->desc.N % 3 == 0) { spread_out = 3; } else if (handle->desc.N % 2 == 0) { spread_out = 2; } else { spread_out = 1; } if ((spread_out > 1) && (handle->desc.threads % spread_out == 0)) { int tile_id = ltid / spread_out; int ifmpt = LIBXSMM_UPDIV(handle->blocksifm, spread_out); int ifm_id = ltid % spread_out; imgpt = LIBXSMM_UPDIV(handle->desc.N, handle->desc.threads) * spread_out; my_img_start = LIBXSMM_MIN(tile_id * imgpt, handle->desc.N); my_img_end = LIBXSMM_MIN((tile_id+1) * imgpt, handle->desc.N); my_ifm_start = LIBXSMM_MIN(ifm_id * ifmpt, handle->blocksifm); my_ifm_end = LIBXSMM_MIN((ifm_id+1) * ifmpt, handle->blocksifm); } } if (handle->loop_order == 0) { /* (loop_order == N_Kb_Cb_Hb_k_c_h_w) {*/ if ( handle->avoid_fmas_in_rim == 1) { for (img = my_img_start; img < my_img_end; img++) { for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += handle->block_bwd_ifm) { for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_bwd_ofm) { for (ojb = 0; ojb < handle->ofh; ojb += handle->block_bwd_oj) { for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_bwd_ifm, my_ifm_end); ifm1++ ) { if ( (ofmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load_bwd == 0 && ojb == 0) { /* set output feature map to zero */ for (oj = 0; oj < handle->ofh; ++oj) { element_input_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, del_input, img, oj, 0, ifm1, 0, IFH, IFW, handle->blocksifm, handle->ifmblock)); for (oi = 0; oi < handle->ofw; ++oi) { LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { temp_ptr[ifm2] = (element_input_type)0; } temp_ptr += handle->blocksifm * handle->ifmblock; } } } for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_bwd_ofm, handle->blocksofm); ofm1 += handle->blocksofm_blocking) { for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_bwd_oj,handle->ofh); oj += handle->bwd_ofh_rb) { for (oi = 0; oi < handle->ofw; oi += handle->bwd_ofw_rb) { for (kj = 0; kj < handle->desc.R; kj++) { for (ki = 0; ki < handle->desc.S; ki++) { /* Prepare batch-reduce kernel arguments */ ij_use = oj; ii_use = oi; oj_use = oj - (1-handle->desc.pad_h_out); oi_use = oi - (1-handle->desc.pad_w_out); if (kj == 0 && oj == 0) { /* Do no FLOPS */ } else if (kj == handle->desc.R-1 && oj == handle->ofh-1 ) { /* Do no FLOPS */ } else if ( oi == 0 && ki == 0 ) { ind = 0; for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ifm1, ofm2, kj, ki, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, oj_use + kj, oi_use + ki + 1, ofm2, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock); ind++; } n_blocks = ind; br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ij_use, ii_use + 1, ifm1, 0, IFH, IFW, handle->blocksifm, handle->ifmblock), &n_blocks); } else if (oi == handle->ofw-handle->bwd_ofw_rb && ki == handle->desc.S-1) { ind = 0; for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ifm1, ofm2, kj, ki, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, oj_use + kj, oi_use + ki, ofm2, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock); ind++; } n_blocks = ind; br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ij_use, ii_use, ifm1, 0, IFH, IFW, handle->blocksifm, handle->ifmblock), &n_blocks); } else { ind = 0; for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ifm1, ofm2, kj, ki, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, oj_use + kj, oi_use + ki, ofm2, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock); ind++; } n_blocks = ind; br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ij_use, ii_use, ifm1, 0, IFH, IFW, handle->blocksifm, handle->ifmblock), &n_blocks); } } } } } } } } } } } } else { for (img = my_img_start; img < my_img_end; img++) { for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += handle->block_bwd_ifm) { for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_bwd_ofm) { for (ojb = 0; ojb < handle->ofh; ojb += handle->block_bwd_oj) { for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_bwd_ifm, my_ifm_end); ifm1++ ) { if ( (ofmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load_bwd == 0 && ojb == 0) { /* set output feature map to zero */ for (oj = 0; oj < handle->ofh; ++oj) { element_input_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, del_input, img, oj, 0, ifm1, 0, IFH, IFW, handle->blocksifm, handle->ifmblock)); for (oi = 0; oi < handle->ofw; ++oi) { LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { temp_ptr[ifm2] = (element_input_type)0; } temp_ptr += handle->blocksifm * handle->ifmblock; } } } for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_bwd_ofm, handle->blocksofm); ofm1 += handle->blocksofm_blocking) { for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_bwd_oj,handle->ofh); oj += handle->bwd_ofh_rb) { for (oi = 0; oi < handle->ofw; oi += handle->bwd_ofw_rb) { /* Prepare batch-reduce kernel arguments */ ij_use = (handle->spread_input_bwd == 1) ? oj * handle->desc.u : oj; ii_use = (handle->spread_input_bwd == 1) ? oi * handle->desc.v : oi; oi_use = oi; oj_use = oj; ind = 0; for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { for (kj = 0; kj < handle->desc.R; kj++) { for (ki = 0; ki < handle->desc.S; ki++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ifm1, ofm2, kj, ki, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, oj_use + kj, oi_use + ki, ofm2, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock); ind++; } } } n_blocks = ind; br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ij_use, ii_use, ifm1, 0, IFH, IFW, handle->blocksifm, handle->ifmblock), &n_blocks); } } } } } } } } } } if (handle->loop_order == 1) { /* (loop_order == N_Kb_Cb_Hb_k_c_h_w) { */ for (img = my_img_start; img < my_img_end; img++) { for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += handle->block_bwd_ifm) { for (ojb = 0; ojb < handle->ofh; ojb += handle->block_bwd_oj) { for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_bwd_oj,handle->ofh); oj += handle->bwd_ofh_rb) { for (oi = 0; oi < handle->ofw; oi += handle->bwd_ofw_rb) { for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_bwd_ifm, my_ifm_end); ifm1++ ) { for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_bwd_ofm) { if ( (ofmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load_bwd == 0 && ojb == 0 && oj == 0 && oi == 0) { /* set output feature map to zero */ for (oj = 0; oj < handle->ofh; ++oj) { element_input_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, del_input, img, oj, 0, ifm1, 0, IFH, IFW, handle->blocksifm, handle->ifmblock)); for (oi = 0; oi < handle->ofw; ++oi) { LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { temp_ptr[ifm2] = (element_input_type)0; } temp_ptr += handle->blocksifm * handle->ifmblock; } } } for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_bwd_ofm, handle->blocksofm); ofm1 += handle->blocksofm_blocking) { /* Prepare batch-reduce kernel arguments */ ij_use = (handle->spread_input_bwd == 1) ? oj * handle->desc.u : oj; ii_use = (handle->spread_input_bwd == 1) ? oi * handle->desc.v : oi; oi_use = oi; oj_use = oj; ind = 0; for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { for (kj = 0; kj < handle->desc.R; kj++) { for (ki = 0; ki < handle->desc.S; ki++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ifm1, ofm2, kj, ki, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, oj_use + kj, oi_use + ki, ofm2, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock); ind++; } } } n_blocks = ind; br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ij_use, ii_use, ifm1, 0, IFH, IFW, handle->blocksifm, handle->ifmblock), &n_blocks); } } } } } } } } } if (handle->pack_input_bwd == 1) { LIBXSMM_VLA_DECL(5, element_input_type, del_input_full, (element_input_type*)handle->grad_input->data + ((size_t)handle->desc.pad_h_in * handle->ifwp + handle->desc.pad_w_in) * handle->blocksifm * handle->ifmblock, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock); for (img = my_img_start; img < my_img_end; img++) { for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { for (oj = 0; oj < handle->ifhp; oj++) { for (oi = 0; oi < handle->ifwp; oi++) { if (oi % handle->desc.v != 0 || oj % handle->desc.u != 0) { LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_VLA_ACCESS(5, del_input_full, img, oj, oi, ifm1, ifm2, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock) = (element_input_type)0; } } else { LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_VLA_ACCESS(5, del_input_full, img, oj, oi, ifm1, ifm2, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock) = LIBXSMM_VLA_ACCESS(5, del_input, img, oj/handle->desc.u, oi/handle->desc.v, ifm1, ifm2, IFH, IFW, handle->blocksifm,handle->ifmblock); } } } } } } } else if (handle->spread_input_bwd == 1) { LIBXSMM_VLA_DECL(5, element_input_type, del_input_full, (element_input_type*)handle->grad_input->data + ((size_t)handle->desc.pad_h_in * handle->ifwp + handle->desc.pad_w_in) * handle->blocksifm * handle->ifmblock, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock); for (img = my_img_start; img < my_img_end; img++) { for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { for (oj = 0; oj < handle->ifhp; oj++) { for (oi = 0; oi < handle->ifwp; oi++) { if (oi % handle->desc.v != 0 || oj % handle->desc.u != 0) { LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_VLA_ACCESS(5, del_input_full, img, oj, oi, ifm1, ifm2, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock) = (element_input_type)0; } } } } } } } libxsmm_barrier_wait(handle->barrier, ltid); libxsmm-1.17/src/template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic.tpl.c000066400000000000000000000643241415223013700307430ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas, Alexander Heinecke, Hans Pabst (Intel Corp.) ******************************************************************************/ int img, ofm1, ofm2 = 0, ifm1, ifm2 = 0, oj, oi, kj, ki, oi_use, oj_use, ii_use, ij_use, ofmb, ifmb, ojb, myOfmId, nOfmBlocks, ind, ofm11, ki1, kj1, ojj, oii, ii, ij, spread_out = 1; /* computing first logical thread */ const int ltid = tid - start_thread; int imgpt = LIBXSMM_UPDIV(handle->desc.N, handle->desc.threads); int threads_per_image = handle->desc.threads / handle->desc.N; int my_img_start = LIBXSMM_MIN(ltid * imgpt, handle->desc.N); int my_img_end = LIBXSMM_MIN((ltid+1) * imgpt, handle->desc.N); int my_ofm_start = 0; int my_ofm_end = handle->blocksofm; /* Batch reduce related variables */ const element_filter_type *A_ptrs[1024]; const element_input_type *B_ptrs[1024]; unsigned long long n_blocks; /* offset output pointer in case of physical output padding */ element_output_type* out = (element_output_type*)handle->reg_output->data + ((size_t)handle->desc.pad_h_out * handle->ofwp + handle->desc.pad_w_out) * handle->ofmblock; LIBXSMM_VLA_DECL(5, element_output_type, output, out, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); element_input_type *input_ptr = ( (handle->pack_input == 1) || (handle->fwd_padding_copy == 1) ) ? (element_input_type*)((char*)handle->scratch + handle->fwd_packing_padding_scratch_offset) : (element_input_type*)handle->reg_input->data; const int IFW = (handle->fwd_padding_copy == 1) ? handle->ifwp + 2*handle->desc.pad_w : ( (handle->pack_input == 1) ? handle->ofwp : handle->ifwp ); const int IFH = (handle->fwd_padding_copy == 1) ? handle->ifhp + 2*handle->desc.pad_h : ( (handle->pack_input == 1) ? handle->ofhp : handle->ifhp ); LIBXSMM_VLA_DECL(5, element_input_type, input, input_ptr, handle->blocksifm, IFH, IFW, handle->ifmblock); LIBXSMM_VLA_DECL(6, const element_filter_type, weight, (element_filter_type*)handle->reg_filter->data, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); if ( imgpt <= 1 ) { my_img_start = LIBXSMM_MIN(ltid / threads_per_image, handle->desc.N); my_img_end = LIBXSMM_MIN(my_img_start + 1, handle->desc.N); myOfmId = ltid % threads_per_image; nOfmBlocks = LIBXSMM_UPDIV(handle->blocksofm, threads_per_image); my_ofm_start = LIBXSMM_MIN(myOfmId * nOfmBlocks, handle->blocksofm); my_ofm_end = LIBXSMM_MIN((myOfmId+1) * nOfmBlocks, handle->blocksofm); } if ( handle->use_ofm_parallelization == 1 ) { if ( handle->desc.N % 8 == 0) { spread_out = 8; } else if ( handle->desc.N % 4 == 0) { spread_out = 4; } else if (handle->desc.N % 2 == 0) { spread_out = 2; } else if (handle->desc.N % 3 == 0) { spread_out = 3; } else { spread_out = 1; } if ((spread_out > 1) && (handle->desc.threads % spread_out == 0)) { int tile_id = ltid / spread_out; int ofmpt = LIBXSMM_UPDIV(handle->blocksofm, spread_out); int ofm_id = ltid % spread_out; imgpt = LIBXSMM_UPDIV(handle->desc.N, handle->desc.threads) * spread_out; my_img_start = LIBXSMM_MIN(tile_id * imgpt, handle->desc.N); my_img_end = LIBXSMM_MIN((tile_id+1) * imgpt, handle->desc.N); my_ofm_start = LIBXSMM_MIN(ofm_id * ofmpt, handle->blocksofm); my_ofm_end = LIBXSMM_MIN((ofm_id+1) * ofmpt, handle->blocksofm); } } /* remove stride from input */ if (handle->pack_input == 1) { int ifmpt = LIBXSMM_UPDIV(handle->blocksifm, spread_out); int ifm_id = ltid % spread_out; int my_ifm_start = LIBXSMM_MIN(ifm_id * ifmpt, handle->blocksifm); int my_ifm_end = LIBXSMM_MIN((ifm_id+1) * ifmpt, handle->blocksifm); LIBXSMM_VLA_DECL(5, element_input_type, input_src, (element_input_type*)handle->reg_input->data, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); for (img = my_img_start; img < my_img_end; img++) { for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { for (oj = 0; oj < handle->ofh; oj++) { for (oi = 0; oi < handle->ofw; oi++) { ij_use = oj * handle->desc.u; ii_use = oi * handle->desc.v; LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_VLA_ACCESS(5, input, img, ifm1, oj, oi, ifm2, handle->blocksifm, IFH, IFW, handle->ifmblock) = LIBXSMM_VLA_ACCESS(5, input_src, img, ifm1, ij_use, ii_use, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); } } } } } if ( handle->use_ofm_parallelization == 1 || handle->desc.N % handle->desc.threads != 0) { libxsmm_barrier_wait(handle->barrier, ltid); } } /* physical pad input */ if (handle->fwd_padding_copy == 1) { int ifmpt = LIBXSMM_UPDIV(handle->blocksifm, spread_out); int ifm_id = ltid % spread_out; int my_ifm_start = LIBXSMM_MIN(ifm_id * ifmpt, handle->blocksifm); int my_ifm_end = LIBXSMM_MIN((ifm_id+1) * ifmpt, handle->blocksifm); LIBXSMM_VLA_DECL(5, element_input_type, input_src, (element_input_type*)handle->reg_input->data, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); for (img = my_img_start; img < my_img_end; img++) { for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { /* copy the inner part */ for (ij = 0; ij < handle->ifhp+(2*handle->desc.pad_h); ij++) { for (ii = 0; ii < handle->ifwp+(2*handle->desc.pad_w); ii++) { if ( (ij >= handle->desc.pad_h) && (ii >= handle->desc.pad_w) && (ij < handle->ifhp+handle->desc.pad_h) && (ii < handle->ifwp+handle->desc.pad_w) ) { LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij, ii, ifm2, handle->blocksifm, IFH, IFW, handle->ifmblock) = LIBXSMM_VLA_ACCESS(5, input_src, img, ifm1, ij-handle->desc.pad_h, ii-handle->desc.pad_w, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); } } else { LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij, ii, ifm2, handle->blocksifm, IFH, IFW, handle->ifmblock) = (element_input_type)0; } } } } } } if ( handle->use_ofm_parallelization == 1 || handle->desc.N % handle->desc.threads != 0 ) { libxsmm_barrier_wait(handle->barrier, ltid); } } if (handle->use_fallback_fwd_loops == 1) { /* number of tasks that could be run in parallel */ const int work = handle->desc.N * handle->blocksofm * handle->ofh; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; int imgofm1ofh; if ( handle->avoid_fmas_in_rim == 1) { for (imgofm1ofh = thr_begin; imgofm1ofh < thr_end; ++imgofm1ofh) { img = imgofm1ofh / (handle->blocksofm*handle->ofh); #if 1 ofm1 = (imgofm1ofh % (handle->blocksofm*handle->ofh))/handle->ofh; oj = (imgofm1ofh % (handle->blocksofm*handle->ofh))%handle->ofh; #else oj = (imgofm1ofh % (handle->blocksofm*handle->ofh))/handle->blocksofm; ofm1 = (imgofm1ofh % (handle->blocksofm*handle->ofh))%handle->blocksofm; #endif for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { if ( (ifmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0) { /* set output feature map to zero */ element_output_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock)); for (oi = 0; oi < handle->ofw; ++oi) { LIBXSMM_PRAGMA_SIMD for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { temp_ptr[ofm2] = (element_output_type)0; } temp_ptr += handle->ofmblock; } } for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { for (kj = 0; kj < handle->desc.R; kj++) { for (ki = 0; ki < handle->desc.S; ki++) { /* Prepare batch-reduce kernel arguments */ if (handle->pack_input == 1) { ij_use = oj; ii_use = oi; } else { ij_use = oj * handle->desc.u - (1-handle->desc.pad_h_in); ii_use = oi * handle->desc.v - (1-handle->desc.pad_w_in); } oi_use = oi; oj_use = oj; if (kj == 0 && oj == 0) { /* Do no FLOPS */ } else if (kj == handle->desc.R-1 && oj == handle->ofh-1 ) { /* Do no FLOPS */ } else if ( oi == 0 && ki == 0 ) { ind = 0; for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki + 1, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); ind++; } n_blocks = ind; br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use + 1, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); } else if (oi == handle->ofw-handle->fwd_ofw_rb && ki == handle->desc.S-1) { ind = 0; for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); ind++; } n_blocks = ind; br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); } else { ind = 0; for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); ind++; } n_blocks = ind; br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); } } } } } } } } else { for (imgofm1ofh = thr_begin; imgofm1ofh < thr_end; ++imgofm1ofh) { img = imgofm1ofh / (handle->blocksofm*handle->ofh); #if 1 ofm1 = (imgofm1ofh % (handle->blocksofm*handle->ofh))/handle->ofh; oj = (imgofm1ofh % (handle->blocksofm*handle->ofh))%handle->ofh; #else oj = (imgofm1ofh % (handle->blocksofm*handle->ofh))/handle->blocksofm; ofm1 = (imgofm1ofh % (handle->blocksofm*handle->ofh))%handle->blocksofm; #endif for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { if ( (ifmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0) { /* set output feature map to zero */ element_output_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock)); for (oi = 0; oi < handle->ofw; ++oi) { LIBXSMM_PRAGMA_SIMD for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { temp_ptr[ofm2] = (element_output_type)0; } temp_ptr += handle->ofmblock; } } for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { /* Prepare batch-reduce kernel arguments */ if (handle->pack_input == 1) { ij_use = oj; ii_use = oi; } else { ij_use = oj * handle->desc.u; ii_use = oi * handle->desc.v; } oi_use = oi; oj_use = oj; ind = 0; for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { for (kj = 0; kj < handle->desc.R; kj++) { for (ki = 0; ki < handle->desc.S; ki++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); ind++; } } } n_blocks = ind; br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); } } } } } } else { if (handle->loop_order == 0) { if ( handle->avoid_fmas_in_rim == 1) { for (img = my_img_start; img < my_img_end; img++) { for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += handle->block_fwd_ofm) { for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { for (ojb = 0; ojb < handle->ofh; ojb += handle->block_fwd_oj) { for (ofm11 = ofmb; ofm11 < LIBXSMM_MIN(ofmb+handle->block_fwd_ofm, my_ofm_end); ofm11++ ) { ofm1 = (handle->shuffle_filter_accesses == 1) ? (ofm11+ltid)%handle->blocksofm : ofm11; if ( (ifmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0 && ojb == 0) { /* set output feature map to zero */ for (oj = 0; oj < handle->ofh; ++oj) { element_output_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock)); for (oi = 0; oi < handle->ofw; ++oi) { LIBXSMM_PRAGMA_SIMD for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { temp_ptr[ofm2] = (element_output_type)0; } temp_ptr += handle->ofmblock; } } } for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_fwd_oj,handle->ofh); oj += handle->fwd_ofh_rb) { for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { for (kj1 = 0; kj1 < handle->desc.R; kj1++) { for (ki1 = 0; ki1 < handle->desc.S; ki1++) { /* Prepare batch-reduce kernel arguments */ if (handle->pack_input == 1) { ij_use = oj; ii_use = oi; } else { ij_use = oj * handle->desc.u - (1-handle->desc.pad_h_in); ii_use = oi * handle->desc.v - (1-handle->desc.pad_w_in); } oi_use = oi; oj_use = oj; ki = (handle->shuffle_filter_accesses == 1) ? (ki1+ltid)%handle->desc.S : ki1; kj = (handle->shuffle_filter_accesses == 1) ? (kj1+ltid)%handle->desc.R : kj1; if (kj == 0 && oj == 0) { /* Do no FLOPS */ } else if (kj == handle->desc.R-1 && oj == handle->ofh-1 ) { /* Do no FLOPS */ } else if ( oi == 0 && ki == 0 ) { ind = 0; for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki + 1, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); ind++; } n_blocks = ind; br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use + 1, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); } else if (oi == handle->ofw-handle->fwd_ofw_rb && ki == handle->desc.S-1) { ind = 0; for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); ind++; } n_blocks = ind; br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); } else { ind = 0; for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); ind++; } n_blocks = ind; br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); } } } } } } } } } } } } else { for (img = my_img_start; img < my_img_end; img++) { for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += handle->block_fwd_ofm) { for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { for (ojb = 0; ojb < handle->ofh; ojb += handle->block_fwd_oj) { for (ofm11 = ofmb; ofm11 < LIBXSMM_MIN(ofmb+handle->block_fwd_ofm, my_ofm_end); ofm11++ ) { ofm1 = (handle->shuffle_filter_accesses == 1) ? (ofm11+ltid)%handle->blocksofm : ofm11; if ( (ifmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0 && ojb == 0) { /* set output feature map to zero */ for (oj = 0; oj < handle->ofh; ++oj) { element_output_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock)); for (oi = 0; oi < handle->ofw; ++oi) { LIBXSMM_PRAGMA_SIMD for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { temp_ptr[ofm2] = (element_output_type)0; } temp_ptr += handle->ofmblock; } } } for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_fwd_oj,handle->ofh); oj += handle->fwd_ofh_rb) { for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { /* Prepare batch-reduce kernel arguments */ if (handle->pack_input == 1) { ij_use = oj; ii_use = oi; } else { ij_use = oj * handle->desc.u; ii_use = oi * handle->desc.v; } oi_use = oi; oj_use = oj; ind = 0; for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { for (kj1 = 0; kj1 < handle->desc.R; kj1++) { for (ki1 = 0; ki1 < handle->desc.S; ki1++) { ki = (handle->shuffle_filter_accesses == 1) ? (ki1+ltid)%handle->desc.S : ki1; kj = (handle->shuffle_filter_accesses == 1) ? (kj1+ltid)%handle->desc.R : kj1; A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); ind++; } } } n_blocks = ind; br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); } } } } } } } } } } if (handle->loop_order == 1) { for (img = my_img_start; img < my_img_end; img++) { for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += handle->block_fwd_ofm) { for (ojb = 0; ojb < handle->ofh; ojb += handle->block_fwd_oj) { for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_fwd_oj,handle->ofh); oj += handle->fwd_ofh_rb) { for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_fwd_ofm, my_ofm_end); ofm1++ ) { if (((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0 && oj == 0 && oi == 0) { /* set output feature map to zero */ for (ojj = 0; ojj < handle->ofh; ++ojj) { element_output_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, ojj, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock)); for (oii = 0; oii < handle->ofw; ++oii) { LIBXSMM_PRAGMA_SIMD for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { temp_ptr[ofm2] = (element_output_type)0; } temp_ptr += handle->ofmblock; } } } for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { /* Prepare batch-reduce kernel arguments */ if (handle->pack_input == 1) { ij_use = oj; ii_use = oi; } else { ij_use = oj * handle->desc.u; ii_use = oi * handle->desc.v; } oi_use = oi; oj_use = oj; ind = 0; for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { for (kj = 0; kj < handle->desc.R; kj++) { for (ki = 0; ki < handle->desc.S; ki++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); ind++; } } } n_blocks = ind; br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); } } } } } } } } } } libxsmm_barrier_wait(handle->barrier, ltid); libxsmm-1.17/src/template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic_bf16.tpl.c000066400000000000000000001064211415223013700315540ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas, Alexander Heinecke, Hans Pabst (Intel Corp.) ******************************************************************************/ int img, ofm1, ofm2 = 0, ifm1, ifm2 = 0, oj, oi, kj, ki, oi_use, oj_use, ii_use, ij_use, ofmb, ifmb, ojb, myOfmId, nOfmBlocks, ind, ofm11, ki1, kj1, ojj, oii, spread_out = 1; int last_ki, last_kj, next_kj; /* computing first logical thread */ const int ltid = tid - start_thread; int imgpt = LIBXSMM_UPDIV(handle->desc.N, handle->desc.threads); int threads_per_image = handle->desc.threads / handle->desc.N; int my_img_start = LIBXSMM_MIN(ltid * imgpt, handle->desc.N); int my_img_end = LIBXSMM_MIN((ltid+1) * imgpt, handle->desc.N); int my_ofm_start = 0; int my_ofm_end = handle->blocksofm; int ifmblock_lp = handle->ifmblock/handle->fm_lp_block; /* Batch reduce related variables */ const element_filter_type *A_ptrs[1024]; const element_input_type *B_ptrs[1024]; unsigned long long n_blocks; /* JITed eltwise function */ libxsmm_meltwfunction_cvtfp32bf16 cvt_kernel = handle->fwd_cvtfp32bf16_kernel; libxsmm_meltw_cvtfp32bf16_param cvt_params; /* offset output pointer in case of physical output padding */ element_output_type* out = (element_output_type*)handle->reg_output->data + ((size_t)handle->desc.pad_h_out * handle->ofwp + handle->desc.pad_w_out) * handle->ofmblock; float* out_fp32 = (float*)((char*)handle->scratch + handle->fwd_lp_output_full_scratch_offset) + ((size_t)handle->desc.pad_h_out * handle->ofwp + handle->desc.pad_w_out) * handle->ofmblock; float* out_scratch = (float*)((char*)handle->scratch + handle->fwd_lp_output_block_scratch_offset) + ((size_t) ltid * handle->fwd_ofw_rb * handle->fwd_ofh_rb * handle->ofmblock); float* out_ptr; LIBXSMM_VLA_DECL(5, element_output_type, output, out, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); LIBXSMM_VLA_DECL(5, float, output_fp32, out_fp32, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); LIBXSMM_VLA_DECL(3, float, scratch_fp32, out_scratch, handle->fwd_ofw_rb, handle->ofmblock); element_input_type *input_ptr = (handle->pack_input == 1) ?(element_input_type*)((char*)handle->scratch + handle->fwd_packing_padding_scratch_offset) : (element_input_type*)handle->reg_input->data; const int IFW = (handle->pack_input == 1) ? handle->ofwp : handle->ifwp; const int IFH = (handle->pack_input == 1) ? handle->ofhp : handle->ifhp; LIBXSMM_VLA_DECL(5, element_input_type, input, input_ptr, handle->blocksifm, IFH, IFW, handle->ifmblock); LIBXSMM_VLA_DECL(7, const element_filter_type, weight, (element_filter_type*)handle->reg_filter->data, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block); libxsmm_barrier_init(handle->barrier, ltid); if ( imgpt <= 1 ) { my_img_start = LIBXSMM_MIN(ltid / threads_per_image, handle->desc.N); my_img_end = LIBXSMM_MIN(my_img_start + 1, handle->desc.N); myOfmId = ltid % threads_per_image; nOfmBlocks = LIBXSMM_UPDIV(handle->blocksofm, threads_per_image); my_ofm_start = LIBXSMM_MIN(myOfmId * nOfmBlocks, handle->blocksofm); my_ofm_end = LIBXSMM_MIN((myOfmId+1) * nOfmBlocks, handle->blocksofm); } if ( handle->use_ofm_parallelization == 1 ) { if ( handle->desc.N % 8 == 0) { spread_out = 8; } else if ( handle->desc.N % 4 == 0) { spread_out = 4; } else if (handle->desc.N % 2 == 0) { spread_out = 2; } else if (handle->desc.N % 3 == 0) { spread_out = 3; } else { spread_out = 1; } if ((spread_out > 1) && (handle->desc.threads % spread_out == 0)) { int tile_id = ltid / spread_out; int ofmpt = LIBXSMM_UPDIV(handle->blocksofm, spread_out); int ofm_id = ltid % spread_out; imgpt = LIBXSMM_UPDIV(handle->desc.N, handle->desc.threads) * spread_out; my_img_start = LIBXSMM_MIN(tile_id * imgpt, handle->desc.N); my_img_end = LIBXSMM_MIN((tile_id+1) * imgpt, handle->desc.N); my_ofm_start = LIBXSMM_MIN(ofm_id * ofmpt, handle->blocksofm); my_ofm_end = LIBXSMM_MIN((ofm_id+1) * ofmpt, handle->blocksofm); } } if (handle->pack_input == 1) { int ifmpt = LIBXSMM_UPDIV(handle->blocksifm, spread_out); int ifm_id = ltid % spread_out; int my_ifm_start = LIBXSMM_MIN(ifm_id * ifmpt, handle->blocksifm); int my_ifm_end = LIBXSMM_MIN((ifm_id+1) * ifmpt, handle->blocksifm); LIBXSMM_VLA_DECL(5, element_input_type, input_src, (element_input_type*)handle->reg_input->data, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); for (img = my_img_start; img < my_img_end; img++) { for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { for (oj = 0; oj < handle->ofh; oj++) { for (oi = 0; oi < handle->ofw; oi++) { ij_use = oj * handle->desc.u; ii_use = oi * handle->desc.v; LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_VLA_ACCESS(5, input, img, ifm1, oj, oi, ifm2, handle->blocksifm, IFH, IFW, handle->ifmblock) = LIBXSMM_VLA_ACCESS(5, input_src, img, ifm1, ij_use, ii_use, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); } } } } } if ( handle->use_ofm_parallelization == 1 ) { libxsmm_barrier_wait(handle->barrier, ltid); } } if (handle->use_fallback_fwd_loops == 1) { /* number of tasks that could be run in parallel */ const int work = handle->desc.N * handle->blocksofm * handle->ofh; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; int imgofm1ofh; if ( handle->avoid_fmas_in_rim == 1) { for (imgofm1ofh = thr_begin; imgofm1ofh < thr_end; ++imgofm1ofh) { img = imgofm1ofh / (handle->blocksofm*handle->ofh); ofm1 = (imgofm1ofh % (handle->blocksofm*handle->ofh))/handle->ofh; oj = (imgofm1ofh % (handle->blocksofm*handle->ofh))%handle->ofh; for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { if ( (ifmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0) { /* set output feature map to zero */ float* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output_fp32, img, ofm1, oj, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock)); for (oi = 0; oi < handle->ofw; ++oi) { LIBXSMM_PRAGMA_SIMD for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { temp_ptr[ofm2] = (float)0; } temp_ptr += handle->ofmblock; } } for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { for (kj = 0; kj < handle->desc.R; kj++) { for (ki = 0; ki < handle->desc.S; ki++) { /* Prepare batch-reduce kernel arguments */ if (handle->pack_input == 1) { ij_use = oj; ii_use = oi; } else { ij_use = oj * handle->desc.u - (1-handle->desc.pad_h_in); ii_use = oi * handle->desc.v - (1-handle->desc.pad_w_in); } oi_use = oi; oj_use = oj; last_kj = handle->desc.R-1; last_ki = handle->desc.S-1; next_kj = kj+1; if (kj == 0 && oj == 0) { /* Do no FLOPS */ } else if (kj == handle->desc.R-1 && oj == handle->ofh-1 ) { /* Do no FLOPS */ } else if ( oi == 0 && ki == 0 ) { ind = 0; for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm2, kj, ki, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki + 1, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); ind++; } n_blocks = ind; if (handle->avoid_acc_load == 1) { br_gemm_kernel2_bf16bf16(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use + 1, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); } else { out_ptr = &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use + 1, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); br_gemm_kernel2(A_ptrs, B_ptrs, out_ptr, &n_blocks); if (ifm2 == handle->blocksifm && ((kj == last_kj && ki == last_ki) || (next_kj == 0 && next_kj == last_kj && oj == 0) || (next_kj == handle->desc.R-1 && next_kj == last_kj && oj == handle->ofh-1))) { cvt_params.in_ptr = &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); cvt_params.out_ptr = &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); cvt_kernel(&cvt_params); } } } else if (oi == handle->ofw-handle->fwd_ofw_rb && ki == handle->desc.S-1) { ind = 0; for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm2, kj, ki, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); ind++; } n_blocks = ind; if (handle->avoid_acc_load == 1) { br_gemm_kernel2_bf16bf16(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); } else { out_ptr = &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); br_gemm_kernel2(A_ptrs, B_ptrs, out_ptr, &n_blocks); if (ifm2 == handle->blocksifm && ((kj == last_kj && ki == last_ki) || (next_kj == 0 && next_kj == last_kj && oj == 0) || (next_kj == handle->desc.R-1 && next_kj == last_kj && oj == handle->ofh-1))) { cvt_params.in_ptr = &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); cvt_params.out_ptr = &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); cvt_kernel(&cvt_params); } } } else { ind = 0; for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm2, kj, ki, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); ind++; } n_blocks = ind; if (handle->avoid_acc_load == 1) { br_gemm_kernel_bf16bf16(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); } else { out_ptr = &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); br_gemm_kernel(A_ptrs, B_ptrs, out_ptr, &n_blocks); if (ifm2 == handle->blocksifm && ((kj == last_kj && ki == last_ki) || (next_kj == 0 && next_kj == last_kj && oj == 0) || (next_kj == handle->desc.R-1 && next_kj == last_kj && oj == handle->ofh-1))) { cvt_params.in_ptr = &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); cvt_params.out_ptr = &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); cvt_kernel(&cvt_params); } } } } } } } } } } else { for (imgofm1ofh = thr_begin; imgofm1ofh < thr_end; ++imgofm1ofh) { img = imgofm1ofh / (handle->blocksofm*handle->ofh); ofm1 = (imgofm1ofh % (handle->blocksofm*handle->ofh))/handle->ofh; oj = (imgofm1ofh % (handle->blocksofm*handle->ofh))%handle->ofh; for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { if ( (ifmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0) { /* set output feature map to zero */ float* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output_fp32, img, ofm1, oj, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock)); for (oi = 0; oi < handle->ofw; ++oi) { LIBXSMM_PRAGMA_SIMD for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { temp_ptr[ofm2] = (float)0; } temp_ptr += handle->ofmblock; } } for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { /* Prepare batch-reduce kernel arguments */ if (handle->pack_input == 1) { ij_use = oj; ii_use = oi; } else { ij_use = oj * handle->desc.u; ii_use = oi * handle->desc.v; } oi_use = oi; oj_use = oj; ind = 0; kj = 0; ki = 0; for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { for (kj = 0; kj < handle->desc.R; kj++) { for (ki = 0; ki < handle->desc.S; ki++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm2, kj, ki, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); ind++; } } } n_blocks = ind; if (handle->avoid_acc_load == 1) { br_gemm_kernel_bf16bf16(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); } else { out_ptr = (handle->avoid_acc_load == 1) ? &LIBXSMM_VLA_ACCESS( 3, scratch_fp32, 0, 0, 0, handle->fwd_ofw_rb, handle->ofmblock) : &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); br_gemm_kernel(A_ptrs, B_ptrs, out_ptr, &n_blocks); if (ifm2 == handle->blocksifm && kj == handle->desc.R && ki == handle->desc.S) { cvt_params.in_ptr = &LIBXSMM_VLA_ACCESS( 5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); cvt_params.out_ptr = &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); cvt_kernel(&cvt_params); } } } } } } } } else { if (handle->loop_order == 0) { if ( handle->avoid_fmas_in_rim == 1) { for (img = my_img_start; img < my_img_end; img++) { for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += handle->block_fwd_ofm) { for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { for (ojb = 0; ojb < handle->ofh; ojb += handle->block_fwd_oj) { for (ofm11 = ofmb; ofm11 < LIBXSMM_MIN(ofmb+handle->block_fwd_ofm, my_ofm_end); ofm11++ ) { ofm1 = (handle->shuffle_filter_accesses == 1) ? (ofm11+ltid)%handle->blocksofm : ofm11; if ( (ifmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0 && ojb == 0) { /* set output feature map to zero */ for (oj = 0; oj < handle->ofh; ++oj) { float* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output_fp32, img, ofm1, oj, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock)); for (oi = 0; oi < handle->ofw; ++oi) { LIBXSMM_PRAGMA_SIMD for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { temp_ptr[ofm2] = (float)0; } temp_ptr += handle->ofmblock; } } } for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_fwd_oj,handle->ofh); oj += handle->fwd_ofh_rb) { for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { for (kj1 = 0; kj1 < handle->desc.R; kj1++) { for (ki1 = 0; ki1 < handle->desc.S; ki1++) { /* Prepare batch-reduce kernel arguments */ if (handle->pack_input == 1) { ij_use = oj; ii_use = oi; } else { ij_use = oj * handle->desc.u - (1-handle->desc.pad_h_in); ii_use = oi * handle->desc.v - (1-handle->desc.pad_w_in); } oi_use = oi; oj_use = oj; ki = (handle->shuffle_filter_accesses == 1) ? (ki1+ltid)%handle->desc.S : ki1; kj = (handle->shuffle_filter_accesses == 1) ? (kj1+ltid)%handle->desc.R : kj1; last_ki = (handle->shuffle_filter_accesses == 1) ? (handle->desc.S-1+ltid)%handle->desc.S : handle->desc.S-1; last_kj = (handle->shuffle_filter_accesses == 1) ? (handle->desc.R-1+ltid)%handle->desc.R : handle->desc.R-1; next_kj = (handle->shuffle_filter_accesses == 1) ? (kj1+1+ltid)%handle->desc.R : kj1+1; if (kj == 0 && oj == 0) { /* Do no FLOPS */ } else if (kj == handle->desc.R-1 && oj == handle->ofh-1 ) { /* Do no FLOPS */ } else if ( oi == 0 && ki == 0 ) { ind = 0; for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm2, kj, ki, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki + 1, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); ind++; } n_blocks = ind; if (handle->avoid_acc_load == 1) { br_gemm_kernel2_bf16bf16(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use + 1, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); } else { out_ptr = &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use + 1, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); br_gemm_kernel2(A_ptrs, B_ptrs, out_ptr, &n_blocks); if (ifm2 == handle->blocksifm && ((kj == last_kj && ki == last_ki) || (next_kj == 0 && next_kj == last_kj && oj == 0) || (next_kj == handle->desc.R-1 && next_kj == last_kj && oj == handle->ofh-1))) { cvt_params.in_ptr = &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); cvt_params.out_ptr = &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); cvt_kernel(&cvt_params); } } } else if (oi == handle->ofw-handle->fwd_ofw_rb && ki == handle->desc.S-1) { ind = 0; for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm2, kj, ki, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); ind++; } n_blocks = ind; if (handle->avoid_acc_load == 1) { br_gemm_kernel2_bf16bf16(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); } else { out_ptr = &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); br_gemm_kernel2(A_ptrs, B_ptrs, out_ptr, &n_blocks); if (ifm2 == handle->blocksifm && ((kj == last_kj && ki == last_ki) || (next_kj == 0 && next_kj == last_kj && oj == 0) || (next_kj == handle->desc.R-1 && next_kj == last_kj && oj == handle->ofh-1))) { cvt_params.in_ptr = &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); cvt_params.out_ptr = &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); cvt_kernel(&cvt_params); } } } else { ind = 0; for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm2, kj, ki, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); ind++; } n_blocks = ind; if (handle->avoid_acc_load == 1) { br_gemm_kernel_bf16bf16(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); } else { out_ptr = &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); br_gemm_kernel(A_ptrs, B_ptrs, out_ptr, &n_blocks); if (ifm2 == handle->blocksifm && ((kj == last_kj && ki == last_ki) || (next_kj == 0 && next_kj == last_kj && oj == 0) || (next_kj == handle->desc.R-1 && next_kj == last_kj && oj == handle->ofh-1))) { cvt_params.in_ptr = &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); cvt_params.out_ptr = &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); cvt_kernel(&cvt_params); } } } } } } } } } } } } } } else { for (img = my_img_start; img < my_img_end; img++) { for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += handle->block_fwd_ofm) { for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { for (ojb = 0; ojb < handle->ofh; ojb += handle->block_fwd_oj) { for (ofm11 = ofmb; ofm11 < LIBXSMM_MIN(ofmb+handle->block_fwd_ofm, my_ofm_end); ofm11++ ) { ofm1 = (handle->shuffle_filter_accesses == 1) ? (ofm11+ltid)%handle->blocksofm : ofm11; if ( (ifmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0 && ojb == 0) { /* set output feature map to zero */ for (oj = 0; oj < handle->ofh; ++oj) { float* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output_fp32, img, ofm1, oj, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock)); for (oi = 0; oi < handle->ofw; ++oi) { LIBXSMM_PRAGMA_SIMD for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { temp_ptr[ofm2] = (float)0; } temp_ptr += handle->ofmblock; } } } for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_fwd_oj,handle->ofh); oj += handle->fwd_ofh_rb) { for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { /* Prepare batch-reduce kernel arguments */ if (handle->pack_input == 1) { ij_use = oj; ii_use = oi; } else { ij_use = oj * handle->desc.u; ii_use = oi * handle->desc.v; } oi_use = oi; oj_use = oj; ind = 0; kj1 = 0; ki1 = 0; for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { for (kj1 = 0; kj1 < handle->desc.R; kj1++) { for (ki1 = 0; ki1 < handle->desc.S; ki1++) { ki = (handle->shuffle_filter_accesses == 1) ? (ki1+ltid)%handle->desc.S : ki1; kj = (handle->shuffle_filter_accesses == 1) ? (kj1+ltid)%handle->desc.R : kj1; A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm2, kj, ki, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); ind++; } } } n_blocks = ind; if (handle->avoid_acc_load == 1) { br_gemm_kernel_bf16bf16(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); } else { out_ptr = &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); br_gemm_kernel(A_ptrs, B_ptrs, out_ptr, &n_blocks); if (kj1 == handle->desc.R && ki1 == handle->desc.S && ifm2 == handle->blocksifm) { cvt_params.in_ptr = &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); cvt_params.out_ptr = &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); cvt_kernel(&cvt_params); } } } } } } } } } } } } if (handle->loop_order == 1) { for (img = my_img_start; img < my_img_end; img++) { for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += handle->block_fwd_ofm) { for (ojb = 0; ojb < handle->ofh; ojb += handle->block_fwd_oj) { for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_fwd_oj,handle->ofh); oj += handle->fwd_ofh_rb) { for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_fwd_ofm, my_ofm_end); ofm1++ ) { if (((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0 && oj == 0 && oi == 0) { /* set output feature map to zero */ for (ojj = 0; ojj < handle->ofh; ++ojj) { float* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output_fp32, img, ofm1, ojj, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock)); for (oii = 0; oii < handle->ofw; ++oii) { LIBXSMM_PRAGMA_SIMD for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { temp_ptr[ofm2] = (float)0; } temp_ptr += handle->ofmblock; } } } for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { /* Prepare batch-reduce kernel arguments */ if (handle->pack_input == 1) { ij_use = oj; ii_use = oi; } else { ij_use = oj * handle->desc.u; ii_use = oi * handle->desc.v; } oi_use = oi; oj_use = oj; ind = 0; kj = 0; ki = 0; for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { for (kj = 0; kj < handle->desc.R; kj++) { for (ki = 0; ki < handle->desc.S; ki++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm2, kj, ki, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); ind++; } } } n_blocks = ind; if (handle->avoid_acc_load == 1) { br_gemm_kernel_bf16bf16(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); } else { out_ptr = &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); br_gemm_kernel(A_ptrs, B_ptrs, out_ptr, &n_blocks); if (kj == handle->desc.R && ki == handle->desc.S && ifm2 == handle->blocksifm) { cvt_params.in_ptr = &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); cvt_params.out_ptr = &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); cvt_kernel(&cvt_params); } } } } } } } } } } } #if 0 /* In case we used intermediate fp32 buffer, now downconvert the result to the actual bf16 output */ if (handle->avoid_acc_load == 0) { for (img = my_img_start; img < my_img_end; img++) { for (ofm1 = my_ofm_start; ofm1 < my_ofm_end; ofm1++) { for (oj = 0; oj < handle->ofh; oj++) { LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS( 5, output_fp32, img, ofm1, oj, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), handle->ofw * handle->ofmblock); } } } } #endif } libxsmm_barrier_wait(handle->barrier, ltid); libxsmm-1.17/src/template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic_i8i32.tpl.c000066400000000000000000000250511415223013700316530ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas, Alexander Heinecke, Hans Pabst (Intel Corp.) ******************************************************************************/ int img, ofm1, ofm2, ifm1, ifm2, oj, oi, kj, ki, ii_use, ij_use, oii, spread_out = 1; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const int w_tasks = handle->ofw/handle->fwd_ofw_rb; const int work = handle->desc.N * handle->blocksofm * handle->ofh * w_tasks; const int work_KHW = handle->blocksofm * handle->ofh * w_tasks; const int work_HW = handle->ofh * w_tasks; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; int imgofm1ofhofw; int imgpt = LIBXSMM_UPDIV(handle->desc.N, handle->desc.threads); int my_img_start = LIBXSMM_MIN(ltid * imgpt, handle->desc.N); int my_img_end = LIBXSMM_MIN((ltid+1) * imgpt, handle->desc.N); int ifmblock_lp = handle->ifmblock/handle->fm_lp_block; /* Batch reduce related variables */ unsigned long long n_blocks; /* offset output pointer in case of physical output padding */ element_output_type* out = (element_output_type*)handle->reg_output->data + ((size_t)handle->desc.pad_h_out * handle->ofwp + handle->desc.pad_w_out) * handle->ofmblock; LIBXSMM_VLA_DECL(5, element_output_type, output, out, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); element_input_type *input_ptr = (handle->pack_input == 1) ?(element_input_type*)((char*)handle->scratch + handle->fwd_packing_padding_scratch_offset) : (element_input_type*)handle->reg_input->data; const int IFW = (handle->pack_input == 1) ? handle->ofwp : handle->ifwp; const int IFH = (handle->pack_input == 1) ? handle->ofhp : handle->ifhp; LIBXSMM_VLA_DECL(5, element_input_type, input, input_ptr, handle->blocksifm, IFH, IFW, handle->ifmblock); LIBXSMM_VLA_DECL(7, const element_filter_type, weight, (element_filter_type*)handle->reg_filter->data, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block); libxsmm_barrier_init(handle->barrier, ltid); if (handle->pack_input == 1) { int ifmpt = LIBXSMM_UPDIV(handle->blocksifm, spread_out); int ifm_id = ltid % spread_out; int my_ifm_start = LIBXSMM_MIN(ifm_id * ifmpt, handle->blocksifm); int my_ifm_end = LIBXSMM_MIN((ifm_id+1) * ifmpt, handle->blocksifm); LIBXSMM_VLA_DECL(5, element_input_type, input_src, (element_input_type*)handle->reg_input->data, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); for (img = my_img_start; img < my_img_end; img++) { for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { for (oj = 0; oj < handle->ofh; oj++) { for (oi = 0; oi < handle->ofw; oi++) { ij_use = oj * handle->desc.u; ii_use = oi * handle->desc.v; LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_VLA_ACCESS(5, input, img, ifm1, oj, oi, ifm2, handle->blocksifm, IFH, IFW, handle->ifmblock) = LIBXSMM_VLA_ACCESS(5, input_src, img, ifm1, ij_use, ii_use, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); } } } } } if ( handle->use_ofm_parallelization == 1 ) { libxsmm_barrier_wait(handle->barrier, ltid); } } if (handle->avoid_fmas_in_rim == 1) { n_blocks = handle->blocksifm_blocking; for (imgofm1ofhofw = thr_begin; imgofm1ofhofw < thr_end; ++imgofm1ofhofw) { img = imgofm1ofhofw / work_KHW; ofm1 = (imgofm1ofhofw % work_KHW)/work_HW; oj = ((imgofm1ofhofw % work_KHW)%work_HW)/w_tasks; oi = (((imgofm1ofhofw % work_KHW)%work_HW)%w_tasks)*handle->fwd_ofw_rb; ij_use = (handle->pack_input == 1) ? oj : oj * handle->desc.u - (1-handle->desc.pad_h_in); ii_use = (handle->pack_input == 1) ? oi : oi * handle->desc.v - (1-handle->desc.pad_w_in); if ( ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0) { /* set output feature map to zero */ element_output_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock)); for (oii = 0; oii < handle->fwd_ofw_rb; ++oii) { LIBXSMM_PRAGMA_SIMD for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { temp_ptr[ofm2] = (element_output_type)0; } temp_ptr += handle->ofmblock; } } for (ifm1 = 0; ifm1 < handle->blocksifm; ifm1 += handle->blocksifm_blocking) { for (kj = 0; kj < handle->desc.R; kj++) { for (ki = 0; ki < handle->desc.S; ki++) { if (kj == 0 && oj == 0) { /* Do no FLOPS */ } else if (kj == handle->desc.R-1 && oj == handle->ofh-1 ) { /* Do no FLOPS */ } else if ( oi == 0 && ki == 0 ) { br_gemm_kernel_strided2( &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm1, kj, ki, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block), &LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij_use+kj, ii_use+ki+1, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj, oi+1, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); } else if (oi == handle->ofw-handle->fwd_ofw_rb && ki == handle->desc.S-1) { br_gemm_kernel_strided2( &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm1, kj, ki, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block), &LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij_use+kj, ii_use+ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); } else { br_gemm_kernel_strided( &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm1, kj, ki, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block), &LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij_use+kj, ii_use+ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); } } } } } } else { /* Strided based BRGEMM */ n_blocks = (unsigned long long)handle->blocksifm_blocking * handle->desc.R * handle->desc.S; if (handle->desc.R == 1 && handle->desc.S == 1) { for (imgofm1ofhofw = thr_begin; imgofm1ofhofw < thr_end; ++imgofm1ofhofw) { img = imgofm1ofhofw / work_KHW; ofm1 = (imgofm1ofhofw % work_KHW)/work_HW; oj = ((imgofm1ofhofw % work_KHW)%work_HW)/w_tasks; oi = (((imgofm1ofhofw % work_KHW)%work_HW)%w_tasks)*handle->fwd_ofw_rb; ij_use = (handle->pack_input == 1) ? oj : oj * handle->desc.u; ii_use = (handle->pack_input == 1) ? oi : oi * handle->desc.v; if ( ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0) { /* set output feature map to zero */ element_output_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock)); for (oii = 0; oii < handle->fwd_ofw_rb; ++oii) { LIBXSMM_PRAGMA_SIMD for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { temp_ptr[ofm2] = (element_output_type)0; } temp_ptr += handle->ofmblock; } } for (ifm1 = 0; ifm1 < handle->blocksifm; ifm1 += handle->blocksifm_blocking) { br_gemm_kernel_strided( &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm1, 0, 0, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block), &LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); } } } else { /* Offset based BRGEMM */ for (imgofm1ofhofw = thr_begin; imgofm1ofhofw < thr_end; ++imgofm1ofhofw) { img = imgofm1ofhofw / work_KHW; ofm1 = (imgofm1ofhofw % work_KHW)/work_HW; oj = ((imgofm1ofhofw % work_KHW)%work_HW)/w_tasks; oi = (((imgofm1ofhofw % work_KHW)%work_HW)%w_tasks)*handle->fwd_ofw_rb; ij_use = (handle->pack_input == 1) ? oj : oj * handle->desc.u; ii_use = (handle->pack_input == 1) ? oi : oi * handle->desc.v; if ( ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0) { /* set output feature map to zero */ element_output_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock)); for (oii = 0; oii < handle->fwd_ofw_rb; ++oii) { LIBXSMM_PRAGMA_SIMD for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { temp_ptr[ofm2] = (element_output_type)0; } temp_ptr += handle->ofmblock; } } for (ifm1 = 0; ifm1 < handle->blocksifm; ifm1 += handle->blocksifm_blocking) { br_gemm_kernel_offset( &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm1, 0, 0, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block), &LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks, handle->A_offsets, handle->B_offsets); } } } } libxsmm_barrier_wait(handle->barrier, ltid); libxsmm-1.17/src/template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic_i8i8.tpl.c000066400000000000000000000107221415223013700315750ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas, Alexander Heinecke, Hans Pabst (Intel Corp.) ******************************************************************************/ const int ifmblock_lp = handle->ifmblock/handle->fm_lp_block; int imgofm1ofhofw, img, ofm1, oj, oi, ii, ij; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const int w_tasks = handle->ofw/handle->fwd_ofw_rb; const int work = handle->desc.N * handle->blocksofm * handle->ofh * w_tasks; const int work_KHW = handle->blocksofm * handle->ofh * w_tasks; const int work_HW = handle->ofh * w_tasks; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* Batch reduce related variables */ unsigned long long n_blocks = (unsigned long long)handle->blocksifm_blocking * handle->desc.R * handle->desc.S; /* Calculate scaling factor here for output... */ float _scf = libxsmm_sexp2_i8i(-(handle->reg_filter->scf + handle->reg_input->scf - handle->reg_output->scf)); /* offset output pointer in case of physical output padding */ LIBXSMM_VLA_DECL(5, element_output_type, output, (element_output_type*)handle->reg_output->data + ((size_t)handle->desc.pad_h_out * handle->ofwp + handle->desc.pad_w_out) * handle->ofmblock, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); LIBXSMM_VLA_DECL(5, element_input_type, input, (element_input_type*)handle->reg_input->data, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); LIBXSMM_VLA_DECL(7, const element_filter_type, weight, (element_filter_type*)handle->reg_filter->data, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block); libxsmm_barrier_init(handle->barrier, ltid); if (handle->desc.R == 1 && handle->desc.S == 1) { /* Strided based BRGEMM */ for (imgofm1ofhofw = thr_begin; imgofm1ofhofw < thr_end; ++imgofm1ofhofw) { img = imgofm1ofhofw / work_KHW; ofm1 = (imgofm1ofhofw % work_KHW)/work_HW; oj = ((imgofm1ofhofw % work_KHW)%work_HW)/w_tasks; oi = (((imgofm1ofhofw % work_KHW)%work_HW)%w_tasks)*handle->fwd_ofw_rb; ij = oj * handle->desc.u; ii = oi * handle->desc.v; br_gemm_kernel_strided( &LIBXSMM_VLA_ACCESS(7, weight, ofm1, 0, 0, 0, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block), &LIBXSMM_VLA_ACCESS(5, input, img, 0, ij, ii, 0, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock), &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks, &_scf); } } else { /* Offset based BRGEMM */ for (imgofm1ofhofw = thr_begin; imgofm1ofhofw < thr_end; ++imgofm1ofhofw) { img = imgofm1ofhofw / work_KHW; ofm1 = (imgofm1ofhofw % work_KHW)/work_HW; oj = ((imgofm1ofhofw % work_KHW)%work_HW)/w_tasks; oi = (((imgofm1ofhofw % work_KHW)%work_HW)%w_tasks)*handle->fwd_ofw_rb; ij = oj * handle->desc.u; ii = oi * handle->desc.v; br_gemm_kernel_offset( &LIBXSMM_VLA_ACCESS(7, weight, ofm1, 0, 0, 0, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block), &LIBXSMM_VLA_ACCESS(5, input, img, 0, ij, ii, 0, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock), &LIBXSMM_VLA_ACCESS(5 , output, img, ofm1, oj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks, handle->A_offsets, handle->B_offsets, &_scf); } } libxsmm_barrier_wait(handle->barrier, ltid); libxsmm-1.17/src/template/libxsmm_dnn_convolve_st_fwd_nhwc_custom-rsck_generic.tpl.c000066400000000000000000000724251415223013700313310ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas, Alexander Heinecke, Hans Pabst (Intel Corp.) ******************************************************************************/ int img, ofm1, ofm2 = 0, ifm1, ifm2 = 0, oj, oi, kj, ki, oi_use, oj_use, ii_use, ij_use, ofmb, ifmb, ojb, myOfmId, nOfmBlocks, ind, ofm11, ki1, kj1, ojj, oii, ii, ij, spread_out = 1; /* computing first logical thread */ const int ltid = tid - start_thread; int imgpt = LIBXSMM_UPDIV(handle->desc.N, handle->desc.threads); int threads_per_image = handle->desc.threads / handle->desc.N; int my_img_start = LIBXSMM_MIN(ltid * imgpt, handle->desc.N); int my_img_end = LIBXSMM_MIN((ltid+1) * imgpt, handle->desc.N); int my_ofm_start = 0; int my_ofm_end = handle->blocksofm; /* Batch reduce related variables */ const element_filter_type *A_ptrs[1024]; const element_input_type *B_ptrs[1024]; unsigned long long n_blocks; /* offset output pointer in case of physical output padding */ element_output_type* out = (element_output_type*)handle->reg_output->data + ((size_t)handle->desc.pad_h_out * handle->ofwp + handle->desc.pad_w_out) * handle->blocksofm * handle->ofmblock; LIBXSMM_VLA_DECL(5, element_output_type, output, out, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock); element_input_type *input_ptr = ( (handle->pack_input == 1) || (handle->fwd_padding_copy == 1) ) ?(element_input_type*)((char*)handle->scratch + handle->fwd_packing_padding_scratch_offset) : (element_input_type*)handle->reg_input->data; const int IFW = (handle->fwd_padding_copy == 1) ? handle->ifwp + 2*handle->desc.pad_w : ( (handle->pack_input == 1) ? handle->ofwp : handle->ifwp ); const int IFH = (handle->fwd_padding_copy == 1) ? handle->ifhp + 2*handle->desc.pad_h : ( (handle->pack_input == 1) ? handle->ofhp : handle->ifhp ); LIBXSMM_VLA_DECL(5, element_input_type, input, input_ptr, IFH, IFW, handle->blocksifm, handle->ifmblock); #ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_CUSTOM LIBXSMM_VLA_DECL(6, const element_filter_type, weight, (element_filter_type*)handle->reg_filter->data, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); #endif #ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_RSCK LIBXSMM_VLA_DECL(6, const element_filter_type, weight, (element_filter_type*)handle->reg_filter->data, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); #endif /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); if ( imgpt <= 1 ) { my_img_start = LIBXSMM_MIN(ltid / threads_per_image, handle->desc.N); my_img_end = LIBXSMM_MIN(my_img_start + 1, handle->desc.N); myOfmId = ltid % threads_per_image; nOfmBlocks = LIBXSMM_UPDIV(handle->blocksofm, threads_per_image); my_ofm_start = LIBXSMM_MIN(myOfmId * nOfmBlocks, handle->blocksofm); my_ofm_end = LIBXSMM_MIN((myOfmId+1) * nOfmBlocks, handle->blocksofm); } if ( handle->use_ofm_parallelization == 1 ) { if ( handle->desc.N % 8 == 0) { spread_out = 8; } else if ( handle->desc.N % 4 == 0) { spread_out = 4; } else if (handle->desc.N % 2 == 0) { spread_out = 2; } else if (handle->desc.N % 3 == 0) { spread_out = 3; } else { spread_out = 1; } if ((spread_out > 1) && (handle->desc.threads % spread_out == 0)) { int tile_id = ltid / spread_out; int ofmpt = LIBXSMM_UPDIV(handle->blocksofm, spread_out); int ofm_id = ltid % spread_out; imgpt = LIBXSMM_UPDIV(handle->desc.N, handle->desc.threads) * spread_out; my_img_start = LIBXSMM_MIN(tile_id * imgpt, handle->desc.N); my_img_end = LIBXSMM_MIN((tile_id+1) * imgpt, handle->desc.N); my_ofm_start = LIBXSMM_MIN(ofm_id * ofmpt, handle->blocksofm); my_ofm_end = LIBXSMM_MIN((ofm_id+1) * ofmpt, handle->blocksofm); } } /* remove stride from input */ if (handle->pack_input == 1) { int ifmpt = LIBXSMM_UPDIV(handle->blocksifm, spread_out); int ifm_id = ltid % spread_out; int my_ifm_start = LIBXSMM_MIN(ifm_id * ifmpt, handle->blocksifm); int my_ifm_end = LIBXSMM_MIN((ifm_id+1) * ifmpt, handle->blocksifm); /* @TODO think about packed format */ LIBXSMM_VLA_DECL(5, element_input_type, input_src, (element_input_type*)handle->reg_input->data, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock); for (img = my_img_start; img < my_img_end; img++) { for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { for (oj = 0; oj < handle->ofh; oj++) { for (oi = 0; oi < handle->ofw; oi++) { ij_use = oj * handle->desc.u; ii_use = oi * handle->desc.v; LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_VLA_ACCESS(5, input, img, oj, oi, ifm1, ifm2, IFH, IFW, handle->blocksifm, handle->ifmblock) = LIBXSMM_VLA_ACCESS(5, input_src, img, ij_use, ii_use, ifm1, ifm2, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock); } } } } } if ( handle->use_ofm_parallelization == 1 ) { libxsmm_barrier_wait(handle->barrier, ltid); } } /* physical pad input */ if (handle->fwd_padding_copy == 1) { int ifmpt = LIBXSMM_UPDIV(handle->blocksifm, spread_out); int ifm_id = ltid % spread_out; int my_ifm_start = LIBXSMM_MIN(ifm_id * ifmpt, handle->blocksifm); int my_ifm_end = LIBXSMM_MIN((ifm_id+1) * ifmpt, handle->blocksifm); LIBXSMM_VLA_DECL(5, element_input_type, input_src, (element_input_type*)handle->reg_input->data, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock); for (img = my_img_start; img < my_img_end; img++) { for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { /* copy the inner part */ for (ij = 0; ij < handle->ifhp+(2*handle->desc.pad_h); ij++) { for (ii = 0; ii < handle->ifwp+(2*handle->desc.pad_w); ii++) { if ( (ij >= handle->desc.pad_h) && (ii >= handle->desc.pad_w) && (ij < handle->ifhp+handle->desc.pad_h) && (ii < handle->ifwp+handle->desc.pad_w) ) { LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_VLA_ACCESS(5, input, img, ij, ii, ifm1, ifm2, IFH, IFW, handle->blocksifm, handle->ifmblock) = LIBXSMM_VLA_ACCESS(5, input_src, img, ij-handle->desc.pad_h, ii-handle->desc.pad_w, ifm1, ifm2, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock); } } else { LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_VLA_ACCESS(5, input, img, ij, ii, ifm1, ifm2, IFH, IFW, handle->blocksifm, handle->ifmblock) = (element_input_type)0; } } } } } } if ( handle->use_ofm_parallelization == 1 ) { libxsmm_barrier_wait(handle->barrier, ltid); } } if (handle->use_fallback_fwd_loops == 1) { /* number of tasks that could be run in parallel */ const int work = handle->desc.N * handle->blocksofm * handle->ofh; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; int imgofm1ofh; if ( handle->avoid_fmas_in_rim == 1) { for (imgofm1ofh = thr_begin; imgofm1ofh < thr_end; ++imgofm1ofh) { img = imgofm1ofh / (handle->blocksofm*handle->ofh); #if 1 ofm1 = (imgofm1ofh % (handle->blocksofm*handle->ofh))/handle->ofh; oj = (imgofm1ofh % (handle->blocksofm*handle->ofh))%handle->ofh; #else oj = (imgofm1ofh % (handle->blocksofm*handle->ofh))/handle->blocksofm; ofm1 = (imgofm1ofh % (handle->blocksofm*handle->ofh))%handle->blocksofm; #endif for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { if ( (ifmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0) { /* set output feature map to zero */ element_output_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output, img, oj, 0, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock)); for (oi = 0; oi < handle->ofw; ++oi) { LIBXSMM_PRAGMA_SIMD for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { temp_ptr[ofm2] = (element_output_type)0; } temp_ptr += handle->blocksofm*handle->ofmblock; } } for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { for (kj = 0; kj < handle->desc.R; kj++) { for (ki = 0; ki < handle->desc.S; ki++) { /* Prepare batch-reduce kernel arguments */ if (handle->pack_input == 1) { ij_use = oj; ii_use = oi; } else { ij_use = oj * handle->desc.u - (1-handle->desc.pad_h_in); ii_use = oi * handle->desc.v - (1-handle->desc.pad_w_in); } oi_use = oi; oj_use = oj; if (kj == 0 && oj == 0) { /* Do no FLOPS */ } else if (kj == handle->desc.R-1 && oj == handle->ofh-1 ) { /* Do no FLOPS */ } else if ( oi == 0 && ki == 0 ) { ind = 0; for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { #ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_CUSTOM A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); #endif #ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_RSCK A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, kj, ki, ifm2, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); #endif B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ij_use + kj, ii_use + ki + 1, ifm2, 0, IFH, IFW, handle->blocksifm, handle->ifmblock); ind++; } n_blocks = ind; br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, oj_use, oi_use + 1, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock), &n_blocks); } else if (oi == handle->ofw-handle->fwd_ofw_rb && ki == handle->desc.S-1) { ind = 0; for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { #ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_CUSTOM A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); #endif #ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_RSCK A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, kj, ki, ifm2, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); #endif B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ij_use + kj, ii_use + ki, ifm2, 0, IFH, IFW, handle->blocksifm, handle->ifmblock); ind++; } n_blocks = ind; br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, oj_use, oi_use, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock), &n_blocks); } else { ind = 0; for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { #ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_CUSTOM A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); #endif #ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_RSCK A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, kj, ki, ifm2, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); #endif B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ij_use + kj, ii_use + ki, ifm2, 0, IFH, IFW, handle->blocksifm, handle->ifmblock); ind++; } n_blocks = ind; br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, oj_use, oi_use, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock), &n_blocks); } } } } } } } } else { for (imgofm1ofh = thr_begin; imgofm1ofh < thr_end; ++imgofm1ofh) { img = imgofm1ofh / (handle->blocksofm*handle->ofh); #if 1 ofm1 = (imgofm1ofh % (handle->blocksofm*handle->ofh))/handle->ofh; oj = (imgofm1ofh % (handle->blocksofm*handle->ofh))%handle->ofh; #else oj = (imgofm1ofh % (handle->blocksofm*handle->ofh))/handle->blocksofm; ofm1 = (imgofm1ofh % (handle->blocksofm*handle->ofh))%handle->blocksofm; #endif for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { if ( (ifmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0) { /* set output feature map to zero */ element_output_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output, img, oj, 0, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock)); for (oi = 0; oi < handle->ofw; ++oi) { LIBXSMM_PRAGMA_SIMD for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { temp_ptr[ofm2] = (element_output_type)0; } temp_ptr += handle->blocksofm*handle->ofmblock; } } for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { /* Prepare batch-reduce kernel arguments */ if (handle->pack_input == 1) { ij_use = oj; ii_use = oi; } else { ij_use = oj * handle->desc.u; ii_use = oi * handle->desc.v; } oi_use = oi; oj_use = oj; ind = 0; for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { for (kj = 0; kj < handle->desc.R; kj++) { for (ki = 0; ki < handle->desc.S; ki++) { #ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_CUSTOM A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); #endif #ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_RSCK A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, kj, ki, ifm2, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); #endif B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ij_use + kj, ii_use + ki, ifm2, 0, IFH, IFW, handle->blocksifm, handle->ifmblock); ind++; } } } n_blocks = ind; br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, oj_use, oi_use, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock), &n_blocks); } } } } } } else { if (handle->loop_order == 0) { if ( handle->avoid_fmas_in_rim == 1) { for (img = my_img_start; img < my_img_end; img++) { for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += handle->block_fwd_ofm) { for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { for (ojb = 0; ojb < handle->ofh; ojb += handle->block_fwd_oj) { for (ofm11 = ofmb; ofm11 < LIBXSMM_MIN(ofmb+handle->block_fwd_ofm, my_ofm_end); ofm11++ ) { ofm1 = (handle->shuffle_filter_accesses == 1) ? (ofm11+ltid)%handle->blocksofm : ofm11; if ( (ifmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0 && ojb == 0) { /* set output feature map to zero */ for (oj = 0; oj < handle->ofh; ++oj) { element_output_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output, img, oj, 0, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock)); for (oi = 0; oi < handle->ofw; ++oi) { LIBXSMM_PRAGMA_SIMD for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { temp_ptr[ofm2] = (element_output_type)0; } temp_ptr += handle->blocksofm*handle->ofmblock; } } } for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_fwd_oj,handle->ofh); oj += handle->fwd_ofh_rb) { for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { for (kj1 = 0; kj1 < handle->desc.R; kj1++) { for (ki1 = 0; ki1 < handle->desc.S; ki1++) { /* Prepare batch-reduce kernel arguments */ if (handle->pack_input == 1) { ij_use = oj; ii_use = oi; } else { ij_use = oj * handle->desc.u - (1-handle->desc.pad_h_in); ii_use = oi * handle->desc.v - (1-handle->desc.pad_w_in); } oi_use = oi; oj_use = oj; ki = (handle->shuffle_filter_accesses == 1) ? (ki1+ltid)%handle->desc.S : ki1; kj = (handle->shuffle_filter_accesses == 1) ? (kj1+ltid)%handle->desc.R : kj1; if (kj == 0 && oj == 0) { /* Do no FLOPS */ } else if (kj == handle->desc.R-1 && oj == handle->ofh-1 ) { /* Do no FLOPS */ } else if ( oi == 0 && ki == 0 ) { ind = 0; for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { #ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_CUSTOM A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); #endif #ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_RSCK A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, kj, ki, ifm2, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); #endif B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ij_use + kj, ii_use + ki + 1, ifm2, 0, IFH, IFW, handle->blocksifm, handle->ifmblock); ind++; } n_blocks = ind; br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, oj_use, oi_use + 1, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock), &n_blocks); } else if (oi == handle->ofw-handle->fwd_ofw_rb && ki == handle->desc.S-1) { ind = 0; for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { #ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_CUSTOM A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); #endif #ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_RSCK A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, kj, ki, ifm2, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); #endif B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ij_use + kj, ii_use + ki, ifm2, 0, IFH, IFW, handle->blocksifm, handle->ifmblock); ind++; } n_blocks = ind; br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, oj_use, oi_use, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock), &n_blocks); } else { ind = 0; for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { #ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_CUSTOM A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); #endif #ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_RSCK A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, kj, ki, ifm2, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); #endif B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ij_use + kj, ii_use + ki, ifm2, 0, IFH, IFW, handle->blocksifm, handle->ifmblock); ind++; } n_blocks = ind; br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, oj_use, oi_use, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock), &n_blocks); } } } } } } } } } } } } else { for (img = my_img_start; img < my_img_end; img++) { for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += handle->block_fwd_ofm) { for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { for (ojb = 0; ojb < handle->ofh; ojb += handle->block_fwd_oj) { for (ofm11 = ofmb; ofm11 < LIBXSMM_MIN(ofmb+handle->block_fwd_ofm, my_ofm_end); ofm11++ ) { ofm1 = (handle->shuffle_filter_accesses == 1) ? (ofm11+ltid)%handle->blocksofm : ofm11; if ( (ifmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0 && ojb == 0) { /* set output feature map to zero */ for (oj = 0; oj < handle->ofh; ++oj) { element_output_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output, img, oj, 0, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock)); for (oi = 0; oi < handle->ofw; ++oi) { LIBXSMM_PRAGMA_SIMD for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { temp_ptr[ofm2] = (element_output_type)0; } temp_ptr += handle->blocksofm * handle->ofmblock; } } } for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_fwd_oj,handle->ofh); oj += handle->fwd_ofh_rb) { for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { /* Prepare batch-reduce kernel arguments */ if (handle->pack_input == 1) { ij_use = oj; ii_use = oi; } else { ij_use = oj * handle->desc.u; ii_use = oi * handle->desc.v; } oi_use = oi; oj_use = oj; ind = 0; for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { for (kj1 = 0; kj1 < handle->desc.R; kj1++) { for (ki1 = 0; ki1 < handle->desc.S; ki1++) { ki = (handle->shuffle_filter_accesses == 1) ? (ki1+ltid)%handle->desc.S : ki1; kj = (handle->shuffle_filter_accesses == 1) ? (kj1+ltid)%handle->desc.R : kj1; #ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_CUSTOM A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); #endif #ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_RSCK A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, kj, ki, ifm2, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); #endif B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ij_use + kj, ii_use + ki, ifm2, 0, IFH, IFW, handle->blocksifm, handle->ifmblock); ind++; } } } n_blocks = ind; br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, oj_use, oi_use, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock), &n_blocks); } } } } } } } } } } if (handle->loop_order == 1) { for (img = my_img_start; img < my_img_end; img++) { for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += handle->block_fwd_ofm) { for (ojb = 0; ojb < handle->ofh; ojb += handle->block_fwd_oj) { for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_fwd_oj,handle->ofh); oj += handle->fwd_ofh_rb) { for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_fwd_ofm, my_ofm_end); ofm1++ ) { if (((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0 && oj == 0 && oi == 0) { /* set output feature map to zero */ for (ojj = 0; ojj < handle->ofh; ++ojj) { element_output_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output, img, ojj, 0, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock)); for (oii = 0; oii < handle->ofw; ++oii) { LIBXSMM_PRAGMA_SIMD for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { temp_ptr[ofm2] = (element_output_type)0; } temp_ptr += handle->blocksofm * handle->ofmblock; } } } for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { /* Prepare batch-reduce kernel arguments */ if (handle->pack_input == 1) { ij_use = oj; ii_use = oi; } else { ij_use = oj * handle->desc.u; ii_use = oi * handle->desc.v; } oi_use = oi; oj_use = oj; ind = 0; for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { for (kj = 0; kj < handle->desc.R; kj++) { for (ki = 0; ki < handle->desc.S; ki++) { #ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_CUSTOM A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); #endif #ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_RSCK A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, kj, ki, ifm2, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); #endif B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ij_use + kj, ii_use + ki, ifm2, 0, IFH, IFW, handle->blocksifm, handle->ifmblock); ind++; } } } n_blocks = ind; br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, oj_use, oi_use, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock), &n_blocks); } } } } } } } } } } libxsmm_barrier_wait(handle->barrier, ltid); libxsmm-1.17/src/template/libxsmm_dnn_convolve_st_upd_custom_custom_generic.tpl.c000066400000000000000000001047221415223013700307500ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas, Alexander Heinecke (Intel Corp.) ******************************************************************************/ int img, my_img_start, my_img_end, ofmb, ifmb, ojb, ofm1, ifm1, ifm2 = 0, ofm2 = 0, oj, oi, ii, ij, kj, ki, ind, j_br, img_br, img_block_size = 1, my_ofm_start, my_ofm_end, my_ifm_start, my_ifm_end, block_ofm, block_ifm; /* computing first logical thread */ const int ltid = tid - start_thread; libxsmm_blasint LDA = handle->ofmblock; libxsmm_blasint LDB = (handle->upd_pack_input == 1) ? handle->ifmblock : handle->desc.v * handle->ifmblock; libxsmm_blasint LDC = handle->ofmblock; int l_flags = LIBXSMM_GEMM_FLAGS('N', 'T'); element_output_type *const out = (element_output_type*)handle->grad_output->data + ((size_t)handle->desc.pad_h_out * handle->ofwp + handle->desc.pad_w_out) * handle->ofmblock; LIBXSMM_VLA_DECL(5, const element_output_type, output, (const element_output_type*)out, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); const int IFWP = (handle->upd_padding_copy == 1) ? handle->ifwp + 2*handle->desc.pad_w : handle->ifwp; const int IFHP = (handle->upd_padding_copy == 1) ? handle->ifhp + 2*handle->desc.pad_h : handle->ifhp; element_input_type *input_ptr_to_use = (handle->upd_padding_copy == 1) ? (element_input_type*) ((char*)handle->scratch + handle->upd_packing_padding_scratch_offset) : (element_input_type*)handle->reg_input->data; LIBXSMM_VLA_DECL(5, element_input_type, input, (element_input_type*) input_ptr_to_use, handle->blocksifm, IFHP, IFWP, handle->ifmblock); LIBXSMM_VLA_DECL(6, element_filter_type, weight_global, (element_filter_type*)handle->grad_filter->data, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); element_filter_type *weight_ptr = (handle->weight_copies == 1) ? (element_filter_type*)handle->grad_filter->data : (element_filter_type*) ((char*)handle->scratch + handle->upd_filter_scratch_offset) + ltid * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S; LIBXSMM_VLA_DECL(6, element_filter_type, weight_private, (element_filter_type*)weight_ptr, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); int prefetch_mode = (handle->desc.u == 2 || (handle->desc.R == 3 && handle->ofw == 7) ) ? libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_NONE) : libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_BL1); /* Batch reduce related variables */ const element_output_type *A_ptrs[1024]; const element_input_type *B_ptrs[1024]; unsigned long long n_blocks; libxsmm_barrier_init(handle->barrier, ltid); /* physical pad input */ if (handle->upd_padding_copy == 1) { LIBXSMM_VLA_DECL(5, element_input_type, input_src, (element_input_type*)handle->reg_input->data, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); int imgpt = LIBXSMM_UPDIV(handle->desc.N, handle->desc.threads); my_img_start = LIBXSMM_MIN(ltid * imgpt, handle->desc.N); my_img_end = LIBXSMM_MIN((ltid+1) * imgpt, handle->desc.N); my_ifm_start = 0; my_ifm_end = handle->blocksifm; for (img = my_img_start; img < my_img_end; img++) { for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { /* copy the inner part */ for (ij = 0; ij < handle->ifhp+(2*handle->desc.pad_h); ij++) { for (ii = 0; ii < handle->ifwp+(2*handle->desc.pad_w); ii++) { if ( (ij >= handle->desc.pad_h) && (ii >= handle->desc.pad_w) && (ij < handle->ifhp+handle->desc.pad_h) && (ii < handle->ifwp+handle->desc.pad_w) ) { LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij, ii, ifm2, handle->blocksifm, IFHP, IFWP, handle->ifmblock) = LIBXSMM_VLA_ACCESS(5, input_src, img, ifm1, ij-handle->desc.pad_h, ii-handle->desc.pad_w, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); } } else { LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij, ii, ifm2, handle->blocksifm, IFHP, IFWP, handle->ifmblock) = (element_input_type)0; } } } } } } libxsmm_barrier_wait(handle->barrier, ltid); } if (handle->upd_use_batchreduce == 0 && handle->upd_linearized_tasklist == 0) { /* Parallelize over minibatch */ const int img_work = handle->desc.N; const int img_chunksize = (img_work % handle->desc.threads == 0) ? (img_work / handle->desc.threads) : (img_work / handle->desc.threads) + 1; const float beta = ((img_chunksize == 1) && (handle->upd_ofh_rb == handle->ofh) && (handle->upd_ofw_rb == handle->ofw)) ? 0.f : 1.f; gemm_function gemm_kernel = libxsmm_smmdispatch(handle->ofmblock, handle->ifmblock, handle->upd_ofw_rb * handle->upd_ofh_rb, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); my_img_start = (ltid * img_chunksize < img_work) ? (ltid * img_chunksize) : img_work; my_img_end = ((ltid + 1) * img_chunksize < img_work) ? ((ltid + 1) * img_chunksize) : img_work; if (!((img_chunksize == 1) && (handle->upd_ofh_rb == handle->ofh) && (handle->upd_ofw_rb == handle->ofw))) { memset(weight_ptr, 0, handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S * sizeof(element_filter_type)); } if (handle->upd_loop_order == 0) { for (img = my_img_start; img < my_img_end; img++) { for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_upd_ofm) { for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_upd_ifm) { for (ojb = 0; ojb < handle->ofh; ojb += handle->upd_ofh_rb) { for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_upd_ofm, handle->blocksofm); ofm1++ ) { for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_upd_ifm, handle->blocksifm); ifm1++) { for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->upd_ofh_rb,handle->ofh); oj+= handle->upd_ofh_rb) { for (oi = 0; oi < handle->ofw; oi += handle->upd_ofw_rb) { for (kj = 0; kj < handle->desc.R; ++kj) { for (ki = 0; ki < handle->desc.S; ++ki) { ii = oi * handle->desc.u + ki; ij = oj * handle->desc.v + kj; gemm_kernel( &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij, ii, 0, handle->blocksifm, IFHP, IFWP, handle->ifmblock), &LIBXSMM_VLA_ACCESS(6, weight_private, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock) ); } } } } } } } } } } } if (handle->upd_loop_order == 1) { for (img = my_img_start; img < my_img_end; img++) { for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_upd_ifm) { for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_upd_ofm) { for (ojb = 0; ojb < handle->ofh; ojb += handle->upd_ofh_rb) { for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_upd_ifm, handle->blocksifm); ifm1++) { for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_upd_ofm, handle->blocksofm); ofm1++ ) { for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->upd_ofh_rb,handle->ofh); oj+= handle->upd_ofh_rb) { for (oi = 0; oi < handle->ofw; oi += handle->upd_ofw_rb) { for (kj = 0; kj < handle->desc.R; ++kj) { for (ki = 0; ki < handle->desc.S; ++ki) { ii = oi * handle->desc.u + ki; ij = oj * handle->desc.v + kj; gemm_kernel( &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij, ii, 0, handle->blocksifm, IFHP, IFWP, handle->ifmblock), &LIBXSMM_VLA_ACCESS(6, weight_private, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock) ); } } } } } } } } } } } } else { if (handle->upd_linearized_tasklist == 1) { /* Amount of work when using linearized view of tasks */ const int work = handle->desc.R * handle->desc.S * handle->blocksofm * handle->blocksifm; const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : (work / handle->desc.threads) + 1; const int work_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int work_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; int work_item; int Cb = handle->blocksifm; #if 0 int Kb = handle->blocksofm; #endif int R = handle->desc.R; int S = handle->desc.S; if (handle->upd_avoid_rim_fmas == 0) { const int IFH = (handle->upd_pack_input == 1) ? handle->ifhp/handle->desc.u : IFHP; const int IFW = (handle->upd_pack_input == 1) ? handle->ifwp/handle->desc.v : IFWP; element_input_type *input_ptr_base = (handle->upd_pack_input == 1) ? (element_input_type*) ((char*)handle->scratch + handle->upd_packing_padding_scratch_offset) : (element_input_type*)input_ptr_to_use; LIBXSMM_VLA_DECL(5, element_input_type, input_use, (element_input_type*)input_ptr_base, handle->blocksifm, IFH, IFW, handle->ifmblock); const float beta = ((handle->desc.N == 1) && (handle->upd_ofh_rb == handle->ofh) && (handle->upd_ofw_rb == handle->ofw)) ? 0.f : 1.f; gemm_function gemm_kernel = libxsmm_smmdispatch(handle->ofmblock, handle->ifmblock, handle->upd_ofw_rb * handle->upd_ofh_rb, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); /* If requested, pack input to avoid strided accesses */ if (handle->upd_pack_input == 1) { LIBXSMM_VLA_DECL(5, element_input_type, input_src, (element_input_type*)handle->reg_input->data, handle->blocksifm, IFHP, IFWP, handle->ifmblock); const int img_chunk = (handle->desc.N % handle->desc.threads == 0) ? handle->desc.N/handle->desc.threads : (handle->desc.N/handle->desc.threads) + 1; const int img_copy_start = LIBXSMM_MIN(ltid*img_chunk, handle->desc.N); const int img_copy_end = LIBXSMM_MIN((ltid+1)*img_chunk, handle->desc.N); for (img = img_copy_start; img < img_copy_end; img++) { for (ifm1 = 0; ifm1 < handle->blocksifm; ifm1++) { for (oj = 0; oj < handle->ofh; oj++) { for (oi = 0; oi < handle->ofw; oi++) { ij = oj * handle->desc.u; ii = oi * handle->desc.v; LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_VLA_ACCESS(5, input_use, img, ifm1, oj, oi, ifm2, handle->blocksifm, IFH, IFW, handle->ifmblock) = LIBXSMM_VLA_ACCESS(5, input_src, img, ifm1, ij, ii, ifm2, handle->blocksifm, IFHP, IFWP, handle->ifmblock); } } } } } libxsmm_barrier_wait(handle->barrier, ltid); } /* Initialize weights to zero */ if (!((handle->desc.N == 1) && (handle->upd_ofh_rb == handle->ofh) && (handle->upd_ofw_rb == handle->ofw))) { for (work_item = work_begin; work_item < work_end; work_item++) { ofm1 = work_item/(Cb*R*S); ifm1 = (work_item%(Cb*R*S))/(R*S); kj = ((work_item%(Cb*R*S))%(R*S))/S; ki = ((work_item%(Cb*R*S))%(R*S))%S; for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_PRAGMA_SIMD for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2++) { LIBXSMM_VLA_ACCESS(6, weight_global, ofm1, ifm1, kj, ki, ifm2, ofm2, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock) = (element_filter_type)0; } } } } for (img = 0; img < handle->desc.N; img++) { for (work_item = work_begin; work_item < work_end; work_item++) { ofm1 = work_item/(Cb*R*S); ifm1 = (work_item%(Cb*R*S))/(R*S); kj = ((work_item%(Cb*R*S))%(R*S))/S; ki = ((work_item%(Cb*R*S))%(R*S))%S; oi = 0; ii = ki; for (oj = 0; oj < handle->ofh; oj += handle->upd_ofh_rb) { ij = oj * handle->desc.u + kj; gemm_kernel( &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &LIBXSMM_VLA_ACCESS(5, input_use, img, ifm1, ij, ii, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), &LIBXSMM_VLA_ACCESS(6, weight_global, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock) ); } } } } else { const float beta = ((handle->upd_ofh_rb == handle->ofh) && (handle->upd_ofw_rb == handle->ofw)) ? 0.f : 1.f; gemm_br_function br_gemm_kernel = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->ifmblock, handle->upd_ofw_rb, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); gemm_br_function br_gemm_kernel2 = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->ifmblock, handle->upd_ofw_rb-1, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); for (work_item = work_begin; work_item < work_end; work_item++) { ofm1 = work_item/(Cb*R*S); ifm1 = (work_item%(Cb*R*S))/(R*S); kj = ((work_item%(Cb*R*S))%(R*S))/S; ki = ((work_item%(Cb*R*S))%(R*S))%S; oi = 0; oj = 0; ii = oi * handle->desc.u + ki; ij = oj * handle->desc.v + kj; img = 0; img_block_size = handle->desc.N; if (kj == 0) { ind = 0; for (img_br = 0; img_br < img_block_size; img_br++) { for (j_br = 1; j_br < handle->upd_ofh_rb; j_br++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img + img_br, ofm1, oj + j_br, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ifm1, ij + j_br * handle->desc.u, ii, 0, handle->blocksifm, IFHP, IFWP, handle->ifmblock); ind++; } } n_blocks = ind; br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_global, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &n_blocks); } else if (ki == 0) { ind = 0; for (img_br = 0; img_br < img_block_size; img_br++) { for (j_br = 0; j_br < handle->upd_ofh_rb; j_br++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img + img_br, ofm1, oj + j_br, oi + 1, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ifm1, ij + j_br * handle->desc.u, ii + 1, 0, handle->blocksifm, IFHP, IFWP, handle->ifmblock); ind++; } } n_blocks = ind; br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_global, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &n_blocks); } else if (oi == handle->ofw-handle->fwd_ofw_rb && ki == handle->desc.S-1) { ind = 0; for (img_br = 0; img_br < img_block_size; img_br++) { for (j_br = 0; j_br < handle->upd_ofh_rb; j_br++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img + img_br, ofm1, oj + j_br, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ifm1, ij + j_br * handle->desc.u, ii, 0, handle->blocksifm, IFHP, IFWP, handle->ifmblock); ind++; } } n_blocks = ind; br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_global, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &n_blocks); } else { if (kj == handle->desc.R-1) { ind = 0; for (img_br = 0; img_br < img_block_size; img_br++) { for (j_br = 0; j_br < handle->upd_ofh_rb-1; j_br++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img + img_br, ofm1, oj + j_br, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ifm1, ij + j_br * handle->desc.u, ii, 0, handle->blocksifm, IFHP, IFWP, handle->ifmblock); ind++; } } n_blocks = ind; br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_global, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &n_blocks); } else { ind = 0; for (img_br = 0; img_br < img_block_size; img_br++) { for (j_br = 0; j_br < handle->upd_ofh_rb; j_br++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img + img_br, ofm1, oj + j_br, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ifm1, ij + j_br * handle->desc.u, ii, 0, handle->blocksifm, IFHP, IFWP, handle->ifmblock); ind++; } } n_blocks = ind; br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_global, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &n_blocks); } } } } } else { /* Here we are using batch-reduce kernel and hybrid minibatch/FM parallelization */ /* FIXME: Hardcoed logic for N=27 */ int group_size = (handle->desc.threads == 27 && handle->desc.N == 27 && handle->ofw == 14 && handle->desc.R == 1 && handle->desc.u == 1 && ltid >= 24) ? 3 : LIBXSMM_UPDIV(handle->desc.threads, handle->weight_copies); int tile_id = ltid / LIBXSMM_UPDIV(handle->desc.threads, handle->weight_copies); int tiles = handle->weight_copies; int img_per_tile = LIBXSMM_UPDIV(handle->desc.N, tiles); int my_in_tile_id = ltid % group_size; int ifms_per_thread = LIBXSMM_UPDIV(handle->blocksifm, group_size); int ofms_per_thread = LIBXSMM_UPDIV(handle->blocksofm, group_size); int my_R_start = 0; int my_R_end = handle->desc.R; const float beta = ((handle->upd_ofh_rb == handle->ofh) && (handle->upd_ofw_rb == handle->ofw)) ? 0.f : 1.f; gemm_br_function br_gemm_kernel = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->ifmblock, handle->upd_ofw_rb, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); const float beta_flat = 0.0; gemm_br_function br_gemm_kernel_flat = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->ifmblock, handle->upd_ofw_rb, &LDA, &LDB, &LDC, NULL, &beta_flat, &l_flags, &prefetch_mode); element_filter_type *weight_ptr_group = (handle->weight_copies > 1) ? (element_filter_type*) ((char*)handle->scratch + handle->upd_filter_scratch_offset) + tile_id * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S : (element_filter_type*)handle->grad_filter->data; LIBXSMM_VLA_DECL(6, element_filter_type, weight_private_group, (element_filter_type*)weight_ptr_group, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); my_img_start = LIBXSMM_MIN(tile_id * img_per_tile, handle->desc.N); my_img_end = LIBXSMM_MIN((tile_id+1) * img_per_tile, handle->desc.N); my_ifm_start = LIBXSMM_MIN(my_in_tile_id * ifms_per_thread, handle->blocksifm ); my_ifm_end = LIBXSMM_MIN((my_in_tile_id+1) * ifms_per_thread, handle->blocksifm ); my_ofm_start = 0; my_ofm_end = handle->blocksofm; /* FIXME: Hardcoed logic for N=27 */ if (handle->desc.threads == 27 && handle->desc.N == 27 && handle->desc.C == 256 && handle->desc.K == 1024 && handle->ofh == 14 && handle->desc.u == 1) { my_ofm_start = LIBXSMM_MIN(my_in_tile_id * ofms_per_thread, handle->blocksofm); my_ofm_end = LIBXSMM_MIN((my_in_tile_id+1) * ofms_per_thread, handle->blocksofm); my_ifm_start = 0; my_ifm_end = handle->blocksifm; } if (handle->desc.threads == 27 && handle->desc.N == 27 && handle->desc.R == 3 && handle->desc.S == 3 && handle->ofh == 14) { int r_per_tile = LIBXSMM_UPDIV(handle->desc.R, group_size); my_ifm_start = 0; my_ifm_end = handle->blocksifm; my_ofm_start = 0; my_ofm_end = handle->blocksofm; my_R_start = LIBXSMM_MIN(my_in_tile_id * r_per_tile, handle->desc.R); my_R_end = LIBXSMM_MIN((my_in_tile_id+1) * r_per_tile, handle->desc.R); } if (handle->desc.threads == 92 && handle->desc.N == 92 && handle->desc.C == 512 && handle->desc.K == 512 && handle->ofh == 7 && handle->desc.u == 1 && handle->desc.R == 3) { my_ofm_start = LIBXSMM_MIN(my_in_tile_id * ofms_per_thread, handle->blocksofm); my_ofm_end = LIBXSMM_MIN((my_in_tile_id+1) * ofms_per_thread, handle->blocksofm); my_ifm_start = 0; my_ifm_end = handle->blocksifm; } block_ofm = my_ofm_end-my_ofm_start+1; block_ifm = my_ifm_end-my_ifm_start+1; img_block_size = my_img_end - my_img_start; if (handle->desc.N != handle->desc.threads) { /* Use "flat" parallelism + reduction */ const int work = handle->desc.R * handle->desc.S * handle->blocksofm * handle->blocksifm * handle->desc.N; const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : (work / handle->desc.threads) + 1; const int work_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int work_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; int work_item; int Cb = handle->blocksifm; int Kb = handle->blocksofm; int R = handle->desc.R; int S = handle->desc.S; const int IFH = (handle->upd_pack_input == 1) ? handle->ifhp/handle->desc.u : IFHP; const int IFW = (handle->upd_pack_input == 1) ? handle->ifwp/handle->desc.v : IFWP; element_input_type *input_ptr_base = (handle->upd_pack_input == 1) ? (element_input_type*) ((char*)handle->scratch + handle->upd_packing_padding_scratch_offset) : (element_input_type*)input_ptr_to_use; LIBXSMM_VLA_DECL(5, element_input_type, input_use, (element_input_type*)input_ptr_base, handle->blocksifm, IFH, IFW, handle->ifmblock); /* If requested, pack input to avoid strided accesses */ if (handle->upd_pack_input == 1) { LIBXSMM_VLA_DECL(5, element_input_type, input_src, (element_input_type*)handle->reg_input->data, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); const int img_chunk = (handle->desc.N % handle->desc.threads == 0) ? handle->desc.N/handle->desc.threads : (handle->desc.N/handle->desc.threads) + 1; const int img_copy_start = LIBXSMM_MIN(ltid*img_chunk, handle->desc.N); const int img_copy_end = LIBXSMM_MIN((ltid+1)*img_chunk, handle->desc.N); for (img = img_copy_start; img < img_copy_end; img++) { for (ifm1 = 0; ifm1 < handle->blocksifm; ifm1++) { for (oj = 0; oj < handle->ofh; oj++) { for (oi = 0; oi < handle->ofw; oi++) { ij = oj * handle->desc.u; ii = oi * handle->desc.v; LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_VLA_ACCESS(5, input_use, img, ifm1, oj, oi, ifm2, handle->blocksifm, IFH, IFW, handle->ifmblock) = LIBXSMM_VLA_ACCESS(5, input_src, img, ifm1, ij, ii, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); } } } } } libxsmm_barrier_wait(handle->barrier, ltid); } /* Initialize weights to zero */ if (handle->upd_ofw_rb != handle->ofw) { for (work_item = work_begin; work_item < work_end; work_item++) { img = work_item/(Cb*Kb*R*S); ofm1 = (work_item%(Cb*Kb*R*S))/(Cb*R*S); ifm1 = ((work_item%(Cb*Kb*R*S))%(Cb*R*S))/(R*S); kj = (((work_item%(Cb*Kb*R*S))%(Cb*R*S))%(R*S))/S; ki = (((work_item%(Cb*Kb*R*S))%(Cb*R*S))%(R*S))%S; { element_filter_type *weight_ptr_current = (handle->weight_copies > 1) ? (element_filter_type*) ((char*)handle->scratch + handle->upd_filter_scratch_offset) + img * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S : (element_filter_type*)handle->grad_filter->data; LIBXSMM_VLA_DECL(6, element_filter_type, weight_current, (element_filter_type*)weight_ptr_current, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_PRAGMA_SIMD for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2++) { LIBXSMM_VLA_ACCESS(6, weight_current, ofm1, ifm1, kj, ki, ifm2, ofm2, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock) = (element_filter_type)0; } } } } } for (work_item = work_begin; work_item < work_end; work_item++) { img = work_item/(Cb*Kb*R*S); ofm1 = (work_item%(Cb*Kb*R*S))/(Cb*R*S); ifm1 = ((work_item%(Cb*Kb*R*S))%(Cb*R*S))/(R*S); kj = (((work_item%(Cb*Kb*R*S))%(Cb*R*S))%(R*S))/S; ki = (((work_item%(Cb*Kb*R*S))%(Cb*R*S))%(R*S))%S; ii = 0 + ki; ij = 0 + kj; oj = 0; oi = 0; { element_filter_type *weight_ptr_current = (handle->weight_copies > 1) ? (element_filter_type*) ((char*)handle->scratch + handle->upd_filter_scratch_offset) + img * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S : (element_filter_type*)handle->grad_filter->data; LIBXSMM_VLA_DECL(6, element_filter_type, weight_current, (element_filter_type*)weight_ptr_current, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); ind = 0; for (j_br = 0; j_br < handle->ofh; j_br++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img , ofm1, oj + j_br, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input_use, img, ifm1, ij + j_br * handle->desc.u, ii, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); ind++; } n_blocks = ind; br_gemm_kernel_flat(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_current, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &n_blocks); } } } else { /* May need to initialized private weights to zero */ if (!((handle->upd_ofh_rb == handle->ofh) && (handle->upd_ofw_rb == handle->ofw))) { for (ofm1 = my_ofm_start; ofm1 < my_ofm_end; ofm1++ ) { for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { for (kj = my_R_start; kj < my_R_end; ++kj) { for (ki = 0; ki < handle->desc.S; ++ki) { for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2++ ) { for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_VLA_ACCESS(6, weight_private_group, ofm1, ifm1, kj, ki, ifm2, ofm2, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock) = (element_filter_type)0; } } } } } } } if (handle->upd_loop_order == 0) { for (img = my_img_start; img < my_img_end; img += img_block_size) { for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += block_ofm) { for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += block_ifm) { for (ojb = 0; ojb < handle->ofh; ojb += handle->upd_ofh_rb) { for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+block_ofm, my_ofm_end); ofm1++ ) { for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+block_ifm, my_ifm_end); ifm1++) { for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->upd_ofh_rb,handle->ofh); oj+= handle->upd_ofh_rb) { for (oi = 0; oi < handle->ofw; oi += handle->upd_ofw_rb) { for (kj = my_R_start; kj < my_R_end; ++kj) { for (ki = 0; ki < handle->desc.S; ++ki) { ii = oi * handle->desc.u + ki; ij = oj * handle->desc.v + kj; ind = 0; for (img_br = 0; img_br < img_block_size; img_br++) { for (j_br = 0; j_br < handle->upd_ofh_rb; j_br++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img + img_br, ofm1, oj + j_br, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ifm1, ij + j_br * handle->desc.u, ii, 0, handle->blocksifm, IFHP, IFWP, handle->ifmblock); ind++; } } n_blocks = ind; br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_private_group, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &n_blocks); } } } } } } } } } } } else { for (img = my_img_start; img < my_img_end; img += img_block_size) { for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += block_ifm) { for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += block_ofm) { for (ojb = 0; ojb < handle->ofh; ojb += handle->upd_ofh_rb) { for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+block_ifm, my_ifm_end); ifm1++) { for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+block_ofm, my_ofm_end); ofm1++ ) { for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->upd_ofh_rb,handle->ofh); oj+= handle->upd_ofh_rb) { for (oi = 0; oi < handle->ofw; oi += handle->upd_ofw_rb) { for (kj = my_R_start; kj < my_R_end; ++kj) { for (ki = 0; ki < handle->desc.S; ++ki) { ii = oi * handle->desc.u + ki; ij = oj * handle->desc.v + kj; ind = 0; for (img_br = 0; img_br < img_block_size; img_br++) { for (j_br = 0; j_br < handle->upd_ofh_rb; j_br++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img + img_br, ofm1, oj + j_br, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ifm1, ij + j_br * handle->desc.u, ii, 0, handle->blocksifm, IFHP, IFWP, handle->ifmblock); ind++; } } n_blocks = ind; br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_private_group, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &n_blocks); } } } } } } } } } } } } } } if (handle->weight_copies > 1) { /* reduce work-related variables */ const int fm_blocking = (handle->ofmblock % 16 == 0) ? 16 : handle->ofmblock; const int reduce_work = handle->blocksofm * handle->blocksifm * handle->desc.R * handle->desc.S * (handle->ofmblock/fm_blocking) * handle->ifmblock; const int reduce_chunksize = (reduce_work % handle->desc.threads == 0) ? (reduce_work / handle->desc.threads) : (reduce_work / handle->desc.threads) + 1; const int reduce_thr_begin = (ltid * reduce_chunksize < reduce_work) ? (ltid * reduce_chunksize) : reduce_work; const int reduce_thr_end = ((ltid + 1) * reduce_chunksize < reduce_work) ? ((ltid + 1) * reduce_chunksize) : reduce_work; /* Perform reduction here */ libxsmm_barrier_wait(handle->barrier, ltid); for ( ij = reduce_thr_begin; ij < reduce_thr_end; ij++ ) { element_filter_type *weight_ptr_glb = (element_filter_type*) handle->grad_filter->data; #if 1 float weight_sum[64]; int wtcnt = 0; assert( handle->ofmblock <= 64 ); LIBXSMM_PRAGMA_SIMD for ( wtcnt = 0; wtcnt < fm_blocking; ++wtcnt ) { weight_sum[wtcnt] = 0.0f; } for ( ii = 0; ii < handle->weight_copies; ii++ ) { element_filter_type *weight_ptr_src = (element_filter_type*) ((char*)handle->scratch + handle->upd_filter_scratch_offset) + ii * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S + ij * fm_blocking; LIBXSMM_PRAGMA_SIMD for ( wtcnt = 0; wtcnt < fm_blocking; ++wtcnt ) { weight_sum[wtcnt] += weight_ptr_src[wtcnt]; } } LIBXSMM_PRAGMA_SIMD for ( wtcnt = 0; wtcnt < fm_blocking; ++wtcnt ) { weight_ptr_glb[(ij*fm_blocking) + wtcnt] = weight_sum[wtcnt]; } #else __m512 weight_sum = _mm512_setzero_ps(); for ( ii = 0; ii < handle->weight_copies; ii++ ) { element_filter_type *weight_ptr_src = (element_filter_type*)handle->scratch7 + ii * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S + ij * 16; weight_sum = _mm512_add_ps(weight_sum, LIBXSMM_INTRINSICS_MM512_LOAD_PS(weight_ptr_src)); } _mm512_storeu_ps(&weight_ptr_glb[ij*16], weight_sum); #endif } } libxsmm_barrier_wait(handle->barrier, ltid); libxsmm-1.17/src/template/libxsmm_dnn_convolve_st_upd_custom_custom_generic_bf16.tpl.c000066400000000000000000001230411415223013700315610ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas (Intel Corp.) ******************************************************************************/ #define TRANS_OUTPUT_TO_VNNI_FORMAT(img, ofm1) do {\ __m512i zero_reg = _mm512_setzero_si512();\ src_out = (element_output_type*) &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, 0, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock);\ tr_out = (element_output_type*) &LIBXSMM_VLA_ACCESS(5, tr_output, img, ofm1, 0, 0, 0, handle->blocksofm, handle->output_pixels/2, handle->ofmblock, 2);\ for (pixel_pair = 0; pixel_pair < n_full_pixel_pairs; pixel_pair++) {\ for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2+=32) {\ pixel_0 = _mm512_loadu_si512((element_output_type*)src_out+ofm2);\ pixel_1 = _mm512_loadu_si512(((element_output_type*)src_out+handle->ofmblock+ofm2));\ ofms_lo = _mm512_permutex2var_epi16(pixel_0, idx_lo, pixel_1);\ ofms_hi = _mm512_permutex2var_epi16(pixel_0, idx_hi, pixel_1);\ _mm512_storeu_si512(tr_out+ofm2*2, ofms_lo);\ _mm512_storeu_si512((element_output_type*)tr_out+32+ofm2*2, ofms_hi);\ }\ src_out += 2* handle->ofmblock;\ tr_out += 2*handle->ofmblock;\ }\ if (half_pixel_pair == 1) {\ for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2+=32) {\ pixel_0 = _mm512_loadu_si512((element_output_type*)src_out+ofm2);\ pixel_1 = _mm512_setzero_si512();\ ofms_lo = _mm512_permutex2var_epi16(pixel_0, idx_lo, pixel_1);\ ofms_hi = _mm512_permutex2var_epi16(pixel_0, idx_hi, pixel_1);\ _mm512_storeu_si512(tr_out+ofm2*2, ofms_lo);\ _mm512_storeu_si512((element_output_type*)tr_out+32+ofm2*2, ofms_hi);\ }\ }\ for (oi = ((handle->compute_pixels+1)/2)*2; oi < handle->output_pixels; oi+=2) {\ for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2+=32) {\ tr_out = (element_output_type*) &LIBXSMM_VLA_ACCESS(5, tr_output, img, ofm1, oi/2, ofm2, 0, handle->blocksofm, handle->output_pixels/2, handle->ofmblock, 2);\ _mm512_storeu_si512((element_output_type*)tr_out, zero_reg);\ _mm512_storeu_si512((element_output_type*)tr_out+32, zero_reg);\ }\ }\ } while(0) #define TRANS_OUTPUT_W_TO_VNNI_FORMAT(img, ofm1, oj, H) do {\ int h, w_pixel_pair, w_full_pixel_pairs = handle->ofwp/2;\ for (h=0; hblocksofm, handle->ofhp, handle->ofwp, handle->ofmblock);\ tr_out = (element_output_type*) &LIBXSMM_VLA_ACCESS(6, tr_output_2, img, 0, h, 0, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp_extended/2, handle->ofmblock, 2);\ for (w_pixel_pair = 0; w_pixel_pair < w_full_pixel_pairs; w_pixel_pair++) {\ for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2+=32) {\ pixel_0 = _mm512_loadu_si512((element_output_type*)src_out+ofm2);\ pixel_1 = _mm512_loadu_si512(((element_output_type*)src_out+handle->ofmblock+ofm2));\ ofms_lo = _mm512_permutex2var_epi16(pixel_0, idx_lo, pixel_1);\ ofms_hi = _mm512_permutex2var_epi16(pixel_0, idx_hi, pixel_1);\ _mm512_storeu_si512(tr_out+ofm2*2, ofms_lo);\ _mm512_storeu_si512((element_output_type*)tr_out+32+ofm2*2, ofms_hi);\ }\ src_out += 2* handle->ofmblock;\ tr_out += 2*handle->ofmblock;\ }\ }\ } while(0) int img, my_img_start, my_img_end, ofmb, ifmb, ofm1, ifm1, ifm2, ofm2, oj, oi, ii, ij, kj, ki, j_br, img_br, i, j, img_block_size = 1, my_ofm_start, my_ofm_end, my_ifm_start, my_ifm_end, block_ofm, block_ifm, pix; /* computing first logical thread */ const int ltid = tid - start_thread; element_output_type *const out = (element_output_type*)handle->grad_output->data + ((size_t)handle->desc.pad_h_out * handle->ofwp + handle->desc.pad_w_out) * handle->ofmblock; LIBXSMM_VLA_DECL(5, const element_output_type, output, (const element_output_type*)out, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); LIBXSMM_VLA_DECL(5, const element_input_type, input, (const element_input_type*)handle->reg_input->data, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); element_filter_type *weight_ptr = (element_filter_type*)((char*)handle->scratch + handle->upd_filter_scratch_offset) + ltid * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S; element_filter_type *filter_dst_ptr = (handle->weight_copies > 1) ? (element_filter_type*)weight_ptr : (element_filter_type*)handle->grad_filter->data; LIBXSMM_VLA_DECL(7, element_filter_type, weight_dst, (element_filter_type*)filter_dst_ptr, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock/2, handle->ofmblock, 2); /* This intermediate tensor is used when pixels are NOT fully accumulated */ float *weight_ptr_f32 = (float*) ((char*)handle->scratch + handle->upd_lp_filter_full_scratch_offset) + ltid * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S; LIBXSMM_VLA_DECL(6, float, weight_private_f32, (float*)weight_ptr_f32, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); /* Accumulation scratch is used when pixels are ully accumulated */ element_filter_type *filter_scratch = (element_filter_type*)((char*)handle->scratch + handle->upd_lp_filter_full_scratch_offset) + ltid * handle->ofmblock * handle->ifmblock * 2; LIBXSMM_VLA_DECL(2, float, filter_tmp, (float*)filter_scratch, handle->ofmblock); element_input_type *scratch_tr_input = (element_input_type*)((char*)handle->scratch + handle->upd_lp_input_full_scratch_offset); element_input_type *zero_ptr_in; LIBXSMM_VLA_DECL(4, element_input_type, tr_input, (element_input_type*) scratch_tr_input, handle->blocksifm, handle->ifmblock, handle->input_pixels); LIBXSMM_VLA_DECL(5, element_input_type, tr_input_2, (element_input_type*) scratch_tr_input, handle->blocksifm, handle->ifmblock, handle->ifhp, handle->ifwp_extended); element_output_type *scratch_tr_output = (element_input_type*)((char*)handle->scratch + handle->upd_lp_output_full_scratch_offset); LIBXSMM_VLA_DECL(5, element_output_type, tr_output, (element_output_type*) scratch_tr_output, handle->blocksofm, handle->output_pixels/2, handle->ofmblock, 2); LIBXSMM_VLA_DECL(6, element_output_type, tr_output_2, (element_output_type*) scratch_tr_output, handle->blocksofm, handle->ofhp, handle->ofwp_extended/2, handle->ofmblock, 2); #if 0 element_output_type *out_ptr = (element_output_type*)handle->grad_output->data + ((size_t)handle->desc.pad_h_out * handle->ofwp + handle->desc.pad_w_out) * handle->ofmblock; element_output_type *zero_ptr_out; #endif /* transpose, copy and reduce work-related variables */ const int reduce_work = (handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S)/16; const int reduce_chunksize = (reduce_work % handle->desc.threads == 0) ? (reduce_work / handle->desc.threads) : (reduce_work / handle->desc.threads) + 1; const int reduce_thr_begin = (ltid * reduce_chunksize < reduce_work) ? (ltid * reduce_chunksize) : reduce_work; const int reduce_thr_end = ((ltid + 1) * reduce_chunksize < reduce_work) ? ((ltid + 1) * reduce_chunksize) : reduce_work; const float beta = (handle->use_intermediate_f32_wt_tensor ? 1.f : 0.f); float *dst_ptr; gemm_br_function br_gemm_kernel = 0; /* These are used for the vnni reformatting of the f32 output */ __m512i c01 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); const __m512i perm_index = LIBXSMM_INTRINSICS_MM512_SET_EPI16(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8, 23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0); /* Related to the output transpose */ int n_full_pixel_pairs = handle->compute_pixels/2, half_pixel_pair = handle->compute_pixels%2, pixel_pair; element_output_type *tr_out, *src_out; const __m512i selector = LIBXSMM_INTRINSICS_MM512_SET_EPI16(32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0); const __m512i offsets_lo = LIBXSMM_INTRINSICS_MM512_SET_EPI16(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); const __m512i offsets_hi = LIBXSMM_INTRINSICS_MM512_SET_EPI16(31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, 25, 25, 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16); const __m512i idx_lo = _mm512_or_epi32(selector, offsets_lo); const __m512i idx_hi = _mm512_or_epi32(selector, offsets_hi); __m512i pixel_0, pixel_1, ofms_lo, ofms_hi; /* Batch reduce related variables */ const element_output_type *A_ptrs[1024]; const element_input_type *B_ptrs[1024]; unsigned long long n_blocks; libxsmm_blasint LDA = handle->ofmblock; libxsmm_blasint LDB = handle->input_pixels; libxsmm_blasint LDC = handle->ofmblock; int prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_NONE); int l_flags = LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N'); const int img_work = handle->desc.N; const int img_chunksize = (img_work % handle->desc.threads == 0) ? (img_work / handle->desc.threads) : (img_work / handle->desc.threads) + 1; my_img_start = (ltid * img_chunksize < img_work) ? (ltid * img_chunksize) : img_work; my_img_end = ((ltid + 1) * img_chunksize < img_work) ? ((ltid + 1) * img_chunksize) : img_work; libxsmm_barrier_init(handle->barrier, ltid); if (handle->upd_linearized_pixels == 1) { /* First transpose input and output */ if (handle->use_hybrid_imgofm_parallelization == 1) { if (handle->upd_pack_input_upfront == 0) { for (img = my_img_start; img < my_img_end; img++) { #if 0 zero_ptr_in = (element_input_type*) &LIBXSMM_VLA_ACCESS(4, tr_input, img, 0, 0, 0, handle->blocksifm, handle->ifmblock, handle->input_pixels); memset(zero_ptr_in, 0, handle->desc.C * handle->input_pixels * sizeof(element_input_type)); #endif for (ifm1 = 0; ifm1 < handle->blocksifm; ifm1++) { transpose_input_pixels_bf16( (element_input_type*)&LIBXSMM_VLA_ACCESS(5, input, img, ifm1, 0, 0, 0, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock), (element_input_type*)&LIBXSMM_VLA_ACCESS(4, tr_input, img, ifm1, 0, 0, handle->blocksifm, handle->ifmblock, handle->input_pixels), handle->ifmblock, handle->ifhp*handle->ifwp, handle->ifmblock, handle->input_pixels ); #if 0 for (ij = 0; ij < handle->ifhp; ij++) { for (ii = 0; ii < handle->ifwp; ii++) { for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_VLA_ACCESS(4, tr_input, img, ifm1, ifm2, ij * handle->ifwp + ii, handle->blocksifm, handle->ifmblock, handle->input_pixels) = LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij, ii, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); } } } #endif } } } else { for (img = my_img_start; img < my_img_end; img++) { #if 0 zero_ptr_in = (element_input_type*) &LIBXSMM_VLA_ACCESS(4, tr_input, img, 0, 0, 0, handle->blocksifm, handle->ifmblock, handle->input_pixels); memset(zero_ptr_in, 0, handle->desc.C * handle->input_pixels * sizeof(element_input_type)); #endif for (ifm1 = 0; ifm1 < handle->blocksifm; ifm1++) { for (ij = 0; ij < handle->ifhp/handle->desc.u; ij++) { transpose_input_pixels_bf16( (element_input_type*)&LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij*handle->desc.u, 0, 0, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock), (element_input_type*)&LIBXSMM_VLA_ACCESS(4, tr_input, img, ifm1, 0, ij * (handle->ifwp/handle->desc.v), handle->blocksifm, handle->ifmblock, handle->input_pixels), handle->ifmblock, handle->ifwp/handle->desc.v, 2*handle->ifmblock, handle->input_pixels ); #if 0 for (ii = 0; ii < handle->ifwp/handle->desc.v; ii++) { for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_VLA_ACCESS(4, tr_input, img, ifm1, ifm2, ij * (handle->ifwp/handle->desc.v) + ii, handle->blocksifm, handle->ifmblock, handle->input_pixels) = LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij*handle->desc.u, ii*handle->desc.v, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); } } #endif } } } } for (img = my_img_start; img < my_img_end; img++) { for (ofm1 = 0; ofm1 < handle->blocksofm; ofm1++) { TRANS_OUTPUT_TO_VNNI_FORMAT(img, ofm1); } } } #if 0 for (img = my_img_start; img < my_img_end; img++) { zero_ptr_out = (element_output_type*) &LIBXSMM_VLA_ACCESS(5, tr_output, img, 0, 0, 0, 0, handle->blocksofm, handle->output_pixels/2, handle->ofmblock, 2); memset(zero_ptr_out, 0, handle->desc.K * handle->output_pixels * sizeof(element_output_type)); for (ofm1 = 0; ofm1 < handle->blocksofm; ofm1++) { for (oi = 0; oi < handle->n_used_pixels; oi++) { for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2++) { LIBXSMM_VLA_ACCESS(5, tr_output, img, ofm1, oi/2, ofm2, oi%2, handle->blocksofm, handle->output_pixels/2, handle->ofmblock, 2) = *((element_output_type*)out_ptr + img * handle->blocksofm * handle->ofwp * handle->ofhp * handle->ofmblock + ofm1 * handle->ofwp * handle->ofhp * handle->ofmblock + oi * handle->ofmblock + ofm2); } } } } #endif } else { if (handle->upd_trans_w_only == 0) { if (handle->on_the_fly_input_packing == 0) { for (img = my_img_start; img < my_img_end; img++) { zero_ptr_in = (element_input_type*) &LIBXSMM_VLA_ACCESS(5, tr_input_2, img, 0, 0, 0, 0, handle->blocksifm, handle->ifmblock, handle->ifhp, handle->ifwp_extended); memset(zero_ptr_in, 0, handle->desc.C * handle->ifhp * handle->ifwp_extended * sizeof(element_input_type)); for (ifm1 = 0; ifm1 < handle->blocksifm; ifm1++) { for (ij = 0; ij < handle->ifhp; ij++) { for (ii = 0; ii < handle->ifwp; ii++) { for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_VLA_ACCESS(5, tr_input_2, img, ifm1, ifm2, ij, ii, handle->blocksifm, handle->ifmblock, handle->ifhp, handle->ifwp_extended) = LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij, ii, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); } } } } } } for (img = my_img_start; img < my_img_end; img++) { for (ofm1 = 0; ofm1 < handle->blocksofm; ofm1++) { #if 0 TRANS_OUTPUT_W_TO_VNNI_FORMAT(img, ofm1, 0, handle->ofh); #else for (oj = 0; oj < handle->ofh; oj++) { for (oi = 0; oi < handle->ofw; oi++) { for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2++) { LIBXSMM_VLA_ACCESS(6, tr_output_2, img, ofm1, oj, oi/2, ofm2, oi%2, handle->blocksofm, handle->ofhp, handle->ofwp_extended/2, handle->ofmblock, 2) = LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj, oi, ofm2, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); } } } if (handle->ofw % 2 == 1) { for (oj = 0; oj < handle->ofh; oj++) { for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2++) { LIBXSMM_VLA_ACCESS(6, tr_output_2, img, ofm1, oj, handle->ofw/2, ofm2, handle->ofw%2, handle->blocksofm, handle->ofhp, handle->ofwp_extended/2, handle->ofmblock, 2) = (element_output_type)0; } } } #endif } } } } /* Make sure we initialize intermediate weights to zero */ if (handle->use_intermediate_f32_wt_tensor == 1 && handle->use_hybrid_imgofm_parallelization == 0) { memset(weight_ptr_f32, 0, handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S * sizeof(float)); } if (handle->upd_linearized_pixels == 0) { if (handle->upd_trans_w_only == 1) { LDA = handle->ofmblock; LDB = handle->ifhp*handle->ifwp_extended; LDC = handle->ofmblock; prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_NONE); l_flags = LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N'); n_blocks = handle->batchreduce_h_pixels; br_gemm_kernel = libxsmm_bsmmdispatch_reducebatch_addr(handle->ofmblock, handle->ifmblock, handle->ofw, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); for (img = my_img_start; img < my_img_end; img++) { for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_upd_ofm) { for (oj = 0; oj < handle->ofh; oj += handle->batchreduce_h_pixels){ for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_upd_ifm) { for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_upd_ofm, handle->blocksofm); ofm1++ ) { /* Transpose output block */ TRANS_OUTPUT_W_TO_VNNI_FORMAT(img, ofm1, oj, handle->batchreduce_h_pixels); for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_upd_ifm, handle->blocksifm); ifm1++) { /* Transpose input block */ for (j=0; j < handle->batchreduce_h_pixels; j++) { transpose_input_pixels_bf16( (element_input_type*)&LIBXSMM_VLA_ACCESS(5, input, img, ifm1, oj+j, 0, 0, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock), (element_input_type*)&LIBXSMM_VLA_ACCESS(5, tr_input_2, img, 0, 0, j, 0, handle->blocksifm, handle->ifmblock, handle->ifhp, handle->ifwp_extended), handle->ifmblock, handle->ifwp_extended, handle->ifmblock, handle->ifhp*handle->ifwp_extended ); } for (kj = 0; kj < handle->desc.R; ++kj) { for (ki = 0; ki < handle->desc.S; ++ki) { /* Determine if destination is the accumulation scratch or the intermediate fp32 weight tensor */ if (handle->use_intermediate_f32_wt_tensor == 1) { dst_ptr = (float*)&LIBXSMM_VLA_ACCESS(6, weight_private_f32, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); } else { dst_ptr = (float*)&LIBXSMM_VLA_ACCESS(2, filter_tmp, 0, 0, handle->ofmblock); } for (j_br = 0; j_br < handle->batchreduce_h_pixels; j_br++) { A_ptrs[j_br] = (element_output_type*) &LIBXSMM_VLA_ACCESS(6, tr_output_2, img, 0, j_br, 0, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp_extended/2, handle->ofmblock, 2); B_ptrs[j_br] = (element_input_type*) &LIBXSMM_VLA_ACCESS(5, tr_input_2, img, 0, 0, j_br, 0, handle->blocksifm, handle->ifmblock, handle->ifhp, handle->ifwp_extended); } br_gemm_kernel(A_ptrs, B_ptrs, dst_ptr, &n_blocks); /* Convert fully accumulated buffer to bf16 weight buffer in case of full accumulation has happened */ if ((oj + handle->batchreduce_h_pixels >= handle->ofh) && (img == my_img_end - 1)) { LIBXSMM_VLA_DECL(2, float, filter_acc_buffer, (float*)dst_ptr, handle->ofmblock); for (ij = 0; ij < handle->ifmblock; ij+=2) { for (ii = 0; ii < handle->ofmblock; ii+=16) { c01 = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(2, filter_acc_buffer, ij+1, ii, handle->ofmblock)), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(2, filter_acc_buffer, ij, ii, handle->ofmblock)) ); _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(7, weight_dst, ofm1, ifm1, kj, ki, ij/2, ii, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock/2, handle->ofmblock, 2), _mm512_permutexvar_epi16(perm_index, c01)); } } } } } } } } } } } } else { int fast_trans = (handle->ofw == 112 && handle->desc.v == 2 && handle->ifmblock == 4 && handle->batchreduce_h_pixels == 1) ? 1 : 0; const __m512i skipper = LIBXSMM_INTRINSICS_MM512_SET_EPI16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 27, 19, 11, 3, 26, 18, 10, 2, 25, 17, 9, 1, 24, 16, 8, 0); __m512i p0, p1, p2, p3; __m256i _p0, _p1, _p2, _p3; __m256i r0 = _mm256_undefined_si256(); __m256i r1 = _mm256_undefined_si256(); __m256i r2 = _mm256_undefined_si256(); __m256i r3 = _mm256_undefined_si256(); LDA = handle->ofmblock; LDB = handle->ifhp*handle->ifwp_extended; LDC = handle->ofmblock; prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_NONE); l_flags = LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N'); n_blocks = handle->batchreduce_h_pixels; /* Handle case when ofw is odd number... */ if (handle->ofw % 2 == 1) { br_gemm_kernel = libxsmm_bsmmdispatch_reducebatch_addr(handle->ofmblock, handle->ifmblock, handle->ofw+1, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); } else { br_gemm_kernel = libxsmm_bsmmdispatch_reducebatch_addr(handle->ofmblock, handle->ifmblock, handle->ofw, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); } for (img = my_img_start; img < my_img_end; img++) { for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_upd_ofm) { for (oj = 0; oj < handle->ofh; oj += handle->batchreduce_h_pixels){ for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_upd_ifm) { for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_upd_ofm, handle->blocksofm); ofm1++ ) { for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_upd_ifm, handle->blocksifm); ifm1++) { for (kj = 0; kj < handle->desc.R; ++kj) { for (ki = 0; ki < handle->desc.S; ++ki) { /* Determine if destination is the accumulation scratch or the intermediate fp32 weight tensor */ if (handle->use_intermediate_f32_wt_tensor == 1) { dst_ptr = (float*)&LIBXSMM_VLA_ACCESS(6, weight_private_f32, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); } else { dst_ptr = (float*)&LIBXSMM_VLA_ACCESS(2, filter_tmp, 0, 0, handle->ofmblock); } /* Copy the input in such a way that we ignore "w-pixels" based on ki value */ if (handle->on_the_fly_input_packing == 1) { if (fast_trans == 1) { for (ii = 0; ii < handle->ofw*2; ii+=32) { p0 = _mm512_loadu_si512((element_input_type*)&LIBXSMM_VLA_ACCESS(5, input, img, ifm1, oj*handle->desc.u+kj, ii+ki, 0, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock)); p0 = _mm512_permutexvar_epi16(skipper, p0); _p0 = LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(p0, 0); p1 = _mm512_loadu_si512((element_input_type*)&LIBXSMM_VLA_ACCESS(5, input, img, ifm1, oj*handle->desc.u+kj, ii+8+ki, 0, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock)); p1 = _mm512_permutexvar_epi16(skipper, p1); _p1 = LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(p1, 0); p2 = _mm512_loadu_si512((element_input_type*)&LIBXSMM_VLA_ACCESS(5, input, img, ifm1, oj*handle->desc.u+kj, ii+16+ki, 0, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock)); p2 = _mm512_permutexvar_epi16(skipper, p2); _p2 = LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(p2, 0); p3 = _mm512_loadu_si512((element_input_type*)&LIBXSMM_VLA_ACCESS(5, input, img, ifm1, oj*handle->desc.u+kj, ii+24+ki, 0, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock)); p3 = _mm512_permutexvar_epi16(skipper, p3); _p3 = LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(p3, 0); r0 = _mm256_insert_epi64 (r0, _mm256_extract_epi64(_p0, 0), 0); r0 = _mm256_insert_epi64 (r0, _mm256_extract_epi64(_p1, 0), 1); r0 = _mm256_insert_epi64 (r0, _mm256_extract_epi64(_p2, 0), 2); r0 = _mm256_insert_epi64 (r0, _mm256_extract_epi64(_p3, 0), 3); _mm256_storeu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(5, tr_input_2, img, 0, 0, 0, ii/2, handle->blocksifm, handle->ifmblock, handle->ifhp, handle->ifwp_extended), r0); r1 = _mm256_insert_epi64 (r1, _mm256_extract_epi64(_p0, 1), 0); r1 = _mm256_insert_epi64 (r1, _mm256_extract_epi64(_p1, 1), 1); r1 = _mm256_insert_epi64 (r1, _mm256_extract_epi64(_p2, 1), 2); r1 = _mm256_insert_epi64 (r1, _mm256_extract_epi64(_p3, 1), 3); _mm256_storeu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(5, tr_input_2, img, 0, 1, 0, ii/2, handle->blocksifm, handle->ifmblock, handle->ifhp, handle->ifwp_extended), r1); r2 = _mm256_insert_epi64 (r2, _mm256_extract_epi64(_p0, 2), 0); r2 = _mm256_insert_epi64 (r2, _mm256_extract_epi64(_p1, 2), 1); r2 = _mm256_insert_epi64 (r2, _mm256_extract_epi64(_p2, 2), 2); r2 = _mm256_insert_epi64 (r2, _mm256_extract_epi64(_p3, 2), 3); _mm256_storeu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(5, tr_input_2, img, 0, 2, 0, ii/2, handle->blocksifm, handle->ifmblock, handle->ifhp, handle->ifwp_extended), r2); r3 = _mm256_insert_epi64 (r3, _mm256_extract_epi64(_p0, 3), 0); r3 = _mm256_insert_epi64 (r3, _mm256_extract_epi64(_p1, 3), 1); r3 = _mm256_insert_epi64 (r3, _mm256_extract_epi64(_p2, 3), 2); r3 = _mm256_insert_epi64 (r3, _mm256_extract_epi64(_p3, 3), 3); _mm256_storeu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(5, tr_input_2, img, 0, 3, 0, ii/2, handle->blocksifm, handle->ifmblock, handle->ifhp, handle->ifwp_extended), r3); } } else { for (ij = 0; ij < handle->batchreduce_h_pixels; ij++) { for (ii = 0; ii < handle->ofw; ii++) { for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_VLA_ACCESS(5, tr_input_2, img, 0, ifm2, ij, ii, handle->blocksifm, handle->ifmblock, handle->ifhp, handle->ifwp_extended) = LIBXSMM_VLA_ACCESS(5, input, img, ifm1, (oj+ij)*handle->desc.u+kj, ii*handle->desc.v+ki, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); } } } } } for (j_br = 0; j_br < handle->batchreduce_h_pixels; j_br++) { A_ptrs[j_br] = (element_output_type*) &LIBXSMM_VLA_ACCESS(6, tr_output_2, img, ofm1, oj+j_br, 0, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp_extended/2, handle->ofmblock, 2); B_ptrs[j_br] = (element_input_type*) &LIBXSMM_VLA_ACCESS(5, tr_input_2, img, 0, 0, j_br, 0, handle->blocksifm, handle->ifmblock, handle->ifhp, handle->ifwp_extended); } br_gemm_kernel(A_ptrs, B_ptrs, dst_ptr, &n_blocks); /* Convert fully accumulated buffer to bf16 weight buffer in case of full accumulation has happened */ if ((oj + handle->batchreduce_h_pixels >= handle->ofh) && (img == my_img_end - 1)) { LIBXSMM_VLA_DECL(2, float, filter_acc_buffer, (float*)dst_ptr, handle->ofmblock); for (ij = 0; ij < handle->ifmblock; ij+=2) { for (ii = 0; ii < handle->ofmblock; ii+=16) { c01 = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(2, filter_acc_buffer, ij+1, ii, handle->ofmblock)), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(2, filter_acc_buffer, ij, ii, handle->ofmblock))); _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(7, weight_dst, ofm1, ifm1, kj, ki, ij/2, ii, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock/2, handle->ofmblock, 2), _mm512_permutexvar_epi16(perm_index, c01)); } } } } } } } } } } } } } else { LDA = handle->ofmblock; LDB = handle->input_pixels; LDC = handle->ofmblock; prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_NONE); l_flags = LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N'); if (handle->use_hybrid_imgofm_parallelization == 1) { /* Here we are using batch-reduce kernel and hybrid minibatch/FM parallelization */ /* FIXME: Hardcoed logic for N=27 */ int group_size = (handle->desc.threads == 27 && handle->desc.N == 27 && handle->ofw == 14 && handle->desc.R == 1 && handle->desc.u == 1 && ltid >= 24) ? 3 : LIBXSMM_UPDIV(handle->desc.threads, handle->weight_copies); int tile_id = ltid / LIBXSMM_UPDIV(handle->desc.threads, handle->weight_copies); int tiles = handle->weight_copies; int img_per_tile = LIBXSMM_UPDIV(handle->desc.N, tiles); int my_in_tile_id = ltid % group_size; int ifms_per_thread = LIBXSMM_UPDIV(handle->blocksifm, group_size); int ofms_per_thread = LIBXSMM_UPDIV(handle->blocksofm, group_size); int my_R_start = 0; int my_R_end = handle->desc.R; element_filter_type *weight_ptr_group = (handle->weight_copies > 1) ? (element_filter_type*)((char*)handle->scratch + handle->upd_filter_scratch_offset) + tile_id * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S : (element_filter_type*)handle->grad_filter->data; LIBXSMM_VLA_DECL(7, element_filter_type, weight_private_group, (element_filter_type*)weight_ptr_group, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock/2, handle->ofmblock, 2); /* This intermediate tensor is used when pixels are NOT fully accumulated */ float *weight_tile_ptr_f32 = (float*)((char*)handle->scratch + handle->upd_lp_filter_full_scratch_offset) + tile_id * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S; LIBXSMM_VLA_DECL(6, float, weight_private_tile_f32, (float*)weight_tile_ptr_f32, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); my_img_start = LIBXSMM_MIN(tile_id * img_per_tile, handle->desc.N); my_img_end = LIBXSMM_MIN((tile_id+1) * img_per_tile, handle->desc.N); my_ifm_start = LIBXSMM_MIN(my_in_tile_id * ifms_per_thread, handle->blocksifm ); my_ifm_end = LIBXSMM_MIN((my_in_tile_id+1) * ifms_per_thread, handle->blocksifm ); my_ofm_start = 0; my_ofm_end = handle->blocksofm; /* FIXME: Hardcoed logic for N=27 */ if (handle->desc.threads == 27 && handle->desc.N == 27 && handle->desc.C == 256 && handle->desc.K == 1024 && handle->ofh == 14 && handle->desc.u == 1) { my_ofm_start = LIBXSMM_MIN(my_in_tile_id * ofms_per_thread, handle->blocksofm); my_ofm_end = LIBXSMM_MIN((my_in_tile_id+1) * ofms_per_thread, handle->blocksofm); my_ifm_start = 0; my_ifm_end = handle->blocksifm; } if (handle->desc.threads == 27 && handle->desc.N == 27 && handle->desc.R == 3 && handle->desc.S == 3 && handle->ofh == 14) { int r_per_tile = LIBXSMM_UPDIV(handle->desc.R, group_size); my_ifm_start = 0; my_ifm_end = handle->blocksifm; my_ofm_start = 0; my_ofm_end = handle->blocksofm; my_R_start = LIBXSMM_MIN(my_in_tile_id * r_per_tile, handle->desc.R); my_R_end = LIBXSMM_MIN((my_in_tile_id+1) * r_per_tile, handle->desc.R); } block_ofm = my_ofm_end-my_ofm_start+1; block_ifm = my_ifm_end-my_ifm_start+1; img_block_size = my_img_end - my_img_start; br_gemm_kernel = libxsmm_bsmmdispatch_reducebatch_addr(handle->ofmblock, handle->ifmblock, handle->pixel_blocking, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); n_blocks = img_block_size; /* Make sure we initialize intermediate weights to zero */ if (handle->use_intermediate_f32_wt_tensor == 1) { for (ofm1 = my_ofm_start; ofm1 < my_ofm_end; ofm1++ ) { for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { for (kj = my_R_start; kj < my_R_end; ++kj) { memset((float*)&LIBXSMM_VLA_ACCESS(6, weight_private_tile_f32, ofm1, ifm1, kj, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), 0, handle->ofmblock * handle->ifmblock * handle->desc.S * sizeof(float)); } } } } libxsmm_barrier_wait(handle->barrier, ltid); for (img = my_img_start; img < my_img_end; img += img_block_size) { for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += block_ofm) { for (pix = 0; pix < handle->n_used_pixels; pix += handle->pixel_blocking){ for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += block_ifm) { for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+block_ofm, my_ofm_end); ofm1++ ) { for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+block_ifm, my_ifm_end); ifm1++) { for (kj = my_R_start; kj < my_R_end; ++kj) { for (ki = 0; ki < handle->desc.S; ++ki) { /* Determine if destination is the accumulation scratch or the intermediate fp32 weight tensor */ if (handle->use_intermediate_f32_wt_tensor == 1) { dst_ptr = (float*)&LIBXSMM_VLA_ACCESS(6, weight_private_tile_f32, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); } else { dst_ptr = (float*)&LIBXSMM_VLA_ACCESS(2, filter_tmp, 0, 0, handle->ofmblock); } for (img_br = 0; img_br < img_block_size; img_br++) { A_ptrs[img_br] = &LIBXSMM_VLA_ACCESS(5, tr_output, img + img_br, ofm1, pix/2, 0, 0, handle->blocksofm, handle->output_pixels/2, handle->ofmblock, 2); B_ptrs[img_br] = &LIBXSMM_VLA_ACCESS(4, tr_input, img + img_br, ifm1, 0, pix + kj * handle->ifwp + ki, handle->blocksifm, handle->ifmblock, handle->input_pixels); } br_gemm_kernel(A_ptrs, B_ptrs, dst_ptr, &n_blocks); /* Convert fully caccumulated buffer to bf16 weight buffer in case of full accumulation has happened */ if ((pix + handle->pixel_blocking >= handle->n_used_pixels) && (img == my_img_end - img_block_size)) { LIBXSMM_VLA_DECL(2, float, filter_acc_buffer, (float*)dst_ptr, handle->ofmblock); for (ij = 0; ij < handle->ifmblock; ij+=2) { for (ii = 0; ii < handle->ofmblock; ii+=16) { c01 = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(2, filter_acc_buffer, ij+1, ii, handle->ofmblock)), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(2, filter_acc_buffer, ij, ii, handle->ofmblock)) ); _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(7, weight_private_group, ofm1, ifm1, kj, ki, ij/2, ii, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock/2, handle->ofmblock, 2), _mm512_permutexvar_epi16(perm_index, c01)); } } } } } } } } } } } } else { gemm_function gemm_kernel = libxsmm_bsmmdispatch(handle->ofmblock, handle->ifmblock, handle->pixel_blocking, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); for (img = my_img_start; img < my_img_end; img++) { for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_upd_ofm) { for (pix = 0; pix < handle->n_used_pixels; pix += handle->pixel_blocking){ for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_upd_ifm) { for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_upd_ofm, handle->blocksofm); ofm1++ ) { /* Transpose output block */ if (pix == 0 && ifmb == 0) { TRANS_OUTPUT_TO_VNNI_FORMAT(img, ofm1); } for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_upd_ifm, handle->blocksifm); ifm1++) { /* Transpose input block */ if (pix == 0 && ofmb == 0 && ofm1 == 0) { if (handle->upd_pack_input_upfront == 0) { transpose_input_pixels_bf16( (element_input_type*)&LIBXSMM_VLA_ACCESS(5, input, img, ifm1, 0, 0, 0, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock), (element_input_type*)&LIBXSMM_VLA_ACCESS(4, tr_input, img, ifm1, 0, 0, handle->blocksifm, handle->ifmblock, handle->input_pixels), handle->ifmblock, handle->ifhp*handle->ifwp, handle->ifmblock, handle->input_pixels ); } else { for (ij = 0; ij < handle->ifhp/handle->desc.u; ij++) { transpose_input_pixels_bf16( (element_input_type*)&LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij*handle->desc.u, 0, 0, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock), (element_input_type*)&LIBXSMM_VLA_ACCESS(4, tr_input, img, ifm1, 0, ij * (handle->ifwp/handle->desc.v), handle->blocksifm, handle->ifmblock, handle->input_pixels), handle->ifmblock, handle->ifwp/handle->desc.v, 2*handle->ifmblock, handle->input_pixels ); } } } for (kj = 0; kj < handle->desc.R; ++kj) { for (ki = 0; ki < handle->desc.S; ++ki) { /* Determine if destination is the accumulation scratch or the intermediate fp32 weight tensor */ if (handle->use_intermediate_f32_wt_tensor == 1) { dst_ptr = (float*)&LIBXSMM_VLA_ACCESS(6, weight_private_f32, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); } else { dst_ptr = (float*)&LIBXSMM_VLA_ACCESS(2, filter_tmp, 0, 0, handle->ofmblock); } gemm_kernel( &LIBXSMM_VLA_ACCESS(5, tr_output, img, ofm1, pix/2, 0, 0, handle->blocksofm, handle->output_pixels/2, handle->ofmblock, 2), &LIBXSMM_VLA_ACCESS(4, tr_input, img, ifm1, 0, pix + kj * handle->ifwp + ki, handle->blocksifm, handle->ifmblock, handle->input_pixels), dst_ptr); /* Convert fully accumulated buffer to bf16 weight buffer in case of full accumulation has happened */ if ((pix + handle->pixel_blocking >= handle->n_used_pixels) && (img == my_img_end - 1)) { LIBXSMM_VLA_DECL(2, float, filter_acc_buffer, (float*)dst_ptr, handle->ofmblock); for (ij = 0; ij < handle->ifmblock; ij+=2) { for (ii = 0; ii < handle->ofmblock; ii+=16) { c01 = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(2, filter_acc_buffer, ij+1, ii, handle->ofmblock)), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(2, filter_acc_buffer, ij, ii, handle->ofmblock)) ); _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(7, weight_dst, ofm1, ifm1, kj, ki, ij/2, ii, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock/2, handle->ofmblock, 2), _mm512_permutexvar_epi16(perm_index, c01)); } } } } } } } } } } } } } libxsmm_barrier_wait(handle->barrier, ltid); if (handle->weight_copies > 1) { int active_copies = handle->weight_copies; const int filter_size = handle->desc.R * handle->desc.S * handle->desc.C * handle->desc.K; LIBXSMM_VLA_DECL(2, element_filter_type, weight_copies_buffer, (element_filter_type*) ((char*)handle->scratch + handle->upd_filter_scratch_offset), filter_size); element_filter_type *weight_global_ptr = (element_filter_type*) handle->grad_filter->data; /* In this case calculate how many weight copies have been indeed computed */ if (handle->desc.N != handle->desc.threads) { active_copies = 1; while (active_copies * img_chunksize < handle->desc.N) { active_copies++; } } for ( j = reduce_thr_begin; j < reduce_thr_end; j++) { __m512 weight_sum = _mm512_setzero_ps(); for ( i = 0; i < active_copies; i++ ) { weight_sum = _mm512_add_ps(weight_sum, LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((const __m256i*)&LIBXSMM_VLA_ACCESS(2, weight_copies_buffer, i, j*16, filter_size)))); } _mm256_storeu_si256((__m256i*)(((libxsmm_bfloat16*) weight_global_ptr) + j*16), LIBXSMM_INTRINSICS_MM512_CVT_FP32_BF16(weight_sum)); } libxsmm_barrier_wait(handle->barrier, ltid); } #undef TRANS_OUTPUT_W_TO_VNNI_FORMAT #undef TRANS_OUTPUT_TO_VNNI_FORMAT libxsmm-1.17/src/template/libxsmm_dnn_convolve_st_upd_nhwc_custom-rsck_generic.tpl.c000066400000000000000000001226241415223013700313360ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas, Alexander Heinecke (Intel Corp.) ******************************************************************************/ int img, my_img_start, my_img_end, ofmb, ifmb, ojb, ofm1, ifm1, ifm2 = 0, ofm2 = 0, oj, oi, ii, ij, kj, ki, ind, j_br, img_br, img_block_size = 1, my_ofm_start, my_ofm_end, my_ifm_start, my_ifm_end, block_ofm, block_ifm; /* computing first logical thread */ const int ltid = tid - start_thread; libxsmm_blasint LDA = handle->blocksofm * handle->ofmblock; libxsmm_blasint LDB = (handle->upd_pack_input == 1) ? handle->blocksifm * handle->ifmblock : handle->desc.v * handle->blocksifm * handle->ifmblock; #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) libxsmm_blasint LDC = handle->ofmblock; #endif #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) libxsmm_blasint LDC = handle->blocksofm * handle->ofmblock; #endif int l_flags = LIBXSMM_GEMM_FLAGS('N', 'T'); element_output_type *const out = (element_output_type*)handle->grad_output->data + ((size_t)handle->desc.pad_h_out * handle->ofwp + handle->desc.pad_w_out) * handle->blocksofm * handle->ofmblock; LIBXSMM_VLA_DECL(5, const element_output_type, output, (const element_output_type*)out, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock); const int IFWP = (handle->upd_padding_copy == 1) ? handle->ifwp + 2*handle->desc.pad_w : handle->ifwp; const int IFHP = (handle->upd_padding_copy == 1) ? handle->ifhp + 2*handle->desc.pad_h : handle->ifhp; element_input_type *input_ptr_to_use = (handle->upd_padding_copy == 1) ? (element_input_type*) ((char*)handle->scratch + handle->upd_packing_padding_scratch_offset) : (element_input_type*)handle->reg_input->data; LIBXSMM_VLA_DECL(5, element_input_type, input, (element_input_type*) input_ptr_to_use, IFHP, IFWP, handle->blocksifm, handle->ifmblock); #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) LIBXSMM_VLA_DECL(6, element_filter_type, weight_global, (element_filter_type*)handle->grad_filter->data, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); #endif #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) LIBXSMM_VLA_DECL(6, element_filter_type, weight_global, (element_filter_type*)handle->grad_filter->data, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); #endif element_filter_type *weight_ptr = (handle->weight_copies == 1) ? (element_filter_type*)handle->grad_filter->data : (element_filter_type*) ((char*)handle->scratch + handle->upd_filter_scratch_offset) + ltid * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S; #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) LIBXSMM_VLA_DECL(6, element_filter_type, weight_private, (element_filter_type*)weight_ptr, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); #endif #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) LIBXSMM_VLA_DECL(6, element_filter_type, weight_private, (element_filter_type*)weight_ptr, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); #endif int prefetch_mode = (handle->desc.u == 2 || (handle->desc.R == 3 && handle->ofw == 7) ) ? libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_NONE) : libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_BL1); /* Batch reduce related variables */ const element_output_type *A_ptrs[1024]; const element_input_type *B_ptrs[1024]; unsigned long long n_blocks; libxsmm_barrier_init(handle->barrier, ltid); /* physical pad input */ if (handle->upd_padding_copy == 1) { LIBXSMM_VLA_DECL(5, element_input_type, input_src, (element_input_type*)handle->reg_input->data, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock); int imgpt = LIBXSMM_UPDIV(handle->desc.N, handle->desc.threads); my_img_start = LIBXSMM_MIN(ltid * imgpt, handle->desc.N); my_img_end = LIBXSMM_MIN((ltid+1) * imgpt, handle->desc.N); my_ifm_start = 0; my_ifm_end = handle->blocksifm; for (img = my_img_start; img < my_img_end; img++) { for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { /* copy the inner part */ for (ij = 0; ij < handle->ifhp+(2*handle->desc.pad_h); ij++) { for (ii = 0; ii < handle->ifwp+(2*handle->desc.pad_w); ii++) { if ( (ij >= handle->desc.pad_h) && (ii >= handle->desc.pad_w) && (ij < handle->ifhp+handle->desc.pad_h) && (ii < handle->ifwp+handle->desc.pad_w) ) { LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_VLA_ACCESS(5, input, img, ij, ii, ifm1, ifm2, IFHP, IFWP, handle->blocksifm, handle->ifmblock) = LIBXSMM_VLA_ACCESS(5, input_src, img, ij-handle->desc.pad_h, ii-handle->desc.pad_w, ifm1, ifm2, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock); } } else { LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_VLA_ACCESS(5, input, img, ij, ii, ifm1, ifm2, IFHP, IFWP, handle->blocksifm, handle->ifmblock) = (element_input_type)0; } } } } } } libxsmm_barrier_wait(handle->barrier, ltid); } if (handle->upd_use_batchreduce == 0 && handle->upd_linearized_tasklist == 0) { /* Parallelize over minibatch */ const int img_work = handle->desc.N; const int img_chunksize = (img_work % handle->desc.threads == 0) ? (img_work / handle->desc.threads) : (img_work / handle->desc.threads) + 1; const float beta = ((img_chunksize == 1) && (handle->upd_ofh_rb == handle->ofh) && (handle->upd_ofw_rb == handle->ofw)) ? 0.f : 1.f; gemm_function gemm_kernel = libxsmm_smmdispatch(handle->ofmblock, handle->ifmblock, handle->upd_ofw_rb * handle->upd_ofh_rb, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); my_img_start = (ltid * img_chunksize < img_work) ? (ltid * img_chunksize) : img_work; my_img_end = ((ltid + 1) * img_chunksize < img_work) ? ((ltid + 1) * img_chunksize) : img_work; if (!((img_chunksize == 1) && (handle->upd_ofh_rb == handle->ofh) && (handle->upd_ofw_rb == handle->ofw))) { memset(weight_ptr, 0, handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S * sizeof(element_filter_type)); } if (handle->upd_loop_order == 0) { for (img = my_img_start; img < my_img_end; img++) { for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_upd_ofm) { for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_upd_ifm) { for (ojb = 0; ojb < handle->ofh; ojb += handle->upd_ofh_rb) { for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_upd_ofm, handle->blocksofm); ofm1++ ) { for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_upd_ifm, handle->blocksifm); ifm1++) { for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->upd_ofh_rb,handle->ofh); oj+= handle->upd_ofh_rb) { for (oi = 0; oi < handle->ofw; oi += handle->upd_ofw_rb) { for (kj = 0; kj < handle->desc.R; ++kj) { for (ki = 0; ki < handle->desc.S; ++ki) { ii = oi * handle->desc.u + ki; ij = oj * handle->desc.v + kj; #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) gemm_kernel( &LIBXSMM_VLA_ACCESS(5, output, img, oj, oi, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock), &LIBXSMM_VLA_ACCESS(5, input, img, ij, ii, ifm1, 0, IFHP, IFWP, handle->blocksifm, handle->ifmblock), &LIBXSMM_VLA_ACCESS(6, weight_private, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock) ); #endif #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) gemm_kernel( &LIBXSMM_VLA_ACCESS(5, output, img, oj, oi, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock), &LIBXSMM_VLA_ACCESS(5, input, img, ij, ii, ifm1, 0, IFHP, IFWP, handle->blocksifm, handle->ifmblock), &LIBXSMM_VLA_ACCESS(6, weight_private, kj, ki, ifm1, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock) ); #endif } } } } } } } } } } } if (handle->upd_loop_order == 1) { for (img = my_img_start; img < my_img_end; img++) { for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_upd_ifm) { for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_upd_ofm) { for (ojb = 0; ojb < handle->ofh; ojb += handle->upd_ofh_rb) { for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_upd_ifm, handle->blocksifm); ifm1++) { for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_upd_ofm, handle->blocksofm); ofm1++ ) { for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->upd_ofh_rb,handle->ofh); oj+= handle->upd_ofh_rb) { for (oi = 0; oi < handle->ofw; oi += handle->upd_ofw_rb) { for (kj = 0; kj < handle->desc.R; ++kj) { for (ki = 0; ki < handle->desc.S; ++ki) { ii = oi * handle->desc.u + ki; ij = oj * handle->desc.v + kj; #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) gemm_kernel( &LIBXSMM_VLA_ACCESS(5, output, img, oj, oi, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock), &LIBXSMM_VLA_ACCESS(5, input, img, ij, ii, ifm1, 0, IFHP, IFWP, handle->blocksifm, handle->ifmblock), &LIBXSMM_VLA_ACCESS(6, weight_private, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock) ); #endif #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) gemm_kernel( &LIBXSMM_VLA_ACCESS(5, output, img, oj, oi, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock), &LIBXSMM_VLA_ACCESS(5, input, img, ij, ii, ifm1, 0, IFHP, IFWP, handle->blocksifm, handle->ifmblock), &LIBXSMM_VLA_ACCESS(6, weight_private, kj, ki, ifm1, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock) ); #endif } } } } } } } } } } } } else { if (handle->upd_linearized_tasklist == 1) { /* Amount of work when using linearized view of tasks */ const int work = handle->desc.R * handle->desc.S * handle->blocksofm * handle->blocksifm; const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : (work / handle->desc.threads) + 1; const int work_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int work_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; int work_item; int Cb = handle->blocksifm; #if 0 int Kb = handle->blocksofm; #endif int R = handle->desc.R; int S = handle->desc.S; if (handle->upd_avoid_rim_fmas == 0) { const int IFH = (handle->upd_pack_input == 1) ? handle->ifhp/handle->desc.u : IFHP; const int IFW = (handle->upd_pack_input == 1) ? handle->ifwp/handle->desc.v : IFWP; element_input_type *input_ptr_base = (handle->upd_pack_input == 1) ? (element_input_type*)((char*)handle->scratch + handle->upd_packing_padding_scratch_offset) : (element_input_type*)input_ptr_to_use; LIBXSMM_VLA_DECL(5, element_input_type, input_use, (element_input_type*)input_ptr_base, IFH, IFW, handle->blocksifm, handle->ifmblock); const float beta = ((handle->desc.N == 1) && (handle->upd_ofh_rb == handle->ofh) && (handle->upd_ofw_rb == handle->ofw)) ? 0.f : 1.f; gemm_function gemm_kernel = libxsmm_smmdispatch(handle->ofmblock, handle->ifmblock, handle->upd_ofw_rb * handle->upd_ofh_rb, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); /* If requested, pack input to avoid strided accesses */ if (handle->upd_pack_input == 1) { LIBXSMM_VLA_DECL(5, element_input_type, input_src, (element_input_type*)handle->reg_input->data, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock); const int img_chunk = (handle->desc.N % handle->desc.threads == 0) ? handle->desc.N/handle->desc.threads : (handle->desc.N/handle->desc.threads) + 1; const int img_copy_start = LIBXSMM_MIN(ltid*img_chunk, handle->desc.N); const int img_copy_end = LIBXSMM_MIN((ltid+1)*img_chunk, handle->desc.N); for (img = img_copy_start; img < img_copy_end; img++) { for (ifm1 = 0; ifm1 < handle->blocksifm; ifm1++) { for (oj = 0; oj < handle->ofh; oj++) { for (oi = 0; oi < handle->ofw; oi++) { ij = oj * handle->desc.u; ii = oi * handle->desc.v; LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_VLA_ACCESS(5, input_use, img, oj, oi, ifm1, ifm2, IFH, IFW, handle->blocksifm, handle->ifmblock) = LIBXSMM_VLA_ACCESS(5, input_src, img, ij, ii, ifm1, ifm2, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock); } } } } } libxsmm_barrier_wait(handle->barrier, ltid); } /* Initialize weights to zero */ if (!((handle->desc.N == 1) && (handle->upd_ofh_rb == handle->ofh) && (handle->upd_ofw_rb == handle->ofw))) { for (work_item = work_begin; work_item < work_end; work_item++) { ofm1 = work_item/(Cb*R*S); ifm1 = (work_item%(Cb*R*S))/(R*S); kj = ((work_item%(Cb*R*S))%(R*S))/S; ki = ((work_item%(Cb*R*S))%(R*S))%S; for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_PRAGMA_SIMD for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2++) { #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) LIBXSMM_VLA_ACCESS(6, weight_global, ofm1, ifm1, kj, ki, ifm2, ofm2, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock) = (element_filter_type)0; #endif #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) LIBXSMM_VLA_ACCESS(6, weight_global, kj, ki, ifm1, ifm2, ofm1, ofm2, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock) = (element_filter_type)0; #endif } } } } for (img = 0; img < handle->desc.N; img++) { for (work_item = work_begin; work_item < work_end; work_item++) { ofm1 = work_item/(Cb*R*S); ifm1 = (work_item%(Cb*R*S))/(R*S); kj = ((work_item%(Cb*R*S))%(R*S))/S; ki = ((work_item%(Cb*R*S))%(R*S))%S; oi = 0; ii = ki; for (oj = 0; oj < handle->ofh; oj += handle->upd_ofh_rb) { ij = oj * handle->desc.u + kj; #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) gemm_kernel( &LIBXSMM_VLA_ACCESS(5, output, img, oj, oi, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock), &LIBXSMM_VLA_ACCESS(5, input_use, img, ij, ii, ifm1, 0, IFH, IFW, handle->blocksifm, handle->ifmblock), &LIBXSMM_VLA_ACCESS(6, weight_global, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock) ); #endif #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) gemm_kernel( &LIBXSMM_VLA_ACCESS(5, output, img, oj, oi, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock), &LIBXSMM_VLA_ACCESS(5, input_use, img, ij, ii, ifm1, 0, IFH, IFW, handle->blocksifm, handle->ifmblock), &LIBXSMM_VLA_ACCESS(6, weight_global, kj, ki, ifm1, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock) ); #endif } } } } else { const float beta = ((handle->upd_ofh_rb == handle->ofh) && (handle->upd_ofw_rb == handle->ofw)) ? 0.f : 1.f; gemm_br_function br_gemm_kernel = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->ifmblock, handle->upd_ofw_rb, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); gemm_br_function br_gemm_kernel2 = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->ifmblock, handle->upd_ofw_rb-1, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); for (work_item = work_begin; work_item < work_end; work_item++) { ofm1 = work_item/(Cb*R*S); ifm1 = (work_item%(Cb*R*S))/(R*S); kj = ((work_item%(Cb*R*S))%(R*S))/S; ki = ((work_item%(Cb*R*S))%(R*S))%S; oi = 0; oj = 0; ii = oi * handle->desc.u + ki; ij = oj * handle->desc.v + kj; img = 0; img_block_size = handle->desc.N; if (kj == 0) { ind = 0; for (img_br = 0; img_br < img_block_size; img_br++) { for (j_br = 1; j_br < handle->upd_ofh_rb; j_br++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img + img_br, oj + j_br, oi, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ij + j_br * handle->desc.u, ii, ifm1, 0, IFHP, IFWP, handle->blocksifm, handle->ifmblock); ind++; } } n_blocks = ind; #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_global, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &n_blocks); #endif #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_global, kj, ki, ifm1, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock), &n_blocks); #endif } else if (ki == 0) { ind = 0; for (img_br = 0; img_br < img_block_size; img_br++) { for (j_br = 0; j_br < handle->upd_ofh_rb; j_br++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img + img_br, oj + j_br, oi + 1, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ij + j_br * handle->desc.u, ii + 1, ifm1, 0, IFHP, IFWP, handle->blocksifm, handle->ifmblock); ind++; } } n_blocks = ind; #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_global, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &n_blocks); #endif #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_global, kj, ki, ifm1, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock), &n_blocks); #endif } else if (oi == handle->ofw-handle->fwd_ofw_rb && ki == handle->desc.S-1) { ind = 0; for (img_br = 0; img_br < img_block_size; img_br++) { for (j_br = 0; j_br < handle->upd_ofh_rb; j_br++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img + img_br, oj + j_br, oi, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ij + j_br * handle->desc.u, ii, ifm1, 0, IFHP, IFWP, handle->blocksifm, handle->ifmblock); ind++; } } n_blocks = ind; #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_global, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &n_blocks); #endif #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_global, kj, ki, ifm1, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock), &n_blocks); #endif } else { if (kj == handle->desc.R-1) { ind = 0; for (img_br = 0; img_br < img_block_size; img_br++) { for (j_br = 0; j_br < handle->upd_ofh_rb-1; j_br++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img + img_br, oj + j_br, oi, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ij + j_br * handle->desc.u, ii, ifm1, 0, IFHP, IFWP, handle->blocksifm, handle->ifmblock); ind++; } } n_blocks = ind; #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_global, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &n_blocks); #endif #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_global, kj, ki, ifm1, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock), &n_blocks); #endif } else { ind = 0; for (img_br = 0; img_br < img_block_size; img_br++) { for (j_br = 0; j_br < handle->upd_ofh_rb; j_br++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img + img_br, oj + j_br, oi, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ij + j_br * handle->desc.u, ii, ifm1, 0, IFHP, IFWP, handle->blocksifm, handle->ifmblock); ind++; } } n_blocks = ind; #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_global, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &n_blocks); #endif #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_global, kj, ki, ifm1, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock), &n_blocks); #endif } } } } } else { /* Here we are using batch-reduce kernel and hybrid minibatch/FM parallelization */ /* FIXME: Hardcoed logic for N=27 */ int group_size = (handle->desc.threads == 27 && handle->desc.N == 27 && handle->ofw == 14 && handle->desc.R == 1 && handle->desc.u == 1 && ltid >= 24) ? 3 : LIBXSMM_UPDIV(handle->desc.threads, handle->weight_copies); int tile_id = ltid / LIBXSMM_UPDIV(handle->desc.threads, handle->weight_copies); int tiles = handle->weight_copies; int img_per_tile = LIBXSMM_UPDIV(handle->desc.N, tiles); int my_in_tile_id = ltid % group_size; int ifms_per_thread = LIBXSMM_UPDIV(handle->blocksifm, group_size); int ofms_per_thread = LIBXSMM_UPDIV(handle->blocksofm, group_size); int my_R_start = 0; int my_R_end = handle->desc.R; const float beta = ((handle->upd_ofh_rb == handle->ofh) && (handle->upd_ofw_rb == handle->ofw)) ? 0.f : 1.f; gemm_br_function br_gemm_kernel = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->ifmblock, handle->upd_ofw_rb, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); const float beta_flat = 0.0; gemm_br_function br_gemm_kernel_flat = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->ifmblock, handle->upd_ofw_rb, &LDA, &LDB, &LDC, NULL, &beta_flat, &l_flags, &prefetch_mode); element_filter_type *weight_ptr_group = (handle->weight_copies > 1) ? (element_filter_type*)((char*)handle->scratch + handle->upd_filter_scratch_offset) + tile_id * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S : (element_filter_type*)handle->grad_filter->data; #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) LIBXSMM_VLA_DECL(6, element_filter_type, weight_private_group, (element_filter_type*)weight_ptr_group, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); #endif #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) LIBXSMM_VLA_DECL(6, element_filter_type, weight_private_group, (element_filter_type*)weight_ptr_group, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); #endif my_img_start = LIBXSMM_MIN(tile_id * img_per_tile, handle->desc.N); my_img_end = LIBXSMM_MIN((tile_id+1) * img_per_tile, handle->desc.N); my_ifm_start = LIBXSMM_MIN(my_in_tile_id * ifms_per_thread, handle->blocksifm ); my_ifm_end = LIBXSMM_MIN((my_in_tile_id+1) * ifms_per_thread, handle->blocksifm ); my_ofm_start = 0; my_ofm_end = handle->blocksofm; /* FIXME: Hardcoed logic for N=27 */ if (handle->desc.threads == 27 && handle->desc.N == 27 && handle->desc.C == 256 && handle->desc.K == 1024 && handle->ofh == 14 && handle->desc.u == 1) { my_ofm_start = LIBXSMM_MIN(my_in_tile_id * ofms_per_thread, handle->blocksofm); my_ofm_end = LIBXSMM_MIN((my_in_tile_id+1) * ofms_per_thread, handle->blocksofm); my_ifm_start = 0; my_ifm_end = handle->blocksifm; } if (handle->desc.threads == 27 && handle->desc.N == 27 && handle->desc.R == 3 && handle->desc.S == 3 && handle->ofh == 14) { int r_per_tile = LIBXSMM_UPDIV(handle->desc.R, group_size); my_ifm_start = 0; my_ifm_end = handle->blocksifm; my_ofm_start = 0; my_ofm_end = handle->blocksofm; my_R_start = LIBXSMM_MIN(my_in_tile_id * r_per_tile, handle->desc.R); my_R_end = LIBXSMM_MIN((my_in_tile_id+1) * r_per_tile, handle->desc.R); } block_ofm = my_ofm_end-my_ofm_start+1; block_ifm = my_ifm_end-my_ifm_start+1; img_block_size = my_img_end - my_img_start; if (handle->desc.N != handle->desc.threads) { /* Use "flat" parallelism + reduction */ const int work = handle->desc.R * handle->desc.S * handle->blocksofm * handle->blocksifm * handle->desc.N; const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : (work / handle->desc.threads) + 1; const int work_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int work_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; int work_item; int Cb = handle->blocksifm; int Kb = handle->blocksofm; int R = handle->desc.R; int S = handle->desc.S; const int IFH = (handle->upd_pack_input == 1) ? handle->ifhp/handle->desc.u : IFHP; const int IFW = (handle->upd_pack_input == 1) ? handle->ifwp/handle->desc.v : IFWP; element_input_type *input_ptr_base = (handle->upd_pack_input == 1) ? (element_input_type*)((char*)handle->scratch + handle->upd_packing_padding_scratch_offset) : (element_input_type*)handle->reg_input->data; LIBXSMM_VLA_DECL(5, element_input_type, input_use, (element_input_type*)input_ptr_base, IFH, IFW, handle->blocksifm, handle->ifmblock); /* If requested, pack input to avoid strided accesses */ if (handle->upd_pack_input == 1) { LIBXSMM_VLA_DECL(5, element_input_type, input_src, (element_input_type*)handle->reg_input->data, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock); const int img_chunk = (handle->desc.N % handle->desc.threads == 0) ? handle->desc.N/handle->desc.threads : (handle->desc.N/handle->desc.threads) + 1; const int img_copy_start = LIBXSMM_MIN(ltid*img_chunk, handle->desc.N); const int img_copy_end = LIBXSMM_MIN((ltid+1)*img_chunk, handle->desc.N); for (img = img_copy_start; img < img_copy_end; img++) { for (ifm1 = 0; ifm1 < handle->blocksifm; ifm1++) { for (oj = 0; oj < handle->ofh; oj++) { for (oi = 0; oi < handle->ofw; oi++) { ij = oj * handle->desc.u; ii = oi * handle->desc.v; LIBXSMM_PRAGMA_SIMD for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_VLA_ACCESS(5, input_use, img, oj, oi, ifm1, ifm2, IFH, IFW, handle->blocksifm, handle->ifmblock) = LIBXSMM_VLA_ACCESS(5, input_src, img, ij, ii, ifm1, ifm2, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock); } } } } } libxsmm_barrier_wait(handle->barrier, ltid); } /* Initialize weights to zero */ if (handle->upd_ofw_rb != handle->ofw) { for (work_item = work_begin; work_item < work_end; work_item++) { img = work_item/(Cb*Kb*R*S); ofm1 = (work_item%(Cb*Kb*R*S))/(Cb*R*S); ifm1 = ((work_item%(Cb*Kb*R*S))%(Cb*R*S))/(R*S); kj = (((work_item%(Cb*Kb*R*S))%(Cb*R*S))%(R*S))/S; ki = (((work_item%(Cb*Kb*R*S))%(Cb*R*S))%(R*S))%S; { element_filter_type *weight_ptr_current = (handle->weight_copies > 1) ? (element_filter_type*)((char*)handle->scratch + handle->upd_filter_scratch_offset)+ img * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S : (element_filter_type*)handle->grad_filter->data; #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) LIBXSMM_VLA_DECL(6, element_filter_type, weight_current, (element_filter_type*)weight_ptr_current, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); #endif #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) LIBXSMM_VLA_DECL(6, element_filter_type, weight_current, (element_filter_type*)weight_ptr_current, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); #endif for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { LIBXSMM_PRAGMA_SIMD for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2++) { #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) LIBXSMM_VLA_ACCESS(6, weight_current, ofm1, ifm1, kj, ki, ifm2, ofm2, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock) = (element_filter_type)0; #endif #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) LIBXSMM_VLA_ACCESS(6, weight_current, kj, ki, ifm1, ifm2, ofm1, ofm2, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock) = (element_filter_type)0; #endif } } } } } for (work_item = work_begin; work_item < work_end; work_item++) { img = work_item/(Cb*Kb*R*S); ofm1 = (work_item%(Cb*Kb*R*S))/(Cb*R*S); ifm1 = ((work_item%(Cb*Kb*R*S))%(Cb*R*S))/(R*S); kj = (((work_item%(Cb*Kb*R*S))%(Cb*R*S))%(R*S))/S; ki = (((work_item%(Cb*Kb*R*S))%(Cb*R*S))%(R*S))%S; ii = 0 + ki; ij = 0 + kj; oj = 0; oi = 0; { element_filter_type *weight_ptr_current = (handle->weight_copies > 1) ? (element_filter_type*)((char*)handle->scratch + handle->upd_filter_scratch_offset) + img * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S : (element_filter_type*)handle->grad_filter->data; #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) LIBXSMM_VLA_DECL(6, element_filter_type, weight_current, (element_filter_type*)weight_ptr_current, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); #endif #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) LIBXSMM_VLA_DECL(6, element_filter_type, weight_current, (element_filter_type*)weight_ptr_current, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); #endif ind = 0; for (j_br = 0; j_br < handle->ofh; j_br++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img , oj + j_br, oi, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input_use, img, ij + j_br * handle->desc.u, ii, ifm1, 0, IFHP, IFWP, handle->blocksifm, handle->ifmblock); ind++; } n_blocks = ind; #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) br_gemm_kernel_flat(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_current, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &n_blocks); #endif #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) br_gemm_kernel_flat(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_current, kj, ki, ifm1, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock), &n_blocks); #endif } } } else { /* May need to initialized private weights to zero */ if (!((handle->upd_ofh_rb == handle->ofh) && (handle->upd_ofw_rb == handle->ofw))) { for (ofm1 = my_ofm_start; ofm1 < my_ofm_end; ofm1++ ) { for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { for (kj = my_R_start; kj < my_R_end; ++kj) { for (ki = 0; ki < handle->desc.S; ++ki) { for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2++ ) { for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) LIBXSMM_VLA_ACCESS(6, weight_private_group, ofm1, ifm1, kj, ki, ifm2, ofm2, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock) = (element_filter_type)0; #endif #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) LIBXSMM_VLA_ACCESS(6, weight_private_group, kj, ki, ifm1, ifm2, ofm1, ofm2, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock) = (element_filter_type)0; #endif } } } } } } } if (handle->upd_loop_order == 0) { for (img = my_img_start; img < my_img_end; img += img_block_size) { for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += block_ofm) { for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += block_ifm) { for (ojb = 0; ojb < handle->ofh; ojb += handle->upd_ofh_rb) { for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+block_ofm, my_ofm_end); ofm1++ ) { for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+block_ifm, my_ifm_end); ifm1++) { for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->upd_ofh_rb,handle->ofh); oj+= handle->upd_ofh_rb) { for (oi = 0; oi < handle->ofw; oi += handle->upd_ofw_rb) { for (kj = my_R_start; kj < my_R_end; ++kj) { for (ki = 0; ki < handle->desc.S; ++ki) { ii = oi * handle->desc.u + ki; ij = oj * handle->desc.v + kj; ind = 0; for (img_br = 0; img_br < img_block_size; img_br++) { for (j_br = 0; j_br < handle->upd_ofh_rb; j_br++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img + img_br, oj + j_br, oi, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ij + j_br * handle->desc.u, ii, ifm1, 0, IFHP, IFWP, handle->blocksifm, handle->ifmblock); ind++; } } n_blocks = ind; #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_private_group, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &n_blocks); #endif #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_private_group, kj, ki, ifm1, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock), &n_blocks); #endif } } } } } } } } } } } else { for (img = my_img_start; img < my_img_end; img += img_block_size) { for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += block_ifm) { for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += block_ofm) { for (ojb = 0; ojb < handle->ofh; ojb += handle->upd_ofh_rb) { for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+block_ifm, my_ifm_end); ifm1++) { for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+block_ofm, my_ofm_end); ofm1++ ) { for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->upd_ofh_rb,handle->ofh); oj+= handle->upd_ofh_rb) { for (oi = 0; oi < handle->ofw; oi += handle->upd_ofw_rb) { for (kj = my_R_start; kj < my_R_end; ++kj) { for (ki = 0; ki < handle->desc.S; ++ki) { ii = oi * handle->desc.u + ki; ij = oj * handle->desc.v + kj; ind = 0; for (img_br = 0; img_br < img_block_size; img_br++) { for (j_br = 0; j_br < handle->upd_ofh_rb; j_br++) { A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img + img_br, oj + j_br, oi, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock); B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ij + j_br * handle->desc.u, ii, ifm1, 0, IFHP, IFWP, handle->blocksifm, handle->ifmblock); ind++; } } n_blocks = ind; #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_private_group, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &n_blocks); #endif #if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_private_group, kj, ki, ifm1, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock), &n_blocks); #endif } } } } } } } } } } } } } } if (handle->weight_copies > 1) { /* reduce work-related variables */ const int fm_blocking = (handle->ofmblock % 16 == 0) ? 16 : handle->ofmblock; const int reduce_work = handle->blocksofm * handle->blocksifm * handle->desc.R * handle->desc.S * (handle->ofmblock/fm_blocking) * handle->ifmblock; const int reduce_chunksize = (reduce_work % handle->desc.threads == 0) ? (reduce_work / handle->desc.threads) : (reduce_work / handle->desc.threads) + 1; const int reduce_thr_begin = (ltid * reduce_chunksize < reduce_work) ? (ltid * reduce_chunksize) : reduce_work; const int reduce_thr_end = ((ltid + 1) * reduce_chunksize < reduce_work) ? ((ltid + 1) * reduce_chunksize) : reduce_work; /* Perform reduction here */ libxsmm_barrier_wait(handle->barrier, ltid); for ( ij = reduce_thr_begin; ij < reduce_thr_end; ij++ ) { element_filter_type *weight_ptr_glb = (element_filter_type*) handle->grad_filter->data; #if 1 float weight_sum[64]; int wtcnt = 0; assert( handle->ofmblock <= 64 ); LIBXSMM_PRAGMA_SIMD for ( wtcnt = 0; wtcnt < fm_blocking; ++wtcnt ) { weight_sum[wtcnt] = 0.0f; } for ( ii = 0; ii < handle->weight_copies; ii++ ) { element_filter_type *weight_ptr_src = (element_filter_type*)((char*)handle->scratch + handle->upd_filter_scratch_offset)+ ii * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S + ij * fm_blocking; LIBXSMM_PRAGMA_SIMD for ( wtcnt = 0; wtcnt < fm_blocking; ++wtcnt ) { weight_sum[wtcnt] += weight_ptr_src[wtcnt]; } } LIBXSMM_PRAGMA_SIMD for ( wtcnt = 0; wtcnt < fm_blocking; ++wtcnt ) { weight_ptr_glb[(ij*fm_blocking) + wtcnt] = weight_sum[wtcnt]; } #else __m512 weight_sum = _mm512_setzero_ps(); for ( ii = 0; ii < handle->weight_copies; ii++ ) { element_filter_type *weight_ptr_src = (element_filter_type*)handle->scratch7 + ii * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S + ij * 16; weight_sum = _mm512_add_ps(weight_sum, LIBXSMM_INTRINSICS_MM512_LOAD_PS(weight_ptr_src)); } _mm512_storeu_ps(&weight_ptr_glb[ij*16], weight_sum); #endif } } libxsmm_barrier_wait(handle->barrier, ltid); libxsmm-1.17/src/template/libxsmm_dnn_fullyconnected_st_bwdupd_custom_generic.tpl.c000066400000000000000000000300651415223013700312340ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ if ( (kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) || (kind == LIBXSMM_DNN_COMPUTE_KIND_BWDUPD) ) { /* size variables, all const */ /* here we assume that input and output blocking is similar */ const int nBlocksIFm = handle->blocksifm; const int nIFmBlock = handle->ifmblock; const int nBlocksOFm = handle->blocksofm; const int nOFmBlock = handle->ofmblock; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const int work = nBlocksIFm; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* number of tasks for transpose that could be run in parallel */ const int transpose_work = nBlocksIFm * nBlocksOFm; /* compute chunk size */ const int transpose_chunksize = (transpose_work % handle->desc.threads == 0) ? (transpose_work / handle->desc.threads) : ((transpose_work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int transpose_thr_begin = (ltid * transpose_chunksize < transpose_work) ? (ltid * transpose_chunksize) : transpose_work; const int transpose_thr_end = ((ltid + 1) * transpose_chunksize < transpose_work) ? ((ltid + 1) * transpose_chunksize) : transpose_work; /* loop variables */ int ofm1 = 0; int ofm2 = 0; int ifm1 = 0; int ifm2 = 0; int ifm1ofm1 = 0; LIBXSMM_VLA_DECL(3, const element_output_type, doutput, (element_output_type*)handle->grad_output->data, nBlocksOFm, nOFmBlock); LIBXSMM_VLA_DECL(4, const element_filter_type, filter, (element_filter_type*)handle->reg_filter->data, nBlocksIFm, nIFmBlock, nOFmBlock); #if defined(LIBXSMM_DNN_FULLYCONNECTED_BWD_BF16_F32) float* dinput_f32_ptr = (float*)handle->scratch; float* filter_f32_ptr = ((float*)handle->scratch)+((size_t)handle->desc.N*(size_t)handle->desc.C); LIBXSMM_VLA_DECL(3, float, dinput, dinput_f32_ptr, nBlocksIFm, nIFmBlock); LIBXSMM_VLA_DECL(4, float, filter_tr, filter_f32_ptr, nBlocksOFm, nOFmBlock, nIFmBlock); /* number of tasks that could be run in parallel */ const int work_input = handle->desc.N * handle->desc.C; /* compute chunk size */ const int chunksize_input = (work_input % handle->desc.threads == 0) ? (work_input / handle->desc.threads) : ((work_input / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin_input = (ltid * chunksize_input < work_input) ? (ltid * chunksize_input) : work_input; const int thr_end_input = ((ltid + 1) * chunksize_input < work_input) ? ((ltid + 1) * chunksize_input) : work_input; #else LIBXSMM_VLA_DECL(3, element_input_type, dinput, (element_input_type* )handle->grad_input->data, nBlocksIFm, nIFmBlock); LIBXSMM_VLA_DECL(4, element_filter_type, filter_tr, (element_filter_type*)handle->scratch, nBlocksOFm, nOFmBlock, nIFmBlock); #endif /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); for (ifm1ofm1 = transpose_thr_begin; ifm1ofm1 < transpose_thr_end; ++ifm1ofm1) { ofm1 = ifm1ofm1 / nBlocksIFm; ifm1 = ifm1ofm1 % nBlocksIFm; for (ofm2 = 0; ofm2 < nOFmBlock; ++ofm2) { for (ifm2 = 0; ifm2 < nIFmBlock; ++ifm2) { #if defined(LIBXSMM_DNN_FULLYCONNECTED_BWD_BF16_F32) union libxsmm_bfloat16_hp filter_f32; filter_f32.i[0] = 0; filter_f32.i[1] = LIBXSMM_VLA_ACCESS(4, filter, ofm1, ifm1, ifm2, ofm2, nBlocksIFm, nIFmBlock, nOFmBlock); LIBXSMM_VLA_ACCESS(4, filter_tr, ifm1, ofm1, ofm2, ifm2, nBlocksOFm, nOFmBlock, nIFmBlock) = filter_f32.f; #else LIBXSMM_VLA_ACCESS(4, filter_tr, ifm1, ofm1, ofm2, ifm2, nBlocksOFm, nOFmBlock, nIFmBlock) = LIBXSMM_VLA_ACCESS(4, filter, ofm1, ifm1, ifm2, ofm2, nBlocksIFm, nIFmBlock, nOFmBlock); #endif } } } /* wait for transpose to finish */ libxsmm_barrier_wait(handle->barrier, ltid); for ( ifm1 = thr_begin; ifm1 < thr_end; ++ifm1 ) { /* outer GEMM m-loop */ #if 1 gemm_kernel_bwd( &LIBXSMM_VLA_ACCESS(4, filter_tr, ifm1, 0, 0, 0, nBlocksOFm, nOFmBlock, nIFmBlock), &LIBXSMM_VLA_ACCESS(3, doutput, 0, 0, 0, nBlocksOFm, nOFmBlock), &LIBXSMM_VLA_ACCESS(3, dinput, 0, ifm1, 0, nBlocksIFm, nIFmBlock) ); #else const int nImg = handle->desc.N; int img2; /* this is a simple replacement code using regular loops */ for ( img2 = 0; img2 < nImg; ++img2 ) { LIBXSMM_PRAGMA_SIMD for ( ifm2 = 0; ifm2 < nIFmBlock; ++ifm2 ) { LIBXSMM_VLA_ACCESS(3, dinput, img2, ifm1, ifm2, nBlocksIFm, nIFmBlock) = (element_output_type)0; } } for ( ofm1 = 0; ofm1 < nBlocksOFm; ++ofm1 ) { /* outer GEMM k-loop */ for ( ofm2 = 0; ofm2 < nOFmBlock; ++ofm2 ) { /* GEMM K-loop */ for ( img2 = 0; img2 < nImg; ++img2 ) { /* GEMM n-loop */ LIBXSMM_PRAGMA_SIMD for ( ifm2 = 0; ifm2 < nIFmBlock; ++ifm2 ) { /* GEMM m-loop */ LIBXSMM_VLA_ACCESS(3, dinput, img2, ifm1, ifm2, nBlocksIFm, nIFmBlock) += LIBXSMM_VLA_ACCESS(4, filter_tr, ifm1, ofm1, ofm2, ifm2, nBlocksOFm, nOFmBlock, nIFmBlock) * LIBXSMM_VLA_ACCESS(3, doutput, img2, ofm1, ofm2, nBlocksOFm, nOFmBlock); } } } } #endif } #if defined(LIBXSMM_DNN_FULLYCONNECTED_BWD_BF16_F32) libxsmm_barrier_wait(handle->barrier, ltid); libxsmm_rne_convert_fp32_bf16( dinput_f32_ptr+thr_begin_input, ((element_input_type*)handle->grad_input->data)+thr_begin_input, thr_end_input-thr_begin_input ); #endif libxsmm_barrier_wait(handle->barrier, ltid); } if ( (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) || (kind == LIBXSMM_DNN_COMPUTE_KIND_BWDUPD) ) { /* size variables, all const */ const int nImg = handle->desc.N; /* here we assume that input and output blocking is similar */ const int nBlocksIFm = handle->blocksifm; const int nIFmBlock = handle->ifmblock; const int nBlocksOFm = handle->blocksofm; const int nOFmBlock = handle->ofmblock; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const int work = nBlocksIFm * nBlocksOFm; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* number of tasks for transpose that could be run in parallel */ const int transpose_work = nBlocksIFm; /* compute chunk size */ const int transpose_chunksize = (transpose_work % handle->desc.threads == 0) ? (transpose_work / handle->desc.threads) : ((transpose_work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int transpose_thr_begin = (ltid * transpose_chunksize < transpose_work) ? (ltid * transpose_chunksize) : transpose_work; const int transpose_thr_end = ((ltid + 1) * transpose_chunksize < transpose_work) ? ((ltid + 1) * transpose_chunksize) : transpose_work; /* loop variables */ int img2 = 0; int ifm1ofm1 = 0; int ofm1 = 0; int ifm1 = 0; int ifm2 = 0; LIBXSMM_VLA_DECL(3, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksIFm, nIFmBlock); LIBXSMM_VLA_DECL(3, const element_output_type, doutput, (element_output_type*)handle->grad_output->data, nBlocksOFm, nOFmBlock); #if defined(LIBXSMM_DNN_FULLYCONNECTED_UPD_BF16_F32) float* input_f32_ptr = (float*)handle->scratch; float* dfilter_f32_ptr = ((float*)handle->scratch)+((size_t)handle->desc.N*(size_t)handle->desc.C); LIBXSMM_VLA_DECL(3, float, input_tr, input_f32_ptr, nIFmBlock, nImg); LIBXSMM_VLA_DECL(4, float, dfilter, dfilter_f32_ptr, nBlocksIFm, nIFmBlock, nOFmBlock); /* number of tasks that could be run in parallel */ const int work_filter = handle->desc.C * handle->desc.K; /* compute chunk size */ const int chunksize_filter = (work_filter % handle->desc.threads == 0) ? (work_filter / handle->desc.threads) : ((work_filter / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin_filter = (ltid * chunksize_filter < work_filter) ? (ltid * chunksize_filter) : work_filter; const int thr_end_filter = ((ltid + 1) * chunksize_filter < work_filter) ? ((ltid + 1) * chunksize_filter) : work_filter; #else LIBXSMM_VLA_DECL(4, element_filter_type, dfilter, (element_filter_type*)handle->grad_filter->data, nBlocksIFm, nIFmBlock, nOFmBlock); LIBXSMM_VLA_DECL(3, element_input_type, input_tr, (element_input_type* )handle->scratch, nIFmBlock, nImg); #endif /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); for (ifm1 = transpose_thr_begin; ifm1 < transpose_thr_end; ++ifm1) { for (ifm2 = 0; ifm2 < nIFmBlock; ++ifm2) { for (img2 = 0; img2 < nImg; ++img2) { #if defined(LIBXSMM_DNN_FULLYCONNECTED_UPD_BF16_F32) union libxsmm_bfloat16_hp input_f32; input_f32.i[0] = 0; input_f32.i[1] = LIBXSMM_VLA_ACCESS(3, input, img2, ifm1, ifm2, nBlocksIFm, nIFmBlock); LIBXSMM_VLA_ACCESS(3, input_tr, ifm1, ifm2, img2, nIFmBlock, nImg) = input_f32.f; #else LIBXSMM_VLA_ACCESS(3, input_tr, ifm1, ifm2, img2, nIFmBlock, nImg) = LIBXSMM_VLA_ACCESS(3, input, img2, ifm1, ifm2, nBlocksIFm, nIFmBlock); #endif } } } /* wait for transpose to finish */ libxsmm_barrier_wait(handle->barrier, ltid); for ( ifm1ofm1 = thr_begin; ifm1ofm1 < thr_end; ++ifm1ofm1 ) { /* outer GEMM m/n-loop */ ofm1 = ifm1ofm1 / nBlocksIFm; ifm1 = ifm1ofm1 % nBlocksIFm; #if 1 gemm_kernel_upd( &LIBXSMM_VLA_ACCESS(3, doutput, 0, ofm1, 0, nBlocksOFm, nOFmBlock), &LIBXSMM_VLA_ACCESS(3, input_tr, ifm1, 0, 0, nIFmBlock, nImg), &LIBXSMM_VLA_ACCESS(4, dfilter, ofm1, ifm1, 0, 0, nBlocksIFm, nIFmBlock, nOFmBlock) ); #else { const int nImg = handle->desc.N; int ifm2, ofm2; /* this is a simple replacement code using regular loops */ for ( ifm2 = 0; ifm2 < nIFmBlock; ++ifm2 ) { LIBXSMM_PRAGMA_SIMD for ( ofm2 = 0; ofm2 < nOFmBlock; ++ofm2 ) { LIBXSMM_VLA_ACCESS(4, dfilter, ofm1, ifm1, ifm2, ofm2, nBlocksIFm, nIFmBlock, nOFmBlock) = (element_output_type)0; } } for ( img2 = 0; img2 < nImg; ++img2 ) { /* GEMM k-loop */ for ( ifm2 = 0; ifm2 < nIFmBlock; ++ifm2 ) { /* GEMM n-loop */ LIBXSMM_PRAGMA_SIMD for ( ofm2 = 0; ofm2 < nOFmBlock; ++ofm2 ) { /* GEMM m-loop */ LIBXSMM_VLA_ACCESS(4, dfilter, ofm1, ifm1, ifm2, ofm2, nBlocksIFm, nIFmBlock, nOFmBlock) += LIBXSMM_VLA_ACCESS(3, doutput, img2, ofm1, ofm2, nBlocksOFm, nOFmBlock) * LIBXSMM_VLA_ACCESS(3, input_tr, ifm1, ifm2, img2, nIFmBlock, nImg); } } } } #endif } #if defined(LIBXSMM_DNN_FULLYCONNECTED_UPD_BF16_F32) libxsmm_barrier_wait(handle->barrier, ltid); libxsmm_rne_convert_fp32_bf16( dfilter_f32_ptr+thr_begin_filter, ((element_input_type*)handle->grad_filter->data)+thr_begin_filter, thr_end_filter-thr_begin_filter ); #endif libxsmm_barrier_wait(handle->barrier, ltid); } libxsmm-1.17/src/template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic.tpl.c000066400000000000000000000410241415223013700316330ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas, Kunal Banerjee (Intel Corp.) ******************************************************************************/ /* here we assume that input and output blocking is similar */ const int bn = handle->bn; const int bk = handle->bk; const int bc = handle->bc; const int nBlocksIFm = handle->desc.C / bc; const int nBlocksOFm = handle->desc.K / bk; const int nBlocksMB = handle->desc.N / bn; /* computing first logical thread */ const int ltid = tid - start_thread; /* Transpose kernel to transpose filters */ libxsmm_xtransfunction tr_kernel = handle->tr_kernel; #if defined(LIBXSMM_DNN_FC_BWD_FUSE_RELU) || defined(LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID) /* number of tasks for transpose that could be run in parallel */ const int eltwise_work = nBlocksOFm * nBlocksMB; /* compute chunk size */ const int eltwise_chunksize = (eltwise_work % handle->desc.threads == 0) ? (eltwise_work / handle->desc.threads) : ((eltwise_work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int eltwise_thr_begin = (ltid * eltwise_chunksize < eltwise_work) ? (ltid * eltwise_chunksize) : eltwise_work; const int eltwise_thr_end = ((ltid + 1) * eltwise_chunksize < eltwise_work) ? ((ltid + 1) * eltwise_chunksize) : eltwise_work; int mb1ofm1; #endif #ifdef LIBXSMM_DNN_FC_BWD_FUSE_BIAS /* number of tasks for transpose that could be run in parallel */ const int dbias_work = nBlocksOFm; /* compute chunk size */ const int dbias_chunksize = (dbias_work % handle->desc.threads == 0) ? (dbias_work / handle->desc.threads) : ((dbias_work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int dbias_thr_begin = (ltid * dbias_chunksize < dbias_work) ? (ltid * dbias_chunksize) : dbias_work; const int dbias_thr_end = ((ltid + 1) * dbias_chunksize < dbias_work) ? ((ltid + 1) * dbias_chunksize) : dbias_work; #endif /* loop variables */ int ofm1 = 0, mb1 = 0, ofm2 = 0, mb2 = 0; #if defined(LIBXSMM_DNN_FC_BWD_FUSE_RELU) || defined(LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID) element_output_type *grad_output_ptr = ((element_output_type*)handle->scratch)+(handle->desc.C*handle->desc.K); LIBXSMM_VLA_DECL(4, const element_output_type, doutput_orig, (element_output_type*)handle->grad_output->data, nBlocksOFm, bn, bk); #else element_output_type *grad_output_ptr = (element_output_type*)handle->grad_output->data; #endif LIBXSMM_VLA_DECL(4, element_output_type, doutput, grad_output_ptr, nBlocksOFm, bn, bk); #ifdef LIBXSMM_DNN_FC_BWD_FUSE_BIAS LIBXSMM_VLA_DECL(2, float, dbias, (float*) handle->grad_bias->data, handle->bk); #endif #ifdef LIBXSMM_DNN_FC_BWD_FUSE_RELU LIBXSMM_VLA_DECL(4, unsigned char, relumask, (unsigned char*) handle->relumask->data, nBlocksOFm, handle->bn, handle->bk); #endif /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); #if defined(LIBXSMM_DNN_FC_BWD_FUSE_RELU) || defined(LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID) for ( mb1ofm1 = eltwise_thr_begin; mb1ofm1 < eltwise_thr_end; ++mb1ofm1 ) { mb1 = mb1ofm1%nBlocksMB; ofm1 = mb1ofm1/nBlocksMB; for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { float l_cur_out = LIBXSMM_VLA_ACCESS(4, doutput_orig, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk); #ifdef LIBXSMM_DNN_FC_BWD_FUSE_RELU l_cur_out = (LIBXSMM_VLA_ACCESS(4, relumask, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) != 0) ? l_cur_out : (element_output_type)0; #endif #ifdef LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID l_cur_out = l_cur_out*(1.0f - l_cur_out); #endif LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = l_cur_out; } } } /* wait for eltwise to finish */ libxsmm_barrier_wait(handle->barrier, ltid); #endif #if defined(LIBXSMM_DNN_FC_BWD_FUSE_BIAS) for ( ofm1 = dbias_thr_begin; ofm1 < dbias_thr_end; ++ofm1 ) { for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { LIBXSMM_VLA_ACCESS( 2, dbias, ofm1, ofm2, handle->bk ) = 0.0f; } for ( mb1 = 0; mb1 < nBlocksMB; ++mb1 ) { for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { LIBXSMM_VLA_ACCESS( 2, dbias, ofm1, ofm2, handle->bk ) += LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk); } } } } /* wait for eltwise to finish */ libxsmm_barrier_wait(handle->barrier, ltid); #endif if ( (kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) || (kind == LIBXSMM_DNN_COMPUTE_KIND_BWDUPD) ) { const int use_2d_blocking = handle->bwd_2d_blocking; /* number of tasks that could be run in parallel */ const int work = nBlocksIFm * nBlocksMB; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* number of tasks for transpose that could be run in parallel */ const int transpose_work = nBlocksIFm * nBlocksOFm; /* compute chunk size */ const int transpose_chunksize = (transpose_work % handle->desc.threads == 0) ? (transpose_work / handle->desc.threads) : ((transpose_work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int transpose_thr_begin = (ltid * transpose_chunksize < transpose_work) ? (ltid * transpose_chunksize) : transpose_work; const int transpose_thr_end = ((ltid + 1) * transpose_chunksize < transpose_work) ? ((ltid + 1) * transpose_chunksize) : transpose_work; /* loop variables */ int ifm1 = 0, ifm2 = 0, ifm1ofm1 = 0, mb1ifm1 = 0; int im_tasks_per_thread = 0, in_tasks_per_thread = 0, my_in_start = 0, my_in_end = 0, my_im_start = 0, my_im_end = 0, my_row_id = 0, my_col_id = 0, row_teams = 0, column_teams = 0; LIBXSMM_VLA_DECL(4, const element_filter_type, filter, (element_filter_type*)handle->reg_filter->data, nBlocksIFm, bc, bk); LIBXSMM_VLA_DECL(4, element_input_type, dinput, (element_input_type* )handle->grad_input->data, nBlocksIFm, bn, bc); LIBXSMM_VLA_DECL(4, element_filter_type, filter_tr, (element_filter_type*)handle->scratch, nBlocksOFm, bk, bc); unsigned long long blocks = nBlocksOFm; int KB_BLOCKS = nBlocksOFm, BF = 1; BF = handle->bwd_bf; KB_BLOCKS = nBlocksOFm/BF; blocks = KB_BLOCKS; if (use_2d_blocking == 1) { row_teams = handle->bwd_row_teams; column_teams = handle->bwd_column_teams; my_col_id = ltid % column_teams; my_row_id = ltid / column_teams; im_tasks_per_thread = LIBXSMM_UPDIV(nBlocksMB, row_teams); in_tasks_per_thread = LIBXSMM_UPDIV(nBlocksIFm, column_teams); my_im_start = LIBXSMM_MIN(my_row_id * im_tasks_per_thread, nBlocksMB); my_im_end = LIBXSMM_MIN((my_row_id+1) * im_tasks_per_thread, nBlocksMB); my_in_start = LIBXSMM_MIN(my_col_id * in_tasks_per_thread, nBlocksIFm); my_in_end = LIBXSMM_MIN((my_col_id+1) * in_tasks_per_thread, nBlocksIFm); } /* transpose weight */ for (ifm1ofm1 = transpose_thr_begin; ifm1ofm1 < transpose_thr_end; ++ifm1ofm1) { const unsigned int ubk = (unsigned int)bk; const unsigned int ubc = (unsigned int)bc; ofm1 = ifm1ofm1 / nBlocksIFm; ifm1 = ifm1ofm1 % nBlocksIFm; tr_kernel(&LIBXSMM_VLA_ACCESS(4, filter, ofm1, ifm1, 0, 0, nBlocksIFm, bc, bk), &ubk, &LIBXSMM_VLA_ACCESS(4, filter_tr, ifm1, ofm1, 0, 0, nBlocksOFm, bk, bc), &ubc); #if 0 for (ofm2 = 0; ofm2 < bk; ++ofm2) { for (ifm2 = 0; ifm2 < bc; ++ifm2) { LIBXSMM_VLA_ACCESS(4, filter_tr, ifm1, ofm1, ofm2, ifm2, nBlocksOFm, bk, bc) = LIBXSMM_VLA_ACCESS(4, filter, ofm1, ifm1, ifm2, ofm2, nBlocksIFm, bc, bk); } } #endif } /* wait for transpose to finish */ libxsmm_barrier_wait(handle->barrier, ltid); if (use_2d_blocking == 1) { if (BF > 1) { for ( ofm1 = 0; ofm1 < BF; ++ofm1 ) { for (ifm1 = my_in_start; ifm1 < my_in_end; ++ifm1) { for (mb1 = my_im_start; mb1 < my_im_end; ++mb1) { /* Initialize intermediate f32 tensor */ if ( ofm1 == 0 ) { for ( mb2 = 0; mb2 < bn; ++mb2 ) { for ( ifm2 = 0; ifm2 < bc; ++ifm2 ) { LIBXSMM_VLA_ACCESS(4, dinput, mb1, ifm1, mb2, ifm2, nBlocksIFm, bn, bc) = (element_input_type)0; } } } batchreduce_kernel_bwd( &LIBXSMM_VLA_ACCESS(4, filter_tr, ifm1, ofm1*KB_BLOCKS, 0, 0, nBlocksOFm, bk, bc ), &LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1*KB_BLOCKS, 0, 0, nBlocksOFm, bn, bk), &LIBXSMM_VLA_ACCESS(4, dinput, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), &blocks); } } } } else { for (ifm1 = my_in_start; ifm1 < my_in_end; ++ifm1) { for (mb1 = my_im_start; mb1 < my_im_end; ++mb1) { batchreduce_kernel_bwd_zerobeta( &LIBXSMM_VLA_ACCESS(4, filter_tr, ifm1, 0, 0, 0, nBlocksOFm, bk, bc), &LIBXSMM_VLA_ACCESS(4, doutput, mb1, 0, 0, 0, nBlocksOFm, bn, bk), &LIBXSMM_VLA_ACCESS(4, dinput, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), &blocks); } } } } else { if (BF > 1) { for ( ofm1 = 0; ofm1 < BF; ++ofm1 ) { for ( mb1ifm1 = thr_begin; mb1ifm1 < thr_end; ++mb1ifm1 ) { mb1 = mb1ifm1%nBlocksMB; ifm1 = mb1ifm1/nBlocksMB; /* Initialize intermediate f32 tensor */ if ( ofm1 == 0 ) { for ( mb2 = 0; mb2 < bn; ++mb2 ) { for ( ifm2 = 0; ifm2 < bc; ++ifm2 ) { LIBXSMM_VLA_ACCESS(4, dinput, mb1, ifm1, mb2, ifm2, nBlocksIFm, bn, bc) = (element_input_type)0; } } } batchreduce_kernel_bwd( &LIBXSMM_VLA_ACCESS(4, filter_tr, ifm1, ofm1*KB_BLOCKS, 0, 0, nBlocksOFm, bk, bc ), &LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1*KB_BLOCKS, 0, 0, nBlocksOFm, bn, bk), &LIBXSMM_VLA_ACCESS(4, dinput, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), &blocks); } } } else { for ( mb1ifm1 = thr_begin; mb1ifm1 < thr_end; ++mb1ifm1 ) { mb1 = mb1ifm1%nBlocksMB; ifm1 = mb1ifm1/nBlocksMB; batchreduce_kernel_bwd_zerobeta( &LIBXSMM_VLA_ACCESS(4, filter_tr, ifm1, 0, 0, 0, nBlocksOFm, bk, bc ), &LIBXSMM_VLA_ACCESS(4, doutput, mb1, 0, 0, 0, nBlocksOFm, bn, bk), &LIBXSMM_VLA_ACCESS(4, dinput, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), &blocks); } } } libxsmm_barrier_wait(handle->barrier, ltid); } if ( (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) || (kind == LIBXSMM_DNN_COMPUTE_KIND_BWDUPD) ) { /* number of tasks that could be run in parallel */ const int ofm_subtasks = (handle->upd_2d_blocking == 1) ? 1 : handle->ofm_subtasks; const int ifm_subtasks = (handle->upd_2d_blocking == 1) ? 1 : handle->ifm_subtasks; const int bbk = (handle->upd_2d_blocking == 1) ? bk : bk/ofm_subtasks; const int bbc = (handle->upd_2d_blocking == 1) ? bc : bc/ifm_subtasks; const int work = nBlocksIFm * ifm_subtasks * nBlocksOFm * ofm_subtasks; const int Cck_work = nBlocksIFm * ifm_subtasks * ofm_subtasks; const int Cc_work = nBlocksIFm * ifm_subtasks; /* 2D blocking parameters */ int use_2d_blocking = handle->upd_2d_blocking; int im_tasks_per_thread = 0, in_tasks_per_thread = 0, my_in_start = 0, my_in_end = 0, my_im_start = 0, my_im_end = 0, my_row_id = 0, my_col_id = 0, row_teams = 0, column_teams = 0; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; int BF = handle->upd_bf; /* loop variables */ int ifm1ofm1 = 0, ifm1 = 0, ifm2 = 0, bfn = 0, ii = 0, jj = 0; /* Batch reduce related variables */ unsigned long long blocks = nBlocksMB/BF; LIBXSMM_VLA_DECL(4, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksIFm, bn, bc); LIBXSMM_VLA_DECL(4, element_filter_type, dfilter, (element_filter_type*)handle->grad_filter->data, nBlocksIFm, bc, bk); if (use_2d_blocking == 1) { row_teams = handle->upd_row_teams; column_teams = handle->upd_column_teams; my_col_id = ltid % column_teams; my_row_id = ltid / column_teams; im_tasks_per_thread = LIBXSMM_UPDIV(nBlocksIFm, row_teams); in_tasks_per_thread = LIBXSMM_UPDIV(nBlocksOFm, column_teams); my_im_start = LIBXSMM_MIN(my_row_id * im_tasks_per_thread, nBlocksIFm); my_im_end = LIBXSMM_MIN((my_row_id+1) * im_tasks_per_thread, nBlocksIFm); my_in_start = LIBXSMM_MIN(my_col_id * in_tasks_per_thread, nBlocksOFm); my_in_end = LIBXSMM_MIN((my_col_id+1) * in_tasks_per_thread, nBlocksOFm); } if (use_2d_blocking == 1) { if (BF == 1) { for (ofm1 = my_in_start; ofm1 < my_in_end; ++ofm1) { for (ifm1 = my_im_start; ifm1 < my_im_end; ++ifm1) { batchreduce_kernel_upd_zerobeta(&LIBXSMM_VLA_ACCESS(4, doutput, 0, ofm1, 0, 0, nBlocksOFm, bn, bk), &LIBXSMM_VLA_ACCESS(4, input, 0, ifm1, 0, 0, nBlocksIFm, bn, bc), &LIBXSMM_VLA_ACCESS(4, dfilter, ofm1, ifm1, 0, 0, nBlocksIFm, bc, bk), &blocks); } } } else { for (bfn = 0; bfn < BF; bfn++) { for (ofm1 = my_in_start; ofm1 < my_in_end; ++ofm1) { for (ifm1 = my_im_start; ifm1 < my_im_end; ++ifm1) { /* initialize current work task to zero */ if (bfn == 0) { for (ii = 0; iibarrier, ltid); } libxsmm-1.17/src/template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16.tpl.c000066400000000000000000001032521415223013700324530ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas, Alexander Heinecke (Intel Corp.) ******************************************************************************/ /* size variables, all const */ /* here we assume that input and output blocking is similar */ const int bn = handle->bn; const int bk = handle->bk; const int bc = handle->bc; int lpb = 2; const int bc_lp = bc/lpb; const int bk_lp = bk/lpb; const int bn_lp = bn/lpb; const int nBlocksIFm = handle->desc.C / handle->bc; const int nBlocksOFm = handle->desc.K / handle->bk; const int nBlocksMB = handle->desc.N / handle->bn; int mb1ofm1 = 0, mb1 = 0, ofm1 = 0, mb2 = 0, ofm2 = 0; #if defined(LIBXSMM_DNN_FC_BWD_FUSE_RELU) || defined(LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID) || defined(LIBXSMM_DNN_FC_BWD_FUSE_BIAS) int iteri = 0, iterj = 0; #endif int performed_doutput_transpose = 0; /* computing first logical thread */ const int ltid = tid - start_thread; #if defined(LIBXSMM_DNN_FC_BWD_FUSE_RELU) || defined(LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID) /* number of tasks for transpose that could be run in parallel */ const int eltwise_work = nBlocksOFm * nBlocksMB; /* compute chunk size */ const int eltwise_chunksize = (eltwise_work % handle->desc.threads == 0) ? (eltwise_work / handle->desc.threads) : ((eltwise_work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int eltwise_thr_begin = (ltid * eltwise_chunksize < eltwise_work) ? (ltid * eltwise_chunksize) : eltwise_work; const int eltwise_thr_end = ((ltid + 1) * eltwise_chunksize < eltwise_work) ? ((ltid + 1) * eltwise_chunksize) : eltwise_work; #endif #ifdef LIBXSMM_DNN_FC_BWD_FUSE_BIAS /* number of tasks for transpose that could be run in parallel */ const int dbias_work = nBlocksOFm; /* compute chunk size */ const int dbias_chunksize = (dbias_work % handle->desc.threads == 0) ? (dbias_work / handle->desc.threads) : ((dbias_work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int dbias_thr_begin = (ltid * dbias_chunksize < dbias_work) ? (ltid * dbias_chunksize) : dbias_work; const int dbias_thr_end = ((ltid + 1) * dbias_chunksize < dbias_work) ? ((ltid + 1) * dbias_chunksize) : dbias_work; #endif #ifdef LIBXSMM_DNN_FC_BWD_FUSE_BIAS LIBXSMM_VLA_DECL(2, libxsmm_bfloat16, dbias, (libxsmm_bfloat16*) handle->grad_bias->data, handle->bk); #endif #ifdef LIBXSMM_DNN_FC_BWD_FUSE_RELU LIBXSMM_VLA_DECL(4, unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksOFm, handle->bn, handle->bk); LIBXSMM_VLA_DECL(4, __mmask32, relubitmask, (__mmask32*)handle->relumask->data, nBlocksOFm, handle->bn, handle->bk/32); #endif #if defined(LIBXSMM_DNN_FC_BWD_FUSE_RELU) || defined(LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID) element_output_type *grad_output_ptr = (element_output_type*)((char*)handle->scratch + handle->doutput_scratch_mark); element_output_type *tr_doutput_ptr = (element_output_type*)grad_output_ptr + handle->desc.N * handle->desc.K; LIBXSMM_VLA_DECL(4, const element_output_type, doutput_orig, (element_output_type*)handle->grad_output->data, nBlocksOFm, bn, bk); #else element_output_type *grad_output_ptr = (element_output_type*)handle->grad_output->data; element_output_type *tr_doutput_ptr = (element_output_type*)handle->scratch; #endif LIBXSMM_VLA_DECL(4, element_output_type, doutput, grad_output_ptr, nBlocksOFm, bn, bk); LIBXSMM_VLA_DECL(5, element_output_type, doutput_tr, tr_doutput_ptr, nBlocksMB, bn_lp, bk, lpb); /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); /* Apply to doutput potential fusions */ #if defined(LIBXSMM_DNN_FC_BWD_FUSE_RELU) || defined(LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID) if (bk % 32 == 0) { for ( mb1ofm1 = eltwise_thr_begin; mb1ofm1 < eltwise_thr_end; ++mb1ofm1 ) { mb1 = mb1ofm1%nBlocksMB; ofm1 = mb1ofm1/nBlocksMB; for ( iteri = 0; iteri < handle->bn; ++iteri ) { for ( iterj = 0; iterj < handle->bk; iterj += 32 ) { __m512i cur_out_reg = _mm512_loadu_si512(&LIBXSMM_VLA_ACCESS(4, doutput_orig, mb1, ofm1, iteri, iterj, nBlocksOFm, handle->bn, handle->bk)); #ifdef LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID __m512 cur_out_reg_0, cur_out_reg_1; const __m512 ones = _mm512_set1_ps(1.0f); #endif #ifdef LIBXSMM_DNN_FC_BWD_FUSE_RELU __m512i zero_reg = _mm512_setzero_si512(); __mmask32 relumask = LIBXSMM_INTRINSICS_MM512_LOAD_MASK32 (&LIBXSMM_VLA_ACCESS(4, relubitmask, mb1, ofm1, iteri, iterj/32, nBlocksOFm, handle->bn, handle->bk/32)); cur_out_reg = _mm512_mask_blend_epi16 (relumask, zero_reg, cur_out_reg); #endif #ifdef LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID cur_out_reg_0 = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(cur_out_reg, 0)),16)); cur_out_reg_1 = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(cur_out_reg, 1)),16)); cur_out_reg_0 = _mm512_mul_ps(cur_out_reg_0, _mm512_sub_ps(ones, cur_out_reg_0)); cur_out_reg_1 = _mm512_mul_ps(cur_out_reg_1, _mm512_sub_ps(ones, cur_out_reg_1)); cur_out_reg = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(cur_out_reg_1, cur_out_reg_0); #endif _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1, iteri, iterj, nBlocksOFm, handle->bn, handle->bk), cur_out_reg); } } /* If in UPD pass, also perform transpose of doutput */ if ( (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) || (kind == LIBXSMM_DNN_COMPUTE_KIND_BWDUPD) ) { bf16_vnni_reformat((element_output_type*)&LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1, 0, 0, nBlocksOFm, bn, bk), &LIBXSMM_VLA_ACCESS(5, doutput_tr, ofm1, mb1, 0, 0, 0, nBlocksMB, bn_lp, bk, lpb), bk, bn, bk, bn); } } } else { for ( mb1ofm1 = eltwise_thr_begin; mb1ofm1 < eltwise_thr_end; ++mb1ofm1 ) { mb1 = mb1ofm1%nBlocksMB; ofm1 = mb1ofm1/nBlocksMB; for ( iteri = 0; iteri < handle->bn; ++iteri ) { for ( iterj = 0; iterj < handle->bk; ++iterj ) { element_output_type l_cur_out = LIBXSMM_VLA_ACCESS(4, doutput_orig, mb1, ofm1, iteri, iterj, nBlocksOFm, handle->bn, handle->bk); #ifdef LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID float l_cur_out_f32 = 0; libxsmm_bfloat16_hp tmp; #endif #ifdef LIBXSMM_DNN_FC_BWD_FUSE_RELU l_cur_out = (element_output_type)((LIBXSMM_VLA_ACCESS(4, relumask, mb1, ofm1, iteri, iterj, nBlocksOFm, handle->bn, handle->bk) != 0) ? l_cur_out : (element_output_type)0); #endif #ifdef LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID tmp.i[0] = 0; tmp.i[1] = l_cur_out; l_cur_out_f32 = tmp.f; l_cur_out_f32 = l_cur_out_f32*(1.0f - l_cur_out_f32); libxsmm_rne_convert_fp32_bf16(&l_cur_out_f32, &l_cur_out, 1); #endif LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1, iteri, iterj, nBlocksOFm, handle->bn, handle->bk) = l_cur_out; } } /* If in UPD pass, also perform transpose of doutput */ if ( (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) || (kind == LIBXSMM_DNN_COMPUTE_KIND_BWDUPD) ) { for (mb2 = 0; mb2 < bn; mb2++) { for (ofm2 = 0; ofm2 < bk; ofm2++) { LIBXSMM_VLA_ACCESS(5, doutput_tr, ofm1, mb1, mb2/lpb, ofm2, mb2%lpb, nBlocksMB, bn_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1, mb2, ofm2, nBlocksOFm, bn, bk); } } } } } if ( (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) || (kind == LIBXSMM_DNN_COMPUTE_KIND_BWDUPD) ) { performed_doutput_transpose = 1; } libxsmm_barrier_wait(handle->barrier, ltid); #endif #if defined(LIBXSMM_DNN_FC_BWD_FUSE_BIAS) /* Accumulation of bias happens in f32 */ { float *scratch_dbias = (float*) ((element_output_type*)handle->scratch + handle->desc.N * (handle->desc.K + handle->desc.C) + ltid * bk * 2); if (handle->bk % 16 == 0) { __m512 zero_reg = _mm512_setzero_ps(); __m512 doutput_reg = _mm512_setzero_ps(); __m512 dbias_reg = _mm512_setzero_ps(); for ( ofm1 = dbias_thr_begin; ofm1 < dbias_thr_end; ++ofm1 ) { for ( iterj = 0; iterj < handle->bk; iterj += 16 ) { _mm512_storeu_ps(scratch_dbias+iterj, zero_reg); } for ( mb1 = 0; mb1 < nBlocksMB; ++mb1 ) { for ( iteri = 0; iteri < handle->bn; ++iteri ) { for ( iterj = 0; iterj < handle->bk; iterj += 16 ) { doutput_reg = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((const __m256i*)&LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1, iteri, iterj, nBlocksOFm, handle->bn, handle->bk))); dbias_reg = LIBXSMM_INTRINSICS_MM512_LOAD_PS(scratch_dbias+iterj); dbias_reg = _mm512_add_ps(dbias_reg, doutput_reg); _mm512_storeu_ps(scratch_dbias+iterj, dbias_reg); } } } for ( iterj = 0; iterj < handle->bk; iterj += 16 ) { _mm256_storeu_si256((__m256i*)&LIBXSMM_VLA_ACCESS( 2, dbias, ofm1, iterj, handle->bk ), LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH( LIBXSMM_INTRINSICS_MM512_LOAD_PS(scratch_dbias+iterj)) ); } } } else { for ( ofm1 = dbias_thr_begin; ofm1 < dbias_thr_end; ++ofm1 ) { for ( iterj = 0; iterj < handle->bk; ++iterj ) { scratch_dbias[iterj] = 0.0; } for ( mb1 = 0; mb1 < nBlocksMB; ++mb1 ) { for ( iteri = 0; iteri < handle->bn; ++iteri ) { for ( iterj = 0; iterj < handle->bk; ++iterj ) { float doutput_f32 = 0; libxsmm_bfloat16_hp tmp; tmp.i[0] = 0; tmp.i[1] = LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1, iteri, iterj, nBlocksOFm, handle->bn, handle->bk); doutput_f32 = tmp.f; scratch_dbias[iterj] += doutput_f32; } } } libxsmm_rne_convert_fp32_bf16(scratch_dbias, &LIBXSMM_VLA_ACCESS( 2, dbias, ofm1, 0, handle->bk ), handle->bk); } } } /* wait for eltwise to finish */ libxsmm_barrier_wait(handle->barrier, ltid); #endif if ( (kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) || (kind == LIBXSMM_DNN_COMPUTE_KIND_BWDUPD) ){ int use_2d_blocking = handle->bwd_2d_blocking; /* number of tasks that could be run in parallel */ const int work = nBlocksIFm * nBlocksMB; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* number of tasks for transpose that could be run in parallel */ const int transpose_work = nBlocksIFm * nBlocksOFm; /* compute chunk size */ const int transpose_chunksize = (transpose_work % handle->desc.threads == 0) ? (transpose_work / handle->desc.threads) : ((transpose_work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int transpose_thr_begin = (ltid * transpose_chunksize < transpose_work) ? (ltid * transpose_chunksize) : transpose_work; const int transpose_thr_end = ((ltid + 1) * transpose_chunksize < transpose_work) ? ((ltid + 1) * transpose_chunksize) : transpose_work; /* loop variables */ int ifm1 = 0, ifm2 = 0, ifm1ofm1 = 0, mb1ifm1 = 0; int im_tasks_per_thread = 0, in_tasks_per_thread = 0, my_in_start = 0, my_in_end = 0, my_im_start = 0, my_im_end = 0, my_row_id = 0, my_col_id = 0, row_teams = 0, column_teams = 0; LIBXSMM_VLA_DECL(5, const element_filter_type, filter, (element_filter_type*)handle->reg_filter->data, nBlocksIFm, bc_lp, bk, lpb); LIBXSMM_VLA_DECL(4, element_input_type, dinput, (element_input_type* )handle->grad_input->data, nBlocksIFm, bn, bc); LIBXSMM_VLA_DECL(5, element_filter_type, filter_tr, (element_filter_type*)handle->scratch, nBlocksOFm, bk_lp, bc, lpb); float* temp_output = (float*)handle->scratch + (handle->desc.C * handle->desc.K)/2; LIBXSMM_VLA_DECL(4, float, dinput_f32, (float*) temp_output, nBlocksIFm, bn, bc); unsigned long long blocks = nBlocksOFm; int KB_BLOCKS = nBlocksOFm, BF = 1; BF = handle->bwd_bf; KB_BLOCKS = nBlocksOFm/BF; blocks = KB_BLOCKS; if (use_2d_blocking == 1) { row_teams = handle->bwd_row_teams; column_teams = handle->bwd_column_teams; my_col_id = ltid % column_teams; my_row_id = ltid / column_teams; im_tasks_per_thread = LIBXSMM_UPDIV(nBlocksMB, row_teams); in_tasks_per_thread = LIBXSMM_UPDIV(nBlocksIFm, column_teams); my_im_start = LIBXSMM_MIN(my_row_id * im_tasks_per_thread, nBlocksMB); my_im_end = LIBXSMM_MIN((my_row_id+1) * im_tasks_per_thread, nBlocksMB); my_in_start = LIBXSMM_MIN(my_col_id * in_tasks_per_thread, nBlocksIFm); my_in_end = LIBXSMM_MIN((my_col_id+1) * in_tasks_per_thread, nBlocksIFm); } if (handle->desc.K > 1) { /* transpose weight */ if ((bk % 16 == 0) && (bc % 16 == 0)) { for (ifm1ofm1 = transpose_thr_begin; ifm1ofm1 < transpose_thr_end; ++ifm1ofm1) { ofm1 = ifm1ofm1 / nBlocksIFm; ifm1 = ifm1ofm1 % nBlocksIFm; bf16_vnni_transpose((element_filter_type*)&LIBXSMM_VLA_ACCESS(5, filter, ofm1, ifm1, 0, 0, 0, nBlocksIFm, bc_lp, bk, lpb), (element_filter_type*)&LIBXSMM_VLA_ACCESS(5, filter_tr, ifm1, ofm1, 0, 0, 0, nBlocksOFm, bk_lp, bc, lpb), bk, bc, bk, bc); } } else { for (ifm1ofm1 = transpose_thr_begin; ifm1ofm1 < transpose_thr_end; ++ifm1ofm1) { ofm1 = ifm1ofm1 / nBlocksIFm; ifm1 = ifm1ofm1 % nBlocksIFm; for (ofm2 = 0; ofm2 < bk; ++ofm2) { for (ifm2 = 0; ifm2 < bc; ++ifm2) { LIBXSMM_VLA_ACCESS(5, filter_tr, ifm1, ofm1, ofm2/lpb, ifm2, ofm2%lpb, nBlocksOFm, bk_lp, bc, lpb) = LIBXSMM_VLA_ACCESS(5, filter, ofm1, ifm1, ifm2/lpb, ofm2, ifm2%lpb, nBlocksIFm, bc_lp, bk, lpb); } } } } /* wait for transpose to finish */ libxsmm_barrier_wait(handle->barrier, ltid); if (use_2d_blocking == 1) { if (BF > 1) { for ( ofm1 = 0; ofm1 < BF; ++ofm1 ) { for (ifm1 = my_in_start; ifm1 < my_in_end; ++ifm1) { for (mb1 = my_im_start; mb1 < my_im_end; ++mb1) { /* Initialize intermediate f32 tensor */ if ( ofm1 == 0 ) { memset(&LIBXSMM_VLA_ACCESS(4, dinput_f32, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), 0, bn*bc*sizeof(float)); } batchreduce_kernel_bwd( &LIBXSMM_VLA_ACCESS(5, filter_tr, ifm1, ofm1*KB_BLOCKS, 0, 0, 0, nBlocksOFm, bk_lp, bc, lpb), &LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1*KB_BLOCKS, 0, 0, nBlocksOFm, bn, bk), &LIBXSMM_VLA_ACCESS(4, dinput_f32, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), &blocks); /* downconvert intermediate f32 tensor to bf 16 and store to final C */ if ( ofm1 == BF-1 ) { LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16(&LIBXSMM_VLA_ACCESS(4, dinput_f32, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), &LIBXSMM_VLA_ACCESS(4, dinput, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), bn*bc); } } } } } else { for (ifm1 = my_in_start; ifm1 < my_in_end; ++ifm1) { for (mb1 = my_im_start; mb1 < my_im_end; ++mb1) { batchreduce_kernel_bwd_zerobeta( &LIBXSMM_VLA_ACCESS(5, filter_tr, ifm1, 0, 0, 0, 0, nBlocksOFm, bk_lp, bc, lpb), &LIBXSMM_VLA_ACCESS(4, doutput, mb1, 0, 0, 0, nBlocksOFm, bn, bk), &LIBXSMM_VLA_ACCESS(4, dinput, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), &blocks); } } } } else { if (BF > 1) { for ( ofm1 = 0; ofm1 < BF; ++ofm1 ) { for ( mb1ifm1 = thr_begin; mb1ifm1 < thr_end; ++mb1ifm1 ) { mb1 = mb1ifm1%nBlocksMB; ifm1 = mb1ifm1/nBlocksMB; /* Initialize intermediate f32 tensor */ if ( ofm1 == 0 ) { memset(&LIBXSMM_VLA_ACCESS(4, dinput_f32, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), 0, bn*bc*sizeof(float)); } batchreduce_kernel_bwd( &LIBXSMM_VLA_ACCESS(5, filter_tr, ifm1, ofm1*KB_BLOCKS, 0, 0, 0, nBlocksOFm, bk_lp, bc, lpb), &LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1*KB_BLOCKS, 0, 0, nBlocksOFm, bn, bk), &LIBXSMM_VLA_ACCESS(4, dinput_f32, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), &blocks); /* downconvert intermediate f32 tensor to bf 16 and store to final C */ if ( ofm1 == BF-1 ) { LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16(&LIBXSMM_VLA_ACCESS(4, dinput_f32, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), &LIBXSMM_VLA_ACCESS(4, dinput, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), bn*bc); } } } } else { for ( mb1ifm1 = thr_begin; mb1ifm1 < thr_end; ++mb1ifm1 ) { mb1 = mb1ifm1%nBlocksMB; ifm1 = mb1ifm1/nBlocksMB; batchreduce_kernel_bwd_zerobeta( &LIBXSMM_VLA_ACCESS(5, filter_tr, ifm1, 0, 0, 0, 0, nBlocksOFm, bk_lp, bc, lpb), &LIBXSMM_VLA_ACCESS(4, doutput, mb1, 0, 0, 0, nBlocksOFm, bn, bk), &LIBXSMM_VLA_ACCESS(4, dinput, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), &blocks); } } } } else { /* Special case when K = 1 */ /* number of tasks for doutput copy that could be run in parallel */ const int copy_work_output = nBlocksMB * nBlocksOFm; /* compute chunk size */ const int copy_chunksize = (copy_work_output % handle->desc.threads == 0) ? (copy_work_output / handle->desc.threads) : ((copy_work_output / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int copy_thr_begin = (ltid * copy_chunksize < copy_work_output) ? (ltid * copy_chunksize) : copy_work_output; const int copy_thr_end = ((ltid + 1) * copy_chunksize < copy_work_output) ? ((ltid + 1) * copy_chunksize) : copy_work_output; LIBXSMM_VLA_DECL(5, element_filter_type, filter_tr_padded, (element_filter_type*)handle->scratch, nBlocksOFm, 1, bc, lpb); LIBXSMM_VLA_DECL(4, element_output_type, doutput_padded, (element_output_type*)handle->scratch + handle->desc.C * 2, nBlocksOFm, bn, lpb); /* Copy in weights and doutput in a padded buffer */ for (ifm1ofm1 = transpose_thr_begin; ifm1ofm1 < transpose_thr_end; ++ifm1ofm1) { ofm1 = ifm1ofm1 / nBlocksIFm; ifm1 = ifm1ofm1 % nBlocksIFm; ofm2 = 0; for (ifm2 = 0; ifm2 < bc; ++ifm2) { LIBXSMM_VLA_ACCESS(5, filter_tr_padded, ifm1, ofm1, ofm2/lpb, ifm2, ofm2%lpb, nBlocksOFm, 1, bc, lpb) = LIBXSMM_VLA_ACCESS(5, filter, ofm1, ifm1, ifm2/lpb, ofm2, ifm2%lpb, nBlocksIFm, bc_lp, bk, lpb); LIBXSMM_VLA_ACCESS(5, filter_tr_padded, ifm1, ofm1, ofm2/lpb, ifm2, 1, nBlocksOFm, 1, bc, lpb) = (element_filter_type)0; } } for (mb1ofm1 = copy_thr_begin; mb1ofm1 < copy_thr_end; ++mb1ofm1) { mb1 = mb1ofm1 / nBlocksOFm; ofm1 = mb1ofm1 % nBlocksOFm; ofm2 = 0; for (mb2 = 0; mb2 < bn; ++mb2) { LIBXSMM_VLA_ACCESS(4, doutput_padded, mb1, ofm1, mb2, 0, nBlocksOFm, bn, 2) = LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1, mb2, 0, nBlocksOFm, bn, bk); LIBXSMM_VLA_ACCESS(4, doutput_padded, mb1, ofm1, mb2, 1, nBlocksOFm, bn, 2) = (element_output_type)0; } } libxsmm_barrier_wait(handle->barrier, ltid); for ( mb1ifm1 = thr_begin; mb1ifm1 < thr_end; ++mb1ifm1 ) { mb1 = mb1ifm1%nBlocksMB; ifm1 = mb1ifm1/nBlocksMB; batchreduce_kernel_bwd_zerobeta( &LIBXSMM_VLA_ACCESS(5, filter_tr_padded, ifm1, 0, 0, 0, 0, nBlocksOFm, 1, bc, lpb), &LIBXSMM_VLA_ACCESS(4, doutput_padded, mb1, 0, 0, 0, nBlocksOFm, bn, 2), &LIBXSMM_VLA_ACCESS(4, dinput, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), &blocks); } } libxsmm_barrier_wait(handle->barrier, ltid); } if ( (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) || (kind == LIBXSMM_DNN_COMPUTE_KIND_BWDUPD) ) { /* number of tasks that could be run in parallel */ const int ofm_subtasks = (handle->upd_2d_blocking == 1) ? 1 : handle->ofm_subtasks; const int ifm_subtasks = (handle->upd_2d_blocking == 1) ? 1 : handle->ifm_subtasks; const int bbk = (handle->upd_2d_blocking == 1) ? bk : bk/ofm_subtasks; const int bbc = (handle->upd_2d_blocking == 1) ? bc : bc/ifm_subtasks; const int work = nBlocksIFm * ifm_subtasks * nBlocksOFm * ofm_subtasks; const int Cck_work = nBlocksIFm * ifm_subtasks * ofm_subtasks; const int Cc_work = nBlocksIFm * ifm_subtasks; /* 2D blocking parameters */ int use_2d_blocking = handle->upd_2d_blocking; int im_tasks_per_thread = 0, in_tasks_per_thread = 0, my_in_start = 0, my_in_end = 0, my_im_start = 0, my_im_end = 0, my_row_id = 0, my_col_id = 0, row_teams = 0, column_teams = 0; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; int BF = handle->upd_bf; /* loop variables */ int ifm1ofm1 = 0, ifm1 = 0, ifm2 = 0, bfn = 0, ii = 0, jj = 0, mb1ifm1 = 0, jc = 0, jk = 0; /* Batch reduce related variables */ unsigned long long blocks = nBlocksMB/BF; LIBXSMM_VLA_DECL(4, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksIFm, bn, bc); LIBXSMM_VLA_DECL(5, element_filter_type, dfilter, (element_filter_type*)handle->grad_filter->data, nBlocksIFm, bc_lp, bk, lpb); /* Set up tensors for transposing/scratch before vnni reformatting dfilter */ element_input_type *tr_inp_ptr = (element_input_type*) ((element_output_type*)handle->scratch + handle->desc.N * handle->desc.K); float *dfilter_f32_ptr = (float*) ((element_input_type*)tr_inp_ptr + handle->desc.N * handle->desc.C); element_filter_type *dfilter_scratch = (element_filter_type*) ((float*)dfilter_f32_ptr + handle->desc.C * handle->desc.K) + ltid * bc * bk; LIBXSMM_VLA_DECL(4, element_input_type, input_tr, (element_input_type*)tr_inp_ptr, nBlocksMB, bc, bn); LIBXSMM_VLA_DECL(4, float, dfilter_f32, (float*)dfilter_f32_ptr, nBlocksIFm, bc, bk); LIBXSMM_VLA_DECL(2, element_filter_type, dfilter_block, (element_filter_type*)dfilter_scratch, bk); const int tr_out_work = nBlocksMB * nBlocksOFm; const int tr_out_chunksize = (tr_out_work % handle->desc.threads == 0) ? (tr_out_work / handle->desc.threads) : ((tr_out_work / handle->desc.threads) + 1); const int tr_out_thr_begin = (ltid * tr_out_chunksize < tr_out_work) ? (ltid * tr_out_chunksize) : tr_out_work; const int tr_out_thr_end = ((ltid + 1) * tr_out_chunksize < tr_out_work) ? ((ltid + 1) * tr_out_chunksize) : tr_out_work; const int tr_inp_work = nBlocksMB * nBlocksIFm; const int tr_inp_chunksize = (tr_inp_work % handle->desc.threads == 0) ? (tr_inp_work / handle->desc.threads) : ((tr_inp_work / handle->desc.threads) + 1); const int tr_inp_thr_begin = (ltid * tr_inp_chunksize < tr_inp_work) ? (ltid * tr_inp_chunksize) : tr_inp_work; const int tr_inp_thr_end = ((ltid + 1) * tr_inp_chunksize < tr_inp_work) ? ((ltid + 1) * tr_inp_chunksize) : tr_inp_work; /* These are used for the vnni reformatting of the f32 output */ __m256i c0, c1; __m512 a01, b01; __m512i c01 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); const __m512i perm_index = LIBXSMM_INTRINSICS_MM512_SET_EPI16(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8, 23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0); if (use_2d_blocking == 1) { row_teams = handle->upd_row_teams; column_teams = handle->upd_column_teams; my_col_id = ltid % column_teams; my_row_id = ltid / column_teams; im_tasks_per_thread = LIBXSMM_UPDIV(nBlocksIFm, row_teams); in_tasks_per_thread = LIBXSMM_UPDIV(nBlocksOFm, column_teams); my_im_start = LIBXSMM_MIN(my_row_id * im_tasks_per_thread, nBlocksIFm); my_im_end = LIBXSMM_MIN((my_row_id+1) * im_tasks_per_thread, nBlocksIFm); my_in_start = LIBXSMM_MIN(my_col_id * in_tasks_per_thread, nBlocksOFm); my_in_end = LIBXSMM_MIN((my_col_id+1) * in_tasks_per_thread, nBlocksOFm); } /* Required upfront tranposes */ if (bc % 32 == 0) { for (mb1ifm1 = tr_inp_thr_begin; mb1ifm1 < tr_inp_thr_end; mb1ifm1++) { mb1 = mb1ifm1%nBlocksMB; ifm1 = mb1ifm1/nBlocksMB; bf16_transpose((element_input_type*)&LIBXSMM_VLA_ACCESS(4, input, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), &LIBXSMM_VLA_ACCESS(4, input_tr, ifm1, mb1, 0, 0, nBlocksMB, bc, bn), bc, bn, bc, bn); } } else { for (mb1ifm1 = tr_inp_thr_begin; mb1ifm1 < tr_inp_thr_end; mb1ifm1++) { mb1 = mb1ifm1%nBlocksMB; ifm1 = mb1ifm1/nBlocksMB; for (mb2 = 0; mb2 < bn; mb2++) { for (ifm2 = 0; ifm2 < bc; ifm2++) { LIBXSMM_VLA_ACCESS(4, input_tr, ifm1, mb1, ifm2, mb2, nBlocksMB, bc, bn) = LIBXSMM_VLA_ACCESS(4, input, mb1, ifm1, mb2, ifm2, nBlocksIFm, bn, bc); } } } } if (performed_doutput_transpose == 0) { if (bk % 32 == 0) { for (mb1ofm1 = tr_out_thr_begin; mb1ofm1 < tr_out_thr_end; mb1ofm1++) { mb1 = mb1ofm1%nBlocksMB; ofm1 = mb1ofm1/nBlocksMB; bf16_vnni_reformat((element_output_type*)&LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1, 0, 0, nBlocksOFm, bn, bk), &LIBXSMM_VLA_ACCESS(5, doutput_tr, ofm1, mb1, 0, 0, 0, nBlocksMB, bn_lp, bk, lpb), bk, bn, bk, bn); } } else { for (mb1ofm1 = tr_out_thr_begin; mb1ofm1 < tr_out_thr_end; mb1ofm1++) { mb1 = mb1ofm1%nBlocksMB; ofm1 = mb1ofm1/nBlocksMB; for (mb2 = 0; mb2 < bn; mb2++) { for (ofm2 = 0; ofm2 < bk; ofm2++) { LIBXSMM_VLA_ACCESS(5, doutput_tr, ofm1, mb1, mb2/lpb, ofm2, mb2%lpb, nBlocksMB, bn_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1, mb2, ofm2, nBlocksOFm, bn, bk); } } } } } libxsmm_barrier_wait(handle->barrier, ltid); if (use_2d_blocking == 1) { if (BF == 1) { for (ofm1 = my_in_start; ofm1 < my_in_end; ++ofm1) { for (ifm1 = my_im_start; ifm1 < my_im_end; ++ifm1) { batchreduce_kernel_upd_zerobeta(&LIBXSMM_VLA_ACCESS(5, doutput_tr, ofm1, 0, 0, 0, 0, nBlocksMB, bn_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(4, input_tr, ifm1, 0, 0, 0, nBlocksMB, bc, bn), &LIBXSMM_VLA_ACCESS(2, dfilter_block, 0, 0, bk), &blocks); /* TODO: Make this vnni reformating in the kernel... */ /* Copy result back to vnni format */ if ((bc % 2 == 0) && (bk % 16 == 0)) { for (jc = 0; jc < bc; jc+=2) { for (jk = 0; jk < bk; jk+=16) { c1 = _mm256_loadu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(2, dfilter_block, jc+1,jk, bk)); c0 = _mm256_loadu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(2, dfilter_block, jc, jk, bk)); c01 = _mm512_inserti64x4(c01, c0, 0); c01 = _mm512_inserti64x4(c01, c1, 1); _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(5, dfilter, ofm1, ifm1, jc/lpb, jk, 0, nBlocksIFm, bc_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); } } } else { for (ii = 0; ii < bc; ii++) { for (jj = 0; jj < bk; jj++) { LIBXSMM_VLA_ACCESS(5, dfilter, ofm1, ifm1, ii/lpb, jj, ii%lpb, nBlocksIFm, bc_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, dfilter_block, ii, jj, bk); } } } } } } else { for (bfn = 0; bfn < BF; bfn++) { for (ofm1 = my_in_start; ofm1 < my_in_end; ++ofm1) { for (ifm1 = my_im_start; ifm1 < my_im_end; ++ifm1) { /* initialize current work task to zero */ if (bfn == 0) { for (ii = 0; iibarrier, ltid); } libxsmm-1.17/src/template/libxsmm_dnn_fullyconnected_st_fwd_custom_generic.tpl.c000066400000000000000000000123761415223013700305340ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ /* size variables, all const */ /* here we assume that input and output blocking is similar */ const int nBlocksIFm = handle->blocksifm; const int nIFmBlock = handle->ifmblock; const int nBlocksOFm = handle->blocksofm; const int nOFmBlock = handle->ofmblock; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const int work = nBlocksOFm; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* loop variables */ int ofm1 = 0; LIBXSMM_VLA_DECL(3, element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksOFm, nOFmBlock); #if defined(LIBXSMM_DNN_FULLYCONNECTED_FWD_BF16_F32) float* input_f32_ptr = (float*)handle->scratch; float* filter_f32_ptr = ((float*)handle->scratch)+((size_t)handle->desc.N*(size_t)handle->desc.C); LIBXSMM_VLA_DECL(3, const float, input, input_f32_ptr, nBlocksIFm, nIFmBlock); LIBXSMM_VLA_DECL(4, const float, filter, filter_f32_ptr, nBlocksIFm, nIFmBlock, nOFmBlock); /* number of tasks that could be run in parallel */ const int work_input = handle->desc.N * handle->desc.C; /* compute chunk size */ const int chunksize_input = (work_input % handle->desc.threads == 0) ? (work_input / handle->desc.threads) : ((work_input / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin_input = (ltid * chunksize_input < work_input) ? (ltid * chunksize_input) : work_input; const int thr_end_input = ((ltid + 1) * chunksize_input < work_input) ? ((ltid + 1) * chunksize_input) : work_input; /* number of tasks that could be run in parallel */ const int work_filter = handle->desc.C * handle->desc.K; /* compute chunk size */ const int chunksize_filter = (work_filter % handle->desc.threads == 0) ? (work_filter / handle->desc.threads) : ((work_filter / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin_filter = (ltid * chunksize_filter < work_filter) ? (ltid * chunksize_filter) : work_filter; const int thr_end_filter = ((ltid + 1) * chunksize_filter < work_filter) ? ((ltid + 1) * chunksize_filter) : work_filter; #else LIBXSMM_VLA_DECL(3, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksIFm, nIFmBlock); LIBXSMM_VLA_DECL(4, const element_filter_type, filter, (element_filter_type*)handle->reg_filter->data, nBlocksIFm, nIFmBlock, nOFmBlock); #endif /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); #if defined(LIBXSMM_DNN_FULLYCONNECTED_FWD_BF16_F32) libxsmm_convert_bf16_f32( ((element_input_type*)handle->reg_input->data)+thr_begin_input, input_f32_ptr+thr_begin_input, thr_end_input - thr_begin_input ); libxsmm_convert_bf16_f32( ((element_filter_type*)handle->reg_filter->data)+thr_begin_filter, filter_f32_ptr+thr_begin_filter, thr_end_filter - thr_begin_filter ); libxsmm_barrier_wait(handle->barrier, ltid); #endif for ( ofm1 = thr_begin; ofm1 < thr_end; ++ofm1 ) { /* outer GEMM m-loop */ #if 1 gemm_kernel( &LIBXSMM_VLA_ACCESS(4, filter, ofm1, 0, 0, 0, nBlocksIFm, nIFmBlock, nOFmBlock), &LIBXSMM_VLA_ACCESS(3, input, 0, 0, 0, nBlocksIFm, nIFmBlock), &LIBXSMM_VLA_ACCESS(3, output, 0, ofm1, 0, nBlocksOFm, nOFmBlock) ); #else { const int nImg = handle->desc.N; int img2, ifm1, ifm2, ofm2; /* this is a simple replacement code using regular loops */ for ( img2 = 0; img2 < nImg; ++img2 ) { LIBXSMM_PRAGMA_SIMD for ( ofm2 = 0; ofm2 < nOFmBlock; ++ofm2 ) { LIBXSMM_VLA_ACCESS(3, output, img2, ofm1, ofm2, nBlocksOFm, nOFmBlock) = (element_output_type)0; } } for ( ifm1 = 0; ifm1 < nBlocksIFm; ++ifm1 ) { /* outer GEMM k-loop */ for ( ifm2 = 0; ifm2 < nIFmBlock; ++ifm2 ) { /* GEMM K-loop */ for ( img2 = 0; img2 < nImg; ++img2 ) { /* GEMM n-loop */ LIBXSMM_PRAGMA_SIMD for ( ofm2 = 0; ofm2 < nOFmBlock; ++ofm2 ) { /* GEMM m-loop */ LIBXSMM_VLA_ACCESS(3, output, img2, ofm1, ofm2, nBlocksOFm, nOFmBlock) += LIBXSMM_VLA_ACCESS(4, filter, ofm1, ifm1, ifm2, ofm2, nBlocksIFm, nIFmBlock, nOFmBlock) * LIBXSMM_VLA_ACCESS(3, input, img2, ifm1, ifm2, nBlocksIFm, nIFmBlock); } } } } } #endif } libxsmm_barrier_wait(handle->barrier, ltid); libxsmm-1.17/src/template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic.tpl.c000066400000000000000000000277211415223013700311360ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas, Alexander Heinecke (Intel Corp.) ******************************************************************************/ /* size variables, all const */ /* here we assume that input and output blocking is similar */ const int nBlocksIFm = handle->desc.C / handle->bc; const int nBlocksOFm = handle->desc.K / handle->bk; const int nBlocksMB = handle->desc.N / handle->bn; int use_2d_blocking = handle->fwd_2d_blocking; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const int work = nBlocksOFm * nBlocksMB; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* loop variables */ int mb1ofm1 = 0, mb1 = 0, ofm1 = 0, ifm1 = 0; int im_tasks_per_thread = 0, in_tasks_per_thread = 0, my_in_start = 0, my_in_end = 0, my_im_start = 0, my_im_end = 0, my_row_id = 0, my_col_id = 0, row_teams = 0, column_teams = 0; int mb2 = 0, ofm2 = 0; LIBXSMM_VLA_DECL(4, element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksOFm, handle->bn, handle->bk); LIBXSMM_VLA_DECL(4, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksIFm, handle->bn, handle->bc); LIBXSMM_VLA_DECL(4, const element_filter_type, filter, (element_filter_type*)handle->reg_filter->data, nBlocksIFm, handle->bc, handle->bk); #ifndef LIBXSMM_DNN_FC_FWD_FUSE_NONE #ifdef LIBXSMM_DNN_FC_FWD_FUSE_BIAS LIBXSMM_VLA_DECL(2, const element_output_type, bias, (element_output_type*)handle->reg_bias->data, handle->bk); #endif #ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU LIBXSMM_VLA_DECL(4, unsigned char, relumask, (unsigned char*) handle->relumask->data, nBlocksOFm, handle->bn, handle->bk); #endif #endif unsigned long long blocks = nBlocksIFm; int CB_BLOCKS = nBlocksIFm, BF = 1; BF = handle->fwd_bf; CB_BLOCKS = nBlocksIFm/BF; blocks = CB_BLOCKS; if (use_2d_blocking == 1) { row_teams = handle->fwd_row_teams; column_teams = handle->fwd_column_teams; my_col_id = ltid % column_teams; my_row_id = ltid / column_teams; im_tasks_per_thread = LIBXSMM_UPDIV(nBlocksMB, row_teams); in_tasks_per_thread = LIBXSMM_UPDIV(nBlocksOFm, column_teams); my_im_start = LIBXSMM_MIN(my_row_id * im_tasks_per_thread, nBlocksMB); my_im_end = LIBXSMM_MIN((my_row_id+1) * im_tasks_per_thread, nBlocksMB); my_in_start = LIBXSMM_MIN(my_col_id * in_tasks_per_thread, nBlocksOFm); my_in_end = LIBXSMM_MIN((my_col_id+1) * in_tasks_per_thread, nBlocksOFm); } /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); if (use_2d_blocking == 1) { if (BF > 1) { for ( ifm1 = 0; ifm1 < BF; ++ifm1 ) { for (ofm1 = my_in_start; ofm1 < my_in_end; ++ofm1) { for (mb1 = my_im_start; mb1 < my_im_end; ++mb1) { /* Initialize intermediate f32 tensor */ if ( ifm1 == 0 ) { #ifdef LIBXSMM_DNN_FC_FWD_FUSE_BIAS for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = LIBXSMM_VLA_ACCESS(2, bias, ofm1, ofm2, handle->bk); } } #else for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = (element_output_type)0; } } #endif } batchreduce_kernel_beta( &LIBXSMM_VLA_ACCESS(4, filter, ofm1, ifm1*CB_BLOCKS, 0, 0, nBlocksIFm, handle->bc, handle->bk), &LIBXSMM_VLA_ACCESS(4, input, mb1, ifm1*CB_BLOCKS, 0, 0, nBlocksIFm, handle->bn, handle->bc), &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), &blocks); /* downconvert intermediate f32 tensor to bf 16 and store to final C */ #ifndef LIBXSMM_DNN_FC_FWD_FUSE_NONE if ( ifm1 == BF-1 ) { for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { float l_cur_out = LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk); #ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU LIBXSMM_VLA_ACCESS(4, relumask, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = (unsigned char)(( l_cur_out > (element_output_type)0 ) ? 1 : 0); l_cur_out = (l_cur_out > (element_output_type)0) ? l_cur_out : (element_output_type)0; #endif #ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID /* we ar using Pade 7/8 approximation */ l_cur_out = (libxsmm_stanh_pade78( l_cur_out / 2.0f ) + 1.0f) / 2.0f; #endif LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = l_cur_out; } } } #endif } } } } else { for (ofm1 = my_in_start; ofm1 < my_in_end; ++ofm1) { for (mb1 = my_im_start; mb1 < my_im_end; ++mb1) { #ifdef LIBXSMM_DNN_FC_FWD_FUSE_BIAS for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = LIBXSMM_VLA_ACCESS(2, bias, ofm1, ofm2, handle->bk); } } batchreduce_kernel_beta( &LIBXSMM_VLA_ACCESS(4, filter, ofm1, 0, 0, 0, nBlocksIFm, handle->bc, handle->bk), &LIBXSMM_VLA_ACCESS(4, input, mb1, 0, 0, 0, nBlocksIFm, handle->bn, handle->bc), &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), &blocks); #else batchreduce_kernel_zerobeta( &LIBXSMM_VLA_ACCESS(4, filter, ofm1, 0, 0, 0, nBlocksIFm, handle->bc, handle->bk), &LIBXSMM_VLA_ACCESS(4, input, mb1, 0, 0, 0, nBlocksIFm, handle->bn, handle->bc), &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), &blocks); #endif #ifndef LIBXSMM_DNN_FC_FWD_FUSE_NONE for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { element_output_type l_cur_out = LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk); #ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU LIBXSMM_VLA_ACCESS(4, relumask, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = (unsigned char)(( l_cur_out > (element_output_type)0 ) ? 1 : 0); l_cur_out = ( l_cur_out > (element_output_type)0 ) ? l_cur_out : (element_output_type)0; #endif #ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID /* we ar using Pade 7/8 approximation */ l_cur_out = (libxsmm_stanh_pade78( l_cur_out / 2.0f ) + 1.0f) / 2.0f; #endif LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = l_cur_out; } } #endif } } } } else { if (BF > 1) { for ( ifm1 = 0; ifm1 < BF; ++ifm1 ) { for ( mb1ofm1 = thr_begin; mb1ofm1 < thr_end; ++mb1ofm1 ) { mb1 = mb1ofm1%nBlocksMB; ofm1 = mb1ofm1/nBlocksMB; /* Initialize intermediate f32 tensor */ if ( ifm1 == 0 ) { #ifdef LIBXSMM_DNN_FC_FWD_FUSE_BIAS for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = LIBXSMM_VLA_ACCESS(2, bias, ofm1, ofm2, handle->bk); } } #else for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = (element_output_type)0; } } #endif } batchreduce_kernel_beta( &LIBXSMM_VLA_ACCESS(4, filter, ofm1, ifm1*CB_BLOCKS, 0, 0, nBlocksIFm, handle->bc, handle->bk), &LIBXSMM_VLA_ACCESS(4, input, mb1, ifm1*CB_BLOCKS, 0, 0, nBlocksIFm, handle->bn, handle->bc), &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), &blocks); /* downconvert intermediate f32 tensor to bf 16 and store to final C */ #ifndef LIBXSMM_DNN_FC_FWD_FUSE_NONE if ( ifm1 == BF-1 ) { for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { float l_cur_out = LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk); #ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU LIBXSMM_VLA_ACCESS(4, relumask, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = (unsigned char)(( l_cur_out > (element_output_type)0 ) ? 1 : 0); l_cur_out = (l_cur_out > (element_output_type)0) ? l_cur_out : (element_output_type)0; #endif #ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID /* we ar using Pade 7/8 approximation */ l_cur_out = (libxsmm_stanh_pade78( l_cur_out / 2.0f ) + 1.0f) / 2.0f; #endif LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = l_cur_out; } } } #endif } } } else { for ( mb1ofm1 = thr_begin; mb1ofm1 < thr_end; ++mb1ofm1 ) { mb1 = mb1ofm1%nBlocksMB; ofm1 = mb1ofm1/nBlocksMB; #ifdef LIBXSMM_DNN_FC_FWD_FUSE_BIAS for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = LIBXSMM_VLA_ACCESS(2, bias, ofm1, ofm2, handle->bk); } } batchreduce_kernel_beta( &LIBXSMM_VLA_ACCESS(4, filter, ofm1, 0, 0, 0, nBlocksIFm, handle->bc, handle->bk), &LIBXSMM_VLA_ACCESS(4, input, mb1, 0, 0, 0, nBlocksIFm, handle->bn, handle->bc), &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), &blocks); #else batchreduce_kernel_zerobeta( &LIBXSMM_VLA_ACCESS(4, filter, ofm1, 0, 0, 0, nBlocksIFm, handle->bc, handle->bk), &LIBXSMM_VLA_ACCESS(4, input, mb1, 0, 0, 0, nBlocksIFm, handle->bn, handle->bc), &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), &blocks); #endif #ifndef LIBXSMM_DNN_FC_FWD_FUSE_NONE for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { element_output_type l_cur_out = LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk); #ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU LIBXSMM_VLA_ACCESS(4, relumask, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = (unsigned char)(( l_cur_out > (element_output_type)0 ) ? 1 : 0); l_cur_out = ( l_cur_out > (element_output_type)0 ) ? l_cur_out : (element_output_type)0; #endif #ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID /* we ar using Pade 7/8 approximation */ l_cur_out = (libxsmm_stanh_pade78( l_cur_out / 2.0f ) + 1.0f) / 2.0f; #endif LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = l_cur_out; } } #endif } } } libxsmm_barrier_wait(handle->barrier, ltid); libxsmm-1.17/src/template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16.tpl.c000066400000000000000000000535071415223013700317550ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas, Alexander Heinecke (Intel Corp.) ******************************************************************************/ /* size variables, all const */ /* here we assume that input and output blocking is similar */ const int nBlocksIFm = handle->desc.C / handle->bc; const int nBlocksOFm = handle->desc.K / handle->bk; const int nBlocksMB = handle->desc.N / handle->bn; int lpb = 2; const int bc_lp = handle->bc/lpb; /* const int bc = handle->bc;*/ int use_2d_blocking = handle->fwd_2d_blocking; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const int work = nBlocksOFm * nBlocksMB; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* loop variables */ int mb1ofm1 = 0, mb1 = 0, ofm1 = 0, ifm1 = 0; int im_tasks_per_thread = 0, in_tasks_per_thread = 0, my_in_start = 0, my_in_end = 0, my_im_start = 0, my_im_end = 0, my_row_id = 0, my_col_id = 0, row_teams = 0, column_teams = 0; #ifndef LIBXSMM_DNN_FC_FWD_FUSE_NONE int mb2 = 0, ofm2 = 0; #endif LIBXSMM_VLA_DECL(4, element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksOFm, handle->bn, handle->bk); LIBXSMM_VLA_DECL(4, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksIFm, handle->bn, handle->bc); LIBXSMM_VLA_DECL(5, const element_filter_type, filter, (element_filter_type*)handle->reg_filter->data, nBlocksIFm, bc_lp, handle->bk, lpb); float* temp_output = (float*)handle->scratch; LIBXSMM_VLA_DECL(4, float, output_f32, (float*) temp_output, nBlocksOFm,handle->bn,handle->bk); #ifndef LIBXSMM_DNN_FC_FWD_FUSE_NONE #ifdef LIBXSMM_DNN_FC_FWD_FUSE_BIAS LIBXSMM_VLA_DECL(2, const element_input_type, bias, (element_input_type*) handle->reg_bias->data, handle->bk); #endif #ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU LIBXSMM_VLA_DECL(4, unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksOFm, handle->bn, handle->bk); LIBXSMM_VLA_DECL(4, __mmask16, relubitmask, (__mmask16*)handle->relumask->data, nBlocksOFm, handle->bn, handle->bk/16); #endif #endif unsigned long long blocks = nBlocksIFm; int CB_BLOCKS = nBlocksIFm, BF = 1; BF = handle->fwd_bf; CB_BLOCKS = nBlocksIFm/BF; blocks = CB_BLOCKS; if (use_2d_blocking == 1) { row_teams = handle->fwd_row_teams; column_teams = handle->fwd_column_teams; my_col_id = ltid % column_teams; my_row_id = ltid / column_teams; im_tasks_per_thread = LIBXSMM_UPDIV(nBlocksMB, row_teams); in_tasks_per_thread = LIBXSMM_UPDIV(nBlocksOFm, column_teams); my_im_start = LIBXSMM_MIN(my_row_id * im_tasks_per_thread, nBlocksMB); my_im_end = LIBXSMM_MIN((my_row_id+1) * im_tasks_per_thread, nBlocksMB); my_in_start = LIBXSMM_MIN(my_col_id * in_tasks_per_thread, nBlocksOFm); my_in_end = LIBXSMM_MIN((my_col_id+1) * in_tasks_per_thread, nBlocksOFm); } /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); if (use_2d_blocking == 1) { if (BF > 1) { for ( ifm1 = 0; ifm1 < BF; ++ifm1 ) { for (ofm1 = my_in_start; ofm1 < my_in_end; ++ofm1) { for (mb1 = my_im_start; mb1 < my_im_end; ++mb1) { /* Initialize intermediate f32 tensor */ if ( ifm1 == 0 ) { #ifdef LIBXSMM_DNN_FC_FWD_FUSE_BIAS for ( mb2 = 0; mb2 bn; ++mb2 ) { LIBXSMM_DNN_CONVERT_BUFFER_BF16_F32( &LIBXSMM_VLA_ACCESS(2, bias, ofm1, 0,handle->bk), &LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, mb2, 0, nBlocksOFm,handle->bn,handle->bk), handle->bk ); } #else memset(&LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), 0, handle->bn*handle->bk*sizeof(float)); #endif } batchreduce_kernel( &LIBXSMM_VLA_ACCESS(5, filter, ofm1, ifm1*CB_BLOCKS, 0, 0, 0, nBlocksIFm, bc_lp, handle->bk, lpb), &LIBXSMM_VLA_ACCESS(4, input, mb1, ifm1*CB_BLOCKS, 0, 0, nBlocksIFm, handle->bn, handle->bc), &LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), &blocks); /* downconvert intermediate f32 tensor to bf 16 and store to final C */ if ( ifm1 == BF-1 ) { #ifndef LIBXSMM_DNN_FC_FWD_FUSE_NONE if (handle->bk % 32 == 0) { __m512 cur_out_0 = _mm512_setzero_ps(); __m512 cur_out_1 = _mm512_setzero_ps(); #ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU __mmask16 relumask0; __mmask16 relumask1; #endif #ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID __m512 ones = _mm512_set1_ps(1.0); __m512 halves = _mm512_set1_ps(0.5); #endif for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { for ( ofm2 = 0; ofm2 < handle->bk; ofm2 += 32 ) { cur_out_0 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk)); cur_out_1 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, mb2, ofm2+16, nBlocksOFm, handle->bn, handle->bk)); #ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU relumask0 = _mm512_cmp_ps_mask( cur_out_0, _mm512_setzero_ps(), _CMP_GT_OQ ); relumask1 = _mm512_cmp_ps_mask( cur_out_1, _mm512_setzero_ps(), _CMP_GT_OQ ); cur_out_0 = _mm512_mask_blend_ps( relumask0, _mm512_setzero_ps(), cur_out_0 ); cur_out_1 = _mm512_mask_blend_ps( relumask1, _mm512_setzero_ps(), cur_out_1 ); LIBXSMM_INTRINSICS_MM512_STORE_MASK16( &LIBXSMM_VLA_ACCESS(4, relubitmask, mb1, ofm1, mb2, ofm2/16, nBlocksOFm, handle->bn, handle->bk/16), relumask0 ); LIBXSMM_INTRINSICS_MM512_STORE_MASK16( &LIBXSMM_VLA_ACCESS(4, relubitmask, mb1, ofm1, mb2, ofm2/16+1, nBlocksOFm, handle->bn, handle->bk/16), relumask1 ); #endif #ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID /* we ar using Pade 7/8 approximation */ cur_out_0 = _mm512_mul_ps(_mm512_add_ps(LIBXSMM_INTRINSICS_MM512_TANH_PS_RATIONAL_78(_mm512_mul_ps(cur_out_0, halves)), ones), halves); cur_out_1 = _mm512_mul_ps(_mm512_add_ps(LIBXSMM_INTRINSICS_MM512_TANH_PS_RATIONAL_78(_mm512_mul_ps(cur_out_1, halves)), ones), halves); #endif _mm512_storeu_ps(&LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk), cur_out_0); _mm512_storeu_ps(&LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, mb2, ofm2+16, nBlocksOFm, handle->bn, handle->bk), cur_out_1); } } } else { for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { float l_cur_out = LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk); #ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU LIBXSMM_VLA_ACCESS(4, relumask, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = (unsigned char)(( l_cur_out > (float)0 ) ? 1 : 0); l_cur_out = (l_cur_out > (float)0) ? l_cur_out : (float)0; #endif #ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID /* we ar using Pade 7/8 approximation */ l_cur_out = (libxsmm_stanh_pade78( l_cur_out / 2.0f ) + 1.0f) / 2.0f; #endif LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = l_cur_out; } } } #endif LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16(&LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, 0, 0, nBlocksOFm,handle->bn,handle->bk), &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm,handle->bn,handle->bk),handle->bn*handle->bk); } } } } } else { for (ofm1 = my_in_start; ofm1 < my_in_end; ++ofm1) { for (mb1 = my_im_start; mb1 < my_im_end; ++mb1) { #ifdef LIBXSMM_DNN_FC_FWD_FUSE_BIAS for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = LIBXSMM_VLA_ACCESS(2, bias, ofm1, ofm2, handle->bk); } } batchreduce_kernel_beta( &LIBXSMM_VLA_ACCESS(5, filter, ofm1, 0, 0, 0, 0, nBlocksIFm, bc_lp, handle->bk, lpb), &LIBXSMM_VLA_ACCESS(4, input, mb1, 0, 0, 0, nBlocksIFm, handle->bn, handle->bc), &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), &blocks); #else batchreduce_kernel_zerobeta( &LIBXSMM_VLA_ACCESS(5, filter, ofm1, 0, 0, 0, 0, nBlocksIFm, bc_lp, handle->bk, lpb), &LIBXSMM_VLA_ACCESS(4, input, mb1, 0, 0, 0, nBlocksIFm, handle->bn, handle->bc), &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), &blocks); #endif #ifndef LIBXSMM_DNN_FC_FWD_FUSE_NONE if (handle->bk % 32 == 0) { __m512 cur_out_0 = _mm512_setzero_ps(); __m512 cur_out_1 = _mm512_setzero_ps(); #ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU __mmask16 relumask0; __mmask16 relumask1; #endif #ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID __m512 ones = _mm512_set1_ps(1.0); __m512 halves = _mm512_set1_ps(0.5); #endif for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { for ( ofm2 = 0; ofm2 < handle->bk; ofm2 += 32 ) { cur_out_0 = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk))); cur_out_1 = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2+16, nBlocksOFm, handle->bn, handle->bk))); #ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU relumask0 = _mm512_cmp_ps_mask( cur_out_0, _mm512_setzero_ps(), _CMP_GT_OQ ); relumask1 = _mm512_cmp_ps_mask( cur_out_1, _mm512_setzero_ps(), _CMP_GT_OQ ); cur_out_0 = _mm512_mask_blend_ps( relumask0, _mm512_setzero_ps(), cur_out_0 ); cur_out_1 = _mm512_mask_blend_ps( relumask1, _mm512_setzero_ps(), cur_out_1 ); LIBXSMM_INTRINSICS_MM512_STORE_MASK16( &LIBXSMM_VLA_ACCESS(4, relubitmask, mb1, ofm1, mb2, ofm2/16, nBlocksOFm, handle->bn, handle->bk/16), relumask0 ); LIBXSMM_INTRINSICS_MM512_STORE_MASK16( &LIBXSMM_VLA_ACCESS(4, relubitmask, mb1, ofm1, mb2, ofm2/16+1, nBlocksOFm, handle->bn, handle->bk/16), relumask1 ); #endif #ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID /* we ar using Pade 7/8 approximation */ cur_out_0 = _mm512_mul_ps(_mm512_add_ps(LIBXSMM_INTRINSICS_MM512_TANH_PS_RATIONAL_78(_mm512_mul_ps(cur_out_0, halves)), ones), halves); cur_out_1 = _mm512_mul_ps(_mm512_add_ps(LIBXSMM_INTRINSICS_MM512_TANH_PS_RATIONAL_78(_mm512_mul_ps(cur_out_1, halves)), ones), halves); #endif _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk), LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( cur_out_1, cur_out_0 )); } } } else { for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { #ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID libxsmm_bfloat16_hp t; #endif libxsmm_bfloat16 l_cur_out = LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk); #ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU LIBXSMM_VLA_ACCESS(4, relumask, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = (unsigned char)(( (l_cur_out & 0x8000) > 0 ) ? 0 : 1); l_cur_out = (libxsmm_bfloat16)(( (l_cur_out & 0x8000) > 0 ) ? 0 : l_cur_out); #endif #ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID /* we ar using Pade 7/8 approximation */ t.i[1] = l_cur_out; t.i[0] = 0; t.f = (libxsmm_stanh_pade78( t.f / 2.0f ) + 1.0f) / 2.0f; l_cur_out = t.i[1]; #endif LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = l_cur_out; } } } #endif } } } } else { if (BF > 1) { for ( ifm1 = 0; ifm1 < BF; ++ifm1 ) { for ( mb1ofm1 = thr_begin; mb1ofm1 < thr_end; ++mb1ofm1 ) { mb1 = mb1ofm1%nBlocksMB; ofm1 = mb1ofm1/nBlocksMB; /* Initialize intermediate f32 tensor */ if ( ifm1 == 0 ) { #ifdef LIBXSMM_DNN_FC_FWD_FUSE_BIAS for ( mb2 = 0; mb2 bn; ++mb2 ) { LIBXSMM_DNN_CONVERT_BUFFER_BF16_F32( &LIBXSMM_VLA_ACCESS(2, bias, ofm1, 0,handle->bk), &LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, mb2, 0, nBlocksOFm, handle->bn, handle->bk), handle->bk ); } #else memset(&LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), 0, handle->bn*handle->bk*sizeof(float)); #endif } batchreduce_kernel( &LIBXSMM_VLA_ACCESS(5, filter, ofm1, ifm1*CB_BLOCKS, 0, 0, 0, nBlocksIFm, bc_lp, handle->bk, lpb), &LIBXSMM_VLA_ACCESS(4, input, mb1, ifm1*CB_BLOCKS, 0, 0, nBlocksIFm, handle->bn, handle->bc), &LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), &blocks); /* downconvert intermediate f32 tensor to bf 16 and store to final C */ if ( ifm1 == BF-1 ) { #ifndef LIBXSMM_DNN_FC_FWD_FUSE_NONE if (handle->bk % 32 == 0) { __m512 cur_out_0 = _mm512_setzero_ps(); __m512 cur_out_1 = _mm512_setzero_ps(); #ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU __mmask16 relumask0; __mmask16 relumask1; #endif #ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID __m512 ones = _mm512_set1_ps(1.0); __m512 halves = _mm512_set1_ps(0.5); #endif for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { for ( ofm2 = 0; ofm2 < handle->bk; ofm2 += 32 ) { cur_out_0 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk)); cur_out_1 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, mb2, ofm2+16, nBlocksOFm, handle->bn, handle->bk)); #ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU relumask0 = _mm512_cmp_ps_mask( cur_out_0, _mm512_setzero_ps(), _CMP_GT_OQ ); relumask1 = _mm512_cmp_ps_mask( cur_out_1, _mm512_setzero_ps(), _CMP_GT_OQ ); cur_out_0 = _mm512_mask_blend_ps( relumask0, _mm512_setzero_ps(), cur_out_0 ); cur_out_1 = _mm512_mask_blend_ps( relumask1, _mm512_setzero_ps(), cur_out_1 ); LIBXSMM_INTRINSICS_MM512_STORE_MASK16( &LIBXSMM_VLA_ACCESS(4, relubitmask, mb1, ofm1, mb2, ofm2/16, nBlocksOFm, handle->bn, handle->bk/16), relumask0 ); LIBXSMM_INTRINSICS_MM512_STORE_MASK16( &LIBXSMM_VLA_ACCESS(4, relubitmask, mb1, ofm1, mb2, ofm2/16+1, nBlocksOFm, handle->bn, handle->bk/16), relumask1 ); #endif #ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID /* we ar using Pade 7/8 approximation */ cur_out_0 = _mm512_mul_ps(_mm512_add_ps(LIBXSMM_INTRINSICS_MM512_TANH_PS_RATIONAL_78(_mm512_mul_ps(cur_out_0, halves)), ones), halves); cur_out_1 = _mm512_mul_ps(_mm512_add_ps(LIBXSMM_INTRINSICS_MM512_TANH_PS_RATIONAL_78(_mm512_mul_ps(cur_out_1, halves)), ones), halves); #endif _mm512_storeu_ps(&LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk), cur_out_0); _mm512_storeu_ps(&LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, mb2, ofm2+16, nBlocksOFm, handle->bn, handle->bk), cur_out_1); } } } else { for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { float l_cur_out = LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk); #ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU LIBXSMM_VLA_ACCESS(4, relumask, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = (unsigned char)(( l_cur_out > 0.0 ) ? 1 : 0); l_cur_out = (l_cur_out > (float)0) ? l_cur_out : (float)0; #endif #ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID /* we ar using Pade 7/8 approximation */ l_cur_out = (libxsmm_stanh_pade78( l_cur_out / 2.0f ) + 1.0f) / 2.0f; #endif LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = l_cur_out; } } } #endif LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16(&LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), handle->bn*handle->bk); } } } } else { for ( mb1ofm1 = thr_begin; mb1ofm1 < thr_end; ++mb1ofm1 ) { mb1 = mb1ofm1%nBlocksMB; ofm1 = mb1ofm1/nBlocksMB; #ifdef LIBXSMM_DNN_FC_FWD_FUSE_BIAS for ( mb2 = 0; mb2 bn; ++mb2 ) { for ( ofm2 = 0; ofm2 bk; ++ofm2 ) { LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = LIBXSMM_VLA_ACCESS(2, bias, ofm1, ofm2, handle->bk); } } batchreduce_kernel_beta( &LIBXSMM_VLA_ACCESS(5, filter, ofm1, 0, 0, 0, 0, nBlocksIFm, bc_lp, handle->bk, lpb), &LIBXSMM_VLA_ACCESS(4, input, mb1, 0, 0, 0, nBlocksIFm, handle->bn, handle->bc), &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), &blocks); #else batchreduce_kernel_zerobeta( &LIBXSMM_VLA_ACCESS(5, filter, ofm1, 0, 0, 0, 0, nBlocksIFm, bc_lp, handle->bk, lpb), &LIBXSMM_VLA_ACCESS(4, input, mb1, 0, 0, 0, nBlocksIFm, handle->bn, handle->bc), &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), &blocks); #endif #ifndef LIBXSMM_DNN_FC_FWD_FUSE_NONE if (handle->bk % 32 == 0) { __m512 cur_out_0 = _mm512_setzero_ps(); __m512 cur_out_1 = _mm512_setzero_ps(); #ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU __mmask16 relumask0; __mmask16 relumask1; #endif #ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID __m512 ones = _mm512_set1_ps(1.0); __m512 halves = _mm512_set1_ps(0.5); #endif for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { for ( ofm2 = 0; ofm2 < handle->bk; ofm2 += 32 ) { cur_out_0 = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk))); cur_out_1 = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2+16, nBlocksOFm, handle->bn, handle->bk))); #ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU relumask0 = _mm512_cmp_ps_mask( cur_out_0, _mm512_setzero_ps(), _CMP_GT_OQ ); relumask1 = _mm512_cmp_ps_mask( cur_out_1, _mm512_setzero_ps(), _CMP_GT_OQ ); cur_out_0 = _mm512_mask_blend_ps( relumask0, _mm512_setzero_ps(), cur_out_0 ); cur_out_1 = _mm512_mask_blend_ps( relumask1, _mm512_setzero_ps(), cur_out_1 ); LIBXSMM_INTRINSICS_MM512_STORE_MASK16( &LIBXSMM_VLA_ACCESS(4, relubitmask, mb1, ofm1, mb2, ofm2/16, nBlocksOFm, handle->bn, handle->bk/16), relumask0 ); LIBXSMM_INTRINSICS_MM512_STORE_MASK16( &LIBXSMM_VLA_ACCESS(4, relubitmask, mb1, ofm1, mb2, ofm2/16+1, nBlocksOFm, handle->bn, handle->bk/16), relumask1 ); #endif #ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID /* we ar using Pade 7/8 approximation */ cur_out_0 = _mm512_mul_ps(_mm512_add_ps(LIBXSMM_INTRINSICS_MM512_TANH_PS_RATIONAL_78(_mm512_mul_ps(cur_out_0, halves)), ones), halves); cur_out_1 = _mm512_mul_ps(_mm512_add_ps(LIBXSMM_INTRINSICS_MM512_TANH_PS_RATIONAL_78(_mm512_mul_ps(cur_out_1, halves)), ones), halves); #endif _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk), LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( cur_out_1, cur_out_0 )); } } } else { for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { #ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID libxsmm_bfloat16_hp t; #endif libxsmm_bfloat16 l_cur_out = LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk); #ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU LIBXSMM_VLA_ACCESS(4, relumask, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = (unsigned char)(( (l_cur_out & 0x8000) > 0 ) ? 0 : 1); l_cur_out = (libxsmm_bfloat16)(( (l_cur_out & 0x8000) > 0 ) ? 0 : l_cur_out); #endif #ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID /* we ar using Pade 7/8 approximation */ t.i[1] = l_cur_out; t.i[0] = 0; t.f = (libxsmm_stanh_pade78( t.f / 2.0f ) + 1.0f) / 2.0f; l_cur_out = t.i[1]; #endif LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = l_cur_out; } } } #endif } } } libxsmm_barrier_wait(handle->barrier, ltid); libxsmm-1.17/src/template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c000066400000000000000000000316471415223013700320730ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ #if defined(LIBXSMM_DNN_FUSEDBN_BWD_BF16) # define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) #if 1 # define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) #else # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) #endif #else # define _mm512_load_act(A) _mm512_loadu_ps(A) # define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) # define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) #endif /* size variables, all const */ const int nImg = handle->desc.partN; const int ifh = handle->desc.H; const int ifw = handle->desc.W; const int sh = handle->desc.u; const int sw = handle->desc.v; const int ofh = ifh/sh; const int ofw = ifw/sw; const int iph = handle->desc.pad_h_in; const int ipw = handle->desc.pad_w_in; const int oph = handle->desc.pad_h_out; const int opw = handle->desc.pad_w_out; const int ofhp = ofh + 2*oph; const int ofwp = ofw + 2*opw; const int ifhp = ifh + 2*iph; const int ifwp = ifw + 2*ipw; /* here we assume that input and output blocking is similar */ const int nBlocksFm = handle->blocksifm; const element_stats_type nhw = (element_stats_type)(handle->desc.fullN * ifh * ifw); const element_stats_type recp_nhw = 1.0f/nhw; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const int work = nImg * nBlocksFm; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* number of tasks that could be run in parallel, delta gamma and beta reduction */ const int work2 = nBlocksFm; /* compute chunk size */ const int chunksize2 = (work2 % handle->desc.threads == 0) ? (work2 / handle->desc.threads) : ((work2 / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; /* loop variables */ int img = 0; int fm = 0; int imgfm = 0; int hi = 0; int wi = 0; int ho = 0; int wo = 0; LIBXSMM_VLA_DECL(5, element_input_type, dinput, (element_input_type* )handle->grad_input->data, nBlocksFm, ifhp, ifwp, 16); LIBXSMM_VLA_DECL(5, element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, 16); #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) LIBXSMM_VLA_DECL(5, element_input_type, dinput_add, (element_input_type* )handle->grad_add->data, nBlocksFm, ifhp, ifwp, 16); #endif #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) LIBXSMM_VLA_DECL(5, const element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, 16); #endif LIBXSMM_VLA_DECL(5, element_output_type, doutput, (element_output_type*)handle->grad_output->data, nBlocksFm, ofhp, ofwp, 16); LIBXSMM_VLA_DECL(2, const element_stats_type, gamma, (element_stats_type*)handle->reg_gamma->data, 16); LIBXSMM_VLA_DECL(2, element_stats_type, dgamma, (element_stats_type*)handle->grad_gamma->data, 16); LIBXSMM_VLA_DECL(2, element_stats_type, dbeta, (element_stats_type*)handle->grad_beta->data, 16); LIBXSMM_VLA_DECL(2, const element_stats_type, bmean, (element_stats_type*)handle->expvalue->data, 16); LIBXSMM_VLA_DECL(2, const element_stats_type, brstd, (element_stats_type*)handle->rcpstddev->data, 16); LIBXSMM_VLA_DECL(3, element_stats_type, dgamma_img, (element_stats_type*)handle->scratch, nImg, 16); LIBXSMM_VLA_DECL(3, element_stats_type, dbeta_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * (size_t)nBlocksFm * (size_t)16), nImg, 16); #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) LIBXSMM_VLA_DECL(5, const unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksFm, ofhp, ofwp, 2); #endif /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0) || ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) > 0) ) { for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { __m512 lcl_vdgamma = _mm512_setzero_ps(); __m512 lcl_vdbeta = _mm512_setzero_ps(); __m512 lcl_vbmean, lcl_vbrstd; element_stats_type* del_gamma_img_ptr; element_stats_type* del_beta_img_ptr; img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, img, 0, nImg, 16); del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, img, 0, nImg, 16); lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 16) ); lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 16) ); for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) { #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) element_input_type* del_input_add_ptr = &LIBXSMM_VLA_ACCESS(5, dinput_add, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 16); #endif #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) const element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 16); #endif #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) const unsigned char* relumask_ptr = &LIBXSMM_VLA_ACCESS(5, relumask, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 2); #endif const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 16); element_output_type* del_output_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 16); for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) { __m512 lcl_vdeloutput = _mm512_load_act( del_output_ptr ); #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) const __m512 value = _mm512_load_act( output_ptr ); const __mmask16 lcl_relumask = _mm512_cmp_ps_mask( value, _mm512_setzero_ps(), _CMP_NEQ_OQ ); lcl_vdeloutput = _mm512_mask_blend_ps( lcl_relumask, _mm512_setzero_ps(), lcl_vdeloutput ); _mm512_store_act( del_output_ptr, lcl_vdeloutput ); output_ptr += 16; #endif #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) const __mmask16 lcl_relumask = LIBXSMM_INTRINSICS_MM512_LOAD_MASK16( relumask_ptr ); lcl_vdeloutput = _mm512_mask_blend_ps( lcl_relumask, _mm512_setzero_ps(), lcl_vdeloutput ); _mm512_store_act( del_output_ptr, lcl_vdeloutput ); relumask_ptr += 2; #endif #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) _mm512_stream_act( del_input_add_ptr, lcl_vdeloutput ); del_input_add_ptr += sw*16; #endif lcl_vdgamma = _mm512_add_ps( lcl_vdgamma, _mm512_mul_ps( _mm512_mul_ps( _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ), lcl_vdeloutput ), lcl_vbrstd ) ); lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, lcl_vdeloutput ); input_ptr += sw*16; del_output_ptr += 16; } } _mm512_storeu_ps( del_gamma_img_ptr, lcl_vdgamma ); _mm512_storeu_ps( del_beta_img_ptr, lcl_vdbeta ); } libxsmm_barrier_wait(handle->barrier, ltid); if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0) ) { /* now we need to reduce the del_gamm and del_beta */ for ( fm = thr_begin2; fm < thr_end2; ++fm ) { element_stats_type* del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, 0, 0, nImg, 16); element_stats_type* del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, 0, 0, nImg, 16); __m512 lcl_vdgamma = _mm512_setzero_ps(); __m512 lcl_vdbeta = _mm512_setzero_ps(); for ( img=0; img < nImg; img++ ) { lcl_vdgamma = _mm512_add_ps( lcl_vdgamma, _mm512_loadu_ps( del_gamma_img_ptr ) ); lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, _mm512_loadu_ps( del_beta_img_ptr ) ); del_gamma_img_ptr += 16; del_beta_img_ptr += 16; } _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 0, 16), lcl_vdgamma ); _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 0, 16), lcl_vdbeta ); } } else { /* now we need to reduce the del_gamm and del_beta */ for ( fm = thr_begin2; fm < thr_end2; ++fm ) { element_stats_type* del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, 0, 0, nImg, 16); element_stats_type* del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, 0, 0, nImg, 16); __m512 lcl_vdgamma = _mm512_setzero_ps(); __m512 lcl_vdbeta = _mm512_setzero_ps(); for ( img=0; img < nImg; img++ ) { lcl_vdgamma = _mm512_add_ps( lcl_vdgamma, _mm512_loadu_ps( del_gamma_img_ptr ) ); lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, _mm512_loadu_ps( del_beta_img_ptr ) ); del_gamma_img_ptr += 16; del_beta_img_ptr += 16; } _mm512_storeu_ps( del_gamma_img_ptr - (nImg*16), lcl_vdgamma ); _mm512_storeu_ps( del_beta_img_ptr - (nImg*16), lcl_vdbeta ); } } libxsmm_barrier_wait(handle->barrier, ltid); } if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) > 0) ) { /* now we apply the actual backward batch norm */ for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { __m512 lcl_vgamma, lcl_vbmean, lcl_vbrstd, lcl_vdgamma, lcl_vdbeta; __m512 lcl_vnhw = _mm512_set1_ps( nhw ); __m512 lcl_vrec_nhw = _mm512_set1_ps( recp_nhw ); img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; lcl_vgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, 16) ); lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 16) ); lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 16) ); lcl_vdgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 0, 16) ); lcl_vdbeta = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 0, 16) ); for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) { element_input_type* del_input_ptr = &LIBXSMM_VLA_ACCESS(5, dinput, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 16); const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 16); const element_output_type* del_output_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 16); for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) { __m512 lcl_vdelinput; lcl_vdelinput = _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ); lcl_vdelinput = _mm512_mul_ps( lcl_vdelinput, lcl_vdgamma ); lcl_vdelinput = _mm512_mul_ps( lcl_vdelinput, lcl_vbrstd ); lcl_vdelinput = _mm512_add_ps( lcl_vdbeta, lcl_vdelinput ); lcl_vdelinput = _mm512_sub_ps( _mm512_mul_ps( lcl_vnhw, _mm512_load_act( del_output_ptr ) ), lcl_vdelinput ); lcl_vdelinput = _mm512_mul_ps( lcl_vrec_nhw, lcl_vdelinput ); lcl_vdelinput = _mm512_mul_ps( lcl_vbrstd, lcl_vdelinput ); lcl_vdelinput = _mm512_mul_ps( lcl_vgamma, lcl_vdelinput ); _mm512_stream_act( del_input_ptr, lcl_vdelinput ); del_input_ptr += sw*16; input_ptr += sw*16; del_output_ptr += 16; } } } libxsmm_barrier_wait(handle->barrier, ltid); } # undef _mm512_load_act # undef _mm512_stream_act # undef _mm512_store_act libxsmm-1.17/src/template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c000066400000000000000000000412151415223013700320610ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ #if defined(LIBXSMM_DNN_FUSEDBN_BWD_BF16) # define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) #if 1 # define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) #else # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) #endif #else # define _mm512_load_act(A) _mm512_loadu_ps(A) # define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) # define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) #endif /* size variables, all const */ const int nImg = handle->desc.partN; const int ifh = handle->desc.H; const int ifw = handle->desc.W; const int sh = handle->desc.u; const int sw = handle->desc.v; const int ofh = ifh/sh; const int ofw = ifw/sw; const int iph = handle->desc.pad_h_in; const int ipw = handle->desc.pad_w_in; const int oph = handle->desc.pad_h_out; const int opw = handle->desc.pad_w_out; const int ofhp = ofh + 2*oph; const int ofwp = ofw + 2*opw; const int ifhp = ifh + 2*iph; const int ifwp = ifw + 2*ipw; /* here we assume that input and output blocking is similar */ const int nBlocksFm = handle->blocksifm; const element_stats_type nhw = (element_stats_type)(handle->desc.fullN * ifh * ifw); const element_stats_type recp_nhw = 1.0f/nhw; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const int work = nImg * nBlocksFm; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* number of tasks that could be run in parallel, delta gamma and beta reduction */ const int work2 = nBlocksFm; /* compute chunk size */ const int chunksize2 = (work2 % handle->desc.threads == 0) ? (work2 / handle->desc.threads) : ((work2 / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; /* loop variables */ int img = 0; int fm = 0; int imgfm = 0; int hi = 0; int wi = 0; int ho = 0; int wo = 0; LIBXSMM_VLA_DECL(5, element_input_type, dinput, (element_input_type* )handle->grad_input->data, nBlocksFm, ifhp, ifwp, 32); LIBXSMM_VLA_DECL(5, element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, 32); #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) LIBXSMM_VLA_DECL(5, element_input_type, dinput_add, (element_input_type* )handle->grad_add->data, nBlocksFm, ifhp, ifwp, 32); #endif #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) LIBXSMM_VLA_DECL(5, const element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, 32); #endif LIBXSMM_VLA_DECL(5, element_output_type, doutput, (element_output_type*)handle->grad_output->data, nBlocksFm, ofhp, ofwp, 32); LIBXSMM_VLA_DECL(2, const element_stats_type, gamma, (element_stats_type*)handle->reg_gamma->data, 32); LIBXSMM_VLA_DECL(2, element_stats_type, dgamma, (element_stats_type*)handle->grad_gamma->data, 32); LIBXSMM_VLA_DECL(2, element_stats_type, dbeta, (element_stats_type*)handle->grad_beta->data, 32); LIBXSMM_VLA_DECL(2, const element_stats_type, bmean, (element_stats_type*)handle->expvalue->data, 32); LIBXSMM_VLA_DECL(2, const element_stats_type, brstd, (element_stats_type*)handle->rcpstddev->data, 32); LIBXSMM_VLA_DECL(3, element_stats_type, dgamma_img, (element_stats_type*)handle->scratch, nImg, 32); LIBXSMM_VLA_DECL(3, element_stats_type, dbeta_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * (size_t)nBlocksFm * (size_t)32), nImg, 32); #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) LIBXSMM_VLA_DECL(5, const unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksFm, ofhp, ofwp, 4); #endif /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0) || ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) > 0) ) { for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { __m512 lcl_vdgamma = _mm512_setzero_ps(); __m512 lcl_vdbeta = _mm512_setzero_ps(); __m512 lcl_vdgamma2 = _mm512_setzero_ps(); __m512 lcl_vdbeta2 = _mm512_setzero_ps(); __m512 lcl_vbmean, lcl_vbrstd; __m512 lcl_vbmean2, lcl_vbrstd2; element_stats_type* del_gamma_img_ptr; element_stats_type* del_beta_img_ptr; img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, img, 0, nImg, 32); del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, img, 0, nImg, 32); lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 32) ); lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 32) ); lcl_vbmean2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 16, 32) ); lcl_vbrstd2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 16, 32) ); for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) { #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) element_input_type* del_input_add_ptr = &LIBXSMM_VLA_ACCESS(5, dinput_add, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32); #endif #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) const element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 32); #endif #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) const unsigned char* relumask_ptr = &LIBXSMM_VLA_ACCESS(5, relumask, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 4); #endif const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32); element_output_type* del_output_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 32); for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) { __m512 lcl_vdeloutput, lcl_vdeloutput2; #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) || defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) const __m512 vzero = _mm512_setzero_ps(); __mmask16 lcl_relumask, lcl_relumask2; #endif lcl_vdeloutput = _mm512_load_act( del_output_ptr ); #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) lcl_relumask = _mm512_cmp_ps_mask( _mm512_load_act( output_ptr ), vzero, _CMP_NEQ_OQ ); lcl_vdeloutput = _mm512_mask_blend_ps( lcl_relumask, vzero, lcl_vdeloutput ); _mm512_store_act( del_output_ptr, lcl_vdeloutput ); #endif #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) lcl_relumask = LIBXSMM_INTRINSICS_MM512_LOAD_MASK16( relumask_ptr ); lcl_vdeloutput = _mm512_mask_blend_ps( lcl_relumask, vzero, lcl_vdeloutput ); _mm512_store_act( del_output_ptr, lcl_vdeloutput ); relumask_ptr += 2; #endif #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) _mm512_stream_act( del_input_add_ptr, lcl_vdeloutput ); #endif lcl_vdgamma = _mm512_add_ps( lcl_vdgamma, _mm512_mul_ps( _mm512_mul_ps( _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ), lcl_vdeloutput ), lcl_vbrstd ) ); lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, lcl_vdeloutput ); lcl_vdeloutput2 = _mm512_load_act( del_output_ptr+16 ); #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) lcl_relumask2 = _mm512_cmp_ps_mask( _mm512_load_act( output_ptr+16 ), vzero, _CMP_NEQ_OQ ); lcl_vdeloutput2 = _mm512_mask_blend_ps( lcl_relumask2, vzero, lcl_vdeloutput2 ); _mm512_store_act( del_output_ptr+16, lcl_vdeloutput2 ); output_ptr += 32; #endif #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) lcl_relumask2 = LIBXSMM_INTRINSICS_MM512_LOAD_MASK16( relumask_ptr ); lcl_vdeloutput2 = _mm512_mask_blend_ps( lcl_relumask2, vzero, lcl_vdeloutput2 ); _mm512_store_act( del_output_ptr+16, lcl_vdeloutput2 ); relumask_ptr += 2; #endif #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) _mm512_stream_act( del_input_add_ptr+16, lcl_vdeloutput2 ); del_input_add_ptr += sw*32; #endif lcl_vdgamma2 = _mm512_add_ps( lcl_vdgamma2, _mm512_mul_ps( _mm512_mul_ps( _mm512_sub_ps( _mm512_load_act( input_ptr+16 ), lcl_vbmean2 ), lcl_vdeloutput2 ), lcl_vbrstd2 ) ); lcl_vdbeta2 = _mm512_add_ps( lcl_vdbeta2, lcl_vdeloutput2 ); input_ptr += sw*32; del_output_ptr += 32; } } _mm512_storeu_ps( del_gamma_img_ptr, lcl_vdgamma ); _mm512_storeu_ps( del_beta_img_ptr, lcl_vdbeta ); _mm512_storeu_ps( del_gamma_img_ptr+16, lcl_vdgamma2 ); _mm512_storeu_ps( del_beta_img_ptr+16, lcl_vdbeta2 ); } libxsmm_barrier_wait(handle->barrier, ltid); if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0) ) { /* now we need to reduce the del_gamm and del_beta */ for ( fm = thr_begin2; fm < thr_end2; ++fm ) { element_stats_type* del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, 0, 0, nImg, 32); element_stats_type* del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, 0, 0, nImg, 32); __m512 lcl_vdgamma = _mm512_setzero_ps(); __m512 lcl_vdbeta = _mm512_setzero_ps(); __m512 lcl_vdgamma2 = _mm512_setzero_ps(); __m512 lcl_vdbeta2 = _mm512_setzero_ps(); for ( img=0; img < nImg; img++ ) { lcl_vdgamma = _mm512_add_ps( lcl_vdgamma, _mm512_loadu_ps( del_gamma_img_ptr ) ); lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, _mm512_loadu_ps( del_beta_img_ptr ) ); lcl_vdgamma2 = _mm512_add_ps( lcl_vdgamma2, _mm512_loadu_ps( del_gamma_img_ptr+16 ) ); lcl_vdbeta2 = _mm512_add_ps( lcl_vdbeta2, _mm512_loadu_ps( del_beta_img_ptr+16 ) ); del_gamma_img_ptr += 32; del_beta_img_ptr += 32; } _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 0, 32), lcl_vdgamma ); _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 0, 32), lcl_vdbeta ); _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 16, 32), lcl_vdgamma2 ); _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 16, 32), lcl_vdbeta2 ); } } else { /* now we need to reduce the del_gamm and del_beta */ for ( fm = thr_begin2; fm < thr_end2; ++fm ) { element_stats_type* del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, 0, 0, nImg, 32); element_stats_type* del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, 0, 0, nImg, 32); __m512 lcl_vdgamma = _mm512_setzero_ps(); __m512 lcl_vdbeta = _mm512_setzero_ps(); __m512 lcl_vdgamma2 = _mm512_setzero_ps(); __m512 lcl_vdbeta2 = _mm512_setzero_ps(); for ( img=0; img < nImg; img++ ) { lcl_vdgamma = _mm512_add_ps( lcl_vdgamma, _mm512_loadu_ps( del_gamma_img_ptr ) ); lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, _mm512_loadu_ps( del_beta_img_ptr ) ); lcl_vdgamma2 = _mm512_add_ps( lcl_vdgamma2, _mm512_loadu_ps( del_gamma_img_ptr+16 ) ); lcl_vdbeta2 = _mm512_add_ps( lcl_vdbeta2, _mm512_loadu_ps( del_beta_img_ptr+16 ) ); del_gamma_img_ptr += 32; del_beta_img_ptr += 32; } _mm512_storeu_ps( del_gamma_img_ptr - (32*nImg), lcl_vdgamma ); _mm512_storeu_ps( del_beta_img_ptr - (32*nImg), lcl_vdbeta ); _mm512_storeu_ps( del_gamma_img_ptr - (32*nImg) + 16, lcl_vdgamma2 ); _mm512_storeu_ps( del_beta_img_ptr - (32*nImg) + 16, lcl_vdbeta2 ); } } libxsmm_barrier_wait(handle->barrier, ltid); } if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) > 0) ) { /* now we apply the actual backward batch norm */ for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { __m512 lcl_vgamma, lcl_vbmean, lcl_vbrstd, lcl_vdgamma, lcl_vdbeta; __m512 lcl_vgamma2, lcl_vbmean2, lcl_vbrstd2, lcl_vdgamma2, lcl_vdbeta2; __m512 lcl_vnhw = _mm512_set1_ps( nhw ); __m512 lcl_vrec_nhw = _mm512_set1_ps( recp_nhw ); img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; lcl_vgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, 32) ); lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 32) ); lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 32) ); lcl_vdgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 0, 32) ); lcl_vdbeta = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 0, 32) ); lcl_vgamma2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 16, 32) ); lcl_vbmean2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 16, 32) ); lcl_vbrstd2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 16, 32) ); lcl_vdgamma2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 16, 32) ); lcl_vdbeta2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 16, 32) ); for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) { element_input_type* del_input_ptr = &LIBXSMM_VLA_ACCESS(5, dinput, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32); const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32); const element_output_type* del_output_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 32); for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) { __m512 lcl_vdelinput; __m512 lcl_vdelinput2; lcl_vdelinput = _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ); lcl_vdelinput = _mm512_mul_ps( lcl_vdelinput, lcl_vdgamma ); lcl_vdelinput = _mm512_mul_ps( lcl_vdelinput, lcl_vbrstd ); lcl_vdelinput = _mm512_add_ps( lcl_vdbeta, lcl_vdelinput ); lcl_vdelinput = _mm512_sub_ps( _mm512_mul_ps( lcl_vnhw, _mm512_load_act( del_output_ptr ) ), lcl_vdelinput ); lcl_vdelinput = _mm512_mul_ps( lcl_vrec_nhw, lcl_vdelinput ); lcl_vdelinput = _mm512_mul_ps( lcl_vbrstd, lcl_vdelinput ); lcl_vdelinput = _mm512_mul_ps( lcl_vgamma, lcl_vdelinput ); lcl_vdelinput2 = _mm512_sub_ps( _mm512_load_act( input_ptr+16 ), lcl_vbmean2 ); lcl_vdelinput2 = _mm512_mul_ps( lcl_vdelinput2, lcl_vdgamma2 ); lcl_vdelinput2 = _mm512_mul_ps( lcl_vdelinput2, lcl_vbrstd2 ); lcl_vdelinput2 = _mm512_add_ps( lcl_vdbeta2, lcl_vdelinput2 ); lcl_vdelinput2 = _mm512_sub_ps( _mm512_mul_ps( lcl_vnhw, _mm512_load_act( del_output_ptr+16 ) ), lcl_vdelinput2 ); lcl_vdelinput2 = _mm512_mul_ps( lcl_vrec_nhw, lcl_vdelinput2 ); lcl_vdelinput2 = _mm512_mul_ps( lcl_vbrstd2, lcl_vdelinput2 ); lcl_vdelinput2 = _mm512_mul_ps( lcl_vgamma2, lcl_vdelinput2 ); _mm512_stream_act( del_input_ptr, lcl_vdelinput ); _mm512_stream_act( del_input_ptr+16, lcl_vdelinput2 ); del_input_ptr += sw*32; input_ptr += sw*32; del_output_ptr += 32; } } } libxsmm_barrier_wait(handle->barrier, ltid); } # undef _mm512_load_act # undef _mm512_stream_act # undef _mm512_store_act libxsmm-1.17/src/template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c000066400000000000000000000523451415223013700320740ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ #if defined(LIBXSMM_DNN_FUSEDBN_BWD_BF16) # define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) #if 1 # define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) #else # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) #endif #else # define _mm512_load_act(A) _mm512_loadu_ps(A) # define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) # define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) #endif /* size variables, all const */ const int nImg = handle->desc.partN; const int ifh = handle->desc.H; const int ifw = handle->desc.W; const int sh = handle->desc.u; const int sw = handle->desc.v; const int ofh = ifh/sh; const int ofw = ifw/sw; const int iph = handle->desc.pad_h_in; const int ipw = handle->desc.pad_w_in; const int oph = handle->desc.pad_h_out; const int opw = handle->desc.pad_w_out; const int ofhp = ofh + 2*oph; const int ofwp = ofw + 2*opw; const int ifhp = ifh + 2*iph; const int ifwp = ifw + 2*ipw; /* here we assume that input and output blocking is similar */ const int nBlocksFm = handle->blocksifm; const element_stats_type nhw = (element_stats_type)(handle->desc.fullN * ifh * ifw); const element_stats_type recp_nhw = 1.0f/nhw; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const int work = nImg * nBlocksFm; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* number of tasks that could be run in parallel, delta gamma and beta reduction */ const int work2 = nBlocksFm * 4; /* compute chunk size */ const int chunksize2 = (work2 % handle->desc.threads == 0) ? (work2 / handle->desc.threads) : ((work2 / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; /* loop variables */ int img = 0; int fm = 0; int imgfm = 0; int hi = 0; int wi = 0; int ho = 0; int wo = 0; LIBXSMM_VLA_DECL(5, element_input_type, dinput, (element_input_type* )handle->grad_input->data, nBlocksFm, ifhp, ifwp, 64); LIBXSMM_VLA_DECL(5, element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, 64); #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) LIBXSMM_VLA_DECL(5, element_input_type, dinput_add, (element_input_type* )handle->grad_add->data, nBlocksFm, ifhp, ifwp, 64); #endif #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) LIBXSMM_VLA_DECL(5, const element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, 64); #endif LIBXSMM_VLA_DECL(5, element_output_type, doutput, (element_output_type*)handle->grad_output->data, nBlocksFm, ofhp, ofwp, 64); LIBXSMM_VLA_DECL(2, const element_stats_type, gamma, (element_stats_type*)handle->reg_gamma->data, 64); LIBXSMM_VLA_DECL(2, element_stats_type, dgamma, (element_stats_type*)handle->grad_gamma->data, 64); LIBXSMM_VLA_DECL(2, element_stats_type, dbeta, (element_stats_type*)handle->grad_beta->data, 64); LIBXSMM_VLA_DECL(2, const element_stats_type, bmean, (element_stats_type*)handle->expvalue->data, 64); LIBXSMM_VLA_DECL(2, const element_stats_type, brstd, (element_stats_type*)handle->rcpstddev->data, 64); LIBXSMM_VLA_DECL(3, element_stats_type, dgamma_img, (element_stats_type*)handle->scratch, nImg, 64); LIBXSMM_VLA_DECL(3, element_stats_type, dbeta_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * (size_t)nBlocksFm * (size_t)64), nImg, 64); #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) LIBXSMM_VLA_DECL(5, const unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksFm, ofhp, ofwp, 8); #endif /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0) || ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) > 0) ) { for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { __m512 lcl_vdgamma = _mm512_setzero_ps(); __m512 lcl_vdbeta = _mm512_setzero_ps(); __m512 lcl_vdgamma2 = _mm512_setzero_ps(); __m512 lcl_vdbeta2 = _mm512_setzero_ps(); __m512 lcl_vdgamma3 = _mm512_setzero_ps(); __m512 lcl_vdbeta3 = _mm512_setzero_ps(); __m512 lcl_vdgamma4 = _mm512_setzero_ps(); __m512 lcl_vdbeta4 = _mm512_setzero_ps(); __m512 lcl_vbmean, lcl_vbrstd; __m512 lcl_vbmean2, lcl_vbrstd2; __m512 lcl_vbmean3, lcl_vbrstd3; __m512 lcl_vbmean4, lcl_vbrstd4; element_stats_type* del_gamma_img_ptr; element_stats_type* del_beta_img_ptr; img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, img, 0, nImg, 64); del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, img, 0, nImg, 64); lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 64) ); lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 64) ); lcl_vbmean2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 16, 64) ); lcl_vbrstd2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 16, 64) ); lcl_vbmean3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 32, 64) ); lcl_vbrstd3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 32, 64) ); lcl_vbmean4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 48, 64) ); lcl_vbrstd4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 48, 64) ); for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) { #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) element_input_type* del_input_add_ptr = &LIBXSMM_VLA_ACCESS(5, dinput_add, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 64); #endif #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) const element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 64); #endif #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) const unsigned char* relumask_ptr = &LIBXSMM_VLA_ACCESS(5, relumask, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 8); #endif const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 64); element_output_type* del_output_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 64); for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) { __m512 lcl_vdeloutput, lcl_vdeloutput2, lcl_vdeloutput3, lcl_vdeloutput4; #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) || defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) __mmask16 lcl_relumask, lcl_relumask2, lcl_relumask3, lcl_relumask4; const __m512 vzero = _mm512_setzero_ps(); #endif lcl_vdeloutput = _mm512_load_act( del_output_ptr ); #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) lcl_relumask = _mm512_cmp_ps_mask( _mm512_load_act( output_ptr ), vzero, _CMP_NEQ_OQ ); lcl_vdeloutput = _mm512_mask_blend_ps( lcl_relumask, vzero, lcl_vdeloutput ); _mm512_store_act( del_output_ptr, lcl_vdeloutput ); #endif #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) lcl_relumask = LIBXSMM_INTRINSICS_MM512_LOAD_MASK16( relumask_ptr ); lcl_vdeloutput = _mm512_mask_blend_ps( lcl_relumask, vzero, lcl_vdeloutput ); _mm512_store_act( del_output_ptr, lcl_vdeloutput ); relumask_ptr += 2; #endif #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) _mm512_stream_act( del_input_add_ptr, lcl_vdeloutput ); #endif lcl_vdgamma = _mm512_add_ps( lcl_vdgamma, _mm512_mul_ps( _mm512_mul_ps( _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ), lcl_vdeloutput ), lcl_vbrstd ) ); lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, lcl_vdeloutput ); lcl_vdeloutput2 = _mm512_load_act( del_output_ptr+16 ); #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) lcl_relumask2 = _mm512_cmp_ps_mask( _mm512_load_act( output_ptr+16 ), vzero, _CMP_NEQ_OQ ); lcl_vdeloutput2 = _mm512_mask_blend_ps( lcl_relumask2, vzero, lcl_vdeloutput2 ); _mm512_store_act( del_output_ptr+16, lcl_vdeloutput2 ); #endif #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) lcl_relumask2 = LIBXSMM_INTRINSICS_MM512_LOAD_MASK16( relumask_ptr ); lcl_vdeloutput2 = _mm512_mask_blend_ps( lcl_relumask2, vzero, lcl_vdeloutput2 ); _mm512_store_act( del_output_ptr+16, lcl_vdeloutput2 ); relumask_ptr += 2; #endif #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) _mm512_stream_act( del_input_add_ptr+16, lcl_vdeloutput2 ); #endif lcl_vdgamma2 = _mm512_add_ps( lcl_vdgamma2, _mm512_mul_ps( _mm512_mul_ps( _mm512_sub_ps( _mm512_load_act( input_ptr+16 ), lcl_vbmean2 ), lcl_vdeloutput2 ), lcl_vbrstd2 ) ); lcl_vdbeta2 = _mm512_add_ps( lcl_vdbeta2, lcl_vdeloutput2 ); lcl_vdeloutput3 = _mm512_load_act( del_output_ptr+32 ); #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) lcl_relumask3 = _mm512_cmp_ps_mask( _mm512_load_act( output_ptr+32 ), vzero, _CMP_NEQ_OQ ); lcl_vdeloutput3 = _mm512_mask_blend_ps( lcl_relumask3, vzero, lcl_vdeloutput3 ); _mm512_store_act( del_output_ptr+32, lcl_vdeloutput3 ); #endif #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) lcl_relumask3 = LIBXSMM_INTRINSICS_MM512_LOAD_MASK16( relumask_ptr ); lcl_vdeloutput3 = _mm512_mask_blend_ps( lcl_relumask3, vzero, lcl_vdeloutput3 ); _mm512_store_act( del_output_ptr+32, lcl_vdeloutput3 ); relumask_ptr += 2; #endif #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) _mm512_stream_act( del_input_add_ptr+32, lcl_vdeloutput3 ); #endif lcl_vdgamma3 = _mm512_add_ps( lcl_vdgamma3, _mm512_mul_ps( _mm512_mul_ps( _mm512_sub_ps( _mm512_load_act( input_ptr+32 ), lcl_vbmean3 ), lcl_vdeloutput3 ), lcl_vbrstd3 ) ); lcl_vdbeta3 = _mm512_add_ps( lcl_vdbeta3, lcl_vdeloutput3 ); lcl_vdeloutput4 = _mm512_load_act( del_output_ptr+48 ); #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) lcl_relumask4 = _mm512_cmp_ps_mask( _mm512_load_act( output_ptr+48 ), vzero, _CMP_NEQ_OQ ); lcl_vdeloutput4 = _mm512_mask_blend_ps( lcl_relumask4, vzero, lcl_vdeloutput4 ); _mm512_store_act( del_output_ptr+48, lcl_vdeloutput4 ); output_ptr += 64; #endif #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) lcl_relumask4 = LIBXSMM_INTRINSICS_MM512_LOAD_MASK16( relumask_ptr ); lcl_vdeloutput4 = _mm512_mask_blend_ps( lcl_relumask4, vzero, lcl_vdeloutput4 ); _mm512_store_act( del_output_ptr+48, lcl_vdeloutput4 ); relumask_ptr += 2; #endif #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) _mm512_stream_act( del_input_add_ptr+48, lcl_vdeloutput4 ); del_input_add_ptr += sw*64; #endif lcl_vdgamma4 = _mm512_add_ps( lcl_vdgamma4, _mm512_mul_ps( _mm512_mul_ps( _mm512_sub_ps( _mm512_load_act( input_ptr+48 ), lcl_vbmean4 ), lcl_vdeloutput4 ), lcl_vbrstd4 ) ); lcl_vdbeta4 = _mm512_add_ps( lcl_vdbeta4, lcl_vdeloutput4 ); input_ptr += sw*64; del_output_ptr += 64; } } _mm512_storeu_ps( del_gamma_img_ptr, lcl_vdgamma ); _mm512_storeu_ps( del_beta_img_ptr, lcl_vdbeta ); _mm512_storeu_ps( del_gamma_img_ptr+16, lcl_vdgamma2 ); _mm512_storeu_ps( del_beta_img_ptr+16, lcl_vdbeta2 ); _mm512_storeu_ps( del_gamma_img_ptr+32, lcl_vdgamma3 ); _mm512_storeu_ps( del_beta_img_ptr+32, lcl_vdbeta3 ); _mm512_storeu_ps( del_gamma_img_ptr+48, lcl_vdgamma4 ); _mm512_storeu_ps( del_beta_img_ptr+48, lcl_vdbeta4 ); } libxsmm_barrier_wait(handle->barrier, ltid); if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0) ) { /* now we need to reduce the del_gamm and del_beta */ for ( fm = thr_begin2; fm < thr_end2; ++fm ) { element_stats_type* del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, (fm/4), 0, ((fm%4)*16), nImg, 64); element_stats_type* del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, (fm/4), 0, ((fm%4)*16), nImg, 64); __m512 lcl_vdgamma = _mm512_setzero_ps(); __m512 lcl_vdbeta = _mm512_setzero_ps(); for ( img=0; img < nImg; img++ ) { lcl_vdgamma = _mm512_add_ps( lcl_vdgamma, _mm512_loadu_ps( del_gamma_img_ptr ) ); lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, _mm512_loadu_ps( del_beta_img_ptr ) ); del_gamma_img_ptr += 64; del_beta_img_ptr += 64; } _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, (fm/4), ((fm%4)*16), 64), lcl_vdgamma ); _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, (fm/4), ((fm%4)*16), 64), lcl_vdbeta ); } } else { /* now we need to reduce the del_gamm and del_beta */ for ( fm = thr_begin2; fm < thr_end2; ++fm ) { element_stats_type* del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, (fm/4), 0, ((fm%4)*16), nImg, 64); element_stats_type* del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, (fm/4), 0, ((fm%4)*16), nImg, 64); __m512 lcl_vdgamma = _mm512_setzero_ps(); __m512 lcl_vdbeta = _mm512_setzero_ps(); for ( img=0; img < nImg; img++ ) { lcl_vdgamma = _mm512_add_ps( lcl_vdgamma, _mm512_loadu_ps( del_gamma_img_ptr ) ); lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, _mm512_loadu_ps( del_beta_img_ptr ) ); del_gamma_img_ptr += 64; del_beta_img_ptr += 64; } _mm512_storeu_ps( del_gamma_img_ptr - (64*nImg), lcl_vdgamma ); _mm512_storeu_ps( del_beta_img_ptr - (64*nImg), lcl_vdbeta ); } } libxsmm_barrier_wait(handle->barrier, ltid); } if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) > 0) ) { /* now we apply the actual backward batch norm */ for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { __m512 lcl_vgamma, lcl_vbmean, lcl_vbrstd, lcl_vdgamma, lcl_vdbeta; __m512 lcl_vgamma2, lcl_vbmean2, lcl_vbrstd2, lcl_vdgamma2, lcl_vdbeta2; __m512 lcl_vgamma3, lcl_vbmean3, lcl_vbrstd3, lcl_vdgamma3, lcl_vdbeta3; __m512 lcl_vgamma4, lcl_vbmean4, lcl_vbrstd4, lcl_vdgamma4, lcl_vdbeta4; __m512 lcl_vnhw = _mm512_set1_ps( nhw ); __m512 lcl_vrec_nhw = _mm512_set1_ps( recp_nhw ); img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; lcl_vgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, 64) ); lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 64) ); lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 64) ); lcl_vdgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 0, 64) ); lcl_vdbeta = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 0, 64) ); lcl_vgamma2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 16, 64) ); lcl_vbmean2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 16, 64) ); lcl_vbrstd2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 16, 64) ); lcl_vdgamma2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 16, 64) ); lcl_vdbeta2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 16, 64) ); lcl_vgamma3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 32, 64) ); lcl_vbmean3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 32, 64) ); lcl_vbrstd3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 32, 64) ); lcl_vdgamma3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 32, 64) ); lcl_vdbeta3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 32, 64) ); lcl_vgamma4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 48, 64) ); lcl_vbmean4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 48, 64) ); lcl_vbrstd4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 48, 64) ); lcl_vdgamma4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 48, 64) ); lcl_vdbeta4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 48, 64) ); for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) { element_input_type* del_input_ptr = &LIBXSMM_VLA_ACCESS(5, dinput, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 64); const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 64); const element_output_type* del_output_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 64); for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) { __m512 lcl_vdelinput; __m512 lcl_vdelinput2; __m512 lcl_vdelinput3; __m512 lcl_vdelinput4; lcl_vdelinput = _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ); lcl_vdelinput = _mm512_mul_ps( lcl_vdelinput, lcl_vdgamma ); lcl_vdelinput = _mm512_mul_ps( lcl_vdelinput, lcl_vbrstd ); lcl_vdelinput = _mm512_add_ps( lcl_vdbeta, lcl_vdelinput ); lcl_vdelinput = _mm512_sub_ps( _mm512_mul_ps( lcl_vnhw, _mm512_load_act( del_output_ptr ) ), lcl_vdelinput ); lcl_vdelinput = _mm512_mul_ps( lcl_vrec_nhw, lcl_vdelinput ); lcl_vdelinput = _mm512_mul_ps( lcl_vbrstd, lcl_vdelinput ); lcl_vdelinput = _mm512_mul_ps( lcl_vgamma, lcl_vdelinput ); lcl_vdelinput2 = _mm512_sub_ps( _mm512_load_act( input_ptr+16 ), lcl_vbmean2 ); lcl_vdelinput2 = _mm512_mul_ps( lcl_vdelinput2, lcl_vdgamma2 ); lcl_vdelinput2 = _mm512_mul_ps( lcl_vdelinput2, lcl_vbrstd2 ); lcl_vdelinput2 = _mm512_add_ps( lcl_vdbeta2, lcl_vdelinput2 ); lcl_vdelinput2 = _mm512_sub_ps( _mm512_mul_ps( lcl_vnhw, _mm512_load_act( del_output_ptr+16 ) ), lcl_vdelinput2 ); lcl_vdelinput2 = _mm512_mul_ps( lcl_vrec_nhw, lcl_vdelinput2 ); lcl_vdelinput2 = _mm512_mul_ps( lcl_vbrstd2, lcl_vdelinput2 ); lcl_vdelinput2 = _mm512_mul_ps( lcl_vgamma2, lcl_vdelinput2 ); lcl_vdelinput3 = _mm512_sub_ps( _mm512_load_act( input_ptr+32 ), lcl_vbmean3 ); lcl_vdelinput3 = _mm512_mul_ps( lcl_vdelinput3, lcl_vdgamma3 ); lcl_vdelinput3 = _mm512_mul_ps( lcl_vdelinput3, lcl_vbrstd3 ); lcl_vdelinput3 = _mm512_add_ps( lcl_vdbeta3, lcl_vdelinput3 ); lcl_vdelinput3 = _mm512_sub_ps( _mm512_mul_ps( lcl_vnhw, _mm512_load_act( del_output_ptr+32 ) ), lcl_vdelinput3 ); lcl_vdelinput3 = _mm512_mul_ps( lcl_vrec_nhw, lcl_vdelinput3 ); lcl_vdelinput3 = _mm512_mul_ps( lcl_vbrstd3, lcl_vdelinput3 ); lcl_vdelinput3 = _mm512_mul_ps( lcl_vgamma3, lcl_vdelinput3 ); lcl_vdelinput4 = _mm512_sub_ps( _mm512_load_act( input_ptr+48 ), lcl_vbmean4 ); lcl_vdelinput4 = _mm512_mul_ps( lcl_vdelinput4, lcl_vdgamma4 ); lcl_vdelinput4 = _mm512_mul_ps( lcl_vdelinput4, lcl_vbrstd4 ); lcl_vdelinput4 = _mm512_add_ps( lcl_vdbeta4, lcl_vdelinput4 ); lcl_vdelinput4 = _mm512_sub_ps( _mm512_mul_ps( lcl_vnhw, _mm512_load_act( del_output_ptr+48 ) ), lcl_vdelinput4 ); lcl_vdelinput4 = _mm512_mul_ps( lcl_vrec_nhw, lcl_vdelinput4 ); lcl_vdelinput4 = _mm512_mul_ps( lcl_vbrstd4, lcl_vdelinput4 ); lcl_vdelinput4 = _mm512_mul_ps( lcl_vgamma4, lcl_vdelinput4 ); _mm512_stream_act( del_input_ptr, lcl_vdelinput ); _mm512_stream_act( del_input_ptr+16, lcl_vdelinput2 ); _mm512_stream_act( del_input_ptr+32, lcl_vdelinput3 ); _mm512_stream_act( del_input_ptr+48, lcl_vdelinput4 ); del_input_ptr += sw*64; input_ptr += sw*64; del_output_ptr += 64; } } } libxsmm_barrier_wait(handle->barrier, ltid); } # undef _mm512_load_act # undef _mm512_stream_act # undef _mm512_store_act libxsmm-1.17/src/template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_generic.tpl.c000066400000000000000000000315711415223013700305140ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ /* size variables, all const */ const int nImg = handle->desc.partN; const int ifh = handle->desc.H; const int ifw = handle->desc.W; const int sh = handle->desc.u; const int sw = handle->desc.v; const int ofh = ifh/sh; const int ofw = ifw/sw; const int iph = handle->desc.pad_h_in; const int ipw = handle->desc.pad_w_in; const int oph = handle->desc.pad_h_out; const int opw = handle->desc.pad_w_out; const int ofhp = ofh + 2*oph; const int ofwp = ofw + 2*opw; const int ifhp = ifh + 2*iph; const int ifwp = ifw + 2*ipw; /* here we assume that input and output blocking is similar */ const int nBlocksFm = handle->blocksifm; const int nFmBlock = handle->ifmblock; const element_stats_type nhw = (element_stats_type)(handle->desc.fullN * ifh * ifw); const element_stats_type recp_nhw = 1.0f/nhw; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const int work = nImg * nBlocksFm; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* number of tasks that could be run in parallel, delta gamma and beta reduction */ const int work2 = nBlocksFm; /* compute chunk size */ const int chunksize2 = (work2 % handle->desc.threads == 0) ? (work2 / handle->desc.threads) : ((work2 / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; /* loop variables */ int img = 0; int fm = 0; int imgfm = 0; int hi = 0; int wi = 0; int v = 0; int ho = 0; int wo = 0; LIBXSMM_VLA_DECL(5, element_input_type, dinput, (element_input_type* )handle->grad_input->data, nBlocksFm, ifhp, ifwp, nFmBlock); LIBXSMM_VLA_DECL(5, element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, nFmBlock); #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) LIBXSMM_VLA_DECL(5, element_input_type, dinput_add, (element_input_type* )handle->grad_add->data, nBlocksFm, ifhp, ifwp, nFmBlock); #endif #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) LIBXSMM_VLA_DECL(5, const element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, nFmBlock); #endif LIBXSMM_VLA_DECL(5, element_output_type, doutput, (element_output_type*)handle->grad_output->data, nBlocksFm, ofhp, ofwp, nFmBlock); LIBXSMM_VLA_DECL(2, const element_stats_type, gamma, (element_stats_type*)handle->reg_gamma->data, nFmBlock); LIBXSMM_VLA_DECL(2, element_stats_type, dgamma, (element_stats_type*)handle->grad_gamma->data, nFmBlock); LIBXSMM_VLA_DECL(2, element_stats_type, dbeta, (element_stats_type*)handle->grad_beta->data, nFmBlock); LIBXSMM_VLA_DECL(2, const element_stats_type, bmean, (element_stats_type*)handle->expvalue->data, nFmBlock); LIBXSMM_VLA_DECL(2, const element_stats_type, brstd, (element_stats_type*)handle->rcpstddev->data, nFmBlock); LIBXSMM_VLA_DECL(3, element_stats_type, dgamma_img, (element_stats_type*)handle->scratch, nImg, nFmBlock); LIBXSMM_VLA_DECL(3, element_stats_type, dbeta_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * (size_t)nBlocksFm * (size_t)nFmBlock), nImg, nFmBlock); #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) LIBXSMM_VLA_DECL(5, unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksFm, ofhp, ofwp, nFmBlock); #endif #if defined(LIBXSMM_DNN_FUSEDBN_BWD_BF16) union libxsmm_bfloat16_hp input_f32; union libxsmm_bfloat16_hp del_input_f32; union libxsmm_bfloat16_hp del_output_f32; #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) union libxsmm_bfloat16_hp output_f32; output_f32.i[1] = 0; output_f32.i[0] = 0; #endif input_f32.i[1] = 0; input_f32.i[0] = 0; del_output_f32.i[1] = 0; del_output_f32.i[0] = 0; del_input_f32.i[1] = 0; del_input_f32.i[0] = 0; #endif assert( nFmBlock <= 64 ); /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0) || ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) > 0) ) { for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { /* @TODO check if we can bake this in into scratch */ element_stats_type lcl_gamma_ptr[64]; element_stats_type lcl_beta_ptr[64]; element_stats_type* del_gamma_img_ptr; element_stats_type* del_beta_img_ptr; img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, img, 0, nImg, nFmBlock); del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, img, 0, nImg, nFmBlock); LIBXSMM_PRAGMA_SIMD for ( v=0; v < nFmBlock; v++ ) { lcl_gamma_ptr[v] = 0.0f; lcl_beta_ptr[v] = 0.0f; } for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) { for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) { #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) element_input_type* del_input_add_ptr = &LIBXSMM_VLA_ACCESS(5, dinput_add, img, fm, hi, wi, 0, nBlocksFm, ifhp, ifwp, nFmBlock); #endif #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) const element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, nFmBlock); #endif #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) const unsigned char* relumask_ptr = &LIBXSMM_VLA_ACCESS(5, relumask, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, nFmBlock); #endif const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, wi, 0, nBlocksFm, ifhp, ifwp, nFmBlock); element_output_type* del_output_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, nFmBlock); const element_stats_type* bmean_ptr = &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, nFmBlock); const element_stats_type* brstd_ptr = &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, nFmBlock); #if !defined(LIBXSMM_DNN_FUSEDBN_BWD_BF16) LIBXSMM_PRAGMA_SIMD #endif for ( v=0; v < nFmBlock; v++ ) { #if defined(LIBXSMM_DNN_FUSEDBN_BWD_BF16) del_output_f32.i[1] = del_output_ptr[v]; del_output_f32.i[0] = 0; #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) output_f32.i[1] = output_ptr[v]; del_output_f32.f = LIBXSMM_FEQ(output_f32.f, 0) ? 0 : del_output_f32.f; del_output_ptr[v] = del_output_f32.i[1]; #endif #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) del_output_ptr[v] = (element_output_type)(relumask_ptr[v] == 1 ? del_output_ptr[v] : 0); #endif #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) del_input_add_ptr[v] = del_output_ptr[v]; #endif input_f32.i[1] = input_ptr[v]; lcl_gamma_ptr[v] += (input_f32.f - bmean_ptr[v]) * del_output_f32.f * brstd_ptr[v]; lcl_beta_ptr[v] += del_output_f32.f; #else #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) del_output_ptr[v] = LIBXSMM_FEQ(output_ptr[v], 0) ? 0 : del_output_ptr[v]; #endif #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) del_output_ptr[v] = (element_output_type)(relumask_ptr[v] == 1 ? del_output_ptr[v] : 0); #endif #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) del_input_add_ptr[v] = del_output_ptr[v]; #endif lcl_gamma_ptr[v] += (input_ptr[v] - bmean_ptr[v]) * del_output_ptr[v] * brstd_ptr[v]; lcl_beta_ptr[v] += del_output_ptr[v]; #endif } } } LIBXSMM_PRAGMA_SIMD for ( v=0; v < nFmBlock; v++ ) { del_gamma_img_ptr[v] = lcl_gamma_ptr[v]; del_beta_img_ptr[v] = lcl_beta_ptr[v]; } } libxsmm_barrier_wait(handle->barrier, ltid); if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0) ) { /* now we need to reduce the del_gamm and del_beta */ for ( fm = thr_begin2; fm < thr_end2; ++fm ) { element_stats_type* del_gamma_ptr = &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 0, nFmBlock); element_stats_type* del_beta_ptr = &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 0, nFmBlock); LIBXSMM_PRAGMA_SIMD for ( v=0; v < nFmBlock; v++ ) { del_gamma_ptr[v] = (element_stats_type)0; del_beta_ptr[v] = (element_stats_type)0; } for ( img=0; img < nImg; img++ ) { element_stats_type* del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, img, 0, nImg, nFmBlock); element_stats_type* del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, img, 0, nImg, nFmBlock); LIBXSMM_PRAGMA_SIMD for ( v=0; v < nFmBlock; v++ ) { del_gamma_ptr[v] += del_gamma_img_ptr[v]; del_beta_ptr[v] += del_beta_img_ptr[v]; } } } } else { /* now we need to reduce the del_gamm and del_beta */ for ( fm = thr_begin2; fm < thr_end2; ++fm ) { element_stats_type* del_gamma_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, 0, 0, nImg, nFmBlock); element_stats_type* del_beta_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, 0, 0, nImg, nFmBlock); for ( img=1; img < nImg; img++ ) { element_stats_type* del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, img, 0, nImg, nFmBlock); element_stats_type* del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, img, 0, nImg, nFmBlock); LIBXSMM_PRAGMA_SIMD for ( v=0; v < nFmBlock; v++ ) { del_gamma_ptr[v] += del_gamma_img_ptr[v]; del_beta_ptr[v] += del_beta_img_ptr[v]; } } } } libxsmm_barrier_wait(handle->barrier, ltid); } if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) > 0) ) { /* now we apply the actual backward batch norm */ for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) { for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) { element_input_type* del_input_ptr = &LIBXSMM_VLA_ACCESS(5, dinput, img, fm, hi, wi, 0, nBlocksFm, ifhp, ifwp, nFmBlock); const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, wi, 0, nBlocksFm, ifhp, ifwp, nFmBlock); const element_output_type* del_output_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, nFmBlock); const element_stats_type* bmean_ptr = &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, nFmBlock); const element_stats_type* brstd_ptr = &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, nFmBlock); const element_stats_type* gamma_ptr = &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, nFmBlock); const element_stats_type* del_gamma_ptr = &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 0, nFmBlock); const element_stats_type* del_beta_ptr = &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 0, nFmBlock); #if !defined(LIBXSMM_DNN_FUSEDBN_BWD_BF16) LIBXSMM_PRAGMA_SIMD #endif for ( v=0; v < nFmBlock; v++ ) { #if defined(LIBXSMM_DNN_FUSEDBN_BWD_BF16) del_output_f32.i[1] = del_output_ptr[v]; input_f32.i[1] = input_ptr[v]; del_input_f32.f = gamma_ptr[v] * brstd_ptr[v] * recp_nhw * (nhw*del_output_f32.f - (del_beta_ptr[v] + (input_f32.f - bmean_ptr[v]) * del_gamma_ptr[v] * brstd_ptr[v])); del_input_ptr[v] = del_input_f32.i[1]; #else del_input_ptr[v] = gamma_ptr[v] * brstd_ptr[v] * recp_nhw * (nhw*del_output_ptr[v] - (del_beta_ptr[v] + (input_ptr[v] - bmean_ptr[v]) * del_gamma_ptr[v] * brstd_ptr[v])); #endif } } } } libxsmm_barrier_wait(handle->barrier, ltid); } libxsmm-1.17/src/template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c000066400000000000000000000277771415223013700321100ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ #if defined(LIBXSMM_DNN_FUSEDBN_FWD_BF16) # define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) #if 1 # define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) #else # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) #endif #else # define _mm512_load_act(A) _mm512_loadu_ps(A) # define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) # define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) #endif /* size variables, all const */ const int nImg = handle->desc.partN; const int ifh = handle->desc.H; const int ifw = handle->desc.W; const int sh = handle->desc.u; const int sw = handle->desc.v; const int ofh = ifh/sh; const int ofw = ifw/sw; const int iph = handle->desc.pad_h_in; const int ipw = handle->desc.pad_w_in; const int oph = handle->desc.pad_h_out; const int opw = handle->desc.pad_w_out; const int ofhp = ofh + 2*oph; const int ofwp = ofw + 2*opw; const int ifhp = ifh + 2*iph; const int ifwp = ifw + 2*ipw; /* here we assume that input and output blocking is similar */ const int nBlocksFm = handle->blocksifm; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const int work = nImg * nBlocksFm; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* number of tasks that could be run in parallel, delta gamma and beta reduction */ const int work2 = nBlocksFm; /* compute chunk size */ const int chunksize2 = (work2 % handle->desc.threads == 0) ? (work2 / handle->desc.threads) : ((work2 / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; /* eps to avoid sqrt of zero */ const element_stats_type sqrt_eps = 1e-7f; const element_stats_type nhw = (element_stats_type)(handle->desc.fullN * ifh * ifw); const element_stats_type recp_nhw = 1.0f/nhw; /* loop variables */ int img = 0; int fm = 0; int imgfm = 0; int hi = 0; int wi = 0; int ho = 0; int wo = 0; LIBXSMM_VLA_DECL(5, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, 16); #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) LIBXSMM_VLA_DECL(5, const element_input_type, input_add, (element_input_type* )handle->reg_add->data, nBlocksFm, ifhp, ifwp, 16); #endif LIBXSMM_VLA_DECL(5, element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, 16); LIBXSMM_VLA_DECL(2, const element_stats_type, gamma, (element_stats_type*)handle->reg_gamma->data, 16); LIBXSMM_VLA_DECL(2, const element_stats_type, beta, (element_stats_type*)handle->reg_beta->data, 16); LIBXSMM_VLA_DECL(2, element_stats_type, bmean, (element_stats_type*)handle->expvalue->data, 16); LIBXSMM_VLA_DECL(2, element_stats_type, brstd, (element_stats_type*)handle->rcpstddev->data, 16); LIBXSMM_VLA_DECL(2, element_stats_type, variance, (element_stats_type*)handle->variance->data, 16); LIBXSMM_VLA_DECL(3, element_stats_type, sum_img, (element_stats_type*)handle->scratch, nImg, 16); LIBXSMM_VLA_DECL(3, element_stats_type, sumsq_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * (size_t)nBlocksFm * 16), nImg, 16); #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) LIBXSMM_VLA_DECL(5, unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksFm, ofhp, ofwp, 2); #endif /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0) || ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) > 0) ) { for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { __m512 lcl_vsum = _mm512_setzero_ps(); __m512 lcl_vsumsq = _mm512_setzero_ps(); element_stats_type* sum_img_ptr; element_stats_type* sumsq_img_ptr; img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; sum_img_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img, fm, img, 0, nImg, 16); sumsq_img_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img, fm, img, 0, nImg, 16); for ( hi=iph; hi < (ifh + iph); hi++ ) { const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 16); for ( wi=ipw; wi < (ifw + ipw); wi++ ) { __m512 lcl_vinput = _mm512_load_act( input_ptr ); lcl_vsum = _mm512_add_ps( lcl_vsum, lcl_vinput ); lcl_vsumsq = _mm512_add_ps( lcl_vsumsq, _mm512_mul_ps( lcl_vinput, lcl_vinput ) ); input_ptr += 16; } } _mm512_storeu_ps( sum_img_ptr, lcl_vsum ); _mm512_storeu_ps( sumsq_img_ptr, lcl_vsumsq ); } libxsmm_barrier_wait(handle->barrier, ltid); /* now we need to reduce the sum and sum^2, we use the final */ for ( fm = thr_begin2; fm < thr_end2; ++fm ) { __m512 lcl_vsum = _mm512_setzero_ps(); __m512 lcl_vsumsq = _mm512_setzero_ps(); element_stats_type* sum_img_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img, fm, 0, 0, nImg, 16); element_stats_type* sumsq_img_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img, fm, 0, 0, nImg, 16); for ( img=0; img < nImg; img++ ) { lcl_vsum = _mm512_add_ps( lcl_vsum, _mm512_loadu_ps( sum_img_ptr ) ); lcl_vsumsq = _mm512_add_ps( lcl_vsumsq, _mm512_loadu_ps( sumsq_img_ptr ) ); sum_img_ptr += 16; sumsq_img_ptr += 16; } if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0) ) { __m512 lcl_vsqrt_eps = _mm512_set1_ps(sqrt_eps); __m512 lcl_vrec_nhw = _mm512_set1_ps(recp_nhw); __m512 lcl_vone = _mm512_set1_ps(1.0); __m512 lcl_vbmean, lcl_vbmeansq, lcl_vsqbmean, lcl_vbrstd, lcl_vvar; lcl_vbmean = _mm512_mul_ps( lcl_vrec_nhw, lcl_vsum ); /* E(X) */ lcl_vbmeansq = _mm512_mul_ps( lcl_vbmean, lcl_vbmean ); /* E(X)^2 */ lcl_vsqbmean = _mm512_mul_ps( lcl_vrec_nhw, lcl_vsumsq ); /* E(X^2) */ lcl_vvar = _mm512_sub_ps( lcl_vsqbmean, lcl_vbmeansq ); /* variance */ #if 0 { __m512d lcl_voned = _mm512_set1_pd(1.0); __m512d lcl_vepsd = _mm512_set1_pd(1e-7); __m512d lcl_vlo = _mm512_cvtps_pd( _mm256_castpd_ps( _mm512_extractf64x4_pd( _mm512_castps_pd( lcl_vvar ), 0 ) ) ); __m512d lcl_vhi = _mm512_cvtps_pd( _mm256_castpd_ps( _mm512_extractf64x4_pd( _mm512_castps_pd( lcl_vvar ), 1 ) ) ); lcl_vlo = _mm512_sqrt_pd( _mm512_add_pd( lcl_vlo, lcl_vepsd ) ); lcl_vhi = _mm512_sqrt_pd( _mm512_add_pd( lcl_vhi, lcl_vepsd ) ); lcl_vlo = _mm512_div_pd( lcl_voned, lcl_vlo ); lcl_vhi = _mm512_div_pd( lcl_voned, lcl_vhi ); lcl_vbrstd = _mm512_castpd_ps( _mm512_insertf64x4( _mm512_setzero_pd(), _mm256_castps_pd( _mm512_cvtpd_ps( lcl_vlo ) ), 0 ) ); lcl_vbrstd = _mm512_castpd_ps( _mm512_insertf64x4( _mm512_castps_pd( lcl_vbrstd ), _mm256_castps_pd( _mm512_cvtpd_ps( lcl_vhi ) ), 1 ) ); } #else lcl_vbrstd = _mm512_div_ps( lcl_vone, _mm512_sqrt_ps( _mm512_add_ps( lcl_vvar, lcl_vsqrt_eps ) ) ); #endif _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 16), lcl_vbmean ); _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 16), lcl_vbrstd ); _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, variance, fm, 0, 16), lcl_vvar ); } else { sum_img_ptr -= 16*nImg; sumsq_img_ptr -= 16*nImg; _mm512_storeu_ps( sum_img_ptr, lcl_vsum ); _mm512_storeu_ps( sumsq_img_ptr, lcl_vsumsq ); } } libxsmm_barrier_wait(handle->barrier, ltid); } if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) > 0) ) { /* now we apply the actual forward batch norm */ for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { __m512 lcl_vgamma, lcl_vbeta, lcl_vbmean, lcl_vbrstd; img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; lcl_vgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, 16) ); lcl_vbeta = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, beta, fm, 0, 16) ); lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 16) ); lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 16) ); for ( hi=iph, ho=oph; hi < (ifh+iph); hi+=sh, ho++ ) { const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 16); #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) const element_input_type* input_add_ptr = &LIBXSMM_VLA_ACCESS(5, input_add, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 16); #endif element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 16); #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) unsigned char* relumask_ptr = &LIBXSMM_VLA_ACCESS(5, relumask, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 2); #endif for ( wi=ipw, wo=opw; wi < (ifw+ipw); wi+=sw, wo++ ) { __m512 lcl_vo; #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) __mmask16 lcl_relumask; #endif /* BN + scale (gamma, beta) */ lcl_vo = _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ); lcl_vo = _mm512_mul_ps( lcl_vgamma, lcl_vo ); lcl_vo = _mm512_fmadd_ps( lcl_vo, lcl_vbrstd, lcl_vbeta ); /* eltwise add */ #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) lcl_vo = _mm512_add_ps( lcl_vo, _mm512_load_act( input_add_ptr ) ); #endif /* ReLU */ #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU) lcl_vo = _mm512_max_ps( lcl_vo, _mm512_setzero_ps() ); #endif #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) lcl_relumask = _mm512_cmp_ps_mask( lcl_vo, _mm512_setzero_ps(), _CMP_GT_OQ ); lcl_vo = _mm512_mask_blend_ps( lcl_relumask, _mm512_setzero_ps(), lcl_vo ); LIBXSMM_INTRINSICS_MM512_STORE_MASK16( relumask_ptr, lcl_relumask ); relumask_ptr += 2; #endif _mm512_stream_act( output_ptr, lcl_vo ); input_ptr += sw*16; #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) input_add_ptr += sw*16; #endif output_ptr += 16; } } } libxsmm_barrier_wait(handle->barrier, ltid); } # undef _mm512_load_act # undef _mm512_stream_act # undef _mm512_store_act libxsmm-1.17/src/template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c000066400000000000000000000340571415223013700320730ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ #if defined(LIBXSMM_DNN_FUSEDBN_FWD_BF16) # define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) #if 1 # define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) #else # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) #endif #else # define _mm512_load_act(A) _mm512_loadu_ps(A) # define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) # define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) #endif /* size variables, all const */ const int nImg = handle->desc.partN; const int ifh = handle->desc.H; const int ifw = handle->desc.W; const int sh = handle->desc.u; const int sw = handle->desc.v; const int ofh = ifh/sh; const int ofw = ifw/sw; const int iph = handle->desc.pad_h_in; const int ipw = handle->desc.pad_w_in; const int oph = handle->desc.pad_h_out; const int opw = handle->desc.pad_w_out; const int ofhp = ofh + 2*oph; const int ofwp = ofw + 2*opw; const int ifhp = ifh + 2*iph; const int ifwp = ifw + 2*ipw; /* here we assume that input and output blocking is similar */ const int nBlocksFm = handle->blocksifm; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const int work = nImg * nBlocksFm; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* number of tasks that could be run in parallel, delta gamma and beta reduction */ const int work2 = nBlocksFm; /* compute chunk size */ const int chunksize2 = (work2 % handle->desc.threads == 0) ? (work2 / handle->desc.threads) : ((work2 / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; /* eps to avoid sqrt of zero */ const element_stats_type sqrt_eps = 1e-7f; const element_stats_type nhw = (element_stats_type)(handle->desc.fullN * ifh * ifw); const element_stats_type recp_nhw = 1.0f/nhw; /* loop variables */ int img = 0; int fm = 0; int imgfm = 0; int hi = 0; int wi = 0; int ho = 0; int wo = 0; LIBXSMM_VLA_DECL(5, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, 32); #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) LIBXSMM_VLA_DECL(5, const element_input_type, input_add, (element_input_type* )handle->reg_add->data, nBlocksFm, ifhp, ifwp, 32); #endif LIBXSMM_VLA_DECL(5, element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, 32); LIBXSMM_VLA_DECL(2, const element_stats_type, gamma, (element_stats_type*)handle->reg_gamma->data, 32); LIBXSMM_VLA_DECL(2, const element_stats_type, beta, (element_stats_type*)handle->reg_beta->data, 32); LIBXSMM_VLA_DECL(2, element_stats_type, bmean, (element_stats_type*)handle->expvalue->data, 32); LIBXSMM_VLA_DECL(2, element_stats_type, brstd, (element_stats_type*)handle->rcpstddev->data, 32); LIBXSMM_VLA_DECL(2, element_stats_type, variance, (element_stats_type*)handle->variance->data, 32); LIBXSMM_VLA_DECL(3, element_stats_type, sum_img, (element_stats_type*)handle->scratch, nImg, 32); LIBXSMM_VLA_DECL(3, element_stats_type, sumsq_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * (size_t)nBlocksFm * 32), nImg, 32); #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) LIBXSMM_VLA_DECL(5, unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksFm, ofhp, ofwp, 4); #endif /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0) || ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) > 0) ) { for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { __m512 lcl_vsum = _mm512_setzero_ps(); __m512 lcl_vsumsq = _mm512_setzero_ps(); __m512 lcl_vsum2 = _mm512_setzero_ps(); __m512 lcl_vsumsq2 = _mm512_setzero_ps(); element_stats_type* sum_img_ptr; element_stats_type* sumsq_img_ptr; img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; sum_img_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img, fm, img, 0, nImg, 32); sumsq_img_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img, fm, img, 0, nImg, 32); for ( hi=iph; hi < (ifh + iph); hi++ ) { const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32); for ( wi=ipw; wi < (ifw + ipw); wi++ ) { __m512 lcl_vinput = _mm512_load_act( input_ptr ); __m512 lcl_vinput2 = _mm512_load_act( input_ptr+16 ); lcl_vsum = _mm512_add_ps( lcl_vsum, lcl_vinput ); lcl_vsumsq = _mm512_add_ps( lcl_vsumsq, _mm512_mul_ps( lcl_vinput, lcl_vinput ) ); lcl_vsum2 = _mm512_add_ps( lcl_vsum2, lcl_vinput2 ); lcl_vsumsq2 = _mm512_add_ps( lcl_vsumsq2, _mm512_mul_ps( lcl_vinput2, lcl_vinput2 ) ); input_ptr += 32; } } _mm512_storeu_ps( sum_img_ptr, lcl_vsum ); _mm512_storeu_ps( sumsq_img_ptr, lcl_vsumsq ); _mm512_storeu_ps( sum_img_ptr+16, lcl_vsum2 ); _mm512_storeu_ps( sumsq_img_ptr+16, lcl_vsumsq2 ); } libxsmm_barrier_wait(handle->barrier, ltid); /* now we need to reduce the sum and sum^2, we use the final */ for ( fm = thr_begin2; fm < thr_end2; ++fm ) { __m512 lcl_vsum = _mm512_setzero_ps(); __m512 lcl_vsumsq = _mm512_setzero_ps(); __m512 lcl_vsum2 = _mm512_setzero_ps(); __m512 lcl_vsumsq2 = _mm512_setzero_ps(); element_stats_type* sum_img_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img, fm, 0, 0, nImg, 32); element_stats_type* sumsq_img_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img, fm, 0, 0, nImg, 32); for ( img=0; img < nImg; img++ ) { lcl_vsum = _mm512_add_ps( lcl_vsum, _mm512_loadu_ps( sum_img_ptr ) ); lcl_vsumsq = _mm512_add_ps( lcl_vsumsq, _mm512_loadu_ps( sumsq_img_ptr ) ); lcl_vsum2 = _mm512_add_ps( lcl_vsum2, _mm512_loadu_ps( sum_img_ptr+16 ) ); lcl_vsumsq2 = _mm512_add_ps( lcl_vsumsq2, _mm512_loadu_ps( sumsq_img_ptr+16 ) ); sum_img_ptr += 32; sumsq_img_ptr += 32; } if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0) ) { __m512 lcl_vsqrt_eps = _mm512_set1_ps(sqrt_eps); __m512 lcl_vrec_nhw = _mm512_set1_ps(recp_nhw); __m512 lcl_vone = _mm512_set1_ps(1.0); __m512 lcl_vbmean, lcl_vbmeansq, lcl_vsqbmean, lcl_vbrstd, lcl_vvar; __m512 lcl_vbmean2, lcl_vbmeansq2, lcl_vsqbmean2, lcl_vbrstd2, lcl_vvar2; lcl_vbmean = _mm512_mul_ps( lcl_vrec_nhw, lcl_vsum ); /* E(X) */ lcl_vbmeansq = _mm512_mul_ps( lcl_vbmean, lcl_vbmean ); /* E(X)^2 */ lcl_vsqbmean = _mm512_mul_ps( lcl_vrec_nhw, lcl_vsumsq ); /* E(X^2) */ lcl_vvar = _mm512_sub_ps( lcl_vsqbmean, lcl_vbmeansq ); /* variance */ lcl_vbrstd = _mm512_div_ps( lcl_vone, _mm512_sqrt_ps( _mm512_add_ps( lcl_vvar, lcl_vsqrt_eps ) ) ); lcl_vbmean2 = _mm512_mul_ps( lcl_vrec_nhw, lcl_vsum2 ); /* E(X) */ lcl_vbmeansq2 = _mm512_mul_ps( lcl_vbmean2, lcl_vbmean2 ); /* E(X)^2 */ lcl_vsqbmean2 = _mm512_mul_ps( lcl_vrec_nhw, lcl_vsumsq2 ); /* E(X^2) */ lcl_vvar2 = _mm512_sub_ps( lcl_vsqbmean2, lcl_vbmeansq2 ); /* variance */ lcl_vbrstd2 = _mm512_div_ps( lcl_vone, _mm512_sqrt_ps( _mm512_add_ps( lcl_vvar2, lcl_vsqrt_eps ) ) ); _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 32), lcl_vbmean ); _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 32), lcl_vbrstd ); _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, variance, fm, 0, 32), lcl_vvar ); _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 16, 32), lcl_vbmean2 ); _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 16, 32), lcl_vbrstd2 ); _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, variance, fm, 16, 32), lcl_vvar2 ); } else { sum_img_ptr -= 32*nImg; sumsq_img_ptr -= 32*nImg; _mm512_storeu_ps( sum_img_ptr, lcl_vsum ); _mm512_storeu_ps( sumsq_img_ptr, lcl_vsumsq ); _mm512_storeu_ps( sum_img_ptr+16, lcl_vsum2 ); _mm512_storeu_ps( sumsq_img_ptr+16, lcl_vsumsq2 ); } } libxsmm_barrier_wait(handle->barrier, ltid); } if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) > 0) ) { /* now we apply the actual forward batch norm */ for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { __m512 lcl_vgamma, lcl_vbeta, lcl_vbmean, lcl_vbrstd; __m512 lcl_vgamma2, lcl_vbeta2, lcl_vbmean2, lcl_vbrstd2; img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; lcl_vgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, 32) ); lcl_vbeta = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, beta, fm, 0, 32) ); lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 32) ); lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 32) ); lcl_vgamma2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 16, 32) ); lcl_vbeta2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, beta, fm, 16, 32) ); lcl_vbmean2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 16, 32) ); lcl_vbrstd2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 16, 32) ); for ( hi=iph, ho=oph; hi < (ifh+iph); hi+=sh, ho++ ) { const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32); #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) const element_input_type* input_add_ptr = &LIBXSMM_VLA_ACCESS(5, input_add, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32); #endif element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 32); #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) unsigned char* relumask_ptr = &LIBXSMM_VLA_ACCESS(5, relumask, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 4); #endif for ( wi=ipw, wo=opw; wi < (ifw+ipw); wi+=sw, wo++ ) { __m512 lcl_vo; __m512 lcl_vo2; #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) __mmask16 lcl_relumask; __mmask16 lcl_relumask2; #endif /* BN + scale (gamma, beta) */ lcl_vo = _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ); lcl_vo = _mm512_mul_ps( lcl_vgamma, lcl_vo ); lcl_vo = _mm512_fmadd_ps( lcl_vo, lcl_vbrstd, lcl_vbeta ); /* eltwise add */ #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) lcl_vo = _mm512_add_ps( lcl_vo, _mm512_load_act( input_add_ptr ) ); #endif /* ReLU */ #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU) lcl_vo = _mm512_max_ps( lcl_vo, _mm512_setzero_ps() ); #endif #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) lcl_relumask = _mm512_cmp_ps_mask( lcl_vo, _mm512_setzero_ps(), _CMP_GT_OQ ); lcl_vo = _mm512_mask_blend_ps( lcl_relumask, _mm512_setzero_ps(), lcl_vo ); LIBXSMM_INTRINSICS_MM512_STORE_MASK16( relumask_ptr, lcl_relumask ); relumask_ptr += 2; #endif /* BN + scale (gamma, beta) */ lcl_vo2 = _mm512_sub_ps( _mm512_load_act( input_ptr+16 ), lcl_vbmean2 ); lcl_vo2 = _mm512_mul_ps( lcl_vgamma2, lcl_vo2 ); lcl_vo2 = _mm512_fmadd_ps( lcl_vo2, lcl_vbrstd2, lcl_vbeta2 ); /* eltwise add */ #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) lcl_vo2 = _mm512_add_ps( lcl_vo2, _mm512_load_act( input_add_ptr+16 ) ); #endif /* ReLU */ #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU) lcl_vo2 = _mm512_max_ps( lcl_vo2, _mm512_setzero_ps() ); #endif #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) lcl_relumask2 = _mm512_cmp_ps_mask( lcl_vo2, _mm512_setzero_ps(), _CMP_GT_OQ ); lcl_vo2 = _mm512_mask_blend_ps( lcl_relumask2, _mm512_setzero_ps(), lcl_vo2 ); LIBXSMM_INTRINSICS_MM512_STORE_MASK16( relumask_ptr, lcl_relumask2 ); relumask_ptr += 2; #endif _mm512_stream_act( output_ptr, lcl_vo ); _mm512_stream_act( output_ptr+16, lcl_vo2 ); input_ptr += sw*32; #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) input_add_ptr += sw*32; #endif output_ptr += 32; } } } libxsmm_barrier_wait(handle->barrier, ltid); } # undef _mm512_load_act # undef _mm512_stream_act # undef _mm512_store_act libxsmm-1.17/src/template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c000066400000000000000000000410761415223013700320770ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ #if defined(LIBXSMM_DNN_FUSEDBN_FWD_BF16) # define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) #if 1 # define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) #else # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) #endif #else # define _mm512_load_act(A) _mm512_loadu_ps(A) # define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) # define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) #endif /* size variables, all const */ const int nImg = handle->desc.partN; const int ifh = handle->desc.H; const int ifw = handle->desc.W; const int sh = handle->desc.u; const int sw = handle->desc.v; const int ofh = ifh/sh; const int ofw = ifw/sw; const int iph = handle->desc.pad_h_in; const int ipw = handle->desc.pad_w_in; const int oph = handle->desc.pad_h_out; const int opw = handle->desc.pad_w_out; const int ofhp = ofh + 2*oph; const int ofwp = ofw + 2*opw; const int ifhp = ifh + 2*iph; const int ifwp = ifw + 2*ipw; /* here we assume that input and output blocking is similar */ const int nBlocksFm = handle->blocksifm; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const int work = nImg * nBlocksFm; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* number of tasks that could be run in parallel, delta gamma and beta reduction */ const int work2 = nBlocksFm * 4; /* compute chunk size */ const int chunksize2 = (work2 % handle->desc.threads == 0) ? (work2 / handle->desc.threads) : ((work2 / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; /* eps to avoid sqrt of zero */ const element_stats_type sqrt_eps = 1e-7f; const element_stats_type nhw = (element_stats_type)(handle->desc.fullN * ifh * ifw); const element_stats_type recp_nhw = 1.0f/nhw; /* loop variables */ int img = 0; int fm = 0; int imgfm = 0; int hi = 0; int wi = 0; int ho = 0; int wo = 0; LIBXSMM_VLA_DECL(5, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, 64); #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) LIBXSMM_VLA_DECL(5, const element_input_type, input_add, (element_input_type* )handle->reg_add->data, nBlocksFm, ifhp, ifwp, 64); #endif LIBXSMM_VLA_DECL(5, element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, 64); LIBXSMM_VLA_DECL(2, const element_stats_type, gamma, (element_stats_type*)handle->reg_gamma->data, 64); LIBXSMM_VLA_DECL(2, const element_stats_type, beta, (element_stats_type*)handle->reg_beta->data, 64); LIBXSMM_VLA_DECL(2, element_stats_type, bmean, (element_stats_type*)handle->expvalue->data, 64); LIBXSMM_VLA_DECL(2, element_stats_type, brstd, (element_stats_type*)handle->rcpstddev->data, 64); LIBXSMM_VLA_DECL(2, element_stats_type, variance, (element_stats_type*)handle->variance->data, 64); LIBXSMM_VLA_DECL(3, element_stats_type, sum_img, (element_stats_type*)handle->scratch, nImg, 64); LIBXSMM_VLA_DECL(3, element_stats_type, sumsq_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * (size_t)nBlocksFm * 64), nImg, 64); #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) LIBXSMM_VLA_DECL(5, unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksFm, ofhp, ofwp, 8); #endif /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0) || ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) > 0) ) { for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { __m512 lcl_vsum = _mm512_setzero_ps(); __m512 lcl_vsumsq = _mm512_setzero_ps(); __m512 lcl_vsum2 = _mm512_setzero_ps(); __m512 lcl_vsumsq2 = _mm512_setzero_ps(); __m512 lcl_vsum3 = _mm512_setzero_ps(); __m512 lcl_vsumsq3 = _mm512_setzero_ps(); __m512 lcl_vsum4 = _mm512_setzero_ps(); __m512 lcl_vsumsq4 = _mm512_setzero_ps(); element_stats_type* sum_img_ptr; element_stats_type* sumsq_img_ptr; img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; sum_img_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img, fm, img, 0, nImg, 64); sumsq_img_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img, fm, img, 0, nImg, 64); for ( hi=iph; hi < (ifh + iph); hi++ ) { const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 64); for ( wi=ipw; wi < (ifw + ipw); wi++ ) { __m512 lcl_vinput = _mm512_load_act( input_ptr ); __m512 lcl_vinput2 = _mm512_load_act( input_ptr+16 ); __m512 lcl_vinput3 = _mm512_load_act( input_ptr+32 ); __m512 lcl_vinput4 = _mm512_load_act( input_ptr+48 ); lcl_vsum = _mm512_add_ps( lcl_vsum, lcl_vinput ); lcl_vsumsq = _mm512_add_ps( lcl_vsumsq, _mm512_mul_ps( lcl_vinput, lcl_vinput ) ); lcl_vsum2 = _mm512_add_ps( lcl_vsum2, lcl_vinput2 ); lcl_vsumsq2 = _mm512_add_ps( lcl_vsumsq2, _mm512_mul_ps( lcl_vinput2, lcl_vinput2 ) ); lcl_vsum3 = _mm512_add_ps( lcl_vsum3, lcl_vinput3 ); lcl_vsumsq3 = _mm512_add_ps( lcl_vsumsq3, _mm512_mul_ps( lcl_vinput3, lcl_vinput3 ) ); lcl_vsum4 = _mm512_add_ps( lcl_vsum4, lcl_vinput4 ); lcl_vsumsq4 = _mm512_add_ps( lcl_vsumsq4, _mm512_mul_ps( lcl_vinput4, lcl_vinput4 ) ); input_ptr += 64; } } _mm512_storeu_ps( sum_img_ptr, lcl_vsum ); _mm512_storeu_ps( sumsq_img_ptr, lcl_vsumsq ); _mm512_storeu_ps( sum_img_ptr+16, lcl_vsum2 ); _mm512_storeu_ps( sumsq_img_ptr+16, lcl_vsumsq2 ); _mm512_storeu_ps( sum_img_ptr+32, lcl_vsum3 ); _mm512_storeu_ps( sumsq_img_ptr+32, lcl_vsumsq3 ); _mm512_storeu_ps( sum_img_ptr+48, lcl_vsum4 ); _mm512_storeu_ps( sumsq_img_ptr+48, lcl_vsumsq4 ); } libxsmm_barrier_wait(handle->barrier, ltid); /* now we need to reduce the sum and sum^2, we use the final */ for ( fm = thr_begin2; fm < thr_end2; ++fm ) { __m512 lcl_vsum = _mm512_setzero_ps(); __m512 lcl_vsumsq = _mm512_setzero_ps(); element_stats_type* sum_img_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img, (fm/4), 0, ((fm%4)*16), nImg, 64); element_stats_type* sumsq_img_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img, (fm/4), 0, ((fm%4)*16), nImg, 64); for ( img=0; img < nImg; img++ ) { lcl_vsum = _mm512_add_ps( lcl_vsum, _mm512_loadu_ps( sum_img_ptr ) ); lcl_vsumsq = _mm512_add_ps( lcl_vsumsq, _mm512_loadu_ps( sumsq_img_ptr ) ); sum_img_ptr += 64; sumsq_img_ptr += 64; } if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0) ) { __m512 lcl_vsqrt_eps = _mm512_set1_ps(sqrt_eps); __m512 lcl_vrec_nhw = _mm512_set1_ps(recp_nhw); __m512 lcl_vone = _mm512_set1_ps(1.0); __m512 lcl_vbmean, lcl_vbmeansq, lcl_vsqbmean, lcl_vbrstd, lcl_vvar; lcl_vbmean = _mm512_mul_ps( lcl_vrec_nhw, lcl_vsum ); /* E(X) */ lcl_vbmeansq = _mm512_mul_ps( lcl_vbmean, lcl_vbmean ); /* E(X)^2 */ lcl_vsqbmean = _mm512_mul_ps( lcl_vrec_nhw, lcl_vsumsq ); /* E(X^2) */ lcl_vvar = _mm512_sub_ps( lcl_vsqbmean, lcl_vbmeansq ); /* variance */ lcl_vbrstd = _mm512_div_ps( lcl_vone, _mm512_sqrt_ps( _mm512_add_ps( lcl_vvar, lcl_vsqrt_eps ) ) ); _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, (fm/4), ((fm%4)*16), 64), lcl_vbmean ); _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, (fm/4), ((fm%4)*16), 64), lcl_vbrstd ); _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, variance, (fm/4), ((fm%4)*16), 64), lcl_vvar ); } else { sum_img_ptr -= 64*nImg; sumsq_img_ptr -= 64*nImg; _mm512_storeu_ps( sum_img_ptr, lcl_vsum ); _mm512_storeu_ps( sumsq_img_ptr, lcl_vsumsq ); } } libxsmm_barrier_wait(handle->barrier, ltid); } if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) > 0) ) { /* now we apply the actual forward batch norm */ for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { __m512 lcl_vgamma, lcl_vbeta, lcl_vbmean, lcl_vbrstd; __m512 lcl_vgamma2, lcl_vbeta2, lcl_vbmean2, lcl_vbrstd2; __m512 lcl_vgamma3, lcl_vbeta3, lcl_vbmean3, lcl_vbrstd3; __m512 lcl_vgamma4, lcl_vbeta4, lcl_vbmean4, lcl_vbrstd4; img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; lcl_vgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, 64) ); lcl_vbeta = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, beta, fm, 0, 64) ); lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 64) ); lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 64) ); lcl_vgamma2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 16, 64) ); lcl_vbeta2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, beta, fm, 16, 64) ); lcl_vbmean2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 16, 64) ); lcl_vbrstd2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 16, 64) ); lcl_vgamma3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 32, 64) ); lcl_vbeta3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, beta, fm, 32, 64) ); lcl_vbmean3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 32, 64) ); lcl_vbrstd3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 32, 64) ); lcl_vgamma4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 48, 64) ); lcl_vbeta4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, beta, fm, 48, 64) ); lcl_vbmean4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 48, 64) ); lcl_vbrstd4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 48, 64) ); for ( hi=iph, ho=oph; hi < (ifh+iph); hi+=sh, ho++ ) { const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 64); #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) const element_input_type* input_add_ptr = &LIBXSMM_VLA_ACCESS(5, input_add, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 64); #endif element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 64); #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) unsigned char* relumask_ptr = &LIBXSMM_VLA_ACCESS(5, relumask, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 8); #endif for ( wi=ipw, wo=opw; wi < (ifw+ipw); wi+=sw, wo++ ) { __m512 lcl_vo; __m512 lcl_vo2; __m512 lcl_vo3; __m512 lcl_vo4; #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) __mmask16 lcl_relumask; __mmask16 lcl_relumask2; __mmask16 lcl_relumask3; __mmask16 lcl_relumask4; #endif /* BN + scale (gamma, beta) */ lcl_vo = _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ); lcl_vo = _mm512_mul_ps( lcl_vgamma, lcl_vo ); lcl_vo = _mm512_fmadd_ps( lcl_vo, lcl_vbrstd, lcl_vbeta ); /* eltwise add */ #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) lcl_vo = _mm512_add_ps( lcl_vo, _mm512_load_act( input_add_ptr ) ); #endif /* ReLU */ #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU) lcl_vo = _mm512_max_ps( lcl_vo, _mm512_setzero_ps() ); #endif #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) lcl_relumask = _mm512_cmp_ps_mask( lcl_vo, _mm512_setzero_ps(), _CMP_GT_OQ ); lcl_vo = _mm512_mask_blend_ps( lcl_relumask, _mm512_setzero_ps(), lcl_vo ); LIBXSMM_INTRINSICS_MM512_STORE_MASK16( relumask_ptr, lcl_relumask ); relumask_ptr += 2; #endif /* BN + scale (gamma, beta) */ lcl_vo2 = _mm512_sub_ps( _mm512_load_act( input_ptr+16 ), lcl_vbmean2 ); lcl_vo2 = _mm512_mul_ps( lcl_vgamma2, lcl_vo2 ); lcl_vo2 = _mm512_fmadd_ps( lcl_vo2, lcl_vbrstd2, lcl_vbeta2 ); /* eltwise add */ #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) lcl_vo2 = _mm512_add_ps( lcl_vo2, _mm512_load_act( input_add_ptr+16 ) ); #endif /* ReLU */ #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU) lcl_vo2 = _mm512_max_ps( lcl_vo2, _mm512_setzero_ps() ); #endif #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) lcl_relumask2 = _mm512_cmp_ps_mask( lcl_vo2, _mm512_setzero_ps(), _CMP_GT_OQ ); lcl_vo2 = _mm512_mask_blend_ps( lcl_relumask2, _mm512_setzero_ps(), lcl_vo2 ); LIBXSMM_INTRINSICS_MM512_STORE_MASK16( relumask_ptr, lcl_relumask2 ); relumask_ptr += 2; #endif /* BN + scale (gamma, beta) */ lcl_vo3 = _mm512_sub_ps( _mm512_load_act( input_ptr+32 ), lcl_vbmean3 ); lcl_vo3 = _mm512_mul_ps( lcl_vgamma3, lcl_vo3 ); lcl_vo3 = _mm512_fmadd_ps( lcl_vo3, lcl_vbrstd3, lcl_vbeta3 ); /* eltwise add */ #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) lcl_vo3 = _mm512_add_ps( lcl_vo3, _mm512_load_act( input_add_ptr+32 ) ); #endif /* ReLU */ #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU) lcl_vo3 = _mm512_max_ps( lcl_vo3, _mm512_setzero_ps() ); #endif #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) lcl_relumask3 = _mm512_cmp_ps_mask( lcl_vo3, _mm512_setzero_ps(), _CMP_GT_OQ ); lcl_vo3 = _mm512_mask_blend_ps( lcl_relumask3, _mm512_setzero_ps(), lcl_vo3 ); LIBXSMM_INTRINSICS_MM512_STORE_MASK16( relumask_ptr, lcl_relumask3 ); relumask_ptr += 2; #endif /* BN + scale (gamma, beta) */ lcl_vo4 = _mm512_sub_ps( _mm512_load_act( input_ptr+48 ), lcl_vbmean4 ); lcl_vo4 = _mm512_mul_ps( lcl_vgamma4, lcl_vo4 ); lcl_vo4 = _mm512_fmadd_ps( lcl_vo4, lcl_vbrstd4, lcl_vbeta4 ); /* eltwise add */ #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) lcl_vo4 = _mm512_add_ps( lcl_vo4, _mm512_load_act( input_add_ptr+48 ) ); #endif /* ReLU */ #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU) lcl_vo4 = _mm512_max_ps( lcl_vo4, _mm512_setzero_ps() ); #endif #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) lcl_relumask4 = _mm512_cmp_ps_mask( lcl_vo4, _mm512_setzero_ps(), _CMP_GT_OQ ); lcl_vo4 = _mm512_mask_blend_ps( lcl_relumask4, _mm512_setzero_ps(), lcl_vo4 ); LIBXSMM_INTRINSICS_MM512_STORE_MASK16( relumask_ptr, lcl_relumask4 ); relumask_ptr += 2; #endif _mm512_stream_act( output_ptr, lcl_vo ); _mm512_stream_act( output_ptr+16, lcl_vo2 ); _mm512_stream_act( output_ptr+32, lcl_vo3 ); _mm512_stream_act( output_ptr+48, lcl_vo4 ); input_ptr += sw*64; #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) input_add_ptr += sw*64; #endif output_ptr += 64; } } } libxsmm_barrier_wait(handle->barrier, ltid); } # undef _mm512_load_act # undef _mm512_stream_act # undef _mm512_store_act libxsmm-1.17/src/template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_generic.tpl.c000066400000000000000000000263561415223013700305250ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ /* size variables, all const */ const int nImg = handle->desc.partN; const int ifh = handle->desc.H; const int ifw = handle->desc.W; const int sh = handle->desc.u; const int sw = handle->desc.v; const int ofh = ifh/sh; const int ofw = ifw/sw; const int iph = handle->desc.pad_h_in; const int ipw = handle->desc.pad_w_in; const int oph = handle->desc.pad_h_out; const int opw = handle->desc.pad_w_out; const int ofhp = ofh + 2*oph; const int ofwp = ofw + 2*opw; const int ifhp = ifh + 2*iph; const int ifwp = ifw + 2*ipw; /* here we assume that input and output blocking is similar */ const int nBlocksFm = handle->blocksifm; const int nFmBlock = handle->ifmblock; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const int work = nImg * nBlocksFm; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* number of tasks that could be run in parallel, delta gamma and beta reduction */ const int work2 = nBlocksFm; /* compute chunk size */ const int chunksize2 = (work2 % handle->desc.threads == 0) ? (work2 / handle->desc.threads) : ((work2 / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; /* eps to avoid sqrt of zero */ const element_stats_type sqrt_eps = 1e-7f; const element_stats_type nhw = (element_stats_type)(handle->desc.fullN * ifh * ifw); const element_stats_type recp_nhw = 1.0f/nhw; /* loop variables */ int img = 0; int fm = 0; int imgfm = 0; int hi = 0; int wi = 0; int v = 0; int ho = 0; int wo = 0; LIBXSMM_VLA_DECL(5, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, nFmBlock); #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) LIBXSMM_VLA_DECL(5, const element_input_type, input_add, (element_input_type* )handle->reg_add->data, nBlocksFm, ifhp, ifwp, nFmBlock); #endif LIBXSMM_VLA_DECL(5, element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, nFmBlock); LIBXSMM_VLA_DECL(2, const element_stats_type, gamma, (element_stats_type*)handle->reg_gamma->data, nFmBlock); LIBXSMM_VLA_DECL(2, const element_stats_type, beta, (element_stats_type*)handle->reg_beta->data, nFmBlock); LIBXSMM_VLA_DECL(2, element_stats_type, bmean, (element_stats_type*)handle->expvalue->data, nFmBlock); LIBXSMM_VLA_DECL(2, element_stats_type, brstd, (element_stats_type*)handle->rcpstddev->data, nFmBlock); LIBXSMM_VLA_DECL(2, element_stats_type, variance, (element_stats_type*)handle->variance->data, nFmBlock); LIBXSMM_VLA_DECL(3, element_stats_type, sum_img, (element_stats_type*)handle->scratch, nImg, nFmBlock); LIBXSMM_VLA_DECL(3, element_stats_type, sumsq_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * (size_t)nBlocksFm * (size_t)nFmBlock), nImg, nFmBlock); #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) LIBXSMM_VLA_DECL(5, unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksFm, ofhp, ofwp, nFmBlock); #endif #if defined(LIBXSMM_DNN_FUSEDBN_FWD_BF16) union libxsmm_bfloat16_hp input_f32; union libxsmm_bfloat16_hp output_f32; #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) union libxsmm_bfloat16_hp input_add_f32; input_add_f32.i[1] = 0; input_add_f32.i[0] = 0; #endif input_f32.i[1] = 0; input_f32.i[0] = 0; output_f32.i[1] = 0; output_f32.i[0] = 0; #endif /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0) || ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) > 0) ) { for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { /* @TODO check if we can bake this in into scratch */ element_stats_type lcl_sum_ptr[64]; element_stats_type lcl_sumsq_ptr[64]; element_stats_type* sum_img_ptr; element_stats_type* sumsq_img_ptr; img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; sum_img_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img, fm, img, 0, nImg, nFmBlock); sumsq_img_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img, fm, img, 0, nImg, nFmBlock); LIBXSMM_PRAGMA_SIMD for ( v=0; v < nFmBlock; v++ ) { lcl_sum_ptr[v] = (element_stats_type)0; lcl_sumsq_ptr[v] = (element_stats_type)0; } for ( hi=iph; hi < (ifh + iph); hi++ ) { for ( wi=ipw; wi < (ifw + ipw); wi++ ) { const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, wi, 0, nBlocksFm, ifhp, ifwp, nFmBlock); #if !defined(LIBXSMM_DNN_FUSEDBN_FWD_BF16) LIBXSMM_PRAGMA_SIMD #endif for (v=0; v < nFmBlock; v++) { #if defined(LIBXSMM_DNN_FUSEDBN_FWD_BF16) input_f32.i[1] = input_ptr[v]; lcl_sum_ptr[v] += input_f32.f; lcl_sumsq_ptr[v] += (input_f32.f * input_f32.f); #else lcl_sum_ptr[v] += input_ptr[v]; lcl_sumsq_ptr[v] += (input_ptr[v] * input_ptr[v]); #endif } } } LIBXSMM_PRAGMA_SIMD for (v=0; v < nFmBlock; v++) { sum_img_ptr[v] = lcl_sum_ptr[v]; sumsq_img_ptr[v] = lcl_sumsq_ptr[v]; } } libxsmm_barrier_wait(handle->barrier, ltid); /* now we need to reduce the sum and sum^2, we use the final */ for ( fm = thr_begin2; fm < thr_end2; ++fm ) { /* @TODO check if we can bake this in into scratch */ element_stats_type lcl_sum_ptr[64]; element_stats_type lcl_sumsq_ptr[64]; element_stats_type* bmean_ptr = &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, nFmBlock); element_stats_type* brstd_ptr = &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, nFmBlock); element_stats_type* tvar_ptr = &LIBXSMM_VLA_ACCESS(2, variance, fm, 0, nFmBlock); LIBXSMM_PRAGMA_SIMD for ( v=0; v < nFmBlock; v++ ) { lcl_sum_ptr[v] = (element_stats_type)0; lcl_sumsq_ptr[v] = (element_stats_type)0; } for ( img=0; img < nImg; img++ ) { element_stats_type* sum_img_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img, fm, img, 0, nImg, nFmBlock); element_stats_type* sumsq_img_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img, fm, img, 0, nImg, nFmBlock); LIBXSMM_PRAGMA_SIMD for ( v=0; v < nFmBlock; v++ ) { lcl_sum_ptr[v] += sum_img_ptr[v]; lcl_sumsq_ptr[v] += sumsq_img_ptr[v]; } } if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0) ) { LIBXSMM_PRAGMA_SIMD for ( v=0; v < nFmBlock; v++ ) { const element_stats_type tbmean = (recp_nhw * lcl_sum_ptr[v]); const element_stats_type tbmeansq = tbmean * tbmean; const element_stats_type tsqbmean = recp_nhw * lcl_sumsq_ptr[v]; const element_stats_type tvar = tsqbmean - tbmeansq; const element_stats_type tbrstd = (element_stats_type)(1.0/sqrt((double)tvar + sqrt_eps)); bmean_ptr[v] = tbmean; brstd_ptr[v] = tbrstd; tvar_ptr[v] = tvar; } } else { element_stats_type* sum_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img, fm, 0, 0, nImg, nFmBlock); element_stats_type* sumsq_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img, fm, 0, 0, nImg, nFmBlock); LIBXSMM_PRAGMA_SIMD for ( v=0; v < nFmBlock; v++ ) { sum_ptr[v] = lcl_sum_ptr[v]; sumsq_ptr[v] = lcl_sumsq_ptr[v]; } } } libxsmm_barrier_wait(handle->barrier, ltid); } if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) > 0) ) { /* now we apply the actual forward batch norm */ for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; for ( hi=iph, ho=oph; hi < (ifh+iph); hi+=sh, ho++ ) { for ( wi=ipw, wo=opw; wi < (ifw+ipw); wi+=sw, wo++ ) { const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, wi, 0, nBlocksFm, ifhp, ifwp, nFmBlock); #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) const element_input_type* input_add_ptr = &LIBXSMM_VLA_ACCESS(5, input_add, img, fm, hi, wi, 0, nBlocksFm, ifhp, ifwp, nFmBlock); #endif const element_stats_type* gamma_ptr = &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, nFmBlock); const element_stats_type* beta_ptr = &LIBXSMM_VLA_ACCESS(2, beta, fm, 0, nFmBlock); const element_stats_type* bmean_ptr = &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, nFmBlock); const element_stats_type* brstd_ptr = &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, nFmBlock); element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, nFmBlock); #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) unsigned char* relumask_ptr = &LIBXSMM_VLA_ACCESS(5, relumask, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, nFmBlock); #endif float o; #if !defined(LIBXSMM_DNN_FUSEDBN_FWD_BF16) LIBXSMM_PRAGMA_SIMD #endif for (v = 0; v < nFmBlock; v++ ) { #if defined(LIBXSMM_DNN_FUSEDBN_FWD_BF16) input_f32.i[1] = input_ptr[v]; o = gamma_ptr[v]*(input_f32.f - bmean_ptr[v])*brstd_ptr[v] + beta_ptr[v]; #else /* BN + scale (gamma, beta) */ o = gamma_ptr[v]*(input_ptr[v] - bmean_ptr[v])*brstd_ptr[v] + beta_ptr[v]; #endif /* Eltwise */ #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) #if defined(LIBXSMM_DNN_FUSEDBN_FWD_BF16) input_add_f32.i[1] = input_add_ptr[v]; o += input_add_f32.f; #else o += input_add_ptr[v]; #endif #endif /* ReLU */ #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU) o = ( o > 0.0f ) ? o : 0.0f; #endif #if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) o = ( o > 0.0f ) ? o : 0.0f; relumask_ptr[v] = (unsigned char)(o > 0.0f ? 1 : 0); #endif #if defined(LIBXSMM_DNN_FUSEDBN_FWD_BF16) output_f32.f = o; output_ptr[v] = output_f32.i[1]; #else output_ptr[v] = o; #endif } } } } libxsmm_barrier_wait(handle->barrier, ltid); } libxsmm-1.17/src/template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c000066400000000000000000000267721415223013700321510ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ #if defined(LIBXSMM_DNN_FUSEDGN_BWD_BF16) # define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) #if 1 # define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) #else # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) #endif #else # define _mm512_load_act(A) _mm512_loadu_ps(A) # define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) # define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) #endif /* size variables, all const */ const int nImg = handle->desc.N; const int ifh = handle->desc.H; const int ifw = handle->desc.W; const int sh = handle->desc.u; const int sw = handle->desc.v; const int ofh = ifh/sh; const int ofw = ifw/sw; const int iph = handle->desc.pad_h_in; const int ipw = handle->desc.pad_w_in; const int oph = handle->desc.pad_h_out; const int opw = handle->desc.pad_w_out; const int ofhp = ofh + 2*oph; const int ofwp = ofw + 2*opw; const int ifhp = ifh + 2*iph; const int ifwp = ifw + 2*ipw; /* here we assume that input and output blocking is similar */ const int nBlocksFm = handle->blocksifm; const element_stats_type nhw = (element_stats_type)(handle->desc.N * ifh * ifw); const element_stats_type recp_nhw = 1.0f/nhw; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const int work = nImg * nBlocksFm; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* number of tasks that could be run in parallel, delta gamma and beta reduction */ const int work2 = nBlocksFm; /* compute chunk size */ const int chunksize2 = (work2 % handle->desc.threads == 0) ? (work2 / handle->desc.threads) : ((work2 / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; /* loop variables */ int img = 0; int fm = 0; int imgfm = 0; int hi = 0; int wi = 0; int ho = 0; int wo = 0; LIBXSMM_VLA_DECL(5, element_input_type, dinput, (element_input_type* )handle->grad_input->data, nBlocksFm, ifhp, ifwp, 16); LIBXSMM_VLA_DECL(5, element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, 16); #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE) LIBXSMM_VLA_DECL(5, element_input_type, dinput_add, (element_input_type* )handle->grad_add->data, nBlocksFm, ifhp, ifwp, 16); #endif #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) LIBXSMM_VLA_DECL(5, const element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, 16); #endif LIBXSMM_VLA_DECL(5, element_output_type, doutput, (element_output_type*)handle->grad_output->data, nBlocksFm, ofhp, ofwp, 16); LIBXSMM_VLA_DECL(2, const element_stats_type, gamma, (element_stats_type*)handle->reg_gamma->data, 16); LIBXSMM_VLA_DECL(2, element_stats_type, dgamma, (element_stats_type*)handle->grad_gamma->data, 16); LIBXSMM_VLA_DECL(2, element_stats_type, dbeta, (element_stats_type*)handle->grad_beta->data, 16); LIBXSMM_VLA_DECL(2, const element_stats_type, bmean, (element_stats_type*)handle->expvalue->data, 16); LIBXSMM_VLA_DECL(2, const element_stats_type, brstd, (element_stats_type*)handle->rcpstddev->data, 16); LIBXSMM_VLA_DECL(3, element_stats_type, dgamma_img, (element_stats_type*)handle->scratch, nImg, 16); LIBXSMM_VLA_DECL(3, element_stats_type, dbeta_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * (size_t)nBlocksFm * (size_t)16), nImg, 16); #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) LIBXSMM_VLA_DECL(5, const unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksFm, ofhp, ofwp, 2); #endif /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { __m512 lcl_vdgamma = _mm512_setzero_ps(); __m512 lcl_vdbeta = _mm512_setzero_ps(); __m512 lcl_vbmean, lcl_vbrstd; element_stats_type* del_gamma_img_ptr; element_stats_type* del_beta_img_ptr; img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, img, 0, nImg, 16); del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, img, 0, nImg, 16); lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 16) ); lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 16) ); for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) { #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE) element_input_type* del_input_add_ptr = &LIBXSMM_VLA_ACCESS(5, dinput_add, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 16); #endif #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) const element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 16); #endif #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) const unsigned char* relumask_ptr = &LIBXSMM_VLA_ACCESS(5, relumask, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 2); #endif const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 16); element_output_type* del_output_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 16); for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) { __m512 lcl_vdeloutput = _mm512_load_act( del_output_ptr ); #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) const __mmask16 lcl_relumask = _mm512_cmp_ps_mask( _mm512_load_act( output_ptr ), _mm512_setzero_ps(), _CMP_NEQ_OQ ); lcl_vdeloutput = _mm512_mask_blend_ps( lcl_relumask, _mm512_setzero_ps(), lcl_vdeloutput ); _mm512_store_act( del_output_ptr, lcl_vdeloutput ); output_ptr += 16; #endif #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) const __mmask16 lcl_relumask = LIBXSMM_INTRINSICS_MM512_LOAD_MASK16( relumask_ptr ); lcl_vdeloutput = _mm512_mask_blend_ps( lcl_relumask, _mm512_setzero_ps(), lcl_vdeloutput ); _mm512_store_act( del_output_ptr, lcl_vdeloutput ); relumask_ptr += 2; #endif #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE) _mm512_stream_act( del_input_add_ptr, lcl_vdeloutput ); del_input_add_ptr += sw*16; #endif lcl_vdgamma = _mm512_add_ps( lcl_vdgamma, _mm512_mul_ps( _mm512_mul_ps( _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ), lcl_vdeloutput ), lcl_vbrstd ) ); lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, lcl_vdeloutput ); input_ptr += sw*16; del_output_ptr += 16; } } _mm512_storeu_ps( del_gamma_img_ptr, lcl_vdgamma ); _mm512_storeu_ps( del_beta_img_ptr, lcl_vdbeta ); } libxsmm_barrier_wait(handle->barrier, ltid); /* now we need to reduce the del_gamm and del_beta */ for ( fm = thr_begin2; fm < thr_end2; ++fm ) { element_stats_type* del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, 0, 0, nImg, 16); element_stats_type* del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, 0, 0, nImg, 16); __m512 lcl_vdgamma = _mm512_setzero_ps(); __m512 lcl_vdbeta = _mm512_setzero_ps(); for ( img=0; img < nImg; img++ ) { lcl_vdgamma = _mm512_add_ps( lcl_vdgamma, _mm512_loadu_ps( del_gamma_img_ptr ) ); lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, _mm512_loadu_ps( del_beta_img_ptr ) ); del_gamma_img_ptr += 16; del_beta_img_ptr += 16; } _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 0, 16), lcl_vdgamma ); _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 0, 16), lcl_vdbeta ); } libxsmm_barrier_wait(handle->barrier, ltid); /* now we apply the actual backward batch norm */ for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { __m512 lcl_vgamma, lcl_vbmean, lcl_vbrstd, lcl_vdgamma, lcl_vdbeta; __m512 lcl_vnhw = _mm512_set1_ps( nhw ); __m512 lcl_vrec_nhw = _mm512_set1_ps( recp_nhw ); img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; lcl_vgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, 16) ); lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 16) ); lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 16) ); lcl_vdgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 0, 16) ); lcl_vdbeta = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 0, 16) ); for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) { element_input_type* del_input_ptr = &LIBXSMM_VLA_ACCESS(5, dinput, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 16); const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 16); const element_output_type* del_output_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 16); for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) { __m512 lcl_vdelinput; lcl_vdelinput = _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ); lcl_vdelinput = _mm512_mul_ps( lcl_vdelinput, lcl_vdgamma ); lcl_vdelinput = _mm512_mul_ps( lcl_vdelinput, lcl_vbrstd ); lcl_vdelinput = _mm512_add_ps( lcl_vdbeta, lcl_vdelinput ); lcl_vdelinput = _mm512_sub_ps( _mm512_mul_ps( lcl_vnhw, _mm512_load_act( del_output_ptr ) ), lcl_vdelinput ); lcl_vdelinput = _mm512_mul_ps( lcl_vrec_nhw, lcl_vdelinput ); lcl_vdelinput = _mm512_mul_ps( lcl_vbrstd, lcl_vdelinput ); lcl_vdelinput = _mm512_mul_ps( lcl_vgamma, lcl_vdelinput ); _mm512_stream_act( del_input_ptr, lcl_vdelinput ); del_input_ptr += sw*16; input_ptr += sw*16; del_output_ptr += 16; } } } libxsmm_barrier_wait(handle->barrier, ltid); # undef _mm512_load_act # undef _mm512_stream_act # undef _mm512_store_act libxsmm-1.17/src/template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c000066400000000000000000000356251415223013700321440ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ #if defined(LIBXSMM_DNN_FUSEDGN_BWD_BF16) # define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) #if 1 # define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) #else # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) #endif #else # define _mm512_load_act(A) _mm512_loadu_ps(A) # define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) # define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) #endif /* size variables, all const */ const int nImg = handle->desc.N; const int ifh = handle->desc.H; const int ifw = handle->desc.W; const int sh = handle->desc.u; const int sw = handle->desc.v; const int ofh = ifh/sh; const int ofw = ifw/sw; const int iph = handle->desc.pad_h_in; const int ipw = handle->desc.pad_w_in; const int oph = handle->desc.pad_h_out; const int opw = handle->desc.pad_w_out; const int ofhp = ofh + 2*oph; const int ofwp = ofw + 2*opw; const int ifhp = ifh + 2*iph; const int ifwp = ifw + 2*ipw; /* here we assume that input and output blocking is similar */ const int nBlocksFm = handle->blocksifm; const element_stats_type nhw = (element_stats_type)(handle->desc.N * ifh * ifw); const element_stats_type recp_nhw = 1.0f/nhw; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const int work = nImg * nBlocksFm; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* number of tasks that could be run in parallel, delta gamma and beta reduction */ const int work2 = nBlocksFm; /* compute chunk size */ const int chunksize2 = (work2 % handle->desc.threads == 0) ? (work2 / handle->desc.threads) : ((work2 / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; /* loop variables */ int img = 0; int fm = 0; int imgfm = 0; int hi = 0; int wi = 0; int ho = 0; int wo = 0; LIBXSMM_VLA_DECL(5, element_input_type, dinput, (element_input_type* )handle->grad_input->data, nBlocksFm, ifhp, ifwp, 32); LIBXSMM_VLA_DECL(5, element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, 32); #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE) LIBXSMM_VLA_DECL(5, element_input_type, dinput_add, (element_input_type* )handle->grad_add->data, nBlocksFm, ifhp, ifwp, 32); #endif #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) LIBXSMM_VLA_DECL(5, const element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, 32); #endif LIBXSMM_VLA_DECL(5, element_output_type, doutput, (element_output_type*)handle->grad_output->data, nBlocksFm, ofhp, ofwp, 32); LIBXSMM_VLA_DECL(2, const element_stats_type, gamma, (element_stats_type*)handle->reg_gamma->data, 32); LIBXSMM_VLA_DECL(2, element_stats_type, dgamma, (element_stats_type*)handle->grad_gamma->data, 32); LIBXSMM_VLA_DECL(2, element_stats_type, dbeta, (element_stats_type*)handle->grad_beta->data, 32); LIBXSMM_VLA_DECL(2, const element_stats_type, bmean, (element_stats_type*)handle->expvalue->data, 32); LIBXSMM_VLA_DECL(2, const element_stats_type, brstd, (element_stats_type*)handle->rcpstddev->data, 32); LIBXSMM_VLA_DECL(3, element_stats_type, dgamma_img, (element_stats_type*)handle->scratch, nImg, 32); LIBXSMM_VLA_DECL(3, element_stats_type, dbeta_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * (size_t)nBlocksFm * (size_t)32), nImg, 32); #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) LIBXSMM_VLA_DECL(5, const unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksFm, ofhp, ofwp, 4); #endif /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { __m512 lcl_vdgamma = _mm512_setzero_ps(); __m512 lcl_vdbeta = _mm512_setzero_ps(); __m512 lcl_vdgamma2 = _mm512_setzero_ps(); __m512 lcl_vdbeta2 = _mm512_setzero_ps(); __m512 lcl_vbmean, lcl_vbrstd; __m512 lcl_vbmean2, lcl_vbrstd2; element_stats_type* del_gamma_img_ptr; element_stats_type* del_beta_img_ptr; img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, img, 0, nImg, 32); del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, img, 0, nImg, 32); lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 32) ); lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 32) ); lcl_vbmean2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 16, 32) ); lcl_vbrstd2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 16, 32) ); for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) { #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE) element_input_type* del_input_add_ptr = &LIBXSMM_VLA_ACCESS(5, dinput_add, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32); #endif #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) const element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 32); #endif #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) const unsigned char* relumask_ptr = &LIBXSMM_VLA_ACCESS(5, relumask, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 4); #endif const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32); element_output_type* del_output_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 32); for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) { __m512 lcl_vdeloutput, lcl_vdeloutput2; #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) __mmask16 lcl_relumask, lcl_relumask2; #endif #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) __mmask16 lcl_relumask, lcl_relumask2; #endif lcl_vdeloutput = _mm512_load_act( del_output_ptr ); #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) lcl_relumask = _mm512_cmp_ps_mask( _mm512_load_act( output_ptr ), _mm512_setzero_ps(), _CMP_NEQ_OQ ); lcl_vdeloutput = _mm512_mask_blend_ps( lcl_relumask, _mm512_setzero_ps(), lcl_vdeloutput ); _mm512_store_act( del_output_ptr, lcl_vdeloutput ); #endif #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) lcl_relumask = LIBXSMM_INTRINSICS_MM512_LOAD_MASK16( relumask_ptr ); lcl_vdeloutput = _mm512_mask_blend_ps( lcl_relumask, _mm512_setzero_ps(), lcl_vdeloutput ); _mm512_store_act( del_output_ptr, lcl_vdeloutput ); relumask_ptr += 2; #endif #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE) _mm512_stream_act( del_input_add_ptr, lcl_vdeloutput ); #endif lcl_vdgamma = _mm512_add_ps( lcl_vdgamma, _mm512_mul_ps( _mm512_mul_ps( _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ), lcl_vdeloutput ), lcl_vbrstd ) ); lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, lcl_vdeloutput ); lcl_vdeloutput2 = _mm512_load_act( del_output_ptr+16 ); #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) lcl_relumask2 = _mm512_cmp_ps_mask( _mm512_load_act( output_ptr+16 ), _mm512_setzero_ps(), _CMP_NEQ_OQ ); lcl_vdeloutput2 = _mm512_mask_blend_ps( lcl_relumask2, _mm512_setzero_ps(), lcl_vdeloutput2 ); _mm512_store_act( del_output_ptr+16, lcl_vdeloutput2 ); output_ptr += 32; #endif #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) lcl_relumask2 = LIBXSMM_INTRINSICS_MM512_LOAD_MASK16( relumask_ptr ); lcl_vdeloutput2 = _mm512_mask_blend_ps( lcl_relumask2, _mm512_setzero_ps(), lcl_vdeloutput2 ); _mm512_store_act( del_output_ptr+16, lcl_vdeloutput2 ); relumask_ptr += 2; #endif #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE) _mm512_stream_act( del_input_add_ptr+16, lcl_vdeloutput2 ); del_input_add_ptr += sw*32; #endif lcl_vdgamma2 = _mm512_add_ps( lcl_vdgamma2, _mm512_mul_ps( _mm512_mul_ps( _mm512_sub_ps( _mm512_load_act( input_ptr+16 ), lcl_vbmean2 ), lcl_vdeloutput2 ), lcl_vbrstd2 ) ); lcl_vdbeta2 = _mm512_add_ps( lcl_vdbeta2, lcl_vdeloutput2 ); input_ptr += sw*32; del_output_ptr += 32; } } _mm512_storeu_ps( del_gamma_img_ptr, lcl_vdgamma ); _mm512_storeu_ps( del_beta_img_ptr, lcl_vdbeta ); _mm512_storeu_ps( del_gamma_img_ptr+16, lcl_vdgamma2 ); _mm512_storeu_ps( del_beta_img_ptr+16, lcl_vdbeta2 ); } libxsmm_barrier_wait(handle->barrier, ltid); /* now we need to reduce the del_gamm and del_beta */ for ( fm = thr_begin2; fm < thr_end2; ++fm ) { element_stats_type* del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, 0, 0, nImg, 32); element_stats_type* del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, 0, 0, nImg, 32); __m512 lcl_vdgamma = _mm512_setzero_ps(); __m512 lcl_vdbeta = _mm512_setzero_ps(); __m512 lcl_vdgamma2 = _mm512_setzero_ps(); __m512 lcl_vdbeta2 = _mm512_setzero_ps(); for ( img=0; img < nImg; img++ ) { lcl_vdgamma = _mm512_add_ps( lcl_vdgamma, _mm512_loadu_ps( del_gamma_img_ptr ) ); lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, _mm512_loadu_ps( del_beta_img_ptr ) ); lcl_vdgamma2 = _mm512_add_ps( lcl_vdgamma2, _mm512_loadu_ps( del_gamma_img_ptr+16 ) ); lcl_vdbeta2 = _mm512_add_ps( lcl_vdbeta2, _mm512_loadu_ps( del_beta_img_ptr+16 ) ); del_gamma_img_ptr += 32; del_beta_img_ptr += 32; } _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 0, 32), lcl_vdgamma ); _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 0, 32), lcl_vdbeta ); _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 16, 32), lcl_vdgamma2 ); _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 16, 32), lcl_vdbeta2 ); } libxsmm_barrier_wait(handle->barrier, ltid); /* now we apply the actual backward batch norm */ for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { __m512 lcl_vgamma, lcl_vbmean, lcl_vbrstd, lcl_vdgamma, lcl_vdbeta; __m512 lcl_vgamma2, lcl_vbmean2, lcl_vbrstd2, lcl_vdgamma2, lcl_vdbeta2; __m512 lcl_vnhw = _mm512_set1_ps( nhw ); __m512 lcl_vrec_nhw = _mm512_set1_ps( recp_nhw ); img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; lcl_vgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, 32) ); lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 32) ); lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 32) ); lcl_vdgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 0, 32) ); lcl_vdbeta = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 0, 32) ); lcl_vgamma2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 16, 32) ); lcl_vbmean2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 16, 32) ); lcl_vbrstd2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 16, 32) ); lcl_vdgamma2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 16, 32) ); lcl_vdbeta2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 16, 32) ); for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) { element_input_type* del_input_ptr = &LIBXSMM_VLA_ACCESS(5, dinput, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32); const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32); const element_output_type* del_output_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 32); for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) { __m512 lcl_vdelinput; __m512 lcl_vdelinput2; lcl_vdelinput = _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ); lcl_vdelinput = _mm512_mul_ps( lcl_vdelinput, lcl_vdgamma ); lcl_vdelinput = _mm512_mul_ps( lcl_vdelinput, lcl_vbrstd ); lcl_vdelinput = _mm512_add_ps( lcl_vdbeta, lcl_vdelinput ); lcl_vdelinput = _mm512_sub_ps( _mm512_mul_ps( lcl_vnhw, _mm512_load_act( del_output_ptr ) ), lcl_vdelinput ); lcl_vdelinput = _mm512_mul_ps( lcl_vrec_nhw, lcl_vdelinput ); lcl_vdelinput = _mm512_mul_ps( lcl_vbrstd, lcl_vdelinput ); lcl_vdelinput = _mm512_mul_ps( lcl_vgamma, lcl_vdelinput ); lcl_vdelinput2 = _mm512_sub_ps( _mm512_load_act( input_ptr+16 ), lcl_vbmean2 ); lcl_vdelinput2 = _mm512_mul_ps( lcl_vdelinput2, lcl_vdgamma2 ); lcl_vdelinput2 = _mm512_mul_ps( lcl_vdelinput2, lcl_vbrstd2 ); lcl_vdelinput2 = _mm512_add_ps( lcl_vdbeta2, lcl_vdelinput2 ); lcl_vdelinput2 = _mm512_sub_ps( _mm512_mul_ps( lcl_vnhw, _mm512_load_act( del_output_ptr+16 ) ), lcl_vdelinput2 ); lcl_vdelinput2 = _mm512_mul_ps( lcl_vrec_nhw, lcl_vdelinput2 ); lcl_vdelinput2 = _mm512_mul_ps( lcl_vbrstd2, lcl_vdelinput2 ); lcl_vdelinput2 = _mm512_mul_ps( lcl_vgamma2, lcl_vdelinput2 ); _mm512_stream_act( del_input_ptr, lcl_vdelinput ); _mm512_stream_act( del_input_ptr+16, lcl_vdelinput2 ); del_input_ptr += sw*32; input_ptr += sw*32; del_output_ptr += 32; } } } libxsmm_barrier_wait(handle->barrier, ltid); # undef _mm512_load_act # undef _mm512_stream_act # undef _mm512_store_act libxsmm-1.17/src/template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c000066400000000000000000000500051415223013700321360ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ #if defined(LIBXSMM_DNN_FUSEDGN_BWD_BF16) # define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) #if 1 # define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) #else # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) #endif #else # define _mm512_load_act(A) _mm512_loadu_ps(A) # define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) # define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) #endif /* size variables, all const */ const int nImg = handle->desc.N; const int ifh = handle->desc.H; const int ifw = handle->desc.W; const int sh = handle->desc.u; const int sw = handle->desc.v; const int ofh = ifh/sh; const int ofw = ifw/sw; const int iph = handle->desc.pad_h_in; const int ipw = handle->desc.pad_w_in; const int oph = handle->desc.pad_h_out; const int opw = handle->desc.pad_w_out; const int ofhp = ofh + 2*oph; const int ofwp = ofw + 2*opw; const int ifhp = ifh + 2*iph; const int ifwp = ifw + 2*ipw; /* here we assume that input and output blocking is similar */ const int nBlocksFm = handle->blocksifm; const element_stats_type nhw = (element_stats_type)(handle->desc.N * ifh * ifw); const element_stats_type recp_nhw = 1.0f/nhw; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const int work = nImg * nBlocksFm; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* number of tasks that could be run in parallel, delta gamma and beta reduction */ const int work2 = nBlocksFm * 4; /* compute chunk size */ const int chunksize2 = (work2 % handle->desc.threads == 0) ? (work2 / handle->desc.threads) : ((work2 / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; /* loop variables */ int img = 0; int fm = 0; int imgfm = 0; int hi = 0; int wi = 0; int ho = 0; int wo = 0; LIBXSMM_VLA_DECL(5, element_input_type, dinput, (element_input_type* )handle->grad_input->data, nBlocksFm, ifhp, ifwp, 64); LIBXSMM_VLA_DECL(5, element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, 64); #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE) LIBXSMM_VLA_DECL(5, element_input_type, dinput_add, (element_input_type* )handle->grad_add->data, nBlocksFm, ifhp, ifwp, 64); #endif #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) LIBXSMM_VLA_DECL(5, const element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, 64); #endif LIBXSMM_VLA_DECL(5, element_output_type, doutput, (element_output_type*)handle->grad_output->data, nBlocksFm, ofhp, ofwp, 64); LIBXSMM_VLA_DECL(2, const element_stats_type, gamma, (element_stats_type*)handle->reg_gamma->data, 64); LIBXSMM_VLA_DECL(2, element_stats_type, dgamma, (element_stats_type*)handle->grad_gamma->data, 64); LIBXSMM_VLA_DECL(2, element_stats_type, dbeta, (element_stats_type*)handle->grad_beta->data, 64); LIBXSMM_VLA_DECL(2, const element_stats_type, bmean, (element_stats_type*)handle->expvalue->data, 64); LIBXSMM_VLA_DECL(2, const element_stats_type, brstd, (element_stats_type*)handle->rcpstddev->data, 64); LIBXSMM_VLA_DECL(3, element_stats_type, dgamma_img, (element_stats_type*)handle->scratch, nImg, 64); LIBXSMM_VLA_DECL(3, element_stats_type, dbeta_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * (size_t)nBlocksFm * (size_t)64), nImg, 64); #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) LIBXSMM_VLA_DECL(5, const unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksFm, ofhp, ofwp, 8); #endif /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { __m512 lcl_vdgamma = _mm512_setzero_ps(); __m512 lcl_vdbeta = _mm512_setzero_ps(); __m512 lcl_vdgamma2 = _mm512_setzero_ps(); __m512 lcl_vdbeta2 = _mm512_setzero_ps(); __m512 lcl_vdgamma3 = _mm512_setzero_ps(); __m512 lcl_vdbeta3 = _mm512_setzero_ps(); __m512 lcl_vdgamma4 = _mm512_setzero_ps(); __m512 lcl_vdbeta4 = _mm512_setzero_ps(); __m512 lcl_vbmean, lcl_vbrstd; __m512 lcl_vbmean2, lcl_vbrstd2; __m512 lcl_vbmean3, lcl_vbrstd3; __m512 lcl_vbmean4, lcl_vbrstd4; element_stats_type* del_gamma_img_ptr; element_stats_type* del_beta_img_ptr; img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, img, 0, nImg, 64); del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, img, 0, nImg, 64); lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 64) ); lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 64) ); lcl_vbmean2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 16, 64) ); lcl_vbrstd2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 16, 64) ); lcl_vbmean3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 32, 64) ); lcl_vbrstd3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 32, 64) ); lcl_vbmean4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 48, 64) ); lcl_vbrstd4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 48, 64) ); for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) { #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE) element_input_type* del_input_add_ptr = &LIBXSMM_VLA_ACCESS(5, dinput_add, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 64); #endif #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) const element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 64); #endif #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) const unsigned char* relumask_ptr = &LIBXSMM_VLA_ACCESS(5, relumask, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 8); #endif const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 64); element_output_type* del_output_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 64); for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) { __m512 lcl_vdeloutput, lcl_vdeloutput2, lcl_vdeloutput3, lcl_vdeloutput4; #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) __mmask16 lcl_relumask, lcl_relumask2, lcl_relumask3, lcl_relumask4; #endif #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) __mmask16 lcl_relumask, lcl_relumask2, lcl_relumask3, lcl_relumask4; #endif lcl_vdeloutput = _mm512_load_act( del_output_ptr ); #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) lcl_relumask = _mm512_cmp_ps_mask( _mm512_load_act( output_ptr ), _mm512_setzero_ps(), _CMP_NEQ_OQ ); lcl_vdeloutput = _mm512_mask_blend_ps( lcl_relumask, _mm512_setzero_ps(), lcl_vdeloutput ); _mm512_store_act( del_output_ptr, lcl_vdeloutput ); #endif #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) lcl_relumask = LIBXSMM_INTRINSICS_MM512_LOAD_MASK16( relumask_ptr ); lcl_vdeloutput = _mm512_mask_blend_ps( lcl_relumask, _mm512_setzero_ps(), lcl_vdeloutput ); _mm512_store_act( del_output_ptr, lcl_vdeloutput ); relumask_ptr += 2; #endif #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE) _mm512_stream_act( del_input_add_ptr, lcl_vdeloutput ); #endif lcl_vdgamma = _mm512_add_ps( lcl_vdgamma, _mm512_mul_ps( _mm512_mul_ps( _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ), lcl_vdeloutput ), lcl_vbrstd ) ); lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, lcl_vdeloutput ); lcl_vdeloutput2 = _mm512_load_act( del_output_ptr+16 ); #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) lcl_relumask2 = _mm512_cmp_ps_mask( _mm512_load_act( output_ptr+16 ), _mm512_setzero_ps(), _CMP_NEQ_OQ ); lcl_vdeloutput2 = _mm512_mask_blend_ps( lcl_relumask2, _mm512_setzero_ps(), lcl_vdeloutput2 ); _mm512_store_act( del_output_ptr+16, lcl_vdeloutput2 ); #endif #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) lcl_relumask2 = LIBXSMM_INTRINSICS_MM512_LOAD_MASK16( relumask_ptr ); lcl_vdeloutput2 = _mm512_mask_blend_ps( lcl_relumask2, _mm512_setzero_ps(), lcl_vdeloutput2 ); _mm512_store_act( del_output_ptr+16, lcl_vdeloutput2 ); relumask_ptr += 2; #endif #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE) _mm512_stream_act( del_input_add_ptr+16, lcl_vdeloutput2 ); #endif lcl_vdgamma2 = _mm512_add_ps( lcl_vdgamma2, _mm512_mul_ps( _mm512_mul_ps( _mm512_sub_ps( _mm512_load_act( input_ptr+16 ), lcl_vbmean2 ), lcl_vdeloutput2 ), lcl_vbrstd2 ) ); lcl_vdbeta2 = _mm512_add_ps( lcl_vdbeta2, lcl_vdeloutput2 ); lcl_vdeloutput3 = _mm512_load_act( del_output_ptr+32 ); #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) lcl_relumask3 = _mm512_cmp_ps_mask( _mm512_load_act( output_ptr+32 ), _mm512_setzero_ps(), _CMP_NEQ_OQ ); lcl_vdeloutput3 = _mm512_mask_blend_ps( lcl_relumask3, _mm512_setzero_ps(), lcl_vdeloutput3 ); _mm512_store_act( del_output_ptr+32, lcl_vdeloutput3 ); #endif #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) lcl_relumask3 = LIBXSMM_INTRINSICS_MM512_LOAD_MASK16( relumask_ptr ); lcl_vdeloutput3 = _mm512_mask_blend_ps( lcl_relumask3, _mm512_setzero_ps(), lcl_vdeloutput3 ); _mm512_store_act( del_output_ptr+32, lcl_vdeloutput3 ); relumask_ptr += 2; #endif #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE) _mm512_stream_act( del_input_add_ptr+32, lcl_vdeloutput3 ); #endif lcl_vdgamma3 = _mm512_add_ps( lcl_vdgamma3, _mm512_mul_ps( _mm512_mul_ps( _mm512_sub_ps( _mm512_load_act( input_ptr+32 ), lcl_vbmean3 ), lcl_vdeloutput3 ), lcl_vbrstd3 ) ); lcl_vdbeta3 = _mm512_add_ps( lcl_vdbeta3, lcl_vdeloutput3 ); lcl_vdeloutput4 = _mm512_load_act( del_output_ptr+48 ); #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) lcl_relumask4 = _mm512_cmp_ps_mask( _mm512_load_act( output_ptr+48 ), _mm512_setzero_ps(), _CMP_NEQ_OQ ); lcl_vdeloutput4 = _mm512_mask_blend_ps( lcl_relumask4, _mm512_setzero_ps(), lcl_vdeloutput4 ); _mm512_store_act( del_output_ptr+48, lcl_vdeloutput4 ); output_ptr += 64; #endif #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) lcl_relumask4 = LIBXSMM_INTRINSICS_MM512_LOAD_MASK16( relumask_ptr ); lcl_vdeloutput4 = _mm512_mask_blend_ps( lcl_relumask4, _mm512_setzero_ps(), lcl_vdeloutput4 ); _mm512_store_act( del_output_ptr+48, lcl_vdeloutput4 ); relumask_ptr += 2; #endif #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE) _mm512_stream_act( del_input_add_ptr+48, lcl_vdeloutput4 ); del_input_add_ptr += sw*64; #endif lcl_vdgamma4 = _mm512_add_ps( lcl_vdgamma4, _mm512_mul_ps( _mm512_mul_ps( _mm512_sub_ps( _mm512_load_act( input_ptr+48 ), lcl_vbmean4 ), lcl_vdeloutput4 ), lcl_vbrstd4 ) ); lcl_vdbeta4 = _mm512_add_ps( lcl_vdbeta4, lcl_vdeloutput4 ); input_ptr += sw*64; del_output_ptr += 64; } } _mm512_storeu_ps( del_gamma_img_ptr, lcl_vdgamma ); _mm512_storeu_ps( del_beta_img_ptr, lcl_vdbeta ); _mm512_storeu_ps( del_gamma_img_ptr+16, lcl_vdgamma2 ); _mm512_storeu_ps( del_beta_img_ptr+16, lcl_vdbeta2 ); _mm512_storeu_ps( del_gamma_img_ptr+32, lcl_vdgamma3 ); _mm512_storeu_ps( del_beta_img_ptr+32, lcl_vdbeta3 ); _mm512_storeu_ps( del_gamma_img_ptr+48, lcl_vdgamma4 ); _mm512_storeu_ps( del_beta_img_ptr+48, lcl_vdbeta4 ); } libxsmm_barrier_wait(handle->barrier, ltid); /* now we need to reduce the del_gamm and del_beta */ for ( fm = thr_begin2; fm < thr_end2; ++fm ) { element_stats_type* del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, (fm/4), 0, ((fm%4)*16), nImg, 64); element_stats_type* del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, (fm/4), 0, ((fm%4)*16), nImg, 64); __m512 lcl_vdgamma = _mm512_setzero_ps(); __m512 lcl_vdbeta = _mm512_setzero_ps(); for ( img=0; img < nImg; img++ ) { lcl_vdgamma = _mm512_add_ps( lcl_vdgamma, _mm512_loadu_ps( del_gamma_img_ptr ) ); lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, _mm512_loadu_ps( del_beta_img_ptr ) ); del_gamma_img_ptr += 64; del_beta_img_ptr += 64; } _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, (fm/4), ((fm%4)*16), 64), lcl_vdgamma ); _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, (fm/4), ((fm%4)*16), 64), lcl_vdbeta ); } libxsmm_barrier_wait(handle->barrier, ltid); /* now we apply the actual backward batch norm */ for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { __m512 lcl_vgamma, lcl_vbmean, lcl_vbrstd, lcl_vdgamma, lcl_vdbeta; __m512 lcl_vgamma2, lcl_vbmean2, lcl_vbrstd2, lcl_vdgamma2, lcl_vdbeta2; __m512 lcl_vgamma3, lcl_vbmean3, lcl_vbrstd3, lcl_vdgamma3, lcl_vdbeta3; __m512 lcl_vgamma4, lcl_vbmean4, lcl_vbrstd4, lcl_vdgamma4, lcl_vdbeta4; __m512 lcl_vnhw = _mm512_set1_ps( nhw ); __m512 lcl_vrec_nhw = _mm512_set1_ps( recp_nhw ); img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; lcl_vgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, 64) ); lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 64) ); lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 64) ); lcl_vdgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 0, 64) ); lcl_vdbeta = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 0, 64) ); lcl_vgamma2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 16, 64) ); lcl_vbmean2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 16, 64) ); lcl_vbrstd2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 16, 64) ); lcl_vdgamma2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 16, 64) ); lcl_vdbeta2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 16, 64) ); lcl_vgamma3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 32, 64) ); lcl_vbmean3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 32, 64) ); lcl_vbrstd3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 32, 64) ); lcl_vdgamma3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 32, 64) ); lcl_vdbeta3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 32, 64) ); lcl_vgamma4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 48, 64) ); lcl_vbmean4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 48, 64) ); lcl_vbrstd4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 48, 64) ); lcl_vdgamma4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 48, 64) ); lcl_vdbeta4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 48, 64) ); for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) { element_input_type* del_input_ptr = &LIBXSMM_VLA_ACCESS(5, dinput, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 64); const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 64); const element_output_type* del_output_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 64); for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) { __m512 lcl_vdelinput; __m512 lcl_vdelinput2; __m512 lcl_vdelinput3; __m512 lcl_vdelinput4; lcl_vdelinput = _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ); lcl_vdelinput = _mm512_mul_ps( lcl_vdelinput, lcl_vdgamma ); lcl_vdelinput = _mm512_mul_ps( lcl_vdelinput, lcl_vbrstd ); lcl_vdelinput = _mm512_add_ps( lcl_vdbeta, lcl_vdelinput ); lcl_vdelinput = _mm512_sub_ps( _mm512_mul_ps( lcl_vnhw, _mm512_load_act( del_output_ptr ) ), lcl_vdelinput ); lcl_vdelinput = _mm512_mul_ps( lcl_vrec_nhw, lcl_vdelinput ); lcl_vdelinput = _mm512_mul_ps( lcl_vbrstd, lcl_vdelinput ); lcl_vdelinput = _mm512_mul_ps( lcl_vgamma, lcl_vdelinput ); lcl_vdelinput2 = _mm512_sub_ps( _mm512_load_act( input_ptr+16 ), lcl_vbmean2 ); lcl_vdelinput2 = _mm512_mul_ps( lcl_vdelinput2, lcl_vdgamma2 ); lcl_vdelinput2 = _mm512_mul_ps( lcl_vdelinput2, lcl_vbrstd2 ); lcl_vdelinput2 = _mm512_add_ps( lcl_vdbeta2, lcl_vdelinput2 ); lcl_vdelinput2 = _mm512_sub_ps( _mm512_mul_ps( lcl_vnhw, _mm512_load_act( del_output_ptr+16 ) ), lcl_vdelinput2 ); lcl_vdelinput2 = _mm512_mul_ps( lcl_vrec_nhw, lcl_vdelinput2 ); lcl_vdelinput2 = _mm512_mul_ps( lcl_vbrstd2, lcl_vdelinput2 ); lcl_vdelinput2 = _mm512_mul_ps( lcl_vgamma2, lcl_vdelinput2 ); lcl_vdelinput3 = _mm512_sub_ps( _mm512_load_act( input_ptr+32 ), lcl_vbmean3 ); lcl_vdelinput3 = _mm512_mul_ps( lcl_vdelinput3, lcl_vdgamma3 ); lcl_vdelinput3 = _mm512_mul_ps( lcl_vdelinput3, lcl_vbrstd3 ); lcl_vdelinput3 = _mm512_add_ps( lcl_vdbeta3, lcl_vdelinput3 ); lcl_vdelinput3 = _mm512_sub_ps( _mm512_mul_ps( lcl_vnhw, _mm512_load_act( del_output_ptr+32 ) ), lcl_vdelinput3 ); lcl_vdelinput3 = _mm512_mul_ps( lcl_vrec_nhw, lcl_vdelinput3 ); lcl_vdelinput3 = _mm512_mul_ps( lcl_vbrstd3, lcl_vdelinput3 ); lcl_vdelinput3 = _mm512_mul_ps( lcl_vgamma3, lcl_vdelinput3 ); lcl_vdelinput4 = _mm512_sub_ps( _mm512_load_act( input_ptr+48 ), lcl_vbmean4 ); lcl_vdelinput4 = _mm512_mul_ps( lcl_vdelinput4, lcl_vdgamma4 ); lcl_vdelinput4 = _mm512_mul_ps( lcl_vdelinput4, lcl_vbrstd4 ); lcl_vdelinput4 = _mm512_add_ps( lcl_vdbeta4, lcl_vdelinput4 ); lcl_vdelinput4 = _mm512_sub_ps( _mm512_mul_ps( lcl_vnhw, _mm512_load_act( del_output_ptr+48 ) ), lcl_vdelinput4 ); lcl_vdelinput4 = _mm512_mul_ps( lcl_vrec_nhw, lcl_vdelinput4 ); lcl_vdelinput4 = _mm512_mul_ps( lcl_vbrstd4, lcl_vdelinput4 ); lcl_vdelinput4 = _mm512_mul_ps( lcl_vgamma4, lcl_vdelinput4 ); _mm512_stream_act( del_input_ptr, lcl_vdelinput ); _mm512_stream_act( del_input_ptr+16, lcl_vdelinput2 ); _mm512_stream_act( del_input_ptr+32, lcl_vdelinput3 ); _mm512_stream_act( del_input_ptr+48, lcl_vdelinput4 ); del_input_ptr += sw*64; input_ptr += sw*64; del_output_ptr += 64; } } } libxsmm_barrier_wait(handle->barrier, ltid); # undef _mm512_load_act # undef _mm512_stream_act # undef _mm512_store_act libxsmm-1.17/src/template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_generic.tpl.c000066400000000000000000000311621415223013700305630ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ /* size variables, all const */ const int nImg = handle->desc.N; const int nG = handle->desc.G; const int ifh = handle->desc.H; const int ifw = handle->desc.W; const int sh = handle->desc.u; const int sw = handle->desc.v; const int ofh = ifh/sh; const int ofw = ifw/sw; const int iph = handle->desc.pad_h_in; const int ipw = handle->desc.pad_w_in; const int oph = handle->desc.pad_h_out; const int opw = handle->desc.pad_w_out; const int ofhp = ofh + 2*oph; const int ofwp = ofw + 2*opw; const int ifhp = ifh + 2*iph; const int ifwp = ifw + 2*ipw; /* here we assume that input and output blocking is similar */ const int nBlocksFm = handle->blocksifm; const int nFmBlock = handle->ifmblock; /* derive channels per group */ const int nFmG = (nBlocksFm * nFmBlock) / nG; /* size of sample */ const element_stats_type ghw = (element_stats_type)(nFmG * ifh * ifw); const element_stats_type recp_ghw = 1.0f/ghw; const element_stats_type eps = 1e-7f; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ /* @TODO let's fix parallelization to include channel groups while avoiding conflict misses */ const int work = nImg; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* number of tasks that could be run in parallel, delta gamma and beta reduction */ const int work2 = nBlocksFm; /* compute chunk size */ const int chunksize2 = (work2 % handle->desc.threads == 0) ? (work2 / handle->desc.threads) : ((work2 / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; /* loop variables */ int img = 0; int fm = 0; /*int imgfm = 0;*/ int hi = 0; int wi = 0; int v = 0; int ho = 0; int wo = 0; int g = 0; LIBXSMM_VLA_DECL(5, element_input_type, dinput, (element_input_type* )handle->grad_input->data, nBlocksFm, ifhp, ifwp, nFmBlock); LIBXSMM_VLA_DECL(5, element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, nFmBlock); #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE) LIBXSMM_VLA_DECL(5, element_input_type, dinput_add, (element_input_type* )handle->grad_add->data, nBlocksFm, ifhp, ifwp, nFmBlock); #endif #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) LIBXSMM_VLA_DECL(5, const element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, nFmBlock); #endif LIBXSMM_VLA_DECL(5, element_output_type, doutput, (element_output_type*)handle->grad_output->data, nBlocksFm, ofhp, ofwp, nFmBlock); LIBXSMM_VLA_DECL(2, const element_stats_type, gamma, (element_stats_type*)handle->reg_gamma->data, nFmBlock); LIBXSMM_VLA_DECL(2, element_stats_type, dgamma, (element_stats_type*)handle->grad_gamma->data, nFmBlock); LIBXSMM_VLA_DECL(2, element_stats_type, dbeta, (element_stats_type*)handle->grad_beta->data, nFmBlock); LIBXSMM_VLA_DECL(2, const element_stats_type, bmean, (element_stats_type*)handle->expvalue->data, nG); LIBXSMM_VLA_DECL(2, const element_stats_type, brstd, (element_stats_type*)handle->rcpstddev->data, nG); LIBXSMM_VLA_DECL(2, const element_stats_type, variance, (element_stats_type*)handle->variance->data, nG); LIBXSMM_VLA_DECL(3, element_stats_type, dgamma_img, (element_stats_type*)handle->scratch, nImg, nFmBlock); LIBXSMM_VLA_DECL(3, element_stats_type, dbeta_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * (size_t)nBlocksFm * (size_t)nFmBlock), nImg, nFmBlock); LIBXSMM_VLA_DECL(2, element_stats_type, d1_val_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * 2 * (size_t)nBlocksFm * (size_t)nFmBlock), nG); LIBXSMM_VLA_DECL(2, element_stats_type, d2_val_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * 2 * (size_t)nBlocksFm * (size_t)nFmBlock) + ((size_t)nImg*(size_t)nG), nG); #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) LIBXSMM_VLA_DECL(5, unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksFm, ofhp, ofwp, nFmBlock); #endif #if defined(LIBXSMM_DNN_FUSEDGN_BWD_BF16) union libxsmm_bfloat16_hp input_f32; union libxsmm_bfloat16_hp del_input_f32; union libxsmm_bfloat16_hp del_output_f32; #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) union libxsmm_bfloat16_hp output_f32; output_f32.i[1] = 0; output_f32.i[0] = 0; #endif input_f32.i[1] = 0; input_f32.i[0] = 0; del_output_f32.i[1] = 0; del_output_f32.i[0] = 0; del_input_f32.i[1] = 0; del_input_f32.i[0] = 0; #endif assert( nFmBlock <= 64 ); /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); for ( img = thr_begin; img < thr_end; ++img ) { element_stats_type* d1_val_img_ptr = &LIBXSMM_VLA_ACCESS(2, d1_val_img, img, 0, nG); element_stats_type* d2_val_img_ptr = &LIBXSMM_VLA_ACCESS(2, d2_val_img, img, 0, nG); for ( g = 0; g < nG; ++g ) { d1_val_img_ptr[g] = 0.0f; d2_val_img_ptr[g] = 0.0f; } for ( fm = 0; fm < nBlocksFm; ++fm ) { /* @TODO check if we can bake this in into scratch */ element_stats_type lcl_gamma_ptr[64]; element_stats_type lcl_beta_ptr[64]; element_stats_type* del_gamma_img_ptr; element_stats_type* del_beta_img_ptr; del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, img, 0, nImg, nFmBlock); del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, img, 0, nImg, nFmBlock); LIBXSMM_PRAGMA_SIMD for ( v=0; v < nFmBlock; v++ ) { lcl_gamma_ptr[v] = 0.0f; lcl_beta_ptr[v] = 0.0f; } for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) { for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) { #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE) element_input_type* del_input_add_ptr = &LIBXSMM_VLA_ACCESS(5, dinput_add, img, fm, hi, wi, 0, nBlocksFm, ifhp, ifwp, nFmBlock); #endif #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) const element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, nFmBlock); #endif #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) const unsigned char* relumask_ptr = &LIBXSMM_VLA_ACCESS(5, relumask, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, nFmBlock); #endif const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, wi, 0, nBlocksFm, ifhp, ifwp, nFmBlock); element_output_type* del_output_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, nFmBlock); const element_stats_type* bmean_ptr = &LIBXSMM_VLA_ACCESS(2, bmean, img, 0, nG); const element_stats_type* brstd_ptr = &LIBXSMM_VLA_ACCESS(2, brstd, img, 0, nG); const element_stats_type* gamma_ptr = &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, nFmBlock); for ( v=0; v < nFmBlock; v++ ) { g = ((fm*nFmBlock)+v)/nFmG; #if defined(LIBXSMM_DNN_FUSEDGN_BWD_BF16) del_output_f32.i[1] = del_output_ptr[v]; del_output_f32.i[0] = 0; #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) output_f32.i[1] = output_ptr[v]; del_output_f32.f = LIBXSMM_FEQ(output_f32.f, 0) ? 0 : del_output_f32.f; del_output_ptr[v] = del_output_f32.i[1]; #endif #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) del_output_ptr[v] = (element_output_type)(relumask_ptr[v] == 1 ? del_output_ptr[v] : 0); #endif #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE) del_input_add_ptr[v] = del_output_ptr[v]; #endif input_f32.i[1] = input_ptr[v]; lcl_gamma_ptr[v] += (input_f32.f - bmean_ptr[g]) * del_output_f32.f * brstd_ptr[g]; lcl_beta_ptr[v] += del_output_f32.f; d1_val_img_ptr[g] += (input_f32.f - bmean_ptr[g]) * del_output_f32.f * gamma_ptr[v]; d2_val_img_ptr[g] += del_output_f32.f * gamma_ptr[v]; #else #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) del_output_ptr[v] = LIBXSMM_FEQ(output_ptr[v], 0) ? 0 : del_output_ptr[v]; #endif #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) del_output_ptr[v] = (element_output_type)(relumask_ptr[v] == 1 ? del_output_ptr[v] : 0); #endif #if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE) del_input_add_ptr[v] = del_output_ptr[v]; #endif lcl_gamma_ptr[v] += (input_ptr[v] - bmean_ptr[g]) * del_output_ptr[v] * brstd_ptr[g]; lcl_beta_ptr[v] += del_output_ptr[v]; d1_val_img_ptr[g] += (input_ptr[v] - bmean_ptr[g]) * del_output_ptr[v] * gamma_ptr[v]; d2_val_img_ptr[g] += del_output_ptr[v] * gamma_ptr[v]; #endif } } } LIBXSMM_PRAGMA_SIMD for ( v=0; v < nFmBlock; v++ ) { del_gamma_img_ptr[v] = lcl_gamma_ptr[v]; del_beta_img_ptr[v] = lcl_beta_ptr[v]; } } for ( fm = 0; fm < nBlocksFm; ++fm ) { for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) { for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) { element_input_type* del_input_ptr = &LIBXSMM_VLA_ACCESS(5, dinput, img, fm, hi, wi, 0, nBlocksFm, ifhp, ifwp, nFmBlock); const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, wi, 0, nBlocksFm, ifhp, ifwp, nFmBlock); const element_output_type* del_output_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, nFmBlock); const element_stats_type* bmean_ptr = &LIBXSMM_VLA_ACCESS(2, bmean, img, 0, nG); const element_stats_type* brstd_ptr = &LIBXSMM_VLA_ACCESS(2, brstd, img, 0, nG); const element_stats_type* variance_ptr = &LIBXSMM_VLA_ACCESS(2, variance, img, 0, nG); const element_stats_type* gamma_ptr = &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, nFmBlock); #if 0 #if !defined(LIBXSMM_DNN_FUSEDGN_BWD_BF16) LIBXSMM_PRAGMA_SIMD #endif #endif for ( v=0; v < nFmBlock; v++ ) { element_stats_type t0_val; g = ((fm*nFmBlock)+v)/nFmG; t0_val = brstd_ptr[g] * recp_ghw; #if defined(LIBXSMM_DNN_FUSEDGN_BWD_BF16) del_output_f32.i[1] = del_output_ptr[v]; input_f32.i[1] = input_ptr[v]; del_input_f32.f = t0_val * ((gamma_ptr[v] * ghw * del_output_f32.f) - d2_val_img_ptr[g] - ((input_f32.f - bmean_ptr[g]) * d1_val_img_ptr[g] * (1.0f/(variance_ptr[g] + eps)))); del_input_ptr[v] = del_input_f32.i[1]; #else del_input_ptr[v] = t0_val * ((gamma_ptr[v] * ghw * del_output_ptr[v]) - d2_val_img_ptr[g] - ((input_ptr[v] - bmean_ptr[g]) * d1_val_img_ptr[g] * (1.0f/(variance_ptr[g] + eps)))); #endif } } } } } libxsmm_barrier_wait(handle->barrier, ltid); /* now we need to reduce the del_gamm and del_beta */ for ( fm = thr_begin2; fm < thr_end2; ++fm ) { element_stats_type* del_gamma_ptr = &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 0, nFmBlock); element_stats_type* del_beta_ptr = &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 0, nFmBlock); LIBXSMM_PRAGMA_SIMD for ( v=0; v < nFmBlock; v++ ) { del_gamma_ptr[v] = (element_stats_type)0; del_beta_ptr[v] = (element_stats_type)0; } for ( img=0; img < nImg; img++ ) { element_stats_type* del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, img, 0, nImg, nFmBlock); element_stats_type* del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, img, 0, nImg, nFmBlock); LIBXSMM_PRAGMA_SIMD for ( v=0; v < nFmBlock; v++ ) { del_gamma_ptr[v] += del_gamma_img_ptr[v]; del_beta_ptr[v] += del_beta_img_ptr[v]; } } } libxsmm_barrier_wait(handle->barrier, ltid); libxsmm-1.17/src/template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c000066400000000000000000000264271415223013700321520ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ #if defined(LIBXSMM_DNN_FUSEDGN_FWD_BF16) # define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) #if 1 # define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) #else # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) #endif #else # define _mm512_load_act(A) _mm512_loadu_ps(A) # define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) # define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) #endif /* size variables, all const */ const int nImg = handle->desc.N; const int ifh = handle->desc.H; const int ifw = handle->desc.W; const int sh = handle->desc.u; const int sw = handle->desc.v; const int ofh = ifh/sh; const int ofw = ifw/sw; const int iph = handle->desc.pad_h_in; const int ipw = handle->desc.pad_w_in; const int oph = handle->desc.pad_h_out; const int opw = handle->desc.pad_w_out; const int ofhp = ofh + 2*oph; const int ofwp = ofw + 2*opw; const int ifhp = ifh + 2*iph; const int ifwp = ifw + 2*ipw; /* here we assume that input and output blocking is similar */ const int nBlocksFm = handle->blocksifm; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const int work = nImg * nBlocksFm; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* number of tasks that could be run in parallel, delta gamma and beta reduction */ const int work2 = nBlocksFm; /* compute chunk size */ const int chunksize2 = (work2 % handle->desc.threads == 0) ? (work2 / handle->desc.threads) : ((work2 / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; /* eps to avoid sqrt of zero */ const element_stats_type sqrt_eps = 1e-7f; const element_stats_type nhw = (element_stats_type)(handle->desc.N * ifh * ifw); const element_stats_type recp_nhw = 1.0f/nhw; /* loop variables */ int img = 0; int fm = 0; int imgfm = 0; int hi = 0; int wi = 0; int ho = 0; int wo = 0; LIBXSMM_VLA_DECL(5, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, 16); #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) LIBXSMM_VLA_DECL(5, const element_input_type, input_add, (element_input_type* )handle->reg_add->data, nBlocksFm, ifhp, ifwp, 16); #endif LIBXSMM_VLA_DECL(5, element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, 16); LIBXSMM_VLA_DECL(2, const element_stats_type, gamma, (element_stats_type*)handle->reg_gamma->data, 16); LIBXSMM_VLA_DECL(2, const element_stats_type, beta, (element_stats_type*)handle->reg_beta->data, 16); LIBXSMM_VLA_DECL(2, element_stats_type, bmean, (element_stats_type*)handle->expvalue->data, 16); LIBXSMM_VLA_DECL(2, element_stats_type, brstd, (element_stats_type*)handle->rcpstddev->data, 16); LIBXSMM_VLA_DECL(2, element_stats_type, variance, (element_stats_type*)handle->variance->data, 16); LIBXSMM_VLA_DECL(3, element_stats_type, sum_img, (element_stats_type*)handle->scratch, nImg, 16); LIBXSMM_VLA_DECL(3, element_stats_type, sumsq_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * (size_t)nBlocksFm * 16), nImg, 16); #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) LIBXSMM_VLA_DECL(5, unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksFm, ofhp, ofwp, 2); #endif /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { __m512 lcl_vsum = _mm512_setzero_ps(); __m512 lcl_vsumsq = _mm512_setzero_ps(); element_stats_type* sum_img_ptr; element_stats_type* sumsq_img_ptr; img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; sum_img_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img, fm, img, 0, nImg, 16); sumsq_img_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img, fm, img, 0, nImg, 16); for ( hi=iph; hi < (ifh + iph); hi++ ) { const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 16); for ( wi=ipw; wi < (ifw + ipw); wi++ ) { __m512 lcl_vinput = _mm512_load_act( input_ptr ); lcl_vsum = _mm512_add_ps( lcl_vsum, lcl_vinput ); lcl_vsumsq = _mm512_add_ps( lcl_vsumsq, _mm512_mul_ps( lcl_vinput, lcl_vinput ) ); input_ptr += 16; } } _mm512_storeu_ps( sum_img_ptr, lcl_vsum ); _mm512_storeu_ps( sumsq_img_ptr, lcl_vsumsq ); } libxsmm_barrier_wait(handle->barrier, ltid); /* now we need to reduce the sum and sum^2, we use the final */ for ( fm = thr_begin2; fm < thr_end2; ++fm ) { __m512 lcl_vsum = _mm512_setzero_ps(); __m512 lcl_vsumsq = _mm512_setzero_ps(); element_stats_type* sum_img_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img, fm, 0, 0, nImg, 16); element_stats_type* sumsq_img_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img, fm, 0, 0, nImg, 16); for ( img=0; img < nImg; img++ ) { lcl_vsum = _mm512_add_ps( lcl_vsum, _mm512_loadu_ps( sum_img_ptr ) ); lcl_vsumsq = _mm512_add_ps( lcl_vsumsq, _mm512_loadu_ps( sumsq_img_ptr ) ); sum_img_ptr += 16; sumsq_img_ptr += 16; } __m512 lcl_vsqrt_eps = _mm512_set1_ps(sqrt_eps); __m512 lcl_vrec_nhw = _mm512_set1_ps(recp_nhw); __m512 lcl_vone = _mm512_set1_ps(1.0); __m512 lcl_vbmean, lcl_vbmeansq, lcl_vsqbmean, lcl_vbrstd, lcl_vvar; lcl_vbmean = _mm512_mul_ps( lcl_vrec_nhw, lcl_vsum ); /* E(X) */ lcl_vbmeansq = _mm512_mul_ps( lcl_vbmean, lcl_vbmean ); /* E(X)^2 */ lcl_vsqbmean = _mm512_mul_ps( lcl_vrec_nhw, lcl_vsumsq ); /* E(X^2) */ lcl_vvar = _mm512_sub_ps( lcl_vsqbmean, lcl_vbmeansq ); /* variance */ #if 0 { __m512d lcl_voned = _mm512_set1_pd(1.0); __m512d lcl_vepsd = _mm512_set1_pd(1e-7); __m512d lcl_vlo = _mm512_cvtps_pd( _mm256_castpd_ps( _mm512_extractf64x4_pd( _mm512_castps_pd( lcl_vvar ), 0 ) ) ); __m512d lcl_vhi = _mm512_cvtps_pd( _mm256_castpd_ps( _mm512_extractf64x4_pd( _mm512_castps_pd( lcl_vvar ), 1 ) ) ); lcl_vlo = _mm512_sqrt_pd( _mm512_add_pd( lcl_vlo, lcl_vepsd ) ); lcl_vhi = _mm512_sqrt_pd( _mm512_add_pd( lcl_vhi, lcl_vepsd ) ); lcl_vlo = _mm512_div_pd( lcl_voned, lcl_vlo ); lcl_vhi = _mm512_div_pd( lcl_voned, lcl_vhi ); lcl_vbrstd = _mm512_castpd_ps( _mm512_insertf64x4( _mm512_setzero_pd(), _mm256_castps_pd( _mm512_cvtpd_ps( lcl_vlo ) ), 0 ) ); lcl_vbrstd = _mm512_castpd_ps( _mm512_insertf64x4( _mm512_castps_pd( lcl_vbrstd ), _mm256_castps_pd( _mm512_cvtpd_ps( lcl_vhi ) ), 1 ) ); } #else lcl_vbrstd = _mm512_div_ps( lcl_vone, _mm512_sqrt_ps( _mm512_add_ps( lcl_vvar, lcl_vsqrt_eps ) ) ); #endif _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 16), lcl_vbmean ); _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 16), lcl_vbrstd ); _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, variance, fm, 0, 16), lcl_vvar ); libxsmm_barrier_wait(handle->barrier, ltid); /* now we apply the actual forward batch norm */ for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { __m512 lcl_vgamma, lcl_vbeta, lcl_vbmean, lcl_vbrstd; img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; lcl_vgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, 16) ); lcl_vbeta = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, beta, fm, 0, 16) ); lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 16) ); lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 16) ); for ( hi=iph, ho=oph; hi < (ifh+iph); hi+=sh, ho++ ) { const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 16); #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) const element_input_type* input_add_ptr = &LIBXSMM_VLA_ACCESS(5, input_add, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 16); #endif element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 16); #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) unsigned char* relumask_ptr = &LIBXSMM_VLA_ACCESS(5, relumask, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 2); #endif for ( wi=ipw, wo=opw; wi < (ifw+ipw); wi+=sw, wo++ ) { __m512 lcl_vo; #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) __mmask16 lcl_relumask; #endif /* BN + scale (gamma, beta) */ lcl_vo = _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ); lcl_vo = _mm512_mul_ps( lcl_vgamma, lcl_vo ); lcl_vo = _mm512_fmadd_ps( lcl_vo, lcl_vbrstd, lcl_vbeta ); /* eltwise add */ #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) lcl_vo = _mm512_add_ps( lcl_vo, _mm512_load_act( input_add_ptr ) ); #endif /* ReLU */ #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU) lcl_vo = _mm512_max_ps( lcl_vo, _mm512_setzero_ps() ); #endif #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) lcl_relumask = _mm512_cmp_ps_mask( lcl_vo, _mm512_setzero_ps(), _CMP_GT_OQ ); lcl_vo = _mm512_mask_blend_ps( lcl_relumask, _mm512_setzero_ps(), lcl_vo ); LIBXSMM_INTRINSICS_MM512_STORE_MASK16( relumask_ptr, lcl_relumask ); relumask_ptr += 2; #endif _mm512_stream_act( output_ptr, lcl_vo ); input_ptr += sw*16; #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) input_add_ptr += sw*16; #endif output_ptr += 16; } } } libxsmm_barrier_wait(handle->barrier, ltid); } # undef _mm512_load_act # undef _mm512_stream_act # undef _mm512_store_act libxsmm-1.17/src/template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c000066400000000000000000000323221415223013700321370ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ #if defined(LIBXSMM_DNN_FUSEDGN_FWD_BF16) # define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) #if 1 # define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) #else # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) #endif #else # define _mm512_load_act(A) _mm512_loadu_ps(A) # define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) # define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) #endif /* size variables, all const */ const int nImg = handle->desc.N; const int ifh = handle->desc.H; const int ifw = handle->desc.W; const int sh = handle->desc.u; const int sw = handle->desc.v; const int ofh = ifh/sh; const int ofw = ifw/sw; const int iph = handle->desc.pad_h_in; const int ipw = handle->desc.pad_w_in; const int oph = handle->desc.pad_h_out; const int opw = handle->desc.pad_w_out; const int ofhp = ofh + 2*oph; const int ofwp = ofw + 2*opw; const int ifhp = ifh + 2*iph; const int ifwp = ifw + 2*ipw; /* here we assume that input and output blocking is similar */ const int nBlocksFm = handle->blocksifm; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const int work = nImg * nBlocksFm; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* number of tasks that could be run in parallel, delta gamma and beta reduction */ const int work2 = nBlocksFm; /* compute chunk size */ const int chunksize2 = (work2 % handle->desc.threads == 0) ? (work2 / handle->desc.threads) : ((work2 / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; /* eps to avoid sqrt of zero */ const element_stats_type sqrt_eps = 1e-7f; const element_stats_type nhw = (element_stats_type)(handle->desc.N * ifh * ifw); const element_stats_type recp_nhw = 1.0f/nhw; /* loop variables */ int img = 0; int fm = 0; int imgfm = 0; int hi = 0; int wi = 0; int ho = 0; int wo = 0; LIBXSMM_VLA_DECL(5, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, 32); #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) LIBXSMM_VLA_DECL(5, const element_input_type, input_add, (element_input_type* )handle->reg_add->data, nBlocksFm, ifhp, ifwp, 32); #endif LIBXSMM_VLA_DECL(5, element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, 32); LIBXSMM_VLA_DECL(2, const element_stats_type, gamma, (element_stats_type*)handle->reg_gamma->data, 32); LIBXSMM_VLA_DECL(2, const element_stats_type, beta, (element_stats_type*)handle->reg_beta->data, 32); LIBXSMM_VLA_DECL(2, element_stats_type, bmean, (element_stats_type*)handle->expvalue->data, 32); LIBXSMM_VLA_DECL(2, element_stats_type, brstd, (element_stats_type*)handle->rcpstddev->data, 32); LIBXSMM_VLA_DECL(2, element_stats_type, variance, (element_stats_type*)handle->variance->data, 32); LIBXSMM_VLA_DECL(3, element_stats_type, sum_img, (element_stats_type*)handle->scratch, nImg, 32); LIBXSMM_VLA_DECL(3, element_stats_type, sumsq_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * (size_t)nBlocksFm * 32), nImg, 32); #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) LIBXSMM_VLA_DECL(5, unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksFm, ofhp, ofwp, 4); #endif /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { __m512 lcl_vsum = _mm512_setzero_ps(); __m512 lcl_vsumsq = _mm512_setzero_ps(); __m512 lcl_vsum2 = _mm512_setzero_ps(); __m512 lcl_vsumsq2 = _mm512_setzero_ps(); element_stats_type* sum_img_ptr; element_stats_type* sumsq_img_ptr; img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; sum_img_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img, fm, img, 0, nImg, 32); sumsq_img_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img, fm, img, 0, nImg, 32); for ( hi=iph; hi < (ifh + iph); hi++ ) { const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32); for ( wi=ipw; wi < (ifw + ipw); wi++ ) { __m512 lcl_vinput = _mm512_load_act( input_ptr ); __m512 lcl_vinput2 = _mm512_load_act( input_ptr+16 ); lcl_vsum = _mm512_add_ps( lcl_vsum, lcl_vinput ); lcl_vsumsq = _mm512_add_ps( lcl_vsumsq, _mm512_mul_ps( lcl_vinput, lcl_vinput ) ); lcl_vsum2 = _mm512_add_ps( lcl_vsum2, lcl_vinput2 ); lcl_vsumsq2 = _mm512_add_ps( lcl_vsumsq2, _mm512_mul_ps( lcl_vinput2, lcl_vinput2 ) ); input_ptr += 32; } } _mm512_storeu_ps( sum_img_ptr, lcl_vsum ); _mm512_storeu_ps( sumsq_img_ptr, lcl_vsumsq ); _mm512_storeu_ps( sum_img_ptr+16, lcl_vsum2 ); _mm512_storeu_ps( sumsq_img_ptr+16, lcl_vsumsq2 ); } libxsmm_barrier_wait(handle->barrier, ltid); /* now we need to reduce the sum and sum^2, we use the final */ for ( fm = thr_begin2; fm < thr_end2; ++fm ) { __m512 lcl_vsum = _mm512_setzero_ps(); __m512 lcl_vsumsq = _mm512_setzero_ps(); __m512 lcl_vsum2 = _mm512_setzero_ps(); __m512 lcl_vsumsq2 = _mm512_setzero_ps(); element_stats_type* sum_img_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img, fm, 0, 0, nImg, 32); element_stats_type* sumsq_img_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img, fm, 0, 0, nImg, 32); for ( img=0; img < nImg; img++ ) { lcl_vsum = _mm512_add_ps( lcl_vsum, _mm512_loadu_ps( sum_img_ptr ) ); lcl_vsumsq = _mm512_add_ps( lcl_vsumsq, _mm512_loadu_ps( sumsq_img_ptr ) ); lcl_vsum2 = _mm512_add_ps( lcl_vsum2, _mm512_loadu_ps( sum_img_ptr+16 ) ); lcl_vsumsq2 = _mm512_add_ps( lcl_vsumsq2, _mm512_loadu_ps( sumsq_img_ptr+16 ) ); sum_img_ptr += 32; sumsq_img_ptr += 32; } __m512 lcl_vsqrt_eps = _mm512_set1_ps(sqrt_eps); __m512 lcl_vrec_nhw = _mm512_set1_ps(recp_nhw); __m512 lcl_vone = _mm512_set1_ps(1.0); __m512 lcl_vbmean, lcl_vbmeansq, lcl_vsqbmean, lcl_vbrstd, lcl_vvar; __m512 lcl_vbmean2, lcl_vbmeansq2, lcl_vsqbmean2, lcl_vbrstd2, lcl_vvar2; lcl_vbmean = _mm512_mul_ps( lcl_vrec_nhw, lcl_vsum ); /* E(X) */ lcl_vbmeansq = _mm512_mul_ps( lcl_vbmean, lcl_vbmean ); /* E(X)^2 */ lcl_vsqbmean = _mm512_mul_ps( lcl_vrec_nhw, lcl_vsumsq ); /* E(X^2) */ lcl_vvar = _mm512_sub_ps( lcl_vsqbmean, lcl_vbmeansq ); /* variance */ lcl_vbrstd = _mm512_div_ps( lcl_vone, _mm512_sqrt_ps( _mm512_add_ps( lcl_vvar, lcl_vsqrt_eps ) ) ); lcl_vbmean2 = _mm512_mul_ps( lcl_vrec_nhw, lcl_vsum2 ); /* E(X) */ lcl_vbmeansq2 = _mm512_mul_ps( lcl_vbmean2, lcl_vbmean2 ); /* E(X)^2 */ lcl_vsqbmean2 = _mm512_mul_ps( lcl_vrec_nhw, lcl_vsumsq2 ); /* E(X^2) */ lcl_vvar2 = _mm512_sub_ps( lcl_vsqbmean2, lcl_vbmeansq2 ); /* variance */ lcl_vbrstd2 = _mm512_div_ps( lcl_vone, _mm512_sqrt_ps( _mm512_add_ps( lcl_vvar2, lcl_vsqrt_eps ) ) ); _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 32), lcl_vbmean ); _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 32), lcl_vbrstd ); _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, variance, fm, 0, 32), lcl_vvar ); _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 16, 32), lcl_vbmean2 ); _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 16, 32), lcl_vbrstd2 ); _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, variance, fm, 16, 32), lcl_vvar2 ); } libxsmm_barrier_wait(handle->barrier, ltid); /* now we apply the actual forward batch norm */ for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { __m512 lcl_vgamma, lcl_vbeta, lcl_vbmean, lcl_vbrstd; __m512 lcl_vgamma2, lcl_vbeta2, lcl_vbmean2, lcl_vbrstd2; img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; lcl_vgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, 32) ); lcl_vbeta = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, beta, fm, 0, 32) ); lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 32) ); lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 32) ); lcl_vgamma2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 16, 32) ); lcl_vbeta2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, beta, fm, 16, 32) ); lcl_vbmean2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 16, 32) ); lcl_vbrstd2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 16, 32) ); for ( hi=iph, ho=oph; hi < (ifh+iph); hi+=sh, ho++ ) { const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32); #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) const element_input_type* input_add_ptr = &LIBXSMM_VLA_ACCESS(5, input_add, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32); #endif element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 32); #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) unsigned char* relumask_ptr = &LIBXSMM_VLA_ACCESS(5, relumask, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 4); #endif for ( wi=ipw, wo=opw; wi < (ifw+ipw); wi+=sw, wo++ ) { __m512 lcl_vo; __m512 lcl_vo2; #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) __mmask16 lcl_relumask; __mmask16 lcl_relumask2; #endif /* BN + scale (gamma, beta) */ lcl_vo = _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ); lcl_vo = _mm512_mul_ps( lcl_vgamma, lcl_vo ); lcl_vo = _mm512_fmadd_ps( lcl_vo, lcl_vbrstd, lcl_vbeta ); /* eltwise add */ #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) lcl_vo = _mm512_add_ps( lcl_vo, _mm512_load_act( input_add_ptr ) ); #endif /* ReLU */ #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU) lcl_vo = _mm512_max_ps( lcl_vo, _mm512_setzero_ps() ); #endif #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) lcl_relumask = _mm512_cmp_ps_mask( lcl_vo, _mm512_setzero_ps(), _CMP_GT_OQ ); lcl_vo = _mm512_mask_blend_ps( lcl_relumask, _mm512_setzero_ps(), lcl_vo ); LIBXSMM_INTRINSICS_MM512_STORE_MASK16( relumask_ptr, lcl_relumask ); relumask_ptr += 2; #endif /* BN + scale (gamma, beta) */ lcl_vo2 = _mm512_sub_ps( _mm512_load_act( input_ptr+16 ), lcl_vbmean2 ); lcl_vo2 = _mm512_mul_ps( lcl_vgamma2, lcl_vo2 ); lcl_vo2 = _mm512_fmadd_ps( lcl_vo2, lcl_vbrstd2, lcl_vbeta2 ); /* eltwise add */ #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) lcl_vo2 = _mm512_add_ps( lcl_vo2, _mm512_load_act( input_add_ptr+16 ) ); #endif /* ReLU */ #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU) lcl_vo2 = _mm512_max_ps( lcl_vo2, _mm512_setzero_ps() ); #endif #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) lcl_relumask2 = _mm512_cmp_ps_mask( lcl_vo2, _mm512_setzero_ps(), _CMP_GT_OQ ); lcl_vo2 = _mm512_mask_blend_ps( lcl_relumask2, _mm512_setzero_ps(), lcl_vo2 ); LIBXSMM_INTRINSICS_MM512_STORE_MASK16( relumask_ptr, lcl_relumask2 ); relumask_ptr += 2; #endif _mm512_stream_act( output_ptr, lcl_vo ); _mm512_stream_act( output_ptr+16, lcl_vo2 ); input_ptr += sw*32; #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) input_add_ptr += sw*32; #endif output_ptr += 32; } } } libxsmm_barrier_wait(handle->barrier, ltid); # undef _mm512_load_act # undef _mm512_stream_act # undef _mm512_store_act libxsmm-1.17/src/template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c000066400000000000000000000375301415223013700321520ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ #if defined(LIBXSMM_DNN_FUSEDGN_FWD_BF16) # define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) #if 1 # define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) #else # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) #endif #else # define _mm512_load_act(A) _mm512_loadu_ps(A) # define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) # define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) #endif /* size variables, all const */ const int nImg = handle->desc.N; const int ifh = handle->desc.H; const int ifw = handle->desc.W; const int sh = handle->desc.u; const int sw = handle->desc.v; const int ofh = ifh/sh; const int ofw = ifw/sw; const int iph = handle->desc.pad_h_in; const int ipw = handle->desc.pad_w_in; const int oph = handle->desc.pad_h_out; const int opw = handle->desc.pad_w_out; const int ofhp = ofh + 2*oph; const int ofwp = ofw + 2*opw; const int ifhp = ifh + 2*iph; const int ifwp = ifw + 2*ipw; /* here we assume that input and output blocking is similar */ const int nBlocksFm = handle->blocksifm; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const int work = nImg * nBlocksFm; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* number of tasks that could be run in parallel, delta gamma and beta reduction */ const int work2 = nBlocksFm * 4; /* compute chunk size */ const int chunksize2 = (work2 % handle->desc.threads == 0) ? (work2 / handle->desc.threads) : ((work2 / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; /* eps to avoid sqrt of zero */ const element_stats_type sqrt_eps = 1e-7f; const element_stats_type nhw = (element_stats_type)(handle->desc.N * ifh * ifw); const element_stats_type recp_nhw = 1.0f/nhw; /* loop variables */ int img = 0; int fm = 0; int imgfm = 0; int hi = 0; int wi = 0; int ho = 0; int wo = 0; LIBXSMM_VLA_DECL(5, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, 64); #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) LIBXSMM_VLA_DECL(5, const element_input_type, input_add, (element_input_type* )handle->reg_add->data, nBlocksFm, ifhp, ifwp, 64); #endif LIBXSMM_VLA_DECL(5, element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, 64); LIBXSMM_VLA_DECL(2, const element_stats_type, gamma, (element_stats_type*)handle->reg_gamma->data, 64); LIBXSMM_VLA_DECL(2, const element_stats_type, beta, (element_stats_type*)handle->reg_beta->data, 64); LIBXSMM_VLA_DECL(2, element_stats_type, bmean, (element_stats_type*)handle->expvalue->data, 64); LIBXSMM_VLA_DECL(2, element_stats_type, brstd, (element_stats_type*)handle->rcpstddev->data, 64); LIBXSMM_VLA_DECL(2, element_stats_type, variance, (element_stats_type*)handle->variance->data, 64); LIBXSMM_VLA_DECL(3, element_stats_type, sum_img, (element_stats_type*)handle->scratch, nImg, 64); LIBXSMM_VLA_DECL(3, element_stats_type, sumsq_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * (size_t)nBlocksFm * 64), nImg, 64); #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) LIBXSMM_VLA_DECL(5, unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksFm, ofhp, ofwp, 8); #endif /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { __m512 lcl_vsum = _mm512_setzero_ps(); __m512 lcl_vsumsq = _mm512_setzero_ps(); __m512 lcl_vsum2 = _mm512_setzero_ps(); __m512 lcl_vsumsq2 = _mm512_setzero_ps(); __m512 lcl_vsum3 = _mm512_setzero_ps(); __m512 lcl_vsumsq3 = _mm512_setzero_ps(); __m512 lcl_vsum4 = _mm512_setzero_ps(); __m512 lcl_vsumsq4 = _mm512_setzero_ps(); element_stats_type* sum_img_ptr; element_stats_type* sumsq_img_ptr; img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; sum_img_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img, fm, img, 0, nImg, 64); sumsq_img_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img, fm, img, 0, nImg, 64); for ( hi=iph; hi < (ifh + iph); hi++ ) { const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 64); for ( wi=ipw; wi < (ifw + ipw); wi++ ) { __m512 lcl_vinput = _mm512_load_act( input_ptr ); __m512 lcl_vinput2 = _mm512_load_act( input_ptr+16 ); __m512 lcl_vinput3 = _mm512_load_act( input_ptr+32 ); __m512 lcl_vinput4 = _mm512_load_act( input_ptr+48 ); lcl_vsum = _mm512_add_ps( lcl_vsum, lcl_vinput ); lcl_vsumsq = _mm512_add_ps( lcl_vsumsq, _mm512_mul_ps( lcl_vinput, lcl_vinput ) ); lcl_vsum2 = _mm512_add_ps( lcl_vsum2, lcl_vinput2 ); lcl_vsumsq2 = _mm512_add_ps( lcl_vsumsq2, _mm512_mul_ps( lcl_vinput2, lcl_vinput2 ) ); lcl_vsum3 = _mm512_add_ps( lcl_vsum3, lcl_vinput3 ); lcl_vsumsq3 = _mm512_add_ps( lcl_vsumsq3, _mm512_mul_ps( lcl_vinput3, lcl_vinput3 ) ); lcl_vsum4 = _mm512_add_ps( lcl_vsum4, lcl_vinput4 ); lcl_vsumsq4 = _mm512_add_ps( lcl_vsumsq4, _mm512_mul_ps( lcl_vinput4, lcl_vinput4 ) ); input_ptr += 64; } } _mm512_storeu_ps( sum_img_ptr, lcl_vsum ); _mm512_storeu_ps( sumsq_img_ptr, lcl_vsumsq ); _mm512_storeu_ps( sum_img_ptr+16, lcl_vsum2 ); _mm512_storeu_ps( sumsq_img_ptr+16, lcl_vsumsq2 ); _mm512_storeu_ps( sum_img_ptr+32, lcl_vsum3 ); _mm512_storeu_ps( sumsq_img_ptr+32, lcl_vsumsq3 ); _mm512_storeu_ps( sum_img_ptr+48, lcl_vsum4 ); _mm512_storeu_ps( sumsq_img_ptr+48, lcl_vsumsq4 ); } libxsmm_barrier_wait(handle->barrier, ltid); /* now we need to reduce the sum and sum^2, we use the final */ for ( fm = thr_begin2; fm < thr_end2; ++fm ) { __m512 lcl_vsum = _mm512_setzero_ps(); __m512 lcl_vsumsq = _mm512_setzero_ps(); element_stats_type* sum_img_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img, (fm/4), 0, ((fm%4)*16), nImg, 64); element_stats_type* sumsq_img_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img, (fm/4), 0, ((fm%4)*16), nImg, 64); for ( img=0; img < nImg; img++ ) { lcl_vsum = _mm512_add_ps( lcl_vsum, _mm512_loadu_ps( sum_img_ptr ) ); lcl_vsumsq = _mm512_add_ps( lcl_vsumsq, _mm512_loadu_ps( sumsq_img_ptr ) ); sum_img_ptr += 64; sumsq_img_ptr += 64; } __m512 lcl_vsqrt_eps = _mm512_set1_ps(sqrt_eps); __m512 lcl_vrec_nhw = _mm512_set1_ps(recp_nhw); __m512 lcl_vone = _mm512_set1_ps(1.0); __m512 lcl_vbmean, lcl_vbmeansq, lcl_vsqbmean, lcl_vbrstd, lcl_vvar; lcl_vbmean = _mm512_mul_ps( lcl_vrec_nhw, lcl_vsum ); /* E(X) */ lcl_vbmeansq = _mm512_mul_ps( lcl_vbmean, lcl_vbmean ); /* E(X)^2 */ lcl_vsqbmean = _mm512_mul_ps( lcl_vrec_nhw, lcl_vsumsq ); /* E(X^2) */ lcl_vvar = _mm512_sub_ps( lcl_vsqbmean, lcl_vbmeansq ); /* variance */ lcl_vbrstd = _mm512_div_ps( lcl_vone, _mm512_sqrt_ps( _mm512_add_ps( lcl_vvar, lcl_vsqrt_eps ) ) ); _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, (fm/4), ((fm%4)*16), 64), lcl_vbmean ); _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, (fm/4), ((fm%4)*16), 64), lcl_vbrstd ); _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, variance, (fm/4), ((fm%4)*16), 64), lcl_vvar ); } libxsmm_barrier_wait(handle->barrier, ltid); /* now we apply the actual forward batch norm */ for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { __m512 lcl_vgamma, lcl_vbeta, lcl_vbmean, lcl_vbrstd; __m512 lcl_vgamma2, lcl_vbeta2, lcl_vbmean2, lcl_vbrstd2; __m512 lcl_vgamma3, lcl_vbeta3, lcl_vbmean3, lcl_vbrstd3; __m512 lcl_vgamma4, lcl_vbeta4, lcl_vbmean4, lcl_vbrstd4; img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; lcl_vgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, 64) ); lcl_vbeta = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, beta, fm, 0, 64) ); lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 64) ); lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 64) ); lcl_vgamma2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 16, 64) ); lcl_vbeta2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, beta, fm, 16, 64) ); lcl_vbmean2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 16, 64) ); lcl_vbrstd2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 16, 64) ); lcl_vgamma3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 32, 64) ); lcl_vbeta3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, beta, fm, 32, 64) ); lcl_vbmean3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 32, 64) ); lcl_vbrstd3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 32, 64) ); lcl_vgamma4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 48, 64) ); lcl_vbeta4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, beta, fm, 48, 64) ); lcl_vbmean4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 48, 64) ); lcl_vbrstd4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 48, 64) ); for ( hi=iph, ho=oph; hi < (ifh+iph); hi+=sh, ho++ ) { const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 64); #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) const element_input_type* input_add_ptr = &LIBXSMM_VLA_ACCESS(5, input_add, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 64); #endif element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 64); #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) unsigned char* relumask_ptr = &LIBXSMM_VLA_ACCESS(5, relumask, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 8); #endif for ( wi=ipw, wo=opw; wi < (ifw+ipw); wi+=sw, wo++ ) { __m512 lcl_vo; __m512 lcl_vo2; __m512 lcl_vo3; __m512 lcl_vo4; #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) __mmask16 lcl_relumask; __mmask16 lcl_relumask2; __mmask16 lcl_relumask3; __mmask16 lcl_relumask4; #endif /* BN + scale (gamma, beta) */ lcl_vo = _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ); lcl_vo = _mm512_mul_ps( lcl_vgamma, lcl_vo ); lcl_vo = _mm512_fmadd_ps( lcl_vo, lcl_vbrstd, lcl_vbeta ); /* eltwise add */ #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) lcl_vo = _mm512_add_ps( lcl_vo, _mm512_load_act( input_add_ptr ) ); #endif /* ReLU */ #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU) lcl_vo = _mm512_max_ps( lcl_vo, _mm512_setzero_ps() ); #endif #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) lcl_relumask = _mm512_cmp_ps_mask( lcl_vo, _mm512_setzero_ps(), _CMP_GT_OQ ); lcl_vo = _mm512_mask_blend_ps( lcl_relumask, _mm512_setzero_ps(), lcl_vo ); LIBXSMM_INTRINSICS_MM512_STORE_MASK16( relumask_ptr, lcl_relumask ); relumask_ptr += 2; #endif /* BN + scale (gamma, beta) */ lcl_vo2 = _mm512_sub_ps( _mm512_load_act( input_ptr+16 ), lcl_vbmean2 ); lcl_vo2 = _mm512_mul_ps( lcl_vgamma2, lcl_vo2 ); lcl_vo2 = _mm512_fmadd_ps( lcl_vo2, lcl_vbrstd2, lcl_vbeta2 ); /* eltwise add */ #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) lcl_vo2 = _mm512_add_ps( lcl_vo2, _mm512_load_act( input_add_ptr+16 ) ); #endif /* ReLU */ #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU) lcl_vo2 = _mm512_max_ps( lcl_vo2, _mm512_setzero_ps() ); #endif #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) lcl_relumask2 = _mm512_cmp_ps_mask( lcl_vo2, _mm512_setzero_ps(), _CMP_GT_OQ ); lcl_vo2 = _mm512_mask_blend_ps( lcl_relumask2, _mm512_setzero_ps(), lcl_vo2 ); LIBXSMM_INTRINSICS_MM512_STORE_MASK16( relumask_ptr, lcl_relumask2 ); relumask_ptr += 2; #endif /* BN + scale (gamma, beta) */ lcl_vo3 = _mm512_sub_ps( _mm512_load_act( input_ptr+32 ), lcl_vbmean3 ); lcl_vo3 = _mm512_mul_ps( lcl_vgamma3, lcl_vo3 ); lcl_vo3 = _mm512_fmadd_ps( lcl_vo3, lcl_vbrstd3, lcl_vbeta3 ); /* eltwise add */ #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) lcl_vo3 = _mm512_add_ps( lcl_vo3, _mm512_load_act( input_add_ptr+32 ) ); #endif /* ReLU */ #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU) lcl_vo3 = _mm512_max_ps( lcl_vo3, _mm512_setzero_ps() ); #endif #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) lcl_relumask3 = _mm512_cmp_ps_mask( lcl_vo3, _mm512_setzero_ps(), _CMP_GT_OQ ); lcl_vo3 = _mm512_mask_blend_ps( lcl_relumask3, _mm512_setzero_ps(), lcl_vo3 ); LIBXSMM_INTRINSICS_MM512_STORE_MASK16( relumask_ptr, lcl_relumask3 ); relumask_ptr += 2; #endif /* BN + scale (gamma, beta) */ lcl_vo4 = _mm512_sub_ps( _mm512_load_act( input_ptr+48 ), lcl_vbmean4 ); lcl_vo4 = _mm512_mul_ps( lcl_vgamma4, lcl_vo4 ); lcl_vo4 = _mm512_fmadd_ps( lcl_vo4, lcl_vbrstd4, lcl_vbeta4 ); /* eltwise add */ #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) lcl_vo4 = _mm512_add_ps( lcl_vo4, _mm512_load_act( input_add_ptr+48 ) ); #endif /* ReLU */ #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU) lcl_vo4 = _mm512_max_ps( lcl_vo4, _mm512_setzero_ps() ); #endif #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) lcl_relumask4 = _mm512_cmp_ps_mask( lcl_vo4, _mm512_setzero_ps(), _CMP_GT_OQ ); lcl_vo4 = _mm512_mask_blend_ps( lcl_relumask4, _mm512_setzero_ps(), lcl_vo4 ); LIBXSMM_INTRINSICS_MM512_STORE_MASK16( relumask_ptr, lcl_relumask4 ); relumask_ptr += 2; #endif _mm512_stream_act( output_ptr, lcl_vo ); _mm512_stream_act( output_ptr+16, lcl_vo2 ); _mm512_stream_act( output_ptr+32, lcl_vo3 ); _mm512_stream_act( output_ptr+48, lcl_vo4 ); input_ptr += sw*64; #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) input_add_ptr += sw*64; #endif output_ptr += 64; } } } libxsmm_barrier_wait(handle->barrier, ltid); # undef _mm512_load_act # undef _mm512_stream_act # undef _mm512_store_act libxsmm-1.17/src/template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_generic.tpl.c000066400000000000000000000225051415223013700305700ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ /* size variables, all const */ const int nImg = handle->desc.N; const int nG = handle->desc.G; const int ifh = handle->desc.H; const int ifw = handle->desc.W; const int sh = handle->desc.u; const int sw = handle->desc.v; const int ofh = ifh/sh; const int ofw = ifw/sw; const int iph = handle->desc.pad_h_in; const int ipw = handle->desc.pad_w_in; const int oph = handle->desc.pad_h_out; const int opw = handle->desc.pad_w_out; const int ofhp = ofh + 2*oph; const int ofwp = ofw + 2*opw; const int ifhp = ifh + 2*iph; const int ifwp = ifw + 2*ipw; /* here we assume that input and output blocking is similar */ const int nBlocksFm = handle->blocksifm; const int nFmBlock = handle->ifmblock; /* derive channels per group */ const int nFmG = (nBlocksFm * nFmBlock) / nG; /* size of sample */ const element_stats_type ghw = (element_stats_type)(nFmG * ifh * ifw); const element_stats_type recp_ghw = 1.0f/ghw; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ /* @TODO let's fix parallelization to include channel groups while avoiding conflict misses */ const int work = nImg; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* eps to avoid sqrt of zero */ const element_stats_type sqrt_eps = 1e-7f; /* loop variables */ int img = 0; int fm = 0; /*int imgfm = 0;*/ int hi = 0; int wi = 0; int v = 0; int ho = 0; int wo = 0; int g = 0; LIBXSMM_VLA_DECL(5, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, nFmBlock); #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) LIBXSMM_VLA_DECL(5, const element_input_type, input_add, (element_input_type* )handle->reg_add->data, nBlocksFm, ifhp, ifwp, nFmBlock); #endif LIBXSMM_VLA_DECL(5, element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, nFmBlock); LIBXSMM_VLA_DECL(2, const element_stats_type, gamma, (element_stats_type*)handle->reg_gamma->data, nFmBlock); LIBXSMM_VLA_DECL(2, const element_stats_type, beta, (element_stats_type*)handle->reg_beta->data, nFmBlock); LIBXSMM_VLA_DECL(2, element_stats_type, bmean, (element_stats_type*)handle->expvalue->data, nG); LIBXSMM_VLA_DECL(2, element_stats_type, brstd, (element_stats_type*)handle->rcpstddev->data, nG); LIBXSMM_VLA_DECL(2, element_stats_type, variance, (element_stats_type*)handle->variance->data, nG); LIBXSMM_VLA_DECL(3, element_stats_type, sum_img, (element_stats_type*)handle->scratch, nBlocksFm, nFmBlock); LIBXSMM_VLA_DECL(3, element_stats_type, sumsq_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * (size_t)nBlocksFm * (size_t)nFmBlock), nBlocksFm, nFmBlock); #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) LIBXSMM_VLA_DECL(5, unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksFm, ofhp, ofwp, nFmBlock); #endif #if defined(LIBXSMM_DNN_FUSEDGN_FWD_BF16) union libxsmm_bfloat16_hp input_f32; union libxsmm_bfloat16_hp output_f32; #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) union libxsmm_bfloat16_hp input_add_f32; input_add_f32.i[1] = 0; input_add_f32.i[0] = 0; #endif input_f32.i[1] = 0; input_f32.i[0] = 0; output_f32.i[1] = 0; output_f32.i[0] = 0; #endif /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); for ( img = thr_begin; img < thr_end; ++img ) { element_stats_type* bmean_ptr = &LIBXSMM_VLA_ACCESS(2, bmean, img, 0, nG); element_stats_type* brstd_ptr = &LIBXSMM_VLA_ACCESS(2, brstd, img, 0, nG); element_stats_type* tvar_ptr = &LIBXSMM_VLA_ACCESS(2, variance, img, 0, nG); element_stats_type* sum_img_ptr = NULL; element_stats_type* sumsq_img_ptr = NULL; /* create reduction over all pixels per channel */ for ( fm = 0; fm < nBlocksFm; ++fm ) { /* @TODO check if we can bake this in into scratch */ element_stats_type lcl_sum_ptr[64]; element_stats_type lcl_sumsq_ptr[64]; sum_img_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img, img, fm, 0, nBlocksFm, nFmBlock); sumsq_img_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img, img, fm, 0, nBlocksFm, nFmBlock); LIBXSMM_PRAGMA_SIMD for ( v=0; v < nFmBlock; v++ ) { lcl_sum_ptr[v] = (element_stats_type)0; lcl_sumsq_ptr[v] = (element_stats_type)0; } for ( hi=iph; hi < (ifh + iph); hi++ ) { for ( wi=ipw; wi < (ifw + ipw); wi++ ) { const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, wi, 0, nBlocksFm, ifhp, ifwp, nFmBlock); #if !defined(LIBXSMM_DNN_FUSEDGN_FWD_BF16) LIBXSMM_PRAGMA_SIMD #endif for (v=0; v < nFmBlock; v++) { #if defined(LIBXSMM_DNN_FUSEDGN_FWD_BF16) input_f32.i[1] = input_ptr[v]; lcl_sum_ptr[v] += input_f32.f; lcl_sumsq_ptr[v] += (input_f32.f * input_f32.f); #else lcl_sum_ptr[v] += input_ptr[v]; lcl_sumsq_ptr[v] += (input_ptr[v] * input_ptr[v]); #endif } } } LIBXSMM_PRAGMA_SIMD for (v=0; v < nFmBlock; v++) { sum_img_ptr[v] = lcl_sum_ptr[v]; sumsq_img_ptr[v] = lcl_sumsq_ptr[v]; } } /* new we compute mean, variance and rstd per channel group */ sum_img_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img, img, 0, 0, nImg, nFmBlock); sumsq_img_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img, img, 0, 0, nImg, nFmBlock); for ( g = 0; g < nG; ++g ) { element_stats_type lcl_fm_sum = 0.0f; element_stats_type lcl_fm_sumsq = 0.0f; for ( fm = g*nFmG; fm < (g+1)*nFmG; ++fm ) { lcl_fm_sum += sum_img_ptr[fm]; lcl_fm_sumsq += sumsq_img_ptr[fm]; } { const element_stats_type tbmean = (recp_ghw * lcl_fm_sum); const element_stats_type tbmeansq = tbmean * tbmean; const element_stats_type tsqbmean = recp_ghw * lcl_fm_sumsq; const element_stats_type tvar = tsqbmean - tbmeansq; const element_stats_type tbrstd = (element_stats_type)(1.0/sqrt((double)tvar + sqrt_eps)); bmean_ptr[g] = tbmean; brstd_ptr[g] = tbrstd; tvar_ptr[g] = tvar; } } /* let's scale the data */ for ( fm = 0; fm < nBlocksFm; ++fm ) { for ( hi=iph, ho=oph; hi < (ifh+iph); hi+=sh, ho++ ) { for ( wi=ipw, wo=opw; wi < (ifw+ipw); wi+=sw, wo++ ) { const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, wi, 0, nBlocksFm, ifhp, ifwp, nFmBlock); #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) const element_input_type* input_add_ptr = &LIBXSMM_VLA_ACCESS(5, input_add, img, fm, hi, wi, 0, nBlocksFm, ifhp, ifwp, nFmBlock); #endif const element_stats_type* gamma_ptr = &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, nFmBlock); const element_stats_type* beta_ptr = &LIBXSMM_VLA_ACCESS(2, beta, fm, 0, nFmBlock); element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, nFmBlock); #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) unsigned char* relumask_ptr = &LIBXSMM_VLA_ACCESS(5, relumask, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, nFmBlock); #endif float o; #if 0 #if !defined(LIBXSMM_DNN_FUSEDGN_FWD_BF16) LIBXSMM_PRAGMA_SIMD #endif #endif for (v = 0; v < nFmBlock; v++ ) { g = ((fm*nFmBlock)+v)/nFmG; #if defined(LIBXSMM_DNN_FUSEDGN_FWD_BF16) input_f32.i[1] = input_ptr[v]; o = gamma_ptr[v]*(input_f32.f - bmean_ptr[g])*brstd_ptr[g] + beta_ptr[v]; #else /* BN + scale (gamma, beta) */ o = gamma_ptr[v]*(input_ptr[v] - bmean_ptr[g])*brstd_ptr[g] + beta_ptr[v]; #endif /* Eltwise */ #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) #if defined(LIBXSMM_DNN_FUSEDGN_FWD_BF16) input_add_f32.i[1] = input_add_ptr[v]; o += input_add_f32.f; #else o += input_add_ptr[v]; #endif #endif /* ReLU */ #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU) o = ( o > 0.0f ) ? o : 0.0f; #endif #if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) o = ( o > 0.0f ) ? o : 0.0f; relumask_ptr[v] = (unsigned char)(o > 0.0f ? 1 : 0); #endif #if defined(LIBXSMM_DNN_FUSEDGN_FWD_BF16) output_f32.f = o; output_ptr[v] = output_f32.i[1]; #else output_ptr[v] = o; #endif } } } } } libxsmm_barrier_wait(handle->barrier, ltid); libxsmm-1.17/src/template/libxsmm_dnn_optimizer_sgd_st_generic.tpl.c000066400000000000000000000074341415223013700261420ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #if defined(LIBXSMM_DNN_OPTIMIZER_SGD_BF16_AVX512) # define _mm512_load_fil(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) # define _mm512_store_fil(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16((B)),16))) #endif /* loop counters */ libxsmm_blasint i; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could run in parallel for the filters */ const int work = handle->desc.C * handle->desc.K; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; element_filter_type* filter = (element_filter_type*)handle->reg_filter->data; element_filter_type* dfilter = (element_filter_type*)handle->grad_filter->data; #if defined(LIBXSMM_DNN_OPTIMIZER_SGD_BF16) || defined(LIBXSMM_DNN_OPTIMIZER_SGD_BF16_AVX512) element_master_type* master = (element_master_type*)handle->master_filter->data; #endif /* lazy barrier init */ libxsmm_barrier_init( handle->barrier, ltid ); #if defined(LIBXSMM_DNN_OPTIMIZER_SGD_BF16) || defined(LIBXSMM_DNN_OPTIMIZER_SGD_BF16_AVX512) #if defined(LIBXSMM_DNN_OPTIMIZER_SGD_BF16_AVX512) { libxsmm_blasint iv = ( (thr_end-thr_begin)/16 ) * 16; /* compute iterations which are vectorizable */ __m512 vlr = _mm512_set1_ps( handle->desc.learning_rate ); for ( i = thr_begin; i < iv; i+=16 ) { __m512 newfilter = _mm512_sub_ps( _mm512_loadu_ps( master+i ), _mm512_mul_ps( vlr, _mm512_load_fil( dfilter + i ) ) ); _mm512_store_fil( filter+i, newfilter ); _mm512_storeu_ps( master+i, newfilter ); } for ( i = iv; i < thr_end; ++i ) { libxsmm_bfloat16_hp t1, t2; t1.i[0] =0; t1.i[1] = dfilter[i]; master[i] = master[i] - (handle->desc.learning_rate*t1.f); t2.f = master[i]; filter[i] = t2.i[1]; } } #undef _mm512_load_fil #undef _mm512_store_fil #else for ( i = thr_begin; i < thr_end; ++i ) { libxsmm_bfloat16_hp t1, t2; t1.i[0] =0; t1.i[1] = dfilter[i]; master[i] = master[i] - (handle->desc.learning_rate*t1.f); t2.f = master[i]; filter[i] = t2.i[1]; } #endif #else #if defined(LIBXSMM_DNN_OPTIMIZER_SGD_F32_AVX512) { libxsmm_blasint iv = ( (thr_end-thr_begin)/16 ) * 16; /* compute iterations which are vectorizable */ __m512 vlr = _mm512_set1_ps( handle->desc.learning_rate ); for ( i = thr_begin; i < iv; i+=16 ) { _mm512_storeu_ps( filter+i, _mm512_sub_ps( _mm512_loadu_ps( filter+i ), _mm512_mul_ps( vlr, _mm512_loadu_ps( dfilter + i ) ) ) ) ; } for ( i = iv; i < thr_end; ++i ) { filter[i] = filter[i] - (handle->desc.learning_rate*dfilter[i]); } } #else for ( i = thr_begin; i < thr_end; ++i ) { filter[i] = filter[i] - (handle->desc.learning_rate*dfilter[i]); } #endif #endif libxsmm_barrier_wait( handle->barrier, ltid ); libxsmm-1.17/src/template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c16_avx512.tpl.c000066400000000000000000000157471415223013700305410ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ #if defined(LIBXSMM_DNN_POOLING_BWD_BF16) # define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) #if 1 # define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) #else # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) #endif #else # define _mm512_load_act(A) _mm512_loadu_ps(A) # define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) # define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) #endif /* size variables, all const */ const int nImg = handle->desc.N; const int ifh = handle->desc.H; const int ifw = handle->desc.W; #if defined(LIBXSMM_DNN_POOLING_BWD_AVG) const int sh = handle->desc.u; const int sw = handle->desc.v; #endif const int ofh = handle->ofh; const int ofw = handle->ofw; const int iph = handle->desc.pad_h_in; const int ipw = handle->desc.pad_w_in; const int oph = handle->desc.pad_h_out; const int opw = handle->desc.pad_w_out; const int ofhp = ofh + 2*oph; const int ofwp = ofw + 2*opw; const int ifhp = ifh + 2*iph; const int ifwp = ifw + 2*ipw; /* here we assume that input and output blocking is similar */ const int nBlocksFm = handle->blocksifm; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const int work = nImg * nBlocksFm; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* loop variables */ int img = 0; int fm = 0; int imgfm = 0; int ho = 0; int wo = 0; int hi = 0; int wi = 0; int v = 0; #if defined(LIBXSMM_DNN_POOLING_BWD_AVG) int kh = 0; int kw = 0; #if defined(LIBXSMM_DNN_POOLING_BWD_BF16) float recp_pool_size = 1.0f/((float)handle->desc.R*(float)handle->desc.S); #else element_input_type recp_pool_size = 1.0f/((element_input_type)handle->desc.R*(element_input_type)handle->desc.S); #endif #endif /* multi-dim arrays declaration */ #if defined(LIBXSMM_DNN_POOLING_BWD_BF16) float* lcl_buffer_ptr = ((float*)handle->scratch)+((size_t)ifh*(size_t)ifw*(size_t)16*(size_t)ltid); LIBXSMM_VLA_DECL(3, float, lcl_dinput, lcl_buffer_ptr, ifw, 16); #else element_output_type* lcl_buffer_ptr = ((element_input_type*)handle->scratch)+((size_t)ifh*(size_t)ifw*(size_t)16*(size_t)ltid); LIBXSMM_VLA_DECL(3, element_input_type, lcl_dinput, lcl_buffer_ptr, ifw, 16); #endif LIBXSMM_VLA_DECL(5, element_input_type, dinput, (element_input_type* )handle->grad_input->data, nBlocksFm, ifhp, ifwp, 16); LIBXSMM_VLA_DECL(5, const element_output_type, doutput, (element_output_type*)handle->grad_output->data, nBlocksFm, ofhp, ofwp, 16); #if defined(LIBXSMM_DNN_POOLING_BWD_MAX) LIBXSMM_VLA_DECL(5, const element_mask_type, mask, (element_mask_type* )handle->mask->data, nBlocksFm, ofh, ofw, 16); #endif /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); for (imgfm = thr_begin; imgfm < thr_end; ++imgfm) { img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; for ( v = 0; v < ifh*ifw*16; v += 16 ) { _mm512_storeu_ps( &(lcl_buffer_ptr[v]), _mm512_setzero_ps() ); } #if defined(LIBXSMM_DNN_POOLING_BWD_MAX) for ( ho = oph; ho < (ofh+oph); ho++ ) { for ( wo = opw; wo < (ofw+opw); wo++ ) { const element_output_type* doutput_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, 16); const element_mask_type* mask_ptr = &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 0, nBlocksFm, ofh, ofw, 16); __m512 lcl_vdinput = _mm512_i32gather_ps( _mm512_loadu_si512( mask_ptr ), lcl_buffer_ptr, 4 ); lcl_vdinput = _mm512_add_ps( lcl_vdinput, _mm512_load_act( doutput_ptr ) ); _mm512_i32scatter_ps( lcl_buffer_ptr, _mm512_loadu_si512( mask_ptr ), lcl_vdinput, 4 ); } } #endif #if defined(LIBXSMM_DNN_POOLING_BWD_AVG) for ( ho = oph; ho < (ofh+oph); ho++ ) { hi = ((ho-oph) * sh) - handle->desc.pad_h; for ( wo = opw; wo < (ofw+opw); wo++ ) { wi = ((wo-opw) * sw) - handle->desc.pad_w; for ( kh = 0; kh < handle->desc.R; kh++ ) { if (hi+kh < 0 || hi+kh >= ifh) continue; for ( kw = 0; kw < handle->desc.S; kw++ ) { if (wi+kw < 0 || wi+kw >= ifw) { continue; } else { const element_output_type* doutput_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, 16); float* lcl_dinput_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_dinput, hi+kh, wi+kw, 0, ifw, 16); const __m512 recp_pool_size_ps = _mm512_set1_ps( recp_pool_size ); const __m512 lcl_dinput_ps = _mm512_loadu_ps( lcl_dinput_ptr ); _mm512_storeu_ps( lcl_dinput_ptr, _mm512_fmadd_ps( _mm512_load_act( doutput_ptr ), recp_pool_size_ps, lcl_dinput_ps ) ); } } } } } #endif /* copy the local buffer into dinput activations */ for ( hi = iph; hi < (ifh+iph); hi++ ) { for ( wi = ipw; wi < (ifw+ipw); wi++ ) { element_input_type* dinput_ptr = &LIBXSMM_VLA_ACCESS(5, dinput, img, fm, hi, wi, 0, nBlocksFm, ifhp, ifwp, 16); float* lcl_dinput_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_dinput, hi-iph, wi-ipw, 0, ifw, 16); _mm512_stream_act( dinput_ptr, _mm512_loadu_ps( lcl_dinput_ptr ) ); } } } libxsmm_barrier_wait(handle->barrier, ltid); # undef _mm512_load_act # undef _mm512_stream_act # undef _mm512_store_act libxsmm-1.17/src/template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c32_avx512.tpl.c000066400000000000000000000171101415223013700305210ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ #if defined(LIBXSMM_DNN_POOLING_BWD_BF16) # define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) #if 1 # define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) #else # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) #endif #else # define _mm512_load_act(A) _mm512_loadu_ps(A) # define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) # define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) #endif /* size variables, all const */ const int nImg = handle->desc.N; const int ifh = handle->desc.H; const int ifw = handle->desc.W; #if defined(LIBXSMM_DNN_POOLING_BWD_AVG) const int sh = handle->desc.u; const int sw = handle->desc.v; #endif const int ofh = handle->ofh; const int ofw = handle->ofw; const int iph = handle->desc.pad_h_in; const int ipw = handle->desc.pad_w_in; const int oph = handle->desc.pad_h_out; const int opw = handle->desc.pad_w_out; const int ofhp = ofh + 2*oph; const int ofwp = ofw + 2*opw; const int ifhp = ifh + 2*iph; const int ifwp = ifw + 2*ipw; /* here we assume that input and output blocking is similar */ const int nBlocksFm = handle->blocksifm; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const int work = nImg * nBlocksFm; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* loop variables */ int img = 0; int fm = 0; int imgfm = 0; int ho = 0; int wo = 0; int hi = 0; int wi = 0; int v = 0; #if defined(LIBXSMM_DNN_POOLING_BWD_AVG) int kh = 0; int kw = 0; #if defined(LIBXSMM_DNN_POOLING_BWD_BF16) float recp_pool_size = 1.0f/((float)handle->desc.R*(float)handle->desc.S); #else element_input_type recp_pool_size = 1.0f/((element_input_type)handle->desc.R*(element_input_type)handle->desc.S); #endif #endif /* multi-dim arrays declaration */ #if defined(LIBXSMM_DNN_POOLING_BWD_BF16) float* lcl_buffer_ptr = ((float*)handle->scratch)+((size_t)ifh*(size_t)ifw*(size_t)32*(size_t)ltid); LIBXSMM_VLA_DECL(3, float, lcl_dinput, lcl_buffer_ptr, ifw, 32); #else element_output_type* lcl_buffer_ptr = ((element_input_type*)handle->scratch)+((size_t)ifh*(size_t)ifw*(size_t)32*(size_t)ltid); LIBXSMM_VLA_DECL(3, element_input_type, lcl_dinput, lcl_buffer_ptr, ifw, 32); #endif LIBXSMM_VLA_DECL(5, element_input_type, dinput, (element_input_type* )handle->grad_input->data, nBlocksFm, ifhp, ifwp, 32); LIBXSMM_VLA_DECL(5, const element_output_type, doutput, (element_output_type*)handle->grad_output->data, nBlocksFm, ofhp, ofwp, 32); #if defined(LIBXSMM_DNN_POOLING_BWD_MAX) LIBXSMM_VLA_DECL(5, const element_mask_type, mask, (element_mask_type* )handle->mask->data, nBlocksFm, ofh, ofw, 32); #endif /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); for (imgfm = thr_begin; imgfm < thr_end; ++imgfm) { img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; for( v = 0; v < ifh*ifw*32; v += 16 ) { _mm512_storeu_ps( &(lcl_buffer_ptr[v]), _mm512_setzero_ps() ); } #if defined(LIBXSMM_DNN_POOLING_BWD_MAX) for( ho = oph; ho < (ofh+oph); ho++ ) { for( wo = opw; wo < (ofw+opw); wo++ ) { __m512 lcl_vdinput, lcl_vdinput2; const element_output_type* doutput_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, 32); const element_mask_type* mask_ptr = &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 0, nBlocksFm, ofh, ofw, 32); lcl_vdinput = _mm512_i32gather_ps( _mm512_loadu_si512( mask_ptr ), lcl_buffer_ptr, 4 ); lcl_vdinput = _mm512_add_ps( lcl_vdinput, _mm512_load_act( doutput_ptr ) ); _mm512_i32scatter_ps( lcl_buffer_ptr, _mm512_loadu_si512( mask_ptr ), lcl_vdinput, 4 ); lcl_vdinput2 = _mm512_i32gather_ps( _mm512_loadu_si512( mask_ptr+16 ), lcl_buffer_ptr, 4 ); lcl_vdinput2 = _mm512_add_ps( lcl_vdinput2, _mm512_load_act( doutput_ptr+16 ) ); _mm512_i32scatter_ps( lcl_buffer_ptr, _mm512_loadu_si512( mask_ptr+16 ), lcl_vdinput2, 4 ); } } #endif #if defined(LIBXSMM_DNN_POOLING_BWD_AVG) for( ho = oph; ho < (ofh+oph); ho++ ) { hi = ((ho-oph) * sh) - handle->desc.pad_h; for( wo = opw; wo < (ofw+opw); wo++ ) { wi = ((wo-opw) * sw) - handle->desc.pad_w; for( kh = 0; kh < handle->desc.R; kh++ ) { if (hi+kh < 0 || hi+kh >= ifh) continue; for( kw = 0; kw < handle->desc.S; kw++ ) { if (wi+kw < 0 || wi+kw >= ifw) { continue; } else { const element_output_type* doutput_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, 32); float* lcl_dinput_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_dinput, hi+kh, wi+kw, 0, ifw, 32); const __m512 recp_pool_size_ps = _mm512_set1_ps( recp_pool_size ); const __m512 lcl_dinput_ps = _mm512_loadu_ps( lcl_dinput_ptr ); const __m512 lcl_dinput_ps2 = _mm512_loadu_ps( lcl_dinput_ptr+16 ); _mm512_storeu_ps( lcl_dinput_ptr, _mm512_fmadd_ps( _mm512_load_act( doutput_ptr ), recp_pool_size_ps, lcl_dinput_ps ) ); _mm512_storeu_ps( lcl_dinput_ptr+16, _mm512_fmadd_ps( _mm512_load_act( doutput_ptr+16 ), recp_pool_size_ps, lcl_dinput_ps2 ) ); } } } } } #endif /* copy the local buffer into dinput activations */ for( hi = iph; hi < (ifh+iph); hi++ ) { for( wi = ipw; wi < (ifw+ipw); wi++ ) { element_input_type* dinput_ptr = &LIBXSMM_VLA_ACCESS(5, dinput, img, fm, hi, wi, 0, nBlocksFm, ifhp, ifwp, 32); float* lcl_dinput_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_dinput, hi-iph, wi-ipw, 0, ifw, 32); _mm512_stream_act( dinput_ptr, _mm512_loadu_ps( lcl_dinput_ptr ) ); _mm512_stream_act( dinput_ptr+16, _mm512_loadu_ps( lcl_dinput_ptr+16 ) ); } } } libxsmm_barrier_wait(handle->barrier, ltid); # undef _mm512_load_act # undef _mm512_stream_act # undef _mm512_store_act libxsmm-1.17/src/template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c64_avx512.tpl.c000066400000000000000000000167601415223013700305400ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ #if defined(LIBXSMM_DNN_POOLING_BWD_BF16) # define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) #if 1 # define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) #else # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) #endif #else # define _mm512_load_act(A) _mm512_loadu_ps(A) # define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) # define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) #endif /* size variables, all const */ const int nImg = handle->desc.N; const int ifh = handle->desc.H; const int ifw = handle->desc.W; #if defined(LIBXSMM_DNN_POOLING_BWD_AVG) const int sh = handle->desc.u; const int sw = handle->desc.v; #endif const int ofh = handle->ofh; const int ofw = handle->ofw; const int iph = handle->desc.pad_h_in; const int ipw = handle->desc.pad_w_in; const int oph = handle->desc.pad_h_out; const int opw = handle->desc.pad_w_out; const int ofhp = ofh + 2*oph; const int ofwp = ofw + 2*opw; const int ifhp = ifh + 2*iph; const int ifwp = ifw + 2*ipw; /* here we assume that input and output blocking is similar */ const int nBlocksFm = handle->blocksifm; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const int work = nImg * nBlocksFm * 4; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* loop variables */ int img = 0; int fm1 = 0; int fm2 = 0; int imgfm = 0; int ho = 0; int wo = 0; int hi = 0; int wi = 0; int v = 0; #if defined(LIBXSMM_DNN_POOLING_BWD_AVG) int kh = 0; int kw = 0; #if defined(LIBXSMM_DNN_POOLING_BWD_BF16) float recp_pool_size = 1.0f/((float)handle->desc.R*(float)handle->desc.S); #else element_input_type recp_pool_size = 1.0f/((element_input_type)handle->desc.R*(element_input_type)handle->desc.S); #endif #endif /* multi-dim arrays declaration */ #if defined(LIBXSMM_DNN_POOLING_BWD_BF16) float* lcl_buffer_ptr = ((float*)handle->scratch)+((size_t)ifh*(size_t)ifw*(size_t)64*(size_t)ltid); LIBXSMM_VLA_DECL(3, float, lcl_dinput, lcl_buffer_ptr, ifw, 16); #else element_output_type* lcl_buffer_ptr = ((element_input_type*)handle->scratch)+((size_t)ifh*(size_t)ifw*(size_t)64*(size_t)ltid); LIBXSMM_VLA_DECL(3, element_input_type, lcl_dinput, lcl_buffer_ptr, ifw, 16); #endif LIBXSMM_VLA_DECL(5, element_input_type, dinput, (element_input_type* )handle->grad_input->data, nBlocksFm, ifhp, ifwp, 64); LIBXSMM_VLA_DECL(5, const element_output_type, doutput, (element_output_type*)handle->grad_output->data, nBlocksFm, ofhp, ofwp, 64); #if defined(LIBXSMM_DNN_POOLING_BWD_MAX) LIBXSMM_VLA_DECL(5, const element_mask_type, mask, (element_mask_type* )handle->mask->data, nBlocksFm, ofh, ofw, 64); #endif /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); for (imgfm = thr_begin; imgfm < thr_end; ++imgfm) { img = imgfm / (nBlocksFm*4); fm1 = imgfm % (nBlocksFm*4); fm2 = imgfm % (nBlocksFm*4); fm1 = fm1/4; fm2 = (fm2%4)*16; for( v = 0; v < ifh*ifw*16; v += 16 ) { _mm512_storeu_ps( &(lcl_buffer_ptr[v]), _mm512_setzero_ps() ); } #if defined(LIBXSMM_DNN_POOLING_BWD_MAX) for( ho = oph; ho < (ofh+oph); ho++ ) { for( wo = opw; wo < (ofw+opw); wo++ ) { __m512 lcl_vdinput/*, lcl_vdinput2, lcl_vdinput3, lcl_vdinput4*/; const element_output_type* doutput_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm1, ho, wo, fm2, nBlocksFm, ofhp, ofwp, 64); const element_mask_type* mask_ptr = &LIBXSMM_VLA_ACCESS(5, mask, img, fm1, ho-oph, wo-opw, fm2, nBlocksFm, ofh, ofw, 64); #if 1 lcl_vdinput = _mm512_i32gather_ps( _mm512_loadu_si512( mask_ptr ), lcl_buffer_ptr, 4 ); lcl_vdinput = _mm512_add_ps( lcl_vdinput, _mm512_load_act( doutput_ptr ) ); _mm512_i32scatter_ps( lcl_buffer_ptr, _mm512_loadu_si512( mask_ptr ), lcl_vdinput, 4 ); #else for ( v = 0; v < 16; ++v ) { #if defined(LIBXSMM_DNN_POOLING_BWD_BF16) union libxsmm_bfloat16_hp del_output_f32; del_output_f32.i[1] = doutput_ptr[v]; del_output_f32.i[0] = 0; lcl_buffer_ptr[mask_ptr[v]] += del_output_f32.f; #else lcl_buffer_ptr[mask_ptr[v]] += doutput_ptr[v]; #endif } #endif } } #endif #if defined(LIBXSMM_DNN_POOLING_BWD_AVG) for( ho = oph; ho < (ofh+oph); ho++ ) { hi = ((ho-oph) * sh) - handle->desc.pad_h; for( wo = opw; wo < (ofw+opw); wo++ ) { wi = ((wo-opw) * sw) - handle->desc.pad_w; for( kh = 0; kh < handle->desc.R; kh++ ) { if (hi+kh < 0 || hi+kh >= ifh) continue; for( kw = 0; kw < handle->desc.S; kw++ ) { if (wi+kw < 0 || wi+kw >= ifw) { continue; } else { const element_output_type* doutput_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm1, ho, wo, fm2, nBlocksFm, ofhp, ofwp, 64); float* lcl_dinput_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_dinput, hi+kh, wi+kw, 0, ifw, 16); const __m512 recp_pool_size_ps = _mm512_set1_ps( recp_pool_size ); const __m512 lcl_dinput_ps = _mm512_loadu_ps( lcl_dinput_ptr ); _mm512_storeu_ps( lcl_dinput_ptr, _mm512_fmadd_ps( _mm512_load_act( doutput_ptr ), recp_pool_size_ps, lcl_dinput_ps ) ); } } } } } #endif /* copy the local buffer into dinput activations */ for( hi = iph; hi < (ifh+iph); hi++ ) { for( wi = ipw; wi < (ifw+ipw); wi++ ) { element_input_type* dinput_ptr = &LIBXSMM_VLA_ACCESS(5, dinput, img, fm1, hi, wi, fm2, nBlocksFm, ifhp, ifwp, 64); float* lcl_dinput_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_dinput, hi-iph, wi-ipw, 0, ifw, 16); _mm512_stream_act( dinput_ptr, _mm512_loadu_ps( lcl_dinput_ptr ) ); } } } libxsmm_barrier_wait(handle->barrier, ltid); # undef _mm512_load_act # undef _mm512_stream_act # undef _mm512_store_act libxsmm-1.17/src/template/libxsmm_dnn_pooling_st_bwd_custom_generic.tpl.c000066400000000000000000000161711415223013700271560ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ /* size variables, all const */ const int nImg = handle->desc.N; const int ifh = handle->desc.H; const int ifw = handle->desc.W; #if defined(LIBXSMM_DNN_POOLING_BWD_AVG) const int sh = handle->desc.u; const int sw = handle->desc.v; #endif const int ofh = handle->ofh; const int ofw = handle->ofw; const int iph = handle->desc.pad_h_in; const int ipw = handle->desc.pad_w_in; const int oph = handle->desc.pad_h_out; const int opw = handle->desc.pad_w_out; const int ofhp = ofh + 2*oph; const int ofwp = ofw + 2*opw; const int ifhp = ifh + 2*iph; const int ifwp = ifw + 2*ipw; /* here we assume that input and output blocking is similar */ const int nBlocksFm = handle->blocksifm; const int nFmBlock = handle->ifmblock; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const int work = nImg * nBlocksFm; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* loop variables */ int img = 0; int fm = 0; int imgfm = 0; int ho = 0; int wo = 0; int hi = 0; int wi = 0; int v = 0; #if defined(LIBXSMM_DNN_POOLING_BWD_AVG) int kh = 0; int kw = 0; #if defined(LIBXSMM_DNN_POOLING_BWD_BF16) float recp_pool_size = 1.0f/((float)handle->desc.R*(float)handle->desc.S); #else element_input_type recp_pool_size = 1.0f/((element_input_type)handle->desc.R*(element_input_type)handle->desc.S); #endif #endif /* multi-dim arrays declaration */ #if defined(LIBXSMM_DNN_POOLING_BWD_BF16) float *const lcl_buffer_ptr = (float*)handle->scratch + (size_t)ifh*ifw*nFmBlock*ltid; LIBXSMM_VLA_DECL(3, float, lcl_dinput, lcl_buffer_ptr, ifw, nFmBlock); #else element_output_type *const lcl_buffer_ptr = (element_input_type*)handle->scratch + (size_t)ifh*ifw*nFmBlock*ltid; LIBXSMM_VLA_DECL(3, element_input_type, lcl_dinput, lcl_buffer_ptr, ifw, nFmBlock); #endif LIBXSMM_VLA_DECL(5, element_input_type, dinput, (element_input_type* )handle->grad_input->data, nBlocksFm, ifhp, ifwp, nFmBlock); LIBXSMM_VLA_DECL(5, const element_output_type, doutput, (element_output_type*)handle->grad_output->data, nBlocksFm, ofhp, ofwp, nFmBlock); #if defined(LIBXSMM_DNN_POOLING_BWD_MAX) LIBXSMM_VLA_DECL(5, const element_mask_type, mask, (element_mask_type* )handle->mask->data, nBlocksFm, ofh, ofw, nFmBlock); #endif #if defined(LIBXSMM_DNN_POOLING_BWD_BF16) union libxsmm_bfloat16_hp del_input_f32; union libxsmm_bfloat16_hp del_output_f32; del_input_f32.i[1] = 0; del_input_f32.i[0] = 0; del_output_f32.i[1] = 0; del_output_f32.i[0] = 0; #endif /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); for (imgfm = thr_begin; imgfm < thr_end; ++imgfm) { img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; LIBXSMM_PRAGMA_SIMD for ( v = 0; v < ifh*ifw*nFmBlock; v++ ) { #if defined(LIBXSMM_DNN_POOLING_BWD_BF16) lcl_buffer_ptr[v] = (float)0; #else lcl_buffer_ptr[v] = (element_input_type)0; #endif } #if defined(LIBXSMM_DNN_POOLING_BWD_MAX) for ( ho = oph; ho < (ofh+oph); ho++ ) { for ( wo = opw; wo < (ofw+opw); wo++ ) { const element_output_type* doutput_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, nFmBlock); const element_mask_type* mask_ptr = &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 0, nBlocksFm, ofh, ofw, nFmBlock); #if !defined(LIBXSMM_DNN_POOLING_BWD_BF16) LIBXSMM_PRAGMA_SIMD #endif for ( v = 0; v < nFmBlock; v++ ) { #if defined(LIBXSMM_DNN_POOLING_BWD_BF16) del_output_f32.i[1] = doutput_ptr[v]; lcl_buffer_ptr[mask_ptr[v]] += del_output_f32.f; #else lcl_buffer_ptr[mask_ptr[v]] += doutput_ptr[v]; #endif } } } #endif #if defined(LIBXSMM_DNN_POOLING_BWD_AVG) for ( ho = oph; ho < (ofh+oph); ho++ ) { hi = ((ho-oph) * sh) - handle->desc.pad_h; for ( wo = opw; wo < (ofw+opw); wo++ ) { wi = ((wo-opw) * sw) - handle->desc.pad_w; for ( kh = 0; kh < handle->desc.R; kh++ ) { if (hi+kh < 0 || hi+kh >= ifh) continue; for ( kw = 0; kw < handle->desc.S; kw++ ) { if (wi+kw < 0 || wi+kw >= ifw) { continue; } else { const element_output_type* doutput_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, nFmBlock); #if defined(LIBXSMM_DNN_POOLING_BWD_BF16) float* lcl_dinput_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_dinput, hi+kh, wi+kw, 0, ifw, nFmBlock); #else element_input_type* lcl_dinput_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_dinput, hi+kh, wi+kw, 0, ifw, nFmBlock); #endif #if !defined(LIBXSMM_DNN_POOLING_BWD_BF16) LIBXSMM_PRAGMA_SIMD #endif for ( v = 0; v < nFmBlock; v++ ) { #if defined(LIBXSMM_DNN_POOLING_BWD_BF16) del_output_f32.i[1] = doutput_ptr[v]; lcl_dinput_ptr[v] += (del_output_f32.f * recp_pool_size); #else lcl_dinput_ptr[v] += (doutput_ptr[v] * recp_pool_size); #endif } } } } } } #endif /* copy the local buffer into dinput activations */ for ( hi = iph; hi < (ifh+iph); hi++ ) { for ( wi = ipw; wi < (ifw+ipw); wi++ ) { element_input_type* dinput_ptr = &LIBXSMM_VLA_ACCESS(5, dinput, img, fm, hi, wi, 0, nBlocksFm, ifhp, ifwp, nFmBlock); #if defined(LIBXSMM_DNN_POOLING_BWD_BF16) float* lcl_dinput_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_dinput, hi-iph, wi-ipw, 0, ifw, nFmBlock); #else element_input_type* lcl_dinput_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_dinput, hi-iph, wi-ipw, 0, ifw, nFmBlock); #endif #if !defined(LIBXSMM_DNN_POOLING_BWD_BF16) LIBXSMM_PRAGMA_SIMD #endif for ( v = 0; v < nFmBlock; v++ ) { #if defined(LIBXSMM_DNN_POOLING_BWD_BF16) del_input_f32.f = lcl_dinput_ptr[v]; dinput_ptr[v] = del_input_f32.i[1]; #else dinput_ptr[v] = lcl_dinput_ptr[v]; #endif } } } } libxsmm_barrier_wait(handle->barrier, ltid); libxsmm-1.17/src/template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c16_avx512.tpl.c000066400000000000000000000171701415223013700305350ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ #if defined(LIBXSMM_DNN_POOLING_FWD_BF16) # define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) #if 1 # define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) #else # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) #endif #else # define _mm512_load_act(A) _mm512_loadu_ps(A) # define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) # define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) #endif /* size variables, all const */ const int nImg = handle->desc.N; const int ifh = handle->desc.H; const int ifw = handle->desc.W; const int sh = handle->desc.u; const int sw = handle->desc.v; const int ofh = handle->ofh; const int ofw = handle->ofw; const int iph = handle->desc.pad_h_in; const int ipw = handle->desc.pad_w_in; const int oph = handle->desc.pad_h_out; const int opw = handle->desc.pad_w_out; const int ofhp = ofh + 2*oph; const int ofwp = ofw + 2*opw; const int ifhp = ifh + 2*iph; const int ifwp = ifw + 2*ipw; /* here we assume that input and output blocking is similar */ const int nBlocksFm = handle->blocksifm; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const int work = nImg * nBlocksFm; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* loop variables */ int img = 0; int fm = 0; int imgfm = 0; int ho = 0; int wo = 0; int hi = 0; int wi = 0; int kh = 0; int kw = 0; int v = 0; #if defined(LIBXSMM_DNN_POOLING_FWD_AVG) #if defined(LIBXSMM_DNN_POOLING_FWD_BF16) float recp_pool_size = 1.0f/((float)handle->desc.R*(float)handle->desc.S); #else element_output_type recp_pool_size = 1.0f/((element_output_type)handle->desc.R*(element_output_type)handle->desc.S); #endif #endif /* multi-dim arrays declaration */ #if defined(LIBXSMM_DNN_POOLING_FWD_BF16) float* lcl_buffer_ptr = ((float*)handle->scratch)+((size_t)ofh*(size_t)ofw*(size_t)16*(size_t)ltid); LIBXSMM_VLA_DECL(3, float, lcl_output, lcl_buffer_ptr, ofw, 16); #else element_output_type* lcl_buffer_ptr = ((element_output_type*)handle->scratch)+((size_t)ofh*(size_t)ofw*(size_t)16*(size_t)ltid); LIBXSMM_VLA_DECL(3, element_output_type, lcl_output, lcl_buffer_ptr, ofw, 16); #endif LIBXSMM_VLA_DECL(5, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, 16); LIBXSMM_VLA_DECL(5, element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, 16); #if defined(LIBXSMM_DNN_POOLING_FWD_MAX) LIBXSMM_VLA_DECL(5, element_mask_type, mask, (element_mask_type* )handle->mask->data, nBlocksFm, ofh, ofw, 16); #endif /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); for (imgfm = thr_begin; imgfm < thr_end; ++imgfm) { #if defined(LIBXSMM_DNN_POOLING_FWD_MAX) __m512i lcl_viadd = _mm512_set_epi32( 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 ); #endif img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; for ( v = 0; v < ofh*ofw*16; v+=16 ) { #if defined(LIBXSMM_DNN_POOLING_FWD_MAX) _mm512_storeu_ps( &(lcl_buffer_ptr[v]), _mm512_set1_ps(-FLT_MAX) ); #endif #if defined(LIBXSMM_DNN_POOLING_FWD_AVG) _mm512_storeu_ps( &(lcl_buffer_ptr[v]), _mm512_setzero_ps() ); #endif } for ( ho = oph; ho < (ofh+oph); ho++ ) { hi = ((ho-oph) * sh) - handle->desc.pad_h; for ( wo = opw; wo < (ofw+opw); wo++ ) { float* lcl_output_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_output, ho-oph, wo-opw, 0, ofw, 16); #if defined(LIBXSMM_DNN_POOLING_FWD_MAX) __m512i lcl_vmask = _mm512_loadu_si512( &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 0, nBlocksFm, ofh, ofw, 16) ); #endif __m512 lcl_voutput = _mm512_loadu_ps( lcl_output_ptr ); wi = ((wo-opw) * sw) - handle->desc.pad_w; for ( kh = 0; kh < handle->desc.R; kh++ ) { if (hi+kh < 0 || hi+kh >= ifh) continue; for ( kw = 0; kw < handle->desc.S; kw++ ) { if (wi+kw < 0 || wi+kw >= ifw) { continue; } else { const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi+kh+iph, wi+kw+ipw, 0, nBlocksFm, ifhp, ifwp, 16); #if defined(LIBXSMM_DNN_POOLING_FWD_MAX) __m512i lcl_vnewmask = _mm512_add_epi32( lcl_viadd, _mm512_set1_epi32((hi+kh)*ifw*16 + (wi+kw)*16) ); __m512 lcl_vinput = _mm512_load_act( input_ptr ); __mmask16 lcl_mlt = _mm512_cmp_ps_mask( lcl_voutput, lcl_vinput, _CMP_LT_OS ); lcl_voutput = _mm512_mask_blend_ps( lcl_mlt, lcl_voutput, lcl_vinput ); lcl_vmask = _mm512_mask_blend_epi32( lcl_mlt, lcl_vmask, lcl_vnewmask ); #endif #if defined(LIBXSMM_DNN_POOLING_FWD_AVG) lcl_voutput = _mm512_add_ps( lcl_voutput, _mm512_load_act( input_ptr ) ); #endif } } } #if defined(LIBXSMM_DNN_POOLING_FWD_MAX) _mm512_storeu_si512( &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 0, nBlocksFm, ofh, ofw, 16), lcl_vmask ); #endif _mm512_storeu_ps( lcl_output_ptr, lcl_voutput ); } } /* copy the local buffer into output activations */ for ( ho = oph; ho < (ofh+oph); ho++ ) { element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 16); float* lcl_output_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_output, ho-oph, 0, 0, ofw, 16); for ( wo = opw; wo < (ofw+opw); wo++ ) { #if defined(LIBXSMM_DNN_POOLING_FWD_AVG) const __m512 recp_pool_size_ps = _mm512_set1_ps( recp_pool_size ); #endif #if defined(LIBXSMM_DNN_POOLING_FWD_MAX) _mm512_stream_act( output_ptr, _mm512_loadu_ps( lcl_output_ptr ) ); #endif #if defined(LIBXSMM_DNN_POOLING_FWD_AVG) _mm512_stream_act( output_ptr, _mm512_mul_ps( _mm512_loadu_ps( lcl_output_ptr ), recp_pool_size_ps ) ); #endif output_ptr += 16; lcl_output_ptr += 16; } } } libxsmm_barrier_wait(handle->barrier, ltid); # undef _mm512_load_act # undef _mm512_stream_act # undef _mm512_store_act libxsmm-1.17/src/template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c32_avx512.tpl.c000066400000000000000000000213221415223013700305250ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ #if defined(LIBXSMM_DNN_POOLING_FWD_BF16) # define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) #if 1 # define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) #else # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) #endif #else # define _mm512_load_act(A) _mm512_loadu_ps(A) # define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) # define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) #endif /* size variables, all const */ const int nImg = handle->desc.N; const int ifh = handle->desc.H; const int ifw = handle->desc.W; const int sh = handle->desc.u; const int sw = handle->desc.v; const int ofh = handle->ofh; const int ofw = handle->ofw; const int iph = handle->desc.pad_h_in; const int ipw = handle->desc.pad_w_in; const int oph = handle->desc.pad_h_out; const int opw = handle->desc.pad_w_out; const int ofhp = ofh + 2*oph; const int ofwp = ofw + 2*opw; const int ifhp = ifh + 2*iph; const int ifwp = ifw + 2*ipw; /* here we assume that input and output blocking is similar */ const int nBlocksFm = handle->blocksifm; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const int work = nImg * nBlocksFm; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* loop variables */ int img = 0; int fm = 0; int imgfm = 0; int ho = 0; int wo = 0; int hi = 0; int wi = 0; int kh = 0; int kw = 0; int v = 0; #if defined(LIBXSMM_DNN_POOLING_FWD_AVG) #if defined(LIBXSMM_DNN_POOLING_FWD_BF16) float recp_pool_size = 1.0f/((float)handle->desc.R*(float)handle->desc.S); #else element_output_type recp_pool_size = 1.0f/((element_output_type)handle->desc.R*(element_output_type)handle->desc.S); #endif #endif /* multi-dim arrays declaration */ #if defined(LIBXSMM_DNN_POOLING_FWD_BF16) float* lcl_buffer_ptr = ((float*)handle->scratch)+((size_t)ofh*(size_t)ofw*(size_t)32*(size_t)ltid); LIBXSMM_VLA_DECL(3, float, lcl_output, lcl_buffer_ptr, ofw, 32); #else element_output_type* lcl_buffer_ptr = ((element_output_type*)handle->scratch)+((size_t)ofh*(size_t)ofw*(size_t)32*(size_t)ltid); LIBXSMM_VLA_DECL(3, element_output_type, lcl_output, lcl_buffer_ptr, ofw, 32); #endif LIBXSMM_VLA_DECL(5, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, 32); LIBXSMM_VLA_DECL(5, element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, 32); #if defined(LIBXSMM_DNN_POOLING_FWD_MAX) LIBXSMM_VLA_DECL(5, element_mask_type, mask, (element_mask_type* )handle->mask->data, nBlocksFm, ofh, ofw, 32); #endif /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); for (imgfm = thr_begin; imgfm < thr_end; ++imgfm) { #if defined(LIBXSMM_DNN_POOLING_FWD_MAX) __m512i lcl_viadd = _mm512_set_epi32( 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 ); #endif img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; for( v = 0; v < ofh*ofw*32; v+=16 ) { #if defined(LIBXSMM_DNN_POOLING_FWD_MAX) _mm512_storeu_ps( &(lcl_buffer_ptr[v]), _mm512_set1_ps(-FLT_MAX) ); #endif #if defined(LIBXSMM_DNN_POOLING_FWD_AVG) _mm512_storeu_ps( &(lcl_buffer_ptr[v]), _mm512_setzero_ps() ); #endif } for( ho = oph; ho < (ofh+oph); ho++ ) { hi = ((ho-oph) * sh) - handle->desc.pad_h; for( wo = opw; wo < (ofw+opw); wo++ ) { float* lcl_output_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_output, ho-oph, wo-opw, 0, ofw, 32); #if defined(LIBXSMM_DNN_POOLING_FWD_MAX) __m512i lcl_vmask = _mm512_loadu_si512( &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 0, nBlocksFm, ofh, ofw, 32) ); __m512i lcl_vmask2 = _mm512_loadu_si512( &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 16, nBlocksFm, ofh, ofw, 32) ); #endif __m512 lcl_voutput = _mm512_loadu_ps( lcl_output_ptr ); __m512 lcl_voutput2 = _mm512_loadu_ps( lcl_output_ptr+16 ); wi = ((wo-opw) * sw) - handle->desc.pad_w; for( kh = 0; kh < handle->desc.R; kh++ ) { if (hi+kh < 0 || hi+kh >= ifh) continue; for( kw = 0; kw < handle->desc.S; kw++ ) { if (wi+kw < 0 || wi+kw >= ifw) { continue; } else { const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi+kh+iph, wi+kw+ipw, 0, nBlocksFm, ifhp, ifwp, 32); #if defined(LIBXSMM_DNN_POOLING_FWD_MAX) __m512i lcl_vnewmask = _mm512_add_epi32( lcl_viadd, _mm512_set1_epi32((hi+kh)*ifw*32 + (wi+kw)*32) ); __m512i lcl_vnewmask2 = _mm512_add_epi32( lcl_viadd, _mm512_set1_epi32((hi+kh)*ifw*32 + (wi+kw)*32 + 16) ); __m512 lcl_vinput = _mm512_load_act( input_ptr ); __m512 lcl_vinput2 = _mm512_load_act( input_ptr+16 ); __mmask16 lcl_mlt = _mm512_cmp_ps_mask( lcl_voutput, lcl_vinput, _CMP_LT_OS ); __mmask16 lcl_mlt2 = _mm512_cmp_ps_mask( lcl_voutput2, lcl_vinput2, _CMP_LT_OS ); lcl_voutput = _mm512_mask_blend_ps( lcl_mlt, lcl_voutput, lcl_vinput ); lcl_voutput2 = _mm512_mask_blend_ps( lcl_mlt2, lcl_voutput2, lcl_vinput2 ); lcl_vmask = _mm512_mask_blend_epi32( lcl_mlt, lcl_vmask, lcl_vnewmask ); lcl_vmask2 = _mm512_mask_blend_epi32( lcl_mlt2, lcl_vmask2, lcl_vnewmask2 ); #endif #if defined(LIBXSMM_DNN_POOLING_FWD_AVG) lcl_voutput = _mm512_add_ps( lcl_voutput, _mm512_load_act( input_ptr ) ); lcl_voutput2 = _mm512_add_ps( lcl_voutput2, _mm512_load_act( input_ptr+16 ) ); #endif } } } #if defined(LIBXSMM_DNN_POOLING_FWD_MAX) _mm512_storeu_si512( &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 0, nBlocksFm, ofh, ofw, 32), lcl_vmask ); _mm512_storeu_si512( &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 16, nBlocksFm, ofh, ofw, 32), lcl_vmask2 ); #endif _mm512_storeu_ps( lcl_output_ptr, lcl_voutput ); _mm512_storeu_ps( lcl_output_ptr+16, lcl_voutput2 ); } } /* copy the local buffer into output activations */ for( ho = oph; ho < (ofh+oph); ho++ ) { element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 32); float* lcl_output_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_output, ho-oph, 0, 0, ofw, 32); for( wo = opw; wo < (ofw+opw); wo++ ) { #if defined(LIBXSMM_DNN_POOLING_FWD_AVG) const __m512 recp_pool_size_ps = _mm512_set1_ps( recp_pool_size ); _mm512_stream_act( output_ptr, _mm512_mul_ps( _mm512_loadu_ps( lcl_output_ptr ), recp_pool_size_ps ) ); _mm512_stream_act( output_ptr+16, _mm512_mul_ps( _mm512_loadu_ps( lcl_output_ptr+16 ), recp_pool_size_ps ) ); #endif #if defined(LIBXSMM_DNN_POOLING_FWD_MAX) _mm512_stream_act( output_ptr, _mm512_loadu_ps( lcl_output_ptr ) ); _mm512_stream_act( output_ptr+16, _mm512_loadu_ps( lcl_output_ptr+16 ) ); #endif output_ptr += 32; lcl_output_ptr += 32; } } } libxsmm_barrier_wait(handle->barrier, ltid); # undef _mm512_load_act # undef _mm512_stream_act # undef _mm512_store_act libxsmm-1.17/src/template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c64_avx512.tpl.c000066400000000000000000000256331415223013700305430ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ #if defined(LIBXSMM_DNN_POOLING_FWD_BF16) # define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) #if 1 # define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) #else # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) #endif #else # define _mm512_load_act(A) _mm512_loadu_ps(A) # define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) # define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) #endif /* size variables, all const */ const int nImg = handle->desc.N; const int ifh = handle->desc.H; const int ifw = handle->desc.W; const int sh = handle->desc.u; const int sw = handle->desc.v; const int ofh = handle->ofh; const int ofw = handle->ofw; const int iph = handle->desc.pad_h_in; const int ipw = handle->desc.pad_w_in; const int oph = handle->desc.pad_h_out; const int opw = handle->desc.pad_w_out; const int ofhp = ofh + 2*oph; const int ofwp = ofw + 2*opw; const int ifhp = ifh + 2*iph; const int ifwp = ifw + 2*ipw; /* here we assume that input and output blocking is similar */ const int nBlocksFm = handle->blocksifm; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const int work = nImg * nBlocksFm; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* loop variables */ int img = 0; int fm = 0; int imgfm = 0; int ho = 0; int wo = 0; int hi = 0; int wi = 0; int kh = 0; int kw = 0; int v = 0; #if defined(LIBXSMM_DNN_POOLING_FWD_AVG) #if defined(LIBXSMM_DNN_POOLING_FWD_BF16) float recp_pool_size = 1.0f/((float)handle->desc.R*(float)handle->desc.S); #else element_output_type recp_pool_size = 1.0f/((element_output_type)handle->desc.R*(element_output_type)handle->desc.S); #endif #endif /* multi-dim arrays declaration */ #if defined(LIBXSMM_DNN_POOLING_FWD_BF16) float* lcl_buffer_ptr = ((float*)handle->scratch)+((size_t)ofh*(size_t)ofw*(size_t)64*(size_t)ltid); LIBXSMM_VLA_DECL(3, float, lcl_output, lcl_buffer_ptr, ofw, 64); #else element_output_type* lcl_buffer_ptr = ((element_output_type*)handle->scratch)+((size_t)ofh*(size_t)ofw*(size_t)64*(size_t)ltid); LIBXSMM_VLA_DECL(3, element_output_type, lcl_output, lcl_buffer_ptr, ofw, 64); #endif LIBXSMM_VLA_DECL(5, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, 64); LIBXSMM_VLA_DECL(5, element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, 64); #if defined(LIBXSMM_DNN_POOLING_FWD_MAX) LIBXSMM_VLA_DECL(5, element_mask_type, mask, (element_mask_type* )handle->mask->data, nBlocksFm, ofh, ofw, 64); #endif /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); for (imgfm = thr_begin; imgfm < thr_end; ++imgfm) { #if defined(LIBXSMM_DNN_POOLING_FWD_MAX) __m512i lcl_viadd = _mm512_set_epi32( 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 ); #endif img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; for( v = 0; v < ofh*ofw*64; v+=16 ) { #if defined(LIBXSMM_DNN_POOLING_FWD_MAX) _mm512_storeu_ps( &(lcl_buffer_ptr[v]), _mm512_set1_ps(-FLT_MAX) ); #endif #if defined(LIBXSMM_DNN_POOLING_FWD_AVG) _mm512_storeu_ps( &(lcl_buffer_ptr[v]), _mm512_setzero_ps() ); #endif } for( ho = oph; ho < (ofh+oph); ho++ ) { hi = ((ho-oph) * sh) - handle->desc.pad_h; for( wo = opw; wo < (ofw+opw); wo++ ) { float* lcl_output_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_output, ho-oph, wo-opw, 0, ofw, 64); #if defined(LIBXSMM_DNN_POOLING_FWD_MAX) __m512i lcl_vmask = _mm512_loadu_si512( &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 0, nBlocksFm, ofh, ofw, 64) ); __m512i lcl_vmask2 = _mm512_loadu_si512( &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 16, nBlocksFm, ofh, ofw, 64) ); __m512i lcl_vmask3 = _mm512_loadu_si512( &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 32, nBlocksFm, ofh, ofw, 64) ); __m512i lcl_vmask4 = _mm512_loadu_si512( &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 48, nBlocksFm, ofh, ofw, 64) ); #endif __m512 lcl_voutput = _mm512_loadu_ps( lcl_output_ptr ); __m512 lcl_voutput2 = _mm512_loadu_ps( lcl_output_ptr+16 ); __m512 lcl_voutput3 = _mm512_loadu_ps( lcl_output_ptr+32 ); __m512 lcl_voutput4 = _mm512_loadu_ps( lcl_output_ptr+48 ); wi = ((wo-opw) * sw) - handle->desc.pad_w; for( kh = 0; kh < handle->desc.R; kh++ ) { if (hi+kh < 0 || hi+kh >= ifh) continue; for( kw = 0; kw < handle->desc.S; kw++ ) { if (wi+kw < 0 || wi+kw >= ifw) { continue; } else { const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi+kh+iph, wi+kw+ipw, 0, nBlocksFm, ifhp, ifwp, 64); #if defined(LIBXSMM_DNN_POOLING_FWD_MAX) __m512i lcl_vnewmask = _mm512_add_epi32( lcl_viadd, _mm512_set1_epi32((hi+kh)*ifw*16 + (wi+kw)*16) ); __m512i lcl_vnewmask2 = _mm512_add_epi32( lcl_viadd, _mm512_set1_epi32((hi+kh)*ifw*16 + (wi+kw)*16) ); __m512i lcl_vnewmask3 = _mm512_add_epi32( lcl_viadd, _mm512_set1_epi32((hi+kh)*ifw*16 + (wi+kw)*16) ); __m512i lcl_vnewmask4 = _mm512_add_epi32( lcl_viadd, _mm512_set1_epi32((hi+kh)*ifw*16 + (wi+kw)*16) ); __m512 lcl_vinput = _mm512_load_act( input_ptr ); __m512 lcl_vinput2 = _mm512_load_act( input_ptr+16 ); __m512 lcl_vinput3 = _mm512_load_act( input_ptr+32 ); __m512 lcl_vinput4 = _mm512_load_act( input_ptr+48 ); __mmask16 lcl_mlt = _mm512_cmp_ps_mask( lcl_voutput, lcl_vinput, _CMP_LT_OS ); __mmask16 lcl_mlt2 = _mm512_cmp_ps_mask( lcl_voutput2, lcl_vinput2, _CMP_LT_OS ); __mmask16 lcl_mlt3 = _mm512_cmp_ps_mask( lcl_voutput3, lcl_vinput3, _CMP_LT_OS ); __mmask16 lcl_mlt4 = _mm512_cmp_ps_mask( lcl_voutput4, lcl_vinput4, _CMP_LT_OS ); lcl_voutput = _mm512_mask_blend_ps( lcl_mlt, lcl_voutput, lcl_vinput ); lcl_voutput2 = _mm512_mask_blend_ps( lcl_mlt2, lcl_voutput2, lcl_vinput2 ); lcl_voutput3 = _mm512_mask_blend_ps( lcl_mlt3, lcl_voutput3, lcl_vinput3 ); lcl_voutput4 = _mm512_mask_blend_ps( lcl_mlt4, lcl_voutput4, lcl_vinput4 ); lcl_vmask = _mm512_mask_blend_epi32( lcl_mlt, lcl_vmask, lcl_vnewmask ); lcl_vmask2 = _mm512_mask_blend_epi32( lcl_mlt2, lcl_vmask2, lcl_vnewmask2 ); lcl_vmask3 = _mm512_mask_blend_epi32( lcl_mlt3, lcl_vmask3, lcl_vnewmask3 ); lcl_vmask4 = _mm512_mask_blend_epi32( lcl_mlt4, lcl_vmask4, lcl_vnewmask4 ); #endif #if defined(LIBXSMM_DNN_POOLING_FWD_AVG) lcl_voutput = _mm512_add_ps( lcl_voutput, _mm512_load_act( input_ptr ) ); lcl_voutput2 = _mm512_add_ps( lcl_voutput2, _mm512_load_act( input_ptr+16 ) ); lcl_voutput3 = _mm512_add_ps( lcl_voutput3, _mm512_load_act( input_ptr+32 ) ); lcl_voutput4 = _mm512_add_ps( lcl_voutput4, _mm512_load_act( input_ptr+48 ) ); #endif } } } #if defined(LIBXSMM_DNN_POOLING_FWD_MAX) _mm512_storeu_si512( &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 0, nBlocksFm, ofh, ofw, 64), lcl_vmask ); _mm512_storeu_si512( &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 16, nBlocksFm, ofh, ofw, 64), lcl_vmask2 ); _mm512_storeu_si512( &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 32, nBlocksFm, ofh, ofw, 64), lcl_vmask3 ); _mm512_storeu_si512( &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 48, nBlocksFm, ofh, ofw, 64), lcl_vmask4 ); #endif _mm512_storeu_ps( lcl_output_ptr, lcl_voutput ); _mm512_storeu_ps( lcl_output_ptr+16, lcl_voutput2 ); _mm512_storeu_ps( lcl_output_ptr+32, lcl_voutput3 ); _mm512_storeu_ps( lcl_output_ptr+48, lcl_voutput4 ); } } /* copy the local buffer into output activations */ for( ho = oph; ho < (ofh+oph); ho++ ) { element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 64); float* lcl_output_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_output, ho-oph, 0, 0, ofw, 64); for( wo = opw; wo < (ofw+opw); wo++ ) { #if defined(LIBXSMM_DNN_POOLING_FWD_AVG) const __m512 recp_pool_size_ps = _mm512_set1_ps( recp_pool_size ); _mm512_stream_act( output_ptr, _mm512_mul_ps( _mm512_loadu_ps( lcl_output_ptr ), recp_pool_size_ps ) ); _mm512_stream_act( output_ptr+16, _mm512_mul_ps( _mm512_loadu_ps( lcl_output_ptr+16 ), recp_pool_size_ps ) ); _mm512_stream_act( output_ptr+32, _mm512_mul_ps( _mm512_loadu_ps( lcl_output_ptr+32 ), recp_pool_size_ps ) ); _mm512_stream_act( output_ptr+48, _mm512_mul_ps( _mm512_loadu_ps( lcl_output_ptr+48 ), recp_pool_size_ps ) ); #endif #if defined(LIBXSMM_DNN_POOLING_FWD_MAX) _mm512_stream_act( output_ptr, _mm512_loadu_ps( lcl_output_ptr ) ); _mm512_stream_act( output_ptr+16, _mm512_loadu_ps( lcl_output_ptr+16 ) ); _mm512_stream_act( output_ptr+32, _mm512_loadu_ps( lcl_output_ptr+32 ) ); _mm512_stream_act( output_ptr+48, _mm512_loadu_ps( lcl_output_ptr+48 ) ); #endif output_ptr += 64; lcl_output_ptr += 64; } } } libxsmm_barrier_wait(handle->barrier, ltid); # undef _mm512_load_act # undef _mm512_stream_act # undef _mm512_store_act libxsmm-1.17/src/template/libxsmm_dnn_pooling_st_fwd_custom_generic.tpl.c000066400000000000000000000167431415223013700271670ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) ******************************************************************************/ /* size variables, all const */ const int nImg = handle->desc.N; const int ifh = handle->desc.H; const int ifw = handle->desc.W; const int sh = handle->desc.u; const int sw = handle->desc.v; const int ofh = handle->ofh; const int ofw = handle->ofw; const int iph = handle->desc.pad_h_in; const int ipw = handle->desc.pad_w_in; const int oph = handle->desc.pad_h_out; const int opw = handle->desc.pad_w_out; const int ofhp = ofh + 2*oph; const int ofwp = ofw + 2*opw; const int ifhp = ifh + 2*iph; const int ifwp = ifw + 2*ipw; /* here we assume that input and output blocking is similar */ const int nBlocksFm = handle->blocksifm; const int nFmBlock = handle->ifmblock; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could be run in parallel */ const int work = nImg * nBlocksFm; /* compute chunk size */ const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* loop variables */ int img = 0; int fm = 0; int imgfm = 0; int ho = 0; int wo = 0; int hi = 0; int wi = 0; int kh = 0; int kw = 0; int v = 0; #if defined(LIBXSMM_DNN_POOLING_FWD_AVG) #if defined(LIBXSMM_DNN_POOLING_FWD_BF16) float recp_pool_size = 1.0f/((float)handle->desc.R*(float)handle->desc.S); #else element_output_type recp_pool_size = 1.0f/((element_output_type)handle->desc.R*(element_output_type)handle->desc.S); #endif #endif /* multi-dim arrays declaration */ #if defined(LIBXSMM_DNN_POOLING_FWD_BF16) float *const lcl_buffer_ptr = (float*)handle->scratch + (size_t)ofh*ofw*nFmBlock*ltid; LIBXSMM_VLA_DECL(3, float, lcl_output, lcl_buffer_ptr, ofw, nFmBlock); #else element_output_type *const lcl_buffer_ptr = (element_output_type*)handle->scratch + (size_t)ofh*ofw*nFmBlock*ltid; LIBXSMM_VLA_DECL(3, element_output_type, lcl_output, lcl_buffer_ptr, ofw, nFmBlock); #endif LIBXSMM_VLA_DECL(5, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, nFmBlock); LIBXSMM_VLA_DECL(5, element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, nFmBlock); #if defined(LIBXSMM_DNN_POOLING_FWD_MAX) LIBXSMM_VLA_DECL(5, element_mask_type, mask, (element_mask_type* )handle->mask->data, nBlocksFm, ofh, ofw, nFmBlock); #endif #if defined(LIBXSMM_DNN_POOLING_FWD_BF16) union libxsmm_bfloat16_hp input_f32; union libxsmm_bfloat16_hp output_f32; input_f32.i[1] = 0; input_f32.i[0] = 0; output_f32.i[1] = 0; output_f32.i[0] = 0; #endif /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, ltid); for (imgfm = thr_begin; imgfm < thr_end; ++imgfm) { img = imgfm / nBlocksFm; fm = imgfm % nBlocksFm; LIBXSMM_PRAGMA_SIMD for ( v = 0; v < ofh*ofw*nFmBlock; v++ ) { #if defined(LIBXSMM_DNN_POOLING_FWD_MAX) lcl_buffer_ptr[v] = -FLT_MAX; #endif #if defined(LIBXSMM_DNN_POOLING_FWD_AVG) #if defined(LIBXSMM_DNN_POOLING_FWD_BF16) lcl_buffer_ptr[v] = (float)0.0; #else lcl_buffer_ptr[v] = (element_output_type)0.0; #endif #endif } for ( ho = oph; ho < (ofh+oph); ho++ ) { hi = ((ho-oph) * sh) - handle->desc.pad_h; for ( wo = opw; wo < (ofw+opw); wo++ ) { wi = ((wo-opw) * sw) - handle->desc.pad_w; for ( kh = 0; kh < handle->desc.R; kh++ ) { if (hi+kh < 0 || hi+kh >= ifh) continue; for ( kw = 0; kw < handle->desc.S; kw++ ) { if (wi+kw < 0 || wi+kw >= ifw) { continue; } else { const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi+kh+iph, wi+kw+ipw, 0, nBlocksFm, ifhp, ifwp, nFmBlock); #if defined(LIBXSMM_DNN_POOLING_FWD_BF16) float* lcl_output_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_output, ho-oph, wo-opw, 0, ofw, nFmBlock); #else element_output_type* lcl_output_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_output, ho-oph, wo-opw, 0, ofw, nFmBlock); #endif #if defined(LIBXSMM_DNN_POOLING_FWD_MAX) const int idx = (hi+kh)*ifw*nFmBlock + (wi+kw)*nFmBlock; element_mask_type* mask_ptr = &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 0, nBlocksFm, ofh, ofw, nFmBlock); #endif #if defined(LIBXSMM_DNN_POOLING_FWD_BF16) for ( v = 0; v < nFmBlock; v++ ) { input_f32.i[1] = input_ptr[v]; #if defined(LIBXSMM_DNN_POOLING_FWD_MAX) if ( input_f32.f > lcl_output_ptr[v] ) { lcl_output_ptr[v] = input_f32.f; mask_ptr[v] = idx + v; } #endif #if defined(LIBXSMM_DNN_POOLING_FWD_AVG) lcl_output_ptr[v] += input_f32.f; #endif } #else LIBXSMM_PRAGMA_SIMD for ( v = 0; v < nFmBlock; v++ ) { #if defined(LIBXSMM_DNN_POOLING_FWD_MAX) if ( input_ptr[v] > lcl_output_ptr[v] ) { lcl_output_ptr[v] = input_ptr[v]; mask_ptr[v] = idx + v; } #endif #if defined(LIBXSMM_DNN_POOLING_FWD_AVG) lcl_output_ptr[v] += input_ptr[v]; #endif } #endif } } } } } /* copy the local buffer into output activations */ for ( ho = oph; ho < (ofh+oph); ho++ ) { for ( wo = opw; wo < (ofw+opw); wo++ ) { element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, nFmBlock); #if defined(LIBXSMM_DNN_POOLING_FWD_BF16) float* lcl_output_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_output, ho-oph, wo-opw, 0, ofw, nFmBlock); #else element_output_type* lcl_output_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_output, ho-oph, wo-opw, 0, ofw, nFmBlock); #endif #if defined(LIBXSMM_DNN_POOLING_FWD_BF16) for ( v = 0; v < nFmBlock; v++ ) { #if defined(LIBXSMM_DNN_POOLING_FWD_MAX) output_f32.f = lcl_output_ptr[v]; #endif #if defined(LIBXSMM_DNN_POOLING_FWD_AVG) output_f32.f = lcl_output_ptr[v] * recp_pool_size; #endif output_ptr[v] = output_f32.i[1]; } #else LIBXSMM_PRAGMA_SIMD for ( v = 0; v < nFmBlock; v++ ) { #if defined(LIBXSMM_DNN_POOLING_FWD_MAX) output_ptr[v] = lcl_output_ptr[v]; #endif #if defined(LIBXSMM_DNN_POOLING_FWD_AVG) output_ptr[v] = lcl_output_ptr[v] * recp_pool_size; #endif } #endif } } } libxsmm_barrier_wait(handle->barrier, ltid); libxsmm-1.17/src/template/libxsmm_dnn_rnncell_st_gru_bwdupd_nc_ck_generic.tpl.c000066400000000000000000000772551415223013700303070ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Kunal Banerjee (Intel Corp.) ******************************************************************************/ /* helper variables */ libxsmm_blasint j, ik, ikb, in, inb, ic, icb, jk, jb/*jn shadows global variable*/, jc, ek, en, ec, BF, KB_BLOCKS, KB; /* tensor dimensions */ libxsmm_blasint K = handle->desc.K; libxsmm_blasint N = handle->desc.N; libxsmm_blasint C = handle->desc.C; libxsmm_blasint t = handle->T; libxsmm_blasint bk = handle->bk; libxsmm_blasint bn = handle->bn; libxsmm_blasint bc = handle->bc; libxsmm_blasint K3 = K * 3; const libxsmm_blasint cBlocks = C/bc; const libxsmm_blasint kBlocks = K/bk; const libxsmm_blasint nBlocks = N/bn; unsigned long long blocks; /* tensor raw pointers */ element_input_type *xt = (element_input_type* )handle->xt->data; element_input_type *hpD = (element_input_type* )handle->hp->data; element_filter_type *w = (element_filter_type*)handle->w->data; element_filter_type *r = (element_filter_type*)handle->r->data; element_output_type *ht = (element_output_type*)(handle->ht ? handle->ht->data : NULL); element_output_type *it = (element_output_type*)handle->it->data; element_output_type *ct = (element_output_type*)handle->cit->data; element_output_type *ft = (element_output_type*)handle->ft->data; element_output_type *ot = (element_output_type*)handle->ot->data; element_input_type *dxt = (element_input_type* )handle->dxt->data; element_input_type *dhpD = (element_input_type* )handle->dhp->data; element_filter_type *dw = (element_filter_type*)handle->dw->data; element_filter_type *dr = (element_filter_type*)handle->dr->data; element_output_type *db = (element_output_type*)handle->db->data; element_output_type *dht = (element_output_type*)handle->dht->data; element_output_type *diD = (element_output_type*)handle->scratch_di; element_output_type *dcD = (element_output_type*)handle->scratch_dci; element_output_type *dfD = (element_output_type*)handle->scratch_df; element_output_type *doD = (element_output_type*)handle->scratch_do; element_output_type *doutD = (element_output_type*)handle->scratch_deltat; element_input_type *scratch_xT = (element_input_type* )handle->scratch_xT; element_filter_type *scratch_wT = (element_filter_type*)handle->scratch_wT; element_filter_type *scratch_rT = (element_filter_type*)handle->scratch_rT; element_output_type *scratch_hT = (element_output_type*)handle->scratch_hT; element_output_type *scratch_oT = (element_output_type*)handle->scratch_dpB; element_filter_type *w_scratch = (element_filter_type*)handle->scratch_w; element_filter_type *r_scratch = (element_filter_type*)handle->scratch_r; element_filter_type *wiD = &(w[0]); element_filter_type *wcD = &(w[K]); element_filter_type *wfD = &(w[2*K]); element_filter_type *riD = &(r[0]); element_filter_type *rcD = &(r[K]); element_filter_type *rfD = &(r[2*K]); element_filter_type *dwiD = &(dw[0]); element_filter_type *dwcD = &(dw[K]); element_filter_type *dwfD = &(dw[2*K]); element_filter_type *driD = &(dr[0]); element_filter_type *drcD = &(dr[K]); element_filter_type *drfD = &(dr[2*K]); element_filter_type *dwiD_scratch = &(w_scratch[0]); element_filter_type *dwcD_scratch = &(w_scratch[C*K]); element_filter_type *dwfD_scratch = &(w_scratch[2*C*K]); element_filter_type *driD_scratch = &(r_scratch[0]); element_filter_type *drcD_scratch = &(r_scratch[K*K]); element_filter_type *drfD_scratch = &(r_scratch[2*K*K]); element_output_type *dbi = &(db[0]); element_output_type *dbc = &(db[K]); element_output_type *dbf = &(db[2*K]); element_filter_type *scratch_wiT = &(scratch_wT[0]); element_filter_type *scratch_wcT = &(scratch_wT[C*K]); element_filter_type *scratch_wfT = &(scratch_wT[2*C*K]); element_filter_type *scratch_riT = &(scratch_rT[0]); element_filter_type *scratch_rcT = &(scratch_rT[K*K]); element_filter_type *scratch_rfT = &(scratch_rT[2*K*K]); element_output_type *t1D = (element_output_type*)handle->scratch_t1; element_output_type *t2D = (element_output_type*)handle->scratch_t2; /* multidimensional arrays */ LIBXSMM_VLA_DECL(2, element_output_type, t1, t1D, K); LIBXSMM_VLA_DECL(2, element_output_type, t2, t2D, K); LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); LIBXSMM_VLA_DECL(2, element_filter_type, wi, wiD, K3); LIBXSMM_VLA_DECL(2, element_filter_type, wc, wcD, K3); LIBXSMM_VLA_DECL(2, element_filter_type, wf, wfD, K3); LIBXSMM_VLA_DECL(2, element_filter_type, ri, riD, K3); LIBXSMM_VLA_DECL(2, element_filter_type, rc, rcD, K3); LIBXSMM_VLA_DECL(2, element_filter_type, rf, rfD, K3); LIBXSMM_VLA_DECL(3, element_output_type, h, ht, N, K); LIBXSMM_VLA_DECL(3, element_output_type, i, it, N, K); LIBXSMM_VLA_DECL(3, element_output_type, c, ct, N, K); LIBXSMM_VLA_DECL(3, element_output_type, f, ft, N, K); LIBXSMM_VLA_DECL(3, element_output_type, o, ot, N, K); LIBXSMM_VLA_DECL(3, element_input_type, dx, dxt, N, C); LIBXSMM_VLA_DECL(2, element_input_type, dhp, dhpD, K); LIBXSMM_VLA_DECL(4, element_filter_type, dwi, dwiD_scratch, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, dwc, dwcD_scratch, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, dwf, dwfD_scratch, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, dri, driD_scratch, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, drc, drcD_scratch, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, drf, drfD_scratch, kBlocks, bk, bk); LIBXSMM_VLA_DECL(2, element_filter_type, dwi_ck, dwiD, K3); LIBXSMM_VLA_DECL(2, element_filter_type, dwc_ck, dwcD, K3); LIBXSMM_VLA_DECL(2, element_filter_type, dwf_ck, dwfD, K3); LIBXSMM_VLA_DECL(2, element_filter_type, dri_ck, driD, K3); LIBXSMM_VLA_DECL(2, element_filter_type, drc_ck, drcD, K3); LIBXSMM_VLA_DECL(2, element_filter_type, drf_ck, drfD, K3); LIBXSMM_VLA_DECL(3, element_output_type, dh, dht, N, K); LIBXSMM_VLA_DECL(2, element_output_type, di, diD, K); LIBXSMM_VLA_DECL(2, element_output_type, dc, dcD, K); LIBXSMM_VLA_DECL(2, element_output_type, df, dfD, K); LIBXSMM_VLA_DECL(2, element_output_type, dp, doD, K); LIBXSMM_VLA_DECL(2, element_output_type, dout, doutD, K); LIBXSMM_VLA_DECL(2, element_input_type, xT, scratch_xT, N); LIBXSMM_VLA_DECL(4, element_filter_type, wiT, scratch_wiT, kBlocks, bk, bc); LIBXSMM_VLA_DECL(4, element_filter_type, wcT, scratch_wcT, kBlocks, bk, bc); LIBXSMM_VLA_DECL(4, element_filter_type, wfT, scratch_wfT, kBlocks, bk, bc); LIBXSMM_VLA_DECL(4, element_filter_type, riT, scratch_riT, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, rcT, scratch_rcT, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, rfT, scratch_rfT, kBlocks, bk, bk); LIBXSMM_VLA_DECL(2, element_output_type, hT, scratch_hT, N); LIBXSMM_VLA_DECL(2, element_output_type, oT, scratch_oT, N); element_output_type *dout_ptr = NULL; /* define batch-reduce gemm kernels */ const libxsmm_smmfunction_reducebatch_addr batchreduce_kernela = libxsmm_smmdispatch_reducebatch_addr( bc, bn, bk, &bc, &K, &C, NULL, NULL, NULL, NULL ); #if 0 const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelb = libxsmm_smmdispatch_reducebatch_addr( bk, bk, bn, &bk, &N, &bk, NULL, NULL, NULL, NULL ); const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelc = libxsmm_smmdispatch_reducebatch_addr( bk, bc, bn, &bk, &N, &bk, NULL, NULL, NULL, NULL ); #endif const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelb1 = libxsmm_smmdispatch_reducebatch_addr( bk, bk, bn, &K, &N, &bk, NULL, NULL, NULL, NULL ); const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelc1 = libxsmm_smmdispatch_reducebatch_addr( bk, bc, bn, &K, &N, &bk, NULL, NULL, NULL, NULL ); const libxsmm_smmfunction_reducebatch_addr batchreduce_kerneld = libxsmm_smmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, NULL, NULL ); /* Auxiliary arrays for batch-reduce gemm calls */ const element_filter_type *A_array[1024]; const element_output_type *B_array[1024]; #if 0 LIBXSMM_VLA_DECL(4, element_output_type, diB, (element_output_type*)handle->scratch_diB, kBlocks, bn, bk); LIBXSMM_VLA_DECL(4, element_output_type, dcB, (element_output_type*)handle->scratch_dciB, kBlocks, bn, bk); LIBXSMM_VLA_DECL(4, element_output_type, dfB, (element_output_type*)handle->scratch_dfB, kBlocks, bn, bk); #endif /* computing first logical thread */ const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; /* number of tasks that could be run in parallel for N and K blocks*/ const libxsmm_blasint work_nk = (N/bn) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize_nk = (work_nk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nk / (libxsmm_blasint)handle->desc.threads) : ((work_nk / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_nk = (ltid * chunksize_nk < work_nk) ? (ltid * chunksize_nk) : work_nk; const libxsmm_blasint thr_end_nk = ((ltid + 1) * chunksize_nk < work_nk) ? ((ltid + 1) * chunksize_nk) : work_nk; /* number of tasks that could be run in parallel for N and C blocks*/ const libxsmm_blasint work_nc = (N/bn) * (C/bc); /* compute chunk size */ const libxsmm_blasint chunksize_nc = (work_nc % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nc / (libxsmm_blasint)handle->desc.threads) : ((work_nc / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_nc = (ltid * chunksize_nc < work_nc) ? (ltid * chunksize_nc) : work_nc; const libxsmm_blasint thr_end_nc = ((ltid + 1) * chunksize_nc < work_nc) ? ((ltid + 1) * chunksize_nc) : work_nc; /* number of tasks that could be run in parallel for C and K blocks*/ const libxsmm_blasint work_ck = (C/bc) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize_ck = (work_ck % (libxsmm_blasint)handle->desc.threads == 0) ? (work_ck / (libxsmm_blasint)handle->desc.threads) : ((work_ck / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_ck = (ltid * chunksize_ck < work_ck) ? (ltid * chunksize_ck) : work_ck; const libxsmm_blasint thr_end_ck = ((ltid + 1) * chunksize_ck < work_ck) ? ((ltid + 1) * chunksize_ck) : work_ck; /* number of tasks that could be run in parallel for K and K blocks*/ const libxsmm_blasint work_kk = (K/bk) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize_kk = (work_kk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_kk / (libxsmm_blasint)handle->desc.threads) : ((work_kk / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_kk = (ltid * chunksize_kk < work_kk) ? (ltid * chunksize_kk) : work_kk; const libxsmm_blasint thr_end_kk = ((ltid + 1) * chunksize_kk < work_kk) ? ((ltid + 1) * chunksize_kk) : work_kk; /* number of tasks that could be run in parallel for K blocks*/ /* compute chunk size */ const libxsmm_blasint chunksize_k = (K % (libxsmm_blasint)handle->desc.threads == 0) ? (K / (libxsmm_blasint)handle->desc.threads) : ((K / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_k = (ltid * chunksize_k < K) ? (ltid * chunksize_k) : K; const libxsmm_blasint thr_end_k = ((ltid + 1) * chunksize_k < K) ? ((ltid + 1) * chunksize_k) : K; /* int bcbk_multiples_of_16 = ((bc % 16 == 0) && (bk % 16 == 0)) ? 1 : 0; */ libxsmm_blasint ikic, inic, inik, icin, ikin; /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, (int)ltid); /* Blocking reduction domain if it is too large */ BF = 1; if (K > 1024 && K <= 2048) { BF = 8; while (kBlocks % BF != 0) { BF--; } } if (K > 2048) { BF = 16; while (kBlocks % BF != 0) { BF--; } } KB_BLOCKS = kBlocks/BF; /* initialization is done at the beginning */ if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { libxsmm_internal_matrix_zero(N*C*t, dxt, start_thread, tid, handle->desc.threads); } /* initialization is done at the beginning */ if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { libxsmm_internal_matrix_zero(C*K*3, w_scratch, start_thread, tid, handle->desc.threads); libxsmm_internal_matrix_zero(K*K*3, r_scratch, start_thread, tid, handle->desc.threads); libxsmm_internal_matrix_zero(K*3, db, start_thread, tid, handle->desc.threads); } /* transpose W */ for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { ic = (ikic / (K/bk)); ik = (ikic % (K/bk)); for (jk = 0; jk < bk; ++jk) { for (jc = 0; jc < bc; ++jc) { LIBXSMM_VLA_ACCESS(4, wiT, ic, ik, jk, jc, kBlocks, bk, bc) = LIBXSMM_VLA_ACCESS(2, wi, ic*bc+jc, ik*bk+jk, K3); LIBXSMM_VLA_ACCESS(4, wcT, ic, ik, jk, jc, kBlocks, bk, bc) = LIBXSMM_VLA_ACCESS(2, wc, ic*bc+jc, ik*bk+jk, K3); LIBXSMM_VLA_ACCESS(4, wfT, ic, ik, jk, jc, kBlocks, bk, bc) = LIBXSMM_VLA_ACCESS(2, wf, ic*bc+jc, ik*bk+jk, K3); } } } /* transpose R */ for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { ik = (ikic / (K/bk)); ic = (ikic % (K/bk)); for (jk = 0; jk < bk; ++jk) { for (jc = 0; jc < bk; ++jc) { LIBXSMM_VLA_ACCESS(4, riT, ic, ik, jk, jc, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(2, ri, ic*bk+jc, ik*bk+jk, K3); LIBXSMM_VLA_ACCESS(4, rcT, ic, ik, jk, jc, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(2, rc, ic*bk+jc, ik*bk+jk, K3); LIBXSMM_VLA_ACCESS(4, rfT, ic, ik, jk, jc, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(2, rf, ic*bk+jc, ik*bk+jk, K3); } } } libxsmm_barrier_wait(handle->barrier, (int)ltid); for (j = t-1; j >= 0; --j) { /* let's run the cell in blocks for good locality */ for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { in = (inik % (N/bn))*bn; ik = (inik / (N/bn))*bk; /* compute dhp */ if (j == t-1) { libxsmm_internal_matrix_copy_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, dh, t-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K) ); } else { libxsmm_internal_matrix_add_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, dh, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K) ); } /* df = dout . (1 - c) . (1 - (f . f)) */ libxsmm_internal_matrix_complement_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); libxsmm_internal_matrix_complement_square_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, df, in, ik, K) ); /* dc = dout . (hp - f) . c . (1 - c) */ libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); if (0 == j) { libxsmm_internal_matrix_sub_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, hp, in, ik, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); } else { LIBXSMM_ASSERT(NULL != ht); /* coverity[var_deref_op] */ libxsmm_internal_matrix_sub_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); } libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dc, in, ik, K) ); } if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { /* transpose xt for current timestep */ for (icin = thr_begin_nc; icin < thr_end_nc; ++icin ) { in = (icin / (C/bc))*bn; ic = (icin % (C/bc))*bc; for (jc = 0; jc < bc; ++jc) { for (jb = 0; jb < bn; ++jb) { en = in + jb; ec = ic + jc; LIBXSMM_VLA_ACCESS(2, xT, ec, en, N) = LIBXSMM_VLA_ACCESS(3, x, j, en, ec, N, C); } } } /* transpose ht for current timestep */ if (j == 0) { for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { in = (ikin / (K/bk))*bn; ik = (ikin % (K/bk))*bk; for (jk = 0; jk < bk; ++jk) { for (jb = 0; jb < bn; ++jb) { en = in + jb; ek = ik + jk; LIBXSMM_VLA_ACCESS(2, hT, ek, en, N) = LIBXSMM_VLA_ACCESS(2, hp, en, ek, K); } } } } else { for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { in = (ikin / (K/bk))*bn; ik = (ikin % (K/bk))*bk; for (jk = 0; jk < bk; ++jk) { for (jb = 0; jb < bn; ++jb) { en = in + jb; ek = ik + jk; LIBXSMM_VLA_ACCESS(2, hT, ek, en, N) = LIBXSMM_VLA_ACCESS(3, h, j-1, en, ek, N, K); } } } } /* transpose ot for current timestep */ for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { in = (ikin / (K/bk))*bn; ik = (ikin % (K/bk))*bk; for (jk = 0; jk < bk; ++jk) { for (jb = 0; jb < bn; ++jb) { en = in + jb; ek = ik + jk; LIBXSMM_VLA_ACCESS(2, oT, ek, en, N) = LIBXSMM_VLA_ACCESS(3, o, j, en, ek, N, K); } } } } libxsmm_barrier_wait(handle->barrier, (int)ltid); /* do = {R_f}^T * df */ for (KB = 0; KB < BF; KB++) { for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { in = (inik % (N/bn))*bn; ikb = inik / (N/bn); ik = ikb*bk; if (KB == 0) libxsmm_internal_matrix_zero_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K) ); for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rfT, ikb, icb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(2, df, in, ic + KB*KB_BLOCKS*bk, K); } /* Reduce batch gemm call */ blocks = KB_BLOCKS; batchreduce_kerneld(A_array, B_array, &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K), &blocks); } } libxsmm_barrier_wait(handle->barrier, (int)ltid); /* di = do . hp . i . (1 - i) */ for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { in = (inik % (N/bn))*bn; ik = (inik / (N/bn))*bk; libxsmm_internal_matrix_complement_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); if (0 == j) { libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, hp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); } else { libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); } libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, di, in, ik, K) ); } libxsmm_barrier_wait(handle->barrier, (int)ltid); if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { /* dx = W^T * dicf */ for (KB = 0; KB < BF; KB++) { for (inic = thr_begin_nc; inic < thr_end_nc; ++inic ) { in = (inic % (N/bn))*bn; icb = inic / (N/bn); ic = icb*bc; for (ik = 0, ikb = 0; ikb < KB_BLOCKS; ik += bk, ikb++) { A_array[ikb] = &LIBXSMM_VLA_ACCESS(4, wiT, icb, ikb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bc); B_array[ikb] = &LIBXSMM_VLA_ACCESS(2, di, in, ik + KB*KB_BLOCKS*bk, K); } /* Reduce batch gemm call */ blocks = KB_BLOCKS; batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C), &blocks); for (ik = 0, ikb = 0; ikb < KB_BLOCKS; ik += bk, ikb++) { A_array[ikb] = &LIBXSMM_VLA_ACCESS(4, wcT, icb, ikb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bc); B_array[ikb] = &LIBXSMM_VLA_ACCESS(2, dc, in, ik + KB*KB_BLOCKS*bk, K); } /* Reduce batch gemm call */ batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C), &blocks); for (ik = 0, ikb = 0; ikb < KB_BLOCKS; ik += bk, ikb++) { A_array[ikb] = &LIBXSMM_VLA_ACCESS(4, wfT, icb, ikb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bc); B_array[ikb] = &LIBXSMM_VLA_ACCESS(2, df, in, ik + KB*KB_BLOCKS*bk, K); } /* Reduce batch gemm call */ batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C), &blocks); } } } for (KB = 0; KB < BF; KB++) { for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { in = (inik % (N/bn))*bn; ikb = inik / (N/bn); ik = ikb*bk; dout_ptr = (j > 0) ? (element_output_type*) &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K) : (element_output_type*) &LIBXSMM_VLA_ACCESS(2, dhp, in, ik, K); if (0 == KB) { libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); libxsmm_internal_matrix_add_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), dout_ptr ); } /* dhp += R^T * dic */ for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, riT, ikb, icb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(2, di, in, ic + KB*KB_BLOCKS*bk, K); } /* Reduce batch gemm call */ blocks = KB_BLOCKS; batchreduce_kerneld(A_array, B_array, dout_ptr, &blocks); for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rcT, ikb, icb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(2, dc, in, ic + KB*KB_BLOCKS*bk, K); } /* Reduce batch gemm call */ batchreduce_kerneld(A_array, B_array, dout_ptr, &blocks); } } if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { if ((C == K) && (bc == bk) /*&& (bcbk_multiples_of_16 == 1)*/) { #if 0 if (K % 2048 != 0) { #endif /* Interleave computation of dr = dicf * o^T/h^T and dw = dicf * x^T to take advantage of temporal locality */ for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { icb = ikic / (K/bk); ic = icb*bk; ikb = ikic % (K/bk); ik = ikb*bk; blocks = nBlocks; for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, di, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, oT, ic, in, N); } batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, di, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); } batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, dc, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, oT, ic, in, N); } batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, dc, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); } batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, df, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); } batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, df, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); } batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); } #if 0 } else { /* Interleave computation of dr = dicf * o^T/h^T and dw = dicf * x^T to take advantage of temporal locality */ /* Use blocked format for di, dc, df */ for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { icb = ikic / (K/bk); ic = icb*bk; ikb = ikic % (K/bk); ik = ikb*bk; blocks = nBlocks; for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(4, diB, inb, ikb, 0, 0, kBlocks, bn, bk); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, oT, ic, in, N); } batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(4, diB, inb, ikb, 0, 0, kBlocks, bn, bk); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); } batchreduce_kernelc(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(4, dcB, inb, ikb, 0, 0, kBlocks, bn, bk); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, oT, ic, in, N); } batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(4, dcB, inb, ikb, 0, 0, kBlocks, bn, bk); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); } batchreduce_kernelc(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(4, dfB, inb, ikb, 0, 0, kBlocks, bn, bk); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); } batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(4, dfB, inb, ikb, 0, 0, kBlocks, bn, bk); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); } batchreduce_kernelc(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); } } #endif } else { /* dr = dicf * o^T/h^T */ for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { icb = ikic / (K/bk); ic = icb*bk; ikb = ikic % (K/bk); ik = ikb*bk; for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, di, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, oT, ic, in, N); } blocks = nBlocks; batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, dc, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, oT, ic, in, N); } batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, df, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); } batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); } /* dw = dicf * x^T */ for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { icb = ikic / (K/bk); ic = icb*bc; ikb = ikic % (K/bk); ik = ikb*bk; for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, di, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); } blocks = nBlocks; batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, dc, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); } batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, df, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); } batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); } } /* gradient bias */ for (ik = thr_begin_k; ik < thr_end_k; ik++) { for (in = 0; in < N; in++) { dbi[ik] += LIBXSMM_VLA_ACCESS(2, di, in, ik, K); dbc[ik] += LIBXSMM_VLA_ACCESS(2, dc, in, ik, K); dbf[ik] += LIBXSMM_VLA_ACCESS(2, df, in, ik, K); } } } libxsmm_barrier_wait(handle->barrier, (int)ltid); } if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { /* Store result weight matrices in CK format */ for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { icb = ikic / (K/bk); ic = icb*bc; ikb = ikic % (K/bk); ik = ikb*bk; for (jc = 0; jc < bc; ++jc) { for (jk = 0; jk < bk; ++jk) { LIBXSMM_VLA_ACCESS(2, dwi_ck, ic+jc, ik+jk, K3) = LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, jc, jk, cBlocks, bc, bk); LIBXSMM_VLA_ACCESS(2, dwc_ck, ic+jc, ik+jk, K3) = LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, jc, jk, cBlocks, bc, bk); LIBXSMM_VLA_ACCESS(2, dwf_ck, ic+jc, ik+jk, K3) = LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, jc, jk, cBlocks, bc, bk); } } } for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { icb = ikic / (K/bk); ic = icb*bk; ikb = ikic % (K/bk); ik = ikb*bk; for (jc = 0; jc < bk; ++jc) { for (jk = 0; jk < bk; ++jk) { LIBXSMM_VLA_ACCESS(2, dri_ck, ic+jc, ik+jk, K3) = LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, jc, jk, kBlocks, bk, bk); LIBXSMM_VLA_ACCESS(2, drc_ck, ic+jc, ik+jk, K3) = LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, jc, jk, kBlocks, bk, bk); LIBXSMM_VLA_ACCESS(2, drf_ck, ic+jc, ik+jk, K3) = LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, jc, jk, kBlocks, bk, bk); } } } libxsmm_barrier_wait(handle->barrier, (int)ltid); } libxsmm-1.17/src/template/libxsmm_dnn_rnncell_st_gru_bwdupd_nc_kcck.tpl.c000066400000000000000000001011661415223013700271160ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Kunal Banerjee (Intel Corp.) ******************************************************************************/ /* helper variables */ libxsmm_blasint j, ik, ikb, in, inb, ic, icb, jk, jb/*jn shadows global variable*/, jc, ek, en, ec, BF, KB_BLOCKS, KB; /* tensor dimensions */ libxsmm_blasint K = handle->desc.K; libxsmm_blasint N = handle->desc.N; libxsmm_blasint C = handle->desc.C; libxsmm_blasint t = handle->T; libxsmm_blasint bk = handle->bk; libxsmm_blasint bn = handle->bn; libxsmm_blasint bc = handle->bc; const libxsmm_blasint cBlocks = C/bc; const libxsmm_blasint kBlocks = K/bk; const libxsmm_blasint nBlocks = N/bn; unsigned long long blocks; /* tensor raw pointers */ element_input_type *xt = (element_input_type* )handle->xt->data; element_input_type *hpD = (element_input_type* )handle->hp->data; element_filter_type *w = (element_filter_type*)handle->w->data; element_filter_type *r = (element_filter_type*)handle->r->data; element_output_type *ht = handle->ht ? (element_output_type*)handle->ht->data : (element_output_type*)NULL; element_output_type *it = (element_output_type*)handle->it->data; element_output_type *ct = (element_output_type*)handle->cit->data; element_output_type *ft = (element_output_type*)handle->ft->data; element_output_type *ot = (element_output_type*)handle->ot->data; element_input_type *dxt = (element_input_type* )handle->dxt->data; element_input_type *dhpD = (element_input_type* )handle->dhp->data; element_filter_type *dw = (element_filter_type*)handle->dw->data; element_filter_type *dr = (element_filter_type*)handle->dr->data; element_output_type *db = (element_output_type*)handle->db->data; element_output_type *dht = (element_output_type*)handle->dht->data; element_output_type *diD = (element_output_type*)handle->scratch_di; element_output_type *dcD = (element_output_type*)handle->scratch_dci; element_output_type *dfD = (element_output_type*)handle->scratch_df; element_output_type *doD = (element_output_type*)handle->scratch_do; element_output_type *doutD = (element_output_type*)handle->scratch_deltat; element_input_type *scratch_xT = (element_input_type* )handle->scratch_xT; element_filter_type *scratch_wT = (element_filter_type*)handle->scratch_wT; element_filter_type *scratch_rT = (element_filter_type*)handle->scratch_rT; element_output_type *scratch_hT = (element_output_type*)handle->scratch_hT; element_output_type *scratch_oT = (element_output_type*)handle->scratch_dpB; element_filter_type *wiD = &(w[0]); element_filter_type *wcD = &(w[C*K]); element_filter_type *wfD = &(w[2*C*K]); element_filter_type *riD = &(r[0]); element_filter_type *rcD = &(r[K*K]); element_filter_type *rfD = &(r[2*K*K]); element_filter_type *dwiD = &(dw[0]); element_filter_type *dwcD = &(dw[C*K]); element_filter_type *dwfD = &(dw[2*C*K]); element_filter_type *driD = &(dr[0]); element_filter_type *drcD = &(dr[K*K]); element_filter_type *drfD = &(dr[2*K*K]); element_output_type *dbi = &(db[0]); element_output_type *dbc = &(db[K]); element_output_type *dbf = &(db[2*K]); element_filter_type *scratch_wiT = &(scratch_wT[0]); element_filter_type *scratch_wcT = &(scratch_wT[C*K]); element_filter_type *scratch_wfT = &(scratch_wT[2*C*K]); element_filter_type *scratch_riT = &(scratch_rT[0]); element_filter_type *scratch_rcT = &(scratch_rT[K*K]); element_filter_type *scratch_rfT = &(scratch_rT[2*K*K]); element_output_type *t1D = (element_output_type*)handle->scratch_t1; element_output_type *t2D = (element_output_type*)handle->scratch_t2; /* multidimensional arrays */ LIBXSMM_VLA_DECL(2, element_output_type, t1, t1D, K); LIBXSMM_VLA_DECL(2, element_output_type, t2, t2D, K); LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); LIBXSMM_VLA_DECL(4, element_filter_type, wi, wiD, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, wc, wcD, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, wf, wfD, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, ri, riD, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, rc, rcD, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, rf, rfD, kBlocks, bk, bk); LIBXSMM_VLA_DECL(3, element_output_type, h, ht, N, K); LIBXSMM_VLA_DECL(3, element_output_type, i, it, N, K); LIBXSMM_VLA_DECL(3, element_output_type, c, ct, N, K); LIBXSMM_VLA_DECL(3, element_output_type, f, ft, N, K); LIBXSMM_VLA_DECL(3, element_output_type, o, ot, N, K); LIBXSMM_VLA_DECL(3, element_input_type, dx, dxt, N, C); LIBXSMM_VLA_DECL(2, element_input_type, dhp, dhpD, K); LIBXSMM_VLA_DECL(4, element_filter_type, dwi, dwiD, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, dwc, dwcD, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, dwf, dwfD, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, dri, driD, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, drc, drcD, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, drf, drfD, kBlocks, bk, bk); LIBXSMM_VLA_DECL(3, element_output_type, dh, dht, N, K); LIBXSMM_VLA_DECL(2, element_output_type, di, diD, K); LIBXSMM_VLA_DECL(2, element_output_type, dc, dcD, K); LIBXSMM_VLA_DECL(2, element_output_type, df, dfD, K); LIBXSMM_VLA_DECL(2, element_output_type, dp, doD, K); LIBXSMM_VLA_DECL(2, element_output_type, dout, doutD, K); LIBXSMM_VLA_DECL(2, element_input_type, xT, scratch_xT, N); LIBXSMM_VLA_DECL(4, element_filter_type, wiT, scratch_wiT, kBlocks, bk, bc); LIBXSMM_VLA_DECL(4, element_filter_type, wcT, scratch_wcT, kBlocks, bk, bc); LIBXSMM_VLA_DECL(4, element_filter_type, wfT, scratch_wfT, kBlocks, bk, bc); LIBXSMM_VLA_DECL(4, element_filter_type, riT, scratch_riT, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, rcT, scratch_rcT, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, rfT, scratch_rfT, kBlocks, bk, bk); LIBXSMM_VLA_DECL(2, element_output_type, hT, scratch_hT, N); LIBXSMM_VLA_DECL(2, element_output_type, oT, scratch_oT, N); element_output_type *dout_ptr = NULL; /* define batch-reduce gemm kernels */ const libxsmm_smmfunction_reducebatch_addr batchreduce_kernela = libxsmm_smmdispatch_reducebatch_addr( bc, bn, bk, &bc, &K, &C, NULL, NULL, NULL, NULL ); #if 0 const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelb = libxsmm_smmdispatch_reducebatch_addr( bk, bk, bn, &bk, &N, &bk, NULL, NULL, NULL, NULL ); const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelc = libxsmm_smmdispatch_reducebatch_addr( bk, bc, bn, &bk, &N, &bk, NULL, NULL, NULL, NULL ); #endif const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelb1 = libxsmm_smmdispatch_reducebatch_addr( bk, bk, bn, &K, &N, &bk, NULL, NULL, NULL, NULL ); const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelc1 = libxsmm_smmdispatch_reducebatch_addr( bk, bc, bn, &K, &N, &bk, NULL, NULL, NULL, NULL ); const libxsmm_smmfunction_reducebatch_addr batchreduce_kerneld = libxsmm_smmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, NULL, NULL ); /* Auxiliary arrays for batch-reduce gemm calls */ const element_filter_type *A_array[1024]; const element_output_type *B_array[1024]; #if 0 LIBXSMM_VLA_DECL(4, element_output_type, diB, (element_output_type*)handle->scratch_diB, kBlocks, bn, bk); LIBXSMM_VLA_DECL(4, element_output_type, dcB, (element_output_type*)handle->scratch_dciB, kBlocks, bn, bk); LIBXSMM_VLA_DECL(4, element_output_type, dfB, (element_output_type*)handle->scratch_dfB, kBlocks, bn, bk); #endif /* computing first logical thread */ const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; /* number of tasks that could be run in parallel for N and K blocks*/ const libxsmm_blasint work_nk = (N/bn) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize_nk = (work_nk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nk / (libxsmm_blasint)handle->desc.threads) : ((work_nk / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_nk = (ltid * chunksize_nk < work_nk) ? (ltid * chunksize_nk) : work_nk; const libxsmm_blasint thr_end_nk = ((ltid + 1) * chunksize_nk < work_nk) ? ((ltid + 1) * chunksize_nk) : work_nk; /* number of tasks that could be run in parallel for N and C blocks*/ const libxsmm_blasint work_nc = (N/bn) * (C/bc); /* compute chunk size */ const libxsmm_blasint chunksize_nc = (work_nc % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nc / (libxsmm_blasint)handle->desc.threads) : ((work_nc / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_nc = (ltid * chunksize_nc < work_nc) ? (ltid * chunksize_nc) : work_nc; const libxsmm_blasint thr_end_nc = ((ltid + 1) * chunksize_nc < work_nc) ? ((ltid + 1) * chunksize_nc) : work_nc; /* number of tasks that could be run in parallel for C and K blocks*/ const libxsmm_blasint work_ck = (C/bc) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize_ck = (work_ck % (libxsmm_blasint)handle->desc.threads == 0) ? (work_ck / (libxsmm_blasint)handle->desc.threads) : ((work_ck / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_ck = (ltid * chunksize_ck < work_ck) ? (ltid * chunksize_ck) : work_ck; const libxsmm_blasint thr_end_ck = ((ltid + 1) * chunksize_ck < work_ck) ? ((ltid + 1) * chunksize_ck) : work_ck; /* number of tasks that could be run in parallel for K and K blocks*/ const libxsmm_blasint work_kk = (K/bk) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize_kk = (work_kk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_kk / (libxsmm_blasint)handle->desc.threads) : ((work_kk / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_kk = (ltid * chunksize_kk < work_kk) ? (ltid * chunksize_kk) : work_kk; const libxsmm_blasint thr_end_kk = ((ltid + 1) * chunksize_kk < work_kk) ? ((ltid + 1) * chunksize_kk) : work_kk; /* number of tasks that could be run in parallel for K blocks*/ /* compute chunk size */ const libxsmm_blasint chunksize_k = (K % (libxsmm_blasint)handle->desc.threads == 0) ? (K / (libxsmm_blasint)handle->desc.threads) : ((K / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_k = (ltid * chunksize_k < K) ? (ltid * chunksize_k) : K; const libxsmm_blasint thr_end_k = ((ltid + 1) * chunksize_k < K) ? ((ltid + 1) * chunksize_k) : K; libxsmm_blasint ikic, inic, inik, icin, ikin; #if defined(LIBXSMM_RNN_CELL_AVX512) int bcbk_multiples_of_16 = ((bc % 16 == 0) && (bk % 16 == 0)) ? 1 : 0; #endif /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, (int)ltid); /* Blocking reduction domain if it is too large */ BF = 1; if (K >= 1024 && K%2==0) { BF = 2; } if (K >= 2048 && K%4==0) { BF = 4; } if (K >= 4096 && K%8==0) { BF = 8; } KB_BLOCKS = kBlocks/BF; /* initialization is done at the beginning */ if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { libxsmm_internal_matrix_zero(N*C*t, dxt, start_thread, tid, handle->desc.threads); } /* initialization is done at the beginning */ if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { libxsmm_internal_matrix_zero(C*K*3, dw, start_thread, tid, handle->desc.threads); libxsmm_internal_matrix_zero(K*K*3, dr, start_thread, tid, handle->desc.threads); libxsmm_internal_matrix_zero(K*3, db, start_thread, tid, handle->desc.threads); } /* transpose W */ for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { ic = (ikic / (K/bk)); ik = (ikic % (K/bk)); for (jk = 0; jk < bk; ++jk) { for (jc = 0; jc < bc; ++jc) { LIBXSMM_VLA_ACCESS(4, wiT, ic, ik, jk, jc, kBlocks, bk, bc) = LIBXSMM_VLA_ACCESS(4, wi, ik, ic, jc, jk, cBlocks, bc, bk); LIBXSMM_VLA_ACCESS(4, wcT, ic, ik, jk, jc, kBlocks, bk, bc) = LIBXSMM_VLA_ACCESS(4, wc, ik, ic, jc, jk, cBlocks, bc, bk); LIBXSMM_VLA_ACCESS(4, wfT, ic, ik, jk, jc, kBlocks, bk, bc) = LIBXSMM_VLA_ACCESS(4, wf, ik, ic, jc, jk, cBlocks, bc, bk); } } } /* transpose R */ for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { ik = (ikic / (K/bk)); ic = (ikic % (K/bk)); for (jk = 0; jk < bk; ++jk) { for (jc = 0; jc < bk; ++jc) { LIBXSMM_VLA_ACCESS(4, riT, ic, ik, jk, jc, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(4, ri, ik, ic, jc, jk, kBlocks, bk, bk); LIBXSMM_VLA_ACCESS(4, rcT, ic, ik, jk, jc, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(4, rc, ik, ic, jc, jk, kBlocks, bk, bk); LIBXSMM_VLA_ACCESS(4, rfT, ic, ik, jk, jc, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(4, rf, ik, ic, jc, jk, kBlocks, bk, bk); } } } libxsmm_barrier_wait(handle->barrier, (int)ltid); for (j = t-1; j >= 0; --j) { /* let's run the cell in blocks for good locality */ for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { in = (inik % (N/bn))*bn; ik = (inik / (N/bn))*bk; #if defined(LIBXSMM_RNN_CELL_AVX512) if (bcbk_multiples_of_16) { #include "libxsmm_internal_gru_bwdupd_fused_eltwise_1.tpl.c" } else { /* compute dhp */ if (j == t-1) { libxsmm_internal_matrix_copy_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, dh, t-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K) ); } else { libxsmm_internal_matrix_add_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, dh, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K) ); } /* df = dout . (1 - c) . (1 - (f . f)) */ libxsmm_internal_matrix_complement_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); libxsmm_internal_matrix_complement_square_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, df, in, ik, K) ); /* dc = dout . (hp - f) . c . (1 - c) */ libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); if (0 == j) { libxsmm_internal_matrix_sub_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, hp, in, ik, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); } else { libxsmm_internal_matrix_sub_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); } libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dc, in, ik, K) ); } #else /* compute dhp */ if (j == t-1) { libxsmm_internal_matrix_copy_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, dh, t-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K) ); } else { libxsmm_internal_matrix_add_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, dh, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K) ); } /* df = dout . (1 - c) . (1 - (f . f)) */ libxsmm_internal_matrix_complement_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); libxsmm_internal_matrix_complement_square_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, df, in, ik, K) ); /* dc = dout . (hp - f) . c . (1 - c) */ libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); if (0 == j) { libxsmm_internal_matrix_sub_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, hp, in, ik, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); } else { libxsmm_internal_matrix_sub_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); } libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dc, in, ik, K) ); #endif } if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { /* transpose xt for current timestep */ for (icin = thr_begin_nc; icin < thr_end_nc; ++icin ) { in = (icin / (C/bc))*bn; ic = (icin % (C/bc))*bc; for (jc = 0; jc < bc; ++jc) { for (jb = 0; jb < bn; ++jb) { en = in + jb; ec = ic + jc; LIBXSMM_VLA_ACCESS(2, xT, ec, en, N) = LIBXSMM_VLA_ACCESS(3, x, j, en, ec, N, C); } } } /* transpose ht for current timestep */ if (j == 0) { for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { in = (ikin / (K/bk))*bn; ik = (ikin % (K/bk))*bk; for (jk = 0; jk < bk; ++jk) { for (jb = 0; jb < bn; ++jb) { en = in + jb; ek = ik + jk; LIBXSMM_VLA_ACCESS(2, hT, ek, en, N) = LIBXSMM_VLA_ACCESS(2, hp, en, ek, K); } } } } else { for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { in = (ikin / (K/bk))*bn; ik = (ikin % (K/bk))*bk; for (jk = 0; jk < bk; ++jk) { for (jb = 0; jb < bn; ++jb) { en = in + jb; ek = ik + jk; LIBXSMM_VLA_ACCESS(2, hT, ek, en, N) = LIBXSMM_VLA_ACCESS(3, h, j-1, en, ek, N, K); } } } } /* transpose ot for current timestep */ for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { in = (ikin / (K/bk))*bn; ik = (ikin % (K/bk))*bk; for (jk = 0; jk < bk; ++jk) { for (jb = 0; jb < bn; ++jb) { en = in + jb; ek = ik + jk; LIBXSMM_VLA_ACCESS(2, oT, ek, en, N) = LIBXSMM_VLA_ACCESS(3, o, j, en, ek, N, K); } } } } libxsmm_barrier_wait(handle->barrier, (int)ltid); /* do = {R_f}^T * df */ for (KB = 0; KB < BF; KB++) { for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { in = (inik % (N/bn))*bn; ikb = inik / (N/bn); ik = ikb*bk; if (KB == 0) libxsmm_internal_matrix_zero_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K) ); for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rfT, ikb, icb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(2, df, in, ic + KB*KB_BLOCKS*bk, K); } /* Reduce batch gemm call */ blocks = KB_BLOCKS; batchreduce_kerneld(A_array, B_array, &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K), &blocks); } } libxsmm_barrier_wait(handle->barrier, (int)ltid); /* di = do . hp . i . (1 - i) */ for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { in = (inik % (N/bn))*bn; ik = (inik / (N/bn))*bk; #if defined(LIBXSMM_RNN_CELL_AVX512) if (bcbk_multiples_of_16) { #include "libxsmm_internal_gru_bwdupd_fused_eltwise_2.tpl.c" } else { libxsmm_internal_matrix_complement_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); if (0 == j) { libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, hp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); } else { libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); } libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, di, in, ik, K) ); } #else libxsmm_internal_matrix_complement_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); if (0 == j) { libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, hp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); } else { libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); } libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, di, in, ik, K) ); #endif } libxsmm_barrier_wait(handle->barrier, (int)ltid); if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { /* dx = W^T * dicf */ for (KB = 0; KB < BF; KB++) { for (inic = thr_begin_nc; inic < thr_end_nc; ++inic ) { in = (inic % (N/bn))*bn; icb = inic / (N/bn); ic = icb*bc; for (ik = 0, ikb = 0; ikb < KB_BLOCKS; ik += bk, ikb++) { A_array[ikb] = &LIBXSMM_VLA_ACCESS(4, wiT, icb, ikb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bc); B_array[ikb] = &LIBXSMM_VLA_ACCESS(2, di, in, ik + KB*KB_BLOCKS*bk, K); } /* Reduce batch gemm call */ blocks = KB_BLOCKS; batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C), &blocks); for (ik = 0, ikb = 0; ikb < KB_BLOCKS; ik += bk, ikb++) { A_array[ikb] = &LIBXSMM_VLA_ACCESS(4, wcT, icb, ikb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bc); B_array[ikb] = &LIBXSMM_VLA_ACCESS(2, dc, in, ik + KB*KB_BLOCKS*bk, K); } /* Reduce batch gemm call */ batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C), &blocks); for (ik = 0, ikb = 0; ikb < KB_BLOCKS; ik += bk, ikb++) { A_array[ikb] = &LIBXSMM_VLA_ACCESS(4, wfT, icb, ikb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bc); B_array[ikb] = &LIBXSMM_VLA_ACCESS(2, df, in, ik + KB*KB_BLOCKS*bk, K); } /* Reduce batch gemm call */ batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C), &blocks); } } } for (KB = 0; KB < BF; KB++) { for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { in = (inik % (N/bn))*bn; ikb = inik / (N/bn); ik = ikb*bk; dout_ptr = (j > 0) ? (element_output_type*) &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K) : (element_output_type*) &LIBXSMM_VLA_ACCESS(2, dhp, in, ik, K); if (0 == KB) { libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); libxsmm_internal_matrix_add_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), dout_ptr ); } /* dhp += R^T * dic */ for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, riT, ikb, icb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(2, di, in, ic + KB*KB_BLOCKS*bk, K); } /* Reduce batch gemm call */ blocks = KB_BLOCKS; batchreduce_kerneld(A_array, B_array, dout_ptr, &blocks); for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rcT, ikb, icb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(2, dc, in, ic + KB*KB_BLOCKS*bk, K); } /* Reduce batch gemm call */ batchreduce_kerneld(A_array, B_array, dout_ptr, &blocks); } } if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { if ((C == K) && (bc == bk) /*&& (bcbk_multiples_of_16 == 1)*/) { #if 0 if (K % 2048 != 0) { #endif /* Interleave computation of dr = dicf * o^T/h^T and dw = dicf * x^T to take advantage of temporal locality */ for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { icb = ikic / (K/bk); ic = icb*bk; ikb = ikic % (K/bk); ik = ikb*bk; blocks = nBlocks; for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, di, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, oT, ic, in, N); } batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, di, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); } batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, dc, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, oT, ic, in, N); } batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, dc, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); } batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, df, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); } batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, df, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); } batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); } #if 0 } else { /* Interleave computation of dr = dicf * o^T/h^T and dw = dicf * x^T to take advantage of temporal locality */ /* Use blocked format for di, dc, df */ for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { icb = ikic / (K/bk); ic = icb*bk; ikb = ikic % (K/bk); ik = ikb*bk; blocks = nBlocks; for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(4, diB, inb, ikb, 0, 0, kBlocks, bn, bk); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, oT, ic, in, N); } batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(4, diB, inb, ikb, 0, 0, kBlocks, bn, bk); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); } batchreduce_kernelc(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(4, dcB, inb, ikb, 0, 0, kBlocks, bn, bk); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, oT, ic, in, N); } batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(4, dcB, inb, ikb, 0, 0, kBlocks, bn, bk); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); } batchreduce_kernelc(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(4, dfB, inb, ikb, 0, 0, kBlocks, bn, bk); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); } batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(4, dfB, inb, ikb, 0, 0, kBlocks, bn, bk); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); } batchreduce_kernelc(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); } } #endif } else { /* dr = dicf * o^T/h^T */ for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { icb = ikic / (K/bk); ic = icb*bk; ikb = ikic % (K/bk); ik = ikb*bk; for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, di, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, oT, ic, in, N); } blocks = nBlocks; batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, dc, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, oT, ic, in, N); } batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, df, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); } batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); } /* dw = dicf * x^T */ for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { icb = ikic / (K/bk); ic = icb*bc; ikb = ikic % (K/bk); ik = ikb*bk; for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, di, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); } blocks = nBlocks; batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, dc, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); } batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, df, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); } batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); } } /* gradient bias */ for (ik = thr_begin_k; ik < thr_end_k; ik++) { for (in = 0; in < N; in++) { dbi[ik] += LIBXSMM_VLA_ACCESS(2, di, in, ik, K); dbc[ik] += LIBXSMM_VLA_ACCESS(2, dc, in, ik, K); dbf[ik] += LIBXSMM_VLA_ACCESS(2, df, in, ik, K); } } } libxsmm_barrier_wait(handle->barrier, (int)ltid); } libxsmm-1.17/src/template/libxsmm_dnn_rnncell_st_gru_fwd_nc_ck_generic.tpl.c000066400000000000000000000344231415223013700275700ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Kunal Banerjee (Intel Corp.) ******************************************************************************/ /* helper variables */ libxsmm_blasint j, ik, ikb, in, ic, icb, inik, BF, CB, CB_BLOCKS, KB_BLOCKS, ikic, jk, jc; /* input sizes */ const libxsmm_blasint K = handle->desc.K; const libxsmm_blasint N = handle->desc.N; const libxsmm_blasint C = handle->desc.C; const libxsmm_blasint t = handle->T; const libxsmm_blasint bk = handle->bk; const libxsmm_blasint bn = handle->bn; const libxsmm_blasint bc = handle->bc; const libxsmm_blasint K3 = K * 3; const libxsmm_blasint cBlocks = C/bc; const libxsmm_blasint kBlocks = K/bk; unsigned long long blocks; /* define tensors */ element_input_type *xt = (element_input_type* )handle->xt->data; element_input_type *hpD = (element_input_type* )handle->hp->data; element_filter_type *w = (element_filter_type*)handle->w->data; element_filter_type *r = (element_filter_type*)handle->r->data; element_filter_type *w_scratch = (element_filter_type*)handle->scratch_w; element_filter_type *r_scratch = (element_filter_type*)handle->scratch_r; element_output_type *b = (element_output_type*)handle->b->data; element_output_type *ht = (element_output_type*)handle->ht->data; element_output_type *it = (element_output_type*)handle->it->data; element_output_type *ct = (element_output_type*)handle->cit->data; element_output_type *ft = (element_output_type*)handle->ft->data; element_output_type *ot = (element_output_type*)handle->ot->data; element_filter_type *wiD = &(w[0]); element_filter_type *wcD = &(w[K]); element_filter_type *wfD = &(w[2*K]); element_filter_type *riD = &(r[0]); element_filter_type *rcD = &(r[K]); element_filter_type *rfD = &(r[2*K]); element_filter_type *wiD_scratch = &(w_scratch[0]); element_filter_type *wcD_scratch = &(w_scratch[C*K]); element_filter_type *wfD_scratch = &(w_scratch[2*C*K]); element_filter_type *riD_scratch = &(r_scratch[0]); element_filter_type *rcD_scratch = &(r_scratch[K*K]); element_filter_type *rfD_scratch = &(r_scratch[2*K*K]); element_output_type *bi = &(b[0]); element_output_type *bd = &(b[K]); element_output_type *bf = &(b[2*K]); LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); LIBXSMM_VLA_DECL(4, element_filter_type, wi, wiD_scratch, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, wc, wcD_scratch, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, wf, wfD_scratch, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, ri, riD_scratch, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, rc, rcD_scratch, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, rf, rfD_scratch, kBlocks, bk, bk); LIBXSMM_VLA_DECL(2, element_filter_type, wi_ck, wiD, K3); LIBXSMM_VLA_DECL(2, element_filter_type, wc_ck, wcD, K3); LIBXSMM_VLA_DECL(2, element_filter_type, wf_ck, wfD, K3); LIBXSMM_VLA_DECL(2, element_filter_type, ri_ck, riD, K3); LIBXSMM_VLA_DECL(2, element_filter_type, rc_ck, rcD, K3); LIBXSMM_VLA_DECL(2, element_filter_type, rf_ck, rfD, K3); LIBXSMM_VLA_DECL(3, element_output_type, h, ht, N, K); LIBXSMM_VLA_DECL(3, element_output_type, i, it, N, K); LIBXSMM_VLA_DECL(3, element_output_type, c, ct, N, K); LIBXSMM_VLA_DECL(3, element_output_type, f, ft, N, K); LIBXSMM_VLA_DECL(3, element_output_type, o, ot, N, K); /* define batch-reduce gemm kernels */ const libxsmm_smmfunction_reducebatch_addr batchreduce_kernela = libxsmm_smmdispatch_reducebatch_addr( bk, bn, bc, &bk, &C, &K, NULL, NULL, NULL, NULL ); const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelb = libxsmm_smmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, NULL, NULL ); /* define gemm kernels */ /* Auxiliary arrays for batch-reduce gemms */ const element_filter_type *A_array[1024]; const element_input_type *B_array[1024]; /* parallelize over C-blocks */ /* computing first logical thread */ const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; /* number of tasks that could be run in parallel */ const libxsmm_blasint work = (N/bn) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize = (work % (libxsmm_blasint)handle->desc.threads == 0) ? (work / (libxsmm_blasint)handle->desc.threads) : ((work / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const libxsmm_blasint thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* number of tasks that could be run in parallel for C and K blocks*/ const libxsmm_blasint work_ck = (C/bc) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize_ck = (work_ck % (libxsmm_blasint)handle->desc.threads == 0) ? (work_ck / (libxsmm_blasint)handle->desc.threads) : ((work_ck / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_ck = (ltid * chunksize_ck < work_ck) ? (ltid * chunksize_ck) : work_ck; const libxsmm_blasint thr_end_ck = ((ltid + 1) * chunksize_ck < work_ck) ? ((ltid + 1) * chunksize_ck) : work_ck; /* number of tasks that could be run in parallel for K and K blocks*/ const libxsmm_blasint work_kk = (K/bk) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize_kk = (work_kk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_kk / (libxsmm_blasint)handle->desc.threads) : ((work_kk / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_kk = (ltid * chunksize_kk < work_kk) ? (ltid * chunksize_kk) : work_kk; const libxsmm_blasint thr_end_kk = ((ltid + 1) * chunksize_kk < work_kk) ? ((ltid + 1) * chunksize_kk) : work_kk; #if 0 const int use_fused_implementation = (C == 2048 && K == 2048) ? 1 : 0; #endif /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, (int)ltid); /* Blocking reduction domain if it is too large */ BF = 1; if ((C > 1024 && C <= 2048) || (K > 1024 && K <= 2048)) { BF = 8; while ( (cBlocks % BF != 0) || (kBlocks % BF != 0) ) { BF--; } } if (C > 2048 || K > 2048) { BF = 16; while ( (cBlocks % BF != 0) || (kBlocks % BF != 0) ) { BF--; } } if (C == 2048 && K == 1024) { BF = 2; } CB_BLOCKS = cBlocks/BF; KB_BLOCKS = kBlocks/BF; /* Upfront reformatting of W and R */ /* reformat W */ for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { ic = (ikic / (K/bk)); ik = (ikic % (K/bk)); for (jk = 0; jk < bk; ++jk) { for (jc = 0; jc < bc; ++jc) { LIBXSMM_VLA_ACCESS(4, wi, ik, ic, jc, jk, cBlocks, bc, bk) = LIBXSMM_VLA_ACCESS(2, wi_ck, ic*bc+jc, ik*bk+jk, 3*K); LIBXSMM_VLA_ACCESS(4, wc, ik, ic, jc, jk, cBlocks, bc, bk) = LIBXSMM_VLA_ACCESS(2, wc_ck, ic*bc+jc, ik*bk+jk, 3*K); LIBXSMM_VLA_ACCESS(4, wf, ik, ic, jc, jk, cBlocks, bc, bk) = LIBXSMM_VLA_ACCESS(2, wf_ck, ic*bc+jc, ik*bk+jk, 3*K); } } } /* reformat R */ for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { ik = (ikic / (K/bk)); ic = (ikic % (K/bk)); for (jk = 0; jk < bk; ++jk) { for (jc = 0; jc < bk; ++jc) { LIBXSMM_VLA_ACCESS(4, ri, ik, ic, jc, jk, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(2, ri_ck, ic*bk+jc, ik*bk+jk, 3*K); LIBXSMM_VLA_ACCESS(4, rc, ik, ic, jc, jk, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(2, rc_ck, ic*bk+jc, ik*bk+jk, 3*K); LIBXSMM_VLA_ACCESS(4, rf, ik, ic, jc, jk, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(2, rf_ck, ic*bk+jc, ik*bk+jk, 3*K); } } } libxsmm_barrier_wait(handle->barrier, (int)ltid); /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, (int)ltid); /* All data is in column-major format */ for (j = 0; j < t; ++j) { /* let's run the cell in blocks for good locality */ /* Block reduction loop if requested */ for (CB = 0; CB < BF; CB++) { for (inik = thr_begin; inik < thr_end; ++inik ) { in = (inik % (N/bn))*bn; ikb = inik / (N/bn); ik = ikb*bk; /* initialize i with bi */ if (CB == 0) libxsmm_internal_matrix_bcst_colvector_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &bi[ik] ); /* i += W.x */ for (icb = 0, ic = 0; icb < CB_BLOCKS; ic += bc, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, wi, ikb, icb + CB*CB_BLOCKS, 0, 0, cBlocks, bc, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(3, x, j, in, ic + CB*CB_BLOCKS*bc, N, C); } /* Reduce batch gemm call */ blocks = CB_BLOCKS; batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &blocks); /* i += R.hp */ if (0 == j) { for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, ri, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(2, hp, in, ic + CB*KB_BLOCKS*bk, K); } } else { for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, ri, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ic + CB*KB_BLOCKS*bk, N, K); } } /* Reduce batch gemm call */ blocks = KB_BLOCKS; batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &blocks); /* initialize c with bd */ if (CB == 0) libxsmm_internal_matrix_bcst_colvector_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &bd[ik] ); /* c += W.x */ for (icb = 0, ic = 0; icb < CB_BLOCKS; ic += bc, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, wc, ikb, icb + CB*CB_BLOCKS, 0, 0, cBlocks, bc, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(3, x, j, in, ic + CB*CB_BLOCKS*bc, N, C); } /* Reduce batch gemm call */ blocks = CB_BLOCKS; batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &blocks); /* c += R.hp */ if (0 == j) { for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rc, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(2, hp, in, ic + CB*KB_BLOCKS*bk, K); } } else { for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rc, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ic + CB*KB_BLOCKS*bk, N, K); } } /* Reduce batch gemm call */ blocks = KB_BLOCKS; batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &blocks); if (CB == BF-1) { /* i = sigmoid(i) */ libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); /* o = hp . i */ if (0 == j) { libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, hp, in, ik, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); } else { libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); } } } } libxsmm_barrier_wait(handle->barrier, (int)ltid); /* We need a barrier here to ensure all elements of o are computed before f can be computed */ for (CB = 0; CB < BF; CB++) { for (inik = thr_begin; inik < thr_end; ++inik ) { in = (inik % (N/bn))*bn; ikb = inik / (N/bn); ik = ikb*bk; /* initialize f with bf */ if (CB == 0) libxsmm_internal_matrix_bcst_colvector_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &bf[ik] ); /* f += W.x */ for (icb = 0, ic = 0; icb < CB_BLOCKS; ic += bc, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, wf, ikb, icb + CB*CB_BLOCKS, 0, 0, cBlocks, bc, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(3, x, j, in, ic + CB*CB_BLOCKS*bc, N, C); } /* Reduce batch gemm call */ blocks = CB_BLOCKS; batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &blocks); /* f += R.o */ for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rf, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(3, o, j, in, ic + CB*KB_BLOCKS*bk, N, K); } /* Reduce batch gemm call */ blocks = KB_BLOCKS; batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &blocks); if (CB == BF-1) { /* f = tanh(f) */ libxsmm_internal_matrix_tanh_ld ( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); /* c = sigmoid(c) */ libxsmm_internal_matrix_sigmoid_ld ( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K) ); /* h = (1 - c) . f */ libxsmm_internal_matrix_complement_ld ( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_mult_ld ( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); /* h += c . hp */ if (0 == j) { libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, hp, in, ik, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); } else { libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); } } } } libxsmm_barrier_wait(handle->barrier, (int)ltid); } libxsmm-1.17/src/template/libxsmm_dnn_rnncell_st_gru_fwd_nc_kcck.tpl.c000066400000000000000000000256071415223013700264160ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Kunal Banerjee (Intel Corp.) ******************************************************************************/ /* helper variables */ libxsmm_blasint j, ik, ikb, in, ic, icb, inik, BF, CB, CB_BLOCKS, KB_BLOCKS; /* input sizes */ const libxsmm_blasint K = handle->desc.K; const libxsmm_blasint N = handle->desc.N; const libxsmm_blasint C = handle->desc.C; const libxsmm_blasint t = handle->T; const libxsmm_blasint bk = handle->bk; const libxsmm_blasint bn = handle->bn; const libxsmm_blasint bc = handle->bc; const libxsmm_blasint cBlocks = C/bc; const libxsmm_blasint kBlocks = K/bk; unsigned long long blocks; /* define tensors */ element_input_type *xt = (element_input_type* )handle->xt->data; element_input_type *hpD = (element_input_type* )handle->hp->data; element_filter_type *w = (element_filter_type*)handle->w->data; element_filter_type *r = (element_filter_type*)handle->r->data; element_output_type *b = (element_output_type*)handle->b->data; element_output_type *ht = (element_output_type*)handle->ht->data; element_output_type *it = (element_output_type*)handle->it->data; element_output_type *ct = (element_output_type*)handle->cit->data; element_output_type *ft = (element_output_type*)handle->ft->data; element_output_type *ot = (element_output_type*)handle->ot->data; element_filter_type *wiD = &(w[0]); element_filter_type *wcD = &(w[C*K]); element_filter_type *wfD = &(w[2*C*K]); element_filter_type *riD = &(r[0]); element_filter_type *rcD = &(r[K*K]); element_filter_type *rfD = &(r[2*K*K]); element_output_type *bi = &(b[0]); element_output_type *bd = &(b[K]); element_output_type *bf = &(b[2*K]); LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); LIBXSMM_VLA_DECL(4, element_filter_type, wi, wiD, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, wc, wcD, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, wf, wfD, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, ri, riD, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, rc, rcD, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, rf, rfD, kBlocks, bk, bk); LIBXSMM_VLA_DECL(3, element_output_type, h, ht, N, K); LIBXSMM_VLA_DECL(3, element_output_type, i, it, N, K); LIBXSMM_VLA_DECL(3, element_output_type, c, ct, N, K); LIBXSMM_VLA_DECL(3, element_output_type, f, ft, N, K); LIBXSMM_VLA_DECL(3, element_output_type, o, ot, N, K); /* define batch-reduce gemm kernels */ const libxsmm_smmfunction_reducebatch_addr batchreduce_kernela = libxsmm_smmdispatch_reducebatch_addr( bk, bn, bc, &bk, &C, &K, NULL, NULL, NULL, NULL ); const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelb = libxsmm_smmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, NULL, NULL ); /* define gemm kernels */ /* Auxiliary arrays for batch-reduce gemms */ const element_filter_type *A_array[1024]; const element_input_type *B_array[1024]; /* parallelize over C-blocks */ /* computing first logical thread */ const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; /* number of tasks that could be run in parallel */ const libxsmm_blasint work = (N/bn) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize = (work % (libxsmm_blasint)handle->desc.threads == 0) ? (work / (libxsmm_blasint)handle->desc.threads) : ((work / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const libxsmm_blasint thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; #if 0 const int use_fused_implementation = (C == 2048 && K == 2048) ? 1 : 0; #endif BF = 1; if ((C > 1024 && C <= 2048) || (K > 1024 && K <= 2048)) { BF = 8; while ( (cBlocks % BF != 0) || (kBlocks % BF != 0) ) { BF--; } } if (C > 2048 || K > 2048) { BF = 16; while ( (cBlocks % BF != 0) || (kBlocks % BF != 0) ) { BF--; } } if (C == 2048 && K == 1024) { BF = 2; } CB_BLOCKS = cBlocks/BF; KB_BLOCKS = kBlocks/BF; /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, (int)ltid); /* All data is in column-major format */ for (j = 0; j < t; ++j) { /* let's run the cell in blocks for good locality */ /* Block reduction loop if requested */ for (CB = 0; CB < BF; CB++) { for (inik = thr_begin; inik < thr_end; ++inik ) { in = (inik % (N/bn))*bn; ikb = inik / (N/bn); ik = ikb*bk; /* initialize i with bi */ if (CB == 0) libxsmm_internal_matrix_bcst_colvector_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &bi[ik] ); /* i += W.x */ for (icb = 0, ic = 0; icb < CB_BLOCKS; ic += bc, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, wi, ikb, icb + CB*CB_BLOCKS, 0, 0, cBlocks, bc, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(3, x, j, in, ic + CB*CB_BLOCKS*bc, N, C); } /* Reduce batch gemm call */ blocks = CB_BLOCKS; batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &blocks); /* i += R.hp */ if (0 == j) { for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, ri, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(2, hp, in, ic + CB*KB_BLOCKS*bk, K); } } else { for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, ri, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ic + CB*KB_BLOCKS*bk, N, K); } } /* Reduce batch gemm call */ blocks = KB_BLOCKS; batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &blocks); /* initialize c with bd */ if (CB == 0) libxsmm_internal_matrix_bcst_colvector_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &bd[ik] ); /* c += W.x */ for (icb = 0, ic = 0; icb < CB_BLOCKS; ic += bc, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, wc, ikb, icb + CB*CB_BLOCKS, 0, 0, cBlocks, bc, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(3, x, j, in, ic + CB*CB_BLOCKS*bc, N, C); } /* Reduce batch gemm call */ blocks = CB_BLOCKS; batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &blocks); /* c += R.hp */ if (0 == j) { for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rc, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(2, hp, in, ic + CB*KB_BLOCKS*bk, K); } } else { for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rc, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ic + CB*KB_BLOCKS*bk, N, K); } } /* Reduce batch gemm call */ blocks = KB_BLOCKS; batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &blocks); if (CB == BF-1) { /* i = sigmoid(i) */ libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); /* o = hp . i */ if (0 == j) { libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, hp, in, ik, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); } else { libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); } } } } libxsmm_barrier_wait(handle->barrier, (int)ltid); /* We need a barrier here to ensure all elements of o are computed before f can be computed */ for (CB = 0; CB < BF; CB++) { for (inik = thr_begin; inik < thr_end; ++inik ) { in = (inik % (N/bn))*bn; ikb = inik / (N/bn); ik = ikb*bk; /* initialize f with bf */ if (CB == 0) libxsmm_internal_matrix_bcst_colvector_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &bf[ik] ); /* f += W.x */ for (icb = 0, ic = 0; icb < CB_BLOCKS; ic += bc, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, wf, ikb, icb + CB*CB_BLOCKS, 0, 0, cBlocks, bc, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(3, x, j, in, ic + CB*CB_BLOCKS*bc, N, C); } /* Reduce batch gemm call */ blocks = CB_BLOCKS; batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &blocks); /* f += R.o */ for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rf, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(3, o, j, in, ic + CB*KB_BLOCKS*bk, N, K); } /* Reduce batch gemm call */ blocks = KB_BLOCKS; batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &blocks); if (CB == BF-1) { /* f = tanh(f) */ libxsmm_internal_matrix_tanh_ld ( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); /* c = sigmoid(c) */ libxsmm_internal_matrix_sigmoid_ld ( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K) ); /* h = (1 - c) . f */ libxsmm_internal_matrix_complement_ld ( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_mult_ld ( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); /* h += c . hp */ if (0 == j) { libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, hp, in, ik, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); } else { libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); } } } } libxsmm_barrier_wait(handle->barrier, (int)ltid); } libxsmm-1.17/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_ck_generic.tpl.c000066400000000000000000000501001415223013700304450ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas, Kunal Banerjee (Intel Corp.) ******************************************************************************/ #if 0 #define PROFILE #endif /* helper variables */ libxsmm_blasint j, ik, ikb, in, inb, ic, icb, jk, jb/*jn shadows global variable*/, jc, ek, en, ec, BF, KB_BLOCKS, KB; /* tensor dimensions */ libxsmm_blasint K = handle->desc.K; libxsmm_blasint N = handle->desc.N; libxsmm_blasint C = handle->desc.C; libxsmm_blasint t = handle->T; libxsmm_blasint bk = handle->bk; libxsmm_blasint bn = handle->bn; libxsmm_blasint bc = handle->bc; libxsmm_blasint K4 = K * 4; const libxsmm_blasint cBlocks = C/bc; const libxsmm_blasint kBlocks = K/bk; const libxsmm_blasint nBlocks = N/bn; unsigned long long blocks; /* tensor raw pointers */ element_input_type *xt = (element_input_type* )handle->xt->data; element_input_type *csp = (element_input_type* )handle->csp->data; element_input_type *hpD = (element_input_type* )handle->hp->data; element_filter_type *w = (element_filter_type*)handle->w->data; element_filter_type *r = (element_filter_type*)handle->r->data; element_output_type *cst = (element_output_type*)handle->cst->data; element_output_type *ht = handle->ht ? (element_output_type*)handle->ht->data : (element_output_type*)NULL; element_output_type *it = (element_output_type*)handle->it->data; element_output_type *ft = (element_output_type*)handle->ft->data; element_output_type *ot = (element_output_type*)handle->ot->data; element_output_type *cit = (element_output_type*)handle->cit->data; element_output_type *cot = (element_output_type*)handle->cot->data; element_input_type *dxt = (element_input_type*)handle->dxt->data; element_input_type *dcsp = (element_input_type* )handle->dcsp->data; element_input_type *dhpD = (element_input_type* )handle->dhp->data; element_filter_type *dw = (element_filter_type*)handle->dw->data; element_filter_type *dr = (element_filter_type*)handle->dr->data; element_output_type *db = (element_output_type*)handle->db->data; element_output_type *dcsD = (element_output_type*)handle->dcs->data; element_output_type *dht = (element_output_type*)handle->dht->data; element_output_type *diD = (element_output_type*)handle->scratch_di; element_output_type *dfD = (element_output_type*)handle->scratch_df; element_output_type *doD = (element_output_type*)handle->scratch_do; element_output_type *dciD = (element_output_type*)handle->scratch_dci; element_output_type *doutD = (element_output_type*)handle->scratch_deltat; element_input_type *scratch_xT = (element_input_type* )handle->scratch_xT; element_filter_type *scratch_wT = (element_filter_type*)handle->scratch_wT; element_filter_type *scratch_rT = (element_filter_type*)handle->scratch_rT; element_output_type *scratch_hT = (element_output_type*)handle->scratch_hT; element_filter_type *w_scratch = (element_filter_type*)handle->scratch_w; element_filter_type *r_scratch = (element_filter_type*)handle->scratch_r; element_filter_type *wiD = &(w[0]); element_filter_type *wcD = &(w[K]); element_filter_type *wfD = &(w[2*K]); element_filter_type *woD = &(w[3*K]); element_filter_type *riD = &(r[0]); element_filter_type *rcD = &(r[K]); element_filter_type *rfD = &(r[2*K]); element_filter_type *roD = &(r[3*K]); element_filter_type *dwiD = &(dw[0]); element_filter_type *dwcD = &(dw[K]); element_filter_type *dwfD = &(dw[2*K]); element_filter_type *dwoD = &(dw[3*K]); element_filter_type *driD = &(dr[0]); element_filter_type *drcD = &(dr[K]); element_filter_type *drfD = &(dr[2*K]); element_filter_type *droD = &(dr[3*K]); element_filter_type *dwiD_scratch = &(w_scratch[0]); element_filter_type *dwcD_scratch = &(w_scratch[C*K]); element_filter_type *dwfD_scratch = &(w_scratch[2*C*K]); element_filter_type *dwoD_scratch = &(w_scratch[3*C*K]); element_filter_type *driD_scratch = &(r_scratch[0]); element_filter_type *drcD_scratch = &(r_scratch[K*K]); element_filter_type *drfD_scratch = &(r_scratch[2*K*K]); element_filter_type *droD_scratch = &(r_scratch[3*K*K]); element_output_type *dbi = &(db[0]); element_output_type *dbc = &(db[K]); element_output_type *dbf = &(db[2*K]); element_output_type *dbo = &(db[3*K]); element_filter_type *scratch_wiT = &(scratch_wT[0]); element_filter_type *scratch_wcT = &(scratch_wT[C*K]); element_filter_type *scratch_wfT = &(scratch_wT[2*C*K]); element_filter_type *scratch_woT = &(scratch_wT[3*C*K]); element_filter_type *scratch_riT = &(scratch_rT[0]); element_filter_type *scratch_rcT = &(scratch_rT[K*K]); element_filter_type *scratch_rfT = &(scratch_rT[2*K*K]); element_filter_type *scratch_roT = &(scratch_rT[3*K*K]); element_output_type *t1D = (element_output_type*)handle->scratch_t1; element_output_type *t2D = (element_output_type*)handle->scratch_t2; /* multidimensional arrays */ LIBXSMM_VLA_DECL(2, element_output_type, t1, t1D, K); LIBXSMM_VLA_DECL(2, element_output_type, t2, t2D, K); LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); LIBXSMM_VLA_DECL(2, element_input_type, cp, csp, K); LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); LIBXSMM_VLA_DECL(2, element_filter_type, wi, wiD, K4); LIBXSMM_VLA_DECL(2, element_filter_type, wf, wfD, K4); LIBXSMM_VLA_DECL(2, element_filter_type, wo, woD, K4); LIBXSMM_VLA_DECL(2, element_filter_type, wc, wcD, K4); LIBXSMM_VLA_DECL(2, element_filter_type, ri, riD, K4); LIBXSMM_VLA_DECL(2, element_filter_type, rf, rfD, K4); LIBXSMM_VLA_DECL(2, element_filter_type, ro, roD, K4); LIBXSMM_VLA_DECL(2, element_filter_type, rc, rcD, K4); LIBXSMM_VLA_DECL(3, element_output_type, cs, cst, N, K); LIBXSMM_VLA_DECL(3, element_output_type, h, ht, N, K); LIBXSMM_VLA_DECL(3, element_output_type, i, it, N, K); LIBXSMM_VLA_DECL(3, element_output_type, f, ft, N, K); LIBXSMM_VLA_DECL(3, element_output_type, o, ot, N, K); LIBXSMM_VLA_DECL(3, element_output_type, ci, cit, N, K); LIBXSMM_VLA_DECL(3, element_output_type, co, cot, N, K); LIBXSMM_VLA_DECL(3, element_input_type, dx, dxt, N, C); LIBXSMM_VLA_DECL(2, element_input_type, dcp, dcsp, K); LIBXSMM_VLA_DECL(2, element_input_type, dhp, dhpD, K); LIBXSMM_VLA_DECL(4, element_filter_type, dwi, dwiD_scratch, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, dwf, dwfD_scratch, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, dwo, dwoD_scratch, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, dwc, dwcD_scratch, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, dri, driD_scratch, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, drf, drfD_scratch, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, dro, droD_scratch, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, drc, drcD_scratch, kBlocks, bk, bk); LIBXSMM_VLA_DECL(2, element_filter_type, dwi_ck, dwiD, 4*K); LIBXSMM_VLA_DECL(2, element_filter_type, dwf_ck, dwfD, 4*K); LIBXSMM_VLA_DECL(2, element_filter_type, dwo_ck, dwoD, 4*K); LIBXSMM_VLA_DECL(2, element_filter_type, dwc_ck, dwcD, 4*K); LIBXSMM_VLA_DECL(2, element_filter_type, dri_ck, driD, 4*K); LIBXSMM_VLA_DECL(2, element_filter_type, drf_ck, drfD, 4*K); LIBXSMM_VLA_DECL(2, element_filter_type, dro_ck, droD, 4*K); LIBXSMM_VLA_DECL(2, element_filter_type, drc_ck, drcD, 4*K); LIBXSMM_VLA_DECL(2, element_output_type, dcs, dcsD, K); LIBXSMM_VLA_DECL(3, element_output_type, dh, dht, N, K); LIBXSMM_VLA_DECL(2, element_output_type, di, diD, K); LIBXSMM_VLA_DECL(2, element_output_type, df, dfD, K); LIBXSMM_VLA_DECL(2, element_output_type, dp, doD, K); LIBXSMM_VLA_DECL(2, element_output_type, dci, dciD, K); LIBXSMM_VLA_DECL(2, element_output_type, dout, doutD, K); LIBXSMM_VLA_DECL(2, element_input_type, xT, scratch_xT, N); LIBXSMM_VLA_DECL(4, element_filter_type, wiT, scratch_wiT, kBlocks, bk, bc); LIBXSMM_VLA_DECL(4, element_filter_type, wcT, scratch_wcT, kBlocks, bk, bc); LIBXSMM_VLA_DECL(4, element_filter_type, wfT, scratch_wfT, kBlocks, bk, bc); LIBXSMM_VLA_DECL(4, element_filter_type, woT, scratch_woT, kBlocks, bk, bc); LIBXSMM_VLA_DECL(4, element_filter_type, riT, scratch_riT, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, rcT, scratch_rcT, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, rfT, scratch_rfT, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, roT, scratch_roT, kBlocks, bk, bk); LIBXSMM_VLA_DECL(2, element_output_type, hT, scratch_hT, N); element_output_type *dout_ptr = NULL; /* define batch-reduce gemm kernels */ const libxsmm_smmfunction_reducebatch_addr batchreduce_kernela = libxsmm_smmdispatch_reducebatch_addr( bc, bn, bk, &bc, &K, &C, NULL, NULL, NULL, NULL); const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelb = libxsmm_smmdispatch_reducebatch_addr( bk, bk, bn, &bk, &N, &bk, NULL, NULL, NULL, NULL); const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelc = libxsmm_smmdispatch_reducebatch_addr( bk, bc, bn, &bk, &N, &bk, NULL, NULL, NULL, NULL); const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelb1 = libxsmm_smmdispatch_reducebatch_addr( bk, bk, bn, &K, &N, &bk, NULL, NULL, NULL, NULL); const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelc1 = libxsmm_smmdispatch_reducebatch_addr( bk, bc, bn, &K, &N, &bk, NULL, NULL, NULL, NULL); const libxsmm_smmfunction_reducebatch_addr batchreduce_kerneld = libxsmm_smmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, NULL, NULL); /* Auxiliary arrays for batch-reduce gemm calls */ const element_filter_type *A_array[1024]; const element_output_type *B_array[1024]; LIBXSMM_VLA_DECL(4, element_output_type, diB, (element_output_type*)handle->scratch_diB, kBlocks, bn, bk); LIBXSMM_VLA_DECL(4, element_output_type, dfB, (element_output_type*)handle->scratch_dfB, kBlocks, bn, bk); LIBXSMM_VLA_DECL(4, element_output_type, dpB, (element_output_type*)handle->scratch_dpB, kBlocks, bn, bk); LIBXSMM_VLA_DECL(4, element_output_type, dciB, (element_output_type*)handle->scratch_dciB, kBlocks, bn, bk); /* computing first logical thread */ const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; /* number of tasks that could be run in parallel for N and K blocks*/ const libxsmm_blasint work_nk = (N/bn) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize_nk = (work_nk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nk / (libxsmm_blasint)handle->desc.threads) : ((work_nk / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_nk = (ltid * chunksize_nk < work_nk) ? (ltid * chunksize_nk) : work_nk; const libxsmm_blasint thr_end_nk = ((ltid + 1) * chunksize_nk < work_nk) ? ((ltid + 1) * chunksize_nk) : work_nk; /* number of tasks that could be run in parallel for N and C blocks*/ const libxsmm_blasint work_nc = (N/bn) * (C/bc); /* compute chunk size */ const libxsmm_blasint chunksize_nc = (work_nc % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nc / (libxsmm_blasint)handle->desc.threads) : ((work_nc / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_nc = (ltid * chunksize_nc < work_nc) ? (ltid * chunksize_nc) : work_nc; const libxsmm_blasint thr_end_nc = ((ltid + 1) * chunksize_nc < work_nc) ? ((ltid + 1) * chunksize_nc) : work_nc; /* number of tasks that could be run in parallel for C and K blocks*/ const libxsmm_blasint work_ck = (C/bc) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize_ck = (work_ck % (libxsmm_blasint)handle->desc.threads == 0) ? (work_ck / (libxsmm_blasint)handle->desc.threads) : ((work_ck / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_ck = (ltid * chunksize_ck < work_ck) ? (ltid * chunksize_ck) : work_ck; const libxsmm_blasint thr_end_ck = ((ltid + 1) * chunksize_ck < work_ck) ? ((ltid + 1) * chunksize_ck) : work_ck; /* number of tasks that could be run in parallel for K and K blocks*/ const libxsmm_blasint work_kk = (K/bk) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize_kk = (work_kk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_kk / (libxsmm_blasint)handle->desc.threads) : ((work_kk / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_kk = (ltid * chunksize_kk < work_kk) ? (ltid * chunksize_kk) : work_kk; const libxsmm_blasint thr_end_kk = ((ltid + 1) * chunksize_kk < work_kk) ? ((ltid + 1) * chunksize_kk) : work_kk; #if defined(LIBXSMM_RNN_CELL_AVX512) element_output_type *cps_ptr = NULL; int k_tasks = K/16; int k_chunksize = (k_tasks % (libxsmm_blasint)handle->desc.threads == 0) ? (k_tasks / (libxsmm_blasint)handle->desc.threads) : ((k_tasks / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint k_thr_begin = (ltid * k_chunksize * 16 < K) ? (ltid * k_chunksize * 16) : K; const libxsmm_blasint k_thr_end = ((ltid + 1) * k_chunksize * 16 < K) ? ((ltid + 1) * k_chunksize * 16) : K;__m512 dbi_sum, dbf_sum, dbo_sum, dbc_sum; #endif /* number of tasks that could be run in parallel for K blocks*/ /* compute chunk size */ const libxsmm_blasint chunksize_k = (K % (libxsmm_blasint)handle->desc.threads == 0) ? (K / (libxsmm_blasint)handle->desc.threads) : ((K / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_k = (ltid * chunksize_k < K) ? (ltid * chunksize_k) : K; const libxsmm_blasint thr_end_k = ((ltid + 1) * chunksize_k < K) ? ((ltid + 1) * chunksize_k) : K; #ifdef PROFILE __int64_t _start, _end, eltwise_cycles = 0, dout_cycles = 0, weight_trans_cycles = 0, act_trans_cycles = 0, dx_cycles = 0, dwdr_cycles = 0, gradient_cycles = 0, reformat_cycles = 0; float total_time = 0.0; #endif int bcbk_multiples_of_16 = ((bc % 16 == 0) && (bk % 16 == 0)) ? 1 : 0; libxsmm_blasint ikic, inic, inik, icin, ikin; /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, (int)ltid); /* Blocking reduction domain if it is too large */ BF = 1; if (K > 1024 && K <= 2048) { BF = 8; while (kBlocks % BF != 0) { BF--; } } if (K > 2048) { BF = 16; while (kBlocks % BF != 0) { BF--; } } KB_BLOCKS = kBlocks/BF; /* initialization is done at the beginning */ if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { libxsmm_internal_matrix_zero(N*C*t, dxt, start_thread, tid, handle->desc.threads); } /* initialization is done at the beginning */ if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { libxsmm_internal_matrix_zero(C*K*4, w_scratch, start_thread, tid, handle->desc.threads); libxsmm_internal_matrix_zero(K*K*4, r_scratch, start_thread, tid, handle->desc.threads); libxsmm_internal_matrix_zero(K*4, db, start_thread, tid, handle->desc.threads); } #ifdef PROFILE if (ltid == 0) _start = _rdtsc(); #endif /* transpose W */ for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { ic = (ikic / (K/bk)); ik = (ikic % (K/bk)); for (jk = 0; jk < bk; ++jk) { for (jc = 0; jc < bc; ++jc) { LIBXSMM_VLA_ACCESS(4, wiT, ic, ik, jk, jc, kBlocks, bk, bc) = LIBXSMM_VLA_ACCESS(2, wi, ic*bc+jc, ik*bk+jk, 4*K); LIBXSMM_VLA_ACCESS(4, wcT, ic, ik, jk, jc, kBlocks, bk, bc) = LIBXSMM_VLA_ACCESS(2, wc, ic*bc+jc, ik*bk+jk, 4*K); LIBXSMM_VLA_ACCESS(4, wfT, ic, ik, jk, jc, kBlocks, bk, bc) = LIBXSMM_VLA_ACCESS(2, wf, ic*bc+jc, ik*bk+jk, 4*K); LIBXSMM_VLA_ACCESS(4, woT, ic, ik, jk, jc, kBlocks, bk, bc) = LIBXSMM_VLA_ACCESS(2, wo, ic*bc+jc, ik*bk+jk, 4*K); } } } /* transpose R */ for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { ik = (ikic / (K/bk)); ic = (ikic % (K/bk)); for (jk = 0; jk < bk; ++jk) { for (jc = 0; jc < bk; ++jc) { LIBXSMM_VLA_ACCESS(4, riT, ic, ik, jk, jc, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(2, ri, ic*bk+jc, ik*bk+jk, 4*K); LIBXSMM_VLA_ACCESS(4, rcT, ic, ik, jk, jc, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(2, rc, ic*bk+jc, ik*bk+jk, 4*K); LIBXSMM_VLA_ACCESS(4, rfT, ic, ik, jk, jc, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(2, rf, ic*bk+jc, ik*bk+jk, 4*K); LIBXSMM_VLA_ACCESS(4, roT, ic, ik, jk, jc, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(2, ro, ic*bk+jc, ik*bk+jk, 4*K); } } } #ifdef PROFILE if (ltid == 0) { _end = _rdtsc(); weight_trans_cycles += _end - _start; } #endif #include "libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_core.tpl.c" if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { #ifdef PROFILE if (ltid == 0) _start = _rdtsc(); #endif /* Store result weight matrices in CK format */ for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { icb = ikic / (K/bk); ic = icb*bc; ikb = ikic % (K/bk); ik = ikb*bk; for (jc = 0; jc < bc; ++jc) { for (jk = 0; jk < bk; ++jk) { LIBXSMM_VLA_ACCESS(2, dwi_ck, ic+jc, ik+jk , K4) = LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, jc, jk, cBlocks, bc, bk); LIBXSMM_VLA_ACCESS(2, dwc_ck, ic+jc, ik+jk , K4) = LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, jc, jk, cBlocks, bc, bk); LIBXSMM_VLA_ACCESS(2, dwf_ck, ic+jc, ik+jk , K4) = LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, jc, jk, cBlocks, bc, bk); LIBXSMM_VLA_ACCESS(2, dwo_ck, ic+jc, ik+jk , K4) = LIBXSMM_VLA_ACCESS(4, dwo, ikb, icb, jc, jk, cBlocks, bc, bk); } } } for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { icb = ikic / (K/bk); ic = icb*bk; ikb = ikic % (K/bk); ik = ikb*bk; for (jc = 0; jc < bk; ++jc) { for (jk = 0; jk < bk; ++jk) { LIBXSMM_VLA_ACCESS(2, dri_ck, ic+jc, ik+jk , K4) = LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, jc, jk, kBlocks, bk, bk); LIBXSMM_VLA_ACCESS(2, drc_ck, ic+jc, ik+jk , K4) = LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, jc, jk, kBlocks, bk, bk); LIBXSMM_VLA_ACCESS(2, drf_ck, ic+jc, ik+jk , K4) = LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, jc, jk, kBlocks, bk, bk); LIBXSMM_VLA_ACCESS(2, dro_ck, ic+jc, ik+jk , K4) = LIBXSMM_VLA_ACCESS(4, dro, ikb, icb, jc, jk, kBlocks, bk, bk); } } } libxsmm_barrier_wait(handle->barrier, (int)ltid); #ifdef PROFILE if (ltid == 0) { _end = _rdtsc(); reformat_cycles += _end - _start; } #endif } #ifdef PROFILE if (ltid == 0) { printf("----- PROFILING LSTM BWD/UPD (N = %d, C = %d, K = %d, bn = %d. bc = %d, bk = %d)----\n", N, C, K, bn, bc, bk ); total_time = (gradient_cycles+dwdr_cycles+dx_cycles+act_trans_cycles+weight_trans_cycles+dout_cycles+eltwise_cycles+reformat_cycles)/(2.5 * 1e9)*1000.0f; printf("Transpose weights time is %f ms (%.2f%%)\n", weight_trans_cycles/(2.5 * 1e9)*1000.0f, weight_trans_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); printf("Elementwise time is %f ms (%.2f%%)\n", eltwise_cycles/(2.5 * 1e9)*1000.0f, eltwise_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); printf("Dx GEMM time is %f ms (%.2f%%) at %f GFLOPS\n", dx_cycles/(2.5 * 1e9)*1000.0f, dx_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*2.0*N*C*K*4/1e9/(dx_cycles/(2.5 * 1e9))); printf("Dh GEMM time is %f ms (%.2f%%) at %f GFLOPS\n", dout_cycles/(2.5 * 1e9)*1000.0f, dout_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*2.0*N*K*K*4/1e9/(dout_cycles/(2.5 * 1e9))); printf("Transpose input activations time is %f ms (%.2f%%)\n", act_trans_cycles/(2.5 * 1e9)*1000.0f, act_trans_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); printf("Dwdr GEMM time is %f ms (%.2f%%) at %f GFLOPS\n", dwdr_cycles/(2.5 * 1e9)*1000.0f, dwdr_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*2.0*(N*K*K*2.0+N*C*K*2.0)*2.0/1e9/(dwdr_cycles/(2.5 * 1e9))); printf("Gradient bias calculation time is %f ms (%.2f%%)\n", gradient_cycles/(2.5 * 1e9)*1000.0f, gradient_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); printf("Reformat dwdr time is %f ms (%.2f%%)\n\n", reformat_cycles/(2.5 * 1e9)*1000.0f, reformat_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); } #undef PROFILE #endif libxsmm-1.17/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_ck_generic_bf16.tpl.c000066400000000000000000000513421415223013700312740ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas, Kunal Banerjee (Intel Corp.) ******************************************************************************/ #if 0 #define PROFILE #endif /* helper variables */ libxsmm_blasint j, ik, ikb, in, inb, ic, icb, jk, jb/*jn shadows global variable*/, jc, ek, en, ec, BF, KB_BLOCKS, KB; /* tensor dimensions */ libxsmm_blasint K = handle->desc.K; libxsmm_blasint N = handle->desc.N; libxsmm_blasint C = handle->desc.C; libxsmm_blasint t = handle->T; libxsmm_blasint bk = handle->bk; libxsmm_blasint bn = handle->bn; libxsmm_blasint bc = handle->bc; libxsmm_blasint K4 = K * 4; const libxsmm_blasint cBlocks = C/bc; const libxsmm_blasint kBlocks = K/bk; const libxsmm_blasint nBlocks = N/bn; const int lpb = handle->lpb; /*const int bc_lp = bc/lpb;*/ const int bk_lp = bk/lpb; const int bn_lp = bn/lpb; unsigned long long blocks; /* tensor raw pointers */ element_input_type *xt = (element_input_type* )handle->xt->data; element_input_type *csp = (element_input_type* )handle->csp->data; element_input_type *hpD = (element_input_type* )handle->hp->data; element_filter_type *w = (element_filter_type*)handle->w->data; element_filter_type *r = (element_filter_type*)handle->r->data; element_output_type *cst = (element_output_type*)handle->cst->data; element_output_type *ht = handle->ht ? (element_output_type*)handle->ht->data : (element_output_type*)NULL; element_output_type *it = (element_output_type*)handle->it->data; element_output_type *ft = (element_output_type*)handle->ft->data; element_output_type *ot = (element_output_type*)handle->ot->data; element_output_type *cit = (element_output_type*)handle->cit->data; element_output_type *cot = (element_output_type*)handle->cot->data; element_input_type *dxt = (element_input_type*)handle->dxt->data; element_input_type *dcsp = (element_input_type* )handle->dcsp->data; element_input_type *dhpD = (element_input_type* )handle->dhp->data; element_filter_type *dw = (element_filter_type*)handle->dw->data; element_filter_type *dr = (element_filter_type*)handle->dr->data; element_output_type *db_bf16 = (element_output_type*)handle->db->data; element_output_type *dcsD = (element_output_type*)handle->dcs->data; element_output_type *dht = (element_output_type*)handle->dht->data; element_output_type *diD = (element_output_type*)handle->scratch_di; element_output_type *dfD = (element_output_type*)handle->scratch_df; element_output_type *doD = (element_output_type*)handle->scratch_do; element_output_type *dciD = (element_output_type*)handle->scratch_dci; float *dxD = (float*)handle->scratch_dx; float *doutD = (float*)handle->scratch_deltat; float *dhpD_f32 = (float*)handle->scratch_dhp; float *db = (float*)handle->scratch_db; element_input_type *scratch_xT = (element_input_type* )handle->scratch_xT; element_filter_type *scratch_wT = (element_filter_type*)handle->scratch_wT; element_filter_type *scratch_rT = (element_filter_type*)handle->scratch_rT; element_output_type *scratch_hT = (element_output_type*)handle->scratch_hT; float *w_scratch = (float*)handle->scratch_w; float *r_scratch = (float*)handle->scratch_r; element_filter_type *wiD = &(w[0]); element_filter_type *wcD = &(w[K]); element_filter_type *wfD = &(w[2*K]); element_filter_type *woD = &(w[3*K]); element_filter_type *riD = &(r[0]); element_filter_type *rcD = &(r[K]); element_filter_type *rfD = &(r[2*K]); element_filter_type *roD = &(r[3*K]); element_filter_type *dwiD = &(dw[0]); element_filter_type *dwcD = &(dw[K]); element_filter_type *dwfD = &(dw[2*K]); element_filter_type *dwoD = &(dw[3*K]); element_filter_type *driD = &(dr[0]); element_filter_type *drcD = &(dr[K]); element_filter_type *drfD = &(dr[2*K]); element_filter_type *droD = &(dr[3*K]); float *dwiD_scratch = &(w_scratch[0]); float *dwcD_scratch = &(w_scratch[C*K]); float *dwfD_scratch = &(w_scratch[2*C*K]); float *dwoD_scratch = &(w_scratch[3*C*K]); float *driD_scratch = &(r_scratch[0]); float *drcD_scratch = &(r_scratch[K*K]); float *drfD_scratch = &(r_scratch[2*K*K]); float *droD_scratch = &(r_scratch[3*K*K]); float *dbi = &(db[0]); float *dbc = &(db[K]); float *dbf = &(db[2*K]); float *dbo = &(db[3*K]); element_output_type *dbi_bf16 = &(db_bf16[0]); element_output_type *dbc_bf16 = &(db_bf16[K]); element_output_type *dbf_bf16 = &(db_bf16[2*K]); element_output_type *dbo_bf16 = &(db_bf16[3*K]); element_filter_type *scratch_wiT = &(scratch_wT[0]); element_filter_type *scratch_wcT = &(scratch_wT[C*K]); element_filter_type *scratch_wfT = &(scratch_wT[2*C*K]); element_filter_type *scratch_woT = &(scratch_wT[3*C*K]); element_filter_type *scratch_riT = &(scratch_rT[0]); element_filter_type *scratch_rcT = &(scratch_rT[K*K]); element_filter_type *scratch_rfT = &(scratch_rT[2*K*K]); element_filter_type *scratch_roT = &(scratch_rT[3*K*K]); /*element_output_type *t1D = (element_output_type*)handle->scratch_t1;*/ /*element_output_type *t2D = (element_output_type*)handle->scratch_t2;*/ /* multidimensional arrays */ /*LIBXSMM_VLA_DECL(2, element_output_type, t1, t1D, K);*/ /*LIBXSMM_VLA_DECL(2, element_output_type, t2, t2D, K);*/ LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); LIBXSMM_VLA_DECL(2, element_input_type, cp, csp, K); LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); LIBXSMM_VLA_DECL(2, element_filter_type, wi, wiD, K4); LIBXSMM_VLA_DECL(2, element_filter_type, wf, wfD, K4); LIBXSMM_VLA_DECL(2, element_filter_type, wo, woD, K4); LIBXSMM_VLA_DECL(2, element_filter_type, wc, wcD, K4); LIBXSMM_VLA_DECL(2, element_filter_type, ri, riD, K4); LIBXSMM_VLA_DECL(2, element_filter_type, rf, rfD, K4); LIBXSMM_VLA_DECL(2, element_filter_type, ro, roD, K4); LIBXSMM_VLA_DECL(2, element_filter_type, rc, rcD, K4); LIBXSMM_VLA_DECL(3, element_output_type, cs, cst, N, K); LIBXSMM_VLA_DECL(3, element_output_type, h, ht, N, K); LIBXSMM_VLA_DECL(3, element_output_type, i, it, N, K); LIBXSMM_VLA_DECL(3, element_output_type, f, ft, N, K); LIBXSMM_VLA_DECL(3, element_output_type, o, ot, N, K); LIBXSMM_VLA_DECL(3, element_output_type, ci, cit, N, K); LIBXSMM_VLA_DECL(3, element_output_type, co, cot, N, K); LIBXSMM_VLA_DECL(3, float, dx, dxD, N, C); LIBXSMM_VLA_DECL(3, element_input_type, dx_bf16, dxt, N, C); LIBXSMM_VLA_DECL(2, element_input_type, dcp, dcsp, K); LIBXSMM_VLA_DECL(2, element_input_type, dhp, dhpD, K); LIBXSMM_VLA_DECL(2, float, dhp_f32, dhpD_f32, K); LIBXSMM_VLA_DECL(4, float, dwi, dwiD_scratch, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, float, dwf, dwfD_scratch, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, float, dwo, dwoD_scratch, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, float, dwc, dwcD_scratch, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, float, dri, driD_scratch, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, float, drf, drfD_scratch, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, float, dro, droD_scratch, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, float, drc, drcD_scratch, kBlocks, bk, bk); LIBXSMM_VLA_DECL(2, element_filter_type, dwi_ck, dwiD, 4*K); LIBXSMM_VLA_DECL(2, element_filter_type, dwf_ck, dwfD, 4*K); LIBXSMM_VLA_DECL(2, element_filter_type, dwo_ck, dwoD, 4*K); LIBXSMM_VLA_DECL(2, element_filter_type, dwc_ck, dwcD, 4*K); LIBXSMM_VLA_DECL(2, element_filter_type, dri_ck, driD, 4*K); LIBXSMM_VLA_DECL(2, element_filter_type, drf_ck, drfD, 4*K); LIBXSMM_VLA_DECL(2, element_filter_type, dro_ck, droD, 4*K); LIBXSMM_VLA_DECL(2, element_filter_type, drc_ck, drcD, 4*K); LIBXSMM_VLA_DECL(2, element_output_type, dcs, dcsD, K); LIBXSMM_VLA_DECL(3, element_output_type, dh, dht, N, K); LIBXSMM_VLA_DECL(2, element_output_type, di, diD, K); LIBXSMM_VLA_DECL(2, element_output_type, df, dfD, K); LIBXSMM_VLA_DECL(2, element_output_type, dp, doD, K); LIBXSMM_VLA_DECL(2, element_output_type, dci, dciD, K); LIBXSMM_VLA_DECL(5, element_output_type, diB, (element_output_type*)handle->scratch_diB, nBlocks, bn_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_output_type, dfB, (element_output_type*)handle->scratch_dfB, nBlocks, bn_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_output_type, dpB, (element_output_type*)handle->scratch_dpB, nBlocks, bn_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_output_type, dciB, (element_output_type*)handle->scratch_dciB, nBlocks, bn_lp, bk, lpb); LIBXSMM_VLA_DECL(2, float, dout, doutD, K); LIBXSMM_VLA_DECL(2, element_input_type, xT, scratch_xT, N); LIBXSMM_VLA_DECL(5, element_filter_type, wiT, scratch_wiT, kBlocks, bk_lp, bc, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, wcT, scratch_wcT, kBlocks, bk_lp, bc, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, wfT, scratch_wfT, kBlocks, bk_lp, bc, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, woT, scratch_woT, kBlocks, bk_lp, bc, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, riT, scratch_riT, kBlocks, bk_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, rcT, scratch_rcT, kBlocks, bk_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, rfT, scratch_rfT, kBlocks, bk_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, roT, scratch_roT, kBlocks, bk_lp, bk, lpb); LIBXSMM_VLA_DECL(2, element_output_type, hT, scratch_hT, N); float *dout_ptr = NULL; /* define batch-reduce gemm kernels */ const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernela = handle->bwdupd_kernela; const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernelb = handle->bwdupd_kernelb; const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernelc = handle->bwdupd_kernelc; const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kerneld = handle->bwdupd_kerneld; /* computing first logical thread */ const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; /* number of tasks that could be run in parallel for N and K blocks*/ const libxsmm_blasint work_nk = (N/bn) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize_nk = (work_nk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nk / (libxsmm_blasint)handle->desc.threads) : ((work_nk / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_nk = (ltid * chunksize_nk < work_nk) ? (ltid * chunksize_nk) : work_nk; const libxsmm_blasint thr_end_nk = ((ltid + 1) * chunksize_nk < work_nk) ? ((ltid + 1) * chunksize_nk) : work_nk; /* number of tasks that could be run in parallel for N and C blocks*/ const libxsmm_blasint work_nc = (N/bn) * (C/bc); /* compute chunk size */ const libxsmm_blasint chunksize_nc = (work_nc % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nc / (libxsmm_blasint)handle->desc.threads) : ((work_nc / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_nc = (ltid * chunksize_nc < work_nc) ? (ltid * chunksize_nc) : work_nc; const libxsmm_blasint thr_end_nc = ((ltid + 1) * chunksize_nc < work_nc) ? ((ltid + 1) * chunksize_nc) : work_nc; /* number of tasks that could be run in parallel for C and K blocks*/ const libxsmm_blasint work_ck = (C/bc) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize_ck = (work_ck % (libxsmm_blasint)handle->desc.threads == 0) ? (work_ck / (libxsmm_blasint)handle->desc.threads) : ((work_ck / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_ck = (ltid * chunksize_ck < work_ck) ? (ltid * chunksize_ck) : work_ck; const libxsmm_blasint thr_end_ck = ((ltid + 1) * chunksize_ck < work_ck) ? ((ltid + 1) * chunksize_ck) : work_ck; /* number of tasks that could be run in parallel for K and K blocks*/ const libxsmm_blasint work_kk = (K/bk) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize_kk = (work_kk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_kk / (libxsmm_blasint)handle->desc.threads) : ((work_kk / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_kk = (ltid * chunksize_kk < work_kk) ? (ltid * chunksize_kk) : work_kk; const libxsmm_blasint thr_end_kk = ((ltid + 1) * chunksize_kk < work_kk) ? ((ltid + 1) * chunksize_kk) : work_kk; element_output_type *cps_ptr = NULL; int k_tasks = K/16; int k_chunksize = (k_tasks % (libxsmm_blasint)handle->desc.threads == 0) ? (k_tasks / (libxsmm_blasint)handle->desc.threads) : ((k_tasks / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint k_thr_begin = (ltid * k_chunksize * 16 < K) ? (ltid * k_chunksize * 16) : K; const libxsmm_blasint k_thr_end = ((ltid + 1) * k_chunksize * 16 < K) ? ((ltid + 1) * k_chunksize * 16) : K; __m512 dbi_sum, dbf_sum, dbo_sum, dbc_sum; /* number of tasks that could be run in parallel for K blocks*/ /* compute chunk size */ #if 0 const libxsmm_blasint chunksize_k = (K % (libxsmm_blasint)handle->desc.threads == 0) ? (K / (libxsmm_blasint)handle->desc.threads) : ((K / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_k = (ltid * chunksize_k < K) ? (ltid * chunksize_k) : K; const libxsmm_blasint thr_end_k = ((ltid + 1) * chunksize_k < K) ? ((ltid + 1) * chunksize_k) : K; #endif #ifdef PROFILE __int64_t _start, _end, eltwise_cycles = 0, dout_cycles = 0, weight_trans_cycles = 0, act_trans_cycles = 0, dx_cycles = 0, dwdr_cycles = 0, gradient_cycles = 0, reformat_cycles = 0; float total_time = 0.0; #endif int bcbk_multiples_of_16 = ((bc % 16 == 0) && (bk % 16 == 0)) ? 1 : 0; libxsmm_blasint ikic, inic, inik, icin, ikin; /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, (int)ltid); /* Blocking reduction domain if it is too large */ BF = 1; if (K > 1024 && K <= 2048) { BF = 8; while (kBlocks % BF != 0) { BF--; } } if (K > 2048) { BF = 16; while (kBlocks % BF != 0) { BF--; } } KB_BLOCKS = kBlocks/BF; /* initialization is done at the beginning */ if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { libxsmm_internal_matrix_zero(N*C*t, dxD, start_thread, tid, handle->desc.threads); } /* initialization is done at the beginning */ if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { libxsmm_internal_matrix_zero(C*K*4, w_scratch, start_thread, tid, handle->desc.threads); libxsmm_internal_matrix_zero(K*K*4, r_scratch, start_thread, tid, handle->desc.threads); libxsmm_internal_matrix_zero(K*4, db, start_thread, tid, handle->desc.threads); } #ifdef PROFILE if (ltid == 0) _start = _rdtsc(); #endif /* transpose W */ for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { ic = (ikic / (K/bk)); ik = (ikic % (K/bk)); for (jk = 0; jk < bk; ++jk) { for (jc = 0; jc < bc; ++jc) { LIBXSMM_VLA_ACCESS(5, wiT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bc, lpb) = LIBXSMM_VLA_ACCESS(2, wi, ic*bc+jc, ik*bk+jk, 4*K); LIBXSMM_VLA_ACCESS(5, wcT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bc, lpb) = LIBXSMM_VLA_ACCESS(2, wc, ic*bc+jc, ik*bk+jk, 4*K); LIBXSMM_VLA_ACCESS(5, wfT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bc, lpb) = LIBXSMM_VLA_ACCESS(2, wf, ic*bc+jc, ik*bk+jk, 4*K); LIBXSMM_VLA_ACCESS(5, woT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bc, lpb) = LIBXSMM_VLA_ACCESS(2, wo, ic*bc+jc, ik*bk+jk, 4*K); } } } /* transpose R */ for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { ik = (ikic / (K/bk)); ic = (ikic % (K/bk)); for (jk = 0; jk < bk; ++jk) { for (jc = 0; jc < bk; ++jc) { LIBXSMM_VLA_ACCESS(5, riT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, ri, ic*bk+jc, ik*bk+jk, 4*K); LIBXSMM_VLA_ACCESS(5, rcT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, rc, ic*bk+jc, ik*bk+jk, 4*K); LIBXSMM_VLA_ACCESS(5, rfT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, rf, ic*bk+jc, ik*bk+jk, 4*K); LIBXSMM_VLA_ACCESS(5, roT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, ro, ic*bk+jc, ik*bk+jk, 4*K); } } } #ifdef PROFILE if (ltid == 0) { _end = _rdtsc(); weight_trans_cycles += _end - _start; } #endif #include "libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_core_bf16.tpl.c" if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { #ifdef PROFILE if (ltid == 0) _start = _rdtsc(); #endif /* Store result weight matrices in CK format and downcovert to bf16 */ #if defined(LIBXSMM_RNN_CELL_AVX512) for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { icb = ikic / (K/bk); ic = icb*bc; ikb = ikic % (K/bk); ik = ikb*bk; for (jc = 0; jc < bc; ++jc) { for (jk = 0; jk < bk; jk += 16) { _mm256_storeu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(2, dwi_ck, ic+jc, ik+jk , K4), LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, jc, jk, cBlocks, bc, bk)))); _mm256_storeu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(2, dwc_ck, ic+jc, ik+jk , K4), LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, jc, jk, cBlocks, bc, bk)))); _mm256_storeu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(2, dwf_ck, ic+jc, ik+jk , K4), LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, jc, jk, cBlocks, bc, bk)))); _mm256_storeu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(2, dwo_ck, ic+jc, ik+jk , K4), LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwo, ikb, icb, jc, jk, cBlocks, bc, bk)))); } } } for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { icb = ikic / (K/bk); ic = icb*bk; ikb = ikic % (K/bk); ik = ikb*bk; for (jc = 0; jc < bk; ++jc) { for (jk = 0; jk < bk; jk += 16) { _mm256_storeu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(2, dri_ck, ic+jc, ik+jk , K4), LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, jc, jk, kBlocks, bk, bk)))); _mm256_storeu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(2, drc_ck, ic+jc, ik+jk , K4), LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, jc, jk, kBlocks, bk, bk)))); _mm256_storeu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(2, drf_ck, ic+jc, ik+jk , K4), LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, jc, jk, kBlocks, bk, bk)))); _mm256_storeu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(2, dro_ck, ic+jc, ik+jk , K4), LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dro, ikb, icb, jc, jk, kBlocks, bk, bk)))); } } } #else /* TODO: Add here non AVX512 replacement code */ #endif libxsmm_barrier_wait(handle->barrier, (int)ltid); #ifdef PROFILE if (ltid == 0) { _end = _rdtsc(); reformat_cycles += _end - _start; } #endif } #ifdef PROFILE if (ltid == 0) { printf("----- PROFILING LSTM BWD/UPD (N = %d, C = %d, K = %d, bn = %d. bc = %d, bk = %d)----\n", N, C, K, bn, bc, bk ); total_time = (gradient_cycles+dwdr_cycles+dx_cycles+act_trans_cycles+weight_trans_cycles+dout_cycles+eltwise_cycles+reformat_cycles)/(2.5 * 1e9)*1000.0f; printf("Transpose weights time is %f ms (%.2f%%)\n", weight_trans_cycles/(2.5 * 1e9)*1000.0f, weight_trans_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); printf("Elementwise time is %f ms (%.2f%%)\n", eltwise_cycles/(2.5 * 1e9)*1000.0f, eltwise_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); printf("Dx GEMM time is %f ms (%.2f%%) at %f GFLOPS\n", dx_cycles/(2.5 * 1e9)*1000.0f, dx_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*2.0*N*C*K*4/1e9/(dx_cycles/(2.5 * 1e9))); printf("Dh GEMM time is %f ms (%.2f%%) at %f GFLOPS\n", dout_cycles/(2.5 * 1e9)*1000.0f, dout_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*2.0*N*K*K*4/1e9/(dout_cycles/(2.5 * 1e9))); printf("Transpose input activations time is %f ms (%.2f%%)\n", act_trans_cycles/(2.5 * 1e9)*1000.0f, act_trans_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); printf("Dwdr GEMM time is %f ms (%.2f%%) at %f GFLOPS\n", dwdr_cycles/(2.5 * 1e9)*1000.0f, dwdr_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*2.0*(N*K*K*2.0+N*C*K*2.0)*2.0/1e9/(dwdr_cycles/(2.5 * 1e9))); printf("Gradient bias calculation time is %f ms (%.2f%%)\n", gradient_cycles/(2.5 * 1e9)*1000.0f, gradient_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); printf("Reformat dwdr time is %f ms (%.2f%%)\n\n", reformat_cycles/(2.5 * 1e9)*1000.0f, reformat_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); } #undef PROFILE #endif libxsmm-1.17/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck.tpl.c000066400000000000000000000422341415223013700273000ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas, Kunal Banerjee (Intel Corp.) ******************************************************************************/ #if 0 #define PROFILE #endif /* helper variables */ libxsmm_blasint j, ik, ikb, in, inb, ic, icb, jk, jb/*jn shadows global variable*/, jc, ek, en, ec, BF, KB_BLOCKS, KB; /* tensor dimensions */ libxsmm_blasint K = handle->desc.K; libxsmm_blasint N = handle->desc.N; libxsmm_blasint C = handle->desc.C; libxsmm_blasint t = handle->T; libxsmm_blasint bk = handle->bk; libxsmm_blasint bn = handle->bn; libxsmm_blasint bc = handle->bc; const libxsmm_blasint cBlocks = C/bc; const libxsmm_blasint kBlocks = K/bk; const libxsmm_blasint nBlocks = N/bn; unsigned long long blocks; /* tensor raw pointers */ element_input_type *xt = (element_input_type* )handle->xt->data; element_input_type *csp = (element_input_type* )handle->csp->data; element_input_type *hpD = (element_input_type* )handle->hp->data; element_filter_type *wt = (element_filter_type*)handle->wt->data; element_filter_type *rt = (element_filter_type*)handle->rt->data; element_output_type *cst = (element_output_type*)handle->cst->data; element_output_type *ht = handle->ht ? (element_output_type*)handle->ht->data : (element_output_type*)NULL; element_output_type *it = (element_output_type*)handle->it->data; element_output_type *ft = (element_output_type*)handle->ft->data; element_output_type *ot = (element_output_type*)handle->ot->data; element_output_type *cit = (element_output_type*)handle->cit->data; element_output_type *cot = (element_output_type*)handle->cot->data; element_input_type *dxt = (element_input_type*)handle->dxt->data; element_input_type *dcsp = (element_input_type* )handle->dcsp->data; element_input_type *dhpD = (element_input_type* )handle->dhp->data; element_filter_type *dw = (element_filter_type*)handle->dw->data; element_filter_type *dr = (element_filter_type*)handle->dr->data; element_output_type *db = (element_output_type*)handle->db->data; element_output_type *dcsD = (element_output_type*)handle->dcs->data; element_output_type *dht = (element_output_type*)handle->dht->data; element_output_type *diD = (element_output_type*)handle->scratch_di; element_output_type *dfD = (element_output_type*)handle->scratch_df; element_output_type *doD = (element_output_type*)handle->scratch_do; element_output_type *dciD = (element_output_type*)handle->scratch_dci; element_output_type *doutD = (element_output_type*)handle->scratch_deltat; element_input_type *scratch_xT = (element_input_type* )handle->scratch_xT; #if 0 element_filter_type *scratch_wT = (element_filter_type*)handle->scratch_wT; element_filter_type *scratch_rT = (element_filter_type*)handle->scratch_rT; #endif element_output_type *scratch_hT = (element_output_type*)handle->scratch_hT; element_filter_type *witD = &(wt[0]); element_filter_type *wctD = &(wt[C*K]); element_filter_type *wftD = &(wt[2*C*K]); element_filter_type *wotD = &(wt[3*C*K]); element_filter_type *ritD = &(rt[0]); element_filter_type *rctD = &(rt[K*K]); element_filter_type *rftD = &(rt[2*K*K]); element_filter_type *rotD = &(rt[3*K*K]); element_filter_type *dwiD = &(dw[0]); element_filter_type *dwcD = &(dw[C*K]); element_filter_type *dwfD = &(dw[2*C*K]); element_filter_type *dwoD = &(dw[3*C*K]); element_filter_type *driD = &(dr[0]); element_filter_type *drcD = &(dr[K*K]); element_filter_type *drfD = &(dr[2*K*K]); element_filter_type *droD = &(dr[3*K*K]); element_output_type *dbi = &(db[0]); element_output_type *dbc = &(db[K]); element_output_type *dbf = &(db[2*K]); element_output_type *dbo = &(db[3*K]); #if 0 element_filter_type *scratch_wiT = &(scratch_wT[0]); element_filter_type *scratch_wcT = &(scratch_wT[C*K]); element_filter_type *scratch_wfT = &(scratch_wT[2*C*K]); element_filter_type *scratch_woT = &(scratch_wT[3*C*K]); element_filter_type *scratch_riT = &(scratch_rT[0]); element_filter_type *scratch_rcT = &(scratch_rT[K*K]); element_filter_type *scratch_rfT = &(scratch_rT[2*K*K]); element_filter_type *scratch_roT = &(scratch_rT[3*K*K]); #endif element_output_type *t1D = (element_output_type*)handle->scratch_t1; element_output_type *t2D = (element_output_type*)handle->scratch_t2; /* multidimensional arrays */ LIBXSMM_VLA_DECL(2, element_output_type, t1, t1D, K); LIBXSMM_VLA_DECL(2, element_output_type, t2, t2D, K); LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); LIBXSMM_VLA_DECL(2, element_input_type, cp, csp, K); LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); #if 0 LIBXSMM_VLA_DECL(4, element_filter_type, wi, wiD, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, wf, wfD, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, wo, woD, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, wc, wcD, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, ri, riD, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, rf, rfD, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, ro, roD, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, rc, rcD, kBlocks, bk, bk); #endif LIBXSMM_VLA_DECL(3, element_output_type, cs, cst, N, K); LIBXSMM_VLA_DECL(3, element_output_type, h, ht, N, K); LIBXSMM_VLA_DECL(3, element_output_type, i, it, N, K); LIBXSMM_VLA_DECL(3, element_output_type, f, ft, N, K); LIBXSMM_VLA_DECL(3, element_output_type, o, ot, N, K); LIBXSMM_VLA_DECL(3, element_output_type, ci, cit, N, K); LIBXSMM_VLA_DECL(3, element_output_type, co, cot, N, K); LIBXSMM_VLA_DECL(3, element_input_type, dx, dxt, N, C); LIBXSMM_VLA_DECL(2, element_input_type, dcp, dcsp, K); LIBXSMM_VLA_DECL(2, element_input_type, dhp, dhpD, K); LIBXSMM_VLA_DECL(4, element_filter_type, dwi, dwiD, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, dwf, dwfD, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, dwo, dwoD, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, dwc, dwcD, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, dri, driD, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, drf, drfD, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, dro, droD, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, drc, drcD, kBlocks, bk, bk); LIBXSMM_VLA_DECL(2, element_output_type, dcs, dcsD, K); LIBXSMM_VLA_DECL(3, element_output_type, dh, dht, N, K); LIBXSMM_VLA_DECL(2, element_output_type, di, diD, K); LIBXSMM_VLA_DECL(2, element_output_type, df, dfD, K); LIBXSMM_VLA_DECL(2, element_output_type, dp, doD, K); LIBXSMM_VLA_DECL(2, element_output_type, dci, dciD, K); LIBXSMM_VLA_DECL(2, element_output_type, dout, doutD, K); LIBXSMM_VLA_DECL(2, element_input_type, xT, scratch_xT, N); LIBXSMM_VLA_DECL(4, element_filter_type, wiT, witD, kBlocks, bk, bc); LIBXSMM_VLA_DECL(4, element_filter_type, wcT, wctD, kBlocks, bk, bc); LIBXSMM_VLA_DECL(4, element_filter_type, wfT, wftD, kBlocks, bk, bc); LIBXSMM_VLA_DECL(4, element_filter_type, woT, wotD, kBlocks, bk, bc); LIBXSMM_VLA_DECL(4, element_filter_type, riT, ritD, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, rcT, rctD, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, rfT, rftD, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, roT, rotD, kBlocks, bk, bk); LIBXSMM_VLA_DECL(2, element_output_type, hT, scratch_hT, N); element_output_type *dout_ptr = NULL; /* define batch-reduce gemm kernels */ const libxsmm_smmfunction_reducebatch_addr batchreduce_kernela = libxsmm_smmdispatch_reducebatch_addr( bc, bn, bk, &bc, &K, &C, NULL, NULL, NULL, NULL); const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelb = libxsmm_smmdispatch_reducebatch_addr( bk, bk, bn, &bk, &N, &bk, NULL, NULL, NULL, NULL); const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelc = libxsmm_smmdispatch_reducebatch_addr( bk, bc, bn, &bk, &N, &bk, NULL, NULL, NULL, NULL); const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelb1 = libxsmm_smmdispatch_reducebatch_addr( bk, bk, bn, &K, &N, &bk, NULL, NULL, NULL, NULL); const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelc1 = libxsmm_smmdispatch_reducebatch_addr( bk, bc, bn, &K, &N, &bk, NULL, NULL, NULL, NULL); const libxsmm_smmfunction_reducebatch_addr batchreduce_kerneld = libxsmm_smmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, NULL, NULL); /* Auxiliary arrays for batch-reduce gemm calls */ const element_filter_type *A_array[1024]; const element_output_type *B_array[1024]; LIBXSMM_VLA_DECL(4, element_output_type, diB, (element_output_type*)handle->scratch_diB, kBlocks, bn, bk); LIBXSMM_VLA_DECL(4, element_output_type, dfB, (element_output_type*)handle->scratch_dfB, kBlocks, bn, bk); LIBXSMM_VLA_DECL(4, element_output_type, dpB, (element_output_type*)handle->scratch_dpB, kBlocks, bn, bk); LIBXSMM_VLA_DECL(4, element_output_type, dciB, (element_output_type*)handle->scratch_dciB, kBlocks, bn, bk); /* computing first logical thread */ const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; /* number of tasks that could be run in parallel for N and K blocks*/ const libxsmm_blasint work_nk = (N/bn) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize_nk = (work_nk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nk / (libxsmm_blasint)handle->desc.threads) : ((work_nk / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_nk = (ltid * chunksize_nk < work_nk) ? (ltid * chunksize_nk) : work_nk; const libxsmm_blasint thr_end_nk = ((ltid + 1) * chunksize_nk < work_nk) ? ((ltid + 1) * chunksize_nk) : work_nk; /* number of tasks that could be run in parallel for N and C blocks*/ const libxsmm_blasint work_nc = (N/bn) * (C/bc); /* compute chunk size */ const libxsmm_blasint chunksize_nc = (work_nc % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nc / (libxsmm_blasint)handle->desc.threads) : ((work_nc / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_nc = (ltid * chunksize_nc < work_nc) ? (ltid * chunksize_nc) : work_nc; const libxsmm_blasint thr_end_nc = ((ltid + 1) * chunksize_nc < work_nc) ? ((ltid + 1) * chunksize_nc) : work_nc; /* number of tasks that could be run in parallel for C and K blocks*/ const libxsmm_blasint work_ck = (C/bc) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize_ck = (work_ck % (libxsmm_blasint)handle->desc.threads == 0) ? (work_ck / (libxsmm_blasint)handle->desc.threads) : ((work_ck / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_ck = (ltid * chunksize_ck < work_ck) ? (ltid * chunksize_ck) : work_ck; const libxsmm_blasint thr_end_ck = ((ltid + 1) * chunksize_ck < work_ck) ? ((ltid + 1) * chunksize_ck) : work_ck; /* number of tasks that could be run in parallel for K and K blocks*/ const libxsmm_blasint work_kk = (K/bk) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize_kk = (work_kk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_kk / (libxsmm_blasint)handle->desc.threads) : ((work_kk / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_kk = (ltid * chunksize_kk < work_kk) ? (ltid * chunksize_kk) : work_kk; const libxsmm_blasint thr_end_kk = ((ltid + 1) * chunksize_kk < work_kk) ? ((ltid + 1) * chunksize_kk) : work_kk; #if defined(LIBXSMM_RNN_CELL_AVX512) element_output_type *cps_ptr = NULL; int k_tasks = K/16; int k_chunksize = (k_tasks % (libxsmm_blasint)handle->desc.threads == 0) ? (k_tasks / (libxsmm_blasint)handle->desc.threads) : ((k_tasks / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint k_thr_begin = (ltid * k_chunksize * 16 < K) ? (ltid * k_chunksize * 16) : K; const libxsmm_blasint k_thr_end = ((ltid + 1) * k_chunksize * 16 < K) ? ((ltid + 1) * k_chunksize * 16) : K;__m512 dbi_sum, dbf_sum, dbo_sum, dbc_sum; #endif /* number of tasks that could be run in parallel for K blocks*/ /* compute chunk size */ const libxsmm_blasint chunksize_k = (K % (libxsmm_blasint)handle->desc.threads == 0) ? (K / (libxsmm_blasint)handle->desc.threads) : ((K / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_k = (ltid * chunksize_k < K) ? (ltid * chunksize_k) : K; const libxsmm_blasint thr_end_k = ((ltid + 1) * chunksize_k < K) ? ((ltid + 1) * chunksize_k) : K; #ifdef PROFILE __int64_t _start, _end, eltwise_cycles = 0, dout_cycles = 0, weight_trans_cycles = 0, act_trans_cycles = 0, dx_cycles = 0, dwdr_cycles = 0, gradient_cycles = 0; float total_time = 0.0; #endif int bcbk_multiples_of_16 = ((bc % 16 == 0) && (bk % 16 == 0)) ? 1 : 0; libxsmm_blasint ikic, inic, inik, icin, ikin; /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, (int)ltid); /* Blocking reduction domain if it is too large */ BF = 1; if (K > 1024 && K <= 2048) { BF = 8; while (kBlocks % BF != 0) { BF--; } } if (K > 2048) { BF = 16; while (kBlocks % BF != 0) { BF--; } } KB_BLOCKS = kBlocks/BF; /* initialization is done at the beginning */ if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { libxsmm_internal_matrix_zero(N*C*t, dxt, start_thread, tid, handle->desc.threads); } /* initialization is done at the beginning */ if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { libxsmm_internal_matrix_zero(C*K*4, dw, start_thread, tid, handle->desc.threads); libxsmm_internal_matrix_zero(K*K*4, dr, start_thread, tid, handle->desc.threads); libxsmm_internal_matrix_zero(K*4, db, start_thread, tid, handle->desc.threads); } #if 0 #ifdef PROFILE if (ltid == 0) _start = _rdtsc(); #endif /* transpose W */ for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { ic = (ikic / (K/bk)); ik = (ikic % (K/bk)); for (jk = 0; jk < bk; ++jk) { for (jc = 0; jc < bc; ++jc) { LIBXSMM_VLA_ACCESS(4, wiT, ic, ik, jk, jc, kBlocks, bk, bc) = LIBXSMM_VLA_ACCESS(4, wi, ik, ic, jc, jk, cBlocks, bc, bk); LIBXSMM_VLA_ACCESS(4, wcT, ic, ik, jk, jc, kBlocks, bk, bc) = LIBXSMM_VLA_ACCESS(4, wc, ik, ic, jc, jk, cBlocks, bc, bk); LIBXSMM_VLA_ACCESS(4, wfT, ic, ik, jk, jc, kBlocks, bk, bc) = LIBXSMM_VLA_ACCESS(4, wf, ik, ic, jc, jk, cBlocks, bc, bk); LIBXSMM_VLA_ACCESS(4, woT, ic, ik, jk, jc, kBlocks, bk, bc) = LIBXSMM_VLA_ACCESS(4, wo, ik, ic, jc, jk, cBlocks, bc, bk); } } } /* transpose R */ for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { ik = (ikic / (K/bk)); ic = (ikic % (K/bk)); for (jk = 0; jk < bk; ++jk) { for (jc = 0; jc < bk; ++jc) { LIBXSMM_VLA_ACCESS(4, riT, ic, ik, jk, jc, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(4, ri, ik, ic, jc, jk, kBlocks, bk, bk); LIBXSMM_VLA_ACCESS(4, rcT, ic, ik, jk, jc, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(4, rc, ik, ic, jc, jk, kBlocks, bk, bk); LIBXSMM_VLA_ACCESS(4, rfT, ic, ik, jk, jc, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(4, rf, ik, ic, jc, jk, kBlocks, bk, bk); LIBXSMM_VLA_ACCESS(4, roT, ic, ik, jk, jc, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(4, ro, ik, ic, jc, jk, kBlocks, bk, bk); } } } #ifdef PROFILE if (ltid == 0) { _end = _rdtsc(); weight_trans_cycles += _end - _start; } #endif #endif #include "libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_core.tpl.c" #ifdef PROFILE if (ltid == 0) { printf("----- PROFILING LSTM BWD/UPD (N = %d, C = %d, K = %d, bn = %d. bc = %d, bk = %d)----\n", N, C, K, bn, bc, bk ); total_time = (gradient_cycles+dwdr_cycles+dx_cycles+act_trans_cycles+weight_trans_cycles+dout_cycles+eltwise_cycles)/(2.5 * 1e9)*1000.0f; printf("Transpose weights time is %f ms (%.2f%%)\n", weight_trans_cycles/(2.5 * 1e9)*1000.0f, weight_trans_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); printf("Elementwise time is %f ms (%.2f%%)\n", eltwise_cycles/(2.5 * 1e9)*1000.0f, eltwise_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); printf("Dx GEMM time is %f ms (%.2f%%) at %f GFLOPS\n", dx_cycles/(2.5 * 1e9)*1000.0f, dx_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*2.0*N*C*K*4/1e9/(dx_cycles/(2.5 * 1e9))); printf("Dh GEMM time is %f ms (%.2f%%) at %f GFLOPS\n", dout_cycles/(2.5 * 1e9)*1000.0f, dout_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*2.0*N*K*K*4/1e9/(dout_cycles/(2.5 * 1e9))); printf("Transpose input activations time is %f ms (%.2f%%)\n", act_trans_cycles/(2.5 * 1e9)*1000.0f, act_trans_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); printf("Dwdr GEMM time is %f ms (%.2f%%) at %f GFLOPS\n", dwdr_cycles/(2.5 * 1e9)*1000.0f, dwdr_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*2.0*(N*K*K*2.0+N*C*K*2.0)*2.0/1e9/(dwdr_cycles/(2.5 * 1e9))); printf("Gradient bias calculation time is %f ms (%.2f%%)\n", gradient_cycles/(2.5 * 1e9)*1000.0f, gradient_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); } #undef PROFILE #endif libxsmm-1.17/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_bf16.tpl.c000066400000000000000000000625561415223013700301270ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas, Kunal Banerjee (Intel Corp.) ******************************************************************************/ #if 0 #define PROFILE #endif /* helper variables */ libxsmm_blasint j, ik, ikb, in, inb, ic, icb, jk, jb/*jn shadows global variable*/, jc, ek, en, ec, BF, KB_BLOCKS, KB; /* tensor dimensions */ libxsmm_blasint K = handle->desc.K; libxsmm_blasint N = handle->desc.N; libxsmm_blasint C = handle->desc.C; libxsmm_blasint t = handle->T; libxsmm_blasint bk = handle->bk; libxsmm_blasint bn = handle->bn; libxsmm_blasint bc = handle->bc; const libxsmm_blasint cBlocks = C/bc; const libxsmm_blasint kBlocks = K/bk; const libxsmm_blasint nBlocks = N/bn; const int lpb = handle->lpb; const int bc_lp = bc/lpb; const int bk_lp = bk/lpb; const int bn_lp = bn/lpb; unsigned long long blocks; /* tensor raw pointers */ element_input_type *xt = (element_input_type* )handle->xt->data; element_input_type *csp = (element_input_type* )handle->csp->data; element_input_type *hpD = (element_input_type* )handle->hp->data; element_filter_type *wt = (element_filter_type*)handle->wt->data; element_filter_type *rt = (element_filter_type*)handle->rt->data; element_output_type *cst = (element_output_type*)handle->cst->data; element_output_type *ht = handle->ht ? (element_output_type*)handle->ht->data : (element_output_type*)NULL; element_output_type *it = (element_output_type*)handle->it->data; element_output_type *ft = (element_output_type*)handle->ft->data; element_output_type *ot = (element_output_type*)handle->ot->data; element_output_type *cit = (element_output_type*)handle->cit->data; element_output_type *cot = (element_output_type*)handle->cot->data; element_input_type *dxt = (element_input_type*)handle->dxt->data; element_input_type *dcsp = (element_input_type* )handle->dcsp->data; element_input_type *dhpD = (element_input_type* )handle->dhp->data; element_filter_type *dw = (element_filter_type*)handle->dw->data; element_filter_type *dr = (element_filter_type*)handle->dr->data; element_output_type *db_bf16 = (element_output_type*)handle->db->data; element_output_type *dcsD = (element_output_type*)handle->dcs->data; element_output_type *dht = (element_output_type*)handle->dht->data; element_output_type *diD = (element_output_type*)handle->scratch_di; element_output_type *dfD = (element_output_type*)handle->scratch_df; element_output_type *doD = (element_output_type*)handle->scratch_do; element_output_type *dciD = (element_output_type*)handle->scratch_dci; float *dxD = (float*)handle->scratch_dx; float *doutD = (float*)handle->scratch_deltat; float *dhpD_f32 = (float*)handle->scratch_dhp; float *db = (float*)handle->scratch_db; element_input_type *scratch_xT = (element_input_type* )handle->scratch_xT; #if 0 element_filter_type *scratch_wT = (element_filter_type*)handle->scratch_wT; element_filter_type *scratch_rT = (element_filter_type*)handle->scratch_rT; #endif element_output_type *scratch_hT = (element_output_type*)handle->scratch_hT; float *w_scratch = (float*)handle->scratch_w; float *r_scratch = (float*)handle->scratch_r; element_filter_type *witD = &(wt[0]); element_filter_type *wctD = &(wt[C*K]); element_filter_type *wftD = &(wt[2*C*K]); element_filter_type *wotD = &(wt[3*C*K]); element_filter_type *ritD = &(rt[0]); element_filter_type *rctD = &(rt[K*K]); element_filter_type *rftD = &(rt[2*K*K]); element_filter_type *rotD = &(rt[3*K*K]); element_filter_type *dwiD = &(dw[0]); element_filter_type *dwcD = &(dw[C*K]); element_filter_type *dwfD = &(dw[2*C*K]); element_filter_type *dwoD = &(dw[3*C*K]); element_filter_type *driD = &(dr[0]); element_filter_type *drcD = &(dr[K*K]); element_filter_type *drfD = &(dr[2*K*K]); element_filter_type *droD = &(dr[3*K*K]); float *dwiD_scratch = &(w_scratch[0]); float *dwcD_scratch = &(w_scratch[C*K]); float *dwfD_scratch = &(w_scratch[2*C*K]); float *dwoD_scratch = &(w_scratch[3*C*K]); float *driD_scratch = &(r_scratch[0]); float *drcD_scratch = &(r_scratch[K*K]); float *drfD_scratch = &(r_scratch[2*K*K]); float *droD_scratch = &(r_scratch[3*K*K]); float *dbi = &(db[0]); float *dbc = &(db[K]); float *dbf = &(db[2*K]); float *dbo = &(db[3*K]); element_output_type *dbi_bf16 = &(db_bf16[0]); element_output_type *dbc_bf16 = &(db_bf16[K]); element_output_type *dbf_bf16 = &(db_bf16[2*K]); element_output_type *dbo_bf16 = &(db_bf16[3*K]); #if 0 element_filter_type *scratch_wiT = &(scratch_wT[0]); element_filter_type *scratch_wcT = &(scratch_wT[C*K]); element_filter_type *scratch_wfT = &(scratch_wT[2*C*K]); element_filter_type *scratch_woT = &(scratch_wT[3*C*K]); element_filter_type *scratch_riT = &(scratch_rT[0]); element_filter_type *scratch_rcT = &(scratch_rT[K*K]); element_filter_type *scratch_rfT = &(scratch_rT[2*K*K]); element_filter_type *scratch_roT = &(scratch_rT[3*K*K]); #endif /*element_output_type *t1D = (element_output_type*)handle->scratch_t1;*/ /*element_output_type *t2D = (element_output_type*)handle->scratch_t2;*/ /* multidimensional arrays */ /*LIBXSMM_VLA_DECL(2, element_output_type, t1, t1D, K);*/ /*LIBXSMM_VLA_DECL(2, element_output_type, t2, t2D, K);*/ LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); LIBXSMM_VLA_DECL(2, element_input_type, cp, csp, K); LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); #if 0 LIBXSMM_VLA_DECL(5, element_filter_type, wi, wiD, cBlocks, bc_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, wc, wcD, cBlocks, bc_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, wf, wfD, cBlocks, bc_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, wo, woD, cBlocks, bc_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, ri, riD, kBlocks, bk_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, rc, rcD, kBlocks, bk_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, rf, rfD, kBlocks, bk_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, ro, roD, kBlocks, bk_lp, bk, lpb); #endif LIBXSMM_VLA_DECL(3, element_output_type, cs, cst, N, K); LIBXSMM_VLA_DECL(3, element_output_type, h, ht, N, K); LIBXSMM_VLA_DECL(3, element_output_type, i, it, N, K); LIBXSMM_VLA_DECL(3, element_output_type, f, ft, N, K); LIBXSMM_VLA_DECL(3, element_output_type, o, ot, N, K); LIBXSMM_VLA_DECL(3, element_output_type, ci, cit, N, K); LIBXSMM_VLA_DECL(3, element_output_type, co, cot, N, K); LIBXSMM_VLA_DECL(3, float, dx, dxD, N, C); LIBXSMM_VLA_DECL(3, element_input_type, dx_bf16, dxt, N, C); LIBXSMM_VLA_DECL(2, element_input_type, dcp, dcsp, K); LIBXSMM_VLA_DECL(2, element_input_type, dhp, dhpD, K); LIBXSMM_VLA_DECL(2, float, dhp_f32, dhpD_f32, K); LIBXSMM_VLA_DECL(4, float, dwi, dwiD_scratch, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, float, dwf, dwfD_scratch, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, float, dwo, dwoD_scratch, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, float, dwc, dwcD_scratch, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, float, dri, driD_scratch, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, float, drf, drfD_scratch, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, float, dro, droD_scratch, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, float, drc, drcD_scratch, kBlocks, bk, bk); LIBXSMM_VLA_DECL(5, element_filter_type, dwi_bf16, dwiD, cBlocks, bc_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, dwc_bf16, dwcD, cBlocks, bc_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, dwf_bf16, dwfD, cBlocks, bc_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, dwo_bf16, dwoD, cBlocks, bc_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, dri_bf16, driD, kBlocks, bk_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, drc_bf16, drcD, kBlocks, bk_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, drf_bf16, drfD, kBlocks, bk_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, dro_bf16, droD, kBlocks, bk_lp, bk, lpb); LIBXSMM_VLA_DECL(2, element_output_type, dcs, dcsD, K); LIBXSMM_VLA_DECL(3, element_output_type, dh, dht, N, K); LIBXSMM_VLA_DECL(2, element_output_type, di, diD, K); LIBXSMM_VLA_DECL(2, element_output_type, df, dfD, K); LIBXSMM_VLA_DECL(2, element_output_type, dp, doD, K); LIBXSMM_VLA_DECL(2, element_output_type, dci, dciD, K); LIBXSMM_VLA_DECL(5, element_output_type, diB, (element_output_type*)handle->scratch_diB, nBlocks, bn_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_output_type, dfB, (element_output_type*)handle->scratch_dfB, nBlocks, bn_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_output_type, dpB, (element_output_type*)handle->scratch_dpB, nBlocks, bn_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_output_type, dciB, (element_output_type*)handle->scratch_dciB, nBlocks, bn_lp, bk, lpb); LIBXSMM_VLA_DECL(2, float, dout, doutD, K); LIBXSMM_VLA_DECL(2, element_input_type, xT, scratch_xT, N); LIBXSMM_VLA_DECL(5, element_filter_type, wiT, witD, kBlocks, bk_lp, bc, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, wcT, wctD, kBlocks, bk_lp, bc, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, wfT, wftD, kBlocks, bk_lp, bc, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, woT, wotD, kBlocks, bk_lp, bc, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, riT, ritD, kBlocks, bk_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, rcT, rctD, kBlocks, bk_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, rfT, rftD, kBlocks, bk_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, roT, rotD, kBlocks, bk_lp, bk, lpb); LIBXSMM_VLA_DECL(2, element_output_type, hT, scratch_hT, N); float *dout_ptr = NULL; /* define batch-reduce gemm kernels */ const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernela = handle->bwdupd_kernela; const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernelb = handle->bwdupd_kernelb; const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernelc = handle->bwdupd_kernelc; const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kerneld = handle->bwdupd_kerneld; /* computing first logical thread */ const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; /* number of tasks that could be run in parallel for N and K blocks*/ const libxsmm_blasint work_nk = (N/bn) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize_nk = (work_nk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nk / (libxsmm_blasint)handle->desc.threads) : ((work_nk / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_nk = (ltid * chunksize_nk < work_nk) ? (ltid * chunksize_nk) : work_nk; const libxsmm_blasint thr_end_nk = ((ltid + 1) * chunksize_nk < work_nk) ? ((ltid + 1) * chunksize_nk) : work_nk; /* number of tasks that could be run in parallel for N and C blocks*/ const libxsmm_blasint work_nc = (N/bn) * (C/bc); /* compute chunk size */ const libxsmm_blasint chunksize_nc = (work_nc % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nc / (libxsmm_blasint)handle->desc.threads) : ((work_nc / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_nc = (ltid * chunksize_nc < work_nc) ? (ltid * chunksize_nc) : work_nc; const libxsmm_blasint thr_end_nc = ((ltid + 1) * chunksize_nc < work_nc) ? ((ltid + 1) * chunksize_nc) : work_nc; /* number of tasks that could be run in parallel for C and K blocks*/ const libxsmm_blasint work_ck = (C/bc) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize_ck = (work_ck % (libxsmm_blasint)handle->desc.threads == 0) ? (work_ck / (libxsmm_blasint)handle->desc.threads) : ((work_ck / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_ck = (ltid * chunksize_ck < work_ck) ? (ltid * chunksize_ck) : work_ck; const libxsmm_blasint thr_end_ck = ((ltid + 1) * chunksize_ck < work_ck) ? ((ltid + 1) * chunksize_ck) : work_ck; /* number of tasks that could be run in parallel for K and K blocks*/ const libxsmm_blasint work_kk = (K/bk) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize_kk = (work_kk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_kk / (libxsmm_blasint)handle->desc.threads) : ((work_kk / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_kk = (ltid * chunksize_kk < work_kk) ? (ltid * chunksize_kk) : work_kk; const libxsmm_blasint thr_end_kk = ((ltid + 1) * chunksize_kk < work_kk) ? ((ltid + 1) * chunksize_kk) : work_kk; #if defined(LIBXSMM_RNN_CELL_AVX512) element_output_type *cps_ptr = NULL; int k_tasks = K/16; int k_chunksize = (k_tasks % (libxsmm_blasint)handle->desc.threads == 0) ? (k_tasks / (libxsmm_blasint)handle->desc.threads) : ((k_tasks / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint k_thr_begin = (ltid * k_chunksize * 16 < K) ? (ltid * k_chunksize * 16) : K; const libxsmm_blasint k_thr_end = ((ltid + 1) * k_chunksize * 16 < K) ? ((ltid + 1) * k_chunksize * 16) : K; __m512 dbi_sum, dbf_sum, dbo_sum, dbc_sum; #endif /* number of tasks that could be run in parallel for K blocks*/ /* compute chunk size */ #if 0 const libxsmm_blasint chunksize_k = (K % (libxsmm_blasint)handle->desc.threads == 0) ? (K / (libxsmm_blasint)handle->desc.threads) : ((K / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_k = (ltid * chunksize_k < K) ? (ltid * chunksize_k) : K; const libxsmm_blasint thr_end_k = ((ltid + 1) * chunksize_k < K) ? ((ltid + 1) * chunksize_k) : K; #endif #ifdef PROFILE __int64_t _start, _end, eltwise_cycles = 0, dout_cycles = 0, weight_trans_cycles = 0, act_trans_cycles = 0, dx_cycles = 0, dwdr_cycles = 0, gradient_cycles = 0, reformat_cycles = 0; float total_time = 0.0; #endif int bcbk_multiples_of_16 = ((bc % 16 == 0) && (bk % 16 == 0)) ? 1 : 0; libxsmm_blasint ikic, inic, inik, icin, ikin; /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, (int)ltid); /* Blocking reduction domain if it is too large */ BF = 1; if (K > 1024 && K <= 2048) { BF = 8; while (kBlocks % BF != 0) { BF--; } } if (K > 2048) { BF = 16; while (kBlocks % BF != 0) { BF--; } } KB_BLOCKS = kBlocks/BF; /* initialization is done at the beginning */ if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { libxsmm_internal_matrix_zero(N*C*t, dxD, start_thread, tid, handle->desc.threads); } /* initialization is done at the beginning */ if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { libxsmm_internal_matrix_zero(C*K*4, w_scratch, start_thread, tid, handle->desc.threads); libxsmm_internal_matrix_zero(K*K*4, r_scratch, start_thread, tid, handle->desc.threads); libxsmm_internal_matrix_zero(K*4, db, start_thread, tid, handle->desc.threads); } #if 0 #ifdef PROFILE if (ltid == 0) _start = _rdtsc(); #endif /* transpose W */ for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { ic = (ikic / (K/bk)); ik = (ikic % (K/bk)); for (jk = 0; jk < bk; ++jk) { for (jc = 0; jc < bc; ++jc) { LIBXSMM_VLA_ACCESS(5, wiT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bc, lpb) = LIBXSMM_VLA_ACCESS(5, wi, ik, ic, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb); LIBXSMM_VLA_ACCESS(5, wcT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bc, lpb) = LIBXSMM_VLA_ACCESS(5, wc, ik, ic, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb); LIBXSMM_VLA_ACCESS(5, wfT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bc, lpb) = LIBXSMM_VLA_ACCESS(5, wf, ik, ic, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb); LIBXSMM_VLA_ACCESS(5, woT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bc, lpb) = LIBXSMM_VLA_ACCESS(5, wo, ik, ic, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb); } } } /* transpose R */ for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { ik = (ikic / (K/bk)); ic = (ikic % (K/bk)); for (jk = 0; jk < bk; ++jk) { for (jc = 0; jc < bk; ++jc) { LIBXSMM_VLA_ACCESS(5, riT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(5, ri, ik, ic, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb); LIBXSMM_VLA_ACCESS(5, rcT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(5, rc, ik, ic, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb); LIBXSMM_VLA_ACCESS(5, rfT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(5, rf, ik, ic, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb); LIBXSMM_VLA_ACCESS(5, roT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(5, ro, ik, ic, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb); } } } #ifdef PROFILE if (ltid == 0) { _end = _rdtsc(); weight_trans_cycles += _end - _start; } #endif #endif #include "libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_core_bf16.tpl.c" if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { #ifdef PROFILE if (ltid == 0) _start = _rdtsc(); #endif /* Store result weight matrices in KCCK bf16 format and downcovert to bf16 */ #if defined(LIBXSMM_RNN_CELL_AVX512) #if 0 for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { icb = ikic / (K/bk); ikb = ikic % (K/bk); ik = ikb*bk; for (jc = 0; jc < bc; jc++) { for (jk = 0; jk < bk; jk++) { libxsmm_bfloat16_hp tmp; tmp.f = LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, jc, jk, cBlocks, bc, bk); LIBXSMM_VLA_ACCESS(5, dwi_bf16, ikb, icb, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb) = tmp.i[1]; tmp.f = LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, jc, jk, cBlocks, bc, bk); LIBXSMM_VLA_ACCESS(5, dwc_bf16, ikb, icb, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb) = tmp.i[1]; tmp.f = LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, jc, jk, cBlocks, bc, bk); LIBXSMM_VLA_ACCESS(5, dwf_bf16, ikb, icb, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb) = tmp.i[1]; tmp.f = LIBXSMM_VLA_ACCESS(4, dwo, ikb, icb, jc, jk, cBlocks, bc, bk); LIBXSMM_VLA_ACCESS(5, dwo_bf16, ikb, icb, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb) = tmp.i[1]; } } } for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { icb = ikic / (K/bk); ikb = ikic % (K/bk); ik = ikb*bk; for (jc = 0; jc < bk; jc++) { for (jk = 0; jk < bk; jk++) { libxsmm_bfloat16_hp tmp; tmp.f = LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, jc, jk, kBlocks, bk, bk); LIBXSMM_VLA_ACCESS(5, dri_bf16, ikb, icb, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb) = tmp.i[1]; tmp.f = LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, jc, jk, kBlocks, bk, bk); LIBXSMM_VLA_ACCESS(5, drc_bf16, ikb, icb, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb) = tmp.i[1]; tmp.f = LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, jc, jk, kBlocks, bk, bk); LIBXSMM_VLA_ACCESS(5, drf_bf16, ikb, icb, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb) = tmp.i[1]; tmp.f = LIBXSMM_VLA_ACCESS(4, dro, ikb, icb, jc, jk, kBlocks, bk, bk); LIBXSMM_VLA_ACCESS(5, dro_bf16, ikb, icb, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb) = tmp.i[1]; } } } #endif __m512 a01, b01; __m512i c01; const __m512i perm_index = LIBXSMM_INTRINSICS_MM512_SET_EPI16(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8, 23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0); for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { icb = ikic / (K/bk); ikb = ikic % (K/bk); ik = ikb*bk; for (jc = 0; jc < bc; jc+=2) { for (jk = 0; jk < bk; jk+=16) { a01 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, jc+1, jk, cBlocks, bc, bk)); b01 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, jc, jk, cBlocks, bc, bk)); c01 = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(a01, b01); _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(5, dwi_bf16, ikb, icb, jc/lpb, jk, 0, cBlocks, bc_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); a01 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, jc+1, jk, cBlocks, bc, bk)); b01 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, jc, jk, cBlocks, bc, bk)); c01 = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(a01, b01); _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(5, dwc_bf16, ikb, icb, jc/lpb, jk, 0, cBlocks, bc_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); a01 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, jc+1, jk, cBlocks, bc, bk)); b01 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, jc, jk, cBlocks, bc, bk)); c01 = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(a01, b01); _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(5, dwf_bf16, ikb, icb, jc/lpb, jk, 0, cBlocks, bc_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); a01 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwo, ikb, icb, jc+1, jk, cBlocks, bc, bk)); b01 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwo, ikb, icb, jc, jk, cBlocks, bc, bk)); c01 = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(a01, b01); _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(5, dwo_bf16, ikb, icb, jc/lpb, jk, 0, cBlocks, bc_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); } } } for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { icb = ikic / (K/bk); ikb = ikic % (K/bk); ik = ikb*bk; for (jc = 0; jc < bk; jc+=2) { for (jk = 0; jk < bk; jk+=16) { a01 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, jc+1, jk, cBlocks, bc, bk)); b01 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, jc, jk, cBlocks, bc, bk)); c01 = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(a01, b01); _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(5, dri_bf16, ikb, icb, jc/lpb, jk, 0, cBlocks, bc_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); a01 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, jc+1, jk, cBlocks, bc, bk)); b01 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, jc, jk, cBlocks, bc, bk)); c01 = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(a01, b01); _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(5, drc_bf16, ikb, icb, jc/lpb, jk, 0, cBlocks, bc_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); a01 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, jc+1, jk, cBlocks, bc, bk)); b01 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, jc, jk, cBlocks, bc, bk)); c01 = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(a01, b01); _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(5, drf_bf16, ikb, icb, jc/lpb, jk, 0, cBlocks, bc_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); a01 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dro, ikb, icb, jc+1, jk, cBlocks, bc, bk)); b01 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dro, ikb, icb, jc, jk, cBlocks, bc, bk)); c01 = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(a01, b01); _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(5, dro_bf16, ikb, icb, jc/lpb, jk, 0, cBlocks, bc_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); } } } #else /* TODO: Add here non AVX512 replacement code */ LIBXSMM_UNUSED(thr_begin_kk); LIBXSMM_UNUSED(thr_begin_ck); LIBXSMM_UNUSED(ikic); LIBXSMM_UNUSED(jk); LIBXSMM_UNUSED(jc); LIBXSMM_UNUSED(thr_end_ck); LIBXSMM_UNUSED(thr_end_kk); #endif libxsmm_barrier_wait(handle->barrier, (int)ltid); #ifdef PROFILE if (ltid == 0) { _end = _rdtsc(); reformat_cycles += _end - _start; } #endif } #ifdef PROFILE if (ltid == 0) { printf("----- PROFILING LSTM BWD/UPD (N = %d, C = %d, K = %d, bn = %d. bc = %d, bk = %d)----\n", N, C, K, bn, bc, bk ); total_time = (gradient_cycles+dwdr_cycles+dx_cycles+act_trans_cycles+weight_trans_cycles+dout_cycles+eltwise_cycles+reformat_cycles)/(2.5 * 1e9)*1000.0f; printf("Transpose weights time is %f ms (%.2f%%)\n", weight_trans_cycles/(2.5 * 1e9)*1000.0f, weight_trans_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); printf("Elementwise time is %f ms (%.2f%%)\n", eltwise_cycles/(2.5 * 1e9)*1000.0f, eltwise_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); printf("Dx GEMM time is %f ms (%.2f%%) at %f GFLOPS\n", dx_cycles/(2.5 * 1e9)*1000.0f, dx_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*2.0*N*C*K*4/1e9/(dx_cycles/(2.5 * 1e9))); printf("Dh GEMM time is %f ms (%.2f%%) at %f GFLOPS\n", dout_cycles/(2.5 * 1e9)*1000.0f, dout_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*2.0*N*K*K*4/1e9/(dout_cycles/(2.5 * 1e9))); printf("Transpose input activations time is %f ms (%.2f%%)\n", act_trans_cycles/(2.5 * 1e9)*1000.0f, act_trans_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); printf("Dwdr GEMM time is %f ms (%.2f%%) at %f GFLOPS\n", dwdr_cycles/(2.5 * 1e9)*1000.0f, dwdr_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*2.0*(N*K*K*2.0+N*C*K*2.0)*2.0/1e9/(dwdr_cycles/(2.5 * 1e9))); printf("Gradient bias calculation time is %f ms (%.2f%%)\n", gradient_cycles/(2.5 * 1e9)*1000.0f, gradient_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); printf("Reformat dwdr time is %f ms (%.2f%%)\n\n", reformat_cycles/(2.5 * 1e9)*1000.0f, reformat_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); } #undef PROFILE #endif libxsmm-1.17/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_core.tpl.c000066400000000000000000000663231415223013700303150ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas (Intel Corp.) ******************************************************************************/ for (j = t-1; j >= 0; --j) { /* let's run the cell in blocks for good locality */ #ifdef PROFILE if (ltid == 0) _start = _rdtsc(); #endif for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { inb = inik % (N/bn); ikb = inik / (N/bn); in = (inik % (N/bn))*bn; ik = (inik / (N/bn))*bk; #if defined(LIBXSMM_RNN_CELL_AVX512) /* Compute dcp, dci, di, df, dp */ cps_ptr = (j == 0) ? &LIBXSMM_VLA_ACCESS(2, cp, in, ik, K) : &LIBXSMM_VLA_ACCESS(3, cs, j-1, in, ik, N, K); if (bcbk_multiples_of_16) { if (K % 2048 != 0 || LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) { #include "libxsmm_internal_lstm_bwdupd_fused_eltwise.tpl.c" } else { /* Also reformat di, dci, df and dp to be used in the UPD pass in blocked format ... */ #include "libxsmm_internal_lstm_bwdupd_fused_eltwise_reformat.tpl.c" } } else { /* compute dhp */ if (j == t-1) { libxsmm_internal_matrix_copy_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, dh, t-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K) ); } else { libxsmm_internal_matrix_add_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K), &LIBXSMM_VLA_ACCESS(3, dh, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K) ); } /* compute dcp */ libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); libxsmm_internal_matrix_complement_square_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); if (j == t-1) { libxsmm_internal_matrix_add_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dcs, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K) ); } else { libxsmm_internal_matrix_add_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K) ); } /* compute dci */ libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); libxsmm_internal_matrix_complement_square_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dci, in, ik, K) ); /* compute di */ libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); libxsmm_internal_matrix_complement_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, di, in, ik, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, di, in, ik, K), &LIBXSMM_VLA_ACCESS(2, di, in, ik, K) ); /* compute df */ if (j == 0) { libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, cp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); } else { libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K), &LIBXSMM_VLA_ACCESS(3, cs, j-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); } libxsmm_internal_matrix_complement_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, df, in, ik, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, df, in, ik, K), &LIBXSMM_VLA_ACCESS(2, df, in, ik, K) ); /* compute dp */ libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); libxsmm_internal_matrix_complement_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K) ); /* update dcp */ libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K) ); } #else /* compute dhp */ if (j == t-1) { libxsmm_internal_matrix_copy_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, dh, t-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K) ); } else { libxsmm_internal_matrix_add_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K), &LIBXSMM_VLA_ACCESS(3, dh, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K) ); } /* compute dcp */ libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); libxsmm_internal_matrix_complement_square_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); if (j == t-1) { libxsmm_internal_matrix_add_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dcs, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K) ); } else { libxsmm_internal_matrix_add_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K) ); } /* compute dci */ libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); libxsmm_internal_matrix_complement_square_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dci, in, ik, K) ); /* compute di */ libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); libxsmm_internal_matrix_complement_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, di, in, ik, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, di, in, ik, K), &LIBXSMM_VLA_ACCESS(2, di, in, ik, K) ); /* compute df */ if (j == 0) { libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, cp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); } else { libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K), &LIBXSMM_VLA_ACCESS(3, cs, j-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); } libxsmm_internal_matrix_complement_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, df, in, ik, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, df, in, ik, K), &LIBXSMM_VLA_ACCESS(2, df, in, ik, K) ); /* compute dp */ libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); libxsmm_internal_matrix_complement_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K) ); /* update dcp */ libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K) ); #endif } #ifdef PROFILE if (ltid == 0) { _end = _rdtsc(); eltwise_cycles += _end - _start; } #endif if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { #ifdef PROFILE if (ltid == 0) _start = _rdtsc(); #endif /* transpose xt for current timestep */ for (icin = thr_begin_nc; icin < thr_end_nc; ++icin ) { in = (icin / (C/bc))*bn; ic = (icin % (C/bc))*bc; for (jc = 0; jc < bc; ++jc) { for (jb = 0; jb < bn; ++jb) { en = in + jb; ec = ic + jc; LIBXSMM_VLA_ACCESS(2, xT, ec, en, N) = LIBXSMM_VLA_ACCESS(3, x, j, en, ec, N, C); } } } /* transpose ht for current timestep */ if (j == 0) { for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { in = (ikin / (K/bk))*bn; ik = (ikin % (K/bk))*bk; for (jk = 0; jk < bk; ++jk) { for (jb = 0; jb < bn; ++jb) { en = in + jb; ek = ik + jk; LIBXSMM_VLA_ACCESS(2, hT, ek, en, N) = LIBXSMM_VLA_ACCESS(2, hp, en, ek, K); } } } } else { for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { in = (ikin / (K/bk))*bn; ik = (ikin % (K/bk))*bk; for (jk = 0; jk < bk; ++jk) { for (jb = 0; jb < bn; ++jb) { en = in + jb; ek = ik + jk; LIBXSMM_VLA_ACCESS(2, hT, ek, en, N) = LIBXSMM_VLA_ACCESS(3, h, j-1, en, ek, N, K); } } } } #ifdef PROFILE if (ltid == 0) { _end = _rdtsc(); act_trans_cycles += _end - _start; } #endif } libxsmm_barrier_wait(handle->barrier, (int)ltid); if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { #ifdef PROFILE if (ltid == 0) _start = _rdtsc(); #endif /* dx = W^T * difoc */ for (KB = 0; KB < BF; KB++) { for (inic = thr_begin_nc; inic < thr_end_nc; ++inic ) { in = (inic % (N/bn))*bn; icb = inic / (N/bn); ic = icb*bc; for (ik = 0, ikb = 0; ikb < KB_BLOCKS; ik += bk, ikb++) { A_array[ikb] = &LIBXSMM_VLA_ACCESS(4, wiT, icb, ikb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bc); B_array[ikb] = &LIBXSMM_VLA_ACCESS(2, di, in, ik + KB*KB_BLOCKS*bk, K); } /* Reduce batch gemm call */ blocks = KB_BLOCKS; batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C) , &blocks); for (ik = 0, ikb = 0; ikb < KB_BLOCKS; ik += bk, ikb++) { A_array[ikb] = &LIBXSMM_VLA_ACCESS(4, wcT, icb, ikb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bc); B_array[ikb] = &LIBXSMM_VLA_ACCESS(2, dci, in, ik + KB*KB_BLOCKS*bk, K); } /* Reduce batch gemm call */ batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C) , &blocks); for (ik = 0, ikb = 0; ikb < KB_BLOCKS; ik += bk, ikb++) { A_array[ikb] = &LIBXSMM_VLA_ACCESS(4, wfT, icb, ikb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bc); B_array[ikb] = &LIBXSMM_VLA_ACCESS(2, df, in, ik + KB*KB_BLOCKS*bk, K); } /* Reduce batch gemm call */ batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C) , &blocks); for (ik = 0, ikb = 0; ikb < KB_BLOCKS; ik += bk, ikb++) { A_array[ikb] = &LIBXSMM_VLA_ACCESS(4, woT, icb, ikb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bc); B_array[ikb] = &LIBXSMM_VLA_ACCESS(2, dp, in, ik + KB*KB_BLOCKS*bk, K); } /* Reduce batch gemm call */ batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C) , &blocks); } } #ifdef PROFILE if (ltid == 0) { _end = _rdtsc(); dx_cycles += _end - _start; } #endif } #ifdef PROFILE if (ltid == 0) _start = _rdtsc(); #endif for (KB = 0; KB < BF; KB++) { for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { in = (inik % (N/bn))*bn; ikb = inik / (N/bn); ik = ikb*bk; dout_ptr = (j > 0) ? (element_output_type*) &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K) : (element_output_type*) &LIBXSMM_VLA_ACCESS(2, dhp, in, ik, K); if (KB == 0) libxsmm_internal_matrix_zero_ld( bk, bn, K, dout_ptr); /* dout += R^T * difoc */ for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, riT, ikb, icb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(2, di, in, ic + KB*KB_BLOCKS*bk, K); } /* Reduce batch gemm call */ blocks = KB_BLOCKS; batchreduce_kerneld(A_array, B_array, dout_ptr, &blocks); for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rcT, ikb, icb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(2, dci, in, ic + KB*KB_BLOCKS*bk, K); } /* Reduce batch gemm call */ batchreduce_kerneld(A_array, B_array, dout_ptr, &blocks); for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rfT, ikb, icb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(2, df, in, ic + KB*KB_BLOCKS*bk, K); } /* Reduce batch gemm call */ batchreduce_kerneld(A_array, B_array, dout_ptr, &blocks); for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, roT, ikb, icb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(2, dp, in, ic + KB*KB_BLOCKS*bk, K); } /* Reduce batch gemm call */ batchreduce_kerneld(A_array, B_array, dout_ptr, &blocks); } } #ifdef PROFILE if (ltid == 0) { _end = _rdtsc(); dout_cycles += _end - _start; } #endif if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { #ifdef PROFILE if (ltid == 0) _start = _rdtsc(); #endif if ((C == K) && (bc == bk) && (bcbk_multiples_of_16 == 1)) { if (K % 2048 != 0) { /* Interleave computation of dr = difoc * h^T and dw = difoc * x^T to take advantage of temporal locality */ for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { icb = ikic / (K/bk); ic = icb*bk; ikb = ikic % (K/bk); ik = ikb*bk; blocks = nBlocks; for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, di, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); } batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, di, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); } batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, dci, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); } batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, dci, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); } batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, df, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); } batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, df, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); } batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); } batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dro, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); } batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwo, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); } } else { /* Interleave computation of dr = difoc * h^T and dw = difoc * x^T to take advantage of temporal locality */ /* Use blocked format for di, dci, df and dp */ for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { icb = ikic / (K/bk); ic = icb*bk; ikb = ikic % (K/bk); ik = ikb*bk; blocks = nBlocks; for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(4, diB, inb, ikb, 0, 0, kBlocks, bn, bk); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); } batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(4, diB, inb, ikb, 0, 0, kBlocks, bn, bk); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); } batchreduce_kernelc(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(4, dciB, inb, ikb, 0, 0, kBlocks, bn, bk); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); } batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(4, dciB, inb, ikb, 0, 0, kBlocks, bn, bk); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); } batchreduce_kernelc(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(4, dfB, inb, ikb, 0, 0, kBlocks, bn, bk); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); } batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(4, dfB, inb, ikb, 0, 0, kBlocks, bn, bk); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); } batchreduce_kernelc(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(4, dpB, inb, ikb, 0, 0, kBlocks, bn, bk); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); } batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dro, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(4, dpB, inb, ikb, 0, 0, kBlocks, bn, bk); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); } batchreduce_kernelc(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwo, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); } } } else { /* dr = difoc * h^T */ for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { icb = ikic / (K/bk); ic = icb*bk; ikb = ikic % (K/bk); ik = ikb*bk; for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, di, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); } blocks = nBlocks; batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, dci, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); } batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, df, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); } batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); } batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dro, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); } /* dw = difoc * x^T */ for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { icb = ikic / (K/bk); ic = icb*bc; ikb = ikic % (K/bk); ik = ikb*bk; for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, di, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); } blocks = nBlocks; batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, dci, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); } batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, df, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); } batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); } batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwo, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); } } #ifdef PROFILE if (ltid == 0) { _end = _rdtsc(); dwdr_cycles += _end - _start; } #endif #ifdef PROFILE if (ltid == 0) _start = _rdtsc(); #endif /* gradient bias */ #if defined(LIBXSMM_RNN_CELL_AVX512) if (bcbk_multiples_of_16) { for (ik = k_thr_begin; ik < k_thr_end; ik += 16) { dbi_sum = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&dbi[ik]); dbf_sum = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&dbf[ik]); dbo_sum = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&dbo[ik]); dbc_sum = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&dbc[ik]); for (in = 0; in < N; in++) { dbi_sum = _mm512_add_ps(dbi_sum, LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(2, di, in, ik, K))); dbf_sum = _mm512_add_ps(dbf_sum, LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(2, df, in, ik, K))); dbo_sum = _mm512_add_ps(dbo_sum, LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(2, dp, in, ik, K))); dbc_sum = _mm512_add_ps(dbc_sum, LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(2, dci, in, ik, K))); } _mm512_storeu_ps(&dbi[ik], dbi_sum); _mm512_storeu_ps(&dbf[ik], dbf_sum); _mm512_storeu_ps(&dbo[ik], dbo_sum); _mm512_storeu_ps(&dbc[ik], dbc_sum); } } else { for (ik = thr_begin_k; ik < thr_end_k; ik++) { for (in = 0; in < N; in++) { dbi[ik] += LIBXSMM_VLA_ACCESS(2, di, in, ik, K); dbf[ik] += LIBXSMM_VLA_ACCESS(2, df, in, ik, K); dbo[ik] += LIBXSMM_VLA_ACCESS(2, dp, in, ik, K); dbc[ik] += LIBXSMM_VLA_ACCESS(2, dci, in, ik, K); } } } #else for (ik = thr_begin_k; ik < thr_end_k; ik++) { for (in = 0; in < N; in++) { dbi[ik] += LIBXSMM_VLA_ACCESS(2, di, in, ik, K); dbf[ik] += LIBXSMM_VLA_ACCESS(2, df, in, ik, K); dbo[ik] += LIBXSMM_VLA_ACCESS(2, dp, in, ik, K); dbc[ik] += LIBXSMM_VLA_ACCESS(2, dci, in, ik, K); } } #endif #ifdef PROFILE if (ltid == 0) { _end = _rdtsc(); gradient_cycles += _end - _start; } #endif } libxsmm_barrier_wait(handle->barrier, (int)ltid); } libxsmm-1.17/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_core_bf16.tpl.c000066400000000000000000000352521415223013700311300ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas (Intel Corp.) ******************************************************************************/ #define NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(m, n, ld, _src, _dst) \ do { \ float *const src = _src; \ libxsmm_bfloat16 *const dst = _dst; \ libxsmm_blasint __i, __j; \ __m512i packed_result; \ for ( __j = 0; __j < n; ++__j ) { \ for ( __i = 0; __i < m; __i+=32 ) { \ packed_result = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS((float*)&src[(__j*ld)+__i+16]), LIBXSMM_INTRINSICS_MM512_LOAD_PS((float*)&src[(__j*ld)+__i])); \ _mm512_storeu_si512(&dst[(__j*ld)+__i], packed_result); \ } \ } \ } while (0) for (j = t-1; j >= 0; --j) { /* let's run the cell in blocks for good locality */ #ifdef PROFILE if (ltid == 0) _start = _rdtsc(); #endif for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { inb = inik % (N/bn); ikb = inik / (N/bn); in = (inik % (N/bn))*bn; ik = (inik / (N/bn))*bk; #if defined(LIBXSMM_RNN_CELL_AVX512) /* Compute dcp, dci, di, df, dp */ cps_ptr = (j == 0) ? &LIBXSMM_VLA_ACCESS(2, cp, in, ik, K) : &LIBXSMM_VLA_ACCESS(3, cs, j-1, in, ik, N, K); if (bcbk_multiples_of_16) { /* Also reformat di, dci, df and dp to be used in the UPD pass in blocked format ... */ #include "libxsmm_internal_lstm_bwdupd_fused_eltwise_reformat_bf16.tpl.c" } else { /* TODO: Add alternative path here */ } #else /* TODO: Add alternative path here */ #endif } #ifdef PROFILE if (ltid == 0) { _end = _rdtsc(); eltwise_cycles += _end - _start; } #endif if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { #ifdef PROFILE if (ltid == 0) _start = _rdtsc(); #endif /* transpose xt for current timestep */ for (icin = thr_begin_nc; icin < thr_end_nc; ++icin ) { in = (icin / (C/bc))*bn; ic = (icin % (C/bc))*bc; for (jc = 0; jc < bc; ++jc) { for (jb = 0; jb < bn; ++jb) { en = in + jb; ec = ic + jc; LIBXSMM_VLA_ACCESS(2, xT, ec, en, N) = LIBXSMM_VLA_ACCESS(3, x, j, en, ec, N, C); } } } /* transpose ht for current timestep */ if (j == 0) { for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { in = (ikin / (K/bk))*bn; ik = (ikin % (K/bk))*bk; for (jk = 0; jk < bk; ++jk) { for (jb = 0; jb < bn; ++jb) { en = in + jb; ek = ik + jk; LIBXSMM_VLA_ACCESS(2, hT, ek, en, N) = LIBXSMM_VLA_ACCESS(2, hp, en, ek, K); } } } } else { for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { in = (ikin / (K/bk))*bn; ik = (ikin % (K/bk))*bk; for (jk = 0; jk < bk; ++jk) { for (jb = 0; jb < bn; ++jb) { en = in + jb; ek = ik + jk; LIBXSMM_VLA_ACCESS(2, hT, ek, en, N) = LIBXSMM_VLA_ACCESS(3, h, j-1, en, ek, N, K); } } } } #ifdef PROFILE if (ltid == 0) { _end = _rdtsc(); act_trans_cycles += _end - _start; } #endif } libxsmm_barrier_wait(handle->barrier, (int)ltid); if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { #ifdef PROFILE if (ltid == 0) _start = _rdtsc(); #endif /* dx = W^T * difoc */ blocks = KB_BLOCKS; for (KB = 0; KB < BF; KB++) { for (inic = thr_begin_nc; inic < thr_end_nc; ++inic ) { in = (inic % (N/bn))*bn; icb = inic / (N/bn); ic = icb*bc; batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wiT, icb, KB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bc, lpb), &LIBXSMM_VLA_ACCESS(2, di, in, KB*KB_BLOCKS*bk, K), &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C), &blocks); batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wcT, icb, KB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bc, lpb), &LIBXSMM_VLA_ACCESS(2, dci, in, KB*KB_BLOCKS*bk, K), &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C), &blocks); batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wfT, icb, KB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bc, lpb), &LIBXSMM_VLA_ACCESS(2, df, in, KB*KB_BLOCKS*bk, K), &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C), &blocks); batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, woT, icb, KB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bc, lpb), &LIBXSMM_VLA_ACCESS(2, dp, in, KB*KB_BLOCKS*bk, K), &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C), &blocks); /* If last block, make sure we downconvert dx to bf16 */ if (KB == BF-1) { NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bc, bn, C, &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C), &LIBXSMM_VLA_ACCESS(3, dx_bf16, j, in, ic, N, C)); } } } #ifdef PROFILE if (ltid == 0) { _end = _rdtsc(); dx_cycles += _end - _start; } #endif } #ifdef PROFILE if (ltid == 0) _start = _rdtsc(); #endif blocks = KB_BLOCKS; for (KB = 0; KB < BF; KB++) { for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { in = (inik % (N/bn))*bn; ikb = inik / (N/bn); ik = ikb*bk; dout_ptr = (j > 0) ? (float*) &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K) : (float*) &LIBXSMM_VLA_ACCESS(2, dhp_f32, in, ik, K); if (KB == 0) libxsmm_internal_matrix_zero_ld( bk, bn, K, dout_ptr); /* dout += R^T * difoc */ batchreduce_kerneld(&LIBXSMM_VLA_ACCESS(5, riT, ikb, KB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(2, di, in, KB*KB_BLOCKS*bk, K), dout_ptr, &blocks); batchreduce_kerneld(&LIBXSMM_VLA_ACCESS(5, rcT, ikb, KB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(2, dci, in, KB*KB_BLOCKS*bk, K), dout_ptr, &blocks); batchreduce_kerneld(&LIBXSMM_VLA_ACCESS(5, rfT, ikb, KB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(2, df, in, KB*KB_BLOCKS*bk, K), dout_ptr, &blocks); batchreduce_kerneld(&LIBXSMM_VLA_ACCESS(5, roT, ikb, KB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(2, dp, in, KB*KB_BLOCKS*bk, K), dout_ptr, &blocks); /* Make sure when last and j == 0 to downconvert dhp to BF16 */ if ((j == 0) && (KB == BF-1)) { NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, dout_ptr, &LIBXSMM_VLA_ACCESS(2, dhp, in, ik, K)); } } } #ifdef PROFILE if (ltid == 0) { _end = _rdtsc(); dout_cycles += _end - _start; } #endif if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { #ifdef PROFILE if (ltid == 0) _start = _rdtsc(); #endif blocks = nBlocks; if ((C == K) && (bc == bk) && (bcbk_multiples_of_16 == 1)) { /* Interleave computation of dr = difoc * h^T and dw = difoc * x^T to take advantage of temporal locality */ /* Use blocked format for di, dci, df and db */ for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { icb = ikic / (K/bk); ic = icb*bk; ikb = ikic % (K/bk); ik = ikb*bk; batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, diB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(2, hT, ic, 0, N), &LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); batchreduce_kernelc(&LIBXSMM_VLA_ACCESS(5, diB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(2, xT, ic, 0, N), &LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, dciB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(2, hT, ic, 0, N), &LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); batchreduce_kernelc(&LIBXSMM_VLA_ACCESS(5, dciB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(2, xT, ic, 0, N), &LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, dfB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(2, hT, ic, 0, N), &LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); batchreduce_kernelc(&LIBXSMM_VLA_ACCESS(5, dfB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(2, xT, ic, 0, N), &LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, dpB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(2, hT, ic, 0, N), &LIBXSMM_VLA_ACCESS(4, dro, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); batchreduce_kernelc(&LIBXSMM_VLA_ACCESS(5, dpB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(2, xT, ic, 0, N), &LIBXSMM_VLA_ACCESS(4, dwo, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); } } else { /* dr = difoc * h^T */ /* Use blocked format for di, dci, df and db */ for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { icb = ikic / (K/bk); ic = icb*bk; ikb = ikic % (K/bk); ik = ikb*bk; batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, diB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(2, hT, ic, 0, N), &LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, dciB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(2, hT, ic, 0, N), &LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, dfB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(2, hT, ic, 0, N), &LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, dpB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(2, hT, ic, 0, N), &LIBXSMM_VLA_ACCESS(4, dro, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); } /* dw = difoc * x^T */ for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { icb = ikic / (K/bk); ic = icb*bc; ikb = ikic % (K/bk); ik = ikb*bk; batchreduce_kernelc(&LIBXSMM_VLA_ACCESS(5, diB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(2, xT, ic, 0, N), &LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); batchreduce_kernelc(&LIBXSMM_VLA_ACCESS(5, dciB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(2, xT, ic, 0, N), &LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); batchreduce_kernelc(&LIBXSMM_VLA_ACCESS(5, dfB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(2, xT, ic, 0, N), &LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); batchreduce_kernelc(&LIBXSMM_VLA_ACCESS(5, dpB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(2, xT, ic, 0, N), &LIBXSMM_VLA_ACCESS(4, dwo, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); } } #ifdef PROFILE if (ltid == 0) { _end = _rdtsc(); dwdr_cycles += _end - _start; } #endif #ifdef PROFILE if (ltid == 0) _start = _rdtsc(); #endif /* gradient bias */ #if defined(LIBXSMM_RNN_CELL_AVX512) if (bcbk_multiples_of_16) { for (ik = k_thr_begin; ik < k_thr_end; ik += 16) { dbi_sum = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&dbi[ik]); dbf_sum = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&dbf[ik]); dbo_sum = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&dbo[ik]); dbc_sum = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&dbc[ik]); for (in = 0; in < N; in++) { dbi_sum = _mm512_add_ps(dbi_sum, LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(2, di, in, ik, K)))); dbf_sum = _mm512_add_ps(dbf_sum, LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(2, df, in, ik, K)))); dbo_sum = _mm512_add_ps(dbo_sum, LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(2, dp, in, ik, K)))); dbc_sum = _mm512_add_ps(dbc_sum, LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(2, dci, in, ik, K)))); } _mm512_storeu_ps(&dbi[ik], dbi_sum); _mm512_storeu_ps(&dbf[ik], dbf_sum); _mm512_storeu_ps(&dbo[ik], dbo_sum); _mm512_storeu_ps(&dbc[ik], dbc_sum); /* Downconvert delta bias to bf16 if done with all timesteps */ if (j == 0) { _mm256_storeu_si256((__m256i*)&dbi_bf16[ik], LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(dbi_sum)); _mm256_storeu_si256((__m256i*)&dbf_bf16[ik], LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(dbf_sum)); _mm256_storeu_si256((__m256i*)&dbo_bf16[ik], LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(dbo_sum)); _mm256_storeu_si256((__m256i*)&dbc_bf16[ik], LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(dbc_sum)); } } } else { /* TODO: Add alternative path here */ } #else /* TODO: Add alternative path here */ #endif #ifdef PROFILE if (ltid == 0) { _end = _rdtsc(); gradient_cycles += _end - _start; } #endif } libxsmm_barrier_wait(handle->barrier, (int)ltid); } #undef NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD libxsmm-1.17/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_ck_generic.tpl.c000066400000000000000000000251201415223013700277440ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas, Kunal Banerjee (Intel Corp.) ******************************************************************************/ #if 0 #define PROFILE #endif /* helper variables */ libxsmm_blasint j, ik, ikb, in, ic, icb, inik, BF, CB, CB_BLOCKS, KB_BLOCKS, ikic, jk, jc; /* input sizes */ const libxsmm_blasint K = handle->desc.K; const libxsmm_blasint N = handle->desc.N; const libxsmm_blasint C = handle->desc.C; const libxsmm_blasint t = handle->T; const libxsmm_blasint bk = handle->bk; const libxsmm_blasint bn = handle->bn; const libxsmm_blasint bc = handle->bc; const libxsmm_blasint cBlocks = C/bc; const libxsmm_blasint kBlocks = K/bk; unsigned long long blocks; /* define tensors */ element_input_type *xt = (element_input_type* )handle->xt->data; element_input_type *csp = (element_input_type* )handle->csp->data; element_input_type *hpD = (element_input_type* )handle->hp->data; element_filter_type *w = (element_filter_type*)handle->w->data; element_filter_type *r = (element_filter_type*)handle->r->data; element_filter_type *w_scratch = (element_filter_type*)handle->scratch_w; element_filter_type *r_scratch = (element_filter_type*)handle->scratch_r; element_output_type *b = (element_output_type*)handle->b->data; element_output_type *cst = (element_output_type*)handle->cst->data; element_output_type *ht = (element_output_type*)handle->ht->data; element_output_type *it = (element_output_type*)handle->it->data; element_output_type *ft = (element_output_type*)handle->ft->data; element_output_type *ot = (element_output_type*)handle->ot->data; element_output_type *cit = (element_output_type*)handle->cit->data; element_output_type *cot = (element_output_type*)handle->cot->data; element_filter_type *wiD = &(w[0]); element_filter_type *wcD = &(w[K]); element_filter_type *wfD = &(w[2*K]); element_filter_type *woD = &(w[3*K]); element_filter_type *riD = &(r[0]); element_filter_type *rcD = &(r[K]); element_filter_type *rfD = &(r[2*K]); element_filter_type *roD = &(r[3*K]); element_filter_type *wiD_scratch = &(w_scratch[0]); element_filter_type *wcD_scratch = &(w_scratch[C*K]); element_filter_type *wfD_scratch = &(w_scratch[2*C*K]); element_filter_type *woD_scratch = &(w_scratch[3*C*K]); element_filter_type *riD_scratch = &(r_scratch[0]); element_filter_type *rcD_scratch = &(r_scratch[K*K]); element_filter_type *rfD_scratch = &(r_scratch[2*K*K]); element_filter_type *roD_scratch = &(r_scratch[3*K*K]); element_output_type *bi = &(b[0]); element_output_type *bd = &(b[K]); element_output_type *bf = &(b[2*K]); element_output_type *bo = &(b[3*K]); LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); LIBXSMM_VLA_DECL(2, element_input_type, cp, csp, K); LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); LIBXSMM_VLA_DECL(4, element_filter_type, wi, wiD_scratch, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, wf, wfD_scratch, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, wo, woD_scratch, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, wc, wcD_scratch, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, ri, riD_scratch, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, rf, rfD_scratch, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, ro, roD_scratch, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, rc, rcD_scratch, kBlocks, bk, bk); LIBXSMM_VLA_DECL(2, element_filter_type, wi_ck, wiD, 4*K); LIBXSMM_VLA_DECL(2, element_filter_type, wf_ck, wfD, 4*K); LIBXSMM_VLA_DECL(2, element_filter_type, wo_ck, woD, 4*K); LIBXSMM_VLA_DECL(2, element_filter_type, wc_ck, wcD, 4*K); LIBXSMM_VLA_DECL(2, element_filter_type, ri_ck, riD, 4*K); LIBXSMM_VLA_DECL(2, element_filter_type, rf_ck, rfD, 4*K); LIBXSMM_VLA_DECL(2, element_filter_type, ro_ck, roD, 4*K); LIBXSMM_VLA_DECL(2, element_filter_type, rc_ck, rcD, 4*K); LIBXSMM_VLA_DECL(3, element_output_type, cs, cst, N, K); LIBXSMM_VLA_DECL(3, element_output_type, h, ht, N, K); LIBXSMM_VLA_DECL(3, element_output_type, i, it, N, K); LIBXSMM_VLA_DECL(3, element_output_type, f, ft, N, K); LIBXSMM_VLA_DECL(3, element_output_type, o, ot, N, K); LIBXSMM_VLA_DECL(3, element_output_type, ci, cit, N, K); LIBXSMM_VLA_DECL(3, element_output_type, co, cot, N, K); /* define batch-reduce gemm kernels */ const libxsmm_smmfunction_reducebatch_addr batchreduce_kernela = libxsmm_smmdispatch_reducebatch_addr( bk, bn, bc, &bk, &C, &K, NULL, NULL, NULL, NULL ); const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelb = libxsmm_smmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, NULL, NULL ); /* Auxiliary arrays for batch-reduce gemms */ const element_filter_type *A_array[1024]; const element_input_type *B_array[1024]; element_output_type *cps_ptr = NULL; /* parallelize over C-blocks */ /* computing first logical thread */ const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; /* number of tasks that could be run in parallel */ const libxsmm_blasint work = (N/bn) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize = (work % (libxsmm_blasint)handle->desc.threads == 0) ? (work / (libxsmm_blasint)handle->desc.threads) : ((work / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const libxsmm_blasint thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* number of tasks that could be run in parallel for C and K blocks*/ const libxsmm_blasint work_ck = (C/bc) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize_ck = (work_ck % (libxsmm_blasint)handle->desc.threads == 0) ? (work_ck / (libxsmm_blasint)handle->desc.threads) : ((work_ck / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_ck = (ltid * chunksize_ck < work_ck) ? (ltid * chunksize_ck) : work_ck; const libxsmm_blasint thr_end_ck = ((ltid + 1) * chunksize_ck < work_ck) ? ((ltid + 1) * chunksize_ck) : work_ck; /* number of tasks that could be run in parallel for K and K blocks*/ const libxsmm_blasint work_kk = (K/bk) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize_kk = (work_kk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_kk / (libxsmm_blasint)handle->desc.threads) : ((work_kk / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_kk = (ltid * chunksize_kk < work_kk) ? (ltid * chunksize_kk) : work_kk; const libxsmm_blasint thr_end_kk = ((ltid + 1) * chunksize_kk < work_kk) ? ((ltid + 1) * chunksize_kk) : work_kk; const int use_fused_implementation = (C == 2048 && K == 2048) ? 1 : 0; #ifdef PROFILE __int64_t eltwise_start, eltwise_end, eltwise_cycles = 0, gemm_start, gemm_end, gemm_cycles = 0, gemm_cycles2 = 0, reformat_start, reformat_end, reformat_cycles = 0; float total_time = 0.0; #endif /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, (int)ltid); /* Blocking reduction domain if it is too large */ BF = 1; if ((C > 1024 && C <= 2048) || (K > 1024 && K <= 2048)) { BF = 8; while ( (cBlocks % BF != 0) || (kBlocks % BF != 0) ) { BF--; } } if (C > 2048 || K > 2048) { BF = 16; while ( (cBlocks % BF != 0) || (kBlocks % BF != 0) ) { BF--; } } if (C == 2048 && K == 1024) { BF = 2; } CB_BLOCKS = cBlocks/BF; KB_BLOCKS = kBlocks/BF; /* Upfront reformatting of W and R */ /* reformat W */ #ifdef PROFILE if (ltid == 0) reformat_start = _rdtsc(); #endif for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { ic = (ikic / (K/bk)); ik = (ikic % (K/bk)); for (jk = 0; jk < bk; ++jk) { for (jc = 0; jc < bc; ++jc) { LIBXSMM_VLA_ACCESS(4, wi, ik, ic, jc, jk, cBlocks, bc, bk) = LIBXSMM_VLA_ACCESS(2, wi_ck, ic*bc+jc, ik*bk+jk, 4*K); LIBXSMM_VLA_ACCESS(4, wc, ik, ic, jc, jk, cBlocks, bc, bk) = LIBXSMM_VLA_ACCESS(2, wc_ck, ic*bc+jc, ik*bk+jk, 4*K); LIBXSMM_VLA_ACCESS(4, wf, ik, ic, jc, jk, cBlocks, bc, bk) = LIBXSMM_VLA_ACCESS(2, wf_ck, ic*bc+jc, ik*bk+jk, 4*K); LIBXSMM_VLA_ACCESS(4, wo, ik, ic, jc, jk, cBlocks, bc, bk) = LIBXSMM_VLA_ACCESS(2, wo_ck, ic*bc+jc, ik*bk+jk, 4*K); } } } /* reformat R */ for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { ik = (ikic / (K/bk)); ic = (ikic % (K/bk)); for (jk = 0; jk < bk; ++jk) { for (jc = 0; jc < bk; ++jc) { LIBXSMM_VLA_ACCESS(4, ri, ik, ic, jc, jk, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(2, ri_ck, ic*bk+jc, ik*bk+jk, 4*K); LIBXSMM_VLA_ACCESS(4, rc, ik, ic, jc, jk, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(2, rc_ck, ic*bk+jc, ik*bk+jk, 4*K); LIBXSMM_VLA_ACCESS(4, rf, ik, ic, jc, jk, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(2, rf_ck, ic*bk+jc, ik*bk+jk, 4*K); LIBXSMM_VLA_ACCESS(4, ro, ik, ic, jc, jk, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(2, ro_ck, ic*bk+jc, ik*bk+jk, 4*K); } } } libxsmm_barrier_wait(handle->barrier, (int)ltid); #ifdef PROFILE if (ltid == 0) { reformat_end = _rdtsc(); reformat_cycles = reformat_end - reformat_start; } #endif if (use_fused_implementation) { #include "libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_fused.tpl.c" } else { #include "libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_diffused.tpl.c" } #ifdef PROFILE if (ltid == 0) { printf("----- PROFILING LSTM FWD (N = %d, C = %d, K = %d, bn = %d. bc = %d, bk = %d)----\n", N, C, K, bn, bc, bk ); total_time = (gemm_cycles+gemm_cycles2+eltwise_cycles+reformat_cycles)/(2.5 * 1e9)*1000.0f; printf("Elementwise time is %f ms (%.2f%%)\n", eltwise_cycles/(2.5 * 1e9)*1000.0f, eltwise_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); printf("Reformat weights time is %f ms (%.2f%%)\n", reformat_cycles/(2.5 * 1e9)*1000.0f, reformat_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); printf("GEMM W*x time is %f ms (%.2f%%) at %f GFLOPS\n", gemm_cycles/(2.5 * 1e9)*1000.0f, gemm_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*(N*C*K*2.0)*4.0/1e9/(gemm_cycles/(2.5 * 1e9))); printf("GEMM R*h time is %f ms (%.2f%%) at %f GFLOPS\n\n", gemm_cycles2/(2.5 * 1e9)*1000.0f, gemm_cycles2/(2.5 * 1e9)*1000.0f*100.0/total_time, t*(N*K*K*2.0)*4.0/1e9/(gemm_cycles2/(2.5 * 1e9))); } #undef PROFILE #endif libxsmm-1.17/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_ck_generic_bf16.tpl.c000066400000000000000000000323201415223013700305620ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas, Kunal Banerjee (Intel Corp.) ******************************************************************************/ #if 0 #define PROFILE #endif #define MATRIX_CVT_BF16_FP32_LD(m, n, ld, _src, _dst) \ do { \ libxsmm_bfloat16 *src = _src; \ float *dst = _dst; \ libxsmm_blasint __i,__j; \ for ( __j = 0; __j < n; ++__j ) { \ for ( __i = 0; __i < m; __i+=16 ) { \ _mm512_storeu_ps((float*)&dst[(__j*ld)+__i], LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&src[(__j*ld)+__i]))); \ } \ } \ } while (0) #define MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD(m, n, ld, _srcdst, _colv) \ do { \ libxsmm_bfloat16 *colv = _colv; \ float *srcdst = _srcdst; \ libxsmm_blasint __i,__j; \ for ( __j = 0; __j < n; ++__j ) { \ for ( __i = 0; __i < m; __i+=16 ) { \ _mm512_storeu_ps((float*)&srcdst[(__j*ld)+__i], LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&colv[__i]))); \ } \ } \ } while (0) #define MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_CONST_LD(m, n, ld, _srcdst, _colv, const_bias) \ do { \ libxsmm_bfloat16 *colv = _colv; \ float *srcdst = _srcdst; \ libxsmm_blasint __i,__j; \ __m512 vbias = _mm512_set1_ps(const_bias); \ for ( __j = 0; __j < n; ++__j ) { \ for ( __i = 0; __i < m; __i+=16 ) { \ _mm512_storeu_ps((float*)&srcdst[(__j*ld)+__i], _mm512_add_ps(vbias, LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&colv[__i])))); \ } \ } \ } while (0) /* helper variables */ libxsmm_blasint j, ik, ikb, in, ic, /*icb,*/ inik, BF, CB, CB_BLOCKS, KB_BLOCKS, ikic, jk, jc; /* input sizes */ const libxsmm_blasint K = handle->desc.K; const libxsmm_blasint N = handle->desc.N; const libxsmm_blasint C = handle->desc.C; const libxsmm_blasint t = handle->T; const libxsmm_blasint bk = handle->bk; const libxsmm_blasint bn = handle->bn; const libxsmm_blasint bc = handle->bc; const libxsmm_blasint cBlocks = C/bc; const libxsmm_blasint kBlocks = K/bk; const int lpb = handle->lpb; const int bc_lp = bc/lpb; const int bk_lp = bk/lpb; unsigned long long blocks, blocksa, blocksb; /* define tensors */ element_input_type *xt = (element_input_type* )handle->xt->data; element_input_type *hpD = (element_input_type* )handle->hp->data; element_output_type *b = (element_output_type*)handle->b->data; element_input_type *csp = (element_input_type* )handle->csp->data; element_filter_type *w = (element_filter_type*)handle->w->data; element_filter_type *r = (element_filter_type*)handle->r->data; element_filter_type *w_scratch = (element_filter_type*)handle->scratch_w; element_filter_type *r_scratch = (element_filter_type*)handle->scratch_r; /* These buffers are scratch for fp32 output of gemms (intermmediate results) */ float *cst = (float*)handle->cst_scratch; float *ht = (float*)handle->ht_scratch; float *it = (float*)handle->it_scratch; float *ft = (float*)handle->ft_scratch; float *ot = (float*)handle->ot_scratch; float *cit = (float*)handle->cit_scratch; float *cot = (float*)handle->cot_scratch; /* This has to be also upconverted since it is used in the elementwise functions */ float *csp_f32 = (float*)handle->csp_scratch; /* These are the output bf16 data */ element_output_type *cst_bf16 = (element_output_type*)handle->cst->data; element_output_type *ht_bf16 = (element_output_type*)handle->ht->data; element_output_type *it_bf16 = (element_output_type*)handle->it->data; element_output_type *ft_bf16 = (element_output_type*)handle->ft->data; element_output_type *ot_bf16 = (element_output_type*)handle->ot->data; element_output_type *cit_bf16 = (element_output_type*)handle->cit->data; element_output_type *cot_bf16 = (element_output_type*)handle->cot->data; element_filter_type *wiD = &(w[0]); element_filter_type *wcD = &(w[K]); element_filter_type *wfD = &(w[2*K]); element_filter_type *woD = &(w[3*K]); element_filter_type *riD = &(r[0]); element_filter_type *rcD = &(r[K]); element_filter_type *rfD = &(r[2*K]); element_filter_type *roD = &(r[3*K]); element_filter_type *wiD_scratch = &(w_scratch[0]); element_filter_type *wcD_scratch = &(w_scratch[C*K]); element_filter_type *wfD_scratch = &(w_scratch[2*C*K]); element_filter_type *woD_scratch = &(w_scratch[3*C*K]); element_filter_type *riD_scratch = &(r_scratch[0]); element_filter_type *rcD_scratch = &(r_scratch[K*K]); element_filter_type *rfD_scratch = &(r_scratch[2*K*K]); element_filter_type *roD_scratch = &(r_scratch[3*K*K]); element_output_type *bi = &(b[0]); element_output_type *bd = &(b[K]); element_output_type *bf = &(b[2*K]); element_output_type *bo = &(b[3*K]); LIBXSMM_VLA_DECL(2, float, cp, csp_f32, K); LIBXSMM_VLA_DECL(2, element_input_type, cp_bf16, csp, K); LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); LIBXSMM_VLA_DECL(5, element_filter_type, wi, wiD_scratch, cBlocks, bc_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, wf, wfD_scratch, cBlocks, bc_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, wo, woD_scratch, cBlocks, bc_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, wc, wcD_scratch, cBlocks, bc_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, ri, riD_scratch, kBlocks, bk_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, rf, rfD_scratch, kBlocks, bk_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, ro, roD_scratch, kBlocks, bk_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, rc, rcD_scratch, kBlocks, bk_lp, bk, lpb); LIBXSMM_VLA_DECL(2, element_filter_type, wi_ck, wiD, 4*K); LIBXSMM_VLA_DECL(2, element_filter_type, wf_ck, wfD, 4*K); LIBXSMM_VLA_DECL(2, element_filter_type, wo_ck, woD, 4*K); LIBXSMM_VLA_DECL(2, element_filter_type, wc_ck, wcD, 4*K); LIBXSMM_VLA_DECL(2, element_filter_type, ri_ck, riD, 4*K); LIBXSMM_VLA_DECL(2, element_filter_type, rf_ck, rfD, 4*K); LIBXSMM_VLA_DECL(2, element_filter_type, ro_ck, roD, 4*K); LIBXSMM_VLA_DECL(2, element_filter_type, rc_ck, rcD, 4*K); LIBXSMM_VLA_DECL(3, float, cs, cst, N, K); LIBXSMM_VLA_DECL(3, float, h, ht, N, K); LIBXSMM_VLA_DECL(3, float, i, it, N, K); LIBXSMM_VLA_DECL(3, float, f, ft, N, K); LIBXSMM_VLA_DECL(3, float, o, ot, N, K); LIBXSMM_VLA_DECL(3, float, ci, cit, N, K); LIBXSMM_VLA_DECL(3, float, co, cot, N, K); LIBXSMM_VLA_DECL(3, element_output_type, cs_out, cst_bf16, N, K); LIBXSMM_VLA_DECL(3, element_output_type, h_out, ht_bf16, N, K); LIBXSMM_VLA_DECL(3, element_output_type, i_out, it_bf16, N, K); LIBXSMM_VLA_DECL(3, element_output_type, f_out, ft_bf16, N, K); LIBXSMM_VLA_DECL(3, element_output_type, o_out, ot_bf16, N, K); LIBXSMM_VLA_DECL(3, element_output_type, ci_out, cit_bf16, N, K); LIBXSMM_VLA_DECL(3, element_output_type, co_out, cot_bf16, N, K); /* define batch-reduce gemm kernels */ const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernela = handle->fwd_kernela; const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernelb = handle->fwd_kernelb; float *cps_ptr = NULL; /* parallelize over C-blocks */ /* computing first logical thread */ const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; /* number of tasks that could be run in parallel */ const libxsmm_blasint work = (N/bn) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize = (work % (libxsmm_blasint)handle->desc.threads == 0) ? (work / (libxsmm_blasint)handle->desc.threads) : ((work / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const libxsmm_blasint thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* number of tasks that could be run in parallel for C and K blocks*/ const libxsmm_blasint work_ck = (C/bc) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize_ck = (work_ck % (libxsmm_blasint)handle->desc.threads == 0) ? (work_ck / (libxsmm_blasint)handle->desc.threads) : ((work_ck / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_ck = (ltid * chunksize_ck < work_ck) ? (ltid * chunksize_ck) : work_ck; const libxsmm_blasint thr_end_ck = ((ltid + 1) * chunksize_ck < work_ck) ? ((ltid + 1) * chunksize_ck) : work_ck; /* number of tasks that could be run in parallel for K and K blocks*/ const libxsmm_blasint work_kk = (K/bk) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize_kk = (work_kk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_kk / (libxsmm_blasint)handle->desc.threads) : ((work_kk / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_kk = (ltid * chunksize_kk < work_kk) ? (ltid * chunksize_kk) : work_kk; const libxsmm_blasint thr_end_kk = ((ltid + 1) * chunksize_kk < work_kk) ? ((ltid + 1) * chunksize_kk) : work_kk; const int use_fused_implementation = (C == 2048 && K == 2048) ? 1 : 0; #ifdef PROFILE __int64_t eltwise_start, eltwise_end, eltwise_cycles = 0, gemm_start, gemm_end, gemm_cycles = 0, gemm_cycles2 = 0, reformat_start, reformat_end, reformat_cycles = 0; float total_time = 0.0; #endif /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, (int)ltid); /* Blocking reduction domain if it is too large */ BF = 1; if ((C > 1024 && C <= 2048) || (K > 1024 && K <= 2048)) { BF = 8; while ( (cBlocks % BF != 0) || (kBlocks % BF != 0) ) { BF--; } } if (C > 2048 || K > 2048) { BF = 16; while ( (cBlocks % BF != 0) || (kBlocks % BF != 0) ) { BF--; } } if (C == 2048 && K == 1024) { BF = 2; } CB_BLOCKS = cBlocks/BF; KB_BLOCKS = kBlocks/BF; /* Upfront reformatting of W and R */ /* reformat W */ #ifdef PROFILE if (ltid == 0) reformat_start = _rdtsc(); #endif for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { ic = (ikic / (K/bk)); ik = (ikic % (K/bk)); for (jk = 0; jk < bk; ++jk) { for (jc = 0; jc < bc;++jc) { LIBXSMM_VLA_ACCESS(5, wi, ik, ic, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, wi_ck, ic*bc+jc, ik*bk+jk, 4*K); LIBXSMM_VLA_ACCESS(5, wc, ik, ic, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, wc_ck, ic*bc+jc, ik*bk+jk, 4*K); LIBXSMM_VLA_ACCESS(5, wf, ik, ic, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, wf_ck, ic*bc+jc, ik*bk+jk, 4*K); LIBXSMM_VLA_ACCESS(5, wo, ik, ic, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, wo_ck, ic*bc+jc, ik*bk+jk, 4*K); } } } /* reformat R */ for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { ik = (ikic / (K/bk)); ic = (ikic % (K/bk)); for (jk = 0; jk < bk; ++jk) { for (jc = 0; jc < bk; ++jc) { LIBXSMM_VLA_ACCESS(5, ri, ik, ic, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, ri_ck, ic*bk+jc, ik*bk+jk, 4*K); LIBXSMM_VLA_ACCESS(5, rc, ik, ic, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, rc_ck, ic*bk+jc, ik*bk+jk, 4*K); LIBXSMM_VLA_ACCESS(5, rf, ik, ic, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, rf_ck, ic*bk+jc, ik*bk+jk, 4*K); LIBXSMM_VLA_ACCESS(5, ro, ik, ic, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, ro_ck, ic*bk+jc, ik*bk+jk, 4*K); } } } /* Upconvert the cp input to fp32 that is used for elementwise stuff */ for (inik = thr_begin; inik < thr_end; ++inik ) { in = (inik % (N/bn))*bn; ikb = inik / (N/bn); ik = ikb*bk; libxsmm_internal_matrix_cvt_bf16_fp32_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, cp_bf16, in, ik, K), &LIBXSMM_VLA_ACCESS(2, cp, in, ik, K)); } libxsmm_barrier_wait(handle->barrier, (int)ltid); #ifdef PROFILE if (ltid == 0) { reformat_end = _rdtsc(); reformat_cycles = reformat_end - reformat_start; } #endif if (use_fused_implementation) { #include "libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_fused_bf16.tpl.c" } else { #include "libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_diffused_bf16.tpl.c" } #ifdef PROFILE if (ltid == 0) { printf("----- PROFILING LSTM FWD (N = %d, C = %d, K = %d, bn = %d. bc = %d, bk = %d)----\n", N, C, K, bn, bc, bk ); total_time = (gemm_cycles+gemm_cycles2+eltwise_cycles+reformat_cycles)/(2.5 * 1e9)*1000.0f; printf("Elementwise time is %f ms (%.2f%%)\n", eltwise_cycles/(2.5 * 1e9)*1000.0f, eltwise_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); printf("Reformat weights time is %f ms (%.2f%%)\n", reformat_cycles/(2.5 * 1e9)*1000.0f, reformat_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); printf("GEMM W*x time is %f ms (%.2f%%) at %f GFLOPS\n", gemm_cycles/(2.5 * 1e9)*1000.0f, gemm_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*(N*C*K*2.0)*4.0/1e9/(gemm_cycles/(2.5 * 1e9))); printf("GEMM R*h time is %f ms (%.2f%%) at %f GFLOPS\n\n", gemm_cycles2/(2.5 * 1e9)*1000.0f, gemm_cycles2/(2.5 * 1e9)*1000.0f*100.0/total_time, t*(N*K*K*2.0)*4.0/1e9/(gemm_cycles2/(2.5 * 1e9))); } #undef PROFILE #endif libxsmm-1.17/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck.tpl.c000066400000000000000000000147471415223013700266030ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas, Kunal Banerjee (Intel Corp.) ******************************************************************************/ #if 0 #define PROFILE #endif /* helper variables */ libxsmm_blasint j, ik, ikb, in, ic, icb, inik, BF, CB, CB_BLOCKS, KB_BLOCKS; /* input sizes */ const libxsmm_blasint K = handle->desc.K; const libxsmm_blasint N = handle->desc.N; const libxsmm_blasint C = handle->desc.C; const libxsmm_blasint t = handle->T; const libxsmm_blasint bk = handle->bk; const libxsmm_blasint bn = handle->bn; const libxsmm_blasint bc = handle->bc; const libxsmm_blasint cBlocks = C/bc; const libxsmm_blasint kBlocks = K/bk; unsigned long long blocks; /* define tensors */ element_input_type *xt = (element_input_type* )handle->xt->data; element_input_type *csp = (element_input_type* )handle->csp->data; element_input_type *hpD = (element_input_type* )handle->hp->data; element_filter_type *w = (element_filter_type*)handle->w->data; element_filter_type *r = (element_filter_type*)handle->r->data; element_output_type *b = (element_output_type*)handle->b->data; element_output_type *cst = (element_output_type*)handle->cst->data; element_output_type *ht = (element_output_type*)handle->ht->data; element_output_type *it = (element_output_type*)handle->it->data; element_output_type *ft = (element_output_type*)handle->ft->data; element_output_type *ot = (element_output_type*)handle->ot->data; element_output_type *cit = (element_output_type*)handle->cit->data; element_output_type *cot = (element_output_type*)handle->cot->data; element_filter_type *wiD = &(w[0]); element_filter_type *wcD = &(w[C*K]); element_filter_type *wfD = &(w[2*C*K]); element_filter_type *woD = &(w[3*C*K]); element_filter_type *riD = &(r[0]); element_filter_type *rcD = &(r[K*K]); element_filter_type *rfD = &(r[2*K*K]); element_filter_type *roD = &(r[3*K*K]); element_output_type *bi = &(b[0]); element_output_type *bd = &(b[K]); element_output_type *bf = &(b[2*K]); element_output_type *bo = &(b[3*K]); LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); LIBXSMM_VLA_DECL(2, element_input_type, cp, csp, K); LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); LIBXSMM_VLA_DECL(4, element_filter_type, wi, wiD, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, wf, wfD, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, wo, woD, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, wc, wcD, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, ri, riD, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, rf, rfD, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, ro, roD, kBlocks, bk, bk); LIBXSMM_VLA_DECL(4, element_filter_type, rc, rcD, kBlocks, bk, bk); LIBXSMM_VLA_DECL(3, element_output_type, cs, cst, N, K); LIBXSMM_VLA_DECL(3, element_output_type, h, ht, N, K); LIBXSMM_VLA_DECL(3, element_output_type, i, it, N, K); LIBXSMM_VLA_DECL(3, element_output_type, f, ft, N, K); LIBXSMM_VLA_DECL(3, element_output_type, o, ot, N, K); LIBXSMM_VLA_DECL(3, element_output_type, ci, cit, N, K); LIBXSMM_VLA_DECL(3, element_output_type, co, cot, N, K); /* define batch-reduce gemm kernels */ const libxsmm_smmfunction_reducebatch_addr batchreduce_kernela = libxsmm_smmdispatch_reducebatch_addr( bk, bn, bc, &bk, &C, &K, NULL, NULL, NULL, NULL ); const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelb = libxsmm_smmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, NULL, NULL ); /* Auxiliary arrays for batch-reduce gemms */ const element_filter_type *A_array[1024]; const element_input_type *B_array[1024]; element_output_type *cps_ptr = NULL; /* parallelize over C-blocks */ /* computing first logical thread */ const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; /* number of tasks that could be run in parallel */ const libxsmm_blasint work = (N/bn) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize = (work % (libxsmm_blasint)handle->desc.threads == 0) ? (work / (libxsmm_blasint)handle->desc.threads) : ((work / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const libxsmm_blasint thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; const int use_fused_implementation = (C == 2048 && K == 2048) ? 1 : 0; #ifdef PROFILE __int64_t eltwise_start, eltwise_end, eltwise_cycles = 0, gemm_start, gemm_end, gemm_cycles = 0, gemm_cycles2 = 0; float total_time = 0.0; #endif /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, (int)ltid); /* Blocking reduction domain if it is too large */ BF = 1; if ((C > 1024 && C <= 2048) || (K > 1024 && K <= 2048)) { BF = 8; while ( (cBlocks % BF != 0) || (kBlocks % BF != 0) ) { BF--; } } if (C > 2048 || K > 2048) { BF = 16; while ( (cBlocks % BF != 0) || (kBlocks % BF != 0) ) { BF--; } } if (C == 2048 && K == 1024) { BF = 2; } CB_BLOCKS = cBlocks/BF; KB_BLOCKS = kBlocks/BF; if (use_fused_implementation) { #include "libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_fused.tpl.c" } else { #include "libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_diffused.tpl.c" } #ifdef PROFILE if (ltid == 0) { printf("----- PROFILING LSTM FWD (N = %d, C = %d, K = %d, bn = %d. bc = %d, bk = %d)----\n", N, C, K, bn, bc, bk ); total_time = (gemm_cycles+gemm_cycles2+eltwise_cycles)/(2.5 * 1e9)*1000.0f; printf("Elementwise time is %f ms (%.2f%%)\n", eltwise_cycles/(2.5 * 1e9)*1000.0f, eltwise_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); printf("GEMM W*x time is %f ms (%.2f%%) at %f GFLOPS\n", gemm_cycles/(2.5 * 1e9)*1000.0f, gemm_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*(N*C*K*2.0)*4.0/1e9/(gemm_cycles/(2.5 * 1e9))); printf("GEMM R*h time is %f ms (%.2f%%) at %f GFLOPS\n\n", gemm_cycles2/(2.5 * 1e9)*1000.0f, gemm_cycles2/(2.5 * 1e9)*1000.0f*100.0/total_time, t*(N*K*K*2.0)*4.0/1e9/(gemm_cycles2/(2.5 * 1e9))); } #undef PROFILE #endif libxsmm-1.17/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_bf16.tpl.c000066400000000000000000000252541415223013700274140ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas, Kunal Banerjee (Intel Corp.) ******************************************************************************/ #if 0 #define PROFILE #endif #define MATRIX_CVT_BF16_FP32_LD(m, n, ld, _src, _dst) \ do { \ libxsmm_bfloat16 *src = _src; \ float *dst = _dst; \ libxsmm_blasint __i,__j; \ for ( __j = 0; __j < n; ++__j ) { \ for ( __i = 0; __i < m; __i+=16 ) { \ _mm512_storeu_ps((float*)&dst[(__j*ld)+__i], LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&src[(__j*ld)+__i]))); \ } \ } \ } while (0) #define MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD(m, n, ld, _srcdst, _colv) \ do { \ libxsmm_bfloat16 *colv = _colv; \ float *srcdst = _srcdst; \ libxsmm_blasint __i,__j; \ for ( __j = 0; __j < n; ++__j ) { \ for ( __i = 0; __i < m; __i+=16 ) { \ _mm512_storeu_ps((float*)&srcdst[(__j*ld)+__i], LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&colv[__i]))); \ } \ } \ } while (0) #define MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_CONST_LD(m, n, ld, _srcdst, _colv, const_bias) \ do { \ libxsmm_bfloat16 *colv = _colv; \ float *srcdst = _srcdst; \ libxsmm_blasint __i,__j; \ __m512 vbias = _mm512_set1_ps(const_bias); \ for ( __j = 0; __j < n; ++__j ) { \ for ( __i = 0; __i < m; __i+=16 ) { \ _mm512_storeu_ps((float*)&srcdst[(__j*ld)+__i], _mm512_add_ps(vbias, LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&colv[__i])))); \ } \ } \ } while (0) /* helper variables */ libxsmm_blasint j, ik, ikb, in, /*ic, icb,*/ inik, BF, CB, CB_BLOCKS, KB_BLOCKS; /* input sizes */ const libxsmm_blasint K = handle->desc.K; const libxsmm_blasint N = handle->desc.N; const libxsmm_blasint C = handle->desc.C; const libxsmm_blasint t = handle->T; const libxsmm_blasint bk = handle->bk; const libxsmm_blasint bn = handle->bn; const libxsmm_blasint bc = handle->bc; const libxsmm_blasint cBlocks = C/bc; const libxsmm_blasint kBlocks = K/bk; int lpb = 2; const int bc_lp = bc/lpb; const int bk_lp = bk/lpb; unsigned long long blocks, blocksa, blocksb; /* define tensors */ element_input_type *xt = (element_input_type* )handle->xt->data; element_input_type *csp = (element_input_type* )handle->csp->data; element_input_type *hpD = (element_input_type* )handle->hp->data; element_filter_type *w = (element_filter_type*)handle->w->data; element_filter_type *r = (element_filter_type*)handle->r->data; element_output_type *b = (element_output_type*)handle->b->data; /* These buffers are scratch for fp32 output of gemms (intermmediate results) */ float *cst = (float*)handle->cst_scratch; float *ht = (float*)handle->ht_scratch; float *it = (float*)handle->it_scratch; float *ft = (float*)handle->ft_scratch; float *ot = (float*)handle->ot_scratch; float *cit = (float*)handle->cit_scratch; float *cot = (float*)handle->cot_scratch; /* This has to be also upconverted since it is used in the elementwise functions */ float *csp_f32 = (float*)handle->csp_scratch; /* These are the output bf16 data */ element_output_type *cst_bf16 = (element_output_type*)handle->cst->data; element_output_type *ht_bf16 = (element_output_type*)handle->ht->data; element_output_type *it_bf16 = (element_output_type*)handle->it->data; element_output_type *ft_bf16 = (element_output_type*)handle->ft->data; element_output_type *ot_bf16 = (element_output_type*)handle->ot->data; element_output_type *cit_bf16 = (element_output_type*)handle->cit->data; element_output_type *cot_bf16 = (element_output_type*)handle->cot->data; element_filter_type *wiD = &(w[0]); element_filter_type *wcD = &(w[C*K]); element_filter_type *wfD = &(w[2*C*K]); element_filter_type *woD = &(w[3*C*K]); element_filter_type *riD = &(r[0]); element_filter_type *rcD = &(r[K*K]); element_filter_type *rfD = &(r[2*K*K]); element_filter_type *roD = &(r[3*K*K]); element_output_type *bi = &(b[0]); element_output_type *bd = &(b[K]); element_output_type *bf = &(b[2*K]); element_output_type *bo = &(b[3*K]); LIBXSMM_VLA_DECL(2, float, cp, csp_f32, K); LIBXSMM_VLA_DECL(2, element_input_type, cp_bf16, csp, K); LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); LIBXSMM_VLA_DECL(5, element_filter_type, wi, wiD, cBlocks, bc_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, wf, wfD, cBlocks, bc_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, wo, woD, cBlocks, bc_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, wc, wcD, cBlocks, bc_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, ri, riD, kBlocks, bk_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, rf, rfD, kBlocks, bk_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, ro, roD, kBlocks, bk_lp, bk, lpb); LIBXSMM_VLA_DECL(5, element_filter_type, rc, rcD, kBlocks, bk_lp, bk, lpb); LIBXSMM_VLA_DECL(3, float, cs, cst, N, K); LIBXSMM_VLA_DECL(3, float, h, ht, N, K); LIBXSMM_VLA_DECL(3, float, i, it, N, K); LIBXSMM_VLA_DECL(3, float, f, ft, N, K); LIBXSMM_VLA_DECL(3, float, o, ot, N, K); LIBXSMM_VLA_DECL(3, float, ci, cit, N, K); LIBXSMM_VLA_DECL(3, float, co, cot, N, K); LIBXSMM_VLA_DECL(3, element_output_type, cs_out, cst_bf16, N, K); LIBXSMM_VLA_DECL(3, element_output_type, h_out, ht_bf16, N, K); LIBXSMM_VLA_DECL(3, element_output_type, i_out, it_bf16, N, K); LIBXSMM_VLA_DECL(3, element_output_type, f_out, ft_bf16, N, K); LIBXSMM_VLA_DECL(3, element_output_type, o_out, ot_bf16, N, K); LIBXSMM_VLA_DECL(3, element_output_type, ci_out, cit_bf16, N, K); LIBXSMM_VLA_DECL(3, element_output_type, co_out, cot_bf16, N, K); /* define batch-reduce gemm kernels */ const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernela = handle->fwd_kernela; const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernelb = handle->fwd_kernelb; float *cps_ptr = NULL; /* parallelize over C-blocks */ /* computing first logical thread */ const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; /* number of tasks that could be run in parallel */ const libxsmm_blasint work = (N/bn) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize = (work % (libxsmm_blasint)handle->desc.threads == 0) ? (work / (libxsmm_blasint)handle->desc.threads) : ((work / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const libxsmm_blasint thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; #if 0 /* number of tasks that could be run in parallel for C and K blocks*/ const libxsmm_blasint work_ck = (C/bc) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize_ck = (work_ck % (libxsmm_blasint)handle->desc.threads == 0) ? (work_ck / (libxsmm_blasint)handle->desc.threads) : ((work_ck / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_ck = (ltid * chunksize_ck < work_ck) ? (ltid * chunksize_ck) : work_ck; const libxsmm_blasint thr_end_ck = ((ltid + 1) * chunksize_ck < work_ck) ? ((ltid + 1) * chunksize_ck) : work_ck; /* number of tasks that could be run in parallel for K and K blocks*/ const libxsmm_blasint work_kk = (K/bk) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize_kk = (work_kk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_kk / (libxsmm_blasint)handle->desc.threads) : ((work_kk / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_kk = (ltid * chunksize_kk < work_kk) ? (ltid * chunksize_kk) : work_kk; const libxsmm_blasint thr_end_kk = ((ltid + 1) * chunksize_kk < work_kk) ? ((ltid + 1) * chunksize_kk) : work_kk; #endif const int use_fused_implementation = (C == 2048 && K == 2048) ? 1 : 0; #ifdef PROFILE __int64_t eltwise_start, eltwise_end, eltwise_cycles = 0, gemm_start, gemm_end, gemm_cycles = 0, gemm_cycles2 = 0, reformat_start, reformat_end, reformat_cycles = 0; float total_time = 0.0; #endif /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, (int)ltid); /* Blocking reduction domain if it is too large */ BF = 1; if ((C > 1024 && C <= 2048) || (K > 1024 && K <= 2048)) { BF = 8; while ( (cBlocks % BF != 0) || (kBlocks % BF != 0) ) { BF--; } } if (C > 2048 || K > 2048) { BF = 16; while ( (cBlocks % BF != 0) || (kBlocks % BF != 0) ) { BF--; } } if (C == 2048 && K == 1024) { BF = 2; } CB_BLOCKS = cBlocks/BF; KB_BLOCKS = kBlocks/BF; #ifdef PROFILE if (ltid == 0) reformat_start = _rdtsc(); #endif /* Upconvert the cp input to fp32 that is used for elementwise stuff */ for (inik = thr_begin; inik < thr_end; ++inik ) { in = (inik % (N/bn))*bn; ikb = inik / (N/bn); ik = ikb*bk; MATRIX_CVT_BF16_FP32_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, cp_bf16, in, ik, K), &LIBXSMM_VLA_ACCESS(2, cp, in, ik, K)); } libxsmm_barrier_wait(handle->barrier, (int)ltid); #ifdef PROFILE if (ltid == 0) { reformat_end = _rdtsc(); reformat_cycles = reformat_end - reformat_start; } #endif if (use_fused_implementation) { #include "libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_fused_bf16.tpl.c" } else { #include "libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_diffused_bf16.tpl.c" } #ifdef PROFILE if (ltid == 0) { printf("----- PROFILING LSTM FWD (N = %d, C = %d, K = %d, bn = %d. bc = %d, bk = %d)----\n", N, C, K, bn, bc, bk ); total_time = (gemm_cycles+gemm_cycles2+eltwise_cycles+reformat_cycles)/(2.5 * 1e9)*1000.0f; printf("Elementwise time is %f ms (%.2f%%)\n", eltwise_cycles/(2.5 * 1e9)*1000.0f, eltwise_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); printf("Reformat weights time is %f ms (%.2f%%)\n", reformat_cycles/(2.5 * 1e9)*1000.0f, reformat_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); printf("GEMM W*x time is %f ms (%.2f%%) at %f GFLOPS\n", gemm_cycles/(2.5 * 1e9)*1000.0f, gemm_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*(N*C*K*2.0)*4.0/1e9/(gemm_cycles/(2.5 * 1e9))); printf("GEMM R*h time is %f ms (%.2f%%) at %f GFLOPS\n\n", gemm_cycles2/(2.5 * 1e9)*1000.0f, gemm_cycles2/(2.5 * 1e9)*1000.0f*100.0/total_time, t*(N*K*K*2.0)*4.0/1e9/(gemm_cycles2/(2.5 * 1e9))); } #undef PROFILE #endif #undef MATRIX_CVT_BF16_FP32_LD #undef MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD #undef MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_CONST_LD libxsmm-1.17/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_diffused.tpl.c000066400000000000000000000266431415223013700304520ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas (Intel Corp.) ******************************************************************************/ /* First perform the W*x part of the output */ for (j = 0; j < t; ++j) { /* let's run the cell in blocks for good locality */ /* Block reduction loop if requested */ for (CB = 0; CB < BF; CB++) { for (inik = thr_begin; inik < thr_end; ++inik ) { in = (inik % (N/bn))*bn; ikb = inik / (N/bn); ik = ikb*bk; /* initialize i with bi */ #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif if (CB == 0) libxsmm_internal_matrix_bcst_colvector_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &bi[ik] ); /* i += W.x */ for (icb = 0, ic = 0; icb < CB_BLOCKS; ic += bc, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, wi, ikb, icb + CB*CB_BLOCKS, 0, 0, cBlocks, bc, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(3, x, j, in, ic + CB*CB_BLOCKS*bc, N, C); } /* Reduce batch gemm call */ blocks = CB_BLOCKS; batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &blocks); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* initialize ci with bd */ if (CB == 0) libxsmm_internal_matrix_bcst_colvector_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &bd[ik] ); /* ci += W.x */ for (icb = 0, ic = 0; icb < CB_BLOCKS; ic += bc, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, wc, ikb, icb + CB*CB_BLOCKS, 0, 0, cBlocks, bc, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(3, x, j, in, ic + CB*CB_BLOCKS*bc, N, C); } /* Reduce batch gemm call */ blocks = CB_BLOCKS; batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &blocks); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* initialize f with (bf + forget_bias) */ if (CB == 0) libxsmm_internal_matrix_bcst_colvector_const_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &bf[ik], handle->forget_bias ); /* f += W.x */ for (icb = 0, ic = 0; icb < CB_BLOCKS; ic += bc, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, wf, ikb, icb + CB*CB_BLOCKS, 0, 0, cBlocks, bc, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(3, x, j, in, ic + CB*CB_BLOCKS*bc, N, C); } /* Reduce batch gemm call */ blocks = CB_BLOCKS; batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &blocks); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* initialize o with bo */ if (CB == 0) libxsmm_internal_matrix_bcst_colvector_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &bo[ik] ); /* o += W.x */ for (icb = 0, ic = 0; icb < CB_BLOCKS; ic += bc, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, wo, ikb, icb + CB*CB_BLOCKS, 0, 0, cBlocks, bc, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(3, x, j, in, ic + CB*CB_BLOCKS*bc, N, C); } /* Reduce batch gemm call */ blocks = CB_BLOCKS; batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &blocks); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles += gemm_end-gemm_start; } #endif } } } /* Compute the R*h part of the output */ for (j = 0; j < t; ++j) { /* let's run the cell in blocks for good locality */ /* Block reduction loop if requested */ for (CB = 0; CB < BF; CB++) { for (inik = thr_begin; inik < thr_end; ++inik ) { in = (inik % (N/bn))*bn; ikb = inik / (N/bn); ik = ikb*bk; #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* i += R.h */ if (0 == j) { for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, ri, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(2, hp, in, ic + CB*KB_BLOCKS*bk, K); } } else { for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, ri, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ic + CB*KB_BLOCKS*bk, N, K); } } /* Reduce batch gemm call */ blocks = KB_BLOCKS; batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &blocks); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles2 += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* ci += R.h */ if (0 == j) { for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rc, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(2, hp, in, ic + CB*KB_BLOCKS*bk, K); } } else { for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rc, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ic + CB*KB_BLOCKS*bk, N, K); } } /* Reduce batch gemm call */ blocks = KB_BLOCKS; batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &blocks); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles2 += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* f += R.h */ if (0 == j) { for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rf, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(2, hp, in, ic + CB*KB_BLOCKS*bk, K); } } else { for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rf, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ic + CB*KB_BLOCKS*bk, N, K); } } /* Reduce batch gemm call */ blocks = KB_BLOCKS; batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &blocks); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles2 += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* o += R.h */ if (0 == j) { for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, ro, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(2, hp, in, ic + CB*KB_BLOCKS*bk, K); } } else { for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, ro, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ic + CB*KB_BLOCKS*bk, N, K); } } /* Reduce batch gemm call */ blocks = KB_BLOCKS; batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &blocks); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles2 += gemm_end-gemm_start; } #endif if (CB == BF-1) { #ifdef PROFILE if (ltid == 0) { eltwise_start = _rdtsc(); } #endif cps_ptr = (j == 0) ? &LIBXSMM_VLA_ACCESS(2, cp, in, ik, K) : &LIBXSMM_VLA_ACCESS(3, cs, j-1, in, ik, N, K); /* Compute i, ci, f, o, cs, co and h */ #if defined(LIBXSMM_RNN_CELL_AVX512) if (bk % 16 == 0 && bc % 16 == 0) { #include "libxsmm_internal_lstm_fwd_fused_eltwise.tpl.c" } else { libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), cps_ptr, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); } #else libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), cps_ptr, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); #endif #ifdef PROFILE if (ltid == 0) { eltwise_end = _rdtsc(); eltwise_cycles += eltwise_end-eltwise_start; } #endif } } } libxsmm_barrier_wait(handle->barrier, (int)ltid); } libxsmm-1.17/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_diffused_bf16.tpl.c000066400000000000000000000414471415223013700312670ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas (Intel Corp.) ******************************************************************************/ #define NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(m, n, ld, _src, _dst) \ do { \ float *const src = _src; \ libxsmm_bfloat16 *const dst = _dst; \ libxsmm_blasint __i, __j; \ __m512i packed_result; \ for ( __j = 0; __j < n; ++__j ) { \ for ( __i = 0; __i < m; __i+=32 ) { \ packed_result = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS((float*)&src[(__j*ld)+__i+16]), LIBXSMM_INTRINSICS_MM512_LOAD_PS((float*)&src[(__j*ld)+__i])); \ _mm512_storeu_si512(&dst[(__j*ld)+__i], packed_result); \ } \ } \ } while (0) /* First perform the W*x part of the output */ blocks = CB_BLOCKS; for (j = 0; j < t; ++j) { /* let's run the cell in blocks for good locality */ /* Block reduction loop if requested */ for (CB = 0; CB < BF; CB++) { for (inik = thr_begin; inik < thr_end; ++inik ) { in = (inik % (N/bn))*bn; ikb = inik / (N/bn); ik = ikb*bk; /* initialize i with bi */ #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &bi[ik] ); /* i += W.x */ batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wi, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &blocks); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* initialize ci with bd */ if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &bd[ik] ); /* ci += W.x */ batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wc, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &blocks); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* initialize f with (bf + forget_bias) */ if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_CONST_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &bf[ik], handle->forget_bias ); /* f += W.x */ batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wf, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &blocks); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* initialize o with bo */ if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &bo[ik] ); /* o += W.x */ batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wo, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &blocks); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles += gemm_end-gemm_start; } #endif } } libxsmm_barrier_wait(handle->barrier, (int)ltid); } /* Compute the R*h part of the output */ blocks = KB_BLOCKS; /* Peel off the t=0 iteration to hoist the innermost if conditions */ j = 0; /* let's run the cell in blocks for good locality */ /* Block reduction loop if requested */ for (CB = 0; CB < BF; CB++) { for (inik = thr_begin; inik < thr_end; ++inik ) { in = (inik % (N/bn))*bn; ikb = inik / (N/bn); ik = ikb*bk; #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* i += R.h */ batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, ri, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(2, hp, in, CB*KB_BLOCKS*bk, K), &LIBXSMM_VLA_ACCESS(3, i, 0, in, ik, N, K), &blocks); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles2 += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* ci += R.h */ batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, rc, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(2, hp, in, CB*KB_BLOCKS*bk, K), &LIBXSMM_VLA_ACCESS(3, ci, 0, in, ik, N, K), &blocks); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles2 += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* f += R.h */ batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, rf, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(2, hp, in, CB*KB_BLOCKS*bk, K), &LIBXSMM_VLA_ACCESS(3, f, 0, in, ik, N, K), &blocks); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles2 += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* o += R.h */ batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, ro, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(2, hp, in, CB*KB_BLOCKS*bk, K), &LIBXSMM_VLA_ACCESS(3, o, 0, in, ik, N, K), &blocks); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles2 += gemm_end-gemm_start; } #endif if (CB == BF-1) { #ifdef PROFILE if (ltid == 0) { eltwise_start = _rdtsc(); } #endif cps_ptr = &LIBXSMM_VLA_ACCESS(2, cp, in, ik, K); /* Compute i, ci, f, o, cs, co and h */ #if defined(LIBXSMM_RNN_CELL_AVX512) if (bk % 16 == 0 && bc % 16 == 0) { #include "libxsmm_internal_lstm_fwd_fused_eltwise_bf16.tpl.c" } else { libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), cps_ptr, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); } #else libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), cps_ptr, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); #endif /* Downconvert computed results to bf16 output buffers */ NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs_out, j, in, ik, N, K)); NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h_out, j, in, ik, N, K)); NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i_out, j, in, ik, N, K)); NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f_out, j, in, ik, N, K)); NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o_out, j, in, ik, N, K)); NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci_out, j, in, ik, N, K)); NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co_out, j, in, ik, N, K)); #ifdef PROFILE if (ltid == 0) { eltwise_end = _rdtsc(); eltwise_cycles += eltwise_end-eltwise_start; } #endif } } } libxsmm_barrier_wait(handle->barrier, (int)ltid); for (j = 1; j < t; ++j) { /* let's run the cell in blocks for good locality */ /* Block reduction loop if requested */ for (CB = 0; CB < BF; CB++) { for (inik = thr_begin; inik < thr_end; ++inik ) { in = (inik % (N/bn))*bn; ikb = inik / (N/bn); ik = ikb*bk; #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* i += R.h */ batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, ri, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(3, h_out, j-1, in, CB*KB_BLOCKS*bk, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &blocks); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles2 += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* ci += R.h */ batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, rc, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(3, h_out, j-1, in, CB*KB_BLOCKS*bk, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &blocks); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles2 += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* f += R.h */ batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, rf, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(3, h_out, j-1, in, CB*KB_BLOCKS*bk, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &blocks); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles2 += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* o += R.h */ batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, ro, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(3, h_out, j-1, in, CB*KB_BLOCKS*bk, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &blocks); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles2 += gemm_end-gemm_start; } #endif if (CB == BF-1) { #ifdef PROFILE if (ltid == 0) { eltwise_start = _rdtsc(); } #endif cps_ptr = &LIBXSMM_VLA_ACCESS(3, cs, j-1, in, ik, N, K); /* Compute i, ci, f, o, cs, co and h */ #if defined(LIBXSMM_RNN_CELL_AVX512) if (bk % 16 == 0 && bc % 16 == 0) { #include "libxsmm_internal_lstm_fwd_fused_eltwise_bf16.tpl.c" } else { libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), cps_ptr, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); } #else libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), cps_ptr, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); #endif /* Downconvert computed results to bf16 output buffers */ NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs_out, j, in, ik, N, K)); NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h_out, j, in, ik, N, K)); NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i_out, j, in, ik, N, K)); NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f_out, j, in, ik, N, K)); NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o_out, j, in, ik, N, K)); NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci_out, j, in, ik, N, K)); NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co_out, j, in, ik, N, K)); #ifdef PROFILE if (ltid == 0) { eltwise_end = _rdtsc(); eltwise_cycles += eltwise_end-eltwise_start; } #endif } } } libxsmm_barrier_wait(handle->barrier, (int)ltid); } #undef NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD libxsmm-1.17/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_fused.tpl.c000066400000000000000000000261031415223013700277560ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas (Intel Corp.) ******************************************************************************/ /* All data is in column-major format */ for (j = 0; j < t; ++j) { /* let's run the cell in blocks for good locality */ /* Block reduction loop if requested */ for (CB = 0; CB < BF; CB++) { for (inik = thr_begin; inik < thr_end; ++inik ) { in = (inik % (N/bn))*bn; ikb = inik / (N/bn); ik = ikb*bk; /* initialize i with bi */ #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif if (CB == 0) libxsmm_internal_matrix_bcst_colvector_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &bi[ik] ); /* i += W.x */ for (icb = 0, ic = 0; icb < CB_BLOCKS; ic += bc, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, wi, ikb, icb + CB*CB_BLOCKS, 0, 0, cBlocks, bc, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(3, x, j, in, ic + CB*CB_BLOCKS*bc, N, C); } /* Reduce batch gemm call */ blocks = CB_BLOCKS; batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &blocks); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* i += R.h */ if (0 == j) { for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, ri, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(2, hp, in, ic + CB*KB_BLOCKS*bk, K); } } else { for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, ri, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ic + CB*KB_BLOCKS*bk, N, K); } } /* Reduce batch gemm call */ blocks = KB_BLOCKS; batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &blocks); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles2 += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* initialize ci with bd */ if (CB == 0) libxsmm_internal_matrix_bcst_colvector_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &bd[ik] ); /* ci += W.x */ for (icb = 0, ic = 0; icb < CB_BLOCKS; ic += bc, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, wc, ikb, icb + CB*CB_BLOCKS, 0, 0, cBlocks, bc, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(3, x, j, in, ic + CB*CB_BLOCKS*bc, N, C); } /* Reduce batch gemm call */ blocks = CB_BLOCKS; batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &blocks); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* ci += R.h */ if (0 == j) { for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rc, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(2, hp, in, ic + CB*KB_BLOCKS*bk, K); } } else { for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rc, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ic + CB*KB_BLOCKS*bk, N, K); } } /* Reduce batch gemm call */ blocks = KB_BLOCKS; batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &blocks); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles2 += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* initialize f with (bf + forget_bias) */ if (CB == 0) libxsmm_internal_matrix_bcst_colvector_const_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &bf[ik], handle->forget_bias ); /* f += W.x */ for (icb = 0, ic = 0; icb < CB_BLOCKS; ic += bc, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, wf, ikb, icb + CB*CB_BLOCKS, 0, 0, cBlocks, bc, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(3, x, j, in, ic + CB*CB_BLOCKS*bc, N, C); } /* Reduce batch gemm call */ blocks = CB_BLOCKS; batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &blocks); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* f += R.h */ if (0 == j) { for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rf, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(2, hp, in, ic + CB*KB_BLOCKS*bk, K); } } else { for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rf, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ic + CB*KB_BLOCKS*bk, N, K); } } /* Reduce batch gemm call */ blocks = KB_BLOCKS; batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &blocks); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles2 += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* initialize o with bo */ if (CB == 0) libxsmm_internal_matrix_bcst_colvector_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &bo[ik] ); /* o += W.x */ for (icb = 0, ic = 0; icb < CB_BLOCKS; ic += bc, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, wo, ikb, icb + CB*CB_BLOCKS, 0, 0, cBlocks, bc, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(3, x, j, in, ic + CB*CB_BLOCKS*bc, N, C); } /* Reduce batch gemm call */ blocks = CB_BLOCKS; batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &blocks); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* o += R.h */ if (0 == j) { for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, ro, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(2, hp, in, ic + CB*KB_BLOCKS*bk, K); } } else { for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { A_array[icb] = &LIBXSMM_VLA_ACCESS(4, ro, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array[icb] = &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ic + CB*KB_BLOCKS*bk, N, K); } } /* Reduce batch gemm call */ blocks = KB_BLOCKS; batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &blocks); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles2 += gemm_end-gemm_start; } #endif if (CB == BF-1) { #ifdef PROFILE if (ltid == 0) { eltwise_start = _rdtsc(); } #endif cps_ptr = (j == 0) ? &LIBXSMM_VLA_ACCESS(2, cp, in, ik, K) : &LIBXSMM_VLA_ACCESS(3, cs, j-1, in, ik, N, K); /* Compute i, ci, f, o, cs, co and h */ #if defined(LIBXSMM_RNN_CELL_AVX512) if (bk % 16 == 0 && bc % 16 == 0) { #include "libxsmm_internal_lstm_fwd_fused_eltwise.tpl.c" } else { libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), cps_ptr, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); } #else libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), cps_ptr, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); #endif #ifdef PROFILE if (ltid == 0) { eltwise_end = _rdtsc(); eltwise_cycles += eltwise_end-eltwise_start; } #endif } } } libxsmm_barrier_wait(handle->barrier, (int)ltid); } libxsmm-1.17/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_fused_bf16.tpl.c000066400000000000000000000450571415223013700306050ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas (Intel Corp.) ******************************************************************************/ #define NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(m, n, ld, _src, _dst) \ do { \ float *const src = _src; \ libxsmm_bfloat16 *const dst = _dst; \ libxsmm_blasint __i, __j; \ __m512i packed_result; \ for ( __j = 0; __j < n; ++__j ) { \ for ( __i = 0; __i < m; __i+=32 ) { \ packed_result = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS((float*)&src[(__j*ld)+__i+16]), LIBXSMM_INTRINSICS_MM512_LOAD_PS((float*)&src[(__j*ld)+__i])); \ _mm512_storeu_si512(&dst[(__j*ld)+__i], packed_result); \ } \ } \ } while (0) blocksa = CB_BLOCKS; blocksb = KB_BLOCKS; /* All data is in column-major format */ /* Peel off the t=0 iteration to hoist the innermost if conditions */ j = 0; for (CB = 0; CB < BF; CB++) { for (inik = thr_begin; inik < thr_end; ++inik ) { in = (inik % (N/bn))*bn; ikb = inik / (N/bn); ik = ikb*bk; /* initialize i with bi */ #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &bi[ik] ); /* i += W.x */ batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wi, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &blocksa); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* i += R.h */ batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, ri, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(2, hp, in, CB*KB_BLOCKS*bk, K), &LIBXSMM_VLA_ACCESS(3, i, 0, in, ik, N, K), &blocksb); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles2 += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* initialize ci with bd */ if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &bd[ik] ); /* ci += W.x */ batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wc, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &blocksa); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* ci += R.h */ batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, rc, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(2, hp, in, CB*KB_BLOCKS*bk, K), &LIBXSMM_VLA_ACCESS(3, ci, 0, in, ik, N, K), &blocksb); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles2 += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* initialize f with (bf + forget_bias) */ if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_CONST_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &bf[ik], handle->forget_bias ); /* f += W.x */ batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wf, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &blocksa); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* f += R.h */ batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, rf, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(2, hp, in, CB*KB_BLOCKS*bk, K), &LIBXSMM_VLA_ACCESS(3, f, 0, in, ik, N, K), &blocksb); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles2 += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* initialize o with bo */ if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &bo[ik] ); /* o += W.x */ batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wo, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &blocksa); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* o += R.h */ batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, ro, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(2, hp, in, CB*KB_BLOCKS*bk, K), &LIBXSMM_VLA_ACCESS(3, o, 0, in, ik, N, K), &blocksb); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles2 += gemm_end-gemm_start; } #endif if (CB == BF-1) { #ifdef PROFILE if (ltid == 0) { eltwise_start = _rdtsc(); } #endif cps_ptr = &LIBXSMM_VLA_ACCESS(2, cp, in, ik, K); /* Compute i, ci, f, o, cs, co and h */ #if defined(LIBXSMM_RNN_CELL_AVX512) if (bk % 16 == 0 && bc % 16 == 0) { #include "libxsmm_internal_lstm_fwd_fused_eltwise_bf16.tpl.c" } else { libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), cps_ptr, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); } #else libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), cps_ptr, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); #endif /* Downconvert computed results to bf16 output buffers */ NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs_out, j, in, ik, N, K)); NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h_out, j, in, ik, N, K)); NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i_out, j, in, ik, N, K)); NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f_out, j, in, ik, N, K)); NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o_out, j, in, ik, N, K)); NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci_out, j, in, ik, N, K)); NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co_out, j, in, ik, N, K)); #ifdef PROFILE if (ltid == 0) { eltwise_end = _rdtsc(); eltwise_cycles += eltwise_end-eltwise_start; } #endif } } } libxsmm_barrier_wait(handle->barrier, (int)ltid); for (j = 1; j < t; ++j) { /* let's run the cell in blocks for good locality */ /* Block reduction loop if requested */ for (CB = 0; CB < BF; CB++) { for (inik = thr_begin; inik < thr_end; ++inik ) { in = (inik % (N/bn))*bn; ikb = inik / (N/bn); ik = ikb*bk; /* initialize i with bi */ #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &bi[ik] ); /* i += W.x */ batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wi, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &blocksa); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* i += R.h */ batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, ri, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(3, h_out, j-1, in, CB*KB_BLOCKS*bk, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &blocksb); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles2 += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* initialize ci with bd */ if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &bd[ik] ); /* ci += W.x */ batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wc, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &blocksa); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* ci += R.h */ batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, rc, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(3, h_out, j-1, in, CB*KB_BLOCKS*bk, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &blocksb); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles2 += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* initialize f with (bf + forget_bias) */ if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_CONST_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &bf[ik], handle->forget_bias ); /* f += W.x */ batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wf, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &blocksa); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* f += R.h */ batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, rf, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(3, h_out, j-1, in, CB*KB_BLOCKS*bk, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &blocksb); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles2 += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* initialize o with bo */ if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &bo[ik] ); /* o += W.x */ batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wo, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &blocksa); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles += gemm_end-gemm_start; } #endif #ifdef PROFILE if (ltid == 0) gemm_start = _rdtsc(); #endif /* o += R.h */ batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, ro, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(3, h_out, j-1, in, CB*KB_BLOCKS*bk, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &blocksb); #ifdef PROFILE if (ltid == 0) { gemm_end = _rdtsc(); gemm_cycles2 += gemm_end-gemm_start; } #endif if (CB == BF-1) { #ifdef PROFILE if (ltid == 0) { eltwise_start = _rdtsc(); } #endif cps_ptr = &LIBXSMM_VLA_ACCESS(3, cs, j-1, in, ik, N, K); /* Compute i, ci, f, o, cs, co and h */ #if defined(LIBXSMM_RNN_CELL_AVX512) if (bk % 16 == 0 && bc % 16 == 0) { #include "libxsmm_internal_lstm_fwd_fused_eltwise_bf16.tpl.c" } else { libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), cps_ptr, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); } #else libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), cps_ptr, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K) ); libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); #endif /* Downconvert computed results to bf16 output buffers */ NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs_out, j, in, ik, N, K)); NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h_out, j, in, ik, N, K)); NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i_out, j, in, ik, N, K)); NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f_out, j, in, ik, N, K)); NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o_out, j, in, ik, N, K)); NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci_out, j, in, ik, N, K)); NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co_out, j, in, ik, N, K)); #ifdef PROFILE if (ltid == 0) { eltwise_end = _rdtsc(); eltwise_cycles += eltwise_end-eltwise_start; } #endif } } } libxsmm_barrier_wait(handle->barrier, (int)ltid); } #undef NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD libxsmm-1.17/src/template/libxsmm_dnn_rnncell_st_rnn_bwdupd_nc_ck_generic.tpl.c000066400000000000000000000362171415223013700303000ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Kunal Banerjee (Intel Corp.) ******************************************************************************/ /* helper variables */ libxsmm_blasint i, ik, in, ic, jk, jb/*jn shadows global variable*/, jc, ek, en, ec; /* tensor dimensions */ libxsmm_blasint K = handle->desc.K; libxsmm_blasint N = handle->desc.N; libxsmm_blasint C = handle->desc.C; libxsmm_blasint t = handle->T; libxsmm_blasint bk = handle->bk; libxsmm_blasint bn = handle->bn; libxsmm_blasint bc = handle->bc; /* tensor raw pointers */ element_input_type *xt = (element_input_type* )handle->xt->data; element_input_type *hpD = (element_input_type* )handle->hp->data; element_filter_type *wD = (element_filter_type*)handle->w->data; element_filter_type *rD = (element_filter_type*)handle->r->data; element_output_type *ht = (element_output_type*)handle->ht->data; element_input_type *dxt = (element_input_type*)handle->dxt->data; element_filter_type *dwD = (element_filter_type*)handle->dw->data; element_filter_type *drD = (element_filter_type*)handle->dr->data; element_output_type *db = (element_output_type*)handle->db->data; element_output_type *dht = (element_output_type*)handle->dht->data; element_output_type *deltat = (element_output_type*)handle->scratch_deltat; element_input_type *scratch_xT = (element_input_type*)handle->scratch_xT; element_filter_type *scratch_wT = (element_filter_type*)handle->scratch_wT; element_filter_type *scratch_rT = (element_filter_type*)handle->scratch_rT; element_output_type *scratch_hT = (element_output_type*)handle->scratch_hT; /* multidimensional arrays */ LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); LIBXSMM_VLA_DECL(2, element_filter_type, w, wD, K); LIBXSMM_VLA_DECL(2, element_filter_type, r, rD, K); LIBXSMM_VLA_DECL(3, element_output_type, h, ht, N, K); LIBXSMM_VLA_DECL(3, element_input_type, dx, dxt, N, C); LIBXSMM_VLA_DECL(2, element_filter_type, dw, dwD, K); LIBXSMM_VLA_DECL(2, element_filter_type, dr, drD, K); LIBXSMM_VLA_DECL(3, element_output_type, dh, dht, N, K); LIBXSMM_VLA_DECL(3, element_output_type, delta, deltat, N, K); LIBXSMM_VLA_DECL(2, element_input_type, xT, scratch_xT, N); LIBXSMM_VLA_DECL(2, element_filter_type, wT, scratch_wT, C); LIBXSMM_VLA_DECL(2, element_filter_type, rT, scratch_rT, K); LIBXSMM_VLA_DECL(2, element_output_type, hT, scratch_hT, N); #if defined(LIBXSMM_DNN_RNN_RELU_BWDUPD) || defined(LIBXSMM_DNN_RNN_SIGMOID_BWDUPD) || defined(LIBXSMM_DNN_RNN_TANH_BWDUPD) element_output_type *zt = (element_output_type*)handle->internal_z; LIBXSMM_VLA_DECL(3, element_output_type, z, zt, N, K); #endif /* define gemm kernels */ libxsmm_smmfunction gemmkernela = libxsmm_smmdispatch( bc, bn, bk, &C, &K, &C, NULL, NULL, NULL, NULL ); libxsmm_smmfunction gemmkernelb = libxsmm_smmdispatch( bk, bk, bn, &K, &N, &K, NULL, NULL, NULL, NULL ); libxsmm_smmfunction gemmkernelc = libxsmm_smmdispatch( bk, bc, bn, &K, &N, &K, NULL, NULL, NULL, NULL ); libxsmm_smmfunction gemmkerneld = libxsmm_smmdispatch( bk, bn, bk, &K, &K, &K, NULL, NULL, NULL, NULL ); /* computing first logical thread */ const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; /* number of tasks that could be run in parallel for N and K blocks*/ const libxsmm_blasint work_nk = (N/bn) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize_nk = (work_nk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nk / (libxsmm_blasint)handle->desc.threads) : ((work_nk / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_nk = (ltid * chunksize_nk < work_nk) ? (ltid * chunksize_nk) : work_nk; const libxsmm_blasint thr_end_nk = ((ltid + 1) * chunksize_nk < work_nk) ? ((ltid + 1) * chunksize_nk) : work_nk; /* number of tasks that could be run in parallel for N and C blocks*/ const libxsmm_blasint work_nc = (N/bn) * (C/bc); /* compute chunk size */ const libxsmm_blasint chunksize_nc = (work_nc % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nc / (libxsmm_blasint)handle->desc.threads) : ((work_nc / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_nc = (ltid * chunksize_nc < work_nc) ? (ltid * chunksize_nc) : work_nc; const libxsmm_blasint thr_end_nc = ((ltid + 1) * chunksize_nc < work_nc) ? ((ltid + 1) * chunksize_nc) : work_nc; /* number of tasks that could be run in parallel for C and K blocks*/ const libxsmm_blasint work_ck = (C/bc) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize_ck = (work_ck % (libxsmm_blasint)handle->desc.threads == 0) ? (work_ck / (libxsmm_blasint)handle->desc.threads) : ((work_ck / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_ck = (ltid * chunksize_ck < work_ck) ? (ltid * chunksize_ck) : work_ck; const libxsmm_blasint thr_end_ck = ((ltid + 1) * chunksize_ck < work_ck) ? ((ltid + 1) * chunksize_ck) : work_ck; /* number of tasks that could be run in parallel for K and K blocks*/ const libxsmm_blasint work_kk = (K/bk) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize_kk = (work_kk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_kk / (libxsmm_blasint)handle->desc.threads) : ((work_kk / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_kk = (ltid * chunksize_kk < work_kk) ? (ltid * chunksize_kk) : work_kk; const libxsmm_blasint thr_end_kk = ((ltid + 1) * chunksize_kk < work_kk) ? ((ltid + 1) * chunksize_kk) : work_kk; /* number of tasks that could be run in parallel for K blocks*/ /* compute chunk size */ const libxsmm_blasint chunksize_k = (K % (libxsmm_blasint)handle->desc.threads == 0) ? (K / (libxsmm_blasint)handle->desc.threads) : ((K / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_k = (ltid * chunksize_k < K) ? (ltid * chunksize_k) : K; const libxsmm_blasint thr_end_k = ((ltid + 1) * chunksize_k < K) ? ((ltid + 1) * chunksize_k) : K; libxsmm_blasint ikic, inic, inik, icin, ikin; /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, (int)ltid); /* initialization is done at the beginning */ if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { libxsmm_internal_matrix_zero(N*C*t, dxt, start_thread, tid, handle->desc.threads); } if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { libxsmm_internal_matrix_zero(C*K, dwD, start_thread, tid, handle->desc.threads); libxsmm_internal_matrix_zero(K*K, drD, start_thread, tid, handle->desc.threads); libxsmm_internal_matrix_zero(K, db, start_thread, tid, handle->desc.threads); } /* transpose W */ for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { ik = (ikic / (C/bc))*bk; ic = (ikic % (C/bc))*bc; for (jk = 0; jk < bk; ++jk) { for (jc = 0; jc < bc; ++jc) { ek = ik + jk; ec = ic + jc; LIBXSMM_VLA_ACCESS(2, wT, ek, ec, C) = LIBXSMM_VLA_ACCESS(2, w, ec, ek, K); } } } /* transpose R */ for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { ik = (ikic / (K/bk))*bk; ic = (ikic % (K/bk))*bk; for (jk = 0; jk < bk; ++jk) { for (jc = 0; jc < bk; ++jc) { ek = ik + jk; ec = ic + jc; LIBXSMM_VLA_ACCESS(2, rT, ek, ec, K) = LIBXSMM_VLA_ACCESS(2, r, ec, ek, K); } } } /* transpose xt for current timestep */ for (icin = thr_begin_nc; icin < thr_end_nc; ++icin ) { ic = (icin / (N/bn))*bc; in = (icin % (N/bn))*bn; for (jc = 0; jc < bc; ++jc) { for (jb = 0; jb < bn; ++jb) { en = in + jb; ec = ic + jc; LIBXSMM_VLA_ACCESS(2, xT, ec, en, N) = LIBXSMM_VLA_ACCESS(3, x, t-1, en, ec, N, C); } } } /* transpose ht for current timestep */ for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { ik = (ikin / (N/bn))*bk; in = (ikin % (N/bn))*bn; for (jk = 0; jk < bk; ++jk) { for (jb = 0; jb < bn; ++jb) { en = in + jb; ek = ik + jk; LIBXSMM_VLA_ACCESS(2, hT, ek, en, N) = LIBXSMM_VLA_ACCESS(3, h, t-2, en, ek, N, K); } } } libxsmm_barrier_wait(handle->barrier, (int)ltid); /* The following code is for time step t-1 */ for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { in = (inik / (K/bk))*bn; ik = (inik % (K/bk))*bk; #if defined(LIBXSMM_DNN_RNN_RELU_BWDUPD) libxsmm_internal_matrix_relu_inverse_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, t-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, delta, t-1, in, ik, N, K) ); #endif #if defined(LIBXSMM_DNN_RNN_SIGMOID_BWDUPD) libxsmm_internal_matrix_sigmoid_inverse_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, t-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, delta, t-1, in, ik, N, K) ); #endif #if defined(LIBXSMM_DNN_RNN_TANH_BWDUPD) libxsmm_internal_matrix_tanh_inverse_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, t-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, delta, t-1, in, ik, N, K) ); #endif libxsmm_internal_matrix_inplace_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, dh, t-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, delta, t-1, in, ik, N, K) ); } libxsmm_barrier_wait(handle->barrier, (int)ltid); if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { /* gemm kernel bwd_d */ for (inic = thr_begin_nc; inic < thr_end_nc; ++inic ) { in = (inic / (C/bc))*bn; ic = (inic % (C/bc))*bc; for (ik = 0; ik < K; ik += bk) { gemmkernela( &LIBXSMM_VLA_ACCESS(2, wT, ik, ic, C), &LIBXSMM_VLA_ACCESS(3, delta, t-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, dx, t-1, in, ic, N, C) ); } } } if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { /* gradient bias */ for (ik = thr_begin_k; ik < thr_end_k; ik++) { for (in = 0; in < N; in++) { db[ik] += LIBXSMM_VLA_ACCESS(3, delta, t-1, in, ik, N, K); } } /* dr = delta * h^T */ for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { ic = (ikic / (K/bk))*bk; ik = (ikic % (K/bk))*bk; for (in = 0; in < N; in += bn) { gemmkernelb( &LIBXSMM_VLA_ACCESS(3, delta, t-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N), &LIBXSMM_VLA_ACCESS(2, dr, ic, ik, K) ); } } /* dw = delta * x^T */ for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { ic = (ikic / (K/bk))*bc; ik = (ikic % (K/bk))*bk; for (in = 0; in < N; in += bn ) { gemmkernelc( &LIBXSMM_VLA_ACCESS(3, delta, t-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N), &LIBXSMM_VLA_ACCESS(2, dw, ic, ik, K) ); } } } libxsmm_barrier_wait(handle->barrier, (int)ltid); for (i = t-2; i >= 0; --i) { /* transpose xt for current timestep */ for (icin = thr_begin_nc; icin < thr_end_nc; ++icin ) { ic = (icin / (N/bn))*bc; in = (icin % (N/bn))*bn; for (jc = 0; jc < bc; ++jc) { for (jb = 0; jb < bn; ++jb) { en = in + jb; ec = ic + jc; LIBXSMM_VLA_ACCESS(2, xT, ec, en, N) = LIBXSMM_VLA_ACCESS(3, x, i, en, ec, N, C); } } } /* transpose ht for current timestep */ if (0 == i) { for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { ik = (ikin / (N/bn))*bk; in = (ikin % (N/bn))*bn; for (jk = 0; jk < bk; ++jk) { for (jb = 0; jb < bn; ++jb) { en = in + jb; ek = ik + jk; LIBXSMM_VLA_ACCESS(2, hT, ek, en, N) = LIBXSMM_VLA_ACCESS(2, hp, en, ek, K); } } } } else { for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { ik = (ikin / (N/bn))*bk; in = (ikin % (N/bn))*bn; for (jk = 0; jk < bk; ++jk) { for (jb = 0; jb < bn; ++jb) { en = in + jb; ek = ik + jk; LIBXSMM_VLA_ACCESS(2, hT, ek, en, N) = LIBXSMM_VLA_ACCESS(3, h, i-1, en, ek, N, K); } } } } libxsmm_barrier_wait(handle->barrier, (int)ltid); /* let's run the cell in blocks for good locality */ for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { in = (inik / (K/bk))*bn; ik = (inik % (K/bk))*bk; /* delta = dh */ libxsmm_internal_matrix_copy_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, dh, i, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K) ); /* delta += R^T * delta+1 */ for (ic = 0; ic < K; ic += bk) { gemmkerneld( &LIBXSMM_VLA_ACCESS(2, rT, ic, ik, K), &LIBXSMM_VLA_ACCESS(3, delta, i+1, in, ic, N, K), &LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K) ); } /* run inverse non-linear op */ #if defined(LIBXSMM_DNN_RNN_RELU_BWDUPD) libxsmm_internal_matrix_relu_inverse_inplace_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, i, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K) ); #endif #if defined(LIBXSMM_DNN_RNN_SIGMOID_BWDUPD) libxsmm_internal_matrix_sigmoid_inverse_inplace_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, i, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K) ); #endif #if defined(LIBXSMM_DNN_RNN_TANH_BWDUPD) libxsmm_internal_matrix_tanh_inverse_inplace_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, i, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K) ); #endif } libxsmm_barrier_wait(handle->barrier, (int)ltid); if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { /* dx = W^T * delta */ for (inic = thr_begin_nc; inic < thr_end_nc; ++inic ) { in = (inic / (C/bc))*bn; ic = (inic % (C/bc))*bc; for (ik = 0; ik < K; ik += bk) { gemmkernela( &LIBXSMM_VLA_ACCESS(2, wT, ik, ic, C), &LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, dx, i, in, ic, N, C) ); } } } if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { /* gradient bias */ for (ik = thr_begin_k; ik < thr_end_k; ik++) { for (in = 0; in < N; in++) { db[ik] += LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K); } } /* dr = delta * h^T */ for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { ic = (ikic / (K/bk))*bk; ik = (ikic % (K/bk))*bk; for (in = 0; in < N; in += bn) { gemmkernelb( &LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N), &LIBXSMM_VLA_ACCESS(2, dr, ic, ik, K) ); } } /* dw = delta * x^T */ for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { ic = (ikic / (K/bk))*bc; ik = (ikic % (K/bk))*bk; for (in = 0; in < N; in += bn ) { gemmkernelc( &LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N), &LIBXSMM_VLA_ACCESS(2, dw, ic, ik, K) ); } } } libxsmm_barrier_wait(handle->barrier, (int)ltid); } libxsmm-1.17/src/template/libxsmm_dnn_rnncell_st_rnn_bwdupd_nc_kcck.tpl.c000066400000000000000000000450201415223013700271120ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas, Alexander Heinecke, Kunal Banerjee (Intel Corp.) ******************************************************************************/ /* helper variables */ libxsmm_blasint i, ik, ikb, in, inb, ic, icb, jk, jb/*jn shadows global variable*/, jc, ek, en, ec, BF, KB_BLOCKS, KB; /* tensor dimensions */ libxsmm_blasint K = handle->desc.K; libxsmm_blasint N = handle->desc.N; libxsmm_blasint C = handle->desc.C; libxsmm_blasint t = handle->T; libxsmm_blasint bk = handle->bk; libxsmm_blasint bn = handle->bn; libxsmm_blasint bc = handle->bc; /* tensor raw pointers */ element_input_type *xt = (element_input_type* )handle->xt->data; element_input_type *hpD = (element_input_type* )handle->hp->data; element_filter_type *wtD = (element_filter_type*)handle->wt->data; element_filter_type *rtD = (element_filter_type*)handle->rt->data; element_output_type *ht = (element_output_type*)handle->ht->data; element_input_type *dxt = (element_input_type*)handle->dxt->data; element_filter_type *dwD = (element_filter_type*)handle->dw->data; element_filter_type *drD = (element_filter_type*)handle->dr->data; element_output_type *db = (element_output_type*)handle->db->data; element_output_type *dht = (element_output_type*)handle->dht->data; element_output_type *deltat = (element_output_type*)handle->scratch_deltat; element_input_type *scratch_xT = (element_input_type*)handle->scratch_xT; #if 0 element_filter_type *scratch_wT = (element_filter_type*)handle->scratch_wT; element_filter_type *scratch_rT = (element_filter_type*)handle->scratch_rT; #endif element_output_type *scratch_hT = (element_output_type*)handle->scratch_hT; /* Auxiliary variables for bact-reduce calls */ libxsmm_blasint nBlocks = N/bn; libxsmm_blasint cBlocks = C/bc; libxsmm_blasint kBlocks = K/bk; unsigned long long blocks; const float beta = 0.0; /* multidimensional arrays */ LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); LIBXSMM_VLA_DECL(4, element_filter_type, wT, wtD, kBlocks, bk, bc); LIBXSMM_VLA_DECL(4, element_filter_type, rT, rtD, kBlocks, bk, bk); LIBXSMM_VLA_DECL(3, element_output_type, h, ht, N, K); LIBXSMM_VLA_DECL(3, element_input_type, dx, dxt, N, C); LIBXSMM_VLA_DECL(4, element_filter_type, dw, dwD, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, dr, drD, kBlocks, bk, bk); LIBXSMM_VLA_DECL(3, element_output_type, dh, dht, N, K); LIBXSMM_VLA_DECL(3, element_output_type, delta, deltat, N, K); LIBXSMM_VLA_DECL(2, element_input_type, xT, scratch_xT, N); #if 0 LIBXSMM_VLA_DECL(4, element_filter_type, wT, scratch_wT, kBlocks, bk, bc); LIBXSMM_VLA_DECL(4, element_filter_type, rT, scratch_rT, kBlocks, bk, bk); #endif LIBXSMM_VLA_DECL(2, element_output_type, hT, scratch_hT, N); #if defined(LIBXSMM_DNN_RNN_RELU_BWDUPD) || defined(LIBXSMM_DNN_RNN_SIGMOID_BWDUPD) || defined(LIBXSMM_DNN_RNN_TANH_BWDUPD) element_output_type *zt = (element_output_type*)handle->internal_z; LIBXSMM_VLA_DECL(3, element_output_type, z, zt, N, K); #endif /* define batch-reduce gemm kernels */ /*const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelaz = libxsmm_smmdispatch_reducebatch_addr( bc, bn, bk, &bc, &K, &C, NULL, &beta, NULL, NULL);*/ const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelbz = libxsmm_smmdispatch_reducebatch_addr( bk, bk, bn, &K, &N, &bk, NULL, &beta, NULL, NULL); const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelcz = libxsmm_smmdispatch_reducebatch_addr( bk, bc, bn, &K, &N, &bk, NULL, &beta, NULL, NULL); const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelb = libxsmm_smmdispatch_reducebatch_addr( bk, bk, bn, &K, &N, &bk, NULL, NULL, NULL, NULL); const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelc = libxsmm_smmdispatch_reducebatch_addr( bk, bc, bn, &K, &N, &bk, NULL, NULL, NULL, NULL); const libxsmm_smmfunction_reducebatch_addr batchreduce_kerneld = libxsmm_smmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, NULL, NULL); const libxsmm_smmfunction_reducebatch_addr batchreduce_kernela = libxsmm_smmdispatch_reducebatch_addr( bc, bn, bk, &bc, &K, &C, NULL, NULL, NULL, NULL); /* computing first logical thread */ const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; /* number of tasks that could be run in parallel for N and K blocks*/ const libxsmm_blasint work_nk = (N/bn) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize_nk = (work_nk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nk / (libxsmm_blasint)handle->desc.threads) : ((work_nk / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_nk = (ltid * chunksize_nk < work_nk) ? (ltid * chunksize_nk) : work_nk; const libxsmm_blasint thr_end_nk = ((ltid + 1) * chunksize_nk < work_nk) ? ((ltid + 1) * chunksize_nk) : work_nk; /* number of tasks that could be run in parallel for N and C blocks*/ const libxsmm_blasint work_nc = (N/bn) * (C/bc); /* compute chunk size */ const libxsmm_blasint chunksize_nc = (work_nc % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nc / (libxsmm_blasint)handle->desc.threads) : ((work_nc / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_nc = (ltid * chunksize_nc < work_nc) ? (ltid * chunksize_nc) : work_nc; const libxsmm_blasint thr_end_nc = ((ltid + 1) * chunksize_nc < work_nc) ? ((ltid + 1) * chunksize_nc) : work_nc; /* number of tasks that could be run in parallel for C and K blocks*/ const libxsmm_blasint work_ck = (C/bc) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize_ck = (work_ck % (libxsmm_blasint)handle->desc.threads == 0) ? (work_ck / (libxsmm_blasint)handle->desc.threads) : ((work_ck / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_ck = (ltid * chunksize_ck < work_ck) ? (ltid * chunksize_ck) : work_ck; const libxsmm_blasint thr_end_ck = ((ltid + 1) * chunksize_ck < work_ck) ? ((ltid + 1) * chunksize_ck) : work_ck; /* number of tasks that could be run in parallel for K and K blocks*/ const libxsmm_blasint work_kk = (K/bk) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize_kk = (work_kk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_kk / (libxsmm_blasint)handle->desc.threads) : ((work_kk / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_kk = (ltid * chunksize_kk < work_kk) ? (ltid * chunksize_kk) : work_kk; const libxsmm_blasint thr_end_kk = ((ltid + 1) * chunksize_kk < work_kk) ? ((ltid + 1) * chunksize_kk) : work_kk; #if defined(LIBXSMM_RNN_CELL_AVX512) int k_tasks = K/16; int k_chunksize = (k_tasks % (libxsmm_blasint)handle->desc.threads == 0) ? (k_tasks / (libxsmm_blasint)handle->desc.threads) : ((k_tasks / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint k_thr_begin = (ltid * k_chunksize * 16 < K) ? (ltid * k_chunksize * 16) : K; const libxsmm_blasint k_thr_end = ((ltid + 1) * k_chunksize * 16 < K) ? ((ltid + 1) * k_chunksize * 16) : K; __m512 db_sum; #else /* number of tasks that could be run in parallel for K blocks*/ /* compute chunk size */ const libxsmm_blasint chunksize_k = (K % (libxsmm_blasint)handle->desc.threads == 0) ? (K / (libxsmm_blasint)handle->desc.threads) : ((K / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin_k = (ltid * chunksize_k < K) ? (ltid * chunksize_k) : K; const libxsmm_blasint thr_end_k = ((ltid + 1) * chunksize_k < K) ? ((ltid + 1) * chunksize_k) : K; #endif libxsmm_blasint ikic, inic, inik, icin, ikin; /* Auxiliary arrays for batch-reduce gemm calls */ const element_filter_type *A_array[1024]; const element_output_type *B_array[1024]; /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, (int)ltid); /* Blocking reduction domain if it is too large */ BF = 1; if (C >= 512 && K >= 512 && C%2 == 0 && K%2 == 0) { BF = 2; } if (C >= 2048 && K >= 2048 && C%8 == 0 && K%8 == 0) { BF = 8; } KB_BLOCKS = kBlocks/BF; #if 0 if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { /* transpose W */ for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { ik = (ikic / (C/bc)); ic = (ikic % (C/bc)); for (jk = 0; jk < bk; ++jk) { for (jc = 0; jc < bc; ++jc) { LIBXSMM_VLA_ACCESS(4, wT, ic, ik, jk, jc, kBlocks, bk, bc) = LIBXSMM_VLA_ACCESS(4, w, ik, ic, jc, jk, cBlocks, bc, bk); } } } } /* transpose R */ for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { ik = (ikic / (K/bk)); ic = (ikic % (K/bk)); for (jk = 0; jk < bk; ++jk) { for (jc = 0; jc < bk; ++jc) { LIBXSMM_VLA_ACCESS(4, rT, ic, ik, jk, jc, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(4, r, ik, ic, jc, jk, kBlocks, bk, bk); } } } #endif if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { /* transpose xt for current timestep */ for (icin = thr_begin_nc; icin < thr_end_nc; ++icin ) { ic = (icin / (N/bn))*bc; in = (icin % (N/bn))*bn; for (jc = 0; jc < bc; ++jc) { for (jb = 0; jb < bn; ++jb) { en = in + jb; ec = ic + jc; LIBXSMM_VLA_ACCESS(2, xT, ec, en, N) = LIBXSMM_VLA_ACCESS(3, x, t-1, en, ec, N, C); } } } /* transpose ht for current timestep */ for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { ik = (ikin / (N/bn))*bk; in = (ikin % (N/bn))*bn; for (jk = 0; jk < bk; ++jk) { for (jb = 0; jb < bn; ++jb) { en = in + jb; ek = ik + jk; LIBXSMM_VLA_ACCESS(2, hT, ek, en, N) = LIBXSMM_VLA_ACCESS(3, h, t-2, en, ek, N, K); } } } } /* The following code is for time step t-1 */ for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { in = (inik / (K/bk))*bn; ik = (inik % (K/bk))*bk; #if defined(LIBXSMM_DNN_RNN_RELU_BWDUPD) libxsmm_internal_matrix_relu_inverse_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, t-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, delta, t-1, in, ik, N, K) ); #endif #if defined(LIBXSMM_DNN_RNN_SIGMOID_BWDUPD) libxsmm_internal_matrix_sigmoid_inverse_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, t-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, delta, t-1, in, ik, N, K) ); #endif #if defined(LIBXSMM_DNN_RNN_TANH_BWDUPD) libxsmm_internal_matrix_tanh_inverse_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, t-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, delta, t-1, in, ik, N, K) ); #endif libxsmm_internal_matrix_inplace_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, dh, t-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, delta, t-1, in, ik, N, K) ); } libxsmm_barrier_wait(handle->barrier, (int)ltid); if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { /* gemm kernel bwd_d */ for (KB = 0; KB < BF; KB++) { for (inic = thr_begin_nc; inic < thr_end_nc; ++inic ) { in = (inic / (C/bc))*bn; icb = (inic % (C/bc)); ic = icb * bc; /* Prepare arguments for batch-reduce call */ for (ik = 0, ikb = 0; ikb < KB_BLOCKS; ik+=bk, ikb++) { A_array[ikb] = &LIBXSMM_VLA_ACCESS(4, wT, icb, ikb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bc); B_array[ikb] = &LIBXSMM_VLA_ACCESS(3, delta, t-1, in, ik + KB*KB_BLOCKS*bk, N, K); } /* Reduce batch gemm call */ blocks = KB_BLOCKS; batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, dx, t-1, in, ic, N, C), &blocks); } } } if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { /* dr = delta * h^T */ for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { icb = ikic / (K/bk); ic = icb*bk; ikb = ikic % (K/bk); ik = ikb*bk; for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(3, delta, t-1, in, ik, N, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); } blocks = nBlocks; batchreduce_kernelbz(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dr, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); } /* dw = delta * x^T */ for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { icb = ikic / (K/bk); ic = icb*bc; ikb = ikic % (K/bk); ik = ikb*bk; for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(3, delta, t-1, in, ik, N, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); } blocks = nBlocks; batchreduce_kernelcz(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dw, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); } } for (i = t-2; i >= 0; --i) { /* let's run the cell in blocks for good locality */ for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { in = (inik / (K/bk))*bn; ikb = (inik % (K/bk)); ik = ikb*bk; /* delta = dh */ libxsmm_internal_matrix_copy_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, dh, i, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K) ); /* delta += R^T * delta+1 */ for (ic = 0; ic < kBlocks; ic++) { A_array[ic] = &LIBXSMM_VLA_ACCESS(4, rT, ikb, ic, 0, 0, kBlocks, bk, bk); B_array[ic] = &LIBXSMM_VLA_ACCESS(3, delta, i+1, in, ic*bk, N, K); } /* Reduce batch gemm call */ blocks = kBlocks; batchreduce_kerneld(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K) , &blocks); /* run inverse non-linear op */ #if defined(LIBXSMM_DNN_RNN_RELU_BWDUPD) libxsmm_internal_matrix_relu_inverse_inplace_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, i, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K) ); #endif #if defined(LIBXSMM_DNN_RNN_SIGMOID_BWDUPD) libxsmm_internal_matrix_sigmoid_inverse_inplace_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, i, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K) ); #endif #if defined(LIBXSMM_DNN_RNN_TANH_BWDUPD) libxsmm_internal_matrix_tanh_inverse_inplace_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, i, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K) ); #endif } libxsmm_barrier_wait(handle->barrier, (int)ltid); if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { /* transpose xt for current timestep */ for (icin = thr_begin_nc; icin < thr_end_nc; ++icin ) { ic = (icin / (N/bn))*bc; in = (icin % (N/bn))*bn; for (jc = 0; jc < bc; ++jc) { for (jb = 0; jb < bn; ++jb) { en = in + jb; ec = ic + jc; LIBXSMM_VLA_ACCESS(2, xT, ec, en, N) = LIBXSMM_VLA_ACCESS(3, x, i, en, ec, N, C); } } } /* transpose ht for current timestep */ if (0 == i) { for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { ik = (ikin / (N/bn))*bk; in = (ikin % (N/bn))*bn; for (jk = 0; jk < bk; ++jk) { for (jb = 0; jb < bn; ++jb) { en = in + jb; ek = ik + jk; LIBXSMM_VLA_ACCESS(2, hT, ek, en, N) = LIBXSMM_VLA_ACCESS(2, hp, en, ek, K); } } } } else { for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { ik = (ikin / (N/bn))*bk; in = (ikin % (N/bn))*bn; for (jk = 0; jk < bk; ++jk) { for (jb = 0; jb < bn; ++jb) { en = in + jb; ek = ik + jk; LIBXSMM_VLA_ACCESS(2, hT, ek, en, N) = LIBXSMM_VLA_ACCESS(3, h, i-1, en, ek, N, K); } } } } } if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { /* dx = W^T * delta */ for (KB = 0; KB < BF; KB++) { for (inic = thr_begin_nc; inic < thr_end_nc; ++inic ) { in = (inic / (C/bc))*bn; icb = (inic % (C/bc)); ic = icb * bc; /* Prepare arguments for batch-reduce call */ for (ik = 0, ikb = 0; ikb < KB_BLOCKS; ik+=bk, ikb++) { A_array[ikb] = &LIBXSMM_VLA_ACCESS(4, wT, icb, ikb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bc); B_array[ikb] = &LIBXSMM_VLA_ACCESS(3, delta, i, in, ik + KB*KB_BLOCKS*bk, N, K); } /* Reduce batch gemm call */ blocks = KB_BLOCKS; batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, dx, i, in, ic, N, C), &blocks); } } } libxsmm_barrier_wait(handle->barrier, (int)ltid); if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { /* dr = delta * h^T */ for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { icb = ikic / (K/bk); ic = icb*bk; ikb = ikic % (K/bk); ik = ikb*bk; for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); } blocks = nBlocks; batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dr, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); } /* dw = delta * x^T */ for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { icb = ikic / (K/bk); ic = icb*bc; ikb = ikic % (K/bk); ik = ikb*bk; for (in = 0, inb = 0; in < N; in += bn, inb++) { A_array[inb] = &LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K); B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); } blocks = nBlocks; batchreduce_kernelc(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dw, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); } } } /* gradient bias */ if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { #if defined(LIBXSMM_RNN_CELL_AVX512) for (ik = k_thr_begin; ik < k_thr_end; ik += 16) { db_sum = _mm512_setzero_ps(); for (i = 0; i < t; i++) { for (in = 0; in < N; in++) { db_sum = _mm512_add_ps(db_sum, LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K))); } } LIBXSMM_INTRINSICS_MM512_STREAM_PS(&db[ik], db_sum); } #else for (i = 0; i < t; i++) { for (ik = thr_begin_k; ik < thr_end_k; ik++) { for (in = 0; in < N; in++) { db[ik] += LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K); } } } #endif } libxsmm_barrier_wait(handle->barrier, (int)ltid); libxsmm-1.17/src/template/libxsmm_dnn_rnncell_st_rnn_fwd_nc_ck_generic.tpl.c000066400000000000000000000111301415223013700275560ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Kunal Banerjee (Intel Corp.) ******************************************************************************/ /* helper variables */ libxsmm_blasint i, ik, in, ic, inik; /* input sizes */ const libxsmm_blasint K = handle->desc.K; const libxsmm_blasint N = handle->desc.N; const libxsmm_blasint C = handle->desc.C; const libxsmm_blasint t = handle->T; const libxsmm_blasint bk = handle->bk; const libxsmm_blasint bn = handle->bn; const libxsmm_blasint bc = handle->bc; /* define tensors */ element_input_type *xt = (element_input_type* )handle->xt->data; element_input_type *hpD= (element_input_type* )handle->hp->data; element_filter_type *wD = (element_filter_type*)handle->w->data; element_filter_type *rD = (element_filter_type*)handle->r->data; element_output_type *b = (element_output_type*)handle->b->data; element_output_type *ht = (element_output_type*)handle->ht->data; element_output_type *zt = (element_output_type*)handle->internal_z; LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); LIBXSMM_VLA_DECL(2, element_filter_type, w, wD, K); LIBXSMM_VLA_DECL(2, element_filter_type, r, rD, K); LIBXSMM_VLA_DECL(3, element_output_type, h, ht, N, K); LIBXSMM_VLA_DECL(3, element_output_type, z, zt, N, K); /* define gemm kernels */ libxsmm_smmfunction gemmkernela = libxsmm_smmdispatch( bk, bn, bc, &K, &C, &K, NULL, NULL, NULL, NULL ); libxsmm_smmfunction gemmkernelb = libxsmm_smmdispatch( bk, bn, bk, &K, &K, &K, NULL, NULL, NULL, NULL ); /* parallelize over C-blocks */ /* computing first logical thread */ const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; /* number of tasks that could be run in parallel */ const libxsmm_blasint work = (N/bn) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize = (work % (libxsmm_blasint)handle->desc.threads == 0) ? (work / (libxsmm_blasint)handle->desc.threads) : ((work / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const libxsmm_blasint thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; const libxsmm_blasint thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, (int)ltid); /* All data is in column-major format */ for (i = 0; i < t; ++i) { /* let's run the cell in blocks for good locality */ for (inik = thr_begin; inik < thr_end; ++inik ) { in = (inik / (K/bk))*bn; ik = (inik % (K/bk))*bk; /* z = per_col(b) */ libxsmm_internal_matrix_bcst_colvector_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, i, in, ik, N, K), &b[ik] ); /* z += W.x */ for (ic = 0; ic < C; ic += bc) { /* this is a small matmul */ gemmkernela( &LIBXSMM_VLA_ACCESS(2, w, ic, ik, K), &LIBXSMM_VLA_ACCESS(3, x, i, in, ic, N, C), &LIBXSMM_VLA_ACCESS(3, z, i, in, ik, N, K) ); } /* z += U.h */ if (0 == i) { for (ic = 0; ic < K; ic += bk) { /* this is a small matmul */ gemmkernelb( &LIBXSMM_VLA_ACCESS(2, r, ic, ik, K), &LIBXSMM_VLA_ACCESS(2, hp, in, ic, K), &LIBXSMM_VLA_ACCESS(3, z, i, in, ik, N, K) ); } } else { for (ic = 0; ic < K; ic += bk) { /* this is a small matmul */ gemmkernelb( &LIBXSMM_VLA_ACCESS(2, r, ic, ik, K), &LIBXSMM_VLA_ACCESS(3, h, i-1, in, ic, N, K), &LIBXSMM_VLA_ACCESS(3, z, i, in, ik, N, K) ); } } #if defined(LIBXSMM_DNN_RNN_RELU_FWD) libxsmm_internal_matrix_relu_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, i, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, i, in, ik, N, K) ); #endif #if defined(LIBXSMM_DNN_RNN_SIGMOID_FWD) libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, i, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, i, in, ik, N, K) ); #endif #if defined(LIBXSMM_DNN_RNN_TANH_FWD) libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, i, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, i, in, ik, N, K) ); #endif } libxsmm_barrier_wait(handle->barrier, (int)ltid); } libxsmm-1.17/src/template/libxsmm_dnn_rnncell_st_rnn_fwd_nc_kcck.tpl.c000066400000000000000000000144721415223013700264140ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas, Alexander Heinecke, Kunal Banerjee (Intel Corp.) ******************************************************************************/ /* helper variables */ libxsmm_blasint i, ik, in, ic, inik, BF, CB, CB_BLOCKS, KB_BLOCKS; /* input sizes */ const libxsmm_blasint K = handle->desc.K; const libxsmm_blasint N = handle->desc.N; const libxsmm_blasint C = handle->desc.C; const libxsmm_blasint t = handle->T; const libxsmm_blasint bk = handle->bk; const libxsmm_blasint bn = handle->bn; const libxsmm_blasint bc = handle->bc; /* define tensors */ element_input_type *xt = (element_input_type* )handle->xt->data; element_input_type *hpD= (element_input_type* )handle->hp->data; element_filter_type *wD = (element_filter_type*)handle->w->data; element_filter_type *rD = (element_filter_type*)handle->r->data; element_output_type *b = (element_output_type*)handle->b->data; element_output_type *ht = (element_output_type*)handle->ht->data; element_output_type *zt = (element_output_type*)handle->internal_z; /*libxsmm_blasint nBlocks = N/bn;*/ libxsmm_blasint cBlocks = C/bc; libxsmm_blasint kBlocks = K/bk; unsigned long long blocks; LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); LIBXSMM_VLA_DECL(4, element_filter_type, w, wD, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, r, rD, kBlocks, bk, bk); LIBXSMM_VLA_DECL(3, element_output_type, h, ht, N, K); LIBXSMM_VLA_DECL(3, element_output_type, z, zt, N, K); int prefetch_mode = LIBXSMM_GEMM_PREFETCH_NONE/*LIBXSMM_GEMM_PREFETCH_AL1_BL1*/; /* define gemm kernels */ const libxsmm_smmfunction_reducebatch_addr batchreduce_kernela = libxsmm_smmdispatch_reducebatch_addr( bk, bn, bc, &bk, &C, &K, NULL, NULL, NULL, &prefetch_mode ); const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelb = libxsmm_smmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, NULL, &prefetch_mode ); /* Auxiliary arrays for batch-reduce gemms */ const element_input_type *A_array[1024]; const element_input_type *B_array[1024]; const element_input_type *A_array2[1024]; const element_input_type *B_array2[1024]; /* computing first logical thread */ const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; /* number of tasks that could be run in parallel */ const libxsmm_blasint work = (N/bn) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize = (work % (libxsmm_blasint)handle->desc.threads == 0) ? (work / (libxsmm_blasint)handle->desc.threads) : ((work / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ libxsmm_blasint thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; libxsmm_blasint thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* Blocking reduction domain if it is too large */ BF = 1; if (C >= 2048 && K >= 2048 && C%2 == 0 && K%2 == 0) { BF = 2; } CB_BLOCKS = cBlocks/BF; KB_BLOCKS = kBlocks/BF; assert(CB_BLOCKS <= 1024); assert(KB_BLOCKS <= 1024); /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, (int)ltid); /* All data is in column-major format */ for (i = 0; i < t; ++i) { /* let's run the cell in blocks for good locality */ for (CB = 0; CB < BF; CB++) { for (inik = thr_begin; inik < thr_end; ++inik ) { if (C >= 2048 && K >= 2048) { in = inik % (N/bn); ik = inik / (N/bn); } else { in = inik / (K/bk); ik = inik % (K/bk); } /* z = per_col(b) */ if (0 == CB) { libxsmm_internal_matrix_bcst_colvector_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, i, in*bn, ik*bk, N, K), &b[ik*bk] ); } /* z += W.x */ /* Prepare arrays for the call */ for (ic = 0; ic < CB_BLOCKS; ic++) { /* this is a small matmul */ A_array[ic] = &LIBXSMM_VLA_ACCESS(4, w, ik, ic + CB*CB_BLOCKS, 0, 0, cBlocks, bc, bk); B_array[ic] = &LIBXSMM_VLA_ACCESS(3, x, i, in*bn, (ic + CB*CB_BLOCKS)*bc, N, C); } /* Reduce batch gemm call */ blocks = CB_BLOCKS; batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, z, i, in*bn, ik*bk, N, K), &blocks); /* z += U.h */ if (0 == i) { /* Prepare arrays for the call */ for (ic = 0; ic < KB_BLOCKS; ic++) { A_array2[ic] = &LIBXSMM_VLA_ACCESS(4, r, ik, ic + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array2[ic] = &LIBXSMM_VLA_ACCESS(2, hp, in*bn, (ic + CB*KB_BLOCKS)*bk, K); } /* Reduce batch gemm call */ blocks = KB_BLOCKS; batchreduce_kernelb(A_array2, B_array2, &LIBXSMM_VLA_ACCESS(3, z, i, in*bn, ik*bk, N, K), &blocks); } else { /* Prepare arrays for the call */ for (ic = 0; ic < KB_BLOCKS; ic++) { A_array2[ic] = &LIBXSMM_VLA_ACCESS(4, r, ik, ic + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); B_array2[ic] = &LIBXSMM_VLA_ACCESS(3, h, i-1, in*bn, (ic + CB*KB_BLOCKS)*bk, N, K); } /* Reduce batch gemm call */ blocks = KB_BLOCKS; batchreduce_kernelb(A_array2, B_array2, &LIBXSMM_VLA_ACCESS(3, z, i, in*bn, ik*bk, N, K), &blocks); } #if defined(LIBXSMM_DNN_RNN_RELU_FWD) libxsmm_internal_matrix_relu_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, i, in*bn, ik*bk, N, K), &LIBXSMM_VLA_ACCESS(3, h, i, in*bn, ik*bk, N, K) ); #endif #if defined(LIBXSMM_DNN_RNN_SIGMOID_FWD) libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, i, in*bn, ik*bk, N, K), &LIBXSMM_VLA_ACCESS(3, h, i, in*bn, ik*bk, N, K) ); #endif #if defined(LIBXSMM_DNN_RNN_TANH_FWD) libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, i, in*bn, ik*bk, N, K), &LIBXSMM_VLA_ACCESS(3, h, i, in*bn, ik*bk, N, K) ); #endif } } libxsmm_barrier_wait(handle->barrier, (int)ltid); } libxsmm-1.17/src/template/libxsmm_dnn_rnncell_st_rnn_fwd_ncnc_kcck.tpl.c000066400000000000000000000274071415223013700267370ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas, Alexander Heinecke, Kunal Banerjee (Intel Corp.) ******************************************************************************/ /* helper variables */ libxsmm_blasint i, ik, in, ic, inik; /* input sizes */ const libxsmm_blasint K = handle->desc.K; const libxsmm_blasint N = handle->desc.N; const libxsmm_blasint C = handle->desc.C; const libxsmm_blasint t = handle->T; const libxsmm_blasint bk = handle->bk; const libxsmm_blasint bn = handle->bn; const libxsmm_blasint bc = handle->bc; /* define tensors */ element_input_type *xt = (element_input_type* )handle->xt->data; element_input_type *hpD= (element_input_type* )handle->hp->data; element_filter_type *wD = (element_filter_type*)handle->w->data; element_filter_type *rD = (element_filter_type*)handle->r->data; element_output_type *b = (element_output_type*)handle->b->data; element_output_type *ht = (element_output_type*)handle->ht->data; element_output_type *zt = (element_output_type*)handle->internal_z; libxsmm_blasint nBlocks = N/bn; libxsmm_blasint cBlocks = C/bc; libxsmm_blasint kBlocks = K/bk; unsigned long long blocks; LIBXSMM_VLA_DECL(5, element_input_type, x, xt, nBlocks, cBlocks, bn, bc); LIBXSMM_VLA_DECL(4, element_input_type, hp, hpD, kBlocks, bn, bk); LIBXSMM_VLA_DECL(4, element_filter_type, w, wD, cBlocks, bc, bk); LIBXSMM_VLA_DECL(4, element_filter_type, r, rD, kBlocks, bk, bk); LIBXSMM_VLA_DECL(5, element_output_type, h, ht, nBlocks, kBlocks, bn, bk); LIBXSMM_VLA_DECL(5, element_output_type, z, zt, nBlocks, kBlocks, bn, bk); int prefetch_mode = LIBXSMM_GEMM_PREFETCH_NONE/*LIBXSMM_GEMM_PREFETCH_AL1_BL1*/; /* define gemm kernels */ const libxsmm_smmfunction_reducebatch_addr batchreduce_kernela = libxsmm_smmdispatch_reducebatch_addr( bk, bn, bc, &bk, &bc, &bk, NULL, NULL, NULL, &prefetch_mode ); const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelb = libxsmm_smmdispatch_reducebatch_addr( bk, bn, bk, &bk, &bk, &bk, NULL, NULL, NULL, &prefetch_mode ); /* computing first logical thread */ const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; /* number of tasks that could be run in parallel */ const libxsmm_blasint work = (N/bn) * (K/bk); /* compute chunk size */ const libxsmm_blasint chunksize = (work % (libxsmm_blasint)handle->desc.threads == 0) ? (work / (libxsmm_blasint)handle->desc.threads) : ((work / (libxsmm_blasint)handle->desc.threads) + 1); /* compute thr_begin and thr_end */ libxsmm_blasint thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; libxsmm_blasint thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; /* The snippet below does a 2D domain decomposition of output IF the number of threads and the number of work items are compatible */ /* TODO: For now 2D decomposition targets single socket SKX */ int row_teams = 7; int column_teams = 4; libxsmm_blasint my_col_id = ltid % column_teams; libxsmm_blasint my_row_id = ltid / column_teams; int in_tasks = (int)(N/bn); int ik_tasks = (int)(K/bk); int in_tasks_per_thread = in_tasks/row_teams; int ik_tasks_per_thread = ik_tasks/column_teams; libxsmm_blasint my_in_start = my_row_id * in_tasks_per_thread; libxsmm_blasint my_in_end = (my_row_id+1) * in_tasks_per_thread; libxsmm_blasint my_ik_start = my_col_id * ik_tasks_per_thread; libxsmm_blasint my_ik_end = (my_col_id+1) * ik_tasks_per_thread; int perform_2d_decomp = (in_tasks % row_teams == 0 && ik_tasks % column_teams == 0 && row_teams*column_teams == handle->desc.threads && cBlocks <= 32 && kBlocks <= 32 && ik_tasks_per_thread <= 16 && in_tasks_per_thread <= 2 ) ? 1 : 0; if (perform_2d_decomp) { /* Auxiliary arrays for batch-reduce gemms and potential prefetch */ const element_input_type *A_array[16][2][32]; const element_input_type *B_array[16][2][32]; const element_input_type *A_array2[16][2][32]; const element_input_type *B_array2[16][2][32]; const element_input_type *A_array_pf[16][2][32]; const element_input_type *B_array_pf[16][2][32]; const element_input_type *A_array2_pf[16][2][32]; const element_input_type *B_array2_pf[16][2][32]; int ii, jj; /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, (int)ltid); /* All data is in column-major format */ for (i = 0; i < t; ++i) { /* Prepare arrays for the batch-reduce calls */ for (ik = my_ik_start, ii = 0; ik < my_ik_end; ++ik, ii++ ) { for (in = my_in_start, jj = 0; in < my_in_end; ++in, jj++ ) { /* Prepare arrays for the call */ for (ic = 0; ic < cBlocks; ic++) { /* this is a small matmul */ A_array[ii][jj][ic] = &LIBXSMM_VLA_ACCESS(4, w, ik, ic, 0, 0, cBlocks, bc, bk); B_array[ii][jj][ic] = &LIBXSMM_VLA_ACCESS(5, x, i, in, ic, 0, 0, nBlocks, cBlocks, bn, bc); } /* z += U.h */ if (0 == i) { /* Prepare arrays for the call */ for (ic = 0; ic < kBlocks; ic++) { A_array2[ii][jj][ic] = &LIBXSMM_VLA_ACCESS(4, r, ik, ic, 0, 0, kBlocks, bk, bk); B_array2[ii][jj][ic] = &LIBXSMM_VLA_ACCESS(4, hp, in, ic, 0, 0, kBlocks, bn, bk); } } else { /* Prepare arrays for the call */ for (ic = 0; ic < kBlocks; ic++) { A_array2[ii][jj][ic] = &LIBXSMM_VLA_ACCESS(4, r, ik, ic, 0, 0, kBlocks, bk, bk); B_array2[ii][jj][ic] = &LIBXSMM_VLA_ACCESS(5, h, i-1, in, ic, 0, 0, nBlocks, kBlocks, bn, bk); } } } } if (prefetch_mode != LIBXSMM_GEMM_PREFETCH_NONE) { /* coverity[dead_error_begin] */ /* Prepare additional prefetch arrays that are shifted images of regular ones when external prefetching is requested */ int pf_dist_A = 2; int pf_dist_B = 4; libxsmm_blasint total_blocks = in_tasks_per_thread*ik_tasks_per_thread*cBlocks; const element_input_type **src_ptr = &A_array[0][0][0]; const element_input_type **dst_ptr = &A_array_pf[0][0][0]; for (ii = 0; ii < total_blocks - pf_dist_A; ii++) { dst_ptr[ii] = src_ptr[ii+pf_dist_A]; } src_ptr = &B_array[0][0][0]; dst_ptr = &B_array_pf[0][0][0]; for (ii = 0; ii < total_blocks - pf_dist_B; ii++) { dst_ptr[ii] = src_ptr[ii+pf_dist_B]; } total_blocks = in_tasks_per_thread*ik_tasks_per_thread*kBlocks; src_ptr = &A_array2[0][0][0]; dst_ptr = &A_array2_pf[0][0][0]; for (ii = 0; ii < total_blocks - pf_dist_A; ii++) { dst_ptr[ii] = src_ptr[ii+pf_dist_A]; } src_ptr = &B_array2[0][0][0]; dst_ptr = &B_array2_pf[0][0][0]; for (ii = 0; ii < total_blocks - pf_dist_B; ii++) { dst_ptr[ii] = src_ptr[ii+pf_dist_B]; } } /* let's run the cell in blocks for good locality */ for (ik = my_ik_start, ii = 0; ik < my_ik_end; ++ik, ii++ ) { for (in = my_in_start, jj = 0; in < my_in_end; ++in, jj++ ) { /* z = per_col(b) */ libxsmm_internal_matrix_bcst_colvector_ld( bk, bn, bk, &LIBXSMM_VLA_ACCESS(5, z, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk), &b[ik*bk]); /* z += W.x */ blocks = cBlocks; batchreduce_kernela(&A_array[ii][jj][0], &B_array[ii][jj][0], &LIBXSMM_VLA_ACCESS(5, z, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk), &blocks, &A_array_pf[ii][jj][0], &B_array_pf[ii][jj][0]); /* z += U.h */ blocks = kBlocks; batchreduce_kernelb(&A_array2[ii][jj][0], &B_array2[ii][jj][0], &LIBXSMM_VLA_ACCESS(5, z, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk), &blocks, &A_array2_pf[ii][jj][0], &B_array2_pf[ii][jj][0]); #if defined(LIBXSMM_DNN_RNN_RELU_FWD) libxsmm_internal_matrix_relu_ld( bk, bn, bk, &LIBXSMM_VLA_ACCESS(5, z, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk), &LIBXSMM_VLA_ACCESS(5, h, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk)); #endif #if defined(LIBXSMM_DNN_RNN_SIGMOID_FWD) libxsmm_internal_matrix_sigmoid_ld( bk, bn, bk, &LIBXSMM_VLA_ACCESS(5, z, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk), &LIBXSMM_VLA_ACCESS(5, h, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk)); #endif #if defined(LIBXSMM_DNN_RNN_TANH_FWD) libxsmm_internal_matrix_tanh_ld( bk, bn, bk, &LIBXSMM_VLA_ACCESS(5, z, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk), &LIBXSMM_VLA_ACCESS(5, h, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk)); #endif } } libxsmm_barrier_wait(handle->barrier, (int)ltid); } } else { /* Auxiliary arrays for batch-reduce gemms */ const element_input_type *A_array[1024]; const element_input_type *B_array[1024]; const element_input_type *A_array2[1024]; const element_input_type *B_array2[1024]; assert(kBlocks <= 1024); assert(cBlocks <= 1024); /* lazy barrier init */ libxsmm_barrier_init(handle->barrier, (int)ltid); /* All data is in column-major format */ for (i = 0; i < t; ++i) { /* let's run the cell in blocks for good locality */ for (inik = thr_begin; inik < thr_end; ++inik ) { in = inik / (K/bk); ik = inik % (K/bk); /* z = per_col(b) */ libxsmm_internal_matrix_bcst_colvector_ld( bk, bn, bk, &LIBXSMM_VLA_ACCESS(5, z, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk), &b[ik*bk]); /* z += W.x */ /* Prepare arrays for the call */ for (ic = 0; ic < cBlocks; ic++) { /* this is a small matmul */ A_array[ic] = &LIBXSMM_VLA_ACCESS(4, w, ik, ic, 0, 0, cBlocks, bc, bk); B_array[ic] = &LIBXSMM_VLA_ACCESS(5, x, i, in, ic, 0, 0, nBlocks, cBlocks, bn, bc); } /* Reduce batch gemm call */ blocks = cBlocks; batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(5, z, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk), &blocks); /* z += U.h */ if (0 == i) { /* Prepare arrays for the call */ for (ic = 0; ic < kBlocks; ic++) { A_array2[ic] = &LIBXSMM_VLA_ACCESS(4, r, ik, ic, 0, 0, kBlocks, bk, bk); B_array2[ic] = &LIBXSMM_VLA_ACCESS(4, hp, in, ic, 0, 0, kBlocks, bn, bk); } /* Reduce batch gemm call */ blocks = kBlocks; batchreduce_kernelb(A_array2, B_array2, &LIBXSMM_VLA_ACCESS(5, z, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk), &blocks); } else { /* Prepare arrays for the call */ for (ic = 0; ic < kBlocks; ic++) { A_array2[ic] = &LIBXSMM_VLA_ACCESS(4, r, ik, ic, 0, 0, kBlocks, bk, bk); B_array2[ic] = &LIBXSMM_VLA_ACCESS(5, h, i-1, in, ic, 0, 0, nBlocks, kBlocks, bn, bk); } /* Reduce batch gemm call */ blocks = kBlocks; batchreduce_kernelb(A_array2, B_array2, &LIBXSMM_VLA_ACCESS(5, z, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk), &blocks); } #if defined(LIBXSMM_DNN_RNN_RELU_FWD) libxsmm_internal_matrix_relu_ld( bk, bn, bk, &LIBXSMM_VLA_ACCESS(5, z, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk), &LIBXSMM_VLA_ACCESS(5, h, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk)); #endif #if defined(LIBXSMM_DNN_RNN_SIGMOID_FWD) libxsmm_internal_matrix_sigmoid_ld( bk, bn, bk, &LIBXSMM_VLA_ACCESS(5, z, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk), &LIBXSMM_VLA_ACCESS(5, h, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk)); #endif #if defined(LIBXSMM_DNN_RNN_TANH_FWD) libxsmm_internal_matrix_tanh_ld( bk, bn, bk, &LIBXSMM_VLA_ACCESS(5, z, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk), &LIBXSMM_VLA_ACCESS(5, h, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk)); #endif } libxsmm_barrier_wait(handle->barrier, (int)ltid); } } libxsmm-1.17/src/template/libxsmm_dnn_softmaxloss_st_bwd_ncnc_generic.tpl.c000066400000000000000000000146541415223013700275040ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #if defined(LIBXSMM_DNN_SOFTMAXLOSS_BWD_BF16_AVX512) #define LIBXSMM_DNN_CONVERT_F32_BF16(in, out, length) do { \ unsigned int full_chunks = length / 16; \ unsigned int remainder = length % 16; \ int __i = 0; \ if (remainder == 0) { \ for ( __i = 0; __i < length; __i+= 16) { \ _mm256_storeu_si256((__m256i*)(out+__i), _mm512_cvtepi32_epi16( _mm512_srai_epi32( LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16( LIBXSMM_INTRINSICS_MM512_LOAD_PS((const float*)in+__i) ),16)) ); \ } \ } else { \ unsigned int chunk; \ for ( chunk = 0; chunk < full_chunks; chunk++) { \ __i = chunk * 16; \ _mm256_storeu_si256((__m256i*)(out+__i), _mm512_cvtepi32_epi16( _mm512_srai_epi32( LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16( LIBXSMM_INTRINSICS_MM512_LOAD_PS((const float*)in+__i) ),16)) ); \ } \ libxsmm_rne_convert_fp32_bf16((const float*)in+16*full_chunks, (libxsmm_bfloat16*)out+16*full_chunks, remainder); \ } \ } while(0) #define LIBXSMM_DNN_CONVERT_BF16_F32(in, out, length) do { \ unsigned int full_chunks = length / 16; \ unsigned int remainder = length % 16; \ int __i = 0; \ if (remainder == 0) { \ for ( __i = 0; __i < length; __i+= 16) { \ _mm512_storeu_ps( out+__i, _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(in+__i))),16))); \ } \ } else { \ unsigned int chunk; \ for ( chunk = 0; chunk < full_chunks; chunk++) { \ __i = chunk * 16; \ _mm512_storeu_ps( out+__i, _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(in+__i))),16))); \ } \ libxsmm_convert_bf16_f32( (const libxsmm_bfloat16*)in+16*full_chunks, (float*)out+16*full_chunks, remainder); \ } \ } while(0) #endif libxsmm_blasint bn = handle->bn; libxsmm_blasint Bn = handle->Bn; libxsmm_blasint bc = handle->bc; libxsmm_blasint Bc = handle->Bc; /* loop counters */ int i = 0; libxsmm_blasint img1, img2, ifm1, ifm2; float rcp_N = 1.0f/handle->desc.N; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could run in parallel for the batch */ const int n_work = Bn * bn; /* compute chunk size */ const int n_chunksize = (n_work % handle->desc.threads == 0) ? (n_work / handle->desc.threads) : ((n_work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int n_thr_begin = (ltid * n_chunksize < n_work) ? (ltid * n_chunksize) : n_work; const int n_thr_end = ((ltid + 1) * n_chunksize < n_work) ? ((ltid + 1) * n_chunksize) : n_work; #if defined(LIBXSMM_DNN_SOFTMAXLOSS_BWD_BF16) || defined(LIBXSMM_DNN_SOFTMAXLOSS_BWD_BF16_AVX512) /* number of tasks that could run in parallel for the batch */ const int nc_work = Bn * bn; /* compute chunk size */ const int nc_chunksize = (nc_work % handle->desc.threads == 0) ? (nc_work / handle->desc.threads) : ((nc_work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int nc_thr_begin = (ltid * nc_chunksize < nc_work) ? (ltid * nc_chunksize) : nc_work; const int nc_thr_end = ((ltid + 1) * nc_chunksize < nc_work) ? ((ltid + 1) * nc_chunksize) : nc_work; libxsmm_bfloat16* poutput_bf16 = (element_output_type*)handle->reg_output->data; libxsmm_bfloat16* pdinput_bf16 = (element_input_type*)handle->grad_input->data; float* poutput_fp32 = (float*)handle->scratch; float* pdinput_fp32 = ((float*)handle->scratch)+(handle->desc.N*handle->desc.C); LIBXSMM_VLA_DECL(4, const float, output, poutput_fp32, Bc, bn, bc); LIBXSMM_VLA_DECL(4, float, dinput, pdinput_fp32, Bc, bn, bc); #else LIBXSMM_VLA_DECL(4, const element_output_type, output, (element_output_type*)handle->reg_output->data, Bc, bn, bc); LIBXSMM_VLA_DECL(4, element_input_type, dinput, (element_input_type*)handle->grad_input->data, Bc, bn, bc); #endif LIBXSMM_VLA_DECL(2, const element_label_type, label, (element_label_type*)handle->label->data, bn); /* lazy barrier init */ libxsmm_barrier_init( handle->barrier, ltid ); #if defined(LIBXSMM_DNN_SOFTMAXLOSS_BWD_BF16) for ( i = nc_thr_begin; i < nc_thr_end; ++i ) { libxsmm_bfloat16_hp out; out.i[0] = 0; out.i[1] = poutput_bf16[i]; poutput_fp32[i] = out.f; } libxsmm_barrier_wait( handle->barrier, ltid ); #endif #if defined(LIBXSMM_DNN_SOFTMAXLOSS_BWD_BF16_AVX512) LIBXSMM_DNN_CONVERT_BF16_F32(poutput_bf16+nc_thr_begin, poutput_fp32+nc_thr_begin, nc_thr_end-nc_thr_begin); libxsmm_barrier_wait( handle->barrier, ltid ); #endif for ( i = n_thr_begin; i < n_thr_end; ++i ) { img1 = i/bn; img2 = i%bn; /* set output to input and set compute max per image */ for ( ifm1 = 0; ifm1 < Bc; ++ifm1 ) { for ( ifm2 = 0; ifm2 < bc; ++ifm2 ) { if ( (ifm1*Bc)+ifm2 == (libxsmm_blasint)LIBXSMM_VLA_ACCESS( 2, label, img1, img2, bn ) ) { LIBXSMM_VLA_ACCESS( 4, dinput, img1, ifm1, img2, ifm2, Bc, bn, bc ) = ( LIBXSMM_VLA_ACCESS( 4, output, img1, ifm1, img2, ifm2, Bc, bn, bc ) - 1.0f ) * rcp_N * handle->desc.loss_weight; } else { LIBXSMM_VLA_ACCESS( 4, dinput, img1, ifm1, img2, ifm2, Bc, bn, bc ) = LIBXSMM_VLA_ACCESS( 4, output, img1, ifm1, img2, ifm2, Bc, bn, bc ) * rcp_N * handle->desc.loss_weight; } } } } libxsmm_barrier_wait( handle->barrier, ltid ); #if defined(LIBXSMM_DNN_SOFTMAXLOSS_BWD_BF16) for ( i = nc_thr_begin; i < nc_thr_end; ++i ) { libxsmm_bfloat16_hp din; din.i[0] = 0; din.i[1] = pdinput_bf16[i]; pdinput_fp32[i] = din.f; } libxsmm_barrier_wait( handle->barrier, ltid ); #endif #if defined(LIBXSMM_DNN_SOFTMAXLOSS_BWD_BF16_AVX512) LIBXSMM_DNN_CONVERT_F32_BF16(pdinput_fp32+nc_thr_begin, pdinput_bf16+nc_thr_begin, nc_thr_end-nc_thr_begin); libxsmm_barrier_wait( handle->barrier, ltid ); #undef LIBXSMM_DNN_CONVERT_F32_BF16 #undef LIBXSMM_DNN_CONVERT_BF16_F32 #endif libxsmm-1.17/src/template/libxsmm_dnn_softmaxloss_st_fwd_ncnc_generic.tpl.c000066400000000000000000000171041415223013700275010ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ #if defined(LIBXSMM_DNN_SOFTMAXLOSS_FWD_BF16_AVX512) #define LIBXSMM_DNN_CONVERT_F32_BF16(in, out, length) do { \ unsigned int full_chunks = length / 16; \ unsigned int remainder = length % 16; \ int __i = 0; \ if (remainder == 0) { \ for ( __i = 0; __i < length; __i+= 16) { \ _mm256_storeu_si256((__m256i*)(out+__i), _mm512_cvtepi32_epi16( _mm512_srai_epi32( LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16( LIBXSMM_INTRINSICS_MM512_LOAD_PS((const float*)in+__i) ),16)) ); \ } \ } else { \ unsigned int chunk; \ for ( chunk = 0; chunk < full_chunks; chunk++) { \ __i = chunk * 16; \ _mm256_storeu_si256((__m256i*)(out+__i), _mm512_cvtepi32_epi16( _mm512_srai_epi32( LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16( LIBXSMM_INTRINSICS_MM512_LOAD_PS((const float*)in+__i) ),16)) ); \ } \ libxsmm_rne_convert_fp32_bf16((const float*)in+16*full_chunks, (libxsmm_bfloat16*)out+16*full_chunks, remainder); \ } \ } while(0) #define LIBXSMM_DNN_CONVERT_BF16_F32(in, out, length) do { \ unsigned int full_chunks = length / 16; \ unsigned int remainder = length % 16; \ int __i = 0; \ if (remainder == 0) { \ for ( __i = 0; __i < length; __i+= 16) { \ _mm512_storeu_ps( out+__i, _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(in+__i))),16))); \ } \ } else { \ unsigned int chunk; \ for ( chunk = 0; chunk < full_chunks; chunk++) { \ __i = chunk * 16; \ _mm512_storeu_ps( out+__i, _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(in+__i))),16))); \ } \ libxsmm_convert_bf16_f32( (const libxsmm_bfloat16*)in+16*full_chunks, (float*)out+16*full_chunks, remainder); \ } \ } while(0) #endif libxsmm_blasint bn = handle->bn; libxsmm_blasint Bn = handle->Bn; libxsmm_blasint bc = handle->bc; libxsmm_blasint Bc = handle->Bc; /* loop counters */ int i = 0; libxsmm_blasint img1, img2, ifm1, ifm2; /* computing first logical thread */ const int ltid = tid - start_thread; /* number of tasks that could run in parallel for the batch */ const int n_work = Bn * bn; /* compute chunk size */ const int n_chunksize = (n_work % handle->desc.threads == 0) ? (n_work / handle->desc.threads) : ((n_work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int n_thr_begin = (ltid * n_chunksize < n_work) ? (ltid * n_chunksize) : n_work; const int n_thr_end = ((ltid + 1) * n_chunksize < n_work) ? ((ltid + 1) * n_chunksize) : n_work; #if defined(LIBXSMM_DNN_SOFTMAXLOSS_FWD_BF16) || defined(LIBXSMM_DNN_SOFTMAXLOSS_FWD_BF16_AVX512) /* number of tasks that could run in parallel for the batch */ const int nc_work = Bn * bn; /* compute chunk size */ const int nc_chunksize = (nc_work % handle->desc.threads == 0) ? (nc_work / handle->desc.threads) : ((nc_work / handle->desc.threads) + 1); /* compute thr_begin and thr_end */ const int nc_thr_begin = (ltid * nc_chunksize < nc_work) ? (ltid * nc_chunksize) : nc_work; const int nc_thr_end = ((ltid + 1) * nc_chunksize < nc_work) ? ((ltid + 1) * nc_chunksize) : nc_work; libxsmm_bfloat16* poutput_bf16 = (element_output_type*)handle->reg_output->data; libxsmm_bfloat16* pinput_bf16 = (element_input_type*)handle->reg_input->data; float* poutput_fp32 = (float*)handle->scratch; float* pinput_fp32 = ((float*)handle->scratch)+(handle->desc.N*handle->desc.C); LIBXSMM_VLA_DECL(4, float, output, poutput_fp32, Bc, bn, bc); LIBXSMM_VLA_DECL(4, const float, input, pinput_fp32, Bc, bn, bc); #else LIBXSMM_VLA_DECL(4, element_output_type, output, (element_output_type*)handle->reg_output->data, Bc, bn, bc); LIBXSMM_VLA_DECL(4, const element_input_type, input, (element_input_type*)handle->reg_input->data, Bc, bn, bc); #endif LIBXSMM_VLA_DECL(2, const element_label_type, label, (element_label_type*)handle->label->data, bn); /* lazy barrier init */ libxsmm_barrier_init( handle->barrier, ltid ); #if defined(LIBXSMM_DNN_SOFTMAXLOSS_FWD_BF16) for ( i = nc_thr_begin; i < nc_thr_end; ++i ) { libxsmm_bfloat16_hp in; in.i[0] = 0; in.i[1] = pinput_bf16[i]; pinput_fp32[i] = in.f; } libxsmm_barrier_wait( handle->barrier, ltid ); #endif #if defined(LIBXSMM_DNN_SOFTMAXLOSS_FWD_BF16_AVX512) LIBXSMM_DNN_CONVERT_BF16_F32(pinput_bf16+nc_thr_begin, pinput_fp32+nc_thr_begin, nc_thr_end-nc_thr_begin); libxsmm_barrier_wait( handle->barrier, ltid ); #endif for ( i = n_thr_begin; i < n_thr_end; ++i ) { float max = FLT_MIN; float sum_of_exp = 0.0f; img1 = i/bn; img2 = i%bn; /* set output to input and set compute max per image */ for ( ifm1 = 0; ifm1 < Bc; ++ifm1 ) { for ( ifm2 = 0; ifm2 < bc; ++ifm2 ) { LIBXSMM_VLA_ACCESS( 4, output, img1, ifm1, img2, ifm2, Bc, bn, bc ) = LIBXSMM_VLA_ACCESS( 4, input, img1, ifm1, img2, ifm2, Bc, bn, bc ); if ( LIBXSMM_VLA_ACCESS( 4, input, img1, ifm1, img2, ifm2, Bc, bn, bc ) > max ) { max = LIBXSMM_VLA_ACCESS( 4, input, img1, ifm1, img2, ifm2, Bc, bn, bc ); } } } /* sum exp over outputs */ for ( ifm1 = 0; ifm1 < Bc; ++ifm1 ) { for ( ifm2 = 0; ifm2 < bc; ++ifm2 ) { LIBXSMM_VLA_ACCESS( 4, output, img1, ifm1, img2, ifm2, Bc, bn, bc ) = (float)exp( (double)(LIBXSMM_VLA_ACCESS( 4, output, img1, ifm1, img2, ifm2, Bc, bn, bc ) - max) ); sum_of_exp += LIBXSMM_VLA_ACCESS( 4, output, img1, ifm1, img2, ifm2, Bc, bn, bc ); } } /* scale output */ sum_of_exp = 1.0f/sum_of_exp; for ( ifm1 = 0; ifm1 < Bc; ++ifm1 ) { for ( ifm2 = 0; ifm2 < bc; ++ifm2 ) { LIBXSMM_VLA_ACCESS( 4, output, img1, ifm1, img2, ifm2, Bc, bn, bc ) = LIBXSMM_VLA_ACCESS( 4, output, img1, ifm1, img2, ifm2, Bc, bn, bc ) * sum_of_exp; } } } libxsmm_barrier_wait( handle->barrier, ltid ); /* calculate loss single threaded */ if ( ltid == 0 ) { handle->loss = 0.0f; for ( img1 = 0; img1 < Bn; ++img1 ) { for ( img2 = 0; img2 FLT_MIN ) ? LIBXSMM_VLA_ACCESS( 4, output, img1, ifm1b, img2, ifm2b, Bc, bn, bc ) : FLT_MIN; handle->loss = LIBXSMM_LOGF( val ); } } handle->loss = ((-1.0f)*handle->loss)/handle->desc.N; } libxsmm_barrier_wait( handle->barrier, ltid ); #if defined(LIBXSMM_DNN_SOFTMAXLOSS_FWD_BF16) for ( i = nc_thr_begin; i < nc_thr_end; ++i ) { libxsmm_bfloat16_hp in; in.i[0] = 0; in.i[1] = poutput_bf16[i]; poutput_fp32[i] = in.f; } libxsmm_barrier_wait( handle->barrier, ltid ); #endif #if defined(LIBXSMM_DNN_SOFTMAXLOSS_FWD_BF16_AVX512) LIBXSMM_DNN_CONVERT_F32_BF16(poutput_fp32+nc_thr_begin, poutput_bf16+nc_thr_begin, nc_thr_end-nc_thr_begin); libxsmm_barrier_wait( handle->barrier, ltid ); #undef LIBXSMM_DNN_CONVERT_F32_BF16 #undef LIBXSMM_DNN_CONVERT_BF16_F32 #endif libxsmm-1.17/src/template/libxsmm_dnn_tensor_bias_copy_in_nchw.tpl.c000066400000000000000000000026731415223013700261300ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ /* use for-loops to potentially leverage NUMA in the future */ int i1, i2, i3; #if defined(LIBXSMM_DNN_COPY_LOW_PRECISION) int lpb = tensor->layout->dim_size[0]; int bfm = tensor->layout->dim_size[1]; int fmb = tensor->layout->dim_size[2]; #else int lpb = 1; int bfm = tensor->layout->dim_size[0]; int fmb = tensor->layout->dim_size[1]; #endif const element_type* user_data = (const element_type*)data; LIBXSMM_VLA_DECL(3, element_type, handle_data, (element_type*)tensor->data, bfm, lpb); for (i1 = 0; i1 < fmb; ++i1) { for (i2 = 0; i2 < bfm; ++i2) { for (i3 = 0; i3 < lpb; ++i3) { LIBXSMM_VLA_ACCESS(3, handle_data, i1, i2, i3, bfm, lpb) = user_data[(i1*bfm*lpb) + (i2*lpb) + i3]; } } } libxsmm-1.17/src/template/libxsmm_dnn_tensor_bias_copy_out_nchw.tpl.c000066400000000000000000000026731415223013700263310ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ /* use for-loops to potentially leverage NUMA in the future */ int i1, i2, i3; #if defined(LIBXSMM_DNN_COPY_LOW_PRECISION) int lpb = tensor->layout->dim_size[0]; int bfm = tensor->layout->dim_size[1]; int fmb = tensor->layout->dim_size[2]; #else int lpb = 1; int bfm = tensor->layout->dim_size[0]; int fmb = tensor->layout->dim_size[1]; #endif element_type* user_data = (element_type*)data; LIBXSMM_VLA_DECL(3, const element_type, handle_data, (const element_type*)tensor->data, bfm, lpb); for (i1 = 0; i1 < fmb; ++i1) { for (i2 = 0; i2 < bfm; ++i2) { for (i3 = 0; i3 < lpb; ++i3) { user_data[(i1*bfm*lpb) + (i2*lpb) + i3] = LIBXSMM_VLA_ACCESS(3, handle_data, i1, i2, i3, bfm, lpb); } } } libxsmm-1.17/src/template/libxsmm_dnn_tensor_buffer_copy_in_nchw.tpl.c000066400000000000000000000041411415223013700264530ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Evangelos Georganas, Hans Pabst (Intel Corp.) ******************************************************************************/ int i1, i2, i3, i4, i5, i6; int lpb, bfm, W, H, fmb, N, C; /* low precision formatting */ if ( tensor->layout->num_dims == 6 ) { lpb = tensor->layout->dim_size[0]; bfm = tensor->layout->dim_size[1]; W = tensor->layout->dim_size[2]; H = tensor->layout->dim_size[3]; fmb = tensor->layout->dim_size[4]; N = tensor->layout->dim_size[5]; } else { lpb = 1; bfm = tensor->layout->dim_size[0]; W = tensor->layout->dim_size[1]; H = tensor->layout->dim_size[2]; fmb = tensor->layout->dim_size[3]; N = tensor->layout->dim_size[4]; } C = fmb * bfm * lpb; /*printf(" layout act copy in N %i fmb %i H %i W %i bfm %i lpb %i \n", N, fmb, H, W, bfm, lpb);*/ { LIBXSMM_VLA_DECL(6, element_type, handle_data_1, (element_type*)tensor->data, fmb, H, W, bfm, lpb); LIBXSMM_VLA_DECL(4, const element_type, user_data, (const element_type*)data, C, H, W); for (i1 = 0; i1 < N; ++i1) { for (i2 = 0; i2 < fmb; ++i2) { for (i3 = 0; i3 < H; ++i3) { for (i4 = 0; i4 < W; ++i4) { for (i5 = 0; i5 < bfm; ++i5) { for (i6 = 0; i6 < lpb; ++i6) { LIBXSMM_VLA_ACCESS(6, handle_data_1, i1, i2, i3, i4, i5, i6, fmb, H, W, bfm, lpb) = LIBXSMM_VLA_ACCESS(4, user_data, i1, ((size_t)i2*bfm*lpb) + ((size_t)i5*lpb) + i6, i3, i4, C, H, W); } } } } } } } libxsmm-1.17/src/template/libxsmm_dnn_tensor_buffer_copy_out_nchw.tpl.c000066400000000000000000000041441415223013700266570ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Evangelos Georganas, Hans Pabst (Intel Corp.) ******************************************************************************/ int i1, i2, i3, i4, i5, i6; int lpb, bfm, W, H, fmb, N, C; /* low precision formatting */ if ( tensor->layout->num_dims == 6 ) { lpb = tensor->layout->dim_size[0]; bfm = tensor->layout->dim_size[1]; W = tensor->layout->dim_size[2]; H = tensor->layout->dim_size[3]; fmb = tensor->layout->dim_size[4]; N = tensor->layout->dim_size[5]; } else { lpb = 1; bfm = tensor->layout->dim_size[0]; W = tensor->layout->dim_size[1]; H = tensor->layout->dim_size[2]; fmb = tensor->layout->dim_size[3]; N = tensor->layout->dim_size[4]; } C = fmb * bfm * lpb; /* printf(" layout act copy out N %i fmb %i H %i W %i bfm %i lpb %i \n", N, fmb, H, W, bfm, lpb); */ { LIBXSMM_VLA_DECL(6, const element_type, handle_data_1, (const element_type*)tensor->data, fmb, H, W, bfm, lpb); LIBXSMM_VLA_DECL(4, element_type, user_data, (element_type*)data, C, H, W); for (i1 = 0; i1 < N; ++i1) { for (i2 = 0; i2 < fmb; ++i2) { for (i3 = 0; i3 < H; ++i3) { for (i4 = 0; i4 < W; ++i4) { for (i5 = 0; i5 < bfm; ++i5) { for (i6 = 0; i6 < lpb; ++i6) { LIBXSMM_VLA_ACCESS(4, user_data, i1, ((size_t)i2*bfm*lpb) + ((size_t)i5*lpb) + i6, i3, i4, C, H, W) = LIBXSMM_VLA_ACCESS(6, handle_data_1, i1, i2, i3, i4, i5, i6, fmb, H, W, bfm, lpb); } } } } } } } libxsmm-1.17/src/template/libxsmm_dnn_tensor_filter_copy_in_kcrs.tpl.c000066400000000000000000000050411415223013700264720ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Evangelos Georganas, Hans Pabst (Intel Corp.) ******************************************************************************/ /* @TODO: use for-loops to potentially leverage NUMA in the future */ int i1, i2, i3, i4, i5, i6, i7; int lpb = 0; int bofm = 0; int bifm = 0; int S = 0; int R = 0; int ifmb = 0; int ofmb = 0; /* low precision formatting */ if ( tensor->layout->num_dims == 7 ) { lpb = tensor->layout->dim_size[0]; bofm = tensor->layout->dim_size[1]; bifm = tensor->layout->dim_size[2]; S = tensor->layout->dim_size[3]; R = tensor->layout->dim_size[4]; ifmb = tensor->layout->dim_size[5]; ofmb = tensor->layout->dim_size[6]; } else if ( tensor->layout->num_dims == 6 ) { lpb = 1; bofm = tensor->layout->dim_size[0]; bifm = tensor->layout->dim_size[1]; S = tensor->layout->dim_size[2]; R = tensor->layout->dim_size[3]; ifmb = tensor->layout->dim_size[4]; ofmb = tensor->layout->dim_size[5]; } else { /* should not happen, @TODO throw ERR */ } /*printf("Layout of filters fil ofmb %i ifmb %i R %i S %i bifm %i bofm %i lpb %i \n", ofmb, ifmb, R, S, bifm, bofm, lpb);*/ { LIBXSMM_VLA_DECL(7, element_type, handle_data_1, (element_type*)tensor->data, ifmb, R, S, bifm, bofm, lpb); LIBXSMM_VLA_DECL(4, const element_type, user_data, (const element_type*)data, ifmb * bifm * lpb, R, S); for (i1 = 0; i1 < ofmb; ++i1) { for (i2 = 0; i2 < ifmb; ++i2) { for (i3 = 0; i3 < R; ++i3) { for (i4 = 0; i4 < S; ++i4) { for (i5 = 0; i5 < bifm; ++i5) { for (i6 = 0; i6 < bofm; ++i6) { for (i7 = 0; i7 < lpb; ++i7) { LIBXSMM_VLA_ACCESS(7, handle_data_1, i1, i2, i3, i4, i5, i6, i7, ifmb, R, S, bifm, bofm, lpb) = LIBXSMM_VLA_ACCESS(4, user_data, i1 * bofm + i6, ((size_t)i2*bifm*lpb) + ((size_t)i5*lpb) + i7, i3, i4, ifmb * bifm * lpb, R, S); } } } } } } } } libxsmm-1.17/src/template/libxsmm_dnn_tensor_filter_copy_out_kcrs.tpl.c000066400000000000000000000046451415223013700267040ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke, Evangelos Georganas, Hans Pabst (Intel Corp.) ******************************************************************************/ /* @TODO: use for-loops to potentially leverage NUMA in the future */ int i1, i2, i3, i4, i5, i6, i7; int lpb = 0; int bofm = 0; int bifm = 0; int S = 0; int R = 0; int ifmb = 0; int ofmb = 0; /* low precision formatting */ if ( tensor->layout->num_dims == 7 ) { lpb = tensor->layout->dim_size[0]; bofm = tensor->layout->dim_size[1]; bifm = tensor->layout->dim_size[2]; S = tensor->layout->dim_size[3]; R = tensor->layout->dim_size[4]; ifmb = tensor->layout->dim_size[5]; ofmb = tensor->layout->dim_size[6]; } else if ( tensor->layout->num_dims == 6 ) { lpb = 1; bofm = tensor->layout->dim_size[0]; bifm = tensor->layout->dim_size[1]; S = tensor->layout->dim_size[2]; R = tensor->layout->dim_size[3]; ifmb = tensor->layout->dim_size[4]; ofmb = tensor->layout->dim_size[5]; } else { /* should not happen, @TODO throw ERR */ } { LIBXSMM_VLA_DECL(4, element_type, user_data, (element_type*)data, ifmb * bifm * lpb, R, S); LIBXSMM_VLA_DECL(7, const element_type, handle_data_1, (const element_type*)tensor->data, ifmb, R, S, bifm, bofm, lpb); for (i1 = 0; i1 < ofmb; ++i1) { for (i2 = 0; i2 < ifmb; ++i2) { for (i3 = 0; i3 < R; ++i3) { for (i4 = 0; i4 < S; ++i4) { for (i5 = 0; i5 < bifm; ++i5) { for (i6 = 0; i6 < bofm; ++i6) { for (i7 = 0; i7 < lpb; ++i7) { LIBXSMM_VLA_ACCESS(4, user_data, i1 * bofm + i6, ((size_t)i2*bifm*lpb) + ((size_t)i5*lpb) + i7, i3, i4, ifmb * bifm * lpb, R, S) = LIBXSMM_VLA_ACCESS(7, handle_data_1, i1, i2, i3, i4, i5, i6, i7, ifmb, R, S, bifm, bofm, lpb); } } } } } } } } libxsmm-1.17/src/template/libxsmm_dnn_zero_rim_st_input_custom.tpl.c000066400000000000000000000026221415223013700262200ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ /* this is crappy as it requires a complicated if... */ if (handle->desc.pad_h_in > 0 || handle->desc.pad_w_in > 0) { for ( ij = 0; ij < handle->ifhp; ij++ ) { for ( ii = 0; ii < handle->ifwp; ii++ ) { if ( (ij < handle->desc.pad_h_in) || (ij >= (handle->desc.H+handle->desc.pad_h_in)) || (ii < handle->desc.pad_w_in) || (ii >= (handle->desc.W+handle->desc.pad_w_in)) ) { for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1lpblock, ij, ii, ifm2, handle->blocksifm*handle->fm_lp_block, handle->ifhp, handle->ifwp, handle->ifmblock) = (element_input_type)0; } } } } } libxsmm-1.17/src/template/libxsmm_dnn_zero_rim_st_input_nhwc.tpl.c000066400000000000000000000025671415223013700256550ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Alexander Heinecke (Intel Corp.) ******************************************************************************/ /* this is crappy as it requires a complicated if... */ if (handle->desc.pad_h_in > 0 || handle->desc.pad_w_in > 0) { for ( ij = 0; ij < handle->ifhp; ij++ ) { for ( ii = 0; ii < handle->ifwp; ii++ ) { if ( (ij < handle->desc.pad_h_in) || (ij >= (handle->desc.H+handle->desc.pad_h_in)) || (ii < handle->desc.pad_w_in) || (ii >= (handle->desc.W+handle->desc.pad_w_in)) ) { for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { LIBXSMM_VLA_ACCESS(5, del_input, img, ij, ii, ifm1, ifm2, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock) = (element_input_type)0; } } } } } libxsmm-1.17/src/template/libxsmm_internal_gru_bwdupd_fused_eltwise_1.tpl.c000066400000000000000000000067461415223013700274270ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Kunal Banerjee (Intel Corp.) ******************************************************************************/ { libxsmm_blasint _k, _j; __m512 _vdh, _vdout, _vdf, _vdc, _vf, _vc, _vhp, _vt1, _vt2; element_input_type* _dout = &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K); element_input_type* _hp; element_input_type* _c = &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K); element_input_type* _f = &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K); element_input_type* _dh = &LIBXSMM_VLA_ACCESS(3, dh, j, in, ik, N, K); element_input_type* _dc = &LIBXSMM_VLA_ACCESS(2, dc, in, ik, K); element_input_type* _df = &LIBXSMM_VLA_ACCESS(2, df, in, ik, K); const __m512 _vneg_ones = _mm512_set1_ps( (float)-1.0 ); const __m512 _vones = _mm512_set1_ps( (float)1.0 ); if (0 == j) { _hp = &LIBXSMM_VLA_ACCESS(2, hp, in, ik, K); } else { assert(NULL != LIBXSMM_CONCATENATE(h, LIBXSMM_VLA_POSTFIX)); _hp = &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ik, N, K); } if (j == t-1) { for ( _j = 0; _j < bn; ++_j ) { LIBXSMM_PRAGMA_UNROLL_N(4) for ( _k = 0; _k < bk; _k += 16 ) { _vdout = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&_dh[(_j*K)+_k]); LIBXSMM_INTRINSICS_MM512_STREAM_PS(&_dout[(_j*K)+_k], _vdout); _vc = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&_c[(_j*K)+_k]); _vt1 = _mm512_sub_ps(_vones, _vc); _vt1 = _mm512_mul_ps(_vdout, _vt1); _vf = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&_f[(_j*K)+_k]); _vt2 = _mm512_fnmsub_ps(_vf, _vf, _vneg_ones); _vdf = _mm512_mul_ps(_vt1, _vt2); LIBXSMM_INTRINSICS_MM512_STREAM_PS(&_df[(_j*K)+_k], _vdf); _vhp = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&_hp[(_j*K)+_k]); _vt1 = _mm512_mul_ps(_vt1, _vc); _vt2 = _mm512_sub_ps(_vhp, _vf); _vdc = _mm512_mul_ps(_vt1, _vt2); LIBXSMM_INTRINSICS_MM512_STREAM_PS(&_dc[(_j*K)+_k], _vdc); } } } else { for ( _j = 0; _j < bn; ++_j ) { LIBXSMM_PRAGMA_UNROLL_N(4) for ( _k = 0; _k < bk; _k += 16 ) { _vdout = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&_dout[(_j*K)+_k]); _vdh = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&_dh[(_j*K)+_k]); _vdout = _mm512_add_ps(_vdout, _vdh); LIBXSMM_INTRINSICS_MM512_STREAM_PS(&_dout[(_j*K)+_k], _vdout); _vc = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&_c[(_j*K)+_k]); _vt1 = _mm512_sub_ps(_vones, _vc); _vt1 = _mm512_mul_ps(_vdout, _vt1); _vf = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&_f[(_j*K)+_k]); _vt2 = _mm512_fnmsub_ps(_vf, _vf, _vneg_ones); _vdf = _mm512_mul_ps( _vt1, _vt2 ); LIBXSMM_INTRINSICS_MM512_STREAM_PS(&_df[(_j*K)+_k], _vdf); _vhp = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&_hp[(_j*K)+_k]); _vt1 = _mm512_mul_ps(_vt1, _vc); _vt2 = _mm512_sub_ps(_vhp, _vf); _vdc = _mm512_mul_ps( _vt1, _vt2 ); LIBXSMM_INTRINSICS_MM512_STREAM_PS(&_dc[(_j*K)+_k], _vdc); } } } } libxsmm-1.17/src/template/libxsmm_internal_gru_bwdupd_fused_eltwise_2.tpl.c000066400000000000000000000033631415223013700274200ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Kunal Banerjee (Intel Corp.) ******************************************************************************/ { libxsmm_blasint _k, _j; __m512 _vdi, _vdo, _vi, _vhp, _vt1, _vt2; element_input_type* _hp; element_input_type* _i = &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K); element_input_type* _di = &LIBXSMM_VLA_ACCESS(2, di, in, ik, K); element_input_type* _do = &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K); const __m512 _vones = _mm512_set1_ps( (float)1.0 ); if (0 == j) { _hp = &LIBXSMM_VLA_ACCESS(2, hp, in, ik, K); } else { _hp = &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ik, N, K); } for ( _j = 0; _j < bn; ++_j ) { LIBXSMM_PRAGMA_UNROLL_N(4) for ( _k = 0; _k < bk; _k += 16 ) { _vi = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&_i[(_j*K)+_k]); _vt1 = _mm512_sub_ps(_vones, _vi); _vt1 = _mm512_mul_ps(_vi, _vt1); _vhp = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&_hp[(_j*K)+_k]); _vdo = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&_do[(_j*K)+_k]); _vt2 = _mm512_mul_ps(_vdo, _vhp); _vdi = _mm512_mul_ps(_vt1, _vt2); LIBXSMM_INTRINSICS_MM512_STREAM_PS(&_di[(_j*K)+_k], _vdi); } } } libxsmm-1.17/src/template/libxsmm_internal_lstm_bwdupd_fused_eltwise.tpl.c000066400000000000000000000135721415223013700273640ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas (Intel Corp.), Alexander Heinecke (Intel Corp.) ******************************************************************************/ { libxsmm_blasint _k, _j; __m512 _vdout, _vdh, _vo, _vt1, _vt2, _vco, _vdcs, _vdcp, _vi, _vci, _vdci, _vdi, _vcps, _vf, _vdf, _vdp; element_input_type* _dout = &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K); element_input_type* _dh = &LIBXSMM_VLA_ACCESS(3, dh, j, in, ik, N, K); element_input_type* _o = &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K); element_input_type* _co = &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K); element_input_type* _dcs = &LIBXSMM_VLA_ACCESS(2, dcs, in, ik, K); element_input_type* _i = &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K); element_input_type* _ci = &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K); element_input_type* _dci = &LIBXSMM_VLA_ACCESS(2, dci, in, ik, K); element_input_type* _di = &LIBXSMM_VLA_ACCESS(2, di, in, ik, K); element_input_type* _cps = cps_ptr; element_input_type* _f = &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K); element_input_type* _df = &LIBXSMM_VLA_ACCESS(2, df, in, ik, K); element_input_type* _dp = &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K); element_input_type* _dcp = &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K); const __m512 _vneg_ones = _mm512_set1_ps( (float)-1.0 ); const __m512 _vones = _mm512_set1_ps( (float)1.0 ); if (j == t-1) { for ( _j = 0; _j < bn; ++_j ) { LIBXSMM_PRAGMA_UNROLL_N(4) for ( _k = 0; _k < bk; _k += 16 ) { _vdout = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_dh[(_j*K)+_k] ); _vo = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_o[(_j*K)+_k] ); _vt1 = _mm512_mul_ps( _vdout, _vo ); _vco = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_co[(_j*K)+_k] ); _vt2 = _mm512_fnmsub_ps ( _vco, _vco, _vneg_ones); _vt1 = _mm512_mul_ps( _vt1, _vt2 ); _vdcs = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_dcs[(_j*K)+_k] ); _vdcp = _mm512_add_ps( _vdcs, _vt1 ); _vi = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_i[(_j*K)+_k] ); _vt1 = _mm512_mul_ps( _vi, _vdcp ); _vci = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_ci[(_j*K)+_k] ); _vt2 = _mm512_fnmsub_ps ( _vci, _vci, _vneg_ones); _vdci = _mm512_mul_ps( _vt1, _vt2 ); LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dci[(_j*K)+_k], _vdci ); _vt1 = _mm512_mul_ps( _vci, _vdcp ); _vt2 = _mm512_sub_ps( _vones, _vi ); _vdi = _mm512_mul_ps( _vi, _vt2); _vdi = _mm512_mul_ps( _vdi, _vt1); LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_di[(_j*K)+_k], _vdi ); _vcps = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_cps[(_j*K)+_k] ); _vt1 = _mm512_mul_ps( _vcps, _vdcp ); _vf = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_f[(_j*K)+_k] ); _vt2 = _mm512_sub_ps( _vones, _vf ); _vdf = _mm512_mul_ps( _vf, _vt2); _vdf = _mm512_mul_ps( _vdf, _vt1); LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_df[(_j*K)+_k], _vdf ); _vt1 = _mm512_mul_ps( _vdout, _vco); _vt2 = _mm512_sub_ps( _vones, _vo ); _vt2 = _mm512_mul_ps( _vo, _vt2); _vdp = _mm512_mul_ps( _vt1, _vt2 ); LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dp[(_j*K)+_k], _vdp ); _vdcp = _mm512_mul_ps( _vdcp, _vf); LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dcp[(_j*K)+_k], _vdcp ); } } } else { for ( _j = 0; _j < bn; ++_j ) { LIBXSMM_PRAGMA_UNROLL_N(4) for ( _k = 0; _k < bk; _k += 16 ) { _vdout = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_dout[(_j*K)+_k] ); _vdh = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_dh[(_j*K)+_k] ); _vdout = _mm512_add_ps( _vdout, _vdh ); _vo = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_o[(_j*K)+_k] ); _vt1 = _mm512_mul_ps( _vdout, _vo ); _vco = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_co[(_j*K)+_k] ); _vt2 = _mm512_fnmsub_ps ( _vco, _vco, _vneg_ones); _vt1 = _mm512_mul_ps( _vt1, _vt2 ); _vdcp = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_dcp[(_j*K)+_k] ); _vdcp = _mm512_add_ps( _vdcp, _vt1 ); _vi = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_i[(_j*K)+_k] ); _vt1 = _mm512_mul_ps( _vi, _vdcp ); _vci = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_ci[(_j*K)+_k] ); _vt2 = _mm512_fnmsub_ps ( _vci, _vci, _vneg_ones); _vdci = _mm512_mul_ps( _vt1, _vt2 ); LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dci[(_j*K)+_k], _vdci ); _vt1 = _mm512_mul_ps( _vci, _vdcp ); _vt2 = _mm512_sub_ps( _vones, _vi ); _vdi = _mm512_mul_ps( _vi, _vt2); _vdi = _mm512_mul_ps( _vdi, _vt1); LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_di[(_j*K)+_k], _vdi ); _vcps = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_cps[(_j*K)+_k] ); _vt1 = _mm512_mul_ps( _vcps, _vdcp ); _vf = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_f[(_j*K)+_k] ); _vt2 = _mm512_sub_ps( _vones, _vf ); _vdf = _mm512_mul_ps( _vf, _vt2); _vdf = _mm512_mul_ps( _vdf, _vt1); LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_df[(_j*K)+_k], _vdf ); _vt1 = _mm512_mul_ps( _vdout, _vco); _vt2 = _mm512_sub_ps( _vones, _vo ); _vt2 = _mm512_mul_ps( _vo, _vt2); _vdp = _mm512_mul_ps( _vt1, _vt2 ); LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dp[(_j*K)+_k], _vdp ); _vdcp = _mm512_mul_ps( _vdcp, _vf); LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dcp[(_j*K)+_k], _vdcp ); } } } } libxsmm-1.17/src/template/libxsmm_internal_lstm_bwdupd_fused_eltwise_reformat.tpl.c000066400000000000000000000154431415223013700312620ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas (Intel Corp.), Alexander Heinecke (Intel Corp.) ******************************************************************************/ { libxsmm_blasint _k, _j; __m512 _vdout, _vdh, _vo, _vt1, _vt2, _vco, _vdcs, _vdcp, _vi, _vci, _vdci, _vdi, _vcps, _vf, _vdf, _vdp; element_input_type* _dout = &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K); element_input_type* _dh = &LIBXSMM_VLA_ACCESS(3, dh, j, in, ik, N, K); element_input_type* _o = &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K); element_input_type* _co = &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K); element_input_type* _dcs = &LIBXSMM_VLA_ACCESS(2, dcs, in, ik, K); element_input_type* _i = &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K); element_input_type* _ci = &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K); element_input_type* _dci = &LIBXSMM_VLA_ACCESS(2, dci, in, ik, K); element_input_type* _di = &LIBXSMM_VLA_ACCESS(2, di, in, ik, K); element_input_type* _cps = cps_ptr; element_input_type* _f = &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K); element_input_type* _df = &LIBXSMM_VLA_ACCESS(2, df, in, ik, K); element_input_type* _dp = &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K); element_input_type* _dcp = &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K); element_input_type* _dciB = &LIBXSMM_VLA_ACCESS(4, dciB, inb, ikb, 0, 0, kBlocks, bn, bk); element_input_type* _diB = &LIBXSMM_VLA_ACCESS(4, diB, inb, ikb, 0, 0, kBlocks, bn, bk); element_input_type* _dfB = &LIBXSMM_VLA_ACCESS(4, dfB, inb, ikb, 0, 0, kBlocks, bn, bk); element_input_type* _dpB = &LIBXSMM_VLA_ACCESS(4, dpB, inb, ikb, 0, 0, kBlocks, bn, bk); const __m512 _vneg_ones = _mm512_set1_ps( (float)-1.0 ); const __m512 _vones = _mm512_set1_ps( (float)1.0 ); if (j == t-1) { for ( _j = 0; _j < bn; ++_j ) { LIBXSMM_PRAGMA_UNROLL_N(4) for ( _k = 0; _k < bk; _k += 16 ) { _vdout = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_dh[(_j*K)+_k] ); _vo = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_o[(_j*K)+_k] ); _vt1 = _mm512_mul_ps( _vdout, _vo ); _vco = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_co[(_j*K)+_k] ); _vt2 = _mm512_fnmsub_ps ( _vco, _vco, _vneg_ones); _vt1 = _mm512_mul_ps( _vt1, _vt2 ); _vdcs = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_dcs[(_j*K)+_k] ); _vdcp = _mm512_add_ps( _vdcs, _vt1 ); _vi = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_i[(_j*K)+_k] ); _vt1 = _mm512_mul_ps( _vi, _vdcp ); _vci = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_ci[(_j*K)+_k] ); _vt2 = _mm512_fnmsub_ps ( _vci, _vci, _vneg_ones); _vdci = _mm512_mul_ps( _vt1, _vt2 ); LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dci[(_j*K)+_k], _vdci ); LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dciB[(_j*bk)+_k], _vdci ); _vt1 = _mm512_mul_ps( _vci, _vdcp ); _vt2 = _mm512_sub_ps( _vones, _vi ); _vdi = _mm512_mul_ps( _vi, _vt2); _vdi = _mm512_mul_ps( _vdi, _vt1); LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_di[(_j*K)+_k], _vdi ); LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_diB[(_j*bk)+_k], _vdi ); _vcps = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_cps[(_j*K)+_k] ); _vt1 = _mm512_mul_ps( _vcps, _vdcp ); _vf = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_f[(_j*K)+_k] ); _vt2 = _mm512_sub_ps( _vones, _vf ); _vdf = _mm512_mul_ps( _vf, _vt2); _vdf = _mm512_mul_ps( _vdf, _vt1); LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_df[(_j*K)+_k], _vdf ); LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dfB[(_j*bk)+_k], _vdf ); _vt1 = _mm512_mul_ps( _vdout, _vco); _vt2 = _mm512_sub_ps( _vones, _vo ); _vt2 = _mm512_mul_ps( _vo, _vt2); _vdp = _mm512_mul_ps( _vt1, _vt2 ); LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dp[(_j*K)+_k], _vdp ); LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dpB[(_j*bk)+_k], _vdp ); _vdcp = _mm512_mul_ps( _vdcp, _vf); LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dcp[(_j*K)+_k], _vdcp ); } } } else { for ( _j = 0; _j < bn; ++_j ) { LIBXSMM_PRAGMA_UNROLL_N(4) for ( _k = 0; _k < bk; _k += 16 ) { _vdout = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_dout[(_j*K)+_k] ); _vdh = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_dh[(_j*K)+_k] ); _vdout = _mm512_add_ps( _vdout, _vdh ); _vo = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_o[(_j*K)+_k] ); _vt1 = _mm512_mul_ps( _vdout, _vo ); _vco = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_co[(_j*K)+_k] ); _vt2 = _mm512_fnmsub_ps ( _vco, _vco, _vneg_ones); _vt1 = _mm512_mul_ps( _vt1, _vt2 ); _vdcp = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_dcp[(_j*K)+_k] ); _vdcp = _mm512_add_ps( _vdcp, _vt1 ); _vi = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_i[(_j*K)+_k] ); _vt1 = _mm512_mul_ps( _vi, _vdcp ); _vci = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_ci[(_j*K)+_k] ); _vt2 = _mm512_fnmsub_ps ( _vci, _vci, _vneg_ones); _vdci = _mm512_mul_ps( _vt1, _vt2 ); LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dci[(_j*K)+_k], _vdci ); LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dciB[(_j*bk)+_k], _vdci ); _vt1 = _mm512_mul_ps( _vci, _vdcp ); _vt2 = _mm512_sub_ps( _vones, _vi ); _vdi = _mm512_mul_ps( _vi, _vt2); _vdi = _mm512_mul_ps( _vdi, _vt1); LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_di[(_j*K)+_k], _vdi ); LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_diB[(_j*bk)+_k], _vdi ); _vcps = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_cps[(_j*K)+_k] ); _vt1 = _mm512_mul_ps( _vcps, _vdcp ); _vf = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_f[(_j*K)+_k] ); _vt2 = _mm512_sub_ps( _vones, _vf ); _vdf = _mm512_mul_ps( _vf, _vt2); _vdf = _mm512_mul_ps( _vdf, _vt1); LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_df[(_j*K)+_k], _vdf ); LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dfB[(_j*bk)+_k], _vdf ); _vt1 = _mm512_mul_ps( _vdout, _vco); _vt2 = _mm512_sub_ps( _vones, _vo ); _vt2 = _mm512_mul_ps( _vo, _vt2); _vdp = _mm512_mul_ps( _vt1, _vt2 ); LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dp[(_j*K)+_k], _vdp ); LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dpB[(_j*bk)+_k], _vdp ); _vdcp = _mm512_mul_ps( _vdcp, _vf); LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dcp[(_j*K)+_k], _vdcp ); } } } } libxsmm-1.17/src/template/libxsmm_internal_lstm_bwdupd_fused_eltwise_reformat_bf16.tpl.c000066400000000000000000000245371415223013700321040ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas (Intel Corp.), Alexander Heinecke (Intel Corp.) ******************************************************************************/ { float* _dout = &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K); element_input_type* _dh = &LIBXSMM_VLA_ACCESS(3, dh, j, in, ik, N, K); element_input_type* _o = &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K); element_input_type* _co = &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K); element_input_type* _dcs = &LIBXSMM_VLA_ACCESS(2, dcs, in, ik, K); element_input_type* _ii = &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K); element_input_type* _ci = &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K); element_input_type* _dci = &LIBXSMM_VLA_ACCESS(2, dci, in, ik, K); element_input_type* _di = &LIBXSMM_VLA_ACCESS(2, di, in, ik, K); element_input_type* _cps = cps_ptr; element_input_type* _f = &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K); element_input_type* _df = &LIBXSMM_VLA_ACCESS(2, df, in, ik, K); element_input_type* _dp = &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K); element_input_type* _dcp = &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K); element_input_type* _dciB = &LIBXSMM_VLA_ACCESS(5, dciB, ikb, inb, 0, 0, 0, nBlocks, bn_lp, bk, lpb); element_input_type* _diB = &LIBXSMM_VLA_ACCESS(5, diB, ikb, inb, 0, 0, 0, nBlocks, bn_lp, bk, lpb); element_input_type* _dfB = &LIBXSMM_VLA_ACCESS(5, dfB, ikb, inb, 0, 0, 0, nBlocks, bn_lp, bk, lpb); element_input_type* _dpB = &LIBXSMM_VLA_ACCESS(5, dpB, ikb, inb, 0, 0, 0, nBlocks, bn_lp, bk, lpb); libxsmm_blasint _k, _j; __m512 _vdout, _vdh, _vo, _vt1, _vt2, _vco, _vdcs, _vdcp, _vii, _vci, _vdci, _vdi, _vcps, _vf, _vdf, _vdp; const __m512 _neg_ones = _mm512_set1_ps( (float)-1.0 ); const __m512 _ones = _mm512_set1_ps( (float)1.0 ); int _lpb = 2; if (j == t-1) { for ( _j = 0; _j < bn; ++_j ) { LIBXSMM_PRAGMA_UNROLL_N(4) for ( _k = 0; _k < bk; _k += 16 ) { _vdout = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&_dh[(_j*K)+_k] )); _vo = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&_o[(_j*K)+_k] )); _vt1 = _mm512_mul_ps( _vdout, _vo ); _vco = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&_co[(_j*K)+_k] )); _vt2 = _mm512_fnmsub_ps ( _vco, _vco, _neg_ones); _vt1 = _mm512_mul_ps( _vt1, _vt2 ); _vdcs = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&_dcs[(_j*K)+_k] )); _vdcp = _mm512_add_ps( _vdcs, _vt1 ); _vii = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&_ii[(_j*K)+_k] )); _vt1 = _mm512_mul_ps( _vii, _vdcp ); _vci = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&_ci[(_j*K)+_k] )); _vt2 = _mm512_fnmsub_ps ( _vci, _vci, _neg_ones); _vdci = _mm512_mul_ps( _vt1, _vt2 ); _mm256_stream_si256((__m256i*)&_dci[(_j*K)+_k], LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(_vdci) ); _vt1 = _mm512_mul_ps( _vci, _vdcp ); _vt2 = _mm512_sub_ps( _ones, _vii ); _vdi = _mm512_mul_ps( _vii, _vt2); _vdi = _mm512_mul_ps( _vdi, _vt1); _mm256_stream_si256((__m256i*)&_di[(_j*K)+_k], LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(_vdi) ); _vcps = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&_cps[(_j*K)+_k] )); _vt1 = _mm512_mul_ps( _vcps, _vdcp ); _vf = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&_f[(_j*K)+_k] )); _vt2 = _mm512_sub_ps( _ones, _vf ); _vdf = _mm512_mul_ps( _vf, _vt2); _vdf = _mm512_mul_ps( _vdf, _vt1); _mm256_stream_si256((__m256i*)&_df[(_j*K)+_k], LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(_vdf) ); _vt1 = _mm512_mul_ps( _vdout, _vco); _vt2 = _mm512_sub_ps( _ones, _vo ); _vt2 = _mm512_mul_ps( _vo, _vt2); _vdp = _mm512_mul_ps( _vt1, _vt2 ); _mm256_stream_si256((__m256i*)&_dp[(_j*K)+_k], LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(_vdp) ); _vdcp = _mm512_mul_ps( _vdcp, _vf); _mm256_stream_si256((__m256i*)&_dcp[(_j*K)+_k], LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(_vdcp) ); } } } else { for ( _j = 0; _j < bn; ++_j ) { LIBXSMM_PRAGMA_UNROLL_N(4) for ( _k = 0; _k < bk; _k += 16 ) { _vdout = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_dout[(_j*K)+_k] ); _vdh = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&_dh[(_j*K)+_k] )); _vdout = _mm512_add_ps( _vdout, _vdh ); _vo = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&_o[(_j*K)+_k] )); _vt1 = _mm512_mul_ps( _vdout, _vo ); _vco = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&_co[(_j*K)+_k] )); _vt2 = _mm512_fnmsub_ps ( _vco, _vco, _neg_ones); _vt1 = _mm512_mul_ps( _vt1, _vt2 ); _vdcp = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&_dcp[(_j*K)+_k] )); _vdcp = _mm512_add_ps( _vdcp, _vt1 ); _vii = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&_ii[(_j*K)+_k] )); _vt1 = _mm512_mul_ps( _vii, _vdcp ); _vci = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&_ci[(_j*K)+_k] )); _vt2 = _mm512_fnmsub_ps ( _vci, _vci, _neg_ones); _vdci = _mm512_mul_ps( _vt1, _vt2 ); _mm256_stream_si256((__m256i*)&_dci[(_j*K)+_k], LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(_vdci) ); _vt1 = _mm512_mul_ps( _vci, _vdcp ); _vt2 = _mm512_sub_ps( _ones, _vii ); _vdi = _mm512_mul_ps( _vii, _vt2); _vdi = _mm512_mul_ps( _vdi, _vt1); _mm256_stream_si256((__m256i*)&_di[(_j*K)+_k], LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(_vdi) ); _vcps = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&_cps[(_j*K)+_k] )); _vt1 = _mm512_mul_ps( _vcps, _vdcp ); _vf = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&_f[(_j*K)+_k] )); _vt2 = _mm512_sub_ps( _ones, _vf ); _vdf = _mm512_mul_ps( _vf, _vt2); _vdf = _mm512_mul_ps( _vdf, _vt1); _mm256_stream_si256((__m256i*)&_df[(_j*K)+_k], LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(_vdf) ); _vt1 = _mm512_mul_ps( _vdout, _vco); _vt2 = _mm512_sub_ps( _ones, _vo ); _vt2 = _mm512_mul_ps( _vo, _vt2); _vdp = _mm512_mul_ps( _vt1, _vt2 ); _mm256_stream_si256((__m256i*)&_dp[(_j*K)+_k], LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(_vdp) ); _vdcp = _mm512_mul_ps( _vdcp, _vf); _mm256_stream_si256((__m256i*)&_dcp[(_j*K)+_k], LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(_vdcp) ); } } } { /* Store di/dci/df/dp to diB/dciB/dfB/dpB which is CNNC AND vnni format */ LIBXSMM_VLA_DECL(2, libxsmm_bfloat16, di_, _di, K); LIBXSMM_VLA_DECL(2, libxsmm_bfloat16, df_, _df, K); LIBXSMM_VLA_DECL(2, libxsmm_bfloat16, dp_, _dp, K); LIBXSMM_VLA_DECL(2, libxsmm_bfloat16, dci_, _dci, K); LIBXSMM_VLA_DECL(3, libxsmm_bfloat16, diB_, _diB, bk, _lpb); LIBXSMM_VLA_DECL(3, libxsmm_bfloat16, dfB_, _dfB, bk, _lpb); LIBXSMM_VLA_DECL(3, libxsmm_bfloat16, dpB_, _dpB, bk, _lpb); LIBXSMM_VLA_DECL(3, libxsmm_bfloat16, dciB_, _dciB, bk, _lpb); if ( (bn % 2 == 0) && (bk % 16 == 0) ) { const __m512i perm_idx = LIBXSMM_INTRINSICS_MM512_SET_EPI16(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8, 23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0); __m256i c0, c1; __m512i c01; for (_j = 0; _j < bn; _j+=2) { for (_k = 0; _k < bk; _k+=16) { c0 = _mm256_loadu_si256((const __m256i*)&LIBXSMM_VLA_ACCESS(2, di_, _j, _k, K)); c1 = _mm256_loadu_si256((const __m256i*)&LIBXSMM_VLA_ACCESS(2, di_, _j+1, _k, K)); c01 = _mm512_inserti64x4 (LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), c0, 0); c01 = _mm512_inserti64x4 (c01, c1, 1); _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(3, diB_, _j/_lpb, _k, 0, bk, _lpb), _mm512_permutexvar_epi16(perm_idx, c01)); c0 = _mm256_loadu_si256((const __m256i*)&LIBXSMM_VLA_ACCESS(2, df_, _j, _k, K)); c1 = _mm256_loadu_si256((const __m256i*)&LIBXSMM_VLA_ACCESS(2, df_, _j+1, _k, K)); c01 = _mm512_inserti64x4 (LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), c0, 0); c01 = _mm512_inserti64x4 (c01, c1, 1); _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(3, dfB_, _j/_lpb, _k, 0, bk, _lpb), _mm512_permutexvar_epi16(perm_idx, c01)); c0 = _mm256_loadu_si256((const __m256i*)&LIBXSMM_VLA_ACCESS(2, dp_, _j, _k, K)); c1 = _mm256_loadu_si256((const __m256i*)&LIBXSMM_VLA_ACCESS(2, dp_, _j+1, _k, K)); c01 = _mm512_inserti64x4 (LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), c0, 0); c01 = _mm512_inserti64x4 (c01, c1, 1); _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(3, dpB_, _j/_lpb, _k, 0, bk, _lpb), _mm512_permutexvar_epi16(perm_idx, c01)); c0 = _mm256_loadu_si256((const __m256i*)&LIBXSMM_VLA_ACCESS(2, dci_, _j, _k, K)); c1 = _mm256_loadu_si256((const __m256i*)&LIBXSMM_VLA_ACCESS(2, dci_, _j+1, _k, K)); c01 = _mm512_inserti64x4 (LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), c0, 0); c01 = _mm512_inserti64x4 (c01, c1, 1); _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(3, dciB_, _j/_lpb, _k, 0, bk, _lpb), _mm512_permutexvar_epi16(perm_idx, c01)); } } } else { for (_j = 0; _j < bn; _j++) { for (_k = 0; _k < bk; _k++) { LIBXSMM_VLA_ACCESS(3, diB_, _j / _lpb, _k, _j%_lpb, bk, _lpb) = LIBXSMM_VLA_ACCESS(2, di_, _j, _k, K); LIBXSMM_VLA_ACCESS(3, dfB_, _j / _lpb, _k, _j%_lpb, bk, _lpb) = LIBXSMM_VLA_ACCESS(2, df_, _j, _k, K); LIBXSMM_VLA_ACCESS(3, dpB_, _j / _lpb, _k, _j%_lpb, bk, _lpb) = LIBXSMM_VLA_ACCESS(2, dp_, _j, _k, K); LIBXSMM_VLA_ACCESS(3, dciB_, _j / _lpb, _k, _j%_lpb, bk, _lpb) = LIBXSMM_VLA_ACCESS(2, dci_, _j, _k, K); } } } } } libxsmm-1.17/src/template/libxsmm_internal_lstm_fwd_fused_eltwise.tpl.c000066400000000000000000000054601415223013700266540ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas (Intel Corp.), Alexander Heinecke (Intel Corp.) ******************************************************************************/ { libxsmm_blasint _k, _j; element_input_type* _o = &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K); element_input_type* _i = &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K); element_input_type* _f = &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K); element_input_type* _ci = &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K); element_input_type* _cps = cps_ptr; element_input_type* _cs = &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K); element_input_type* _h = &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K); element_input_type* _co = &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K); __m512 _vf, _vcs, _vi, _vci, _vco, _vo, _vh; const __m512 _halves = _mm512_set1_ps( (LIBXSMM_DNN_ELTWISE_FTYPE)0.5 ); for ( _j = 0; _j < bn; ++_j ) { LIBXSMM_PRAGMA_UNROLL_N(4) for ( _k = 0; _k < bk; _k += 16 ) { _vo = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_o[(_j*K)+_k] ); _vi = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_i[(_j*K)+_k] ); _vci = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_ci[(_j*K)+_k] ); _vf = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_f[(_j*K)+_k] ); _vcs = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_cps[(_j*K)+_k] ); _vo = _mm512_fmadd_ps( LIBXSMM_INTRINSICS_MM512_TANH_PS( _mm512_mul_ps( _vo, _halves ) ), _halves, _halves); _vi = _mm512_fmadd_ps( LIBXSMM_INTRINSICS_MM512_TANH_PS( _mm512_mul_ps( _vi, _halves ) ), _halves, _halves); _vci = LIBXSMM_INTRINSICS_MM512_TANH_PS( _vci ); _vf = _mm512_fmadd_ps( LIBXSMM_INTRINSICS_MM512_TANH_PS( _mm512_mul_ps( _vf, _halves ) ), _halves, _halves); _vcs = _mm512_mul_ps( _vf, _vcs ); _vcs = _mm512_fmadd_ps( _vi, _vci, _vcs ); _vco = LIBXSMM_INTRINSICS_MM512_TANH_PS( _vcs ); _vh = _mm512_mul_ps( _vo, _vco ); _mm512_storeu_ps( &_o[(_j*K)+_k], _vo ); _mm512_storeu_ps( &_i[(_j*K)+_k], _vi ); _mm512_storeu_ps( &_ci[(_j*K)+_k], _vci ); _mm512_storeu_ps( &_f[(_j*K)+_k], _vf ); _mm512_storeu_ps( &_cs[(_j*K)+_k], _vcs ); _mm512_storeu_ps( &_co[(_j*K)+_k], _vco ); LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_h[(_j*K)+_k], _vh ); } } } libxsmm-1.17/src/template/libxsmm_internal_lstm_fwd_fused_eltwise_bf16.tpl.c000066400000000000000000000053101415223013700274640ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Evangelos Georganas (Intel Corp.), Alexander Heinecke (Intel Corp.) ******************************************************************************/ { libxsmm_blasint _k, _j; float* _o = &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K); float* _i = &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K); float* _f = &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K); float* _ci = &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K); float* _cps = cps_ptr; float* _cs = &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K); float* _h = &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K); float* _co = &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K); __m512 _vf, _vcs, _vi, _vci, _vco, _vo, _vh; const __m512 _halves = _mm512_set1_ps( (LIBXSMM_DNN_ELTWISE_FTYPE)0.5 ); for ( _j = 0; _j < bn; ++_j ) { LIBXSMM_PRAGMA_UNROLL_N(4) for ( _k = 0; _k < bk; _k += 16 ) { _vo = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_o[(_j*K)+_k] ); _vi = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_i[(_j*K)+_k] ); _vci = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_ci[(_j*K)+_k] ); _vf = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_f[(_j*K)+_k] ); _vcs = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_cps[(_j*K)+_k] ); _vo = _mm512_fmadd_ps( LIBXSMM_INTRINSICS_MM512_TANH_PS( _mm512_mul_ps( _vo, _halves ) ), _halves, _halves); _vi = _mm512_fmadd_ps( LIBXSMM_INTRINSICS_MM512_TANH_PS( _mm512_mul_ps( _vi, _halves ) ), _halves, _halves); _vci = LIBXSMM_INTRINSICS_MM512_TANH_PS( _vci ); _vf = _mm512_fmadd_ps( LIBXSMM_INTRINSICS_MM512_TANH_PS( _mm512_mul_ps( _vf, _halves ) ), _halves, _halves); _vcs = _mm512_mul_ps( _vf, _vcs ); _vcs = _mm512_fmadd_ps( _vi, _vci, _vcs ); _vco = LIBXSMM_INTRINSICS_MM512_TANH_PS( _vcs ); _vh = _mm512_mul_ps( _vo, _vco ); _mm512_storeu_ps( &_o[(_j*K)+_k], _vo ); _mm512_storeu_ps( &_i[(_j*K)+_k], _vi ); _mm512_storeu_ps( &_ci[(_j*K)+_k], _vci ); _mm512_storeu_ps( &_f[(_j*K)+_k], _vf ); _mm512_storeu_ps( &_cs[(_j*K)+_k], _vcs ); _mm512_storeu_ps( &_co[(_j*K)+_k], _vco ); LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_h[(_j*K)+_k], _vh ); } } } libxsmm-1.17/src/template/libxsmm_matdiff.tpl.c000066400000000000000000000157401415223013700216330ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ const LIBXSMM_MATDIFF_TEMPLATE_ELEM_TYPE *const real_ref = (const LIBXSMM_MATDIFF_TEMPLATE_ELEM_TYPE*)ref; const LIBXSMM_MATDIFF_TEMPLATE_ELEM_TYPE *const real_tst = (const LIBXSMM_MATDIFF_TEMPLATE_ELEM_TYPE*)tst; double compf = 0, compfr = 0, compft = 0, normfr = 0, normft = 0, normr = 0, normt = 0; double normrc = 0, normtc = 0, compr = 0, compt = 0, compd = 0; libxsmm_blasint i, j; for (i = 0; i < nn; ++i) { double comprj = 0, comptj = 0, compij = 0; double normrj = 0, normtj = 0, normij = 0; double v0, v1; for (j = 0; j < mm; ++j) { const double ti = (0 != real_tst ? real_tst[i*ldt+j] : 0); const double ri = real_ref[i*ldr+j]; const double ta = LIBXSMM_ABS(ti); const double ra = LIBXSMM_ABS(ri); /* minimum/maximum of reference set */ if (ri < info->min_ref) info->min_ref = ri; if (ri > info->max_ref) info->max_ref = ri; if (LIBXSMM_NOTNAN(ti) && inf > ta) { const double di = (0 != real_tst ? (ri < ti ? (ti - ri) : (ri - ti)) : 0); /* minimum/maximum of test set */ if (ti < info->min_tst) info->min_tst = ti; if (ti > info->max_tst) info->max_tst = ti; /* maximum absolute error and location */ if (info->linf_abs < di) { info->linf_abs = di; info->m = j; info->n = i; } /* maximum error relative to current value */ if (0 < ra) { const double dri = di / ra; if (info->linf_rel < dri) info->linf_rel = dri; /* sum of relative differences */ v0 = dri * dri; if (inf > v0) { v0 -= compd; v1 = info->l2_rel + v0; compd = (v1 - info->l2_rel) - v0; info->l2_rel = v1; } } /* row-wise sum of reference values with Kahan compensation */ v0 = ra - comprj; v1 = normrj + v0; comprj = (v1 - normrj) - v0; normrj = v1; /* row-wise sum of test values with Kahan compensation */ v0 = ta - comptj; v1 = normtj + v0; comptj = (v1 - normtj) - v0; normtj = v1; /* row-wise sum of differences with Kahan compensation */ v0 = di - compij; v1 = normij + v0; compij = (v1 - normij) - v0; normij = v1; /* Froebenius-norm of reference matrix with Kahan compensation */ v0 = ri * ri - compfr; v1 = normfr + v0; compfr = (v1 - normfr) - v0; normfr = v1; /* Froebenius-norm of test matrix with Kahan compensation */ v0 = ti * ti - compft; v1 = normft + v0; compft = (v1 - normft) - v0; normft = v1; /* Froebenius-norm of differences with Kahan compensation */ v0 = di * di; if (inf > v0) { v0 -= compf; v1 = info->l2_abs + v0; compf = (v1 - info->l2_abs) - v0; info->l2_abs = v1; } } else { /* NaN */ info->m = j; info->n = i; result_nan = LIBXSMM_NOTNAN(ri) && inf > ra ? 1 : 2; break; } } if (0 == result_nan) { /* summarize reference values */ v0 = normrj - compr; v1 = info->l1_ref + v0; compr = (v1 - info->l1_ref) - v0; info->l1_ref = v1; /* summarize test values */ v0 = normtj - compt; v1 = info->l1_tst + v0; compt = (v1 - info->l1_tst) - v0; info->l1_tst = v1; /* calculate Infinity-norm of differences */ if (info->normi_abs < normij) info->normi_abs = normij; /* calculate Infinity-norm of reference/test values */ if (normr < normrj) normr = normrj; if (normt < normtj) normt = normtj; } else { break; } } if (0 == result_nan) { const libxsmm_blasint size = mm * nn; double compr_var = 0, compt_var = 0; /* initial variance */ assert(0 == info->var_ref); /* !LIBXSMM_ASSERT */ assert(0 == info->var_tst); /* !LIBXSMM_ASSERT */ if (0 != size) { /* final average */ info->avg_ref = info->l1_ref / size; info->avg_tst = info->l1_tst / size; } /* Infinity-norm relative to reference */ if (0 < normr) { info->normi_rel = info->normi_abs / normr; } else if (0 < normt) { /* relative to test */ info->normi_rel = info->normi_abs / normt; } else { /* should not happen */ info->normi_rel = 0; } /* Froebenius-norm relative to reference */ if (0 < normfr) { info->normf_rel = info->l2_abs / normfr; } else if (0 < normft) { /* relative to test */ info->normf_rel = info->l2_abs / normft; } else { /* should not happen */ info->normf_rel = 0; } for (j = 0; j < mm; ++j) { double compri = 0, compti = 0, comp1 = 0; double normri = 0, normti = 0, norm1 = 0; for (i = 0; i < nn; ++i) { const double ri = real_ref[i*ldr + j], ti = (0 != real_tst ? real_tst[i*ldt + j] : 0); const double di = (0 != real_tst ? (ri < ti ? (ti - ri) : (ri - ti)) : 0); const double rd = ri - info->avg_ref, td = ti - info->avg_tst; const double ra = LIBXSMM_ABS(ri), ta = LIBXSMM_ABS(ti); /* variance of reference set with Kahan compensation */ double v0 = rd * rd - compr_var, v1 = info->var_ref + v0; compr_var = (v1 - info->var_ref) - v0; info->var_ref = v1; /* variance of test set with Kahan compensation */ v0 = td * td - compt_var; v1 = info->var_tst + v0; compt_var = (v1 - info->var_tst) - v0; info->var_tst = v1; /* column-wise sum of reference values with Kahan compensation */ v0 = ra - compri; v1 = normri + v0; compri = (v1 - normri) - v0; normri = v1; /* column-wise sum of test values with Kahan compensation */ v0 = ta - compti; v1 = normti + v0; compti = (v1 - normti) - v0; normti = v1; /* column-wise sum of differences with Kahan compensation */ v0 = di - comp1; v1 = norm1 + v0; comp1 = (v1 - norm1) - v0; norm1 = v1; } /* calculate One-norm of differences */ if (info->norm1_abs < norm1) info->norm1_abs = norm1; /* calculate One-norm of reference/test values */ if (normrc < normri) normrc = normri; if (normtc < normti) normtc = normti; } /* One-norm relative to reference */ if (0 < normrc) { info->norm1_rel = info->norm1_abs / normrc; } else if (0 < normtc) { /* relative to test */ info->norm1_rel = info->norm1_abs / normtc; } else { /* should not happen */ info->norm1_rel = 0; } if (0 != size) { /* final variance */ info->var_ref /= size; info->var_tst /= size; } } libxsmm-1.17/src/template/libxsmm_spmdm_compute_bfloat16_thread.tpl.c000066400000000000000000001033241415223013700261160ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Nadathur Satish (Intel Corp.) ******************************************************************************/ const int m_blocks = handle->mb; /*const int n_blocks = handle->nb;*/ const int k_blocks = handle->kb; const int m_block_size = handle->bm; const int n_block_size = handle->bn; const int k_block_size = handle->bk; int mb = block_id / handle->nb; int nb = block_id % handle->nb; #define LIBXSMM_SPMDM_COMPUTE_NREGS (6) int m_overall_start = mb*m_block_size; int m_overall_end = (mb + 1)*m_block_size; int num_m; int num_m_aligned; int n_overall_start = nb*n_block_size; int n_overall_end = (nb + 1)*n_block_size; int num_n; int m, n, k, kb; int last_block_n, num_full_regs, last_n_start; int k_overall_start, k_overall_end, num_k; float *const scratch_C = (float *)(handle->base_ptr_scratch_B_scratch_C + (size_t)tid*handle->memory_for_scratch_per_thread); float *const scratch_B = (float *)(handle->base_ptr_scratch_B_scratch_C + (size_t)tid*handle->memory_for_scratch_per_thread + (size_t)m_block_size*n_block_size*sizeof(float)); #if 0 float *const scratch_C = (float *)(handle->spmdm_scratch_C + tid*m_block_size*n_block_size*sizeof(float)); float *const scratch_B = (float *)(handle->spmdm_scratch_B + tid*k_block_size*n_block_size*sizeof(float)); #endif SIMDTYPE_FP32 sum[2*LIBXSMM_SPMDM_COMPUTE_NREGS]; float* LIBXSMM_RESTRICT ptr_result; #if SIMD_WIDTH_FP32 > 1 SIMDTYPE_INT32 vzero = _MM_SETZERO_INT32(); #endif LIBXSMM_UNUSED(nthreads); LIBXSMM_UNUSED(transa); LIBXSMM_UNUSED(alpha); LIBXSMM_UNUSED(beta); LIBXSMM_UNUSED(tid); /* really is twice this */ assert(n_block_size == LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32); if (m_overall_end > handle->m) m_overall_end = handle->m; num_m = (m_overall_end - m_overall_start); num_m_aligned = (num_m / 2) * 2; if (n_overall_end > handle->n) n_overall_end = handle->n; num_n = (n_overall_end - n_overall_start); last_block_n = (num_n != n_block_size); num_full_regs = (num_n / SIMD_WIDTH_FP32); if ((num_full_regs > 0) && (num_full_regs%2)) num_full_regs--; last_n_start = num_full_regs*SIMD_WIDTH_FP32; /* Copy in c matrix to buffer */ ptr_result = c + (size_t)m_overall_start*handle->n + n_overall_start; if (LIBXSMM_FEQ(0.f, *beta)) { if (!last_block_n) { for (m = 0; m < num_m; m++) { _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 0*SIMD_WIDTH_FP32, _MM_SETZERO_FP32()); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 1*SIMD_WIDTH_FP32, _MM_SETZERO_FP32()); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 2*SIMD_WIDTH_FP32, _MM_SETZERO_FP32()); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 3*SIMD_WIDTH_FP32, _MM_SETZERO_FP32()); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 4*SIMD_WIDTH_FP32, _MM_SETZERO_FP32()); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 5*SIMD_WIDTH_FP32, _MM_SETZERO_FP32()); } } else { for (m = 0; m < num_m; m++) { for (n = 0; n < num_full_regs; n += 2) { _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n)*SIMD_WIDTH_FP32, _MM_SETZERO_FP32()); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n+1)*SIMD_WIDTH_FP32, _MM_SETZERO_FP32()); } for (n = last_n_start; n < num_n; n++) { scratch_C[m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + n] = 0; } } } } else if (LIBXSMM_FEQ(1.f, *beta)) { if ('T' == transc || 't' == transc) { int num_m_simd = num_m / SIMD_WIDTH_FP32 * SIMD_WIDTH_FP32; int num_n_simd = num_n / SIMD_WIDTH_FP32 * SIMD_WIDTH_FP32; int m2; ptr_result = c + (size_t)n_overall_start*handle->m + m_overall_start; for (m = 0; m < num_m_simd; m += SIMD_WIDTH_FP32) { for (n = 0; n < num_n_simd; n += SIMD_WIDTH_FP32) { TRANSPOSE_SIMD_WIDTH_KERNEL(ptr_result + (size_t)n*handle->m + m, handle->m, scratch_C + (size_t)m*n_block_size + n, n_block_size); } /* Transpose a SIMD_WIDTH_FP32 * (num_n - num_n_simd) block of output space - input is of size (num_n - num_n_simd) * SIMD_WIDTH_FP32 */ for (m2 = m; m2 < m + SIMD_WIDTH_FP32; m2++) { for (n = num_n_simd; n < num_n; n++) { scratch_C[m2*n_block_size + n] = ptr_result[n*handle->m + m2]; } } } /* Transpose a (num_m - num_m_simd) * num_n block of output space - input is of size num_n * (num_m - num_m_simd) */ for (m = num_m_simd; m < num_m; m++) { for (n = 0; n < num_n; n++) { scratch_C[m*n_block_size + n] = ptr_result[n*handle->m + m]; } } } else { if (!last_block_n) { for (m = 0; m < num_m; m++) { _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 0*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_result + (size_t)m*handle->n + 0*SIMD_WIDTH_FP32)); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 1*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_result + (size_t)m*handle->n + 1*SIMD_WIDTH_FP32)); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 2*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_result + (size_t)m*handle->n + 2*SIMD_WIDTH_FP32)); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 3*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_result + (size_t)m*handle->n + 3*SIMD_WIDTH_FP32)); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 4*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_result + (size_t)m*handle->n + 4*SIMD_WIDTH_FP32)); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 5*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_result + (size_t)m*handle->n + 5*SIMD_WIDTH_FP32)); } } else { for (m = 0; m < num_m; m++) { for (n = 0; n < num_full_regs; n += 2) { _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n) *SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_result + (size_t)m*handle->n + ((size_t)n) *SIMD_WIDTH_FP32)); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n+1)*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_result + (size_t)m*handle->n + ((size_t)n+1)*SIMD_WIDTH_FP32)); } for (n = last_n_start; n < num_n; n++) { scratch_C[m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32+n] = ptr_result[m*handle->n+n]; } } } } } else { SIMDTYPE_FP32 beta_v = _MM_SET1_FP32(*beta); if ('T' == transc || 't' == transc) { int num_m_simd = num_m / SIMD_WIDTH_FP32 * SIMD_WIDTH_FP32; int num_n_simd = num_n / SIMD_WIDTH_FP32 * SIMD_WIDTH_FP32; int m2; ptr_result = c + (size_t)n_overall_start*handle->m + m_overall_start; for (m = 0; m < num_m_simd; m += SIMD_WIDTH_FP32) { for (n = 0; n < num_n_simd; n += SIMD_WIDTH_FP32) { TRANSPOSE_SIMD_WIDTH_KERNEL(ptr_result + (size_t)n*handle->m + m, handle->m, scratch_C + (size_t)m*n_block_size + n, n_block_size); _MM_STORE_FP32(scratch_C + (size_t)m*n_block_size + n, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(scratch_C + (size_t)m*n_block_size + n))); _MM_STORE_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*1, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*1))); _MM_STORE_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*2, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*2))); _MM_STORE_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*3, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*3))); _MM_STORE_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*4, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*4))); _MM_STORE_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*5, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*5))); _MM_STORE_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*6, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*6))); _MM_STORE_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*7, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*7))); } /* Transpose a SIMD_WIDTH_FP32 * (num_n - num_n_simd) block of output space - input is of size (num_n - num_n_simd) * SIMD_WIDTH_FP32 */ for (m2 = m; m2 < m + SIMD_WIDTH_FP32; m2++) { for (n = num_n_simd; n < num_n; n++) { scratch_C[m2*n_block_size + n] = (*beta)*ptr_result[n*handle->m + m2]; } } } /* Transpose a (num_m - num_m_simd) * num_n block of output space - input is of size num_n * (num_m - num_m_simd) */ for (m = num_m_simd; m < num_m; m++) { for (n = 0; n < num_n; n++) { scratch_C[m*n_block_size + n] = (*beta)*ptr_result[n*handle->m + m]; } } } else { if (!last_block_n) { for (m = 0; m < num_m; m++) { _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 0*SIMD_WIDTH_FP32, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(ptr_result + (size_t)m*handle->n + 0*SIMD_WIDTH_FP32))); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 1*SIMD_WIDTH_FP32, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(ptr_result + (size_t)m*handle->n + 1*SIMD_WIDTH_FP32))); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 2*SIMD_WIDTH_FP32, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(ptr_result + (size_t)m*handle->n + 2*SIMD_WIDTH_FP32))); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 3*SIMD_WIDTH_FP32, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(ptr_result + (size_t)m*handle->n + 3*SIMD_WIDTH_FP32))); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 4*SIMD_WIDTH_FP32, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(ptr_result + (size_t)m*handle->n + 4*SIMD_WIDTH_FP32))); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 5*SIMD_WIDTH_FP32, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(ptr_result + (size_t)m*handle->n + 5*SIMD_WIDTH_FP32))); } } else { for (m = 0; m < num_m; m++) { for (n = 0; n < num_full_regs; n += 2) { _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n) *SIMD_WIDTH_FP32, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(ptr_result + (size_t)m*handle->n + ((size_t)n) *SIMD_WIDTH_FP32))); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n+1)*SIMD_WIDTH_FP32, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(ptr_result + (size_t)m*handle->n + ((size_t)n+1)*SIMD_WIDTH_FP32))); } for (n = last_n_start; n < num_n; n++) { scratch_C[m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + n] = (*beta)*ptr_result[m*handle->n + n]; } } } } } for (kb = 0; kb < k_blocks; kb++) { const uint16_t* LIBXSMM_RESTRICT ptr_dense; float * LIBXSMM_RESTRICT scratch_C_base; const float * LIBXSMM_RESTRICT scratch_B_base; int block_A = kb * m_blocks + mb; libxsmm_CSR_sparseslice slice = a_sparse[block_A]; int m_local = 0; k_overall_start = kb*k_block_size; k_overall_end = (kb+1)*k_block_size; num_k = (k_overall_end - k_overall_start); /* Copy in b matrix */ if ('T' == transb || 't' == transb) { int num_k_simd = num_k / SIMD_WIDTH_FP32 * SIMD_WIDTH_FP32; int num_n_simd = num_n / SIMD_WIDTH_FP32 * SIMD_WIDTH_FP32; int k2; ptr_dense = b + (size_t)n_overall_start*handle->k + k_overall_start; for (k = 0; k < num_k_simd; k += SIMD_WIDTH_FP32) { for (n = 0; n < num_n_simd; n += SIMD_WIDTH_FP32) { TRANSPOSE_SIMD_WIDTH_KERNEL_BFLOAT16(ptr_dense + (size_t)n*handle->k + k, handle->k, scratch_B + (size_t)k*n_block_size + n, n_block_size); } /* Transpose a SIMD_WIDTH_FP32 * (num_n - num_n_simd) block of output space - input is of size (num_n - num_n_simd) * SIMD_WIDTH_FP32 */ for (k2 = k; k2 < k + SIMD_WIDTH_FP32; k2++) { for (n = num_n_simd; n < num_n; n++) { uint16_t restmp = ptr_dense[n*handle->k + k2]; union { int i; float f; } res; res.i = restmp; res.i <<= 16; scratch_B[k2*n_block_size + n] = res.f; } } } /* Transpose a (num_m - num_m_simd) * num_n block of output space - input is of size num_n * (num_m - num_m_simd) */ for (k = num_k_simd; k < num_k; k++) { for (n = 0; n < num_n; n++) { uint16_t restmp = ptr_dense[n*handle->k + k]; union { int i; float f; } res; res.i = restmp; res.i <<= 16; scratch_B[k*n_block_size + n] = res.f; } } } else { ptr_dense = b + (size_t)k_overall_start*handle->n + n_overall_start; if (!last_block_n) { for (k = 0; k < num_k; k++) { SIMDTYPE_INT32 vload_0 = _MM_LOADU_INT32((const SIMDTYPE_INT32*)(ptr_dense + (size_t)k*handle->n + 2*0*SIMD_WIDTH_FP32)); SIMDTYPE_INT32 vload_1, vload_2; SIMDTYPE_FP32 v1_0, v2_0; SIMDTYPE_FP32 v1_1, v2_1; SIMDTYPE_FP32 v1_2, v2_2; EXPAND_BFLOAT16(vload_0, v1_0, v2_0); _MM_STORE_FP32(scratch_B + (size_t)k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 2*0*SIMD_WIDTH_FP32, v1_0); _MM_STORE_FP32(scratch_B + (size_t)k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + (2*0+1)*SIMD_WIDTH_FP32, v2_0); vload_1 = _MM_LOADU_INT32((const SIMDTYPE_INT32 *)(ptr_dense + (size_t)k*handle->n + 2*1*SIMD_WIDTH_FP32)); EXPAND_BFLOAT16(vload_1, v1_1, v2_1); _MM_STORE_FP32(scratch_B + (size_t)k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 2*1*SIMD_WIDTH_FP32, v1_1); _MM_STORE_FP32(scratch_B + (size_t)k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + (2*1+1)*SIMD_WIDTH_FP32, v2_1); vload_2 = _MM_LOADU_INT32((const SIMDTYPE_INT32 *)(ptr_dense + (size_t)k*handle->n + 2*2*SIMD_WIDTH_FP32)); EXPAND_BFLOAT16(vload_2, v1_2, v2_2); _MM_STORE_FP32(scratch_B + (size_t)k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 2*2*SIMD_WIDTH_FP32, v1_2); _MM_STORE_FP32(scratch_B + (size_t)k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + (2*2+1)*SIMD_WIDTH_FP32, v2_2); } } else { for (k = 0; k < num_k; k++) { for (n = 0; n < num_full_regs; n += 2) { SIMDTYPE_INT32 vload_0 = _MM_LOADU_INT32((const SIMDTYPE_INT32*)(ptr_dense + (size_t)k*handle->n + (size_t)n*SIMD_WIDTH_FP32)); SIMDTYPE_FP32 v1_0, v2_0; EXPAND_BFLOAT16(vload_0, v1_0, v2_0); _MM_STORE_FP32(scratch_B + (size_t)k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n) *SIMD_WIDTH_FP32, v1_0); _MM_STORE_FP32(scratch_B + (size_t)k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n+1)*SIMD_WIDTH_FP32, v2_0); } for (n = last_n_start; n < num_n; n++) { uint16_t restmp = ptr_dense[k*handle->n + n]; union { int i; float f; } res; res.i = restmp; res.i <<= 16; { scratch_B[k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + n] = res.f; } } } } } scratch_C_base = scratch_C - (size_t)m_overall_start*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; scratch_B_base = scratch_B; /* - (size_t)k_overall_start*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; */ for (m = m_overall_start; m < m_overall_start + num_m_aligned; m += 2, m_local += 2) { int start_j, end_j, end_j_2, num_j, num_j_2; const uint16_t *LIBXSMM_RESTRICT sp_c_ptr_base; const uint16_t *LIBXSMM_RESTRICT sp_c_ptr_base_2; const float *LIBXSMM_RESTRICT sp_v_ptr_base; const float *LIBXSMM_RESTRICT sp_v_ptr_base_2; float *const LIBXSMM_RESTRICT result_m_index = scratch_C_base + ((size_t)m) *LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; float *const LIBXSMM_RESTRICT result_m_index_2 = scratch_C_base + ((size_t)m+1)*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; if (m_local >= m_block_size) { block_A++; slice = a_sparse[block_A]; m_local = 0; } start_j = slice.rowidx[m_local]; end_j = slice.rowidx[m_local + 1]; end_j_2 = slice.rowidx[m_local + 2]; num_j = (end_j - start_j); num_j_2 = (end_j_2 - end_j); sp_c_ptr_base = slice.colidx + start_j; sp_c_ptr_base_2 = slice.colidx + end_j; sp_v_ptr_base = (float *)(slice.values) + start_j; sp_v_ptr_base_2 = (float *)(slice.values) + end_j; if (!last_block_n) { int64_t j = 0, j2 = 0; sum[0] = _MM_LOAD_FP32(result_m_index + 0*SIMD_WIDTH_FP32); sum[0+LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_LOAD_FP32(result_m_index_2 + 0*SIMD_WIDTH_FP32); sum[1] = _MM_LOAD_FP32(result_m_index + 1*SIMD_WIDTH_FP32); sum[1+LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_LOAD_FP32(result_m_index_2 + 1*SIMD_WIDTH_FP32); sum[2] = _MM_LOAD_FP32(result_m_index + 2*SIMD_WIDTH_FP32); sum[2+LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_LOAD_FP32(result_m_index_2 + 2*SIMD_WIDTH_FP32); sum[3] = _MM_LOAD_FP32(result_m_index + 3*SIMD_WIDTH_FP32); sum[3+LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_LOAD_FP32(result_m_index_2 + 3*SIMD_WIDTH_FP32); sum[4] = _MM_LOAD_FP32(result_m_index + 4*SIMD_WIDTH_FP32); sum[4+LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_LOAD_FP32(result_m_index_2 + 4*SIMD_WIDTH_FP32); sum[5] = _MM_LOAD_FP32(result_m_index + 5*SIMD_WIDTH_FP32); sum[5+LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_LOAD_FP32(result_m_index_2 + 5*SIMD_WIDTH_FP32); for (; j < num_j && j2 < num_j_2; j++, j2++) { const float *const LIBXSMM_RESTRICT sp_col_dense_index = scratch_B_base + (size_t)sp_c_ptr_base[j] *LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; const float *const LIBXSMM_RESTRICT sp_col_dense_index_2 = scratch_B_base + (size_t)sp_c_ptr_base_2[j2]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; SIMDTYPE_FP32 v_v = _MM_SET1_FP32(sp_v_ptr_base[j]); SIMDTYPE_FP32 v_v_2 = _MM_SET1_FP32(sp_v_ptr_base_2[j2]); sum[0] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 0*SIMD_WIDTH_FP32), sum[0]); sum[0 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 0*SIMD_WIDTH_FP32), sum[0+LIBXSMM_SPMDM_COMPUTE_NREGS]); sum[1] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 1*SIMD_WIDTH_FP32), sum[1]); sum[1 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 1*SIMD_WIDTH_FP32), sum[1+LIBXSMM_SPMDM_COMPUTE_NREGS]); sum[2] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 2*SIMD_WIDTH_FP32), sum[2]); sum[2 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 2*SIMD_WIDTH_FP32), sum[2+LIBXSMM_SPMDM_COMPUTE_NREGS]); sum[3] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 3*SIMD_WIDTH_FP32), sum[3]); sum[3 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 3*SIMD_WIDTH_FP32), sum[3+LIBXSMM_SPMDM_COMPUTE_NREGS]); sum[4] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 4*SIMD_WIDTH_FP32), sum[4]); sum[4 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 4*SIMD_WIDTH_FP32), sum[4+LIBXSMM_SPMDM_COMPUTE_NREGS]); sum[5] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 5*SIMD_WIDTH_FP32), sum[5]); sum[5 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 5*SIMD_WIDTH_FP32), sum[5+LIBXSMM_SPMDM_COMPUTE_NREGS]); } for (; j < num_j; j++) { const float *const LIBXSMM_RESTRICT sp_col_dense_index = scratch_B_base + (size_t)sp_c_ptr_base[j]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; SIMDTYPE_FP32 v_v = _MM_SET1_FP32(sp_v_ptr_base[j]); sum[0] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 0*SIMD_WIDTH_FP32), sum[0]); sum[1] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 1*SIMD_WIDTH_FP32), sum[1]); sum[2] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 2*SIMD_WIDTH_FP32), sum[2]); sum[3] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 3*SIMD_WIDTH_FP32), sum[3]); sum[4] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 4*SIMD_WIDTH_FP32), sum[4]); sum[5] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 5*SIMD_WIDTH_FP32), sum[5]); } for (; j2 < num_j_2; j2++) { const float *const LIBXSMM_RESTRICT sp_col_dense_index_2 = scratch_B_base + (size_t)sp_c_ptr_base_2[j2]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; SIMDTYPE_FP32 v_v_2 = _MM_SET1_FP32(sp_v_ptr_base_2[j2]); sum[0 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 0*SIMD_WIDTH_FP32), sum[0+LIBXSMM_SPMDM_COMPUTE_NREGS]); sum[1 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 1*SIMD_WIDTH_FP32), sum[1+LIBXSMM_SPMDM_COMPUTE_NREGS]); sum[2 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 2*SIMD_WIDTH_FP32), sum[2+LIBXSMM_SPMDM_COMPUTE_NREGS]); sum[3 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 3*SIMD_WIDTH_FP32), sum[3+LIBXSMM_SPMDM_COMPUTE_NREGS]); sum[4 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 4*SIMD_WIDTH_FP32), sum[4+LIBXSMM_SPMDM_COMPUTE_NREGS]); sum[5 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 5*SIMD_WIDTH_FP32), sum[5+LIBXSMM_SPMDM_COMPUTE_NREGS]); } _MM_STORE_FP32(result_m_index + 0*SIMD_WIDTH_FP32, sum[0]); _MM_STORE_FP32(result_m_index_2 + 0*SIMD_WIDTH_FP32, sum[0+LIBXSMM_SPMDM_COMPUTE_NREGS]); _MM_STORE_FP32(result_m_index + 1*SIMD_WIDTH_FP32, sum[1]); _MM_STORE_FP32(result_m_index_2 + 1*SIMD_WIDTH_FP32, sum[1+LIBXSMM_SPMDM_COMPUTE_NREGS]); _MM_STORE_FP32(result_m_index + 2*SIMD_WIDTH_FP32, sum[2]); _MM_STORE_FP32(result_m_index_2 + 2*SIMD_WIDTH_FP32, sum[2+LIBXSMM_SPMDM_COMPUTE_NREGS]); _MM_STORE_FP32(result_m_index + 3*SIMD_WIDTH_FP32, sum[3]); _MM_STORE_FP32(result_m_index_2 + 3*SIMD_WIDTH_FP32, sum[3+LIBXSMM_SPMDM_COMPUTE_NREGS]); _MM_STORE_FP32(result_m_index + 4*SIMD_WIDTH_FP32, sum[4]); _MM_STORE_FP32(result_m_index_2 + 4*SIMD_WIDTH_FP32, sum[4+LIBXSMM_SPMDM_COMPUTE_NREGS]); _MM_STORE_FP32(result_m_index + 5*SIMD_WIDTH_FP32, sum[5]); _MM_STORE_FP32(result_m_index_2 + 5*SIMD_WIDTH_FP32, sum[5+LIBXSMM_SPMDM_COMPUTE_NREGS]); } else { int64_t j = 0, j2 = 0; for (n = 0; n < num_full_regs; n += 2) { sum[n] = _MM_SETZERO_FP32(); sum[n+LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_SETZERO_FP32(); sum[n+1] = _MM_SETZERO_FP32(); sum[n+1+LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_SETZERO_FP32(); } for (; j < num_j && j2 < num_j_2; j++, j2++) { const float *const LIBXSMM_RESTRICT sp_col_dense_index = scratch_B_base + (size_t)sp_c_ptr_base[j] *LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; const float *const LIBXSMM_RESTRICT sp_col_dense_index_2 = scratch_B_base + (size_t)sp_c_ptr_base_2[j2]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; SIMDTYPE_FP32 v_v = _MM_SET1_FP32(sp_v_ptr_base[j]); SIMDTYPE_FP32 v_v_2 = _MM_SET1_FP32(sp_v_ptr_base_2[j2]); for (n = 0; n < num_full_regs; n += 2) { sum[n] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + (size_t)n*SIMD_WIDTH_FP32), sum[n]); sum[n+LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + (size_t)n*SIMD_WIDTH_FP32), sum[n+LIBXSMM_SPMDM_COMPUTE_NREGS]); sum[n+1] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + ((size_t)n+1)*SIMD_WIDTH_FP32), sum[n+1]); sum[n+1 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + ((size_t)n+1)*SIMD_WIDTH_FP32), sum[n+1+LIBXSMM_SPMDM_COMPUTE_NREGS]); } { float v_v_f = sp_v_ptr_base[j]; float v_v_f_2 = sp_v_ptr_base_2[j2]; for (n = last_n_start; n < num_n; n++) { result_m_index[n] += sp_col_dense_index[n]*v_v_f; result_m_index_2[n] += sp_col_dense_index_2[n]*v_v_f_2; } } } for (; j < num_j; j++) { const float *const LIBXSMM_RESTRICT sp_col_dense_index = scratch_B_base + (size_t)sp_c_ptr_base[j]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; SIMDTYPE_FP32 v_v = _MM_SET1_FP32(sp_v_ptr_base[j]); for (n = 0; n < num_full_regs; n += 2) { sum[n] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + ((size_t)n) *SIMD_WIDTH_FP32), sum[n]); sum[n+1] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + ((size_t)n+1)*SIMD_WIDTH_FP32), sum[n+1]); } { float v_v_f = sp_v_ptr_base[j]; for (n = last_n_start; n < num_n; n++) { result_m_index[n] += sp_col_dense_index[n]*v_v_f; } } } for (; j2 < num_j_2; j2++) { const float *const LIBXSMM_RESTRICT sp_col_dense_index_2 = scratch_B_base + (size_t)sp_c_ptr_base_2[j2]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; SIMDTYPE_FP32 v_v_2 = _MM_SET1_FP32(sp_v_ptr_base_2[j2]); for (n = 0; n < num_full_regs; n += 2) { sum[n + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + ((size_t)n) *SIMD_WIDTH_FP32), sum[n+LIBXSMM_SPMDM_COMPUTE_NREGS]); sum[n+1 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + ((size_t)n+1)*SIMD_WIDTH_FP32), sum[n+1+LIBXSMM_SPMDM_COMPUTE_NREGS]); } { float v_v_f_2 = sp_v_ptr_base_2[j2]; for (n = last_n_start; n < num_n; n++) { result_m_index_2[n] += sp_col_dense_index_2[n]*v_v_f_2; } } } for (n = 0; n < num_full_regs; n += 2) { _MM_STORE_FP32(result_m_index + ((size_t)n) *SIMD_WIDTH_FP32, _MM_ADD_FP32(sum[n], _MM_LOAD_FP32(result_m_index + (size_t)n*SIMD_WIDTH_FP32))); _MM_STORE_FP32(result_m_index_2 + ((size_t)n) *SIMD_WIDTH_FP32, _MM_ADD_FP32(sum[n+LIBXSMM_SPMDM_COMPUTE_NREGS], _MM_LOAD_FP32(result_m_index_2 + (size_t)n*SIMD_WIDTH_FP32))); _MM_STORE_FP32(result_m_index + ((size_t)n+1)*SIMD_WIDTH_FP32, _MM_ADD_FP32(sum[n+1], _MM_LOAD_FP32(result_m_index + ((size_t)n+1)*SIMD_WIDTH_FP32))); _MM_STORE_FP32(result_m_index_2 + ((size_t)n+1)*SIMD_WIDTH_FP32, _MM_ADD_FP32(sum[n+1+LIBXSMM_SPMDM_COMPUTE_NREGS], _MM_LOAD_FP32(result_m_index_2 + ((size_t)n+1)*SIMD_WIDTH_FP32))); } } } for (m = m_overall_start + num_m_aligned; m < m_overall_end; m++, m_local++) { int start_j, end_j, num_j; const uint16_t* LIBXSMM_RESTRICT sp_c_ptr_base; const float* LIBXSMM_RESTRICT sp_v_ptr_base; float* LIBXSMM_RESTRICT result_m_index; if (m_local >= m_block_size) { block_A++; slice = a_sparse[block_A]; m_local = 0; } start_j = slice.rowidx[m_local]; end_j = slice.rowidx[m_local + 1]; num_j = (end_j - start_j); sp_c_ptr_base = slice.colidx + start_j; sp_v_ptr_base = slice.values + start_j; result_m_index = scratch_C_base + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; if (!last_block_n) { int64_t j = 0; sum[0] = _MM_LOAD_FP32(result_m_index + 0*SIMD_WIDTH_FP32); sum[1] = _MM_LOAD_FP32(result_m_index + 1*SIMD_WIDTH_FP32); sum[2] = _MM_LOAD_FP32(result_m_index + 2*SIMD_WIDTH_FP32); sum[3] = _MM_LOAD_FP32(result_m_index + 3*SIMD_WIDTH_FP32); sum[4] = _MM_LOAD_FP32(result_m_index + 4*SIMD_WIDTH_FP32); sum[5] = _MM_LOAD_FP32(result_m_index + 5*SIMD_WIDTH_FP32); for (; j < num_j; j++) { const float *const LIBXSMM_RESTRICT sp_col_dense_index = scratch_B_base + (size_t)sp_c_ptr_base[j]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; SIMDTYPE_FP32 v_v = _MM_SET1_FP32(sp_v_ptr_base[j]); sum[0] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 0*SIMD_WIDTH_FP32), sum[0]); sum[1] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 1*SIMD_WIDTH_FP32), sum[1]); sum[2] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 2*SIMD_WIDTH_FP32), sum[2]); sum[3] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 3*SIMD_WIDTH_FP32), sum[3]); sum[4] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 4*SIMD_WIDTH_FP32), sum[4]); sum[5] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 5*SIMD_WIDTH_FP32), sum[5]); } _MM_STORE_FP32(result_m_index + 0*SIMD_WIDTH_FP32, sum[0]); _MM_STORE_FP32(result_m_index + 1*SIMD_WIDTH_FP32, sum[1]); _MM_STORE_FP32(result_m_index + 2*SIMD_WIDTH_FP32, sum[2]); _MM_STORE_FP32(result_m_index + 3*SIMD_WIDTH_FP32, sum[3]); _MM_STORE_FP32(result_m_index + 4*SIMD_WIDTH_FP32, sum[4]); _MM_STORE_FP32(result_m_index + 5*SIMD_WIDTH_FP32, sum[5]); } else { int64_t j = 0; for (n = 0; n < num_full_regs; n += 2) { sum[n] = _MM_SETZERO_FP32(); sum[n+1] = _MM_SETZERO_FP32(); } for (; j < num_j; j++) { const float *const LIBXSMM_RESTRICT sp_col_dense_index = scratch_B_base + (size_t)sp_c_ptr_base[j]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; SIMDTYPE_FP32 v_v = _MM_SET1_FP32(sp_v_ptr_base[j]); for (n = 0; n < num_full_regs; n += 2) { sum[n] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + ((size_t)n) *SIMD_WIDTH_FP32), sum[n]); sum[n+1] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + ((size_t)n+1)*SIMD_WIDTH_FP32), sum[n+1]); } { float v_v_f = sp_v_ptr_base[j]; for (n = last_n_start; n < num_n; n++) { result_m_index[n] += sp_col_dense_index[n]*v_v_f; } } } for (n = 0; n < num_full_regs; n += 2) { _MM_STORE_FP32(result_m_index + ((size_t)n) *SIMD_WIDTH_FP32, _MM_ADD_FP32(sum[n], _MM_LOAD_FP32(result_m_index + ((size_t)n) *SIMD_WIDTH_FP32))); _MM_STORE_FP32(result_m_index + ((size_t)n+1)*SIMD_WIDTH_FP32, _MM_ADD_FP32(sum[n+1], _MM_LOAD_FP32(result_m_index + ((size_t)n+1)*SIMD_WIDTH_FP32))); } } } } /* kb */ /* Copy out c matrix */ if ('T' == transc || 't' == transc) { int num_m_simd = num_m / SIMD_WIDTH_FP32 * SIMD_WIDTH_FP32; int num_n_simd = num_n / SIMD_WIDTH_FP32 * SIMD_WIDTH_FP32; int n2; ptr_result = c + (size_t)n_overall_start*handle->m + m_overall_start; for (n = 0; n < num_n_simd; n += SIMD_WIDTH_FP32) { for (m = 0; m < num_m_simd; m += SIMD_WIDTH_FP32) { TRANSPOSE_SIMD_WIDTH_KERNEL(scratch_C + (size_t)m*n_block_size + n, n_block_size, ptr_result + (size_t)n*handle->m + m, handle->m); } /* Transpose a SIMD_WIDTH_FP32 * (num_m - num_m_simd) block of output space - input is of size (num_m - num_m_simd) * SIMD_WIDTH_FP32 */ for (n2 = n; n2 < n + SIMD_WIDTH_FP32; n2++) { for (m = num_m_simd; m < num_m; m++) { ptr_result[n2*handle->m + m] = scratch_C[m*n_block_size + n2]; } } } /* Transpose a (num_n - num_n_simd) * num_m block of output space - input is of size num_m * (num_n - num_n_simd) */ for (n = num_n_simd; n < num_n; n++) { for (m = 0; m < num_m; m++) { ptr_result[n*handle->m + m] = scratch_C[m*n_block_size + n]; } } } else { if (!last_block_n) { for (m = 0; m < num_m; m++) { _MM_STOREU_FP32(ptr_result + (size_t)m*handle->n + 0*SIMD_WIDTH_FP32, _MM_LOAD_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 0*SIMD_WIDTH_FP32)); _MM_STOREU_FP32(ptr_result + (size_t)m*handle->n + 1*SIMD_WIDTH_FP32, _MM_LOAD_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 1*SIMD_WIDTH_FP32)); _MM_STOREU_FP32(ptr_result + (size_t)m*handle->n + 2*SIMD_WIDTH_FP32, _MM_LOAD_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 2*SIMD_WIDTH_FP32)); _MM_STOREU_FP32(ptr_result + (size_t)m*handle->n + 3*SIMD_WIDTH_FP32, _MM_LOAD_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 3*SIMD_WIDTH_FP32)); _MM_STOREU_FP32(ptr_result + (size_t)m*handle->n + 4*SIMD_WIDTH_FP32, _MM_LOAD_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 4*SIMD_WIDTH_FP32)); _MM_STOREU_FP32(ptr_result + (size_t)m*handle->n + 5*SIMD_WIDTH_FP32, _MM_LOAD_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 5*SIMD_WIDTH_FP32)); } } else { for (m = 0; m < num_m; m++) { for (n = 0; n < num_full_regs; n += 2) { _MM_STOREU_FP32(ptr_result + (size_t)m*handle->n + ((size_t)n) *SIMD_WIDTH_FP32, _MM_LOAD_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n) *SIMD_WIDTH_FP32)); _MM_STOREU_FP32(ptr_result + (size_t)m*handle->n + ((size_t)n+1)*SIMD_WIDTH_FP32, _MM_LOAD_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n+1)*SIMD_WIDTH_FP32)); } for (n = last_n_start; n < num_n; n++) { ptr_result[m*handle->n + n] = scratch_C[m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + n]; } } } } #undef LIBXSMM_SPMDM_COMPUTE_NREGS libxsmm-1.17/src/template/libxsmm_spmdm_compute_fp32_thread.tpl.c000066400000000000000000001020221415223013700252440ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Nadathur Satish (Intel Corp.) ******************************************************************************/ const int m_blocks = handle->mb; /* const int n_blocks = handle->nb; */ const int k_blocks = handle->kb; const int m_block_size = handle->bm; const int n_block_size = handle->bn; const int k_block_size = handle->bk; const int handle_m = handle->m; const int handle_n = handle->n; int mb = block_id / handle->nb; int nb = block_id % handle->nb; #define LIBXSMM_SPMDM_COMPUTE_NREGS (6) int m_overall_start = mb*m_block_size; int m_overall_end = (mb + 1)*m_block_size; int num_m; int num_m_aligned; int n_overall_start = nb*n_block_size; int n_overall_end = (nb + 1)*n_block_size; int num_n; int m, n, k, kb; int last_block_n, num_full_regs, last_n_start; int k_overall_start, k_overall_end, num_k; float *const scratch_C = (float*)(handle->base_ptr_scratch_B_scratch_C + (size_t)tid*handle->memory_for_scratch_per_thread); float *const scratch_B = (float*)(handle->base_ptr_scratch_B_scratch_C + (size_t)tid*handle->memory_for_scratch_per_thread + (size_t)m_block_size*n_block_size*sizeof(float)); float* LIBXSMM_RESTRICT ptr_result; LIBXSMM_UNUSED(nthreads); LIBXSMM_UNUSED(transa); LIBXSMM_UNUSED(alpha); LIBXSMM_UNUSED(beta); LIBXSMM_UNUSED(tid); /* really is twice this */ assert(n_block_size == LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32); if (m_overall_end > handle_m) m_overall_end = handle_m; num_m = (m_overall_end - m_overall_start); num_m_aligned = (num_m / 2) * 2; if (n_overall_end > handle_n) n_overall_end = handle_n; num_n = (n_overall_end - n_overall_start); last_block_n = (num_n != n_block_size); num_full_regs = (num_n / SIMD_WIDTH_FP32); if ((num_full_regs > 0) && (num_full_regs%2)) num_full_regs--; last_n_start = num_full_regs*SIMD_WIDTH_FP32; /* Copy in c matrix to buffer*/ ptr_result = c + (size_t)m_overall_start*handle_n + n_overall_start; if (LIBXSMM_FEQ(0.f, *beta)) { if (!last_block_n) { for (m = 0; m < num_m; m++) { _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 0*SIMD_WIDTH_FP32, _MM_SETZERO_FP32()); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 1*SIMD_WIDTH_FP32, _MM_SETZERO_FP32()); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 2*SIMD_WIDTH_FP32, _MM_SETZERO_FP32()); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 3*SIMD_WIDTH_FP32, _MM_SETZERO_FP32()); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 4*SIMD_WIDTH_FP32, _MM_SETZERO_FP32()); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 5*SIMD_WIDTH_FP32, _MM_SETZERO_FP32()); } } else { for (m = 0; m < num_m; m++) { for (n = 0; n < num_full_regs; n += 2) { _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n) *SIMD_WIDTH_FP32, _MM_SETZERO_FP32()); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n+1)*SIMD_WIDTH_FP32, _MM_SETZERO_FP32()); } for (n = last_n_start; n < num_n; n++) { scratch_C[m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + n] = 0; } } } } else if (LIBXSMM_FEQ(1.f, *beta)) { if ('T' == transc || 't' == transc) { int num_m_simd = num_m / SIMD_WIDTH_FP32 * SIMD_WIDTH_FP32; int num_n_simd = num_n / SIMD_WIDTH_FP32 * SIMD_WIDTH_FP32; int m2; ptr_result = c + (size_t)n_overall_start*handle_m + m_overall_start; for (m = 0; m < num_m_simd; m += SIMD_WIDTH_FP32) { for (n = 0; n < num_n_simd; n += SIMD_WIDTH_FP32) { TRANSPOSE_SIMD_WIDTH_KERNEL(ptr_result + (size_t)n*handle_m + m, handle_m, scratch_C + (size_t)m*n_block_size + n, n_block_size); } /* Transpose a SIMD_WIDTH_FP32 * (num_n - num_n_simd) block of output space - input is of size (num_n - num_n_simd) * SIMD_WIDTH_FP32 */ for (m2 = m; m2 < m + SIMD_WIDTH_FP32; m2++) { for (n = num_n_simd; n < num_n; n++) { scratch_C[m2*n_block_size + n] = ptr_result[n*handle_m + m2]; } } } /* Transpose a (num_m - num_m_simd) * num_n block of output space - input is of size num_n * (num_m - num_m_simd) */ for (m = num_m_simd; m < num_m; m++) { for (n = 0; n < num_n; n++) { scratch_C[m*n_block_size + n] = ptr_result[n*handle_m + m]; } } } else { if (!last_block_n) { for (m = 0; m < num_m; m++) { _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 0*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_result + (size_t)m*handle_n + 0*SIMD_WIDTH_FP32)); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 1*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_result + (size_t)m*handle_n + 1*SIMD_WIDTH_FP32)); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 2*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_result + (size_t)m*handle_n + 2*SIMD_WIDTH_FP32)); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 3*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_result + (size_t)m*handle_n + 3*SIMD_WIDTH_FP32)); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 4*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_result + (size_t)m*handle_n + 4*SIMD_WIDTH_FP32)); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 5*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_result + (size_t)m*handle_n + 5*SIMD_WIDTH_FP32)); } } else { for (m = 0; m < num_m; m++) { for (n = 0; n < num_full_regs; n += 2) { _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n) *SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_result + (size_t)m*handle_n + ((size_t)n) *SIMD_WIDTH_FP32)); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n+1)*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_result + (size_t)m*handle_n + ((size_t)n+1)*SIMD_WIDTH_FP32)); } for (n = last_n_start; n < num_n; n++) { scratch_C[m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + n] = ptr_result[m*handle_n + n]; } } } } } else { SIMDTYPE_FP32 beta_v = _MM_SET1_FP32(*beta); if ('T' == transc || 't' == transc) { int num_m_simd = num_m / SIMD_WIDTH_FP32 * SIMD_WIDTH_FP32; int num_n_simd = num_n / SIMD_WIDTH_FP32 * SIMD_WIDTH_FP32; int m2; ptr_result = c + (size_t)n_overall_start*handle_m + m_overall_start; for (m = 0; m < num_m_simd; m += SIMD_WIDTH_FP32) { for (n = 0; n < num_n_simd; n += SIMD_WIDTH_FP32) { TRANSPOSE_SIMD_WIDTH_KERNEL(ptr_result + (size_t)n*handle_m + m, handle_m, scratch_C + (size_t)m*n_block_size + n, n_block_size); _MM_STORE_FP32(scratch_C + (size_t)m*n_block_size + n, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(scratch_C + (size_t)m*n_block_size + n))); _MM_STORE_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*1, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*1))); _MM_STORE_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*2, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*2))); _MM_STORE_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*3, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*3))); _MM_STORE_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*4, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*4))); _MM_STORE_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*5, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*5))); _MM_STORE_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*6, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*6))); _MM_STORE_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*7, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*7))); } /* Transpose a SIMD_WIDTH_FP32 * (num_n - num_n_simd) block of output space - input is of size (num_n - num_n_simd) * SIMD_WIDTH_FP32 */ for (m2 = m; m2 < m + SIMD_WIDTH_FP32; m2++) { for (n = num_n_simd; n < num_n; n++) { scratch_C[m2*n_block_size + n] = (*beta)*ptr_result[n*handle_m + m2]; } } } /* Transpose a (num_m - num_m_simd) * num_n block of output space - input is of size num_n * (num_m - num_m_simd) */ for (m = num_m_simd; m < num_m; m++) { for (n = 0; n < num_n; n++) { scratch_C[m*n_block_size + n] = (*beta)*ptr_result[n*handle_m + m]; } } } else { if (!last_block_n) { for (m = 0; m < num_m; m++) { _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 0*SIMD_WIDTH_FP32, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(ptr_result + (size_t)m*handle_n + 0*SIMD_WIDTH_FP32))); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 1*SIMD_WIDTH_FP32, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(ptr_result + (size_t)m*handle_n + 1*SIMD_WIDTH_FP32))); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 2*SIMD_WIDTH_FP32, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(ptr_result + (size_t)m*handle_n + 2*SIMD_WIDTH_FP32))); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 3*SIMD_WIDTH_FP32, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(ptr_result + (size_t)m*handle_n + 3*SIMD_WIDTH_FP32))); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 4*SIMD_WIDTH_FP32, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(ptr_result + (size_t)m*handle_n + 4*SIMD_WIDTH_FP32))); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 5*SIMD_WIDTH_FP32, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(ptr_result + (size_t)m*handle_n + 5*SIMD_WIDTH_FP32))); } } else { for (m = 0; m < num_m; m++) { for (n = 0; n < num_full_regs; n += 2) { _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n) *SIMD_WIDTH_FP32, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(ptr_result + (size_t)m*handle_n + ((size_t)n) *SIMD_WIDTH_FP32))); _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n+1)*SIMD_WIDTH_FP32, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(ptr_result + (size_t)m*handle_n + ((size_t)n+1)*SIMD_WIDTH_FP32))); } for (n = last_n_start; n < num_n; n++) { scratch_C[m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + n] = (*beta)*ptr_result[m*handle_n + n]; } } } } } for (kb = 0; kb < k_blocks; kb++) { const float * LIBXSMM_RESTRICT ptr_dense; float * LIBXSMM_RESTRICT scratch_C_base; const float * LIBXSMM_RESTRICT scratch_B_base; int block_A = kb * m_blocks + mb; libxsmm_CSR_sparseslice slice = a_sparse[block_A]; int m_local = 0; k_overall_start = kb*k_block_size; k_overall_end = (kb+1)*k_block_size; if (k_overall_end > handle->k) k_overall_end = handle->k; num_k = (k_overall_end - k_overall_start); /* Copy in b matrix*/ if ('T' == transb || 't' == transb) { int num_k_simd = num_k / SIMD_WIDTH_FP32 * SIMD_WIDTH_FP32; int num_n_simd = num_n / SIMD_WIDTH_FP32 * SIMD_WIDTH_FP32; int k2; ptr_dense = b + (size_t)n_overall_start*handle->k + k_overall_start; for (k = 0; k < num_k_simd; k += SIMD_WIDTH_FP32) { for (n = 0; n < num_n_simd; n += SIMD_WIDTH_FP32) { TRANSPOSE_SIMD_WIDTH_KERNEL(ptr_dense + (size_t)n*handle->k + k, handle->k, scratch_B + (size_t)k*n_block_size + n, n_block_size); } /* Transpose a SIMD_WIDTH_FP32 * (num_n - num_n_simd) block of output space - input is of size (num_n - num_n_simd) * SIMD_WIDTH_FP32 */ for (k2 = k; k2 < k + SIMD_WIDTH_FP32; k2++) { for (n = num_n_simd; n < num_n; n++) { scratch_B[k2*n_block_size + n] = ptr_dense[n*handle->k + k2]; } } } /* Transpose a (num_m - num_m_simd) * num_n block of output space - input is of size num_n * (num_m - num_m_simd) */ for (k = num_k_simd; k < num_k; k++) { for (n = 0; n < num_n; n++) { scratch_B[k*n_block_size + n] = ptr_dense[n*handle->k + k]; } } } else { ptr_dense = b + (size_t)k_overall_start*handle_n + n_overall_start; if (!last_block_n) { for (k = 0; k < num_k; k++) { _MM_STORE_FP32(scratch_B + (size_t)k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 0*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_dense + (size_t)k*handle_n + 0*SIMD_WIDTH_FP32)); _MM_STORE_FP32(scratch_B + (size_t)k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 1*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_dense + (size_t)k*handle_n + 1*SIMD_WIDTH_FP32)); _MM_STORE_FP32(scratch_B + (size_t)k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 2*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_dense + (size_t)k*handle_n + 2*SIMD_WIDTH_FP32)); _MM_STORE_FP32(scratch_B + (size_t)k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 3*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_dense + (size_t)k*handle_n + 3*SIMD_WIDTH_FP32)); _MM_STORE_FP32(scratch_B + (size_t)k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 4*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_dense + (size_t)k*handle_n + 4*SIMD_WIDTH_FP32)); _MM_STORE_FP32(scratch_B + (size_t)k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 5*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_dense + (size_t)k*handle_n + 5*SIMD_WIDTH_FP32)); } } else { for (k = 0; k < num_k; k++) { for (n = 0; n < num_full_regs; n += 2) { _MM_STORE_FP32(scratch_B + (size_t)k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n) *SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_dense + (size_t)k*handle_n + ((size_t)n) *SIMD_WIDTH_FP32)); _MM_STORE_FP32(scratch_B + (size_t)k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n+1)*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_dense + (size_t)k*handle_n + ((size_t)n+1)*SIMD_WIDTH_FP32)); } for (n = last_n_start; n < num_n; n++) { scratch_B[k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + n] = ptr_dense[k*handle_n + n]; } } } } scratch_C_base = scratch_C - (size_t)m_overall_start*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; scratch_B_base = scratch_B; /* - (size_t)k_overall_start*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32;*/ for (m = m_overall_start; m < m_overall_start + num_m_aligned; m += 2, m_local += 2) { int start_j, end_j, end_j_2, num_j, num_j_2; const uint16_t *LIBXSMM_RESTRICT sp_c_ptr_base; const uint16_t *LIBXSMM_RESTRICT sp_c_ptr_base_2; const float *LIBXSMM_RESTRICT sp_v_ptr_base; const float *LIBXSMM_RESTRICT sp_v_ptr_base_2; float *LIBXSMM_RESTRICT result_m_index; float *LIBXSMM_RESTRICT result_m_index_2; const uint16_t* rowidx; if (m_local >= m_block_size) { block_A++; slice = a_sparse[block_A]; m_local = 0; } rowidx = slice.rowidx; start_j = rowidx[m_local]; end_j = rowidx[m_local+1]; end_j_2 = rowidx[m_local+2]; num_j = (end_j - start_j); num_j_2 = (end_j_2 - end_j); sp_c_ptr_base = slice.colidx + start_j; sp_c_ptr_base_2 = slice.colidx + end_j; sp_v_ptr_base = (float *)(slice.values) + start_j; sp_v_ptr_base_2 = (float *)(slice.values) + end_j; result_m_index = scratch_C_base + ((size_t)m) *LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; result_m_index_2 = scratch_C_base + ((size_t)m+1)*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; if (!last_block_n) { int64_t j = 0, j2 = 0; SIMDTYPE_FP32 sum[2*LIBXSMM_SPMDM_COMPUTE_NREGS]; sum[0] = _MM_LOAD_FP32(result_m_index + 0*SIMD_WIDTH_FP32); sum[0+LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_LOAD_FP32(result_m_index_2 + 0*SIMD_WIDTH_FP32); sum[1] = _MM_LOAD_FP32(result_m_index + 1*SIMD_WIDTH_FP32); sum[1+LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_LOAD_FP32(result_m_index_2 + 1*SIMD_WIDTH_FP32); sum[2] = _MM_LOAD_FP32(result_m_index + 2*SIMD_WIDTH_FP32); sum[2+LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_LOAD_FP32(result_m_index_2 + 2*SIMD_WIDTH_FP32); sum[3] = _MM_LOAD_FP32(result_m_index + 3*SIMD_WIDTH_FP32); sum[3+LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_LOAD_FP32(result_m_index_2 + 3*SIMD_WIDTH_FP32); sum[4] = _MM_LOAD_FP32(result_m_index + 4*SIMD_WIDTH_FP32); sum[4+LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_LOAD_FP32(result_m_index_2 + 4*SIMD_WIDTH_FP32); sum[5] = _MM_LOAD_FP32(result_m_index + 5*SIMD_WIDTH_FP32); sum[5+LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_LOAD_FP32(result_m_index_2 + 5*SIMD_WIDTH_FP32); for (; j < num_j && j2 < num_j_2; j++, j2++) { const float *const LIBXSMM_RESTRICT sp_col_dense_index = scratch_B_base + (size_t)sp_c_ptr_base[j]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; const float *const LIBXSMM_RESTRICT sp_col_dense_index_2 = scratch_B_base + (size_t)sp_c_ptr_base_2[j2]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; SIMDTYPE_FP32 v_v = _MM_SET1_FP32(sp_v_ptr_base[j]); SIMDTYPE_FP32 v_v_2 = _MM_SET1_FP32(sp_v_ptr_base_2[j2]); sum[0] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 0*SIMD_WIDTH_FP32), sum[0]); sum[0 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 0*SIMD_WIDTH_FP32), sum[0+LIBXSMM_SPMDM_COMPUTE_NREGS]); sum[1] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 1*SIMD_WIDTH_FP32), sum[1]); sum[1 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 1*SIMD_WIDTH_FP32), sum[1+LIBXSMM_SPMDM_COMPUTE_NREGS]); sum[2] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 2*SIMD_WIDTH_FP32), sum[2]); sum[2 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 2*SIMD_WIDTH_FP32), sum[2+LIBXSMM_SPMDM_COMPUTE_NREGS]); sum[3] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 3*SIMD_WIDTH_FP32), sum[3]); sum[3 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 3*SIMD_WIDTH_FP32), sum[3+LIBXSMM_SPMDM_COMPUTE_NREGS]); sum[4] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 4*SIMD_WIDTH_FP32), sum[4]); sum[4 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 4*SIMD_WIDTH_FP32), sum[4+LIBXSMM_SPMDM_COMPUTE_NREGS]); sum[5] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 5*SIMD_WIDTH_FP32), sum[5]); sum[5 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 5*SIMD_WIDTH_FP32), sum[5+LIBXSMM_SPMDM_COMPUTE_NREGS]); } for (; j < num_j; j++) { const float *const LIBXSMM_RESTRICT sp_col_dense_index = scratch_B_base + (size_t)sp_c_ptr_base[j]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; SIMDTYPE_FP32 v_v = _MM_SET1_FP32(sp_v_ptr_base[j]); sum[0] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 0*SIMD_WIDTH_FP32), sum[0]); sum[1] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 1*SIMD_WIDTH_FP32), sum[1]); sum[2] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 2*SIMD_WIDTH_FP32), sum[2]); sum[3] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 3*SIMD_WIDTH_FP32), sum[3]); sum[4] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 4*SIMD_WIDTH_FP32), sum[4]); sum[5] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 5*SIMD_WIDTH_FP32), sum[5]); } for (; j2 < num_j_2; j2++) { const float *const LIBXSMM_RESTRICT sp_col_dense_index_2 = scratch_B_base + (size_t)sp_c_ptr_base_2[j2]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; SIMDTYPE_FP32 v_v_2 = _MM_SET1_FP32(sp_v_ptr_base_2[j2]); sum[0 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 0*SIMD_WIDTH_FP32), sum[0+LIBXSMM_SPMDM_COMPUTE_NREGS]); sum[1 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 1*SIMD_WIDTH_FP32), sum[1+LIBXSMM_SPMDM_COMPUTE_NREGS]); sum[2 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 2*SIMD_WIDTH_FP32), sum[2+LIBXSMM_SPMDM_COMPUTE_NREGS]); sum[3 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 3*SIMD_WIDTH_FP32), sum[3+LIBXSMM_SPMDM_COMPUTE_NREGS]); sum[4 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 4*SIMD_WIDTH_FP32), sum[4+LIBXSMM_SPMDM_COMPUTE_NREGS]); sum[5 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 5*SIMD_WIDTH_FP32), sum[5+LIBXSMM_SPMDM_COMPUTE_NREGS]); } _MM_STORE_FP32(result_m_index + 0*SIMD_WIDTH_FP32, sum[0]); _MM_STORE_FP32(result_m_index_2 + 0*SIMD_WIDTH_FP32, sum[0+LIBXSMM_SPMDM_COMPUTE_NREGS]); _MM_STORE_FP32(result_m_index + 1*SIMD_WIDTH_FP32, sum[1]); _MM_STORE_FP32(result_m_index_2 + 1*SIMD_WIDTH_FP32, sum[1+LIBXSMM_SPMDM_COMPUTE_NREGS]); _MM_STORE_FP32(result_m_index + 2*SIMD_WIDTH_FP32, sum[2]); _MM_STORE_FP32(result_m_index_2 + 2*SIMD_WIDTH_FP32, sum[2+LIBXSMM_SPMDM_COMPUTE_NREGS]); _MM_STORE_FP32(result_m_index + 3*SIMD_WIDTH_FP32, sum[3]); _MM_STORE_FP32(result_m_index_2 + 3*SIMD_WIDTH_FP32, sum[3+LIBXSMM_SPMDM_COMPUTE_NREGS]); _MM_STORE_FP32(result_m_index + 4*SIMD_WIDTH_FP32, sum[4]); _MM_STORE_FP32(result_m_index_2 + 4*SIMD_WIDTH_FP32, sum[4+LIBXSMM_SPMDM_COMPUTE_NREGS]); _MM_STORE_FP32(result_m_index + 5*SIMD_WIDTH_FP32, sum[5]); _MM_STORE_FP32(result_m_index_2 + 5*SIMD_WIDTH_FP32, sum[5+LIBXSMM_SPMDM_COMPUTE_NREGS]); } else { int64_t j = 0, j2 = 0; SIMDTYPE_FP32 sum[2*LIBXSMM_SPMDM_COMPUTE_NREGS]; for (n = 0; n < num_full_regs; n += 2) { sum[n] = _MM_SETZERO_FP32(); sum[n+LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_SETZERO_FP32(); sum[n+1] = _MM_SETZERO_FP32(); sum[n+1+LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_SETZERO_FP32(); } for (; j < num_j && j2 < num_j_2; j++, j2++) { const float *const LIBXSMM_RESTRICT sp_col_dense_index = scratch_B_base + (size_t)sp_c_ptr_base[j]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; const float *const LIBXSMM_RESTRICT sp_col_dense_index_2 = scratch_B_base + (size_t)sp_c_ptr_base_2[j2]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; SIMDTYPE_FP32 v_v = _MM_SET1_FP32(sp_v_ptr_base[j]); SIMDTYPE_FP32 v_v_2 = _MM_SET1_FP32(sp_v_ptr_base_2[j2]); for (n = 0; n < num_full_regs; n += 2) { sum[n] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + (size_t)n*SIMD_WIDTH_FP32), sum[n]); sum[n + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + (size_t)n*SIMD_WIDTH_FP32), sum[n+LIBXSMM_SPMDM_COMPUTE_NREGS]); sum[n+1] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + ((size_t)n+1)*SIMD_WIDTH_FP32), sum[n+1]); sum[n+1 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + ((size_t)n+1)*SIMD_WIDTH_FP32), sum[n+1+LIBXSMM_SPMDM_COMPUTE_NREGS]); } { float v_v_f = sp_v_ptr_base[j]; float v_v_f_2 = sp_v_ptr_base_2[j2]; for (n = last_n_start; n < num_n; n++) { result_m_index[n] += sp_col_dense_index[n]*v_v_f; result_m_index_2[n] += sp_col_dense_index_2[n]*v_v_f_2; } } } for (; j < num_j; j++) { const float *const LIBXSMM_RESTRICT sp_col_dense_index = scratch_B_base + (size_t)sp_c_ptr_base[j]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; SIMDTYPE_FP32 v_v = _MM_SET1_FP32(sp_v_ptr_base[j]); for (n = 0; n < num_full_regs; n += 2) { sum[n] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + ((size_t)n) *SIMD_WIDTH_FP32), sum[n]); sum[n+1] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + ((size_t)n+1)*SIMD_WIDTH_FP32), sum[n+1]); } { float v_v_f = sp_v_ptr_base[j]; for (n = last_n_start; n < num_n; n++) { result_m_index[n] += sp_col_dense_index[n]*v_v_f; } } } for (; j2 < num_j_2; j2++) { const float *const LIBXSMM_RESTRICT sp_col_dense_index_2 = scratch_B_base + (size_t)sp_c_ptr_base_2[j2]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; SIMDTYPE_FP32 v_v_2 = _MM_SET1_FP32(sp_v_ptr_base_2[j2]); for (n = 0; n < num_full_regs; n += 2) { sum[n + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + ((size_t)n) *SIMD_WIDTH_FP32), sum[n+LIBXSMM_SPMDM_COMPUTE_NREGS]); sum[n+1 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + ((size_t)n+1)*SIMD_WIDTH_FP32), sum[n+1+LIBXSMM_SPMDM_COMPUTE_NREGS]); } { float v_v_f_2 = sp_v_ptr_base_2[j2]; for (n = last_n_start; n < num_n; n++) { result_m_index_2[n] += sp_col_dense_index_2[n]*v_v_f_2; } } } for (n = 0; n < num_full_regs; n += 2) { _MM_STORE_FP32(result_m_index + ((size_t)n) *SIMD_WIDTH_FP32, _MM_ADD_FP32(sum[n], _MM_LOAD_FP32(result_m_index + (size_t)n*SIMD_WIDTH_FP32))); _MM_STORE_FP32(result_m_index_2 + ((size_t)n) *SIMD_WIDTH_FP32, _MM_ADD_FP32(sum[n+LIBXSMM_SPMDM_COMPUTE_NREGS], _MM_LOAD_FP32(result_m_index_2 + (size_t)n*SIMD_WIDTH_FP32))); _MM_STORE_FP32(result_m_index + ((size_t)n+1)*SIMD_WIDTH_FP32, _MM_ADD_FP32(sum[n+1], _MM_LOAD_FP32(result_m_index + ((size_t)n+1)*SIMD_WIDTH_FP32))); _MM_STORE_FP32(result_m_index_2 + ((size_t)n+1)*SIMD_WIDTH_FP32, _MM_ADD_FP32(sum[n+1+LIBXSMM_SPMDM_COMPUTE_NREGS], _MM_LOAD_FP32(result_m_index_2 + ((size_t)n+1)*SIMD_WIDTH_FP32))); } } } for (m = m_overall_start + num_m_aligned; m < m_overall_end; m++, m_local++) { int start_j, end_j, num_j; const uint16_t *LIBXSMM_RESTRICT sp_c_ptr_base; const float *LIBXSMM_RESTRICT sp_v_ptr_base; float *LIBXSMM_RESTRICT result_m_index; const uint16_t* rowidx; if (m_local >= m_block_size) { block_A++; slice = a_sparse[block_A]; m_local = 0; } rowidx = slice.rowidx; start_j = rowidx[m_local]; end_j = rowidx[m_local+1]; num_j = (end_j - start_j); sp_c_ptr_base = slice.colidx + start_j; sp_v_ptr_base = slice.values + start_j; result_m_index = scratch_C_base + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; if (!last_block_n) { int64_t j = 0; SIMDTYPE_FP32 sum[2*LIBXSMM_SPMDM_COMPUTE_NREGS]; sum[0] = _MM_LOAD_FP32(result_m_index + 0*SIMD_WIDTH_FP32); sum[1] = _MM_LOAD_FP32(result_m_index + 1*SIMD_WIDTH_FP32); sum[2] = _MM_LOAD_FP32(result_m_index + 2*SIMD_WIDTH_FP32); sum[3] = _MM_LOAD_FP32(result_m_index + 3*SIMD_WIDTH_FP32); sum[4] = _MM_LOAD_FP32(result_m_index + 4*SIMD_WIDTH_FP32); sum[5] = _MM_LOAD_FP32(result_m_index + 5*SIMD_WIDTH_FP32); for (; j < num_j; j++) { const float *const LIBXSMM_RESTRICT sp_col_dense_index = scratch_B_base + (size_t)sp_c_ptr_base[j]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; SIMDTYPE_FP32 v_v = _MM_SET1_FP32(sp_v_ptr_base[j]); sum[0] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 0*SIMD_WIDTH_FP32), sum[0]); sum[1] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 1*SIMD_WIDTH_FP32), sum[1]); sum[2] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 2*SIMD_WIDTH_FP32), sum[2]); sum[3] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 3*SIMD_WIDTH_FP32), sum[3]); sum[4] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 4*SIMD_WIDTH_FP32), sum[4]); sum[5] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 5*SIMD_WIDTH_FP32), sum[5]); } _MM_STORE_FP32(result_m_index + 0*SIMD_WIDTH_FP32, sum[0]); _MM_STORE_FP32(result_m_index + 1*SIMD_WIDTH_FP32, sum[1]); _MM_STORE_FP32(result_m_index + 2*SIMD_WIDTH_FP32, sum[2]); _MM_STORE_FP32(result_m_index + 3*SIMD_WIDTH_FP32, sum[3]); _MM_STORE_FP32(result_m_index + 4*SIMD_WIDTH_FP32, sum[4]); _MM_STORE_FP32(result_m_index + 5*SIMD_WIDTH_FP32, sum[5]); } else { SIMDTYPE_FP32 sum[2*LIBXSMM_SPMDM_COMPUTE_NREGS]; int64_t j = 0; for (n = 0; n < num_full_regs; n += 2) { sum[n] = _MM_SETZERO_FP32(); sum[n+1] = _MM_SETZERO_FP32(); } for (; j < num_j; j++) { const float *const LIBXSMM_RESTRICT sp_col_dense_index = scratch_B_base + (size_t)sp_c_ptr_base[j]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; SIMDTYPE_FP32 v_v = _MM_SET1_FP32(sp_v_ptr_base[j]); for (n = 0; n < num_full_regs; n += 2) { sum[n] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + ((size_t)n) *SIMD_WIDTH_FP32), sum[n]); sum[n+1] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + ((size_t)n+1)*SIMD_WIDTH_FP32), sum[n+1]); } { float v_v_f = sp_v_ptr_base[j]; for (n = last_n_start; n < num_n; n++) { result_m_index[n] += sp_col_dense_index[n]*v_v_f; } } } for (n = 0; n < num_full_regs; n += 2) { _MM_STORE_FP32(result_m_index + ((size_t)n) *SIMD_WIDTH_FP32, _MM_ADD_FP32(sum[n], _MM_LOAD_FP32(result_m_index + ((size_t)n) *SIMD_WIDTH_FP32))); _MM_STORE_FP32(result_m_index + ((size_t)n+1)*SIMD_WIDTH_FP32, _MM_ADD_FP32(sum[n+1], _MM_LOAD_FP32(result_m_index + ((size_t)n+1)*SIMD_WIDTH_FP32))); } } } } /* kb */ /* Copy out c matrix */ if ('T' == transc || 't' == transc) { int num_m_simd = num_m / SIMD_WIDTH_FP32 * SIMD_WIDTH_FP32; int num_n_simd = num_n / SIMD_WIDTH_FP32 * SIMD_WIDTH_FP32; int n2; ptr_result = c + (size_t)n_overall_start*handle_m + m_overall_start; for (n = 0; n < num_n_simd; n += SIMD_WIDTH_FP32) { for (m = 0; m < num_m_simd; m += SIMD_WIDTH_FP32) { TRANSPOSE_SIMD_WIDTH_KERNEL(scratch_C + (size_t)m*n_block_size + n, n_block_size, ptr_result + (size_t)n*handle_m + m, handle_m); } /* Transpose a SIMD_WIDTH_FP32 * (num_m - num_m_simd) block of output space - input is of size (num_m - num_m_simd) * SIMD_WIDTH_FP32 */ for (n2 = n; n2 < n + SIMD_WIDTH_FP32; n2++) { for (m = num_m_simd; m < num_m; m++) { ptr_result[n2*handle_m + m] = scratch_C[m*n_block_size + n2]; } } } /* Transpose a (num_n - num_n_simd) * num_m block of output space - input is of size num_m * (num_n - num_n_simd) */ for (n = num_n_simd; n < num_n; n++) { for (m = 0; m < num_m; m++) { ptr_result[n*handle_m + m] = scratch_C[m*n_block_size + n]; } } } else { if (!last_block_n) { for (m = 0; m < num_m; m++) { _MM_STOREU_FP32(ptr_result + (size_t)m*handle_n + 0*SIMD_WIDTH_FP32, _MM_LOAD_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 0*SIMD_WIDTH_FP32)); _MM_STOREU_FP32(ptr_result + (size_t)m*handle_n + 1*SIMD_WIDTH_FP32, _MM_LOAD_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 1*SIMD_WIDTH_FP32)); _MM_STOREU_FP32(ptr_result + (size_t)m*handle_n + 2*SIMD_WIDTH_FP32, _MM_LOAD_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 2*SIMD_WIDTH_FP32)); _MM_STOREU_FP32(ptr_result + (size_t)m*handle_n + 3*SIMD_WIDTH_FP32, _MM_LOAD_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 3*SIMD_WIDTH_FP32)); _MM_STOREU_FP32(ptr_result + (size_t)m*handle_n + 4*SIMD_WIDTH_FP32, _MM_LOAD_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 4*SIMD_WIDTH_FP32)); _MM_STOREU_FP32(ptr_result + (size_t)m*handle_n + 5*SIMD_WIDTH_FP32, _MM_LOAD_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 5*SIMD_WIDTH_FP32)); } } else { for (m = 0; m < num_m; m++) { for (n = 0; n < num_full_regs; n += 2) { _MM_STOREU_FP32(ptr_result + (size_t)m*handle_n + ((size_t)n)*SIMD_WIDTH_FP32, _MM_LOAD_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n) *SIMD_WIDTH_FP32)); _MM_STOREU_FP32(ptr_result + (size_t)m*handle_n + ((size_t)n+1)*SIMD_WIDTH_FP32, _MM_LOAD_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n+1)*SIMD_WIDTH_FP32)); } for (n = last_n_start; n < num_n; n++) { ptr_result[m*handle_n + n] = scratch_C[m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + n]; } } } } #undef LIBXSMM_SPMDM_COMPUTE_NREGS libxsmm-1.17/src/template/libxsmm_spmdm_createSparseSlice_bfloat16_thread.tpl.c000066400000000000000000000120351415223013700300410ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Nadathur Satish (Intel Corp.) ******************************************************************************/ int i, k; int mb, kb; #if SIMD_WIDTH_FP32 == 8 const __m256i *const shufmasks = internal_spmdm_shufmasks_32; #endif #if SIMD_WIDTH_FP32 > 1 const __m256i *const shufmasks2 = internal_spmdm_shufmasks_16; #endif int block_offset_base, block_offset; LIBXSMM_UNUSED(nthreads); LIBXSMM_UNUSED(tid); kb = block_id / handle->mb; mb = block_id % handle->mb; if ('T' == transa || 't' == transa) { block_offset_base = mb * handle->bm; block_offset = block_offset_base + kb * handle->m * handle->bk; } else { block_offset_base = kb * handle->bk; block_offset = block_offset_base + mb * handle->k * handle->bm; } { libxsmm_CSR_sparseslice slice = libxsmm_output_csr_a[kb*handle->mb + mb]; int nrows = ((mb + 1)*handle->bm > handle->m)?(handle->m - (mb)*handle->bm):handle->bm; int ncols = ((kb + 1)*handle->bk > handle->k)?(handle->k - (kb)*handle->bk):handle->bk; /*printf("nrows: %d, ncols: %d\n", nrows, ncols);*/ const uint16_t * input_ptr = a + block_offset; uint16_t * rowidx_ptr = slice.rowidx; uint16_t * colidx_ptr = slice.colidx; float * values_ptr = (float *)(slice.values); uint16_t cnt = 0; #if SIMD_WIDTH_FP32 > 1 const SIMDTYPE_INT32 vzero = _MM_SETZERO_INT32(); const SIMDTYPE_FP32 vzerof = _MM_SETZERO_FP32(); const int ncols_aligned = ncols / (4*SIMD_WIDTH_FP32)*(4*SIMD_WIDTH_FP32); #else const int ncols_aligned = 0; #endif for (i = 0; i < nrows; i++) { rowidx_ptr[i] = cnt; if ('T' == transa || 't' == transa) { #if SIMD_WIDTH_FP32 > 1 for (k = 0; k < ncols_aligned; k += 4*SIMD_WIDTH_FP32) { int vals[32]; int kk; for (kk = 0; kk < 4*SIMD_WIDTH_FP32; kk += 2) { vals[kk/2] = (int)input_ptr[(k+kk)*handle->m + i]; vals[kk/2] |= ((int)(input_ptr[(k+kk+1)*handle->m + i]) << 16); } { SIMDTYPE_INT32 v1tmp = _MM_LOADU_INT32(vals); SIMDTYPE_INT32 v2tmp = _MM_LOADU_INT32(vals + SIMD_WIDTH_FP32); SIMDTYPE_FP32 v1, v2, v3, v4; SIMDMASKTYPE_FP32 m1, m2, m3, m4; EXPAND_BFLOAT16(v1tmp, v1, v2); EXPAND_BFLOAT16(v2tmp, v3, v4); m1 = _MM_CMPNEQ_FP32(v1, vzerof); m2 = _MM_CMPNEQ_FP32(v2, vzerof); m3 = _MM_CMPNEQ_FP32(v3, vzerof); m4 = _MM_CMPNEQ_FP32(v4, vzerof); COMPRESS_FP32(v1, k, m1, cnt); COMPRESS_FP32(v2, k + SIMD_WIDTH_FP32, m2, cnt); COMPRESS_FP32(v3, k + 2*SIMD_WIDTH_FP32, m3, cnt); COMPRESS_FP32(v4, k + 3*SIMD_WIDTH_FP32, m4, cnt); } } #endif for (k = ncols_aligned; k < ncols; k++) { uint16_t v1tmp = input_ptr[k*handle->m + i]; union {int i; float f; } v1tmp_int; v1tmp_int.i = v1tmp; v1tmp_int.i <<= 16; { const int m1 = LIBXSMM_FEQ(0, v1tmp_int.f) ? 0 : 1; if (m1) { colidx_ptr[cnt] = (uint16_t)k; values_ptr[cnt] = v1tmp_int.f; cnt++; } } } } else { #if SIMD_WIDTH_FP32 > 1 for (k = 0; k < ncols_aligned; k += 4*SIMD_WIDTH_FP32) { SIMDTYPE_INT32 v1tmp, v2tmp; SIMDTYPE_FP32 v1, v2, v3, v4; SIMDMASKTYPE_FP32 m1, m2, m3, m4; v1tmp = _MM_LOADU_INT32((const SIMDTYPE_INT32*)(input_ptr + (size_t)i*handle->k + k)); _MM_PREFETCH((char *)(input_ptr + ((size_t)i+2)*handle->k + k), _MM_HINT_T0); v2tmp = _MM_LOADU_INT32((const SIMDTYPE_INT32*)(input_ptr + (size_t)i*handle->k + k + 2*SIMD_WIDTH_FP32)); _MM_PREFETCH((char *)(input_ptr + ((size_t)i+2)*handle->k + k + SIMD_WIDTH_FP32), _MM_HINT_T0); EXPAND_BFLOAT16(v1tmp, v1, v2); EXPAND_BFLOAT16(v2tmp, v3, v4); m1 = _MM_CMPNEQ_FP32(v1, vzerof); m2 = _MM_CMPNEQ_FP32(v2, vzerof); m3 = _MM_CMPNEQ_FP32(v3, vzerof); m4 = _MM_CMPNEQ_FP32(v4, vzerof); COMPRESS_FP32(v1, k, m1, cnt); COMPRESS_FP32(v2, k + SIMD_WIDTH_FP32, m2, cnt); COMPRESS_FP32(v3, k + 2*SIMD_WIDTH_FP32, m3, cnt); COMPRESS_FP32(v4, k + 3*SIMD_WIDTH_FP32, m4, cnt); } #endif for (k = ncols_aligned; k < ncols; k++) { uint16_t v1tmp = input_ptr[i*handle->k + k]; union {int i; float f; } v1tmp_int; v1tmp_int.i = v1tmp; v1tmp_int.i <<= 16; { int m1 = LIBXSMM_FEQ(0, v1tmp_int.f) ? 0 : 1; if (m1) { colidx_ptr[cnt] = (uint16_t)k; values_ptr[cnt] = v1tmp_int.f; cnt++; } } } } } rowidx_ptr[nrows] = cnt; } libxsmm-1.17/src/template/libxsmm_spmdm_createSparseSlice_fp32_thread.tpl.c000066400000000000000000000137041415223013700272010ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Nadathur Satish (Intel Corp.) ******************************************************************************/ int i, k; int mb, kb; #if SIMD_WIDTH_FP32 == 8 const __m256i *const shufmasks = internal_spmdm_shufmasks_32; #endif #if SIMD_WIDTH_FP32 > 1 const __m256i *const shufmasks2 = internal_spmdm_shufmasks_16; SIMDTYPE_INT32 vindex = _MM_SETZERO_INT32(); int idx_array[16]; #endif int block_offset_base, block_offset; LIBXSMM_UNUSED(nthreads); LIBXSMM_UNUSED(tid); kb = block_id / handle->mb; mb = block_id % handle->mb; if ('T' == transa || 't' == transa) { #if SIMD_WIDTH_FP32 > 1 int kk; for (kk = 0; kk < SIMD_WIDTH_FP32; kk++) idx_array[kk] = kk * handle->m; vindex = _MM_LOADU_INT32(idx_array); #endif block_offset_base = mb * handle->bm; block_offset = block_offset_base + kb * handle->m * handle->bk; } else { block_offset_base = kb * handle->bk; block_offset = block_offset_base + mb * handle->k * handle->bm; } { libxsmm_CSR_sparseslice slice = libxsmm_output_csr_a[kb*handle->mb + mb]; int nrows = ((mb + 1)*handle->bm > handle->m)?(handle->m - (mb)*handle->bm):handle->bm; int ncols = ((kb + 1)*handle->bk > handle->k)?(handle->k - (kb)*handle->bk):handle->bk; /*printf("nrows: %d, ncols: %d\n", nrows, ncols);*/ const float * input_ptr = a + block_offset; uint16_t * rowidx_ptr = slice.rowidx; uint16_t * colidx_ptr = slice.colidx; float * values_ptr = (float *)(slice.values); uint16_t cnt = 0; #if SIMD_WIDTH_FP32 > 1 const SIMDTYPE_FP32 vzero = _MM_SETZERO_FP32(); const int ncols_aligned = ncols / (4*SIMD_WIDTH_FP32)*(4*SIMD_WIDTH_FP32); const int ncols_aligned_2 = ncols / (SIMD_WIDTH_FP32)*(SIMD_WIDTH_FP32); #else const int ncols_aligned_2 = 0; #endif for (i = 0; i < nrows; i++) { rowidx_ptr[i] = cnt; if ('T' == transa || 't' == transa) { #if SIMD_WIDTH_FP32 > 1 for (k = 0; k < ncols_aligned; k += 4*SIMD_WIDTH_FP32) { SIMDTYPE_FP32 v1 = _MM_GATHER_FP32(input_ptr + (size_t)k * handle->m + i, vindex, 4); SIMDTYPE_FP32 v2 = _MM_GATHER_FP32(input_ptr + ((size_t)k+1*SIMD_WIDTH_FP32) * handle->m + i, vindex, 4); SIMDTYPE_FP32 v3 = _MM_GATHER_FP32(input_ptr + ((size_t)k+2*SIMD_WIDTH_FP32) * handle->m + i, vindex, 4); SIMDTYPE_FP32 v4 = _MM_GATHER_FP32(input_ptr + ((size_t)k+3*SIMD_WIDTH_FP32) * handle->m + i, vindex, 4); SIMDMASKTYPE_FP32 m1 = _MM_CMPNEQ_FP32(v1, vzero); SIMDMASKTYPE_FP32 m2 = _MM_CMPNEQ_FP32(v2, vzero); SIMDMASKTYPE_FP32 m3 = _MM_CMPNEQ_FP32(v3, vzero); SIMDMASKTYPE_FP32 m4 = _MM_CMPNEQ_FP32(v4, vzero); COMPRESS_FP32(v1, k, m1, cnt); COMPRESS_FP32(v2, k + SIMD_WIDTH_FP32, m2, cnt); COMPRESS_FP32(v3, k + 2*SIMD_WIDTH_FP32, m3, cnt); COMPRESS_FP32(v4, k + 3*SIMD_WIDTH_FP32, m4, cnt); } for (k = ncols_aligned; k < ncols_aligned_2; k += SIMD_WIDTH_FP32) { SIMDTYPE_FP32 v1 = _MM_GATHER_FP32(input_ptr + (size_t)k * handle->m + i, vindex, 4); SIMDMASKTYPE_FP32 m1 = _MM_CMPNEQ_FP32(v1, vzero); COMPRESS_FP32(v1, k, m1, cnt); } #endif for (k = ncols_aligned_2; k < ncols; k++) { const float v1 = input_ptr[i + k*handle->m]; const int m1 = LIBXSMM_FEQ(0, v1) ? 0 : 1; if (m1) { colidx_ptr[cnt] = (uint16_t)k; values_ptr[cnt] = v1; cnt++; } } } else { #if SIMD_WIDTH_FP32 > 1 for (k = 0; k < ncols_aligned; k += 4*SIMD_WIDTH_FP32) { SIMDTYPE_FP32 v1, v2, v3, v4; SIMDMASKTYPE_FP32 m1, m2, m3, m4; v1 = _MM_LOADU_FP32(input_ptr + ((size_t)i) * handle->k + (size_t)k); _MM_PREFETCH((char*)input_ptr + ((size_t)i+2) * handle->k + (size_t)k, _MM_HINT_T0); v2 = _MM_LOADU_FP32(input_ptr + ((size_t)i) * handle->k + (size_t)k + (size_t)SIMD_WIDTH_FP32); _MM_PREFETCH((char*)input_ptr + ((size_t)i+2) * handle->k + (size_t)k + (size_t)SIMD_WIDTH_FP32, _MM_HINT_T0); v3 = _MM_LOADU_FP32(input_ptr + ((size_t)i) * handle->k + (size_t)k + (size_t)2 * SIMD_WIDTH_FP32); _MM_PREFETCH((char*)input_ptr + ((size_t)i+2) * handle->k + (size_t)k + (size_t)2 * SIMD_WIDTH_FP32, _MM_HINT_T0); v4 = _MM_LOADU_FP32(input_ptr + ((size_t)i) * handle->k + (size_t)k + (size_t)3 * SIMD_WIDTH_FP32); _MM_PREFETCH((char*)input_ptr + ((size_t)i+2) * handle->k + (size_t)k + (size_t)3 * SIMD_WIDTH_FP32, _MM_HINT_T0); m1 = _MM_CMPNEQ_FP32(v1, vzero); m2 = _MM_CMPNEQ_FP32(v2, vzero); m3 = _MM_CMPNEQ_FP32(v3, vzero); m4 = _MM_CMPNEQ_FP32(v4, vzero); COMPRESS_FP32(v1, k, m1, cnt); COMPRESS_FP32(v2, k + SIMD_WIDTH_FP32, m2, cnt); COMPRESS_FP32(v3, k + 2*SIMD_WIDTH_FP32, m3, cnt); COMPRESS_FP32(v4, k + 3*SIMD_WIDTH_FP32, m4, cnt); } for (k = ncols_aligned; k < ncols_aligned_2; k += SIMD_WIDTH_FP32) { SIMDTYPE_FP32 v1; SIMDMASKTYPE_FP32 m1; v1 = _MM_LOADU_FP32(input_ptr + ((size_t)i) * handle->k + (size_t)k); _MM_PREFETCH((char*)input_ptr + ((size_t)i+2) * handle->k + (size_t)k, _MM_HINT_T0); m1 = _MM_CMPNEQ_FP32(v1, vzero); COMPRESS_FP32(v1, k, m1, cnt); } #endif for (k = ncols_aligned_2; k < ncols; k++) { const float v1 = input_ptr[i*handle->k + k]; const int m1 = LIBXSMM_FEQ(0, v1) ? 0 : 1; if (m1) { colidx_ptr[cnt] = (uint16_t)k; values_ptr[cnt] = v1; cnt++; } } } } rowidx_ptr[nrows] = cnt; } libxsmm-1.17/src/template/libxsmm_version.h000066400000000000000000000005511415223013700211070ustar00rootroot00000000000000#ifndef LIBXSMM_VERSION_H #define LIBXSMM_VERSION_H #define LIBXSMM_CONFIG_VERSION "$VERSION" #define LIBXSMM_CONFIG_BRANCH "$BRANCH" #define LIBXSMM_CONFIG_VERSION_MAJOR $MAJOR #define LIBXSMM_CONFIG_VERSION_MINOR $MINOR #define LIBXSMM_CONFIG_VERSION_UPDATE $UPDATE #define LIBXSMM_CONFIG_VERSION_PATCH $PATCH #define LIBXSMM_CONFIG_BUILD_DATE $DATE #endif libxsmm-1.17/src/template/transpose.tpl.c000066400000000000000000000130231415223013700204740ustar00rootroot00000000000000{ __m512i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf; __m512i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, te, tf; r0 = _mm512_loadu_si512(mat + 0*16); r1 = _mm512_loadu_si512(mat + 1*16); r2 = _mm512_loadu_si512(mat + 2*16); r3 = _mm512_loadu_si512(mat + 3*16); r4 = _mm512_loadu_si512(mat + 4*16); r5 = _mm512_loadu_si512(mat + 5*16); r6 = _mm512_loadu_si512(mat + 6*16); r7 = _mm512_loadu_si512(mat + 7*16); r8 = _mm512_loadu_si512(mat + 8*16); r9 = _mm512_loadu_si512(mat + 9*16); ra = _mm512_loadu_si512(mat + 10*16); rb = _mm512_loadu_si512(mat + 11*16); rc = _mm512_loadu_si512(mat + 12*16); rd = _mm512_loadu_si512(mat + 13*16); re = _mm512_loadu_si512(mat + 14*16); rf = _mm512_loadu_si512(mat + 15*16); t0 = _mm512_unpacklo_epi32(r0,r1); /* 0 16 1 17 4 20 5 21 8 24 9 25 12 28 13 29 */ t1 = _mm512_unpackhi_epi32(r0,r1); /* 2 18 3 19 6 22 7 23 10 26 11 27 14 30 15 31 */ t2 = _mm512_unpacklo_epi32(r2,r3); /* 32 48 33 49 ... */ t3 = _mm512_unpackhi_epi32(r2,r3); /* 34 50 35 51 ... */ t4 = _mm512_unpacklo_epi32(r4,r5); /* 64 80 65 81 ... */ t5 = _mm512_unpackhi_epi32(r4,r5); /* 66 82 67 83 ... */ t6 = _mm512_unpacklo_epi32(r6,r7); /* 96 112 97 113 ... */ t7 = _mm512_unpackhi_epi32(r6,r7); /* 98 114 99 115 ... */ t8 = _mm512_unpacklo_epi32(r8,r9); /* 128 ... */ t9 = _mm512_unpackhi_epi32(r8,r9); /* 130 ... */ ta = _mm512_unpacklo_epi32(ra,rb); /* 160 ... */ tb = _mm512_unpackhi_epi32(ra,rb); /* 162 ... */ tc = _mm512_unpacklo_epi32(rc,rd); /* 196 ... */ td = _mm512_unpackhi_epi32(rc,rd); /* 198 ... */ te = _mm512_unpacklo_epi32(re,rf); /* 228 ... */ tf = _mm512_unpackhi_epi32(re,rf); /* 230 ... */ r0 = _mm512_unpacklo_epi64(t0,t2); /* 0 16 32 48 ... */ r1 = _mm512_unpackhi_epi64(t0,t2); /* 1 17 33 49 ... */ r2 = _mm512_unpacklo_epi64(t1,t3); /* 2 18 34 49 ... */ r3 = _mm512_unpackhi_epi64(t1,t3); /* 3 19 35 51 ... */ r4 = _mm512_unpacklo_epi64(t4,t6); /* 64 80 96 112 ... */ r5 = _mm512_unpackhi_epi64(t4,t6); /* 65 81 97 114 ... */ r6 = _mm512_unpacklo_epi64(t5,t7); /* 66 82 98 113 ... */ r7 = _mm512_unpackhi_epi64(t5,t7); /* 67 83 99 115 ... */ r8 = _mm512_unpacklo_epi64(t8,ta); /* 128 144 160 176 ... */ r9 = _mm512_unpackhi_epi64(t8,ta); /* 129 145 161 178 ... */ ra = _mm512_unpacklo_epi64(t9,tb); /* 130 146 162 177 ... */ rb = _mm512_unpackhi_epi64(t9,tb); /* 131 147 163 179 ... */ rc = _mm512_unpacklo_epi64(tc,te); /* 192 208 228 240 ... */ rd = _mm512_unpackhi_epi64(tc,te); /* 193 209 229 241 ... */ re = _mm512_unpacklo_epi64(td,tf); /* 194 210 230 242 ... */ rf = _mm512_unpackhi_epi64(td,tf); /* 195 211 231 243 ... */ t0 = _mm512_shuffle_i32x4(r0, r4, 0x88); /* 0 16 32 48 8 24 40 56 64 80 96 112 ... */ t1 = _mm512_shuffle_i32x4(r1, r5, 0x88); /* 1 17 33 49 ... */ t2 = _mm512_shuffle_i32x4(r2, r6, 0x88); /* 2 18 34 50 ... */ t3 = _mm512_shuffle_i32x4(r3, r7, 0x88); /* 3 19 35 51 ... */ t4 = _mm512_shuffle_i32x4(r0, r4, 0xdd); /* 4 20 36 52 ... */ t5 = _mm512_shuffle_i32x4(r1, r5, 0xdd); /* 5 21 37 53 ... */ t6 = _mm512_shuffle_i32x4(r2, r6, 0xdd); /* 6 22 38 54 ... */ t7 = _mm512_shuffle_i32x4(r3, r7, 0xdd); /* 7 23 39 55 ... */ t8 = _mm512_shuffle_i32x4(r8, rc, 0x88); /* 128 144 160 176 ... */ t9 = _mm512_shuffle_i32x4(r9, rd, 0x88); /* 129 145 161 177 ... */ ta = _mm512_shuffle_i32x4(ra, re, 0x88); /* 130 146 162 178 ... */ tb = _mm512_shuffle_i32x4(rb, rf, 0x88); /* 131 147 163 179 ... */ tc = _mm512_shuffle_i32x4(r8, rc, 0xdd); /* 132 148 164 180 ... */ td = _mm512_shuffle_i32x4(r9, rd, 0xdd); /* 133 149 165 181 ... */ te = _mm512_shuffle_i32x4(ra, re, 0xdd); /* 134 150 166 182 ... */ tf = _mm512_shuffle_i32x4(rb, rf, 0xdd); /* 135 151 167 183 ... */ r0 = _mm512_shuffle_i32x4(t0, t8, 0x88); /* 0 16 32 48 64 80 96 112 ... 240 */ r1 = _mm512_shuffle_i32x4(t1, t9, 0x88); /* 1 17 33 49 66 81 97 113 ... 241 */ r2 = _mm512_shuffle_i32x4(t2, ta, 0x88); /* 2 18 34 50 67 82 98 114 ... 242 */ r3 = _mm512_shuffle_i32x4(t3, tb, 0x88); /* 3 19 35 51 68 83 99 115 ... 243 */ r4 = _mm512_shuffle_i32x4(t4, tc, 0x88); /* 4 ... */ r5 = _mm512_shuffle_i32x4(t5, td, 0x88); /* 5 ... */ r6 = _mm512_shuffle_i32x4(t6, te, 0x88); /* 6 ... */ r7 = _mm512_shuffle_i32x4(t7, tf, 0x88); /* 7 ... */ r8 = _mm512_shuffle_i32x4(t0, t8, 0xdd); /* 8 ... */ r9 = _mm512_shuffle_i32x4(t1, t9, 0xdd); /* 9 ... */ ra = _mm512_shuffle_i32x4(t2, ta, 0xdd); /* 10 ... */ rb = _mm512_shuffle_i32x4(t3, tb, 0xdd); /* 11 ... */ rc = _mm512_shuffle_i32x4(t4, tc, 0xdd); /* 12 ... */ rd = _mm512_shuffle_i32x4(t5, td, 0xdd); /* 13 ... */ re = _mm512_shuffle_i32x4(t6, te, 0xdd); /* 14 ... */ rf = _mm512_shuffle_i32x4(t7, tf, 0xdd); /* 15 31 47 63 79 96 111 127 ... 255 */ _mm512_storeu_si512(matT + 0*16, r0); _mm512_storeu_si512(matT + 1*16, r1); _mm512_storeu_si512(matT + 2*16, r2); _mm512_storeu_si512(matT + 3*16, r3); _mm512_storeu_si512(matT + 4*16, r4); _mm512_storeu_si512(matT + 5*16, r5); _mm512_storeu_si512(matT + 6*16, r6); _mm512_storeu_si512(matT + 7*16, r7); _mm512_storeu_si512(matT + 8*16, r8); _mm512_storeu_si512(matT + 9*16, r9); _mm512_storeu_si512(matT + 10*16, ra); _mm512_storeu_si512(matT + 11*16, rb); _mm512_storeu_si512(matT + 12*16, rc); _mm512_storeu_si512(matT + 13*16, rd); _mm512_storeu_si512(matT + 14*16, re); _mm512_storeu_si512(matT + 15*16, rf); } libxsmm-1.17/tests/000077500000000000000000000000001415223013700142555ustar00rootroot00000000000000libxsmm-1.17/tests/Makefile000066400000000000000000000155511415223013700157240ustar00rootroot00000000000000ROOTDIR = $(abspath $(dir $(firstword $(MAKEFILE_LIST)))) DEPDIR = .. SRCDIR = . INCDIR = $(ROOTDIR) BLDDIR = obj OUTDIR = . CXXFLAGS = $(NULL) CFLAGS = $(NULL) DFLAGS = -DLIBXSMM_BLAS_CONST # PEDANTIC=2: OpenBLAS headers can cause warnings override PEDANTIC = 1 BLAS ?= 1 OMP ?= 1 SYM ?= 1 # include common Makefile artifacts include $(DEPDIR)/Makefile.inc # necessary include directories IFLAGS += -I$(call quote,$(INCDIR)) IFLAGS += -I$(call quote,$(DEPDIR)/include) #IFLAGS += -I$(call quote,$(DEPDIR)/src) ifneq (0,$(shell echo "$$((100000>$(GCC_VERSION_NUM)))")) MIX ?= 1 else MIX ?= 0 endif OUTNAME := $(shell basename "$(ROOTDIR)") HEADERS := $(wildcard $(INCDIR)/*.h) $(wildcard $(INCDIR)/*.hpp) $(wildcard $(INCDIR)/*.hxx) $(wildcard $(INCDIR)/*.hh) \ $(wildcard $(SRCDIR)/*.h) $(wildcard $(SRCDIR)/*.hpp) $(wildcard $(SRCDIR)/*.hxx) $(wildcard $(SRCDIR)/*.hh) \ $(DEPDIR)/include/libxsmm_source.h CPPSRCS := $(shell grep -L '$(CMAIN)' $(SRCDIR)/*.cpp 2>/dev/null | tr -s "\n" " ") CPPSRCX := $(shell grep -l '$(CMAIN)' $(SRCDIR)/*.cpp 2>/dev/null | tr -s "\n" " ") CXXSRCS := $(shell grep -L '$(CMAIN)' $(SRCDIR)/*.cxx 2>/dev/null | tr -s "\n" " ") CXXSRCX := $(shell grep -l '$(CMAIN)' $(SRCDIR)/*.cxx 2>/dev/null | tr -s "\n" " ") CCXSRCS := $(shell grep -L '$(CMAIN)' $(SRCDIR)/*.cc 2>/dev/null | tr -s "\n" " ") CCXSRCX := $(shell grep -l '$(CMAIN)' $(SRCDIR)/*.cc 2>/dev/null | tr -s "\n" " ") CSOURCS := $(shell grep -L '$(CMAIN)' $(SRCDIR)/*.c 2>/dev/null | tr -s "\n" " ") CSOURCX := $(shell grep -l '$(CMAIN)' $(SRCDIR)/*.c 2>/dev/null | tr -s "\n" " ") FXXSRCS := $(shell grep -L '$(FMAIN)' $(SRCDIR)/*.f 2>/dev/null | tr -s "\n" " ") FXXSRCX := $(shell grep -l '$(FMAIN)' $(SRCDIR)/*.f 2>/dev/null | tr -s "\n" " ") F77SRCS := $(shell grep -L '$(FMAIN)' $(SRCDIR)/*.F 2>/dev/null | tr -s "\n" " ") F77SRCX := $(shell grep -l '$(FMAIN)' $(SRCDIR)/*.F 2>/dev/null | tr -s "\n" " ") F90SRCS := $(shell grep -L '$(FMAIN)' $(SRCDIR)/*.f90 2>/dev/null | tr -s "\n" " ") \ $(shell grep -L '$(FMAIN)' $(SRCDIR)/*.F90 2>/dev/null | tr -s "\n" " ") F90SRCX := $(shell grep -l '$(FMAIN)' $(SRCDIR)/*.f90 2>/dev/null | tr -s "\n" " ") \ $(shell grep -l '$(FMAIN)' $(SRCDIR)/*.F90 2>/dev/null | tr -s "\n" " ") MODULES := $(addsuffix .mod,$(basename $(FXXSRCS) $(F77SRCS) $(F90SRCS))) \ $(addsuffix .modmic,$(basename $(FXXSRCS) $(F77SRCS) $(F90SRCS))) OBJECTS := $(call objname,$(CPPSRCS) $(CXXSRCS) $(CCXSRCS) $(CSOURCS)) OBJECTX := $(call objname,$(CPPSRCX) $(CXXSRCX) $(CCXSRCX) $(CSOURCX)) FTNOBJS := $(call objname,$(FXXSRCS) $(F77SRCS) $(F90SRCS)) FTNOBJX := $(call objname,$(FXXSRCX) $(F77SRCX) $(F90SRCX)) XFILES := $(addprefix $(OUTDIR)/,$(basename $(notdir \ $(CPPSRCX) $(CXXSRCX) $(CCXSRCX) $(CSOURCX) \ $(FXXSRCX) $(F77SRCX) $(F90SRCX)))) .PHONY: all all: $(XFILES) .PHONY: compile compile: $(OBJECTS) $(FTNOBJS) .PHONY: tests tests: test .PHONY: test test: $(OUTDIR)/.make $(OUTDIR)/test.sh $(XFILES) @$(OUTDIR)/test.sh $(TEST) # determine header-only tests (to avoid linking against LIBXSMM libraries; see below) HEADER_ONLY = $(basename $(notdir $(shell grep -H libxsmm_source *.c | cut -d: -f1))) define DEFINE_LINK_LD_RULE ifneq (,$(wildcard $(LIBDEP))) ifneq (,$(wildcard $(EXTDEP))) $(1): $(2) $(OBJECTX) $(dir $(1))/.make $(if $(filter $(1),$(HEADER_ONLY)),$(NOBLASDEP),$(LIBDEP) $(EXTDEP)) $(LD) $(SLDFLAGS) -o $(1) $(2) \ $(if $(filter $(1),$(HEADER_ONLY)),$(NOBLASLIB),$(EXTLIB) $(MAINLIB)) \ $(call cleanld,$(LDFLAGS) $(CLDFLAGS)) else .PHONY: $(1) endif else .PHONY: $(1) endif endef define DEFINE_LINK_FC_RULE ifneq (,$(strip $(FC))) ifneq (,$(wildcard $(LIBDEP))) ifneq (,$(wildcard $(FORTDEP))) ifneq (,$(wildcard $(EXTDEP))) $(1): $(2) $(FTNOBJS) $(FORTDEP) $(LIBDEP) $(EXTDEP) $(dir $(1))/.make $(FC) $(SLDFLAGS) -o $(1) $(2) $(MAINLIB) $(FORTLIB) $(EXTLIB) \ $(FCMTFLAGS) $(call cleanld,$(LDFLAGS) $(FLDFLAGS) $(ELDFLAGS)) else .PHONY: $(1) endif else .PHONY: $(1) endif else .PHONY: $(1) endif else .PHONY: $(1) endif endef $(foreach SRC, $(filter-out $(SRCDIR)/headeronly.c,$(CPPSRCX) $(CXXSRCX) $(CCXSRCX) $(CSOURCX)), \ $(eval $(call DEFINE_LINK_LD_RULE, $(basename $(notdir $(SRC))), $(call objname,$(SRC))))) $(foreach SRC, $(FXXSRCX) $(F77SRCX) $(F90SRCX), \ $(eval $(call DEFINE_LINK_FC_RULE, $(basename $(notdir $(SRC))), $(call objname,$(SRC))))) ifeq (0,$(MIX)) $(OUTDIR)/headeronly: $(OUTDIR)/.make $(BLDDIR)/headeronly-c.o $(BLDDIR)/headeronly_aux-c.o $(NOBLASDEP) $(LD) -o $@ $(BLDDIR)/headeronly-c.o $(BLDDIR)/headeronly_aux-c.o \ $(call cleanld,$(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(NOBLASLIB)) else # compile headeronly_aux.c as C++ translation unit $(OUTDIR)/headeronly: $(OUTDIR)/.make $(BLDDIR)/headeronly-c.o $(NOBLASDEP) @$(CP) $(SRCDIR)/headeronly_aux.c $(SRCDIR)/headeronly_aux.cpp $(CXX) $(DFLAGS) $(IFLAGS) $(CXXFLAGS) $(CTARGET) -c $(SRCDIR)/headeronly_aux.cpp -o $(BLDDIR)/headeronly_aux-cpp.o $(XLD) -o $@ $(BLDDIR)/headeronly_aux-cpp.o $(BLDDIR)/headeronly-c.o \ $(call cleanld,$(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) $(NOBLASLIB)) @rm -f headeronly_aux.cpp endif $(BLDDIR)/%-cpp.o: $(SRCDIR)/%.cpp .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc #$(DEPDIR)/include/libxsmm_source.h $(CXX) $(DFLAGS) $(IFLAGS) $(CXXFLAGS) $(CTARGET) -c $< -o $@ $(BLDDIR)/%-c.o: $(SRCDIR)/%.c .state $(BLDDIR)/.make $(HEADERS) Makefile $(DEPDIR)/Makefile.inc $(DEPDIR)/include/libxsmm_source.h $(CC) $(DFLAGS) $(IFLAGS) $(CFLAGS) $(CTARGET) -c $< -o $@ #$(BLDDIR)/%-f.o: $(SRCDIR)/%.f .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(DEPDIR)/include/libxsmm_source.h #$(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ #$(BLDDIR)/%-f90.o: $(SRCDIR)/%.f90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(DEPDIR)/include/libxsmm_source.h #$(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ #$(BLDDIR)/%-f90.o: $(SRCDIR)/%.F90 .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(DEPDIR)/include/libxsmm_source.h #$(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ #$(BLDDIR)/%-f77.o: $(SRCDIR)/%.F .state $(BLDDIR)/.make Makefile $(DEPDIR)/Makefile.inc $(DEPDIR)/include/libxsmm_source.h #$(FC) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(FCFLAGS) $(FTARGET) -c $< -o $@ .PHONY: clean clean: ifneq ($(call qapath,$(BLDDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(BLDDIR)),$(call qapath,.)) @rm -rf $(BLDDIR) endif endif ifneq (,$(wildcard $(BLDDIR))) # still exists @rm -f $(OBJECTS) $(OBJECTX) $(FTNOBJS) $(FTNOBJX) *__genmod.* fit.log *.dat @rm -f $(BLDDIR)/*.gcno $(BLDDIR)/*.gcda $(BLDDIR)/*.gcov endif @rm -f .make .state .PHONY: realclean realclean: clean ifneq ($(call qapath,$(OUTDIR)),$(ROOTDIR)) ifneq ($(call qapath,$(OUTDIR)),$(call qapath,.)) @rm -rf $(OUTDIR) endif endif ifneq (,$(wildcard $(OUTDIR))) # still exists @rm -f $(OUTDIR)/libxsmm.$(DLIBEXT) $(OUTDIR)/*.stackdump @rm -f $(XFILES) $(MODULES) endif libxsmm-1.17/tests/README.md000066400000000000000000000003321415223013700155320ustar00rootroot00000000000000# LIBXSMM Test Suite (INTERNAL) This directory contains test cases which are exercising internals of LIBXSMM. This is not collection of code samples since the functionality used might be not part of the LIBXSMM API. libxsmm-1.17/tests/atomics.c000066400000000000000000000045061415223013700160650ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include #if !defined(ATOMIC_KIND) # define ATOMIC_KIND LIBXSMM_ATOMIC_RELAXED #endif int main(void) { LIBXSMM_ALIGNED(LIBXSMM_ATOMIC_LOCKTYPE lock = 0/*unlocked*/, LIBXSMM_ALIGNMENT); int result = EXIT_SUCCESS; int mh = 1051981, hp, tmp; LIBXSMM_NONATOMIC_STORE(&hp, 25071975, ATOMIC_KIND); tmp = LIBXSMM_NONATOMIC_LOAD(&hp, ATOMIC_KIND); if (tmp != LIBXSMM_ATOMIC_LOAD(&hp, ATOMIC_KIND)) { result = EXIT_FAILURE; } if (mh != LIBXSMM_NONATOMIC_SUB_FETCH(&hp, 24019994, ATOMIC_KIND)) { result = EXIT_FAILURE; } if (mh != LIBXSMM_ATOMIC_FETCH_ADD(&hp, 24019994, ATOMIC_KIND)) { result = EXIT_FAILURE; } LIBXSMM_ATOMIC_STORE(&tmp, mh, ATOMIC_KIND); if (25071975 != LIBXSMM_NONATOMIC_FETCH_OR(&hp, tmp, ATOMIC_KIND)) { result = EXIT_FAILURE; } if ((25071975 | mh) != hp) { result = EXIT_FAILURE; } /* check if non-atomic and atomic are compatible */ if (LIBXSMM_NONATOMIC_TRYLOCK(&lock, ATOMIC_KIND)) { if (LIBXSMM_ATOMIC_TRYLOCK(&lock, ATOMIC_KIND)) { result = EXIT_FAILURE; } LIBXSMM_NONATOMIC_RELEASE(&lock, ATOMIC_KIND); if (0 != lock) result = EXIT_FAILURE; } else { result = EXIT_FAILURE; } LIBXSMM_ATOMIC_ACQUIRE(&lock, LIBXSMM_SYNC_NPAUSE, ATOMIC_KIND); if (0 == lock) result = EXIT_FAILURE; if (LIBXSMM_ATOMIC_TRYLOCK(&lock, ATOMIC_KIND)) { result = EXIT_FAILURE; } if (LIBXSMM_ATOMIC_TRYLOCK(&lock, ATOMIC_KIND)) { result = EXIT_FAILURE; } if (0 == lock) result = EXIT_FAILURE; LIBXSMM_ATOMIC_RELEASE(&lock, ATOMIC_KIND); if (0 != lock) result = EXIT_FAILURE; return result; } libxsmm-1.17/tests/atomics.vcxproj000066400000000000000000000544601415223013700173420ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 atomics {E29F73D9-3474-4C67-800A-075D5EC7C2AB} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console libxsmmnoblas.lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true GenerateParallelCode SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console true libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console libxsmmnoblas.lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true GenerateParallelCode SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console true libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console MSVCRT libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console MSVCRT libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console libxsmm-1.17/tests/gemm.c000066400000000000000000000275301415223013700153550ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include #include #if !defined(ITYPE) # define ITYPE double #endif #if !defined(OTYPE) # define OTYPE ITYPE #endif #if !defined(CHECK_FPE) # define CHECK_FPE #endif #if !defined(GEMM_GOLD) # define GEMM_GOLD LIBXSMM_GEMM_SYMBOL #endif #if !defined(GEMM) # define GEMM LIBXSMM_XGEMM_SYMBOL #endif #if !defined(GEMM2) # define GEMM2 LIBXSMM_YGEMM_SYMBOL #endif #if !defined(SMM) # define SMM LIBXSMM_XGEMM_SYMBOL #endif #if !defined(GEMM_NO_BYPASS) # define SMM_NO_BYPASS(FLAGS, ALPHA, BETA) LIBXSMM_GEMM_NO_BYPASS(FLAGS, ALPHA, BETA) #endif #if (LIBXSMM_EQUAL(ITYPE, float) || LIBXSMM_EQUAL(ITYPE, double)) \ && !defined(MKL_DIRECT_CALL_SEQ) && !defined(MKL_DIRECT_CALL) LIBXSMM_BLAS_SYMBOL_DECL(ITYPE, gemm) #endif int main(void) { /* test#: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 */ /* index: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 */ libxsmm_blasint m[] = { 0, 1, 0, 0, 1, 1, 2, 3, 3, 1, 4, 8, 64, 64, 16, 80, 80, 80, 80, 16, 260, 260, 260, 260, 350, 350, 350, 350, 350, 5, 10, 12, 20, 32, 9, 13, 5 }; libxsmm_blasint n[] = { 0, 0, 1, 0, 1, 2, 2, 3, 1, 3, 1, 1, 8, 239, 13824, 1, 3, 5, 7, 65792, 1, 3, 5, 7, 16, 1, 25, 4, 9, 13, 1, 10, 6, 33, 9, 13, 5 }; libxsmm_blasint k[] = { 0, 0, 0, 1, 1, 2, 2, 3, 2, 2, 4, 0, 64, 64, 16, 1, 3, 6, 10, 16, 1, 3, 6, 10, 20, 1, 35, 4, 10, 70, 1, 12, 6, 192, 1742, 13, 5 }; libxsmm_blasint lda[] = { 1, 1, 1, 1, 1, 1, 2, 3, 3, 1, 4, 8, 64, 64, 16, 80, 80, 80, 80, 16, 260, 260, 260, 260, 350, 350, 350, 350, 350, 5, 22, 22, 22, 32, 9, 13, 5 }; libxsmm_blasint ldb[] = { 1, 1, 1, 1, 1, 2, 2, 3, 2, 2, 4, 8, 9216, 240, 16, 1, 3, 5, 5, 16, 1, 3, 5, 7, 35, 35, 35, 35, 35, 70, 1, 20, 8, 2048, 1742, 13, 5 }; libxsmm_blasint ldc[] = { 1, 1, 1, 1, 1, 1, 2, 3, 3, 1, 4, 8, 4096, 240, 16, 80, 80, 80, 80, 16, 260, 260, 260, 260, 350, 350, 350, 350, 350, 5, 22, 12, 20, 2048, 9, 13, 5 }; OTYPE alpha[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; OTYPE beta[] = { 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1 }; #if (!defined(__BLAS) || (0 != __BLAS)) && defined(GEMM_GOLD) char transa[] = "NNNTT"; #else char transa[] = "NN"; #endif char transb[] = "NNTNT"; const int begin = 0, end = sizeof(m) / sizeof(*m), i0 = 0, i1 = sizeof(transa) - 1; libxsmm_blasint max_size_a = 0, max_size_b = 0, max_size_c = 0, block = 1; #if defined(_DEBUG) libxsmm_matdiff_info diff; #endif ITYPE *a = NULL, *b = NULL; OTYPE *c = NULL; #if defined(GEMM) OTYPE *d = NULL; #endif #if (!defined(__BLAS) || (0 != __BLAS)) && defined(GEMM_GOLD) OTYPE *gold = NULL; #endif int result = EXIT_SUCCESS, test, i; #if defined(CHECK_FPE) && defined(_MM_GET_EXCEPTION_MASK) const unsigned int fpemask = _MM_GET_EXCEPTION_MASK(); /* backup FPE mask */ const unsigned int fpcheck = _MM_MASK_INVALID | _MM_MASK_OVERFLOW; unsigned int fpstate = 0; _MM_SET_EXCEPTION_MASK(fpemask & ~fpcheck); #endif LIBXSMM_BLAS_INIT for (test = begin; test < end; ++test) { m[test] = LIBXSMM_UP(m[test], block); n[test] = LIBXSMM_UP(n[test], block); k[test] = LIBXSMM_UP(k[test], block); lda[test] = LIBXSMM_MAX(lda[test], m[test]); ldb[test] = LIBXSMM_MAX(ldb[test], k[test]); ldc[test] = LIBXSMM_MAX(ldc[test], m[test]); } for (test = begin; test < end; ++test) { const libxsmm_blasint size_a = lda[test] * k[test], size_b = ldb[test] * n[test], size_c = ldc[test] * n[test]; LIBXSMM_ASSERT(m[test] <= lda[test] && k[test] <= ldb[test] && m[test] <= ldc[test]); max_size_a = LIBXSMM_MAX(max_size_a, size_a); max_size_b = LIBXSMM_MAX(max_size_b, size_b); max_size_c = LIBXSMM_MAX(max_size_c, size_c); } a = (ITYPE*)libxsmm_malloc((size_t)(max_size_a * sizeof(ITYPE))); b = (ITYPE*)libxsmm_malloc((size_t)(max_size_b * sizeof(ITYPE))); c = (OTYPE*)libxsmm_malloc((size_t)(max_size_c * sizeof(OTYPE))); #if defined(GEMM) d = (OTYPE*)libxsmm_malloc((size_t)(max_size_c * sizeof(OTYPE))); LIBXSMM_ASSERT(NULL != d); #endif #if (!defined(__BLAS) || (0 != __BLAS)) && defined(GEMM_GOLD) gold = (OTYPE*)libxsmm_malloc((size_t)(max_size_c * sizeof(OTYPE))); LIBXSMM_ASSERT(NULL != gold); #endif LIBXSMM_ASSERT(NULL != a && NULL != b && NULL != c); LIBXSMM_MATINIT(ITYPE, 42, a, max_size_a, 1, max_size_a, 1.0); LIBXSMM_MATINIT(ITYPE, 24, b, max_size_b, 1, max_size_b, 1.0); #if defined(_DEBUG) libxsmm_matdiff_clear(&diff); #endif for (test = begin; test < end && EXIT_SUCCESS == result; ++test) { for (i = i0; i < i1 && EXIT_SUCCESS == result; ++i) { libxsmm_blasint mi = m[test], ni = n[test], ki = k[test]; const int flags = LIBXSMM_GEMM_FLAGS(transa[i], transb[i]); const int smm = SMM_NO_BYPASS(flags, alpha[test], beta[test]); #if defined(CHECK_FPE) && defined(_MM_GET_EXCEPTION_MASK) _MM_SET_EXCEPTION_STATE(0); #endif if ('N' != transa[i] && 'N' == transb[i]) { /* TN */ mi = ki = LIBXSMM_MIN(mi, ki); } else if ('N' == transa[i] && 'N' != transb[i]) { /* NT */ ki = ni = LIBXSMM_MIN(ki, ni); } else if ('N' != transa[i] && 'N' != transb[i]) { /* TT */ const libxsmm_blasint ti = LIBXSMM_MIN(mi, ni); mi = ni = ki = LIBXSMM_MIN(ti, ki); } if (LIBXSMM_FEQ(0, beta[test])) { #if (!defined(__BLAS) || (0 != __BLAS)) && defined(GEMM_GOLD) memset(gold, -1, (size_t)(sizeof(OTYPE) * max_size_c)); #endif memset(c, -1, (size_t)(sizeof(OTYPE) * max_size_c)); #if defined(GEMM) memset(d, -1, (size_t)(sizeof(OTYPE) * max_size_c)); #endif } else { #if (!defined(__BLAS) || (0 != __BLAS)) && defined(GEMM_GOLD) memset(gold, 0, (size_t)(sizeof(OTYPE) * max_size_c)); #endif memset(c, 0, (size_t)(sizeof(OTYPE) * max_size_c)); #if defined(GEMM) memset(d, 0, (size_t)(sizeof(OTYPE) * max_size_c)); #endif } if (0 != smm) { SMM(ITYPE)(transa + i, transb + i, &mi, &ni, &ki, alpha + test, a, lda + test, b, ldb + test, beta + test, c, ldc + test); } #if defined(GEMM) else { GEMM(ITYPE)(transa + i, transb + i, &mi, &ni, &ki, alpha + test, a, lda + test, b, ldb + test, beta + test, c, ldc + test); } # if defined(GEMM2) GEMM2(ITYPE)(transa + i, transb + i, &mi, &ni, &ki, alpha + test, a, lda + test, b, ldb + test, beta + test, d, ldc + test); # else GEMM(ITYPE)(transa + i, transb + i, &mi, &ni, &ki, alpha + test, a, lda + test, b, ldb + test, beta + test, d, ldc + test); # endif #endif #if (0 != LIBXSMM_JIT) if (0 != smm) { /* dispatch kernel and check that it is available */ const LIBXSMM_MMFUNCTION_TYPE(ITYPE) kernel = LIBXSMM_MMDISPATCH_SYMBOL(ITYPE)(mi, ni, ki, lda + test, ldb + test, ldc + test, alpha + test, beta + test, &flags, NULL/*prefetch*/); if (NULL == kernel) { # if defined(_DEBUG) fprintf(stderr, "\nERROR: kernel %i.%i not generated!\n\t", test + 1, i + 1); libxsmm_gemm_print(stderr, LIBXSMM_GEMM_PRECISION(ITYPE), transa + i, transb + i, &mi, &ni, &ki, alpha + test, NULL/*a*/, lda + test, NULL/*b*/, ldb + test, beta + test, NULL/*c*/, ldc + test); fprintf(stderr, "\n"); # endif result = EXIT_FAILURE; break; } } #endif #if defined(CHECK_FPE) && defined(_MM_GET_EXCEPTION_MASK) fpstate = _MM_GET_EXCEPTION_STATE() & fpcheck; result = (0 == fpstate ? EXIT_SUCCESS : EXIT_FAILURE); if (EXIT_SUCCESS != result) { # if defined(_DEBUG) fprintf(stderr, "FPE(%i.%i): state=0x%08x -> invalid=%s overflow=%s\n", test + 1, i + 1, fpstate, 0 != (_MM_MASK_INVALID & fpstate) ? "true" : "false", 0 != (_MM_MASK_OVERFLOW & fpstate) ? "true" : "false"); # endif } # if (!defined(__BLAS) || (0 != __BLAS)) && defined(GEMM_GOLD) else # endif #endif #if (!defined(__BLAS) || (0 != __BLAS)) && defined(GEMM_GOLD) # if !defined(GEMM) if (0 != smm) # endif { # if defined(GEMM_GOLD) libxsmm_matdiff_info diff_test; GEMM_GOLD(ITYPE)(transa + i, transb + i, &mi, &ni, &ki, alpha + test, a, lda + test, b, ldb + test, beta + test, gold, ldc + test); result = libxsmm_matdiff(&diff_test, LIBXSMM_DATATYPE(OTYPE), mi, ni, gold, c, ldc + test, ldc + test); if (EXIT_SUCCESS == result) { # if defined(_DEBUG) libxsmm_matdiff_reduce(&diff, &diff_test); # endif if (1.0 < (1000.0 * diff_test.normf_rel)) { # if defined(_DEBUG) if (0 != smm) { fprintf(stderr, "\nERROR: SMM test %i.%i failed!\n\t", test + 1, i + 1); } else { fprintf(stderr, "\nERROR: test %i.%i failed!\n\t", test + 1, i + 1); } libxsmm_gemm_print(stderr, LIBXSMM_GEMM_PRECISION(ITYPE), transa + i, transb + i, &mi, &ni, &ki, alpha + test, NULL/*a*/, lda + test, NULL/*b*/, ldb + test, beta + test, NULL/*c*/, ldc + test); fprintf(stderr, "\n"); # endif result = EXIT_FAILURE; } # if defined(GEMM) else { result = libxsmm_matdiff(&diff_test, LIBXSMM_DATATYPE(OTYPE), mi, ni, gold, d, ldc + test, ldc + test); if (EXIT_SUCCESS == result) { # if defined(_DEBUG) libxsmm_matdiff_reduce(&diff, &diff_test); # endif if (1.0 < (1000.0 * diff_test.normf_rel)) { # if defined(_DEBUG) fprintf(stderr, "\nERROR: test %i.%i failed!\n\t", test + 1, i + 1); libxsmm_gemm_print(stderr, LIBXSMM_GEMM_PRECISION(ITYPE), transa + i, transb + i, &mi, &ni, &ki, alpha + test, NULL/*a*/, lda + test, NULL/*b*/, ldb + test, beta + test, NULL/*c*/, ldc + test); fprintf(stderr, "\n"); # endif result = EXIT_FAILURE; } } } # endif } # endif } # if defined(GEMM_GOLD) /* avoid drift between Gold and test-results */ memcpy(c, gold, (size_t)(sizeof(OTYPE) * max_size_c)); # if defined(GEMM) memcpy(d, gold, (size_t)(sizeof(OTYPE) * max_size_c)); # endif # endif #elif defined(_DEBUG) fprintf(stderr, "Warning: skipped the test due to missing BLAS support!\n"); #endif } } #if defined(_DEBUG) fprintf(stderr, "Diff: L2abs=%f Linf=%f\n", diff.l2_abs, diff.linf_abs); #endif #if defined(CHECK_FPE) && defined(_MM_GET_EXCEPTION_MASK) _MM_SET_EXCEPTION_MASK(fpemask); /* restore FPE mask */ _MM_SET_EXCEPTION_STATE(0); /* clear FPE state */ #endif libxsmm_free(a); libxsmm_free(b); libxsmm_free(c); #if defined(GEMM) libxsmm_free(d); #endif #if (!defined(__BLAS) || (0 != __BLAS)) && defined(GEMM_GOLD) libxsmm_free(gold); #endif return result; } libxsmm-1.17/tests/gemm.vcxproj000066400000000000000000000555211415223013700166270ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 gemm {CDCBFA7A-54BC-4D49-B57A-8B9E9A107320} 10.0 Application Disabled Disabled Sequential v142 true Application true true Disabled Disabled Sequential v142 Application true Disabled Disabled Sequential v142 true Application Disabled Disabled Sequential v142 true true Application true Disabled Disabled Sequential v142 Application true Disabled Disabled true Sequential v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmext.lib;libxsmmext.lib;mkl_rt.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true GenerateParallelCode SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmext.lib;libxsmmext.lib;mkl_rt.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true GenerateParallelCode SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;$(MKLROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;$(MKLROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;mkl_rt.lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/tests/gemmflags.c000066400000000000000000000064331415223013700163710ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include int main(void) { const int defaults[] = { LIBXSMM_GEMM_FLAG_NONE, LIBXSMM_GEMM_FLAG_TRANS_A, LIBXSMM_GEMM_FLAG_TRANS_B, LIBXSMM_GEMM_FLAG_TRANS_A | LIBXSMM_GEMM_FLAG_TRANS_B }; const char trans[] = "NnTtCcX"; const int ndefaults = sizeof(defaults) / sizeof(*defaults), ntrans = sizeof(trans); int result = EXIT_SUCCESS; int i, j = -1, k = -1, flags = 0; for (i = 0; i < ndefaults && EXIT_SUCCESS == result; ++i) { flags = LIBXSMM_GEMM_PFLAGS(0, 0, defaults[i]); if (defaults[i] != flags) { result = EXIT_FAILURE; break; } for (j = 0; j < ntrans && EXIT_SUCCESS == result; ++j) { flags = LIBXSMM_GEMM_PFLAGS(trans + j, 0, defaults[i]); if (0 != (LIBXSMM_GEMM_FLAG_TRANS_A & flags) && ('N' == trans[j] || 'n' == trans[j])) { result = EXIT_FAILURE; break; } if (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & flags) && ('T' == trans[j] || 't' == trans[j])) { result = EXIT_FAILURE; break; } if (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & flags) && ('C' == trans[j] || 'c' == trans[j])) { result = EXIT_FAILURE; break; } for (k = 0; k < ntrans; ++k) { flags = LIBXSMM_GEMM_PFLAGS(0, trans + k, defaults[i]); if (0 != (LIBXSMM_GEMM_FLAG_TRANS_B & flags) && ('N' == trans[k] || 'n' == trans[k])) { result = EXIT_FAILURE; break; } if (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & flags) && ('T' == trans[k] || 't' == trans[k])) { result = EXIT_FAILURE; break; } if (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & flags) && ('C' == trans[k] || 'c' == trans[k])) { result = EXIT_FAILURE; break; } flags = LIBXSMM_GEMM_PFLAGS(trans + j, trans + k, defaults[i]); if (0 != (LIBXSMM_GEMM_FLAG_TRANS_A & flags) && ('N' == trans[j] || 'n' == trans[j])) { result = EXIT_FAILURE; break; } if (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & flags) && ('T' == trans[j] || 't' == trans[j])) { result = EXIT_FAILURE; break; } if (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & flags) && ('C' == trans[j] || 'c' == trans[j])) { result = EXIT_FAILURE; break; } if (0 != (LIBXSMM_GEMM_FLAG_TRANS_B & flags) && ('N' == trans[k] || 'n' == trans[k])) { result = EXIT_FAILURE; break; } if (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & flags) && ('T' == trans[k] || 't' == trans[k])) { result = EXIT_FAILURE; break; } if (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & flags) && ('C' == trans[k] || 'c' == trans[k])) { result = EXIT_FAILURE; break; } } } } #if defined(_DEBUG) if (EXIT_SUCCESS != result) { fprintf(stderr, "%c%c -> %i\n", 0 <= j ? trans[j] : '0', 0 <= k ? trans[k] : '0', flags); } #endif return result; } libxsmm-1.17/tests/gemmflags.vcxproj000066400000000000000000000544641415223013700176510ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 gemmflags {D54866A7-43B1-40AF-AE02-C96BD5B03B81} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console libxsmmnoblas.lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true GenerateParallelCode SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console true libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console libxsmmnoblas.lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true GenerateParallelCode SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console true libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console MSVCRT libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console MSVCRT libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console libxsmm-1.17/tests/hash.c000066400000000000000000000043151415223013700153470ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include #if defined(_DEBUG) # define FPRINTF(STREAM, ...) fprintf(STREAM, __VA_ARGS__) #else # define FPRINTF(STREAM, ...) #endif #if !defined(ELEM_TYPE) # define ELEM_TYPE int #endif /** * This test case is NOT an example of how to use LIBXSMM * since INTERNAL functions are tested which are not part * of the LIBXSMM API. */ int main(void) { const unsigned int seed = 1975, size = 2507; const unsigned int n512 = 512 / (8 * sizeof(ELEM_TYPE)); unsigned int s = LIBXSMM_UP(size, n512), i, h1, h2; int result = EXIT_SUCCESS; const ELEM_TYPE* value; ELEM_TYPE *const data = (ELEM_TYPE*)libxsmm_malloc(sizeof(ELEM_TYPE) * s); if (NULL == data) s = 0; for (i = 0; i < s; ++i) data[i] = (ELEM_TYPE)(rand() - ((RAND_MAX) >> 1)); h1 = libxsmm_crc32_u64(seed, data); h2 = libxsmm_crc32_u32(seed, data); h2 = libxsmm_crc32_u32(h2, (unsigned int*)data + 1); if (h1 != h2) { FPRINTF(stderr, "crc32_u32 or crc32_u64 is wrong\n"); result = EXIT_FAILURE; } h1 = libxsmm_crc32(seed, data, sizeof(ELEM_TYPE) * s); h2 = seed; value = data; for (i = 0; i < s; i += n512) { h2 = libxsmm_crc32_u512(h2, value + i); } if (h1 != h2) { FPRINTF(stderr, "(crc32=%u) != (crc32_sw=%u)\n", h1, h2); result = EXIT_FAILURE; } if (seed != libxsmm_hash(NULL/*data*/, 0/*size*/, seed)) { result = EXIT_FAILURE; } if (0 != libxsmm_hash_string(NULL/*string*/)) { result = EXIT_FAILURE; } libxsmm_free(data); return result; } libxsmm-1.17/tests/hash.vcxproj000066400000000000000000000544521415223013700166270ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 hash {4B3F30DD-1A04-4FA6-B5A4-7B1F86990A9A} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console libxsmmnoblas.lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true GenerateParallelCode SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console true libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console libxsmmnoblas.lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true GenerateParallelCode SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console true libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console MSVCRT libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console MSVCRT libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console libxsmm-1.17/tests/headeronly.c000066400000000000000000000035051415223013700165560ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include /* must match definitions in headeronly_aux.c */ #if !defined(ITYPE) # define ITYPE double #endif #if !defined(OTYPE) # define OTYPE ITYPE #endif LIBXSMM_EXTERN_C LIBXSMM_MMFUNCTION_TYPE2(ITYPE, OTYPE) mmdispatch(int m, int n, int k); int main(void) { const int m = LIBXSMM_MAX_M, n = LIBXSMM_MAX_N, k = LIBXSMM_MAX_K; const LIBXSMM_MMFUNCTION_TYPE2(ITYPE, OTYPE) fa = LIBXSMM_MMDISPATCH_SYMBOL2(ITYPE, OTYPE)(m, n, k, NULL/*lda*/, NULL/*ldb*/, NULL/*ldc*/, NULL/*alpha*/, NULL/*beta*/, NULL/*flags*/, NULL/*prefetch*/); const LIBXSMM_MMFUNCTION_TYPE2(ITYPE, OTYPE) fb = mmdispatch(m, n, k); int result = EXIT_SUCCESS; if (fa == fb) { /* test unregistering and freeing kernel */ union { LIBXSMM_MMFUNCTION_TYPE2(ITYPE, OTYPE) f; const void* p; } kernel; kernel.f = fa; libxsmm_release_kernel(kernel.p); } else { libxsmm_registry_info registry_info; result = libxsmm_get_registry_info(®istry_info); if (EXIT_SUCCESS == result && 2 != registry_info.size) { result = EXIT_FAILURE; } } return result; } libxsmm-1.17/tests/headeronly.vcxproj000066400000000000000000000574521415223013700200410ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 CompileAsCpp CompileAsCpp CompileAsCpp CompileAsCpp CompileAsCpp CompileAsCpp /Zc:twoPhase- %(AdditionalOptions) /Zc:twoPhase- %(AdditionalOptions) /Zc:twoPhase- %(AdditionalOptions) /Zc:twoPhase- %(AdditionalOptions) /Zc:twoPhase- %(AdditionalOptions) /Zc:twoPhase- %(AdditionalOptions) headeronly {88C3F7F5-A3AA-4974-B928-5EE69BC17C31} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__BLAS=0;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console libxsmmnoblas.lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__BLAS=0;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true GenerateParallelCode SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console true libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__BLAS=0;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console libxsmmnoblas.lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__BLAS=0;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true GenerateParallelCode SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console true libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__BLAS=0;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console MSVCRT libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__BLAS=0;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console MSVCRT libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console libxsmm-1.17/tests/headeronly_aux.c000066400000000000000000000030361415223013700174320ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include /* must match definitions in headeronly.c */ #if !defined(ITYPE) # define ITYPE double #endif #if !defined(OTYPE) # define OTYPE ITYPE #endif LIBXSMM_EXTERN_C LIBXSMM_MMFUNCTION_TYPE2(ITYPE, OTYPE) mmdispatch(int m, int n, int k); LIBXSMM_EXTERN_C LIBXSMM_MMFUNCTION_TYPE2(ITYPE, OTYPE) mmdispatch(int m, int n, int k) { LIBXSMM_MMFUNCTION_TYPE2(ITYPE, OTYPE) result; #if defined(__cplusplus) /* C++ by chance: test libxsmm_mmfunction<> wrapper */ const libxsmm_mmfunction mmfunction(m, n, k); result = mmfunction.kernel().LIBXSMM_TPREFIX2(ITYPE, OTYPE, mm); #else result = LIBXSMM_MMDISPATCH_SYMBOL2(ITYPE, OTYPE)(m, n, k, NULL/*lda*/, NULL/*ldb*/, NULL/*ldc*/, NULL/*alpha*/, NULL/*beta*/, NULL/*flags*/, NULL/*prefetch*/); #endif return result; } libxsmm-1.17/tests/malloc.c000066400000000000000000000106061415223013700156730ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include #include #if !defined(CHECK_SETUP) && 1 # define CHECK_SETUP #endif #if !defined(CHECK_REALLOC) && 1 # define CHECK_REALLOC #endif int main(void) { const size_t size = 2507, alignment = (2U << 20); libxsmm_malloc_info malloc_info; int nerrors = 0; void* p; #if defined(CHECK_SETUP) { /* check allocator setup */ libxsmm_malloc_function malloc_fn; libxsmm_free_function free_fn; const void* context; malloc_fn.function = malloc; free_fn.function = free; libxsmm_set_default_allocator(NULL/*context*/, malloc_fn/*malloc*/, free_fn/*free*/); malloc_fn.function = NULL; free_fn.function = NULL; libxsmm_set_scratch_allocator(NULL/*context*/, malloc_fn/*NULL*/, free_fn/*NULL*/); /* check adoption of the default allocator */ libxsmm_get_scratch_allocator(&context, &malloc_fn, &free_fn); if (NULL != context || malloc != malloc_fn.function || free != free_fn.function) { ++nerrors; } } #endif /* allocate some amount of memory */ p = libxsmm_malloc(size); /* query and check the size of the buffer */ if (NULL != p && (EXIT_SUCCESS != libxsmm_get_malloc_info(p, &malloc_info) || malloc_info.size < size)) { ++nerrors; } #if defined(CHECK_REALLOC) if (NULL != p) { /* reallocate larger amount of memory */ const int palign = 1 << LIBXSMM_INTRINSICS_BITSCANFWD64((uintptr_t)p); unsigned char* c = (unsigned char*)p; size_t i; for (i = 0; i < size; ++i) c[i] = (unsigned char)LIBXSMM_MOD2(i, 256); p = libxsmm_realloc(size * 2, p); /* check that alignment is preserved */ if (0 != (((uintptr_t)p) % palign)) { ++nerrors; } c = (unsigned char*)p; for (i = size; i < (size * 2); ++i) c[i] = (unsigned char)LIBXSMM_MOD2(i, 256); /* reallocate again with same size */ p = libxsmm_realloc(size * 2, p); /* check that alignment is preserved */ if (0 != (((uintptr_t)p) % palign)) { ++nerrors; } c = (unsigned char*)p; for (i = 0; i < (size * 2); ++i) { /* check that content is preserved */ nerrors += (c[i] == (unsigned char)LIBXSMM_MOD2(i, 256) ? 0 : 1); } /* reallocate with smaller size */ p = libxsmm_realloc(size / 2, p); /* check that alignment is preserved */ if (0 != (((uintptr_t)p) % palign)) { ++nerrors; } c = (unsigned char*)p; for (i = 0; i < size / 2; ++i) { /* check that content is preserved */ nerrors += (c[i] == (unsigned char)LIBXSMM_MOD2(i, 256) ? 0 : 1); } } /* query and check the size of the buffer */ if (NULL != p && (EXIT_SUCCESS != libxsmm_get_malloc_info(p, &malloc_info) || malloc_info.size < (size / 2))) { ++nerrors; } libxsmm_free(p); /* release buffer */ /* check degenerated reallocation */ p = libxsmm_realloc(size, NULL/*allocation*/); /* query and check the size of the buffer */ if (NULL != p && (EXIT_SUCCESS != libxsmm_get_malloc_info(p, &malloc_info) || malloc_info.size < size)) { ++nerrors; } #endif /* check that a NULL-pointer yields no size */ if (EXIT_SUCCESS != libxsmm_get_malloc_info(NULL, &malloc_info) || 0 != malloc_info.size) { ++nerrors; } /* release NULL pointer */ libxsmm_free(NULL); /* release buffer */ libxsmm_free(p); /* allocate memory with specific alignment */ p = libxsmm_aligned_malloc(size, alignment); /* check the alignment of the allocation */ if (0 != (((uintptr_t)p) % alignment)) { ++nerrors; } /* release aligned memory */ libxsmm_free(p); /* check foreign memory */ if (EXIT_SUCCESS == libxsmm_get_malloc_info(&size/*faulty pointer*/, &malloc_info)) { ++nerrors; } return 0 == nerrors ? EXIT_SUCCESS : EXIT_FAILURE; } libxsmm-1.17/tests/malloc.vcxproj000066400000000000000000000545401415223013700171510ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 malloc {C535D386-FC53-42CF-B9FE-34630FB15929} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true GenerateParallelCode SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true GenerateParallelCode SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/tests/matcopy.c000066400000000000000000000162161415223013700161030ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include #if !defined(ELEM_TYPE) # define ELEM_TYPE float #endif #if !defined(TEST_MZERO) # define TEST_MZERO #endif #if !defined(TEST_MCOPY) # define TEST_MCOPY #endif #if !defined(TEST_JIT) # define TEST_JIT #endif #if LIBXSMM_EQUAL(ELEM_TYPE, float) || LIBXSMM_EQUAL(ELEM_TYPE, double) # if defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL) # include # define MATCOPY_GOLD(M, N, A, LDI, B, LDO) \ LIBXSMM_CONCATENATE(mkl_, LIBXSMM_TPREFIX(ELEM_TYPE, omatcopy))('C', 'n', \ (size_t)(*(M)), (size_t)(*(N)), (ELEM_TYPE)1, A, (size_t)(*(LDI)), B, (size_t)(*(LDO))) # elif defined(__OPENBLAS77) && 0/* issue #390 */ # include # define MATCOPY_GOLD(M, N, A, LDI, B, LDO) { \ /*const*/char matcopy_gold_tc_ = 'C', matcopy_gold_tt_ = 'n'; \ /*const*/ELEM_TYPE matcopy_gold_alpha_ = 1; \ LIBXSMM_FSYMBOL(LIBXSMM_TPREFIX(ELEM_TYPE, omatcopy))(&matcopy_gold_tc_, &matcopy_gold_tt_, \ (libxsmm_blasint*)(M), (libxsmm_blasint*)(N), &matcopy_gold_alpha_, \ A, (libxsmm_blasint*)(LDI), B, (libxsmm_blasint*)(LDO)); \ } # endif #endif int main(void) { /* test#: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 */ /* index: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 */ const libxsmm_blasint m[] = { 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 3, 4, 6, 6, 6, 6, 9, 9, 9, 8, 16, 63, 16, 16, 2507 }; const libxsmm_blasint n[] = { 0, 0, 1, 0, 1, 6, 7, 7, 2, 4, 3, 4, 1, 1, 1, 1, 5, 9, 23, 250, 16, 31, 500, 448, 1975 }; const libxsmm_blasint ldi[] = { 0, 1, 1, 1, 1, 1, 2, 2, 2, 17, 3, 6, 6, 8, 6, 7, 9, 9, 9, 512, 16, 63, 16, 512, 3000 }; const libxsmm_blasint ldo[] = { 0, 1, 1, 1, 1, 1, 1, 8, 2, 2, 3, 4, 6, 6, 8, 8, 9, 9, 9, 16, 16, 64, 512, 16, 3072 }; #if defined(TEST_JIT) && (0 != LIBXSMM_JIT) const int prefetch[] = { 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1 }; #endif const int start = 0, ntests = sizeof(m) / sizeof(*m); libxsmm_blasint max_size_a = 0, max_size_b = 0, i, j; unsigned int nerrors = 0; ELEM_TYPE *a = 0, *b = 0; #if defined(MATCOPY_GOLD) ELEM_TYPE *c = 0; #endif void (*matcopy[])(void*, const void*, unsigned int, libxsmm_blasint, libxsmm_blasint, libxsmm_blasint, libxsmm_blasint) = { libxsmm_matcopy, libxsmm_matcopy_omp }; int test, fun; for (test = start; test < ntests; ++test) { const libxsmm_blasint size_a = ldi[test] * n[test], size_b = ldo[test] * n[test]; assert(m[test] <= ldi[test] && m[test] <= ldo[test]); max_size_a = LIBXSMM_MAX(max_size_a, size_a); max_size_b = LIBXSMM_MAX(max_size_b, size_b); } a = (ELEM_TYPE*)libxsmm_malloc((size_t)(max_size_a * sizeof(ELEM_TYPE))); b = (ELEM_TYPE*)libxsmm_malloc((size_t)(max_size_b * sizeof(ELEM_TYPE))); assert(NULL != a && NULL != b); LIBXSMM_MATINIT_OMP(ELEM_TYPE, 42, a, max_size_a, 1, max_size_a, 1.0); #if defined(MATCOPY_GOLD) c = (ELEM_TYPE*)libxsmm_malloc((size_t)(max_size_b * sizeof(ELEM_TYPE))); assert(NULL != c); #endif for (fun = 0; fun < 2; ++fun) { for (test = start; test < ntests; ++test) { ELEM_TYPE pattern; memset(b, -1, (size_t)(max_size_b * sizeof(ELEM_TYPE))); pattern = b[0]; /* -NaN */ #if defined(TEST_MZERO) matcopy[fun](b, NULL, sizeof(ELEM_TYPE), m[test], n[test], ldi[test], ldo[test]); for (i = 0; i < n[test]; ++i) { for (j = 0; j < m[test]; ++j) { const ELEM_TYPE u = 0; const ELEM_TYPE v = b[i*ldo[test]+j]; if (LIBXSMM_NEQ(u, v)) { ++nerrors; } } for (j = m[test]; j < ldo[test]; ++j) { if (0 != memcmp(&pattern, b + (size_t)i * ldo[test] + j, sizeof(ELEM_TYPE))) { ++nerrors; } } } if (0 != nerrors) { /* break-out */ fun = test = INT_MAX; break; } #endif #if defined(TEST_MCOPY) matcopy[fun](b, a, sizeof(ELEM_TYPE), m[test], n[test], ldi[test], ldo[test]); for (i = 0; i < n[test]; ++i) { for (j = 0; j < m[test]; ++j) { const ELEM_TYPE u = a[i*ldi[test]+j]; const ELEM_TYPE v = b[i*ldo[test]+j]; if (LIBXSMM_NEQ(u, v)) { ++nerrors; } } for (j = m[test]; j < ldo[test]; ++j) { if (0 != memcmp(&pattern, b + (size_t)i * ldo[test] + j, sizeof(ELEM_TYPE))) { ++nerrors; } } } if (0 != nerrors) { /* break-out */ fun = test = INT_MAX; break; } #endif #if defined(MATCOPY_GOLD) if (0 == fun) { MATCOPY_GOLD(m + test, n + test, a, ldi + test, c, ldo + test); for (i = 0; i < n[test]; ++i) { for (j = 0; j < m[test]; ++j) { const ELEM_TYPE u = b[i*ldo[test]+j]; const ELEM_TYPE v = c[i*ldo[test]+j]; if (LIBXSMM_NEQ(u, v)) { ++nerrors; } } for (j = m[test]; j < ldo[test]; ++j) { if (0 != memcmp(&pattern, b + (size_t)i * ldo[test] + j, sizeof(ELEM_TYPE))) { ++nerrors; } } } if (0 != nerrors) { /* break-out */ fun = test = INT_MAX; break; } } #endif #if defined(TEST_JIT) && (0 != LIBXSMM_JIT) /* dispatch kernel and check that it is available */ # if 0 /* Issue #354 */ if (0 == fun && LIBXSMM_X86_AVX <= libxsmm_get_target_archid()) # else if (0 == fun && LIBXSMM_X86_AVX512 <= libxsmm_get_target_archid()) # endif { libxsmm_descriptor_blob blob; const libxsmm_mcopy_descriptor *const desc = libxsmm_mcopy_descriptor_init(&blob, sizeof(ELEM_TYPE), (unsigned int)m[test], (unsigned int)n[test], (unsigned int)ldo[test], (unsigned int)ldi[test], LIBXSMM_MATCOPY_FLAG_DEFAULT, prefetch[test], NULL/*unroll*/); const libxsmm_xmcopyfunction kernel = libxsmm_dispatch_mcopy(desc); if (NULL == kernel) { # if defined(_DEBUG) fprintf(stderr, "\nERROR: kernel %i.%i not generated!\n", fun + 1, test + 1); # endif ++nerrors; fun = test = INT_MAX; break; /* break-out */ } } #endif } } libxsmm_free(a); libxsmm_free(b); #if defined(MATCOPY_GOLD) libxsmm_free(c); #endif if (0 == nerrors) { return EXIT_SUCCESS; } else { # if defined(_DEBUG) fprintf(stderr, "errors=%u\n", nerrors); # endif return EXIT_FAILURE; } } libxsmm-1.17/tests/matcopy.vcxproj000066400000000000000000000550001415223013700173460ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 matcopy {A674A92E-D92F-4AA0-9264-F719D39407B8} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmext.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true GenerateParallelCode SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmext.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true GenerateParallelCode SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/tests/matdiff.c000066400000000000000000000125111415223013700160330ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include #if !defined(ITYPE) # define ITYPE double #endif int main(void) { int result = EXIT_SUCCESS; libxsmm_matdiff_info diff; /* http://www.netlib.org/lapack/lug/node75.html */ const ITYPE a[] = { (ITYPE)1.00, (ITYPE)2.00, (ITYPE)3.00, (ITYPE)4.00, (ITYPE)5.00, (ITYPE)6.00, (ITYPE)7.00, (ITYPE)8.00, (ITYPE)10.0 }; const ITYPE b[] = { (ITYPE)0.44, (ITYPE)2.36, (ITYPE)3.04, (ITYPE)3.09, (ITYPE)5.87, (ITYPE)6.66, (ITYPE)7.36, (ITYPE)7.77, (ITYPE)9.07 }; const ITYPE x[] = { (ITYPE)1.00, (ITYPE)100.0, (ITYPE)9.00 }; const ITYPE y[] = { (ITYPE)1.10, (ITYPE)99.00, (ITYPE)11.0 }; result = libxsmm_matdiff(&diff, LIBXSMM_DATATYPE(ITYPE), 3/*m*/, 3/*n*/, a/*ref*/, b/*tst*/, NULL/*ldref*/, NULL/*ldtst*/); if (EXIT_SUCCESS == result) { /* One-norm */ if (0.0000003 < LIBXSMM_ABS(diff.norm1_abs - 1.8300000)) result = EXIT_FAILURE; if (0.0000001 < LIBXSMM_ABS(diff.norm1_rel - 0.0963158)) result = EXIT_FAILURE; /* Infinity-norm */ if (0.0000002 < LIBXSMM_ABS(diff.normi_abs - 2.4400000)) result = EXIT_FAILURE; if (0.0000001 < LIBXSMM_ABS(diff.normi_rel - 0.0976000)) result = EXIT_FAILURE; /* Froebenius-norm (relative) */ if (0.0000001 < LIBXSMM_ABS(diff.normf_rel - 0.1074954)) result = EXIT_FAILURE; /* L2-norm */ if (0.0000002 < LIBXSMM_ABS(diff.l2_abs - 1.8742465)) result = EXIT_FAILURE; if (0.0000001 < LIBXSMM_ABS(diff.l2_rel - 0.6726295)) result = EXIT_FAILURE; /** L1-norm */ if (0.0000001 < LIBXSMM_ABS(diff.l1_ref - 46.00)) result = EXIT_FAILURE; if (0.0000007 < LIBXSMM_ABS(diff.l1_tst - 45.66)) result = EXIT_FAILURE; /* Linf-norm */ if (0.0000004 < LIBXSMM_ABS(diff.linf_abs - 0.9300000)) result = EXIT_FAILURE; if (0.0000001 < LIBXSMM_ABS(diff.linf_rel - 0.5600000)) result = EXIT_FAILURE; /* Location of maximum absolute error */ if (2 != diff.m) result = EXIT_FAILURE; if (2 != diff.n) result = EXIT_FAILURE; } result = libxsmm_matdiff(&diff, LIBXSMM_DATATYPE(ITYPE), 1/*m*/, 3/*n*/, x/*ref*/, y/*tst*/, NULL/*ldref*/, NULL/*ldtst*/); if (EXIT_SUCCESS == result) { /* One-norm */ if (0.0000001 < LIBXSMM_ABS(diff.norm1_abs - 3.1000000)) result = EXIT_FAILURE; if (0.0000001 < LIBXSMM_ABS(diff.norm1_rel - 0.0281818)) result = EXIT_FAILURE; /* Infinity-norm */ if (0.0000001 < LIBXSMM_ABS(diff.normi_abs - 2.0000000)) result = EXIT_FAILURE; if (0.0000001 < LIBXSMM_ABS(diff.normi_rel - 0.0200000)) result = EXIT_FAILURE; /* Froebenius-norm (relative) */ if (0.0000001 < LIBXSMM_ABS(diff.normf_rel - 0.0222918)) result = EXIT_FAILURE; /** L2-norm */ if (0.0000001 < LIBXSMM_ABS(diff.l2_abs - 2.2383029)) result = EXIT_FAILURE; if (0.0000001 < LIBXSMM_ABS(diff.l2_rel - 0.2438908)) result = EXIT_FAILURE; /** L1-norm */ if (0.0000001 < LIBXSMM_ABS(diff.l1_ref - 110.0)) result = EXIT_FAILURE; if (0.0000001 < LIBXSMM_ABS(diff.l1_tst - 111.1)) result = EXIT_FAILURE; /* Linf-norm */ if (0.0000001 < LIBXSMM_ABS(diff.linf_abs - 2.0000000)) result = EXIT_FAILURE; if (0.0000001 < LIBXSMM_ABS(diff.linf_rel - 0.2222222)) result = EXIT_FAILURE; /* Location of maximum absolute error */ if (0 != diff.m) result = EXIT_FAILURE; if (2 != diff.n) result = EXIT_FAILURE; } result = libxsmm_matdiff(&diff, LIBXSMM_DATATYPE(ITYPE), 3/*m*/, 1/*n*/, x/*ref*/, y/*tst*/, NULL/*ldref*/, NULL/*ldtst*/); if (EXIT_SUCCESS == result) { /* One-norm */ if (0.0000001 < LIBXSMM_ABS(diff.norm1_abs - 3.1000000)) result = EXIT_FAILURE; if (0.0000001 < LIBXSMM_ABS(diff.norm1_rel - 0.0281818)) result = EXIT_FAILURE; /* Infinity-norm */ if (0.0000001 < LIBXSMM_ABS(diff.normi_abs - 2.0000000)) result = EXIT_FAILURE; if (0.0000001 < LIBXSMM_ABS(diff.normi_rel - 0.0200000)) result = EXIT_FAILURE; /* Froebenius-norm (relative) */ if (0.0000001 < LIBXSMM_ABS(diff.normf_rel - 0.0222918)) result = EXIT_FAILURE; /** L2-norm */ if (0.0000001 < LIBXSMM_ABS(diff.l2_abs - 2.2383029)) result = EXIT_FAILURE; if (0.0000001 < LIBXSMM_ABS(diff.l2_rel - 0.2438908)) result = EXIT_FAILURE; /** L1-norm */ if (0.0000001 < LIBXSMM_ABS(diff.l1_ref - 110.0)) result = EXIT_FAILURE; if (0.0000001 < LIBXSMM_ABS(diff.l1_tst - 111.1)) result = EXIT_FAILURE; /* Linf-norm */ if (0.0000001 < LIBXSMM_ABS(diff.linf_abs - 2.0000000)) result = EXIT_FAILURE; if (0.0000001 < LIBXSMM_ABS(diff.linf_rel - 0.2222222)) result = EXIT_FAILURE; /* Location of maximum absolute error */ if (2 != diff.m) result = EXIT_FAILURE; if (0 != diff.n) result = EXIT_FAILURE; } return result; } libxsmm-1.17/tests/matdiff.vcxproj000066400000000000000000000544601415223013700173150ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 matdiff {978B5AF3-4A2E-4DC9-BF7C-374403865240} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console libxsmmnoblas.lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true GenerateParallelCode SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console true libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console libxsmmnoblas.lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true GenerateParallelCode SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console true libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console MSVCRT libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console MSVCRT libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console libxsmm-1.17/tests/math.c000066400000000000000000000262461415223013700153640ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #if !defined(INCLUDE_LIBXSMM_LAST) # include # include #endif #include #if defined(INCLUDE_LIBXSMM_LAST) # include # include #endif #define N 1000000 LIBXSMM_INLINE unsigned int ref_isqrt_u32(unsigned int u32) { const unsigned int r = (unsigned int)(sqrt((double)u32) + 0.5); return ((double)r * r) <= u32 ? r : (r - 1); } LIBXSMM_INLINE unsigned int ref_isqrt_u64(unsigned long long u64) { #if defined(__STDC_VERSION__) && (199901L <= __STDC_VERSION__) /*C99*/ const unsigned long long r = (unsigned long long)(sqrtl((long double)u64) + 0.5); #else const unsigned long long r = (unsigned long long)(sqrt((double)u64) + 0.5); #endif return (unsigned int)(((long double)r * r) <= u64 ? r : (r - 1)); } LIBXSMM_INLINE unsigned int ref_icbrt_u32(unsigned int u32) { const unsigned int r = (unsigned int)(pow((double)u32, 1.0 / 3.0) + 0.5); return ((double)r * r * r) <= u32 ? r : (r - 1); } LIBXSMM_INLINE unsigned int ref_icbrt_u64(unsigned long long u64) { #if defined(__STDC_VERSION__) && (199901L <= __STDC_VERSION__) /*C99*/ const unsigned long long r = (unsigned long long)(powl((long double)u64, 1.0 / 3.0) + 0.5); #else const unsigned long long r = (unsigned long long)(pow((double)u64, 1.0 / 3.0) + 0.5); #endif return (unsigned int)(((long double)r * r * r) <= u64 ? r : (r - 1)); } LIBXSMM_INLINE unsigned int ref_ilog2_u32(unsigned int u32) { return (unsigned int)ceil(LIBXSMM_LOG2(u32)); } int main(/*int argc, char* argv[]*/) { const unsigned long long scale64 = ((unsigned long long)-1) / (RAND_MAX) - 1; const unsigned int scale32 = ((unsigned int)-1) / (RAND_MAX) - 1; int warn_dsqrt = 0, warn_ssqrt = 0, i; for (i = 0; i < 256; ++i) { const float a = libxsmm_sexp2_u8((unsigned char)i); const float b = LIBXSMM_EXP2F((float)i); if (LIBXSMM_NEQ(a, b)) exit(EXIT_FAILURE); } for (i = -128; i < 127; ++i) { const float a = libxsmm_sexp2_i8((signed char)i); const float b = LIBXSMM_EXP2F((float)i); if (LIBXSMM_NEQ(a, b)) exit(EXIT_FAILURE); } for (i = 0; i < (N); ++i) { const int r1 = (0 != i ? rand() : 0), r2 = (1 < i ? rand() : 0); const double rd = 2.0 * (r1 * (r2 - RAND_MAX / 2)) / RAND_MAX; const unsigned long long r64 = scale64 * r1; const unsigned int r32 = scale32 * r1; double d1, d2, e1, e2, e3; unsigned int a, b; if (LIBXSMM_NEQ(LIBXSMM_ROUND((double)r1), LIBXSMM_ROUNDX(double, (double)r1))) exit(EXIT_FAILURE); if (LIBXSMM_NEQ(LIBXSMM_ROUND((double)r2), LIBXSMM_ROUNDX(double, (double)r2))) exit(EXIT_FAILURE); if (LIBXSMM_NEQ(LIBXSMM_ROUND((double)rd), LIBXSMM_ROUNDX(double, (double)rd))) exit(EXIT_FAILURE); if (LIBXSMM_NEQ(LIBXSMM_ROUNDF((float)r1), LIBXSMM_ROUNDX(float, (float)r1))) exit(EXIT_FAILURE); if (LIBXSMM_NEQ(LIBXSMM_ROUNDF((float)r2), LIBXSMM_ROUNDX(float, (float)r2))) exit(EXIT_FAILURE); if (LIBXSMM_NEQ(LIBXSMM_ROUNDF((float)rd), LIBXSMM_ROUNDX(float, (float)rd))) exit(EXIT_FAILURE); d1 = libxsmm_sexp2((float)rd); d2 = LIBXSMM_EXP2F((float)rd); e1 = fabs(d1 - d2); e2 = fabs(d2); e3 = 0 < e2 ? (e1 / e2) : 0.0; if (1E-4 < LIBXSMM_MIN(e1, e3)) exit(EXIT_FAILURE); a = libxsmm_isqrt_u32(r32); b = ref_isqrt_u32(r32); if (a != b) exit(EXIT_FAILURE); a = libxsmm_isqrt_u64(r64); b = ref_isqrt_u64(r64); if (a != b) exit(EXIT_FAILURE); d1 = libxsmm_ssqrt((float)fabs(rd)); e1 = fabs(d1 * d1 - fabs(rd)); d2 = LIBXSMM_SQRTF((float)fabs(rd)); e2 = fabs(d2 * d2 - fabs(rd)); if (e2 < e1) { e3 = 0 < e2 ? (e1 / e2) : 0.f; if (1E-2 > LIBXSMM_MIN(fabs(e1 - e2), e3)) { ++warn_ssqrt; } else { exit(EXIT_FAILURE); } } d1 = libxsmm_dsqrt(fabs(rd)); e1 = fabs(d1 * d1 - fabs(rd)); d2 = sqrt(fabs(rd)); e2 = fabs(d2 * d2 - fabs(rd)); if (e2 < e1) { e3 = 0 < e2 ? (e1 / e2) : 0.f; if (1E-11 > LIBXSMM_MIN(fabs(e1 - e2), e3)) { ++warn_dsqrt; } else { exit(EXIT_FAILURE); } } a = libxsmm_icbrt_u32(r32); b = ref_icbrt_u32(r32); if (a != b) exit(EXIT_FAILURE); a = libxsmm_icbrt_u64(r64); b = ref_icbrt_u64(r64); if (a != b) exit(EXIT_FAILURE); a = LIBXSMM_INTRINSICS_BITSCANFWD32(r32); b = LIBXSMM_INTRINSICS_BITSCANFWD32_SW(r32); if (a != b) exit(EXIT_FAILURE); a = LIBXSMM_INTRINSICS_BITSCANBWD32(r32); b = LIBXSMM_INTRINSICS_BITSCANBWD32_SW(r32); if (a != b) exit(EXIT_FAILURE); a = LIBXSMM_INTRINSICS_BITSCANFWD64(r64); b = LIBXSMM_INTRINSICS_BITSCANFWD64_SW(r64); if (a != b) exit(EXIT_FAILURE); a = LIBXSMM_INTRINSICS_BITSCANBWD64(r64); b = LIBXSMM_INTRINSICS_BITSCANBWD64_SW(r64); if (a != b) exit(EXIT_FAILURE); a = LIBXSMM_ILOG2(i); b = ref_ilog2_u32(i); if (0 != i && a != b) exit(EXIT_FAILURE); a = LIBXSMM_ILOG2(r32); b = ref_ilog2_u32(r32); if (0 != r32 && a != b) exit(EXIT_FAILURE); a = LIBXSMM_ISQRT2(i); b = libxsmm_isqrt_u32(i); if (a < LIBXSMM_DELTA(a, b)) exit(EXIT_FAILURE); a = LIBXSMM_ISQRT2(r32); b = libxsmm_isqrt_u32(r32); if (a < LIBXSMM_DELTA(a, b)) exit(EXIT_FAILURE); a = LIBXSMM_ISQRT2(r64); b = libxsmm_isqrt_u64(r64); if (0 != a/*u32-overflow*/ && a < LIBXSMM_DELTA(a, b)) exit(EXIT_FAILURE); } { /* further check LIBXSMM_INTRINSICS_BITSCANBWD32 */ const int npot[] = { 0, 1, 2, 4, 8, 16, 32, 64, 128, 256, 65536 }; const int n = sizeof(npot) / sizeof(*npot); for (i = 0; i < n; ++i) { const int numpot = npot[i]; const int nbits = LIBXSMM_INTRINSICS_BITSCANBWD32(numpot); const int num = nbits < numpot ? (1 << nbits) : nbits; if (numpot != num) { exit(EXIT_FAILURE); } } } { /* bit operations: specific tests */ unsigned int a, b; a = LIBXSMM_INTRINSICS_BITSCANFWD64(0x2aaaab69ede0); b = LIBXSMM_INTRINSICS_BITSCANFWD64_SW(0x2aaaab69ede0); if (a != b) exit(EXIT_FAILURE); } if (0 < warn_ssqrt || 0 < warn_dsqrt) { fprintf(stderr, "missed bitwise exact result in %.0f%% of the cases!\n", 100.0 * LIBXSMM_MAX(warn_ssqrt, warn_dsqrt) / N); } { /* check LIBXSMM_UP2POT and LIBXSMM_LO2POT */ const size_t a[] = { 0, 1, 10, 100, 127, 128, 129 }; const size_t b[] = { 0, 1, 16, 128, 128, 128, 256 }; const size_t c[] = { 0, 1, 8, 64, 64, 128, 128 }; const int n = sizeof(a) / sizeof(*a); for (i = 0; i < n; ++i) { if (LIBXSMM_UP2POT(a[i]) != b[i]) exit(EXIT_FAILURE); if (LIBXSMM_LO2POT(a[i]) != c[i]) exit(EXIT_FAILURE); } } { /* check LIBXSMM_UPDIV, LIBXSMM_UP and LIBXSMM_UP2 */ const int ai[] = { 0, 1, 3, 5, 127, 3000 }; const int ao[] = { 0, 1, 1, 1, 19, 429 }; const int bi[] = { 0, 1, 3, 5, 127, 3000 }; const int bo[] = { 0, 7, 7, 7, 133, 3003 }; const int ci[] = { 0, 1, 3, 5, 127, 3000 }; const int co[] = { 0, 8, 8, 8, 128, 3000 }; const int n = sizeof(ai) / sizeof(*ai); for (i = 0; i < n; ++i) { if (LIBXSMM_UPDIV(ai[i], 7) != ao[i]) exit(EXIT_FAILURE); if (LIBXSMM_UP( bi[i], 7) != bo[i]) exit(EXIT_FAILURE); if (LIBXSMM_UP2( ci[i], 8) != co[i]) exit(EXIT_FAILURE); } } { /* check GCD */ const size_t a[] = { 0, 1, 0, 100, 10 }; const size_t b[] = { 0, 0, 1, 10, 100 }; const size_t c[] = { 1, 1, 1, 10, 10 }; const int n = sizeof(a) / sizeof(*a); for (i = 0; i < n; ++i) { if (libxsmm_gcd(a[i], b[i]) != c[i]) exit(EXIT_FAILURE); } } { /* check prime factorization */ const unsigned int test[] = { 0, 1, 2, 3, 5, 7, 12, 13, 24, 32, 2057, 120, 14, 997, 65519u * 65521u }; const int n = sizeof(test) / sizeof(*test); unsigned int fact[32]; for (i = 0; i < n; ++i) { const int np = libxsmm_primes_u32(test[i], fact); int j; for (j = 1; j < np; ++j) fact[0] *= fact[j]; if (0 < np && fact[0] != test[i]) { exit(EXIT_FAILURE); } } } { /* check shuffle routine */ const unsigned int test[] = { 0, 1, 2, 3, 5, 7, 12, 13, 24, 32, 2057, 120, 14, 997 }; const int n = sizeof(test) / sizeof(*test); for (i = 0; i < n; ++i) { const size_t coprime = libxsmm_shuffle(test[i]); const unsigned int gcd = (unsigned int)libxsmm_gcd(coprime, test[i]); if ((0 != coprime || 1 < test[i]) && (test[i] <= coprime || 1 != gcd)) { exit(EXIT_FAILURE); } } if (libxsmm_shuffle(65423) != 32711) exit(EXIT_FAILURE); if (libxsmm_shuffle(1000) != 499) exit(EXIT_FAILURE); if (libxsmm_shuffle(997) != 498) exit(EXIT_FAILURE); if (libxsmm_shuffle(24) != 11) exit(EXIT_FAILURE); if (libxsmm_shuffle(5) != 2) exit(EXIT_FAILURE); } /* find upper limited product */ if (libxsmm_product_limit(12 * 5 * 7 * 11 * 13 * 17, 231, 0) != (3 * 7 * 11)) exit(EXIT_FAILURE); if (libxsmm_product_limit(12 * 5 * 7, 32, 0) != (2 * 3 * 5)) exit(EXIT_FAILURE); if (libxsmm_product_limit(12 * 13, 13, 0) != 13) exit(EXIT_FAILURE); if (libxsmm_product_limit(12, 6, 0) != 6) exit(EXIT_FAILURE); if (libxsmm_product_limit(0, 48, 0) != 0) exit(EXIT_FAILURE); if (libxsmm_product_limit(0, 1, 0) != 0) exit(EXIT_FAILURE); if (libxsmm_product_limit(0, 0, 0) != 0) exit(EXIT_FAILURE); if (libxsmm_product_limit(1, 0, 0) != 0) exit(EXIT_FAILURE); /* find lower limited product */ if (libxsmm_product_limit(12 * 5 * 7 * 11 * 13 * 17, 231, 1) != (3 * 7 * 11)) exit(EXIT_FAILURE); if (libxsmm_product_limit(12 * 5 * 7, 36, 1) != (2 * 5 * 7)) exit(EXIT_FAILURE); if (libxsmm_product_limit(12 * 13, 13, 1) != 13) exit(EXIT_FAILURE); if (libxsmm_product_limit(320, 300, 1) != 320) exit(EXIT_FAILURE); if (libxsmm_product_limit(320, 65, 1) != 80) exit(EXIT_FAILURE); if (libxsmm_product_limit(320, 33, 1) != 64) exit(EXIT_FAILURE); if (libxsmm_product_limit(1000, 6, 1) != 10) exit(EXIT_FAILURE); if (libxsmm_product_limit(1000, 9, 1) != 10) exit(EXIT_FAILURE); if (libxsmm_product_limit(12, 7, 1) != 12) exit(EXIT_FAILURE); if (libxsmm_product_limit(5, 2, 1) != 5) exit(EXIT_FAILURE); if (libxsmm_product_limit(5, 2, 0) != 1) exit(EXIT_FAILURE); if (libxsmm_product_limit(0, 1, 1) != 0) exit(EXIT_FAILURE); if (libxsmm_product_limit(0, 0, 1) != 0) exit(EXIT_FAILURE); if (libxsmm_product_limit(1, 0, 1) != 0) exit(EXIT_FAILURE); if (libxsmm_isqrt2_u32(1024) * 32 != 1024) exit(EXIT_FAILURE); if (libxsmm_isqrt2_u32(1981) * 283 != 1981) exit(EXIT_FAILURE); if (libxsmm_isqrt2_u32(2507) * 109 != 2507) exit(EXIT_FAILURE); if (libxsmm_isqrt2_u32(1975) * 79 != 1975) exit(EXIT_FAILURE); return EXIT_SUCCESS; } libxsmm-1.17/tests/math.vcxproj000066400000000000000000000545341415223013700166360ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 math {2EE779F4-1C04-4FC8-93A3-A721B4C1F095} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true GenerateParallelCode SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console true libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true GenerateParallelCode SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console true libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console MSVCRT libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console MSVCRT libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console libxsmm-1.17/tests/memory.c000066400000000000000000000033211415223013700157300ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include int main(/*int argc, char* argv[]*/) { char item[LIBXSMM_DESCRIPTOR_MAXSIZE]; const libxsmm_blasint isize = sizeof(item); const libxsmm_blasint size = 1000, ntests = 1000; char *const data = (char*)malloc((size_t)isize * size); libxsmm_blasint i, j, k, s; if (NULL == data) return EXIT_FAILURE; libxsmm_rng_seq(data, isize * size); for (i = 0; i < ntests; ++i) { j = (libxsmm_blasint)libxsmm_rng_u32(size); s = libxsmm_rng_u32(isize) + 1; libxsmm_rng_seq(item, s); for (k = s; k < isize; ++k) item[k] = 0; LIBXSMM_MEMCPY127(data + (j * isize), item, isize); k = libxsmm_diff_n(item, data, (unsigned char)s, (unsigned char)isize, 0, size); while (k < j) { k = libxsmm_diff_n(item, data, (unsigned char)s, (unsigned char)isize, k + 1, size); } if (k == j) { continue; } else { free(data); return EXIT_FAILURE; } } free(data); return EXIT_SUCCESS; } libxsmm-1.17/tests/memory.vcxproj000066400000000000000000000545001415223013700172060ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 memory 10.0 {D7AC0AAD-A1B6-4AC2-9EBD-04F25EEBEF2A} Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__BLAS=0;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console libxsmmnoblas.lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__BLAS=0;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true GenerateParallelCode SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console true libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__BLAS=0;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console libxsmmnoblas.lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__BLAS=0;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true GenerateParallelCode SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console true libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__BLAS=0;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console MSVCRT libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__BLAS=0;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console MSVCRT libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console libxsmm-1.17/tests/mhd.c000066400000000000000000000163461415223013700152030ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include int main(int argc, char* argv[]) { const char *const filename = (1 < argc ? argv[1] : "mhd_image.mhd"); /* take some block-sizes, which are used to test leading dimensions */ const int bw = LIBXSMM_MAX(2 < argc ? atoi(argv[2]) : 64, 1); const int bh = LIBXSMM_MAX(3 < argc ? atoi(argv[3]) : 64, 1); size_t ndims = 3, size[3], pitch[3], offset[3], ncomponents, header_size, extension_size; libxsmm_mhd_elemtype type; char data_filename[1024]; void* data = NULL; int result; /* Read header information; function includes various sanity checks. */ result = libxsmm_mhd_read_header(filename, sizeof(data_filename), data_filename, &ndims, size, &ncomponents, &type, &header_size, &extension_size); /* Allocate data according to the header information. */ if (EXIT_SUCCESS == result) { size_t typesize; pitch[0] = LIBXSMM_UP(size[0], bw); pitch[1] = LIBXSMM_UP(size[1], bh); pitch[2] = size[2]; /* center the image inside of the (pitched) buffer */ offset[0] = (pitch[0] - size[0]) / 2; offset[1] = (pitch[1] - size[1]) / 2; offset[2] = 0; if (0 != libxsmm_mhd_typename(type, &typesize, NULL/*ctypename*/)) { const size_t nelements = pitch[0] * (1 < ndims ? (pitch[1] * (2 < ndims ? pitch[2] : 1)) : 1); data = malloc(ncomponents * nelements * typesize); } else { result = EXIT_FAILURE; } } /* perform tests with libxsmm_mhd_element_conversion (int2signed) */ if (EXIT_SUCCESS == result) { short src = 2507, src_min = 0, src_max = 5000; float dst_f32; /* destination range is implicit due to type */ signed char dst_i8; /* destination range is implicit due to type */ result = libxsmm_mhd_element_conversion( &dst_f32, LIBXSMM_MHD_ELEMTYPE_F32/*dst_type*/, LIBXSMM_MHD_ELEMTYPE_I16/*src_type*/, &src, NULL/*src_min*/, NULL/*src_max*/); if (EXIT_SUCCESS == result && src != dst_f32) result = EXIT_FAILURE; if (EXIT_SUCCESS == result) { result = libxsmm_mhd_element_conversion( &dst_f32, LIBXSMM_MHD_ELEMTYPE_F32/*dst_type*/, LIBXSMM_MHD_ELEMTYPE_I16/*src_type*/, &src, &src_min, &src_max); if (EXIT_SUCCESS == result && src != dst_f32) result = EXIT_FAILURE; } if (EXIT_SUCCESS == result) { result = libxsmm_mhd_element_conversion( &dst_i8, LIBXSMM_MHD_ELEMTYPE_I8/*dst_type*/, LIBXSMM_MHD_ELEMTYPE_I16/*src_type*/, &src, NULL/*src_min*/, NULL/*src_max*/); if (EXIT_SUCCESS == result && LIBXSMM_MIN(127, src) != dst_i8) result = EXIT_FAILURE; } if (EXIT_SUCCESS == result) { result = libxsmm_mhd_element_conversion( &dst_i8, LIBXSMM_MHD_ELEMTYPE_I8/*dst_type*/, LIBXSMM_MHD_ELEMTYPE_I16/*src_type*/, &src, &src_min, &src_max); if (EXIT_SUCCESS == result && 64 != dst_i8) result = EXIT_FAILURE; } } /* perform tests with libxsmm_mhd_element_conversion (float2int) */ if (EXIT_SUCCESS == result) { double src = 1975, src_min = -25071975, src_max = 1981; short dst_i16; /* destination range is implicit due to type */ unsigned char dst_u8; /* destination range is implicit due to type */ result = libxsmm_mhd_element_conversion( &dst_i16, LIBXSMM_MHD_ELEMTYPE_I16/*dst_type*/, LIBXSMM_MHD_ELEMTYPE_F64/*src_type*/, &src, NULL/*src_min*/, NULL/*src_max*/); if (EXIT_SUCCESS == result && src != dst_i16) result = EXIT_FAILURE; if (EXIT_SUCCESS == result) { result = libxsmm_mhd_element_conversion( &dst_i16, LIBXSMM_MHD_ELEMTYPE_I16/*dst_type*/, LIBXSMM_MHD_ELEMTYPE_F64/*src_type*/, &src, &src_min, &src_max); if (EXIT_SUCCESS == result && 2 != dst_i16) result = EXIT_FAILURE; } if (EXIT_SUCCESS == result) { result = libxsmm_mhd_element_conversion( &dst_u8, LIBXSMM_MHD_ELEMTYPE_U8/*dst_type*/, LIBXSMM_MHD_ELEMTYPE_F64/*src_type*/, &src, NULL/*src_min*/, NULL/*src_max*/); if (EXIT_SUCCESS == result && LIBXSMM_MIN(255, src) != dst_u8) result = EXIT_FAILURE; } if (EXIT_SUCCESS == result) { result = libxsmm_mhd_element_conversion( &dst_u8, LIBXSMM_MHD_ELEMTYPE_U8/*dst_type*/, LIBXSMM_MHD_ELEMTYPE_F64/*src_type*/, &src, &src_min, &src_max); if (EXIT_SUCCESS == result && 255 != dst_u8) result = EXIT_FAILURE; } if (EXIT_SUCCESS == result) { src = -src; result = libxsmm_mhd_element_conversion( &dst_u8, LIBXSMM_MHD_ELEMTYPE_U8/*dst_type*/, LIBXSMM_MHD_ELEMTYPE_F64/*src_type*/, &src, &src_min, &src_max); if (EXIT_SUCCESS == result && 0 != dst_u8) result = EXIT_FAILURE; } if (EXIT_SUCCESS == result) { result = libxsmm_mhd_element_conversion( &dst_i16, LIBXSMM_MHD_ELEMTYPE_I16/*dst_type*/, LIBXSMM_MHD_ELEMTYPE_F64/*src_type*/, &src, &src_min, &src_max); if (EXIT_SUCCESS == result && -3 != dst_i16) result = EXIT_FAILURE; } } /* Read the data according to the header into the allocated buffer. */ if (EXIT_SUCCESS == result) { result = libxsmm_mhd_read(data_filename, offset, size, pitch, ndims, ncomponents, header_size, type, NULL/*type_data*/, data, NULL/*handle_element*/, NULL/*extension*/, 0/*extension_size*/); } /* Write the data into a new file; update header_size. */ if (EXIT_SUCCESS == result) { result = libxsmm_mhd_write("mhd_test.mhd", NULL/*offset*/, pitch, pitch, ndims, ncomponents, type, NULL/*no conversion*/, data, &header_size, NULL/*extension_header*/, NULL/*extension*/, 0/*extension_size*/); } /* Check the written data against the buffer. */ if (EXIT_SUCCESS == result) { result = libxsmm_mhd_read(data_filename, offset, size, pitch, ndims, ncomponents, 0/*header_size*/, type, NULL/*type*/, data, libxsmm_mhd_element_comparison, NULL/*extension*/, 0/*extension_size*/); } /* Check the written data against the buffer with conversion. */ if (EXIT_SUCCESS == result) { const libxsmm_mhd_elemtype tcomp = LIBXSMM_MHD_ELEMTYPE_F64; void* buffer = NULL; size_t typesize; if (0 != libxsmm_mhd_typename(tcomp, &typesize, NULL/*ctypename*/)) { const size_t nelements = pitch[0] * (1 < ndims ? (pitch[1] * (2 < ndims ? pitch[2] : 1)) : 1); buffer = malloc(ncomponents * nelements * typesize); } result = libxsmm_mhd_read(data_filename, offset, size, pitch, ndims, ncomponents, 0/*header_size*/, type, &tcomp, buffer, NULL/*libxsmm_mhd_element_comparison*/, NULL/*extension*/, 0/*extension_size*/); free(buffer); } /* Deallocate the buffer. */ free(data); return result; } libxsmm-1.17/tests/mhd.vcxproj000066400000000000000000000545321415223013700164530ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 mhd {0F908964-721C-4952-99A8-EF5C988CCE58} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true GenerateParallelCode SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console true libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true GenerateParallelCode SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console true libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console MSVCRT libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console MSVCRT libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console libxsmm-1.17/tests/mhd_image.mhd000066400000000000000000000004601415223013700166610ustar00rootroot00000000000000ObjectType = Image NDims = 3 BinaryData = True BinaryDataByteOrderMSB = False CompressedData = False TransformMatrix = 1 0 0 0 1 0 0 0 1 Offset = 0 0 0 CenterOfRotation = 0 0 0 AnatomicalOrientation = RAI ElementSpacing = 1 1 1 DimSize = 202 134 1 ElementType = MET_SHORT ElementDataFile = mhd_image.raw libxsmm-1.17/tests/mhd_image.raw000066400000000000000000001515701415223013700167130ustar00rootroot00000000000000 "-6@HNS[[[[[[[[[UOH@6,!'Ca}hI) 1Qp|X3 9fz? 6h|<=nJ .sB Ru)"fmU@5,# &/:Ke8*nmL0%GlE#aZ2 7n= V}S/ ?t&5d-is[X0 -q6*|l*Iq7A4Jv441Oj( AIXo0 lZN3,_CJ ~Q2u R?G;%e-(n, Z 2B NFKnr 3"g"VUUUUUUUUOM\; o j1S P ['f/=|m =j.f/CE'Qei-f/Zi-f/iy)i-H{{{{{{{Xf/dH)=M`i-f/ I~ Ni-f/(p Qi-f/$ Pj-f/b&Pe+f/?1Q'%%%%%%%&"f/$G (f/YFf/cef/re %, f/rD'a________XGa___________________^O;!c^____a7EQf/g 0k. 62%f/\ oi- r f/7J|i- 0Lf/V6A5i- "of/}(i- pf/i- TZf/5qi- -Pf/;i- -f/fUli- %%%%%%%%%?`*%%%%%%QA $hf/ .i- khW f/J`i- @do>f/&i- f/+DYi- ?ff/ Wi- /0)f/34i- eNf/*i- kf/UTi- f/FPi- f/ i- f/f@i- f/?0Ui- f/*)k,i- f/t i-  f/i- [[[[[[[[[[[[[[[[[[[[[[[[[:f//%i- f/.d?i- |f/IVji- Rf/o p[i- Pf/Li-  'g/Y-Mi- w"1a/7Mi- `.}M8O/8P_- dy=,*+,ME<:/-`B- 13C#/t'- L/- c8[/h]- -/4- 89 2/y } *- Q(q;/0dN3- ; {6"/IK- g(M0-I5 j.ybds4e_Yt.5 b7'-EOOOJ3!r ^G) ?KY"$  ne W`+nL`IL`;9V`,l`.:.`L O`q1y`C\`yLP` "C`uP`/h*Za:u ;Z:> 7ec<s%-hm/s8!Lwn U(%=la rD=_PX~X?, #1@Uu2&r\M<,  %0=L\k}db9 N=bH\GV59#gV*5d(9P+-fmC/fe7 'JldI."Ehy_A" *6AK[kwwk\NE;1& libxsmm-1.17/tests/registry.c000066400000000000000000000064531415223013700163010ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include int main(/*int argc, char* argv[]*/) { int result = EXIT_SUCCESS; struct { int x, y, z; } key[] = { { 0, 0, 0 }, { 0, 0, 1 }, { 0, 1, 0 }, { 0, 1, 1 }, { 1, 0, 0 }, { 1, 0, 1 }, { 1, 1, 0 }, { 1, 1, 1 } }; /*const*/ char* value[] = { "hello", "world", "libxsmm", "hello world", "hello libxsmm", "value", "next", "last" }; const size_t key_size = sizeof(*key); #if (0 != LIBXSMM_JIT) /* unused variable warning */ const int n = (int)sizeof(key) / (int)key_size; int i; #endif if (EXIT_SUCCESS == result) { /* test for some expected failure */ result = (NULL == libxsmm_xregister(key, /*too large*/LIBXSMM_DESCRIPTOR_MAXSIZE + 1, strlen(value[0]) + 1, value[0]) ? EXIT_SUCCESS : EXIT_FAILURE); } if (EXIT_SUCCESS == result) { /* test for some expected failure */ result = (NULL == libxsmm_xregister(NULL, 16, /* invalid combination */ strlen(value[0]) + 1, value[0]) ? EXIT_SUCCESS : EXIT_FAILURE); } if (EXIT_SUCCESS == result) { /* test for some expected failure */ result = (NULL == libxsmm_xregister(NULL, 0, /* invalid combination */ strlen(value[0]) + 1, value[0]) ? EXIT_SUCCESS : EXIT_FAILURE); } if (EXIT_SUCCESS == result) { /* test for some expected failure */ result = (NULL == libxsmm_xregister(key, key_size, 0, NULL) ? EXIT_SUCCESS : EXIT_FAILURE); } #if (0 != LIBXSMM_JIT) /* registry service only with JIT */ if (EXIT_SUCCESS == result) { /* same key but (larger) payload; initialized later */ result = (NULL != libxsmm_xregister(key, key_size, strlen(value[0]) + 1, NULL) ? EXIT_SUCCESS : EXIT_FAILURE); } if (EXIT_SUCCESS == result) { /* re-register same key with larger payload */ result = (NULL == libxsmm_xregister(key, key_size, strlen(value[3]) + 1, value[0]) ? EXIT_SUCCESS : EXIT_FAILURE); } if (EXIT_SUCCESS == result) { /* release registered value */ libxsmm_xrelease(key, key_size); } for (i = 0; i < n && EXIT_SUCCESS == result; ++i) { result = (NULL != libxsmm_xregister(key + i, key_size, strlen(value[i]) + 1, value[i]) ? EXIT_SUCCESS : EXIT_FAILURE); } for (i = 0; i < n && EXIT_SUCCESS == result; ++i) { const char *const v = (char*)libxsmm_xdispatch(key + i, key_size); libxsmm_kernel_info info; result = libxsmm_get_kernel_info(v, &info); if (EXIT_SUCCESS == result) { result = (LIBXSMM_KERNEL_KIND_USER == info.kind ? EXIT_SUCCESS : EXIT_FAILURE); } if (EXIT_SUCCESS == result) { result = strcmp(v, value[i]); } libxsmm_release_kernel(v); } #endif return result; } libxsmm-1.17/tests/registry.vcxproj000066400000000000000000000545441415223013700175560ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 registry 10.0 {988F6B05-1746-43E6-8FEA-B7DB609EE668} Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true GenerateParallelCode SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console true libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true GenerateParallelCode SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console true libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console MSVCRT libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console MSVCRT libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console libxsmm-1.17/tests/rng.c000066400000000000000000000100521415223013700152050ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst, Alexander Heinecke (Intel Corp.) ******************************************************************************/ #if !defined(INCLUDE_LIBXSMM_LAST) # include #endif #include #if defined(INCLUDE_LIBXSMM_LAST) # include #endif #if !defined(USE_EXPECTED) && 0 # define USE_EXPECTED #else # include #endif int main(/*int argc, char* argv[]*/) { #if defined(USE_EXPECTED) const unsigned int seed = 25071975; const float rngs_expected[] = { 0.438140392f, 0.284636021f, 0.808342457f, 0.140940785f, 0.740890265f, 0.0189954042f, 0.4811354880f, 0.616942167f, 0.273835897f, 0.636928558f, 0.916998625f, 0.260923862f, 0.673431635f, 0.5160189870f, 0.0404732227f, 0.327739120f }; #endif libxsmm_blasint num_rngs = 1000, i; libxsmm_matdiff_info info; int result = EXIT_SUCCESS; float *const rngs = (float*)malloc((size_t)(sizeof(float) * num_rngs)); if (NULL == rngs) num_rngs = 0; /* mute warning about potentially uninitialized variable */ libxsmm_matdiff_clear(&info); #if defined(USE_EXPECTED) /* setup reproducible sequence */ libxsmm_rng_set_seed(seed); /* fill array with random floats */ libxsmm_rng_f32_seq(rngs, num_rngs); /* check expected value (depends on reproducible seed) */ for (i = 0; i < 16; ++i) { if (rngs_expected[i] != rngs[i]) result = EXIT_FAILURE; } /* reset state */ libxsmm_rng_set_seed(seed); /* enforce scalar RNG */ libxsmm_rng_f32_seq(rngs, 15); /* check expected value matches scalar RNG; check successful reset */ for (i = 0; i < 16; ++i) { if (rngs_expected[i] != rngs[i]) result = EXIT_FAILURE; } if (EXIT_SUCCESS == result) { /* calculate quality of random numbers */ result = libxsmm_matdiff(&info, LIBXSMM_DATATYPE_F32, 1/*m*/, num_rngs, NULL/*ref*/, rngs/*tst*/, NULL/*ldref*/, NULL/*ldtst*/); } #else { int j; for (j = 0; j < 1000; ++j) { /* setup sequence */ libxsmm_rng_set_seed((unsigned int)time(0)); /* fill array with random floats */ switch (j % 2) { case 1: { for (i = 0; i < num_rngs; ++i) rngs[i] = (float)libxsmm_rng_f64(); } break; default: libxsmm_rng_f32_seq(rngs, num_rngs); } if (EXIT_SUCCESS == result) { /* calculate quality of random numbers */ libxsmm_matdiff_info j_info; result = libxsmm_matdiff(&j_info, LIBXSMM_DATATYPE_F32, 1/*m*/, num_rngs, NULL/*ref*/, rngs/*tst*/, NULL/*ldref*/, NULL/*ldtst*/); if (EXIT_SUCCESS == result) libxsmm_matdiff_reduce(&info, &j_info); } #endif if (EXIT_SUCCESS == result) { libxsmm_blasint num_odd = 0, num_even = 0; const double scale = 0xFFFFFFFF; for (i = 0; i < num_rngs; ++i) { const unsigned int u = (unsigned int)LIBXSMM_ROUND(rngs[i] * scale); if (u & 1) { ++num_odd; } else { ++num_even; } } if (num_rngs < 4 * LIBXSMM_DELTA(num_odd, num_even)) result = EXIT_FAILURE; } #if !defined(USE_EXPECTED) }} #endif if (EXIT_SUCCESS == result) { const double range = info.max_tst - info.min_tst, expected = 0.5; if (expected < 5 * LIBXSMM_DELTA(info.avg_tst, expected)) result = EXIT_FAILURE; if (expected < 5 * LIBXSMM_DELTA(0.5 * range, expected)) result = EXIT_FAILURE; } if (EXIT_SUCCESS == result) { const double expected = 1.0 / 12.0; if (expected < 5 * LIBXSMM_DELTA(info.var_tst, expected)) result = EXIT_FAILURE; } free(rngs); return result; } libxsmm-1.17/tests/rng.vcxproj000066400000000000000000000545321415223013700164710ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 rng {A561000C-1143-4897-9F30-D6734CC9B293} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true GenerateParallelCode SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console true libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true GenerateParallelCode SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console true libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console MSVCRT libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console MSVCRT libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console libxsmm-1.17/tests/test.sh000077500000000000000000000063701415223013700156010ustar00rootroot00000000000000#!/usr/bin/env bash ############################################################################### # Copyright (c) Intel Corporation - All rights reserved. # # This file is part of the LIBXSMM library. # # # # For information on the license, see the LICENSE file. # # Further information: https://github.com/hfp/libxsmm/ # # SPDX-License-Identifier: BSD-3-Clause # ############################################################################### # Hans Pabst (Intel Corp.) ############################################################################### HERE=$(cd "$(dirname "$0")" && pwd -P) GREP=$(command -v grep) SED=$(command -v sed) ENV=$(command -v env) TR=$(command -v tr) WC=$(command -v wc) #Eventually disable a set of tests e.g., TESTS_DISABLED="headeronly" # list of tests that produce "application must be linked against LAPACK/BLAS" in case of BLAS=0 TESTS_NEEDBLAS="gemm.c" # grep pattern based on TESTS_NEEDBLAS TESTS_NEEDBLAS_GREP=$(echo ${TESTS_NEEDBLAS} | ${SED} "s/[[:space:]][[:space:]]*/\\\\|/g" | ${SED} "s/\./\\\\./g") # good-enough pattern to match main functions, and to include translation unit in test set if [ "" = "$*" ]; then TESTS=$(${GREP} -l "main[[:space:]]*(.*)" ${HERE}/*.c 2>/dev/null) else TESTS=$* fi if [ "${TESTS}" ] && [ "$(${GREP} 'BLAS=0' ${HERE}/../.state 2>/dev/null)" ]; then TESTS=$(echo "${TESTS}" | ${GREP} -v "${TESTS_NEEDBLAS_GREP}") fi if [ "Windows_NT" = "${OS}" ]; then # Cygwin's "env" does not set PATH ("Files/Black: No such file or directory") export PATH=${PATH}:${HERE}/../lib:/usr/x86_64-w64-mingw32/sys-root/mingw/bin # Cygwin's ldd hangs with dyn. linked executables or certain shared libraries LDD=$(command -v cygcheck) EXE=.exe else if [ "$(command -v ldd)" ]; then LDD=ldd elif [ "$(command -v otool)" ]; then LDD="otool -L" else LDD=echo fi fi echo "=============" echo "Running tests" echo "=============" NTEST=1 NMAX=$(echo "${TESTS}" | ${WC} -w | ${TR} -d " ") for TEST in ${TESTS}; do NAME=$(basename "${TEST}" .c) echo -n "${NTEST} of ${NMAX} (${NAME})... " if [ "0" != "$(echo ${TESTS_DISABLED} | ${GREP} -q ${NAME}; echo $?)" ]; then cd ${HERE} ERROR=$({ if [ "$(${LDD} ${HERE}/${NAME}${EXE} 2>/dev/null | ${GREP} libiomp5\.)" ]; then ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../lib \ KMP_AFFINITY=scatter,granularity=fine,1 \ MIC_KMP_AFFINITY=scatter,granularity=fine \ MIC_ENV_PREFIX=MIC \ OFFLOAD_INIT=on_start \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} ${TOOL_COMMAND_POST} else ${ENV} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HERE}/../lib \ DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${HERE}/../lib \ OMP_PROC_BIND=TRUE \ ${TOOL_COMMAND} ${HERE}/${NAME}${EXE} ${TOOL_COMMAND_POST} fi >/dev/null; } 2>&1) RESULT=$? else ERROR="Test is disabled" RESULT=0 fi if [ 0 != ${RESULT} ]; then echo "FAILED(${RESULT}) ${ERROR}" exit ${RESULT} else echo "OK ${ERROR}" fi NTEST=$((NTEST+1)) done libxsmm-1.17/tests/threadsafety.c000066400000000000000000000205351415223013700171110ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #if !defined(INCLUDE_LIBXSMM_LAST) # include #endif #include #if defined(_OPENMP) # include #endif #if defined(INCLUDE_LIBXSMM_LAST) # include #endif #if !defined(MAX_NKERNELS) # define MAX_NKERNELS 800 #endif #if !defined(CHECK_PARALLEL_INIT) # define CHECK_PARALLEL_INIT #endif #if !defined(CHECK_PARALLEL_JIT) # define CHECK_PARALLEL_JIT #endif #if !defined(CHECK_SEPARATE) # define CHECK_SEPARATE #endif #if !defined(USE_VERBOSE) # define USE_VERBOSE #endif #if !defined(ITYPE) # define ITYPE float #endif #if !defined(OTYPE) # define OTYPE ITYPE #endif #if defined(CHECK_SEPARATE) int test(libxsmm_blasint /*m*/, libxsmm_blasint /*n*/, libxsmm_blasint /*k*/); int test(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k) { const OTYPE alpha = 1, beta = 0; LIBXSMM_MMFUNCTION_TYPE2(ITYPE,OTYPE) kernel; int result = EXIT_FAILURE; #if defined(_OPENMP) && !defined(CHECK_PARALLEL_JIT) # pragma omp single #endif kernel = LIBXSMM_MMDISPATCH_SYMBOL2(ITYPE,OTYPE)(m, n, k, NULL/*lda*/, NULL/*ldb*/, NULL/*ldc*/, &alpha, &beta, NULL/*flags*/, NULL/*prefetch*/); if (NULL != kernel) { libxsmm_mmkernel_info info; libxsmm_xmmfunction xmm; xmm.LIBXSMM_TPREFIX2(ITYPE,OTYPE,mm) = kernel; result = libxsmm_get_mmkernel_info(xmm, &info); if (EXIT_SUCCESS == result) { const unsigned int um = (unsigned int)m, un = (unsigned int)n, uk = (unsigned int)k; if ( um != info.m || un != info.n || uk != info.k || um != info.lda || uk != info.ldb || um != info.ldc || LIBXSMM_GEMM_PRECISION(ITYPE) != info.iprecision || LIBXSMM_GEMM_PRECISION(OTYPE) != info.oprecision) { #if defined(_DEBUG) || defined(USE_VERBOSE) fprintf(stderr, "Error: the %" PRIuPTR "x%" PRIuPTR "x%" PRIuPTR "-kernel does not match!\n", (uintptr_t)m, (uintptr_t)n, (uintptr_t)k); #endif result = EXIT_FAILURE; } } #if defined(_DEBUG) || defined(USE_VERBOSE) else { fprintf(stderr, "Error: the %" PRIuPTR "x%" PRIuPTR "x%" PRIuPTR "-kernel is corrupted!\n", (uintptr_t)m, (uintptr_t)n, (uintptr_t)k); } #endif } #if !defined(LIBXSMM_JIT) || (0 == LIBXSMM_JIT) else result = EXIT_SUCCESS; #endif return result; } #endif /*defined(CHECK_SEPARATE)*/ int main(void) { union { libxsmm_xmmfunction x; void* p; } f[MAX_NKERNELS]; const OTYPE alpha = LIBXSMM_ALPHA, beta = LIBXSMM_BETA; const int prefetch = LIBXSMM_PREFETCH_AUTO; libxsmm_registry_info registry_info; const int max_shape = LIBXSMM_MAX_M, flags = LIBXSMM_FLAGS; int result = EXIT_SUCCESS, nkernels = MAX_NKERNELS, ndup = 0, i; #if defined(CHECK_SEPARATE) int mnk[3*MAX_NKERNELS] = { 8,8,8, 16,16,8 }; const int shift = 1, nr = 2; /* nr: predefined triplets */ #endif int r[3*MAX_NKERNELS]; #if defined(_OPENMP) const int nthreads = omp_get_max_threads(); #else const int nthreads = 1; #endif /* generate set of random number for parallel region */ for (i = 0; i < (3 * nkernels); i += 3) { r[i+0] = rand(); r[i+1] = rand(); r[i+2] = rand(); } #if defined(CHECK_SEPARATE) /* fill-up set of (m,n,k) for distinct test set */ for (i = 3 * nr; i < (3 * nkernels); ++i) { mnk[i] = (r[i] + shift) % max_shape + 1; } #endif #if defined(CHECK_PARALLEL_INIT) # if defined(_OPENMP) # pragma omp parallel for num_threads(nthreads) private(i) # endif for (i = 0; i < MAX_NKERNELS; ++i) { if (0 == (i % 2)) { libxsmm_init(); } else { libxsmm_finalize(); } } #endif libxsmm_init(); result = libxsmm_get_registry_info(®istry_info); if (EXIT_SUCCESS == result) { nkernels = (int)LIBXSMM_MIN((size_t)nkernels, registry_info.capacity); #if defined(CHECK_SEPARATE) for (i = 0; i < nkernels; i += nthreads) { #if defined(_OPENMP) && defined(CHECK_PARALLEL_JIT) # pragma omp parallel num_threads(nthreads) #endif { #if defined(_OPENMP) && defined(CHECK_PARALLEL_JIT) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif const int j = LIBXSMM_MIN(3 * (i + tid), nkernels - 3); const int ri = test(mnk[j+0], mnk[j+1], mnk[j+2]); if (EXIT_SUCCESS != ri) { #if defined(_OPENMP) && defined(CHECK_PARALLEL_JIT) # if (201107 <= _OPENMP) # pragma omp atomic write # else # pragma omp critical # endif #endif result = ri; } } } #endif } if (EXIT_SUCCESS == result) { #if defined(_OPENMP) && defined(CHECK_PARALLEL_JIT) # pragma omp parallel for num_threads(nthreads) private(i) #endif for (i = 0; i < nkernels; ++i) { const libxsmm_blasint m = r[3*i+0] % max_shape + 1; const libxsmm_blasint n = r[3*i+1] % max_shape + 1; const libxsmm_blasint k = r[3*i+2] % max_shape + 1; f[i].x.LIBXSMM_TPREFIX2(ITYPE,OTYPE,mm) = LIBXSMM_MMDISPATCH_SYMBOL2(ITYPE,OTYPE)( m, n, k, &m/*lda*/, &k/*ldb*/, &m/*ldc*/, &alpha, &beta, &flags, &prefetch); } } #if defined(_OPENMP) && !defined(CHECK_PARALLEL_JIT) # pragma omp parallel for num_threads(nthreads) private(i) #endif for (i = 0; i < nkernels; ++i) { if (EXIT_SUCCESS == result) { const libxsmm_blasint m = r[3*i+0] % max_shape + 1; const libxsmm_blasint n = r[3*i+1] % max_shape + 1; const libxsmm_blasint k = r[3*i+2] % max_shape + 1; union { libxsmm_xmmfunction x; void* p; } fi; libxsmm_descriptor_blob blob; const libxsmm_gemm_descriptor *const desc = libxsmm_gemm_descriptor_init(&blob, LIBXSMM_GEMM_PRECISION2(ITYPE,OTYPE), m, n, k, m/*lda*/, k/*ldb*/, m/*ldc*/, &alpha, &beta, flags, prefetch); fi.x = libxsmm_xmmdispatch(desc); if (NULL != fi.p && NULL != f[i].p) { if (fi.p != f[i].p) { libxsmm_kernel_info a_info, b_info; const int ra = libxsmm_get_kernel_info(f[i].p, &a_info); const int rb = libxsmm_get_kernel_info(fi.p, &b_info); /* perform deeper check based on another code generation (used as reference) */ if (EXIT_SUCCESS == ra && EXIT_SUCCESS == rb && (a_info.code_size != b_info.code_size || 0 != memcmp(f[i].p, fi.p, a_info.code_size))) { #if defined(_DEBUG) || defined(USE_VERBOSE) fprintf(stderr, "Error: the %" PRIuPTR "x%" PRIuPTR "x%" PRIuPTR "-kernel does not match!\n", (uintptr_t)m, (uintptr_t)n, (uintptr_t)k); #endif #if defined(_OPENMP) && !defined(CHECK_PARALLEL_JIT) # if (201107 <= _OPENMP) # pragma omp atomic write # else # pragma omp critical # endif #endif result = EXIT_FAILURE; } #if defined(_OPENMP) && !defined(CHECK_PARALLEL_JIT) # if (201107 <= _OPENMP) # pragma omp atomic write # else # pragma omp critical # endif #endif ++ndup; } } #if (0 != LIBXSMM_JIT) else { # if defined(_DEBUG) || defined(USE_VERBOSE) fprintf(stderr, "Error: no code generated for %" PRIuPTR "x%" PRIuPTR "x%" PRIuPTR "-kernel!\n", (uintptr_t)m, (uintptr_t)n, (uintptr_t)k); # endif # if defined(_OPENMP) && !defined(CHECK_PARALLEL_JIT) # if (201107 <= _OPENMP) # pragma omp atomic write # else # pragma omp critical # endif # endif result = EXIT_FAILURE; } #endif } } #if defined(_DEBUG) || defined(USE_VERBOSE) if (0 != ndup) fprintf(stderr, "Info: %i kernel%s duplicated.\n", ndup, 1 != ndup ? "s" : ""); #endif /* test unregistering and freeing kernels */ if (EXIT_SUCCESS == result) { for (i = 0; i < nkernels; ++i) { int j = i + 1; /* avoid to double-release kernels */ for (; j < nkernels; ++j) { if (f[i].p == f[j].p) f[j].p = NULL; } libxsmm_release_kernel(f[i].p); } } libxsmm_finalize(); return result; } libxsmm-1.17/tests/threadsafety.vcxproj000066400000000000000000000545541415223013700203720ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 threadsafety {25D60DA9-C699-4530-AD01-A1908DE9735F} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true GenerateParallelCode SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true GenerateParallelCode SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/tests/timer.c000066400000000000000000000064671415223013700155560ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #if !defined(INCLUDE_LIBXSMM_LAST) # include #endif #include #if defined(INCLUDE_LIBXSMM_LAST) # include #endif #if !defined(USE_NOINIT) # define USE_NOINIT #endif #if !defined(MAX_NSECONDS) # define MAX_NSECONDS 16 #endif #if !defined(MAX_TOLPERC) # define MAX_TOLPERC 12 #endif #if defined(_WIN32) # include #else # include #endif LIBXSMM_INLINE int timer_sleep(unsigned int seconds) { int result; #if defined(_WIN32) Sleep((DWORD)(1000 * seconds)); result = EXIT_SUCCESS; #else result = (0 == sleep(seconds) ? EXIT_SUCCESS : EXIT_FAILURE); #endif return result; } int main(int argc, char* argv[]) { const int max_nseconds_input = (1 < argc ? atoi(argv[1]) : MAX_NSECONDS); const unsigned int max_nseconds = (unsigned int)LIBXSMM_UP2POT(LIBXSMM_MAX(max_nseconds_input, 1)); const char *const env_delta = getenv("TIMER_DELTA"); const int max_delta = (NULL == env_delta || 0 == *env_delta) #if defined(_DEBUG) ? -1 #else ? 0 #endif : atoi(env_delta); unsigned int n = max_nseconds, ninterrupts = 0; libxsmm_timer_tickint begin, start; double total = 0, delta = 0, d, t; libxsmm_timer_info info; int result; #if !defined(USE_NOINIT) libxsmm_init(); #endif start = begin = libxsmm_timer_tick(); for (n >>= 1; 0 < n; n >>= 1) { if (EXIT_SUCCESS == timer_sleep(n)) { t = libxsmm_timer_duration(start, libxsmm_timer_tick()); d = 100.0 * LIBXSMM_DELTA(t, (double)n) / n; if (delta < d) delta = d; total += t; } else { total += (double)n; ++ninterrupts; } start = libxsmm_timer_tick(); } start = libxsmm_timer_tick(); if (EXIT_SUCCESS == timer_sleep(1)) { t = libxsmm_timer_duration(start, libxsmm_timer_tick()); d = 100.0 * LIBXSMM_DELTA(t, 1.0); if (delta < d) delta = d; total += t; } else { ++ninterrupts; total += 1.0; } start = libxsmm_timer_tick(); d = 100.0 * LIBXSMM_DELTA(total, (double)max_nseconds) / max_nseconds; if (delta < d) delta = d; result = libxsmm_get_timer_info(&info); if (EXIT_SUCCESS == result) { result = (int)LIBXSMM_ROUND(delta); if ((0 != max_delta || 0 == info.tsc) && (0 > max_delta || result <= max_delta)) { d = libxsmm_timer_duration(begin, start); fprintf(stderr, "seconds=%f delta=%s%i%% interrupted=%u tsc=%s\n", d, 0 == result ? "" : (total <= d ? "+" : "-"), result, ninterrupts, 0 != info.tsc ? "true" : "false"); result = EXIT_SUCCESS; } else if ((MAX_TOLPERC) >= result) { result = EXIT_SUCCESS; } } return result; } libxsmm-1.17/tests/timer.vcxproj000066400000000000000000000545361415223013700170270ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 timer 10.0 {2582DF4E-C0DE-4DA4-A9E6-EA4988E31620} Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true GenerateParallelCode SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console true libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true GenerateParallelCode SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console true libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console MSVCRT libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console MSVCRT libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console libxsmm-1.17/tests/trans.c000066400000000000000000000132651415223013700155570ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include #if !defined(ELEM_TYPE) # define ELEM_TYPE double #endif int main(void) { /* test #: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 */ /* index: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 */ const libxsmm_blasint m[] = { 0, 1, 1, 1, 1, 2, 3, 4, 5, 5, 5, 5, 5, 13, 13, 16, 22, 63, 64, 16, 16, 75, 2507 }; const libxsmm_blasint n[] = { 0, 1, 7, 7, 7, 2, 3, 1, 1, 1, 1, 5, 13, 5, 13, 16, 22, 31, 64, 500, 32, 130, 1975 }; const libxsmm_blasint ldi[] = { 0, 1, 1, 1, 9, 2, 3, 4, 5, 8, 8, 5, 5, 13, 13, 16, 22, 64, 64, 16, 512, 87, 3000 }; const libxsmm_blasint ldo[] = { 1, 1, 7, 8, 8, 2, 3, 1, 1, 1, 4, 5, 13, 5, 13, 16, 22, 32, 64, 512, 64, 136, 3072 }; const int start = 0, ntests = sizeof(m) / sizeof(*m); libxsmm_blasint max_size_a = 0, max_size_b = 0; ELEM_TYPE *a = NULL, *b = NULL, *c = NULL; const size_t typesize = sizeof(ELEM_TYPE); unsigned int nerrors = 0; int test, fun; void (*otrans[])(void*, const void*, unsigned int, libxsmm_blasint, libxsmm_blasint, libxsmm_blasint, libxsmm_blasint) = { libxsmm_otrans, libxsmm_otrans_omp }; void (*itrans[])(void*, unsigned int, libxsmm_blasint, libxsmm_blasint, libxsmm_blasint) = { libxsmm_itrans, libxsmm_itrans/*_omp*/ }; for (test = start; test < ntests; ++test) { const libxsmm_blasint size_a = ldi[test] * n[test], size_b = ldo[test] * m[test]; LIBXSMM_ASSERT(m[test] <= ldi[test] && n[test] <= ldo[test]); max_size_a = LIBXSMM_MAX(max_size_a, size_a); max_size_b = LIBXSMM_MAX(max_size_b, size_b); } a = (ELEM_TYPE*)libxsmm_malloc(typesize * max_size_a); b = (ELEM_TYPE*)libxsmm_malloc(typesize * max_size_b); c = (ELEM_TYPE*)libxsmm_malloc(typesize * max_size_b); LIBXSMM_ASSERT(NULL != a && NULL != b && NULL != c); /* initialize data */ LIBXSMM_MATINIT(ELEM_TYPE, 42, a, max_size_a, 1, max_size_a, 1.0); LIBXSMM_MATINIT(ELEM_TYPE, 24, b, max_size_b, 1, max_size_b, 1.0); for (fun = 0; fun < 2; ++fun) { for (test = start; test < ntests; ++test) { memcpy(c, b, typesize * max_size_b); otrans[fun](b, a, (unsigned int)typesize, m[test], n[test], ldi[test], ldo[test]); { libxsmm_blasint testerrors = 0, i, j; /* validation */ for (i = 0; i < n[test]; ++i) { for (j = 0; j < m[test]; ++j) { const libxsmm_blasint u = i * ldi[test] + j; const libxsmm_blasint v = j * ldo[test] + i; if (LIBXSMM_NEQ(a[u], b[v])) { ++testerrors; i = n[test]; break; } } for (j = m[test]; j < ldi[test] && 0 == testerrors; ++j) { const libxsmm_blasint v = j * ldo[test] + i; if (v < max_size_b && LIBXSMM_NEQ(b[v], c[v])) { ++testerrors; } } } for (i = n[test]; i < ldo[test] && 0 == testerrors; ++i) { for (j = 0; j < m[test]; ++j) { const libxsmm_blasint v = j * ldo[test] + i; if ((v < max_size_b && LIBXSMM_NEQ(b[v], c[v])) || v >= max_size_b) { ++testerrors; break; } } for (j = m[test]; j < ldi[test] && 0 == testerrors; ++j) { const libxsmm_blasint v = j * ldo[test] + i; if (v < max_size_b && LIBXSMM_NEQ(b[v], c[v])) { ++testerrors; } } } #if (0 != LIBXSMM_JIT) /* dispatch kernel and check that it is available */ if (LIBXSMM_X86_AVX <= libxsmm_get_target_archid() && (4 == typesize || 8 == typesize)) { libxsmm_descriptor_blob blob; const libxsmm_trans_descriptor *const desc = libxsmm_trans_descriptor_init( &blob, (unsigned int)typesize, m[test], n[test], ldo[test]); const libxsmm_xtransfunction kernel = libxsmm_dispatch_trans(desc); if (NULL == kernel) { # if defined(_DEBUG) fprintf(stderr, "\nERROR: kernel %i.%i not generated!\n", fun + 1, test + 1); # endif ++testerrors; } } #endif nerrors += testerrors; } if (LIBXSMM_MAX(n[test], 1) > ldi[test] || 0 != fun) continue; #if 1 /* TODO */ if (m[test] != ldi[test] || n[test] != ldi[test]) continue; #endif memcpy(c, b, typesize * max_size_b); itrans[fun](b, (unsigned int)typesize, m[test], n[test], ldi[test]); { libxsmm_blasint testerrors = 0, i, j; /* validation */ for (i = 0; i < n[test]; ++i) { for (j = 0; j < m[test]; ++j) { const libxsmm_blasint u = i * ldi[test] + j; if (LIBXSMM_NEQ(a[u], b[u])) { ++testerrors; i = n[test]; break; } } } nerrors += testerrors; } } } libxsmm_free(a); libxsmm_free(b); libxsmm_free(c); if (0 == nerrors) { return EXIT_SUCCESS; } else { # if defined(_DEBUG) fprintf(stderr, "errors=%u\n", nerrors); # endif return EXIT_FAILURE; } } libxsmm-1.17/tests/trans.vcxproj000066400000000000000000000547741415223013700170420ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 trans {C9CA7758-C0C9-4A50-9757-115410B599B0} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmext.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true GenerateParallelCode SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm.lib;libxsmmext.lib;libxsmmnoblas.lib;%(AdditionalDependencies) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true GenerateParallelCode SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) true Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) libxsmm-$(Configuration).lib;libxsmmext-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) MSVCRT Console libxsmm-1.17/tests/vla.c000066400000000000000000000044471415223013700152140ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) Intel Corporation - All rights reserved. * * This file is part of the LIBXSMM library. * * * * For information on the license, see the LICENSE file. * * Further information: https://github.com/hfp/libxsmm/ * * SPDX-License-Identifier: BSD-3-Clause * ******************************************************************************/ /* Hans Pabst (Intel Corp.) ******************************************************************************/ #include #if !defined(ELEM_TYPE) # define ELEM_TYPE short #endif #define VLA_IJK_DECL(DIM, TYPE, ARRAY, DATA, S1, S2) LIBXSMM_VLA_DECL( DIM, TYPE, ARRAY, DATA, S1, S2) #define VLA_IJK_INDX(DIM, ARRAY, I0, I1, I2, S1, S2) LIBXSMM_VLA_ACCESS(DIM, ARRAY, I0, I1, I2, S1, S2) #define VLA_IKJ_DECL(DIM, TYPE, ARRAY, DATA, S1, S2) LIBXSMM_VLA_DECL( DIM, TYPE, ARRAY, DATA, S2, S1) #define VLA_IKJ_INDX(DIM, ARRAY, I0, I1, I2, S1, S2) LIBXSMM_VLA_ACCESS(DIM, ARRAY, I0, I2, I1, S2, S1) int main(/*int argc, char* argv[]*/) { int ni = 9, nj = 7, nk = 3, i, j, k, linear = 0, result = EXIT_SUCCESS; ELEM_TYPE *const input = (ELEM_TYPE*)malloc(sizeof(ELEM_TYPE) * ni * nj * nk); LIBXSMM_VLA_DECL(1, const ELEM_TYPE, in1, input); VLA_IJK_DECL(3, const ELEM_TYPE, jk3, input, nj, nk); VLA_IKJ_DECL(3, const ELEM_TYPE, kj3, input, nj, nk); LIBXSMM_ASSERT(NULL != input); for (i = 0; i < (ni * nj * nk); ++i) input[i] = (ELEM_TYPE)i; for (i = 0; i < ni && EXIT_SUCCESS == result; ++i) { for (j = 0; j < nj; ++j) { for (k = 0; k < nk; ++k) { const ELEM_TYPE gold0 = input[linear]; const ELEM_TYPE test0 = VLA_IJK_INDX(3, jk3, i, j, k, nj, nk); const ELEM_TYPE gold1 = VLA_IJK_INDX(3, kj3, i, k, j, nk, nj); const ELEM_TYPE test1 = VLA_IKJ_INDX(3, kj3, i, j, k, nj, nk); if (gold0 != LIBXSMM_VLA_ACCESS(1, in1, linear) || gold0 != test0 || gold1 != test1) { result = EXIT_FAILURE; j = nj; break; } ++linear; } } } free(input); return result; } libxsmm-1.17/tests/vla.vcxproj000066400000000000000000000546321415223013700164660ustar00rootroot00000000000000 debug Win32 debug x64 symbols Win32 symbols x64 release Win32 release x64 vla {326EBB80-794E-4E06-BA06-85CFE26F7C3C} 10.0 Application Disabled Disabled v142 true Application true true Disabled Disabled v142 Application true Disabled Disabled v142 true Application Disabled Disabled v142 true true Application true Disabled Disabled v142 Application true Disabled Disabled true v142 <_ProjectFileVersion>10.0.30319.1 bin\ia32\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\ia32\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ bin\intel64\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) obj\$(Platform)-$(Configuration)\$(ProjectName)\ $(ProjectName)-$(Configuration) $(ProjectName)-$(Configuration) Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console libxsmmnoblas.lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true StreamingSIMDExtensions2 None false true GenerateParallelCode SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console true libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console X64 Full $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console libxsmmnoblas.lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console X64 MaxSpeed $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;NDEBUG;%(PreprocessorDefinitions) true MultiThreadedDLL false Level4 Fast NoTraps true true None false true GenerateParallelCode SingleFile 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console true libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true true Console MSVCRT libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) Console X64 Disabled $(SolutionDir)..\include;$(LIBXSMMROOT)\include;$(MKLROOT)\include;%(AdditionalIncludeDirectories) __BLAS=0;__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;__MKL;_DEBUG;%(PreprocessorDefinitions) MultiThreadedDebugDLL Level4 None false true GenerateParallelCode 3948,10373,10382 HOST true 0x0407 $(OutDir)$(TargetName)$(TargetExt) true true Console MSVCRT libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) Console libxsmm-1.17/version.txt000066400000000000000000000000151415223013700153350ustar00rootroot00000000000000release-1.17